facebookincubator · kjmph · May 15, 2026 · May 20, 2026 · karthikeyann · Jun 2, 2026
@@ -101,11 +101,7 @@ std::unique_ptr<cudf::table> getConcatenatedTable(
   // the wrong stream.
   auto output = cudf::concatenate(tableViews, stream, mr);
 
-  // Order input deallocations after the concatenate read.
-  // Since memory resources are stream-ordered, deallocations
-  // on inputStreams will be ordered after the concatenate completes.
-  CudaEvent event(cudaEventDisableTiming);
-  streamsWaitForStream(event, inputStreams, stream);
+  orderCudfVectorDeallocationsAfterStream(tables, inputStreams, stream);
   // Input tables are deallocated here when 'tables' goes out of scope.
   return output;
 }
@@ -168,20 +164,15 @@ std::vector<std::unique_ptr<cudf::table>> getConcatenatedTableBatched(
             stream,
             mr));
   }
-  // Order input deallocations after the concatenate reads by making all input
-  // streams wait for the output stream.
-  // Since memory resources are stream-ordered, deallocations
-  // on inputStreams will be ordered after the concatenate completes.
-  CudaEvent event(cudaEventDisableTiming);
-  streamsWaitForStream(event, inputStreams, stream);
+  orderCudfVectorDeallocationsAfterStream(tables, inputStreams, stream);
 
   // Input tables are deallocated here when 'tables' goes out of scope.
   return outputTables;
 }
 
 void streamsWaitForStream(
     CudaEvent& event,
-    const std::vector<rmm::cuda_stream_view>& streams,
+    std::span<const rmm::cuda_stream_view> streams,
     rmm::cuda_stream_view stream) {
   event.recordFrom(stream);
   for (const auto& strm : streams) {
@@ -216,4 +207,20 @@ const CudaEvent& CudaEvent::waitOn(rmm::cuda_stream_view stream) const {
   return *this;
 }
 
+void orderCudfVectorDeallocationsAfterStream(
+    std::span<const CudfVectorPtr> vectors,
+    std::span<const rmm::cuda_stream_view> inputStreams,
+    rmm::cuda_stream_view stream) {
+  bool allRebound = true;
+  for (const auto& vector : vectors) {
+    VELOX_CHECK_NOT_NULL(vector);
+    allRebound &= vector->rebindStream(stream);
+  }
+
+  if (!allRebound) {
+    CudaEvent event(cudaEventDisableTiming);
+    streamsWaitForStream(event, inputStreams, stream);
+  }
+}
+
 } // namespace facebook::velox::cudf_velox
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
+#include <span>
 
 namespace facebook::velox::cudf_velox {
 
@@ -178,6 +179,19 @@ class CudaEvent {
  */
 void streamsWaitForStream(
     CudaEvent& event,
-    const std::vector<rmm::cuda_stream_view>& streams,
+    std::span<const rmm::cuda_stream_view> streams,
+    rmm::cuda_stream_view stream);
+
+/**
+ * @brief Orders CudfVector deallocations after work on a target stream.
+ *
+ * Prefer rebinding owned table buffers to @p stream so stream-ordered memory
+ * resources free the inputs after prior work on that stream. Falls back to an
+ * event wait on @p inputStreams when an input cannot be rebound without
+ * materializing, e.g. packed-table inputs or older cuDF builds.
+ */
+void orderCudfVectorDeallocationsAfterStream(
+    std::span<const CudfVectorPtr> vectors,
+    std::span<const rmm::cuda_stream_view> inputStreams,
     rmm::cuda_stream_view stream);
 } // namespace facebook::velox::cudf_velox
@@ -25,6 +25,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table.hpp>
 
+#if __has_include(<cudf/column/column_stream.hpp>)
+#include <cudf/column/column_stream.hpp>
+#define VELOX_CUDF_HAS_COLUMN_REBIND_STREAM 1
+#else
+#define VELOX_CUDF_HAS_COLUMN_REBIND_STREAM 0
+#endif
+
 namespace facebook::velox::cudf_velox {
 namespace {
 
@@ -175,6 +182,32 @@ std::unique_ptr<cudf::table> CudfVector::release() {
   return materializedTable;
 }
 
+bool CudfVector::rebindStream(rmm::cuda_stream_view stream) {
+  if (stream_.value() == stream.value()) {
+    return true;
+  }
+
+#if VELOX_CUDF_HAS_COLUMN_REBIND_STREAM
+  if (auto* tablePtr =
+          std::get_if<std::unique_ptr<cudf::table>>(&tableStorage_)) {
+    if (!*tablePtr) {
+      return false;
+    }
+
+    auto columns = (*tablePtr)->release();
+    for (auto& column : columns) {
+      column = cudf::rebind_stream(std::move(*column), stream);
+    }
+
+    *tablePtr = std::make_unique<cudf::table>(std::move(columns));
+    tabView_ = (*tablePtr)->view();
+    stream_ = stream;
+    return true;
+  }
+#endif
+  return false;
+}
+
 uint64_t CudfVector::estimateFlatSize() const {
   return flatSize_;
 }

@@ -68,6 +68,11 @@ class CudfVector : public RowVector {
   /// first (which copies the data).
   std::unique_ptr<cudf::table> release();
 
+  /// Rebinds owned table buffers to use 'stream' for future deallocation.
+  /// Returns false when the storage cannot be rebound without materializing or
+  /// when the cuDF rebind API is unavailable.
+  bool rebindStream(rmm::cuda_stream_view stream);
+
   uint64_t estimateFlatSize() const override;
 
  private:

diff --git a/velox/experimental/ucx-exchange/UcxPartitionedOutput.cpp b/velox/experimental/ucx-exchange/UcxPartitionedOutput.cpp
@@ -19,11 +19,13 @@
 #include "velox/core/QueryConfig.h"
 #include "velox/exec/Driver.h"
 #include "velox/exec/Operator.h"
+#include "velox/experimental/cudf/exec/Utilities.h"
 #include "velox/experimental/cudf/vector/CudfVector.h"
 
 #include <cudf/concatenate.hpp>
 #include <cudf/contiguous_split.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/partitioning.hpp>
 
 using namespace facebook::velox::cudf_velox;
@@ -128,24 +130,26 @@ void UcxPartitionedOutput::flushPending() {
           ? cv->getTableView()
           : cv->getTableView().select(remap_.begin(), remap_.end());
     } else {
-      // Sync all input streams so their GPU data is ready to read.
-      for (auto& v : pendingInputs_) {
-        v->stream().synchronize();
-      }
-
       // Collect (remapped) table views.
       std::vector<cudf::table_view> views;
+      std::vector<rmm::cuda_stream_view> inputStreams;
       views.reserve(pendingInputs_.size());
+      inputStreams.reserve(pendingInputs_.size());
       for (auto& v : pendingInputs_) {
+        inputStreams.push_back(v->stream());
         views.push_back(
             remap_.empty()
                 ? v->getTableView()
                 : v->getTableView().select(remap_.begin(), remap_.end()));
       }
 
+      cudf::detail::join_streams(inputStreams, stream);
       mergedTable = cudf::concatenate(
           views, stream, cudf::get_current_device_resource_ref());
 
+      orderCudfVectorDeallocationsAfterStream(
+          pendingInputs_, inputStreams, stream);
+
       // Free input GPU memory before partitioning (peak = 2x -> 1x).
       pendingInputs_.clear();