Fix stream ordering in Tensor::Copy and Tensor(List)GPU.as_cpu

mzient · stiepan · commit 85806edb1242 · 2025-10-16T17:44:44.000+02:00
Signed-off-by: Michal Zientkiewicz &lt;michalz@nvidia.com&gt;
diff --git a/dali/pipeline/data/tensor.h b/dali/pipeline/data/tensor.h
@@ -113,22 +113,29 @@ class Tensor : public Buffer<Backend> {
   inline void Copy(const Tensor<InBackend> &other, AccessOrder order = {}) {
     constexpr bool is_host_to_host = std::is_same<Backend, CPUBackend>::value &&
                                      std::is_same<InBackend, CPUBackend>::value;
+    auto src_order = other.order();
+    auto dst_order = order_;
     if (!order) {
       if (is_host_to_host)
         order = AccessOrder::host();
-      else
-        order = other.order() ? other.order() : order_;
+      else  // use device order, if available; if not, use whichever (dst, src) is set
+        order = dst_order.is_device()
+                ? dst_order
+                : src_order.is_device()
+                  ? src_order
+                  : dst_order ? dst_order : src_order;
     }
     DALI_ENFORCE(!is_host_to_host || !order.is_device(),
                  "Cannot issue a host-to-host copy on a device stream.");
     this->Resize(other.shape(), other.type());
-    order.wait(order_);
+    order.wait(dst_order);  // wait for the destination to avoid overwriting while in use
+    order.wait(other.order());  // wait for the source to avoid reading while not ready
     this->SetLayout(other.GetLayout());
     this->SetSourceInfo(other.GetSourceInfo());
     this->SetSkipSample(other.ShouldSkipSample());
     type_.template Copy<Backend, InBackend>(this->raw_mutable_data(),
         other.raw_data(), this->size(), order.stream());
-    order_.wait(order);
+    dst_order.wait(order);
   }
 
   /**
diff --git a/dali/pipeline/data/tensor_list.cc b/dali/pipeline/data/tensor_list.cc
@@ -45,14 +45,17 @@ namespace copy_impl {
  *
  * The copy ordering can be:
  * - explict, as specified in `order`
- * - the one from `src_order`, if set
- * - the one from `dst_order`
+ * - the one from `dst_order`, if set
+ * - the one from `src_order`
  * @return copy_order - order on which we will do the copy
  */
 AccessOrder SyncBefore(AccessOrder dst_order, AccessOrder src_order, AccessOrder order) {
   if (!order)
-    order = src_order ? src_order : dst_order;
-
+    order = dst_order.is_device()
+            ? dst_order
+            : src_order.is_device()
+              ? src_order
+              : dst_order ? dst_order : src_order;
   // The destination buffer must be ready to be overwritten
   order.wait(dst_order);
   // The source buffer must be ready to cosume
diff --git a/dali/python/backend_impl.cc b/dali/python/backend_impl.cc
@@ -832,6 +832,9 @@ void ExposeTensor(py::module &m) {
         },
       R"code(Passthrough, since the object is already an instance of `TensorCPU`.)code",
       py::return_value_policy::reference_internal)
+    .def("_set_stream", [](Tensor<CPUBackend> &t, py::object stream) {
+      t.set_order(AccessOrderFromPythonStreamObj(stream));
+    })
     .def("_make_copy", [](const Tensor<CPUBackend> &t) {
         auto dst = std::make_unique<Tensor<CPUBackend>>();
         dst->set_device_id(t.device_id());
@@ -997,6 +1000,7 @@ void ExposeTensor(py::module &m) {
           DeviceGuard g(t.device_id());
           auto ret = std::make_unique<Tensor<CPUBackend>>();
           ret->set_pinned(false);
+          ret->set_order(AccessOrder::host());
           UserStream * us = UserStream::Get();
           cudaStream_t s = us->GetStream(t);
           ret->Copy(t, s);
@@ -1007,6 +1011,9 @@ void ExposeTensor(py::module &m) {
       Returns a `TensorCPU` object being a copy of this `TensorGPU`.
       )code",
       py::return_value_policy::take_ownership)
+    .def("_set_stream", [](Tensor<GPUBackend> &t, py::object stream) {
+      t.set_order(AccessOrderFromPythonStreamObj(stream));
+    })
     .def("_make_copy", [](const Tensor<GPUBackend> &t) {
         DeviceGuard dg(t.device_id());
         auto dst = std::make_unique<Tensor<GPUBackend>>();
@@ -1112,7 +1119,9 @@ std::unique_ptr<Tensor<Backend> > TensorListGetItemImpl(TensorList<Backend> &t,
   auto ptr = std::make_unique<Tensor<Backend>>();
   // TODO(klecki): Rework this with proper sample-based tensor batch data structure
   auto &sample_shared_ptr = unsafe_sample_owner(t, id);
-  ptr->ShareData(sample_shared_ptr, t.capacity(), t.is_pinned(), t.shape()[id], t.type(),
+  auto &tshape = t.tensor_shape(id);
+  size_t num_bytes = tshape.num_elements() * t.type_info().size();
+  ptr->ShareData(sample_shared_ptr, num_bytes, t.is_pinned(), tshape, t.type(),
                  t.device_id(), t.order(), t.ready_event());
   ptr->SetMeta(t.GetMeta(id));
   return ptr;
@@ -1360,6 +1369,9 @@ void ExposeTensorListCPU(py::module &m) {
         return t;
       }, R"code(Passthrough, as it is already an instance of `TensorListCPU`.)code",
       py::return_value_policy::reference_internal)
+    .def("_set_stream", [](TensorList<CPUBackend> &t, py::object stream) {
+      t.set_order(AccessOrderFromPythonStreamObj(stream));
+    })
     .def("_make_copy", [](const TensorList<CPUBackend> &t) {
         auto dst = std::make_shared<TensorList<CPUBackend>>();
         dst->set_device_id(t.device_id());
@@ -1625,6 +1637,7 @@ void ExposeTesorListGPU(py::module &m) {
           DeviceGuard g(t.device_id());
           auto ret = std::make_shared<TensorList<CPUBackend>>();
           ret->set_pinned(false);
+          ret->set_order(AccessOrder::host());
           ret->SetContiguity(BatchContiguity::Contiguous);
           UserStream * us = UserStream::Get();
           cudaStream_t s = us->GetStream(t);
@@ -1636,6 +1649,9 @@ void ExposeTesorListGPU(py::module &m) {
       Returns a `TensorListCPU` object being a copy of this `TensorListGPU`.
       )code",
       py::return_value_policy::take_ownership)
+    .def("_set_stream", [](TensorList<GPUBackend> &t, py::object stream) {
+      t.set_order(AccessOrderFromPythonStreamObj(stream));
+    })
     .def("_make_copy", [](const TensorList<GPUBackend> &tl) {
         DeviceGuard dg(tl.device_id());
         auto dst = std::make_shared<TensorList<GPUBackend>>();