Merge pull request tensorflow#51094 from benbarsdell:gpu-war-radix-sort-sparse-segment-reduce-grad

tensorflower-gardener · tensorflower-gardener · commit 8cca9d147a52 · 2021-08-11T11:23:08.000-07:00
PiperOrigin-RevId: 390177302
Change-Id: Idd8e881b54b6a3693de0471df9cee106498162b0
diff --git a/tensorflow/core/kernels/gpu_prim_helpers.h b/tensorflow/core/kernels/gpu_prim_helpers.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/gpu_prim.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
+#include "tensorflow/stream_executor/stream.h"
 
 namespace tensorflow {
 
@@ -57,6 +58,36 @@ Status GpuRadixSort(OpKernelContext* context, int size, const Tkey* keys_in,
                     const Tindex* indices_in,  // Optional
                     Tindex* indices_out, int num_bits = sizeof(Tkey) * 8) {
   if (size == 0) return Status::OK();
+  if (num_bits == 0) {
+    // Workaround for CUB failing when begin_bit = end_bit = 0 (e.g., when all
+    // keys are 0, so no sorting is needed).
+    se::Stream* stream = context->op_device_context()->stream();
+    if (keys_out) {
+      // Copy keys_in to keys_out.
+      size_t num_bytes = size * sizeof(Tkey);
+      se::DeviceMemoryBase src(const_cast<Tkey*>(keys_in), num_bytes);
+      se::DeviceMemoryBase dst(keys_out, num_bytes);
+      if (!stream->ThenMemcpy(&dst, src, num_bytes).ok()) {
+        return errors::Internal("Failed to copy keys_in to keys_out");
+      }
+    }
+    if (indices_in) {
+      // Copy indices_in to indices_out.
+      size_t num_bytes = size * sizeof(Tindex);
+      se::DeviceMemoryBase src(const_cast<Tindex*>(indices_in), num_bytes);
+      se::DeviceMemoryBase dst(indices_out, num_bytes);
+      if (!stream->ThenMemcpy(&dst, src, num_bytes).ok()) {
+        return errors::Internal("Failed to copy indices_in to indices_out");
+      }
+    } else {
+      // Set output indices to range.
+      const Eigen::GpuDevice& device =
+          context->eigen_device<Eigen::GpuDevice>();
+      TF_RETURN_IF_ERROR(detail::RangeInit(device, Tindex(0), Tindex(1),
+                                           Tindex(size), indices_out));
+    }
+    return Status::OK();
+  }
   // Allocate temporary inputs/outputs if necessary.
   Tensor tmp_indices_in;
   if (!indices_in) {
diff --git a/tensorflow/core/kernels/gpu_prim_helpers_test.cu.cc b/tensorflow/core/kernels/gpu_prim_helpers_test.cu.cc
@@ -271,6 +271,38 @@ TEST_F(GpuPrimHelpersTest, GpuRadixSort_WithNumBits) {
   test::ExpectTensorEqual<int32>(expected_indices_out, *GetOutput(1));
 }
 
+TEST_F(GpuPrimHelpersTest, GpuRadixSort_WithNumBitsZero) {
+  // Check that num_bits=0 is handled correctly.
+  MakeRadixSort(DT_INT32, DT_INT32, /*need_keys_out=*/true, /*num_bits=*/0);
+  AddInputFromArray<int32>(TensorShape({8}), {4, 2, 6, 7, 1, 3, 0, 5});  // keys
+  AddInputFromArray<int32>(TensorShape({0}), {});                        // inds
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_keys_out(allocator(), DT_INT32, TensorShape({8}));
+  test::FillValues<int32>(&expected_keys_out, {4, 2, 6, 7, 1, 3, 0, 5});
+  test::ExpectTensorEqual<int32>(expected_keys_out, *GetOutput(0));
+
+  Tensor expected_indices_out(allocator(), DT_INT32, TensorShape({8}));
+  test::FillValues<int32>(&expected_indices_out, {0, 1, 2, 3, 4, 5, 6, 7});
+  test::ExpectTensorEqual<int32>(expected_indices_out, *GetOutput(1));
+}
+
+TEST_F(GpuPrimHelpersTest, GpuRadixSort_KeysAndIndices_WithNumBitsZero) {
+  // Check that num_bits=0 is handled correctly (with indices_in).
+  MakeRadixSort(DT_INT32, DT_INT32, /*need_keys_out=*/true, /*num_bits=*/0);
+  AddInputFromArray<int32>(TensorShape({8}), {4, 2, 6, 7, 1, 3, 0, 5});  // keys
+  AddInputFromArray<int32>(TensorShape({8}), {7, 6, 5, 4, 3, 2, 1, 0});  // inds
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected_keys_out(allocator(), DT_INT32, TensorShape({8}));
+  test::FillValues<int32>(&expected_keys_out, {4, 2, 6, 7, 1, 3, 0, 5});
+  test::ExpectTensorEqual<int32>(expected_keys_out, *GetOutput(0));
+
+  Tensor expected_indices_out(allocator(), DT_INT32, TensorShape({8}));
+  test::FillValues<int32>(&expected_indices_out, {7, 6, 5, 4, 3, 2, 1, 0});
+  test::ExpectTensorEqual<int32>(expected_indices_out, *GetOutput(1));
+}
+
 TEST_F(GpuPrimHelpersTest, GpuInclusivePrefixSum) {
   MakeInclusivePrefixSum(DT_INT32);
   AddInputFromArray<int32>(TensorShape({8}), {4, 2, 6, 7, 1, 3, 0, 5});
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -790,23 +790,31 @@ struct SparseSegmentGradFunctor<GPUDevice, T, Index, SegmentId> {
                                   segment_offsets_ptr, weights_ptr));
     }
 
-    // Sort indices and permute segments.
-    Tensor sorted_indices;
-    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<Index>::value,
-                                                   TensorShape({nouter}),
-                                                   &sorted_indices));
-    Index* sorted_indices_ptr = sorted_indices.flat<Index>().data();
-    Tensor sorted_segment;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<SegmentId>::value,
-                                TensorShape({nouter}), &sorted_segment));
-    SegmentId* sorted_segment_ptr = sorted_segment.flat<SegmentId>().data();
-    OP_REQUIRES_OK(context, GpuRadixSort(context, nouter,
-                                         /*keys_in=*/indices_vec.data(),
-                                         /*keys_out=*/sorted_indices_ptr,
-                                         /*indices_in=*/segment_vec.data(),
-                                         /*indices_out=*/sorted_segment_ptr,
-                                         /*num_bits=*/Log2Ceiling64(noutput)));
+    const Index* sorted_indices_ptr = indices_vec.data();
+    const SegmentId* sorted_segment_ptr = segment_vec.data();
+    Tensor tmp_sorted_indices;
+    Tensor tmp_sorted_segment;
+    if (noutput > 1) {
+      // Sort indices and permute segments.
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<Index>::value,
+                                  TensorShape({nouter}), &tmp_sorted_indices));
+      Index* tmp_sorted_indices_ptr = tmp_sorted_indices.flat<Index>().data();
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<SegmentId>::value,
+                                  TensorShape({nouter}), &tmp_sorted_segment));
+      SegmentId* tmp_sorted_segment_ptr =
+          tmp_sorted_segment.flat<SegmentId>().data();
+      OP_REQUIRES_OK(context,
+                     GpuRadixSort(context, nouter,
+                                  /*keys_in=*/indices_vec.data(),
+                                  /*keys_out=*/tmp_sorted_indices_ptr,
+                                  /*indices_in=*/segment_vec.data(),
+                                  /*indices_out=*/tmp_sorted_segment_ptr,
+                                  /*num_bits=*/Log2Ceiling64(noutput)));
+      sorted_indices_ptr = tmp_sorted_indices_ptr;
+      sorted_segment_ptr = tmp_sorted_segment_ptr;
+    }
 
     // Compute the gradient using a weighted SegmentReduceGPU with the segment
     // IDs and indices swapped.
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -217,7 +217,6 @@ TF_CALL_FLOAT_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 #undef REGISTER_GPU_SPARSE_KERNELS
 
-#if 0  // TODO(b/192086735): Enable once bug is fixed.
 #define REGISTER_GPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("SparseSegmentMeanGrad")                                     \
@@ -229,7 +228,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
       SparseSegmentMeanGradOp<GPUDevice, type, index_type, segment_ids_type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 #undef REGISTER_GPU_SPARSE_KERNELS
-#endif
 
 #define REGISTER_GPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
   REGISTER_KERNEL_BUILDER(                                              \
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -902,6 +902,27 @@ def testGradientExplicit(self):
           tf_xgrad = tf_op(tf_ygrad, indices, segment_ids, output_dim0)
           self.assertAllClose(tf_xgrad, np_xgrad)
 
+  def testGradientExplicitSingleOutput(self):
+    # The GPU implem has a special case when there is a single output.
+    for inner_size in (1, 2, 3, 32):
+      with self.session():
+        tf_ygrad, np_ygrad = self._input([3, inner_size],
+                                         dtype=dtypes_lib.float32)
+        segment_ids = [0, 1, 2, 2, 2]
+        indices = [0, 0, 0, 0, 0]
+        output_dim0 = 1
+        ops_list = [
+            (math_ops.sparse_segment_sum_grad, "sum"),
+            (math_ops.sparse_segment_mean_grad, "mean"),
+            (math_ops.sparse_segment_sqrt_n_grad, "sqrtn"),
+        ]
+        for tf_op, mode in ops_list:
+          np_xgrad = self._sparseSegmentReduceGrad(np_ygrad, indices,
+                                                   segment_ids, output_dim0,
+                                                   mode)
+          tf_xgrad = tf_op(tf_ygrad, indices, segment_ids, output_dim0)
+          self.assertAllClose(tf_xgrad, np_xgrad)
+
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)