Re-enable and optimize SparseSegmentMeanGrad GPU

benbarsdell · benbarsdell · commit d9873fbc4e05 · 2021-08-02T23:37:25.000+10:00
- Re-enables this kernel now that the CUB issue has been worked
  around.
- Optimizes the kernel to skip the sort call when output_dim0 = 1.
- Adds a test case for when output_dim0 = 1.
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -790,23 +790,31 @@ struct SparseSegmentGradFunctor<GPUDevice, T, Index, SegmentId> {
                                   segment_offsets_ptr, weights_ptr));
     }
 
-    // Sort indices and permute segments.
-    Tensor sorted_indices;
-    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<Index>::value,
-                                                   TensorShape({nouter}),
-                                                   &sorted_indices));
-    Index* sorted_indices_ptr = sorted_indices.flat<Index>().data();
-    Tensor sorted_segment;
-    OP_REQUIRES_OK(context, context->allocate_temp(
-                                DataTypeToEnum<SegmentId>::value,
-                                TensorShape({nouter}), &sorted_segment));
-    SegmentId* sorted_segment_ptr = sorted_segment.flat<SegmentId>().data();
-    OP_REQUIRES_OK(context, GpuRadixSort(context, nouter,
-                                         /*keys_in=*/indices_vec.data(),
-                                         /*keys_out=*/sorted_indices_ptr,
-                                         /*indices_in=*/segment_vec.data(),
-                                         /*indices_out=*/sorted_segment_ptr,
-                                         /*num_bits=*/Log2Ceiling64(noutput)));
+    const Index* sorted_indices_ptr = indices_vec.data();
+    const SegmentId* sorted_segment_ptr = segment_vec.data();
+    Tensor tmp_sorted_indices;
+    Tensor tmp_sorted_segment;
+    if (noutput > 1) {
+      // Sort indices and permute segments.
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<Index>::value,
+                                  TensorShape({nouter}), &tmp_sorted_indices));
+      Index* tmp_sorted_indices_ptr = tmp_sorted_indices.flat<Index>().data();
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<SegmentId>::value,
+                                  TensorShape({nouter}), &tmp_sorted_segment));
+      SegmentId* tmp_sorted_segment_ptr =
+          tmp_sorted_segment.flat<SegmentId>().data();
+      OP_REQUIRES_OK(context,
+                     GpuRadixSort(context, nouter,
+                                  /*keys_in=*/indices_vec.data(),
+                                  /*keys_out=*/tmp_sorted_indices_ptr,
+                                  /*indices_in=*/segment_vec.data(),
+                                  /*indices_out=*/tmp_sorted_segment_ptr,
+                                  /*num_bits=*/Log2Ceiling64(noutput)));
+      sorted_indices_ptr = tmp_sorted_indices_ptr;
+      sorted_segment_ptr = tmp_sorted_segment_ptr;
+    }
 
     // Compute the gradient using a weighted SegmentReduceGPU with the segment
     // IDs and indices swapped.
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -217,7 +217,6 @@ TF_CALL_FLOAT_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 #undef REGISTER_GPU_SPARSE_KERNELS
 
-#if 0  // TODO(b/192086735): Enable once bug is fixed.
 #define REGISTER_GPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
   REGISTER_KERNEL_BUILDER(                                              \
       Name("SparseSegmentMeanGrad")                                     \
@@ -229,7 +228,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
       SparseSegmentMeanGradOp<GPUDevice, type, index_type, segment_ids_type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
 #undef REGISTER_GPU_SPARSE_KERNELS
-#endif
 
 #define REGISTER_GPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
   REGISTER_KERNEL_BUILDER(                                              \
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -902,6 +902,27 @@ def testGradientExplicit(self):
           tf_xgrad = tf_op(tf_ygrad, indices, segment_ids, output_dim0)
           self.assertAllClose(tf_xgrad, np_xgrad)
 
+  def testGradientExplicitSingleOutput(self):
+    # The GPU implem has a special case when there is a single output.
+    for inner_size in (1, 2, 3, 32):
+      with self.session():
+        tf_ygrad, np_ygrad = self._input([3, inner_size],
+                                         dtype=dtypes_lib.float32)
+        segment_ids = [0, 1, 2, 2, 2]
+        indices = [0, 0, 0, 0, 0]
+        output_dim0 = 1
+        ops_list = [
+            (math_ops.sparse_segment_sum_grad, "sum"),
+            (math_ops.sparse_segment_mean_grad, "mean"),
+            (math_ops.sparse_segment_sqrt_n_grad, "sqrtn"),
+        ]
+        for tf_op, mode in ops_list:
+          np_xgrad = self._sparseSegmentReduceGrad(np_ygrad, indices,
+                                                   segment_ids, output_dim0,
+                                                   mode)
+          tf_xgrad = tf_op(tf_ygrad, indices, segment_ids, output_dim0)
+          self.assertAllClose(tf_xgrad, np_xgrad)
+
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)