[JAX SC] Parallelize device loop for extraction, sorting and grouping.

adityagupta1089 · Google-ML-Automation · commit 1ce8e3bdf37b · 2025-10-31T10:51:08.000-07:00
* `9.28%` geomean reduction in wall time with `0.61%` CPU time increase and `6.05%` cycles reduction.
* Use separate pool to avoid deadlocks. The fixed cost for scheduling should be less than 0.1%
* Add default constructible objects for parallelization.

PiperOrigin-RevId: 826509091
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc
@@ -126,6 +126,7 @@ struct TableState {
   MinibatchingSplit table_minibatching_split = 0;
   std::vector<ExtractedCooTensors> extracted_coo_tensors_per_device;
   std::vector<PartitionedCooTensors> partitioned_coo_tensors_per_device;
+  std::vector<int> dropped_id_count_per_device;
 
   TableState(const std::string& name,
              absl::Span<const StackedTableMetadata> metadata,
@@ -147,8 +148,9 @@ struct TableState {
         stats_per_host(options.local_device_count, options.GetNumScs(),
                        options.num_sc_per_device),
         batch_size_for_device(0) {
-    extracted_coo_tensors_per_device.reserve(options.local_device_count);
-    partitioned_coo_tensors_per_device.reserve(options.local_device_count);
+    extracted_coo_tensors_per_device.resize(options.local_device_count);
+    partitioned_coo_tensors_per_device.resize(options.local_device_count);
+    dropped_id_count_per_device.resize(options.local_device_count, 0);
   }
 };
 
@@ -165,26 +167,40 @@ void ExtractSortAndGroupCooTensorsForTable(
                         state.stacked_table_name);
   });
 
+  absl::BlockingCounter counter(options.local_device_count);
   for (int local_device = 0; local_device < options.local_device_count;
        ++local_device) {
-    ExtractedCooTensors extracted_coo_tensors =
-        internal::ExtractCooTensorsForAllFeaturesPerLocalDevice(
-            state.stacked_table_metadata, input_batches, local_device, options);
-    state.extracted_coo_tensors_per_device.push_back(extracted_coo_tensors);
-    if (local_device == 0)
-      state.batch_size_for_device = extracted_coo_tensors.batch_size_for_device;
-    else
-      CHECK_EQ(state.batch_size_for_device,
-               extracted_coo_tensors.batch_size_for_device);
-
-    internal::StatsPerDevice stats_per_device =
-        state.stats_per_host.GetStatsPerDevice(local_device);
-    const PartitionedCooTensors grouped_coo_tensors =
-        SortAndGroupCooTensorsPerLocalDevice(
-            extracted_coo_tensors, state.stacked_table_metadata[0], options,
-            stats_per_device, state.table_minibatching_required);
-    state.partitioned_coo_tensors_per_device.push_back(grouped_coo_tensors);
-    state.stats_per_host.dropped_id_count += stats_per_device.dropped_id_count;
+    DeviceProcessingThreadPool()->Schedule([&, local_device] {
+      state.extracted_coo_tensors_per_device[local_device] =
+          internal::ExtractCooTensorsForAllFeaturesPerLocalDevice(
+              state.stacked_table_metadata, input_batches, local_device,
+              options);
+
+      internal::StatsPerDevice stats_per_device =
+          state.stats_per_host.GetStatsPerDevice(local_device);
+      state.partitioned_coo_tensors_per_device[local_device] =
+          SortAndGroupCooTensorsPerLocalDevice(
+              state.extracted_coo_tensors_per_device[local_device],
+              state.stacked_table_metadata[0], options, stats_per_device,
+              state.table_minibatching_required);
+      state.dropped_id_count_per_device[local_device] =
+          stats_per_device.dropped_id_count;
+      counter.DecrementCount();
+    });
+  }
+  counter.Wait();
+
+  // Post-process results after all threads are done.
+  state.batch_size_for_device =
+      state.extracted_coo_tensors_per_device[0].batch_size_for_device;
+  state.stats_per_host.dropped_id_count = 0;
+  for (int local_device = 0; local_device < options.local_device_count;
+       ++local_device) {
+    DCHECK_EQ(state.batch_size_for_device,
+              state.extracted_coo_tensors_per_device[local_device]
+                  .batch_size_for_device);
+    state.stats_per_host.dropped_id_count +=
+        state.dropped_id_count_per_device[local_device];
   }
 }
 
@@ -518,9 +534,9 @@ PreprocessSparseDenseMatmulInput(
   // Stage 1: COO Extraction and Initial Sort/Group
   {
     tsl::profiler::TraceMe traceme("ExtractSortAndGroupCooTensors");
-    absl::BlockingCounter counter(stacked_tables.size());
+    absl::BlockingCounter counter(table_states.size());
     for (auto& state : table_states) {
-      PreprocessingThreadPool()->Schedule([&, &state = state] {
+      TableProcessingThreadPool()->Schedule([&, &state = state] {
         ExtractSortAndGroupCooTensorsForTable(state, input_batches, options);
         counter.DecrementCount();
       });
@@ -536,9 +552,9 @@ PreprocessSparseDenseMatmulInput(
   if (options.enable_minibatching && global_minibatching_required) {
     {
       tsl::profiler::TraceMe traceme("CreateMinibatchingBuckets");
-      absl::BlockingCounter counter(stacked_tables.size());
+      absl::BlockingCounter counter(table_states.size());
       for (auto& state : table_states) {
-        PreprocessingThreadPool()->Schedule([&, &state = state] {
+        TableProcessingThreadPool()->Schedule([&, &state = state] {
           CreateMinibatchingBucketsForTable(state, options);
           counter.DecrementCount();
         });
@@ -553,9 +569,9 @@ PreprocessSparseDenseMatmulInput(
   // Stage 3: Fill Device Buffers
   {
     tsl::profiler::TraceMe traceme("FillDeviceBuffers");
-    absl::BlockingCounter counter(stacked_tables.size());
+    absl::BlockingCounter counter(table_states.size());
     for (auto& state : table_states) {
-      PreprocessingThreadPool()->Schedule([&, &state = state,
+      TableProcessingThreadPool()->Schedule([&, &state = state,
                                            global_minibatching_required,
                                            global_minibatching_split] {
         FillDeviceBuffersForTable(state, options, global_minibatching_required,
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_threads.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_threads.cc
@@ -28,11 +28,14 @@ namespace jax_sc_embedding {
 namespace {
 
 constexpr char kScEnv[] = "SPARSECORE_INPUT_PREPROCESSING_THREADS";
-constexpr char kScPool[] = "SparseCoreInputPreprocessingThreadPool";
+constexpr char kDevicePool[] = "SparseCoreDeviceProcessingThreadPool";
+constexpr char kTablePool[] = "SparseCoreTableProcessingThreadPool";
 
 // Returns at least one but the minimum of NumSchedulableCPUs() and the value
 // specified by the environment variable
 // `SPARSECORE_INPUT_PREPROCESSING_THREADS`.
+// NOTE: This size applies to *each* thread pool (Device and Table). If the env
+// var is set to N, 2*N threads may be created in total.
 int GetThreadPoolSize() {
   int num_threads = tsl::port::NumSchedulableCPUs();
   if (const char* env = std::getenv(kScEnv); env != nullptr) {
@@ -46,14 +49,30 @@ int GetThreadPoolSize() {
 
 }  // namespace
 
-tsl::thread::ThreadPool* PreprocessingThreadPool() {
+tsl::thread::ThreadPool* DeviceProcessingThreadPool() {
   static tsl::thread::ThreadPool* pool = []() {
     const int num_threads = GetThreadPoolSize();
     DCHECK_GE(num_threads, 1);
-    LOG(INFO) << "Creating thread pool for SparseCore input preprocessing: "
+    LOG(INFO) << "Creating device processing thread pool for SparseCore input "
+                 "preprocessing: "
               << num_threads << " threads";
     auto thread_pool = new tsl::thread::ThreadPool(
-        tsl::Env::Default(), tsl::ThreadOptions(), kScPool, num_threads,
+        tsl::Env::Default(), tsl::ThreadOptions(), kDevicePool, num_threads,
+        /*low_latency_hint=*/false);
+    return thread_pool;
+  }();
+  return pool;
+}
+
+tsl::thread::ThreadPool* TableProcessingThreadPool() {
+  static tsl::thread::ThreadPool* pool = []() {
+    const int num_threads = GetThreadPoolSize();
+    DCHECK_GE(num_threads, 1);
+    LOG(INFO) << "Creating table processing thread pool for SparseCore input "
+                 "preprocessing: "
+              << num_threads << " threads";
+    auto thread_pool = new tsl::thread::ThreadPool(
+        tsl::Env::Default(), tsl::ThreadOptions(), kTablePool, num_threads,
         /*low_latency_hint=*/false);
     return thread_pool;
   }();
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_threads.h b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_threads.h
@@ -18,8 +18,20 @@
 
 namespace jax_sc_embedding {
 
-// Global thread pool for all computations done by input preprocessing.
-tsl::thread::ThreadPool* PreprocessingThreadPool();
+// We use two separate thread pools to handle nested parallelism in input
+// preprocessing. Table-level tasks are scheduled onto TableProcessingThreadPool,
+// and each of these tasks may schedule multiple device-level tasks onto
+// DeviceProcessingThreadPool.
+// If a single pool were used, it could lead to deadlock: if all threads in the
+// pool were occupied by table-level tasks blocked waiting for device-level
+// tasks to complete, no threads would be available to run the device-level
+// tasks, and the system would hang. Using separate pools prevents this issue.
+
+// Thread pool for device-level computations in input preprocessing.
+tsl::thread::ThreadPool* DeviceProcessingThreadPool();
+
+// Thread pool for table-level computations in input preprocessing.
+tsl::thread::ThreadPool* TableProcessingThreadPool();
 
 }  // namespace jax_sc_embedding
 
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util.h b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_util.h
@@ -222,6 +222,7 @@ struct ExtractedCooTensors {
   // grouping them. Might be lower after deduplication.
   std::vector<int> coo_tensors_per_sc;
 
+  ExtractedCooTensors() : ExtractedCooTensors(0, 0) {}
   ExtractedCooTensors(int num_sc_per_device, int batch_size_for_device)
       : batch_size_for_device(batch_size_for_device),
         coo_tensors_per_sc(num_sc_per_device, 0) {}
diff --git a/jax_tpu_embedding/sparsecore/lib/core/partitioned_coo_tensors.h b/jax_tpu_embedding/sparsecore/lib/core/partitioned_coo_tensors.h
@@ -32,6 +32,7 @@ namespace jax_sc_embedding {
 
 class PartitionedCooTensors {
  public:
+  PartitionedCooTensors() : PartitionedCooTensors(0, 0, 0, 1) {}
   PartitionedCooTensors(int reserve_count, int num_sc_per_device,
                         uint32_t global_sc_count, int bucket_count_per_sc = 1)
       : coo_tensors_(),
diff --git a/jax_tpu_embedding/sparsecore/lib/nn/tests/preprocess_input_benchmarks.py b/jax_tpu_embedding/sparsecore/lib/nn/tests/preprocess_input_benchmarks.py
@@ -11,17 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Simple benchmarks for preprocessing input for sparse-dense matmul.
+r"""Simple benchmarks for preprocessing input for sparse-dense matmul.
 
 Example usage:
 
 On perflab comparing against HEAD:
-benchy --perflab --runs=10 --reference=srcfs --benchmark_filter=all
+benchy --perflab --runs=10 --reference=srcfs --benchmark_filter=all \
 :preprocess_input_benchmarks
 
 Or locally:
-bazel run -c opt --dynamic_mode=off --copt=-gmlt :preprocess_input_benchmarks --
+bazel run -c opt --dynamic_mode=off --copt=-gmlt :preprocess_input_benchmarks -- \
 --benchmark_filter=all --cpu_profile=/tmp/preprocess.prof
+
+The --benchmark_filter flag uses a regex to select benchmarks. For parameterized
+benchmarks, the name is typically formatted as:
+`[benchmark_name]/[param1]:[value1]/[param2]:[value2]`.
+Boolean parameters are often represented as 0 for False and 1 for True.
+
+For example, to run only the `sparse_coo` benchmarks:
+`--benchmark_filter=preprocess_input_benchmark_sparse_coo`
+
+To run only the `sparse_coo` benchmark where `has_leading_dimension` is `False`:
+`--benchmark_filter='preprocess_input_benchmark_sparse_coo/has_leading_dimension:0'`
+
+To run all benchmarks across all suites where `has_leading_dimension` is `False`:
+`--benchmark_filter='/has_leading_dimension:0'`
 """
 
 import concurrent