[Performance] Speed up DataLoader with Multiple DataLoader Workers (#1821)

qibaoyuan · brian-dellabetta · dsikka · commit 2df1473d391b · 2025-09-21T19:01:54.000Z
when the dataset is too large, it should be speeded up to min(8x,cpu//2)
times

SUMMARY:

When the dataset is large, using the default DataLoader is too slow. In
my experiment, it took 1.5 hours to prepare the cache. After setting
num_workers to min(8, cpu // 2), the time was reduced to 7 minutes.

TEST PLAN:

Tested with a 5K dataset, comparing the performance between this patch
and the original implementation.

---------

Signed-off-by: Baoyuan Qi &lt;qibaoyuan@126.com&gt;
Co-authored-by: Brian Dellabetta &lt;brian-dellabetta@users.noreply.github.com&gt;
diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py
@@ -1,3 +1,4 @@
+import multiprocessing
 import re
 from typing import Any, Callable, Dict, List, Optional
 
@@ -138,13 +139,22 @@ def format_calibration_data(
         tokenized_dataset = tokenized_dataset.shuffle()
     tokenized_calibration = tokenized_dataset.select(range(safe_calibration_samples))
 
+    MAX_DATALOADER_WORKERS = 8
+    try:
+        num_workers = min(MAX_DATALOADER_WORKERS, multiprocessing.cpu_count() // 2)
+    except NotImplementedError:
+        logger.warning(
+            "Could not determine number of CPUs, defaulting to 0 dataloader workers."
+        )
+        num_workers = 0
     dataloader_params = {
         "batch_size": 1,
         "sampler": RandomSampler(tokenized_calibration)
         if do_shuffle
         else SequentialSampler(tokenized_calibration),
         "collate_fn": collate_fn,
         "pin_memory": True,
+        "num_workers": num_workers,
     }
 
     calibration_dataloader = DataLoader(tokenized_calibration, **dataloader_params)