revert multi-pack sampler changes

splotnikv · splotnikv · commit 57a643964aac · 2025-05-15T13:46:11.000-07:00
diff --git a/src/instructlab/training/multipack_sampler.py b/src/instructlab/training/multipack_sampler.py
@@ -34,8 +34,6 @@
 import torch
 import torch.distributed as dist
 
-from instructlab.training.utils import bucket
-
 
 def find_max_pack_len_with_padding(
     dataset,
@@ -213,11 +211,11 @@ def ffd_check_padding(a: np.ndarray, c: int, n: int):
         not_found = True
         for idx in range(n):
             # Calculate the new capacity if size is added to the bin
-            new_capacity = bucket(max(bins_max_lengths[idx], size)) * (
+            new_capacity = max(bins_max_lengths[idx], size) * (
                 bins_num_samples[idx] + 1
             )
             if new_capacity <= c:
-                bins_max_lengths[idx] = bucket(max(bins_max_lengths[idx], size))
+                bins_max_lengths[idx] = max(bins_max_lengths[idx], size)
                 bins_num_samples[idx] += 1
                 not_found = False
                 break
@@ -268,11 +266,11 @@ def ffd_with_result_padding(a: np.ndarray, c: int, start_index: int):
         add_new = True
         for idx in range(len(bins_max_lengths)):
             # Calculate the new capacity if size is added to the bin
-            new_capacity = bucket(max(bins_max_lengths[idx], size)) * (
+            new_capacity = max(bins_max_lengths[idx], size) * (
                 bins_num_samples[idx] + 1
             )
             if new_capacity <= c:
-                bins_max_lengths[idx] = bucket(max(bins_max_lengths[idx], size))
+                bins_max_lengths[idx] = max(bins_max_lengths[idx], size)
                 bins_num_samples[idx] += 1
                 bins_result[idx].append(indices[a_id] + start_index)
                 add_new = False
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -242,6 +242,28 @@ def check_flash_attn_enabled(disable_flash_attn: bool, use_dolomite: bool) -> bo
 
 @numba.njit
 def simple_bucket(length):
+    """
+    This bucket algorithm merely relies on the given number instead of based on
+    slicing the known (min, max) range for several reasons:
+        1) Due to the use of the first-fit-decreasing (FFD) algorithm, the
+           (min, max) sequence length of each rank will be much smaller than the
+           (min, max) sequence length of the dataset. Bucketing on the
+           (min, max) sequence length of the dataset is not practical
+        2) The (min, max) sequence length of a given rank is unknown until
+           finishing 1 epoch since the packing is done on the fly
+        3) Due to the shuffling, the (min, max) sequence length of a given rank
+           may vary between ranks. Once the (min, max) sequence length of a
+           given rank changes, the bucketing also needs adjustment
+
+    This bucket algorithm is based on the most significant set bit of the input number.
+    It first check what’s the most significant set bit, assuming it's bit "S",
+    and then slice the range [2 ** S, 2 ** (S+1)] into buckets with the same size.
+    By default the range is divided into 16 buckets, so the bucket size will be
+    2 ** (S - 4)
+    For example, 0b10001 will be padded to 0b10010.
+    This approach can limit the overhead of bucketing (at most 1/16 of the input
+    number) and also prevent recompilation due to a too small bucket size.
+    """
     l = length
     msb = 0
     while l > 0:
@@ -439,7 +461,7 @@ def reduce_sum_forward(
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
-                **deprecated_arguments if is_torch_hpu_available() else None,
+                **_deprecated_arguments if is_torch_hpu_available() else None,
             )
 
             return_dict = isinstance(output, dict)