ServiceNow · jlamypoirier · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -32,7 +32,7 @@ jobs:
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \
           MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
-          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,GENERATION,DEV,DOCS,VISION]"
+          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,STREAMING,DEV,DOCS]"
       - name: Run tests
         run: pytest -v -ra .
 

diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -34,7 +34,7 @@ jobs:
           pip install pybind11
           FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \
           MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
-          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,GENERATION,DEV,DOCS,VISION]"
+          pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV,DOCS]"
       - name: Build the documentation
         run: mkdocs build
 

diff --git a/Dockerfile b/Dockerfile
@@ -39,7 +39,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
 COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
 # Install dependencies within the virtual environment.
-RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV]" triton==3.5.1
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,STREAMING,DEV]" triton==3.5.1
 
 # Copy the remaining source code with universal write permissions.
 COPY --chmod=777 ./Megatron-LM Megatron-LM

diff --git a/fast_llm/data/data/data_loader.py b/fast_llm/data/data/data_loader.py
@@ -0,0 +1,72 @@
+import itertools
+import typing
+
+import torch.utils.data
+
+from fast_llm.core.distributed import broadcast_object
+
+
+class SampledDatasetIterator(torch.utils.data.Sampler):
+    """
+    A distributed sampler generating indices for a `SampledDataset` (i.e., the natural numbers).
+    To be used as the `batch_sampler` of a `torch.utils.data.DataLoader`.
+    """
+
+    def __init__(self, total_samples, begin_index, micro_batch_size, data_rank, data_parallel):
+        super().__init__()
+        self._total_samples = total_samples
+        self._begin_index = begin_index
+        self._batch_size = micro_batch_size * data_parallel
+        self._start_idx = data_rank * micro_batch_size
+        self._end_idx = (data_rank + 1) * micro_batch_size
+
+    def __len__(self) -> int:
+        return self._total_samples
+
+    def __iter__(self) -> typing.Iterator[list[int]]:
+        for idx in range(self._begin_index, self._total_samples - self._batch_size + 1, self._batch_size):
+            yield list(range(idx + self._start_idx, idx + self._end_idx))
+
+
+class DistributedDataLoaderWrapper:
+    """
+    Wraps a regular dataloader so that only the process group leader
+    loads data, and then broadcasts the batch to other ranks in the group.
+    """
+
+    def __init__(
+        self,
+        data_loader: torch.utils.data.dataloader.DataLoader,
+        process_group: torch.distributed.ProcessGroup | None,
+    ):
+        self._data_loader = data_loader
+        self._rank = 0 if process_group is None else process_group.rank()
+        self._process_group = process_group
+
+    def __iter__(self):
+        if self._rank == 0:
+            self._iterator = iter(self._data_loader)
+        else:
+            self._iterator = itertools.repeat(None)
+        if self._process_group is None:
+            return self._iterator
+        return self
+
+    def __next__(self):
+        # TODO:
+        # Instead of broadcasting a general object, make this iterator yield an actual Batch class.
+        # Implement `get_state_dict` and `from_state_dict` in the Batch class so that we can
+        # efficiently broadcast tensors directly. This avoids using `broadcast_object` on the
+        # entire Batch object, which is inefficient for tensors because it serializes
+        # (pickles) them before sending.
+
+        try:
+            data = next(self._iterator)  # may raise StopIteration
+        except Exception as e:
+            data = e
+        data = broadcast_object(data, self._process_group, 0)
+
+        if isinstance(data, Exception):
+            raise data
+
+        return data
diff --git a/fast_llm/data/data/gpt/data.py b/fast_llm/data/data/gpt/data.py
@@ -8,12 +8,12 @@
 
 from fast_llm.core.distributed import safe_barrier
 from fast_llm.data.data.abstract import Data
+from fast_llm.data.data.data_loader import DistributedDataLoaderWrapper, SampledDatasetIterator
 from fast_llm.data.data.gpt.config import GPTDataConfig
 from fast_llm.data.dataset.abstract import SampledDataset
 from fast_llm.data.dataset.config import SamplingParameters
 from fast_llm.data.dataset.gpt.config import GPTSamplingData
 from fast_llm.data.dataset.monitor import DatasetMonitor
-from fast_llm.data.iterator import SampledDatasetIterator
 from fast_llm.data.preprocessing.language_model import LanguageModelPreprocessingConfig
 from fast_llm.data.sample.language_model import LanguageModelBatch
 from fast_llm.engine.config_utils.run import log_main_rank
@@ -116,20 +116,23 @@ def get_iterator(
         Assert.in_range_incl(batch_config.sequence_length, 1, sampling_parameters.sequence_length)
         log_main_rank(f"Initializing {dataset_name} dataset iterator from sample {consumed_samples}...")
 
-        return iter(
-            torch.utils.data.DataLoader(
-                self._datasets[dataset_name],  # noqa
-                batch_sampler=SampledDatasetIterator(
-                    total_samples=len(self._datasets[dataset_name]),
-                    begin_index=consumed_samples,
-                    micro_batch_size=batch_config.micro_batch_size,
-                    data_rank=self._distributed.config.batch_data_rank,
-                    data_parallel=self._distributed.config.batch_data_parallel,
-                ),
-                num_workers=num_workers,
-                prefetch_factor=prefetch_factor,
-                pin_memory=True,
-                collate_fn=LanguageModelBatch.from_samples,
-                multiprocessing_context=self._config.multiprocessing_context.value if num_workers > 0 else None,
-            )
+        data_loader = torch.utils.data.DataLoader(
+            self._datasets[dataset_name],  # noqa
+            batch_sampler=SampledDatasetIterator(
+                total_samples=len(self._datasets[dataset_name]),
+                begin_index=consumed_samples,
+                micro_batch_size=batch_config.micro_batch_size,
+                data_rank=self._distributed.config.batch_data_rank,
+                data_parallel=self._distributed.config.batch_data_parallel,
+            ),
+            num_workers=num_workers,
+            prefetch_factor=prefetch_factor,
+            pin_memory=True,
+            collate_fn=LanguageModelBatch.from_samples,
+            multiprocessing_context=self._config.multiprocessing_context.value if num_workers > 0 else None,
         )
+
+        if self._datasets[dataset_name].requires_broadcast:
+            data_loader = DistributedDataLoaderWrapper(data_loader, self.distributed.model_and_sequence_data_group)
+
+        return iter(data_loader)
diff --git a/fast_llm/data/dataset/abstract.py b/fast_llm/data/dataset/abstract.py
@@ -5,6 +5,7 @@
 
 if typing.TYPE_CHECKING:
     from fast_llm.data.dataset.config import SamplingData
+    from fast_llm.data.dataset.sampled import SampledIterableDataset
 
 
 class Dataset[SampleType: Sample](abc.ABC):
@@ -27,6 +28,14 @@ def __getstate__(self):
             del state["__orig_class__"]
         return state
 
+    @property
+    def requires_broadcast(self) -> bool:
+        """
+        Some dataset schemes load the dataset on a batch-data-parallel group leaders,
+        then broadcast to the other devices.
+        """
+        return False
+
 
 class SampledDataset[SampleType: Sample](Dataset[SampleType]):
     """
@@ -48,3 +57,14 @@ class SamplableDataset[SampleType: Sample](Dataset[SampleType]):
     @abc.abstractmethod
     def sample(self, config: "SamplingData") -> SampledDataset[SampleType]:
         pass
+
+
+class SamplableIterableDataset[SampleType: Sample](SamplableDataset[SampleType]):
+    @abc.abstractmethod
+    def __iter__(self) -> typing.Iterator[SampleType]:
+        pass
+
+    def sample(self, config: "SamplingData") -> "SampledIterableDataset[SampleType]":
+        from fast_llm.data.dataset.sampled import SampledIterableDataset
+
+        return SampledIterableDataset(self, config)
diff --git a/fast_llm/data/dataset/config.py b/fast_llm/data/dataset/config.py
@@ -15,6 +15,7 @@
 
 if typing.TYPE_CHECKING:
     from fast_llm.data.dataset.indexed import ConcatenatedDataset, DatasetSlice, IndexedDataset
+    from fast_llm.data.sample.language_model import LanguageModelSample
     from fast_llm.engine.distributed.distributed import Distributed
 
 logger = logging.getLogger(__name__)
@@ -298,3 +299,56 @@ def build(self, preprocessing: PreprocessingConfig) -> "IndexedDataset[SampleTyp
             return LegacyMemmapDataset[SampleType](name, self.path, preprocessing)
         else:
             raise FileNotFoundError(self.path)
+
+
+REDIS_DATA_STREAM = "fast_llm_streaming"
+REDIS_FIELD = "data"
+REDIS_GROUP_NAME = "fast_llm_group"
+
+
+@config_class()
+class RedisConfig(Config):
+    REDIS_FIELD: typing.ClassVar[str] = "data"
+    REDIS_FIELD_B: typing.ClassVar[bytes] = REDIS_FIELD.encode()
+    REDIS_GROUP_NAME: typing.ClassVar[str] = "fast_llm_group"
+    REDIS_GROUP_NAME_B: typing.ClassVar[bytes] = REDIS_GROUP_NAME.encode()
+
+    # TODO: Move elsewhere? (Also used in trainer) Get it from the trainer in sampling config?
+    host: str = Field(
+        default="localhost",
+        desc="Hostname or IP address of the Redis server.",
+        hint=FieldHint.core,
+    )
+
+    port: int = Field(
+        default=6379,
+        desc="Port number on which the Redis server is running.",
+        hint=FieldHint.core,
+    )
+
+    def get_client(self):
+        import redis
+
+        return redis.Redis(self.host, self.port)
+
+
+@config_class(dynamic_type={SampledDatasetConfig: "streaming"})
+class StreamingDatasetConfig[SampleType: LanguageModelSample](RedisConfig, SamplableDatasetConfig[SampleType]):
+    """
+    Configuration for a streaming dataset that reads training data from a Redis stream.
+    """
+
+    _abstract = False
+
+    acknowledge_interval: int = Field(
+        default=10,
+        desc="Number of messages after which the consumer acknowledges received IDs back to the Redis hash.",
+        hint=FieldHint.core,
+    )
+
+    def build_and_sample(self, sampling: SamplingData) -> SampledDataset[SampleType]:
+        from fast_llm.data.dataset.streaming import RedisStreamingDataset
+
+        return RedisStreamingDataset[StreamingDatasetConfig, SampleType](self, sampling.distributed.config).sample(
+            sampling
+        )
diff --git a/fast_llm/data/dataset/monitor.py b/fast_llm/data/dataset/monitor.py
@@ -51,3 +51,11 @@ def __getitem__(self, index: int) -> SampleType:
     @property
     def name(self) -> str:
         return self._dataset.name
+
+    @property
+    def requires_broadcast(self) -> bool:
+        """
+        Some dataset schemes load the dataset on a batch-data-parallel group leaders,
+        then broadcast to the other devices.
+        """
+        return self._dataset.requires_broadcast
diff --git a/fast_llm/data/dataset/sampled.py b/fast_llm/data/dataset/sampled.py
@@ -8,7 +8,7 @@
 import torch
 import yaml
 
-from fast_llm.data.dataset.abstract import SampledDataset
+from fast_llm.data.dataset.abstract import SamplableIterableDataset, SampledDataset
 from fast_llm.data.dataset.config import SamplingData, ShufflingType
 from fast_llm.data.dataset.indexed import IndexedDataset
 from fast_llm.data.sample.abstract import Sample
@@ -111,6 +111,10 @@ def __init__(
             # No barrier yet to allow running in parallel.
             # There needs to be one before calling `__getitem__`, normally handled through `Data`.
 
+    @property
+    def requires_broadcast(self) -> bool:
+        return self._indexed_dataset.requires_broadcast
+
     def _sample(self) -> None:
         """
         Create a `SampledDataset` with the requested parameters.
@@ -429,3 +433,61 @@ def _load_yaml_data(self, data: dict[str, typing.Any]) -> None:
 
         self._unshuffled_tokens = data["unshuffled_tokens"]
         self._unshuffled_documents = data["unshuffled_epochs"] * self._documents_per_epoch
+
+
+class SampledIterableDataset[SampleType: Sample](SampledDataset[SampleType]):
+    def __init__(
+        self,
+        dataset: SamplableIterableDataset[SampleType],
+        sampling: SamplingData,
+    ):
+        self._dataset = dataset
+        self._config = sampling.config
+        self._parameters = sampling.parameters
+        self._documents: list[SampleType] = []
+        self._current_length = 0
+        self._sample_length = self._parameters.sequence_length + self._parameters.extra_tokens
+        # Delay iterator creation to avoid pickling issues.
+        self._iterator: typing.Iterator[SampleType] | None = None
+
+    @property
+    def requires_broadcast(self) -> bool:
+        # TODO: ====== fix ======
+        # return self._iterator.requires_broadcast
+        return True
+
+    def __getitem__(self, index: int) -> SampleType:
+        if self._iterator is None:
+            self._iterator = iter(self._dataset)
+        while self._current_length < self._sample_length:
+            document = next(self._iterator)
+            if len(document) > self._sample_length:
+                logging.warning(f"Dropping document with length {len(document)} > {self._sample_length}.")
+                continue
+            self._documents.append(document)
+            self._current_length += len(document)
+
+        if self._current_length == self._sample_length:
+            documents = self._documents
+            self._documents = []
+            self._current_length = 0
+        else:
+            last_length = len(self._documents[-1])
+            remaining_length = last_length - (self._current_length - self._sample_length)
+            if self._parameters.truncate_documents:
+                documents = self._documents[:-1] + [self._documents[-1].crop(0, remaining_length)]
+                self._documents = [self._documents[-1].crop(remaining_length, last_length)]
+            else:
+                documents = self._documents[:-1] + [self._documents[0].get_padding(remaining_length)]
+                self._documents = [self._documents[-1]]
+            self._current_length = len(self._documents[0])
+        sample = documents[0].from_documents(documents)
+        Assert.eq(len(sample), self._sample_length)
+        return sample
+
+    def __len__(self) -> int:
+        return self._parameters.num_samples
+
+    @property
+    def name(self) -> str:
+        return self._dataset.name