Run local_sgd/diloco in titan

H-Huang · H-Huang · commit e008b650f177 · 2025-04-22T10:43:21.000-07:00
diff --git a/torchtitan/components/ft.py b/torchtitan/components/ft.py
@@ -6,6 +6,7 @@
 
 import copy
 import importlib
+from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import Optional
 
@@ -85,13 +86,16 @@ def init_ft_manager(job: JobConfig) -> FTManager:
 
     pg = ft.ProcessGroupNCCL()
 
+    # If the training method is specific, then the quorum should be synchronous
+    use_async_quorum = job.fault_tolerance.training_method is None
+
     return FTManager(
         ft.Manager(
             pg=pg,
             min_replica_size=job.fault_tolerance.min_replica_size,
             load_state_dict=None,
             state_dict=None,
-            use_async_quorum=True,
+            use_async_quorum=use_async_quorum,
             replica_id=f"torchtitan_ft_{job.fault_tolerance.replica_id}",
         ),
         group_size=job.fault_tolerance.group_size,
@@ -158,3 +162,44 @@ def ft_clip_grad_norm_util(total_norm: DTensor) -> torch.Tensor:
             return DTensor.from_local(local_tensor, mesh.mesh, placements)
 
     return total_norm
+
+
+def maybe_semi_sync_training(
+    config: JobConfig,
+    ft_manager: FTManager,
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    sync_every: int,
+):
+    """
+    If TorchFT is enabled and the config is set, use training_method
+    """
+    training_method = config.fault_tolerance.training_method
+    if training_method is not None:
+        if training_method.lower() == "diloco":
+            # Create the outer optimizer based on the inner optimizer parameters.
+            params = [group["params"] for group in optimizer.param_groups]
+            params = [param for sublist in params for param in sublist]
+            outer_optimizer = torch.optim.SGD(
+                params, lr=0.7, momentum=0.9, nesterov=True
+            )
+
+            return ft.local_sgd.DiLoCo(
+                manager=ft_manager._manager,
+                model=model,
+                inner_optimizer=optimizer,
+                outer_optimizer=outer_optimizer,
+                sync_every=sync_every,
+            )
+        elif training_method.lower() == "local_sgd":
+            return ft.local_sgd.LocalSGD(
+                manager=ft_manager._manager,
+                model=model,
+                optimizer=optimizer,
+                sync_every=sync_every,
+            )
+        else:
+            raise ValueError(
+                f"Unknown training method: {training_method}, only 'diloco' and 'local_sgd' are supported."
+            )
+    return nullcontext()
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -502,6 +502,13 @@ class FaultTolerance:
     min_replica_size: int = 1
     """The minimum number of FT replica for each step."""
 
+    training_method: str | None = "diloco"
+    """
+    The algorithm to use for semi-sync training. Currently, only "local_sgd" and "diloco" from
+    torchft are supported
+    (https://github.com/pytorch/torchft/blob/360c5c534bdeac959507e9d238ba9f3902d3fda9/torchft/local_sgd.py#L41)
+    """
+
 
 @dataclass
 class Experimental:
diff --git a/torchtitan/experiments/fault_tolerance/README.md b/torchtitan/experiments/fault_tolerance/README.md
@@ -0,0 +1 @@
+This folder contains experiments of running TorchTitan alongside of TorchFt (https://github.com/pytorch/torchft)
diff --git a/torchtitan/experiments/fault_tolerance/train.py b/torchtitan/experiments/fault_tolerance/train.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import time
+from datetime import timedelta
+from typing import Optional
+
+import torch
+
+import torchtitan.components.ft as ft
+
+from torch.distributed.elastic.multiprocessing.errors import record
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import utils as dist_utils
+from torchtitan.tools.logging import init_logger, logger
+from torchtitan.tools.profiling import (
+    maybe_enable_memory_snapshot,
+    maybe_enable_profiling,
+)
+from torchtitan.train import Trainer
+
+
+class FtTrainer(Trainer):
+    # Override the train method to add fault tolerance
+    @record
+    def train(self):
+        job_config = self.job_config
+
+        self.checkpointer.load(step=job_config.checkpoint.load_step)
+        logger.info(f"Training starts at step {self.step + 1}.")
+
+        with maybe_enable_profiling(
+            job_config, global_step=self.step
+        ) as torch_profiler, maybe_enable_memory_snapshot(
+            job_config, global_step=self.step
+        ) as memory_profiler, ft.maybe_semi_sync_training(
+            job_config,
+            ft_manager=self.ft_manager,
+            model=self.model_parts[0],
+            optimizer=self.optimizers,
+            sync_every=2,
+        ) as semi_sync_training:
+            data_iterator = iter(self.dataloader)
+
+            while self.step < job_config.training.steps:
+                self.step += 1
+                self.gc_handler.run(self.step)
+                inputs, labels = self.next_batch(data_iterator)
+                self.train_step(inputs, labels)
+                self.checkpointer.save(
+                    self.step, force=(self.step == job_config.training.steps)
+                )
+
+                # signal the profiler that the next profiling step has started
+                if torch_profiler:
+                    torch_profiler.step()
+                if memory_profiler:
+                    memory_profiler.step()
+
+                # reduce timeout after first train step for faster signal
+                # (assuming lazy init and compilation are finished)
+                if self.step == 1:
+                    dist_utils.set_pg_timeouts(
+                        timeout=timedelta(
+                            seconds=job_config.comm.train_timeout_seconds
+                        ),
+                        world_mesh=self.world_mesh,
+                    )
+
+        if torch.distributed.get_rank() == 0:
+            logger.info("Sleeping 2 seconds for other ranks to complete")
+            time.sleep(2)
+
+        self.metrics_processor.close()
+        logger.info("Training completed")
+
+
+if __name__ == "__main__":
+    init_logger()
+    config = JobConfig()
+    config.maybe_add_custom_args()
+    config.parse_args()
+    trainer: Optional[Trainer] = None
+
+    try:
+        trainer = FtTrainer(config)
+
+        if config.checkpoint.create_seed_checkpoint:
+            assert int(
+                os.environ["WORLD_SIZE"]
+            ), "Must create seed checkpoint using a single device, to disable sharding."
+            assert (
+                config.checkpoint.enable_checkpoint
+            ), "Must enable checkpointing when creating a seed checkpoint."
+            trainer.checkpointer.save(curr_step=0, force=True)
+            logger.info("Created seed checkpoint")
+        else:
+            trainer.train()
+    finally:
+        if trainer:
+            trainer.close()
+
+        if torch.distributed.is_initialized():
+            torch.distributed.destroy_process_group()
+            logger.info("Process group destroyed.")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+This folder contains experiments of running TorchTitan alongside of TorchFt (https://github.com/pytorch/torchft)`