[FT] Support local_sgd / diloco in titan (#1122)

H-Huang · web-flow · commit c6c28dc519e8 · 2025-04-29T17:35:52.000-04:00
Depends on torchft changes: - pytorch/torchft#168 - pytorch/torchft#170 This PR adds a new semi sync method context manager which wraps around the train loop to run local sgd or diloco. It also adds multiple config properties to set and control the training method. ### To run (need 3 different terminals): Start torchft lighthouse (terminal 1): ` RUST_LOGS=debug RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 2 --quorum_tick_ms 100 --join_timeout_ms 10000 ` Start replica 1 (terminal 2, update lighthouse URL): ` TORCHFT_LIGHTHOUSE=<url> TORCHFT_MANAGER_PORT=29520 REPLICA_GROUP_ID=0 CUDA_VISIBLE_DEVICES=0,1,2,3 NGPU=4 ./run_train.sh --parallelism.data_parallel_shard_degree=4 --fault_tolerance.enable --fault_tolerance.group_size=2 --fault_tolerance.replica_id=0 --fault_tolerance.semi_sync_method="diloco" ` Start replica 2 (terminal 3, update lighthouse URL): ` TORCHFT_LIGHTHOUSE=<url> TORCHFT_MANAGER_PORT=29522 REPLICA_GROUP_ID=1 CUDA_VISIBLE_DEVICES=4,5,6,7 NGPU=4 ./run_train.sh --parallelism.data_parallel_shard_degree=4 --fault_tolerance.enable --fault_tolerance.group_size=2 --fault_tolerance.replica_id=1 --fault_tolerance.semi_sync_method="diloco" `
diff --git a/torchtitan/components/ft.py b/torchtitan/components/ft.py
@@ -6,8 +6,9 @@
 
 import copy
 import importlib
+from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import Optional
+from typing import ContextManager, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch.distributed as dist
@@ -22,6 +23,9 @@
 if importlib.util.find_spec("torchft") is not None:
     import torchft as ft
 
+    if TYPE_CHECKING:
+        from torchft import local_sgd
+
     has_torchft = True
 else:
     has_torchft = False
@@ -85,13 +89,16 @@ def init_ft_manager(job: JobConfig) -> FTManager:
 
     pg = ft.ProcessGroupNCCL()
 
+    # If the training method is specific, then the quorum should be synchronous
+    use_async_quorum = job.fault_tolerance.semi_sync_method is None
+
     return FTManager(
         ft.Manager(
             pg=pg,
             min_replica_size=job.fault_tolerance.min_replica_size,
             load_state_dict=None,
             state_dict=None,
-            use_async_quorum=True,
+            use_async_quorum=use_async_quorum,
             replica_id=f"torchtitan_ft_{job.fault_tolerance.replica_id}",
         ),
         group_size=job.fault_tolerance.group_size,
@@ -158,3 +165,50 @@ def ft_clip_grad_norm_util(total_norm: DTensor) -> torch.Tensor:
             return DTensor.from_local(local_tensor, mesh.mesh, placements)
 
     return total_norm
+
+
+def maybe_semi_sync_training(
+    config: JobConfig,
+    ft_manager: FTManager,
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    sync_every: int,
+) -> ContextManager[Union["local_sgd.DiLoCo", "local_sgd.LocalSGD", None]]:
+    """
+    If TorchFT is enabled and the config is set, use semi_sync_method
+    """
+    semi_sync_method = config.fault_tolerance.semi_sync_method
+    torchft_enabled = config.fault_tolerance.enable
+    if torchft_enabled and semi_sync_method is not None:
+        from torchft import local_sgd
+
+        assert (
+            ft_manager._manager is not None
+        ), "FTManager must be enabled to use semi-sync training."
+        if semi_sync_method.lower() == "diloco":
+            # Create the outer optimizer based on the inner optimizer parameters.
+            params = [group["params"] for group in optimizer.param_groups]
+            params = [param for sublist in params for param in sublist]
+            outer_optimizer = torch.optim.SGD(
+                params, lr=0.7, momentum=0.9, nesterov=True
+            )
+
+            return local_sgd.DiLoCo(
+                manager=ft_manager._manager,
+                model=model,
+                inner_optimizer=optimizer,
+                outer_optimizer=outer_optimizer,
+                sync_every=sync_every,
+            )
+        elif semi_sync_method.lower() == "local_sgd":
+            return local_sgd.LocalSGD(
+                manager=ft_manager._manager,
+                model=model,
+                optimizer=optimizer,
+                sync_every=sync_every,
+            )
+        else:
+            raise ValueError(
+                f"Unknown training method: {semi_sync_method}, only 'diloco' and 'local_sgd' are supported."
+            )
+    return nullcontext()
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -499,6 +499,19 @@ class FaultTolerance:
     min_replica_size: int = 1
     """The minimum number of FT replica for each step."""
 
+    semi_sync_method: str | None = None
+    """
+    The algorithm to use for semi-sync training. Currently, only "local_sgd" and "diloco" from
+    torchft are supported
+    (https://github.com/pytorch/torchft/blob/360c5c534bdeac959507e9d238ba9f3902d3fda9/torchft/local_sgd.py#L41)
+    """
+
+    sync_steps: int = 5
+    """
+    Number of steps to wait before performing synchronization. This is only used when "semi_sync_method"
+    is set.
+    """
+
 
 @dataclass
 class Experimental:
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -401,7 +401,13 @@ def train(self):
             job_config, global_step=self.step
         ) as torch_profiler, maybe_enable_memory_snapshot(
             job_config, global_step=self.step
-        ) as memory_profiler:
+        ) as memory_profiler, ft.maybe_semi_sync_training(
+            job_config,
+            ft_manager=self.ft_manager,
+            model=self.model_parts[0],
+            optimizer=self.optimizers,
+            sync_every=job_config.fault_tolerance.sync_steps,
+        ):
             data_iterator = iter(self.dataloader)
             while self.step < job_config.training.steps:
                 self.step += 1