the first draft for bwd overlapping

huggingface · xrsrke · Jan 29, 2025 · Jan 30, 2025 · Jan 31, 2025 · Feb 3, 2025
commit d765fd57e29a30b0a083012bed6e443a2df72b7f
diff --git a/src/nanotron/constants.py b/src/nanotron/constants.py
@@ -10,3 +10,6 @@
 
 CHECKPOINT_FILE_NAME = "checkpoint_metadata.json"
 MODEL_CONFIG_FILE_NAME = "model_config.json"
+
+
+CUDA_STREAMS = {}
diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py
@@ -482,7 +482,9 @@ def get_profiler(config: Config):
             on_trace_ready=on_trace_ready,
             # record_shapes=True,
             # profile_memory=True,
+            with_flops=True,
             with_stack=True,
+            with_modules=True,
         )
     else:
         prof = contextlib.nullcontext()

diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py
@@ -30,6 +30,7 @@
 from nanotron.nn.activations import ACT2FN
 from nanotron.nn.layer_norm import TritonRMSNorm
 from nanotron.parallel import ParallelContext
+from nanotron.parallel.comm import WaitComm
 from nanotron.parallel.parameters import NanotronParameter
 from nanotron.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer
 from nanotron.parallel.pipeline_parallel.p2p import P2P
@@ -46,6 +47,8 @@
 
 logger = logging.get_logger(__name__)
 
+DOMINO_COMM_STREAM = "domino_comm_stream_{}"
+
 
 class RotaryEmbedding(nn.Module):
     def __init__(self, dim: int, end: int, theta: float = 10000.0):
@@ -241,8 +244,8 @@ def __init__(
         )
         self.split_silu_mul = GLUActivation(config.hidden_act)
 
-    def forward(self, hidden_states):  # [seq_length, batch_size, hidden_dim]
-        merged_states = self.gate_up_proj(hidden_states)
+    def forward(self, hidden_states, handle_idx=None):  # [seq_length, batch_size, hidden_dim]
+        merged_states = self.gate_up_proj(hidden_states, handle_idx)
         hidden_states, work = self.down_proj(self.split_silu_mul(merged_states))
         return {"hidden_states": hidden_states, "work": work}
 
@@ -437,6 +440,7 @@ def forward(
         self,
         hidden_states,  # [seq_length, batch_size, hidden_size]
         sequence_mask,  # [batch_size, seq_length]
+        handle_idx=None,
     ):
         from flash_attn import bert_padding
         from flash_attn.flash_attn_interface import (
@@ -445,7 +449,7 @@ def forward(
         )
 
         qkv_states = self.qkv_proj(
-            hidden_states
+            hidden_states, handle_idx=handle_idx
         )  # [seq_length, batch_size, n_local_q_heads * d_qk + 2 * n_local_kv_heads * d_qk]
         q_length, batch_size, _ = qkv_states.shape
 
@@ -720,6 +724,18 @@ def __init__(
         self.recompute_layer = parallel_config.recompute_layer
         self.parallel_config = parallel_config
 
+        # if parallel_config.domino is not None and parallel_config.domino.num_input_batches > 1:
+        #     from nanotron.parallel.comm import CudaStreamManager
+        #     # NOTE: we use different cuda streams for different gpus, so it can overlaps the communication
+        #     CudaStreamManager.create(DOMINO_COMM_STREAM.format(torch.cuda.current_device()))
+        num_gpus = torch.cuda.device_count()
+        for i in range(num_gpus):
+            from nanotron import constants
+
+            constants.CUDA_STREAMS[i] = torch.cuda.Stream(device=torch.device(f"cuda:{i}"))
+
+        self.layer_idx = layer_idx
+
     def _core_forward(
         self,
         hidden_states: Union[torch.Tensor, TensorPointer],
@@ -747,29 +763,52 @@ def _core_forward(
         hidden_states0 = self.input_layernorm(hidden_states0)
         hidden_states1 = self.input_layernorm(hidden_states1)
 
-        attn_output0 = self.attn(hidden_states=hidden_states0, sequence_mask=sequence_mask0)
+        attn_output0 = self.attn(
+            hidden_states=hidden_states0, sequence_mask=sequence_mask0, handle_idx=f"layer_{self.layer_idx}_batch_0"
+        )
         attn_output0_work = attn_output0["work"]
 
-        attn_output1 = self.attn(hidden_states=hidden_states1, sequence_mask=sequence_mask1)
+        attn_output1 = self.attn(
+            hidden_states=hidden_states1, sequence_mask=sequence_mask1, handle_idx=f"layer_{self.layer_idx}_batch_1"
+        )
         attn_output1_work = attn_output1["work"]
 
-        attn_output0_work.wait()
+        from nanotron import constants
+
+        comm_stream = constants.CUDA_STREAMS[torch.cuda.current_device()]
+        # comm_stream = CudaStreamManager.get(DOMINO_COMM_STREAM.format(torch.cuda.current_device()))
+        with torch.cuda.stream(comm_stream):
+            attn_output0_work.wait()
+        # attn_output0_work.wait()
+
         hidden_states0 = attn_output0["hidden_states"]
         hidden_states0 = hidden_states0 + residual0
         residual0 = hidden_states0
         hidden_states0 = self.post_attention_layernorm(hidden_states0)
+        hidden_states0 = WaitComm.apply(hidden_states0, f"layer_{self.layer_idx}_batch_0")
 
+        # mlp_output0 = self.mlp(hidden_states=hidden_states0, handle_idx=f"layer_{self.layer_idx}_batch_0")
         mlp_output0 = self.mlp(hidden_states=hidden_states0)
 
-        attn_output1_work.wait()
+        with torch.cuda.stream(comm_stream):
+            attn_output1_work.wait()
+        # attn_output1_work.wait()
+
         hidden_states1 = attn_output1["hidden_states"]
         hidden_states1 = hidden_states1 + residual1
         residual1 = hidden_states1
         hidden_states1 = self.post_attention_layernorm(hidden_states1)
+        hidden_states1 = WaitComm.apply(hidden_states1, f"layer_{self.layer_idx}_batch_1")
 
+        # mlp_output1 = self.mlp(hidden_states=hidden_states1, handle_idx=f"layer_{self.layer_idx}_batch_1")
         mlp_output1 = self.mlp(hidden_states=hidden_states1)
-        mlp_output0["work"].wait()
-        mlp_output1["work"].wait()
+
+        with torch.cuda.stream(comm_stream):
+            mlp_output0["work"].wait()
+            mlp_output1["work"].wait()
+
+        # mlp_output0["work"].wait()
+        # mlp_output1["work"].wait()
 
         hidden_states0 = mlp_output0["hidden_states"]
         hidden_states1 = mlp_output1["hidden_states"]

diff --git a/src/nanotron/parallel/comm.py b/src/nanotron/parallel/comm.py
@@ -1,5 +1,27 @@
+from contextlib import contextmanager
 from typing import Dict
 
+import torch
+
+
+class CudaStreamManager:
+    _streams: Dict[str, "torch.cuda.Stream"] = {}
+
+    @staticmethod
+    def create(name: str):
+        assert name not in CudaStreamManager._streams
+        CudaStreamManager._streams[name] = torch.cuda.Stream()
+
+    @staticmethod
+    def get(name: str):
+        return CudaStreamManager._streams.get(name)
+
+    @contextmanager
+    def run_on_stream(name: str):
+        stream = CudaStreamManager.get(name)
+        with torch.cuda.stream(stream):
+            yield stream
+
 
 class AsyncCommBucket:
     """
@@ -14,13 +36,37 @@ class AsyncCommBucket:
 
     @staticmethod
     def add(tensor_id: int, work: "dist.Work"):
+        assert (
+            tensor_id not in AsyncCommBucket._async_op
+        ), f"tensor_id: {tensor_id}, keys: {AsyncCommBucket._async_op.keys()}"
         AsyncCommBucket._async_op[tensor_id] = work
 
     @staticmethod
     def get(tensor_id: int):
         return AsyncCommBucket._async_op.get(tensor_id)
 
+    @staticmethod
+    def pop(tensor_id: int):
+        return AsyncCommBucket._async_op.pop(tensor_id)
+
     @staticmethod
     def wait(tensor_id: int):
         work = AsyncCommBucket._async_op.pop(tensor_id)
         work.wait()
+
+
+class WaitComm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, wait_handle_idx):
+        ctx.wait_handle_idx = wait_handle_idx
+        return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        import pydevd
+
+        pydevd.settrace(suspend=False, trace_only_current_thread=True)
+        if ctx.wait_handle_idx != "layer_1_batch_1":
+            handle = AsyncCommBucket.pop(ctx.wait_handle_idx)
+            handle.wait()
+        return grad_output, None
diff --git a/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py b/src/nanotron/parallel/tensor_parallel/distributed_differentiable_primitives.py
@@ -26,22 +26,31 @@ class DifferentiableIdentity(torch.autograd.Function):
     """All-reduce gradients in a differentiable fashion"""
 
     @staticmethod
-    def forward(ctx, tensor, group: Optional[ProcessGroup]):
+    def forward(ctx, tensor, group: Optional[ProcessGroup], handle_idx=None):
+        # assert handle_idx is not None
+        ctx.handle_idx = handle_idx
         ctx.group = group
         return tensor
 
     @staticmethod
     def backward(ctx, grad_output):
+        # import pydevd
+        # pydevd.settrace(suspend=False, trace_only_current_thread=True)
+        # NOTE: lm_head is TensorParallelColumnLinear, and it doesn't do async
+        # assert ctx.handle_idx is not None
         group = ctx.group
-        return DifferentiableAllReduceSum.apply(grad_output, group, False), None
+        if ctx.handle_idx is not None:
+            assert 1 == 1
+
+        return DifferentiableAllReduceSum.apply(grad_output, group, True, ctx.handle_idx), None, None
 
 
 class DifferentiableAllReduceSum(torch.autograd.Function):
     """All-reduce in a differentiable fashion"""
 
     @staticmethod
     def forward(
-        ctx, tensor, group: Optional[ProcessGroup], async_all_reduce: bool
+        ctx, tensor, group: Optional[ProcessGroup], async_all_reduce: bool, handle_idx: Optional[int] = None
     ) -> Tuple[torch.Tensor, Optional["dist.Work"]]:
         # ctx.mark_non_differentiable(async_all_reduce)
         ctx.async_all_reduce = async_all_reduce
@@ -63,13 +72,17 @@ def forward(
         if async_all_reduce:
             # AsyncCommBucket.add(tensor, handle)
             # AsyncCommBucket.add(id(tensor), handle)
-            AsyncCommBucket.add(orig_id, handle)
+            # try:
+            #     AsyncCommBucket.add(orig_id if handle_idx is None else handle_idx, handle)
+            # except Exception as e:
+            #     assert 1 == 1
+            AsyncCommBucket.add(orig_id if handle_idx is None else handle_idx, handle)
 
         return tensor
 
     @staticmethod
     def backward(ctx, grad_output):
-        return grad_output, None, None
+        return grad_output, None, None, None
 
 
 class DifferentiableAllGather(torch.autograd.Function):
@@ -151,8 +164,8 @@ def backward(ctx, grad_output):
 # -----------------
 
 
-def differentiable_identity(tensor, group: Optional[ProcessGroup] = None):
-    return DifferentiableIdentity.apply(tensor, group)
+def differentiable_identity(tensor, group: Optional[ProcessGroup] = None, handle_idx=None):
+    return DifferentiableIdentity.apply(tensor, group, handle_idx)
 
 
 def differentiable_all_reduce_sum(tensor, group: Optional[ProcessGroup] = None, async_all_reduce: bool = False):

diff --git a/src/nanotron/parallel/tensor_parallel/functional.py b/src/nanotron/parallel/tensor_parallel/functional.py
@@ -436,12 +436,13 @@ def column_linear(
     tp_mode: TensorParallelLinearMode,
     async_communication: bool,
     tp_recompute_allgather: bool = True,
+    handle_idx: Optional[int] = None,
 ):
     if async_communication:
         return _ColumnLinearAsyncCommunication.apply(input, weight, bias, group, tp_mode, tp_recompute_allgather)
 
     if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
-        input = differentiable_identity(input, group=group)
+        input = differentiable_identity(input, group=group, handle_idx=handle_idx)
         return F.linear(input, weight, bias)
     if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
         return _ColumnLinearNoAsyncCommunicationReduceScatterMode.apply(
@@ -604,7 +605,8 @@ def row_linear(
         if async_all_reduce:
             from nanotron.parallel.comm import AsyncCommBucket
 
-            work = AsyncCommBucket.get(orig_out_id)
+            # work = AsyncCommBucket.get(orig_out_id)
+            work = AsyncCommBucket.pop(orig_out_id)
             assert 1 == 1
     elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
         assert async_all_reduce is False, "Async communication is not supported for REDUCE_SCATTER mode."

diff --git a/src/nanotron/parallel/tensor_parallel/nn.py b/src/nanotron/parallel/tensor_parallel/nn.py
@@ -52,6 +52,7 @@ def __init__(
         async_communication: bool = False,
         contiguous_chunks: Optional[Tuple[int, ...]] = None,
         tp_recompute_allgather: bool = True,
+        # handle_idx: Optional[int] = None,
     ):
         self.pg = pg
         self.world_size = pg.size()
@@ -72,6 +73,7 @@ def __init__(
 
         self.mode = mode
         self.async_communication = async_communication
+        # self.handle_idx = handle_idx
 
         if contiguous_chunks is not None:
             assert (
@@ -85,7 +87,7 @@ def __init__(
             split_config=split_config,
         )
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor, handle_idx=None) -> torch.Tensor:
         return column_linear(
             input=x,
             weight=self.weight,
@@ -94,6 +96,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             tp_mode=self.mode,
             async_communication=self.async_communication,
             tp_recompute_allgather=self.tp_recompute_allgather,
+            handle_idx=handle_idx,
         )
 
     def extra_repr(self) -> str:
Original file line number	Diff line number	Diff line change
		@@ -10,3 +10,6 @@

		CHECKPOINT_FILE_NAME = "checkpoint_metadata.json"
		MODEL_CONFIG_FILE_NAME = "model_config.json"


		CUDA_STREAMS = {}