fix tp

zzhhjjj · zzhhjjj · commit c0f86f0d1fe3 · 2025-02-17T12:21:11.000Z
diff --git a/src/nanotron/parallel/tensor_parallel/functional.py b/src/nanotron/parallel/tensor_parallel/functional.py
@@ -389,6 +389,9 @@ def forward(
 
     @staticmethod
     def backward(ctx, grad_output: torch.Tensor):
+        import debugpy
+
+        debugpy.breakpoint()
         # Either allgather the inputs again or get them from context.
         group = ctx.group
         tp_recompute_allgather = ctx.tp_recompute_allgather
@@ -414,7 +417,7 @@ def backward(ctx, grad_output: torch.Tensor):
         grad_weight = grad_output.T @ total_input
         grad_input = grad_output @ weight
         if group.size() == 1:
-            sub_grad_input = grad_input
+            sub_grad_input = grad_input.reshape(input_size)  # [s*b, h_in] -> [s, b, h_in]
         else:
             # Seems that `reduce_scatter` need contiguous tensors: https://github.com/pytorch/pytorch/blob/2b267fa7f28e18ca6ea1de4201d2541a40411457/torch/distributed/nn/functional.py#L305
             # We set grad_input to be contiguous in case it isn't already.