fix tp

zzhhjjj · zzhhjjj · commit a300f23f677d · 2025-02-17T12:24:12.000Z
diff --git a/src/nanotron/parallel/tensor_parallel/functional.py b/src/nanotron/parallel/tensor_parallel/functional.py
@@ -414,7 +414,7 @@ def backward(ctx, grad_output: torch.Tensor):
         grad_weight = grad_output.T @ total_input
         grad_input = grad_output @ weight
         if group.size() == 1:
-            sub_grad_input = grad_input
+            sub_grad_input = grad_input.reshape(input_size)  # [s*b, h_in] -> [s, b, h_in]
         else:
             # Seems that `reduce_scatter` need contiguous tensors: https://github.com/pytorch/pytorch/blob/2b267fa7f28e18ca6ea1de4201d2541a40411457/torch/distributed/nn/functional.py#L305
             # We set grad_input to be contiguous in case it isn't already.