Merge branch 'main' into rwkv-tp

EleutherAI · jahatef · Jun 4, 2024 · Jun 19, 2024 · Sep 25, 2024 · Oct 3, 2024
commit bf478ce9c5337a5d41d8ad845e34b8134bf5ca8a
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
@@ -1303,20 +1303,20 @@ def forward(self, x, attention_mask, layer_past=None):
                     else:
                         raise KeyError(self.moe_type)
 
-            with torch.enable_grad() if not self.eval else nullcontext():
-                if mlp_bias == None or (
-                    self.num_experts > 1 and self.moe_type == "deepspeed"
-                ):
-                    # No dropout either
-                    assert mlp_bias is None
-                    output = mlp_output + attention_output
-                else:
-                    output = bias_dropout_fn(
-                        mlp_output,
-                        bias=mlp_bias.expand_as(attention_output),
-                        residual=attention_output,
-                        prob=self.hidden_dropout,
-                    )
+                with torch.enable_grad() if not self.eval else nullcontext():
+                    if mlp_bias == None or (
+                        self.num_experts > 1 and self.moe_type == "deepspeed"
+                    ):
+                        # No dropout either
+                        assert mlp_bias is None
+                        output = mlp_output + attention_output
+                    else:
+                        output = bias_dropout_fn(
+                            mlp_output,
+                            bias=mlp_bias.expand_as(attention_output),
+                            residual=attention_output,
+                            prob=self.hidden_dropout,
+                        )
 
             return output, moe_loss
 

@@ -502,11 +502,86 @@ class NeoXArgsModel(NeoXArgsTemplate):
 
     # Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905)
     output_layer_parallelism: Literal["column"] = "column"
-
     """
     Parameter controlling whether the output layer is parallelized over the hidden dim (row) or the vocab dim (column)
     """
 
+    serve_model_weights: bool = False
+    """
+    If true, serve model weight pointers over a socket connection
+    """
+
+    weight_server_port: Union[int, List[int]] = 6000
+    """
+    Port(s) to serve model weights over
+    If an integer is provided, the port for each GPU will be 6000 + global rank
+    If a list is provided, the ports will be used in order, e.g. rank0 will be weight_server_port[0]
+    """
+
+    online_dataserver_ips: Union[str, List[str]] = "localhost"
+    """
+    ip addresses to connect to for online data serving, defaults to localhost
+    """
+
+    online_dataserver_ports: Union[int, List[int]] = 10000
+    """
+    Port(s) to connect to for online data serving, defaults to 10000
+    """
+
+    te_columnparallel: bool = False
+    """
+    Use TransformerEngine for RowParallelLinear layer.
+    """
+
+    te_rowparallel: bool = False
+    """
+    Use TransformerEngine for ColumnParallelLinear layer.
+    """
+
+    te_layernorm_mlp: bool = False
+    """
+    Use TransformerEngine for LayerNormMLP layer.
+    """
+
+    te_mha: bool = False
+    """
+    Use TransformerEngine for MultiheadAttention layer.
+    """
+
+    te_fp8_format: Literal["e4m3", "hybrid"] = "hybrid"
+    """
+    Controls the FP8 data format used during forward and backward pass by TransformerEngine.
+    Hybrid uses E4M3 during forward pass, E5M2 during backward pass.
+    """
+
+    te_fp8_wgrad: bool = True
+    """
+    When set to False, override FP8 config options and do the wgrad computation
+    in higher precision.
+    """
+
+    te_fp8_amax_history_len: int = 1
+    """
+    The length of the amax history window used for scaling factor computation.
+    """
+
+    te_fp8_amax_compute_algo: str = "most_recent"
+    """
+    Algorithm used for choosing the `amax` value for the scaling factor computation. There are 2
+    predefined choices: `max` chooses the largest `amax` in the history window, while `most_recent`
+    always chooses the most recently seen value.
+    """
+
+    te_fp8_margin: int = 0
+    """
+    Margin for the scaling factor computation.
+    """
+
+    te_fp8_mha: bool = False
+    """
+    When set to True, use the FP8 implementation of Multi Head Attention.
+    """
+
     dim_att: int = None
     """
     Total dimension of the attention mechanism for RWKV. If not set, defaults to hidden_size.