megagonlabs
diff --git a/‎.gitlab-ci.yml
+9 b/‎.gitlab-ci.yml
+9
diff --git a/‎megatron/core/__init__.py
+1-6 b/‎megatron/core/__init__.py
+1-6
diff --git a/‎megatron/core/enums.py
+1 b/‎megatron/core/enums.py
+1
diff --git a/‎megatron/core/fusions/fused_bias_dropout.py
+9-7 b/‎megatron/core/fusions/fused_bias_dropout.py
+9-7
diff --git a/‎megatron/core/fusions/fused_bias_gelu.py
+9-4 b/‎megatron/core/fusions/fused_bias_gelu.py
+9-4
diff --git a/‎megatron/core/fusions/fused_layer_norm.py
+62-32 b/‎megatron/core/fusions/fused_layer_norm.py
+62-32
diff --git a/‎megatron/core/fusions/fused_softmax.py
+7-16 b/‎megatron/core/fusions/fused_softmax.py
+7-16
diff --git a/‎megatron/core/model_parallel_config.py
+5-2 b/‎megatron/core/model_parallel_config.py
+5-2
diff --git a/‎megatron/core/models/common/rotary_pos_embedding.py
+2-1 b/‎megatron/core/models/common/rotary_pos_embedding.py
+2-1
@@ -30,6 +30,15 @@ unit_tests:
   only:
     - merge_requests
 
+formatting:
+  tags:
+    - docker_local_runner
+  stage: test
+  script:
+    - pip install black==19.10b0 isort
+    - black megatron/core --check --verbose --diff --color
+    - isort megatron/core --check
+
 .selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
   tags:
     - ssh_selene_runner
 
@@ -7,9 +7,4 @@
 # Alias parallel_state as mpu, its legacy name
 mpu = parallel_state
 
-__all__ = [
-    "parallel_state",
-    "tensor_parallel",
-    "utils",
-    "ModelParallelConfig"
-]
+__all__ = ["parallel_state", "tensor_parallel", "utils", "ModelParallelConfig"]
@@ -2,6 +2,7 @@
 
 import enum
 
+
 class ModelType(enum.Enum):
     encoder_or_decoder = 1
     encoder_and_decoder = 2
 
@@ -1,7 +1,9 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+from typing import Optional, Tuple
+
 import torch
-from typing import Tuple, Optional
+
 
 def _bias_dropout_add_func(x, bias, residual, prob, training):
     # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
@@ -16,28 +18,28 @@ def _bias_dropout_add_func(x, bias, residual, prob, training):
     out = residual + out
     return out
 
-def get_bias_dropout_add(training, fused):
 
+def get_bias_dropout_add(training, fused):
     def unfused_bias_dropout_add(x_with_bias, residual, prob):
-        x, bias = x_with_bias # unpack
+        x, bias = x_with_bias  # unpack
         return _bias_dropout_add_func(x, bias, residual, prob, training)
 
     @torch.jit.script
     def bias_dropout_add_fused_train(
         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
         residual: torch.Tensor,
-        prob: float
+        prob: float,
     ) -> torch.Tensor:
-        x, bias = x_with_bias # unpack
+        x, bias = x_with_bias  # unpack
         return _bias_dropout_add_func(x, bias, residual, prob, True)
 
     @torch.jit.script
     def bias_dropout_add_fused_inference(
         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
         residual: torch.Tensor,
-        prob: float
+        prob: float,
     ) -> torch.Tensor:
-        x, bias = x_with_bias # unpack
+        x, bias = x_with_bias  # unpack
         return _bias_dropout_add_func(x, bias, residual, prob, False)
 
     if fused:
 
@@ -2,7 +2,6 @@
 
 import torch
 
-
 ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 # 1/sqrt(2*pi)-> 0.3989423
 # 1/sqrt(2)   -> 0.70710678
@@ -11,10 +10,12 @@
 # actual gelu is:
 # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
 
+
 @torch.jit.script
 def bias_gelu(bias, y):
     x = bias + y
-    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
 
 # gradient of tanh approximation of gelu
 # gradient of actual gelu is:
@@ -24,8 +25,11 @@ def bias_gelu_back(g, bias, y):
     x = bias + y
     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
-    return ff*g
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return ff * g
+
 
 class GeLUFunction(torch.autograd.Function):
     @staticmethod
@@ -40,4 +44,5 @@ def backward(ctx, grad_output):
         tmp = bias_gelu_back(grad_output, bias, input)
         return tmp, tmp
 
+
 bias_gelu_impl = GeLUFunction.apply
@@ -1,42 +1,71 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+import importlib
 import numbers
+
 import torch
-from torch.nn.parameter import Parameter
 from torch.nn import init
-import importlib
+from torch.nn.parameter import Parameter
 
 from megatron.core.utils import make_viewless_tensor
 
 try:
     from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
     HAVE_PERSIST_LAYER_NORM = True
 except:
     HAVE_PERSIST_LAYER_NORM = False
 
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
     HAVE_FUSED_LAYER_NORM = True
 except:
     HAVE_FUSED_LAYER_NORM = False
 
 
 class FusedLayerNorm(torch.nn.Module):
-
-  def __init__(self, hidden_size, eps=1e-5,
-               persist_layer_norm=True,
-               sequence_parallel=False,
-               zero_centered_gamma=False):
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        persist_layer_norm=True,
+        sequence_parallel=False,
+        zero_centered_gamma=False,
+    ):
         super().__init__()
 
         self.zero_centered_gamma = zero_centered_gamma
 
         # List of hiddens sizes supported in the persistent layer norm kernel
         # If the hidden size is not supported, fall back to the non-persistent
         # kernel.
-        persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
-            5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
-            24576, 25600, 30720, 32768, 40960, 49152, 65536]
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
         if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
             persist_layer_norm = False
 
@@ -58,32 +87,33 @@ def __init__(self, hidden_size, eps=1e-5,
         setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
         setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
 
+    def reset_parameters(self):
 
-  def reset_parameters(self):
-
-    if self.zero_centered_gamma:
-        init.zeros_(self.weight)
-        init.zeros_(self.bias)
-    else:
-        init.ones_(self.weight)
-        init.zeros_(self.bias)
+        if self.zero_centered_gamma:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
 
-  def forward(self, input):
+    def forward(self, input):
 
-    weight = self.weight + 1 if self.zero_centered_gamma else self.weight
+        weight = self.weight + 1 if self.zero_centered_gamma else self.weight
 
-    if self.persist_layer_norm:
-        output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+        if self.persist_layer_norm:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
 
-        # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
-        # a populated '_base' field). This will result in schedule.py's
-        # deallocate_output_tensor() throwing an error, so a viewless tensor is
-        # created to prevent this.
-        output = make_viewless_tensor(inp = output,
-                                      requires_grad = input.requires_grad,
-                                      keep_graph = True)
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
 
-    else:
-        output = FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.hidden_size, self.eps)
+        else:
+            output = FusedLayerNormAffineFunction.apply(
+                input, weight, self.bias, self.hidden_size, self.eps
+            )
 
-    return output
+        return output
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+
 from megatron.core.transformer.enums import AttnMaskType
 
 
@@ -19,9 +20,7 @@ def forward(ctx, inputs, scale):
         import scaled_upper_triang_masked_softmax_cuda
 
         scale_t = torch.tensor([scale])
-        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
-            inputs, scale_t[0]
-        )
+        softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
 
         ctx.save_for_backward(softmax_results, scale_t)
         return softmax_results
@@ -62,9 +61,7 @@ def backward(ctx, output_grads):
 
         softmax_results, scale_t = ctx.saved_tensors
 
-        input_grads = scaled_masked_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
+        input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
         return input_grads, None, None
 
 
@@ -81,9 +78,7 @@ def forward(ctx, inputs, scale):
 
         scale_t = torch.tensor([scale])
 
-        softmax_results = scaled_softmax_cuda.forward(
-            inputs, scale_t[0]
-        )
+        softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
         ctx.save_for_backward(softmax_results, scale_t)
         return softmax_results
 
@@ -93,9 +88,7 @@ def backward(ctx, output_grads):
 
         softmax_results, scale_t = ctx.saved_tensors
 
-        input_grads = scaled_softmax_cuda.backward(
-            output_grads, softmax_results, scale_t[0]
-        )
+        input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
         return input_grads, None, None
 
 
@@ -136,9 +129,7 @@ def __init__(
         self.softmax_in_fp32 = softmax_in_fp32
         self.scale = scale
 
-        assert (
-            self.scale is None or softmax_in_fp32
-        ), "softmax should be in fp32 when scaled"
+        assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
 
     def forward(self, input, mask):
         # [b, np, sq, sk]
@@ -157,7 +148,7 @@ def is_kernel_available(self, mask, b, np, sq, sk):
             and self.input_in_float16  # input must be fp16
             and 16 < sk <= 4096  # sk must be 16 ~ 2048
             and sq % 4 == 0  # sq must be divisor of 4
-            and sk % 4 == 0  # sk must be divisor of 4 
+            and sk % 4 == 0  # sk must be divisor of 4
             and attn_batches % 4 == 0  # np * b must be divisor of 4
         ):
             if 0 <= sk <= 4096:
 
@@ -5,6 +5,7 @@
 
 import torch
 
+
 @dataclass
 class ModelParallelConfig:
     """Base configuration for Megatron Core
@@ -128,7 +129,7 @@ class ModelParallelConfig:
     # Optimizations
     gradient_accumulation_fusion: bool = False
     async_tensor_model_parallel_allreduce: bool = False
-    
+
     # Pipeline Parallel
     pipeline_dtype: torch.dtype = None
     grad_scale_func: Callable = None
@@ -158,7 +159,9 @@ def __post_init__(self):
 
         if self.pipeline_model_parallel_size > 1:
             if self.pipeline_dtype is None:
-                raise ValueError("When using pipeline parallelism, pipeline_dtype must be specified")
+                raise ValueError(
+                    "When using pipeline parallelism, pipeline_dtype must be specified"
+                )
 
         if self.autocast_dtype is None:
             self.autocast_dtype = self.params_dtype
@@ -1,12 +1,13 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 import importlib.util
-import torch
 
+import torch
 from torch import einsum, nn
 
 __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
 
+
 class RotaryEmbedding(nn.Module):
     def __init__(self, dim):
         super().__init__()