Skip to content

Commit 19a9b78

Browse files
committed
Apply black and isort autoformatting to core
No code changes, just formatting. Add pyproject.toml with config for black and isort Add script to run black and isort on core Add CI check for core
1 parent 156533e commit 19a9b78

35 files changed

+1272
-910
lines changed

.gitlab-ci.yml

+9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ unit_tests:
3030
only:
3131
- merge_requests
3232

33+
formatting:
34+
tags:
35+
- docker_local_runner
36+
stage: test
37+
script:
38+
- pip install black==19.10b0 isort
39+
- black megatron/core --check --verbose --diff --color
40+
- isort megatron/core --check
41+
3342
.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
3443
tags:
3544
- ssh_selene_runner

megatron/core/__init__.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,4 @@
77
# Alias parallel_state as mpu, its legacy name
88
mpu = parallel_state
99

10-
__all__ = [
11-
"parallel_state",
12-
"tensor_parallel",
13-
"utils",
14-
"ModelParallelConfig"
15-
]
10+
__all__ = ["parallel_state", "tensor_parallel", "utils", "ModelParallelConfig"]

megatron/core/enums.py

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import enum
44

5+
56
class ModelType(enum.Enum):
67
encoder_or_decoder = 1
78
encoder_and_decoder = 2

megatron/core/fusions/fused_bias_dropout.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22

3+
from typing import Optional, Tuple
4+
35
import torch
4-
from typing import Tuple, Optional
6+
57

68
def _bias_dropout_add_func(x, bias, residual, prob, training):
79
# type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
@@ -16,28 +18,28 @@ def _bias_dropout_add_func(x, bias, residual, prob, training):
1618
out = residual + out
1719
return out
1820

19-
def get_bias_dropout_add(training, fused):
2021

22+
def get_bias_dropout_add(training, fused):
2123
def unfused_bias_dropout_add(x_with_bias, residual, prob):
22-
x, bias = x_with_bias # unpack
24+
x, bias = x_with_bias # unpack
2325
return _bias_dropout_add_func(x, bias, residual, prob, training)
2426

2527
@torch.jit.script
2628
def bias_dropout_add_fused_train(
2729
x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
2830
residual: torch.Tensor,
29-
prob: float
31+
prob: float,
3032
) -> torch.Tensor:
31-
x, bias = x_with_bias # unpack
33+
x, bias = x_with_bias # unpack
3234
return _bias_dropout_add_func(x, bias, residual, prob, True)
3335

3436
@torch.jit.script
3537
def bias_dropout_add_fused_inference(
3638
x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
3739
residual: torch.Tensor,
38-
prob: float
40+
prob: float,
3941
) -> torch.Tensor:
40-
x, bias = x_with_bias # unpack
42+
x, bias = x_with_bias # unpack
4143
return _bias_dropout_add_func(x, bias, residual, prob, False)
4244

4345
if fused:

megatron/core/fusions/fused_bias_gelu.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import torch
44

5-
65
###### BIAS GELU FUSION/ NO AUTOGRAD ################
76
# 1/sqrt(2*pi)-> 0.3989423
87
# 1/sqrt(2) -> 0.70710678
@@ -11,10 +10,12 @@
1110
# actual gelu is:
1211
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
1312

13+
1414
@torch.jit.script
1515
def bias_gelu(bias, y):
1616
x = bias + y
17-
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
17+
return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18+
1819

1920
# gradient of tanh approximation of gelu
2021
# gradient of actual gelu is:
@@ -24,8 +25,11 @@ def bias_gelu_back(g, bias, y):
2425
x = bias + y
2526
tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
2627
# sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27-
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28-
return ff*g
28+
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
29+
1 + tanh_out
30+
)
31+
return ff * g
32+
2933

3034
class GeLUFunction(torch.autograd.Function):
3135
@staticmethod
@@ -40,4 +44,5 @@ def backward(ctx, grad_output):
4044
tmp = bias_gelu_back(grad_output, bias, input)
4145
return tmp, tmp
4246

47+
4348
bias_gelu_impl = GeLUFunction.apply
+62-32
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,71 @@
11
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22

3+
import importlib
34
import numbers
5+
46
import torch
5-
from torch.nn.parameter import Parameter
67
from torch.nn import init
7-
import importlib
8+
from torch.nn.parameter import Parameter
89

910
from megatron.core.utils import make_viewless_tensor
1011

1112
try:
1213
from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
14+
1315
HAVE_PERSIST_LAYER_NORM = True
1416
except:
1517
HAVE_PERSIST_LAYER_NORM = False
1618

1719
try:
1820
from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
21+
1922
HAVE_FUSED_LAYER_NORM = True
2023
except:
2124
HAVE_FUSED_LAYER_NORM = False
2225

2326

2427
class FusedLayerNorm(torch.nn.Module):
25-
26-
def __init__(self, hidden_size, eps=1e-5,
27-
persist_layer_norm=True,
28-
sequence_parallel=False,
29-
zero_centered_gamma=False):
28+
def __init__(
29+
self,
30+
hidden_size,
31+
eps=1e-5,
32+
persist_layer_norm=True,
33+
sequence_parallel=False,
34+
zero_centered_gamma=False,
35+
):
3036
super().__init__()
3137

3238
self.zero_centered_gamma = zero_centered_gamma
3339

3440
# List of hiddens sizes supported in the persistent layer norm kernel
3541
# If the hidden size is not supported, fall back to the non-persistent
3642
# kernel.
37-
persist_ln_hidden_sizes = [1024, 1536, 2048, 2304, 3072, 3840, 4096,
38-
5120, 6144, 8192, 10240, 12288, 12800, 15360, 16384, 18432, 20480,
39-
24576, 25600, 30720, 32768, 40960, 49152, 65536]
43+
persist_ln_hidden_sizes = [
44+
1024,
45+
1536,
46+
2048,
47+
2304,
48+
3072,
49+
3840,
50+
4096,
51+
5120,
52+
6144,
53+
8192,
54+
10240,
55+
12288,
56+
12800,
57+
15360,
58+
16384,
59+
18432,
60+
20480,
61+
24576,
62+
25600,
63+
30720,
64+
32768,
65+
40960,
66+
49152,
67+
65536,
68+
]
4069
if hidden_size not in persist_ln_hidden_sizes or not HAVE_PERSIST_LAYER_NORM:
4170
persist_layer_norm = False
4271

@@ -58,32 +87,33 @@ def __init__(self, hidden_size, eps=1e-5,
5887
setattr(self.weight, 'sequence_parallel', self.sequence_parallel)
5988
setattr(self.bias, 'sequence_parallel', self.sequence_parallel)
6089

90+
def reset_parameters(self):
6191

62-
def reset_parameters(self):
63-
64-
if self.zero_centered_gamma:
65-
init.zeros_(self.weight)
66-
init.zeros_(self.bias)
67-
else:
68-
init.ones_(self.weight)
69-
init.zeros_(self.bias)
92+
if self.zero_centered_gamma:
93+
init.zeros_(self.weight)
94+
init.zeros_(self.bias)
95+
else:
96+
init.ones_(self.weight)
97+
init.zeros_(self.bias)
7098

71-
def forward(self, input):
99+
def forward(self, input):
72100

73-
weight = self.weight + 1 if self.zero_centered_gamma else self.weight
101+
weight = self.weight + 1 if self.zero_centered_gamma else self.weight
74102

75-
if self.persist_layer_norm:
76-
output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
103+
if self.persist_layer_norm:
104+
output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
77105

78-
# Apex's fast layer norm function outputs a 'view' tensor (i.e., has
79-
# a populated '_base' field). This will result in schedule.py's
80-
# deallocate_output_tensor() throwing an error, so a viewless tensor is
81-
# created to prevent this.
82-
output = make_viewless_tensor(inp = output,
83-
requires_grad = input.requires_grad,
84-
keep_graph = True)
106+
# Apex's fast layer norm function outputs a 'view' tensor (i.e., has
107+
# a populated '_base' field). This will result in schedule.py's
108+
# deallocate_output_tensor() throwing an error, so a viewless tensor is
109+
# created to prevent this.
110+
output = make_viewless_tensor(
111+
inp=output, requires_grad=input.requires_grad, keep_graph=True
112+
)
85113

86-
else:
87-
output = FusedLayerNormAffineFunction.apply(input, weight, self.bias, self.hidden_size, self.eps)
114+
else:
115+
output = FusedLayerNormAffineFunction.apply(
116+
input, weight, self.bias, self.hidden_size, self.eps
117+
)
88118

89-
return output
119+
return output

megatron/core/fusions/fused_softmax.py

+7-16
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import torch
55
import torch.nn as nn
6+
67
from megatron.core.transformer.enums import AttnMaskType
78

89

@@ -19,9 +20,7 @@ def forward(ctx, inputs, scale):
1920
import scaled_upper_triang_masked_softmax_cuda
2021

2122
scale_t = torch.tensor([scale])
22-
softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(
23-
inputs, scale_t[0]
24-
)
23+
softmax_results = scaled_upper_triang_masked_softmax_cuda.forward(inputs, scale_t[0])
2524

2625
ctx.save_for_backward(softmax_results, scale_t)
2726
return softmax_results
@@ -62,9 +61,7 @@ def backward(ctx, output_grads):
6261

6362
softmax_results, scale_t = ctx.saved_tensors
6463

65-
input_grads = scaled_masked_softmax_cuda.backward(
66-
output_grads, softmax_results, scale_t[0]
67-
)
64+
input_grads = scaled_masked_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
6865
return input_grads, None, None
6966

7067

@@ -81,9 +78,7 @@ def forward(ctx, inputs, scale):
8178

8279
scale_t = torch.tensor([scale])
8380

84-
softmax_results = scaled_softmax_cuda.forward(
85-
inputs, scale_t[0]
86-
)
81+
softmax_results = scaled_softmax_cuda.forward(inputs, scale_t[0])
8782
ctx.save_for_backward(softmax_results, scale_t)
8883
return softmax_results
8984

@@ -93,9 +88,7 @@ def backward(ctx, output_grads):
9388

9489
softmax_results, scale_t = ctx.saved_tensors
9590

96-
input_grads = scaled_softmax_cuda.backward(
97-
output_grads, softmax_results, scale_t[0]
98-
)
91+
input_grads = scaled_softmax_cuda.backward(output_grads, softmax_results, scale_t[0])
9992
return input_grads, None, None
10093

10194

@@ -136,9 +129,7 @@ def __init__(
136129
self.softmax_in_fp32 = softmax_in_fp32
137130
self.scale = scale
138131

139-
assert (
140-
self.scale is None or softmax_in_fp32
141-
), "softmax should be in fp32 when scaled"
132+
assert self.scale is None or softmax_in_fp32, "softmax should be in fp32 when scaled"
142133

143134
def forward(self, input, mask):
144135
# [b, np, sq, sk]
@@ -157,7 +148,7 @@ def is_kernel_available(self, mask, b, np, sq, sk):
157148
and self.input_in_float16 # input must be fp16
158149
and 16 < sk <= 4096 # sk must be 16 ~ 2048
159150
and sq % 4 == 0 # sq must be divisor of 4
160-
and sk % 4 == 0 # sk must be divisor of 4
151+
and sk % 4 == 0 # sk must be divisor of 4
161152
and attn_batches % 4 == 0 # np * b must be divisor of 4
162153
):
163154
if 0 <= sk <= 4096:

megatron/core/model_parallel_config.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import torch
77

8+
89
@dataclass
910
class ModelParallelConfig:
1011
"""Base configuration for Megatron Core
@@ -128,7 +129,7 @@ class ModelParallelConfig:
128129
# Optimizations
129130
gradient_accumulation_fusion: bool = False
130131
async_tensor_model_parallel_allreduce: bool = False
131-
132+
132133
# Pipeline Parallel
133134
pipeline_dtype: torch.dtype = None
134135
grad_scale_func: Callable = None
@@ -158,7 +159,9 @@ def __post_init__(self):
158159

159160
if self.pipeline_model_parallel_size > 1:
160161
if self.pipeline_dtype is None:
161-
raise ValueError("When using pipeline parallelism, pipeline_dtype must be specified")
162+
raise ValueError(
163+
"When using pipeline parallelism, pipeline_dtype must be specified"
164+
)
162165

163166
if self.autocast_dtype is None:
164167
self.autocast_dtype = self.params_dtype

megatron/core/models/common/rotary_pos_embedding.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
22

33
import importlib.util
4-
import torch
54

5+
import torch
66
from torch import einsum, nn
77

88
__all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
99

10+
1011
class RotaryEmbedding(nn.Module):
1112
def __init__(self, dim):
1213
super().__init__()

0 commit comments

Comments
 (0)