Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use AWS Neuron sdk 2.21 #754

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/inference_cache_llm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v4
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/inference_cache_stable_diffusion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v4
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_inf2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_inf2_export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_inf2_full_export.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_inf2_inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Install cv2 dependencies
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_inf2_tgi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v2
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_trainium_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Install cv2 dependencies
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_trainium_distributed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Install cv2 dependencies
run: |
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_trainium_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ jobs:
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2 -y
dpkg -l | grep neuron
export PATH=/opt/aws/neuron/bin:$PATH
- name: Install cv2 dependencies
run: |
Expand Down
8 changes: 7 additions & 1 deletion optimum/neuron/distributed/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def initialize(mod: GQAQKVColumnParallelLinear, proj_name: str, output_size: int
else:
# TODO: change kv heads.
maybe_load_linear_weight_to_gqa_qkv_column_parallel_linear(
mod, f"weight_{proj_name}", linear_layer=fake_linear_mod
mod, proj_name, f"weight_{proj_name}", linear_layer=fake_linear_mod
)
del fake_linear_mod

Expand Down Expand Up @@ -678,6 +678,9 @@ def should_parallelize_layer_predicate_func(layer):
"num_attention_heads": None,
"num_key_value_heads": None,
"kv_size_multiplier": None,
"fuse_qkv": None,
"q_output_size_per_partition": None,
"kv_output_size_per_partition": None,
}
for mod in model.modules():
if isinstance(mod, OptimumGQAQKVColumnParallelLinear):
Expand All @@ -690,6 +693,9 @@ def should_parallelize_layer_predicate_func(layer):
"num_attention_heads": num_attention_heads,
"num_key_value_heads": num_key_value_heads,
"kv_size_multiplier": kv_size_multiplier,
"fuse_qkv": mod.fuse_qkv,
"q_output_size_per_partition": mod.q_output_size_per_partition,
"kv_output_size_per_partition": mod.kv_output_size_per_partition,
}
break

Expand Down
93 changes: 60 additions & 33 deletions optimum/neuron/distributed/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,46 +134,73 @@ def consolidate_tensor_parallel_checkpoints(
for name in parameter_names:
# We need to handle the mapping between the GQA parameter names and the original names.
is_gqa_qkv_weight = name in gqa_qkv_names_to_original_names
is_fuse_qkv = gqa_qkv_metadata["fuse_qkv"]
if is_gqa_qkv_weight:
original_name = gqa_qkv_names_to_original_names[name]
weight_name = name.rsplit(".", maxsplit=1)[1]
if is_fuse_qkv:
original_names = [k for k, v in original_parameter_names_to_gqa_qkv_names.items() if v == name]
weight_names = [name.rsplit(".", maxsplit=1)[1] for name in original_names]
weight_names = ["weight_q", "weight_k", "weight_v"]
else:
original_names = [gqa_qkv_names_to_original_names[name]]
weight_names = [name.rsplit(".", maxsplit=1)[1]]
else:
original_name = name
weight_name = "" # Not needed.
original_names = [name]
weight_names = [""] # Not needed.

# For now all parameter metadatas are equal so it is enough to take the first element.
# This might not be the case anymore when `ParameterMetadata` uses slices.
sharded_metadata = sharded_metadatas[name]
if sharded_metadata.is_tied:
consolidated_state_dict[original_name] = state_dicts[0][name].to("cpu").contiguous()
else:
# Ensure that all tensors are contiguous before concatenating or further processing
weights = [state_dict[name].contiguous() for state_dict in state_dicts]
tp_size = len(weights)

full_weight = (
torch.cat(
weights,
dim=sharded_metadata.partition_dim,
)
.to("cpu")
.contiguous()
) # Ensure the result is also contiguous

if weight_name in ["weight_k", "weight_v", "bias_k", "bias_v"]:
for original_name, weight_name in zip(original_names, weight_names):
if sharded_metadata.is_tied:
consolidated_state_dict[original_name] = state_dicts[0][name].to("cpu").contiguous()
else:
if is_fuse_qkv:
if weight_name == "weight_q":
s = slice(0, gqa_qkv_metadata["q_output_size_per_partition"])
elif weight_name == "weight_k":
s = slice(
gqa_qkv_metadata["q_output_size_per_partition"],
gqa_qkv_metadata["q_output_size_per_partition"]
+ gqa_qkv_metadata["kv_output_size_per_partition"],
)
elif weight_name == "weight_v":
s = slice(
gqa_qkv_metadata["q_output_size_per_partition"]
+ gqa_qkv_metadata["kv_output_size_per_partition"],
None,
)
else:
s = slice(None, None)
else:
s = slice(None, None)

# Ensure that all tensors are contiguous before concatenating or further processing
weights = [state_dict[name][s].contiguous() for state_dict in state_dicts]
tp_size = len(weights)

full_weight = (
torch.chunk(full_weight, gqa_qkv_metadata["kv_size_multiplier"], dim=0)[0].detach().clone()
)
elif weight_name == "weight_q" or original_name in gqa_qkv_output_projections_names:
full_weight = create_gqa_query_or_output_projection_weight_from_full_weight(
full_weight,
tp_size,
gqa_qkv_metadata["num_attention_heads"],
gqa_qkv_metadata["num_key_value_heads"],
gqa_qkv_metadata["kv_size_multiplier"],
"query" if weight_name == "weight_q" else "output",
)
consolidated_state_dict[original_name] = full_weight
torch.cat(
weights,
dim=sharded_metadata.partition_dim,
)
.to("cpu")
.contiguous()
) # Ensure the result is also contiguous

if weight_name in ["weight_k", "weight_v", "bias_k", "bias_v"]:
full_weight = (
torch.chunk(full_weight, gqa_qkv_metadata["kv_size_multiplier"], dim=0)[0].detach().clone()
)
elif weight_name == "weight_q" or original_name in gqa_qkv_output_projections_names:
full_weight = create_gqa_query_or_output_projection_weight_from_full_weight(
full_weight,
tp_size,
gqa_qkv_metadata["num_attention_heads"],
gqa_qkv_metadata["num_key_value_heads"],
gqa_qkv_metadata["kv_size_multiplier"],
"query" if weight_name == "weight_q" else "output",
)
consolidated_state_dict[original_name] = full_weight

return consolidated_state_dict

Expand Down
6 changes: 3 additions & 3 deletions optimum/neuron/distributed/parallel_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,8 +379,8 @@ def replace_qkv_by_gqa_qkv_column_parallel_linear(
key_linear = getattr(attention_layer, cls.KEYS_NAME)

hidden_size = query_linear.weight.size(1)
query_in_features = query_linear.weight.size(0)
key_value_in_features = key_linear.weight.size(0)
query_out_features = query_linear.out_features
key_value_out_features = key_linear.out_features

if kv_size_multiplier is None:
kv_size_multiplier = get_tensor_model_parallel_size() // num_key_value_heads
Expand All @@ -397,7 +397,7 @@ def replace_qkv_by_gqa_qkv_column_parallel_linear(
num_attention_heads,
num_key_value_heads,
hidden_size,
[query_in_features, key_value_in_features],
[query_out_features, key_value_out_features],
gather_output=False,
bias=query_linear.bias is not None,
sequence_parallel_enabled=sequence_parallel_enabled,
Expand Down
Loading
Loading