huggingface · dacorvo · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 24, 2024
diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml
@@ -39,7 +39,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v4

diff --git a/.github/workflows/inference_cache_stable_diffusion.yml b/.github/workflows/inference_cache_stable_diffusion.yml
@@ -29,7 +29,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v4

diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
@@ -32,7 +32,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2_export.yml b/.github/workflows/test_inf2_export.yml
@@ -32,7 +32,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2_full_export.yml b/.github/workflows/test_inf2_full_export.yml
@@ -30,7 +30,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2_inference.yml b/.github/workflows/test_inf2_inference.yml
@@ -32,7 +32,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Install cv2 dependencies
         run: |

diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -34,7 +34,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
@@ -34,7 +34,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Install cv2 dependencies
         run: |

diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml
@@ -33,7 +33,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Install cv2 dependencies
         run: |

diff --git a/.github/workflows/test_trainium_examples.yml b/.github/workflows/test_trainium_examples.yml
@@ -41,7 +41,8 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa  -y
+          sudo apt-get install aws-neuronx-tools=2.20.204.0 aws-neuronx-runtime-lib=2.23.110.0-9b5179492 aws-neuronx-collectives=2.23.133.0-3e70920f2  -y
+          dpkg -l | grep neuron
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Install cv2 dependencies
         run: |

diff --git a/optimum/neuron/distributed/base.py b/optimum/neuron/distributed/base.py
@@ -484,7 +484,7 @@ def initialize(mod: GQAQKVColumnParallelLinear, proj_name: str, output_size: int
                             else:
                                 # TODO: change kv heads.
                                 maybe_load_linear_weight_to_gqa_qkv_column_parallel_linear(
-                                    mod, f"weight_{proj_name}", linear_layer=fake_linear_mod
+                                    mod, proj_name, f"weight_{proj_name}", linear_layer=fake_linear_mod
                                 )
                             del fake_linear_mod
 
@@ -678,6 +678,9 @@ def should_parallelize_layer_predicate_func(layer):
             "num_attention_heads": None,
             "num_key_value_heads": None,
             "kv_size_multiplier": None,
+            "fuse_qkv": None,
+            "q_output_size_per_partition": None,
+            "kv_output_size_per_partition": None,
         }
         for mod in model.modules():
             if isinstance(mod, OptimumGQAQKVColumnParallelLinear):
@@ -690,6 +693,9 @@ def should_parallelize_layer_predicate_func(layer):
                     "num_attention_heads": num_attention_heads,
                     "num_key_value_heads": num_key_value_heads,
                     "kv_size_multiplier": kv_size_multiplier,
+                    "fuse_qkv": mod.fuse_qkv,
+                    "q_output_size_per_partition": mod.q_output_size_per_partition,
+                    "kv_output_size_per_partition": mod.kv_output_size_per_partition,
                 }
                 break
 

diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py
@@ -134,46 +134,73 @@ def consolidate_tensor_parallel_checkpoints(
     for name in parameter_names:
         # We need to handle the mapping between the GQA parameter names and the original names.
         is_gqa_qkv_weight = name in gqa_qkv_names_to_original_names
+        is_fuse_qkv = gqa_qkv_metadata["fuse_qkv"]
         if is_gqa_qkv_weight:
-            original_name = gqa_qkv_names_to_original_names[name]
-            weight_name = name.rsplit(".", maxsplit=1)[1]
+            if is_fuse_qkv:
+                original_names = [k for k, v in original_parameter_names_to_gqa_qkv_names.items() if v == name]
+                weight_names = [name.rsplit(".", maxsplit=1)[1] for name in original_names]
+                weight_names = ["weight_q", "weight_k", "weight_v"]
+            else:
+                original_names = [gqa_qkv_names_to_original_names[name]]
+                weight_names = [name.rsplit(".", maxsplit=1)[1]]
         else:
-            original_name = name
-            weight_name = ""  # Not needed.
+            original_names = [name]
+            weight_names = [""]  # Not needed.
 
         # For now all parameter metadatas are equal so it is enough to take the first element.
         # This might not be the case anymore when `ParameterMetadata` uses slices.
         sharded_metadata = sharded_metadatas[name]
-        if sharded_metadata.is_tied:
-            consolidated_state_dict[original_name] = state_dicts[0][name].to("cpu").contiguous()
-        else:
-            # Ensure that all tensors are contiguous before concatenating or further processing
-            weights = [state_dict[name].contiguous() for state_dict in state_dicts]
-            tp_size = len(weights)
-
-            full_weight = (
-                torch.cat(
-                    weights,
-                    dim=sharded_metadata.partition_dim,
-                )
-                .to("cpu")
-                .contiguous()
-            )  # Ensure the result is also contiguous
-
-            if weight_name in ["weight_k", "weight_v", "bias_k", "bias_v"]:
+        for original_name, weight_name in zip(original_names, weight_names):
+            if sharded_metadata.is_tied:
+                consolidated_state_dict[original_name] = state_dicts[0][name].to("cpu").contiguous()
+            else:
+                if is_fuse_qkv:
+                    if weight_name == "weight_q":
+                        s = slice(0, gqa_qkv_metadata["q_output_size_per_partition"])
+                    elif weight_name == "weight_k":
+                        s = slice(
+                            gqa_qkv_metadata["q_output_size_per_partition"],
+                            gqa_qkv_metadata["q_output_size_per_partition"]
+                            + gqa_qkv_metadata["kv_output_size_per_partition"],
+                        )
+                    elif weight_name == "weight_v":
+                        s = slice(
+                            gqa_qkv_metadata["q_output_size_per_partition"]
+                            + gqa_qkv_metadata["kv_output_size_per_partition"],
+                            None,
+                        )
+                    else:
+                        s = slice(None, None)
+                else:
+                    s = slice(None, None)
+
+                # Ensure that all tensors are contiguous before concatenating or further processing
+                weights = [state_dict[name][s].contiguous() for state_dict in state_dicts]
+                tp_size = len(weights)
+
                 full_weight = (
-                    torch.chunk(full_weight, gqa_qkv_metadata["kv_size_multiplier"], dim=0)[0].detach().clone()
-                )
-            elif weight_name == "weight_q" or original_name in gqa_qkv_output_projections_names:
-                full_weight = create_gqa_query_or_output_projection_weight_from_full_weight(
-                    full_weight,
-                    tp_size,
-                    gqa_qkv_metadata["num_attention_heads"],
-                    gqa_qkv_metadata["num_key_value_heads"],
-                    gqa_qkv_metadata["kv_size_multiplier"],
-                    "query" if weight_name == "weight_q" else "output",
-                )
-            consolidated_state_dict[original_name] = full_weight
+                    torch.cat(
+                        weights,
+                        dim=sharded_metadata.partition_dim,
+                    )
+                    .to("cpu")
+                    .contiguous()
+                )  # Ensure the result is also contiguous
+
+                if weight_name in ["weight_k", "weight_v", "bias_k", "bias_v"]:
+                    full_weight = (
+                        torch.chunk(full_weight, gqa_qkv_metadata["kv_size_multiplier"], dim=0)[0].detach().clone()
+                    )
+                elif weight_name == "weight_q" or original_name in gqa_qkv_output_projections_names:
+                    full_weight = create_gqa_query_or_output_projection_weight_from_full_weight(
+                        full_weight,
+                        tp_size,
+                        gqa_qkv_metadata["num_attention_heads"],
+                        gqa_qkv_metadata["num_key_value_heads"],
+                        gqa_qkv_metadata["kv_size_multiplier"],
+                        "query" if weight_name == "weight_q" else "output",
+                    )
+                consolidated_state_dict[original_name] = full_weight
 
     return consolidated_state_dict
 

diff --git a/optimum/neuron/distributed/parallel_layers.py b/optimum/neuron/distributed/parallel_layers.py
@@ -379,8 +379,8 @@ def replace_qkv_by_gqa_qkv_column_parallel_linear(
         key_linear = getattr(attention_layer, cls.KEYS_NAME)
 
         hidden_size = query_linear.weight.size(1)
-        query_in_features = query_linear.weight.size(0)
-        key_value_in_features = key_linear.weight.size(0)
+        query_out_features = query_linear.out_features
+        key_value_out_features = key_linear.out_features
 
         if kv_size_multiplier is None:
             kv_size_multiplier = get_tensor_model_parallel_size() // num_key_value_heads
@@ -397,7 +397,7 @@ def replace_qkv_by_gqa_qkv_column_parallel_linear(
             num_attention_heads,
             num_key_value_heads,
             hidden_size,
-            [query_in_features, key_value_in_features],
+            [query_out_features, key_value_out_features],
             gather_output=False,
             bias=query_linear.bias is not None,
             sequence_parallel_enabled=sequence_parallel_enabled,