From 35e06900f0951e375911138c0d8df21536246e8e Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 09:21:53 +0000
Subject: [PATCH 01/11] doc: update installation instructions

In order to be able to be independent of optimum releases to update
optimum-neuron transformer dependency, we now recommend a direct
installation instead of installing optimum-neuron as an optimum extra.
---
 README.md                                                    | 4 ++--
 docs/source/guides/export_model.mdx                          | 4 ++--
 docs/source/inference_tutorials/stable_diffusion.mdx         | 4 ++--
 docs/source/installation.mdx                                 | 4 ++--
 notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb    | 2 +-
 notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb | 2 +-
 optimum/exporters/neuron/base.py                             | 2 +-
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index dbc9b0a26..6ce564ed9 100644
--- a/README.md
+++ b/README.md
@@ -26,13 +26,13 @@ To install the latest release of this package:
 * For AWS Trainium (trn1) or AWS inferentia2 (inf2)
 
 ```bash
-pip install --upgrade-strategy eager optimum[neuronx]
+pip install --upgrade-strategy eager optimum-neuron[neuronx]
 ```
 
 * For AWS inferentia (inf1)
 
 ```bash
-pip install --upgrade-strategy eager optimum[neuron]
+pip install --upgrade-strategy eager optimum-neuron[neuron]
 ```
 
 Optimum Neuron is a fast-moving project, and you may want to install it from source:
diff --git a/docs/source/guides/export_model.mdx b/docs/source/guides/export_model.mdx
index 3741aad1d..bb98a090d 100644
--- a/docs/source/guides/export_model.mdx
+++ b/docs/source/guides/export_model.mdx
@@ -65,13 +65,13 @@ To export a 🤗 Transformers model to Neuron, you'll first need to install some
 **For Inf2**
 
 ```bash
-pip install optimum[neuronx]
+pip install optimum-neuron[neuronx]
 ```
 
 **For Inf1**
 
 ```bash
-pip install optimum[neuron]
+pip install optimum-neuron[neuron]
 ```
 
 The Optimum Neuron export can be used through Optimum command-line:
diff --git a/docs/source/inference_tutorials/stable_diffusion.mdx b/docs/source/inference_tutorials/stable_diffusion.mdx
index f42e9340e..9245406e0 100644
--- a/docs/source/inference_tutorials/stable_diffusion.mdx
+++ b/docs/source/inference_tutorials/stable_diffusion.mdx
@@ -25,7 +25,7 @@ limitations under the License.
 To get started, make sure you have [configured your inf2 / trn1 instance](../installation), and installed optimum:
 
 ```bash
-pip install "optimum[neuronx, diffusers]"
+pip install optimum-neuron[neuronx] diffusers
 ```
 
 ### Compile Stable Diffusion
@@ -585,7 +585,7 @@ pipe.save_pretrained("sd_neuron_controlnet")
 
 ### Text-to-Image
 
-For text-to-image, we can specify an additional conditioning input. 
+For text-to-image, we can specify an additional conditioning input.
 
 Here is an example with a canny image, a white outline of an image on a black background. The ControlNet will use the canny image as a control to guide the model to generate an image with the same outline.
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 1758f1a72..f5e368d13 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -29,11 +29,11 @@ python -m pip config set global.extra-index-url https://pip.repos.neuron.amazona
 ## Installing `optimum-neuron` for AWS Trainium (`trn1`) or AWS inferentia2 (`inf2`)
 
 ```bash
-python -m pip install --upgrade-strategy eager optimum[neuronx]
+python -m pip install --upgrade-strategy eager optimum-neuron[neuronx]
 ```
 
 ## Installing `optimum-neuron` for AWS inferentia (`inf1`)
 
 ```bash
-python -m pip install --upgrade-strategy eager optimum[neuron]
+python -m pip install --upgrade-strategy eager optimum-neuron[neuron]
 ```
diff --git a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
index 775fd254c..364c4a593 100644
--- a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
+++ b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
@@ -25,7 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"optimum[neuronx, diffusers]\" matplotlib"
+    "!pip install \"optimum-neuron[neuronx]\" diffusers matplotlib"
    ]
   },
   {
diff --git a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
index c8fafda7c..1529c601d 100644
--- a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
+++ b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
@@ -25,7 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"optimum[neuronx, diffusers]\" matplotlib"
+    "!pip install \"optimum-neuron[neuronx]\" diffusers matplotlib"
    ]
   },
   {
diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 62e0d91a8..3e7c4740c 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -458,7 +458,7 @@ class NeuronDecoderConfig(NeuronConfig):
     def __init__(self, task: str):
         if not is_transformers_neuronx_available():
             raise ModuleNotFoundError(
-                "The mandatory transformers-neuronx package is missing. Please install optimum[neuronx]."
+                "The mandatory transformers-neuronx package is missing. Please install optimum-neuron[neuronx]."
             )
         if isinstance(self.NEURONX_CLASS, type):
             self._neuronx_class = self.NEURONX_CLASS

From 24bc42f66396a44b14eacb1f490b58caca44fc19 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 09:25:26 +0000
Subject: [PATCH 02/11] feat: bump transformers version

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 71f287de9..3a3893fd2 100644
--- a/setup.py
+++ b/setup.py
@@ -13,10 +13,10 @@
 
 
 INSTALL_REQUIRES = [
-    "transformers == 4.46.2",
+    "transformers ~= 4.48.1",
     "accelerate == 0.29.2",
-    "optimum ~= 1.23.0",
-    "huggingface_hub >= 0.20.1",
+    "optimum ~= 1.23.3",
+    "huggingface_hub >= 0.28.0",
     "numpy>=1.22.2, <=1.25.2",
     "protobuf>=3.20.3, <4",
 ]

From 5735e7b12e3e5d2334b3fce5515b66b9a64a8743 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 09:58:43 +0000
Subject: [PATCH 03/11] refactor(training): shard_checkpoint does not exist
 anymore

---
 optimum/neuron/distributed/checkpointing.py | 29 ++++++++++++++-------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py
index 19075afd1..372ed1ff9 100644
--- a/optimum/neuron/distributed/checkpointing.py
+++ b/optimum/neuron/distributed/checkpointing.py
@@ -20,7 +20,7 @@
 from typing import Any, Callable, Dict, List, Literal, Union
 
 import torch
-from transformers.modeling_utils import shard_checkpoint
+from huggingface_hub import split_torch_state_dict_into_shards
 from transformers.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -255,16 +255,27 @@ def consolidate_model_parallel_checkpoints_to_unified_checkpoint(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     state_dict = consolidate_model_parallel_checkpoints(checkpoint_dir)
-    shards, index = shard_checkpoint(
-        state_dict, weights_name=safe_weights_name if save_format == "safetensors" else weights_name
+    state_dict_split = split_torch_state_dict_into_shards(
+        state_dict, filename_pattern=safe_weights_name if save_format == "safetensors" else weights_name
     )
-    for shard_file, shard in shards.items():
-        if save_format == "safetensors":
-            save_file(shard, output_dir / shard_file, metadata={"format": "pt"})
-        else:
-            torch.save(shard, output_dir / shard_file)
-    if index is not None:
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
         save_index_file = SAFE_WEIGHTS_INDEX_NAME if save_format == "safetensors" else WEIGHTS_INDEX_NAME
         with open(output_dir / save_index_file, "w") as fp:
             content = json.dumps(index, indent=2, sort_keys=True) + "\n"
             fp.write(content)
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in filename_to_tensors:
+        shard = {}
+        for tensor in tensors:
+            shard[tensor] = state_dict[tensor].contiguous()
+            del state_dict[tensor]
+        if save_format == "safetensors":
+            save_file(shard, output_dir / shard_file, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_dir / shard_file)

From e5b2cfeca836bdc990d91c2c7b5ae73ae1d04436 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 10:11:40 +0000
Subject: [PATCH 04/11] chore: bump dev version

---
 optimum/neuron/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
index e0ce222fe..f9326a281 100644
--- a/optimum/neuron/version.py
+++ b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.0.28.dev0"
+__version__ = "0.0.28.dev1"
 
 __sdk_version__ = "2.20.2"

From 3a7d0235919e070361e22fa965900a0348d6d8e1 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 10:47:28 +0000
Subject: [PATCH 05/11] chore(ami): use AWS Neuron SDK 2.20.2 base image

---
 infrastructure/ami/hcl2-files/variables.pkr.hcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl
index f833dc17e..53e19353a 100644
--- a/infrastructure/ami/hcl2-files/variables.pkr.hcl
+++ b/infrastructure/ami/hcl2-files/variables.pkr.hcl
@@ -10,7 +10,7 @@ variable "instance_type" {
 }
 
 variable "source_ami" {
-  default     = "ami-0980ce83654efe544"
+  default     = "ami-034a7ef9c22c72085"
   description = "Base Image"
   type        = string
   /*

From 50a82292ac54f602045a7a35192647be799fbac5 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 13:01:48 +0000
Subject: [PATCH 06/11] fix(ami): use new AWS venv path

---
 infrastructure/ami/hcl2-files/build.pkr.hcl                 | 2 +-
 infrastructure/ami/scripts/install-huggingface-libraries.sh | 6 +++---
 infrastructure/ami/scripts/validate-neuron.sh               | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/infrastructure/ami/hcl2-files/build.pkr.hcl b/infrastructure/ami/hcl2-files/build.pkr.hcl
index e412ca2e8..cda8df5bd 100644
--- a/infrastructure/ami/hcl2-files/build.pkr.hcl
+++ b/infrastructure/ami/hcl2-files/build.pkr.hcl
@@ -14,7 +14,7 @@ build {
     ]
   }
   provisioner "shell" {
-    inline = ["echo 'source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"]
+    inline = ["echo 'source /opt/aws_neuronx_venv_pytorch/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"]
   }
   provisioner "file" {
     source      = "scripts/welcome-msg.sh"
diff --git a/infrastructure/ami/scripts/install-huggingface-libraries.sh b/infrastructure/ami/scripts/install-huggingface-libraries.sh
index ead353e75..bf9cc36fa 100644
--- a/infrastructure/ami/scripts/install-huggingface-libraries.sh
+++ b/infrastructure/ami/scripts/install-huggingface-libraries.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Activate the neuron virtual environment
-source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate
+source /opt/aws_neuronx_venv_pytorch/bin/activate
 
 echo "Step: install-hugging-face-libraries"
 
@@ -15,12 +15,12 @@ pip install --upgrade --no-cache-dir \
     "markupsafe==2.1.1" \
     "jinja2==3.1.2" \
     "attrs==23.1.0" \
-    "hf_transfer>=0.1.4" 
+    "hf_transfer>=0.1.4"
 
 # Temporary fix for the issue: https://github.com/huggingface/optimum-neuron/issues/142
 pip install -U optimum
 echo 'export PATH="${HOME}/.local/bin:$PATH"' >> "${HOME}/.bashrc"
-# Add HF_TRANSFER by default 
+# Add HF_TRANSFER by default
 echo 'export HF_HUB_ENABLE_HF_TRANSFER=1' >> "${HOME}/.bashrc"
 
 echo "Step: install-and-copy-optimum-neuron-examples"
diff --git a/infrastructure/ami/scripts/validate-neuron.sh b/infrastructure/ami/scripts/validate-neuron.sh
index 5d8c99109..c3687ec16 100644
--- a/infrastructure/ami/scripts/validate-neuron.sh
+++ b/infrastructure/ami/scripts/validate-neuron.sh
@@ -3,7 +3,7 @@ echo "Step: validate-neuron-devices"
 neuron-ls
 
 # Activate the neuron virtual environment
-source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate
+source /opt/aws_neuronx_venv_pytorch/bin/activate
 
 python -c 'import torch'
 python -c 'import torch_neuronx'

From 048447226778866f234a53f7e883d40491149f29 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 13:35:31 +0000
Subject: [PATCH 07/11] fix(tests): set minimum timm version

This is because otherwise contronel-aux will pull an extremely old
version of timm.
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 3a3893fd2..effc0de20 100644
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,7 @@
     "opencv-python-headless",
     "controlnet-aux",
     "mediapipe",
+    "timm >= 1.0.0",
 ]
 
 QUALITY_REQUIRES = [

From 172fbc54f3b2495ca76ea14be308960c0147026f Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 29 Jan 2025 16:32:10 +0000
Subject: [PATCH 08/11] fix: add legacy attributes to LLama attention

The parallelization code expects these parameters to be set. A proper
fix would be to write a specific Llama parallel model.
---
 optimum/neuron/distributed/decoder_models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
index 1e8add2fa..109feb1fd 100644
--- a/optimum/neuron/distributed/decoder_models.py
+++ b/optimum/neuron/distributed/decoder_models.py
@@ -588,6 +588,10 @@ def _parallelize(
             layers = model.model.layers
 
         for layer in layers:
+            # FIXME: temporary workaround to avoid too many changes in the transformation code
+            layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads
+            layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads
+            layer.self_attn.hidden_size = layer.self_attn.config.hidden_size
             layer.self_attn = LlamaParallelSelfAttention.transform(
                 model,
                 layer.self_attn,

From 53f7fe4882ada2e554391a6fc97e325f4237cfb3 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 30 Jan 2025 08:50:32 +0000
Subject: [PATCH 09/11] fix(distributed): align Llama attention forward

---
 optimum/neuron/distributed/decoder_models.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
index 109feb1fd..a8a8f4f1e 100644
--- a/optimum/neuron/distributed/decoder_models.py
+++ b/optimum/neuron/distributed/decoder_models.py
@@ -441,12 +441,11 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
         def attention_forward(
             self,
             hidden_states: torch.Tensor,
+            position_embeddings: Tuple[torch.Tensor, torch.Tensor],
             attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
             past_key_value: Optional[Cache] = None,
-            output_attentions: bool = False,
-            use_cache: bool = False,
             cache_position: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = False,
             **kwargs,
         ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
             if self.config.pretraining_tp > 1:
@@ -489,8 +488,8 @@ def attention_forward(
                 value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
             past_key_value = getattr(self, "past_key_value", past_key_value)
-            cos, sin = self.rotary_emb(value_states, position_ids)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
             if past_key_value is not None:
                 # sin and cos are specific to RoPE models; cache_position needed for the static cache
@@ -539,7 +538,7 @@ def attention_forward(
             if not output_attentions:
                 attn_weights = None
 
-            return attn_output, attn_weights, past_key_value
+            return attn_output, attn_weights
 
         for module in model.modules():
             if isinstance(module, LlamaAttention):

From c51652a424e72fab39ff7ba61a83f4572c57cff3 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 30 Jan 2025 09:16:11 +0000
Subject: [PATCH 10/11] fix(distributed): add legacy attributes to Mistral
 attention

The transformation code expects some attributes to be set.
---
 optimum/neuron/distributed/decoder_models.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
index a8a8f4f1e..629a070e9 100644
--- a/optimum/neuron/distributed/decoder_models.py
+++ b/optimum/neuron/distributed/decoder_models.py
@@ -834,6 +834,10 @@ def _parallelize(
                 **parallel_layer_specific_kwargs,
             )
         for layer in model.model.layers:
+            # FIXME: temporary workaround to avoid too many changes in the transformation code
+            layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads
+            layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads
+            layer.self_attn.hidden_size = layer.self_attn.config.hidden_size
             layer.self_attn = MistralParallelSelfAttention.transform(
                 model,
                 layer.self_attn,

From 2fc13fc89c21091bc3132e08b19ad190f3ce65d0 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 30 Jan 2025 09:14:23 +0000
Subject: [PATCH 11/11] fix(distributed): align mistral attention forward

---
 optimum/neuron/distributed/decoder_models.py | 25 ++++----------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
index 629a070e9..431c2a264 100644
--- a/optimum/neuron/distributed/decoder_models.py
+++ b/optimum/neuron/distributed/decoder_models.py
@@ -720,12 +720,10 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
         def attention_forward(
             self,
             hidden_states: torch.Tensor,
+            position_embeddings: Tuple[torch.Tensor, torch.Tensor],
             attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
             past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
-            use_cache: bool = False,
-            cache_position: Optional[torch.LongTensor] = None,
         ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
@@ -748,12 +746,8 @@ def attention_forward(
                 key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
                 value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-            kv_seq_len = key_states.shape[-2]
-            if past_key_value is not None:
-                kv_seq_len += cache_position[0]
-
-            cos, sin = self.rotary_emb(value_states, position_ids)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
             if past_key_value is not None:
                 cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
@@ -767,18 +761,7 @@ def attention_forward(
 
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
-            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                    f" {attn_weights.size()}"
-                )
-
             if attention_mask is not None:
-                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                    raise ValueError(
-                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                    )
-
                 attn_weights = attn_weights + attention_mask
 
             # upcast attention to fp32
@@ -804,7 +787,7 @@ def attention_forward(
             if not output_attentions:
                 attn_weights = None
 
-            return attn_output, attn_weights, past_key_value
+            return attn_output, attn_weights
 
         for module in model.modules():
             if isinstance(module, MistralAttention):