From 35e06900f0951e375911138c0d8df21536246e8e Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 09:21:53 +0000 Subject: [PATCH 01/11] doc: update installation instructions In order to be able to be independent of optimum releases to update optimum-neuron transformer dependency, we now recommend a direct installation instead of installing optimum-neuron as an optimum extra. --- README.md | 4 ++-- docs/source/guides/export_model.mdx | 4 ++-- docs/source/inference_tutorials/stable_diffusion.mdx | 4 ++-- docs/source/installation.mdx | 4 ++-- notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb | 2 +- notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb | 2 +- optimum/exporters/neuron/base.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index dbc9b0a26..6ce564ed9 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,13 @@ To install the latest release of this package: * For AWS Trainium (trn1) or AWS inferentia2 (inf2) ```bash -pip install --upgrade-strategy eager optimum[neuronx] +pip install --upgrade-strategy eager optimum-neuron[neuronx] ``` * For AWS inferentia (inf1) ```bash -pip install --upgrade-strategy eager optimum[neuron] +pip install --upgrade-strategy eager optimum-neuron[neuron] ``` Optimum Neuron is a fast-moving project, and you may want to install it from source: diff --git a/docs/source/guides/export_model.mdx b/docs/source/guides/export_model.mdx index 3741aad1d..bb98a090d 100644 --- a/docs/source/guides/export_model.mdx +++ b/docs/source/guides/export_model.mdx @@ -65,13 +65,13 @@ To export a 🤗 Transformers model to Neuron, you'll first need to install some **For Inf2** ```bash -pip install optimum[neuronx] +pip install optimum-neuron[neuronx] ``` **For Inf1** ```bash -pip install optimum[neuron] +pip install optimum-neuron[neuron] ``` The Optimum Neuron export can be used through Optimum command-line: diff --git a/docs/source/inference_tutorials/stable_diffusion.mdx b/docs/source/inference_tutorials/stable_diffusion.mdx index f42e9340e..9245406e0 100644 --- a/docs/source/inference_tutorials/stable_diffusion.mdx +++ b/docs/source/inference_tutorials/stable_diffusion.mdx @@ -25,7 +25,7 @@ limitations under the License. To get started, make sure you have [configured your inf2 / trn1 instance](../installation), and installed optimum: ```bash -pip install "optimum[neuronx, diffusers]" +pip install optimum-neuron[neuronx] diffusers ``` ### Compile Stable Diffusion @@ -585,7 +585,7 @@ pipe.save_pretrained("sd_neuron_controlnet") ### Text-to-Image -For text-to-image, we can specify an additional conditioning input. +For text-to-image, we can specify an additional conditioning input. Here is an example with a canny image, a white outline of an image on a black background. The ControlNet will use the canny image as a control to guide the model to generate an image with the same outline. diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 1758f1a72..f5e368d13 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -29,11 +29,11 @@ python -m pip config set global.extra-index-url https://pip.repos.neuron.amazona ## Installing `optimum-neuron` for AWS Trainium (`trn1`) or AWS inferentia2 (`inf2`) ```bash -python -m pip install --upgrade-strategy eager optimum[neuronx] +python -m pip install --upgrade-strategy eager optimum-neuron[neuronx] ``` ## Installing `optimum-neuron` for AWS inferentia (`inf1`) ```bash -python -m pip install --upgrade-strategy eager optimum[neuron] +python -m pip install --upgrade-strategy eager optimum-neuron[neuron] ``` diff --git a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb index 775fd254c..364c4a593 100644 --- a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb +++ b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb @@ -25,7 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install \"optimum[neuronx, diffusers]\" matplotlib" + "!pip install \"optimum-neuron[neuronx]\" diffusers matplotlib" ] }, { diff --git a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb index c8fafda7c..1529c601d 100644 --- a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb +++ b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb @@ -25,7 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install \"optimum[neuronx, diffusers]\" matplotlib" + "!pip install \"optimum-neuron[neuronx]\" diffusers matplotlib" ] }, { diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py index 62e0d91a8..3e7c4740c 100644 --- a/optimum/exporters/neuron/base.py +++ b/optimum/exporters/neuron/base.py @@ -458,7 +458,7 @@ class NeuronDecoderConfig(NeuronConfig): def __init__(self, task: str): if not is_transformers_neuronx_available(): raise ModuleNotFoundError( - "The mandatory transformers-neuronx package is missing. Please install optimum[neuronx]." + "The mandatory transformers-neuronx package is missing. Please install optimum-neuron[neuronx]." ) if isinstance(self.NEURONX_CLASS, type): self._neuronx_class = self.NEURONX_CLASS From 24bc42f66396a44b14eacb1f490b58caca44fc19 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 09:25:26 +0000 Subject: [PATCH 02/11] feat: bump transformers version --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 71f287de9..3a3893fd2 100644 --- a/setup.py +++ b/setup.py @@ -13,10 +13,10 @@ INSTALL_REQUIRES = [ - "transformers == 4.46.2", + "transformers ~= 4.48.1", "accelerate == 0.29.2", - "optimum ~= 1.23.0", - "huggingface_hub >= 0.20.1", + "optimum ~= 1.23.3", + "huggingface_hub >= 0.28.0", "numpy>=1.22.2, <=1.25.2", "protobuf>=3.20.3, <4", ] From 5735e7b12e3e5d2334b3fce5515b66b9a64a8743 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 09:58:43 +0000 Subject: [PATCH 03/11] refactor(training): shard_checkpoint does not exist anymore --- optimum/neuron/distributed/checkpointing.py | 29 ++++++++++++++------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py index 19075afd1..372ed1ff9 100644 --- a/optimum/neuron/distributed/checkpointing.py +++ b/optimum/neuron/distributed/checkpointing.py @@ -20,7 +20,7 @@ from typing import Any, Callable, Dict, List, Literal, Union import torch -from transformers.modeling_utils import shard_checkpoint +from huggingface_hub import split_torch_state_dict_into_shards from transformers.utils import ( SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, @@ -255,16 +255,27 @@ def consolidate_model_parallel_checkpoints_to_unified_checkpoint( output_dir.mkdir(parents=True, exist_ok=True) state_dict = consolidate_model_parallel_checkpoints(checkpoint_dir) - shards, index = shard_checkpoint( - state_dict, weights_name=safe_weights_name if save_format == "safetensors" else weights_name + state_dict_split = split_torch_state_dict_into_shards( + state_dict, filename_pattern=safe_weights_name if save_format == "safetensors" else weights_name ) - for shard_file, shard in shards.items(): - if save_format == "safetensors": - save_file(shard, output_dir / shard_file, metadata={"format": "pt"}) - else: - torch.save(shard, output_dir / shard_file) - if index is not None: + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } save_index_file = SAFE_WEIGHTS_INDEX_NAME if save_format == "safetensors" else WEIGHTS_INDEX_NAME with open(output_dir / save_index_file, "w") as fp: content = json.dumps(index, indent=2, sort_keys=True) + "\n" fp.write(content) + # Save the model + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in filename_to_tensors: + shard = {} + for tensor in tensors: + shard[tensor] = state_dict[tensor].contiguous() + del state_dict[tensor] + if save_format == "safetensors": + save_file(shard, output_dir / shard_file, metadata={"format": "pt"}) + else: + torch.save(shard, output_dir / shard_file) From e5b2cfeca836bdc990d91c2c7b5ae73ae1d04436 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 10:11:40 +0000 Subject: [PATCH 04/11] chore: bump dev version --- optimum/neuron/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py index e0ce222fe..f9326a281 100644 --- a/optimum/neuron/version.py +++ b/optimum/neuron/version.py @@ -12,6 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.0.28.dev0" +__version__ = "0.0.28.dev1" __sdk_version__ = "2.20.2" From 3a7d0235919e070361e22fa965900a0348d6d8e1 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 10:47:28 +0000 Subject: [PATCH 05/11] chore(ami): use AWS Neuron SDK 2.20.2 base image --- infrastructure/ami/hcl2-files/variables.pkr.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl index f833dc17e..53e19353a 100644 --- a/infrastructure/ami/hcl2-files/variables.pkr.hcl +++ b/infrastructure/ami/hcl2-files/variables.pkr.hcl @@ -10,7 +10,7 @@ variable "instance_type" { } variable "source_ami" { - default = "ami-0980ce83654efe544" + default = "ami-034a7ef9c22c72085" description = "Base Image" type = string /* From 50a82292ac54f602045a7a35192647be799fbac5 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 13:01:48 +0000 Subject: [PATCH 06/11] fix(ami): use new AWS venv path --- infrastructure/ami/hcl2-files/build.pkr.hcl | 2 +- infrastructure/ami/scripts/install-huggingface-libraries.sh | 6 +++--- infrastructure/ami/scripts/validate-neuron.sh | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/infrastructure/ami/hcl2-files/build.pkr.hcl b/infrastructure/ami/hcl2-files/build.pkr.hcl index e412ca2e8..cda8df5bd 100644 --- a/infrastructure/ami/hcl2-files/build.pkr.hcl +++ b/infrastructure/ami/hcl2-files/build.pkr.hcl @@ -14,7 +14,7 @@ build { ] } provisioner "shell" { - inline = ["echo 'source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"] + inline = ["echo 'source /opt/aws_neuronx_venv_pytorch/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"] } provisioner "file" { source = "scripts/welcome-msg.sh" diff --git a/infrastructure/ami/scripts/install-huggingface-libraries.sh b/infrastructure/ami/scripts/install-huggingface-libraries.sh index ead353e75..bf9cc36fa 100644 --- a/infrastructure/ami/scripts/install-huggingface-libraries.sh +++ b/infrastructure/ami/scripts/install-huggingface-libraries.sh @@ -1,7 +1,7 @@ #!/bin/bash # Activate the neuron virtual environment -source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate +source /opt/aws_neuronx_venv_pytorch/bin/activate echo "Step: install-hugging-face-libraries" @@ -15,12 +15,12 @@ pip install --upgrade --no-cache-dir \ "markupsafe==2.1.1" \ "jinja2==3.1.2" \ "attrs==23.1.0" \ - "hf_transfer>=0.1.4" + "hf_transfer>=0.1.4" # Temporary fix for the issue: https://github.com/huggingface/optimum-neuron/issues/142 pip install -U optimum echo 'export PATH="${HOME}/.local/bin:$PATH"' >> "${HOME}/.bashrc" -# Add HF_TRANSFER by default +# Add HF_TRANSFER by default echo 'export HF_HUB_ENABLE_HF_TRANSFER=1' >> "${HOME}/.bashrc" echo "Step: install-and-copy-optimum-neuron-examples" diff --git a/infrastructure/ami/scripts/validate-neuron.sh b/infrastructure/ami/scripts/validate-neuron.sh index 5d8c99109..c3687ec16 100644 --- a/infrastructure/ami/scripts/validate-neuron.sh +++ b/infrastructure/ami/scripts/validate-neuron.sh @@ -3,7 +3,7 @@ echo "Step: validate-neuron-devices" neuron-ls # Activate the neuron virtual environment -source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate +source /opt/aws_neuronx_venv_pytorch/bin/activate python -c 'import torch' python -c 'import torch_neuronx' From 048447226778866f234a53f7e883d40491149f29 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 13:35:31 +0000 Subject: [PATCH 07/11] fix(tests): set minimum timm version This is because otherwise contronel-aux will pull an extremely old version of timm. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 3a3893fd2..effc0de20 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ "opencv-python-headless", "controlnet-aux", "mediapipe", + "timm >= 1.0.0", ] QUALITY_REQUIRES = [ From 172fbc54f3b2495ca76ea14be308960c0147026f Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Wed, 29 Jan 2025 16:32:10 +0000 Subject: [PATCH 08/11] fix: add legacy attributes to LLama attention The parallelization code expects these parameters to be set. A proper fix would be to write a specific Llama parallel model. --- optimum/neuron/distributed/decoder_models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py index 1e8add2fa..109feb1fd 100644 --- a/optimum/neuron/distributed/decoder_models.py +++ b/optimum/neuron/distributed/decoder_models.py @@ -588,6 +588,10 @@ def _parallelize( layers = model.model.layers for layer in layers: + # FIXME: temporary workaround to avoid too many changes in the transformation code + layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads + layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads + layer.self_attn.hidden_size = layer.self_attn.config.hidden_size layer.self_attn = LlamaParallelSelfAttention.transform( model, layer.self_attn, From 53f7fe4882ada2e554391a6fc97e325f4237cfb3 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 30 Jan 2025 08:50:32 +0000 Subject: [PATCH 09/11] fix(distributed): align Llama attention forward --- optimum/neuron/distributed/decoder_models.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py index 109feb1fd..a8a8f4f1e 100644 --- a/optimum/neuron/distributed/decoder_models.py +++ b/optimum/neuron/distributed/decoder_models.py @@ -441,12 +441,11 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral def attention_forward( self, hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = False, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if self.config.pretraining_tp > 1: @@ -489,8 +488,8 @@ def attention_forward( value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) past_key_value = getattr(self, "past_key_value", past_key_value) - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: # sin and cos are specific to RoPE models; cache_position needed for the static cache @@ -539,7 +538,7 @@ def attention_forward( if not output_attentions: attn_weights = None - return attn_output, attn_weights, past_key_value + return attn_output, attn_weights for module in model.modules(): if isinstance(module, LlamaAttention): From c51652a424e72fab39ff7ba61a83f4572c57cff3 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 30 Jan 2025 09:16:11 +0000 Subject: [PATCH 10/11] fix(distributed): add legacy attributes to Mistral attention The transformation code expects some attributes to be set. --- optimum/neuron/distributed/decoder_models.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py index a8a8f4f1e..629a070e9 100644 --- a/optimum/neuron/distributed/decoder_models.py +++ b/optimum/neuron/distributed/decoder_models.py @@ -834,6 +834,10 @@ def _parallelize( **parallel_layer_specific_kwargs, ) for layer in model.model.layers: + # FIXME: temporary workaround to avoid too many changes in the transformation code + layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads + layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads + layer.self_attn.hidden_size = layer.self_attn.config.hidden_size layer.self_attn = MistralParallelSelfAttention.transform( model, layer.self_attn, From 2fc13fc89c21091bc3132e08b19ad190f3ce65d0 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Thu, 30 Jan 2025 09:14:23 +0000 Subject: [PATCH 11/11] fix(distributed): align mistral attention forward --- optimum/neuron/distributed/decoder_models.py | 25 ++++---------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py index 629a070e9..431c2a264 100644 --- a/optimum/neuron/distributed/decoder_models.py +++ b/optimum/neuron/distributed/decoder_models.py @@ -720,12 +720,10 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral def attention_forward( self, hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) @@ -748,12 +746,8 @@ def attention_forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += cache_position[0] - - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models @@ -767,18 +761,7 @@ def attention_forward( attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask # upcast attention to fp32 @@ -804,7 +787,7 @@ def attention_forward( if not output_attentions: attn_weights = None - return attn_output, attn_weights, past_key_value + return attn_output, attn_weights for module in model.modules(): if isinstance(module, MistralAttention):