From 7e8a33bb6cd18184270743475b78afb2ae5f14af Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Fri, 2 Feb 2024 16:40:54 +0100
Subject: [PATCH 01/12] Fix sdxl inpaint pipeline for diffusers 0.26.* (#458)

* chore: update dev version

* ci: split inf1 & inf2 workflows

* update inpaint pipe for sdxl

* fix setup

* fix typo

* revert trfrs version

---------

Co-authored-by: David Corvoysier <david@huggingface.co>
---
 .github/workflows/test_inf1_export.yml        |  49 ++++++
 ...{test_inf1.yml => test_inf1_inference.yml} |  10 +-
 .github/workflows/test_inf2.yml               |  12 --
 .github/workflows/test_inf2_export.yml        |  41 +++++
 .github/workflows/test_inf2_inference.yml     |  45 ++++++
 .../pipeline_stable_diffusion_xl_inpaint.py   | 145 +++++++++++++++---
 optimum/neuron/version.py                     |   2 +-
 setup.py                                      |   2 +-
 8 files changed, 264 insertions(+), 42 deletions(-)
 create mode 100644 .github/workflows/test_inf1_export.yml
 rename .github/workflows/{test_inf1.yml => test_inf1_inference.yml} (77%)
 create mode 100644 .github/workflows/test_inf2_export.yml
 create mode 100644 .github/workflows/test_inf2_inference.yml

diff --git a/.github/workflows/test_inf1_export.yml b/.github/workflows/test_inf1_export.yml
new file mode 100644
index 000000000..be3bf5954
--- /dev/null
+++ b/.github/workflows/test_inf1_export.yml
@@ -0,0 +1,49 @@
+name: Optimum neuron / Test INF1 export
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run INF1 tests
+    runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Check AMI
+        run: dpkg -l | grep neuron
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install system packages
+        run: |
+          sudo apt install python3.8-venv -y
+      - name: Install python packages
+        run: |
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuron,tests]
+          python -m pip uninstall optimum -y
+          python -m pip install optimum
+      - name: Run CLI tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
+      - name: Run export tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
diff --git a/.github/workflows/test_inf1.yml b/.github/workflows/test_inf1_inference.yml
similarity index 77%
rename from .github/workflows/test_inf1.yml
rename to .github/workflows/test_inf1_inference.yml
index 780d4df3f..5bdd9571a 100644
--- a/.github/workflows/test_inf1.yml
+++ b/.github/workflows/test_inf1_inference.yml
@@ -1,4 +1,4 @@
-name: Optimum neuron / Test INF1
+name: Optimum neuron / Test INF1 inference & pipelines
 
 on:
   push:
@@ -39,14 +39,6 @@ jobs:
           python -m pip install .[neuron,tests]
           python -m pip uninstall optimum -y
           python -m pip install optimum
-      - name: Run CLI tests
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
-      - name: Run export tests
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
       - name: Run inference tests
         run: |
           source aws_neuron_venv_pytorch/bin/activate
diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
index e1e8b3015..333c0bd08 100644
--- a/.github/workflows/test_inf2.yml
+++ b/.github/workflows/test_inf2.yml
@@ -47,15 +47,3 @@ jobs:
         run: |
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/generation
-      - name: Run exporters tests
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
-      - name: Run inference tests
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/inference
-      - name: Run pipelines tests
-        run: |
-          source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/pipelines
diff --git a/.github/workflows/test_inf2_export.yml b/.github/workflows/test_inf2_export.yml
new file mode 100644
index 000000000..796b0933a
--- /dev/null
+++ b/.github/workflows/test_inf2_export.yml
@@ -0,0 +1,41 @@
+name: Optimum neuron / Test INF2 export
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run INF2 tests
+    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Check AMI
+        run: dpkg -l | grep neuron
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install python dependencies
+        run: |
+          sudo apt install python3.8-venv -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuronx,tests]
+      - name: Run exporters tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
diff --git a/.github/workflows/test_inf2_inference.yml b/.github/workflows/test_inf2_inference.yml
new file mode 100644
index 000000000..a70f44599
--- /dev/null
+++ b/.github/workflows/test_inf2_inference.yml
@@ -0,0 +1,45 @@
+name: Optimum neuron / Test INF2 inference & pipelines
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run INF2 tests
+    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Check AMI
+        run: dpkg -l | grep neuron
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install python dependencies
+        run: |
+          sudo apt install python3.8-venv -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m pip install .[neuronx,tests]
+      - name: Run inference tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/inference
+      - name: Run pipelines tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/pipelines
diff --git a/optimum/neuron/pipelines/diffusers/pipeline_stable_diffusion_xl_inpaint.py b/optimum/neuron/pipelines/diffusers/pipeline_stable_diffusion_xl_inpaint.py
index c796c35ae..03198e43c 100644
--- a/optimum/neuron/pipelines/diffusers/pipeline_stable_diffusion_xl_inpaint.py
+++ b/optimum/neuron/pipelines/diffusers/pipeline_stable_diffusion_xl_inpaint.py
@@ -19,9 +19,13 @@
 
 import torch
 from diffusers import StableDiffusionXLInpaintPipeline
-from diffusers.image_processor import VaeImageProcessor
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint import rescale_noise_cfg
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint import (
+    rescale_noise_cfg,
+    retrieve_timesteps,
+)
+from diffusers.utils import deprecate
 
 from .pipeline_utils import StableDiffusionXLPipelineMixin
 
@@ -80,8 +84,10 @@ def __call__(
         image: Optional["PipelineImageInput"] = None,
         mask_image: Optional["PipelineImageInput"] = None,
         masked_image_latents: Optional[torch.FloatTensor] = None,
+        padding_mask_crop: Optional[int] = None,
         strength: float = 0.9999,
         num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
         denoising_start: Optional[float] = None,
         denoising_end: Optional[float] = None,
         guidance_scale: float = 7.5,
@@ -95,10 +101,9 @@ def __call__(
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         guidance_rescale: float = 0.0,
         original_size: Tuple[int, int] = None,
@@ -110,6 +115,9 @@ def __call__(
         aesthetic_score: float = 6.0,
         negative_aesthetic_score: float = 2.5,
         clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -129,6 +137,12 @@ def __call__(
                 repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
                 to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
                 instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            padding_mask_crop (`Optional[int]`, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                and contain information inreleant for inpainging, such as background.
             strength (`float`, defaults to 0.9999):
                 Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
                 between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
@@ -140,6 +154,10 @@ def __call__(
             num_inference_steps (`int`, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference.
+            timesteps (`Optional[List[int]]`, defaults to `None`):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
             denoising_start (`Optional[float]`, defaults to `None`):
                 When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
                 bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
@@ -182,6 +200,7 @@ def __call__(
                 Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                 weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                 input argument.
+            ip_adapter_image: (`Optional[PipelineImageInput]`, defaults to `None`): Optional image input to work with IP Adapters.
             num_images_per_prompt (`int`, defaults to 1):
                 The number of images to generate per prompt.
             eta (`float`, defaults to 0.0):
@@ -200,12 +219,6 @@ def __call__(
             return_dict (`bool`, defaults to `True`):
                 Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                 plain tuple.
-            callback (`Optional[Callable]`, defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
             cross_attention_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
                 A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                 `self.processor` in
@@ -250,6 +263,15 @@ def __call__(
             clip_skip (`Optional[int]`, defaults to `None`):
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Optional[Callable[[int, int, Dict], None]]`, defaults to `None`):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List[str]`, defaults to ["latents"]):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
 
         Examples:
 
@@ -279,6 +301,22 @@ def __call__(
             [`diffusers.pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
             `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
         """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
         # -1. Check `num_images_per_prompt`
         if self.num_images_per_prompt != num_images_per_prompt and not self.dynamic_batch_size:
             logger.warning(
@@ -295,16 +333,29 @@ def __call__(
         self.check_inputs(
             prompt,
             prompt_2,
+            image,
+            mask_image,
             height,
             width,
             strength,
             callback_steps,
+            output_type,
             negative_prompt,
             negative_prompt_2,
             prompt_embeds,
             negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
         )
 
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -324,7 +375,7 @@ def __call__(
 
         # 3. Encode input prompt
         text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+            cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
 
         (
@@ -344,13 +395,14 @@ def __call__(
             pooled_prompt_embeds=pooled_prompt_embeds,
             negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
         )
 
         # 4. set timesteps
         def denoising_value_valid(dnv):
             return isinstance(denoising_end, float) and 0 < dnv < 1
 
-        self.scheduler.set_timesteps(num_inference_steps, device=None)
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, None, timesteps)
         timesteps, num_inference_steps = self.get_timesteps(
             num_inference_steps, strength, None, denoising_start=denoising_start if denoising_value_valid else None
         )
@@ -366,13 +418,25 @@ def denoising_value_valid(dnv):
         is_strength_max = strength == 1.0
 
         # 5. Preprocess mask and image
-        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
         init_image = init_image.to(dtype=torch.float32)
 
         self.mask_processor = VaeImageProcessor(
             vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
         )
-        mask = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
 
         if masked_image_latents is not None:
             masked_image = masked_image_latents
@@ -481,6 +545,11 @@ def denoising_value_valid(dnv):
         add_text_embeds = add_text_embeds
         add_time_ids = add_time_ids
 
+        if ip_adapter_image is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image, None, batch_size * num_images_per_prompt
+            )
+
         # 11. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
@@ -495,18 +564,29 @@ def denoising_value_valid(dnv):
                 f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
                 + f" {denoising_end} when using type float."
             )
-        elif denoising_end is not None and denoising_value_valid(denoising_end):
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
             discrete_timestep_cutoff = int(
                 round(
                     self.scheduler.config.num_train_timesteps
-                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
                 )
             )
             num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
             timesteps = timesteps[:num_inference_steps]
 
+        # 11.1 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
 
@@ -518,6 +598,8 @@ def denoising_value_valid(dnv):
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
                 # [modified for neuron] Remove not traced inputs: cross_attention_kwargs, return_dict
                 noise_pred = self.unet(
                     sample=latent_model_input,
@@ -533,14 +615,17 @@ def denoising_value_valid(dnv):
 
                 if do_classifier_free_guidance and guidance_rescale > 0.0:
                     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
 
                 if num_channels_unet == 4:
                     init_latents_proper = image_latents[:1]
-                    init_mask = mask[:1]
+                    if do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
 
                     if i < len(timesteps) - 1:
                         noise_timestep = timesteps[i + 1]
@@ -550,11 +635,30 @@ def denoising_value_valid(dnv):
 
                     latents = (1 - init_mask) * init_latents_proper + init_mask * latents
 
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
 
         if not output_type == "latent":
             # [Modified] Replace with pre-compiled vae decoder
@@ -568,6 +672,9 @@ def denoising_value_valid(dnv):
 
         image = self.image_processor.postprocess(image, output_type=output_type)
 
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
         if not return_dict:
             return (image,)
 
diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
index 8d48f1965..9097032e5 100644
--- a/optimum/neuron/version.py
+++ b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.0.18.dev0"
+__version__ = "0.0.19.dev0"
 
 __sdk_version__ = "2.16.1"
diff --git a/setup.py b/setup.py
index 64e1588c4..031e1bafa 100644
--- a/setup.py
+++ b/setup.py
@@ -62,7 +62,7 @@
         "torchvision==0.14.*",
         "neuronx_distributed==0.6.0",
     ],
-    "diffusers": ["diffusers >= 0.25.0"],
+    "diffusers": ["diffusers ~= 0.26.1"],
     "sentence-transformers": ["sentence-transformers >= 2.2.0"],
 }
 

From 7c7d5089355012b2993370d269139d204443df69 Mon Sep 17 00:00:00 2001
From: Shubham Krishna <shubhamkrishna.ism@gmail.com>
Date: Sat, 3 Feb 2024 13:23:04 +0100
Subject: [PATCH 02/12] Add packer support for building AWS AMI (#441)

* Add packer support for bulding AWS AMI

* Update README

* Add slack notification on failure

* Add comment to explain workflow
---
 .github/workflows/build-ami.yml               | 82 +++++++++++++++++++
 infrastructure/ami/README.md                  | 78 ++++++++++++++++++
 infrastructure/ami/hcl2-files/build.pkr.hcl   | 29 +++++++
 infrastructure/ami/hcl2-files/packer.pkr.hcl  |  8 ++
 infrastructure/ami/hcl2-files/sources.pkr.hcl | 15 ++++
 .../ami/hcl2-files/variables.pkr.hcl          | 54 ++++++++++++
 .../scripts/install-huggingface-libraries.sh  | 39 +++++++++
 infrastructure/ami/scripts/validate-neuron.sh | 13 +++
 infrastructure/ami/scripts/welcome-msg.sh     | 11 +++
 9 files changed, 329 insertions(+)
 create mode 100644 .github/workflows/build-ami.yml
 create mode 100644 infrastructure/ami/README.md
 create mode 100644 infrastructure/ami/hcl2-files/build.pkr.hcl
 create mode 100644 infrastructure/ami/hcl2-files/packer.pkr.hcl
 create mode 100644 infrastructure/ami/hcl2-files/sources.pkr.hcl
 create mode 100644 infrastructure/ami/hcl2-files/variables.pkr.hcl
 create mode 100644 infrastructure/ami/scripts/install-huggingface-libraries.sh
 create mode 100644 infrastructure/ami/scripts/validate-neuron.sh
 create mode 100644 infrastructure/ami/scripts/welcome-msg.sh

diff --git a/.github/workflows/build-ami.yml b/.github/workflows/build-ami.yml
new file mode 100644
index 000000000..e41c5f712
--- /dev/null
+++ b/.github/workflows/build-ami.yml
@@ -0,0 +1,82 @@
+# The workflow file for building the AWS Neuron AMI using Packer
+# It can be triggered by push and pull request to main when changes made to infrastructure/ami folder, manually and scheduler.
+name: Build AWS Neuron AMI
+on:
+    push:
+      branches:
+        - main
+      paths:
+        - 'infrastructure/ami/**'
+    pull_request:
+      branches:
+        - main
+      paths:
+        - 'infrastructure/ami/**'
+    workflow_dispatch:
+      inputs: 
+        tag: 
+          description: 'Tag to use for the AMI build'
+          default: 'main'
+    schedule:
+      # Schedule the workflow to run every second day at midnight UTC
+      - cron: '0 0 */2 * *'
+
+jobs:
+    build-ami:
+        defaults:
+          run:
+            working-directory: infrastructure/ami
+        runs-on: ubuntu-latest
+        env:
+          AWS_REGION: us-east-1
+        steps:
+          - name: Checkout
+            uses: actions/checkout@v3
+            with:
+              # If the workflow is triggered manually or by schedule, uses the tag, otherwise uses the current branch  
+              ref: ${{ github.event.inputs.tag || github.ref }}
+          
+          - name: Setup Packer
+            uses: hashicorp/setup-packer@main
+
+          - name: configure aws credentials
+            uses: aws-actions/configure-aws-credentials@v1
+            with:
+              aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID_BUILD_AMI }}
+              aws-secret-access-key: ${{ secrets.AWS_ACCESS_KEY_SECRET_BUILD_AMI }}
+              aws-region: ${{ env.AWS_REGION }}
+          
+          - name: Packer format
+            id: format
+            run: packer fmt hcl2-files
+            continue-on-error: true
+          
+          - name: Packer Init
+            id: init
+            run: packer init hcl2-files
+            continue-on-error: true
+          
+          - name: Packer Validate
+            id: validate
+            # If the workflow is triggered manually or scheduled, uses the tag, otherwise uses the main branch of optimum-neuron repo for building the AMI  
+            run: packer validate -var "optimum_version=${{ github.event.inputs.tag || github.event.repository.default_branch }}" -var "region=${{ env.AWS_REGION }}" hcl2-files
+            continue-on-error: true
+          
+          - name: Packer Build
+            id: build
+            # If the workflow is triggered manually or scheduled, uses the tag, otherwise uses the main branch of optimum-neuron repo for building the AMI 
+            run:  | 
+              packer build -var "optimum_version=${{ github.event.inputs.tag || github.event.repository.default_branch }}" -var "region=${{ env.AWS_REGION }}" hcl2-files              
+
+          - name: Slack Notification on Failure
+            id: slack
+            uses: slackapi/slack-github-action@v1.25.0
+            if: ${{ failure() && github.event_name == 'schedule' }}
+            with:
+              channel-id: 'C06GAEQJLNN' #copied from slack channel
+              payload: |
+                {
+                  "text": "GitHub Action HuggingFace Neuron AMI Build result: ${{job.status}}"
+                }
+            env:
+              SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
diff --git a/infrastructure/ami/README.md b/infrastructure/ami/README.md
new file mode 100644
index 000000000..006812b14
--- /dev/null
+++ b/infrastructure/ami/README.md
@@ -0,0 +1,78 @@
+# Building AMI with Packer
+
+This directory contains the files for building AMI using [Packer](https://github.com/hashicorp/packer) that is later published as a AWS Marketplace asset.
+
+
+## Folder Structure
+
+- [hcl2-files](./hcl2-files/) - Includes different files which are used by a Packer pipeline to build an AMI. The files are:
+  - [build.pkr.hcl](./hcl2-files/build.pkr.hcl): contains the [build](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/blocks/build) block, defining the builders to start, provisioning them using [provisioner](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/blocks/build/provisioner), and specifying actions to take with the built artifacts using `post-process`.
+  - [variables.pkr.hcl](./hcl2-files/variables.pkr.hcl): contains the [variables](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/blocks/variable) block, defining variables within your Packer configuration.
+  - [sources.pkr.hcl](./hcl2-files/sources.pkr.hcl): contains the [source](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/blocks/source) block, defining reusable builder configuration blocks.
+  - [packer.pkr.hcl](./hcl2-files/packer.pkr.hcl): contains the [packer](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/blocks/packer) block, used to configure some behaviors of Packer itself, such as the minimum required Packer version needed to apply to your configuration.
+- [scripts](./scripts): contains scripts used by [provisioner](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/blocks/build/provisioner) for installing additonal packages/softwares.
+
+
+### Prerequisites
+ - [Packer](https://developer.hashicorp.com/packer/docs/intro): Packer is an open source tool for creating identical machine images for multiple platforms from a single source configuration.
+
+ - AWS Credentials: You need to have AWS credentials configured on your machine. You can configure AWS credentials using [AWS CLI](https://github.com/aws/aws-cli) or by setting environment variables.
+
+ #### Install Packer on Ubuntu/Debian
+ ```bash
+ curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo apt-key add -
+ sudo apt-add-repository "deb [arch=amd64] https://apt.releases.hashicorp.com $(lsb_release -cs) main"
+ sudo apt-get update && sudo apt-get install packer
+ ```
+
+You can also install Packer for other OS from [here](https://developer.hashicorp.com/packer/tutorials/docker-get-started/get-started-install-cli).
+
+#### Configure AWS Credentials
+
+Using Environment Variables:
+```bash
+export AWS_ACCESS_KEY_ID=<access_key>
+export AWS_SECRET_ACCESS_KEY=<secret_key>
+```
+
+Using AWS CLI:
+```bash
+aws configure sso
+```
+
+There are other ways to configure AWS credentials. You can read more about it [here](https://github.com/aws/aws-cli?tab=readme-ov-file#configuration).
+
+### Build AMI
+
+#### Format Packer blocks
+You can format your HCL2 files locally. This command will update your files in place.
+
+Format a single file:
+```bash
+packer fmt build.pkr.hcl 
+```
+
+Format all files in a directory:
+```bash
+packer fmt ./hcl2-files
+```
+
+#### Validate Packer blocks
+You can validate the syntax and configuration of your files locally. This command will return a zero exit status on success, and a non-zero exit status on failure. 
+
+```bash
+packer validate -var 'region=us-west-2' -var 'optimum_version=v0.0.17' ./hcl2-files
+```
+
+#### Run Packer build
+You can run Packer locally. This command will build the AMI and upload it to AWS.
+
+You need to set variables with no default values using `-var` flag. For example:
+```bash
+packer build -var 'region=us-west-2' -var 'optimum_version=v0.0.17' ./hcl2-files
+```
+
+To trigger a github action workflow manually, you can use GitHub CLI:
+```bash
+gh workflow run build-ami.yml -f tag=<tag>
+```
diff --git a/infrastructure/ami/hcl2-files/build.pkr.hcl b/infrastructure/ami/hcl2-files/build.pkr.hcl
new file mode 100644
index 000000000..f9327dacf
--- /dev/null
+++ b/infrastructure/ami/hcl2-files/build.pkr.hcl
@@ -0,0 +1,29 @@
+build {
+  name = "build-hf-dl-neuron"
+  sources = [
+    "source.amazon-ebs.ubuntu"
+  ]
+  provisioner "shell" {
+    script = "scripts/validate-neuron.sh"
+  }
+  provisioner "shell" {
+    script = "scripts/install-huggingface-libraries.sh"
+    environment_vars = [
+      "TRANSFORMERS_VERSION=${var.transformers_version}",
+      "OPTIMUM_VERSION=${var.optimum_version}",
+    ]
+  }
+  provisioner "shell" {
+    inline = ["echo 'source /opt/aws_neuron_venv_pytorch/bin/activate' >> /home/ubuntu/.bashrc"]
+  }
+  provisioner "file" {
+    source      = "scripts/welcome-msg.sh"
+    destination = "/tmp/99-custom-message"
+  }
+  provisioner "shell" {
+    inline = [
+      "sudo mv /tmp/99-custom-message /etc/update-motd.d/",
+      "sudo chmod +x /etc/update-motd.d/99-custom-message",
+    ]
+  }
+}
\ No newline at end of file
diff --git a/infrastructure/ami/hcl2-files/packer.pkr.hcl b/infrastructure/ami/hcl2-files/packer.pkr.hcl
new file mode 100644
index 000000000..fa4a5e49e
--- /dev/null
+++ b/infrastructure/ami/hcl2-files/packer.pkr.hcl
@@ -0,0 +1,8 @@
+packer {
+  required_plugins {
+    amazon = {
+      version = ">= 1.2.8"
+      source  = "github.com/hashicorp/amazon"
+    }
+  }
+}
\ No newline at end of file
diff --git a/infrastructure/ami/hcl2-files/sources.pkr.hcl b/infrastructure/ami/hcl2-files/sources.pkr.hcl
new file mode 100644
index 000000000..fe3d5c22b
--- /dev/null
+++ b/infrastructure/ami/hcl2-files/sources.pkr.hcl
@@ -0,0 +1,15 @@
+source "amazon-ebs" "ubuntu" {
+  ami_name      = "huggingface-neuron-{{isotime \"2006-01-02T15-04-05Z\"}}"
+  instance_type = var.instance_type
+  region        = var.region
+  source_ami    = var.source_ami
+  ssh_username  = var.ssh_username
+  launch_block_device_mappings {
+    device_name           = "/dev/sda1"
+    volume_size           = 512
+    volume_type           = "gp2"
+    delete_on_termination = true
+  }
+  ami_users   = var.ami_users
+  ami_regions = var.ami_regions
+}
\ No newline at end of file
diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl
new file mode 100644
index 000000000..57e7214a8
--- /dev/null
+++ b/infrastructure/ami/hcl2-files/variables.pkr.hcl
@@ -0,0 +1,54 @@
+variable "region" {
+  description = "The AWS region"
+  type        = string
+}
+
+variable "instance_type" {
+  default     = "trn1.2xlarge"
+  description = "EC2 machine type for building AMI"
+  type        = string
+}
+
+variable "source_ami" {
+  default     = "ami-0fbea04d7389bcd4e" 
+  description = "Base Image"
+  type        = string
+  /*
+  To get latest value, run the following command:
+  aws ec2 describe-images \
+      --region us-east-1 \
+      --owners amazon \
+      --filters 'Name=name,Values=Deep Learning AMI Neuron PyTorch 1.13 (Ubuntu 20.04) ????????' 'Name=state,Values=available' \
+      --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \
+      --output text
+  */
+}
+
+variable "ssh_username" {
+  default     = "ubuntu"
+  description = "Username to connect to SSH with"
+  type        = string
+}
+
+variable "optimum_version" {
+  description = "Optimum Neuron version to install"
+  type        = string
+}
+
+variable "transformers_version" {
+  default     = "4.36.2"
+  description = "Transformers version to install"
+  type        = string
+}
+
+variable "ami_users" {
+  default     = ["754289655784", "558105141721"]
+  description = "AWS accounts to share AMI with"
+  type        = list(string)
+}
+
+variable "ami_regions" {
+  default     = ["eu-west-1"]
+  description = "AWS regions to share AMI with"
+  type        = list(string)
+}
\ No newline at end of file
diff --git a/infrastructure/ami/scripts/install-huggingface-libraries.sh b/infrastructure/ami/scripts/install-huggingface-libraries.sh
new file mode 100644
index 000000000..b89c08822
--- /dev/null
+++ b/infrastructure/ami/scripts/install-huggingface-libraries.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Activate the neuron virtual environment
+source /opt/aws_neuron_venv_pytorch/bin/activate
+
+echo "Step: install-hugging-face-libraries"
+
+echo "TRANSFORMERS_VERSION: $TRANSFORMERS_VERSION"
+echo "OPTIMUM_VERSION: $OPTIMUM_VERSION"
+
+pip install --upgrade --no-cache-dir \
+    "transformers[sklearn,sentencepiece,vision]==$TRANSFORMERS_VERSION" \
+    "datasets==2.16.1" \
+    "accelerate==0.23.0" \
+    "diffusers==0.25.0" \
+    "evaluate==0.4.1" \
+    "requests==2.31.0" \
+    "notebook==7.0.6" \
+    "markupsafe==2.1.1" \
+    "jinja2==3.1.2" \
+    "attrs==23.1.0"
+
+echo 'export PATH="${HOME}/.local/bin:$PATH"' >> "${HOME}/.bashrc"
+
+echo "Step: install-and-copy-optimum-neuron-examples"
+git clone -b $OPTIMUM_VERSION https://github.com/huggingface/optimum-neuron.git
+
+cd optimum-neuron
+python setup.py install
+cd ..
+
+mkdir /home/ubuntu/huggingface-neuron-samples/ /home/ubuntu/huggingface-neuron-notebooks/
+mv optimum-neuron/examples/* /home/ubuntu/huggingface-neuron-samples/
+mv optimum-neuron/notebooks/* /home/ubuntu/huggingface-neuron-notebooks/
+rm -rf optimum-neuron
+chmod -R 777 /home/ubuntu/huggingface-neuron-samples /home/ubuntu/huggingface-neuron-notebooks
+
+echo "Step: validate-imports-of-huggingface-libraries"
+bash -c 'python -c "import transformers;import datasets;import accelerate;import evaluate;import tensorboard; import torch;"'
\ No newline at end of file
diff --git a/infrastructure/ami/scripts/validate-neuron.sh b/infrastructure/ami/scripts/validate-neuron.sh
new file mode 100644
index 000000000..c2fdcb7de
--- /dev/null
+++ b/infrastructure/ami/scripts/validate-neuron.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+echo "Step: validate-neuron-devices"
+neuron-ls
+
+# Activate the neuron virtual environment
+source /opt/aws_neuron_venv_pytorch/bin/activate
+
+python -c 'import torch'
+python -c 'import torch_neuronx'
+
+echo "Installing Tensorboard Plugin for Neuron"
+pip install --upgrade --no-cache-dir \
+    "tensorboard-plugin-neuronx"
\ No newline at end of file
diff --git a/infrastructure/ami/scripts/welcome-msg.sh b/infrastructure/ami/scripts/welcome-msg.sh
new file mode 100644
index 000000000..256228200
--- /dev/null
+++ b/infrastructure/ami/scripts/welcome-msg.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+printf "=============================================================================\n"
+printf "       __|  __|_  )\n"
+printf "       _|  (     /   HuggingFace Deep Learning Neuron AMI (Ubuntu 20.04)\n"
+printf "      ___|\___|___|\n"
+printf "=============================================================================\n"
+printf "Welcome to the HuggingFace Deep Learning Neuron AMI (Ubuntu 20.04)\n"
+printf "* Examples:     /home/ubuntu/huggingface-neuron-samples \n"
+printf "* Notebooks:    /home/ubuntu/huggingface-neuron-notebooks \n"
+printf "* Documentation: https://huggingface.co/docs/optimum-neuron/ \n"
+printf "=============================================================================\n"

From 165ba05dea60d3c139e5b6ceaa2b0c2227a86887 Mon Sep 17 00:00:00 2001
From: Shubham Krishna <shubhamkrishna.ism@gmail.com>
Date: Tue, 6 Feb 2024 15:11:34 +0100
Subject: [PATCH 03/12] Fix path, update versions (#462)

* Fix packer version

* Update build-ami workflow
---
 .github/workflows/build-ami.yml | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/build-ami.yml b/.github/workflows/build-ami.yml
index e41c5f712..8d7711a6a 100644
--- a/.github/workflows/build-ami.yml
+++ b/.github/workflows/build-ami.yml
@@ -5,13 +5,9 @@ on:
     push:
       branches:
         - main
-      paths:
-        - 'infrastructure/ami/**'
     pull_request:
       branches:
         - main
-      paths:
-        - 'infrastructure/ami/**'
     workflow_dispatch:
       inputs: 
         tag: 
@@ -31,16 +27,16 @@ jobs:
           AWS_REGION: us-east-1
         steps:
           - name: Checkout
-            uses: actions/checkout@v3
-            with:
-              # If the workflow is triggered manually or by schedule, uses the tag, otherwise uses the current branch  
-              ref: ${{ github.event.inputs.tag || github.ref }}
+            uses: actions/checkout@v4
           
           - name: Setup Packer
             uses: hashicorp/setup-packer@main
+            id: setup-packer
+            with:
+              version: "1.10.1"
 
           - name: configure aws credentials
-            uses: aws-actions/configure-aws-credentials@v1
+            uses: aws-actions/configure-aws-credentials@v4
             with:
               aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID_BUILD_AMI }}
               aws-secret-access-key: ${{ secrets.AWS_ACCESS_KEY_SECRET_BUILD_AMI }}
@@ -55,18 +51,19 @@ jobs:
             id: init
             run: packer init hcl2-files
             continue-on-error: true
+  
           
           - name: Packer Validate
             id: validate
-            # If the workflow is triggered manually or scheduled, uses the tag, otherwise uses the main branch of optimum-neuron repo for building the AMI  
-            run: packer validate -var "optimum_version=${{ github.event.inputs.tag || github.event.repository.default_branch }}" -var "region=${{ env.AWS_REGION }}" hcl2-files
+            # If the workflow is triggered manually or scheduled, uses the tag, otherwise uses the name of branch that triggered workflow for building the AMI  
+            run: packer validate -var "optimum_version=${{ github.event.inputs.tag || github.head_ref || github.ref_name }}" -var "region=${{ env.AWS_REGION }}" hcl2-files
             continue-on-error: true
           
           - name: Packer Build
             id: build
-            # If the workflow is triggered manually or scheduled, uses the tag, otherwise uses the main branch of optimum-neuron repo for building the AMI 
+            # If the workflow is triggered manually or scheduled, uses the tag, otherwise uses the name of branch that triggered workflow for building the AMI 
             run:  | 
-              packer build -var "optimum_version=${{ github.event.inputs.tag || github.event.repository.default_branch }}" -var "region=${{ env.AWS_REGION }}" hcl2-files              
+              packer build -var "optimum_version=${{ github.event.inputs.tag || github.head_ref || github.ref_name }}" -var "region=${{ env.AWS_REGION }}" hcl2-files              
 
           - name: Slack Notification on Failure
             id: slack

From 18ed6949440800418a9bb752f3f48fc153cd0609 Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Wed, 7 Feb 2024 09:27:03 +0100
Subject: [PATCH 04/12] Fix trigger for actions (#468)

Update build-ami.yml
---
 .github/workflows/build-ami.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build-ami.yml b/.github/workflows/build-ami.yml
index 8d7711a6a..4eb92e62a 100644
--- a/.github/workflows/build-ami.yml
+++ b/.github/workflows/build-ami.yml
@@ -5,9 +5,13 @@ on:
     push:
       branches:
         - main
+      paths:
+        - 'infrastructure/ami/**'
     pull_request:
       branches:
         - main
+      paths:
+        - 'infrastructure/ami/**'
     workflow_dispatch:
       inputs: 
         tag: 

From d2ac20a6cf7ff4ebe662f3a61b3c13c0108fc1f9 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 7 Feb 2024 10:45:38 +0100
Subject: [PATCH 05/12] Add TGI tests and CI workflow (#355)

* feat(tgi): fetch TGI .proto if not provided

This allows to build the server outside of the Dockerfile more easily.
Note that when built out of the Dockerfile on Debian/Ubuntu, only the wheel
can be installed.
This might be related to an issue with python-pip.

* feat(tgi): align server version to optimum-neuron

* test(Makefile): added test_tgi target

* test(tgi): test decoding with streamed tokens

* ci: added TGI workflow

* fix(tgi): use git ignored build dir name
---
 .github/workflows/test_inf2_tgi.yml           | 40 ++++++++++++
 Makefile                                      | 16 ++++-
 text-generation-inference/Dockerfile          | 10 +--
 text-generation-inference/server/Makefile     | 32 +++++++---
 .../server/pyproject.toml                     |  5 +-
 .../tests/test_generator_slot.py              | 61 +++++++++++++++++++
 6 files changed, 147 insertions(+), 17 deletions(-)
 create mode 100644 .github/workflows/test_inf2_tgi.yml
 create mode 100644 text-generation-inference/tests/test_generator_slot.py

diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
new file mode 100644
index 000000000..fe0cb87e6
--- /dev/null
+++ b/.github/workflows/test_inf2_tgi.yml
@@ -0,0 +1,40 @@
+name: Optimum neuron / Test TGI on INF2
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - "text-generation-inference/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - "text-generation-inference/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run TGI tests
+    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install python and create venv
+        run: |
+          sudo apt install python3.8-venv -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+      - name: Run TGI server python tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_test
diff --git a/Makefile b/Makefile
index 118b37484..71011ca12 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,7 @@ clean:
 
 rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))
 
-VERSION := $(shell python -W ignore -c "from optimum.neuron.version import __version__; print(__version__)")
+VERSION := $(shell gawk 'match($$0, /__version__ = "(.*)"/, a) {print a[1]}' optimum/neuron/version.py)
 
 PACKAGE_DIST = dist/optimum-neuron-$(VERSION).tar.gz
 PACKAGE_WHEEL = dist/optimum_neuron-$(VERSION)-py3-none-any.whl
@@ -71,6 +71,20 @@ build_dist: ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 pypi_upload: ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 	python -m twine upload ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 
+# Tests
+
 test_installs:
 	python -m pip install .[tests]
 	python -m pip install git+https://github.com/huggingface/transformers.git
+
+# Stand-alone TGI server for unit tests outside of TGI container
+tgi_server:
+	python -m pip install -r text-generation-inference/server/build-requirements.txt
+	make -C text-generation-inference/server clean
+	VERSION=${VERSION} make -C text-generation-inference/server gen-server
+
+tgi_test: tgi_server
+	python -m pip install .[neuronx] pytest
+	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
+	                               -exec python -m pip install --force-reinstall {} \;
+	python -m pytest -s text-generation-inference/tests
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 5b2036c7f..c92479478 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -51,6 +51,10 @@ RUN apt-get update -y \
     && apt-get clean
 RUN pip3 --no-cache-dir install --upgrade pip
 
+# VERSION is a mandatory parameter
+ARG VERSION
+RUN test -n ${VERSION:?}
+
 # Python server build image
 FROM base AS pyserver
 
@@ -66,15 +70,11 @@ WORKDIR /pyserver
 COPY text-generation-inference/server server
 COPY --from=tgi /tgi/proto proto
 RUN pip3 install -r server/build-requirements.txt
-RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server gen-server
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server
 
 # Neuron base image (used for deployment)
 FROM base AS neuron
 
-# VERSION is a mandatory parameter
-ARG VERSION
-RUN test -n ${VERSION:?}
-
 # Install system prerequisites
 RUN apt-get update -y \
  && apt-get install -y --no-install-recommends \
diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index 6ae41d4ca..da5e38ffb 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -1,6 +1,7 @@
 # Initialize base variables
 pkg_name := text_generation_server
-BUILDDIR ?= $(CURDIR)/build_$(pkg_name)
+BUILDDIR ?= $(CURDIR)/build
+VERSION ?= 0.0.1
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)
@@ -13,14 +14,6 @@ src_dir := $(mkfile_dir)/$(pkg_name)
 sources := $(wildcard $(src_dir)/*.py)
 deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))
 
-# Three python files are generated for each protobuf
-protobufs := $(wildcard $(PROTODIR)/*.proto)
-pkg_pb_dir := $(pkg_dir)/pb
-generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
-generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
-generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
-generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
-
 # Static files are just copied
 
 define COPY
@@ -30,6 +23,7 @@ endef
 $(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml
 	mkdir -p $(BUILDDIR)
 	$(COPY)
+	sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@
 
 $(pkg_dir)/%.py: $(src_dir)/%.py
 	mkdir -p $(pkg_dir)
@@ -37,6 +31,24 @@ $(pkg_dir)/%.py: $(src_dir)/%.py
 
 # Generated files are produced by grpcio tools
 
+# If not provided, fetch proto files from TGI
+ifndef PROTODIR
+PROTODIR := $(BUILDDIR)/tgi/proto
+endif
+
+$(BUILDDIR)/tgi/proto/%.proto:
+	install -d $(BUILDDIR)/tgi
+	curl -L https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
+	tar -C $(BUILDDIR)/tgi -xf $(BUILDDIR)/tgi/sources.tar.gz --strip-components=1
+
+# Three python files are generated for each protobuf
+protobufs := $(PROTODIR)/generate.proto
+pkg_pb_dir := $(pkg_dir)/pb
+generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
+generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
+
 $(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
 	mkdir -p $(pkg_pb_dir)
 	python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
@@ -44,4 +56,4 @@ $(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PR
 	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
 
 gen-server: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
-	python -m build $(BUILDDIR) --sdist
+	python -m build $(BUILDDIR)
diff --git a/text-generation-inference/server/pyproject.toml b/text-generation-inference/server/pyproject.toml
index 7261e8d44..869d341cb 100644
--- a/text-generation-inference/server/pyproject.toml
+++ b/text-generation-inference/server/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "text-generation-server"
-version = "0.0.1"
+version = "VERSION"
 authors = [{name="David Corvoysier", email="david@huggingface.co" }]
 description = "TGI compatible inference server for AWS Neuronx platforms"
 dependencies = [
@@ -18,5 +18,8 @@ dependencies = [
     'loguru == 0.6.0'
 ]
 
+[tool.setuptools]
+packages = ["text_generation_server", "text_generation_server.pb"]
+
 [project.scripts]
 text-generation-server = 'text_generation_server.cli:app'
diff --git a/text-generation-inference/tests/test_generator_slot.py b/text-generation-inference/tests/test_generator_slot.py
new file mode 100644
index 000000000..2f243b5d4
--- /dev/null
+++ b/text-generation-inference/tests/test_generator_slot.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from text_generation_server.generator import Slot
+from text_generation_server.pb.generate_pb2 import Request
+from transformers import AutoTokenizer, GenerationConfig
+
+
+TOKENIZERS = ["NousResearch/Llama-2-7b-hf", "gpt2"]
+
+
+@pytest.fixture(params=TOKENIZERS)
+def tokenizer(request):
+    t = AutoTokenizer.from_pretrained(request.param)
+    t.padding_side = "left"
+    t.pad_token_id = t.eos_token_id
+    return t
+
+
+@pytest.mark.parametrize(
+    "input_text, generated_text",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            " Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind,"
+            " slipped quickly through the glass doors of Victory Mansions, though not quickly enough"
+            " to prevent a swirl of gritty dust from entering along with him.",
+        ],
+        ["This sentence is written in chinese:", "我很感谢你的热情"],
+        ["Some text might contain a lot of emojis like 😃", "😍💪 👉 👀"],
+    ],
+    ids=["spaces", "chinese-utf8", "emojis"],
+)
+def test_decode_streaming(tokenizer, input_text, generated_text):
+    slot = Slot(0, tokenizer)
+    request = Request(id=0, inputs=input_text)
+    slot.assign(request, GenerationConfig())
+    assert slot.cached_text == input_text
+
+    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    input_ids = inputs["input_ids"][0]
+    attention_mask = inputs["attention_mask"][0]
+    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
+
+    # We need to regenerate the full text as the tokenizer might change it (extra spaces might be added)
+    all_input_ids = torch.cat([input_ids, torch.tensor(generated_tokens)])
+    full_text = tokenizer.decode(all_input_ids, skip_special_tokens=True)
+    regenerated_text = full_text[len(input_text) :]
+
+    # Initialize the slot with the inputs
+    slot.reset(input_ids, attention_mask, selector=None)
+
+    assert slot.generated_tokens == 0
+
+    # Simulate an iterative generation (i.e. don't call select and use known tokens instead)
+    decoded_text = ""
+    for i in range(len(generated_tokens)):
+        text = slot.append(generated_tokens[i])
+        assert slot.generated_tokens == i + 1
+        decoded_text += text
+
+    assert decoded_text == regenerated_text

From 0767cc443fd2f64070a7525812ca49c4ed6530a6 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:41:30 +0100
Subject: [PATCH 06/12] Add issue and PR templates & build optimum env cli for
 Neuron (#463)

* add templates & override env cli

* address comments

* workaround
---
 .github/ISSUE_TEMPLATE/bug-report.yml      |  73 +++++++++++++++
 .github/ISSUE_TEMPLATE/config.yml          |  12 +++
 .github/ISSUE_TEMPLATE/feature-request.yml |  31 +++++++
 .github/PULL_REQUEST_TEMPLATE.md           |  21 +++++
 optimum/commands/env.py                    | 100 +++++++++++++++++++++
 5 files changed, 237 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug-report.yml
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/ISSUE_TEMPLATE/feature-request.yml
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md
 create mode 100644 optimum/commands/env.py

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
new file mode 100644
index 000000000..b93b03135
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,73 @@
+name: "\U0001F41B Bug Report"
+description: Submit a bug report to help us improve Optimum Neuron
+labels: [ "bug" ]
+body:
+  - type: textarea
+    id: system-info
+    attributes:
+      label: System Info
+      description: Please share your system info with us. You can run the command optimum-cli env and copy-paste its output below.
+      render: shell
+      placeholder: optimum-neuron version, optimum version, platform, python version, ...
+    validations:
+      required: true
+
+  - type: textarea
+    id: who-can-help
+    attributes:
+      label: Who can help?
+      description: |
+        Your issue will be replied to more quickly if you can figure out the right person to tag with @
+        If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+        Please tag fewer than 3 people.
+
+        - Neuron Training: `@michaelbenayoun`
+        - Export models to Neuron format: `@JingyaHuang`
+        - Inference: `@dacorvo`, `@JingyaHuang`
+        - TGI: `@dacorvo`
+        - Pipelines: `@philschmid`, `@JingyaHuang`
+
+      placeholder: "@Username ..."
+
+  - type: checkboxes
+    id: information-scripts-examples
+    attributes:
+      label: Information
+      description: 'The problem arises when using:'
+      options:
+        - label: "The official example scripts"
+        - label: "My own modified scripts"
+
+  - type: checkboxes
+    id: information-tasks
+    attributes:
+      label: Tasks
+      description: "The tasks I am working on are:"
+      options:
+        - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)"
+        - label: "My own task or dataset (give details below)"
+
+  - type: textarea
+    id: reproduction
+    validations:
+      required: true
+    attributes:
+      label: Reproduction (minimal, reproducible, runnable)
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+        If you have code snippets, error messages, stack traces please provide them here as well.
+        Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+        Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
+        Providing a **minimal**, **reproducible** reproduction using a **publicly available model** significantly increase the chances of a fix in a timely manner.
+
+      placeholder: |
+        Providing a minimal, reproducible reproduction using a publicly available model significantly increase the chances of a fix in a timely manner.
+
+
+  - type: textarea
+    id: expected-behavior
+    validations:
+      required: true
+    attributes:
+      label: Expected behavior
+      description: "A clear and concise description of what you would expect to happen."
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..c5ffe928a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,12 @@
+blank_issues_enabled: true
+version: 2.1
+contact_links:
+  - name: Model checkpoints on the Hugging Face Hub
+    url: https://huggingface.co/models
+    about: Open a Pull request / Discussion related to a specific model checkpoint directly on the Hugging Face Hub
+  - name: Website Related
+    url: https://github.com/huggingface/hub-docs/issues
+    about: Feature requests and bug reports related to the website
+  - name: Forum
+    url: https://discuss.huggingface.co/
+    about: General usage questions and community discussions
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
new file mode 100644
index 000000000..1e17554c0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -0,0 +1,31 @@
+name: "\U0001F680 Feature request"
+description: Submit a proposal/request a new feature on Optimum Neuron
+labels: [ "feature" ]
+body:
+  - type: textarea
+    id: feature-request
+    validations:
+      required: true
+    attributes:
+      label: Feature request
+      description: |
+        A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist.
+
+  - type: textarea
+    id: motivation
+    validations:
+      required: true
+    attributes:
+      label: Motivation
+      description: |
+        Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
+        
+
+  - type: textarea
+    id: contribution
+    validations:
+      required: true
+    attributes:
+      label: Your contribution
+      description: |
+        Is there any way that you could help, e.g. by submitting a PR?
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..0fe201944
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,21 @@
+# What does this PR do?
+
+<!--
+Congratulations! You've made it this far! You're not quite done yet though.
+
+Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
+
+Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
+
+Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
+-->
+
+<!-- Remove if not applicable -->
+
+Fixes # (issue)
+
+
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you make sure to update the documentation with your changes?
+- [ ] Did you write any new necessary tests?
diff --git a/optimum/commands/env.py b/optimum/commands/env.py
new file mode 100644
index 000000000..6dfae72c1
--- /dev/null
+++ b/optimum/commands/env.py
@@ -0,0 +1,100 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import platform
+import subprocess
+from typing import Dict, List, Optional
+
+import huggingface_hub
+from pkg_resources import get_distribution
+from transformers import __version__ as transformers_version
+from transformers.utils import is_torch_available
+
+from ..neuron.utils import is_neuron_available, is_neuronx_available
+from ..neuron.version import __sdk_version__ as neuron_sdk_version
+from ..neuron.version import __version__ as optimum_neuron_version
+from ..version import __version__ as optimum_version
+from . import BaseOptimumCLICommand, CommandInfo
+
+
+class EnvironmentCommand(BaseOptimumCLICommand):
+    COMMAND = CommandInfo(name="env", help="Get information about the environment used.")
+
+    @staticmethod
+    def format_dict(d):
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
+
+    @staticmethod
+    def get_pip_pkgs_version(pkg_list: Optional[List], info: Dict):
+        if pkg_list is not None:
+            for pkg in pkg_list:
+                try:
+                    num_version = get_distribution(pkg).version
+                except Exception:
+                    num_version = "NA"
+                info[f"`{pkg}` version"] = num_version
+        return info
+
+    @staticmethod
+    def print_apt_pkgs():
+        apt = subprocess.Popen(["apt", "list", "--installed"], stdout=subprocess.PIPE)
+        grep = subprocess.Popen(["grep", "aws-neuron"], stdin=apt.stdout, stdout=subprocess.PIPE)
+        pkgs_list = list(grep.stdout)
+        for pkg in pkgs_list:
+            print(pkg.decode("utf-8").split("\n")[0])
+
+    def run(self):
+        pt_version = "not installed"
+        if is_torch_available():
+            import torch
+
+            pt_version = torch.__version__
+
+        platform_info = {
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+        }
+        info = {
+            "`optimum-neuron` version": optimum_neuron_version,
+            "`neuron-sdk` version": neuron_sdk_version,
+            "`optimum` version": optimum_version,
+            "`transformers` version": transformers_version,
+            "`huggingface_hub` version": huggingface_hub.__version__,
+            "`torch` version": f"{pt_version}",
+        }
+
+        if is_neuron_available():
+            neuron_python_pkgs = ["dmlc-tvm", "neuron-cc", "torch-neuron"]
+        elif is_neuronx_available():
+            neuron_python_pkgs = [
+                "aws-neuronx-runtime-discovery",
+                "libneuronxla",
+                "neuronx-cc",
+                "neuronx-distributed",
+                "neuronx-hwm",
+                "torch-neuronx",
+                "torch-xla",
+                "transformers-neuronx",
+            ]
+        else:
+            neuron_python_pkgs = None
+
+        info = self.get_pip_pkgs_version(neuron_python_pkgs, info)
+
+        print("\nCopy-and-paste the text below in your GitHub issue:\n")
+        print("\nPlatform:\n")
+        print(self.format_dict(platform_info))
+        print("\nPython packages:\n")
+        print(self.format_dict(info))
+        print("\nNeuron Driver:\n")
+        self.print_apt_pkgs()

From 1b7d07d685f098952c2bef05697f75463390ead6 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Thu, 8 Feb 2024 12:42:48 +0100
Subject: [PATCH 07/12] Add contribution guide for Neuron exporter (#461)

* init

* add guide and example

* complete

* improve

* improve

* exclude causallm

* fix extension

* fix typo

* Update docs/source/community/contributing.mdx

Co-authored-by: David Corvoysier <david@huggingface.co>

* Update docs/source/community/contributing.mdx

Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>

* Update docs/source/community/contributing.mdx

Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>

---------

Co-authored-by: JingyaHuang <jingya@huggingface.co>
Co-authored-by: David Corvoysier <david@huggingface.co>
Co-authored-by: Michael Benayoun <mickbenayoun@gmail.com>
---
 docs/source/_toctree.yml                  |   4 +
 docs/source/community/contributing.mdx    | 115 ++++++++++++++++++++++
 docs/source/guides/overview.mdx           |   2 +-
 docs/source/package_reference/export.mdx  |  21 ----
 optimum/exporters/neuron/model_configs.py |  10 ++
 tests/exporters/exporters_utils.py        |   1 +
 tests/exporters/test_export.py            |   1 -
 7 files changed, 131 insertions(+), 23 deletions(-)
 create mode 100644 docs/source/community/contributing.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 2416dd141..f71a67b51 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -41,6 +41,10 @@
     - local: guides/pipelines
       title: Inference pipelines with AWS Neuron
     title: How-To Guides
+  - sections:
+    - local: community/contributing
+      title: Add support for a new model architecture
+    title: Contribute
   - sections:
     - local: package_reference/trainer
       title: Neuron Trainer
diff --git a/docs/source/community/contributing.mdx b/docs/source/community/contributing.mdx
new file mode 100644
index 000000000..a53e54297
--- /dev/null
+++ b/docs/source/community/contributing.mdx
@@ -0,0 +1,115 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Adding support for new architectures
+
+
+
+> **_NOTE:_** ❗This section does not apply to the decoder model’s inference with autoregressive sampling integrated through `transformers-neuronx`. If you want to add support for these models, please open an issue on the Optimum Neuron GitHub repo, and ping maintainers for help.
+
+You want to export and run a new model on AWS Inferentia or Trainium? Check the guideline, and submit a pull request to [🤗 Optimum Neuron's GitHub repo](https://github.com/huggingface/optimum-neuron/)! 
+
+To support a new model architecture in the Optimum Neuron library here are some steps to follow:
+
+1. Implement a custom Neuron configuration.
+2. Export and validate the model.
+3. Contribute to the GitHub repo. 
+
+## Implement a custom Neuron configuration
+
+To support the export of a new model to a Neuron compatible format, the first thing to do is to define a Neuron configuration, describing how to export the PyTorch model by specifying:
+
+1. The input names.
+2. The output names.
+3. The dummy inputs used to trace the model: the Neuron Compiler records the computational graph via tracing and works on the resulting `TorchScript` module.
+4. The compilation arguments used to control the trade-off between hardware efficiency (latency, throughput) and accuracy.
+
+Depending on the choice of model and task, we represent the data above with configuration classes. Each configuration class is associated with
+a specific model architecture, and follows the naming convention `ArchitectureNameNeuronConfig`. For instance, the configuration that specifies the Neuron
+export of BERT models is `BertNeuronConfig`.
+
+Since many architectures share similar properties for their Neuron configuration, 🤗 Optimum adopts a 3-level class hierarchy:
+
+1. Abstract and generic base classes. These handle all the fundamental features, while being agnostic to the modality (text, image, audio, etc).
+2. Middle-end classes. These are aware of the modality. Multiple config classes could exist for the same modality, depending on the inputs they support. They specify which input generators should be used for generating the dummy inputs, but remain model-agnostic.
+3. Model-specific classes like the `BertNeuronConfig` mentioned above. These are the ones actually used to export models.
+
+### Example: Adding support for ESM models
+
+Here we take the support of [ESM models](https://huggingface.co/docs/transformers/model_doc/esm#esm) as an example. Let's create an `EsmNeuronConfig` class in the `optimum/exporters/neuron/model_configs.py`.
+
+When an Esm model interprets as a text encoder, we are able to inherit from the middle-end class [`TextEncoderNeuronConfig`](https://github.com/huggingface/optimum-neuron/blob/v0.0.18/optimum/exporters/neuron/config.py#L36). 
+Since the modeling and configuration of Esm is almost the same as BERT when it is interpreted as an encoder, we can use the `NormalizedConfigManager` with `model_type=bert` to normalize the configuration to generate dummy inputs for tracing the model.
+
+And one last step, since `optimum-neuron` is an extension of `optimum`, we need to register the Neuron config that we create to the [TasksManager](https://huggingface.co/docs/optimum/main/en/exporters/task_manager#optimum.exporters.TasksManager) with the `register_in_tasks_manager` decorator by specifying the model type and supported tasks.
+
+```python
+
+@register_in_tasks_manager("esm", *["feature-extraction", "fill-mask", "text-classification", "token-classification"])
+class EsmNeuronConfig(TextEncoderNeuronConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedConfigManager.get_normalized_config_class("bert")
+    ATOL_FOR_VALIDATION = 1e-3  # absolute tolerance to compare for comparing model on CPUs
+
+    @property
+    def inputs(self) -> List[str]:
+        return ["input_ids", "attention_mask"]
+
+```
+
+## Export and validate the model
+
+With the Neuron configuration class that you implemented, now do a quick test if it works as expected:
+
+* Export
+
+```bash
+optimum-cli export neuron --model facebook/esm2_t33_650M_UR50D --task text-classification --batch_size 1 --sequence_length 16 esm_neuron/
+```
+
+During the export [`validate_model_outputs`](https://github.com/huggingface/optimum-neuron/blob/7b18de9ddfa5c664c94051304c651eaf855c3e0b/optimum/exporters/neuron/convert.py#L136) will be called to validate the outputs of your exported Neuron model by comparing them to the results of PyTorch on the CPU. You could also validate the model manually with:
+
+```python
+from optimum.exporters.neuron import validate_model_outputs
+
+validate_model_outputs(
+    neuron_config, base_model, neuron_model_path, neuron_named_outputs, neuron_config.ATOL_FOR_VALIDATION
+)
+```
+
+* Inference (optional)
+
+```python
+from transformers import AutoTokenizer
+from optimum.neuron import NeuronModelForSequenceClassification
+
+model = NeuronModelForSequenceClassification.from_pretrained("esm_neuron/")
+tokenizer = AutoTokenizer.from_pretrained("esm_neuron/")
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+logits = model(**inputs).logits
+```
+
+## Contribute to the GitHub repo
+
+We are almost all set. Now submit a pull request to make your work accessible to all community members!
+
+* Open an issue in the [Optimum Neuron GitHub repo](https://github.com/huggingface/optimum-neuron/issues) to describe the new feature and make it visible to Optimum Neuron's maintainers.
+* Add the model to the exporter test in [`optimum-neuron/tests/exporters/exporters_utils.py`](https://github.com/huggingface/optimum-neuron/blob/v0.0.18/tests/exporters/exporters_utils.py) and the inference test in [`optimum-neuron/tests/inference/inference_utils.py`](https://github.com/huggingface/optimum-neuron/blob/v0.0.18/tests/inference/inference_utils.py).
+* Open a pull request! (Don't forget to link it to the issue you opened, so that the maintainers could better track it and provide help when needed.)
+
+
+<Tip>
+
+We usually test smaller checkpoints to accelerate the CIs, you could find tiny models for testing under the [`Hugging Face Internal Testing Organization`](https://huggingface.co/hf-internal-testing).
+
+</Tip>
+
+You have made a new model accessible on Neuron for the community! Thanks for joining us in the endeavor of democratizing good machine learning 🤗.
\ No newline at end of file
diff --git a/docs/source/guides/overview.mdx b/docs/source/guides/overview.mdx
index 64ebfaa0c..0255ccc75 100644
--- a/docs/source/guides/overview.mdx
+++ b/docs/source/guides/overview.mdx
@@ -21,9 +21,9 @@ Welcome to the 🤗 Optimum Neuron how-to guides!
 These guides tackle more advanced topics and will show you how to easily get the best from AWS Trainium / Inferentia:
 
 - [How to setup AWS Trainium instance](./setup_aws_instance)
-- [How to fine-tune a Transformers model with AWS Trainium](./fine_tune)
 - [Training and Deployment using Amazon Sagemaker](./sagemaker)
 - [Neuron model cache](./cache_system)
+- [How to fine-tune a Transformers model with AWS Trainium](./fine_tune)
 - [Distributed training with AWS Neuron](./distributed_training.mdx)
 - [Export a model to Inferentia](./export_model)
 - [Neuron Model Inference](./models)
diff --git a/docs/source/package_reference/export.mdx b/docs/source/package_reference/export.mdx
index 912ae5d81..7f0102ecf 100644
--- a/docs/source/package_reference/export.mdx
+++ b/docs/source/package_reference/export.mdx
@@ -28,27 +28,6 @@ exporting function according to the environment.
 Besides, you can check if the exported model is valid via [`~optimum.exporters.neuron.convert.validate_model_outputs`], which compares
 the compiled model's output on Neuron devices to the PyTorch model's output on CPU.
 
-## Configuration classes for Neuron exports
-
-Exporting a PyTorch model to a Neuron compiled model involves specifying:
-
-1. The input names.
-2. The output names.
-3. The dummy inputs used to trace the model. This is needed by the Neuron Compiler to record the computational graph and convert it to a TorchScript module.
-4. The compilation arguments used to control the trade-off between hardware efficiency (latency, throughput) and accuracy.
-
-Depending on the choice of model and task, we represent the data above with _configuration classes_. Each configuration class is associated with
-a specific model architecture, and follows the naming convention `ArchitectureNameNeuronConfig`. For instance, the configuration which specifies the Neuron
-export of BERT models is `BertNeuronConfig`.
-
-Since many architectures share similar properties for their Neuron configuration, 🤗 Optimum adopts a 3-level class hierarchy:
-
-1. Abstract and generic base classes. These handle all the fundamental features, while being agnostic to the modality (text, image, audio, etc).
-2. Middle-end classes. These are aware of the modality, but multiple can exist for the same modality depending on the inputs they support.
-   They specify which input generators should be used for the dummy inputs, but remain model-agnostic.
-3. Model-specific classes like the `BertNeuronConfig` mentioned above. These are the ones actually used to export models.
-
-
 ## Supported architectures
 
 
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index a08da0826..294928bbb 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -101,6 +101,16 @@ def outputs(self) -> List[str]:
         return self._TASK_TO_COMMON_OUTPUTS[self.task]
 
 
+@register_in_tasks_manager("esm", *["feature-extraction", "fill-mask", "text-classification", "token-classification"])
+class EsmNeuronConfig(TextEncoderNeuronConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedConfigManager.get_normalized_config_class("bert")
+    ATOL_FOR_VALIDATION = 1e-3
+
+    @property
+    def inputs(self) -> List[str]:
+        return ["input_ids", "attention_mask"]
+
+
 @register_in_tasks_manager("flaubert", *COMMON_TEXT_TASKS)
 class FlaubertNeuronConfig(ElectraNeuronConfig):
     pass
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index c373e5588..b4b8e32b9 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -23,6 +23,7 @@
     "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model",  # Failed for INF1: 'XSoftmax'
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
+    "esm": "hf-internal-testing/tiny-random-EsmModel",
     "flaubert": "flaubert/flaubert_small_cased",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
     "mpnet": "hf-internal-testing/tiny-random-MPNetModel",
diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py
index f59656252..80316db9a 100644
--- a/tests/exporters/test_export.py
+++ b/tests/exporters/test_export.py
@@ -178,7 +178,6 @@ def test_export_separated_weights(self, test_name, name, model_name, task, neuro
 
     @parameterized.expand(_get_models_to_test(SENTENCE_TRANSFORMERS_MODELS))
     @is_inferentia_test
-    @require_vision
     @require_sentence_transformers
     @requires_neuronx
     def test_export_sentence_transformers(self, test_name, name, model_name, task, neuron_config_constructor):

From ab582ce362c00c0d4d203b2ffb022d8ea41f8ad6 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 8 Feb 2024 16:43:59 +0100
Subject: [PATCH 08/12] TGI: update to controller version 1.4.0 & bug fixes
 (#470)

* feat(tgi): update controller version to 1.4.0

* fix(tgi): correctly handle single token inputs

* feat(tgi): support seed generation parameter

* fix(tgi): return correct finish reason

* fix(tgi): return only pending requests in next batch

On decode, finished requests were also included.

* fix(decoder): do not modify generation_config parameter

* fix(tgi): avoid repeated token in continuous batching

The last generated token of paused slots was recreated and sent back
instead of generating a new one.

* fix(tgi): update max_new_tokens in continuous batching

The max_new_tokens was not updated for pending requests while the
generated tokens were now actually seen as input tokens.
This was effectively as if the number of generated tokens had been reset.

* test(tgi): add generator python tests

* test(tgi): add docker tests
---
 Makefile                                      |  19 +-
 optimum/neuron/generation/token_selector.py   |  11 +-
 text-generation-inference/Dockerfile          |   6 +-
 .../integration-tests/conftest.py             | 155 +++++++++++++
 .../integration-tests/pytest.ini              |   2 +
 .../integration-tests/requirements.txt        |  18 ++
 .../integration-tests/test_gpt2.py            |  99 +++++++++
 text-generation-inference/server/Makefile     |   3 +-
 .../text_generation_server/generator.py       |  50 +++--
 .../tests/test_generator.py                   | 207 ++++++++++++++++++
 10 files changed, 543 insertions(+), 27 deletions(-)
 create mode 100644 text-generation-inference/integration-tests/conftest.py
 create mode 100644 text-generation-inference/integration-tests/pytest.ini
 create mode 100644 text-generation-inference/integration-tests/requirements.txt
 create mode 100644 text-generation-inference/integration-tests/test_gpt2.py
 create mode 100644 text-generation-inference/tests/test_generator.py

diff --git a/Makefile b/Makefile
index 71011ca12..e9ec19103 100644
--- a/Makefile
+++ b/Makefile
@@ -40,12 +40,21 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
+TGI_VERSION ?= 1.4.0
+
 neuronx-tgi: $(PACKAGE_DIST)
-	docker build --rm -f text-generation-inference/Dockerfile --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
+	docker build --rm -f text-generation-inference/Dockerfile \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 -t neuronx-tgi:$(VERSION) .
 	docker tag neuronx-tgi:$(VERSION) neuronx-tgi:latest
 
 neuronx-tgi-sagemaker: $(PACKAGE_DIST)
-	docker build --rm -f text-generation-inference/Dockerfile --target sagemaker --build-arg VERSION=$(VERSION) -t neuronx-tgi:$(VERSION) .
+	docker build --rm -f text-generation-inference/Dockerfile \
+	             --build-arg VERSION=$(VERSION) \
+	             --build-arg TGI_VERSION=$(TGI_VERSION) \
+				 --target sagemaker \
+				 -t neuronx-tgi:$(VERSION) .
 
 # Creates example scripts from Transformers
 transformers_examples:
@@ -81,10 +90,14 @@ test_installs:
 tgi_server:
 	python -m pip install -r text-generation-inference/server/build-requirements.txt
 	make -C text-generation-inference/server clean
-	VERSION=${VERSION} make -C text-generation-inference/server gen-server
+	VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server
 
 tgi_test: tgi_server
 	python -m pip install .[neuronx] pytest
 	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
 	                               -exec python -m pip install --force-reinstall {} \;
 	python -m pytest -s text-generation-inference/tests
+
+tgi_docker_test: neuronx-tgi
+	python -m pip install -r text-generation-inference/integration-tests/requirements.txt
+	python -m pytest -s text-generation-inference/integration-tests
diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py
index 2d64d03fc..9cc0994ed 100644
--- a/optimum/neuron/generation/token_selector.py
+++ b/optimum/neuron/generation/token_selector.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 from typing import Optional
 
@@ -43,6 +44,7 @@ def __init__(
         eos_token_id: int,
         pad_token_id: int,
         logits_warper: Optional[LogitsProcessorList] = None,
+        seed: Optional[int] = 0,
     ):
         self.mode = mode
         self.logits_processor = logits_processor
@@ -50,6 +52,8 @@ def __init__(
         self.eos_token_id = eos_token_id
         self.pad_token_id = pad_token_id
         self.logits_warper = logits_warper
+        self.generator = torch.Generator()
+        self.generator.manual_seed(seed)
 
     @classmethod
     def create(
@@ -59,6 +63,7 @@ def create(
         model: GenerationMixin,
         max_seq_length: int,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
+        seed: Optional[int] = 0,
     ) -> "TokenSelector":
         r"""Creates the `TokenSelector` for a specific generation configuration.
 
@@ -74,10 +79,13 @@ def create(
             stopping_criteria (`Optional[transformers.generation.StoppingCriteriaList], defaults to `None`):
                 Custom stopping criteria that complement the default stopping criteria built from arguments and a
                 generation config.
+            seed(`Optional[int]`):
+                The optional seed for sampling. Defaults to zero.
         Return:
             `torch.LongTensor`: A `torch.LongTensor` containing the selected tokens.
         """
         generation_config.validate()
+        generation_config = copy.deepcopy(generation_config)
 
         unsupported_generation_flags = [
             "output_attentions",
@@ -145,6 +153,7 @@ def create(
             logits_warper=logits_warper,
             eos_token_id=eos_token_id,
             pad_token_id=generation_config.pad_token_id,
+            seed=seed,
         )
 
     def select(self, input_ids: torch.LongTensor, logits: torch.Tensor) -> torch.LongTensor:
@@ -171,7 +180,7 @@ def _sample(self, scores: torch.Tensor) -> torch.LongTensor:
 
         # sample
         probs = torch.nn.functional.softmax(scores, dim=-1)
-        next_tokens = torch.multinomial(probs, num_samples=1)
+        next_tokens = torch.multinomial(probs, num_samples=1, generator=self.generator)
         # Convert the filtered tokens to actual vocabulary tokens
         next_tokens = torch.gather(next_token_indices, 1, next_tokens)
         return next_tokens.squeeze(1)
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index c92479478..bc8249e4b 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -1,7 +1,9 @@
-# Fetch and extract the TGI sources
+# Fetch and extract the TGI sources (TGI_VERSION is mandatory)
 FROM alpine AS tgi
+ARG TGI_VERSION
+RUN test -n ${TGI_VERSION:?}
 RUN mkdir -p /tgi
-ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz /tgi/sources.tar.gz
+ADD https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz /tgi/sources.tar.gz
 RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
diff --git a/text-generation-inference/integration-tests/conftest.py b/text-generation-inference/integration-tests/conftest.py
new file mode 100644
index 000000000..dd8616a3f
--- /dev/null
+++ b/text-generation-inference/integration-tests/conftest.py
@@ -0,0 +1,155 @@
+import asyncio
+import contextlib
+import os
+import random
+import shlex
+import subprocess
+import sys
+import time
+from tempfile import TemporaryDirectory
+from typing import List
+
+import docker
+import pytest
+from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
+from docker.errors import NotFound
+from text_generation import AsyncClient
+from text_generation.types import Response
+
+
+DOCKER_IMAGE = os.getenv("DOCKER_IMAGE", "neuronx-tgi:latest")
+HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN", None)
+DOCKER_VOLUME = os.getenv("DOCKER_VOLUME", "/data")
+
+
+class LauncherHandle:
+    def __init__(self, port: int):
+        self.client = AsyncClient(f"http://localhost:{port}")
+
+    def _inner_health(self):
+        raise NotImplementedError
+
+    async def health(self, timeout: int = 60):
+        assert timeout > 0
+        for _ in range(timeout):
+            if not self._inner_health():
+                raise RuntimeError("Launcher crashed")
+
+            try:
+                await self.client.generate("test")
+                return
+            except (ClientConnectorError, ClientOSError, ServerDisconnectedError):
+                time.sleep(1)
+        raise RuntimeError("Health check failed")
+
+
+class ContainerLauncherHandle(LauncherHandle):
+    def __init__(self, docker_client, container_name, port: int):
+        super(ContainerLauncherHandle, self).__init__(port)
+        self.docker_client = docker_client
+        self.container_name = container_name
+
+    def _inner_health(self) -> bool:
+        container = self.docker_client.containers.get(self.container_name)
+        return container.status in ["running", "created"]
+
+
+class ProcessLauncherHandle(LauncherHandle):
+    def __init__(self, process, port: int):
+        super(ProcessLauncherHandle, self).__init__(port)
+        self.process = process
+
+    def _inner_health(self) -> bool:
+        return self.process.poll() is None
+
+
+@pytest.fixture(scope="module")
+def event_loop():
+    loop = asyncio.get_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture(scope="module")
+def data_volume():
+    tmpdir = TemporaryDirectory()
+    yield tmpdir.name
+    # Cleanup the temporary directory using sudo as it contains root files created by the container
+    subprocess.run(shlex.split(f"sudo rm -rf {tmpdir.name}"))
+
+
+@pytest.fixture(scope="module")
+def launcher(event_loop, data_volume):
+    @contextlib.contextmanager
+    def docker_launcher(
+        model_id: str,
+        trust_remote_code: bool = False,
+    ):
+        port = random.randint(8000, 10_000)
+
+        args = ["--model-id", model_id, "--env"]
+
+        if trust_remote_code:
+            args.append("--trust-remote-code")
+
+        client = docker.from_env()
+
+        container_name = f"tgi-tests-{model_id.split('/')[-1]}"
+
+        try:
+            container = client.containers.get(container_name)
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        env = {"LOG_LEVEL": "info,text_generation_router=debug"}
+
+        if HUGGING_FACE_HUB_TOKEN is not None:
+            env["HUGGING_FACE_HUB_TOKEN"] = HUGGING_FACE_HUB_TOKEN
+
+        for var in ["HF_BATCH_SIZE", "HF_SEQUENCE_LENGTH", "HF_AUTOCAST_TYPE", "HF_NUM_CORES"]:
+            if var in os.environ:
+                env[var] = os.environ[var]
+
+        volumes = [f"{data_volume}:/data"]
+
+        container = client.containers.run(
+            DOCKER_IMAGE,
+            command=args,
+            name=container_name,
+            environment=env,
+            auto_remove=False,
+            detach=True,
+            devices=["/dev/neuron0"],
+            volumes=volumes,
+            ports={"80/tcp": port},
+            shm_size="1G",
+        )
+
+        yield ContainerLauncherHandle(client, container.name, port)
+
+        try:
+            container.stop()
+            container.wait()
+        except NotFound:
+            pass
+
+        container_output = container.logs().decode("utf-8")
+        print(container_output, file=sys.stderr)
+
+        container.remove()
+
+    return docker_launcher
+
+
+@pytest.fixture(scope="module")
+def generate_load():
+    async def generate_load_inner(client: AsyncClient, prompt: str, max_new_tokens: int, n: int) -> List[Response]:
+        futures = [
+            client.generate(prompt, max_new_tokens=max_new_tokens, decoder_input_details=True) for _ in range(n)
+        ]
+
+        return await asyncio.gather(*futures)
+
+    return generate_load_inner
diff --git a/text-generation-inference/integration-tests/pytest.ini b/text-generation-inference/integration-tests/pytest.ini
new file mode 100644
index 000000000..2f4c80e30
--- /dev/null
+++ b/text-generation-inference/integration-tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+asyncio_mode = auto
diff --git a/text-generation-inference/integration-tests/requirements.txt b/text-generation-inference/integration-tests/requirements.txt
new file mode 100644
index 000000000..58765d39c
--- /dev/null
+++ b/text-generation-inference/integration-tests/requirements.txt
@@ -0,0 +1,18 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+text-generation >= 0.6.0
+pytest >= 7.4.0
+pytest-asyncio >= 0.21.1
+docker >= 6.1.3
+Levenshtein
diff --git a/text-generation-inference/integration-tests/test_gpt2.py b/text-generation-inference/integration-tests/test_gpt2.py
new file mode 100644
index 000000000..b1f5275b5
--- /dev/null
+++ b/text-generation-inference/integration-tests/test_gpt2.py
@@ -0,0 +1,99 @@
+import os
+
+import huggingface_hub
+import Levenshtein
+import pytest
+
+
+MODEL_ID = "gpt2"
+NEURON_MODEL_ID = "aws-neuron/gpt2-neuronx-bs4-seqlen1024"
+BATCH_SIZE = 4
+SEQUENCE_LENGTH = 1024
+NUM_CORES = 2
+
+
+@pytest.fixture(scope="module", params=["hub-neuron", "hub", "local-neuron"])
+def model_name_or_path(request, data_volume):
+    if request.param == "hub":
+        os.environ["HF_BATCH_SIZE"] = str(BATCH_SIZE)
+        os.environ["HF_SEQUENCE_LENGTH"] = str(SEQUENCE_LENGTH)
+        os.environ["HF_NUM_CORES"] = str(NUM_CORES)
+        yield MODEL_ID
+    elif request.param == "hub-neuron":
+        yield NEURON_MODEL_ID
+    else:
+        model_dir = f"gpt2-neuron-{BATCH_SIZE}x{SEQUENCE_LENGTH}x{NUM_CORES}"
+        local_path = os.path.join(data_volume, model_dir)
+        huggingface_hub.snapshot_download(NEURON_MODEL_ID, local_dir=local_path)
+        # Return the path of the model inside the mounted volume
+        yield os.path.join("/data", model_dir)
+
+
+@pytest.fixture(scope="module")
+def tgi_service(launcher, model_name_or_path):
+    with launcher(model_name_or_path) as tgi_service:
+        yield tgi_service
+
+
+@pytest.fixture(scope="module")
+async def tgi_client(tgi_service):
+    await tgi_service.health(300)
+    return tgi_service.client
+
+
+@pytest.mark.asyncio
+async def test_model_single_request(tgi_client):
+
+    # Greedy bounded without input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert response.generated_text == "\n\nDeep learning is a new field of research that has been around for a while"
+
+    # Greedy bounded with input
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        return_full_text=True,
+        decoder_input_details=True,
+    )
+    assert response.details.generated_tokens == 17
+    assert (
+        response.generated_text
+        == "What is Deep Learning?\n\nDeep learning is a new field of research that has been around for a while"
+    )
+
+    # Sampling
+    response = await tgi_client.generate(
+        "What is Deep Learning?",
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.2,
+        max_new_tokens=1000,
+        seed=42,
+        decoder_input_details=True,
+    )
+    assert "The purpose of the current post is" in response.generated_text
+
+
+@pytest.mark.asyncio
+async def test_model_multiple_requests(tgi_client, generate_load):
+    num_requests = 4
+    responses = await generate_load(
+        tgi_client,
+        "What is Deep Learning?",
+        max_new_tokens=17,
+        n=num_requests,
+    )
+
+    assert len(responses) == 4
+    expected = "\n\nDeep learning is a new field of research that has been around for a while"
+    for r in responses:
+        assert r.details.generated_tokens == 17
+        # Compute the similarity with the expectation using the levenshtein distance
+        # We should not have more than two substitutions or additions
+        assert Levenshtein.distance(r.generated_text, expected) < 3
diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index da5e38ffb..e16ab6585 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -2,6 +2,7 @@
 pkg_name := text_generation_server
 BUILDDIR ?= $(CURDIR)/build
 VERSION ?= 0.0.1
+TGI_VERSION ?= 1.4.0
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)
@@ -38,7 +39,7 @@ endif
 
 $(BUILDDIR)/tgi/proto/%.proto:
 	install -d $(BUILDDIR)/tgi
-	curl -L https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
+	curl -L https://github.com/huggingface/text-generation-inference/archive/refs/tags/v${TGI_VERSION}.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
 	tar -C $(BUILDDIR)/tgi -xf $(BUILDDIR)/tgi/sources.tar.gz --strip-components=1
 
 # Three python files are generated for each protobuf
diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py
index 7f9f0c310..a568d1049 100644
--- a/text-generation-inference/server/text_generation_server/generator.py
+++ b/text-generation-inference/server/text_generation_server/generator.py
@@ -21,6 +21,7 @@
     Generation,
     InfoResponse,
     Request,
+    Tokens,
 )
 
 
@@ -158,7 +159,8 @@ def assign(self, request: Request, generation_config: GenerationConfig):
         self._generation_config.typical_p = request.parameters.typical_p
         self._generation_config.do_sample = request.parameters.do_sample
         self._generation_config.repetition_penalty = request.parameters.repetition_penalty
-        # TODO: seed, watermark
+        self.seed = request.parameters.seed
+        # TODO: watermark
         self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
         # TODO: stop_sequences, ignore_eos_token
 
@@ -176,6 +178,7 @@ def reset(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, s
         self._tokens = input_ids.clone()
         self._next_text_token_start = 0
         self._next_text_token_end = torch.numel(self._tokens)
+        self._next_text = ""
         self._mask = attention_mask.clone()
         self._selector = selector
 
@@ -184,14 +187,14 @@ def pause(self):
 
         Note that the KV cache for this slot will still be filled.
         """
+        # Drop the last token as it will be added back when resuming the slot
+        self._generated_tokens -= 1
+        # Subtract the number of cached tokens from the maximum number of tokens
+        self._generation_config.max_new_tokens -= self._generated_tokens
         self._state = Slot.State.PAUSE
 
     def resume(self):
         """Mark the slot as ready for generation."""
-        if self._state == Slot.State.PAUSE and self.next_token is not None:
-            # The generation of this slot was inhibited during a prefill, but it
-            # already had a pending token, so we need to increase attention mask
-            self._mask = torch.cat([self._mask, torch.LongTensor([1])])
         self._state = Slot.State.READY
 
     def _decode_next_tokens(
@@ -362,27 +365,32 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
         seq_length = min(padded_inputs.input_ids.shape[-1], self.model.max_length)
         input_ids = padded_inputs.input_ids[:, :seq_length]
         attention_mask = padded_inputs.attention_mask[:, :seq_length]
+        # Pause previously active slots during generation and store their last token.
+        next_tokens = []
+        for slot in active_slots:
+            next_tokens.append(slot.next_token)
+            slot.pause()
         # Each slot must be reset with the padded inputs and masks
         for i, slot in enumerate(self.slots):
             if slot.state != slot.state.EMPTY:
                 slot_input_ids = input_ids[i : i + 1, :]
                 # Padded input ids are also required to set logits processors and stopping criterias
                 selector = TokenSelector.create(
-                    slot_input_ids, slot.generation_config, self.model, self.model.max_length
+                    slot_input_ids, slot.generation_config, self.model, self.model.max_length, seed=slot.seed
                 )
-                slot_input_ids = slot_input_ids.squeeze().type(torch.int64)
+                slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
                 slot_attention_mask = attention_mask[i]
                 slot.reset(slot_input_ids, slot_attention_mask, selector)
         # Clear KV cache
         self.model.reset_generation()
         # Pause previously active slots during generation.
-        # Their KV cache will be prefilled but new tokens will be ignored, as they
-        # have already been generated and sent back in the last decode.
-        for slot in active_slots:
-            slot.pause()
+        # The KV cache of paused slots will be prefilled during generation but new tokens
+        # will be ignored, as they have already been generated and sent back in the last decode.
         generation, next_batch = self._generate_token(batch.id, input_ids, attention_mask)
-        # Reactivate previously active slots for the next decode.
-        for slot in active_slots:
+        # Reactivate previously active slots for the next decode, and append
+        # back their next token.
+        for slot, next_token in zip(active_slots, next_tokens):
+            slot.append(next_token)
             slot.resume()
         logger.debug("Model ready for decoding")
         return generation, next_batch
@@ -436,13 +444,11 @@ def _generate_token(
             return_dict=True,
         )
         generations = []
-        request_ids = []
         active_slots = False
         for i, slot in enumerate(self.slots):
             if slot.state != Slot.State.READY:
                 continue
             request_id = slot.request_id
-            request_ids.append(request_id)
             next_token_logits = outputs.logits[i : i + 1, -1, :]
             slot_input_ids = input_ids[i : i + 1, :]
             next_token = slot.select(slot_input_ids, next_token_logits)
@@ -452,7 +458,8 @@ def _generate_token(
             if next_token == self.tokenizer.eos_token_id:
                 finish_reason = FinishReason.FINISH_REASON_EOS_TOKEN
             elif slot.stopped:
-                finish_reason = FinishReason.FINISH_REASON_STOP_SEQUENCE
+                # For now we only support the length stopping criteria
+                finish_reason = FinishReason.FINISH_REASON_LENGTH
             if finish_reason is not None:
                 # We must include the generated text for each finished sequence in the response
                 generated_text = GeneratedText(
@@ -467,16 +474,19 @@ def _generate_token(
                 Generation(
                     request_id=request_id,
                     prefill_tokens=None,
-                    token_id=next_token,
-                    token_logprob=None,
-                    token_text=next_token_text,
-                    token_is_special=(next_token in self.special_tokens),
+                    tokens=Tokens(
+                        ids=[next_token],
+                        logprobs=[0],
+                        texts=[next_token_text],
+                        is_special=[next_token in self.special_tokens],
+                    ),
                     generated_text=generated_text,
                 )
             )
         batch = None
         if active_slots:
             # Whatever initial batch these requests came from, we always return all pending requests in a single batch
+            request_ids = [slot.request_id for slot in self.slots if slot.state == Slot.State.READY]
             batch = self._cached_batch(next_batch_id, request_ids)
         else:
             logger.debug("No more pending requests")
diff --git a/text-generation-inference/tests/test_generator.py b/text-generation-inference/tests/test_generator.py
new file mode 100644
index 000000000..174f554a0
--- /dev/null
+++ b/text-generation-inference/tests/test_generator.py
@@ -0,0 +1,207 @@
+from tempfile import TemporaryDirectory
+
+import pytest
+from text_generation_server.generator import NeuronGenerator
+from text_generation_server.pb.generate_pb2 import (
+    Batch,
+    NextTokenChooserParameters,
+    Request,
+    StoppingCriteriaParameters,
+)
+from transformers import AutoTokenizer
+
+from optimum.neuron import NeuronModelForCausalLM
+
+
+MODEL_ID = "gpt2"
+BATCH_SIZE = 4
+SEQUENCE_LENGTH = 1024
+NUM_CORES = 2
+
+
+@pytest.fixture(scope="module")
+def model_path():
+    with TemporaryDirectory() as tmpdir:
+        AutoTokenizer.from_pretrained(MODEL_ID).save_pretrained(tmpdir)
+        model = NeuronModelForCausalLM.from_pretrained(
+            MODEL_ID, export=True, batch_size=BATCH_SIZE, sequence_length=SEQUENCE_LENGTH, num_cores=NUM_CORES
+        )
+        model.save_pretrained(tmpdir)
+        yield tmpdir
+
+
+def test_info(model_path):
+    generator = NeuronGenerator.from_pretrained(model_path)
+    info = generator.info
+    assert info.requires_padding is True
+    assert info.device_type == "xla"
+    assert info.window_size == 0
+    assert info.speculate == 0
+
+
+def create_request(
+    id: int,
+    inputs: str,
+    max_new_tokens=20,
+    do_sample: bool = False,
+    top_k: int = 50,
+    top_p: float = 0.9,
+    temperature: float = 1.0,
+    seed: int = 0,
+    repetition_penalty: float = 1.0,
+):
+    parameters = NextTokenChooserParameters(
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        do_sample=do_sample,
+        seed=seed,
+        repetition_penalty=repetition_penalty,
+    )
+    stopping_parameters = StoppingCriteriaParameters(max_new_tokens=max_new_tokens)
+    return Request(id=id, inputs=inputs, parameters=parameters, stopping_parameters=stopping_parameters)
+
+
+@pytest.mark.parametrize(
+    "input_text, token_id, token_text, do_sample",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            383,
+            " The",
+            False,
+        ],
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            198,
+            "\n",
+            True,
+        ],
+    ],
+    ids=["greedy", "sample"],
+)
+@pytest.mark.parametrize("batch_size", [1, 4], ids=["single", "multiple"])
+def test_prefill(input_text, token_id, token_text, do_sample, batch_size, model_path):
+    generator = NeuronGenerator.from_pretrained(model_path)
+    assert generator.model.batch_size >= batch_size
+    requests = []
+    max_new_tokens = 20
+    for i in range(batch_size):
+        requests.append(create_request(id=0, inputs=input_text, do_sample=do_sample, max_new_tokens=max_new_tokens))
+    # Let's be pessimistic when estimating max_tokens
+    batch_size * (len(input_text) + max_new_tokens)
+    batch = Batch(id=0, requests=requests, size=batch_size, max_tokens=batch_size * SEQUENCE_LENGTH)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == batch_size
+    # Whatever was passed as max_tokens, the server will correct it
+    # because of static batching
+    assert next_batch.max_tokens == batch_size * SEQUENCE_LENGTH
+    assert len(generations) == batch_size
+    for g in generations:
+        tokens = g.tokens
+        assert tokens.ids == [token_id]
+        assert tokens.texts == [token_text]
+
+
+@pytest.mark.parametrize(
+    "input_text, max_new_tokens, generated_text, do_sample",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            20,
+            " The sun was setting, and the wind was blowing. The sun was setting, and the wind was",
+            False,
+        ],
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            20,
+            "\n\nAt 11:45 a.m. a small group of friends gathered outside the hotel to",
+            True,
+        ],
+    ],
+    ids=["greedy", "sample"],
+)
+def test_decode_single(input_text, max_new_tokens, generated_text, do_sample, model_path):
+    generator = NeuronGenerator.from_pretrained(model_path)
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens, do_sample=do_sample)
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=SEQUENCE_LENGTH)
+    generations, next_batch = generator.prefill(batch)
+    # We already generated one token: call decode max_new_tokens - 1 times
+    for _ in range(max_new_tokens - 1):
+        assert next_batch.size == 1
+        assert next_batch.max_tokens == 1024
+        assert len(generations) == 1
+        assert len(generations[0].tokens.ids) == 1
+        generations, next_batch = generator.decode([next_batch])
+    assert next_batch is None
+    assert len(generations) == 1
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert output.finish_reason == 0
+    assert output.text == generated_text
+
+
+def test_decode_multiple(model_path):
+    generator = NeuronGenerator.from_pretrained(model_path)
+    assert generator.model.batch_size > 1
+    input_text = "Once upon a time"
+    max_new_tokens = 20
+    # Prefill a single request, remembering the generated token
+    tokens = {0: [], 1: []}
+    request = create_request(id=0, inputs=input_text, max_new_tokens=max_new_tokens)
+    batch = Batch(id=0, requests=[request], size=1, max_tokens=SEQUENCE_LENGTH)
+    generations, next_batch = generator.prefill(batch)
+    assert next_batch.size == 1
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == 1
+    # Decode a few tokens
+    gen_tokens = 4
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert next_batch.size == 1
+    # Add a second request
+    request = create_request(id=1, inputs=input_text, max_new_tokens=max_new_tokens)
+    batch = Batch(id=1, requests=[request], size=1, max_tokens=SEQUENCE_LENGTH)
+    generations, next_batch_1 = generator.prefill(batch)
+    assert next_batch_1.size == 1
+    # We should have generated only a single token
+    assert len(generations) == 1
+    g = generations[0]
+    tokens[g.request_id].append(g.tokens.ids[0])
+    assert len(tokens[0]) == gen_tokens
+    assert len(tokens[1]) == 1
+    # Decode more tokens until we reach the maximum for the first request
+    batches = [next_batch, next_batch_1]
+    for _ in range(max_new_tokens - gen_tokens):
+        generations, next_batch = generator.decode(batches)
+        for g in generations:
+            tokens[g.request_id].append(g.tokens.ids[0])
+        batches = [next_batch]
+    # Verify we now only have one pending request
+    assert next_batch.size == 1
+    assert len(tokens[0]) == max_new_tokens
+    assert len(tokens[1]) == max_new_tokens - gen_tokens + 1
+    # Verify we have the output for the first request
+    for g in generations:
+        if g.request_id == 0:
+            output = g.generated_text
+            assert output.text != ""
+            assert output.generated_tokens == max_new_tokens
+            generated_text = output.text
+    # Continue decoding until the end of the second request
+    for _ in range(gen_tokens - 1):
+        generations, next_batch = generator.decode([next_batch])
+        assert len(generations) == 1
+        g = generations[0]
+        tokens[g.request_id].append(g.tokens.ids[0])
+    assert next_batch is None
+    output = generations[0].generated_text
+    assert output.generated_tokens == max_new_tokens
+    assert tokens[0] == tokens[1]
+    assert output.text == generated_text

From 00ac6ce26e187f9753793ea0a60493781482ffbb Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Mon, 12 Feb 2024 09:45:58 +0100
Subject: [PATCH 09/12] Fix optimum-cli export for inf1  (#474)

* fix + test

* improve tests

---------

Co-authored-by: JingyaHuang <jingya@huggingface.co>
---
 optimum/commands/export/neuron.py    |  25 ++++++
 optimum/exporters/neuron/__main__.py |  19 +++--
 optimum/exporters/neuron/convert.py  |  22 ++++-
 tests/cli/test_export_cli.py         | 120 ++++++---------------------
 tests/exporters/test_export.py       |   1 -
 5 files changed, 84 insertions(+), 103 deletions(-)

diff --git a/optimum/commands/export/neuron.py b/optimum/commands/export/neuron.py
index 12213d0d1..43305aeeb 100644
--- a/optimum/commands/export/neuron.py
+++ b/optimum/commands/export/neuron.py
@@ -46,6 +46,21 @@ def parse_args_neuron(parser: "ArgumentParser"):
             f" {str(list(TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS.keys()) + list(TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS.keys()))}."
         ),
     )
+    optional_group.add_argument(
+        "--library-name",
+        type=str,
+        choices=["transformers", "sentence_transformers"],
+        default=None,
+        help=("The library on the model. If not provided, will attempt to infer the local checkpoint's library."),
+    )
+    optional_group.add_argument(
+        "--subfolder",
+        type=str,
+        default="",
+        help=(
+            "In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, specify the folder name here."
+        ),
+    )
     optional_group.add_argument(
         "--atol",
         type=float,
@@ -58,6 +73,16 @@ def parse_args_neuron(parser: "ArgumentParser"):
         action="store_true",
         help="Allow to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.",
     )
+    optional_group.add_argument(
+        "--compiler_workdir",
+        type=Path,
+        help="Path indicating the directory where to store intermediary files generated by Neuron compiler.",
+    )
+    optional_group.add_argument(
+        "--disable-weights-neff-inline",
+        action="store_true",
+        help="Whether to disable the weights / neff graph inline. You can only replace weights of neuron-compiled models when the weights-neff inlining has been disabled during the compilation.",
+    )
     optional_group.add_argument(
         "--disable-validation",
         action="store_true",
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 8db4f4a75..f53f008af 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -150,16 +150,19 @@ def customize_optional_outputs(args: argparse.Namespace) -> Dict[str, bool]:
 
 def parse_optlevel(args: argparse.Namespace) -> Dict[str, bool]:
     """
-    Parse the level of optimization the compiler should perform. If not specified apply `O2`(the best balance between model performance and compile time).
+    (NEURONX ONLY) Parse the level of optimization the compiler should perform. If not specified apply `O2`(the best balance between model performance and compile time).
     """
-    if args.O1:
-        optlevel = "1"
-    elif args.O2:
-        optlevel = "2"
-    elif args.O3:
-        optlevel = "3"
+    if is_neuronx_available():
+        if args.O1:
+            optlevel = "1"
+        elif args.O2:
+            optlevel = "2"
+        elif args.O3:
+            optlevel = "3"
+        else:
+            optlevel = "2"
     else:
-        optlevel = "2"
+        optlevel = None
     return optlevel
 
 
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 63e85cd44..fb0bcadb9 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -405,7 +405,17 @@ def export(
     disable_fallback: bool = False,
 ) -> Tuple[List[str], List[str]]:
     if is_neuron_available():
-        return export_neuron(model, config, output, auto_cast, auto_cast_type, disable_fast_relayout, disable_fallback)
+        return export_neuron(
+            model=model,
+            config=config,
+            output=output,
+            compiler_workdir=compiler_workdir,
+            inline_weights_to_neff=inline_weights_to_neff,
+            auto_cast=auto_cast,
+            auto_cast_type=auto_cast_type,
+            disable_fast_relayout=disable_fast_relayout,
+            disable_fallback=disable_fallback,
+        )
     elif is_neuronx_available():
         return export_neuronx(
             model=model,
@@ -570,6 +580,8 @@ def export_neuron(
     model: "PreTrainedModel",
     config: "NeuronDefaultConfig",
     output: Path,
+    compiler_workdir: Optional[Path] = None,
+    inline_weights_to_neff: bool = True,
     auto_cast: Optional[str] = None,
     auto_cast_type: str = "bf16",
     disable_fast_relayout: bool = False,
@@ -585,6 +597,10 @@ def export_neuron(
             The Neuron configuration associated with the exported model.
         output (`Path`):
             Directory to store the exported Neuron model.
+        compiler_workdir (`Optional[Path]`, defaults to `None`):
+            The directory used by neuron-cc, where you can find intermediary outputs (neff, weight, hlo...).
+        inline_weights_to_neff (`bool`, defaults to `True`):
+            Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff.
         auto_cast (`Optional[str]`, defaults to `None`):
             Whether to cast operations from FP32 to lower precision to speed up the inference. Can be `None`, `"matmul"` or `"all"`, you should use `None` to disable any auto-casting, use `"matmul"` to cast FP32 matrix multiplication operations, and use `"all"` to cast all FP32 operations.
         auto_cast_type (`str`, defaults to `"bf16"`):
@@ -599,6 +615,8 @@ def export_neuron(
         the Neuron configuration.
     """
     output.parent.mkdir(parents=True, exist_ok=True)
+    if isinstance(compiler_workdir, Path):
+        compiler_workdir = compiler_workdir.as_posix()
 
     if hasattr(model, "config"):
         model.config.return_dict = True
@@ -626,6 +644,8 @@ def export_neuron(
         dummy_inputs_tuple,
         dynamic_batch_size=config.dynamic_batch_size,
         compiler_args=compiler_args,
+        compiler_workdir=compiler_workdir,
+        separate_weights=not inline_weights_to_neff,
         fallback=not disable_fallback,
     )
     torch.jit.save(neuron_model, output)
diff --git a/tests/cli/test_export_cli.py b/tests/cli/test_export_cli.py
index c48c5015e..2bc38eaef 100644
--- a/tests/cli/test_export_cli.py
+++ b/tests/cli/test_export_cli.py
@@ -13,102 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import random
 import subprocess
 import tempfile
 import unittest
-from itertools import product
-from typing import Dict, Optional
 
 from optimum.exporters.neuron.model_configs import *  # noqa: F403
-from optimum.exporters.tasks import TasksManager
-from optimum.neuron.utils import is_neuron_available, is_neuronx_available
+from optimum.neuron.utils import is_neuronx_available
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
-from optimum.utils import DEFAULT_DUMMY_SHAPES, logging
+from optimum.utils import logging
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-_COMMOM_COMMANDS = {
-    "--auto_cast": ["none", "matmul", "all"],
-    "--auto_cast_type": ["bf16", "fp16"],  # "tf32", "mixed"
-}
-_NEURON_COMMANDS = {}
-_NEURONX_COMMANDS = {}
-_DYNAMIC_COMMANDS = {"neuron": ["--disable-fast-relayout"], "neuronx": []}
-
-
-def _get_models_to_test(export_models_dict: Dict, random_pick: Optional[int] = 1):
-    models_to_test = []
-    for model_type, model_names_tasks in export_models_dict.items():
-        model_type = model_type.replace("_", "-")
-        task_config_mapping = TasksManager.get_supported_tasks_for_model_type(model_type, "neuron")
-
-        if isinstance(model_names_tasks, str):  # test export of all tasks on the same model
-            tasks = list(task_config_mapping.keys())
-            model_tasks = {model_names_tasks: tasks}
-        else:
-            n_tested_tasks = sum(len(tasks) for tasks in model_names_tasks.values())
-            if n_tested_tasks != len(task_config_mapping):
-                logger.warning(f"Not all tasks are tested for {model_type}.")
-            model_tasks = model_names_tasks  # possibly, test different tasks on different models
-
-        for model_name, tasks in model_tasks.items():
-            for task in tasks:
-                default_shapes = dict(DEFAULT_DUMMY_SHAPES)
-                TasksManager.get_exporter_config_constructor(
-                    model_type=model_type,
-                    exporter="neuron",
-                    task=task,
-                    model_name=model_name,
-                    exporter_config_kwargs={**default_shapes},
-                )
-
-                models_to_test.append((f"{model_type}_{task}", model_name, task))
-
-    if random_pick is not None:
-        return sorted(random.choices(models_to_test, k=random_pick))
-    else:
-        return sorted(models_to_test)
-
-
-def _get_commands_to_test(models_to_test):
-    commands_to_test = []
-    for test_name, model_name, task in models_to_test:
-        if is_neuron_available():
-            command_items = dict(_COMMOM_COMMANDS, **_NEURON_COMMANDS)
-            dynamic_args = _DYNAMIC_COMMANDS["neuron"]
-        elif is_neuronx_available():
-            command_items = dict(_COMMOM_COMMANDS, **_NEURONX_COMMANDS)
-            dynamic_args = _DYNAMIC_COMMANDS["neuronx"]
-        else:
-            continue
-
-        base_command = f"optimum-cli export neuron --model {model_name} --task {task}"
-
-        # mandatory shape arguments
-        model = TasksManager.get_model_from_task(task, model_name, framework="pt")
-        neuron_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=model, exporter="neuron", task=task
-        )
-        for axis in neuron_config_constructor.func.get_mandatory_axes_for_task(task):
-            default_size = DEFAULT_DUMMY_SHAPES[axis]
-            base_command += f" --{axis} {default_size}"
-
-        # compilation arguments
-        for extra_arg_options in product(*command_items.values()):
-            extra_command = " ".join(
-                [" ".join([arg, option]) for arg, option in zip(command_items, extra_arg_options)]
-            )
-            extra_command += " " + " ".join(random.choices(dynamic_args, k=random.randint(0, len(dynamic_args))))
-            command = base_command + " " + extra_command
-
-            commands_to_test.append((test_name + extra_command.strip(), command))
-
-    return sorted(commands_to_test)
-
-
 @is_inferentia_test
 class TestExportCLI(unittest.TestCase):
     def test_helps_no_raise(self):
@@ -121,12 +38,27 @@ def test_helps_no_raise(self):
         for command in commands:
             subprocess.run(command, shell=True, check=True)
 
-    # @parameterized.expand(_get_commands_to_test(_get_models_to_test(EXPORT_MODELS_TINY)), skip_on_empty=True)
-    # def test_export_commands(self, test_name, command_content):
-    #     with tempfile.TemporaryDirectory() as tempdir:
-    #         command = command_content + f" {tempdir}"
-
-    #         subprocess.run(command, shell=True, check=True)
+    def test_export_commands(self):
+        model_id = "hf-internal-testing/tiny-random-BertModel"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                [
+                    "optimum-cli",
+                    "export",
+                    "neuron",
+                    "--model",
+                    model_id,
+                    "--sequence_length",
+                    "16",
+                    "--batch_size",
+                    "1",
+                    "--task",
+                    "text-classification",
+                    tempdir,
+                ],
+                shell=False,
+                check=True,
+            )
 
     @requires_neuronx
     def test_dynamic_batching(self):
@@ -178,12 +110,14 @@ def test_opt_level(self):
                     check=True,
                 )
 
-    @requires_neuronx
     def test_store_intemediary(self):
         model_id = "hf-internal-testing/tiny-random-BertModel"
         with tempfile.TemporaryDirectory() as tempdir:
             save_path = f"{tempdir}/neff"
-            neff_path = os.path.join(save_path, model_id.split("/")[-1], "graph.neff")
+            if is_neuronx_available():
+                neff_path = os.path.join(save_path, model_id.split("/")[-1], "graph.neff")
+            else:
+                neff_path = os.path.join(save_path, model_id.split("/")[-1], "32", "neff.json")
             subprocess.run(
                 [
                     "optimum-cli",
diff --git a/tests/exporters/test_export.py b/tests/exporters/test_export.py
index 80316db9a..a1b8c1ccd 100644
--- a/tests/exporters/test_export.py
+++ b/tests/exporters/test_export.py
@@ -170,7 +170,6 @@ def test_export(self, test_name, name, model_name, task, neuron_config_construct
         _get_models_to_test(EXPORT_MODELS_TINY, exclude_model_types=WEIGHTS_NEFF_SEPARATION_UNSUPPORTED_ARCH)
     )
     @is_inferentia_test
-    @requires_neuronx
     def test_export_separated_weights(self, test_name, name, model_name, task, neuron_config_constructor):
         self._neuronx_export(
             test_name, name, model_name, task, neuron_config_constructor, inline_weights_to_neff=False

From 8c2babe1eaeeba1d598d93cf69897196a2591e0b Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Tue, 13 Feb 2024 14:22:24 +0100
Subject: [PATCH 10/12] TGI: bump rust version (#477)

chore(tgi): bump rust version
---
 text-generation-inference/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index bc8249e4b..8a7554bd8 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -8,7 +8,7 @@ RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
 # Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
-FROM lukemathwalker/cargo-chef:latest-rust-1.71-bookworm AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse

From 49b14d03bda9faaf7e1e9bed1cd8636a04c8b44c Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Thu, 15 Feb 2024 16:20:20 +0100
Subject: [PATCH 11/12] [AMI] Updates base ami to new id  (#482)

Update variables.pkr.hcl
---
 infrastructure/ami/hcl2-files/variables.pkr.hcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl
index 57e7214a8..cb77d333e 100644
--- a/infrastructure/ami/hcl2-files/variables.pkr.hcl
+++ b/infrastructure/ami/hcl2-files/variables.pkr.hcl
@@ -10,7 +10,7 @@ variable "instance_type" {
 }
 
 variable "source_ami" {
-  default     = "ami-0fbea04d7389bcd4e" 
+  default     = "ami-0da38db779978a5f7" 
   description = "Base Image"
   type        = string
   /*
@@ -51,4 +51,4 @@ variable "ami_regions" {
   default     = ["eu-west-1"]
   description = "AWS regions to share AMI with"
   type        = list(string)
-}
\ No newline at end of file
+}

From ee0c1f4104ee817daf84107776d9a2d7b92499dd Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Thu, 15 Feb 2024 16:20:32 +0100
Subject: [PATCH 12/12] [documentation] Add Container overview page. (#481)

* add new overview

* latest only

* fix code intents

* correct versions
---
 docs/source/_toctree.yml   |  2 ++
 docs/source/containers.mdx | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 docs/source/containers.mdx

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index f71a67b51..fd492c926 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -5,6 +5,8 @@
     title: Installation
   - local: quickstart
     title: Quickstart
+  - local: containers
+    title: Optimum Containers
   - sections:
     - local: tutorials/overview
       title: Overview
diff --git a/docs/source/containers.mdx b/docs/source/containers.mdx
new file mode 100644
index 000000000..2668c46ab
--- /dev/null
+++ b/docs/source/containers.mdx
@@ -0,0 +1,29 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Optimum Neuron Container 
+
+We provide pre-built Optimum Neuron containers for Amazon SageMaker. These containers come with all of the Hugging Face libraries and dependencies pre-installed, so you can start using them right away.
+We have containers for training and inference, and optimized text generation containers with TGI. The table is up to date and only includes the latest versions of each container. You can find older versions in the [Deep Learning Container Release Notes](https://github.com/aws/deep-learning-containers/releases?q=hf-neuronx&expanded=true)
+
+We recommend using the `sagemaker` Python SDK to retrieve the image URI for the container you want to use. 
+
+## Available Optimum Neuron Containers
+
+| Type                       | Optimum Version | Image URI                                   |
+|-----------------------------|-----------------|---------------------------------------------|
+| Training  | 0.0.13           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training-neuronx:1.13.1-neuronx-py310-sdk2.15.0-ubuntu20.04`   |
+| Inference      | 0.0.13           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference-neuronx:1.13.1-neuronx-py310-sdk2.15.0-ubuntu20.04`      |
+| Text Generation Inference        | 0.0.17           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:1.13.1-optimum0.0.17-neuronx-py310-ubuntu22.04`        |
+
+
+Please replace `763104351884` with the correct [AWS account ID](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/image_uri_config/huggingface-neuronx.json) and `region` with the AWS region you are working in. 
\ No newline at end of file