From 8ebbf9a7e010c4045240d458db66e3adfc9b1b14 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 15 Feb 2024 08:47:16 +0000
Subject: [PATCH 01/13] test(generation): move decoder tests in their own file

---
 tests/generation/test_generate.py     | 62 +--------------------
 tests/generation/test_tnx_generate.py | 79 +++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 61 deletions(-)
 create mode 100644 tests/generation/test_tnx_generate.py

diff --git a/tests/generation/test_generate.py b/tests/generation/test_generate.py
index 92170b40a..41eb4bc08 100644
--- a/tests/generation/test_generate.py
+++ b/tests/generation/test_generate.py
@@ -18,20 +18,12 @@
 import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import StoppingCriteria
 
-from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
+from optimum.neuron import NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, is_trainium_test, requires_neuronx
 from optimum.neuron.utils.training_utils import patch_generation_mixin_to_general_neuron_generation_mixin
 
 
-def _test_model_generation(model, tokenizer, batch_size, input_length, **gen_kwargs):
-    input_ids = torch.ones((batch_size, input_length), dtype=torch.int64)
-    with torch.inference_mode():
-        sample_output = model.generate(input_ids, **gen_kwargs)
-        assert sample_output.shape[0] == batch_size
-
-
 def _test_model_generation_trn(model, tokenizer, batch_size, input_length, **gen_kwargs):
     import torch_xla.core.xla_model as xm
 
@@ -43,58 +35,6 @@ def _test_model_generation_trn(model, tokenizer, batch_size, input_length, **gen
     assert sample_output.shape[0] == batch_size
 
 
-@pytest.mark.parametrize(
-    "gen_kwargs",
-    [
-        {"do_sample": True},
-        {"do_sample": True, "temperature": 0.7},
-        {"do_sample": False},
-        {"do_sample": False, "repetition_penalty": 1.2},
-    ],
-    ids=["sample", "sample-with-temp", "greedy", "greedy_no-repeat"],
-)
-@is_inferentia_test
-@requires_neuronx
-def test_decoder_generation(neuron_decoder_path, gen_kwargs):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
-    _test_model_generation(model, tokenizer, model.batch_size, 10, **gen_kwargs)
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_model_generation_input_dimensions(neuron_decoder_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
-    # Using valid input dimensions
-    _test_model_generation(model, tokenizer, model.batch_size, model.max_length // 2)
-    # Using an incompatible batch_size
-    with pytest.raises(ValueError, match="The specified batch_size"):
-        _test_model_generation(model, tokenizer, model.batch_size + 1, model.max_length)
-    # Using an incompatible input length
-    with pytest.raises(ValueError, match="The input sequence length"):
-        _test_model_generation(model, tokenizer, model.batch_size, input_length=model.max_length * 2)
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_decoder_generation_custom_stopping_criteria():
-    model_id = "hf-internal-testing/tiny-random-gpt2"
-    model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, batch_size=1)
-
-    class CustomStoppingCriteria(StoppingCriteria):
-        def __init__(self):
-            self.called = False
-
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-            self.called = True
-            return True
-
-    criteria = CustomStoppingCriteria()
-    model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria])
-    assert criteria.called, "Custom StoppingCriteria should have been called"
-
-
 @is_inferentia_test
 @requires_neuronx
 def test_seq2seq_generation_beam(neuron_seq2seq_beam_path):
diff --git a/tests/generation/test_tnx_generate.py b/tests/generation/test_tnx_generate.py
new file mode 100644
index 000000000..5838175e5
--- /dev/null
+++ b/tests/generation/test_tnx_generate.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+from transformers.generation import StoppingCriteria
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
+
+
+def _test_generation(model, batch_size, input_length, **gen_kwargs):
+    input_ids = torch.ones((batch_size, input_length), dtype=torch.int64)
+    with torch.inference_mode():
+        sample_output = model.generate(input_ids, **gen_kwargs)
+        assert sample_output.shape[0] == batch_size
+
+
+@pytest.mark.parametrize(
+    "gen_kwargs",
+    [
+        {"do_sample": True},
+        {"do_sample": True, "temperature": 0.7},
+        {"do_sample": False},
+        {"do_sample": False, "repetition_penalty": 1.2},
+    ],
+    ids=["sample", "sample-with-temp", "greedy", "greedy_no-repeat"],
+)
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_generation(neuron_decoder_path, gen_kwargs):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    _test_generation(model, model.batch_size, 10, **gen_kwargs)
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_model_generation_input_dimensions(neuron_decoder_path):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    AutoTokenizer.from_pretrained(neuron_decoder_path)
+    # Using valid input dimensions
+    _test_generation(model, model.batch_size, model.max_length // 2)
+    # Using an incompatible batch_size
+    with pytest.raises(ValueError, match="The specified batch_size"):
+        _test_generation(model, model.batch_size + 1, model.max_length)
+    # Using an incompatible input length
+    with pytest.raises(ValueError, match="The input sequence length"):
+        _test_generation(model, model.batch_size, input_length=model.max_length * 2)
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_generation_custom_stopping_criteria(neuron_decoder_path):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+
+    class CustomStoppingCriteria(StoppingCriteria):
+        def __init__(self):
+            self.called = False
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+            self.called = True
+            return True
+
+    criteria = CustomStoppingCriteria()
+    model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria])
+    assert criteria.called, "Custom StoppingCriteria should have been called"

From 0d5e4ed1946742eda72cf45681bd26d9b03ee78a Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 15 Feb 2024 08:55:44 +0000
Subject: [PATCH 02/13] test(decoder): increase batch size

---
 tests/generation/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py
index b9d70505f..9d69c5579 100644
--- a/tests/generation/conftest.py
+++ b/tests/generation/conftest.py
@@ -72,7 +72,7 @@ def export_seq2seq_model_class(request):
 @requires_neuronx
 def neuron_decoder_path(export_decoder_id):
     model = NeuronModelForCausalLM.from_pretrained(
-        export_decoder_id, export=True, batch_size=1, sequence_length=100, num_cores=2
+        export_decoder_id, export=True, batch_size=2, sequence_length=100, num_cores=2
     )
     model_dir = TemporaryDirectory()
     model_path = model_dir.name

From 81db49ccce4588c5af71c50df06d84238749d258 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 15 Feb 2024 10:07:09 +0000
Subject: [PATCH 03/13] test(decoder): add test to check for unk tokens

---
 tests/generation/test_tnx_generate.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/generation/test_tnx_generate.py b/tests/generation/test_tnx_generate.py
index 5838175e5..94b0f06c8 100644
--- a/tests/generation/test_tnx_generate.py
+++ b/tests/generation/test_tnx_generate.py
@@ -77,3 +77,29 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
     criteria = CustomStoppingCriteria()
     model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria])
     assert criteria.called, "Custom StoppingCriteria should have been called"
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_generation_padded_inputs(neuron_decoder_path):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    assert model.batch_size >= 2
+    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
+    prompt = (
+        "It was a bright cold day in April, and the clocks were striking thirteen."
+        " Winston Smith, his chin nuzzled into his breast in an effort to escape the"
+        " vile wind, slipped quickly through the glass doors of Victory Mansions,"
+    )
+    first_input = tokenizer(prompt)
+    first_ids = first_input["input_ids"]
+    first_mask = first_input["attention_mask"]
+    max_padding = 12
+    input_len = len(first_ids)
+    for i in range(max_padding):
+        second_ids = [tokenizer.eos_token_id] * i + first_ids[: input_len - i]
+        second_mask = [0] * i + [1] * (input_len - i)
+        input_ids = torch.tensor([first_ids, second_ids], dtype=torch.int64)
+        attention_mask = torch.tensor([first_mask, second_mask], dtype=torch.int64)
+        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=False)
+        # Verify we did not generate any unknown token
+        assert torch.all(outputs[:, -1] != 0)

From b607992ebcf0933de386362757dd38d3d407a64f Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 15 Feb 2024 18:21:12 +0000
Subject: [PATCH 04/13] test(decoder): test LLama padding issues

---
 tests/generation/test_tnx_llama.py | 45 ++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 tests/generation/test_tnx_llama.py

diff --git a/tests/generation/test_tnx_llama.py b/tests/generation/test_tnx_llama.py
new file mode 100644
index 000000000..b2aff36b0
--- /dev/null
+++ b/tests/generation/test_tnx_llama.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from transformers import AutoTokenizer
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_generation_llama_padded_inputs():
+    model_id = "NousResearch/Llama-2-7b-chat-hf"
+    model_kwargs = {"batch_size": 2, "sequence_length": 2048, "auto_cast_type": "f16", "num_cores": 2}
+    model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **model_kwargs)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    prompt = "One of my fondest memory is of my grandmother making homemade bread"
+    first_input = tokenizer(prompt)
+    first_ids = first_input["input_ids"]
+    first_mask = first_input["attention_mask"]
+    max_padding = 12
+    input_len = len(first_ids)
+    for i in range(max_padding):
+        second_ids = [tokenizer.eos_token_id] * i + first_ids[: input_len - i]
+        second_mask = [0] * i + [1] * (input_len - i)
+        input_ids = torch.tensor([first_ids, second_ids], dtype=torch.int64)
+        attention_mask = torch.tensor([first_mask, second_mask], dtype=torch.int64)
+        outputs = model.generate(
+            input_ids=input_ids, attention_mask=attention_mask, do_sample=False, max_new_tokens=10
+        )
+        # Verify we did not generate any unknown token
+        assert torch.all(outputs[:, -1] != 0)

From 101a0ae3ced874c89dabf8d415c2ea7b80f7624c Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 14 Feb 2024 16:22:01 +0000
Subject: [PATCH 05/13] feat(decoder): add default attention_mask

---
 optimum/neuron/modeling.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py
index fa2fdb574..be31ca072 100644
--- a/optimum/neuron/modeling.py
+++ b/optimum/neuron/modeling.py
@@ -775,7 +775,7 @@ def generate(
                 f"The input sequence length ({sequence_length}) exceeds the model static sequence length ({self.max_length})"
             )
         padded_input_ids = input_ids
-        padded_attention_mask = attention_mask
+        padded_attention_mask = torch.ones_like(input_ids) if attention_mask is None else attention_mask
         if batch_size > self.batch_size:
             raise ValueError(
                 f"The specified batch_size ({batch_size}) exceeds the model static batch size ({self.batch_size})"
@@ -784,10 +784,9 @@ def generate(
             logger.warning("Inputs will be padded to match the model static batch size. This will increase latency.")
             padding_shape = [self.batch_size - batch_size, sequence_length]
             padding = torch.full(padding_shape, fill_value=self.config.eos_token_id, dtype=torch.int64)
-            padded_input_ids = torch.cat([input_ids, padding])
-            if attention_mask is not None:
-                padding = torch.zeros(padding_shape, dtype=torch.int64)
-                padded_attention_mask = torch.cat([attention_mask, padding])
+            padded_input_ids = torch.cat([padded_input_ids, padding])
+            padding = torch.zeros(padding_shape, dtype=torch.int64)
+            padded_attention_mask = torch.cat([padded_attention_mask, padding])
         # Drop the current generation context and clear the Key/Value cache
         self.reset_generation()
 

From d91213c882cf4a10e41dfacf6eae9c85d38b2595 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 14 Feb 2024 10:12:15 +0000
Subject: [PATCH 06/13] refactor(decoder): isolate prefill from decode

---
 optimum/neuron/modeling.py                    | 64 +++++++++++--------
 .../text_generation_server/generator.py       | 19 +++---
 2 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py
index be31ca072..abbe8b235 100644
--- a/optimum/neuron/modeling.py
+++ b/optimum/neuron/modeling.py
@@ -656,15 +656,11 @@ def __init__(
         generation_config: Optional["GenerationConfig"] = None,
     ):
         super().__init__(config, checkpoint_dir, compiled_dir=compiled_dir, generation_config=generation_config)
-        self.cur_len = 0
         self.batch_size = self.model.config.batch_size
         self.max_length = self.model.config.n_positions
         # The generate method from GenerationMixin expects the device attribute to be set
         self.device = torch.device("cpu")
 
-    def reset_generation(self):
-        self.cur_len = 0
-
     @add_start_docstrings_to_model_forward(
         NEURON_CAUSALLM_MODEL_FORWARD_DOCSTRING
         + TEXT_GENERATION_EXAMPLE.format(
@@ -688,7 +684,23 @@ def forward(
             return ModelOutput([("logits", out_logits)])
         return (out_logits,)
 
-    def prepare_inputs_for_generation(
+    def prepare_inputs_for_prefill(
+        self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        # convert attention_mask to start_ids
+        start_ids = None
+        if attention_mask is not None:
+            _, start_ids = attention_mask.max(axis=1)
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "cache_ids": None,
+            "start_ids": start_ids,
+        }
+
+        return model_inputs
+
+    def prepare_inputs_for_decode(
         self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs
     ) -> Dict[str, torch.Tensor]:
         # convert attention_mask to start_ids
@@ -696,17 +708,12 @@ def prepare_inputs_for_generation(
         if attention_mask is not None:
             _, start_ids = attention_mask.max(axis=1)
 
-        if self.cur_len > 0:
-            # Only pass the last tokens of each sample
-            input_ids = input_ids[:, -1:]
-            # Specify the single index at which the new keys and values need to be stored
-            cache_ids = torch.as_tensor([self.cur_len], dtype=torch.int32)
-        else:
-            # cache_ids will be set directly by the parallel context encoding code
-            cache_ids = None
-
-        # Increment the current cache index
-        self.cur_len += input_ids.shape[-1]
+        # Only pass the last tokens of each sample
+        input_ids = input_ids[:, -1:]
+        # Specify the single index at which the new keys and values need to be stored
+        cache_len = attention_mask.shape[1]
+        cache_ids = torch.as_tensor([cache_len - 1], dtype=torch.int32)
+
         model_inputs = {
             "input_ids": input_ids,
             "cache_ids": cache_ids,
@@ -787,8 +794,6 @@ def generate(
             padded_input_ids = torch.cat([padded_input_ids, padding])
             padding = torch.zeros(padding_shape, dtype=torch.int64)
             padded_attention_mask = torch.cat([padded_attention_mask, padding])
-        # Drop the current generation context and clear the Key/Value cache
-        self.reset_generation()
 
         output_ids = self.generate_tokens(
             padded_input_ids,
@@ -830,17 +835,15 @@ def generate_tokens(
         unfinished_sequences = torch.zeros(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
         unfinished_sequences[:batch_size] = 1
 
+        # Prefill and obtain the first token
+        model_inputs = self.prepare_inputs_for_prefill(input_ids, attention_mask, **model_kwargs)
+        outputs = self(
+            **model_inputs,
+            return_dict=True,
+        )
+
         # auto-regressive generation
         while True:
-            # prepare model inputs
-            model_inputs = self.prepare_inputs_for_generation(input_ids, attention_mask, **model_kwargs)
-
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-            )
-
             next_token_logits = outputs.logits[:, -1, :]
 
             next_tokens = selector.select(input_ids, next_token_logits)
@@ -866,4 +869,11 @@ def generate_tokens(
             if selector.stopping_criteria(input_ids, None):
                 break
 
+            # forward pass to get next token
+            model_inputs = self.prepare_inputs_for_decode(input_ids, attention_mask, **model_kwargs)
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+            )
+
         return input_ids
diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py
index a568d1049..a867a4583 100644
--- a/text-generation-inference/server/text_generation_server/generator.py
+++ b/text-generation-inference/server/text_generation_server/generator.py
@@ -381,12 +381,12 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
                 slot_input_ids = slot_input_ids.squeeze(dim=0).type(torch.int64)
                 slot_attention_mask = attention_mask[i]
                 slot.reset(slot_input_ids, slot_attention_mask, selector)
-        # Clear KV cache
-        self.model.reset_generation()
         # Pause previously active slots during generation.
         # The KV cache of paused slots will be prefilled during generation but new tokens
         # will be ignored, as they have already been generated and sent back in the last decode.
-        generation, next_batch = self._generate_token(batch.id, input_ids, attention_mask)
+        model_inputs = self.model.prepare_inputs_for_prefill(input_ids, attention_mask)
+        logits = self.model(**model_inputs)[0]
+        generation, next_batch = self._generate_token(batch.id, logits, input_ids)
         # Reactivate previously active slots for the next decode, and append
         # back their next token.
         for slot, next_token in zip(active_slots, next_tokens):
@@ -433,23 +433,20 @@ def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBa
                 attention_mask[i, :] = slot.attention_mask
         if input_ids is None:
             raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
-        return self._generate_token(next_batch_id, input_ids, attention_mask)
+        model_inputs = self.model.prepare_inputs_for_decode(input_ids, attention_mask)
+        logits = self.model(**model_inputs)[0]
+        return self._generate_token(next_batch_id, logits, input_ids)
 
     def _generate_token(
-        self, next_batch_id: int, input_ids: torch.LongTensor, attention_mask: Optional[torch.LongTensor] = None
+        self, next_batch_id: int, logits: torch.Tensor, input_ids: torch.LongTensor
     ) -> Tuple[List[Generation], CachedBatch]:
-        model_inputs = self.model.prepare_inputs_for_generation(input_ids, attention_mask)
-        outputs = self.model(
-            **model_inputs,
-            return_dict=True,
-        )
         generations = []
         active_slots = False
         for i, slot in enumerate(self.slots):
             if slot.state != Slot.State.READY:
                 continue
             request_id = slot.request_id
-            next_token_logits = outputs.logits[i : i + 1, -1, :]
+            next_token_logits = logits[i : i + 1, -1, :]
             slot_input_ids = input_ids[i : i + 1, :]
             next_token = slot.select(slot_input_ids, next_token_logits)
             next_token_text = slot.append(next_token)

From 6b988abec2a21dd4555cda770039fa7d65657504 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 14 Feb 2024 15:52:31 +0000
Subject: [PATCH 07/13] feat(decoder): add support for continuous batching

---
 optimum/neuron/modeling.py | 100 ++++++++++++++++++++++++++-----------
 1 file changed, 70 insertions(+), 30 deletions(-)

diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py
index abbe8b235..f9f9182c0 100644
--- a/optimum/neuron/modeling.py
+++ b/optimum/neuron/modeling.py
@@ -16,7 +16,7 @@
 
 import copy
 import logging
-from typing import TYPE_CHECKING, Dict, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 import torch
 from transformers import (
@@ -656,11 +656,15 @@ def __init__(
         generation_config: Optional["GenerationConfig"] = None,
     ):
         super().__init__(config, checkpoint_dir, compiled_dir=compiled_dir, generation_config=generation_config)
-        self.batch_size = self.model.config.batch_size
-        self.max_length = self.model.config.n_positions
+        self.batch_size = self.config.neuron["batch_size"]
+        self.max_length = self.config.neuron["sequence_length"]
+        self.continuous_batching = self.model.neuron_config and self.model.neuron_config.continuous_batching
         # The generate method from GenerationMixin expects the device attribute to be set
         self.device = torch.device("cpu")
 
+    def reset_generation(self):
+        pass
+
     @add_start_docstrings_to_model_forward(
         NEURON_CAUSALLM_MODEL_FORWARD_DOCSTRING
         + TEXT_GENERATION_EXAMPLE.format(
@@ -684,44 +688,83 @@ def forward(
             return ModelOutput([("logits", out_logits)])
         return (out_logits,)
 
-    def prepare_inputs_for_prefill(
-        self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs
-    ) -> Dict[str, torch.Tensor]:
-        # convert attention_mask to start_ids
+    def get_start_ids(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_ids: Optional[torch.Tensor] = None,
+    ):
+        # The start_ids parameter has different meanings:
+        # - for continuous (unpadded) batching it corresponds to the sequence id,
+        # - for static batching it corresponds to the start of the padded sequence.
+        if self.continuous_batching:
+            if seq_ids is None:
+                seq_ids = torch.arange(input_ids.shape[0])
+            else:
+                assert seq_ids.shape[0] == input_ids.shape[0]
+            return seq_ids
         start_ids = None
         if attention_mask is not None:
             _, start_ids = attention_mask.max(axis=1)
+        return start_ids
+
+    def get_cache_ids(self, attention_mask: torch.tensor, prefill: bool):
+        cache_n, cache_len = attention_mask.shape
+        if self.continuous_batching:
+            # Evaluate the inputs that are not masked for each sequence
+            input_length = attention_mask.sum(axis=1)
+            if not prefill:
+                # When decoding, cache_ids contains a single value per sequence
+                return (input_length - 1).unsqueeze(1)
+            # When prefilling, cache_ids is an increasing range
+            cache_ids = torch.zeros_like(attention_mask)
+            for i in range(cache_n):
+                cur_length = input_length[i]
+                cache_ids[i, :cur_length] = torch.arange(cur_length)
+            return cache_ids
+        # Static batching
+        return None if prefill else torch.tensor([cache_len - 1], dtype=torch.int32)
 
-        model_inputs = {
+    def prepare_inputs_for_prefill(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_ids: Optional[List[int]] = None,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        start_ids = self.get_start_ids(input_ids, attention_mask, seq_ids=seq_ids)
+        cache_ids = self.get_cache_ids(attention_mask, prefill=True)
+        if self.continuous_batching and torch.any(attention_mask[:, 0] == 0):
+            # Inputs are left padded: we need to invert padding as continuous batching requires right-padding
+            batch_size, seq_len = input_ids.shape
+            input_length = attention_mask.sum(axis=1)
+            new_input_ids = torch.zeros_like(input_ids)
+            for i in range(batch_size):
+                cur_length = input_length[i]
+                new_input_ids[i, :cur_length] = input_ids[i, seq_len - cur_length :]
+            input_ids = new_input_ids
+        return {
             "input_ids": input_ids,
-            "cache_ids": None,
+            "cache_ids": cache_ids,
             "start_ids": start_ids,
         }
 
-        return model_inputs
-
     def prepare_inputs_for_decode(
-        self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, **kwargs
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        seq_ids: Optional[List[int]] = None,
     ) -> Dict[str, torch.Tensor]:
-        # convert attention_mask to start_ids
-        start_ids = None
-        if attention_mask is not None:
-            _, start_ids = attention_mask.max(axis=1)
-
+        start_ids = self.get_start_ids(input_ids, attention_mask, seq_ids=seq_ids)
+        cache_ids = self.get_cache_ids(attention_mask, prefill=False)
         # Only pass the last tokens of each sample
         input_ids = input_ids[:, -1:]
-        # Specify the single index at which the new keys and values need to be stored
-        cache_len = attention_mask.shape[1]
-        cache_ids = torch.as_tensor([cache_len - 1], dtype=torch.int32)
-
-        model_inputs = {
+        return {
             "input_ids": input_ids,
             "cache_ids": cache_ids,
             "start_ids": start_ids,
         }
 
-        return model_inputs
-
     def can_generate(self) -> bool:
         """Returns True to validate the check made in `GenerationMixin.generate()`."""
         return True
@@ -799,7 +842,7 @@ def generate(
             padded_input_ids,
             selector,
             batch_size,
-            attention_mask=padded_attention_mask,
+            padded_attention_mask,
             **model_kwargs,
         )
         return output_ids[:batch_size, :]
@@ -809,7 +852,7 @@ def generate_tokens(
         input_ids: torch.LongTensor,
         selector: TokenSelector,
         batch_size: int,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor,
         **model_kwargs,
     ) -> torch.LongTensor:
         r"""
@@ -853,10 +896,7 @@ def generate_tokens(
 
             # update inputs for the next step
             input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            if attention_mask is not None:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-                )
+            attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
 
             # if eos_token was found in one sentence, set sentence to finished
             unfinished_sequences = unfinished_sequences * next_tokens.ne(selector.eos_token_id)

From 2a9d31288aec96a5db8e11cba5bc48e037cb07a5 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 14 Feb 2024 09:50:19 +0000
Subject: [PATCH 08/13] feat(exporters): decoders with continuous batching

---
 optimum/exporters/neuron/base.py          | 7 ++++++-
 optimum/exporters/neuron/model_configs.py | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 240859e69..0b11b869b 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -379,7 +379,7 @@ class NeuronDecoderConfig(NeuronConfig):
         be passed to export the model,
     - NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model.
     It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling`
-    [`~optimum.utils.DummyInputGenerator`] specifying how to create dummy inputs.
+    - CONTINUOUS_BATCHING (`bool`) -- Whether the model supports continuous batching or not.
 
     The NEURONX_CLASS must always be defined in each model configuration.
 
@@ -389,6 +389,7 @@ class NeuronDecoderConfig(NeuronConfig):
 
     INPUT_ARGS = ("batch_size", "sequence_length")
     NEURONX_CLASS = None
+    CONTINUOUS_BATCHING = False
 
     def __init__(self, task: str):
         if not is_transformers_neuronx_available():
@@ -404,3 +405,7 @@ def __init__(self, task: str):
     @property
     def neuronx_class(self):
         return self._neuronx_class
+
+    @property
+    def continuous_batching(self):
+        return self.CONTINUOUS_BATCHING
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 1b6ce4b2e..045589f3b 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -431,6 +431,7 @@ class GPT2NeuronConfig(TextNeuronDecoderConfig):
 @register_in_tasks_manager("llama", "text-generation")
 class LLamaNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "llama.model.LlamaForSampling"
+    CONTINUOUS_BATCHING = True
 
 
 @register_in_tasks_manager("t5-encoder", "text2text-generation")
@@ -533,3 +534,4 @@ def generate_io_aliases(self, model):
 @register_in_tasks_manager("mistral", "text-generation")
 class MistralNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "mistral.model.MistralForSampling"
+    CONTINUOUS_BATCHING = True

From f5dec76a70b48caf53e37fde624033d64c270add Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 14 Feb 2024 09:51:37 +0000
Subject: [PATCH 09/13] feat(decoder): continuous_batching used by default

---
 optimum/neuron/modeling_decoder.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py
index fdf6fbaa7..e09b0a18e 100644
--- a/optimum/neuron/modeling_decoder.py
+++ b/optimum/neuron/modeling_decoder.py
@@ -35,6 +35,7 @@
 
 
 if is_transformers_neuronx_available():
+    from transformers_neuronx.config import ContinuousBatchingConfig, NeuronConfig
     from transformers_neuronx.module import save_split
 
 
@@ -131,16 +132,26 @@ def __init__(
 
         exporter = get_exporter(config, task)
 
-        # transformers-neuronx uses f32/f16 instead of fp32/fp16
-        auto_cast_type = auto_cast_type.replace("p", "")
+        tnx_kwargs = {
+            "batch_size": batch_size,
+            "tp_degree": num_cores,
+            # transformers-neuronx uses f32/f16 instead of fp32/fp16
+            "amp": auto_cast_type.replace("p", ""),
+        }
+        if batch_size > 1 and exporter.continuous_batching:
+            # Continuous batching is always enabled for models that support it because static batching
+            # is broken for these models:  see https://github.com/aws-neuron/transformers-neuronx/issues/79
+            tnx_kwargs["neuron_config"] = NeuronConfig(
+                continuous_batching=ContinuousBatchingConfig(batch_size_for_shared_caches=batch_size)
+            )
+            tnx_kwargs["n_positions"] = [sequence_length]
+            tnx_kwargs["context_length_estimate"] = [sequence_length]
+        else:
+            tnx_kwargs["n_positions"] = sequence_length
+
+        # Instantiate neuronx model
         checkpoint_path = checkpoint_dir.name if isinstance(checkpoint_dir, TemporaryDirectory) else checkpoint_dir
-        neuronx_model = exporter.neuronx_class.from_pretrained(
-            checkpoint_path,
-            batch_size=batch_size,
-            n_positions=sequence_length,
-            tp_degree=num_cores,
-            amp=auto_cast_type,
-        )
+        neuronx_model = exporter.neuronx_class.from_pretrained(checkpoint_path, **tnx_kwargs)
 
         if compiled_dir is not None:
             # Specify the path where compiled artifacts are stored before conversion

From c637e710f2de862689b3f6e2ec2f229ba5ad395c Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 16 Feb 2024 09:15:49 +0000
Subject: [PATCH 10/13] fix(tgi): use SDK 2.17 torch-neuronx version

---
 text-generation-inference/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 28568edc2..494651ef4 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -102,7 +102,7 @@ ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
 
 RUN pip3 install \
     neuronx-cc==2.12.68.0 \
-    torch-neuronx==1.13.1.1.13.0 \
+    torch-neuronx==1.13.1.1.13.1 \
     transformers-neuronx==0.9.474 \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 

From ce3f88eea3d7e061004a0131fd91d0d2fbe4791e Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 16 Feb 2024 09:27:54 +0000
Subject: [PATCH 11/13] fix(tgi): use correct versions in Dockerfile

---
 text-generation-inference/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 494651ef4..c5c65e152 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -92,8 +92,8 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 RUN apt-get update -y \
  && apt-get install -y --no-install-recommends \
     aws-neuronx-dkms=2.15.9.0 \
-    aws-neuronx-collectives=2.20.11.0 \
-    aws-neuronx-runtime-lib=2.20.11.0 \
+    aws-neuronx-collectives=2.20.11.0-c101c322e \
+    aws-neuronx-runtime-lib=2.20.11.0-b7d33e68b \
     aws-neuronx-tools=2.17.0.0 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean

From f45d62028bc8a8cfa6184a0330a6754e8a10e459 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Mon, 19 Feb 2024 09:15:19 +0000
Subject: [PATCH 12/13] feat(tgi): bump version and use max-batch-size

---
 Makefile                                  |  2 +-
 text-generation-inference/README.md       | 20 ++++++--------------
 text-generation-inference/server/Makefile |  2 +-
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index e9ec19103..036eaca83 100644
--- a/Makefile
+++ b/Makefile
@@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
-TGI_VERSION ?= 1.4.0
+TGI_VERSION ?= 1.4.1
 
 neuronx-tgi: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \
diff --git a/text-generation-inference/README.md b/text-generation-inference/README.md
index c0c2fd949..adfe8b983 100644
--- a/text-generation-inference/README.md
+++ b/text-generation-inference/README.md
@@ -100,11 +100,9 @@ docker run -p 8080:80 \
        -e HF_TOKEN=${HF_TOKEN} \
        ghcr.io/huggingface/neuronx-tgi:latest \
        --model-id aws-neuron/Llama-2-7b-hf-neuron-budget \
-       --max-concurrent-requests 1 \
+       --max-batch-size 1 \
        --max-input-length 1024 \
-       --max-total-tokens 2048 \
-       --max-batch-prefill-tokens 1024 \
-       --max-batch-total-tokens 2048
+       --max-total-tokens 2048
 ```
 
 ### Using a standard model from the 🤗 [HuggingFace Hub](https://huggingface.co/aws-neuron)
@@ -130,11 +128,9 @@ docker run -p 8080:80 \
        -e HF_NUM_CORES=2 \
        ghcr.io/huggingface/neuronx-tgi:latest \
        --model-id aws-neuron/Llama-2-7b-hf-neuron-budget \
-       --max-concurrent-requests 1 \
+       --max-batch-size 1 \
        --max-input-length 512 \
-       --max-total-tokens 1024 \
-       --max-batch-prefill-tokens 512 \
-       --max-batch-total-tokens 1024
+       --max-total-tokens 1024
 ```
 
 ### Using a model exported to a local path
@@ -162,15 +158,11 @@ The configuration of an inference endpoint is always a compromise between throug
 
 The neuron models have static input dimensions `[batch_size, max_length]`.
 
-It leads to a maximum number of tokens of `max_tokens = batch_size * max_length`.
-
 This adds several restrictions to the following parameters:
 
-- `--max-concurrent-requests` must be set to `batch size`,
+- `--max-batch-size` must be set to `batch size`,
 - `--max-input-length` must be lower than `max_length`,
-- `--max-total-tokens` must be set to `max_length` (it is per-request),
-- `--max-batch-prefill-tokens` must be set to `batch_size * max_input_length`,
-- `--max-batch-total-tokens` must be set to `max_tokens`.
+- `--max-total-tokens` must be set to `max_length` (it is per-request).
 
 ### Choosing the correct batch size
 
diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index e16ab6585..dc8bf3290 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -2,7 +2,7 @@
 pkg_name := text_generation_server
 BUILDDIR ?= $(CURDIR)/build
 VERSION ?= 0.0.1
-TGI_VERSION ?= 1.4.0
+TGI_VERSION ?= 1.4.1
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)

From 513a21454c6f51365fe24f22b2efae157b62bd23 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Mon, 19 Feb 2024 10:44:05 +0000
Subject: [PATCH 13/13] review: address comments

---
 optimum/exporters/neuron/base.py |  2 +-
 optimum/neuron/modeling.py       | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
index 0b11b869b..303e56793 100644
--- a/optimum/exporters/neuron/base.py
+++ b/optimum/exporters/neuron/base.py
@@ -379,7 +379,7 @@ class NeuronDecoderConfig(NeuronConfig):
         be passed to export the model,
     - NEURONX_CLASS (`str`) -- the name of the transformers-neuronx class to instantiate for the model.
     It is a full class name defined relatively to the transformers-neuronx module, e.g. `gpt2.model.GPT2ForSampling`
-    - CONTINUOUS_BATCHING (`bool`) -- Whether the model supports continuous batching or not.
+    - CONTINUOUS_BATCHING (`bool`, , defaults to `False`) -- Whether the model supports continuous batching or not.
 
     The NEURONX_CLASS must always be defined in each model configuration.
 
diff --git a/optimum/neuron/modeling.py b/optimum/neuron/modeling.py
index f9f9182c0..8d26c83fc 100644
--- a/optimum/neuron/modeling.py
+++ b/optimum/neuron/modeling.py
@@ -726,11 +726,7 @@ def get_cache_ids(self, attention_mask: torch.tensor, prefill: bool):
         return None if prefill else torch.tensor([cache_len - 1], dtype=torch.int32)
 
     def prepare_inputs_for_prefill(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        seq_ids: Optional[List[int]] = None,
-        **kwargs,
+        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, seq_ids: Optional[List[int]] = None
     ) -> Dict[str, torch.Tensor]:
         start_ids = self.get_start_ids(input_ids, attention_mask, seq_ids=seq_ids)
         cache_ids = self.get_cache_ids(attention_mask, prefill=True)
@@ -879,7 +875,7 @@ def generate_tokens(
         unfinished_sequences[:batch_size] = 1
 
         # Prefill and obtain the first token
-        model_inputs = self.prepare_inputs_for_prefill(input_ids, attention_mask, **model_kwargs)
+        model_inputs = self.prepare_inputs_for_prefill(input_ids, attention_mask)
         outputs = self(
             **model_inputs,
             return_dict=True,
@@ -910,7 +906,7 @@ def generate_tokens(
                 break
 
             # forward pass to get next token
-            model_inputs = self.prepare_inputs_for_decode(input_ids, attention_mask, **model_kwargs)
+            model_inputs = self.prepare_inputs_for_decode(input_ids, attention_mask)
             outputs = self(
                 **model_inputs,
                 return_dict=True,