From b8c2b6489e363e16785963256ea2c51c51b7ac6d Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Thu, 23 Nov 2023 12:18:01 +0000
Subject: [PATCH 1/6] feat(tgi): fetch TGI .proto if not provided

This allows to build the server outside of the Dockerfile more easily.
Note that when built out of the Dockerfile on Debian/Ubuntu, only the wheel
can be installed.
This might be related to an issue with python-pip.
---
 text-generation-inference/server/Makefile     | 28 +++++++++++++------
 .../server/pyproject.toml                     |  3 ++
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index 6ae41d4ca..adcaf6488 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -13,14 +13,6 @@ src_dir := $(mkfile_dir)/$(pkg_name)
 sources := $(wildcard $(src_dir)/*.py)
 deployed_sources := $(subst $(src_dir), $(pkg_dir), $(sources))
 
-# Three python files are generated for each protobuf
-protobufs := $(wildcard $(PROTODIR)/*.proto)
-pkg_pb_dir := $(pkg_dir)/pb
-generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
-generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
-generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
-generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
-
 # Static files are just copied
 
 define COPY
@@ -37,6 +29,24 @@ $(pkg_dir)/%.py: $(src_dir)/%.py
 
 # Generated files are produced by grpcio tools
 
+# If not provided, fetch proto files from TGI
+ifndef PROTODIR
+PROTODIR := $(BUILDDIR)/tgi/proto
+endif
+
+$(BUILDDIR)/tgi/proto/%.proto:
+	install -d $(BUILDDIR)/tgi
+	curl -L https://github.com/huggingface/text-generation-inference/archive/refs/tags/v1.0.2.tar.gz --output $(BUILDDIR)/tgi/sources.tar.gz
+	tar -C $(BUILDDIR)/tgi -xf $(BUILDDIR)/tgi/sources.tar.gz --strip-components=1
+
+# Three python files are generated for each protobuf
+protobufs := $(PROTODIR)/generate.proto
+pkg_pb_dir := $(pkg_dir)/pb
+generated_sources_base := $(foreach proto, $(protobufs), $(proto:.proto=_pb2.py))
+generated_sources := $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=.pyi))
+generated_sources += $(subst $(PROTODIR), $(pkg_pb_dir), $(generated_sources_base:.py=_grpc.py))
+
 $(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PROTODIR)/%.proto
 	mkdir -p $(pkg_pb_dir)
 	python -m grpc_tools.protoc -I$(PROTODIR) --python_out=$(pkg_pb_dir) \
@@ -44,4 +54,4 @@ $(pkg_pb_dir)/%_pb2.py $(pkg_pb_dir)/%_pb2.pyi $(pkg_pb_dir)/%_pb2_grpc.py: $(PR
 	sed -i -e 's/^\(import.*pb2\)/from . \1/g' $(pkg_pb_dir)/$*_pb2_grpc.py
 
 gen-server: $(BUILDDIR)/pyproject.toml $(deployed_sources) $(generated_sources)
-	python -m build $(BUILDDIR) --sdist
+	python -m build $(BUILDDIR)
diff --git a/text-generation-inference/server/pyproject.toml b/text-generation-inference/server/pyproject.toml
index 7261e8d44..bc1eed640 100644
--- a/text-generation-inference/server/pyproject.toml
+++ b/text-generation-inference/server/pyproject.toml
@@ -18,5 +18,8 @@ dependencies = [
     'loguru == 0.6.0'
 ]
 
+[tool.setuptools]
+packages = ["text_generation_server", "text_generation_server.pb"]
+
 [project.scripts]
 text-generation-server = 'text_generation_server.cli:app'

From 289cd827890e2985bab697e2a212cc17b4f4ed61 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 24 Nov 2023 09:10:34 +0000
Subject: [PATCH 2/6] feat(tgi): align server version to optimum-neuron

---
 text-generation-inference/Dockerfile            | 10 +++++-----
 text-generation-inference/server/Makefile       |  2 ++
 text-generation-inference/server/pyproject.toml |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 5b2036c7f..c92479478 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -51,6 +51,10 @@ RUN apt-get update -y \
     && apt-get clean
 RUN pip3 --no-cache-dir install --upgrade pip
 
+# VERSION is a mandatory parameter
+ARG VERSION
+RUN test -n ${VERSION:?}
+
 # Python server build image
 FROM base AS pyserver
 
@@ -66,15 +70,11 @@ WORKDIR /pyserver
 COPY text-generation-inference/server server
 COPY --from=tgi /tgi/proto proto
 RUN pip3 install -r server/build-requirements.txt
-RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto make -C server gen-server
+RUN VERBOSE=1 BUILDDIR=/pyserver/build PROTODIR=/pyserver/proto VERSION=${VERSION} make -C server gen-server
 
 # Neuron base image (used for deployment)
 FROM base AS neuron
 
-# VERSION is a mandatory parameter
-ARG VERSION
-RUN test -n ${VERSION:?}
-
 # Install system prerequisites
 RUN apt-get update -y \
  && apt-get install -y --no-install-recommends \
diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index adcaf6488..de1c403f8 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -1,6 +1,7 @@
 # Initialize base variables
 pkg_name := text_generation_server
 BUILDDIR ?= $(CURDIR)/build_$(pkg_name)
+VERSION ?= 0.0.1
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))
 pkg_dir := $(BUILDDIR)/$(pkg_name)
@@ -22,6 +23,7 @@ endef
 $(BUILDDIR)/pyproject.toml: $(mkfile_dir)/pyproject.toml
 	mkdir -p $(BUILDDIR)
 	$(COPY)
+	sed -i -e 's/version = "VERSION"/version = \"${VERSION}\"/' $@
 
 $(pkg_dir)/%.py: $(src_dir)/%.py
 	mkdir -p $(pkg_dir)
diff --git a/text-generation-inference/server/pyproject.toml b/text-generation-inference/server/pyproject.toml
index bc1eed640..869d341cb 100644
--- a/text-generation-inference/server/pyproject.toml
+++ b/text-generation-inference/server/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "text-generation-server"
-version = "0.0.1"
+version = "VERSION"
 authors = [{name="David Corvoysier", email="david@huggingface.co" }]
 description = "TGI compatible inference server for AWS Neuronx platforms"
 dependencies = [

From 83910e99c251c15012cfcc6a8c28540ee3316093 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 24 Nov 2023 11:05:59 +0000
Subject: [PATCH 3/6] test(Makefile): added test_tgi target

---
 Makefile | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 118b37484..71011ca12 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,7 @@ clean:
 
 rwildcard=$(wildcard $1) $(foreach d,$1,$(call rwildcard,$(addsuffix /$(notdir $d),$(wildcard $(dir $d)*))))
 
-VERSION := $(shell python -W ignore -c "from optimum.neuron.version import __version__; print(__version__)")
+VERSION := $(shell gawk 'match($$0, /__version__ = "(.*)"/, a) {print a[1]}' optimum/neuron/version.py)
 
 PACKAGE_DIST = dist/optimum-neuron-$(VERSION).tar.gz
 PACKAGE_WHEEL = dist/optimum_neuron-$(VERSION)-py3-none-any.whl
@@ -71,6 +71,20 @@ build_dist: ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 pypi_upload: ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 	python -m twine upload ${PACKAGE_DIST} ${PACKAGE_WHEEL}
 
+# Tests
+
 test_installs:
 	python -m pip install .[tests]
 	python -m pip install git+https://github.com/huggingface/transformers.git
+
+# Stand-alone TGI server for unit tests outside of TGI container
+tgi_server:
+	python -m pip install -r text-generation-inference/server/build-requirements.txt
+	make -C text-generation-inference/server clean
+	VERSION=${VERSION} make -C text-generation-inference/server gen-server
+
+tgi_test: tgi_server
+	python -m pip install .[neuronx] pytest
+	find text-generation-inference -name "text_generation_server-$(VERSION)-py3-none-any.whl" \
+	                               -exec python -m pip install --force-reinstall {} \;
+	python -m pytest -s text-generation-inference/tests

From 595bf8da35811b9d12733caf9bf749185976f33b Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 24 Nov 2023 11:07:54 +0000
Subject: [PATCH 4/6] test(tgi): test decoding with streamed tokens

---
 .../tests/test_generator_slot.py              | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 text-generation-inference/tests/test_generator_slot.py

diff --git a/text-generation-inference/tests/test_generator_slot.py b/text-generation-inference/tests/test_generator_slot.py
new file mode 100644
index 000000000..2f243b5d4
--- /dev/null
+++ b/text-generation-inference/tests/test_generator_slot.py
@@ -0,0 +1,61 @@
+import pytest
+import torch
+from text_generation_server.generator import Slot
+from text_generation_server.pb.generate_pb2 import Request
+from transformers import AutoTokenizer, GenerationConfig
+
+
+TOKENIZERS = ["NousResearch/Llama-2-7b-hf", "gpt2"]
+
+
+@pytest.fixture(params=TOKENIZERS)
+def tokenizer(request):
+    t = AutoTokenizer.from_pretrained(request.param)
+    t.padding_side = "left"
+    t.pad_token_id = t.eos_token_id
+    return t
+
+
+@pytest.mark.parametrize(
+    "input_text, generated_text",
+    [
+        [
+            "It was a bright cold day in April, and the clocks were striking thirteen.",
+            " Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind,"
+            " slipped quickly through the glass doors of Victory Mansions, though not quickly enough"
+            " to prevent a swirl of gritty dust from entering along with him.",
+        ],
+        ["This sentence is written in chinese:", "我很感谢你的热情"],
+        ["Some text might contain a lot of emojis like 😃", "😍💪 👉 👀"],
+    ],
+    ids=["spaces", "chinese-utf8", "emojis"],
+)
+def test_decode_streaming(tokenizer, input_text, generated_text):
+    slot = Slot(0, tokenizer)
+    request = Request(id=0, inputs=input_text)
+    slot.assign(request, GenerationConfig())
+    assert slot.cached_text == input_text
+
+    inputs = tokenizer(input_text, padding="max_length", max_length=len(input_text) + 1, return_tensors="pt")
+    input_ids = inputs["input_ids"][0]
+    attention_mask = inputs["attention_mask"][0]
+    generated_tokens = tokenizer(generated_text, add_special_tokens=False)["input_ids"]
+
+    # We need to regenerate the full text as the tokenizer might change it (extra spaces might be added)
+    all_input_ids = torch.cat([input_ids, torch.tensor(generated_tokens)])
+    full_text = tokenizer.decode(all_input_ids, skip_special_tokens=True)
+    regenerated_text = full_text[len(input_text) :]
+
+    # Initialize the slot with the inputs
+    slot.reset(input_ids, attention_mask, selector=None)
+
+    assert slot.generated_tokens == 0
+
+    # Simulate an iterative generation (i.e. don't call select and use known tokens instead)
+    decoded_text = ""
+    for i in range(len(generated_tokens)):
+        text = slot.append(generated_tokens[i])
+        assert slot.generated_tokens == i + 1
+        decoded_text += text
+
+    assert decoded_text == regenerated_text

From 667e238eb233a47aefd7dc1533c32edcff2aaa23 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 24 Nov 2023 14:58:26 +0000
Subject: [PATCH 5/6] ci: added TGI workflow

---
 .github/workflows/test_inf2_tgi.yml | 40 +++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/test_inf2_tgi.yml

diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
new file mode 100644
index 000000000..fe0cb87e6
--- /dev/null
+++ b/.github/workflows/test_inf2_tgi.yml
@@ -0,0 +1,40 @@
+name: Optimum neuron / Test TGI on INF2
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - "text-generation-inference/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "setup.py"
+      - "optimum/**.py"
+      - "text-generation-inference/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  do-the-job:
+    name: Run TGI tests
+    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
+    env:
+      AWS_REGION: us-east-1
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Install python and create venv
+        run: |
+          sudo apt install python3.8-venv -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+      - name: Run TGI server python tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} make tgi_test

From 7aefeacee77c2cb317caa1f887c4698e9f8869a4 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 7 Feb 2024 08:39:28 +0000
Subject: [PATCH 6/6] fix(tgi): use git ignored build dir name

---
 text-generation-inference/server/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/text-generation-inference/server/Makefile b/text-generation-inference/server/Makefile
index de1c403f8..da5e38ffb 100644
--- a/text-generation-inference/server/Makefile
+++ b/text-generation-inference/server/Makefile
@@ -1,6 +1,6 @@
 # Initialize base variables
 pkg_name := text_generation_server
-BUILDDIR ?= $(CURDIR)/build_$(pkg_name)
+BUILDDIR ?= $(CURDIR)/build
 VERSION ?= 0.0.1
 mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
 mkfile_dir := $(dir $(mkfile_path))