From b62f08089454a1c7e74a504180ebb6d39df2f226 Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Fri, 21 Feb 2025 12:37:47 -0800
Subject: [PATCH 1/5] Added main logic

---
 .../openai_frontend/engine/triton_engine.py   | 10 ++++++-
 python/openai/openai_frontend/main.py         | 26 +++++++++++++++++--
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
index de6c1ed2cf..242f3d85e0 100644
--- a/python/openai/openai_frontend/engine/triton_engine.py
+++ b/python/openai/openai_frontend/engine/triton_engine.py
@@ -89,7 +89,15 @@ def __init__(
     ):
         # Assume an already configured and started server
         self.server = server
-        self.tokenizer = self._get_tokenizer(tokenizer)
+        self.tokenizer_map = {}
+        if tokenizer_map:
+            for model_name, tokenizer_path in tokenizer_map.items():
+                try:
+                    self.tokenizer_map[model_name] = get_tokenizer(tokenizer_path)
+                except Exception as e:
+                    print(
+                        f"Warning: Failed to load tokenizer for {model_name} from {tokenizer_path}: {e}"
+                    )
         # TODO: Reconsider name of "backend" vs. something like "request_format"
         self.backend = backend
         self.lora_separator = lora_separator
diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py
index d8899875c4..9e1df8ede1 100755
--- a/python/openai/openai_frontend/main.py
+++ b/python/openai/openai_frontend/main.py
@@ -95,12 +95,20 @@ def parse_args():
         required=True,
         help="Path to the Triton model repository holding the models to be served",
     )
+    # TODO: determine what to do with single tokenizer flag
     triton_group.add_argument(
         "--tokenizer",
         type=str,
         default=None,
         help="HuggingFace ID or local folder path of the Tokenizer to use for chat templates",
     )
+    triton_group.add_argument(
+        "--tokenizers",
+        type=str,
+        nargs="+",  # Accept multiple arguments
+        default=[],
+        help="List of HuggingFace IDs or local folder paths of Tokenizers to use. Format: model_name:tokenizer_path",
+    )
     triton_group.add_argument(
         "--backend",
         type=str,
@@ -166,8 +174,22 @@ def parse_args():
 def main():
     args = parse_args()
 
-    # Initialize a Triton Inference Server pointing at LLM models
-    server: tritonserver.Server = tritonserver.Server(
+    # Parse tokenizer mappings
+    tokenizer_map = {}
+    for tokenizer_spec in args.tokenizers:
+        try:
+            model_name, tokenizer_path = tokenizer_spec.split(":")
+            tokenizer_map[model_name] = tokenizer_path
+        except ValueError:
+            print(
+                f"Warning: Skipping invalid tokenizer specification: {tokenizer_spec}. Format should be 'model_name:tokenizer_path'"
+            )
+
+    if args.tokenizer:
+        tokenizer_map["default"] = args.tokenizer
+
+    # Initialize Triton server
+    server = tritonserver.Server(
         model_repository=args.model_repository,
         log_verbose=args.tritonserver_log_verbose_level,
         log_info=True,

From 10d9e47adb1800f72ee8c93d8e6d378d7c66c4c5 Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Tue, 15 Apr 2025 10:38:13 -0700
Subject: [PATCH 2/5] Fixing issues after rebase

---
 python/openai/openai_frontend/engine/triton_engine.py | 5 +++--
 python/openai/openai_frontend/main.py                 | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py
index 242f3d85e0..4624fa91aa 100644
--- a/python/openai/openai_frontend/engine/triton_engine.py
+++ b/python/openai/openai_frontend/engine/triton_engine.py
@@ -83,9 +83,9 @@ class TritonLLMEngine(LLMEngine):
     def __init__(
         self,
         server: tritonserver.Server,
-        tokenizer: str,
         backend: Optional[str] = None,
         lora_separator: Optional[str] = None,
+        tokenizer_map: Dict[str, str] = None,
     ):
         # Assume an already configured and started server
         self.server = server
@@ -302,12 +302,13 @@ def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]:
                     self.server.options.model_repository, name, model.version
                 )
 
+            default_tokenizer = self.tokenizer_map.get("default", None)
             metadata = TritonModelMetadata(
                 name=name,
                 backend=backend,
                 model=model,
-                tokenizer=self.tokenizer,
                 lora_names=lora_names,
+                tokenizer=self.tokenizer_map.get(name, default_tokenizer),
                 create_time=self.create_time,
                 request_converter=self._determine_request_converter(backend),
             )
diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py
index 9e1df8ede1..02d6d843e9 100755
--- a/python/openai/openai_frontend/main.py
+++ b/python/openai/openai_frontend/main.py
@@ -200,9 +200,9 @@ def main():
     # Wrap Triton Inference Server in an interface-conforming "LLMEngine"
     engine: TritonLLMEngine = TritonLLMEngine(
         server=server,
-        tokenizer=args.tokenizer,
         backend=args.backend,
         lora_separator=args.lora_separator,
+        tokenizer_map=tokenizer_map,
     )
 
     # Attach TritonLLMEngine as the backbone for inference and model management

From 3fa2832811bbe4b958a9032f1c2caf74b41b453a Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Tue, 15 Apr 2025 11:25:30 -0700
Subject: [PATCH 3/5] Refactor

---
 python/openai/openai_frontend/main.py | 50 ++++++++++++++++-----------
 python/openai/tests/utils.py          |  6 ++--
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py
index 02d6d843e9..d959f80478 100755
--- a/python/openai/openai_frontend/main.py
+++ b/python/openai/openai_frontend/main.py
@@ -82,6 +82,27 @@ def start_kserve_frontends(server, args):
     return http_service, grpc_service
 
 
+def parse_tokenizer_arg(tokenizer_args):
+    if not tokenizer_args:
+        return {}
+
+    tokenizer_map = {}
+    # Single tokenizer case
+    if len(tokenizer_args) == 1 and ":" not in tokenizer_args[0]:
+        tokenizer_map["default"] = tokenizer_args[0]
+        return tokenizer_map
+
+    # Multiple tokenizers case
+    for arg in tokenizer_args:
+        try:
+            model_name, tokenizer_path = arg.split(":")
+            tokenizer_map[model_name] = tokenizer_path
+        except ValueError:
+            print(f"Warning: Skipping invalid tokenizer specification: {arg}")
+
+    return tokenizer_map
+
+
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Triton Inference Server with OpenAI-Compatible RESTful API server."
@@ -95,19 +116,17 @@ def parse_args():
         required=True,
         help="Path to the Triton model repository holding the models to be served",
     )
-    # TODO: determine what to do with single tokenizer flag
     triton_group.add_argument(
         "--tokenizer",
         type=str,
+        nargs="+",  # Accept either single value or multiple
         default=None,
-        help="HuggingFace ID or local folder path of the Tokenizer to use for chat templates",
-    )
-    triton_group.add_argument(
-        "--tokenizers",
-        type=str,
-        nargs="+",  # Accept multiple arguments
-        default=[],
-        help="List of HuggingFace IDs or local folder paths of Tokenizers to use. Format: model_name:tokenizer_path",
+        help=(
+            "HuggingFace ID or local folder path of Tokenizer(s). "
+            "For single tokenizer: provide path directly. "
+            "For multiple tokenizers: use format 'model_name:tokenizer_path' for each entry. "
+            "Example: --tokenizer default:/path/to/tokenizer model1:path1 model2:path2"
+        ),
     )
     triton_group.add_argument(
         "--backend",
@@ -175,18 +194,7 @@ def main():
     args = parse_args()
 
     # Parse tokenizer mappings
-    tokenizer_map = {}
-    for tokenizer_spec in args.tokenizers:
-        try:
-            model_name, tokenizer_path = tokenizer_spec.split(":")
-            tokenizer_map[model_name] = tokenizer_path
-        except ValueError:
-            print(
-                f"Warning: Skipping invalid tokenizer specification: {tokenizer_spec}. Format should be 'model_name:tokenizer_path'"
-            )
-
-    if args.tokenizer:
-        tokenizer_map["default"] = args.tokenizer
+    tokenizer_map = parse_tokenizer_arg(args.tokenizer)
 
     # Initialize Triton server
     server = tritonserver.Server(
diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py
index 3ed46f70b4..bcbd0a96fc 100644
--- a/python/openai/tests/utils.py
+++ b/python/openai/tests/utils.py
@@ -29,7 +29,7 @@
 import sys
 import time
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 import openai
 import requests
@@ -52,7 +52,9 @@ def setup_server(model_repository: str):
     return server
 
 
-def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str):
+def setup_fastapi_app(
+    tokenizer: Union[str, Dict[str, str]], server: tritonserver.Server, backend: str
+):
     engine: TritonLLMEngine = TritonLLMEngine(
         server=server, tokenizer=tokenizer, backend=backend
     )

From aaf4f6dd587469de3ba50c655341311e82ab5c70 Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Tue, 15 Apr 2025 12:51:36 -0700
Subject: [PATCH 4/5] Tests

---
 python/openai/tests/test_chat_completions.py | 63 +++++++++++++++++++-
 1 file changed, 62 insertions(+), 1 deletion(-)

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 401601c526..66434d6986 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -619,3 +619,64 @@ def test_chat_completions_invalid_chat_tokenizer(
         assert any(
             error in response.json()["detail"].lower() for error in expected_errors
         )
+
+
+class TestMultipleTokenizers:
+    @pytest.fixture(scope="class")
+    def model_repository(self):
+        # Custom model repository for these specific tests
+        return str(Path(__file__).parent / "vllm_tiny_models")
+
+    # Re-use a single Triton server for different frontend configurations
+    @pytest.fixture(scope="class")
+    def server(self, model_repository: str):
+        server = setup_server(model_repository)
+        yield server
+        server.stop()
+
+    @pytest.fixture(scope="class")
+    def models(self):
+        return ["tiny_llama", "phi-4"]
+
+    def test_chat_completions_multiple_tokenizers(
+        self,
+        server: tritonserver.Server,
+        models: List[str],
+        messages: List[dict],
+    ):
+        app = setup_fastapi_app(
+            tokenizer={
+                "tiny_llama:TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+                "phi-4:microsoft/Phi-4-mini-instruct"
+            },
+            server=server,
+            backend="vllm",
+        )
+        for model in models:
+            with TestClient(app) as client:
+                response = client.post(
+                    "/v1/chat/completions",
+                    json={"model": model, "messages": messages},
+                )
+
+                assert response.status_code == 200
+                message = response.json()["choices"][0]["message"]
+                assert message["content"].strip()
+                assert message["role"] == "assistant"
+
+    def test_chat_completions_unknown_tokenizers(
+        self,
+        server: tritonserver.Server,
+        models: List[str],
+        messages: List[dict],
+    ):
+        app = setup_fastapi_app(tokenizer="", server=server, backend="vllm")
+        for model in models:
+            with TestClient(app) as client:
+                response = client.post(
+                    "/v1/chat/completions",
+                    json={"model": model, "messages": messages},
+                )
+
+                assert response.status_code == 400
+                assert response.json()["detail"] == "Unknown tokenizer"

From 5ae2d84a9c1623a7f85ca3482f83225785debb73 Mon Sep 17 00:00:00 2001
From: oandreeva-nv <oandreeva@nvidia.com>
Date: Tue, 15 Apr 2025 13:04:43 -0700
Subject: [PATCH 5/5] Models for testing

---
 .../tests/vllm_tiny_models/phi-4/1/model.json |  1 +
 .../tests/vllm_tiny_models/phi-4/config.pbtxt | 28 +++++++++++++++++++
 .../vllm_tiny_models/tiny_llama/1/model.json  |  1 +
 .../vllm_tiny_models/tiny_llama/config.pbtxt  | 28 +++++++++++++++++++
 4 files changed, 58 insertions(+)
 create mode 100644 python/openai/tests/vllm_tiny_models/phi-4/1/model.json
 create mode 100644 python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt
 create mode 100644 python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json
 create mode 100644 python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt

diff --git a/python/openai/tests/vllm_tiny_models/phi-4/1/model.json b/python/openai/tests/vllm_tiny_models/phi-4/1/model.json
new file mode 100644
index 0000000000..b6b61e1fc1
--- /dev/null
+++ b/python/openai/tests/vllm_tiny_models/phi-4/1/model.json
@@ -0,0 +1 @@
+{"model": "microsoft/Phi-4-mini-instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.8}
diff --git a/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt b/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt
new file mode 100644
index 0000000000..fc7bdc0534
--- /dev/null
+++ b/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt
@@ -0,0 +1,28 @@
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "vllm"
+instance_group [{kind: KIND_MODEL}]
\ No newline at end of file
diff --git a/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json b/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json
new file mode 100644
index 0000000000..0444f38b14
--- /dev/null
+++ b/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json
@@ -0,0 +1 @@
+{"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "disable_log_requests": true, "gpu_memory_utilization": 0.2}
diff --git a/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt b/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt
new file mode 100644
index 0000000000..fc7bdc0534
--- /dev/null
+++ b/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt
@@ -0,0 +1,28 @@
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+backend: "vllm"
+instance_group [{kind: KIND_MODEL}]
\ No newline at end of file