From b62f08089454a1c7e74a504180ebb6d39df2f226 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Fri, 21 Feb 2025 12:37:47 -0800 Subject: [PATCH 1/5] Added main logic --- .../openai_frontend/engine/triton_engine.py | 10 ++++++- python/openai/openai_frontend/main.py | 26 +++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index de6c1ed2cf..242f3d85e0 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -89,7 +89,15 @@ def __init__( ): # Assume an already configured and started server self.server = server - self.tokenizer = self._get_tokenizer(tokenizer) + self.tokenizer_map = {} + if tokenizer_map: + for model_name, tokenizer_path in tokenizer_map.items(): + try: + self.tokenizer_map[model_name] = get_tokenizer(tokenizer_path) + except Exception as e: + print( + f"Warning: Failed to load tokenizer for {model_name} from {tokenizer_path}: {e}" + ) # TODO: Reconsider name of "backend" vs. something like "request_format" self.backend = backend self.lora_separator = lora_separator diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index d8899875c4..9e1df8ede1 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -95,12 +95,20 @@ def parse_args(): required=True, help="Path to the Triton model repository holding the models to be served", ) + # TODO: determine what to do with single tokenizer flag triton_group.add_argument( "--tokenizer", type=str, default=None, help="HuggingFace ID or local folder path of the Tokenizer to use for chat templates", ) + triton_group.add_argument( + "--tokenizers", + type=str, + nargs="+", # Accept multiple arguments + default=[], + help="List of HuggingFace IDs or local folder paths of Tokenizers to use. Format: model_name:tokenizer_path", + ) triton_group.add_argument( "--backend", type=str, @@ -166,8 +174,22 @@ def parse_args(): def main(): args = parse_args() - # Initialize a Triton Inference Server pointing at LLM models - server: tritonserver.Server = tritonserver.Server( + # Parse tokenizer mappings + tokenizer_map = {} + for tokenizer_spec in args.tokenizers: + try: + model_name, tokenizer_path = tokenizer_spec.split(":") + tokenizer_map[model_name] = tokenizer_path + except ValueError: + print( + f"Warning: Skipping invalid tokenizer specification: {tokenizer_spec}. Format should be 'model_name:tokenizer_path'" + ) + + if args.tokenizer: + tokenizer_map["default"] = args.tokenizer + + # Initialize Triton server + server = tritonserver.Server( model_repository=args.model_repository, log_verbose=args.tritonserver_log_verbose_level, log_info=True, From 10d9e47adb1800f72ee8c93d8e6d378d7c66c4c5 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Tue, 15 Apr 2025 10:38:13 -0700 Subject: [PATCH 2/5] Fixing issues after rebase --- python/openai/openai_frontend/engine/triton_engine.py | 5 +++-- python/openai/openai_frontend/main.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/openai/openai_frontend/engine/triton_engine.py b/python/openai/openai_frontend/engine/triton_engine.py index 242f3d85e0..4624fa91aa 100644 --- a/python/openai/openai_frontend/engine/triton_engine.py +++ b/python/openai/openai_frontend/engine/triton_engine.py @@ -83,9 +83,9 @@ class TritonLLMEngine(LLMEngine): def __init__( self, server: tritonserver.Server, - tokenizer: str, backend: Optional[str] = None, lora_separator: Optional[str] = None, + tokenizer_map: Dict[str, str] = None, ): # Assume an already configured and started server self.server = server @@ -302,12 +302,13 @@ def _get_model_metadata(self) -> Dict[str, TritonModelMetadata]: self.server.options.model_repository, name, model.version ) + default_tokenizer = self.tokenizer_map.get("default", None) metadata = TritonModelMetadata( name=name, backend=backend, model=model, - tokenizer=self.tokenizer, lora_names=lora_names, + tokenizer=self.tokenizer_map.get(name, default_tokenizer), create_time=self.create_time, request_converter=self._determine_request_converter(backend), ) diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index 9e1df8ede1..02d6d843e9 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -200,9 +200,9 @@ def main(): # Wrap Triton Inference Server in an interface-conforming "LLMEngine" engine: TritonLLMEngine = TritonLLMEngine( server=server, - tokenizer=args.tokenizer, backend=args.backend, lora_separator=args.lora_separator, + tokenizer_map=tokenizer_map, ) # Attach TritonLLMEngine as the backbone for inference and model management From 3fa2832811bbe4b958a9032f1c2caf74b41b453a Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Tue, 15 Apr 2025 11:25:30 -0700 Subject: [PATCH 3/5] Refactor --- python/openai/openai_frontend/main.py | 50 ++++++++++++++++----------- python/openai/tests/utils.py | 6 ++-- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/python/openai/openai_frontend/main.py b/python/openai/openai_frontend/main.py index 02d6d843e9..d959f80478 100755 --- a/python/openai/openai_frontend/main.py +++ b/python/openai/openai_frontend/main.py @@ -82,6 +82,27 @@ def start_kserve_frontends(server, args): return http_service, grpc_service +def parse_tokenizer_arg(tokenizer_args): + if not tokenizer_args: + return {} + + tokenizer_map = {} + # Single tokenizer case + if len(tokenizer_args) == 1 and ":" not in tokenizer_args[0]: + tokenizer_map["default"] = tokenizer_args[0] + return tokenizer_map + + # Multiple tokenizers case + for arg in tokenizer_args: + try: + model_name, tokenizer_path = arg.split(":") + tokenizer_map[model_name] = tokenizer_path + except ValueError: + print(f"Warning: Skipping invalid tokenizer specification: {arg}") + + return tokenizer_map + + def parse_args(): parser = argparse.ArgumentParser( description="Triton Inference Server with OpenAI-Compatible RESTful API server." @@ -95,19 +116,17 @@ def parse_args(): required=True, help="Path to the Triton model repository holding the models to be served", ) - # TODO: determine what to do with single tokenizer flag triton_group.add_argument( "--tokenizer", type=str, + nargs="+", # Accept either single value or multiple default=None, - help="HuggingFace ID or local folder path of the Tokenizer to use for chat templates", - ) - triton_group.add_argument( - "--tokenizers", - type=str, - nargs="+", # Accept multiple arguments - default=[], - help="List of HuggingFace IDs or local folder paths of Tokenizers to use. Format: model_name:tokenizer_path", + help=( + "HuggingFace ID or local folder path of Tokenizer(s). " + "For single tokenizer: provide path directly. " + "For multiple tokenizers: use format 'model_name:tokenizer_path' for each entry. " + "Example: --tokenizer default:/path/to/tokenizer model1:path1 model2:path2" + ), ) triton_group.add_argument( "--backend", @@ -175,18 +194,7 @@ def main(): args = parse_args() # Parse tokenizer mappings - tokenizer_map = {} - for tokenizer_spec in args.tokenizers: - try: - model_name, tokenizer_path = tokenizer_spec.split(":") - tokenizer_map[model_name] = tokenizer_path - except ValueError: - print( - f"Warning: Skipping invalid tokenizer specification: {tokenizer_spec}. Format should be 'model_name:tokenizer_path'" - ) - - if args.tokenizer: - tokenizer_map["default"] = args.tokenizer + tokenizer_map = parse_tokenizer_arg(args.tokenizer) # Initialize Triton server server = tritonserver.Server( diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index 3ed46f70b4..bcbd0a96fc 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -29,7 +29,7 @@ import sys import time from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import openai import requests @@ -52,7 +52,9 @@ def setup_server(model_repository: str): return server -def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str): +def setup_fastapi_app( + tokenizer: Union[str, Dict[str, str]], server: tritonserver.Server, backend: str +): engine: TritonLLMEngine = TritonLLMEngine( server=server, tokenizer=tokenizer, backend=backend ) From aaf4f6dd587469de3ba50c655341311e82ab5c70 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Tue, 15 Apr 2025 12:51:36 -0700 Subject: [PATCH 4/5] Tests --- python/openai/tests/test_chat_completions.py | 63 +++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index 401601c526..66434d6986 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -1,4 +1,4 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -619,3 +619,64 @@ def test_chat_completions_invalid_chat_tokenizer( assert any( error in response.json()["detail"].lower() for error in expected_errors ) + + +class TestMultipleTokenizers: + @pytest.fixture(scope="class") + def model_repository(self): + # Custom model repository for these specific tests + return str(Path(__file__).parent / "vllm_tiny_models") + + # Re-use a single Triton server for different frontend configurations + @pytest.fixture(scope="class") + def server(self, model_repository: str): + server = setup_server(model_repository) + yield server + server.stop() + + @pytest.fixture(scope="class") + def models(self): + return ["tiny_llama", "phi-4"] + + def test_chat_completions_multiple_tokenizers( + self, + server: tritonserver.Server, + models: List[str], + messages: List[dict], + ): + app = setup_fastapi_app( + tokenizer={ + "tiny_llama:TinyLlama/TinyLlama-1.1B-Chat-v1.0" + "phi-4:microsoft/Phi-4-mini-instruct" + }, + server=server, + backend="vllm", + ) + for model in models: + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages}, + ) + + assert response.status_code == 200 + message = response.json()["choices"][0]["message"] + assert message["content"].strip() + assert message["role"] == "assistant" + + def test_chat_completions_unknown_tokenizers( + self, + server: tritonserver.Server, + models: List[str], + messages: List[dict], + ): + app = setup_fastapi_app(tokenizer="", server=server, backend="vllm") + for model in models: + with TestClient(app) as client: + response = client.post( + "/v1/chat/completions", + json={"model": model, "messages": messages}, + ) + + assert response.status_code == 400 + assert response.json()["detail"] == "Unknown tokenizer" From 5ae2d84a9c1623a7f85ca3482f83225785debb73 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Tue, 15 Apr 2025 13:04:43 -0700 Subject: [PATCH 5/5] Models for testing --- .../tests/vllm_tiny_models/phi-4/1/model.json | 1 + .../tests/vllm_tiny_models/phi-4/config.pbtxt | 28 +++++++++++++++++++ .../vllm_tiny_models/tiny_llama/1/model.json | 1 + .../vllm_tiny_models/tiny_llama/config.pbtxt | 28 +++++++++++++++++++ 4 files changed, 58 insertions(+) create mode 100644 python/openai/tests/vllm_tiny_models/phi-4/1/model.json create mode 100644 python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt create mode 100644 python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json create mode 100644 python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt diff --git a/python/openai/tests/vllm_tiny_models/phi-4/1/model.json b/python/openai/tests/vllm_tiny_models/phi-4/1/model.json new file mode 100644 index 0000000000..b6b61e1fc1 --- /dev/null +++ b/python/openai/tests/vllm_tiny_models/phi-4/1/model.json @@ -0,0 +1 @@ +{"model": "microsoft/Phi-4-mini-instruct", "disable_log_requests": true, "gpu_memory_utilization": 0.8} diff --git a/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt b/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt new file mode 100644 index 0000000000..fc7bdc0534 --- /dev/null +++ b/python/openai/tests/vllm_tiny_models/phi-4/config.pbtxt @@ -0,0 +1,28 @@ +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "vllm" +instance_group [{kind: KIND_MODEL}] \ No newline at end of file diff --git a/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json b/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json new file mode 100644 index 0000000000..0444f38b14 --- /dev/null +++ b/python/openai/tests/vllm_tiny_models/tiny_llama/1/model.json @@ -0,0 +1 @@ +{"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "disable_log_requests": true, "gpu_memory_utilization": 0.2} diff --git a/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt b/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt new file mode 100644 index 0000000000..fc7bdc0534 --- /dev/null +++ b/python/openai/tests/vllm_tiny_models/tiny_llama/config.pbtxt @@ -0,0 +1,28 @@ +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +backend: "vllm" +instance_group [{kind: KIND_MODEL}] \ No newline at end of file