1
1
#!/usr/bin/env python3
2
2
3
- # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ # Copyright 2024-2025 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4
4
#
5
5
# Redistribution and use in source and binary forms, with or without
6
6
# modification, are permitted provided that the following conditions
@@ -95,12 +95,20 @@ def parse_args():
95
95
required = True ,
96
96
help = "Path to the Triton model repository holding the models to be served" ,
97
97
)
98
+ # TODO: determine what to do with single tokenizer flag
98
99
triton_group .add_argument (
99
100
"--tokenizer" ,
100
101
type = str ,
101
102
default = None ,
102
103
help = "HuggingFace ID or local folder path of the Tokenizer to use for chat templates" ,
103
104
)
105
+ triton_group .add_argument (
106
+ "--tokenizers" ,
107
+ type = str ,
108
+ nargs = "+" , # Accept multiple arguments
109
+ default = [],
110
+ help = "List of HuggingFace IDs or local folder paths of Tokenizers to use. Format: model_name:tokenizer_path" ,
111
+ )
104
112
triton_group .add_argument (
105
113
"--backend" ,
106
114
type = str ,
@@ -160,8 +168,22 @@ def parse_args():
160
168
def main ():
161
169
args = parse_args ()
162
170
163
- # Initialize a Triton Inference Server pointing at LLM models
164
- server : tritonserver .Server = tritonserver .Server (
171
+ # Parse tokenizer mappings
172
+ tokenizer_map = {}
173
+ for tokenizer_spec in args .tokenizers :
174
+ try :
175
+ model_name , tokenizer_path = tokenizer_spec .split (":" )
176
+ tokenizer_map [model_name ] = tokenizer_path
177
+ except ValueError :
178
+ print (
179
+ f"Warning: Skipping invalid tokenizer specification: { tokenizer_spec } . Format should be 'model_name:tokenizer_path'"
180
+ )
181
+
182
+ if args .tokenizer :
183
+ tokenizer_map ["default" ] = args .tokenizer
184
+
185
+ # Initialize Triton server
186
+ server = tritonserver .Server (
165
187
model_repository = args .model_repository ,
166
188
log_verbose = args .tritonserver_log_verbose_level ,
167
189
log_info = True ,
@@ -170,8 +192,8 @@ def main():
170
192
).start (wait_until_ready = True )
171
193
172
194
# Wrap Triton Inference Server in an interface-conforming "LLMEngine"
173
- engine : TritonLLMEngine = TritonLLMEngine (
174
- server = server , tokenizer = args . tokenizer , backend = args .backend
195
+ engine = TritonLLMEngine (
196
+ server = server , tokenizer_map = tokenizer_map , backend = args .backend
175
197
)
176
198
177
199
# Attach TritonLLMEngine as the backbone for inference and model management
0 commit comments