@@ -150,8 +150,14 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
150
150
# (temp reason): we don't support re-routing prefill requests
151
151
# (long-term reason): prefill engine should pull from a global queue so there is
152
152
# only a few in-flight requests that can be quickly finished
153
- generate_endpoint .serve_endpoint (handler .generate , graceful_shutdown = True ),
154
- clear_endpoint .serve_endpoint (handler .clear_kv_blocks ),
153
+ generate_endpoint .serve_endpoint (
154
+ handler .generate ,
155
+ graceful_shutdown = True ,
156
+ metrics_labels = [("model" , config .model )],
157
+ ),
158
+ clear_endpoint .serve_endpoint (
159
+ handler .clear_kv_blocks , metrics_labels = [("model" , config .model )]
160
+ ),
155
161
)
156
162
except Exception as e :
157
163
logger .error (f"Failed to serve endpoints: { e } " )
@@ -178,7 +184,11 @@ async def init(runtime: DistributedRuntime, config: Config):
178
184
.client ()
179
185
)
180
186
181
- factory = StatLoggerFactory (component , config .engine_args .data_parallel_rank or 0 )
187
+ factory = StatLoggerFactory (
188
+ component ,
189
+ config .engine_args .data_parallel_rank or 0 ,
190
+ metrics_labels = [("model" , config .model )],
191
+ )
182
192
engine_client , vllm_config , default_sampling_params = setup_vllm_engine (
183
193
config , factory
184
194
)
@@ -239,8 +249,14 @@ async def init(runtime: DistributedRuntime, config: Config):
239
249
await asyncio .gather (
240
250
# for decode, we want to transfer the in-flight requests to other decode engines,
241
251
# because waiting them to finish can take a long time for long OSLs
242
- generate_endpoint .serve_endpoint (handler .generate , graceful_shutdown = False ),
243
- clear_endpoint .serve_endpoint (handler .clear_kv_blocks ),
252
+ generate_endpoint .serve_endpoint (
253
+ handler .generate ,
254
+ graceful_shutdown = False ,
255
+ metrics_labels = [("model" , config .model )],
256
+ ),
257
+ clear_endpoint .serve_endpoint (
258
+ handler .clear_kv_blocks , metrics_labels = [("model" , config .model )]
259
+ ),
244
260
)
245
261
except Exception as e :
246
262
logger .error (f"Failed to serve endpoints: { e } " )
0 commit comments