Merge branch 'main' into support-abort-request

lvhan028 · lvhan028 · commit 0051af7d2360 · 2025-11-04T20:46:07.000+08:00
diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger('lmdeploy')
 
 
-class GuidedDecodingMangager:
+class GuidedDecodingManager:
     processors = {}
 
     def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]):
@@ -26,7 +26,8 @@ def get_processors(self, session_ctx: List[Dict[str, Any]],
         processors = {}
         for i, _format in enumerate(response_formats):
             if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
-                if _format['type'] == 'json_schema':
+                schema_type = _format['type']
+                if schema_type == 'json_schema':
                     schema = _format['json_schema']
                     if isinstance(schema, Dict):
                         for key in ['json_schema', 'schema']:
@@ -37,15 +38,17 @@ def get_processors(self, session_ctx: List[Dict[str, Any]],
                         raise ValueError(f'Cannot parse schema {schema}. The schema must be '
                                          'either a dictionary or a string that contains the'
                                          ' JSON Schema specification')
-                elif _format['type'] == 'regex_schema':
+                elif schema_type == 'regex_schema':
                     schema = _format.get('regex_schema', '')
+                elif schema_type == 'json_object':
+                    schema = '{"type" : "object", "additionalProperties": true}'
                 else:
-                    raise ValueError(f"unsupported format type: {_format['type']}")
+                    raise ValueError(f'unsupported format type: {schema_type}')
 
                 session_id = session_ctx[i]['session_id']
                 seq_id = session_ctx[i]['seq_id']
 
-                processors[i] = self.get_processor(session_id, seq_id, schema, _format['type'])
+                processors[i] = self.get_processor(session_id, seq_id, schema, schema_type)
 
         return processors
 
@@ -63,7 +66,9 @@ def get_processor(self, session_id: int, seq_id: int, schema: str, type: str) ->
             assert isinstance(schema, dict)
             compiled = self.compiler.compile_json_schema(schema)
         elif type == 'regex_schema':
-            compiled = self.compiler.compile_regex_grammar(schema)
+            compiled = self.compiler.compile_regex(schema)
+        elif type == 'json_object':
+            compiled = self.compiler.compile_json_schema(schema)
         else:
             assert False, f'Do not support schema type {type}'
 
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
@@ -8,7 +8,7 @@
 from lmdeploy.messages import LogitsProcessor
 
 from ..messages import SchedulerSequence
-from .guided_process import GuidedDecodingMangager
+from .guided_process import GuidedDecodingManager
 
 
 def _process_temperature_(scores: torch.Tensor, temperature: torch.Tensor):
@@ -143,12 +143,10 @@ class FusedLogitsProcessor:
     def __init__(
         self,
         sampling_inputs: SamplingInputs,
-        sampling_vocab_size: Optional[int] = None,
         logprobs_mode: Optional[str] = None,
-        guided_decoding_manager: Optional[GuidedDecodingMangager] = None,
+        guided_decoding_manager: Optional[GuidedDecodingManager] = None,
     ):
         self.sampling_inputs: SamplingInputs = sampling_inputs
-        self.sampling_vocab_size = sampling_vocab_size
         self.logprobs_mode = logprobs_mode
         self.guided_decoding_manager = guided_decoding_manager
         if sampling_inputs.session_to_cleanup:
@@ -266,9 +264,6 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
             offsets = sampling_inputs.random_offsets
             return _multinomial_sampling(softmax_scores, seeds, offsets, indices)
 
-        if self.sampling_vocab_size is not None and logits.size(1) > self.sampling_vocab_size:
-            logits = logits[..., :self.sampling_vocab_size]
-
         if sampling_inputs.max_top_k == 1:
             result = logits.argmax(-1)
         else:
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -30,7 +30,7 @@
 from ..utils import get_gpu_memory
 from ..weight_loader.model_weight_loader import load_model_weights
 from .cache_engine import CacheEngine
-from .guided_process import GuidedDecodingMangager
+from .guided_process import GuidedDecodingManager
 from .logits_process import FusedLogitsProcessor, SamplingInputs
 
 logger = get_logger('lmdeploy')
@@ -248,7 +248,8 @@ def model_forward(
             output = model(**input_dict)
 
             # InternVL-3.5-Flash will change the seqlen, model_metas during forward
-            model_metas = context.model_metas
+            if context.model_metas is not None and context.model_metas[0] is not None:
+                model_metas = context.model_metas
             seq_length = context.q_seqlens[:len(inputs.seq_length)]
 
     return dict(hidden_states=output, model_metas=model_metas, seq_length=seq_length)
@@ -315,10 +316,6 @@ def __init__(self,
         self.cache_config = cache_config
         # use raw tokenizer
         self.tokenizer = Tokenizer(model_path).model.model
-        try:
-            self.sampling_vocab_size = len(self.tokenizer)
-        except BaseException:
-            self.sampling_vocab_size = None
 
         self._pre_in_que = None
         self._in_que = None
@@ -354,9 +351,9 @@ def __init__(self,
         self.cache_engine = None
         self.profiler: AgentProfiler = None
         try:
-            self.guided_decoding_manager = GuidedDecodingMangager(self.tokenizer, self.sampling_vocab_size)
+            self.guided_decoding_manager = GuidedDecodingManager(self.tokenizer, model_config.vocab_size)
         except ValueError as e:
-            logger.warning(f'Failed to create GuidedManager for tokenizer {self.tokenizer}: {e}')
+            logger.warning(f'Failed to create GuidedManager for tokenizer {type(self.tokenizer)}: {e}')
             self.guided_decoding_manager = None
 
         # microbatch
@@ -552,7 +549,6 @@ async def async_sampling_logits(self, logits: torch.Tensor, sampling_inputs: Sam
         with record_function('sampling_logits'):
             logits_processor = FusedLogitsProcessor(
                 sampling_inputs,
-                sampling_vocab_size=self.sampling_vocab_size,
                 logprobs_mode=self.misc_config.logprobs_mode,
                 guided_decoding_manager=self.guided_decoding_manager,
             )
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -591,7 +591,7 @@ async def completion_stream_generator() -> AsyncGenerator[str, None]:
         tool_calls = None
         reasoning_content = None
         if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
-            try:  # TODO add json_schema guidance to turbomind
+            try:
                 tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request)
                 text, tool_calls = tool_call_info.content, tool_call_info.tool_calls
                 if isinstance(tool_calls, List) and len(tool_calls):
@@ -907,6 +907,25 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
         return error_check_ret
     if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
         return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
+    if (request.prompt is not None) ^ (request.input_ids is None):
+        return create_error_response(HTTPStatus.BAD_REQUEST, 'You must specify exactly one of prompt or input_ids')
+
+    prompt = request.prompt
+    input_ids = request.input_ids
+    image_data = request.image_data
+    if image_data is not None:
+        # convert to openai format
+        image_input = []
+        if not isinstance(image_data, List):
+            image_data = [image_data]
+        for img in image_data:
+            if isinstance(img, str):
+                image_input.append(dict(type='image_url', image_url=dict(url=img)))
+            else:
+                image_input.append(dict(type='image_url', image_url=img))
+        text_input = dict(type='text', text=prompt if prompt else input_ids)
+        prompt = [dict(role='user', content=[text_input] + image_input)]
+        input_ids = None
 
     gen_config = GenerationConfig(
         max_new_tokens=request.max_tokens,
@@ -926,9 +945,9 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
     )
 
     result_generator = VariableInterface.async_engine.generate(
-        messages=request.prompt,
+        messages=prompt,
         session_id=request.session_id,
-        input_ids=request.input_ids,
+        input_ids=input_ids,
         gen_config=gen_config,
         stream_response=True,  # always use stream to enable batching
         sequence_start=True,
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
@@ -439,11 +439,17 @@ class UpdateParamsRequest(BaseModel):
     finished: bool = False
 
 
+# str for url/base64, base64 should be data:image/jpeg;base64, dict should be {'url': url/base64, 'options': ...}
+ImageDataInputItem = Union[str, Dict]
+ImageDataFormat = Union[ImageDataInputItem, List[ImageDataInputItem]]
+
+
 # /generate input
 class GenerateReqInput(BaseModel):
     session_id: Optional[int] = -1
     prompt: Optional[str] = None
     input_ids: Optional[List[int]] = None
+    image_data: Optional[ImageDataFormat] = None
     return_logprob: Optional[bool] = None
     max_tokens: int = 128
     stop: Optional[Union[str, List[str]]] = None
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -54,9 +54,6 @@ class ModelConfig:
     # Therefore, we add a new attr "embedding_size" to represent the vocab dim
     # of token_embedding
     embedding_size: int = 0
-    # for some models like qwen2.5, the vocab size of the model is larger than
-    # the vocab size of the tokenizer.
-    tokenizer_size: int = None
     num_layer: int = None
     inter_size: List[int] = None
     norm_eps: float = None
diff --git a/lmdeploy/turbomind/deploy/target_model/base.py b/lmdeploy/turbomind/deploy/target_model/base.py
@@ -101,10 +101,6 @@ def update_model_config(self):
         final_cfg.update(self.input_model_info)
         if 'embedding_size' not in self.input_model_info.keys():
             final_cfg.update(embedding_size=self.input_model_info['vocab_size'])
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.input_model.tokenizer_path, trust_remote_code=True)
-        tokenizer_size = min(len(tokenizer), final_cfg['vocab_size'])
-        final_cfg.update(tokenizer_size=tokenizer_size)
 
         self.model_config = config_from_dict(ModelConfig, final_cfg)
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -720,7 +720,12 @@ async def async_stream_infer(self,
             try:
                 tokenizer_info = TokenizerInfo.from_huggingface(tokenizer.model.model, vocab_size=vocab_size)
                 decode_grammar_type = gen_config.response_format['type']
-                decode_grammar = gen_config.response_format[decode_grammar_type]['schema']
+                if decode_grammar_type == 'json_schema':
+                    decode_grammar = gen_config.response_format[decode_grammar_type]['schema']
+                elif decode_grammar_type == 'regex_schema':
+                    decode_grammar = gen_config.response_format[decode_grammar_type]
+                elif decode_grammar_type == 'json_object':
+                    decode_grammar = '{"type" : "object", "additionalProperties": true}'
 
                 compiler = _xgr.GrammarCompiler(tokenizer_info)
 
@@ -730,9 +735,12 @@ async def async_stream_infer(self,
                 elif decode_grammar_type == 'regex_schema':
                     decode_grammar = str(decode_grammar)
                     grammar = compiler.compile_regex(decode_grammar)
+                elif decode_grammar_type == 'json_object':
+                    decode_grammar = str(decode_grammar)
+                    grammar = compiler.compile_json_schema(decode_grammar)
                 else:
                     assert False, f'Decode grammar type {decode_grammar_type} should be in ' \
-                                   '["json_schema", "regex_schema"]'
+                                   '["json_schema", "regex_schema", "json_object"]'
 
                 self.model_inst.set_grammar(grammar)
             except ValueError as e:
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
@@ -86,12 +86,16 @@ async def wrap_for_pytorch(
                 ]
             )
         """
-        result = self.model.to_pytorch(messages,
-                                       chat_template,
-                                       tokenizer,
-                                       sequence_start,
-                                       tools=tools,
-                                       enable_thinking=enable_thinking)
+        has_input_ids = self.model.has_input_ids(messages)
+        if not has_input_ids:
+            result = self.model.to_pytorch(messages,
+                                           chat_template,
+                                           tokenizer,
+                                           sequence_start,
+                                           tools=tools,
+                                           enable_thinking=enable_thinking)
+        else:
+            result = self.model.to_pytorch_with_input_ids(messages)
         # clear data
         for i, message in enumerate(messages):
             if isinstance(message['content'], List):
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
+from itertools import groupby
 from typing import Dict, List, Union
 
 import numpy as np
@@ -104,6 +105,18 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """  # noqa
         raise NotImplementedError()
 
+    def has_input_ids(self, messages: List[Dict]) -> bool:
+        """Check whether the messages contain input_ids directly.
+
+        Args:
+            messages (List[Dict]): a list of message, which is supposed to be
+                the output of `preprocess`
+        Returns:
+            bool: whether the messages contain input_ids directly
+        """
+        users = [x['content'] for x in messages if x['role'] == 'user']
+        return len(users) == 1 and isinstance(users[0], List) and isinstance(users[0][0].get('text', ''), List)
+
     def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
@@ -168,6 +181,43 @@ def collect_images(messages):
             }) for x in content if x['type'] == 'image'])
         return images
 
+    def to_pytorch_with_input_ids(self, messages):
+        """Pack the preprocessing results in a format compatible with what is
+        required by pytorch engine when input_ids are provided directly.
+
+        Args:
+            messages(List[Dict]): the output of `preprocess`
+        """
+        # collect all preprocessing result from messages
+        preps = [x['content'] for x in messages if x['role'] == 'preprocess']
+        assert len(preps) == 1
+        preps = preps[0]
+
+        _input_ids = messages[0]['content'][0]['text']
+        segs = []
+        for k, g in groupby(_input_ids, lambda x: x == self.image_token_id):
+            if not k:
+                segs.append(list(g))
+            else:
+                segs.extend([[]] * (len(list(g)) - 1))
+        if _input_ids[0] == self.image_token_id:
+            segs = [[]] + segs
+        if _input_ids[-1] == self.image_token_id:
+            segs = segs + [[]]
+
+        assert self.image_token_id == preps[0]['image_token_id']
+        assert len(segs) == len(preps) + 1, (f'the number of image token id {self.image_token_id} is not equal '
+                                             f'to input images, {len(segs) - 1} vs {len(preps)}')
+        input_ids = []
+        for i, seg in enumerate(segs):
+            if i > 0 and i <= len(preps):
+                preps[i - 1].update(offset=len(input_ids))
+                image_tokens = preps[i - 1]['image_tokens']
+                input_ids.extend([self.image_token_id] * image_tokens)
+            input_ids.extend(seg)
+
+        return dict(prompt=None, input_ids=input_ids, multimodal=preps)
+
     def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_start):
         """Auxiliary function to pack the preprocessing results in a format
         compatible with what is required by pytorch engine.
diff --git a/src/turbomind/models/llama/LlamaV2.cc b/src/turbomind/models/llama/LlamaV2.cc
@@ -90,7 +90,7 @@ LlamaV2::LlamaV2(DataType                     dtype,
 
     // using float to avoid data overflow
     dynamic_decode_ = std::make_unique<DynamicDecodeLayer>(
-        kFloat32, max_batch_size, model.tokenizer_size, vocab_size_padded_, stream_, &ctx.device_prop);
+        kFloat32, max_batch_size, vocab_size_, vocab_size_padded_, stream_, &ctx.device_prop);
 }
 
 void LlamaV2::updateEmbedding(char*            decoder_input,
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
@@ -28,7 +28,6 @@ struct ModelParam {
     size_t   layer_num;
     size_t   vocab_size;
     size_t   embedding_size;
-    size_t   tokenizer_size;
     float    norm_eps;
     int      quant_policy;
     bool     attn_bias;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -200,12 +200,6 @@ void LlamaTritonModel::handleMissingParams()
                        (int)model_param_.vocab_size);
     }
 
-    if (model_param_.tokenizer_size == 0) {
-        model_param_.tokenizer_size = model_param_.vocab_size;
-        TM_LOG_WARNING("[LlamaTritonModel] `tokenizer_size` is not set, default to `vocab_size` (%d).",
-                       (int)model_param_.vocab_size);
-    }
-
     if (!attn_param_.max_position_embeddings) {
         attn_param_.max_position_embeddings = 2048;
         TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
@@ -322,7 +316,6 @@ LlamaTritonModel::LlamaTritonModel(std::string                            model_
     model_param_.layer_num          = model_reader["num_layer"].as<int>();
     model_param_.vocab_size         = model_reader["vocab_size"].as<int>();
     model_param_.embedding_size     = model_reader["embedding_size"].as<int>();
-    model_param_.tokenizer_size     = model_reader["tokenizer_size"].as<int>(0);
     model_param_.norm_eps           = model_reader["norm_eps"].as<float>();
     model_param_.tune_layer_num     = model_reader["tune_layer_num"].as<int>(1);
     model_param_.mla.q_lora_rank    = model_reader["q_lora_rank"].as<int>();
diff --git a/tests/test_lmdeploy/test_grammar.py b/tests/test_lmdeploy/test_grammar.py