-
Notifications
You must be signed in to change notification settings - Fork 3.3k
model: qwen2.5 omni (thinker only) #4969
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6840b0c
28d8e6a
e9e0551
1780594
2c527f3
5c7a68e
05c5fcc
6c78337
175e3b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,12 +65,25 @@ def convert_to_strs(self, processor): | |
| video_token_regex: Optional[re.Pattern] = None | ||
| audio_token_regex: Optional[re.Pattern] = None | ||
|
|
||
| def __post_init__(self): | ||
| if self.image_token_regex is None and self.image_token is not None: | ||
| def compile_regex(self): | ||
| # TODO: move convert_to_strs to here, before compiling regex | ||
| if ( | ||
| self.image_token_regex is None | ||
| and self.image_token is not None | ||
| and isinstance(self.image_token, str) | ||
| ): | ||
| self.image_token_regex = re.compile(re.escape(self.image_token)) | ||
| if self.video_token_regex is None and self.video_token is not None: | ||
| if ( | ||
| self.video_token_regex is None | ||
| and self.video_token is not None | ||
| and isinstance(self.video_token, str) | ||
| ): | ||
| self.video_token_regex = re.compile(re.escape(self.video_token)) | ||
| if self.audio_token_regex is None and self.audio_token is not None: | ||
| if ( | ||
| self.audio_token_regex is None | ||
| and self.audio_token is not None | ||
| and isinstance(self.audio_token, str) | ||
| ): | ||
| self.audio_token_regex = re.compile(re.escape(self.audio_token)) | ||
|
|
||
| def collect(self) -> re.Pattern: | ||
|
|
@@ -216,34 +229,37 @@ def submit_data_loading_tasks( | |
| # Submit all tasks | ||
| futures = [] | ||
| task_info = [] | ||
| image_index, audio_index = 0, 0 | ||
| image_iter, audio_iter = None, None | ||
| if isinstance(image_data, list): | ||
| image_iter = iter(image_data) | ||
| if isinstance(audio_data, list): | ||
| audio_iter = iter(audio_data) | ||
|
|
||
| for text_part in text_parts: | ||
| if ( | ||
| multimodal_tokens.image_token_regex | ||
| and multimodal_tokens.image_token_regex.match(text_part) | ||
| ): | ||
| data = image_data[image_index] | ||
| assert image_iter | ||
| data = next(image_iter) | ||
| is_video = isinstance(data, str) and data.startswith("video:") | ||
| estimated_frames = estimated_frames_list[image_index] | ||
| frame_count_limit = max(1, int(estimated_frames * scaling_factor)) | ||
| futures.append( | ||
| self.io_executor.submit( | ||
| BaseMultimodalProcessor._load_single_item, | ||
| data, | ||
| is_video, | ||
| False, | ||
| frame_count_limit, | ||
| None, | ||
| discard_alpha_channel, | ||
| ) | ||
| ) | ||
| task_info.append((Modality.IMAGE, data, frame_count_limit)) | ||
| image_index += 1 | ||
| task_info.append((Modality.IMAGE, data, None)) | ||
|
Comment on lines
246
to
+256
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic for |
||
| elif ( | ||
| multimodal_tokens.audio_token_regex | ||
| and multimodal_tokens.audio_token_regex.match(text_part) | ||
| ): | ||
| data = audio_data[audio_index] | ||
| assert audio_iter | ||
| data = next(audio_iter) | ||
| futures.append( | ||
| self.io_executor.submit( | ||
| BaseMultimodalProcessor._load_single_item, | ||
|
|
@@ -255,7 +271,6 @@ def submit_data_loading_tasks( | |
| ) | ||
| ) | ||
| task_info.append((Modality.AUDIO, data, None)) | ||
| audio_index += 1 | ||
|
|
||
| return futures, task_info | ||
|
|
||
|
|
@@ -284,6 +299,8 @@ def load_mm_data( | |
| image_data = [] | ||
|
|
||
| multimodal_tokens.convert_to_strs(self._processor) | ||
| # TODO: remove this | ||
| multimodal_tokens.compile_regex() | ||
|
Comment on lines
+302
to
+303
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a |
||
| multimodal_tokens_pattern = multimodal_tokens.collect() | ||
|
|
||
| if isinstance(prompt, list) and return_text: | ||
|
|
@@ -361,6 +378,8 @@ def get_mm_items_offset( | |
| mm_token_id = 3 | ||
| return result = [(2,4),(6,7)] | ||
| """ | ||
| assert isinstance(mm_token_id, int), type(mm_token_id) | ||
| assert isinstance(input_ids, torch.Tensor), type(input_ids) | ||
| mask = input_ids == mm_token_id | ||
|
|
||
| start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0] | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,3 +1,5 @@ | ||||||
| from __future__ import annotations | ||||||
|
|
||||||
| import asyncio | ||||||
| import math | ||||||
| import re | ||||||
|
|
@@ -14,28 +16,46 @@ | |||||
| MultimodalSpecialTokens, | ||||||
| ) | ||||||
| from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem | ||||||
| from sglang.srt.models.qwen2_5_omni import Qwen2_5OmniModel | ||||||
| from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration | ||||||
| from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration | ||||||
|
|
||||||
|
|
||||||
| # Compatible with Qwen2VL and Qwen2_5VL | ||||||
| # Compatible with Qwen2VL, Qwen2_5VL and Qwen2_5_o | ||||||
| class Qwen2_5VLImageProcessor(SGLangBaseProcessor): | ||||||
| models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration] | ||||||
| models = [ | ||||||
| Qwen2VLForConditionalGeneration, | ||||||
| Qwen2_5_VLForConditionalGeneration, | ||||||
| Qwen2_5OmniModel, | ||||||
| ] | ||||||
|
|
||||||
| def __init__(self, hf_config, server_args, _processor): | ||||||
| super().__init__(hf_config, server_args, _processor) | ||||||
| # The single, pre-expanded image token. | ||||||
| self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>" | ||||||
| # The regex that matches expanded image tokens. | ||||||
| self.IMAGE_TOKEN_REGEX = re.compile( | ||||||
| r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>" | ||||||
| ) | ||||||
| self.IM_START_TOKEN_ID = hf_config.vision_start_token_id | ||||||
| self.IM_END_TOKEN_ID = hf_config.vision_end_token_id | ||||||
| self.IM_TOKEN_ID = hf_config.image_token_id | ||||||
| self.VIDEO_TOKEN_ID = hf_config.video_token_id | ||||||
| self.vision_start_token_id = hf_config.vision_start_token_id | ||||||
| self.vision_end_token_id = hf_config.vision_end_token_id | ||||||
| if self.arch == Qwen2_5OmniModel.__name__: | ||||||
| self.image_token_id = hf_config.thinker_config.image_token_index | ||||||
| self.image_start_id = hf_config.thinker_config.vision_start_token_id | ||||||
| self.image_end_id = hf_config.thinker_config.vision_end_token_id | ||||||
| self.audio_token_id = hf_config.thinker_config.audio_token_index | ||||||
| self.audio_start_id = hf_config.thinker_config.audio_start_token_id | ||||||
| self.audio_end_id = hf_config.thinker_config.audio_end_token_id | ||||||
| self.video_token_id = hf_config.thinker_config.video_token_index | ||||||
| # TODO: precomputed features might not need pre-processing anymore, try removing this | ||||||
| self.IMAGE_TOKEN_REGEX = re.compile( | ||||||
| r"<\|vision_bos\|>(?:<\|IMAGE\|>)+<\|vision_eos\|>" | ||||||
| ) | ||||||
| self.image_token = "<|vision_bos|><|IMAGE|><|vision_eo|>" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a typo in the
Suggested change
|
||||||
| else: | ||||||
| self.image_token_id = hf_config.image_token_id | ||||||
| self.image_start_id = hf_config.vision_start_token_id | ||||||
| self.image_end_id = hf_config.vision_end_token_id | ||||||
| self.video_token_id = hf_config.video_token_id | ||||||
| # The regex that matches expanded image tokens. | ||||||
| self.IMAGE_TOKEN_REGEX = re.compile( | ||||||
| r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>" | ||||||
| ) | ||||||
| self.image_token = "<|vision_start|><|image_pad|><|vision_end|>" | ||||||
|
|
||||||
| self.NUM_TOKEN_PER_FRAME = 770 | ||||||
| self.IMAGE_FACTOR = 28 | ||||||
| self.MIN_PIXELS = 4 * 28 * 28 | ||||||
|
|
@@ -57,9 +77,12 @@ async def process_mm_data_async( | |||||
| base_output = self.load_mm_data( | ||||||
| prompt=input_text, | ||||||
| image_data=image_data, | ||||||
| audio_data=request_obj.audio_data, | ||||||
| multimodal_tokens=MultimodalSpecialTokens( | ||||||
| image_token=self.IMAGE_TOKEN, | ||||||
| image_token=self.image_token, | ||||||
| image_token_regex=self.IMAGE_TOKEN_REGEX, | ||||||
| audio_token=getattr(self, "audio_token_id", None), | ||||||
| video_token=getattr(self, "video_token_id", None), | ||||||
| ), | ||||||
| max_req_input_len=max_req_input_len, | ||||||
| ) | ||||||
|
|
@@ -130,6 +153,18 @@ async def resize_image_async(image): | |||||
| resize_tasks = [resize_image_async(image) for image in base_output.images] | ||||||
| base_output.images = await asyncio.gather(*resize_tasks) | ||||||
|
|
||||||
| ret = self.process_mm_data( | ||||||
| input_text=base_output.input_text, | ||||||
| images=None if images_are_preprocessed else base_output.images, | ||||||
| audio=base_output.audios, | ||||||
| ) | ||||||
|
|
||||||
| input_ids = ret["input_ids"].flatten() | ||||||
| image_offsets = self.get_mm_items_offset( | ||||||
| input_ids=input_ids, mm_token_id=self.image_token_id | ||||||
| ) | ||||||
|
|
||||||
| image_grid_thw = None | ||||||
| video_grid_thw = None # TODO | ||||||
|
|
||||||
| combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output) | ||||||
|
Comment on lines
+156
to
170
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The method |
||||||
|
|
@@ -141,28 +176,63 @@ async def resize_image_async(image): | |||||
| video_grid_thw = None # TODO | ||||||
| second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None) | ||||||
|
|
||||||
| mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index( | ||||||
| spatial_merge_size=self.hf_config.vision_config.spatial_merge_size, | ||||||
| image_token_id=self.IM_TOKEN_ID, | ||||||
| video_token_id=self.VIDEO_TOKEN_ID, | ||||||
| vision_start_token_id=self.vision_start_token_id, | ||||||
| model_type=self.hf_config.model_type, | ||||||
| tokens_per_second=getattr( | ||||||
| self.hf_config.vision_config, "tokens_per_second", None | ||||||
| ), | ||||||
| input_ids=input_ids.unsqueeze(0), | ||||||
| image_grid_thw=combined_mm_item.image_grid_thw, | ||||||
| video_grid_thw=video_grid_thw, | ||||||
| second_per_grid_ts=second_per_grid_ts, | ||||||
| ) | ||||||
| if "input_features" in ret and ret["input_features"] is not None: | ||||||
| audio_offsets = self.get_mm_items_offset( | ||||||
| input_ids=input_ids, | ||||||
| mm_token_id=getattr(self, "audio_token_id", None), | ||||||
| ) | ||||||
| item = MultimodalDataItem( | ||||||
| audio_features=ret["input_features"], | ||||||
| feature_attention_mask=ret["feature_attention_mask"], | ||||||
| attention_mask=ret["attention_mask"], | ||||||
| # TODO: unify feature and offsets across modalities | ||||||
| audio_offsets=audio_offsets, | ||||||
| modality=Modality.AUDIO, | ||||||
| ) | ||||||
| items += [item] | ||||||
|
|
||||||
| if self.hf_config.model_type == "qwen2_5_omni": | ||||||
| feature_attention_mask = ret.get("feature_attention_mask", None) | ||||||
| if feature_attention_mask is not None: | ||||||
| audio_feature_lengths = torch.sum(feature_attention_mask, dim=1) | ||||||
| else: | ||||||
| audio_feature_lengths = None | ||||||
| mrope_positions, mrope_position_delta = ( | ||||||
| MRotaryEmbedding.get_rope_index_omni( | ||||||
| input_ids=input_ids.unsqueeze(0), | ||||||
| config=self.hf_config.thinker_config, | ||||||
| image_grid_thw=ret.get("image_grid_thw", None), | ||||||
| video_grid_thw=ret.get("video_grid_thw", None), | ||||||
| audio_seqlens=audio_feature_lengths, | ||||||
| second_per_grids=ret.get("second_per_grids", None), | ||||||
| ) | ||||||
| ) | ||||||
| else: | ||||||
| mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index( | ||||||
| spatial_merge_size=self.hf_config.vision_config.spatial_merge_size, | ||||||
| image_token_id=self.IM_TOKEN_ID, | ||||||
| video_token_id=self.VIDEO_TOKEN_ID, | ||||||
| vision_start_token_id=self.image_start_id, | ||||||
| model_type=self.hf_config.model_type, | ||||||
| tokens_per_second=getattr( | ||||||
| self.hf_config.vision_config, "tokens_per_second", None | ||||||
| ), | ||||||
| input_ids=input_ids.unsqueeze(0), | ||||||
| image_grid_thw=combined_mm_item.image_grid_thw, | ||||||
| video_grid_thw=video_grid_thw, | ||||||
| second_per_grid_ts=second_per_grid_ts, | ||||||
| ) | ||||||
| mrope_positions = mrope_positions.squeeze(1) | ||||||
|
|
||||||
| return { | ||||||
| "input_ids": input_ids.tolist(), | ||||||
| "mm_items": [combined_mm_item], | ||||||
| "im_start_id": self.IM_START_TOKEN_ID, | ||||||
| "im_end_id": self.IM_END_TOKEN_ID, | ||||||
| "im_start_id": self.image_start_id, | ||||||
| "im_end_id": self.image_end_id, | ||||||
| "im_token_id": self.IM_TOKEN_ID, | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The returned dictionary uses
Suggested change
|
||||||
| "audio_start_id": getattr(self, "audio_start_id", None), | ||||||
| "audio_end_id": getattr(self, "audio_end_id", None), | ||||||
| "audio_token_id": getattr(self, "audio_token_id", None), | ||||||
| "video_token_id": self.VIDEO_TOKEN_ID, | ||||||
| "mrope_positions": mrope_positions, | ||||||
| "mrope_position_delta": mrope_position_delta, | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The logic for adding a newline to
image_tokenhas been inverted. While this seems to correctly handle theqwen2family of models, this change is quite broad and could have unintended consequences for other models that are not in theqwen2family. It would be safer to make this logic more specific to the models that require no newline, rather than making it the default for allqwen2models and changing the behavior for all other models.