Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1313,9 +1313,14 @@ def kv_offload_generate(
vision_end = perf_counter()

lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
lang_inputs["position_ids"] = np.where(
lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
) # Need to use -1 as position_ids for invalid tokens

if "position_ids" in inputs:
lang_inputs["position_ids"] = inputs["position_ids"]
lang_inputs.pop("attention_mask")
else:
lang_inputs["position_ids"] = np.where(
lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
) # Need to use -1 as position_ids for invalid tokens

not_mllama = hasattr(self.model.config, "model_type") and self.model.config.model_type != "mllama"
if not_mllama:
Expand All @@ -1336,7 +1341,7 @@ def kv_offload_generate(
for i in range(num_chunks):
chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
chunk_inputs["position_ids"] = lang_inputs["position_ids"][
:, i * prefill_seq_len : (i + 1) * prefill_seq_len
..., i * prefill_seq_len : (i + 1) * prefill_seq_len
]
outputs = lang_session.run(chunk_inputs)
chunk_inputs["image_idx"] = outputs["image_idx_output"]
Expand All @@ -1353,7 +1358,7 @@ def kv_offload_generate(

# Get first token
lang_inputs["input_ids"] = outputs["logits"].argmax(2)
lang_inputs["position_ids"] = input_len.numpy()
lang_inputs["position_ids"] = np.max(lang_inputs["position_ids"], axis=-1, keepdims=True) + 1
if "cross_attention_mask" in lang_inputs:
bs, _, num_images, img_tiles = lang_inputs["cross_attention_mask"].shape
lang_inputs["cross_attention_mask"] = torch.ones((bs, 1, num_images, img_tiles), dtype=torch.int64).numpy()
Expand Down
30 changes: 30 additions & 0 deletions QEfficient/transformers/models/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,18 @@
Qwen2Model,
Qwen2RMSNorm,
)
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
Qwen2_5_VisionTransformerPretrainedModel,
Qwen2_5_VLAttention,
Qwen2_5_VLDecoderLayer,
Qwen2_5_VLForConditionalGeneration,
Qwen2_5_VLModel,
Qwen2_5_VLTextModel,
Qwen2_5_VLVisionAttention,
)
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
Qwen2RMSNorm as Qwen2_5RMSNorm,
)
from transformers.models.qwen3.modeling_qwen3 import (
Qwen3Attention,
Qwen3DecoderLayer,
Expand Down Expand Up @@ -356,6 +368,15 @@
QEffQwen2ForCausalLM,
QEffQwen2Model,
)
from QEfficient.transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
QEffQwen2_5_VisionTransformerPretrainedModel,
QEffQwen2_5_VLAttention,
QEffQwen2_5_VLDecoderLayer,
QEffQwen2_5_VLModel,
QEffQwen2_5_VLTextModel,
QEffQwen2_5_VLVisionAttention,
QEffQwen_2_5_vl_ForConditionalGeneration,
)
from QEfficient.transformers.models.qwen3.modeling_qwen3 import (
QEffQwen3Attention,
QEffQwen3DecoderLayer,
Expand Down Expand Up @@ -404,6 +425,7 @@ class CustomOpsTransform(ModuleMappingTransform):
Phi3RMSNorm: CustomRMSNormAIC,
Qwen2RMSNorm: CustomRMSNormAIC,
Qwen3RMSNorm: CustomRMSNormAIC,
Qwen2_5RMSNorm: CustomRMSNormAIC,
MllamaTextRMSNorm: CustomRMSNormAIC,
GraniteRMSNorm: CustomRMSNormAIC,
PixtralRMSNorm: CustomRMSNormAIC,
Expand Down Expand Up @@ -544,6 +566,14 @@ class KVCacheTransform(ModuleMappingTransform):
Qwen3DecoderLayer: QEffQwen3DecoderLayer,
Qwen3Model: QEffQwen3Model,
Qwen3ForCausalLM: QEffQwen3ForCausalLM,
# Qwen2.5 VL
Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
Qwen2_5_VLModel: QEffQwen2_5_VLModel,
Qwen2_5_VLAttention: QEffQwen2_5_VLAttention,
Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer,
Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel,
Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention,
Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel,
# Starcoder2
Starcoder2Attention: QEffStarcoder2Attention,
Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer,
Expand Down
6 changes: 6 additions & 0 deletions QEfficient/transformers/models/qwen2_5_vl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
Loading
Loading