Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion QEfficient/base/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,4 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
return model, transformed


VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration"}
VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration", "QEffLlama4ForCausalLM"}
6 changes: 6 additions & 0 deletions QEfficient/transformers/models/gemma3/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
778 changes: 778 additions & 0 deletions QEfficient/transformers/models/gemma3/modeling_gemma3.py

Large diffs are not rendered by default.

24 changes: 10 additions & 14 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,12 +769,13 @@ def kv_offload_generate(
device_ids: List[int] = None,
generation_len: int = None,
):
if not self.vision_model.qpc_path or not self.lang_model.qpc_path:
raise TypeError("Please run compile API for vision and language model first!")
if not self.lang_model.qpc_path:
raise TypeError("Please run compile API for language model first!")

lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_ids, activate=False)

vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids)
if self.vision_model.qpc_path:
vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids)

batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path)

Expand Down Expand Up @@ -849,7 +850,8 @@ def kv_offload_generate(
if not_mllama:
lang_inputs["image_idx"] = np.array([[0]])

vision_session.deactivate()
if self.vision_model.qpc_path:
vision_session.deactivate()
lang_session.activate()

lang_session.set_buffers(vision_outputs)
Expand All @@ -859,6 +861,8 @@ def kv_offload_generate(
prefill_start = perf_counter()

# Run prefill
chunk_inputs = lang_inputs.copy()
chunk_inputs["index"] = np.array([[0]])
for i in range(num_chunks):
chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
chunk_inputs["position_ids"] = lang_inputs["position_ids"][
Expand Down Expand Up @@ -1087,11 +1091,8 @@ def cloud_ai_100_generate(
qpc_session = QAICInferenceSession(
self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False
)

batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path)

pad_token_id = 1

# Skip inputs/outputs
qpc_session.skip_buffers(
[
Expand Down Expand Up @@ -1699,6 +1700,7 @@ def build_decode_specialization(
"ctx_len": ctx_len,
"num_logits_to_keep": (num_speculative_tokens + 1) if self.is_tlm else None,
}

if self.continuous_batching:
spec["full_batch_size"] = kv_cache_batch_size
else:
Expand Down Expand Up @@ -1785,7 +1787,6 @@ def compile(

# --- Specializations ---
specializations = []

if prefill_only is None or prefill_only or prefill_seq_len == 1:
specializations.append(
self.build_prefill_specialization(
Expand Down Expand Up @@ -1833,11 +1834,6 @@ def compile(
**compiler_options,
)

if compiler_options.get("io_encrypt", None):
logger.warning(
"Compilation for IO-Encrypt has been successfully completed. However, Efficient-Transformers do not support IO-Encrypt execution. Please run the execution separately with QPC compiled without io-encrypt."
)

return qpc_path

# FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
Expand Down Expand Up @@ -1890,7 +1886,7 @@ def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[
elif num_speculative_tokens is None:
raise TypeError("missing required argument `num_speculative_tokens` as `is_tlm` instance variable is True.")

if not isinstance(num_speculative_tokens, int) and num_speculative_tokens < 2:
if not isinstance(num_speculative_tokens, int) and num_speculative_tokens:
ValueError(
f"`num_speculative_tokens` arg should be an integer greater than 1, got {num_speculative_tokens}"
)
Expand Down
23 changes: 23 additions & 0 deletions QEfficient/transformers/models/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@
Gemma2Model,
Gemma2RMSNorm,
)
from transformers.models.gemma3.modeling_gemma3 import (
Gemma3Attention,
Gemma3DecoderLayer,
Gemma3ForCausalLM,
Gemma3ForConditionalGeneration,
Gemma3RMSNorm,
Gemma3TextModel,
)
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2LMHeadModel, GPT2Model
from transformers.models.gpt_bigcode.modeling_gpt_bigcode import (
GPTBigCodeAttention,
Expand Down Expand Up @@ -171,6 +179,14 @@
QEffGemma2ForCausalLM,
QEffGemma2Model,
)
from QEfficient.transformers.models.gemma3.modeling_gemma3 import (
QEffGemma3Attention,
QEffGemma3CustomRMSNormAIC,
QEffGemma3DecoderLayer,
QEffGemma3ForCausalLMModel,
QEffGemma3ForConditionalGeneration,
QEffGemma3TextModel,
)
from QEfficient.transformers.models.gpt2.modeling_gpt2 import (
QEffGPT2Attention,
QEffGPT2Block,
Expand Down Expand Up @@ -319,6 +335,7 @@ class CustomOpsTransform(ModuleMappingTransform):
MllamaTextRMSNorm: CustomRMSNormAIC,
GraniteRMSNorm: CustomRMSNormAIC,
GraniteMoeRMSNorm: CustomRMSNormAIC,
Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC,
}


Expand Down Expand Up @@ -373,6 +390,12 @@ class KVCacheTransform(ModuleMappingTransform):
Gemma2DecoderLayer: QEffGemma2DecoderLayer,
Gemma2Model: QEffGemma2Model,
Gemma2ForCausalLM: QEffGemma2ForCausalLM,
# Gemma3
Gemma3Attention: QEffGemma3Attention,
Gemma3DecoderLayer: QEffGemma3DecoderLayer,
Gemma3TextModel: QEffGemma3TextModel,
Gemma3ForCausalLM: QEffGemma3ForCausalLMModel,
Gemma3ForConditionalGeneration: QEffGemma3ForConditionalGeneration,
# Granite
GraniteModel: QEffGraniteModel,
GraniteForCausalLM: QEffGraniteForCausalLM,
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ class QnnConstants:
# Converter Arguments
FLOAT_BITWIDTH = 16
FLOAT_BIAS_BITWIDTH = 32
CONVERTER_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification "
CONVERTER_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification --target_backend AIC "

# Context-Binary-Generator Arguments
LOG_LEVEL = "error"
Expand Down
67 changes: 28 additions & 39 deletions QEfficient/utils/generate_qnn_network_specialization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,55 +66,44 @@ def generate_qnn_specialization(
raise AttributeError(f"ERROR: {input_shape} Shape not Found")
shapes.append(shape)

# Filling shape value for nodes with shape size != 2, example: past_key / past_value nodes.
if len(shapes) != 2:
shape_list = []
prefill_decode_shapes = False
if len(specializations) > 1 and (node.name in ["input_ids", "position_ids"]):
prefill_decode_shapes = True
for input_shape in shapes:
# If shape contains the parameter string, it value is extracted from the specialization file.
if isinstance(input_shape, str):
if input_shape in specializations[0]:
shape_list.append(int(specializations[0][input_shape]))
if (
not prefill_decode_shapes
and len(specializations) > 1
and input_shape in specializations[1]
and specializations[0][input_shape] != specializations[1][input_shape]
):
prefill_decode_shapes = True
else:
raise AttributeError(f"ERROR: {input_shape} is required in specializations")
# If shape contains the value, then that value is used as it is.
else:
shape_list.append(input_shape)
# Calculated shape is now assigned to the input node.
input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")

if prefill_decode_shapes:
shape_list = []
for input_shape in shapes:
# If shape contains the parameter string, it value is extracted from the specialization file.
if isinstance(input_shape, str):
if input_shape in specializations[0]:
shape_list.append(int(specializations[0][input_shape]))
if input_shape in specializations[1]:
shape_list.append(int(specializations[1][input_shape]))
else:
raise AttributeError(f"ERROR: {input_shape} is required in specializations")
# If shape contains the value, then that value is used as it is.
else:
shape_list.append(input_shape)

# Calculated shape is now assigned to the input node.
input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
# If shape value for nodes is with shape size == 2, example: input_ids, position_ids, etc.
else:
shape_list = []
for input_shape in shapes:
if isinstance(input_shape, str):
if input_shape in specializations[0]:
shape_list.append(int(specializations[0][input_shape]))
else:
raise AttributeError(f"ERROR: {input_shape} is required in specializations")
else:
shape_list.append(input_shape)
# If specializations file contains more than one parameters list, then first list is used for prefill and second one for decode graph.
if len(specializations) > 1:
prefill_shape_list = shape_list
decode_shape_list = []
for input_shape in shapes:
if isinstance(input_shape, str):
if input_shape in specializations[1]:
decode_shape_list.append(int(specializations[1][input_shape]))
else:
raise AttributeError(f"ERROR: {input_shape} is required in specializations")
else:
decode_shape_list.append(input_shape)

input_info["Shape"] = (
str(prefill_shape_list).replace("[", "(").replace("]", ")")
+ ", "
+ str(decode_shape_list).replace("[", "(").replace("]", ")")
)

# If specializations file contains only one parameters list, then that list is used for decode graph information.
else:
input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
input_info["Shape"] += ", " + str(shape_list).replace("[", "(").replace("]", ")")

# Finally, input node is created with its name, and desired model parameters {DataType, Shape}
input_nodes_info.append({"Name": node.name, "Desired Model Parameters": input_info})
Expand Down
Loading
Loading