quic · quic-rishinr · Jun 13, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 13, 2025
@@ -177,4 +177,4 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
         return model, transformed
 
 
-VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration"}
+VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration", "QEffLlama4ForCausalLM"}
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
@@ -769,12 +769,13 @@ def kv_offload_generate(
         device_ids: List[int] = None,
         generation_len: int = None,
     ):
-        if not self.vision_model.qpc_path or not self.lang_model.qpc_path:
-            raise TypeError("Please run compile API for vision and language model first!")
+        if not self.lang_model.qpc_path:
+            raise TypeError("Please run compile API for language model first!")
 
         lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_ids, activate=False)
 
-        vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids)
+        if self.vision_model.qpc_path:
+            vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids)
 
         batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path)
 
@@ -849,7 +850,8 @@ def kv_offload_generate(
         if not_mllama:
             lang_inputs["image_idx"] = np.array([[0]])
 
-        vision_session.deactivate()
+        if self.vision_model.qpc_path:
+            vision_session.deactivate()
         lang_session.activate()
 
         lang_session.set_buffers(vision_outputs)
@@ -859,6 +861,8 @@ def kv_offload_generate(
         prefill_start = perf_counter()
 
         # Run prefill
+        chunk_inputs = lang_inputs.copy()
+        chunk_inputs["index"] = np.array([[0]])
         for i in range(num_chunks):
             chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
             chunk_inputs["position_ids"] = lang_inputs["position_ids"][
@@ -1087,11 +1091,8 @@ def cloud_ai_100_generate(
         qpc_session = QAICInferenceSession(
             self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False
         )
-
         batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path)
-
         pad_token_id = 1
-
         # Skip inputs/outputs
         qpc_session.skip_buffers(
             [
@@ -1699,6 +1700,7 @@ def build_decode_specialization(
             "ctx_len": ctx_len,
             "num_logits_to_keep": (num_speculative_tokens + 1) if self.is_tlm else None,
         }
+
         if self.continuous_batching:
             spec["full_batch_size"] = kv_cache_batch_size
         else:
@@ -1785,7 +1787,6 @@ def compile(
 
         # --- Specializations ---
         specializations = []
-
         if prefill_only is None or prefill_only or prefill_seq_len == 1:
             specializations.append(
                 self.build_prefill_specialization(
@@ -1833,11 +1834,6 @@ def compile(
             **compiler_options,
         )
 
-        if compiler_options.get("io_encrypt", None):
-            logger.warning(
-                "Compilation for IO-Encrypt has been successfully completed. However, Efficient-Transformers do not support IO-Encrypt execution. Please run the execution separately with QPC compiled without io-encrypt."
-            )
-
         return qpc_path
 
     # FIXME: Update this method to match with transformers AutoModelForCausalLM.generate
@@ -1890,7 +1886,7 @@ def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[
         elif num_speculative_tokens is None:
             raise TypeError("missing required argument `num_speculative_tokens` as `is_tlm` instance variable is True.")
 
-        if not isinstance(num_speculative_tokens, int) and num_speculative_tokens < 2:
+        if not isinstance(num_speculative_tokens, int) and num_speculative_tokens:
             ValueError(
                 f"`num_speculative_tokens` arg should be an integer greater than 1, got {num_speculative_tokens}"
             )

@@ -36,6 +36,14 @@
     Gemma2Model,
     Gemma2RMSNorm,
 )
+from transformers.models.gemma3.modeling_gemma3 import (
+    Gemma3Attention,
+    Gemma3DecoderLayer,
+    Gemma3ForCausalLM,
+    Gemma3ForConditionalGeneration,
+    Gemma3RMSNorm,
+    Gemma3TextModel,
+)
 from transformers.models.gpt2.modeling_gpt2 import GPT2Attention, GPT2Block, GPT2LMHeadModel, GPT2Model
 from transformers.models.gpt_bigcode.modeling_gpt_bigcode import (
     GPTBigCodeAttention,
@@ -171,6 +179,14 @@
     QEffGemma2ForCausalLM,
     QEffGemma2Model,
 )
+from QEfficient.transformers.models.gemma3.modeling_gemma3 import (
+    QEffGemma3Attention,
+    QEffGemma3CustomRMSNormAIC,
+    QEffGemma3DecoderLayer,
+    QEffGemma3ForCausalLMModel,
+    QEffGemma3ForConditionalGeneration,
+    QEffGemma3TextModel,
+)
 from QEfficient.transformers.models.gpt2.modeling_gpt2 import (
     QEffGPT2Attention,
     QEffGPT2Block,
@@ -319,6 +335,7 @@ class CustomOpsTransform(ModuleMappingTransform):
         MllamaTextRMSNorm: CustomRMSNormAIC,
         GraniteRMSNorm: CustomRMSNormAIC,
         GraniteMoeRMSNorm: CustomRMSNormAIC,
+        Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC,
     }
 
 
@@ -373,6 +390,12 @@ class KVCacheTransform(ModuleMappingTransform):
         Gemma2DecoderLayer: QEffGemma2DecoderLayer,
         Gemma2Model: QEffGemma2Model,
         Gemma2ForCausalLM: QEffGemma2ForCausalLM,
+        # Gemma3
+        Gemma3Attention: QEffGemma3Attention,
+        Gemma3DecoderLayer: QEffGemma3DecoderLayer,
+        Gemma3TextModel: QEffGemma3TextModel,
+        Gemma3ForCausalLM: QEffGemma3ForCausalLMModel,
+        Gemma3ForConditionalGeneration: QEffGemma3ForConditionalGeneration,
         # Granite
         GraniteModel: QEffGraniteModel,
         GraniteForCausalLM: QEffGraniteForCausalLM,

@@ -138,7 +138,7 @@ class QnnConstants:
     # Converter Arguments
     FLOAT_BITWIDTH = 16
     FLOAT_BIAS_BITWIDTH = 32
-    CONVERTER_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification "
+    CONVERTER_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification --target_backend AIC "
 
     # Context-Binary-Generator Arguments
     LOG_LEVEL = "error"

@@ -66,55 +66,44 @@ def generate_qnn_specialization(
                 raise AttributeError(f"ERROR: {input_shape} Shape not Found")
             shapes.append(shape)
 
-        # Filling shape value for nodes with shape size != 2, example: past_key / past_value nodes.
-        if len(shapes) != 2:
+        shape_list = []
+        prefill_decode_shapes = False
+        if len(specializations) > 1 and (node.name in ["input_ids", "position_ids"]):
+            prefill_decode_shapes = True
+        for input_shape in shapes:
+            # If shape contains the parameter string, it value is extracted from the specialization file.
+            if isinstance(input_shape, str):
+                if input_shape in specializations[0]:
+                    shape_list.append(int(specializations[0][input_shape]))
+                    if (
+                        not prefill_decode_shapes
+                        and len(specializations) > 1
+                        and input_shape in specializations[1]
+                        and specializations[0][input_shape] != specializations[1][input_shape]
+                    ):
+                        prefill_decode_shapes = True
+                else:
+                    raise AttributeError(f"ERROR: {input_shape} is required in specializations")
+            # If shape contains the value, then that value is used as it is.
+            else:
+                shape_list.append(input_shape)
+        # Calculated shape is now assigned to the input node.
+        input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
+
+        if prefill_decode_shapes:
             shape_list = []
             for input_shape in shapes:
                 # If shape contains the parameter string, it value is extracted from the specialization file.
                 if isinstance(input_shape, str):
-                    if input_shape in specializations[0]:
-                        shape_list.append(int(specializations[0][input_shape]))
+                    if input_shape in specializations[1]:
+                        shape_list.append(int(specializations[1][input_shape]))
                     else:
                         raise AttributeError(f"ERROR: {input_shape} is required in specializations")
                 # If shape contains the value, then that value is used as it is.
                 else:
                     shape_list.append(input_shape)
-
             # Calculated shape is now assigned to the input node.
-            input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
-        # If shape value for nodes is with shape size == 2, example: input_ids, position_ids, etc.
-        else:
-            shape_list = []
-            for input_shape in shapes:
-                if isinstance(input_shape, str):
-                    if input_shape in specializations[0]:
-                        shape_list.append(int(specializations[0][input_shape]))
-                    else:
-                        raise AttributeError(f"ERROR: {input_shape} is required in specializations")
-                else:
-                    shape_list.append(input_shape)
-            # If specializations file contains more than one parameters list, then first list is used for prefill and second one for decode graph.
-            if len(specializations) > 1:
-                prefill_shape_list = shape_list
-                decode_shape_list = []
-                for input_shape in shapes:
-                    if isinstance(input_shape, str):
-                        if input_shape in specializations[1]:
-                            decode_shape_list.append(int(specializations[1][input_shape]))
-                        else:
-                            raise AttributeError(f"ERROR: {input_shape} is required in specializations")
-                    else:
-                        decode_shape_list.append(input_shape)
-
-                input_info["Shape"] = (
-                    str(prefill_shape_list).replace("[", "(").replace("]", ")")
-                    + ", "
-                    + str(decode_shape_list).replace("[", "(").replace("]", ")")
-                )
-
-            # If specializations file contains only one parameters list, then that list is used for decode graph information.
-            else:
-                input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
+            input_info["Shape"] += ", " + str(shape_list).replace("[", "(").replace("]", ")")
 
         # Finally, input node is created with its name, and desired model parameters {DataType, Shape}
         input_nodes_info.append({"Name": node.name, "Desired Model Parameters": input_info})
Original file line number	Diff line number	Diff line change
Expand Up		@@ -177,4 +177,4 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
		return model, transformed


		VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration"}
		VLM_SPLIT_GATE_UP_WEIGHTS = {"QEffLlama4ForConditionalGeneration", "QEffLlama4ForCausalLM"}