Fixes in QNN Compilation path. (#454)

shubhagr-qc · qcdipankar · commit ee1cc26c1174 · 2025-06-13T15:53:40.000+05:30
1. Fix generate_qnn_specialization to generate correct custom IO for VLM
multimodal tests.
2. Added --target_backend AIC as default parameter in QNN Converter
3. Added QNN Multimodal Tests stage in Jenkins.

---------

Signed-off-by: Shubham Agrawal &lt;shubhagr@qti.qualcomm.com&gt;
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -138,7 +138,7 @@ class QnnConstants:
     # Converter Arguments
     FLOAT_BITWIDTH = 16
     FLOAT_BIAS_BITWIDTH = 32
-    CONVERTER_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification "
+    CONVERTER_DEFAULT_ARGS = "--preserve_io_datatype --onnx_skip_simplification --target_backend AIC "
 
     # Context-Binary-Generator Arguments
     LOG_LEVEL = "error"
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -66,55 +66,44 @@ def generate_qnn_specialization(
                 raise AttributeError(f"ERROR: {input_shape} Shape not Found")
             shapes.append(shape)
 
-        # Filling shape value for nodes with shape size != 2, example: past_key / past_value nodes.
-        if len(shapes) != 2:
+        shape_list = []
+        prefill_decode_shapes = False
+        if len(specializations) > 1 and (node.name in ["input_ids", "position_ids"]):
+            prefill_decode_shapes = True
+        for input_shape in shapes:
+            # If shape contains the parameter string, it value is extracted from the specialization file.
+            if isinstance(input_shape, str):
+                if input_shape in specializations[0]:
+                    shape_list.append(int(specializations[0][input_shape]))
+                    if (
+                        not prefill_decode_shapes
+                        and len(specializations) > 1
+                        and input_shape in specializations[1]
+                        and specializations[0][input_shape] != specializations[1][input_shape]
+                    ):
+                        prefill_decode_shapes = True
+                else:
+                    raise AttributeError(f"ERROR: {input_shape} is required in specializations")
+            # If shape contains the value, then that value is used as it is.
+            else:
+                shape_list.append(input_shape)
+        # Calculated shape is now assigned to the input node.
+        input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
+
+        if prefill_decode_shapes:
             shape_list = []
             for input_shape in shapes:
                 # If shape contains the parameter string, it value is extracted from the specialization file.
                 if isinstance(input_shape, str):
-                    if input_shape in specializations[0]:
-                        shape_list.append(int(specializations[0][input_shape]))
+                    if input_shape in specializations[1]:
+                        shape_list.append(int(specializations[1][input_shape]))
                     else:
                         raise AttributeError(f"ERROR: {input_shape} is required in specializations")
                 # If shape contains the value, then that value is used as it is.
                 else:
                     shape_list.append(input_shape)
-
             # Calculated shape is now assigned to the input node.
-            input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
-        # If shape value for nodes is with shape size == 2, example: input_ids, position_ids, etc.
-        else:
-            shape_list = []
-            for input_shape in shapes:
-                if isinstance(input_shape, str):
-                    if input_shape in specializations[0]:
-                        shape_list.append(int(specializations[0][input_shape]))
-                    else:
-                        raise AttributeError(f"ERROR: {input_shape} is required in specializations")
-                else:
-                    shape_list.append(input_shape)
-            # If specializations file contains more than one parameters list, then first list is used for prefill and second one for decode graph.
-            if len(specializations) > 1:
-                prefill_shape_list = shape_list
-                decode_shape_list = []
-                for input_shape in shapes:
-                    if isinstance(input_shape, str):
-                        if input_shape in specializations[1]:
-                            decode_shape_list.append(int(specializations[1][input_shape]))
-                        else:
-                            raise AttributeError(f"ERROR: {input_shape} is required in specializations")
-                    else:
-                        decode_shape_list.append(input_shape)
-
-                input_info["Shape"] = (
-                    str(prefill_shape_list).replace("[", "(").replace("]", ")")
-                    + ", "
-                    + str(decode_shape_list).replace("[", "(").replace("]", ")")
-                )
-
-            # If specializations file contains only one parameters list, then that list is used for decode graph information.
-            else:
-                input_info["Shape"] = str(shape_list).replace("[", "(").replace("]", ")")
+            input_info["Shape"] += ", " + str(shape_list).replace("[", "(").replace("]", ")")
 
         # Finally, input node is created with its name, and desired model parameters {DataType, Shape}
         input_nodes_info.append({"Name": node.name, "Desired Model Parameters": input_info})
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
@@ -106,7 +106,7 @@ pipeline {
         stage('vLLM Tests') {
             steps {
                 catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
-                    build job: 'qefficient_vllm_upstream', 
+                    build job: 'qefficient_vllm_upstream',
                     parameters: [string(name: 'NAME', value: "${BUILD_TAG}")],
                     propagate: true,
                     wait: true
@@ -144,13 +144,32 @@ pipeline {
                     mkdir -p $PWD/Qnn_non_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_non_cli &&
-                    pytest tests -m '(not cli) and (qnn) and (on_qaic)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
+                    pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
                     junitparser merge tests/tests_log5.xml tests/tests_log.xml &&
                     deactivate"
                     '''
                 }
             }
         }
+        stage('QNN MultiModal Tests') {
+            steps {
+                timeout(time: 60, unit: 'MINUTES') {
+                    sh '''
+                    sudo docker exec ${BUILD_TAG} bash -c "
+                    source /qnn_sdk/bin/envsetup.sh &&
+                    source /qnn_sdk/bin/envcheck -c &&
+                    cd /efficient-transformers &&
+                    . preflight_qeff/bin/activate &&
+                    mkdir -p $PWD/Non_cli_qnn_multimodal &&
+                    export TOKENIZERS_PARALLELISM=false &&
+                    export QEFF_HOME=$PWD/Non_cli_qnn_multimodal &&
+                    pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log7.xml &&
+                    junitparser merge tests/tests_log7.xml tests/tests_log.xml &&
+                    deactivate"
+                    '''
+                }
+            }
+        }
     }
 
    post {
diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py
@@ -5,8 +5,9 @@
 #
 # ----------------------------------------------------------------------------
 
+import os
 from io import BytesIO
-from typing import List
+from typing import List, Optional
 
 import pytest
 import requests
@@ -23,7 +24,8 @@
 
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM, QEFFAutoModelForImageTextToText
 from QEfficient.utils import hf_download
-from QEfficient.utils._utils import get_num_layers_vlm
+from QEfficient.utils._utils import create_json, get_num_layers_vlm
+from QEfficient.utils.constants import QnnConstants
 from QEfficient.utils.device_utils import get_available_device_id
 from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerVlm
 from QEfficient.utils.test_utils import InternProcessor
@@ -198,6 +200,8 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     n_layer: int = 1,
     kv_offload: bool = False,
     num_devices: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
 ):
     model_config = {"model_name": model_name}
     model_config["img_size"] = img_size
@@ -259,6 +263,8 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         mxfp6=False,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
     inputs = processor(images=image, text=prompt, return_tensors="pt")
     if "pixel_values" in inputs:
@@ -281,6 +287,8 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     n_layer: int = 1,
     kv_offload: bool = False,
     num_devices: int = 1,
+    enable_qnn: Optional[bool] = False,
+    qnn_config: Optional[str] = None,
 ):
     model_config = {"model_name": model_name}
 
@@ -346,6 +354,8 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         mxfp6=False,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
     )
     print("QPC Outputs (QAIC):")
     output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer)
@@ -381,6 +391,42 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     )
 
 
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.multimodal
+@pytest.mark.parametrize(
+    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config
+)
+def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(
+    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer
+):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model,  without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    if model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct":
+        pytest.skip("QNN is not supported for Llama 4 Scout models")
+
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name,
+        prompt_len=prompt_len,
+        ctx_len=ctx_len,
+        max_gen_len=NEW_GENERATION_TOKENS,
+        img_size=img_size,
+        img_url=img_url,
+        query=query,
+        n_layer=n_layer,
+        batch_size=batch_size,
+        kv_offload=kv_offload,
+        enable_qnn=True,
+        qnn_config=qnn_config_json_path,
+    )
+
+
 @pytest.mark.on_qaic
 @pytest.mark.multimodal
 @pytest.mark.parametrize(
@@ -400,3 +446,30 @@ def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100(
         batch_size=batch_size,
         kv_offload=kv_offload,
     )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.multimodal
+@pytest.mark.parametrize(
+    "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config
+)
+def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn(
+    model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer
+):
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+    check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
+        model_name=model_name,
+        prompt_len=prompt_len,
+        ctx_len=ctx_len,
+        max_gen_len=NEW_GENERATION_TOKENS,
+        img_url=img_url,
+        query=query,
+        n_layer=n_layer,
+        batch_size=batch_size,
+        kv_offload=kv_offload,
+        enable_qnn=True,
+        qnn_config=qnn_config_json_path,
+    )