pytorch
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/i64_to_i32.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/_passes/i64_to_i32.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/lift_constant_scalar_operands.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/_passes/lift_constant_scalar_operands.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/_passes/replace_inf_values.py
Lines changed: 6 additions & 0 deletions b/‎backends/qualcomm/_passes/replace_inf_values.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/op_cum_sum.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/builders/op_cum_sum.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/tests/models.py
Lines changed: 10 additions & 10 deletions b/‎backends/qualcomm/tests/models.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 64 additions & 3 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 64 additions & 3 deletions
diff --git a/‎backends/qualcomm/tests/utils.py
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/tests/utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl
Lines changed: 47 additions & 53 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl
Lines changed: 47 additions & 53 deletions
@@ -156,13 +156,14 @@ build_executorch_runner() {
 }
 
 cmake_install_executorch_lib() {
+  build_type="${1:-Release}"
   echo "Installing libexecutorch.a and libportable_kernels.a"
   clean_executorch_install_folders
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_BUILD_TYPE=${build_type} \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
-  cmake --build cmake-out -j9 --target install --config Release
+  cmake --build cmake-out -j9 --target install --config ${build_type}
 }
 
 download_stories_model_artifacts() {
 
@@ -46,7 +46,7 @@ jobs:
         fi
 
         # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
+        lintrunner init
 
         RC=0
         # Run lintrunner on all files
 
@@ -28,8 +28,10 @@ class I64toI32(ExportPass):
     I64_OPS = {
         exir_ops.edge.aten.argmin.default,
         exir_ops.edge.aten.arange.start_step,
+        exir_ops.edge.aten.cumsum.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.scalar_tensor.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
     }
     # This dict is to ensure that the input of the OPs are int64 due to Pytorch restrictions.
     # For example, scatter op can only accept args[2], the index, as int64.
 
@@ -86,7 +86,8 @@ def _build_tensor_constant(
             dtype=(
                 node.args[0].meta["val"].dtype
                 if not is_float_tensor(node)
-                and not SCALAR_OPS.get(node.target).use_self_dtype
+                and (info := SCALAR_OPS.get(node.target))
+                and not info.use_self_dtype
                 else node.meta["val"].dtype
             ),
             device=node.meta["val"].device,
 
@@ -30,6 +30,12 @@ def call(self, graph_module: torch.fx.GraphModule):
                     arg_list[index] = torch.finfo(torch.float32).min
                 elif arg == float("inf"):
                     arg_list[index] = torch.finfo(torch.float32).max
+
+            if node.target == torch.ops.aten.masked_fill.Scalar:
+                if arg_list[2] == torch.finfo(torch.float32).max:
+                    arg_list[2] = 255
+                elif arg_list[2] == torch.finfo(torch.float32).min:
+                    arg_list[2] = -255
             node.args = tuple(arg_list)
 
         graph_module.recompile()
 
@@ -51,6 +51,8 @@ def define_node(
         dim = self.get_param(node, input_tensor)
 
         output_tensor = self.get_tensor(node, node)
+        if output_tensor.dtype == torch.int64:
+            output_tensor = output_tensor.to(torch.int32)
         output_tensor_wrapper = self.define_tensor(
             node,
             node,
 
@@ -1101,6 +1101,16 @@ def forward(self, x):
         return torch.mean(x, (-1, -2))
 
 
+class MaskedFill(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, attn_mask):
+        return attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+
+
 class Maximum(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1751,16 +1761,6 @@ def forward(self, x):
         )
 
 
-class MaskedFill(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, attn_mask):
-        return attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
-            attn_mask == 0, float(0.0)
-        )
-
-
 # Mimi Decoder has 0D tensor which QNN cannot handle.
 class ZeroDimTensor(torch.nn.Module):
     def __init__(self):
 
@@ -272,9 +272,24 @@ def test_qnn_backend_cos(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_cumsum(self):
-        module = CumSum()  # noqa: F405
-        sample_input = (torch.randn(4),)
-        self.lower_module_and_test_output(module, sample_input)
+        sample_input = ()
+        test_comb = [
+            {
+                QCOM_MODULE: [CumSum()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(4),),
+                    (torch.randint(0, 10, size=(4,)),),
+                ],
+            }
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        self.lower_module_and_test_output(module, sample_input)
+                        index += 1
 
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
@@ -311,6 +326,12 @@ def test_qnn_backend_element_wise_add(self):
                 QCOM_MODULE: [AddConstantFloat()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)],
             },
+            {
+                QCOM_MODULE: [
+                    AddConstantLong(),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [(torch.randint(0, 10, size=(2, 3)),)],
+            },
         ]
 
         index = 0
@@ -4526,6 +4547,40 @@ def test_retinanet(self):
             else:
                 self.assertGreaterEqual(msg["mAP"], 0.6)
 
+    def test_roberta(self):
+        if not self.required_envs([self.sentence_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/roberta.py",
+            "--dataset",
+            self.sentence_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["accuracy"], 0.5)
+
     def test_squeezenet(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -5344,6 +5399,11 @@ def setup_environment():
         help="Location for imagenet dataset",
         type=str,
     )
+    parser.add_argument(
+        "--sentence_dataset",
+        help="Location for sentence dataset",
+        type=str,
+    )
     parser.add_argument(
         "-p",
         "--pretrained_weight",
@@ -5402,6 +5462,7 @@ def setup_environment():
     TestQNN.executorch_root = args.executorch_root
     TestQNN.artifact_dir = args.artifact_dir
     TestQNN.image_dataset = args.image_dataset
+    TestQNN.sentence_dataset = args.sentence_dataset
     TestQNN.pretrained_weight = args.pretrained_weight
     TestQNN.model_name = args.model_name
     TestQNN.online_prepare = args.online_prepare
 
@@ -183,6 +183,7 @@ class TestQNN(unittest.TestCase):
     executorch_root: str = ""
     artifact_dir: str = ""
     image_dataset: str = ""
+    sentence_dataset: str = ""
     pretrained_weight: str = ""
     enable_profile: bool = False
     op_package_dir: str = ""
 
@@ -13,14 +13,18 @@
 #define IN_T ${buffer_scalar_type(IN_DTYPE)}
 #define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
 
+#define ${MODE}
+
 ${define_active_storage_type("buffer")}
 ${define_required_extensions(IN_DTYPE)}
 ${define_required_extensions(OUT_DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
+#include "indexing_utils.h"
+
 ${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
 
 $if MODE == "per_tensor":
   layout(push_constant) uniform restrict Block {
@@ -29,7 +33,7 @@ $if MODE == "per_tensor":
     int quant_min;
     int quant_max;
   };
-$else:
+$if MODE == "per_token":
   ${layout_declare_tensor(B, "r", "t_scale", "float", "buffer")}
   ${layout_declare_tensor(B, "r", "t_zero_point", "int", "buffer")}
 
@@ -39,87 +43,77 @@ $else:
     int quant_max;
   };
 
+${layout_declare_ubo(B, "int", "out_numel")}
 ${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
 ${layout_declare_ubo(B, "ivec4", "t_in_strides")}
 ${layout_declare_ubo(B, "ivec4", "t_out_sizes")}
 ${layout_declare_ubo(B, "ivec4", "t_out_strides")}
 
-#include "indexing_utils.h"
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+
 #include "dequantize.glslh"
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-void main() {
-$if MODE == "per_tensor":
-  const ivec4 pos = ivec4(
-      gl_GlobalInvocationID.x,
-      gl_GlobalInvocationID.y,
-      gl_GlobalInvocationID.z,
-      0);
-
-  const int t_in_idx = tidx_to_bufi(pos, t_in_strides);
-  const int t_out_idx = tidx_to_bufi(pos, t_out_strides);
-
-  IN_T qvalue = t_in[t_in_idx];
-  OUT_T value;
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+const lowp ivec4 in_dim_order = unhash_dim_order(in_layout);
 
-  value = dequantize_val(qvalue, scale, zero_point);
+#ifdef per_tensor
 
-  t_out[t_out_idx] = value;
+void dequantize_per_tensor() {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
 
-$if MODE == "per_token":
-  const ivec4 pos = ivec4(
-      gl_GlobalInvocationID.x,
-      gl_GlobalInvocationID.y,
-      gl_GlobalInvocationID.z,
-      0);
-
-  const int t_in_idx = tidx_to_bufi(pos, t_in_strides);
-  const int t_out_idx = tidx_to_bufi(pos, t_out_strides);
-
-  // Skip if out of bounds
-  if (t_in_idx >= t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w) {
+  if (out_bufi >= out_numel) {
     return;
   }
 
-  IN_T qvalue = t_in[t_in_idx];
-  OUT_T value;
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
+  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
+
+  IN_T qvalue = t_in[in_bufi];
+  OUT_T value = dequantize_val(qvalue, scale, zero_point);
+
+  t_out[out_bufi] = value;
+}
 
-  // Calculate logical position from linear index and strides
-  ivec4 logical_pos;
-  int remaining = t_in_idx;
+#else
 
-  logical_pos.x = remaining % t_in_sizes.x;
-  remaining /= t_in_sizes.x;
+void dequantize_per_token() {
+  const int out_bufi = int(gl_GlobalInvocationID.x);
 
-  logical_pos.y = remaining % t_in_sizes.y;
-  remaining /= t_in_sizes.y;
+  if (out_bufi >= out_numel) {
+    return;
+  }
 
-  logical_pos.z = remaining % t_in_sizes.z;
-  remaining /= t_in_sizes.z;
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
+  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
 
-  logical_pos.w = remaining;
+  IN_T qvalue = t_in[in_bufi];
 
-  // Calculate token index based on logical position
   int token_idx = 0;
 
-  // Check dimensions to determine how to calculate token_idx
-  if (t_in_sizes.w > 1) {
+  if (t_out_sizes.w > 1) {
     // 4D tensor
-    token_idx = logical_pos.w * (t_in_sizes.z * t_in_sizes.y) + logical_pos.z * t_in_sizes.y + logical_pos.y;
-  } else if (t_in_sizes.z > 1) {
+    token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y;
+  } else if (t_out_sizes.z > 1) {
     // 3D tensor
-    token_idx = logical_pos.z * t_in_sizes.y + logical_pos.y;
-  } else if (t_in_sizes.y > 1) {
+    token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y;
+  } else if (t_out_sizes.y > 1) {
     // 2D tensor
-    token_idx = logical_pos.y;
+    token_idx = out_tidx.y;
   }
   // For 1D tensor, token_idx remains 0
 
-  // Make sure token_idx is within bounds
   token_idx = min(token_idx, num_tokens - 1);
 
-  value = dequantize_val(qvalue, t_scale[token_idx], t_zero_point[token_idx]);
+  OUT_T value = dequantize_val(qvalue, t_scale[token_idx], t_zero_point[token_idx]);
+
+  t_out[out_bufi] = value;
+}
+
+#endif
 
-  t_out[t_out_idx] = value;
+void main() {
+  dequantize_${MODE}();
 }