pytorch
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 0 additions & 1 deletion b/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 19 additions & 23 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 19 additions & 23 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/vulkan/_passes/fuse_quantized_ops.py
Lines changed: 4 additions & 1 deletion b/‎backends/vulkan/_passes/fuse_quantized_ops.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/vulkan/_passes/int4_weight_only_quantizer.py
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/_passes/int4_weight_only_quantizer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/_passes/tag_memory_meta_pass.py
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/_passes/tag_memory_meta_pass.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/op_registry.py
Lines changed: 25 additions & 8 deletions b/‎backends/vulkan/op_registry.py
Lines changed: 25 additions & 8 deletions
diff --git a/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 3 deletions b/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 9 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 9 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 7 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
Lines changed: 2 additions & 3 deletions b/‎backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
Lines changed: 7 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
Lines changed: 7 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
Lines changed: 3 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
Lines changed: 11 additions & 3 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
Lines changed: 11 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
Lines changed: 3 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
Lines changed: 11 additions & 3 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
Lines changed: 11 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
Lines changed: 9 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
Lines changed: 9 additions & 0 deletions
@@ -40,7 +40,6 @@ cmake --build cmake-out -j16 --target install --config Release
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
 
@@ -49,14 +49,24 @@ prepare_artifacts_upload() {
 }
 
 build_cmake_executor_runner() {
+  local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
-  cmake -DCMAKE_BUILD_TYPE=Debug \
-      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-      -B${CMAKE_OUTPUT_DIR} .
-
-  cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
+  mkdir ${CMAKE_OUTPUT_DIR}
+  if [[ "$backend_string_select" == "XNNPACK" ]]; then
+    echo "Backend $backend_string_select selected"
+    (cd ${CMAKE_OUTPUT_DIR} \
+      && cmake -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  else
+    cmake -DCMAKE_BUILD_TYPE=Debug \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        -B${CMAKE_OUTPUT_DIR} .
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
+  fi
 }
 
 run_portable_executor_runner() {
@@ -111,19 +121,6 @@ test_model() {
   run_portable_executor_runner
 }
 
-build_cmake_xnn_executor_runner() {
-  echo "Building xnn_executor_runner"
-
-  (rm -rf ${CMAKE_OUTPUT_DIR} \
-    && mkdir ${CMAKE_OUTPUT_DIR} \
-    && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DCMAKE_BUILD_TYPE=Release \
-      -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
-
-  cmake --build ${CMAKE_OUTPUT_DIR} -j4
-}
-
 test_model_with_xnnpack() {
   WITH_QUANTIZATION=$1
   WITH_DELEGATION=$2
@@ -148,12 +145,11 @@ test_model_with_xnnpack() {
 
   # Run test model
   if [[ "${BUILD_TOOL}" == "buck2" ]]; then
+    # TODO eventually buck should also use consolidated executor runners
     buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}"
   elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-    if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then
-      build_cmake_xnn_executor_runner
-    fi
-    ./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}"
+    build_cmake_executor_runner "XNNPACK"
+    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "${OUTPUT_MODEL_PATH}"
   else
     echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
     exit 1
 
@@ -158,8 +158,7 @@ build_executorch_runner() {
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a and libportable_kernels.a"
   clean_executorch_install_folders
-  retry cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
+  retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
 
@@ -17,6 +17,7 @@
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
 
 #################
 ## linear_qcnw ##
@@ -224,6 +225,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 )
 
         graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
+        dead_code_elimination_pass(graph_module)
 
+        # Re-trace the graph since new nodes were (potentially) inserted
+        graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
@@ -7,7 +7,7 @@
 import torch
 import torch.nn.functional as F
 
-from torchao.quantization.GPTQ import _check_linear_int4_k
+from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k
 from torchao.quantization.unified import Quantizer
 from torchao.quantization.utils import groupwise_affine_quantize_tensor
 
 
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from copy import deepcopy
 from typing import Any, Optional, Set
 
 import executorch.backends.vulkan.utils as utils
@@ -22,6 +21,7 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.tensor import TensorSpec
 
 logger: logging.Logger = logging.getLogger("")
 logger.setLevel(logging.INFO)
@@ -52,7 +52,7 @@ def insert_transition_node(
             (arg,),
         )
         clone_node.meta["val"] = arg.meta["val"]
-        clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
+        clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
         clone_node.meta["spec"].const = False
         set_memory_metadata(clone_node, storage, layout)
         arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)
 
@@ -230,6 +230,14 @@ def update_features_impl(op: OpKey):
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
         # Symbolic integer ops
         torch.ops.aten.sym_size.int,
+        operator.add,
+        operator.lt,
+        operator.gt,
+        operator.ge,
+        operator.le,
+        # Guard and assert ops
+        torch.ops.aten._assert_scalar.default,
+        torch.ops.aten.sym_constrain_range_for_size.default,
     ]
 )
 def register_ephemeral_op(features: OpFeatures):
@@ -500,7 +508,12 @@ def register_sdpa_with_kv_cache_op(features: OpFeatures):
     return features
 
 
-@update_features(["llama::update_cache", "llama::custom_sdpa"])
+@update_features(
+    [
+        "llama::update_cache",
+        "llama::custom_sdpa",
+    ]
+)
 def register_sdpa_ops(features: OpFeatures):
     features.resize_fn = False
     features.buffer_impl = False
@@ -520,8 +533,17 @@ def register_rotary_emb_op(features: OpFeatures):
     return features
 
 
-@update_features(exir_ops.edge.aten.view_copy.default)
-def register_view_op(features: OpFeatures):
+@update_features(
+    [
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.permute.default,
+        exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.select_copy.int,
+        exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.view_copy.default,
+    ]
+)
+def register_view_ops(features: OpFeatures):
     features.texture_impl = TextureImplFeatures(
         valid_packed_dims=all_packed_dims,
     )
@@ -538,10 +560,8 @@ def register_view_op(features: OpFeatures):
         # Indexing and lookup
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
-        exir_ops.edge.aten.select_copy.int,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
-        exir_ops.edge.aten.clone.default,
         exir_ops.edge.aten.constant_pad_nd.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.full_like.default,
@@ -564,12 +584,9 @@ def register_ported_op(features: OpFeatures):
 # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
 @update_features(
     [
-        # Indexing and lookup
-        exir_ops.edge.aten.slice_copy.Tensor,
         # Shape Manipulation
         exir_ops.edge.aten.squeeze_copy.dims,
         exir_ops.edge.aten.unsqueeze_copy.default,
-        exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.repeat.default,
 
@@ -146,10 +146,11 @@ def op_node_is_compatible(  # noqa: C901: Function is too complex
     def node_is_compatible(
         self, node: torch.fx.Node, features: Optional[OpFeatures] = None
     ) -> Tuple[bool, str]:
-        if utils.is_symint_node(node):
-            return node.target in vulkan_supported_ops, "Op is compatible"
-        elif utils.is_tensor_node(node):
+        if utils.is_tensor_node(node):
             return self.op_node_is_compatible(node, features=features)
+        # For non-tensor nodes, just check if the op is registered
+        elif hasattr(node, "target"):
+            return node.target in vulkan_supported_ops, "Op is compatible"
 
         return False, f"Unsupported node type: {node.format_node()}"
 
 
@@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
   return idx;
 }
 
+ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
+  for (int i = 0; i < values_.size(); ++i) {
+    if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
+      return i;
+    }
+  }
+  return add_scalar(val);
+}
+
 ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
 
@@ -604,6 +604,13 @@ class ComputeGraph final {
 
   ValueRef add_symint(const int32_t val);
 
+  /*
+   * Searches the graph's value list for a Int value with the specified value.
+   * If one is found, returns the index of the value. Otherwise, add a new value
+   * and return the index of the new value.
+   */
+  ValueRef get_or_add_value_for_int(const int64_t val);
+
   ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
 
 
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
     const ResizeFunction& resize_fn)
     : DispatchNode(
           graph,
-          vkapi::ShaderInfo(),
-          {1u, 1u, 1u},
+          pick_shader_fn(&graph, args, resize_args),
           {1u, 1u, 1u},
+          {8u, 8u, 1u},
           args,
           params,
           push_constants,
@@ -37,7 +37,6 @@ DynamicDispatchNode::DynamicDispatchNode(
       pick_shader_fn_(pick_shader_fn),
       pick_global_wg_fn_(pick_global_wg_fn),
       pick_local_wg_fn_(pick_local_wg_fn) {
-  shader_ = pick_shader_fn(&graph, args, resize_args);
   global_workgroup_size_ =
       pick_global_wg_fn(&graph, shader_, args, resize_args);
   local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
 
@@ -22,7 +22,13 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "nchw_in", "int")}
-${layout_declare_ubo(B, "ivec4", "sizes")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 sizes;
+  };
+$else:
+  ${layout_declare_ubo(B, "ivec4", "sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 
@@ -8,6 +8,7 @@ nchw_to_bitw8_image_nobitw8buffer:
   parameter_names_with_default_values:
     STORAGE: texture3d
     DTYPE: int8
+    USE_PUSH_CONST: True
   generate_variant_forall:
     STORAGE:
       - VALUE: texture2d
@@ -17,3 +18,5 @@ nchw_to_bitw8_image_nobitw8buffer:
       - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_bitw8_image_nobitw8buffer
+    - NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc
+      USE_PUSH_CONST: False
@@ -12,9 +12,17 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "out_strides")}
-${layout_declare_ubo(4, "int", "numel")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 out_sizes;
+    ivec4 out_strides;
+    int numel;
+  };
+$else:
+  ${layout_declare_ubo(2, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(3, "ivec4", "out_strides")}
+  ${layout_declare_ubo(4, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 
@@ -8,6 +8,7 @@ nchw_to_buffer:
   parameter_names_with_default_values:
     DTYPE: float
     STORAGE: buffer
+    USE_PUSH_CONST: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -17,3 +18,5 @@ nchw_to_buffer:
       - VALUE: uint8
   shader_variants:
     - NAME: nchw_to_buffer
+    - NAME: nchw_to_buffer_no_pc
+      USE_PUSH_CONST: False
@@ -21,9 +21,17 @@ layout(std430) buffer;
 
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
 ${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
-${layout_declare_ubo(B, "ivec4", "sizes")}
-$if not FROM_STAGING:
-  ${layout_declare_ubo(B, "ivec4", "buf_strides")}
+
+$if USE_PUSH_CONST:
+  layout(push_constant) uniform restrict Block {
+    ivec4 sizes;
+  $if not FROM_STAGING:
+    ivec4 buf_strides;
+  };
+$else:
+  ${layout_declare_ubo(B, "ivec4", "sizes")}
+  $if not FROM_STAGING:
+    ${layout_declare_ubo(B, "ivec4", "buf_strides")}
 
 #include "indexing_utils.h"
 
 
@@ -9,6 +9,7 @@ nchw_to_image:
     STORAGE: texture3d
     DTYPE: float
     FROM_STAGING: True
+    USE_PUSH_CONST: True
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -22,3 +23,11 @@ nchw_to_image:
       STORAGE: texture2d
     - NAME: clone_buffer_to_image
       FROM_STAGING: False
+    - NAME: nchw_to_image_no_pc_texture3d
+      USE_PUSH_CONST: False
+    - NAME: nchw_to_image_no_pc_texture2d
+      STORAGE: texture2d
+      USE_PUSH_CONST: False
+    - NAME: clone_buffer_to_image_no_pc
+      FROM_STAGING: False
+      USE_PUSH_CONST: False