Skip to content

Commit 8737301

Browse files
authored
Merge branch 'main' into dev1/chenweng/roberta
2 parents 1be93d7 + af0a246 commit 8737301

File tree

83 files changed

+1311
-673
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+1311
-673
lines changed

.ci/scripts/test_llama_torchao_lowbit.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ cmake --build cmake-out -j16 --target install --config Release
4040

4141
# Install llama runner with torchao
4242
cmake -DPYTHON_EXECUTABLE=python \
43-
-DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
4443
-DCMAKE_BUILD_TYPE=Release \
4544
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
4645
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \

.ci/scripts/test_model.sh

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,24 @@ prepare_artifacts_upload() {
4949
}
5050

5151
build_cmake_executor_runner() {
52+
local backend_string_select="${1:-}"
5253
echo "Building executor_runner"
5354
rm -rf ${CMAKE_OUTPUT_DIR}
54-
cmake -DCMAKE_BUILD_TYPE=Debug \
55-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
56-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
57-
-B${CMAKE_OUTPUT_DIR} .
58-
59-
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
55+
mkdir ${CMAKE_OUTPUT_DIR}
56+
if [[ "$backend_string_select" == "XNNPACK" ]]; then
57+
echo "Backend $backend_string_select selected"
58+
(cd ${CMAKE_OUTPUT_DIR} \
59+
&& cmake -DCMAKE_BUILD_TYPE=Release \
60+
-DEXECUTORCH_BUILD_XNNPACK=ON \
61+
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
62+
cmake --build ${CMAKE_OUTPUT_DIR} -j4
63+
else
64+
cmake -DCMAKE_BUILD_TYPE=Debug \
65+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
66+
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
67+
-B${CMAKE_OUTPUT_DIR} .
68+
cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
69+
fi
6070
}
6171

6272
run_portable_executor_runner() {
@@ -111,19 +121,6 @@ test_model() {
111121
run_portable_executor_runner
112122
}
113123

114-
build_cmake_xnn_executor_runner() {
115-
echo "Building xnn_executor_runner"
116-
117-
(rm -rf ${CMAKE_OUTPUT_DIR} \
118-
&& mkdir ${CMAKE_OUTPUT_DIR} \
119-
&& cd ${CMAKE_OUTPUT_DIR} \
120-
&& retry cmake -DCMAKE_BUILD_TYPE=Release \
121-
-DEXECUTORCH_BUILD_XNNPACK=ON \
122-
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
123-
124-
cmake --build ${CMAKE_OUTPUT_DIR} -j4
125-
}
126-
127124
test_model_with_xnnpack() {
128125
WITH_QUANTIZATION=$1
129126
WITH_DELEGATION=$2
@@ -148,12 +145,11 @@ test_model_with_xnnpack() {
148145

149146
# Run test model
150147
if [[ "${BUILD_TOOL}" == "buck2" ]]; then
148+
# TODO eventually buck should also use consolidated executor runners
151149
buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}"
152150
elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
153-
if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then
154-
build_cmake_xnn_executor_runner
155-
fi
156-
./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}"
151+
build_cmake_executor_runner "XNNPACK"
152+
./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "${OUTPUT_MODEL_PATH}"
157153
else
158154
echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
159155
exit 1

.ci/scripts/utils.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ build_executorch_runner() {
158158
cmake_install_executorch_lib() {
159159
echo "Installing libexecutorch.a and libportable_kernels.a"
160160
clean_executorch_install_folders
161-
retry cmake -DBUCK2="$BUCK" \
162-
-DCMAKE_INSTALL_PREFIX=cmake-out \
161+
retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
163162
-DCMAKE_BUILD_TYPE=Release \
164163
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
165164
-Bcmake-out .

backends/vulkan/_passes/fuse_quantized_ops.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from executorch.exir import ExportedProgram
1818
from executorch.exir.dialects._ops import ops as exir_ops
1919
from executorch.exir.pass_base import ExportPass, PassResult
20+
from executorch.exir.passes import dead_code_elimination_pass
2021

2122
#################
2223
## linear_qcnw ##
@@ -224,6 +225,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
224225
)
225226

226227
graph_module.recompile()
227-
graph_module = super().call(graph_module).graph_module
228+
dead_code_elimination_pass(graph_module)
228229

230+
# Re-trace the graph since new nodes were (potentially) inserted
231+
graph_module = super().call(graph_module).graph_module
229232
return PassResult(graph_module, True)

backends/vulkan/_passes/int4_weight_only_quantizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import torch
88
import torch.nn.functional as F
99

10-
from torchao.quantization.GPTQ import _check_linear_int4_k
10+
from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k
1111
from torchao.quantization.unified import Quantizer
1212
from torchao.quantization.utils import groupwise_affine_quantize_tensor
1313

backends/vulkan/_passes/tag_memory_meta_pass.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# LICENSE file in the root directory of this source tree.
66

77
import logging
8-
from copy import deepcopy
98
from typing import Any, Optional, Set
109

1110
import executorch.backends.vulkan.utils as utils
@@ -22,6 +21,7 @@
2221
from executorch.exir.dialects._ops import ops as exir_ops
2322

2423
from executorch.exir.pass_base import ExportPass, PassResult
24+
from executorch.exir.tensor import TensorSpec
2525

2626
logger: logging.Logger = logging.getLogger("")
2727
logger.setLevel(logging.INFO)
@@ -52,7 +52,7 @@ def insert_transition_node(
5252
(arg,),
5353
)
5454
clone_node.meta["val"] = arg.meta["val"]
55-
clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
55+
clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
5656
clone_node.meta["spec"].const = False
5757
set_memory_metadata(clone_node, storage, layout)
5858
arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)

backends/vulkan/op_registry.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,14 @@ def update_features_impl(op: OpKey):
230230
exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
231231
# Symbolic integer ops
232232
torch.ops.aten.sym_size.int,
233+
operator.add,
234+
operator.lt,
235+
operator.gt,
236+
operator.ge,
237+
operator.le,
238+
# Guard and assert ops
239+
torch.ops.aten._assert_scalar.default,
240+
torch.ops.aten.sym_constrain_range_for_size.default,
233241
]
234242
)
235243
def register_ephemeral_op(features: OpFeatures):
@@ -500,7 +508,12 @@ def register_sdpa_with_kv_cache_op(features: OpFeatures):
500508
return features
501509

502510

503-
@update_features(["llama::update_cache", "llama::custom_sdpa"])
511+
@update_features(
512+
[
513+
"llama::update_cache",
514+
"llama::custom_sdpa",
515+
]
516+
)
504517
def register_sdpa_ops(features: OpFeatures):
505518
features.resize_fn = False
506519
features.buffer_impl = False
@@ -520,8 +533,17 @@ def register_rotary_emb_op(features: OpFeatures):
520533
return features
521534

522535

523-
@update_features(exir_ops.edge.aten.view_copy.default)
524-
def register_view_op(features: OpFeatures):
536+
@update_features(
537+
[
538+
exir_ops.edge.aten.clone.default,
539+
exir_ops.edge.aten.permute.default,
540+
exir_ops.edge.aten.permute_copy.default,
541+
exir_ops.edge.aten.select_copy.int,
542+
exir_ops.edge.aten.slice_copy.Tensor,
543+
exir_ops.edge.aten.view_copy.default,
544+
]
545+
)
546+
def register_view_ops(features: OpFeatures):
525547
features.texture_impl = TextureImplFeatures(
526548
valid_packed_dims=all_packed_dims,
527549
)
@@ -538,10 +560,8 @@ def register_view_op(features: OpFeatures):
538560
# Indexing and lookup
539561
exir_ops.edge.aten.flip.default,
540562
exir_ops.edge.aten.index_select.default,
541-
exir_ops.edge.aten.select_copy.int,
542563
# Tensor creation
543564
exir_ops.edge.aten.arange.start_step,
544-
exir_ops.edge.aten.clone.default,
545565
exir_ops.edge.aten.constant_pad_nd.default,
546566
exir_ops.edge.aten.full.default,
547567
exir_ops.edge.aten.full_like.default,
@@ -564,12 +584,9 @@ def register_ported_op(features: OpFeatures):
564584
# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
565585
@update_features(
566586
[
567-
# Indexing and lookup
568-
exir_ops.edge.aten.slice_copy.Tensor,
569587
# Shape Manipulation
570588
exir_ops.edge.aten.squeeze_copy.dims,
571589
exir_ops.edge.aten.unsqueeze_copy.default,
572-
exir_ops.edge.aten.permute_copy.default,
573590
# Tensor combination
574591
exir_ops.edge.aten.cat.default,
575592
exir_ops.edge.aten.repeat.default,

backends/vulkan/partitioner/vulkan_partitioner.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,11 @@ def op_node_is_compatible( # noqa: C901: Function is too complex
146146
def node_is_compatible(
147147
self, node: torch.fx.Node, features: Optional[OpFeatures] = None
148148
) -> Tuple[bool, str]:
149-
if utils.is_symint_node(node):
150-
return node.target in vulkan_supported_ops, "Op is compatible"
151-
elif utils.is_tensor_node(node):
149+
if utils.is_tensor_node(node):
152150
return self.op_node_is_compatible(node, features=features)
151+
# For non-tensor nodes, just check if the op is registered
152+
elif hasattr(node, "target"):
153+
return node.target in vulkan_supported_ops, "Op is compatible"
153154

154155
return False, f"Unsupported node type: {node.format_node()}"
155156

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
449449
return idx;
450450
}
451451

452+
ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
453+
for (int i = 0; i < values_.size(); ++i) {
454+
if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
455+
return i;
456+
}
457+
}
458+
return add_scalar(val);
459+
}
460+
452461
ValueRef ComputeGraph::set_input_tensor(
453462
const ValueRef idx,
454463
const bool use_staging) {

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,13 @@ class ComputeGraph final {
604604

605605
ValueRef add_symint(const int32_t val);
606606

607+
/*
608+
* Searches the graph's value list for a Int value with the specified value.
609+
* If one is found, returns the index of the value. Otherwise, add a new value
610+
* and return the index of the new value.
611+
*/
612+
ValueRef get_or_add_value_for_int(const int64_t val);
613+
607614
ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
608615
ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
609616

backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
2525
const ResizeFunction& resize_fn)
2626
: DispatchNode(
2727
graph,
28-
vkapi::ShaderInfo(),
29-
{1u, 1u, 1u},
28+
pick_shader_fn(&graph, args, resize_args),
3029
{1u, 1u, 1u},
30+
{8u, 8u, 1u},
3131
args,
3232
params,
3333
push_constants,
@@ -37,7 +37,6 @@ DynamicDispatchNode::DynamicDispatchNode(
3737
pick_shader_fn_(pick_shader_fn),
3838
pick_global_wg_fn_(pick_global_wg_fn),
3939
pick_local_wg_fn_(pick_local_wg_fn) {
40-
shader_ = pick_shader_fn(&graph, args, resize_args);
4140
global_workgroup_size_ =
4241
pick_global_wg_fn(&graph, shader_, args, resize_args);
4342
local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(

backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,13 @@ layout(std430) buffer;
2222

2323
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
2424
${layout_declare_buffer(B, "r", "nchw_in", "int")}
25-
${layout_declare_ubo(B, "ivec4", "sizes")}
25+
26+
$if USE_PUSH_CONST:
27+
layout(push_constant) uniform restrict Block {
28+
ivec4 sizes;
29+
};
30+
$else:
31+
${layout_declare_ubo(B, "ivec4", "sizes")}
2632

2733
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2834

backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ nchw_to_bitw8_image_nobitw8buffer:
88
parameter_names_with_default_values:
99
STORAGE: texture3d
1010
DTYPE: int8
11+
USE_PUSH_CONST: True
1112
generate_variant_forall:
1213
STORAGE:
1314
- VALUE: texture2d
@@ -17,3 +18,5 @@ nchw_to_bitw8_image_nobitw8buffer:
1718
- VALUE: uint8
1819
shader_variants:
1920
- NAME: nchw_to_bitw8_image_nobitw8buffer
21+
- NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc
22+
USE_PUSH_CONST: False

backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,17 @@ layout(std430) buffer;
1212

1313
${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
1414
${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
15-
${layout_declare_ubo(2, "ivec4", "out_sizes")}
16-
${layout_declare_ubo(3, "ivec4", "out_strides")}
17-
${layout_declare_ubo(4, "int", "numel")}
15+
16+
$if USE_PUSH_CONST:
17+
layout(push_constant) uniform restrict Block {
18+
ivec4 out_sizes;
19+
ivec4 out_strides;
20+
int numel;
21+
};
22+
$else:
23+
${layout_declare_ubo(2, "ivec4", "out_sizes")}
24+
${layout_declare_ubo(3, "ivec4", "out_strides")}
25+
${layout_declare_ubo(4, "int", "numel")}
1826

1927
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
2028

backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ nchw_to_buffer:
88
parameter_names_with_default_values:
99
DTYPE: float
1010
STORAGE: buffer
11+
USE_PUSH_CONST: True
1112
generate_variant_forall:
1213
DTYPE:
1314
- VALUE: half
@@ -17,3 +18,5 @@ nchw_to_buffer:
1718
- VALUE: uint8
1819
shader_variants:
1920
- NAME: nchw_to_buffer
21+
- NAME: nchw_to_buffer_no_pc
22+
USE_PUSH_CONST: False

backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,17 @@ layout(std430) buffer;
2121

2222
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
2323
${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
24-
${layout_declare_ubo(B, "ivec4", "sizes")}
25-
$if not FROM_STAGING:
26-
${layout_declare_ubo(B, "ivec4", "buf_strides")}
24+
25+
$if USE_PUSH_CONST:
26+
layout(push_constant) uniform restrict Block {
27+
ivec4 sizes;
28+
$if not FROM_STAGING:
29+
ivec4 buf_strides;
30+
};
31+
$else:
32+
${layout_declare_ubo(B, "ivec4", "sizes")}
33+
$if not FROM_STAGING:
34+
${layout_declare_ubo(B, "ivec4", "buf_strides")}
2735

2836
#include "indexing_utils.h"
2937

backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ nchw_to_image:
99
STORAGE: texture3d
1010
DTYPE: float
1111
FROM_STAGING: True
12+
USE_PUSH_CONST: True
1213
generate_variant_forall:
1314
DTYPE:
1415
- VALUE: half
@@ -22,3 +23,11 @@ nchw_to_image:
2223
STORAGE: texture2d
2324
- NAME: clone_buffer_to_image
2425
FROM_STAGING: False
26+
- NAME: nchw_to_image_no_pc_texture3d
27+
USE_PUSH_CONST: False
28+
- NAME: nchw_to_image_no_pc_texture2d
29+
STORAGE: texture2d
30+
USE_PUSH_CONST: False
31+
- NAME: clone_buffer_to_image_no_pc
32+
FROM_STAGING: False
33+
USE_PUSH_CONST: False

0 commit comments

Comments
 (0)