pytorch · SS-JIA · Sep 8, 2025 · Sep 7, 2025
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -929,7 +929,9 @@ jobs:
         CMAKE_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON" \
         .ci/scripts/setup-linux.sh --build-tool "cmake"
 
+        # Custom operator tests
         PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh add
+        ./cmake-out/backends/vulkan/test/custom_ops/q8csw_linear
 
   nxp-build-test:
     name: nxp-build-test

@@ -308,6 +308,10 @@ class ComputeGraph final {
     return idx == kDummyValueRef ? true : values_.at(idx).isNone();
   }
 
+  inline bool val_is_not_none(const ValueRef idx) {
+    return !val_is_none(idx);
+  }
+
   inline TypeTag get_val_type(const ValueRef idx) {
     return values_.at(idx).type();
   }

@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef COMMON_GLSLH
+#define COMMON_GLSLH
+
+#define mul_2(x) ((x) << 1)
+#define mul_4(x) ((x) << 2)
+#define mul_8(x) ((x) << 3)
+
+#define div_2(x) ((x) >> 1)
+#define div_4(x) ((x) >> 2)
+#define div_8(x) ((x) >> 3)
+
+#define div_up_2(x) (((x) + 1) >> 1)
+#define div_up_4(x) (((x) + 3) >> 2)
+#define div_up_8(x) (((x) + 7) >> 3)
+
+#define align_up_2(x) ((x + 1) & -2)
+#define align_up_4(x) ((x + 3) & -4)
+#define align_up_8(x) ((x + 7) & -8)
+
+#define mod_2(x) ((x) & 1)
+#define mod_4(x) ((x) & 3)
+#define mod_8(x) ((x) & 7)
+
+struct TensorIndex4D {
+  ivec4 data;
+};
+
+#ifdef DEBUG_MODE
+
+#extension GL_EXT_debug_printf : require
+
+void printTensorIndex4D(const TensorIndex4D index) {
+  debugPrintfEXT(
+      "tensor_idx: %d, %d, %d, %d\\n",
+      index.data.x,
+      index.data.y,
+      index.data.z,
+      index.data.w);
+}
+
+#endif // DEBUG_MODE
+
+#endif // COMMON_GLSLH
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Defines common functions and structs to be used across matrix multiplication
+ * operators.
+ */
+
+#ifndef LINEAR_COMMON_GLSLH
+#define LINEAR_COMMON_GLSLH
+
+#include "common.glslh"
+
+int sign_extend_8bit(const int val) {
+  if ((val & 0x80) != 0) {
+    return val | (~0xFF);
+  }
+  return val;
+}
+
+int extract_8bit_from_packed_int_le(const int packed, const int i) {
+  // account for little endian
+  int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
+  return byte;
+}
+
+#endif // LINEAR_COMMON_GLSLH
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef LINEAR_FP_BIAS_LOAD_GLSLH
+#define LINEAR_FP_BIAS_LOAD_GLSLH
+
+#include "linear_fp_per_out_channel_params.glslh"
+
+VEC4_T load_bias_x4(const int n4) {
+  return t_bias[n4];
+}
+
+void load_bias_tile(out FPPerOutChannelParams bias, const int n4_start) {
+#if TILE_N4 == 1
+  bias.data[0] = load_bias_x4(n4_start);
+
+#else
+  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+    bias.data[n4] = load_bias_x4(n4_start + n4);
+  }
+
+#endif
+}
+
+#endif // LINEAR_FP_BIAS_LOAD_GLSLH
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef LINEAR_FP_INPUT_TILE_GLSLH
+#define LINEAR_FP_INPUT_TILE_GLSLH
+
+/*
+ * Defines the FPInputTile struct, which is used to represent a tile of the
+ * input matrix of a matrix multiplication operation.
+ *
+ * Settings:
+ * - TILE_M: number of rows in the tile
+ * - TILE_K4: number of (groups of 4) columns in the tile
+ */
+
+#extension GL_EXT_control_flow_attributes : require
+
+struct FPInputTile {
+  VEC4_T data[TILE_M][TILE_K4];
+};
+
+#ifdef DEBUG_MODE
+
+void printFPInputTile(const FPInputTile in_tile) {
+  debugPrintfEXT("input_tile: \\n");
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      debugPrintfEXT(
+          "  %f, %f, %f, %f, \\n",
+          in_tile.data[m][k4].x,
+          in_tile.data[m][k4].y,
+          in_tile.data[m][k4].z,
+          in_tile.data[m][k4].w);
+    }
+  }
+}
+
+#endif // DEBUG_MODE
+
+#endif // LINEAR_FP_INPUT_TILE_GLSLH
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Defines functions to load a FPInputTile from input buffer/texture.
+ *
+ * Requires:
+ * - t_input to be declared in the shader layout (input buffer/texture)
+ *
+ * Settings:
+ * - INPUT_BUFFER to indicate input resource is a buffer, otherwise texture is
+ *   assumed.
+ */
+
+#ifndef LINEAR_FP_INPUT_TILE_LOAD_GLSLH
+#define LINEAR_FP_INPUT_TILE_LOAD_GLSLH
+
+#extension GL_EXT_control_flow_attributes : require
+
+#include "linear_fp_input_tile.glslh"
+
+#ifdef INPUT_BUFFER
+
+VEC4_T load_input_x4(const int k4, const int m, const int ntexels_k) {
+  return t_input[(m * ntexels_k) + k4];
+}
+
+#else
+
+VEC4_T load_input_x4(const int k4, const int m, const int ntexels_k) {
+  return texelFetch(t_input, ivec3(k4, m, 0), 0);
+}
+
+#endif // INPUT_BUFFER
+
+// To be used if (M - m_start >= TILE_M) || (K4 - k4_start >= TILE_K4)
+void load_input_tile_no_checks(
+    out FPInputTile in_tile,
+    const int k4_start,
+    const int m_start,
+    const int K4,
+    const int M) {
+#if TILE_K4 == 1
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    in_tile.data[m][0] = load_input_x4(k4_start, m_start + m, K4);
+  }
+
+#else
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      in_tile.data[m][k4] = load_input_x4(k4_start + k4, m_start + m, K4);
+    }
+  }
+#endif
+}
+
+// To be used if near tensor boundaries
+void load_input_tile_with_checks(
+    out FPInputTile in_tile,
+    const int k4_start,
+    const int m_start,
+    const int K4,
+    const int M) {
+#if TILE_K4 == 1
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    if (m_start + m < M) {
+      in_tile.data[m][0] = load_input_x4(k4_start, m_start + m, K4);
+    } else {
+      in_tile.data[m][0] = VEC4_T(0.0);
+    }
+  }
+
+#else
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      if (m_start + m < M && k4_start + k4 < K4) {
+        in_tile.data[m][k4] = load_input_x4(k4_start + k4, m_start + m, K4);
+      } else {
+        in_tile.data[m][k4] = VEC4_T(0.0);
+      }
+    }
+  }
+#endif
+}
+
+#endif // LINEAR_FP_INPUT_TILE_LOAD_GLSLH
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Defines the FPOutTile struct, which is used to represent a tile of the output
+ * matrix of a matrix multiplication operation.
+ *
+ * Settings:
+ * - TILE_M: number of rows in the output tile
+ * - TILE_N4: number of (groups of 4) columns in the output tile
+ */
+
+#ifndef LINEAR_FP_OUTPUT_TILE_GLSLH
+#define LINEAR_FP_OUTPUT_TILE_GLSLH
+
+#extension GL_EXT_control_flow_attributes : require
+
+struct FPOutTile {
+  VEC4_T data[TILE_M][TILE_N4];
+};
+
+void initialize(out FPOutTile out_tile) {
+#if TILE_N4 == 1
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    out_tile.data[m][0] = VEC4_T(0);
+  }
+
+#else
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      out_tile.data[m][n4] = VEC4_T(0);
+    }
+  }
+#endif
+}
+
+#ifdef DEBUG_MODE
+
+void printFPOutTile(const FPOutTile tile) {
+  debugPrintfEXT("output_tile: \\n");
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      debugPrintfEXT(
+          "  %f, %f, %f, %f,",
+          tile.data[m][n4].x,
+          tile.data[m][n4].y,
+          tile.data[m][n4].z,
+          tile.data[m][n4].w);
+    }
+    debugPrintfEXT("\\n");
+  }
+}
+
+#endif // DEBUG_MODE
+
+#endif // LINEAR_FP_OUTPUT_TILE_GLSLH