[ET-VK][Ops] enabling double support for quantization and dequantization ops

morelos · morelos · commit 01e4a06f23c2 · 2025-06-12T12:49:12.000-07:00
Pull Request resolved: #11553 With the added double support in the layout template, this diff is enabling it as input/output for dequantization. Since there are limitations with how 64bit can be supported, the expectation is that IO be downgraded to 32bit ghstack-source-id: 290041469 @exported-using-ghexport Differential Revision: [D76289197](https://our.internmc.facebook.com/intern/diff/D76289197/)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml
@@ -11,6 +11,7 @@ dequantize_buffer:
     OUT_DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: double
   shader_variants:
     - NAME: dequantize_per_tensor_buffer
       MODE: per_tensor
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl
@@ -67,7 +67,10 @@ $if MODE == "per_tensor":
   [[unroll]] for (int i = 0; i < 4; ++i) {
     IN_T qvalue = IN_T(intex[i]);
     OUT_T value = dequantize_val(qvalue, scale, zero_point);
-    outtex[i] = value;
+    $if OUT_DTYPE == "double":
+      outtex[i] = float(value);
+    $else:
+      outtex[i] = value;
   }
   write_texel(t_out, pos, outtex);
 
@@ -110,7 +113,10 @@ $if MODE == "per_token":
   [[unroll]] for (int i = 0; i < 4; ++i) {
     IN_T qvalue = IN_T(intex[i]);
     OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-    outtex[i] = value;
+    $if OUT_DTYPE == "double":
+      outtex[i] = float(value);
+    $else:
+      outtex[i] = value;
   }
 
   write_texel(t_out, pos, outtex);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml
@@ -11,6 +11,7 @@ dequantize_texture:
     OUT_DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: double
   shader_variants:
     - NAME: dequantize_per_tensor_texture3d
       MODE: per_tensor
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
@@ -7,6 +7,7 @@ quantize_buffer:
     IN_DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: double
     OUT_DTYPE:
       - VALUE: uint8
       - VALUE: int8
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
@@ -7,6 +7,7 @@ quantize_texture:
     IN_DTYPE:
       - VALUE: half
       - VALUE: float
+      - VALUE: double
     OUT_DTYPE:
       - VALUE: uint8
       - VALUE: int8
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
@@ -162,6 +162,7 @@ void quantize_per_tensor_impl(
 
   // Verify input is a floating point type
   VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kDouble ||
       graph.dtype_of(input) == vkapi::kFloat ||
       graph.dtype_of(input) == vkapi::kHalf);
 
@@ -185,6 +186,7 @@ void quantize_per_token_impl(
 
   // Verify input is a floating point type
   VK_CHECK_COND(
+      graph.dtype_of(input) == vkapi::kDouble ||
       graph.dtype_of(input) == vkapi::kFloat ||
       graph.dtype_of(input) == vkapi::kHalf);
 
diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp
@@ -364,6 +364,12 @@ void test_vulkan_dequantize_per_tensor(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // Telling the system to expect a float instead of a double
+  // since the shader can only return 32bit anyways
+  if (out_dtype == at::kDouble) {
+    out_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_dequantize_per_tensor_impl(
       input_sizes,
@@ -398,6 +404,12 @@ void test_vulkan_dequantize_per_token(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // Telling the system to expect a float instead of a double
+  // since the shader can only return 32bit anyways
+  if (out_dtype == at::kDouble) {
+    out_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_dequantize_per_token_impl(
       input_sizes,
@@ -767,6 +779,19 @@ TEST(
       at::kHalf); // output dtype
 }
 
+TEST(
+    VulkanDequantizePerTensorTest,
+    test_vulkan_dequantize_per_tensor_int32_to_double) {
+  test_vulkan_dequantize_per_tensor(
+      {2, 4, 3}, // input sizes
+      0.0001, // scale
+      100, // zero_point
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kInt, // input dtype
+      at::kDouble); // output dtype
+}
+
 void test_reference_dequantize_per_token(
     const std::vector<int>& input_sizes,
     const std::vector<float>& scales,
@@ -1232,3 +1257,19 @@ TEST(
       at::kInt, // input dtype
       at::kHalf); // output dtype
 }
+
+TEST(
+    VulkanDequantizePerTokenTest,
+    test_vulkan_dequantize_per_token_int32_to_double) {
+  std::vector<float> scales = {0.0001, 0.0002, 0.0003, 0.0};
+  std::vector<int> zero_points = {100, -100, 50, -50};
+
+  test_vulkan_dequantize_per_token(
+      {2, 2, 8}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      -2147483648, // quant_min
+      2147483647, // quant_max
+      at::kInt, // input dtype
+      at::kDouble); // output dtype
+}
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
@@ -314,6 +314,12 @@ void test_vulkan_quantize_per_tensor(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // If the in_dtype is a double, convert to float for texture implementation
+  // since they don't support 64bit as inputs
+  if (in_dtype == at::kDouble) {
+    in_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_quantize_per_tensor_impl(
       input_sizes,
@@ -348,6 +354,12 @@ void test_vulkan_quantize_per_token(
       vkcompute::utils::kBuffer,
       vkcompute::utils::kBuffer);
 
+  // If the in_dtype is a double, convert to float for texture implementation
+  // since they don't support 64bit as inputs
+  if (in_dtype == at::kDouble) {
+    in_dtype = at::kFloat;
+  }
+
   // Test with texture storage
   test_vulkan_quantize_per_token_impl(
       input_sizes,
@@ -639,6 +651,19 @@ TEST(
       at::kChar); // output dtype
 }
 
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_tensor_double_to_int8) {
+  test_vulkan_quantize_per_tensor(
+      {2, 3}, // input sizes
+      0.01, // scale
+      1, // zero_point
+      -128, // quant_min
+      127, // quant_max
+      at::kDouble, // input dtype
+      at::kChar); // output dtype
+}
+
 void test_reference_quantize_per_token(
     const std::vector<int>& input_sizes,
     const std::vector<float>& pre_scales,
@@ -1033,3 +1058,19 @@ TEST(VulkanQuantizePerTensorTest, test_vulkan_quantize_per_token_half_to_int8) {
       at::kHalf, // input dtype
       at::kChar); // output dtype
 }
+
+TEST(
+    VulkanQuantizePerTensorTest,
+    test_vulkan_quantize_per_token_double_to_int8) {
+  std::vector<float> scales = {0.1, 0.2};
+  std::vector<int> zero_points = {0, 5};
+
+  test_vulkan_quantize_per_token(
+      {2, 2}, // input sizes (2*2=4 tokens)
+      scales,
+      zero_points,
+      -128, // quant_min
+      127, // quant_max
+      at::kDouble, // input dtype
+      at::kChar); // output dtype
+}