Skip to content

[ET-VK][Ops] quantize_per_tensor.default test setup #11368

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: gh/ahmtox/10/base
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
252 changes: 252 additions & 0 deletions backends/vulkan/test/op_tests/quantize_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,56 @@ void check_quantize_args(
" actual quant_max: ",
quant_max);
}

//
// Reference Implementation
//

/*
* Reference implementation of quantize_per_tensor
*/
at::Tensor quantize_per_tensor_reference_impl(
const at::Tensor& input,
double scale,
int64_t zero_point,
int64_t quant_min,
int64_t quant_max,
at::ScalarType dtype) {
// Create output tensor with the target dtype
at::Tensor out = at::empty_like(input, dtype);

// Quantize the input tensor
float inv_scale = 1.0 / scale;

// Iterate through the tensor and quantize each element
at::Tensor float_input = input.to(at::kFloat);
at::Tensor float_values = float_input.flatten();

auto out_flat = out.flatten();

for (int i = 0; i < float_values.numel(); i++) {
float value = float_values[i].item<float>();
int64_t qvalue = zero_point + std::nearbyint(inv_scale * value);

qvalue = std::max<int64_t>(qvalue, quant_min);
qvalue = std::min<int64_t>(qvalue, quant_max);

if (dtype == at::kByte) {
out_flat[i] = static_cast<uint8_t>(qvalue);
} else if (dtype == at::kChar) {
out_flat[i] = static_cast<int8_t>(qvalue);
} else if (dtype == at::kShort) {
out_flat[i] = static_cast<int16_t>(qvalue);
} else if (dtype == at::kInt) {
out_flat[i] = static_cast<int32_t>(qvalue);
} else if (dtype == at::kLong) {
out_flat[i] = static_cast<int64_t>(qvalue);
}
}

return out.reshape(input.sizes());
}

/*
* Reference implementation of quantize_per_token
*/
Expand Down Expand Up @@ -337,6 +387,17 @@ at::Tensor quantize_per_token_reference_impl(
return out;
}

// Forward declaration of implementation functions
void test_vulkan_quantize_per_tensor_impl(
const std::vector<int>& input_sizes,
float scale,
int zero_point,
int64_t quant_min,
int64_t quant_max,
at::ScalarType dtype,
const vkcompute::utils::StorageType in_storage,
const vkcompute::utils::StorageType out_storage);

void test_vulkan_quantize_per_token_impl(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
Expand All @@ -347,6 +408,37 @@ void test_vulkan_quantize_per_token_impl(
const vkcompute::utils::StorageType in_storage,
const vkcompute::utils::StorageType out_storage);

// Wrapper function to test both buffer and texture storage types
void test_vulkan_quantize_per_tensor(
const std::vector<int>& input_sizes,
float scale,
int zero_point,
int64_t quant_min,
int64_t quant_max,
at::ScalarType dtype) {
// Test with buffer storage
test_vulkan_quantize_per_tensor_impl(
input_sizes,
scale,
zero_point,
quant_min,
quant_max,
dtype,
vkcompute::utils::kBuffer,
vkcompute::utils::kBuffer);

// Test with texture storage
test_vulkan_quantize_per_tensor_impl(
input_sizes,
scale,
zero_point,
quant_min,
quant_max,
dtype,
vkcompute::utils::kTexture3D,
vkcompute::utils::kTexture3D);
}

// Wrapper function to test both buffer and texture storage types
void test_vulkan_quantize_per_token(
const std::vector<int>& input_sizes,
Expand Down Expand Up @@ -378,6 +470,166 @@ void test_vulkan_quantize_per_token(
vkcompute::utils::kTexture3D);
}

void test_reference_quantize_per_tensor(
const std::vector<int>& input_sizes,
float scale,
int zero_point,
int64_t quant_min,
int64_t quant_max,
at::ScalarType dtype) {
check_quantize_args(quant_min, quant_max, dtype);
std::vector<int64_t> input_sizes_int64(
input_sizes.begin(), input_sizes.end());
at::Tensor input =
at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));

// Fill with a simple pattern: values from 0 to 1 in steps
float step = 1.0f / (input.numel() - 1);
auto flat_input = input.flatten();
for (int i = 0; i < flat_input.numel(); i++) {
flat_input[i] = i * step;
}

// Reshape back to original dimensions
input = flat_input.reshape(input_sizes_int64);

// Get reference output
at::Tensor reference_out = quantize_per_tensor_reference_impl(
input, scale, zero_point, quant_min, quant_max, dtype);

// Get implementation output
at::Tensor impl_out = torch::executor::native::quantize_per_tensor_aten(
input, scale, zero_point, quant_min, quant_max, dtype);

// Convert to int for consistent display regardless of underlying type
at::Tensor reference_int = reference_out.to(at::kInt);
at::Tensor impl_int = impl_out.to(at::kInt);

const bool output_correct = at::equal(reference_int, impl_int);
if (!output_correct) {
at::Tensor diffs = at::abs(reference_int - impl_int);

std::cout << "\n"
<< "Failed with parameters: " << std::endl;
std::cout << " scale: " << scale << std::endl;
std::cout << " zero_point: " << zero_point << std::endl;
std::cout << " quant_min: " << quant_min << std::endl;
std::cout << " quant_max: " << quant_max << std::endl;

std::cout << "input:" << std::endl;
std::cout << input << std::endl;
std::cout << "reference:" << std::endl;
std::cout << reference_int << std::endl;
std::cout << "my_reference:" << std::endl;
std::cout << impl_int << std::endl;
}

ASSERT_TRUE(output_correct);
}

void test_vulkan_quantize_per_tensor_impl(
const std::vector<int>& input_sizes,
float scale,
int zero_point,
int64_t quant_min,
int64_t quant_max,
at::ScalarType dtype,
const vkcompute::utils::StorageType in_storage =
vkcompute::utils::kTexture3D,
const vkcompute::utils::StorageType out_storage =
vkcompute::utils::kTexture3D) {
check_quantize_args(quant_min, quant_max, dtype);
std::vector<int64_t> input_sizes_int64(
input_sizes.begin(), input_sizes.end());
at::Tensor input =
at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));

// Get reference output
at::Tensor reference_out = torch::executor::native::quantize_per_tensor_aten(
input, scale, zero_point, quant_min, quant_max, dtype);

// Build Vulkan quantize_per_tensor graph
using namespace vkcompute;

GraphConfig config;
config.set_storage_type_override(in_storage);
ComputeGraph graph(config);

IOValueRef r_input = graph.add_input_tensor(
input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);

const ValueRef r_scale = graph.add_scalar<double>(scale);
const ValueRef r_zero_point = graph.add_scalar<int64_t>(zero_point);
const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);

const ValueRef r_out = graph.add_tensor(
input.sizes().vec(), from_at_scalartype(dtype), out_storage);

VK_GET_OP_FN("quantize_per_tensor.default")
(graph,
{
r_input.value,
r_scale,
r_zero_point,
r_quant_min,
r_quant_max,
r_out,
});

ValueRef staging_out = graph.set_output_tensor(r_out);

graph.prepare();
graph.encode_prepack();
graph.prepack();
graph.encode_execute();

// Run Vulkan quantize_per_tensor
graph.copy_into_staging(
r_input.staging, input.const_data_ptr(), input.numel());

graph.execute();

at::Tensor vk_out = at::empty_like(reference_out).contiguous();
graph.copy_from_staging(
staging_out, vk_out.mutable_data_ptr(), vk_out.numel());

// Compare outputs
// For quantized types, we need to compare the actual integer values
at::Tensor reference_int = reference_out.to(at::kInt);
at::Tensor vk_int = vk_out.to(at::kInt);

const bool output_correct = at::equal(reference_int, vk_int);
if (!output_correct) {
at::Tensor diffs = at::abs(reference_int - vk_int);

std::cout << "\n"
<< "Failed with parameters: " << std::endl;
std::cout << " scale: " << scale << std::endl;
std::cout << " zero_point: " << zero_point << std::endl;
std::cout << " quant_min: " << quant_min << std::endl;
std::cout << " quant_max: " << quant_max << std::endl;

std::cout << "input:" << std::endl;
std::cout << input << std::endl;
std::cout << "reference:" << std::endl;
std::cout << reference_int << std::endl;
std::cout << "vulkan:" << std::endl;
std::cout << vk_int << std::endl;
}

ASSERT_TRUE(output_correct);
}

TEST(VulkanQuantizePerTensorTest, test_reference_quantize_per_tensor_int8) {
test_reference_quantize_per_tensor(
{2, 3, 4}, // input sizes
0.1, // scale
0, // zero_point
-128, // quant_min
127, // quant_max
at::kChar);
}
void test_reference_quantize_per_token(
const std::vector<int>& input_sizes,
const std::vector<float>& scales,
Expand Down
Loading