diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index ef9e1b0cad490..43507809eff41 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -410,6 +410,70 @@ void SplitSelector::UpdateBuilder(NodesToOptimizeIndicesBuilder& builder) const builder.num_output_defs = 1; // set to 1 as the first output is variadic } +// Validates that the bias DQ's scale matches input_scale * weight_scale[i] for each output channel. +// ONNX QLinearConv requires bias to be in int32 with scale = x_scale * w_scale[i]. +// If this condition is violated, the fused output would be silently incorrect. +// Returns false (conservative) if any scale initializer is not a constant or types are non-conformant. +static bool CheckConvBiasScale(const GraphViewer& graph_viewer, + const Node& input_dq, const Node& weight_dq, const Node& bias_dq) { + const auto* x_scale_arg = input_dq.InputDefs()[QDQ::InputIndex::SCALE_ID]; + const auto* w_scale_arg = weight_dq.InputDefs()[QDQ::InputIndex::SCALE_ID]; + const auto* b_scale_arg = bias_dq.InputDefs()[QDQ::InputIndex::SCALE_ID]; + + const auto* x_scale_proto = graph_viewer.GetConstantInitializer(x_scale_arg->Name(), true); + const auto* w_scale_proto = graph_viewer.GetConstantInitializer(w_scale_arg->Name(), true); + const auto* b_scale_proto = graph_viewer.GetConstantInitializer(b_scale_arg->Name(), true); + + if (!x_scale_proto || !w_scale_proto || !b_scale_proto) { + return false; // conservative: cannot verify + } + + // Input scale must be scalar (rank 0 or 1-element rank-1). + if (x_scale_proto->dims_size() != 0 && + !(x_scale_proto->dims_size() == 1 && x_scale_proto->dims(0) == 1)) { + return false; + } + + const Initializer x_scale_init{graph_viewer.GetGraph(), *x_scale_proto, graph_viewer.ModelPath()}; + const Initializer w_scale_init{graph_viewer.GetGraph(), *w_scale_proto, graph_viewer.ModelPath()}; + const Initializer b_scale_init{graph_viewer.GetGraph(), *b_scale_proto, graph_viewer.ModelPath()}; + + // All scales must be float32 for standard QLinearConv. + if (x_scale_init.data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT || + w_scale_init.data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT || + b_scale_init.data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { + return false; + } + + const auto x_scales = x_scale_init.DataAsSpan(); + const auto w_scales = w_scale_init.DataAsSpan(); + const auto b_scales = b_scale_init.DataAsSpan(); + + const float x_scale = x_scales[0]; + const size_t num_channels = w_scales.size(); // 1 for per-tensor, C_out for per-channel + const size_t b_num = b_scales.size(); + + // b_scale must be scalar or match num_channels. + if (b_num != 1 && b_num != num_channels) { + return false; + } + + // Tolerance values matching convention in optimizer/utils.cc. + constexpr float atol = 1e-6f; + constexpr float rtol = 1e-2f; + + for (size_t i = 0; i < num_channels; ++i) { + const float w_scale = w_scales[i]; + const float b_scale = (b_num == 1) ? b_scales[0] : b_scales[i]; + const float expected = x_scale * w_scale; + if (std::abs(b_scale - expected) > (atol + rtol * std::abs(expected))) { + return false; + } + } + + return true; +} + bool ConvNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const Node* redundant_clip_node, const std::vector& dq_nodes, const std::vector& q_nodes) const { @@ -440,6 +504,12 @@ bool ConvNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& n if (dt_bias != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32) { return false; } + + // Verify bias scale == input_scale * weight_scale[i] per ONNX QLinearConv spec. + // If scales don't match within tolerance, skip fusion to avoid silent numerical errors. + if (!CheckConvBiasScale(graph_viewer, *dq_nodes[0], *dq_nodes[1], *dq_nodes[2])) { + return false; + } } // 16-bit int types must be explicitly allowed. diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index f54b40bb98b56..e1d1903e0e656 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -23,6 +23,7 @@ get_tensor_consumers_and_producers, ) +import onnxruntime as ort from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantType, quantize_static, write_calibration_table from onnxruntime.quantization.calibrate import CalibrationMethod, TensorData, TensorsData from onnxruntime.quantization.quant_utils import quantize_nparray @@ -2224,5 +2225,182 @@ def test_quantize_with_prequantized_input(self): self.assertEqual(zp_val, zp_val_original) +class TestConvBiasScaleValidation(unittest.TestCase): + """Tests that QDQ Conv -> QLinearConv fusion is skipped when bias scale does not match + input_scale * weight_scale, as required by the ONNX QLinearConv specification.""" + + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.bias_scale_validation_") + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_qdq_conv_model( + self, + inp_shape: list[int], + weight_shape: list[int], + x_scale: float, + w_scale: float, + b_scale: float, + x_zp: int = 128, + w_zp: int = 0, + ) -> onnx.ModelProto: + """Builds a QDQ Conv model (DQ->Conv->Q) with the given quantization scales. + + The bias is encoded as int32 with the provided b_scale. When b_scale != x_scale * w_scale + the model is non-conformant and fusion must be skipped. + """ + num_out_channels = weight_shape[0] + np_x_scale = np.float32(x_scale) + np_w_scale = np.float32(w_scale) + np_b_scale = np.float32(b_scale) + + # --- Quantized input (graph input: uint8) --- + input_q = onnx.helper.make_tensor_value_info("input_q", onnx.TensorProto.UINT8, inp_shape) + + # --- Weight: random int8 constant --- + rng = np.random.default_rng(42) + weight_quant_data = rng.integers(-5, 6, size=weight_shape, dtype=np.int8) + weight_quant = onnx.numpy_helper.from_array(weight_quant_data, "weight_q") + + # --- Scales / zero-points as scalar constants --- + x_scale_tensor = onnx.numpy_helper.from_array(np.array(np_x_scale, dtype=np.float32), "x_scale") + x_zp_tensor = onnx.numpy_helper.from_array(np.array(x_zp, dtype=np.uint8), "x_zp") + w_scale_tensor = onnx.numpy_helper.from_array(np.array(np_w_scale, dtype=np.float32), "w_scale") + w_zp_tensor = onnx.numpy_helper.from_array(np.array(w_zp, dtype=np.int8), "w_zp") + b_scale_tensor = onnx.numpy_helper.from_array(np.array(np_b_scale, dtype=np.float32), "b_scale") + b_zp_tensor = onnx.numpy_helper.from_array(np.array(0, dtype=np.int32), "b_zp") + + # Bias as int32 (quantized with b_scale) + bias_float = np.ones(num_out_channels, dtype=np.float32) + bias_q_data = np.round(bias_float / np_b_scale).astype(np.int32) + bias_q = onnx.numpy_helper.from_array(bias_q_data, "bias_q") + + # Output scale/zp for Q node + out_scale_val = np.float32(x_scale) + out_zp_val = np.uint8(128) + out_scale_tensor = onnx.numpy_helper.from_array(np.array(out_scale_val, dtype=np.float32), "out_scale") + out_zp_tensor = onnx.numpy_helper.from_array(np.array(out_zp_val, dtype=np.uint8), "out_zp") + + # Nodes: DQ(input) -> | -> Conv -> Q(output) + # DQ(weight) -/ + # DQ(bias) -/ + dq_input = onnx.helper.make_node( + "DequantizeLinear", ["input_q", "x_scale", "x_zp"], ["input_f"], name="DQ_input" + ) + dq_weight = onnx.helper.make_node( + "DequantizeLinear", ["weight_q", "w_scale", "w_zp"], ["weight_f"], name="DQ_weight" + ) + dq_bias = onnx.helper.make_node("DequantizeLinear", ["bias_q", "b_scale", "b_zp"], ["bias_f"], name="DQ_bias") + conv_node = onnx.helper.make_node("Conv", ["input_f", "weight_f", "bias_f"], ["conv_out"], name="Conv0") + q_output = onnx.helper.make_node( + "QuantizeLinear", ["conv_out", "out_scale", "out_zp"], ["output_q"], name="Q_output" + ) + + output_q = onnx.helper.make_tensor_value_info("output_q", onnx.TensorProto.UINT8, None) + + graph = onnx.helper.make_graph( + [dq_input, dq_weight, dq_bias, conv_node, q_output], + "QDQConvBiasScaleTest", + [input_q], + [output_q], + initializer=[ + weight_quant, + x_scale_tensor, + x_zp_tensor, + w_scale_tensor, + w_zp_tensor, + b_scale_tensor, + b_zp_tensor, + bias_q, + out_scale_tensor, + out_zp_tensor, + ], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + return model + + def _run_model(self, model_path: str, inputs: dict, optimize: bool) -> list: + """Run model with the given optimization level and return outputs.""" + sess_opts = ort.SessionOptions() + if optimize: + sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + else: + sess_opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL + sess = ort.InferenceSession(model_path, sess_options=sess_opts, providers=["CPUExecutionProvider"]) + return sess.run(None, inputs) + + def test_mismatched_bias_scale_skips_fusion(self): + """When bias_scale != input_scale * weight_scale, fusion must be skipped. + + The outputs of the optimized and unoptimized sessions should match, proving + that no silent numerical corruption occurred. + """ + inp_shape = [1, 1, 4, 4] + weight_shape = [1, 1, 1, 1] + + x_scale = 0.05 + w_scale = 0.02 + # Intentionally wrong bias scale: 2x the correct value + correct_b_scale = x_scale * w_scale + wrong_b_scale = 2.0 * correct_b_scale + + model = self.build_qdq_conv_model(inp_shape, weight_shape, x_scale, w_scale, wrong_b_scale) + model_path = os.path.join(self._tmp_dir_path, "conv_mismatched_bias_scale.qdq.onnx") + onnx.save_model(model, model_path) + + # Build a representative input + rng = np.random.default_rng(0) + input_data = rng.integers(100, 200, size=inp_shape, dtype=np.uint8) + inputs = {"input_q": input_data} + + out_optimized = self._run_model(model_path, inputs, optimize=True) + out_unoptimized = self._run_model(model_path, inputs, optimize=False) + + # Both should produce the same uint8 output (fusion was skipped or fallback path is correct). + np.testing.assert_array_equal( + out_optimized[0], + out_unoptimized[0], + err_msg="Mismatched bias scale: optimized and unoptimized outputs differ, " + "indicating silent numerical corruption from incorrect fusion.", + ) + + def test_matching_bias_scale_allows_fusion(self): + """When bias_scale == input_scale * weight_scale, fusion should be allowed. + + Both optimized and unoptimized sessions must produce identical outputs. + """ + inp_shape = [1, 1, 4, 4] + weight_shape = [1, 1, 1, 1] + + x_scale = 0.05 + w_scale = 0.02 + correct_b_scale = x_scale * w_scale # exactly matching + + model = self.build_qdq_conv_model(inp_shape, weight_shape, x_scale, w_scale, correct_b_scale) + model_path = os.path.join(self._tmp_dir_path, "conv_matching_bias_scale.qdq.onnx") + onnx.save_model(model, model_path) + + rng = np.random.default_rng(0) + input_data = rng.integers(100, 200, size=inp_shape, dtype=np.uint8) + inputs = {"input_q": input_data} + + out_optimized = self._run_model(model_path, inputs, optimize=True) + out_unoptimized = self._run_model(model_path, inputs, optimize=False) + + np.testing.assert_array_equal( + out_optimized[0], + out_unoptimized[0], + err_msg="Matching bias scale: optimized and unoptimized outputs differ.", + ) + + if __name__ == "__main__": unittest.main()