diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index e3303dac6c8c5..58eac46acb5ab 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -104,6 +104,7 @@ def __init__( # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()` self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None) self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False) + self.is_activation_restricted_asymmetric = self.extra_options.get("ActivationRestrictedAsymmetric", False) self.min_real_range = self.extra_options.get("MinimumRealRange") self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType) diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py index 148e4c06a8051..2c3a428501820 100644 --- a/onnxruntime/python/tools/quantization/onnx_quantizer.py +++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py @@ -30,6 +30,7 @@ ms_domain, quantize_onnx_initializer, save_and_reload_model_with_shape_infer, + snap_zero_point_to_uint8, tensor_proto_to_array, ) from .registry import CreateOpQuantizer @@ -1157,6 +1158,8 @@ def calculate_quantization_params(self): reduce_range = quant_overrides.get("reduce_range", False) qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric) zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range) + if self.is_activation_restricted_asymmetric and quant_type == onnx.TensorProto.UINT8 and not symmetric: + zero, scale = snap_zero_point_to_uint8(rmin, rmax) quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index c42a7d8faf577..be824009ae9e8 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -38,6 +38,7 @@ ms_domain, normalize_axis, quantize_onnx_initializer, + snap_zero_point_to_uint8, tensor_proto_to_array, ) from .registry import CreateQDQQuantizer @@ -1320,6 +1321,12 @@ def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str, reduce_range = quant_overrides.get("reduce_range", False) qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric) zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range) + if ( + self.is_activation_restricted_asymmetric + and quant_type == onnx.TensorProto.UINT8 + and not symmetric + ): + zero, scale = snap_zero_point_to_uint8(rmin, rmax) return QuantizationParams(zero_point=zero.squeeze(), scale=scale.squeeze(), quant_type=quant_type) diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 0ce1e1a0d75de..17cc8f35fd5de 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -297,6 +297,33 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non return [zero_point, scale] +def snap_zero_point_to_uint8(rmin, rmax): + """Snap a uint8 activation zero-point to 0 (when rmin >= 0) or 128 (when rmin < 0). + + Used by the ActivationRestrictedAsymmetric quantization option. Recomputes scale so the + dequantized range still covers [rmin, rmax] without clipping. + + :parameter rmin: calibrated minimum activation value (numpy scalar) + :parameter rmax: calibrated maximum activation value (numpy scalar) + :return: (zero_point, scale) with zero_point dtype uint8 and scale dtype float32 + """ + rmin = float(numpy.squeeze(rmin)) + rmax = float(numpy.squeeze(rmax)) + if rmax <= rmin: + # Degenerate range – return neutral values + return numpy.array(0, dtype=numpy.uint8), numpy.array(1.0, dtype=numpy.float32) + if rmin >= 0.0: + zero_point = numpy.array(0, dtype=numpy.uint8) + scale = numpy.array(rmax / 255.0, dtype=numpy.float32) + else: + zero_point = numpy.array(128, dtype=numpy.uint8) + # Choose scale that covers both negative and positive halves without clipping + scale_neg = -rmin / 128.0 # scale needed to represent rmin at q=0 + scale_pos = rmax / 127.0 # scale needed to represent rmax at q=255 + scale = numpy.array(max(scale_neg, scale_pos), dtype=numpy.float32) + return zero_point, scale + + def compute_scale_zp_float8(element_type, std): """Calculate the scale s for a float8 type (E4M3FN). The function assumes the coefficient distribution and the float 8 diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index b8b239b85e7ad..a8130ad3fbd1e 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -120,6 +120,8 @@ def __init__( key value pair dictionary for various options in different case. Current used: extra.Sigmoid.nnapi = True/False (Default is False) ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False). + ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0 + (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False). WeightSymmetric = True/False: symmetrize calibration data for weights (default is True). EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized. Dyanmic mode currently is supported. Will support more in future. @@ -419,6 +421,8 @@ def __init__( extra_options: key value pair dictionary for various options in different case. Current used: extra.Sigmoid.nnapi = True/False (Default is False) ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False). + ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0 + (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False). WeightSymmetric = True/False: symmetrize calibration data for weights (default is True). EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will @@ -544,6 +548,8 @@ def quantize_static( key value pair dictionary for various options in different case. Current used: extra.Sigmoid.nnapi = True/False (Default is False) ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False). + ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0 + (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False). WeightSymmetric = True/False: symmetrize calibration data for weights (default is True). EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized. Dyanmic mode currently is supported. Will support more in the future. @@ -834,6 +840,8 @@ def quantize_dynamic( key value pair dictionary for various options in different case. Current used: extra.Sigmoid.nnapi = True/False (Default is False) ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False). + ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0 + (rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False). WeightSymmetric = True/False: symmetrize calibration data for weights (default is True). EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will diff --git a/onnxruntime/test/python/quantization/test_symmetric_flag.py b/onnxruntime/test/python/quantization/test_symmetric_flag.py index 701da80d543d3..d5fe6c5e1074c 100644 --- a/onnxruntime/test/python/quantization/test_symmetric_flag.py +++ b/onnxruntime/test/python/quantization/test_symmetric_flag.py @@ -150,3 +150,84 @@ def test_3(self): if __name__ == "__main__": unittest.main() + + +class TestRestrictedAsymmetricFlag(unittest.TestCase): + """Tests for ActivationRestrictedAsymmetric extra-option (uint8 zero-point snapping).""" + + def setUp(self): + # All-positive activations (post-ReLU-like): rmin >= 0, expect zp == 0 + self.positive_activations = [ + np.zeros([1, 2, 32, 32], dtype="float32"), + np.ones([1, 2, 32, 32], dtype="float32") * 2.0, + ] + # Signed-range activations: rmin < 0, expect zp == 128 + self.signed_activations = [ + -1.0 * np.ones([1, 2, 32, 32], dtype="float32"), + +2.0 * np.ones([1, 2, 32, 32], dtype="float32"), + ] + + self.weights = np.concatenate( + ( + -1 * np.ones([1, 1, 2, 2], dtype="float32"), + +1 * np.ones([1, 1, 2, 2], dtype="float32"), + ), + axis=1, + ) + + def _quantize(self, activations, extra_options): + act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape) + res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None]) + wgt_init = numpy_helper.from_array(self.weights, "WGT") + conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"]) + graph = helper.make_graph([conv_node], "test", [act], [res], initializer=[wgt_init]) + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)]) + onnx.save(model, "model_restricted.onnx") + + class DummyDataReader(quantization.CalibrationDataReader): + def __init__(self_inner): + self_inner.iterator = ({"ACT": act} for act in activations) + + def get_next(self_inner): + return next(self_inner.iterator, None) + + quantization.quantize_static( + model_input="model_restricted.onnx", + model_output="quantized_restricted.onnx", + calibration_data_reader=DummyDataReader(), + quant_format=quantization.QuantFormat.QOperator, + activation_type=quantization.QuantType.QUInt8, + weight_type=quantization.QuantType.QUInt8, + op_types_to_quantize=["Conv", "MatMul"], + extra_options=extra_options, + ) + + model = onnx.load("quantized_restricted.onnx") + act_zp = next(init for init in model.graph.initializer if init.name == "ACT_zero_point").int32_data[0] + act_sc = next(init for init in model.graph.initializer if init.name == "ACT_scale").float_data[0] + return act_zp, act_sc + + def test_positive_activations_zp_is_zero(self): + """All-positive range (rmin >= 0): zero-point must snap to 0.""" + act_zp, act_sc = self._quantize( + self.positive_activations, + extra_options={"ActivationRestrictedAsymmetric": True}, + ) + self.assertEqual(act_zp, 0, f"Expected zp=0 for rmin>=0, got {act_zp}") + + def test_signed_activations_zp_is_128(self): + """Signed range (rmin < 0): zero-point must snap to 128.""" + act_zp, act_sc = self._quantize( + self.signed_activations, + extra_options={"ActivationRestrictedAsymmetric": True}, + ) + self.assertEqual(act_zp, 128, f"Expected zp=128 for rmin<0, got {act_zp}") + + def test_option_false_does_not_snap(self): + """When ActivationRestrictedAsymmetric is False, behavior matches standard asymmetric (zp != 128 for signed).""" + act_zp, act_sc = self._quantize( + self.signed_activations, + extra_options={"ActivationRestrictedAsymmetric": False}, + ) + # Standard asymmetric uint8 with rmin=-1, rmax=2 should give non-128 zp (it's ~85) + self.assertNotEqual(act_zp, 128, f"Option=False should not snap to 128, got {act_zp}")