Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions onnxruntime/python/tools/quantization/base_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def __init__(
# the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()`
self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None)
self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False)
self.is_activation_restricted_asymmetric = self.extra_options.get("ActivationRestrictedAsymmetric", False)
self.min_real_range = self.extra_options.get("MinimumRealRange")

self.activation_qType = getattr(activation_qType, "tensor_type", activation_qType)
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/python/tools/quantization/onnx_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ms_domain,
quantize_onnx_initializer,
save_and_reload_model_with_shape_infer,
snap_zero_point_to_uint8,
tensor_proto_to_array,
)
from .registry import CreateOpQuantizer
Expand Down Expand Up @@ -1157,6 +1158,8 @@ def calculate_quantization_params(self):
reduce_range = quant_overrides.get("reduce_range", False)
qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
if self.is_activation_restricted_asymmetric and quant_type == onnx.TensorProto.UINT8 and not symmetric:
zero, scale = snap_zero_point_to_uint8(rmin, rmax)

quantization_params[tensor_name] = QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type)

Expand Down
7 changes: 7 additions & 0 deletions onnxruntime/python/tools/quantization/qdq_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
ms_domain,
normalize_axis,
quantize_onnx_initializer,
snap_zero_point_to_uint8,
tensor_proto_to_array,
)
from .registry import CreateQDQQuantizer
Expand Down Expand Up @@ -1320,6 +1321,12 @@ def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str,
reduce_range = quant_overrides.get("reduce_range", False)
qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric)
zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range)
if (
self.is_activation_restricted_asymmetric
and quant_type == onnx.TensorProto.UINT8
and not symmetric
):
zero, scale = snap_zero_point_to_uint8(rmin, rmax)

return QuantizationParams(zero_point=zero.squeeze(), scale=scale.squeeze(), quant_type=quant_type)

Expand Down
27 changes: 27 additions & 0 deletions onnxruntime/python/tools/quantization/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,33 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non
return [zero_point, scale]


def snap_zero_point_to_uint8(rmin, rmax):
"""Snap a uint8 activation zero-point to 0 (when rmin >= 0) or 128 (when rmin < 0).

Used by the ActivationRestrictedAsymmetric quantization option. Recomputes scale so the
dequantized range still covers [rmin, rmax] without clipping.

:parameter rmin: calibrated minimum activation value (numpy scalar)
:parameter rmax: calibrated maximum activation value (numpy scalar)
:return: (zero_point, scale) with zero_point dtype uint8 and scale dtype float32
"""
rmin = float(numpy.squeeze(rmin))
rmax = float(numpy.squeeze(rmax))
if rmax <= rmin:
# Degenerate range – return neutral values
return numpy.array(0, dtype=numpy.uint8), numpy.array(1.0, dtype=numpy.float32)
if rmin >= 0.0:
zero_point = numpy.array(0, dtype=numpy.uint8)
scale = numpy.array(rmax / 255.0, dtype=numpy.float32)
else:
zero_point = numpy.array(128, dtype=numpy.uint8)
# Choose scale that covers both negative and positive halves without clipping
scale_neg = -rmin / 128.0 # scale needed to represent rmin at q=0
scale_pos = rmax / 127.0 # scale needed to represent rmax at q=255
scale = numpy.array(max(scale_neg, scale_pos), dtype=numpy.float32)
return zero_point, scale


def compute_scale_zp_float8(element_type, std):
"""Calculate the scale s for a float8 type (E4M3FN).
The function assumes the coefficient distribution and the float 8
Expand Down
8 changes: 8 additions & 0 deletions onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def __init__(
key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
(rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
Dyanmic mode currently is supported. Will support more in future.
Expand Down Expand Up @@ -419,6 +421,8 @@ def __init__(
extra_options: key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
(rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False :
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
Expand Down Expand Up @@ -544,6 +548,8 @@ def quantize_static(
key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
(rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
Dyanmic mode currently is supported. Will support more in the future.
Expand Down Expand Up @@ -834,6 +840,8 @@ def quantize_dynamic(
key value pair dictionary for various options in different case. Current used:
extra.Sigmoid.nnapi = True/False (Default is False)
ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
ActivationRestrictedAsymmetric = True/False: (uint8 activations only) snap zero-point to 0
(rmin>=0) or 128 (rmin<0); recompute scale accordingly (default is False).
WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
EnableSubgraph = True/False :
Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
Expand Down
81 changes: 81 additions & 0 deletions onnxruntime/test/python/quantization/test_symmetric_flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,84 @@ def test_3(self):

if __name__ == "__main__":
unittest.main()


class TestRestrictedAsymmetricFlag(unittest.TestCase):
"""Tests for ActivationRestrictedAsymmetric extra-option (uint8 zero-point snapping)."""

def setUp(self):
# All-positive activations (post-ReLU-like): rmin >= 0, expect zp == 0
self.positive_activations = [
np.zeros([1, 2, 32, 32], dtype="float32"),
np.ones([1, 2, 32, 32], dtype="float32") * 2.0,
]
# Signed-range activations: rmin < 0, expect zp == 128
self.signed_activations = [
-1.0 * np.ones([1, 2, 32, 32], dtype="float32"),
+2.0 * np.ones([1, 2, 32, 32], dtype="float32"),
]

self.weights = np.concatenate(
(
-1 * np.ones([1, 1, 2, 2], dtype="float32"),
+1 * np.ones([1, 1, 2, 2], dtype="float32"),
),
axis=1,
)

def _quantize(self, activations, extra_options):
act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape)
res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None])
wgt_init = numpy_helper.from_array(self.weights, "WGT")
conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"])
graph = helper.make_graph([conv_node], "test", [act], [res], initializer=[wgt_init])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 11)])
onnx.save(model, "model_restricted.onnx")

class DummyDataReader(quantization.CalibrationDataReader):
def __init__(self_inner):
self_inner.iterator = ({"ACT": act} for act in activations)

def get_next(self_inner):
return next(self_inner.iterator, None)

quantization.quantize_static(
model_input="model_restricted.onnx",
model_output="quantized_restricted.onnx",
calibration_data_reader=DummyDataReader(),
quant_format=quantization.QuantFormat.QOperator,
activation_type=quantization.QuantType.QUInt8,
weight_type=quantization.QuantType.QUInt8,
op_types_to_quantize=["Conv", "MatMul"],
extra_options=extra_options,
)

model = onnx.load("quantized_restricted.onnx")
act_zp = next(init for init in model.graph.initializer if init.name == "ACT_zero_point").int32_data[0]
act_sc = next(init for init in model.graph.initializer if init.name == "ACT_scale").float_data[0]
return act_zp, act_sc

def test_positive_activations_zp_is_zero(self):
"""All-positive range (rmin >= 0): zero-point must snap to 0."""
act_zp, act_sc = self._quantize(
self.positive_activations,
extra_options={"ActivationRestrictedAsymmetric": True},
)
self.assertEqual(act_zp, 0, f"Expected zp=0 for rmin>=0, got {act_zp}")

def test_signed_activations_zp_is_128(self):
"""Signed range (rmin < 0): zero-point must snap to 128."""
act_zp, act_sc = self._quantize(
self.signed_activations,
extra_options={"ActivationRestrictedAsymmetric": True},
)
self.assertEqual(act_zp, 128, f"Expected zp=128 for rmin<0, got {act_zp}")

def test_option_false_does_not_snap(self):
"""When ActivationRestrictedAsymmetric is False, behavior matches standard asymmetric (zp != 128 for signed)."""
act_zp, act_sc = self._quantize(
self.signed_activations,
extra_options={"ActivationRestrictedAsymmetric": False},
)
# Standard asymmetric uint8 with rmin=-1, rmax=2 should give non-128 zp (it's ~85)
self.assertNotEqual(act_zp, 128, f"Option=False should not snap to 128, got {act_zp}")