Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions onnxruntime/python/tools/quantization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
CalibrationDataReader,
CalibrationMethod,
MinMaxCalibrater,
TensorData,
TensorsData,
create_calibrator,
load_tensors_data,
save_tensors_data,
)
from .qdq_quantizer import QDQQuantizer # noqa: F401
from .quant_utils import QuantFormat, QuantType, write_calibration_table # noqa: F401
Expand Down
66 changes: 66 additions & 0 deletions onnxruntime/python/tools/quantization/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import abc
import copy
import itertools
import json
import os
import uuid
from collections.abc import Sequence
Expand Down Expand Up @@ -98,6 +99,19 @@ def to_dict(self):
data["CLS"] = self.__class__.__name__
return data

@classmethod
def from_dict(cls, d: dict) -> "TensorData":
"""Reconstruct a TensorData from a dict produced by to_dict()."""
kwargs = {}
for k, v in d.items():
if k == "CLS":
continue
value = v
if isinstance(value, dict) and value.get("CLS") == "numpy.array":
value = np.array(value["data"], dtype=np.dtype(value["dtype"]))
kwargs[k] = value
return cls(**kwargs)


class TensorsData:
def __init__(self, calibration_method, data: dict[str, TensorData | tuple]):
Expand Down Expand Up @@ -150,6 +164,18 @@ def to_dict(self):
}
return data

@classmethod
def from_dict(cls, d: dict) -> "TensorsData":
"""Reconstruct a TensorsData from a dict produced by to_dict()."""
method_val = d["calibration_method"]
if isinstance(method_val, dict) and method_val.get("CLS") == "CalibrationMethod":
name = method_val["value"].split(".")[-1]
method = CalibrationMethod[name]
else:
method = method_val
reconstructed = {k: TensorData.from_dict(v) for k, v in d["data"].items()}
return cls(method, reconstructed)


class CalibrationMethod(Enum):
MinMax = 0
Expand Down Expand Up @@ -184,6 +210,46 @@ def set_range(self, start_index: int, end_index: int):
raise NotImplementedError


class _CalibrationCacheEncoder(json.JSONEncoder):
"""JSON encoder for calibration cache serialization."""

def default(self, obj):
if isinstance(obj, (TensorData, TensorsData)):
return obj.to_dict()
if isinstance(obj, np.ndarray):
return {"data": obj.tolist(), "dtype": str(obj.dtype), "CLS": "numpy.array"}
if isinstance(obj, CalibrationMethod):
return {"CLS": obj.__class__.__name__, "value": str(obj)}
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
return json.JSONEncoder.default(self, obj)


def save_tensors_data(tensors_data: "TensorsData", path: "str | Path") -> None:
"""Serialize calibration tensor ranges to a JSON file at *path*."""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
with tmp.open("w") as f:
json.dump(tensors_data, f, cls=_CalibrationCacheEncoder)
f.flush()
os.replace(tmp, path)


def load_tensors_data(path: "str | Path") -> "TensorsData":
"""Load calibration tensor ranges from a JSON file written by save_tensors_data()."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Calibration cache not found: {path}")
Comment thread
Rishi-Dave marked this conversation as resolved.
if not path.is_file():
raise ValueError(f"Calibration cache path is not a file: {path}")
with path.open("r") as f:
d = json.load(f)
return TensorsData.from_dict(d)


class CalibraterBase:
def __init__(
self,
Expand Down
123 changes: 81 additions & 42 deletions onnxruntime/python/tools/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,14 @@

import onnx

from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
from .calibrate import (
CalibrationDataReader,
CalibrationMethod,
TensorsData,
create_calibrator,
load_tensors_data,
save_tensors_data,
)
from .onnx_quantizer import ONNXQuantizer
from .qdq_quantizer import QDQQuantizer
from .quant_utils import (
Expand Down Expand Up @@ -479,7 +486,7 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
def quantize_static(
model_input: str | Path | onnx.ModelProto,
model_output: str | Path,
calibration_data_reader: CalibrationDataReader,
calibration_data_reader: CalibrationDataReader | None = None,
quant_format=QuantFormat.QDQ,
op_types_to_quantize=None,
per_channel=False,
Expand All @@ -492,6 +499,7 @@ def quantize_static(
calibrate_method=CalibrationMethod.MinMax,
calibration_providers=None,
extra_options=None,
calibration_cache_path: str | Path | None = None,
):
Comment thread
Rishi-Dave marked this conversation as resolved.
"""
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
Expand All @@ -506,7 +514,13 @@ def quantize_static(
model_output: file path of quantized model
calibration_data_reader: a calibration data reader. It
enumerates calibration data and generates inputs for the
original model.
original model. May be None if calibration_cache_path points to an
existing cache file.
calibration_cache_path: optional path to a JSON calibration cache. If
the file already exists, calibration inference is skipped and the
cached tensor ranges are loaded instead. If the file does not yet
exist, calibration runs normally and the result is saved to this
path for future reuse.
quant_format: QuantFormat{QOperator, QDQ}.
QOperator format quantizes the model with quantized operators directly.
QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
Expand Down Expand Up @@ -673,6 +687,11 @@ def quantize_static(
}

if extra_options.get("SmoothQuant", False):
if calibration_data_reader is None:
raise ValueError(
"SmoothQuant requires a non-None calibration_data_reader; the calibration cache "
"stores per-tensor ranges only and cannot drive the SmoothQuant transform."
)
import importlib # noqa: PLC0415

try:
Expand Down Expand Up @@ -704,48 +723,68 @@ def inc_dataloader():
if is_model_updated:
model = updated_model

with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if is_model_updated:
# Update model_input and avoid to use the original one
model_input = copy.deepcopy(model)

if isinstance(model_input, onnx.ModelProto):
output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
onnx.save_model(
model_input,
output_path,
save_as_external_data=True,
_cache_path = Path(calibration_cache_path) if calibration_cache_path is not None else None
if _cache_path is not None and _cache_path.exists() and not _cache_path.is_file():
raise ValueError(f"calibration_cache_path is not a file: {_cache_path}")
_cache_hit = _cache_path is not None and _cache_path.is_file()

if _cache_hit:
tensors_range = load_tensors_data(_cache_path)
if tensors_range.calibration_method != calibrate_method:
Comment thread
Rishi-Dave marked this conversation as resolved.
raise ValueError(
f"Calibration cache at {_cache_path} was produced with "
f"{tensors_range.calibration_method}, but quantize_static was called "
f"with calibrate_method={calibrate_method}. Delete the cache or "
f"pass a matching calibrate_method."
)
else:
if calibration_data_reader is None:
raise ValueError("Either calibration_data_reader or an existing calibration_cache_path must be provided.")
with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
if is_model_updated:
# Update model_input and avoid to use the original one
model_input = copy.deepcopy(model)

if isinstance(model_input, onnx.ModelProto):
output_path = Path(quant_tmp_dir).joinpath("model_input.onnx").as_posix()
onnx.save_model(
model_input,
output_path,
save_as_external_data=True,
)
model_input = output_path

calibrator = create_calibrator(
Path(model_input),
op_types_to_quantize,
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
calibrate_method=calibrate_method,
use_external_data_format=use_external_data_format,
providers=calibration_providers,
extra_options=calib_extra_options,
)
model_input = output_path

calibrator = create_calibrator(
Path(model_input),
op_types_to_quantize,
augmented_model_path=Path(quant_tmp_dir).joinpath("augmented_model.onnx").as_posix(),
calibrate_method=calibrate_method,
use_external_data_format=use_external_data_format,
providers=calibration_providers,
extra_options=calib_extra_options,
)

stride = extra_options.get("CalibStridedMinMax", None)
if stride:
total_data_size = len(calibration_data_reader)
if total_data_size % stride != 0:
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")

for start in range(0, total_data_size, stride):
end_index = start + stride
calibration_data_reader.set_range(start_index=start, end_index=end_index)
stride = extra_options.get("CalibStridedMinMax", None)
if stride:
total_data_size = len(calibration_data_reader)
if total_data_size % stride != 0:
raise ValueError(f"Total data size ({total_data_size}) is not divisible by stride size ({stride}).")

for start in range(0, total_data_size, stride):
end_index = start + stride
calibration_data_reader.set_range(start_index=start, end_index=end_index)
calibrator.collect_data(calibration_data_reader)
else:
calibrator.collect_data(calibration_data_reader)
else:
calibrator.collect_data(calibration_data_reader)
tensors_range = calibrator.compute_data()
if not isinstance(tensors_range, TensorsData):
raise TypeError(
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
)
del calibrator
tensors_range = calibrator.compute_data()
if not isinstance(tensors_range, TensorsData):
raise TypeError(
f"Unexpected type {type(tensors_range)} for tensors_range and calibrator={type(calibrator)}."
)
del calibrator

if _cache_path is not None:
save_tensors_data(tensors_range, _cache_path)

check_static_quant_arguments(quant_format, activation_type, weight_type)

Expand Down
Loading