-
Notifications
You must be signed in to change notification settings - Fork 279
Description
I am trying to quantize pytroch and onnx models. The quantization API performs mixed precision on default with fp32+int8+bf16+fp16 data types. However, I wanted to obtain baseline int8 accuracy of model just like we get baseline fp32. I tried 'excluded_precisions = ["bf16", "fp16", "fp32"]' in the PostTrainingQuantConfig but it did not work.
I am using below setting for quantizing onnx model. Please let me know what I can do to get only int8 quantized model. Any help is appreciated.
Thanks
def eval_func(model_input):
# Handle both file path and ModelProto
if isinstance(model_input, onnx.onnx_ml_pb2.ModelProto):
# Save ModelProto to a temporary file
with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmp_file:
onnx.save(model_input, tmp_file.name)
model_path = tmp_file.name
else:
# Assume model_input is a file path
model_path = model_input
# Configure ONNX Runtime session options to avoid thread affinity issues
sess_opts = ort.SessionOptions()
cpu_count = int(os.getenv('SLURM_CPUS_PER_TASK', 8)) # Fallback to 8 if undefined
sess_opts.inter_op_num_threads = cpu_count
sess_opts.intra_op_num_threads = cpu_count
# Initialize ONNX Runtime session with GPU provider
session = ort.InferenceSession(
model_path,
providers=['CUDAExecutionProvider', 'CPUExecutionProvider'],
sess_options=sess_opts
)
input_name = session.get_inputs()[0].name
correct = 0
total = 0
for images, labels in val_loader:
images = images.numpy() # Convert to numpy for ONNX
outputs = session.run(None, {input_name: images})[0]
_, predicted = torch.max(torch.tensor(outputs), 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
# Clean up temporary file if created
if isinstance(model_input, onnx.onnx_ml_pb2.ModelProto):
os.unlink(model_path)
accuracy = 100 * correct / total
import pdb; pdb.set_trace()
return accuracy
if args.tune:
if not args.onnx_model:
print("Error: --onnx-model must be specified for ONNX mixed precision tuning")
sys.exit(1)
from neural_compressor import PostTrainingQuantConfig
from neural_compressor import quantization
conf = PostTrainingQuantConfig(
#framework="onnxruntime",
device="gpu",
backend="onnxrt_cuda_ep",
inputs=["input"], # Replace with your model's input names
outputs=["output"], # Replace with your model's output names
excluded_precisions=['fp16']
)
output_model = quantization.fit(args.onnx_model, conf, eval_func=eval_func, calib_dataloader=val_loader)
output_model.save(args.tuned_checkpoint)
return