How to quantize a model (pytorch/onnx) with only int8 data type instead of default mixed precision?

I am trying to quantize pytroch and onnx models. The quantization API performs mixed precision on default with fp32+int8+bf16+fp16 data types. However, I wanted to obtain baseline int8 accuracy of model just like we get baseline fp32. I tried 'excluded_precisions = ["bf16", "fp16", "fp32"]' in the PostTrainingQuantConfig but it did not work. 

I am using below setting for quantizing onnx model. Please let me know what I can do to get only int8 quantized model. Any help is appreciated.

Thanks

def eval_func(model_input):
        # Handle both file path and ModelProto
        if isinstance(model_input, onnx.onnx_ml_pb2.ModelProto):
            # Save ModelProto to a temporary file
            with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmp_file:
                onnx.save(model_input, tmp_file.name)
                model_path = tmp_file.name
        else:
            # Assume model_input is a file path
            model_path = model_input

        # Configure ONNX Runtime session options to avoid thread affinity issues
        sess_opts = ort.SessionOptions()
        cpu_count = int(os.getenv('SLURM_CPUS_PER_TASK', 8))  # Fallback to 8 if undefined
        sess_opts.inter_op_num_threads = cpu_count
        sess_opts.intra_op_num_threads = cpu_count

        # Initialize ONNX Runtime session with GPU provider
        session = ort.InferenceSession(
            model_path,
            providers=['CUDAExecutionProvider', 'CPUExecutionProvider'],
            sess_options=sess_opts
        )
        input_name = session.get_inputs()[0].name
        correct = 0
        total = 0
        for images, labels in val_loader:
            images = images.numpy()  # Convert to numpy for ONNX
            outputs = session.run(None, {input_name: images})[0]
            _, predicted = torch.max(torch.tensor(outputs), 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Clean up temporary file if created
        if isinstance(model_input, onnx.onnx_ml_pb2.ModelProto):
            os.unlink(model_path)

        accuracy = 100 * correct / total
        import pdb; pdb.set_trace()
        return accuracy

    if args.tune:
        if not args.onnx_model:
            print("Error: --onnx-model must be specified for ONNX mixed precision tuning")
            sys.exit(1)
        from neural_compressor import PostTrainingQuantConfig
        from neural_compressor import quantization
        conf = PostTrainingQuantConfig(
            #framework="onnxruntime",
            device="gpu",
            backend="onnxrt_cuda_ep",
            inputs=["input"],  # Replace with your model's input names
            outputs=["output"], # Replace with your model's output names
            excluded_precisions=['fp16']
        )
        output_model = quantization.fit(args.onnx_model, conf, eval_func=eval_func, calib_dataloader=val_loader)
        output_model.save(args.tuned_checkpoint)
        return



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to quantize a model (pytorch/onnx) with only int8 data type instead of default mixed precision? #2229

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

How to quantize a model (pytorch/onnx) with only int8 data type instead of default mixed precision? #2229

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions