Quantization tool: Use nanmin, nanmax, nanmean in calibrator (#23749)

jambayk · guschmue · commit df1fd96975ae · 2025-03-05T16:36:19.000-08:00
### Description - The calibrator uses `np.max/np.min` to get min/max values from collected data. However, these functions return `nan` if any of the array values is `nan` which subsequently leads invalid scale and failure during quantization at https://github.com/microsoft/onnxruntime/blob/93689c5995dcacbb99c3afa9ec477b305c71159f/onnxruntime/python/tools/quantization/quant_utils.py#L293. - When quantizing models with `GroupQueryAttention`, the intermediate activations corresponding to padded tokens can become nan. We can safely ignore such values as they don't contribute to the final model output. - Using `np.nanmax/np.nanmin` ensures that the calibrator can handle `nan` values. If all values are nan, numpy raises a `RuntimeWarning: All-NaN slice encountered` warning which can help debug the eventual scale issue failure. ```python import numpy as np no_nans = np.array([1, 2, 3], dtype=np.float32) some_nans = np.array([np.nan, 1, 2, 3, np.nan, np.nan], dtype=np.float32) all_nans = np.array([np.nan, np.nan], dtype=np.float32) for array in [no_nans, some_nans, all_nans]: print("np.max/np.min:", np.max(array), np.min(array)) print("np.nanmax/np.nanmin:", np.nanmax(array), np.nanmin(array)) ``` Output ```bash np.max/np.min: 3.0 1.0 np.nanmax/np.nanmin: 3.0 1.0 np.max/np.min: nan nan np.nanmax/np.nanmin: 3.0 1.0 np.max/np.min: nan nan np.nanmax/np.nanmin: nan nan RuntimeWarning: All-NaN slice encountered print("np.nanmax/np.nanmin:", np.nanmax(array), np.nanmin(array)) ``` ### Motivation and Context
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
@@ -496,14 +496,14 @@ def compute_data(self) -> TensorsData:
         pairs = []
         for i in range(0, len(added_output_names), 2):
             if self.moving_average:
-                min_value_array = np.mean(merged_added_output_dict[added_output_names[i]], axis=0)
-                max_value_array = np.mean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
+                min_value_array = np.nanmean(merged_added_output_dict[added_output_names[i]], axis=0)
+                max_value_array = np.nanmean(merged_added_output_dict[added_output_names[i + 1]], axis=0)
             else:
-                min_value_array = np.min(merged_added_output_dict[added_output_names[i]], axis=0)
-                max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
+                min_value_array = np.nanmin(merged_added_output_dict[added_output_names[i]], axis=0)
+                max_value_array = np.nanmax(merged_added_output_dict[added_output_names[i + 1]], axis=0)
 
             if self.symmetric:
-                max_absolute_value = np.max([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
+                max_absolute_value = np.nanmax([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
                 pairs.append((-max_absolute_value, max_absolute_value))
             else:
                 pairs.append((min_value_array, max_value_array))
@@ -834,8 +834,8 @@ def collect_absolute_value(self, name_to_arr):
                 data_arr_np = data_arr
             data_arr_np = data_arr_np.flatten()
             if data_arr_np.size > 0:
-                min_value = np.min(data_arr_np)
-                max_value = np.max(data_arr_np)
+                min_value = np.nanmin(data_arr_np)
+                max_value = np.nanmax(data_arr_np)
             else:
                 min_value = np.array(0, dtype=data_arr_np.dtype)
                 max_value = np.array(0, dtype=data_arr_np.dtype)
@@ -858,7 +858,7 @@ def collect_absolute_value(self, name_to_arr):
                 assert hasattr(old_max, "dtype"), f"old_min should be a numpy array but is {type(old_max)}"
                 old_hist = old_histogram[0]
                 old_hist_edges = old_histogram[1]
-                temp_amax = np.max(data_arr_np)
+                temp_amax = np.nanmax(data_arr_np)
                 if temp_amax > old_hist_edges[-1]:
                     # increase the number of bins
                     width = old_hist_edges[1] - old_hist_edges[0]
@@ -882,8 +882,8 @@ def collect_value(self, name_to_arr):
             data_arr = data_arr.flatten()  # noqa: PLW2901
 
             if data_arr.size > 0:
-                min_value = np.min(data_arr)
-                max_value = np.max(data_arr)
+                min_value = np.nanmin(data_arr)
+                max_value = np.nanmax(data_arr)
             else:
                 min_value = np.array(0, dtype=data_arr.dtype)
                 max_value = np.array(0, dtype=data_arr.dtype)
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -290,7 +290,7 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non
     dr = numpy.array(rmax - rmin, dtype=numpy.float64)
     dq = numpy.array(qmax, dtype=numpy.float64) - numpy.array(qmin, dtype=numpy.float64)
     scale = numpy.array(dr / dq)
-    assert scale >= 0, "scale isse"
+    assert scale >= 0, "scale issue"
     if scale < numpy.finfo(rmax.dtype).tiny:
         scale = numpy.array(1.0, dtype=rmax.dtype)
         zero_point = numpy.array(0, dtype=qmin.dtype)