cleanup

yuanyao-nv · yuanyao-nv · commit f4ca510aee16 · 2024-08-07T06:17:22.000Z
Signed-off-by: Yuan Yao &lt;yuanyao@nvidia.com&gt;
diff --git a/docs/Operators.md b/docs/Operators.md
@@ -3747,7 +3747,9 @@ for from_type, to_type in test_cases:
             raise ValueError(
                 f"Conversion from {from_type} to {to_type} is not tested."
             )
-        expected = vect_evaluate_float4e2m1_from_bits(subbyte.float32_to_float4e2m1_unpacked(np_fp32))
+        expected = vect_evaluate_float4e2m1_from_bits(
+            subbyte.float32_to_float4e2m1_unpacked(np_fp32)
+        )
         output = make_tensor(
             "y", getattr(TensorProto, to_type), input_shape, expected.tolist()
         )
@@ -20767,45 +20769,33 @@ expect(
 
 
 <details>
-<summary>e2m1</summary>
+<summary>e4m3fn</summary>
 
 ```python
 node = onnx.helper.make_node(
     "QuantizeLinear",
     inputs=["x", "y_scale", "y_zero_point"],
     outputs=["y"],
-    axis=0,
 )
 
-x = np.array(
-    [
-        [0.0, 2.5, 4.8, 8.6],
-        [-30, -20, 6, 9],
-        [-0.0, -2.5, -4.8, -8.6],
-    ]
-).astype(np.float32)
-
-y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
-y_zero_point = make_tensor(
-    "y_zero_point", TensorProto.FLOAT4E2M1, y_scale.shape, np.zeros_like(y_scale)
-)
-y = make_tensor(
-    "y", TensorProto.FLOAT4E2M1, x.shape, [0, 1, 2, 4, -6, -6, 2, 3, 0, -0.5, -1, -2]
-)
+x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
+y_scale = np.float32(2)
+y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
+y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
 
 expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_float4e2m1",
+    name="test_quantizelinear_e4m3fn",
 )
 ```
 
 </details>
 
 
 <details>
-<summary>e4m3fn</summary>
+<summary>e5m2</summary>
 
 ```python
 node = onnx.helper.make_node(
@@ -20816,40 +20806,58 @@ node = onnx.helper.make_node(
 
 x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
 y_scale = np.float32(2)
-y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
-y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
+y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
+y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
 
 expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_e4m3fn",
+    name="test_quantizelinear_e5m2",
 )
 ```
 
 </details>
 
 
 <details>
-<summary>e5m2</summary>
+<summary>float4e2m1</summary>
 
 ```python
 node = onnx.helper.make_node(
     "QuantizeLinear",
     inputs=["x", "y_scale", "y_zero_point"],
     outputs=["y"],
+    axis=0,
 )
 
-x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
-y_scale = np.float32(2)
-y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
-y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
+x = np.array(
+    [
+        [0.0, 2.5, 4.8, 8.6],
+        [-30, -20, 6, 9],
+        [-0.0, -2.5, -4.8, -8.6],
+    ]
+).astype(np.float32)
+
+y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+y_zero_point = make_tensor(
+    "y_zero_point",
+    TensorProto.FLOAT4E2M1,
+    y_scale.shape,
+    np.zeros_like(y_scale),
+)
+y = make_tensor(
+    "y",
+    TensorProto.FLOAT4E2M1,
+    x.shape,
+    [0, 1, 2, 4, -6, -6, 2, 3, 0, -0.5, -1, -2],
+)
 
 expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_e5m2",
+    name="test_quantizelinear_float4e2m1",
 )
 ```
 
diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
@@ -2616,7 +2616,9 @@ for from_type, to_type in test_cases:
             raise ValueError(
                 f"Conversion from {from_type} to {to_type} is not tested."
             )
-        expected = vect_evaluate_float4e2m1_from_bits(subbyte.float32_to_float4e2m1_unpacked(np_fp32))
+        expected = vect_evaluate_float4e2m1_from_bits(
+            subbyte.float32_to_float4e2m1_unpacked(np_fp32)
+        )
         output = make_tensor(
             "y", getattr(TensorProto, to_type), input_shape, expected.tolist()
         )
@@ -14184,43 +14186,31 @@ expect(
 
 </details>
 <details>
-<summary>e2m1</summary>
+<summary>e4m3fn</summary>
 
 ```python
 node = onnx.helper.make_node(
     "QuantizeLinear",
     inputs=["x", "y_scale", "y_zero_point"],
     outputs=["y"],
-    axis=0,
 )
 
-x = np.array(
-    [
-        [0.0, 2.5, 4.8, 8.6],
-        [-30, -20, 6, 9],
-        [-0.0, -2.5, -4.8, -8.6],
-    ]
-).astype(np.float32)
-
-y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
-y_zero_point = make_tensor(
-    "y_zero_point", TensorProto.FLOAT4E2M1, y_scale.shape, np.zeros_like(y_scale)
-)
-y = make_tensor(
-    "y", TensorProto.FLOAT4E2M1, x.shape, [0, 1, 2, 4, -6, -6, 2, 3, 0, -0.5, -1, -2]
-)
+x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
+y_scale = np.float32(2)
+y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
+y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
 
 expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_float4e2m1",
+    name="test_quantizelinear_e4m3fn",
 )
 ```
 
 </details>
 <details>
-<summary>e4m3fn</summary>
+<summary>e5m2</summary>
 
 ```python
 node = onnx.helper.make_node(
@@ -14231,38 +14221,56 @@ node = onnx.helper.make_node(
 
 x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
 y_scale = np.float32(2)
-y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E4M3FN, [1], [0])
-y = make_tensor("y", TensorProto.FLOAT8E4M3FN, [5], [0, 0.5, 1, 448, 96])
+y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
+y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
 
 expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_e4m3fn",
+    name="test_quantizelinear_e5m2",
 )
 ```
 
 </details>
 <details>
-<summary>e5m2</summary>
+<summary>float4e2m1</summary>
 
 ```python
 node = onnx.helper.make_node(
     "QuantizeLinear",
     inputs=["x", "y_scale", "y_zero_point"],
     outputs=["y"],
+    axis=0,
 )
 
-x = np.array([0.0, 1.0, 2.0, 100000.0, 200.0]).astype(np.float32)
-y_scale = np.float32(2)
-y_zero_point = make_tensor("y_zero_point", TensorProto.FLOAT8E5M2, [1], [0.0])
-y = make_tensor("y", TensorProto.FLOAT8E5M2, [5], [0, 0.5, 1, 49152, 96])
+x = np.array(
+    [
+        [0.0, 2.5, 4.8, 8.6],
+        [-30, -20, 6, 9],
+        [-0.0, -2.5, -4.8, -8.6],
+    ]
+).astype(np.float32)
+
+y_scale = np.asarray([2.0, 3.0, 4.0], dtype=np.float32)
+y_zero_point = make_tensor(
+    "y_zero_point",
+    TensorProto.FLOAT4E2M1,
+    y_scale.shape,
+    np.zeros_like(y_scale),
+)
+y = make_tensor(
+    "y",
+    TensorProto.FLOAT4E2M1,
+    x.shape,
+    [0, 1, 2, 4, -6, -6, 2, 3, 0, -0.5, -1, -2],
+)
 
 expect(
     node,
     inputs=[x, y_scale, y_zero_point],
     outputs=[y],
-    name="test_quantizelinear_e5m2",
+    name="test_quantizelinear_float4e2m1",
 )
 ```
 
diff --git a/docs/docsgen/source/technical/float4.md b/docs/docsgen/source/technical/float4.md
@@ -57,7 +57,7 @@ The float value is defined by the following expressions:
    :header-rows: 1
 
    * -
-     - E4M3FN
+     - E2M1
    * - exponent :math:`\neq` 0
      - :math:`(-1)^S 2^{\sum_{i=1}^2 b_i 2^{i-1} - 1} \left( 1 + b_0 2^{-1} \right)`
    * - exponent :math:`=` 0
@@ -108,8 +108,8 @@ The behavior for downcasting to float 4 is summarzied below
 ## Packing and Unpacking
 
 Float4 is stored as 2x4bit in a single byte.
-The first element is stored in the 4 LSB and the second element is stored in the 4 MSB.
-i.e. for elements x, y, that are consecutive elements in the array:
+The first element is stored in the 4 LSB and the second element is stored in the 4 MSB,
+i.e. for elements `x` and `y` that are consecutive elements in the array:
 ```
 pack(x,y): y << 4 | x & 0x0F
 unpack(z): x = z & 0x0F, y = z >> 4
diff --git a/onnx/_custom_element_types.py b/onnx/_custom_element_types.py
@@ -52,6 +52,9 @@
 #: than its onnx size.
 int4 = np.dtype((np.int8, {"int4": (np.int8, 0)}))
 
+#: Defines float 4 e2m1 type, see See :ref:`onnx-detail-float4` for technical details.
+#: Do note that one integer is stored using a byte and therefore is twice bigger
+#: than its onnx size.
 float4e2m1 = np.dtype((np.uint8, {"float4e2m1": (np.uint8, 0)}))
 
 mapping_name_to_data_type = {
diff --git a/onnx/backend/test/case/node/quantizelinear.py b/onnx/backend/test/case/node/quantizelinear.py
@@ -277,7 +277,7 @@ def export_int4() -> None:
         )
 
     @staticmethod
-    def export_e2m1() -> None:
+    def export_float4e2m1() -> None:
         node = onnx.helper.make_node(
             "QuantizeLinear",
             inputs=["x", "y_scale", "y_zero_point"],
diff --git a/onnx/helper.py b/onnx/helper.py
@@ -673,15 +673,14 @@ def pack_float32_to_4bit(array: np.ndarray | Sequence, signed: bool) -> np.ndarr
 
 
 def pack_float32_to_float4e2m1(array: np.ndarray | Sequence) -> np.ndarray:
-    """Convert an array of float32 value to a 4bit data-type and pack every two concecutive elements in a byte.
-    See :ref:`onnx-detail-int4` for technical details.
+    """Convert an array of float32 value to float4e2m1 and pack every two concecutive elements in a byte.
+    See :ref:`onnx-detail-float4` for technical details.
 
     Args:
         array: array of float to convert and pack
-        signed: Whether the 4 bit variant is signed or unsigned
 
     Returns:
-        Packed array with size `ceil(farray.size/2)` (single dimension).
+        Packed array of float4e2m1 (as uint8) with size `ceil(farray.size/2)` (single dimension).
     """
     if not isinstance(array, np.ndarray):
         array = np.asarray(array, dtype=np.float32)
@@ -757,7 +756,6 @@ def make_tensor(
             data_type in (TensorProto.UINT4, TensorProto.INT4, TensorProto.FLOAT4E2M1)
             and len(vals) == expected_size + 0.5
         ):
-            print("$$$$$$", data_type, vals, len(vals), expected_size, dims)
             raise ValueError(
                 f"Number of values does not match tensor's size. Expected {expected_size}, but it is {len(vals)}. "
             )
diff --git a/onnx/numpy_helper.py b/onnx/numpy_helper.py
@@ -221,7 +221,7 @@ def unpack_int4(
     return res
 
 
-def evaluate_float4e2m1_from_bits(x):
+def evaluate_float4e2m1_from_bits(x: np.uint8) -> np.float32:
     """Evaluate the numerical value of a single float4e2m1 element represented as uint8
     See :ref:`onnx-detail-int4` for technical details.
 
@@ -232,7 +232,6 @@ def evaluate_float4e2m1_from_bits(x):
         Packed array with size `ceil(farray.size/2)` (single dimension).
     """
     # x is stored in 4 LSB of int
-    # assert(isinstance(x, np.uint8))
     S = -1 if bool(x & 0x08) else 1
     M = x & 0x01
     E = (x & 0x06) >> 1
@@ -509,8 +508,8 @@ def to_array(tensor: TensorProto, base_dir: str = "") -> np.ndarray:
             data = tensor.int32_data
         shape = tuple(tensor.dims)
 
-        # 2 packed fp4e2m1 elements must be represented as a single uint8 value.
-        # Therefore, y is np.uint8 (not the dtype to which the int4 maps)
+        # 2 packed float4e2m1 elements must be represented as a single uint8 value.
+        # Therefore, y is np.uint8.
         y = np.empty(len(data), dtype=custom_np_types.float4e2m1).ravel()  # type: ignore[assignment]
         for i, d in enumerate(data):
             y[i] = d
diff --git a/onnx/reference/ops/op_cast.py b/onnx/reference/ops/op_cast.py
@@ -139,12 +139,8 @@ def cast_to(x, to, saturate):  # noqa: PLR0911
             return res.astype(np.float16)
 
     if to == TensorProto.FLOAT4E2M1:
-        xf = x.astype(np.float32).ravel()
-        y = np.empty(xf.shape, dtype=float4e2m1).ravel()
-        for i in range(y.shape[0]):
-            el = subbyte.float32_to_float4e2m1_unpacked(xf[i])
-            y[i] = el
-        # This operator preduces a tensor with the same shape for INT4.
+        xf = x.astype(np.float32)
+        y = subbyte.float32_to_float4e2m1_unpacked(xf)
         return y.reshape(x.shape)
 
     if to == TensorProto.STRING:
diff --git a/onnx/reference/ops/op_dequantize_linear.py b/onnx/reference/ops/op_dequantize_linear.py
@@ -93,12 +93,8 @@ def _run(
             elif x_type == TensorProto.FLOAT8E5M2FNUZ:
                 dx = float8e5m2_to_float32(x, fn=True, uz=True)
             elif x_type == TensorProto.FLOAT4E2M1:
-                x_shape = x.shape
-                dx = np.empty(x.shape, dtype=np.float32).ravel()
-                xr = x.ravel()
-                for i in range(x.flatten().size):
-                    dx[i] = evaluate_float4e2m1_from_bits(xr[i])
-                dx.reshape(x_shape)
+                evaluate_func = np.vectorize(evaluate_float4e2m1_from_bits)
+                dx = evaluate_func(x)
             else:
                 dx = x.astype(np.float32)
         y = dx * reshape_input(x_scale, x.shape, axis, block_size)
diff --git a/onnx/reference/ops/op_quantize_linear.py b/onnx/reference/ops/op_quantize_linear.py
@@ -207,11 +207,8 @@ def _run(
             return (i4,)  # type: ignore[attr-defined]
 
         if tensor_type == TensorProto.FLOAT4E2M1:
-            # x += zero_point
-            def single_func(x):
-                return subbyte.float32_to_float4e2m1_unpacked(x)
-
-            func = np.vectorize(single_func)
+            x += zero_point
+            func = np.vectorize(subbyte.float32_to_float4e2m1_unpacked)
             f4 = func(x)
             return (f4,)  # type: ignore[attr-defined]
 
diff --git a/onnx/subbyte.py b/onnx/subbyte.py
diff --git a/onnx/test/test_backend_reference.py b/onnx/test/test_backend_reference.py

Original file line number	Diff line number	Diff line change
`@@ -277,7 +277,7 @@ def export_int4() -> None:`
`277`	`277`	`)`
`278`	`278`
`279`	`279`	`@staticmethod`
`280`		`- def export_e2m1() -> None:`
	`280`	`+ def export_float4e2m1() -> None:`
`281`	`281`	`node = onnx.helper.make_node(`
`282`	`282`	`"QuantizeLinear",`
`283`	`283`	`inputs=["x", "y_scale", "y_zero_point"],`