Arm backend: Move rescales from SUB visitor to pass

Martin Lindström · oscarandersson8218 · Martin Lindström · commit 07d87384aa10 · 2025-10-24T14:14:55.000+02:00
Move the insertion of INT8/INT32 RESCALE ops from the SUB node visitor to the pass InsertRescaleInt32Pass. This is in practice a refactoring patch, but still the output TOSA file becomes different enough to cause an Ethos-U55 test to fail in test_var.py. However, this issue was fixed in https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/commit/642f7517d3a6bd053032e1942822f6e38ccd546f so we temporarily set the failing test to xfail until the version of Ethos-U Vela compiler depended on is bumped to one that includes the fix. Signed-off-by: Martin Lindstroem <Martin.Lindstroem@arm.com> Co-authored-by: Oscar Andersson <Oscar.Andersson@arm.com> Change-Id: I38d63015e03e59c267338c84d64731b050854d06
diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py
@@ -101,6 +101,7 @@ class InsertRescaleInt32Pass(ArmPass):
         exir_ops.edge.aten.maximum.default,
         exir_ops.edge.aten.minimum.default,
         exir_ops.edge.aten.mul.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.sum.dim_IntList,
     ]
 
@@ -144,6 +145,7 @@ def _get_inputs_rescaled_qparams(
             }
         elif target in [
             exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
         ]:
             if input_qparams[0].dtype != input_qparams[1].dtype:
                 raise ValueError(
@@ -196,6 +198,7 @@ def _get_output_qparams(
             exir_ops.edge.aten.minimum.default,
             exir_ops.edge.aten.sum.dim_IntList,
             exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.sub.Tensor,
         ]:
             # The op has not altered the scale; the output scale is equal to
             # the operands' scales.
diff --git a/backends/arm/operators/op_sub.py b/backends/arm/operators/op_sub.py
@@ -7,8 +7,6 @@
 
 from typing import Any, List
 
-import executorch.backends.arm.tosa.quant_utils as tqutils
-import executorch.backends.arm.tosa.utils as tutils
 import tosa_serializer as ts
 
 from executorch.backends.arm.operators.node_visitor import (
@@ -20,22 +18,20 @@
     validate_same_dtype,
     validate_valid_dtype,
 )
-from executorch.backends.arm.tosa import TosaSpecification
 from executorch.backends.arm.tosa.mapping import TosaArg
+from executorch.backends.arm.tosa.specification import TosaSpecification
 from torch.fx import Node
 
 
 @register_node_visitor
-class SubVisitor_INT(NodeVisitor):
+class SubVisitor(NodeVisitor):
     target = "aten.sub.Tensor"
 
     tosa_specs = [
         TosaSpecification.create_from_string("TOSA-1.0+INT"),
+        TosaSpecification.create_from_string("TOSA-1.0+FP"),
     ]
 
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def define_node(
         self,
         node: Node,
@@ -48,106 +44,21 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [*inputs, output],
-            [ts.DType.INT8, ts.DType.INT16, ts.DType.INT32],
+            [ts.DType.INT32, ts.DType.FP32],
             output.tosa_spec,
         )
 
-        scale_back = 1.0
-        if inputs[0].dtype == ts.DType.INT8:
-            rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32_maxscale(
-                tosa_graph, inputs, node, self.tosa_spec
-            )
-        elif inputs[0].dtype == ts.DType.INT16:
-            rescaled_inputs, scale_back = (
-                tqutils.insert_rescale_ops_int16_to_int32_maxscale(
-                    tosa_graph, inputs, node, self.tosa_spec
-                )
-            )
-        else:
-            # input[0].dtype == ts.DType.INT32
-            # Non quantized input, natively support by TOSA.SUB
-            rescaled_inputs = inputs
-
-        if output.dtype in [ts.DType.INT8, ts.DType.INT16]:
-            broadcasted_shape = tutils.tosa_shape(output.shape, output.dim_order)
-            sub_output = tosa_graph.addIntermediate(broadcasted_shape, ts.DType.INT32)
-        else:
-            # output.dtype == ts.DType.INT32
-            sub_output = output
-
-        # Do the INT32 Sub
         attr = ts.TosaSerializerAttribute()
         attr.SubAttribute()
+
         self._serialize_operator(
             node,
             tosa_graph,
             ts.Op.SUB,
             [
-                rescaled_inputs[0].name,
-                rescaled_inputs[1].name,
+                inputs[0].name,
+                inputs[1].name,
             ],
-            [sub_output.name],
+            [output.name],
             attr,
         )
-
-        if output.dtype == ts.DType.INT8:
-            # Scale output back to 8 bit
-            # pyre-ignore
-            tqutils.insert_rescale_op_to_int8(
-                tosa_graph,
-                sub_output,
-                scale_back,
-                node,
-                compute_rescale=False,
-                tosa_spec=self.tosa_spec,
-            )  # type: ignore[possibly-undefined]
-        elif output.dtype == ts.DType.INT16:
-            tqutils.insert_rescale_op_to_int16(
-                tosa_graph,
-                sub_output,
-                scale_back,
-                node,
-                compute_rescale=False,
-                tosa_spec=self.tosa_spec,
-            )  # type: ignore[possibly-undefined]
-
-
-@register_node_visitor
-class SubVisitor_FP(SubVisitor_INT):
-    # inheriting 'target' from INT class
-
-    tosa_specs = [TosaSpecification.create_from_string("TOSA-1.0+FP")]
-
-    def __init__(self, *args):
-        super().__init__(*args)
-
-    def define_node(
-        self,
-        node: Node,
-        tosa_graph: Any,
-        inputs: List[TosaArg],
-        output: TosaArg,
-    ) -> None:
-        validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output], ts)
-
-        if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
-            # Call the inherited define_node for handling integers
-            super().define_node(node, tosa_graph, inputs, output)
-        else:
-            # FP32 Sub lowering
-            validate_valid_dtype(
-                self.target, [*inputs, output], ts.DType.FP32, output.tosa_spec
-            )
-
-            # MI lowering
-            attr = ts.TosaSerializerAttribute()
-            attr.SubAttribute()
-            self._serialize_operator(
-                node,
-                tosa_graph,
-                ts.Op.SUB,
-                [inputs[0].name, inputs[1].name],
-                [output.name],
-                attr,
-            )
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
@@ -344,7 +344,17 @@ def test_var_dim_tosa_INT_correction(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", VarCorrection.test_parameters)
+# TODO: Xfail "var_3d_dims_keep_dim_0_correction" until the Ethos-U Vela compiler ships commit
+# 642f7517d3a6bd053032e1942822f6e38ccd546f. That patch fixes the bug that causes the test to fail.
+@common.parametrize(
+    "test_data",
+    VarCorrection.test_parameters,
+    xfails={
+        "var_3d_dims_keep_dim_0_correction": (
+            "Blocked by Vela commit 642f7517d3a6bd053032e1942822f6e38ccd546f"
+        ),
+    },
+)
 @common.XfailIfNoCorstone300
 def test_var_dim_u55_INT_correction(test_data: Tuple):
     test_data, dim, keepdim, correction = test_data()
diff --git a/backends/arm/test/passes/test_insert_rescale_i32_pass.py b/backends/arm/test/passes/test_insert_rescale_i32_pass.py
@@ -19,7 +19,7 @@ class MultipleOpsModel(torch.nn.Module):
     input_t = Tuple[torch.Tensor, torch.Tensor]
 
     def forward(self, x, y):
-        a = x + y
+        a = x - y
         b = x * a
         c = torch.maximum(a, b)
         d = torch.abs(b)
diff --git a/backends/arm/tosa/quant_utils.py b/backends/arm/tosa/quant_utils.py