diff --git a/docker/Dockerfile.onnx-mlir b/docker/Dockerfile.onnx-mlir
index f028ed29b3..170f3d63b8 100644
--- a/docker/Dockerfile.onnx-mlir
+++ b/docker/Dockerfile.onnx-mlir
@@ -26,7 +26,7 @@ RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
 ARG NPROC=4
 ARG ACCEL=NNPA
 ARG TEST_NOFLOAT16
-ARG TEST_MCPU
+ARG TEST_MARCH
 ARG KEEPSRC
 
 RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
@@ -53,21 +53,21 @@ RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
                                          ([ "$(uname -m)" = "x86_64" ] &&  echo true || \
                                          ([ "$(uname -m)" = "ppc64le" ] && echo || echo)))} \
 # User image is built with SIMD (currently on s390x only)
-    && TEST_MCPU=${TEST_MCPU:-$([ "$(uname -m)" = "s390x" ] && echo z16 || \
+    && TEST_MARCH=${TEST_MARCH:-$([ "$(uname -m)" = "s390x" ] && echo z16 || \
                                ([ "$(uname -m)" = "x86_64" ] &&  echo || \
                                ([ "$(uname -m)" = "ppc64le" ] && echo || echo)))} \
-    && TEST_ARGS="-mcpu=${TEST_MCPU}" \
+    && TEST_ARGS="-march=${TEST_MARCH}" \
     && make check-docs \
     && make check-unittest \
     && make check-multiple-models \
     && make NPROC=${NPROC} \
             CTEST_PARALLEL_LEVEL=${NPROC} \
             TEST_NOFLOAT16=${TEST_NOFLOAT16} \
-            TEST_MCPU=${TEST_MCPU} \
+            TEST_MARCH=${TEST_MARCH} \
             TEST_ARGS="${TEST_ARGS}" \
             -j${NPROC} \
             check-onnx-backend-numerical \
-    && if [ "${TEST_MCPU}" = "z16" ]; then \
+    && if [ "${TEST_MARCH}" = "z16" ]; then \
           make NPROC=${NPROC} \
                CTEST_PARALLEL_LEVEL=${NPROC} \
                -j${NPROC} \
diff --git a/docker/Dockerfile.onnx-mlir-dev b/docker/Dockerfile.onnx-mlir-dev
index 574737c1a9..344fa273b5 100644
--- a/docker/Dockerfile.onnx-mlir-dev
+++ b/docker/Dockerfile.onnx-mlir-dev
@@ -20,7 +20,7 @@ RUN ONNX_ROOT=${WORK_DIR}/onnx-mlir/third_party/onnx \
 ARG NPROC=4
 ARG ACCEL=NNPA
 ARG TEST_NOFLOAT16
-ARG TEST_MCPU
+ARG TEST_MARCH
 
 RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
     && ONNX_MLIR_ROOT=${WORK_DIR}/onnx-mlir \
@@ -51,10 +51,10 @@ RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
                                          ([ "$(uname -m)" = "x86_64" ] &&  echo true || \
                                          ([ "$(uname -m)" = "ppc64le" ] && echo || echo)))} \
 # Dev image is built without SIMD, placeholder for easy SIMD enablement
-    && TEST_MCPU=$([ "$(uname -m)" = "s390x" ] && echo || \
+    && TEST_MARCH=$([ "$(uname -m)" = "s390x" ] && echo || \
                   ([ "$(uname -m)" = "x86_64" ] &&  echo || \
                   ([ "$(uname -m)" = "ppc64le" ] && echo || echo))) \
-    && TEST_ARGS="-mcpu=${TEST_MCPU}" \
+    && TEST_ARGS="-march=${TEST_MARCH}" \
     && TEST_OPTLEVEL=0 \
     && make check-docs \
     && make check-unittest \
@@ -62,7 +62,7 @@ RUN LLVM_PROJECT_ROOT=${WORK_DIR}/llvm-project \
     && make NPROC=${NPROC} \
             CTEST_PARALLEL_LEVEL=${NPROC} \
             TEST_NOFLOAT16=${TEST_NOFLOAT16} \
-            TEST_MCPU=${TEST_MCPU} \
+            TEST_MARCH=${TEST_MARCH} \
             TEST_ARGS="${TEST_ARGS}" \
             TEST_OPTLEVEL=${TEST_OPTLEVEL} \
             -j${NPROC} \
diff --git a/docs/DebuggingNumericalError.md b/docs/DebuggingNumericalError.md
index 62b513ff33..0eeabcb505 100644
--- a/docs/DebuggingNumericalError.md
+++ b/docs/DebuggingNumericalError.md
@@ -65,7 +65,7 @@ optional arguments:
 ## Helper script to compare a model under two distinct compile option.
 
 Based on the above `utils/runONNXModel.py`, the `utils/checkONNXModel.py` allows a user to run a given model twice, under two distinct compile options, and compare its results.
-This let a user simply test a new option, comparing the safe version of the compiler (e.g. `-O0` or `-O3`) with a more advanced version (e.g. `-O3` or `-O3 -march=x86-64`). Simply specify the compile options using the `--ref-compile-args` and `--test-compile-args` flags, a model using the `--model` flag, and possibly a `--shape-info` in presence of dynamic shape inputs.
+This let a user simply test a new option, comparing the safe version of the compiler (e.g. `-O0` or `-O3`) with a more advanced version (e.g. `-O3` or `-O3 --march=x86-64`). Simply specify the compile options using the `--ref-compile-args` and `--test-compile-args` flags, a model using the `--model` flag, and possibly a `--shape-info` in presence of dynamic shape inputs.
 Full options are listed under the `--help` flag.
 
 ## Debugging the Code Generated for an Operator.
diff --git a/docs/Dialects/zhigh.md b/docs/Dialects/zhigh.md
index 4780cbe551..dd87eeecf5 100644
--- a/docs/Dialects/zhigh.md
+++ b/docs/Dialects/zhigh.md
@@ -337,6 +337,61 @@ Effects: `MemoryEffects::Effect{}`
 | :----: | ----------- |
 | `hn_output` | unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS
 
+### `zhigh.Gelu` (::onnx_mlir::zhigh::ZHighGeluOp)
+
+_ZHigh Gelu operation_
+
+"ZHigh operation to perform a Gelu."
+
+Traits: `AlwaysSpeculatableImplTrait`, `SameOperandsAndResultLayout`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>approximate</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `Out` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+### `zhigh.InvSqrt` (::onnx_mlir::zhigh::ZHighInvSqrtOp)
+
+_ZHigh InvSqrt operation_
+
+ZHigh operation to perform a InvSqrt.
+
+Traits: `AlwaysSpeculatableImplTrait`, `SameOperandsAndResultLayout`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `Out` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
 ### `zhigh.LSTM` (::onnx_mlir::zhigh::ZHighLSTMOp)
 
 _ZHigh LSTM operation_
@@ -389,6 +444,37 @@ Effects: `MemoryEffects::Effect{}`
 | `hn_output` | unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS
 | `cf_output` | unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS
 
+### `zhigh.LeakyRelu` (::onnx_mlir::zhigh::ZHighLeakyReluOp)
+
+_ZHigh LeakyRelu operation_
+
+"ZHigh operation to perform a LeakyRelu."
+
+Traits: `AlwaysSpeculatableImplTrait`, `SameOperandsAndResultLayout`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>alpha</code></td><td>::mlir::FloatAttr</td><td>32-bit float attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `Out` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
 ### `zhigh.Log` (::onnx_mlir::zhigh::ZHighLogOp)
 
 _ZHigh Log operation_
@@ -425,6 +511,14 @@ Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterfac
 
 Effects: `MemoryEffects::Effect{}`
 
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>transposeA</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>transposeB</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+</table>
+
 #### Operands:
 
 | Operand | Description |
@@ -577,6 +671,168 @@ Effects: `MemoryEffects::Effect{}`
 | :----: | ----------- |
 | `Out` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
 
+### `zhigh.QuantizedMatMul` (::onnx_mlir::zhigh::ZHighQuantizedMatMulOp)
+
+_ZHigh QuantizedMatMul operation_
+
+ZHigh operation to perform a quantized MatMul.
+
+`OutRecScaleIn` and `OutOffsetIn` are recscale and offset for the output.
+If `OutRecScaleIn` is given, it will be passed to `OutRecScale`. If it is
+None, `OutRescScale` is set to 1.0.
+If `OutOffsetIn` is given, it will be passed to `OutOffset`. If it is
+None, `OutOffset` is set to 0.0.
+
+* PreComputedBias: -1 bias is re-computed, 0: bias is not pre-computed.
+
+`DequantizeOutput` indicates if the output
+is dequantized to real dfloat16 or not. If not, the output is int8 but stored in dlfloat (int8-as-dlfloat).
+* DequantizeOutput: -1 output is dequantized, 0: output is not dequantized.
+
+Traits: `AlwaysSpeculatableImplTrait`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>PreComputedBias</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>DisableClipping</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>DequantizeOutput</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2D or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3DS
+| `XRecScale` | 0D tensor of 32-bit float values
+| `XOffset` | 0D tensor of 32-bit float values
+| `Y` | unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2D or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3DS
+| `YRecScale` | 0D tensor of 32-bit float values
+| `YOffset` | 0D tensor of 32-bit float values
+| `B` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 8-bit signless integer or 16-bit float values or 1D tensor of 8-bit signless integer or 16-bit float values with layout _1D or unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2DS or none type
+| `BRecScale` | 0D tensor of 32-bit float values or none type
+| `BOffset` | 0D tensor of 32-bit float values or none type
+| `OutRecScaleIn` | 0D tensor of 32-bit float values or none type
+| `OutOffsetIn` | 0D tensor of 32-bit float values or none type
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `Out` | unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2D or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS
+| `OutRecScale` | 0D tensor of 32-bit float values
+| `OutOffset` | 0D tensor of 32-bit float values
+
+### `zhigh.QuantizedStick` (::onnx_mlir::zhigh::ZHighQuantizedStickOp)
+
+_ZHigh QuantizedStick operation_
+
+ZHigh operation to perform a quantized Stick.
+Type is one of values: dlfloat16, int8, and weights.
+`sym_mode` indicates whether to use symmetric quantization or not to compute the output rescale and offset.
+`sym_mode` is only effective when the input rescale and offset are None.
+By default, asymmetric quantization is used.
+
+Traits: `AlwaysSpeculatableImplTrait`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>quantized_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>sym_mode</code></td><td>::mlir::IntegerAttr</td><td>64-bit signless integer attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `In` | tensor of 32-bit float values or tensor of 8-bit signless integer values or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS
+| `InRecScale` | 0D tensor of 32-bit float values or none type
+| `InOffset` | 0D tensor of 32-bit float values or none type
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `Out` | unranked tensor of 8-bit signless integer or 16-bit float values or 1D tensor of 8-bit signless integer or 16-bit float values with layout _1D or unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2D or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3D or unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2DS or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3DS or none type
+| `RecScale` | 0D tensor of 32-bit float values
+| `Offset` | 0D tensor of 32-bit float values
+
+### `zhigh.ReduceMax` (::onnx_mlir::zhigh::ZHighReduceMaxOp)
+
+_ZHigh ReduceMax operation_
+
+ZHigh operation to perform a ReduceMax.
+op_type: REDUCE_OP_MAXIMUM or REDUCE_OP_MINIMUM.
+
+Traits: `AlwaysSpeculatableImplTrait`, `SameOperandsAndResultLayout`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>op_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `data` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `output` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+### `zhigh.ReduceMin` (::onnx_mlir::zhigh::ZHighReduceMinOp)
+
+_ZHigh ReduceMin operation_
+
+ZHigh operation to perform a ReduceMin.
+op_type: REDUCE_OP_MAXIMUM or REDUCE_OP_MINIMUM.
+
+Traits: `AlwaysSpeculatableImplTrait`, `SameOperandsAndResultLayout`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeHelperOpInterface`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>op_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `data` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `output` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
 ### `zhigh.Relu` (::onnx_mlir::zhigh::ZHighReluOp)
 
 _ZHigh Relu operation_
@@ -657,6 +913,30 @@ Effects: `MemoryEffects::Effect{}`
 | :----: | ----------- |
 | `Out` | unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS
 
+### `zhigh.Sqrt` (::onnx_mlir::zhigh::ZHighSqrtOp)
+
+_ZHigh Sqrt operation_
+
+ZHigh operation to perform a Sqrt.
+
+Traits: `AlwaysSpeculatableImplTrait`, `SameOperandsAndResultLayout`
+
+Interfaces: `ConditionallySpeculatable`, `NoMemoryEffect (MemoryEffectOpInterface)`, `ShapeInferenceOpInterface`
+
+Effects: `MemoryEffects::Effect{}`
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
+#### Results:
+
+| Result | Description |
+| :----: | ----------- |
+| `Out` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+
 ### `zhigh.StickForGRU` (::onnx_mlir::zhigh::ZHighStickForGRUOp)
 
 _ZHigh stick operation for GRU_
@@ -815,7 +1095,7 @@ Effects: `MemoryEffects::Effect{}`
 
 | Result | Description |
 | :----: | ----------- |
-| `output` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH
+| `output` | unranked tensor of 16-bit float values or 1D tensor of 16-bit float values with layout _1D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2D or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3D or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4D or unranked tensor of 16-bit float values or 2D tensor of 16-bit float values with layout _2DS or unranked tensor of 16-bit float values or 3D tensor of 16-bit float values with layout _3DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout _4DS or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NCHW or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout NHWC or unranked tensor of 16-bit float values or 4D tensor of 16-bit float values with layout HWCK or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout FICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout ZRH or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BFICO or unranked tensor of 16-bit float values or 2D/3D tensor of 16-bit float values with layout BZRH or unranked tensor of 8-bit signless integer or 16-bit float values or 1D tensor of 8-bit signless integer or 16-bit float values with layout _1D or unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2D or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3D or unranked tensor of 8-bit signless integer or 16-bit float values or 2D tensor of 8-bit signless integer or 16-bit float values with layout _2DS or unranked tensor of 8-bit signless integer or 16-bit float values or 3D tensor of 8-bit signless integer or 16-bit float values with layout _3DS
 
 ### `zhigh.Sub` (::onnx_mlir::zhigh::ZHighSubOp)
 
diff --git a/docs/Dialects/zlow.md b/docs/Dialects/zlow.md
index ba6907fced..7be1c6457b 100644
--- a/docs/Dialects/zlow.md
+++ b/docs/Dialects/zlow.md
@@ -342,6 +342,52 @@ Interfaces: `MemoryEffectOpInterface`
 | `shape` | memref of 64-bit signless integer values
 | `hn_output` | memref of dlfloat16 type values
 
+### `zlow.gelu` (::onnx_mlir::zlow::ZLowGeluOp)
+
+_ZLow gelu operation_
+
+ZLow operation to perform a gelu.
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type values
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type values
+
+### `zlow.invsqrt` (::onnx_mlir::zlow::ZLowInvSqrtOp)
+
+_ZLow invsqrt operation_
+
+ZLow operation to perform a invsqrt.
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type values
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type values
+
 ### `zlow.lstm` (::onnx_mlir::zlow::ZLowLSTMOp)
 
 _ZLow lstm operation_
@@ -387,6 +433,30 @@ Interfaces: `MemoryEffectOpInterface`
 | `hn_output` | memref of dlfloat16 type values
 | `cf_output` | memref of dlfloat16 type values
 
+### `zlow.leakyrelu` (::onnx_mlir::zlow::ZLowLeakyReluOp)
+
+_ZLow leakyrelu operation_
+
+ZLow operation to perform a leakyrelu.
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>alpha</code></td><td>::mlir::FloatAttr</td><td>32-bit float attribute</td></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type values
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type values
+
 ### `zlow.log` (::onnx_mlir::zlow::ZLowLogOp)
 
 _ZLow log operation_
@@ -423,14 +493,18 @@ shape is a 1D MemRef (memref<3xi64>) whose items are:
   * 2nd item: n
   * 3rd item: p
 * In case of stacked: X(s, m, n) * Y(s, n, p) + Bias(s, p)
-     or broadcasting: X(s, m, n) * Y(n, p) + Bias(p)
+     or broadcasting1: X(m, n) * Y(s, n, p) + Bias(s, p)
+     or broadcasting23: X(s, m, n) * Y(n, p) + Bias(p)
 shape is a 1D MemRef (memref<4xi64>) whose items are:
   * 1st item: s
   * 2nd item: m
   * 3rd item: n
   * 4th item: p
-* is_bcast: -1 broadcasting, 0: no broadcasting.
+* is_bcast1:  -1 broadcasting1, 0: no broadcasting1.
+* is_bcast23: -1 broadcasting23, 0: no broadcasting23.
 * is_stacked: -1 stacked, 0: unstacked.
+* transposeA: !0 transpose A, 0: do not transpose A.
+* transposeB: !0 transpose B, 0: do not transpose B.
 
 Traits: `MemRefsNormalizable`
 
@@ -440,8 +514,11 @@ Interfaces: `MemoryEffectOpInterface`
 
 <table>
 <tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
-<tr><td><code>is_bcast</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>is_bcast1</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>is_bcast23</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
 <tr><td><code>is_stacked</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>transposeA</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>transposeB</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
 </table>
 
 #### Operands:
@@ -592,6 +669,144 @@ Interfaces: `MemoryEffectOpInterface`
 | `shape` | memref of 64-bit signless integer values
 | `Out` | memref of dlfloat16 type values
 
+### `zlow.quantizedMatmul` (::onnx_mlir::zlow::ZLowQuantizedMatMulOp)
+
+_ZLow quantized matmul operation_
+
+ZLow operation to perform a matmul.
+work_area: a 4K-aligned buffer having the same layout as bias but dlfloat16 type.
+* In case of unstacked: X(m, n) * Y(n, p) + Bias(p)
+shape is a 1D MemRef (memref<3xi64>) whose items are:
+  * 1st item: m
+  * 2nd item: n
+  * 3rd item: p
+* In case of stacked: X(s, m, n) * Y(s, n, p) + Bias(s, p)
+     or broadcasting: X(s, m, n) * Y(n, p) + Bias(p)
+shape is a 1D MemRef (memref<4xi64>) whose items are:
+  * 1st item: s
+  * 2nd item: m
+  * 3rd item: n
+  * 4th item: p
+* is_bcast: -1 broadcasting, 0: no broadcasting.
+* is_stacked: -1 stacked, 0: unstacked.
+* DequantizeOutput: -1 output is dequantized, 0: output is not dequantized.
+* PreComputedBias: -1 bias is re-computed, 0: bias is not pre-computed.
+
+Values for `q_type` are "DLFLOAT16", "INT8", "WEIGHTS", "UNDEFINED".
+
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>x_q_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>y_q_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>bias_q_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>out_q_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>is_bcast</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>is_stacked</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>pre_computed_bias</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>disable_clipping</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+<tr><td><code>dequantize_output</code></td><td>::mlir::IntegerAttr</td><td>64-bit signed integer attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type or 8-bit signless integer values
+| `x_rec_scale` | 0D memref of 32-bit float values
+| `x_offset` | 0D memref of 32-bit float values
+| `Y` | memref of dlfloat16 type or 8-bit signless integer values
+| `y_rec_scale` | 0D memref of 32-bit float values
+| `y_offset` | 0D memref of 32-bit float values
+| `Bias` | memref of dlfloat16 type or 8-bit signless integer values
+| `bias_rec_scale` | 0D memref of 32-bit float values
+| `bias_offset` | 0D memref of 32-bit float values
+| `work_area` | memref of dlfloat16 type or 8-bit signless integer values or none type
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type or 8-bit signless integer values
+| `out_rec_scale` | 0D memref of 32-bit float values
+| `out_offset` | 0D memref of 32-bit float values
+
+### `zlow.quantizedStick` (::onnx_mlir::zlow::ZLowQuantizedStickOp)
+
+_ZLow stick operation for quantization_
+
+"ZLow operation to perform a quantization stick."
+"Type is one of values: dlfloat16, int8, and weights."
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>q_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of 8-bit signless integer or 32-bit float values
+| `rec_scale` | 0D memref of 32-bit float values
+| `offset` | 0D memref of 32-bit float values
+| `out` | memref of dlfloat16 type or 8-bit signless integer values
+
+### `zlow.reducemax` (::onnx_mlir::zlow::ZLowReduceMaxOp)
+
+_ZLow reducemax operation_
+
+ZLow operation to perform a reducemax.
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>op_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type values
+| `work_area` | memref of 8-bit signless integer values
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type values
+
+### `zlow.reducemin` (::onnx_mlir::zlow::ZLowReduceMinOp)
+
+_ZLow reducemin operation_
+
+ZLow operation to perform a reducemin.
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+<tr><td><code>op_type</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type values
+| `work_area` | memref of 8-bit signless integer values
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type values
+
 ### `zlow.relu` (::onnx_mlir::zlow::ZLowReluOp)
 
 _ZLow relu operation_
@@ -670,6 +885,29 @@ Interfaces: `MemoryEffectOpInterface`
 | `shape` | memref of 64-bit signless integer values
 | `Out` | memref of dlfloat16 type values
 
+### `zlow.sqrt` (::onnx_mlir::zlow::ZLowSqrtOp)
+
+_ZLow sqrt operation_
+
+ZLow operation to perform a sqrt.
+
+Traits: `MemRefsNormalizable`
+
+#### Attributes:
+
+<table>
+<tr><th>Attribute</th><th>MLIR Type</th><th>Description</th></tr>
+<tr><td><code>layout</code></td><td>::mlir::StringAttr</td><td>string attribute</td></tr>
+</table>
+
+#### Operands:
+
+| Operand | Description |
+| :-----: | ----------- |
+| `X` | memref of dlfloat16 type values
+| `shape` | memref of 64-bit signless integer values
+| `Out` | memref of dlfloat16 type values
+
 ### `zlow.stickForGRU` (::onnx_mlir::zlow::ZLowStickForGRUOp)
 
 _ZLow stick operation for GRU_
diff --git a/docs/Instrumentation.md b/docs/Instrumentation.md
index 31969ff15e..25b77153b6 100644
--- a/docs/Instrumentation.md
+++ b/docs/Instrumentation.md
@@ -61,11 +61,11 @@ The output for the memory measurement is explained here.
 
 Other example for NNPA
 - Performance profiling for onnx ops before lowering to zhigh ops:
-  `onnx-mlir --mcpu=z16 --maccel=NNPA --instrument-stage=Onnx --instrument-ops=onnx.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
+  `onnx-mlir --march=z16 --maccel=NNPA --instrument-stage=Onnx --instrument-ops=onnx.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
 - Performance profiling for onnx and zhigh ops:
-  `onnx-mlir --mcpu=z16 --maccel=NNPA --instrument-stage=ZHigh --instrument-ops=onnx.*,zhigh.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
+  `onnx-mlir --march=z16 --maccel=NNPA --instrument-stage=ZHigh --instrument-ops=onnx.*,zhigh.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
 - Performance profiling for zlow ops:
-  `onnx-mlir --mcpu=z16 --maccel=NNPA --instrument-stage=ZLow --instrument-ops=zlow.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
+  `onnx-mlir --march=z16 --maccel=NNPA --instrument-stage=ZLow --instrument-ops=zlow.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime mymodel.onnx`
 
 ## Control instrument at runtime
 By providing certain env variable at runtime, you can disable reports from  instrument library.
diff --git a/docs/SupportedONNXOps-NNPA.md b/docs/SupportedONNXOps-NNPA.md
index 80fa3287cf..a0f85aef41 100644
--- a/docs/SupportedONNXOps-NNPA.md
+++ b/docs/SupportedONNXOps-NNPA.md
@@ -8,38 +8,38 @@ Onnx-mlir currently supports ONNX operations targeting up to opset 21. Limitatio
 * Operations are defined by the [ONNX Standard](https://github.com/onnx/onnx/blob/main/docs/Operators.md).
 * **Supported Opsets** indicates the lowest and highest opset a model may have for onnx-mlir to support compiling a model with the operator.
    * A * indicates onnx-mlir is compatible with the latest version of that operator available as of opset 21.
+   * A ^ indicates onnx-mlir is compatible with the latest level of the NNPA Architecture which is z16.
 
 
-NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA. NNPA currently only support DLFLOAT16 as its data type. Common data formats like FP32, FP16, BFLOAT need to undergo data conversions to the NNPA internal format DLFLOAT16. Hence ONNX ops which updated their tensors to BFLOAT16 will not be natively supported on NNPA.
+NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA. NNPA currently only support DLFLOAT16 as its data type. Common data formats like FP32, FP16, BFLOAT need to undergo data conversions to the NNPA internal format DLFLOAT16. Hence ONNX ops which updated their tensors to BFLOAT16 will not be natively supported on NNPA.  Onnx-mlir with NNPA utilizes hardware when possible. To accomplish this, the compiler converts ONNX ops to [ZHigh](Dialects/zhigh.md) ops, [ZLow](Dialects/zlow.md) ops, and are processed by the [IBM Z Deep Neural Network Library (zDNN)](https://github.com/IBM/zDNN).
 
 
-| Op |Supported Opsets (inclusive) |Limitations |Notes |
-| --- |--- |--- |--- |
-| **Add** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **AveragePool** |6 - * |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors (N x C x H x W).<br>- `kernel_shape` must be static.<br>- `count_include_pad` must be default value(0).<br>- `ceil_mode` must be default value(0). | |
-| **BatchNormalization** |6 - * |Input and output tensor must be 4D(N x C x H x W). | |
-| **Conv** |6 - * |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- Dimension in Height and weight must be static.<br>- `group` must be default value(1).<br>- `dilations` must be default value(1).<br>- Input and output tensors must have 4D (N x C x H x W).<br>- `kernel_shape` must be static. | |
-| **ConvTranspose** |6 - * |- 1D and 3D not supported because Conv1D and Conv3D not supported in zDNN. non-default `dilations` not supported because dilated convolution not supported in zDNN. | |
-| **Div** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Exp** |6 - * |Input tensor must have 4 dimensions. | |
-| **GRU** |7 - * |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- If `B` and `initial_h` are given, they must have static dimensions.<br>- `sequence_lens` is not supported for bidirectional GRU.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `linear_before_reset` must be 1.<br>- `layout` is not supported. | |
-| **Gemm** |6 - * |- `alpha` and `beta` must be default value(1).<br>- Rank of `C` must be 1 or 2. If the rank is 1, the dimension of `C` must be the same with the seconde dimension of `B`. | |
-| **GlobalAveragePool** |6 - * |- Input shape must be 4D tensor(NCHW).<br>- Dimensions in `H` and `W` must be static. | |
-| **LSTM** |7 - * |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- `B` and `initial_h` have static dimensions if given. `B`'s direction dim must be 1 or 2.<br>- `P`(peepholes), `activation_alpha`, and `activation_beta` are not supported.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `input_forget` must be default value(0).<br>- `layout` is not supported. | |
-| **LeakyRelu** |6 - * |The operations immediately before and after the LeakyRelu operation must be executed on the NNPA. Otherwise, LeakyRelu is executed on the CPU. This limitation is set to avoid performance degradation. | |
-| **Log** |6 - * |Input tensor must have 4 dimensions. | |
-| **LogSoftmax** |6 - * | | |
-| **MatMul** |6 - * |Ranks of input tensors must be (Rank of A, Rank of B) = (M, N), where M >= 2 and N >= 2. | |
-| **Max** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **MaxPool** |6 - * |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors(N x C x H x W).<br>- `kernel_shape` must be static.<br>- `ceil_mode` must be default value(0).<br>- `dilations` must be default value(1). | |
-| **Min** |6 - * |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Mul** |6 - * |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Pow** |7 - * |- Exponent should be a scalar integer and less or equal to 64. | |
-| **ReduceMean** |6 - * |- `keepdims` must be 1.<br>- Input tensor must be 4D tensors and `axis` must be [2, 3]. | |
-| **Relu** |6 - * |Input tensor must be less than or equal to 4 dimensions. | |
-| **Sigmoid** |6 - * |Input tensor must be less than or equal to 4 dimensions. | |
-| **Softmax** |6 - * |- `axis` must be the last dimension, i.e. `rank - 1` or -1. | |
-| **Softplus** |6 - * |The operations immediately before and after the Softplus operation must be executed on the NNPA. Otherwise, Softplus is executed on the CPU. This limitation is set to avoid performance degradation. | |
-| **Sub** |6 - * |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
-| **Sum** |6 - * |- All inputs must have the same static shape (Broadcasting not supported.)<br>- Single input not supported. | |
-| **Tanh** |6 - * |Input tensor must be less than or equal to 4 dimensions. | |
+| Op |Supported Opsets (inclusive) |Minimum NNPA Level(Inclusive) |Limitations |Notes |
+| --- |--- |--- |--- |--- |
+| **Add** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **AveragePool** |6 - * |z16 |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors (N x C x H x W).<br>- `kernel_shape` must be static.<br>- `count_include_pad` must be default value(0).<br>- `ceil_mode` must be default value(0). | |
+| **BatchNormalization** |6 - * |z16 |Input and output tensor must be 4D(N x C x H x W). | |
+| **Conv** |6 - * |z16 |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- Dimension in Height and weight must be static.<br>- `group` must be default value(1).<br>- `dilations` must be default value(1).<br>- Input and output tensors must have 4D (N x C x H x W).<br>- `kernel_shape` must be static. | |
+| **ConvTranspose** |6 - * |z16 |- 1D and 3D not supported because Conv1D and Conv3D not supported in zDNN. non-default `dilations` not supported because dilated convolution not supported in zDNN. | |
+| **Div** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Exp** |6 - * |z16 |Input tensor must have 4 dimensions. | |
+| **GRU** |7 - * |z16 |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- If `B` and `initial_h` are given, they must have static dimensions.<br>- `sequence_lens` is not supported for bidirectional GRU.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `linear_before_reset` must be 1.<br>- `layout` is not supported. | |
+| **Gemm** |6 - * |z16 |- `alpha` and `beta` must be default value(1).<br>- Rank of `C` must be 1 or 2. If the rank is 1, the dimension of `C` must be the same with the seconde dimension of `B`.<br>. | |
+| **GlobalAveragePool** |6 - * |z16 |- Input shape must be 4D tensor(NCHW).<br>- Dimensions in `H` and `W` must be static. | |
+| **LSTM** |7 - * |z16 |- `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- `B` and `initial_h` have static dimensions if given. `B`'s direction dim must be 1 or 2.<br>- `P`(peepholes), `activation_alpha`, and `activation_beta` are not supported.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `input_forget` must be default value(0).<br>- `layout` is not supported. | |
+| **Log** |6 - * |z16 |Input tensor must have 4 dimensions. | |
+| **LogSoftmax** |6 - * |z16 | | |
+| **MatMul** |6 - * |z16 |Ranks of input tensors must be (Rank of A, Rank of B) = (M, N), where M >= 2 and N >= 2. | |
+| **Max** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **MaxPool** |6 - * |z16 |- `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors(N x C x H x W).<br>- `kernel_shape` must be static.<br>- `ceil_mode` must be default value(0).<br>- `dilations` must be default value(1). | |
+| **Min** |6 - * |z16 |- Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Mul** |6 - * |z16 |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Pow** |7 - * |z16 |- Exponent should be a scalar integer and less or equal to 64. | |
+| **ReduceMean** |6 - * |z16 |- `keepdims` must be 1.<br>- Input tensor must be 4D tensors and `axis` must be [2, 3]. | |
+| **Relu** |6 - * |z16 |Input tensor must be less than or equal to 4 dimensions. | |
+| **Sigmoid** |6 - * |z16 |Input tensor must be less than or equal to 4 dimensions. | |
+| **Softmax** |6 - * |z16 |- `axis` must be the last dimension, i.e. `rank - 1` or -1. | |
+| **Softplus** |6 - * |z16 |The operations immediately before and after the Softplus operation must be executed on the NNPA. Otherwise, Softplus is executed on the CPU. This limitation is set to avoid performance degradation. | |
+| **Sub** |6 - * |z16 |- Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions. | |
+| **Sum** |6 - * |z16 |- All inputs must have the same static shape (Broadcasting not supported.)<br>- Single input not supported. | |
+| **Tanh** |6 - * |z16 |Input tensor must be less than or equal to 4 dimensions. | |
diff --git a/docs/Testing.md b/docs/Testing.md
index 9fafc75876..c5029f01a5 100644
--- a/docs/Testing.md
+++ b/docs/Testing.md
@@ -122,9 +122,9 @@ cmake --build . --config Release --target check-onnx-backend-signature
 
 ### Enable SIMD instructions
 
-On supported platforms, currently s390x only, backend tests can generate SIMD instructions for the compiled models. To enable SIMD, set the TEST_MCPU environment variable, e.g.,
+On supported platforms (currently s390x z14 and up, x86, and arm), backend tests can generate SIMD instructions for the compiled models. To enable SIMD, set the TEST_MARCH environment variable, e.g.,
 ```
-TEST_MCPU=z14 cmake --build . --config Release --target check-onnx-backend[-jni]
+TEST_MARCH=z16 cmake --build . --config Release --target check-onnx-backend[-jni]
 ```
 
 ### Execution of backend tests
@@ -294,9 +294,9 @@ If you need to change ATOL and RTOL for accuracy checks, set the environment var
 
 ### Enable SIMD instructions
 
-On supported platforms, currently s390x only, numerical tests can generate SIMD instructions for the compiled models. To enable SIMD, set the `TEST_ARGS` environment variable, e.g.,
+On supported platforms (currently s390x z14 and up, x86, and arm), numerical tests can generate SIMD instructions for the compiled models. To enable SIMD, set the `TEST_ARGS` environment variable, e.g.,
 ```
-TEST_ARGS="-mcpu=z14" CTEST_PARALLEL_LEVEL=$(nproc) cmake --build . --config Release --target check-onnx-numerical
+TEST_ARGS="-march=z16" CTEST_PARALLEL_LEVEL=$(nproc) cmake --build . --config Release --target check-onnx-numerical
 ```
 
 ### Testing of specific accelerators
@@ -395,7 +395,7 @@ Without specifying a model using `-m`, the script will check all models in the O
 
 If you want to gather performance info about a model zoo (or any models, for that matter), simplest is to request the desired statistic at compile time (using `-profile-ir` flag), divert the output statistic to a file, and then analyze it using `make-report.py`. For example:
 ```
-> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 -march=arm64 --profile-ir=Onnx" -m bertsquad-10
+> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 --march=arm64 --profile-ir=Onnx" -m bertsquad-10
 ...
 > make-report.py -r run.log
 ...
@@ -408,7 +408,7 @@ Statistics start (all ops).
 
 The runtime profiling info can be combined with specific compile-time statistics as well. Let's say that we are interested in SIMD statistics. We inform the compiler of the compile-time statistic to emit using `-opt-report` option, and inform `RunONNXModelZoo.py` that we want to preserve the compiler output using the `--log-to-file` option. For example
 ```
-> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 -march=arm64 -opt-report=Simd --profile-ir=Onnx" -m bertsquad-10 --log-to-file compile.log
+> ONNX_MLIR_INSTRUMENT_FILE=run.log RunONNXModelZoo.py -c "-O3 --march=arm64 -opt-report=Simd --profile-ir=Onnx" -m bertsquad-10 --log-to-file compile.log
 ...
 > make-report.py -c compile.log -r run.log
 ...
diff --git a/src/Accelerators/NNPA/CMakeLists.txt b/src/Accelerators/NNPA/CMakeLists.txt
index 51625e984b..d3687aabc9 100644
--- a/src/Accelerators/NNPA/CMakeLists.txt
+++ b/src/Accelerators/NNPA/CMakeLists.txt
@@ -33,7 +33,7 @@ else()
 endif()
 
 include(zdnn.cmake)
-setup_zdnn(v1.0.1)
+setup_zdnn(v1.1.1)
 
 add_subdirectory(Dialect)
 add_subdirectory(Conversion)
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
index 91b6aab183..52d7933888 100644
--- a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
+++ b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
@@ -101,4 +101,27 @@ llvm::cl::opt<bool> nnpaEnableSaturation("nnpa-saturation",
                    "Default is false."),
     llvm::cl::init(false), llvm::cl::cat(OnnxMlirCommonOptions));
 
+llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPU("nnpa-cpu-dql",
+    llvm::cl::desc("Use dynamic quantized linear on CPU. Default is false"),
+    llvm::cl::init(false), llvm::cl::cat(OnnxMlirCommonOptions));
+
+llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPUForScaleOffset(
+    "nnpa-cpu-dql-scale",
+    llvm::cl::desc("Use dynamic quantized linear computation of "
+                   " scale and offset on CPU. Default is false"),
+    llvm::cl::init(false), llvm::cl::cat(OnnxMlirCommonOptions));
+
+llvm::cl::opt<NNPAQuantType> nnpaQuantization("nnpa-quantization",
+    llvm::cl::desc("Enable quantization with a specific type. Only "
+                   "MatMul whose weight is a constant is supported."),
+    llvm::cl::values(
+        clEnumVal(DynSymI8,
+            "Dynamic Quantization to signed integer 8. Asymmetric "
+            "quant for activations and symmetric quant for weights."),
+        clEnumVal(SymSymI8,
+            "Dynamic Quantization to signed integer 8. Symmetric "
+            "quant for activations and symmetric quant for weights."),
+        clEnumVal(QNONE, "No quantization (default).")),
+    llvm::cl::init(QNONE), llvm::cl::cat(OnnxMlirOptions));
+
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
index 2b0343295c..366efee3fe 100644
--- a/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
+++ b/src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp
@@ -55,6 +55,15 @@ typedef enum {
   MuchFasterOpsWSU, /* FasterOpsWSU only if significantly faster. */
 } NNPAPlacementHeuristic;
 
+// Quantization type
+typedef enum {
+  DynSymI8, /* Dynamic quantization to signed integer 8. Asymmetric quant for
+               activations and symmetric quant for weights.*/
+  SymSymI8, /* Dynamic quantization to signed integer 8. Symmetric quant for
+               activations and symmetric quant for weights.*/
+  QNONE,    /* Only qualifying ops that are faster on NNPA. */
+} NNPAQuantType;
+
 extern llvm::cl::OptionCategory OnnxMlirOptions;
 extern llvm::cl::OptionCategory OnnxMlirCommonOptions;
 extern llvm::cl::opt<onnx_mlir::NNPAEmissionTargetType> nnpaEmissionTarget;
@@ -68,6 +77,9 @@ extern llvm::cl::opt<bool> profileZHighIR;
 extern llvm::cl::opt<std::string> nnpaLoadDevicePlacementFile;
 extern llvm::cl::opt<std::string> nnpaSaveDevicePlacementFile;
 extern llvm::cl::opt<bool> nnpaEnableSaturation;
+extern llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPU;
+extern llvm::cl::opt<bool> nnpaUseDynamicQuantizeLinearOnCPUForScaleOffset;
+extern llvm::cl::opt<NNPAQuantType> nnpaQuantization;
 
 } // namespace onnx_mlir
 #endif
diff --git a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
index eefe6b9a15..d7c5cfcac0 100644
--- a/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
+++ b/src/Accelerators/NNPA/Compiler/NNPACompilerUtils.cpp
@@ -36,6 +36,7 @@
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
 #include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
+#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
 #include "src/Compiler/CompilerOptions.hpp"
 #include "src/Compiler/CompilerPasses.hpp"
 #include "src/Pass/Passes.hpp"
@@ -49,9 +50,9 @@ namespace onnx_mlir {
 
 void configurePassesNNPA() {
   configureOnnxToZHighLoweringPass(optReport == OptReport::NNPAUnsupportedOps);
-  // Compiler generated sticks supports saturation, so force its usage.
-  // TODO: remove this if zDNN adds support for saturation.
-  if (nnpaEnableSaturation)
+  // z16 does not support for hardware saturation.
+  // So, force its usage to compiler generated sticks.
+  if (nnpaEnableSaturation && isLessEqualNNPALevel(NNPALevel::M14))
     nnpaEnableCompilerStickUnstick = true;
 }
 
@@ -84,7 +85,7 @@ void addONNXToZHighPasses(mlir::PassManager &pm) {
     pm.addNestedPass<func::FuncOp>(
         onnx_mlir::createInstrumentPass(instrumentOps, instrumentActions));
 
-  pm.addPass(onnx_mlir::createONNXToZHighPass());
+  pm.addPass(onnx_mlir::createONNXToZHighPass(nnpaQuantization));
   pm.addNestedPass<func::FuncOp>(onnx_mlir::createShapeInferencePass());
 
   // There are more opportunities for const propagation once all zhigh ops were
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
index 58e6439897..47724d8d3e 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
@@ -200,13 +200,13 @@ void DevicePlacementPass::runOnOperation() {
   // Call ONNXToZHigh pass for lowering multiple ONNX ops at once to ZHigh.
   // E.g. `onnx.ReLu (onnx.Conv)` to zhigh.Conv.
   RewritePatternSet Patterns2(context);
-  getONNXToZHighMultipleOpPatterns(Patterns2);
+  getONNXToZHighMultipleOpPatterns(Patterns2, nnpaQuantization);
   (void)applyAnalysisConversion(module, target, std::move(Patterns2),
       ConversionConfig{.legalizableOps = &legalizedOps2});
 
   // Call ONNXToZHigh pass for lowering a single ONNX op to ZHigh.
   RewritePatternSet Patterns3(context);
-  getONNXToZHighOneOpPatterns(Patterns3);
+  getONNXToZHighOneOpPatterns(Patterns3, nnpaQuantization);
   getONNXToZHighOneOpDynamicallyLegal(&target, &dimAnalysis);
   (void)applyAnalysisConversion(module, target, std::move(Patterns3),
       ConversionConfig{.legalizableOps = &legalizedOps3});
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.cpp
index 621c8ffcbf..aa161a9f9e 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.cpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.cpp
@@ -15,6 +15,7 @@
 
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp"
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp"
 #include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
 #include "src/Compiler/CompilerOptions.hpp"
 #include "src/Conversion/ONNXToKrnl/RNN/RNNBase.hpp"
@@ -38,12 +39,16 @@ bool onnxToZHighUnsupportedReport(Operation *op, const std::string &message) {
 
 /// Report incompatibility with NNPA Level.
 bool onnxToZHighInCompatibilityReport(
-    Operation *op, std::string inputNNPALevel) {
-  std::string message =
-      "onnx-mlir NNPA level (" + inputNNPALevel +
-      ") is not compatible with  NNPA level specified by '-mcpu'(" + mcpu +
-      ").";
-  return onnxToZHighUnsupportedReport(op, message);
+    Operation *op, const std::string &message) {
+  std::string compilerNNPALevelStr = getNNPAString(getNNPAFromFlags());
+  std::string errorMessage =
+      "onnx-mlir NNPA level \"" + message + "\" is not compatible with  " +
+      "NNPA level specified by \"" + compilerNNPALevelStr + "\".";
+  return onnxToZHighUnsupportedReport(op, errorMessage);
+}
+
+bool onnxToZHighInCompatibilityReport(Operation *op, NNPALevel level) {
+  return onnxToZHighInCompatibilityReport(op, getNNPAString(level));
 }
 
 /// A function to check whether a value's element type is valid for zAIU or not.
@@ -357,8 +362,8 @@ template <>
 bool isSuitableForZDNN<ONNXAddOp>(
     ONNXAddOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16)) {
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14)) {
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   }
   if (!isValidElementTypeAndRank(op.getOperation(), op.getA()))
     return false;
@@ -376,8 +381,8 @@ template <>
 bool isSuitableForZDNN<ONNXSubOp>(
     ONNXSubOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getA()))
     return false;
   if (!isValidElementTypeAndRank(op.getOperation(), op.getB()))
@@ -394,8 +399,8 @@ template <>
 bool isSuitableForZDNN<ONNXMulOp>(
     ONNXMulOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getA()))
     return false;
   if (!isValidElementTypeAndRank(op.getOperation(), op.getB()))
@@ -414,8 +419,8 @@ bool isSuitableForZDNN<ONNXDivOp>(
   Value A = op.getA();
   Value B = op.getB();
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   // Broadcast with a scalar operand.
   if (isEnableScalarBcastBinary()) {
     if (isF32ScalarConstantTensor(A) &&
@@ -442,8 +447,8 @@ template <>
 bool isSuitableForZDNN<ONNXSumOp>(
     ONNXSumOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   // Do not support a single input.
   if (op.getData_0().size() < 2)
     return onnxToZHighUnsupportedReport(op.getOperation(),
@@ -473,8 +478,8 @@ template <>
 bool isSuitableForZDNN<ONNXMinOp>(
     ONNXMinOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   int64_t opnum = op.getNumOperands();
   if (opnum != 2)
     return onnxToZHighUnsupportedReport(op.getOperation(),
@@ -491,13 +496,13 @@ bool isSuitableForZDNN<ONNXMinOp>(
 }
 
 /// Check legality for ONNXMax.
-/// zDNN Min/Max do not support boradcasting, and getNumOperands != 2.
+/// zDNN Min/Max do not support broadcasting, and getNumOperands != 2.
 template <>
 bool isSuitableForZDNN<ONNXMaxOp>(
     ONNXMaxOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   int64_t opnum = op.getNumOperands();
   if (opnum != 2)
     return onnxToZHighUnsupportedReport(op.getOperation(),
@@ -520,8 +525,8 @@ template <>
 bool isSuitableForZDNN<ONNXSoftmaxOp>(
     ONNXSoftmaxOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getInput()))
     return false;
   ShapedType inputType = mlir::cast<ShapedType>(op.getType());
@@ -541,13 +546,37 @@ bool isSuitableForZDNN<ONNXSoftmaxOp>(
   return true;
 }
 
+/// Check legality for ONNXLeakyRelu.
+template <>
+bool isSuitableForZDNN<ONNXLeakyReluOp>(
+    ONNXLeakyReluOp op, const DimAnalysis *dimAnalysis) {
+  // Check NNPA level.
+  if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M15);
+  if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
+    return false;
+  return true;
+}
+
 /// Check legality for ONNXRelu.
 template <>
 bool isSuitableForZDNN<ONNXReluOp>(
     ONNXReluOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
+  if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
+    return false;
+  return true;
+}
+
+/// Check legality for ONNXGelu.
+template <>
+bool isSuitableForZDNN<ONNXGeluOp>(
+    ONNXGeluOp op, const DimAnalysis *dimAnalysis) {
+  // Check NNPA level.
+  if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M15);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
     return false;
   return true;
@@ -558,8 +587,8 @@ template <>
 bool isSuitableForZDNN<ONNXTanhOp>(
     ONNXTanhOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getInput()))
     return false;
   return true;
@@ -570,8 +599,20 @@ template <>
 bool isSuitableForZDNN<ONNXSigmoidOp>(
     ONNXSigmoidOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
+  if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
+    return false;
+  return true;
+}
+
+/// Check legality for ONNXSqrt.
+template <>
+bool isSuitableForZDNN<ONNXSqrtOp>(
+    ONNXSqrtOp op, const DimAnalysis *dimAnalysis) {
+  // Check NNPA level.
+  if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M15);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
     return false;
   return true;
@@ -582,8 +623,8 @@ template <>
 bool isSuitableForZDNN<ONNXLogOp>(
     ONNXLogOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getInput()))
     return false;
   return true;
@@ -594,8 +635,8 @@ template <>
 bool isSuitableForZDNN<ONNXExpOp>(
     ONNXExpOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   if (!isValidElementTypeAndRank(op.getOperation(), op.getInput()))
     return false;
   return true;
@@ -606,8 +647,8 @@ template <>
 bool isSuitableForZDNN<ONNXMatMulOp>(
     ONNXMatMulOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
   int64_t opnum = op.getNumOperands();
   if (opnum != 2)
     return onnxToZHighUnsupportedReport(op.getOperation(),
@@ -663,10 +704,10 @@ bool isSuitableForZDNN<ONNXMatMulOp>(
     }
     return true;
   } else if ((shapeA.size() == 3) && (shapeB.size() == 2)) {
-    // stacked w/ bcast
+    // stacked w/ bcast23 case
     if (aType.hasStaticShape() && bType.hasStaticShape()) {
       if (shapeA[2] != shapeB[0]) {
-        std::string message = "Stacked w/ bcast case: the 3rd dim of A (" +
+        std::string message = "Stacked w/ bcast23 case: the 3rd dim of A (" +
                               std::to_string(shapeA[2]) +
                               ") and the 1st dim of B (" +
                               std::to_string(shapeB[0]) + ") are not the same.";
@@ -674,6 +715,21 @@ bool isSuitableForZDNN<ONNXMatMulOp>(
       }
     }
     return true;
+  } else if ((shapeA.size() == 2) && (shapeB.size() == 3)) {
+    // stacked w/ bcast1 case
+    if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+      return onnxToZHighInCompatibilityReport(
+          op.getOperation(), NNPALevel::M15);
+    if (aType.hasStaticShape() && bType.hasStaticShape()) {
+      if (shapeA[1] != shapeB[1]) {
+        std::string message = "Stacked w/ bcast1 case: the 2nd dim of A (" +
+                              std::to_string(shapeA[1]) +
+                              ") and the 2nd dim of B (" +
+                              std::to_string(shapeB[1]) + ") are not the same.";
+        return onnxToZHighUnsupportedReport(op.getOperation(), message);
+      }
+    }
+    return true;
   }
   std::string message = "Dim size of A(" + std::to_string(shapeA.size()) +
                         ") and B(" + std::to_string(shapeB.size()) +
@@ -681,6 +737,141 @@ bool isSuitableForZDNN<ONNXMatMulOp>(
   return onnxToZHighUnsupportedReport(op.getOperation(), message);
 }
 
+/// Check legality for ONNXMatMulInteger.
+template <>
+bool isSuitableForZDNN<ONNXMatMulIntegerOp>(
+    ONNXMatMulIntegerOp op, const DimAnalysis *dimAnalysis) {
+  // Check NNPA level.
+  if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M15);
+
+  // Only support per-tensor quantization.
+  Value AZeroPoint = op.getAZeroPoint();
+  Value BZeroPoint = op.getBZeroPoint();
+  if (!isScalarTensor(AZeroPoint))
+    return onnxToZHighInCompatibilityReport(
+        op.getOperation(), "A's zeropoint is not scalar");
+  if (!isScalarTensor(BZeroPoint))
+    return onnxToZHighInCompatibilityReport(
+        op.getOperation(), "B's zeropoint is not scalar");
+
+  ShapedType aType = mlir::cast<ShapedType>(op.getA().getType());
+  ShapedType bType = mlir::cast<ShapedType>(op.getB().getType());
+
+  // Illegal if A or B is unranked.
+  if (!aType.hasRank() || !bType.hasRank())
+    return false;
+
+  auto shapeA = aType.getShape();
+  auto shapeB = bType.getShape();
+
+  // In case of Tensors with unknown dimension, check only size of matrices.
+  // Actual shape is not checked. If actual shape does not meet, get error at
+  // runtime.
+  // TODO: Support other cases
+  // (https://github.com/onnx/onnx/blob/main/docs/Operators.md#MatMul) on zDNN
+  // by using broadcasting etc.
+  if ((shapeA.size() == 2) && (shapeB.size() == 2)) {
+    // unstacked case
+    if (aType.hasStaticShape() && bType.hasStaticShape())
+      return (shapeA[1] == shapeB[0]);
+    else
+      return true;
+  } else if ((shapeA.size() == 3) && (shapeB.size() == 3)) {
+    // stacked w/o bcast case
+    if (aType.hasStaticShape() && bType.hasStaticShape())
+      return ((shapeA[0] == shapeB[0]) && (shapeA[2] == shapeB[1]));
+    else
+      return true;
+  } else if ((shapeA.size() == 3) && (shapeB.size() == 2)) {
+    // stacked w/ bcast
+    if (aType.hasStaticShape() && bType.hasStaticShape())
+      return (shapeA[2] == shapeB[0]);
+    else
+      return true;
+  }
+
+  return false; // unsupported case
+}
+
+/// Check legality for ONNXQLinearMatMul.
+template <>
+bool isSuitableForZDNN<ONNXQLinearMatMulOp>(
+    ONNXQLinearMatMulOp op, const DimAnalysis *dimAnalysis) {
+  Value A = op.getA();
+  Value AScale = op.getAScale();
+  Value AZeroPoint = op.getAZeroPoint();
+  Value B = op.getB();
+  Value BScale = op.getBScale();
+  Value BZeroPoint = op.getBZeroPoint();
+  Value Y = op.getY();
+  Value YScale = op.getYScale();
+
+  // Check NNPA level.
+  if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M15);
+
+  // Only support float32 <-> int8/uint8.
+  Type elemTyA = getElementType(A.getType());
+  Type elemTyAScale = getElementType(AScale.getType());
+  Type elemTyB = getElementType(B.getType());
+  Type elemTyBScale = getElementType(BScale.getType());
+  Type elemTyY = getElementType(Y.getType());
+  Type elemTyYScale = getElementType(YScale.getType());
+
+  if (!elemTyAScale.isF32() || !elemTyBScale.isF32() || !elemTyYScale.isF32())
+    return false;
+  if (!(elemTyA.isInteger(8) || elemTyA.isUnsignedInteger(8)))
+    return false;
+  if (!(elemTyB.isInteger(8) || elemTyB.isUnsignedInteger(8)))
+    return false;
+  if (!(elemTyY.isInteger(8) || elemTyY.isUnsignedInteger(8)))
+    return false;
+
+  // Only support per-tensor quantization.
+  if (!isScalarTensor(AScale) || !isScalarTensor(BScale) ||
+      !isScalarTensor(AZeroPoint) || !isScalarTensor(BZeroPoint))
+    return false;
+
+  ShapedType aType = mlir::cast<ShapedType>(A.getType());
+  ShapedType bType = mlir::cast<ShapedType>(B.getType());
+
+  // Illegal if A or B is unranked.
+  if (!aType.hasRank() || !bType.hasRank())
+    return false;
+
+  auto shapeA = aType.getShape();
+  auto shapeB = bType.getShape();
+
+  // In case of Tensors with unknown dimension, check only size of matrices.
+  // Actual shape is not checked. If actual shape does not meet, get error at
+  // runtime.
+  // TODO: Support other cases
+  // (https://github.com/onnx/onnx/blob/main/docs/Operators.md#MatMul) on zDNN
+  // by using broadcasting etc.
+  if ((shapeA.size() == 2) && (shapeB.size() == 2)) {
+    // unstacked case
+    if (aType.hasStaticShape() && bType.hasStaticShape())
+      return (shapeA[1] == shapeB[0]);
+    else
+      return true;
+  } else if ((shapeA.size() == 3) && (shapeB.size() == 3)) {
+    // stacked w/o bcast case
+    if (aType.hasStaticShape() && bType.hasStaticShape())
+      return ((shapeA[0] == shapeB[0]) && (shapeA[2] == shapeB[1]));
+    else
+      return true;
+  } else if ((shapeA.size() == 3) && (shapeB.size() == 2)) {
+    // stacked w/ bcast
+    if (aType.hasStaticShape() && bType.hasStaticShape())
+      return (shapeA[2] == shapeB[0]);
+    else
+      return true;
+  }
+
+  return false; // unsupported case
+}
+
 /// Check legality for ONNXGemm.
 template <>
 bool isSuitableForZDNN<ONNXGemmOp>(
@@ -690,8 +881,8 @@ bool isSuitableForZDNN<ONNXGemmOp>(
   Value C = op.getC();
 
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check data type.
   if (!isValidElementTypeAndRank(op.getOperation(), A))
@@ -759,13 +950,99 @@ bool isSuitableForZDNN<ONNXGemmOp>(
   return true;
 }
 
+// Common function for ReduceMax and ReduceMin
+template <typename OP_TYPE>
+static bool checkReduceParam(OP_TYPE op) {
+  IndexExprBuilderForAnalysis createIE(op.getLoc());
+
+  // Check NNPA level.
+  if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M15);
+
+  // Check data type.
+  Value data = op.getData();
+  if (!isValidElementTypeAndRank(op.getOperation(), data))
+    return false;
+
+  // Check axes value
+  Value axesVal = op.getAxes();
+  if (!isDenseONNXConstant(axesVal))
+    return false;
+
+  ONNXConstantOp axesConstant =
+      mlir::cast<ONNXConstantOp>(axesVal.getDefiningOp());
+  int64_t axesInt = getScalarValue<int64_t>(axesConstant);
+
+  int64_t keepdims = op.getKeepdims();
+  int64_t noop_with_empty_axes = op.getNoopWithEmptyAxes();
+  int64_t rank = createIE.getShapedTypeRank(data);
+
+  // Check if axes (int64) is exactly a size of one
+  if (floor(log10(axesInt)) + 1 == 1) {
+    int64_t axis = axesInt;
+    // Accepted range is [-r, r-1] where r = rank(data)
+    if (axis < -rank || axis > rank - 1) {
+      std::string message =
+          "The `axis` is out of the accepted range which is [-r, r-1]";
+      return onnxToZHighUnsupportedReport(op, message);
+    }
+    if ((axis != -1) && (axis != rank - 1)) {
+      std::string message = "The `axis` must be the innermost dimension. ";
+      return onnxToZHighUnsupportedReport(op, message);
+    }
+  } else {
+    std::string message = "Axes can only be a scalar size of one. ";
+    return onnxToZHighUnsupportedReport(op, message);
+  }
+
+  // REMINDER: Should we check the input tensor rank.
+
+  // Check keepdims and noop_with_empty_axes, we only support the default
+  // value. Attributes: keepdims (default is 1) and noop_with_empty_axes
+  // (default is 0)
+  if ((noop_with_empty_axes == 1) || (keepdims == 0)) {
+    std::string message = "`noop_with_empty_axes` (" +
+                          std::to_string(noop_with_empty_axes) +
+                          ") must be 0 and `keepdims` (" +
+                          std::to_string(keepdims) + ") must be 1.";
+    return onnxToZHighUnsupportedReport(op, message);
+  }
+  return true;
+}
+
+/// Check legality for ONNXReduceMax.
+template <>
+bool isSuitableForZDNN<ONNXReduceMaxOp>(
+    ONNXReduceMaxOp op, const DimAnalysis *dimAnalysis) {
+
+  // Check parameter restrictions for ReduceMax
+  bool isReduceMax = checkReduceParam<ONNXReduceMaxOp>(op);
+  if (!isReduceMax)
+    return false;
+
+  return true;
+}
+
+/// Check legality for ONNXReduceMin.
+template <>
+bool isSuitableForZDNN<ONNXReduceMinOp>(
+    ONNXReduceMinOp op, const DimAnalysis *dimAnalysis) {
+
+  // Check parameter restrictions for ReduceMin
+  bool isReduceMin = checkReduceParam<ONNXReduceMinOp>(op);
+  if (!isReduceMin)
+    return false;
+
+  return true;
+}
+
 /// Check legality for ONNXReduceMeanV13.
 template <>
 bool isSuitableForZDNN<ONNXReduceMeanV13Op>(
     ONNXReduceMeanV13Op op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check data type.
   if (!isValidElementTypeAndRank(op.getOperation(), op.getData()))
@@ -826,7 +1103,7 @@ template <>
 bool isSuitableForZDNN<ONNXSoftplusOp>(
     ONNXSoftplusOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
     return false;
   if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
     return false;
@@ -844,8 +1121,8 @@ bool isSuitableForZDNN<ONNXLSTMOp>(
   Value B = op.getB();
 
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check direction.
   if ((direction != FORWARD) && (direction != REVERSE) &&
@@ -869,7 +1146,8 @@ bool isSuitableForZDNN<ONNXLSTMOp>(
     std::string message =
         "The first dimension of weight tensor `W` for `num_directions` (" +
         std::to_string(wShape[0]) +
-        ") must be 1 or 2, and the second dimension of it for `hidden_size` (" +
+        ") must be 1 or 2, and the second dimension of it for `hidden_size` "
+        "(" +
         std::to_string(wShape[1]) + ") must be static.";
     return onnxToZHighUnsupportedReport(op.getOperation(), message);
   }
@@ -877,9 +1155,9 @@ bool isSuitableForZDNN<ONNXLSTMOp>(
   ArrayRef<int64_t> rShape = mlir::cast<ShapedType>(R.getType()).getShape();
   if (!mlir::cast<ShapedType>(R.getType()).hasStaticShape() ||
       (rShape[0] != 1 && rShape[0] != 2)) {
-    std::string message =
-        "The recurrence weight tensor `R` must have static dimension, and the "
-        "first dimension of it must be 1 or 2.";
+    std::string message = "The recurrence weight tensor `R` must have static "
+                          "dimension, and the "
+                          "first dimension of it must be 1 or 2.";
     return onnxToZHighUnsupportedReport(op.getOperation(), message);
   }
   // Check hidden_size.
@@ -957,8 +1235,8 @@ bool isSuitableForZDNN<ONNXGRUOp>(
   Value B = op.getB();
 
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check direction.
   if ((direction != FORWARD) && (direction != REVERSE) &&
@@ -982,7 +1260,8 @@ bool isSuitableForZDNN<ONNXGRUOp>(
     std::string message =
         "The first dimension of weight tensor `W` for `num_directions` (" +
         std::to_string(wShape[0]) +
-        ") must be 1 or 2, and the second dimension of it for `hidden_size` (" +
+        ") must be 1 or 2, and the second dimension of it for `hidden_size` "
+        "(" +
         std::to_string(wShape[1]) + ") must be static.";
     return onnxToZHighUnsupportedReport(op.getOperation(), message);
   }
@@ -1062,8 +1341,8 @@ template <>
 bool isSuitableForZDNN<ONNXMaxPoolSingleOutOp>(
     ONNXMaxPoolSingleOutOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check data type.
   if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
@@ -1094,8 +1373,8 @@ template <>
 bool isSuitableForZDNN<ONNXAveragePoolOp>(
     ONNXAveragePoolOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check data type.
   if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
@@ -1111,9 +1390,9 @@ bool isSuitableForZDNN<ONNXAveragePoolOp>(
       ONNXAveragePoolOpShapeHelper>(op, op.getY(), dimAnalysis);
 }
 
-/// Check if input, output, kernel, strides, and paddingType for each axis meet
-/// parameter restrictions for conv2d. See "Conv2D Parameter Restrictions"
-/// in "zDNN API Reference"
+/// Check if input, output, kernel, strides, and paddingType for each axis
+/// meet parameter restrictions for conv2d. See "Conv2D Parameter
+/// Restrictions" in "zDNN API Reference"
 static bool checkConv2DParamRestrictions(Operation *op, int64_t inputDim,
     int64_t kernelDim, int64_t stride, int64_t outputDim,
     StringRef paddingType) {
@@ -1218,8 +1497,8 @@ template <>
 bool isSuitableForZDNN<ONNXConvOp>(
     ONNXConvOp op, const DimAnalysis *dimAnalysis) {
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // Check data type.
   if (!isValidElementTypeAndRank(op.getOperation(), op.getX()))
@@ -1255,7 +1534,8 @@ bool isSuitableForZDNN<ONNXConvOp>(
       ShapedType::isDynamic(shapeOutput[2]) ||
       ShapedType::isDynamic(shapeOutput[3]))
     return onnxToZHighUnsupportedReport(op,
-        "Height and/or width have dynamic dimensions. They are not supported.");
+        "Height and/or width have dynamic dimensions. They are not "
+        "supported.");
 
   // Do not support group.
   if (operandAdaptor.getGroup() != 1)
@@ -1271,7 +1551,8 @@ bool isSuitableForZDNN<ONNXConvOp>(
   }
 
   // `getStrPaddingType` returns `SAME_PADDING`, `VALID_PADDING`, or empty.
-  // `zdnn_conv2d` only support padding for `SAME_PADDING` and `VALID_PADDING`.
+  // `zdnn_conv2d` only support padding for `SAME_PADDING` and
+  // `VALID_PADDING`.
   StringRef paddingType =
       getStrPaddingType<ONNXConvOp, ONNXConvOpAdaptor, ONNXConvOpShapeHelper>(
           op);
@@ -1324,8 +1605,8 @@ bool isSuitableForZDNN<ONNXBatchNormalizationInferenceModeOp>(
   ArrayRef<int64_t> shapeOutput = outputType.getShape();
 
   // Check NNPA level.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    return onnxToZHighInCompatibilityReport(op.getOperation(), NNPALevel::M14);
 
   // 4D tensors(N x C x H x W) are supported as input and output.
   if (shapeInput.size() != 4 || shapeOutput.size() != 4)
@@ -1344,3 +1625,19 @@ bool isSuitableForZDNN<ONNXReshapeOp>(
   // Noop Reshape is suitable for zAIU as this pass removes such reshape ops.
   return isIdentityReshape(op, dimAnalysis);
 }
+
+/// Check legality for ONNXDequantizeLinearOp.
+template <>
+bool isSuitableForZDNN<ONNXDequantizeLinearOp>(
+    ONNXDequantizeLinearOp op, const DimAnalysis *dimAnalysis) {
+  // The pass rewrite-onnx-for-zhigh has a rule to rewrite the pattern
+  // `DequantizeLinear (QLinearMatMul inputs)` where ND inputs are reshaped
+  // into 3D inputs. This rule uses the function template
+  // `addDynamicallyLegalOpFor` to define legality using a custom lambda
+  // function instead of `isSuitableForZDNN`. Hence, the legality here should
+  // not be used/called. This legality is here to complete the function
+  // template `addDynamicallyLegalOpFor` so that it's not failed when building
+  // the compiler.
+  llvm_unreachable("Not used");
+  return false;
+}
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp
index f9c36372c4..09bfa6f4f6 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXLegalityCheck.hpp
@@ -17,6 +17,7 @@
 #ifndef ONNX_MLIR_LEGALITY_H
 #define ONNX_MLIR_LEGALITY_H
 
+#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 #include "src/Dialect/ONNX/ONNXDimAnalysis.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
@@ -53,6 +54,8 @@ bool onnxToZHighUnsupportedReport(
     mlir::Operation *op, const std::string &message);
 
 bool onnxToZHighInCompatibilityReport(
-    mlir::Operation *op, std::string inputNNPALevel);
+    mlir::Operation *op, const std::string &message);
+
+bool onnxToZHighInCompatibilityReport(mlir::Operation *op, NNPALevel level);
 
 #endif
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.cpp
index 2bfa450691..78e94a6a2a 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.cpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.cpp
@@ -4,7 +4,7 @@
 
 //====------ ONNXToZHigh.cpp - ONNX dialect to ZHigh lowering -------------===//
 //
-// Copyright 2019-2022 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -13,6 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Debug.h"
+
+#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp"
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp"
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
@@ -25,6 +28,8 @@
 #include "src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp"
 #include "src/Dialect/ONNX/Transforms/ShapeInference.hpp"
 
+#define DEBUG_TYPE "onnx-to-zhigh"
+
 using namespace mlir;
 
 //
@@ -33,6 +38,17 @@ using namespace mlir;
 
 namespace onnx_mlir {
 
+using namespace zhigh;
+
+#define QUANT_PATTERN_BENEFIT 1000
+
+/// Checks whether a constant tensor's elements are of type FloatType.
+bool isFloatType(Value constValue) {
+  ElementsAttr constElements = getElementAttributeFromONNXValue(constValue);
+  Type elemType = constElements.getElementType();
+  return mlir::isa<FloatType>(elemType);
+}
+
 ArrayAttr getLSTMGRUBiasSplitShape(
     Location loc, PatternRewriter &rewriter, ArrayRef<int64_t> shapeR) {
   int64_t hiddenSize = shapeR[2];
@@ -253,6 +269,349 @@ SmallVector<int64_t, 2> getArrayStrides(OP op) {
   return shapeHelper.strides;
 }
 
+/// Get approximate
+template <typename OP, typename OPAdaptor, typename OPShapeHelper>
+StringRef getStrApproximateType(OP op) {
+  return op.getApproximate();
+}
+
+// Computes the folded bias to be passed to quantized matmul call when
+// operation is MATMUL_OP_ADDITION. Zb should be equal to 0, meaning the
+// correction term for input_a is also equal to 0. This allows the
+// correction term for input_b to be folded into qc_tilde, which removes the
+// need for correction being applied after the quantized matmul call.
+//
+// The original equation for qc_tilde is:
+//   M = (Sa * Sb) / Sy
+//   qc_tilde = Zy - (Sc / Sy) * Zc + (Sc / Sy) * input_c[j] + M*N*Za*Zb
+//
+// Given Zb = 0, the equation becomes:
+//   M = (Sa * Sb) / Sy
+//   qc_tilde = Zy - (Sc / Sy) * Zc + (Sc / Sy) * input_c[j]
+//
+// Given scales are stored as the reciprocal in zTensor, the modified equation
+// becomes:
+//   M = RSy / (RSa * RSb)
+//   qc_tilde = Zy - (RSy / RSc) * Zc + (RSy / RSc) * input_c[j]
+//
+//  where RS = 1/S.
+//
+// We can reorder this to:
+//   M = RSy / (RSa * RSb)
+//   qc_tilde = input_c[j] * (RSy / RSc) + Zy - (RSy / RSc) * Zc
+//
+// This allows us to pre-compute a scale and offset to apply to input_c[j]:
+//   M = RSy / (RSa * RSb).
+//   scale = (RSy / RSc)
+//   offset = Zy - scale * Zc
+//   qc_tilde[j] = input_c[j] * scale + offset
+//
+// The original equation for the correction term for input_b is:
+//   M = (RSa * RSb) / RSy
+//   term_b = M * Za * sum(input_b[:,j])
+//
+// Given scales are stored as the reciprocal, the modified equation becomes:
+//   M = RSy / (RSa * RSb)
+//   term_b = M * Za * sum(input_b[:,j])
+//
+// This gives us the equation:
+//   M = RSy / (RSa * RSb)
+//   MZa = M * Za
+//   scale = (RSy / RSc)
+//   offset = Zy - scale * Zc
+//   qc_tilde[j] = input_c[j] * scale + offset - MZa * sum(input_b[:,j])
+//
+// In case of MatMulInteger, input_c = 0, RSc = 1, Zc = 0, the final equation
+// is:
+//   M = RSy / (RSa * RSb)
+//   MZa = M * Za
+//   scale = RSy
+//   offset = Zy
+//   qc_tilde[j] = offset - Za * (RSy / RSa / RSb) * sum(input_b[:,j])
+//
+// When Zy = 0, qc_tilde[j] = -Za * (RSy / RSa / RSb) * sum(input_b[:,j])
+static void preComputeBias(MultiDialectBuilder<OnnxBuilder> &create, Value RSa,
+    Value Za, Value BI8, Value RSb, Value RSy, Value Zy, Value &qcTilde,
+    Value &RSqctilde, Value &Zqctilde) {
+  OpBuilder rewriter = create.getBuilder();
+  Location loc = create.getLoc();
+
+  Type i64Ty = rewriter.getI64Type();
+  Type f32Ty = rewriter.getF32Type();
+  auto cstMinus2Attr = DenseElementsAttr::get(
+      RankedTensorType::get({}, i64Ty), static_cast<int64_t>(-2));
+  auto cst0Attr = DenseElementsAttr::get(
+      RankedTensorType::get({}, f32Ty), static_cast<float>(0));
+  auto cst1Attr = DenseElementsAttr::get(
+      RankedTensorType::get({}, f32Ty), static_cast<float>(1));
+
+  Value cst0 = create.onnx.constant(cst0Attr);
+  Value cst1 = create.onnx.constant(cst1Attr);
+
+  // Can be optimized further when Zy is zero.
+  bool ZyIsZero = isDenseONNXConstant(Zy) && isConstOf(Zy, 0.);
+
+  Value qcF32;
+  Value B = create.onnx.cast(BI8, f32Ty);
+  Value lastSecondAxis = create.onnx.constant(cstMinus2Attr);
+  // Emit: sum(input_b[:,j])
+  Value BSum = create.onnx.reduceSum(
+      UnrankedTensorType::get(f32Ty), B, lastSecondAxis, false);
+  // RSy, RSa, RSb, Za are scalar, do scalar computation.
+  // Emit: Za * (RSy / RSa / RSb)
+  Value RSyRSa = create.onnx.div(RSy, RSa);
+  Value RSyRSaRSb = create.onnx.div(RSyRSa, RSb);
+  Value MZa = create.onnx.mul(RSyRSaRSb, Za);
+  // Negate ZaRSyRSa to avoid broadcasting Sub:
+  // `Zy - Za * (RSy / RSa / RSb) * ...`
+  MZa = create.onnx.sub(cst0, MZa);
+  // Broadcast ops.
+  // Emit: - Za * (RSy / RSa / RSb) * sum(input_b[:,j])
+  Value MZaBSum = create.onnx.mul(MZa, BSum);
+  // Emit: Zy - Za * (RSy / RSa / RSb) * sum(input_b[:,j])
+  if (ZyIsZero) {
+    qcF32 = MZaBSum;
+  } else {
+    qcF32 = create.onnx.add(Zy, MZaBSum);
+  }
+
+  // Use 1 for recscale and 0 for offset. This is a dlfloat16 stickification.
+  int64_t rank = getRank(qcF32.getType());
+  StringAttr layoutAttr =
+      rewriter.getStringAttr((rank == 1) ? LAYOUT_1D : LAYOUT_2DS);
+  ZHighQuantizedStickOp qcOp = rewriter.create<ZHighQuantizedStickOp>(loc,
+      qcF32, cst1, cst0, layoutAttr, rewriter.getStringAttr(QTYPE_DLFLOAT16));
+  qcTilde = qcOp.getResult(0);
+  RSqctilde = qcOp.getResult(1);
+  Zqctilde = qcOp.getResult(2);
+}
+
+static Value getOrCastToI8(Value val, MultiDialectBuilder<OnnxBuilder> &create,
+    bool simpleCast = false) {
+  if (!getElementType(val.getType()).isUnsignedInteger())
+    return val;
+
+  Type i8Ty = create.getBuilder().getI8Type();
+  if (simpleCast)
+    return create.onnx.cast(val, i8Ty);
+
+  // Use int16 to avoid integer overflow.
+  Type i16Ty = create.getBuilder().getI16Type();
+  auto cst128Attr = DenseElementsAttr::get(
+      RankedTensorType::get({}, i16Ty), static_cast<int16_t>(128));
+  Value valI16 = create.onnx.cast(val, i16Ty);
+  valI16 = create.onnx.sub(valI16, create.onnx.constant(cst128Attr));
+  Value valI8 = create.onnx.cast(valI16, i8Ty);
+  return valI8;
+}
+
+// Dynamic quantization helper to match and rewrite values A, B, C of A*B+C.
+class DynQuantI8PatternHelper {
+public:
+  DynQuantI8PatternHelper(PatternRewriter &rewriter, Location loc,
+      Operation *op, Value A, Value B, Value C, bool symForA)
+      : rewriter(rewriter), loc(loc), op(op), A(A), B(B), C(C),
+        symForA(symForA) {}
+
+  // Check the inputs A, B, C of `A*B+C` to see if they are suitable for doing
+  // dynamic quantization on NNPA.
+  LogicalResult match() {
+    // A is of f32.
+    if (!mlir::isa<Float32Type>(getElementType(A.getType())))
+      return rewriter.notifyMatchFailure(op, "MatMul's A is not of f32.");
+
+    // Weight is a constant.
+    if (!isDenseONNXConstant(B))
+      return rewriter.notifyMatchFailure(op, "MatMul's B is not a constant.");
+
+    if (C) {
+      // Bias is a constant.
+      if (!isDenseONNXConstant(C))
+        return rewriter.notifyMatchFailure(op, "MatMul's C is not a constant");
+      // B and C shapes must be consistent. The reduction shape of B on the
+      // second dim from the last is the same as the shape of B, e.g. If B is
+      // [2x3x4], C must be [2x4].
+      ArrayRef<int64_t> bShape = getShape(B.getType());
+      ArrayRef<int64_t> cShape = getShape(C.getType());
+      int64_t bRank = bShape.size();
+      int64_t cRank = cShape.size();
+      if (bRank - 1 != cRank)
+        return rewriter.notifyMatchFailure(
+            op, "The ranks of B and C are imcompatible.");
+      if (bShape[bRank - 1] != cShape[cRank - 1])
+        return rewriter.notifyMatchFailure(
+            op, "The last dimensions of B and C are not the same.");
+      if (bShape.drop_back(2) != cShape.drop_back(1))
+        return rewriter.notifyMatchFailure(
+            op, "The shapes of B and C are imcompatible.");
+    }
+
+    return success();
+  }
+
+  // clang-format off
+  /*
+   * Emit the following code to compute `A*B+C` using i8 dynamic quantization.
+   * A can be quantized using asymmetric or symmetric quantization depending on
+   * the flag `symForA`, while B is always quantized using symmetric quantization.
+   * (Note that: If C is given, it will be added into the pre_computed_bias)
+   *
+   * ```
+   * (Quantize A using asymmetric/symmetric quant by setting `sym_mode` attr to the `symForA` flag)
+   * %qa, %a_recscale, %a_offset = zhigh.QuantizedStick(%A, none, none) { quantized_type = QUANTIZED_DLFLOAT16, sym_mode = 1/0}
+   *
+   * (Quantize B using symmetric quant)
+   * %b_offset = 0 // Symmetric quant mode for i8. Offset is always zero, qmin = * -127, qmax = 127.
+   * %absmax = onnx.ReduceMax(onnx.Abs(%B))
+   * %b_rescale = onnx.Div(127, absmax)
+   * %qb = onnx.cast(onnx.Clip(onnx.Round(onnx.Mul(%B, %b_rescale)), qmin, qmax))
+   * %qb, %b_recscale, %b_offset = zhigh.QuantizedStick(%qb, %b_recscale, %b_offset) { quantized_type = QUANTIZED_WEIGHTS_INT8 }
+   *
+   * (Pre computed bias, %C is added)
+   * %qc = emit_ops_for_pre_computed_bias_at_compile_time
+   * %qc = zhigh.Add(%qc, zhigh.Stick(%C)) // only done if C is given.
+   * %qc_recscale = 1
+   * %qc_offset = 0
+   *
+   * %Y_recscale = 1
+   * %Y_offset = 0
+   * %Y, %Y_recscale, %Y_offset = zhigh.QuantizedMatMul (%qa, %a_recscale, %a_offset,
+   *                                                     %qb, %b_recscale, %b_offset,
+   *                                                     %qc, %c_recscale, %c_offset,
+   *                                                     %Y_recscale, %Y_offset) {
+   *  PreComputedBias = true, DisableClipping = true, DequantizeOutput = false
+   * }
+   * ```
+   *
+   * where the computation of `%qb` and `%qb_recscale` are expected to be folded by constant
+   * propagation so that they become constants.
+   *
+   * For more information about dynamic quantization, see https://www.maartengrootendorst.com/blog/quantization
+   */
+  // clang-format on
+  Value rewriteSym() {
+    MultiDialectBuilder<OnnxBuilder> create(rewriter, loc);
+
+    Type i8Ty = rewriter.getIntegerType(8);
+    Type si64Ty = rewriter.getIntegerType(64, true);
+    Type f16Ty = rewriter.getF16Type();
+    Type f32Ty = rewriter.getF32Type();
+    RankedTensorType scalarTy = RankedTensorType::get({}, f32Ty);
+
+    IntegerAttr trueAttr = rewriter.getIntegerAttr(si64Ty, -1);
+    IntegerAttr falseAttr = rewriter.getIntegerAttr(si64Ty, 0);
+
+    Value none = create.onnx.none();
+    Value cst0 = create.onnx.constant(
+        DenseElementsAttr::get(scalarTy, static_cast<float>(0)));
+    Value cst1 = create.onnx.constant(
+        DenseElementsAttr::get(scalarTy, static_cast<float>(1)));
+    Value cst127 = create.onnx.constant(
+        DenseElementsAttr::get(scalarTy, static_cast<float>(127)));
+    Value cstNeg127 = create.onnx.constant(
+        DenseElementsAttr::get(scalarTy, static_cast<float>(-127)));
+
+    int64_t rankA = getRank(A.getType());
+    int64_t rankB = getRank(B.getType());
+    StringAttr aLayoutAttr =
+        rewriter.getStringAttr((rankA == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    StringAttr bLayoutAttr =
+        rewriter.getStringAttr((rankB == 2) ? LAYOUT_2D : LAYOUT_3DS);
+
+    // Quantize and stickify A.
+    IntegerAttr symModeAttr =
+        rewriter.getIntegerAttr(rewriter.getI64Type(), symForA ? 1 : 0);
+    ZHighQuantizedStickOp qAOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, A, none, none, aLayoutAttr,
+            rewriter.getStringAttr(QTYPE_DLFLOAT16), symModeAttr);
+    Value AI8 = qAOp.getResult(0);
+    Value ARecScale = qAOp.getResult(1);
+    Value AOffset = qAOp.getResult(2);
+
+    // Quantize B. All computations here would be folded by constprop.
+    // Though computation here can be generalized for other integer types by
+    // changing qmin and qmax, we optimize it for i8 since NNPA supports i8 only
+    // at this moment.
+    // Symmetric mode for i8, meaning offset = 0, qmin = -127, qmax = 127.
+    Value BOffset = cst0;
+    Value qmin = cstNeg127;
+    Value qmax = cst127;
+    // %absmax = onnx.ReduceMax(onnx.Abs(%B))
+    // %b_rescale = onnx.Div(127, absmax)
+    Value absMax =
+        create.onnx.reduceMax(scalarTy, create.onnx.abs(B), none, false, false);
+    Value BRecScale = create.onnx.div(cst127, absMax);
+    // %qb = onnx.Cast(
+    //  onnx.Clip(onnx.Round(onnx.Mul(%B, %b_rescale)), qmin, qmax))
+    Value BI8 = create.onnx.cast(
+        create.onnx.clip(
+            create.onnx.round(create.onnx.mul(B, BRecScale)), qmin, qmax),
+        i8Ty);
+    // Stickify B.
+    ZHighQuantizedStickOp qBOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, BI8, BRecScale, BOffset,
+            bLayoutAttr, rewriter.getStringAttr(QTYPE_WEIGHTS));
+
+    // Output information.
+    Value YRecScale = cst1;
+    Value YOffset = cst0;
+
+    // When A is also quantized using symmetric mode, both correction terms for
+    // A and B are canceled out. Thus, no precomputation is needed.
+    Value qcTilde = none, qcTildeRecScale = cst1, qcTildeOffset = cst0;
+    if (!symForA) {
+      // When only B is quantized using symmetric mode, precompute the
+      // correction term for B only.
+      preComputeBias(create, ARecScale, AOffset, BI8, BRecScale, YRecScale,
+          YOffset, qcTilde, qcTildeRecScale, qcTildeOffset);
+    }
+    // Add up C into bias if C is given.
+    if (C) {
+      int64_t rankC = getRank(C.getType());
+      assert((rankC == rankB - 1) &&
+             "C has a wrong shape to be added into pre_computed_bias");
+      assert((rankC == 1 || rankC == 2) && "Wrong rank for C");
+      StringAttr cLayoutAttr =
+          rewriter.getStringAttr((rankC == 1) ? LAYOUT_1D : LAYOUT_2DS);
+      Value stickC = rewriter.create<ZHighStickOp>(loc, C, cLayoutAttr);
+      if (symForA)
+        qcTilde = stickC;
+      else
+        qcTilde = rewriter.create<ZHighAddOp>(
+            loc, qcTilde.getType(), qcTilde, stickC);
+    }
+
+    // Emit zhigh.QuantizedMatMul.
+    // No need to dequantize since Y's rescale is 1.
+    // Do not clip the output values to i8, keep i32.
+    SmallVector<Type, 3> resTypes;
+    resTypes.emplace_back(UnrankedTensorType::get(f16Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    ZHighQuantizedMatMulOp zhighQuantizedMatMulOp =
+        rewriter.create<ZHighQuantizedMatMulOp>(loc, resTypes, AI8, ARecScale,
+            AOffset, qBOp.getResult(0), BRecScale, BOffset, qcTilde,
+            qcTildeRecScale, qcTildeOffset,
+            /*OutRecScale*/ YRecScale, /*OutOffset*/ YOffset,
+            /*PreComputedBias*/ trueAttr, /*DisableClipping*/ trueAttr,
+            /*DequantizeOutput*/ falseAttr);
+    (void)zhighQuantizedMatMulOp.inferShapes([](Region &region) {});
+
+    // Unstickify the matmul result that is int8-as-float.
+    Value res = rewriter.create<ZHighUnstickOp>(
+        loc, zhighQuantizedMatMulOp.getResult(0));
+    return res;
+  }
+
+private:
+  PatternRewriter &rewriter;
+  Location loc;
+  Operation *op;
+  Value A, B, C;
+  // Whether do symmetric quant for activation input A or not.
+  bool symForA = false;
+};
+
 //===----------------------------------------------------------------------===//
 // ONNX to ZHigh Lowering Pass
 //===----------------------------------------------------------------------===//
@@ -262,9 +621,9 @@ namespace {
 #include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXONNXToZHigh.inc"
 
 // Enhance 'replaceONNXSumOpPatternRecursion' to allow operating recursively.
-struct ONNXSumOpPatternEnhancedRecursion
+struct replaceONNXSumOpPatternEnhancedRecursion
     : public replaceONNXSumOpPatternRecursion {
-  ONNXSumOpPatternEnhancedRecursion(MLIRContext *context)
+  replaceONNXSumOpPatternEnhancedRecursion(MLIRContext *context)
       : replaceONNXSumOpPatternRecursion(context) {}
   void initialize() {
     // This pattern recursively unpacks one variadic operand at a time. The
@@ -274,6 +633,892 @@ struct ONNXSumOpPatternEnhancedRecursion
   }
 };
 
+/**
+ * This is a pattern for doing i8 dynamic quantization (symmetric mode) for
+ * onnx.MatMul(%A, %B), where %B is a constant.
+ */
+
+class replaceONNXMatMulByDynQuantI8Pattern
+    : public OpRewritePattern<ONNXMatMulOp> {
+public:
+  using OpRewritePattern<ONNXMatMulOp>::OpRewritePattern;
+
+  replaceONNXMatMulByDynQuantI8Pattern(
+      MLIRContext *context, PatternBenefit benefit = 1, bool symForA = false)
+      : OpRewritePattern<ONNXMatMulOp>(context, benefit), symForA(symForA) {}
+
+  LogicalResult matchAndRewrite(
+      ONNXMatMulOp mmOp, PatternRewriter &rewriter) const override {
+    Location loc = mmOp.getLoc();
+    Operation *op = mmOp.getOperation();
+    Value A = mmOp.getA();
+    Value B = mmOp.getB();
+
+    // Dynamic quantization helper.
+    DynQuantI8PatternHelper dqHelper(rewriter, loc, op, A, B, nullptr, symForA);
+
+    // Match
+    if (!isSuitableForZDNN<ONNXMatMulOp>(mmOp) || failed(dqHelper.match()))
+      return rewriter.notifyMatchFailure(op, "MatMul is not suitable for zDNN");
+
+    // Rewrite
+    Value res = dqHelper.rewriteSym();
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+private:
+  bool symForA = false;
+};
+
+/**
+ * This is a pattern for doing i8 dynamic quantization (symmetric mode) for
+ * `onnx.Add(onnx.MatMul(%A, %B), %C)`. where
+ * - %B and %C are a constant and
+ * - %B and %C must have compatible shape, i.e. the reduction shape on the last
+ *   second dim of %B is the same as %C's shape.
+ */
+class replaceONNXMatMulAddByDynQuantI8Pattern
+    : public OpRewritePattern<ONNXAddOp> {
+public:
+  using OpRewritePattern<ONNXAddOp>::OpRewritePattern;
+
+  replaceONNXMatMulAddByDynQuantI8Pattern(
+      MLIRContext *context, PatternBenefit benefit = 1, bool symForA = false)
+      : OpRewritePattern<ONNXAddOp>(context, benefit), symForA(symForA) {}
+
+  LogicalResult matchAndRewrite(
+      ONNXAddOp addOp, PatternRewriter &rewriter) const override {
+    Location loc = addOp.getLoc();
+    Operation *op = addOp.getOperation();
+    Value lhs = addOp.getOperand(0);
+    Value rhs = addOp.getOperand(1);
+
+    // Match A*B+C and C+A*B where B and C are constants, and then rewrite.
+    Value AB, C;
+    if (!areDefinedBy<ONNXMatMulOp, ONNXConstantOp>(lhs, rhs, AB, C))
+      return rewriter.notifyMatchFailure(
+          op, "MatMulAdd is not suitable for zDNN.");
+    ONNXMatMulOp mmOp = AB.getDefiningOp<ONNXMatMulOp>();
+    Value A = mmOp.getA();
+    Value B = mmOp.getB();
+
+    // Match A, B, C.
+    DynQuantI8PatternHelper dqHelper(rewriter, loc, op, A, B, C, symForA);
+    if (succeeded(dqHelper.match())) {
+      Value res = dqHelper.rewriteSym();
+      rewriter.replaceOp(op, res);
+      return success();
+    }
+
+    return failure();
+  }
+
+private:
+  bool symForA = false;
+};
+
+/**
+ * This is a pattern for doing i8 dynamic quantization (symmetric mode) for
+ * onnx.Gemm(%A, %B, %C), where %B and %C are constants.
+ *
+ * This pattern is applied only when the compiler option
+ * `--nnpa-quantization={DynSymI8|SymSymI8}` is specified.
+ *
+ */
+
+class replaceONNXGemmByDynQuantI8Pattern : public OpRewritePattern<ONNXGemmOp> {
+public:
+  using OpRewritePattern<ONNXGemmOp>::OpRewritePattern;
+
+  replaceONNXGemmByDynQuantI8Pattern(
+      MLIRContext *context, PatternBenefit benefit = 1, bool symForA = false)
+      : OpRewritePattern<ONNXGemmOp>(context, benefit), symForA(symForA) {}
+
+  LogicalResult matchAndRewrite(
+      ONNXGemmOp gemmOp, PatternRewriter &rewriter) const override {
+    Location loc = gemmOp.getLoc();
+    Operation *op = gemmOp.getOperation();
+
+    Value A = gemmOp.getA();
+    Value B = gemmOp.getB();
+    Value C = gemmOp.getC();
+    bool transA = (gemmOp.getTransA() != 0);
+    bool transB = (gemmOp.getTransB() != 0);
+
+    // Dynamic quantization helper.
+    DynQuantI8PatternHelper dqHelper(
+        rewriter, loc, op, A, B, isNoneValue(C) ? nullptr : C, symForA);
+
+    // Match
+    // TODO: if B is a constant and it is transposed, we can do transpose
+    // explicitly.
+    if (transA || transB)
+      return rewriter.notifyMatchFailure(op, "Gemm is with transpose");
+    if (!isSuitableForZDNN<ONNXGemmOp>(gemmOp))
+      return rewriter.notifyMatchFailure(op, "Gemm is not suitable for zDNN");
+    if (failed(dqHelper.match()))
+      return failure();
+
+    // Rewrite
+    Value res = dqHelper.rewriteSym();
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+private:
+  bool symForA = false;
+};
+
+class replaceONNXMatMulIntegerPattern
+    : public OpRewritePattern<ONNXMatMulIntegerOp> {
+public:
+  using OpRewritePattern<ONNXMatMulIntegerOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      ONNXMatMulIntegerOp mmiOp, PatternRewriter &rewriter) const override {
+    Location loc = mmiOp.getLoc();
+    Operation *op = mmiOp.getOperation();
+    MultiDialectBuilder<OnnxBuilder> create(rewriter, loc);
+
+    // Match
+    if (failed(canBeRewritten(rewriter, mmiOp)))
+      return failure();
+
+    Type si64Ty = rewriter.getIntegerType(64, true);
+    Type f16Ty = rewriter.getF16Type();
+    Type f32Ty = rewriter.getF32Type();
+    Type outElemTy = getElementType(mmiOp.getY().getType());
+    IntegerAttr trueAttr = rewriter.getIntegerAttr(si64Ty, -1);
+    IntegerAttr falseAttr = rewriter.getIntegerAttr(si64Ty, 0);
+
+    auto cst0Attr = DenseElementsAttr::get(
+        RankedTensorType::get({}, f32Ty), static_cast<float>(0));
+    auto cst1Attr = DenseElementsAttr::get(
+        RankedTensorType::get({}, f32Ty), static_cast<float>(1));
+    Value none = create.onnx.none();
+    Value zero = create.onnx.constant(cst0Attr);
+    Value zeroI64 = create.onnx.constantInt64({0});
+    Value one = create.onnx.constant(cst1Attr);
+
+    // Prepare inputs for zhigh QuantizedMatMul.
+
+    // I8 tensors
+    Value AI8 = getOrCastToI8(mmiOp.getA(), create, true);
+    Value BI8 = getOrCastToI8(mmiOp.getB(), create, true);
+
+    // Zero points in f32.
+    Value AZeroPointI8 = mmiOp.getAZeroPoint();
+    if (getRank(AZeroPointI8.getType()) == 1) {
+      // Normalize the zeropoint tensor to tensor<dtype>.
+      AZeroPointI8 = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(AZeroPointI8.getType())),
+          AZeroPointI8, {zeroI64});
+    }
+    AZeroPointI8 = getOrCastToI8(AZeroPointI8, create, true);
+    Value AZeroPointF32 = create.onnx.cast(AZeroPointI8, f32Ty);
+    // TESTING: minus zeropoint in advance to cancel out the software part of
+    // zdnn quantized matmul.
+    // AI8 = create.onnx.sub(AI8, AZeroPointI8);
+    // Value AZeroPointF32 = zero;
+    Value BZeroPointI8 = mmiOp.getBZeroPoint();
+    if (getRank(BZeroPointI8.getType()) == 1) {
+      // Normalize the zeropoint tensor to tensor<dtype>.
+      BZeroPointI8 = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(BZeroPointI8.getType())),
+          BZeroPointI8, {zeroI64});
+    }
+    BZeroPointI8 = getOrCastToI8(BZeroPointI8, create, true);
+    Value BZeroPointF32 = create.onnx.cast(BZeroPointI8, f32Ty);
+    // TESTING: minus zeropoint in advance to cancel out the software part of
+    // zdnn quantized matmul.
+    // BI8 = create.onnx.sub(BI8, AZeroPointI8);
+    // Value BZeroPointF32 = zero;
+    Value YZeroPointF32 = zero;
+
+    // Recscale in f32.
+    // Set recscale of A and B to 1. In dynamic quantization the output of
+    // MatMulInteger is scaled later outside the op.
+    Value ARecScale = one;
+    Value BRecScale = one;
+    Value YRecScale = one;
+
+    // Only pre-compute bias when B is a constant and BZeroPoint is zero.
+    bool canPreComputeBias = isDenseONNXConstant(BI8) &&
+                             isDenseONNXConstant(BZeroPointI8) &&
+                             isConstOf(BZeroPointI8, 0.0);
+
+    // Stickify AI8, Transform AI8 into zTensor format.
+    int64_t rankA = getRank(AI8.getType());
+    StringAttr aLayoutAttr =
+        rewriter.getStringAttr((rankA == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    ZHighQuantizedStickOp qAOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, AI8, ARecScale,
+            AZeroPointF32, aLayoutAttr, rewriter.getStringAttr(QTYPE_INT8));
+
+    // Stickify BI8. It is potentially folded at compile time.
+    int64_t rankB = getRank(BI8.getType());
+    StringAttr bLayoutAttr =
+        rewriter.getStringAttr((rankB == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    ZHighQuantizedStickOp qBOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, BI8, BRecScale,
+            BZeroPointF32, bLayoutAttr, rewriter.getStringAttr(QTYPE_WEIGHTS));
+
+    // Bias is none or precomputed.
+    Value qcTilde, qcTildeRecScale, qcTildeZeroPointF32;
+    if (canPreComputeBias)
+      preComputeBias(create, ARecScale, AZeroPointF32, BI8, BRecScale,
+          YRecScale, YZeroPointF32, qcTilde, qcTildeRecScale,
+          qcTildeZeroPointF32);
+
+    // Emit zhigh.QuantizedMatMul. Bias is none.
+    // Do not dequantize, we want to keep the integer values that will be scaled
+    // outside this op.
+    // Do not clip the output values to i8, keep i32.
+    SmallVector<Type, 3> resTypes;
+    resTypes.emplace_back(UnrankedTensorType::get(f16Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    ZHighQuantizedMatMulOp zhighQuantizedMatMulOp =
+        rewriter.create<ZHighQuantizedMatMulOp>(loc, resTypes,
+            qAOp.getResult(0), qAOp.getResult(1), qAOp.getResult(2),
+            qBOp.getResult(0), qBOp.getResult(1), qBOp.getResult(2),
+            /*Bias*/ canPreComputeBias ? qcTilde : none,
+            /*BiasRecScale*/ canPreComputeBias ? qcTildeRecScale : none,
+            /*BiasOffset*/ canPreComputeBias ? qcTildeZeroPointF32 : none,
+            /*OutRecScale*/ YRecScale, /*OutOffset*/ YZeroPointF32,
+            /*PreComputedBias*/ canPreComputeBias ? trueAttr : falseAttr,
+            /*DisableClipping*/ trueAttr,
+            /*DequantizeOutput*/ falseAttr);
+    (void)zhighQuantizedMatMulOp.inferShapes([](Region &region) {});
+
+    // Unstickify the matmul result that is int8-as-float.
+    Value resI8F32 = rewriter.create<ZHighUnstickOp>(
+        loc, zhighQuantizedMatMulOp.getResult(0));
+    Value res = create.onnx.cast(resI8F32, outElemTy);
+
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+  static mlir::LogicalResult canBeRewritten(
+      PatternRewriter &rewriter, ONNXMatMulIntegerOp mmiOp) {
+    if (!isSuitableForZDNN<ONNXMatMulIntegerOp>(mmiOp))
+      return rewriter.notifyMatchFailure(
+          mmiOp, "MatMulInteger is not suitable for zDNN");
+    return success();
+  }
+};
+
+// Replace by zhigh ops the following pattern:
+// clang-format off
+// func.func @pattern_in_bert(%X: tensor<?x?x768xf32>) : (tensor<?x?x768xf32>) -> tensor<?x?x768xf32> {
+//     %y = onnx.Constant dense_resource<__elided__> : tensor<768x768xi8>
+//     %y_scale = onnx.Constant dense<0.00656270096> : tensor<f32>
+//     %y_zero_point = onnx.Constant dense<0> : tensor<i8>
+//
+//     %x, %x_scale, %x_zero_point = "onnx.DynamicQuantizeLinear"(%X) : (tensor<?x?x768xf32>) -> (tensor<?x?x768xui8>, tensor<f32>, tensor<ui8>)
+//
+//     %matmul = "onnx.MatMulInteger"(%x, %y, %x_zero_point, %y_zero_point) : (tensor<?x?x768xui8>, tensor<768x768xi8>, tensor<ui8>, tensor<i8>) -> tensor<?x?x768xi32>
+//     %cast = "onnx.Cast"(%matmul) {saturate = 1 : si64, to = f32} : (tensor<?x?x768xi32>) -> tensor<?x?x768xf32>
+//     %mul_1= "onnx.Mul"(%cast, %x_scale) : (tensor<?x?x768xf32>, tensor<f32>) -> tensor<?x?x768xf32>
+//     %mul_2= "onnx.Mul"(%mul_1, %y_scale) : (tensor<?x?x768xf32>, tensor<f32>) -> tensor<?x?x768xf32>
+//
+//     return %mul_2: tensor<?x?x768xf32>
+// }
+// clang-format on
+class replaceMatMulIntegerSubGraphFromMulPattern
+    : public OpRewritePattern<ONNXMulOp> {
+public:
+  using OpRewritePattern<ONNXMulOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      ONNXMulOp mulOp, PatternRewriter &rewriter) const override {
+    Location loc = mulOp.getLoc();
+    Operation *op = mulOp.getOperation();
+    MultiDialectBuilder<OnnxBuilder> create(rewriter, loc);
+
+    // Match
+    Value A, AI8, AScale, AZeroPointI8, BI8, BScale, BZeroPointI8;
+    if (failed(canBeRewritten(rewriter, mulOp, A, AI8, AScale, AZeroPointI8,
+            BI8, BScale, BZeroPointI8)))
+      return failure();
+
+    Type si64Ty = rewriter.getIntegerType(64, true);
+    Type f16Ty = rewriter.getF16Type();
+    Type f32Ty = rewriter.getF32Type();
+    IntegerAttr trueAttr = rewriter.getIntegerAttr(si64Ty, -1);
+    IntegerAttr falseAttr = rewriter.getIntegerAttr(si64Ty, 0);
+    Value none = create.onnx.none();
+
+    // Only pre-compute bias when BZeroPoint is zero.
+    bool canPreComputeBias = isDenseONNXConstant(BI8) &&
+                             isDenseONNXConstant(BZeroPointI8) &&
+                             isConstOf(BZeroPointI8, 0.0);
+
+    // Stickify A.
+    int64_t rankA = getRank(A.getType());
+    StringAttr aLayoutAttr =
+        rewriter.getStringAttr((rankA == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    ZHighQuantizedStickOp qAOp;
+    if (nnpaUseDynamicQuantizeLinearOnCPU) {
+      Value zeroI64 = create.onnx.constantInt64({0});
+      // Input A was quantized on CPU by onnx.DynamicQuantizedLinear: f32 to i8.
+      if (getRank(AZeroPointI8.getType()) == 1) {
+        // Normalize the zeropoint tensor to tensor<dtype>.
+        AZeroPointI8 = create.onnx.squeeze(
+            RankedTensorType::get({}, getElementType(AZeroPointI8.getType())),
+            AZeroPointI8, {zeroI64});
+      }
+      AZeroPointI8 = getOrCastToI8(AZeroPointI8, create, true);
+      Value AZeroPointF32 = create.onnx.cast(AZeroPointI8, f32Ty);
+      Value ARecScale = create.onnx.reciprocal(AScale);
+      AI8 = getOrCastToI8(AI8, create, true);
+      // Stickify the quantized input A to ztensor format.
+      qAOp = rewriter.create<ZHighQuantizedStickOp>(loc, AI8, ARecScale,
+          AZeroPointF32, aLayoutAttr, rewriter.getStringAttr(QTYPE_INT8));
+    } else {
+      // Stickify input A to dlfloat16, and it will be quantized internally by
+      // the NNPA quantized matmul.
+      qAOp = rewriter.create<ZHighQuantizedStickOp>(loc, A, none, none,
+          aLayoutAttr, rewriter.getStringAttr(QTYPE_DLFLOAT16));
+    }
+    Value qA = qAOp.getResult(0);
+    Value ARecScale = qAOp.getResult(1);
+    Value AZeroPoint = qAOp.getResult(2);
+
+    // Stickify B. It is potentially folded at compile time.
+    int64_t rankB = getRank(BI8.getType());
+    StringAttr bLayoutAttr =
+        rewriter.getStringAttr((rankB == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    Value BRecScale = create.onnx.reciprocal(BScale);
+    Value BZeroPoint = create.onnx.cast(BZeroPointI8, f32Ty);
+    ZHighQuantizedStickOp qBOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, BI8, BRecScale, BZeroPoint,
+            bLayoutAttr, rewriter.getStringAttr(QTYPE_WEIGHTS));
+    Value qB = qBOp.getResult(0);
+
+    // Output's rescale and zeropoint
+    auto cst0Attr =
+        DenseElementsAttr::get(RankedTensorType::get({}, f32Ty), (float)0);
+    auto cst1Attr =
+        DenseElementsAttr::get(RankedTensorType::get({}, f32Ty), (float)1);
+    Value OutRecScale = create.onnx.constant(cst1Attr);
+    Value OutZeroPoint = create.onnx.constant(cst0Attr);
+
+    // Bias is none or precomputed.
+    Value qcTilde, qcTildeRecScale, qcTildeZeroPoint;
+    if (canPreComputeBias)
+      preComputeBias(create, ARecScale, AZeroPoint, BI8, BRecScale, OutRecScale,
+          OutZeroPoint, qcTilde, qcTildeRecScale, qcTildeZeroPoint);
+
+    // Emit zhigh.QuantizedMatMul.
+    SmallVector<Type, 3> resTypes;
+    resTypes.emplace_back(UnrankedTensorType::get(f16Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    ZHighQuantizedMatMulOp zhighQuantizedMatMulOp =
+        rewriter.create<ZHighQuantizedMatMulOp>(loc, resTypes, qA, ARecScale,
+            AZeroPoint, qB, BRecScale, BZeroPoint,
+            /*Bias*/ canPreComputeBias ? qcTilde : none,
+            /*BiasRecScale*/ canPreComputeBias ? qcTildeRecScale : none,
+            /*BiasOffset*/ canPreComputeBias ? qcTildeZeroPoint : none,
+            /*OutRecScale*/ OutRecScale, /*OutOffset*/ OutZeroPoint,
+            /*PreComputedBias*/ canPreComputeBias ? trueAttr : falseAttr,
+            /*DequantizeOutput*/ trueAttr);
+    (void)zhighQuantizedMatMulOp.inferShapes([](Region &region) {});
+
+    // Unstickify the matmul result.
+    Value res = rewriter.create<ZHighUnstickOp>(
+        loc, zhighQuantizedMatMulOp.getResult(0));
+
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+  // clang-format off
+  // func.func @pattern_in_bert(%A) {
+  //   // A is dynamically quantized.
+  //   %a, %a_scale, %a_zero_point = "onnx.DynamicQuantizeLinear"(%A)
+  //
+  //   // B is a constant and already quantized.
+  //   %b             = onnx.Constant
+  //   %b_scale       = onnx.Constant
+  //   %b_zero_point  = onnx.Constant
+  //
+  //
+  //   %matmul = "onnx.MatMulInteger"(%b, %b, %b_zero_point, %b_zero_point)
+  //
+  //   // Scale the output.
+  //   %mm_f32     = "onnx.Cast"(%matmul) {to = f32}
+  //   %mm_a_scale = "onnx.Mul"(%mm_f32, %a_scale)
+  //   %mm_ab_scale = "onnx.Mul"(%mm_a_scale, %b_scale)
+  //
+  //   return %mm_y_scale
+  // }
+  // clang-format on
+  static mlir::LogicalResult canBeRewritten(PatternRewriter &rewriter,
+      ONNXMulOp mulOp, Value &A, Value &AI8, Value &AScale, Value &AZeroPoint,
+      Value &BI8, Value &BScale, Value &BZeroPoint) {
+
+    // Match `cast(mm_out) * a_scale * b_scale` to find two scales but we don't
+    // know yet which scale is for A or B.
+    Value scale1, scale2;
+    ONNXCastOp castOp;
+    ONNXMulOp mulScaleOp;
+
+    Value opr1 = mulOp.getOperand(0);
+    Value opr2 = mulOp.getOperand(1);
+
+    // Match cast(mm_out) * (a_scale * b_scale)
+    castOp = opr1.getDefiningOp<ONNXCastOp>();
+    mulScaleOp = opr2.getDefiningOp<ONNXMulOp>();
+    bool foundScales = false;
+    if (castOp && mulScaleOp && isScalarTensor(opr2)) {
+      Value lhs = mulScaleOp.getOperand(0);
+      Value rhs = mulScaleOp.getOperand(1);
+      if (isScalarTensor(lhs) && isScalarTensor(rhs)) {
+        // mulScaleOp is a_scale * b_scale;
+        foundScales = true;
+        scale1 = lhs;
+        scale2 = rhs;
+      }
+    }
+    // Match (a_scale * b_scale) * cast(mm_out)
+    if (!foundScales) {
+      mulScaleOp = opr1.getDefiningOp<ONNXMulOp>();
+      castOp = opr2.getDefiningOp<ONNXCastOp>();
+      if (mulScaleOp && isScalarTensor(opr1) && castOp) {
+        Value lhs = mulScaleOp.getOperand(0);
+        Value rhs = mulScaleOp.getOperand(1);
+        if (isScalarTensor(lhs) && isScalarTensor(rhs)) {
+          // mulScaleOp is a_scale * b_scale;
+          foundScales = true;
+          scale1 = lhs;
+          scale2 = rhs;
+        }
+      }
+    }
+    // Match [cast(mm_out) * a_scale] * b_scale
+    if (!foundScales & isScalarTensor(opr2)) {
+      scale1 = opr2;
+      mulScaleOp = opr1.getDefiningOp<ONNXMulOp>();
+      if (mulScaleOp) {
+        Value lhs = mulScaleOp.getOperand(0);
+        Value rhs = mulScaleOp.getOperand(1);
+        castOp = lhs.getDefiningOp<ONNXCastOp>();
+        if (castOp && isScalarTensor(rhs)) {
+          // Match cast(mm_out) * a_scale
+          scale2 = rhs;
+          foundScales = true;
+        }
+        if (!foundScales) {
+          // Match a_scale * cast(mm_out)
+          castOp = rhs.getDefiningOp<ONNXCastOp>();
+          if (isScalarTensor(lhs) && castOp) {
+            scale2 = lhs;
+            foundScales = true;
+          }
+        }
+      }
+      // Match b_scale * [cast(mm_out) * a_scale]
+      if (!foundScales && isScalarTensor(opr1)) {
+        scale1 = opr1;
+        mulScaleOp = opr2.getDefiningOp<ONNXMulOp>();
+        if (mulScaleOp) {
+          Value lhs = mulScaleOp.getOperand(0);
+          Value rhs = mulScaleOp.getOperand(1);
+          castOp = lhs.getDefiningOp<ONNXCastOp>();
+          if (castOp && isScalarTensor(rhs)) {
+            // Match cast(mm_out) * a_scale
+            scale2 = rhs;
+            foundScales = true;
+          }
+          if (!foundScales) {
+            // Match a_scale * cast(mm_out)
+            castOp = rhs.getDefiningOp<ONNXCastOp>();
+            if (isScalarTensor(lhs) && castOp) {
+              scale2 = lhs;
+              foundScales = true;
+            }
+          }
+        }
+      }
+    }
+    if (!foundScales)
+      return rewriter.notifyMatchFailure(mulOp, "Not found scale values");
+
+    // Identify a_scale and b_scale.
+    // a_scale is from DynamicQuantizeLinear.
+    if (scale1.getDefiningOp<ONNXDynamicQuantizeLinearOp>()) {
+      AScale = scale1;
+      BScale = scale2;
+    } else if (scale2.getDefiningOp<ONNXDynamicQuantizeLinearOp>()) {
+      AScale = scale2;
+      BScale = scale1;
+    } else {
+      return rewriter.notifyMatchFailure(
+          mulOp, "Could not identify a_scale and b_scale");
+    }
+
+    // Match cast.
+    //   %cast = "onnx.Cast"(%matmul) {saturate = 1 : si64, to = f32}
+    Type castOutputType = castOp.getOutput().getType();
+    Type castInputType = castOp.getInput().getType();
+    if (isRankedShapedType(castInputType) &&
+        isRankedShapedType(castOutputType)) {
+      if (!getElementType(castInputType).isInteger(32))
+        return rewriter.notifyMatchFailure(
+            mulOp, "ONNXCast is not casting from i32");
+      if (!getElementType(castOutputType).isF32())
+        return rewriter.notifyMatchFailure(
+            mulOp, "ONNXCast is not casting to f32");
+    } else {
+      return rewriter.notifyMatchFailure(mulOp, "ONNXCast is unranked");
+    }
+
+    // Match matmul to get BI8 and BZeroPoint.
+    ONNXMatMulIntegerOp matmulOp =
+        castOp.getInput().getDefiningOp<ONNXMatMulIntegerOp>();
+    if (!matmulOp)
+      return rewriter.notifyMatchFailure(
+          mulOp, "The input of the CastOp is not defined by MatMulIntegerOp");
+    if (!isSuitableForZDNN<ONNXMatMulIntegerOp>(matmulOp))
+      return rewriter.notifyMatchFailure(
+          mulOp, "MatMulInteger is not suitable for zDNN");
+
+    AI8 = matmulOp->getOperand(0);
+    BI8 = matmulOp->getOperand(1);
+    AZeroPoint = matmulOp->getOperand(2);
+    BZeroPoint = matmulOp->getOperand(3);
+    if (!isDenseONNXConstant(BI8))
+      return rewriter.notifyMatchFailure(mulOp, "Quantized Y is not constant");
+    if (!isDenseONNXConstant(BZeroPoint))
+      return rewriter.notifyMatchFailure(mulOp, "BZeroPoint is not constant");
+    if (!(getElementType(BI8.getType()).isUnsignedInteger(8) ||
+            getElementType(BI8.getType()).isSignlessInteger(8)))
+      return rewriter.notifyMatchFailure(
+          mulOp, "Quantized Y is not signed int8");
+
+    // Match dynamic quantize linear to get A.
+    if (auto dqlOp =
+            llvm::dyn_cast<ONNXDynamicQuantizeLinearOp>(AI8.getDefiningOp())) {
+      if (AScale != dqlOp.getResult(1))
+        return rewriter.notifyMatchFailure(mulOp, "AScale is not used");
+      if (AZeroPoint != dqlOp.getResult(2))
+        return rewriter.notifyMatchFailure(mulOp, "AZeroPoint is not used");
+      // return A.
+      A = dqlOp.getOperand();
+    } else {
+      return rewriter.notifyMatchFailure(
+          mulOp, "Quantized A is not defined by DynamicQuantizeLinearOp");
+    }
+
+    return success();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Fuse ZHighQuantizedMatMul and ONNXAdd
+//===----------------------------------------------------------------------===//
+// Rewrite this pattern:
+//   (ONNXAddOp
+//     $x,
+//     (ZHighUnstickOp
+//       (ZHighQuantizedMatMulOp:$mm_res
+//         $a, $Sa, $Za,
+//         $b, $Sb, $Zb,
+//         (ZHighQuantizedStick $c), $Sc, $Zb,
+//         $So, $Zo,
+//         $preComputed, $disableClipping, $dequantized))),
+//
+// into this pattern where $x is added to $c:
+//
+//   (ZHighUnstickOp
+//     (ZHighQuantizedMatMulOp
+//       $a, $Sa, $Za,
+//       $b, $Sb, $Zb,
+//       (ZHighQuantizedStick (ONNXAddOp $x, $c)), $Sc, $Zb,
+//       $So, $Zo,
+//       $preComputed, $disableClipping, $dequantized)),
+//
+// Requirement: `preComputed` is true.
+
+class fuseZHighQuantizedMatMulONNXAddPattern
+    : public OpRewritePattern<ONNXAddOp> {
+public:
+  using OpRewritePattern<ONNXAddOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      ONNXAddOp addOp, PatternRewriter &rewriter) const override {
+    Location loc = addOp.getLoc();
+    Operation *op = addOp.getOperation();
+    MultiDialectBuilder<OnnxBuilder> create(rewriter, loc);
+
+    ZHighUnstickOp unstickOp;
+    ZHighQuantizedMatMulOp mmOp;
+    ZHighQuantizedStickOp qstickOp;
+    Value addInput;
+
+    // match
+    if (failed(canBeRewritten(
+            rewriter, addOp, unstickOp, mmOp, qstickOp, addInput)))
+      return failure();
+
+    // rewrite
+    Value newBias = create.onnx.add(addInput, qstickOp.getIn());
+    ZHighQuantizedStickOp newQStickOp = rewriter.create<ZHighQuantizedStickOp>(
+        loc, newBias, qstickOp.getInRecScale(), qstickOp.getInOffset(),
+        qstickOp.getLayoutAttr(), qstickOp.getQuantizedTypeAttr());
+
+    SmallVector<Type, 3> resTypes;
+    resTypes.emplace_back(mmOp.getResult(0).getType());
+    resTypes.emplace_back(mmOp.getResult(1).getType());
+    resTypes.emplace_back(mmOp.getResult(2).getType());
+    ZHighQuantizedMatMulOp newQMMOp = rewriter.create<ZHighQuantizedMatMulOp>(
+        loc, resTypes, mmOp.getX(), mmOp.getXRecScale(), mmOp.getXOffset(),
+        mmOp.getY(), mmOp.getYRecScale(), mmOp.getYOffset(),
+        newQStickOp.getResult(0), newQStickOp.getResult(1),
+        newQStickOp.getResult(2), mmOp.getOutRecScaleIn(),
+        mmOp.getOutOffsetIn(), mmOp.getPreComputedBiasAttr(),
+        mmOp.getDisableClippingAttr(), mmOp.getDequantizeOutputAttr());
+    ZHighUnstickOp newUnstickOp =
+        rewriter.create<ZHighUnstickOp>(loc, newQMMOp.getResult(0));
+
+    rewriter.replaceOp(op, newUnstickOp);
+    return success();
+  }
+
+  static mlir::LogicalResult canBeRewritten(PatternRewriter &rewriter,
+      ONNXAddOp addOp, ZHighUnstickOp &unstickOp, ZHighQuantizedMatMulOp &mmOp,
+      ZHighQuantizedStickOp &qstickOp, Value &addInput) {
+    Value lhs = addOp.getOperand(0);
+    Value rhs = addOp.getOperand(1);
+    bool found = false;
+    if (auto op1 = lhs.getDefiningOp<ZHighUnstickOp>()) {
+      addInput = rhs;
+      unstickOp = op1;
+      Value mmOutput = unstickOp.getIn();
+      if (auto op2 = mmOutput.getDefiningOp<ZHighQuantizedMatMulOp>()) {
+        mmOp = op2;
+        bool precomputed = (mmOp.getPreComputedBias() == -1);
+        if (!precomputed)
+          return rewriter.notifyMatchFailure(
+              addOp, "not precomputed quantized matmul");
+        Value qBias = mmOp.getB();
+        if (auto op3 = qBias.getDefiningOp<ZHighQuantizedStickOp>()) {
+          qstickOp = op3;
+          Value bias = qstickOp.getIn();
+          // Check rank.
+          if (getRank(bias.getType()) != getRank(addInput.getType()))
+            return rewriter.notifyMatchFailure(addOp, "rank mismatched");
+          found = true;
+        }
+      }
+    }
+    if (found)
+      return success();
+
+    if (auto op1 = rhs.getDefiningOp<ZHighUnstickOp>()) {
+      addInput = lhs;
+      unstickOp = op1;
+      Value mmOutput = unstickOp.getIn();
+      if (auto op2 = mmOutput.getDefiningOp<ZHighQuantizedMatMulOp>()) {
+        mmOp = op2;
+        bool precomputed = (mmOp.getPreComputedBias() == -1);
+        if (!precomputed)
+          return rewriter.notifyMatchFailure(
+              addOp, "not precomputed quantized matmul");
+        Value qBias = mmOp.getB();
+        if (auto op3 = qBias.getDefiningOp<ZHighQuantizedStickOp>()) {
+          qstickOp = op3;
+          Value bias = qstickOp.getIn();
+          // Check rank.
+          if (getRank(bias.getType()) != getRank(addInput.getType()))
+            return rewriter.notifyMatchFailure(addOp, "rank mismatched");
+          found = true;
+        }
+      }
+    }
+    if (found)
+      return success();
+
+    return rewriter.notifyMatchFailure(addOp, "unstick not found");
+  }
+};
+
+class replaceONNXQLinearMatMulPattern
+    : public OpRewritePattern<ONNXQLinearMatMulOp> {
+public:
+  using OpRewritePattern<ONNXQLinearMatMulOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(
+      ONNXQLinearMatMulOp qmmOp, PatternRewriter &rewriter) const override {
+    Location loc = qmmOp.getLoc();
+    Operation *op = qmmOp.getOperation();
+    MultiDialectBuilder<OnnxBuilder> create(rewriter, loc);
+
+    // Match
+    if (failed(canBeRewritten(rewriter, qmmOp)))
+      return failure();
+
+    Type si64Ty = rewriter.getIntegerType(64, true);
+    Type f16Ty = rewriter.getF16Type();
+    Type f32Ty = rewriter.getF32Type();
+    IntegerAttr trueAttr = rewriter.getIntegerAttr(si64Ty, -1);
+    IntegerAttr falseAttr = rewriter.getIntegerAttr(si64Ty, 0);
+
+    Value A = qmmOp.getA();
+    Value AScale = qmmOp.getAScale();
+    Value AZeroPoint = qmmOp.getAZeroPoint();
+    Value B = qmmOp.getB();
+    Value BScale = qmmOp.getBScale();
+    Value BZeroPoint = qmmOp.getBZeroPoint();
+    Value Y = qmmOp.getY();
+    Value YScale = qmmOp.getYScale();
+    Value YZeroPoint = qmmOp.getYZeroPoint();
+
+    // Only pre-compute bias when B is a constant and BZeroPoint is int8 zero.
+    bool canPreComputeBias = false;
+    if (isDenseONNXConstant(B) && isDenseONNXConstant(BZeroPoint)) {
+      if (getElementType(BZeroPoint.getType()).isUnsignedInteger())
+        canPreComputeBias = isConstOf(BZeroPoint, 128.0);
+      else
+        canPreComputeBias = isConstOf(BZeroPoint, 0.0);
+    }
+
+    // Emit some common values.
+    Value none = create.onnx.none();
+    Value zero = create.onnx.constantInt64({0});
+
+    // Normalize scalar tensors to tensor<dtype>.
+    if (getRank(AScale.getType()) == 1) {
+      AScale = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(AScale.getType())), AScale,
+          {zero});
+    }
+    if (getRank(AZeroPoint.getType()) == 1) {
+      AZeroPoint = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(AZeroPoint.getType())),
+          AZeroPoint, {zero});
+    }
+    if (getRank(BScale.getType()) == 1) {
+      BScale = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(BScale.getType())), BScale,
+          {zero});
+    }
+    if (getRank(BZeroPoint.getType()) == 1) {
+      BZeroPoint = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(BZeroPoint.getType())),
+          BZeroPoint, {zero});
+    }
+    if (getRank(YScale.getType()) == 1) {
+      YScale = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(YScale.getType())), YScale,
+          {zero});
+    }
+    if (getRank(YZeroPoint.getType()) == 1) {
+      YZeroPoint = create.onnx.squeeze(
+          RankedTensorType::get({}, getElementType(YZeroPoint.getType())),
+          YZeroPoint, {zero});
+    }
+
+    // zdnn supports signed int8, convert unsigned int8 inputs to signed int8.
+    Value AI8 = getOrCastToI8(A, create);
+    Value BI8 = getOrCastToI8(B, create);
+
+    Value ARecScale = create.onnx.reciprocal(AScale);
+    Value AZeroPointI8 = getOrCastToI8(AZeroPoint, create);
+    Value AZeroPointF32 = create.onnx.cast(AZeroPointI8, f32Ty);
+
+    Value BRecScale = create.onnx.reciprocal(BScale);
+    Value BZeroPointI8 = getOrCastToI8(BZeroPoint, create);
+    Value BZeroPointF32 = create.onnx.cast(BZeroPointI8, f32Ty);
+
+    Value YRecScale = create.onnx.reciprocal(YScale);
+    Value YZeroPointI8 = getOrCastToI8(YZeroPoint, create);
+    Value YZeroPointF32 = create.onnx.cast(YZeroPointI8, f32Ty);
+
+    // Stickify AI8, Transform AI8 into zTensor format.
+    int64_t rankA = getRank(AI8.getType());
+    StringAttr aLayoutAttr =
+        rewriter.getStringAttr((rankA == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    ZHighQuantizedStickOp qAOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, AI8, ARecScale,
+            AZeroPointF32, aLayoutAttr, rewriter.getStringAttr(QTYPE_INT8));
+
+    // Stickify BI8. It is potentially folded at compile time.
+    int64_t rankB = getRank(BI8.getType());
+    StringAttr bLayoutAttr =
+        rewriter.getStringAttr((rankB == 2) ? LAYOUT_2D : LAYOUT_3DS);
+    ZHighQuantizedStickOp qBOp =
+        rewriter.create<ZHighQuantizedStickOp>(loc, BI8, BRecScale,
+            BZeroPointF32, bLayoutAttr, rewriter.getStringAttr(QTYPE_WEIGHTS));
+
+    // Bias is none or precomputed.
+    Value qcTilde, qcTildeRecScale, qcTildeZeroPointF32;
+    if (canPreComputeBias)
+      preComputeBias(create, ARecScale, AZeroPointF32, BI8, BRecScale,
+          YRecScale, YZeroPointF32, qcTilde, qcTildeRecScale,
+          qcTildeZeroPointF32);
+
+    // Emit zhigh.QuantizedMatMul. Bias is none.
+    // DisableClipping gives the same output as the onnx backend test since the
+    // onnx backend test uses `astype` instead of `clipping` to cast the output
+    // to i8.
+    SmallVector<Type, 3> resTypes;
+    resTypes.emplace_back(UnrankedTensorType::get(f16Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    resTypes.emplace_back(RankedTensorType::get({}, f32Ty));
+    ZHighQuantizedMatMulOp zhighQuantizedMatMulOp =
+        rewriter.create<ZHighQuantizedMatMulOp>(loc, resTypes,
+            qAOp.getResult(0), qAOp.getResult(1), qAOp.getResult(2),
+            qBOp.getResult(0), qBOp.getResult(1), qBOp.getResult(2),
+            /*Bias*/ canPreComputeBias ? qcTilde : none,
+            /*BiasRecScale*/ canPreComputeBias ? qcTildeRecScale : none,
+            /*BiasOffset*/ canPreComputeBias ? qcTildeZeroPointF32 : none,
+            /*OutRecScale*/ YRecScale, /*OutOffset*/ YZeroPointF32,
+            /*PreComputedBias*/ canPreComputeBias ? trueAttr : falseAttr,
+            /*DisableClipping*/ trueAttr,
+            /*DequantizeOutput*/ falseAttr);
+    (void)zhighQuantizedMatMulOp.inferShapes([](Region &region) {});
+
+    // Unstickify the matmul result that is int8-as-float.
+    Value resI8F32 = rewriter.create<ZHighUnstickOp>(
+        loc, zhighQuantizedMatMulOp.getResult(0));
+    Value res;
+    Type outElemTy = getElementType(Y.getType());
+    if (outElemTy.isUnsignedInteger(8)) {
+      // The zdnn output is int8. Convert int8 to uint8.
+      // Use int16 to avoid integer overflow.
+      Type i16Ty = rewriter.getI16Type();
+      Type ui16Ty = rewriter.getIntegerType(16, false);
+      auto cst128Attr = DenseElementsAttr::get(
+          RankedTensorType::get({}, i16Ty), static_cast<int16_t>(128));
+      // clang-format off
+      Value resUI16 =
+        create.onnx.cast(
+          create.onnx.add(create.onnx.cast(resI8F32, i16Ty),
+                          create.onnx.constant(cst128Attr)),
+          ui16Ty);
+      // clang-format on
+      res = create.onnx.cast(resUI16, outElemTy);
+    } else {
+      res = create.onnx.cast(resI8F32, outElemTy);
+    }
+    rewriter.replaceOp(op, res);
+    return success();
+  }
+
+  static mlir::LogicalResult canBeRewritten(
+      PatternRewriter &rewriter, ONNXQLinearMatMulOp qmmOp) {
+    if (!isSuitableForZDNN<ONNXQLinearMatMulOp>(qmmOp))
+      return rewriter.notifyMatchFailure(
+          qmmOp, "QLinearMatMul is not suitable for zDNN");
+    return success();
+  }
+};
+
 struct ONNXToZHighLoweringPass
     : public PassWrapper<ONNXToZHighLoweringPass, OperationPass<ModuleOp>> {
 
@@ -290,14 +1535,85 @@ struct ONNXToZHighLoweringPass
   ONNXToZHighLoweringPass() = default;
   ONNXToZHighLoweringPass(const ONNXToZHighLoweringPass &pass)
       : PassWrapper<ONNXToZHighLoweringPass, OperationPass<ModuleOp>>() {}
+  ONNXToZHighLoweringPass(NNPAQuantType quantMode) {
+    this->quantMode = quantMode;
+  }
   void runOnOperation() final;
+
+public:
+  Option<NNPAQuantType> quantMode{*this, "quantization",
+      llvm::cl::desc("Enable quantization"),
+      llvm::cl::values(
+          clEnumVal(DynSymI8,
+              "Dynamic Quantization to signed integer 8. Asymmetric quant for "
+              "activations and symmetric quant for weights."),
+          clEnumVal(SymSymI8,
+              "Dynamic Quantization to signed integer 8. Symmetric quant for "
+              "activations and symmetric quant for weights."),
+          clEnumVal(QNONE, "No quantization (default).")),
+      llvm::cl::init(QNONE)};
 };
 } // end anonymous namespace.
 
-void getONNXToZHighOneOpPatterns(RewritePatternSet &patterns) {
+void getONNXToZHighOneOpPatterns(
+    RewritePatternSet &patterns, NNPAQuantType quantMode) {
   MLIRContext *context = patterns.getContext();
-  populateWithGenerated(patterns);
-  patterns.insert<ONNXSumOpPatternEnhancedRecursion>(context);
+  patterns.insert<normalizeONNXGemmTransAPattern>(context);
+  patterns.insert<normalizeONNXGemmTransBPattern>(context);
+  patterns.insert<replaceONNXAddPattern>(context);
+  patterns.insert<replaceONNXAveragePoolPattern>(context);
+  patterns.insert<replaceONNXConv2DPattern>(context);
+  patterns.insert<replaceONNXDivBroadcastPattern1>(context);
+  patterns.insert<replaceONNXDivBroadcastPattern2>(context);
+  patterns.insert<replaceONNXDivPattern>(context);
+  patterns.insert<replaceONNXExpPattern>(context);
+  patterns.insert<replaceONNXGRUPattern1>(context);
+  patterns.insert<replaceONNXGRUPattern2>(context);
+  patterns.insert<replaceONNXGRUPattern3>(context);
+  patterns.insert<replaceONNXGRUPattern4>(context);
+  patterns.insert<replaceONNXGeluPattern>(context);
+  patterns.insert<replaceONNXGemmBias2DPattern>(context);
+  patterns.insert<replaceONNXGemmBiasNoneOr1DPattern>(context);
+  patterns.insert<replaceONNXGemmTransPattern>(context);
+  patterns.insert<replaceONNXLSTMPattern1>(context);
+  patterns.insert<replaceONNXLSTMPattern2>(context);
+  patterns.insert<replaceONNXLSTMPattern3>(context);
+  patterns.insert<replaceONNXLSTMPattern4>(context);
+  patterns.insert<replaceONNXLeakyReluPattern>(context);
+  patterns.insert<replaceONNXLogPattern>(context);
+  patterns.insert<replaceONNXMatMulPattern>(context);
+  patterns.insert<replaceONNXMatMulIntegerPattern>(context);
+  patterns.insert<replaceONNXMaxPattern>(context);
+  patterns.insert<replaceONNXMaxPoolSingleOutPattern>(context);
+  patterns.insert<replaceONNXMinPattern>(context);
+  patterns.insert<replaceONNXMulPattern>(context);
+  patterns.insert<replaceONNXQLinearMatMulPattern>(context);
+  patterns.insert<replaceONNXReduceMaxPattern>(context);
+  patterns.insert<replaceONNXReduceMeanV13Pattern>(context);
+  patterns.insert<replaceONNXReduceMinPattern>(context);
+  patterns.insert<replaceONNXReluPattern>(context);
+  patterns.insert<replaceONNXSigmoidPattern>(context);
+  patterns.insert<replaceONNXSoftmax2DPattern>(context);
+  patterns.insert<replaceONNXSoftmax3DPattern>(context);
+  patterns.insert<replaceONNXSqrtPattern>(context);
+  patterns.insert<replaceONNXSubPattern>(context);
+  patterns.insert<replaceONNXSumOpPatternEnhancedRecursion>(context);
+  patterns.insert<replaceONNXSumOpPatternRecursion>(context);
+  patterns.insert<replaceONNXSumOpPatternSingleton>(context);
+  patterns.insert<replaceONNXTanhPattern>(context);
+
+  // Pattern for i8 dynamic quantization, symmetric mode.
+  if (isCompatibleWithNNPALevel(NNPALevel::M15) &&
+      (quantMode == NNPAQuantType::DynSymI8 ||
+          quantMode == NNPAQuantType::SymSymI8)) {
+    // Bump up the pattern benefit to run these before non-quantization
+    // patterns.
+    PatternBenefit quantPriority(QUANT_PATTERN_BENEFIT);
+    patterns.insert<replaceONNXMatMulByDynQuantI8Pattern>(
+        context, quantPriority, quantMode == NNPAQuantType::SymSymI8);
+    patterns.insert<replaceONNXGemmByDynQuantI8Pattern>(
+        context, quantPriority, quantMode == NNPAQuantType::SymSymI8);
+  }
 }
 
 void getONNXToZHighOneOpDynamicallyLegal(
@@ -309,7 +1625,10 @@ void getONNXToZHighOneOpDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXSumOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXMinOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXMaxOp>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXGeluOp>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXLeakyReluOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXReluOp>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXSqrtOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXTanhOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXSigmoidOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXLogOp>(target, dimAnalysis);
@@ -319,18 +1638,42 @@ void getONNXToZHighOneOpDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXAveragePoolOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXMatMulOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXGemmOp>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXReduceMaxOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXReduceMeanV13Op>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXReduceMinOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXLSTMOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXGRUOp>(target, dimAnalysis);
   addDynamicallyLegalOpFor<ONNXConvOp>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXMatMulIntegerOp>(target, dimAnalysis);
+  addDynamicallyLegalOpFor<ONNXQLinearMatMulOp>(target, dimAnalysis);
 }
 
-void getONNXToZHighMultipleOpPatterns(RewritePatternSet &patterns) {
+void getONNXToZHighMultipleOpPatterns(
+    RewritePatternSet &patterns, NNPAQuantType quantMode) {
   MLIRContext *context = patterns.getContext();
   patterns.insert<replaceONNXMatMulAddPattern1>(context);
   patterns.insert<replaceONNXMatMulAddPattern2>(context);
   patterns.insert<replaceONNXReluConvPattern>(context);
   patterns.insert<replaceONNXLogSoftmaxPattern>(context);
+  patterns.insert<replaceONNXTransAMatMulPattern>(context);
+  patterns.insert<replaceONNXTransBMatMulPattern>(context);
+  patterns.insert<replaceONNXTransABMatMulPattern>(context);
+  patterns.insert<replaceDiv1SqrtPattern>(context);
+  patterns.insert<replaceReciprocalSqrtPattern>(context);
+  patterns.insert<replaceMatMulIntegerSubGraphFromMulPattern>(context);
+  patterns.insert<fuseZHighQuantizedMatMulONNXAddPattern>(context);
+
+  // Pattern for i8 dynamic quantization, symmetric mode.
+  if (isCompatibleWithNNPALevel(NNPALevel::M15) &&
+      (quantMode == NNPAQuantType::DynSymI8 ||
+          quantMode == NNPAQuantType::SymSymI8)) {
+    // Bump up the pattern benefit to run these before non-quantization
+    // patterns.
+    PatternBenefit quantPriority(QUANT_PATTERN_BENEFIT);
+    patterns.insert<replaceONNXMatMulAddByDynQuantI8Pattern>(
+        context, quantPriority, quantMode == NNPAQuantType::SymSymI8);
+  }
+
   // Shape inference for newly-added operations.
   getShapeInferencePatterns(patterns);
 }
@@ -363,7 +1706,8 @@ void ONNXToZHighLoweringPass::runOnOperation() {
   // a single ONNX Op, because the single op lowering might have conditions that
   // prohibit the combined ops lowering happened.
   RewritePatternSet combinedPatterns(&getContext());
-  onnx_mlir::getONNXToZHighMultipleOpPatterns(combinedPatterns);
+  onnx_mlir::getONNXToZHighMultipleOpPatterns(
+      combinedPatterns, this->quantMode);
 
   // It's ok to fail.
   (void)applyPatternsAndFoldGreedily(module, std::move(combinedPatterns));
@@ -375,7 +1719,7 @@ void ONNXToZHighLoweringPass::runOnOperation() {
 
   // Single ONNX to ZHigh operation lowering.
   RewritePatternSet patterns(&getContext());
-  onnx_mlir::getONNXToZHighOneOpPatterns(patterns);
+  onnx_mlir::getONNXToZHighOneOpPatterns(patterns, this->quantMode);
 
   // This is to make sure we don't want to alloc any MemRef at this high-level
   // representation.
@@ -398,4 +1742,8 @@ std::unique_ptr<Pass> createONNXToZHighPass() {
   return std::make_unique<ONNXToZHighLoweringPass>();
 }
 
+std::unique_ptr<Pass> createONNXToZHighPass(NNPAQuantType quantMode) {
+  return std::make_unique<ONNXToZHighLoweringPass>(quantMode);
+}
+
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp
index 034f92a6e3..d121058168 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.hpp
@@ -18,13 +18,16 @@
 
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp"
 #include "src/Dialect/ONNX/ONNXDimAnalysis.hpp"
 
 namespace onnx_mlir {
 
 // Exports ONNXtoZHigh patterns.
-void getONNXToZHighOneOpPatterns(mlir::RewritePatternSet &patterns);
-void getONNXToZHighMultipleOpPatterns(mlir::RewritePatternSet &patterns);
+void getONNXToZHighOneOpPatterns(
+    mlir::RewritePatternSet &patterns, NNPAQuantType quantMode);
+void getONNXToZHighMultipleOpPatterns(
+    mlir::RewritePatternSet &patterns, NNPAQuantType quantMode);
 
 // Exports ONNXtoZHigh dynamically legal checks.
 void getONNXToZHighOneOpDynamicallyLegal(
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.td b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.td
index 075ae98e98..d09d271432 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.td
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHigh.td
@@ -2,7 +2,7 @@
 
 //===- ONNXToZHigh.td - Replacing ONNX Ops by ZHigh Ops -*- tablegen ------===//
 //
-// Copyright 2019-2020 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -32,6 +32,8 @@ include "src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.td"
 
 def IsEnableScalarBcastBinary: Constraint<CPred<"isEnableScalarBcastBinary()">>;
 
+def HasOneUse : Constraint<CPred<"$_self.hasOneUse()">, "op has exactly one use">;
+
 def IsNotNoneType : Constraint<CPred<"!mlir::isa<NoneType>(($_self).getType())">>;
 
 class HasRankOf<int rank> : Constraint<
@@ -91,6 +93,8 @@ def GetI64ZeroAttr :
 class GetI64ArrayAttr<int n> :
         NativeCodeCall<"$_builder.getI64ArrayAttr(" # n # ")">;
 
+def GetFloatAttr : NativeCodeCall<"$_builder.getFloatAttr($0.getType(), $0.getValueAsDouble())">;
+
 def GetUnrankedTensorTypeOf : NativeCodeCall<
   "UnrankedTensorType::get(mlir::cast<ShapedType>($0.getType()).getElementType())"
 >;
@@ -109,6 +113,36 @@ def IsF32ScalarConstantTensor: Constraint<
 def GetScalarF32AttrFromConstant :
         NativeCodeCall<"getScalarF32AttrFromConstant($0)">;
 
+def IsConstOfHalf : Constraint<
+  CPred<"isDenseONNXConstant($_self) && isConstOf($_self, 0.5)">,
+  "Value is all halves (0.5) for a constant tensor">;
+
+def IsConstOfOnes : Constraint<
+  CPred<"isDenseONNXConstant($_self) && isConstOf($_self, 1.0)">,
+  "Value is an all-ones constant tensor">;
+
+def IsConstOfTwos : Constraint<
+  CPred<"isDenseONNXConstant($_self) && isConstOf($_self, 2.0)">,
+  "Value is all twos for a constant tensor">;
+
+def IsConstOfThrees : Constraint<
+  CPred<"isDenseONNXConstant($_self) && isConstOf($_self, 3.0)">,
+  "Value is all threes for a constant tensor">;
+
+def IsFloatType : Constraint<
+  CPred<"isFloatType($_self)">,
+  "Value has element type of float">;
+
+//===----------------------------------------------------------------------===//
+// ONNXLeakyReluOp %X = ZHighUnstickOp (ZHighLeakyReluOp (ZHighStickOp %X))
+//===----------------------------------------------------------------------===//
+def replaceONNXLeakyReluPattern : Pat<
+  (ONNXLeakyReluOp $x, $alpha),
+  (ZHighUnstickOp (ZHighLeakyReluOp (ZHighStickOp:$s_x $x, (NoneLayoutAttr), (GetDefaultSaturation)), (GetFloatAttr $alpha),
+                               (returnType $s_x))),
+  [(IsCompatibleWithNNPALevelArch15)]
+>;
+
 //===----------------------------------------------------------------------===//
 // ONNXReluOp %X = ZHighUnstickOp (ZHighReluOp (ZHighStickOp %X))
 //===----------------------------------------------------------------------===//
@@ -118,6 +152,17 @@ def replaceONNXReluPattern : Pat<
                                (returnType $s_x)))
 >;
 
+//===----------------------------------------------------------------------===//
+// ONNXGeluOp %X = ZHighUnstickOp (ZHighGeluOp (ZHighStickOp %X), $approximate)
+//===----------------------------------------------------------------------===//
+def replaceONNXGeluPattern : Pat<
+  (ONNXGeluOp:$res $x, $approximate),
+  (ZHighUnstickOp (ZHighGeluOp (ZHighStickOp:$s_x $x, (NoneLayoutAttr), (GetDefaultSaturation)),
+                                 $approximate,
+                                 (returnType $s_x))),
+  [(IsCompatibleWithNNPALevelArch15)]
+>;
+
 //===----------------------------------------------------------------------===//
 // ONNXTanhOp %X = ZHighUnstickOp (ZHighTanhOp (ZHighStickOp %X))
 //===----------------------------------------------------------------------===//
@@ -294,6 +339,38 @@ def replaceONNXMaxPattern : Pat<
                               (returnType $s_x)))
 >;
 
+//===----------------------------------------------------------------------===//
+// ONNXDivOp(1, (ONNXSqrtOp %X)) = ZHighUnstickOp
+//                                 (ZHighInvSqrtOp (ZHighStickOp %X))
+//===----------------------------------------------------------------------===//
+def replaceDiv1SqrtPattern : Pat<
+  (ONNXDivOp $a, (ONNXSqrtOp $x)),
+  (ZHighUnstickOp (ZHighInvSqrtOp (ZHighStickOp:$s_x $x, (NoneLayoutAttr), (GetDefaultSaturation)),
+                               (returnType $s_x))),
+  [(IsCompatibleWithNNPALevelArch15),(IsConstOfOnes:$a),(IsFloatType:$a)]
+>;
+
+//===----------------------------------------------------------------------===//
+// ONNXReciprocalOp(ONNXSqrtOp %X) = ZHighUnstickOp
+//                                   (ZHighInvSqrtOp (ZHighStickOp %X))
+//===----------------------------------------------------------------------===//
+def replaceReciprocalSqrtPattern : Pat<
+  (ONNXReciprocalOp (ONNXSqrtOp $x)),
+  (ZHighUnstickOp (ZHighInvSqrtOp (ZHighStickOp:$s_x $x, (NoneLayoutAttr), (GetDefaultSaturation)),
+                                   (returnType $s_x))),
+  [(IsCompatibleWithNNPALevelArch15)]
+>;
+
+//===----------------------------------------------------------------------===//
+// ONNXSqrtOp %X = ZHighUnstickOp (ZHighSqrtOp (ZHighStickOp %X))
+//===----------------------------------------------------------------------===//
+def replaceONNXSqrtPattern : Pat<
+  (ONNXSqrtOp $x),
+  (ZHighUnstickOp (ZHighSqrtOp (ZHighStickOp:$s_x $x, (NoneLayoutAttr), (GetDefaultSaturation)),
+                                  (returnType $s_x))),
+  [(IsCompatibleWithNNPALevelArch15)]
+>;
+
 //===----------------------------------------------------------------------===//
 // ONNXSoftmaxOp %X = ONNXSqueezeOp
 //                        (ZHighUnstickOp
@@ -378,13 +455,38 @@ def replaceONNXLogSoftmaxPattern : Pattern<
 //===----------------------------------------------------------------------===//
 // ONNXReduceMeanV13Op %X = (ZHighUnstickOp
 //                          (ZHighMeanReduce2DOp
-//                             (ZHighStickOp %X))))
+//                             (ZHighStickOp %X)))
 //===----------------------------------------------------------------------===//
 def replaceONNXReduceMeanV13Pattern : Pat<
   (ONNXReduceMeanV13Op:$res $x, $_, $_),
   (ZHighUnstickOp (ZHighMeanReduce2DOp (ZHighStickOp $x, (NHWCLayoutAttr), (GetDefaultSaturation))))
 >;
 
+//===----------------------------------------------------------------------===//
+// ONNXReduceMaxOp %X =
+//                 (ZHighUnstickOp
+//                     (ZHighReduceMaxOp
+//                         (ZHighStickOp %X)))
+//===----------------------------------------------------------------------===//
+def replaceONNXReduceMaxPattern : Pat<
+  (ONNXReduceMaxOp:$res $data, $axes, $keepdims, $noop_with_empty_axes),
+  (ZHighUnstickOp (ZHighReduceMaxOp (ZHighStickOp:$s_x $data, (NoneLayoutAttr), 
+                                    (GetDefaultSaturation)), (GetStringAttr<"REDUCE_OP_MAXIMUM">))),
+  [(IsCompatibleWithNNPALevelArch15)]
+>;
+
+//===----------------------------------------------------------------------===//
+// ONNXReduceMinOp %X =
+//                 (ZHighUnstickOp
+//                     (ZHighReduceMinOp
+//                         (ZHighStickOp %X)))
+//===----------------------------------------------------------------------===//
+def replaceONNXReduceMinPattern : Pat<
+  (ONNXReduceMinOp:$res $data, $axes, $keepdims, $noop_with_empty_axes),
+  (ZHighUnstickOp (ZHighReduceMinOp (ZHighStickOp:$s_x $data, (NoneLayoutAttr), 
+                                    (GetDefaultSaturation)), (GetStringAttr<"REDUCE_OP_MINIMUM">))),
+  [(IsCompatibleWithNNPALevelArch15)]
+>;
 //===----------------------------------------------------------------------===//
 // ONNXMaxPoolSingleOutOp %X =
 //                 (ZHighUnstickOp
@@ -478,7 +580,7 @@ def GetMatMulLayoutStringAttr : NativeCodeCall<
 >;
 
 def GetMatMulBiasLayoutStringAttr : NativeCodeCall<
-  "$_builder.getStringAttr((($0 == 3) && ($1 == 3)) ? LAYOUT_2DS : LAYOUT_1D)"
+  "$_builder.getStringAttr(((($0 == 3) && ($1 == 3)) || (($0 == 2) && ($1 == 3))) ? LAYOUT_2DS : LAYOUT_1D)"
 >;
 
 //===----------------------------------------------------------------------===//
@@ -495,7 +597,7 @@ def replaceONNXMatMulPattern : Pat<
      (ZHighMatMulOp
         (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
         (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
-        (CreateNoneValue)))
+        (CreateNoneValue), (GetZeroI64Attr), (GetZeroI64Attr)))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -533,7 +635,8 @@ def replaceONNXMatMulAddPattern1 : Pat<
         (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
         (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
         (ZHighStickOp $b, (GetMatMulBiasLayoutStringAttr (GetRank $x),
-                                                         (GetRank $y)), (GetDefaultSaturation)))),
+                                                         (GetRank $y)), (GetDefaultSaturation)),
+        (GetZeroI64Attr), (GetZeroI64Attr))),
   [(IsMatMulLegalForZDNN $m), (HasRankOf<2> $y), (HasRankOf<1> $b),
    (HaveSameLastDimR2R1 $y, $b)], [],
   (addBenefit 0)
@@ -548,17 +651,113 @@ def replaceONNXMatMulAddPattern2 : Pat<
         (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
         (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
         (ZHighStickOp $b, (GetMatMulBiasLayoutStringAttr (GetRank $x),
-                                                         (GetRank $y)), (GetDefaultSaturation)))),
+                                                         (GetRank $y)), (GetDefaultSaturation)),
+        (GetZeroI64Attr), (GetZeroI64Attr))),
   [(IsMatMulLegalForZDNN $m), (HasRankOf<2> $y), (HasRankOf<1> $b),
    (HaveSameLastDimR2R1 $y, $b)], [],
   (addBenefit 0)
 >;
 
+//===----------------------------------------------------------------------===//
+// Replace onnx.add and onnx.matmul with bcast1 tensors with ZHighMatMul
+//===----------------------------------------------------------------------===//
+
+def replaceONNXMatMulAddPatternBcast1A : Pat<
+  // From Add $b, (MatMul $x, $y)
+  (ONNXAddOp $b, (ONNXMatMulOp:$m $x, $y)),
+  // To ZHighMatMulOp
+  (ZHighUnstickOp
+     (ZHighMatMulOp
+        (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
+        (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
+        (ZHighStickOp $b, (GetMatMulBiasLayoutStringAttr (GetRank $x),
+                                                         (GetRank $y)), (GetDefaultSaturation)),
+        (GetZeroI64Attr), (GetZeroI64Attr))),
+  [(IsCompatibleWithNNPALevelArch15),(IsMatMulLegalForZDNN $m),
+   (HasRankOf<2> $x), (HasRankOf<3> $y), (HasRankOf<2> $b)], [],
+  (addBenefit 0)
+>;
+
+def replaceONNXMatMulAddPatternBcast1B : Pat<
+  // From Add (MatMul $x, $y), $b
+  (ONNXAddOp (ONNXMatMulOp:$m $x, $y), $b),
+  // To ZHighMatMulOp
+  (ZHighUnstickOp
+     (ZHighMatMulOp
+	(ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
+        (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
+	(ZHighStickOp $b, (GetMatMulBiasLayoutStringAttr (GetRank $x),
+							 (GetRank $y)), (GetDefaultSaturation)),
+	(GetZeroI64Attr), (GetZeroI64Attr))),
+  [(IsCompatibleWithNNPALevelArch15),(IsMatMulLegalForZDNN $m),
+   (HasRankOf<2> $x), (HasRankOf<3> $y), (HasRankOf<2> $b)], [],
+  (addBenefit 0)
+>;
+
+//===----------------------------------------------------------------------===//
+// Replace onnx.matmul when following onnx.transpose
+//    with ZHighMatMul with either A or B transposed
+//===----------------------------------------------------------------------===//
+
+// TODO: This could likely be done in a cleaner way, such as comparing array vs array...
+// Make sure the transpose permutation is the default as ZHighMatMul only supports default
+def IsStandardTranspose : Constraint<
+  CPred<"((mlir::cast<ArrayAttr>($0).size() == 2 ) " #
+  " && (mlir::cast<IntegerAttr>(mlir::cast<ArrayAttr>($0)[0]).getInt() == 1)" #
+  " && (mlir::cast<IntegerAttr>(mlir::cast<ArrayAttr>($0)[1]).getInt() == 0))" #
+  " || ((mlir::cast<ArrayAttr>($0).size() == 3 ) " #
+  " && (mlir::cast<IntegerAttr>(mlir::cast<ArrayAttr>($0)[0]).getInt() == 0)" #
+  " && (mlir::cast<IntegerAttr>(mlir::cast<ArrayAttr>($0)[1]).getInt() == 2)" #
+  " && (mlir::cast<IntegerAttr>(mlir::cast<ArrayAttr>($0)[2]).getInt() == 1))">
+>;
+
+def replaceONNXTransBMatMulPattern : Pat<
+  (ONNXMatMulOp:$m $x,(ONNXTransposeOp $y,$perm)),
+  (ZHighUnstickOp
+     (ZHighMatMulOp
+        (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
+        (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
+        (CreateNoneValue), (GetI64ZeroAttr), (GetI64NAttr<1>)
+      )
+  ),
+  [(IsCompatibleWithNNPALevelArch15),(IsMatMulLegalForZDNN $m), (IsStandardTranspose $perm)]
+>;
+
+def replaceONNXTransAMatMulPattern : Pat<
+  (ONNXMatMulOp:$m (ONNXTransposeOp $x,$perm), $y),
+  (ZHighUnstickOp
+     (ZHighMatMulOp
+        (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
+        (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
+        (CreateNoneValue), (GetI64NAttr<1>), (GetI64ZeroAttr)
+      )
+  ),
+  [(IsCompatibleWithNNPALevelArch15),(IsMatMulLegalForZDNN $m), (IsStandardTranspose $perm)]
+>;
+
+def replaceONNXTransABMatMulPattern : Pat<
+  (ONNXMatMulOp:$m (ONNXTransposeOp $x,$perma), (ONNXTransposeOp $y,$permb)),
+  (ZHighUnstickOp
+     (ZHighMatMulOp
+        (ZHighStickOp $x, (GetMatMulLayoutStringAttr (GetRank $x)), (GetDefaultSaturation)),
+        (ZHighStickOp $y, (GetMatMulLayoutStringAttr (GetRank $y)), (GetDefaultSaturation)),
+        (CreateNoneValue), (GetI64NAttr<1>), (GetI64NAttr<1>)
+      )
+  ),
+  [(IsCompatibleWithNNPALevelArch15),(IsMatMulLegalForZDNN $m),
+   (IsStandardTranspose $perma), (IsStandardTranspose $permb)]
+>;
+
 //===----------------------------------------------------------------------===//
 // GEMM
 //===----------------------------------------------------------------------===//
 def IsTransposed: Constraint<CPred<"(mlir::cast<IntegerAttr>($_self).getSInt() == 1)">>;
 
+def IsAorBTransposed: Constraint<
+  CPred<"(mlir::cast<IntegerAttr>($0).getSInt() == 1) ||"
+        "(mlir::cast<IntegerAttr>($1).getSInt() == 1)" >
+>;
+
 def Transpose2D: NativeCodeCall<
   "emitONNXTranspose($_loc, $_builder, $0, SmallVector<int64_t, 2>({1, 0}))">;
 
@@ -604,7 +803,8 @@ def replaceONNXGemmBiasNoneOr1DPattern : Pat<
      (ZHighMatMulOp
         (ZHighStickOp $a, (_2DLayoutAttr), (GetDefaultSaturation)),
         (ZHighStickOp $b, (_2DLayoutAttr), (GetDefaultSaturation)),
-        (ZHighStickOp $c, (_1DLayoutAttr), (GetDefaultSaturation)))),
+        (ZHighStickOp $c, (_1DLayoutAttr), (GetDefaultSaturation)),
+        (GetZeroI64Attr), (GetZeroI64Attr))),
   [(IsBiasNoneOr1D:$c)], [],
   (addBenefit 0)
 >;
@@ -616,13 +816,39 @@ def replaceONNXGemmBias2DPattern : Pat<
        (ZHighMatMulOp
           (ZHighStickOp $a, (_2DLayoutAttr), (GetDefaultSaturation)),
           (ZHighStickOp $b, (_2DLayoutAttr), (GetDefaultSaturation)),
-          (CreateNoneValue)),
+          (CreateNoneValue),
+          (GetZeroI64Attr), (GetZeroI64Attr)),
        (returnType $res)),
     $c),
   [(HasRankOf<2> $c)], [],
   (addBenefit 0)
 >;
 
+//===----------------------------------------------------------------------===//
+// Replace onnx.Gemm when transA or transB set to ZHighMatMulOp with transpose.
+//
+// ONNXGemmOp $a, $b, $c, $alpha, $beta, $transA, $transB =
+//   ZHighUnstickOp
+//       (ZHighMatMulOp
+//           (ZHighStickOp $a, (_2DLayoutAttr)),
+//           (ZHighStickOp $b, (_2DLayoutAttr)),
+//           (ZHighStickOp $c, (_1DLayoutAttr)),
+//           $transA, $transB))
+//===----------------------------------------------------------------------===//
+def replaceONNXGemmTransPattern : Pat<
+  (ONNXGemmOp $a, $b, $c, $alpha, $beta, $transA, $transB),
+  (ZHighUnstickOp
+     (ZHighMatMulOp
+        (ZHighStickOp $a, (_2DLayoutAttr), (GetDefaultSaturation)),
+        (ZHighStickOp $b, (_2DLayoutAttr), (GetDefaultSaturation)),
+        (ZHighStickOp $c, (_1DLayoutAttr), (GetDefaultSaturation)),
+        $transA, $transB)),
+  [(IsAorBTransposed $transA, $transB),
+   (IsCompatibleWithNNPALevelArch15),
+   (IsBiasNoneOr1D:$c)], [],
+  (addBenefit 1)
+>;
+
 //===----------------------------------------------------------------------===//
 // LSTM
 //===----------------------------------------------------------------------===//
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.td b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.td
index 6ffdc29815..efd05e5fc8 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.td
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/ONNXToZHighCommon.td
@@ -2,7 +2,7 @@
 
 //===- ONNXToZHigh.td - Replacing ONNX Ops by ZHigh Ops -*- tablegen ------===//
 //
-// Copyright 2019-2020 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -64,4 +64,14 @@ def GetZeroI64Attr: NativeCodeCall<
   "IntegerAttr::get($_builder.getIntegerType(64, /*isSigned=*/true), APInt(64, 0, /*isSigned=*/true))"
 >;
 
+def IsCompatibleWithNNPALevelArch14: Constraint<
+  CPred<"isCompatibleWithNNPALevel(NNPALevel::M14)">,
+  "Input level is compatible with NNPA level"
+>;
+
+def IsCompatibleWithNNPALevelArch15: Constraint<
+  CPred<"isCompatibleWithNNPALevel(NNPALevel::M15)">,
+  "Input level is compatible with NNPA level"
+>;
+
 #endif // ONNX_TO_ZHIGH_COMMON
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.cpp b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.cpp
index 8b498862c0..bf1ebd37b6 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.cpp
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.cpp
@@ -532,8 +532,9 @@ void getRewriteONNXForZHighDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXAddOp>(
       target, dimAnalysis, [](ONNXAddOp op, const DimAnalysis *dimAnalysis) {
         // Check NNPA level.
-        if (!isCompatibleWithNNPALevel(NNPA_Z16))
-          return !onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+        if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+          return !onnxToZHighInCompatibilityReport(
+              op.getOperation(), NNPALevel::M14);
         // Check element type.
         if (!isValidElementTypeAndRank(op.getOperation(), op.getA(), true))
           return true;
@@ -547,8 +548,9 @@ void getRewriteONNXForZHighDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXDivOp>(
       target, dimAnalysis, [](ONNXDivOp op, const DimAnalysis *dimAnalysis) {
         // Check NNPA level.
-        if (!isCompatibleWithNNPALevel(NNPA_Z16))
-          return !onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+        if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+          return !onnxToZHighInCompatibilityReport(
+              op.getOperation(), NNPALevel::M14);
         // Check element type.
         if (!isValidElementTypeAndRank(op.getOperation(), op.getA(), true))
           return true;
@@ -560,8 +562,9 @@ void getRewriteONNXForZHighDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXMulOp>(
       target, dimAnalysis, [](ONNXMulOp op, const DimAnalysis *dimAnalysis) {
         // Check NNPA level.
-        if (!isCompatibleWithNNPALevel(NNPA_Z16))
-          return !onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+        if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+          return !onnxToZHighInCompatibilityReport(
+              op.getOperation(), NNPALevel::M14);
         // Check element type.
         if (!isValidElementTypeAndRank(op.getOperation(), op.getA(), true))
           return true;
@@ -573,8 +576,9 @@ void getRewriteONNXForZHighDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXSubOp>(
       target, dimAnalysis, [](ONNXSubOp op, const DimAnalysis *dimAnalysis) {
         // Check NNPA level.
-        if (!isCompatibleWithNNPALevel(NNPA_Z16))
-          return !onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+        if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+          return !onnxToZHighInCompatibilityReport(
+              op.getOperation(), NNPALevel::M14);
         // Check element type.
         if (!isValidElementTypeAndRank(op.getOperation(), op.getA(), true))
           return true;
@@ -597,8 +601,9 @@ void getRewriteONNXForZHighDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXMatMulOp>(
       target, dimAnalysis, [](ONNXMatMulOp op, const DimAnalysis *dimAnalysis) {
         // Check NNPA level.
-        if (!isCompatibleWithNNPALevel(NNPA_Z16))
-          return !onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+        if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+          return !onnxToZHighInCompatibilityReport(
+              op.getOperation(), NNPALevel::M14);
 
         Value A = op.getA();
         Value B = op.getB();
@@ -660,6 +665,140 @@ void getRewriteONNXForZHighDynamicallyLegal(
         return true;
       });
 
+  // Determine if the pattern `DequantizeLinear (QLinearMatMul inputs)` is
+  // already legal (no need to rewrite) or need to rewrite. Considering the
+  // inputs of QLinearMatMul, the following cases must be rewritten:
+  // - both inputs are *the same* N-D (N > 3) and there is no broadcasting, or
+  // - one input is N-D (N > 3) and the other is 2-D, or
+  //
+  // For such cases, rewrite patterns will be added to turn QLinearMatMulOp into
+  // the one where N-D will become 3-D.
+  //
+  // Starting from ONNXDequantizeLinearOp in order to move Reshape after
+  // DequantizeLinear, so that QLinearMatMul is still followed by
+  // DequantizeLinear, which makes optimization for the pattern
+  // QLinearMatMul-DequantizeLinear easier. In other words, the result pattern
+  // would look like:
+  //
+  // ```
+  // A_3D   = ReshapeTo3D A_ND
+  // B_3D   = ReshapeTo3D B_ND
+  // QA_3D  = Quantize (A_3D)
+  // QB_3D  = Quantize (B_3D)
+  // QY_3D  = QLinearMatMul QA_3D, QB_3D
+  // Y_3D   = Dequantize QY_3D
+  // Y_ND   = ReshapeToND Y_3D
+  // ```
+  //
+  // instead of
+  //
+  // ```
+  // QA_ND  = Quantize (A_ND)
+  // QB_ND  = Quantize (B_ND)
+  // QA_3D  = ReshapeTo3D QA_ND
+  // QB_3D  = ReshapeTo3D QB_ND
+  // QY_3D  = QLinearMatMul QA_3D, QB_3D
+  // QY_ND  = ReshapeToND QY_3D
+  // Y_3D   = Dequantize QY_ND
+  // ```
+  //
+  addDynamicallyLegalOpFor<ONNXDequantizeLinearOp>(target, dimAnalysis,
+      [](ONNXDequantizeLinearOp dlOp, const DimAnalysis *dimAnalysis) {
+        // Check NNPA level.
+        if (!isCompatibleWithNNPALevel(NNPALevel::M15))
+          return !onnxToZHighInCompatibilityReport(
+              dlOp.getOperation(), NNPALevel::M15);
+
+        ONNXQLinearMatMulOp op =
+            dlOp.getX().getDefiningOp<ONNXQLinearMatMulOp>();
+        if (!op)
+          return !onnxToZHighUnsupportedReport(dlOp.getOperation(),
+              "Input is not defined by ONNXQLinearMatMulOp");
+
+        Value A = op.getA();
+        Value AScale = op.getAScale();
+        Value AZeroPoint = op.getAZeroPoint();
+        Value B = op.getB();
+        Value BScale = op.getBScale();
+        Value BZeroPoint = op.getBZeroPoint();
+        Value Y = op.getY();
+        Value YScale = op.getYScale();
+        Type aType = A.getType();
+        Type bType = B.getType();
+        Type yType = Y.getType();
+
+        if (!isRankedShapedType(aType) || !isRankedShapedType(bType)) {
+          std::string message = "A or B is not shaped type with rank";
+          return !onnxToZHighUnsupportedReport(op.getOperation(), message);
+        }
+
+        int64_t aRank = getRank(aType);
+        int64_t bRank = getRank(bType);
+        ArrayRef<int64_t> aShape = getShape(aType);
+        ArrayRef<int64_t> bShape = getShape(bType);
+
+        // Only support float32 <-> int8/uint8.
+        Type elemTyA = getElementType(aType);
+        Type elemTyAScale = getElementType(AScale.getType());
+        Type elemTyB = getElementType(bType);
+        Type elemTyBScale = getElementType(BScale.getType());
+        Type elemTyY = getElementType(yType);
+        Type elemTyYScale = getElementType(YScale.getType());
+        if (!elemTyAScale.isF32() || !elemTyBScale.isF32() ||
+            !elemTyYScale.isF32())
+          return !onnxToZHighUnsupportedReport(
+              op.getOperation(), "A or B or Y's scale is not f32");
+        if (!(elemTyA.isInteger(8) || elemTyA.isUnsignedInteger(8)))
+          return !onnxToZHighUnsupportedReport(
+              op.getOperation(), "A is not i8 or ui8");
+        if (!(elemTyB.isInteger(8) || elemTyB.isUnsignedInteger(8)))
+          return !onnxToZHighUnsupportedReport(
+              op.getOperation(), "B is not i8 or ui8");
+        if (!(elemTyY.isInteger(8) || elemTyY.isUnsignedInteger(8)))
+          return !onnxToZHighUnsupportedReport(
+              op.getOperation(), "Y is not i8 or ui8");
+
+        // Only support per-tensor quantization.
+        if (!isScalarTensor(AScale) || !isScalarTensor(BScale) ||
+            !isScalarTensor(AZeroPoint) || !isScalarTensor(BZeroPoint))
+          return !onnxToZHighUnsupportedReport(
+              op.getOperation(), "Not per-tensor quantization");
+
+        // - one input is N-D (N > 3) and the other is 2-D.
+        if (aRank == 2 && bRank > 3)
+          return false;
+
+        if (bRank == 2 && aRank > 3)
+          return false;
+
+        // - both inputs are *the same* N-D, N > 3 and there is no broadcasting
+        if (aRank > 3 && (aRank == bRank)) {
+          bool sameBatchDims = true;
+          std::string message = "";
+          for (int64_t i = 0; i < aRank - 2; ++i) {
+            sameBatchDims &= (aShape[i] == bShape[i]);
+            if (aShape[i] != bShape[i])
+              message += "The dim " + std::to_string(i) + " of A and dim " +
+                         std::to_string(i) + " of B are not the same.";
+
+            if (sameBatchDims && ShapedType::isDynamic(aShape[i])) {
+              sameBatchDims &=
+                  dimAnalysis->sameDynDim(op.getA(), i, op.getB(), i);
+              if (!sameBatchDims)
+                message += "The dynamic dimension analysis couldn't identify "
+                           "that dim " +
+                           std::to_string(i) + " of A and dim " +
+                           std::to_string(i) + " of B are the same.";
+            }
+          }
+          return (!sameBatchDims) ||
+                 onnxToZHighUnsupportedReport(op.getOperation(), message);
+        }
+
+        // Make other cases legal.
+        return true;
+      });
+
   // Illegalize SoftmaxOp if
   // - the NNPA level is not compatible, or
   // - axis is the last dimension.
@@ -667,8 +806,9 @@ void getRewriteONNXForZHighDynamicallyLegal(
   addDynamicallyLegalOpFor<ONNXSoftmaxOp>(target, dimAnalysis,
       [](ONNXSoftmaxOp op, const DimAnalysis *dimAnalysis) {
         // Check NNPA level.
-        if (!isCompatibleWithNNPALevel(NNPA_Z16))
-          return !onnxToZHighInCompatibilityReport(op.getOperation(), NNPA_Z16);
+        if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+          return !onnxToZHighInCompatibilityReport(
+              op.getOperation(), NNPALevel::M14);
 
         Value input = op.getInput();
         // std::string message = "The `input` is not reshaped to 3D because it
diff --git a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.td b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.td
index 854df5f0e5..2173000382 100644
--- a/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.td
+++ b/src/Accelerators/NNPA/Conversion/ONNXToZHigh/RewriteONNXForZHigh.td
@@ -43,6 +43,11 @@ class HasRankOf<int rank> :
 def GetSqrtResultBatchNormA :
       NativeCodeCall<"getSqrtResultBatchNormA($_loc, $_builder, $0, $1)">;
 
+// Return a tensor type whose shape is from the 1st operand and element type is
+// from the 2nd operand.
+def GetShapeAndElemType: NativeCodeCall<
+ "RankedTensorType::get(getShape($0.getType()), getElementType($1.getType()))">;
+
 //===----------------------------------------------------------------------===//
 // Rewrite
 //
@@ -234,6 +239,140 @@ def rewriteMatMulNDto3D_Broadcast_2: Pat<
   [(HasRankOf<2> $a), (HasRankGT<3> $b)]
 >;
 
+//===----------------------------------------------------------------------===//
+// Rules to turn ONNXQLinearMatMulOp with N-D inputs into the one with 3-D inputs.
+//===----------------------------------------------------------------------===//
+
+// Rewrite QLinearMatMul where N-dimension inputs are reshaped to 3-dimension
+// ones. Start rewriting from a DequantizeLinear operation and hoist Reshape
+// operations out of the pattern QuantizeLinear-QLinearMatMul-DequantizeLinear
+// so that if there is any optimization for the pattern, it is easy to be
+// applied.
+// In other words, the result pattern would look like:
+//
+// ```
+// A_3D   = ReshapeTo3D A_ND
+// B_3D   = ReshapeTo3D B_ND
+// QA_3D  = Quantize (A_3D)
+// QB_3D  = Quantize (B_3D)
+// QY_3D  = QLinearMatMul QA_3D, QB_3D
+// Y_3D   = Dequantize QY_3D
+// Y_ND   = ReshapeToND Y_3D
+// ```
+//
+// instead of
+//
+// ```
+// QA_ND  = Quantize (A_ND)
+// QB_ND  = Quantize (B_ND)
+// QA_3D  = ReshapeTo3D QA_ND
+// QB_3D  = ReshapeTo3D QB_ND
+// QY_3D  = QLinearMatMul QA_3D, QB_3D
+// QY_ND  = ReshapeToND QY_3D
+// Y_3D   = Dequantize QY_ND
+// ```
+//
+def rewriteQLinearMatMulNDto3D_NonBroadcast: Pattern<
+  (ONNXDequantizeLinearOp:$y_nd
+    (ONNXQLinearMatMulOp:$qy_nd
+      (ONNXQuantizeLinearOp:$qa_nd $a_nd, $a_scale, $a_zeropoint, $a_axis, $a_saturate), $_, $_,
+      (ONNXQuantizeLinearOp:$qb_nd $b_nd, $b_scale, $b_zeropoint, $b_axis, $b_saturate), $_, $_,
+      $_, $_),
+    $y_scale, $y_zeropoint, $y_axis),
+  [
+  // Reshape A to 3D and do quantization.
+  (ReshapeTo3D:$a_3d $a_nd),
+  (ONNXQuantizeLinearOp:$qa_3d
+    $a_3d, $a_scale, $a_zeropoint, $a_axis, $a_saturate,
+    (returnType (GetShapeAndElemType $a_3d, $qa_nd))),
+
+  // Reshape B to 3D and do quantization.
+  (ReshapeTo3D:$b_3d $b_nd),
+  (ONNXQuantizeLinearOp:$qb_3d
+    $b_3d, $b_scale, $b_zeropoint, $b_axis, $b_saturate,
+    (returnType (GetShapeAndElemType $b_3d, $qb_nd))),
+
+  // Call QLinearMatMul on 3D inputs.
+  (ONNXQLinearMatMulOp:$qy_3d
+     $qa_3d, $a_scale, $a_zeropoint,
+     $qb_3d, $b_scale, $b_zeropoint,
+     $y_scale, $y_zeropoint,
+     (returnType (GetMatMulResultType $qa_3d, $qb_3d))),
+
+  // Dequantize the 3D output.
+  (ONNXDequantizeLinearOp:$y_3d $qy_3d, $y_scale, $y_zeropoint, $y_axis,
+    (returnType (GetShapeAndElemType $qy_3d, $y_nd))),
+
+  // Reshape the output back to ND.
+  (ONNXReshapeOp $y_3d, (GetMatMulResultShape $a_nd, $b_nd), (GetZeroI64Attr))
+  ],
+  [(HasRankGT<3> $a_nd), (HasRankGT<3> $b_nd)]
+>;
+
+// A is ND, B is 2D.
+def rewriteQLinearMatMulNDto3D_Broadcast1: Pattern<
+  (ONNXDequantizeLinearOp:$y_nd
+    (ONNXQLinearMatMulOp:$qy_nd
+      (ONNXQuantizeLinearOp:$qa_nd $a_nd, $a_scale, $a_zeropoint, $a_axis, $a_saturate), $_, $_,
+      $qb, $b_scale, $b_zeropoint,
+      $_, $_),
+    $y_scale, $y_zeropoint, $y_axis),
+  [
+  // Reshape A to 3D and do quantization.
+  (ReshapeTo3D:$a_3d $a_nd),
+  (ONNXQuantizeLinearOp:$qa_3d
+    $a_3d, $a_scale, $a_zeropoint, $a_axis, $a_saturate,
+    (returnType (GetShapeAndElemType $a_3d, $qa_nd))),
+
+  // Call QLinearMatMul on 3D inputs.
+  (ONNXQLinearMatMulOp:$qy_3d
+     $qa_3d, $a_scale, $a_zeropoint,
+     $qb, $b_scale, $b_zeropoint, // Keep B unchanged.
+     $y_scale, $y_zeropoint,
+     (returnType (GetMatMulResultType $qa_3d, $qb))),
+
+  // Dequantize the 3D output.
+  (ONNXDequantizeLinearOp:$y_3d $qy_3d, $y_scale, $y_zeropoint, $y_axis,
+    (returnType (GetShapeAndElemType $qy_3d, $y_nd))),
+
+  // Reshape the output back to ND.
+  (ONNXReshapeOp $y_3d, (GetMatMulResultShape $a_nd, $qb), (GetZeroI64Attr))
+  ],
+  [(HasRankGT<3> $a_nd), (HasRankOf<2> $qb)]
+>;
+
+// A is 2D, B is ND.
+def rewriteQLinearMatMulNDto3D_Broadcast2: Pattern<
+  (ONNXDequantizeLinearOp:$y_nd
+    (ONNXQLinearMatMulOp:$qy_nd
+      $qa, $a_scale, $a_zeropoint,
+      (ONNXQuantizeLinearOp:$qb_nd $b_nd, $b_scale, $b_zeropoint, $b_axis, $b_saturate), $_, $_,
+      $_, $_),
+    $y_scale, $y_zeropoint, $y_axis),
+  [
+  // Reshape B to 3D and do quantization.
+  (ReshapeTo3D:$b_3d $b_nd),
+  (ONNXQuantizeLinearOp:$qb_3d
+    $b_3d, $b_scale, $b_zeropoint, $b_axis, $b_saturate,
+    (returnType (GetShapeAndElemType $b_3d, $qb_nd))),
+
+  // Call QLinearMatMul on 3D inputs.
+  (ONNXQLinearMatMulOp:$qy_3d
+     $qa, $a_scale, $a_zeropoint, // Keep A unchanged.
+     $qb_3d, $b_scale, $b_zeropoint,
+     $y_scale, $y_zeropoint,
+     (returnType (GetMatMulResultType $qa, $qb_3d))),
+
+  // Dequantize the 3D output.
+  (ONNXDequantizeLinearOp:$y_3d $qy_3d, $y_scale, $y_zeropoint, $y_axis,
+    (returnType (GetShapeAndElemType $qy_3d, $y_nd))),
+
+  // Reshape the output back to ND.
+  (ONNXReshapeOp $y_3d, (GetMatMulResultShape $qa, $b_nd), (GetZeroI64Attr))
+  ],
+  [(HasRankOf<2> $qa), (HasRankGT<3> $b_nd)]
+>;
+
 //===----------------------------------------------------------------------===//
 // Rules to turn ONNXSoftmaxOp with N-D inputs into the one with 2-D inputs.
 //===----------------------------------------------------------------------===//
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/CMakeLists.txt b/src/Accelerators/NNPA/Conversion/ZHighToZLow/CMakeLists.txt
index fb412aa5e9..237ad53d0a 100644
--- a/src/Accelerators/NNPA/Conversion/ZHighToZLow/CMakeLists.txt
+++ b/src/Accelerators/NNPA/Conversion/ZHighToZLow/CMakeLists.txt
@@ -1,8 +1,10 @@
 add_onnx_mlir_library(OMZHighToZLow
+  ProcessStickData.cpp
   ZHighToZLow.cpp
 
   LINK_LIBS PUBLIC
   MLIRMemRefTransforms
+  OMKrnlToLLVM
   OMLayoutHelper
   OMONNXToKrnl
   OMStickify
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.cpp b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.cpp
new file mode 100644
index 0000000000..9829892237
--- /dev/null
+++ b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.cpp
@@ -0,0 +1,170 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//====---------- ProcessStickData.cpp - Process Stick data ----------------===//
+//
+// Copyright 2024 The IBM Research Authors.
+//
+// =============================================================================
+//
+// This file implements the lowering of ZHigh operations to Krnl/Affine/SCF
+// operations that operates on stickified input/output data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
+#include "src/Compiler/CompilerOptions.hpp"
+#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
+#include "src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp"
+#include "src/Dialect/Krnl/DialectBuilder.hpp"
+#include "src/Dialect/ONNX/DialectBuilder.hpp"
+#include "src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp"
+#include "src/Support/SmallVectorHelper.hpp"
+
+using namespace mlir;
+
+namespace onnx_mlir {
+
+// Implementation of quantize helper function.
+void emitDynamicQuantizationLinearMinMaxFromStickifiedInput(
+    ConversionPatternRewriter &rewriter, Location loc, Operation *op,
+    Value input, StringAttr inputLayout, Value &inputMin, Value &inputMax,
+    bool enableSIMD, bool enableParallel) {
+  using MDBuilder = MultiDialectBuilder<KrnlBuilder, IndexExprBuilderForKrnl,
+      MathBuilder, MemRefBuilder, VectorBuilder>;
+  MDBuilder create(rewriter, loc);
+
+  // Extract dims from input, set lbs/ubs.
+  DimsExpr dims;
+  create.krnlIE.getShapeAsSymbols(input, dims);
+  int64_t rank = dims.size();
+  IndexExpr zero = LitIE(0);
+  DimsExpr lbs(rank, zero);
+  DimsExpr ubs = dims;
+
+  // Decide parameters.
+  // UnrollVL decides how many vectors of 8 DLF16 will be processed at once.
+  int64_t unrollVL = 4; // Experimentally good unroll factor.
+  int64_t archVL = 8;   // DLF16.
+  int64_t totVL = unrollVL * archVL;
+
+  // If not parallel, threadNum = 1, forExplicitParallelLoopIE will simply pass
+  // through the lb/ub, so ok to have parID = 0 for the sequential cases.
+  int64_t parId = 0;
+  int64_t threadNum = 1;
+  if (enableParallel) {
+    if (findSuitableParallelDimension(lbs, ubs, 0, rank - 1, parId, 8)) {
+      threadNum = 8; // TODO use more flexible value.
+      onnxToKrnlParallelReport(op, true, parId, lbs[parId], ubs[parId],
+          "simd min/max for DQL in parallel ");
+    } else {
+      enableParallel = false;
+      onnxToKrnlParallelReport(
+          op, false, -1, -1, "not enough work in simd min/max for DQL");
+    }
+  }
+
+  // Alloc temp buffers (more when using parallel).
+  Type f32Type = rewriter.getF32Type();
+  // For each thread, we can use totVL temp values for the current min/max.
+  // But to increase the compute ratio over mem, we will reuse the same tmp
+  // memory location for a pair of totVL values being processed.
+  int64_t tmpSizePerThread = totVL / 2; // Reduce pair in same tmp.
+  int64_t tmpSize = threadNum * tmpSizePerThread;
+  MemRefType redType = MemRefType::get({tmpSize}, f32Type);
+  VectorType vec8xF32Type = VectorType::get({archVL}, f32Type);
+  VectorType vec4xF32Type = VectorType::get({archVL / 2}, f32Type);
+
+  Value minTmp = create.mem.alignedAlloc(redType);
+  Value maxTmp = create.mem.alignedAlloc(redType);
+
+  // Init min and max.
+  Value minInit = create.math.positiveInf(f32Type);
+  Value splatMinInit = create.vec.splat(vec8xF32Type, minInit);
+  Value maxInit = create.math.negativeInf(f32Type);
+  Value splatMaxInit = create.vec.splat(vec8xF32Type, maxInit);
+  // Could parallelize init, here main thread do it all. Use SIMD of 8x.
+  for (int64_t u = 0; u < tmpSize; u += 8) {
+    IndexExpr offset = LitIE(u);
+    create.vec.storeIE(splatMinInit, minTmp, {offset});
+    create.vec.storeIE(splatMaxInit, maxTmp, {offset});
+  }
+
+  // Reduction into these temps.
+  IndexExpr tNum = LitIE(threadNum);
+  create.krnl.forExplicitParallelLoopIE(lbs[parId], ubs[parId], tNum,
+      [&](const KrnlBuilder &ck, ValueRange loopInd) {
+        IndexExprScope scope(ck);
+        IndexExpr t = DimIE(loopInd[0]);
+        DimsExpr currDims = SymListIE(dims);
+        // Reduce lbs, ubs for parallel region, if any.
+        DimsExpr currLbs = SymListIE(lbs);
+        DimsExpr currUbs = SymListIE(ubs);
+        // In sequential cases (threadNum ==1, loopInd[1,2]== orig lb,ub).
+        currLbs[parId] = SymIE(loopInd[1]);
+        currUbs[parId] = SymIE(loopInd[2]);
+        // Cannot use krnl because we may not have affine bounds.
+        SCFBuilder sb(ck);
+        IterateOverStickInputData<SCFBuilder>(
+            sb, op, currLbs, currUbs, currDims, inputLayout, input, nullptr,
+            unrollVL, /*enableParallel*/ false,
+            /*prefetch, disable as it causes issue with affine*/ false,
+            [&](const KrnlBuilder &b, SmallVectorImpl<Value> &vecOf4xF32Vals,
+                DimsExpr &loopIndices) {
+              MDBuilder create(b);
+              int64_t size = vecOf4xF32Vals.size();
+              assert((size == 2 || size == 2 * unrollVL) && "unexpected size");
+              // Since all threads share the same tmpMin/Max, needs to offset by
+              // t * <size for one thread>.
+              IndexExpr threadOffset = SymIE(t) * tmpSizePerThread;
+              size = size / 2; // handle pairs of 2, so size=1 or unrollVL.
+              for (int i = 0; i < size; ++i) {
+                Value val0 = vecOf4xF32Vals[2 * i];
+                Value val1 = vecOf4xF32Vals[2 * i + 1];
+                // Load appropriate tmp, compute min/max, store in tmp.
+                IndexExpr offset = threadOffset + LitIE(4 * i);
+                Value currMin =
+                    create.vec.loadIE(vec4xF32Type, minTmp, {offset});
+                Value currMax =
+                    create.vec.loadIE(vec4xF32Type, maxTmp, {offset});
+                currMin = create.math.min(currMin, val0);
+                currMax = create.math.max(currMax, val0);
+                currMin = create.math.min(currMin, val1);
+                currMax = create.math.max(currMax, val1);
+                create.vec.storeIE(currMin, minTmp, {offset});
+                create.vec.storeIE(currMax, maxTmp, {offset});
+              }
+            },
+            [&](const KrnlBuilder &b, Value scalarF32Val,
+                DimsExpr &loopIndices) {
+              MDBuilder create(b);
+              Value currMin = create.krnl.loadIE(minTmp, {zero});
+              Value currMax = create.krnl.loadIE(maxTmp, {zero});
+              currMin = create.math.min(currMin, scalarF32Val);
+              currMax = create.math.max(currMax, scalarF32Val);
+              create.krnl.storeIE(currMin, minTmp, {zero});
+              create.krnl.storeIE(currMax, maxTmp, {zero});
+            }); // Iterate over stick.
+      });       // Explicit parallel loop (sequential if threadNum==1).
+
+  // Now we have all the partial min/max inside the minTmp/maxTmp: reduce each
+  // vectors with each others. Main thread reduces all the values. Use SIMD of
+  // 8x.
+  Value finalVecMin = create.vec.loadIE(vec8xF32Type, minTmp, {zero});
+  Value finalVecMax = create.vec.loadIE(vec8xF32Type, maxTmp, {zero});
+  for (int u = 8; u < tmpSize; u += 8) {
+    IndexExpr offset = LitIE(u);
+    Value currMin = create.vec.loadIE(vec8xF32Type, minTmp, {offset});
+    Value currMax = create.vec.loadIE(vec8xF32Type, maxTmp, {offset});
+    finalVecMin = create.math.min(finalVecMin, currMin);
+    finalVecMax = create.math.max(finalVecMax, currMax);
+  }
+
+  // Horizontal reduction of the vectors into a scalar.
+  inputMin = create.vec.reduction(VectorBuilder::MIN, finalVecMin);
+  inputMax = create.vec.reduction(VectorBuilder::MAX, finalVecMax);
+}
+
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp
new file mode 100644
index 0000000000..c72bed046c
--- /dev/null
+++ b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp
@@ -0,0 +1,61 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//====---------- ProcessStickData.cpp - Process Stick data ----------------===//
+//
+// Copyright 2024 The IBM Research Authors.
+//
+// =============================================================================
+//
+// This file implements the lowering of ZHigh operations to Krnl/Affine/SCF
+// operations that operates on stickified input/output data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ONNX_MLIR_PROCESS_STICK_DATA_H
+#define ONNX_MLIR_PROCESS_STICK_DATA_H
+
+#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
+
+namespace onnx_mlir {
+
+// By definition of the conversion from dlf16 to f32, vecOfF32Vals should always
+// contain pairs of vectors.
+using ContiguousVectorOfF32IterateBodyFn = std::function<void(
+    const KrnlBuilder &b, mlir::SmallVectorImpl<mlir::Value> &vecOfF32Vals,
+    DimsExpr &loopIndices)>;
+
+using ScalarF32IterateBodyFn = std::function<void(
+    const KrnlBuilder &b, mlir::Value scalarF32Val, DimsExpr &loopIndices)>;
+
+// Iterate over each values in the input's sticks, processing vectors (of 4 F32)
+// with processVectorOfF32Vals, and scalars (1 F32) with processScalarF32Val, By
+// definition, processVectorOfF32Vals contains either 2 or 2*unrollVL vectors.
+// And processScalarF32Val process only 1 scalar value. Output is only used for
+// prefetching. If output is null, skip output prefetching. In general, we
+// expects lbs={0...0} and ubs=dims. WHen parallelized outside of this loop,
+// then lbs and ubs can reflect the subset of iterations assigned to this
+// thread. Iterations cannot be partitioned on the innermost dim.
+template <class BUILDER>
+void IterateOverStickInputData(const BUILDER &b, mlir::Operation *op,
+    DimsExpr &lbs, DimsExpr &ubs, DimsExpr &dims, mlir::StringAttr layout,
+    mlir::Value input, mlir::Value output, int64_t unrollVL,
+    bool enableParallel, bool enablePrefetch,
+    ContiguousVectorOfF32IterateBodyFn processVectorOfF32Vals,
+    ScalarF32IterateBodyFn processScalarF32Val);
+
+// Compute min/max from stickified input. Currently support 2DS, 3D, 3DS,
+// 4D formats.
+void emitDynamicQuantizationLinearMinMaxFromStickifiedInput(
+    mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
+    mlir::Operation *op, mlir::Value input, mlir::StringAttr inputLayout,
+    mlir::Value &inputMin, mlir::Value &inputMax, bool enableSIMD,
+    bool enableParallel);
+
+} // namespace onnx_mlir
+
+// Include template code.
+#include "src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp.inc"
+
+#endif
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp.inc b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp.inc
new file mode 100644
index 0000000000..3e8a51b46c
--- /dev/null
+++ b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp.inc
@@ -0,0 +1,261 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//====-------- ProcessStickData.hpp.inc - Process Stick data --------------===//
+//
+// Copyright 2024 The IBM Research Authors.
+//
+// =============================================================================
+//
+// This file implements the lowering of ZHigh operations to Krnl/Affine/SCF
+// operations that operates on stickified input/output data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
+#include "src/Compiler/CompilerOptions.hpp"
+#include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
+#include "src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp"
+#include "src/Dialect/Krnl/DialectBuilder.hpp"
+#include "src/Dialect/ONNX/DialectBuilder.hpp"
+#include "src/Dialect/ONNX/ONNXOps/ShapeHelper.hpp"
+#include "src/Support/SmallVectorHelper.hpp"
+
+namespace onnx_mlir {
+
+// Iterate over each stick, for an original size of dims, and cover the
+// iterations from lbs to ubs. In most cases, lbs={0...} and ubs=dims, namely we
+// cover all iterations. But we can parallelize the loops from the outside, in
+// which case we expect lbs and ubs to reflect the iterations assigned to this
+// thread. Note that we cannot tile in the innermost dim (as this is the
+// dimension of the sticks).
+template <class BUILDER>
+void IterateOverStickInputData(const BUILDER &b, mlir::Operation *op,
+    DimsExpr &lbs, DimsExpr &ubs, DimsExpr &dims, mlir::StringAttr inputLayout,
+    mlir::Value input, mlir::Value output, int64_t unrollVL,
+    bool enableParallel, bool enablePrefetch,
+    ContiguousVectorOfF32IterateBodyFn processVectorOfF32Vals,
+    ScalarF32IterateBodyFn processScalarF32Val) {
+  // Init builder and scopes.
+  using MDBuilder = MultiDialectBuilder<KrnlBuilder, IndexExprBuilderForKrnl,
+      MemRefBuilder, VectorBuilder, SCFBuilder>;
+  MDBuilder create(b);
+  //IndexExprScope initialScope(b);
+  // Get info and check some inputs.
+  int64_t rank = dims.size();
+  int64_t d1 = rank - 1;
+  IndexExpr E1 = dims[d1];
+  assert(lbs.size() == ubs.size() && "expected same sizes");
+  assert(lbs.size() == dims.size() && "expected same sizes");
+  assert((inputLayout.getValue().equals_insensitive("4D") ||
+             inputLayout.getValue().equals_insensitive("3D") ||
+             inputLayout.getValue().equals_insensitive("2D") ||
+             inputLayout.getValue().equals_insensitive("3DS") ||
+             inputLayout.getValue().equals_insensitive("NHWC")) &&
+         "unsupported inputLayout");
+
+  // Info for SIMD Vector Length (VL).
+  int64_t archVL = 8;              // FP16 archVL.
+  int64_t archVLHalf = archVL / 2; // FP32 archVL.
+  int64_t totVL = archVL * unrollVL;
+  int64_t stickLen = 64;
+  assert(stickLen % totVL == 0 && "bad unrollVL factor");
+  mlir::Type f16Type = b.getBuilder().getF16Type();
+  mlir::Type f32Type = b.getBuilder().getF32Type();
+  mlir::VectorType vecF16Type = mlir::VectorType::get({archVL}, f16Type);
+  mlir::MemRefType bufferF32Type = mlir::MemRefType::get({archVL}, f32Type);
+
+  // Useful constants.
+  IndexExpr litZero = LitIE(0);
+  IndexExpr lit1 = LitIE(1);
+  IndexExpr lit2 = LitIE(2);
+  IndexExpr litArchVLHalf = LitIE(archVLHalf);
+  IndexExpr litArchVL = LitIE(archVL);
+  IndexExpr litStickLen = LitIE(stickLen);
+
+  // Create loop iterations. We iterate over E1 as sticks of 64 elements. Lbs
+  // and ubs reflect the iteration over the sticks (tiled data points).
+  DimsExpr tiledLbs = lbs;
+  DimsExpr tiledUbs = ubs;
+  tiledUbs[d1] = E1.ceilDiv(litStickLen);
+
+  // Predicates used to avoid creating code that is never used.
+  bool neverHas64 = E1.isLiteralAndSmallerThan(stickLen);
+  bool neverHas8 = E1.isLiteralAndSmallerThan(archVL);
+  bool hasOnly64 = E1.isLiteral() && (E1.getLiteral() % stickLen == 0);
+  bool hasOnly8 = E1.isLiteral() && (E1.getLiteral() % archVL == 0);
+
+  // Parallel... Should not be turned on when parallelized in the outside.
+  int64_t parId = 0;
+  if (enableParallel) {
+    // TODO: may want to check if ub of rank makes sense here.
+    // Its ok here even to partition rank-1, included in (0..rank(, because
+    // rank-1 is tiled. So we are still dealing with multiple of sticks.
+    if (findSuitableParallelDimension(tiledLbs, tiledUbs, 0, rank, parId, 8)) {
+      onnxToKrnlParallelReport(op, true, parId, tiledLbs[parId],
+          tiledUbs[parId], "compiler-generated stickify");
+    } else {
+      enableParallel = false;
+      onnxToKrnlParallelReport(op, false, -1, -1,
+          "no dim with enough work in compiler-generated stickify");
+    }
+  }
+
+  // Compute max sticks (tiles of 64 values). It is actually not easy to compute
+  // the max number of sticks. Since we don't allocate, it is just a "view", we
+  // only need to index by the "stick size", it is sufficient to assume 2 or
+  // more.
+  DimsExpr reallocStickDims = {lit2, litStickLen};
+  mlir::Value inputAsSticks =
+      create.mem.reinterpretCast(input, reallocStickDims);
+
+  llvm::SmallVector<int64_t, 4> steps(rank, 1);
+  llvm::SmallVector<bool, 4> useParallel(rank, false);
+  if (enableParallel)
+    useParallel[parId] = true;
+  b.forLoopsIE(tiledLbs, tiledUbs, steps, useParallel,
+      [&](const BUILDER &b, mlir::ValueRange tiledLoopInd) {
+        MDBuilder create(b);
+        IndexExprScope outerScope(b);
+        DimsExpr tiledOuterIndices = DimListIE(tiledLoopInd);
+        // Computation for accessing data (not tiled, actual indices).
+        DimsExpr outerIndices = tiledOuterIndices;
+        IndexExpr E1 = SymIE(dims[d1]); // Original upper bound in d1.
+        IndexExpr e1 = outerIndices[d1] = tiledOuterIndices[d1] * litStickLen;
+        // Translate the tile index t1 to the actual targetted data. Have to
+        // give the actual indices, not the tiled ones.
+        mlir::Value inputOffset =
+            create.krnl.getLinearOffsetIndexIE(input, outerIndices);
+        // Offset in inputAsSticks's first dim is as multiple of litStickLen, so
+        // divide by it.
+        IndexExpr inputStickOffset = SymIE(inputOffset).floorDiv(litStickLen);
+        // Buffer for small leftovers (used when E1 % 8 != 0)
+        mlir::Value bufferF32;
+        if (!hasOnly8)
+          bufferF32 = create.mem.alignedAlloc(bufferF32Type);
+        if (enablePrefetch) {
+          // Prefetch current line
+          create.krnl.prefetchIE(input, outerIndices, /*write*/ false,
+              /*locality*/ 1);
+          if (output)
+            create.krnl.prefetchIE(output, outerIndices, /*write*/ true,
+                /*locality*/ 1);
+        }
+        // Check if we have a full stick (aka end of stick is not beyond UB).
+        IndexExpr hasFullStick;
+        if (hasOnly64) {
+          hasFullStick = PredIE(true); // Has only full sicks.
+        } else if (neverHas64) {
+          hasFullStick = PredIE(false); // Doesn't even has 1 stick.
+        } else {
+          IndexExpr isFull = create.krnlIE.isTileFull(e1, litStickLen, E1);
+          hasFullStick = (isFull >= 0);
+        }
+        create.scf.ifThenElse(
+            hasFullStick.getValue(),
+            // If is full.
+            [&](const SCFBuilder b) {
+              if (neverHas64)
+                return; // Nothing to do here. Avoid generating dead code.
+              MDBuilder create(b);
+              // Iterate through stick by totVL (aka 8 * unroll).
+              create.scf.forLoopIE(litZero, litStickLen, totVL, /*par*/ false,
+                  [&](const SCFBuilder b, mlir::ValueRange loopInd) {
+                    MDBuilder create(b);
+                    IndexExprScope innerScope(b, &outerScope);
+                    IndexExpr l = DimIE(loopInd[0]);
+                    DimsExpr innerIndices = SymListIE(outerIndices);
+                    innerIndices[d1] = innerIndices[d1] + l;
+                    mlir::SmallVector<mlir::Value, 8> vecOfF32Vals;
+                    // Load archVL (8) f16 values from input via reinterpreted
+                    // data tile, and then convert them into f32 and enqueue in
+                    // vecOfF32Vals.
+                    for (int64_t u = 0; u < unrollVL; ++u) {
+                      mlir::Value vecOfF16 =
+                          create.vec.loadIE(vecF16Type, inputAsSticks,
+                              {SymIE(inputStickOffset), l + (u * archVL)});
+                      auto convertOp =
+                          b.getBuilder()
+                              .create<zlow::ZLowConvertDLF16ToF32VectorOp>(
+                                  b.getLoc(), vecOfF16);
+                      vecOfF32Vals.emplace_back(convertOp.getResult(0));
+                      vecOfF32Vals.emplace_back(convertOp.getResult(1));
+                    }
+                    processVectorOfF32Vals(
+                        create.krnl, vecOfF32Vals, innerIndices);
+                  });
+            },
+            // Else, we don't have a full (64 e1) tile.
+            [&](SCFBuilder b) {
+              if (hasOnly64)
+                return; // Do not generate dead code.
+              MDBuilder create(b);
+              IndexExprScope middleScope(b, &outerScope);
+              IndexExpr tripCount = SymIE(E1) - SymIE(e1);
+              if (!neverHas8) {
+                // Note: if we only have multiple of VL, loop below will
+                // handle all as we subtract (VL-1). Aka if VL=8 and tripCount
+                // = 16, tripCountWithoutPartialLastVL is 16 - 7 = 9. Thus we
+                // iterate over i=0 & i=8 as both are < 9.
+                IndexExpr tripCountWithoutPartialLastVL =
+                    tripCount - (archVL - 1);
+                create.scf.forLoopIE(litZero, tripCountWithoutPartialLastVL,
+                    archVL, /*par*/ false,
+                    [&](SCFBuilder b, mlir::ValueRange loopInd) {
+                      MDBuilder create(b);
+                      IndexExprScope innerScope(b, &middleScope);
+                      IndexExpr l = DimIE(loopInd[0]);
+                      DimsExpr innerIndices = SymListIE(outerIndices);
+                      innerIndices[d1] = innerIndices[d1] + l;
+                      mlir::SmallVector<mlir::Value, 8> vecOfF32Vals;
+                      // Load f16 values from input via reinterpreted data
+                      // tile.
+                      mlir::Value vecOfF16 = create.vec.loadIE(vecF16Type,
+                          inputAsSticks, {SymIE(inputStickOffset), l});
+                      // Convert back to f32.
+                      auto convertOp =
+                          b.getBuilder()
+                              .create<zlow::ZLowConvertDLF16ToF32VectorOp>(
+                                  b.getLoc(), vecOfF16);
+                      vecOfF32Vals.emplace_back(convertOp.getResult(0));
+                      vecOfF32Vals.emplace_back(convertOp.getResult(1));
+                      processVectorOfF32Vals(
+                          create.krnl, vecOfF32Vals, innerIndices);
+                    });
+              }
+              if (!hasOnly8) {
+                // Deal with the last <8 values: compute f32 using simd.
+                IndexExpr remainingScalarValues = tripCount % archVL;
+                IndexExpr lastL = tripCount - remainingScalarValues;
+                mlir::Value vecOfF16 = create.vec.loadIE(vecF16Type,
+                    inputAsSticks, {SymIE(inputStickOffset), lastL});
+                // Convert back to f32.
+                auto convertOp =
+                    b.getBuilder().create<zlow::ZLowConvertDLF16ToF32VectorOp>(
+                        b.getLoc(), vecOfF16);
+                mlir::Value vecF32H = convertOp.getResult(0);
+                mlir::Value vecF32L = convertOp.getResult(1);
+                // Save into archVL value buffer.
+                create.vec.storeIE(vecF32H, bufferF32, {litZero});
+                create.vec.storeIE(vecF32L, bufferF32, {litArchVLHalf});
+                create.scf.forLoopIE(litZero, remainingScalarValues, 1,
+                    /*par*/ false, [&](SCFBuilder b, mlir::ValueRange loopInd) {
+                      MDBuilder create(b);
+                      IndexExprScope innerScope(b, &middleScope);
+                      IndexExpr l = DimIE(loopInd[0]);
+                      // Load converted value.
+                      mlir::Value f32 = create.krnl.loadIE(bufferF32, {l});
+
+                      DimsExpr innerIndices = SymListIE(outerIndices);
+                      innerIndices[d1] = innerIndices[d1] + SymIE(lastL);
+                      innerIndices[d1] = innerIndices[d1] + l;
+                      processScalarF32Val(create.krnl, f32, innerIndices);
+                    });
+              }
+            });
+      });
+}
+
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.cpp b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.cpp
index 20ba368893..3cf8e2bd4f 100644
--- a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.cpp
+++ b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.cpp
@@ -19,15 +19,19 @@
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/DialectResourceBlobManager.h"
 
+#include "src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp"
 #include "src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
 #include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
 #include "src/Accelerators/NNPA/Support/LayoutHelper.hpp"
 #include "src/Accelerators/NNPA/Support/Stickify/Convert.hpp"
+#include "src/Conversion/KrnlToLLVM/KrnlToLLVMHelper.hpp"
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
+#include "src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp"
 #include "src/Dialect/Krnl/KrnlHelper.hpp"
 #include "src/Support/TypeUtilities.hpp"
 
@@ -49,7 +53,8 @@ using MDBuilder = MultiDialectBuilder<IndexExprBuilderForKrnl, KrnlBuilder,
 //===----------------------------------------------------------------------===//
 
 Value insertAllocForZMemRefByDim(ArrayRef<IndexExpr> dims,
-    ZTensorEncodingAttr::DataLayout layout, Operation *op,
+    ZTensorEncodingAttr::DataLayout layout,
+    ZTensorEncodingAttr::QuantizedType qtype, Operation *op,
     PatternRewriter &rewriter, int64_t alignment = gAlignment) {
   // Construct a MemRefType for the given dimensions and element type.
   SmallVector<int64_t, 4> shape;
@@ -57,7 +62,7 @@ Value insertAllocForZMemRefByDim(ArrayRef<IndexExpr> dims,
     shape.emplace_back((d.isLiteral() ? d.getLiteral() : ShapedType::kDynamic));
   RankedTensorType tensorType =
       RankedTensorType::get(shape, rewriter.getF32Type(),
-          ZTensorEncodingAttr::get(op->getContext(), layout));
+          ZTensorEncodingAttr::get(op->getContext(), layout, qtype));
   ZMemRefType zMemRefType = convertZTensorToMemRefType(tensorType);
 
   // Insert alloc.
@@ -192,7 +197,9 @@ DenseResourceElementsAttr getDenseResourceElementsAttrOfValue(
 /// returned.
 Value insertAllocOrEmitZeroConstant(ArrayRef<IndexExpr> dims,
     ZTensorEncodingAttr::DataLayout layout, Operation *op,
-    PatternRewriter &rewriter, Location loc) {
+    PatternRewriter &rewriter, Location loc,
+    ZTensorEncodingAttr::QuantizedType qtype =
+        ZTensorEncodingAttr::QuantizedType::UNDEFINED) {
   Value res;
   bool allStaticDims =
       llvm::all_of(dims, [](IndexExpr ie) { return ie.isLiteral(); });
@@ -201,9 +208,12 @@ Value insertAllocOrEmitZeroConstant(ArrayRef<IndexExpr> dims,
     SmallVector<int64_t, 4> shape;
     for (IndexExpr d : dims)
       shape.emplace_back(d.getLiteral());
-    RankedTensorType tensorType =
-        RankedTensorType::get(shape, rewriter.getF32Type(),
-            ZTensorEncodingAttr::get(op->getContext(), layout));
+    Type elemType = rewriter.getF32Type();
+    if (qtype == ZTensorEncodingAttr::QuantizedType::WEIGHTS ||
+        qtype == ZTensorEncodingAttr::QuantizedType::INT8)
+      elemType = rewriter.getI8Type();
+    RankedTensorType tensorType = RankedTensorType::get(shape, elemType,
+        ZTensorEncodingAttr::get(op->getContext(), layout, qtype));
     ZMemRefType zMemRefType = convertZTensorToMemRefType(tensorType);
     MemRefType resType =
         affine::normalizeMemRefType(mlir::cast<MemRefType>(zMemRefType.value));
@@ -224,8 +234,8 @@ Value insertAllocOrEmitZeroConstant(ArrayRef<IndexExpr> dims,
     res = stickifiedConstant.getResult();
   } else {
     MultiDialectBuilder<KrnlBuilder, MathBuilder> create(rewriter, loc);
-    res = insertAllocForZMemRefByDim(dims, layout, op, rewriter);
-    Value initValue = create.math.constant(rewriter.getF16Type(), 0);
+    res = insertAllocForZMemRefByDim(dims, layout, qtype, op, rewriter);
+    Value initValue = create.math.constant(getElementType(res.getType()), 0);
     create.krnl.memset(res, initValue, /*delayed=*/true);
   }
   return res;
@@ -258,11 +268,31 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
     Type elementType = tensorType.getElementType();
     int64_t rank = shape.size();
     if (tensorType.getEncoding()) {
-      // Obtain element type and affine map.
+      // Obtain element type.
+      ZTensorEncodingAttr::QuantizedType qtype = getZTensorQuantizedType(type);
+      if (qtype == ZTensorEncodingAttr::QuantizedType::DLFLOAT16)
+        elementType = b.getF16Type();
+      else if (qtype == ZTensorEncodingAttr::QuantizedType::INT8)
+        elementType = b.getI8Type();
+      else if (qtype == ZTensorEncodingAttr::QuantizedType::WEIGHTS)
+        elementType = b.getI8Type();
+      else
+        elementType = b.getF16Type();
+      // Obtain affine map.
       AffineExpr constExpr0 = getAffineConstantExpr(0, b.getContext());
       AffineExpr constExpr31 = getAffineConstantExpr(31, b.getContext());
-      AffineExpr constExpr32 = getAffineConstantExpr(32, b.getContext());
-      AffineExpr constExpr64 = getAffineConstantExpr(64, b.getContext());
+      AffineExpr constE2Block = getAffineConstantExpr(32, b.getContext());
+      AffineExpr constE1Block = getAffineConstantExpr(64, b.getContext());
+      if (qtype == ZTensorEncodingAttr::QuantizedType::INT8) {
+        // For quantized i8, 128 cells per stick.
+        constE1Block = getAffineConstantExpr(128, b.getContext());
+      } else if (qtype == ZTensorEncodingAttr::QuantizedType::WEIGHTS) {
+        // WEIGHTS has two vectors interleaved, therefore only 64 cells vs 128
+        // Due to this interleaving, number_of_sticks is halved, but must be
+        // rounded up to stay even for proper interleaving.
+        constE2Block = getAffineConstantExpr(64, b.getContext());
+      }
+
       unsigned e4, e3, e2, e1;
       AffineExpr n, c, h, w, res32, res64;
       SmallVector<AffineExpr, 6> dimExpr;
@@ -272,22 +302,22 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         // (e1) -> (1, 1, 1, e1) -> (1, ceil(e1/64), 1, 1, 32, 64)
         e1 = 0;
         n = constExpr0;
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = constExpr0;
         c = constExpr0;
         res32 = constExpr31;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::_2D) {
         // (e2, e1) -> (1, 1, e2, e1) -> (1, ceil(e1/64), 1, ceil(e2/32), 32
         // 64)
         e2 = 0;
         e1 = 1;
         n = constExpr0;
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = constExpr0;
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::_3D) {
         // (e3, e2, e1) -> (1, e3, e2, e1)
         // -> (1, ceil(e1/64), e3, ceil(e2/32), 32, 64)
@@ -295,11 +325,11 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e2 = 1;
         e1 = 2;
         n = constExpr0;
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = b.getAffineDimExpr(e3);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::_4D) {
         // (e4, e3, e2, e1) -> (e4, ceil(e1/64), e3, ceil(e2/32), 32, 64)
         e4 = 0;
@@ -307,21 +337,21 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e2 = 2;
         e1 = 3;
         n = b.getAffineDimExpr(e4);
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = b.getAffineDimExpr(e3);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::_2DS) {
         // (e4, e1) -> (e4, 1, 1, e1) -> (e4, ceil(e1/64), 1, 1, 32, 64)
         e4 = 0;
         e1 = 1;
         n = b.getAffineDimExpr(e4);
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = constExpr0;
         c = constExpr0;
         res32 = constExpr31;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::_3DS) {
         // (e4, e2, e1) -> (e4, 1, e2, e1)
         // -> (e4, ceil(e1/64), 1, ceil(e2/32), 32, 64)
@@ -329,11 +359,11 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e2 = 1;
         e1 = 2;
         n = b.getAffineDimExpr(e4);
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = constExpr0;
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::_4DS) {
         // for normal
         // (e4, e3, e2, e1)
@@ -349,16 +379,16 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e1 = 3;
         n = b.getAffineDimExpr(e4);
         if (shape[1] == 1) {
-          h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+          h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         } else {
           AffineExpr padded_e1 =
-              b.getAffineDimExpr(e1).ceilDiv(constExpr64) * constExpr64;
-          h = (2 * padded_e1).floorDiv(constExpr64);
+              b.getAffineDimExpr(e1).ceilDiv(constE1Block) * constE1Block;
+          h = (2 * padded_e1).floorDiv(constE1Block);
         }
         w = b.getAffineDimExpr(e3);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::NHWC) {
         // (e4, e3, e2, e1) -> (e4, ceil(e1/64), e3, ceil(e2/32), 32, 64)
         e4 = 0;
@@ -366,11 +396,11 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e2 = 2;
         e1 = 3;
         n = b.getAffineDimExpr(e4);
-        h = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         w = b.getAffineDimExpr(e3);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::NCHW) {
         // (e4, e3, e2, e1) -> (e4, ceil(e2/64), e1, ceil(e3/32), 32, 64)
         llvm_unreachable("Not tested yet");
@@ -379,11 +409,11 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e2 = 2;
         e1 = 3;
         n = b.getAffineDimExpr(e4);
-        h = b.getAffineDimExpr(e2).floorDiv(constExpr64);
+        h = b.getAffineDimExpr(e2).floorDiv(constE1Block);
         w = b.getAffineDimExpr(e1);
-        c = b.getAffineDimExpr(e3).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e3) % constExpr32;
-        res64 = b.getAffineDimExpr(e2) % constExpr64;
+        c = b.getAffineDimExpr(e3).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e3) % constE2Block;
+        res64 = b.getAffineDimExpr(e2) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::HWCK) {
         // HWCK (e4, e3, e2, e1) -> KHWC (ceil(e1/64), e4,, e3, ceil(e2/32),
         // 32, 64)
@@ -391,12 +421,12 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         e3 = 1;
         e2 = 2;
         e1 = 3;
-        n = b.getAffineDimExpr(e1).floorDiv(constExpr64);
+        n = b.getAffineDimExpr(e1).floorDiv(constE1Block);
         h = b.getAffineDimExpr(e4);
         w = b.getAffineDimExpr(e3);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
-        res64 = b.getAffineDimExpr(e1) % constExpr64;
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
+        res64 = b.getAffineDimExpr(e1) % constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::FICO) {
         // (e4, e3, e2, e1) -> (e4, 4*ceil(e1/4/64), e3, ceil(e2/32), 32, 64)
         assert(!ShapedType::isDynamic(shape[rank - 1]) &&
@@ -426,12 +456,12 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         h = (((rank == 2) ? shape[0] : 1) *
              (b.getAffineDimExpr(e1) +
                  pad_size * (b.getAffineDimExpr(e1).floorDiv(constExprS))))
-                .floorDiv(constExpr64);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
+                .floorDiv(constE1Block);
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
         res64 = (b.getAffineDimExpr(e1) +
                     pad_size * (b.getAffineDimExpr(e1).floorDiv(constExprS))) %
-                constExpr64;
+                constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::ZRH) {
         // (e4, e3, e2, e1) -> (e4, 3*ceil(e1/4/64), e3, ceil(e2/32), 32, 64)
         int64_t hidden_size = shape[rank - 1];
@@ -463,12 +493,12 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
         h = (((rank == 2) ? shape[0] : 1) *
              (b.getAffineDimExpr(e1) +
                  pad_size * (b.getAffineDimExpr(e1).floorDiv(constExprS))))
-                .floorDiv(constExpr64);
-        c = b.getAffineDimExpr(e2).floorDiv(constExpr32);
-        res32 = b.getAffineDimExpr(e2) % constExpr32;
+                .floorDiv(constE1Block);
+        c = b.getAffineDimExpr(e2).floorDiv(constE2Block);
+        res32 = b.getAffineDimExpr(e2) % constE2Block;
         res64 = (b.getAffineDimExpr(e1) +
                     pad_size * (b.getAffineDimExpr(e1).floorDiv(constExprS))) %
-                constExpr64;
+                constE1Block;
       } else if (layout == ZTensorEncodingAttr::DataLayout::BFICO) {
         llvm_unreachable("Unsupported layout yet");
       } else if (layout == ZTensorEncodingAttr::DataLayout::BZRH) {
@@ -484,7 +514,7 @@ ZMemRefType convertZTensorToMemRefType(Type type) {
       dimExpr.emplace_back(res64);
       AffineMap smap = AffineMap::get(rank, 0, dimExpr, b.getContext());
       // Output type is F16 for zAIU.
-      MemRefType outType = MemRefType::get(shape, b.getF16Type());
+      MemRefType outType = MemRefType::get(shape, elementType);
       resZMemRefType.value =
           MemRefType::Builder(outType).setLayout(AffineMapAttr::get(smap));
       resZMemRefType.layout = convertZTensorDataLayoutToStringAttr(b, layout);
@@ -622,6 +652,137 @@ struct ZHighToZLowStickForGRUOpLowering : public ConversionPattern {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Lower ZHigh QuantizedStick to ZLow QuantizedStick
+//===----------------------------------------------------------------------===//
+
+struct ZHighToZLowQuantizedStickOpLowering : public ConversionPattern {
+  ZHighToZLowQuantizedStickOpLowering(TypeConverter &typeConverter,
+      MLIRContext *ctx, bool enableSIMD, bool enableParallel)
+      : ConversionPattern(
+            typeConverter, ZHighQuantizedStickOp::getOperationName(), 1, ctx) {
+    this->enableSIMD = enableSIMD;
+    this->enableParallel = enableParallel;
+  }
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    Location loc = op->getLoc();
+    auto qstickOp = cast<ZHighQuantizedStickOp>(op);
+
+    ZHighQuantizedStickOpAdaptor operandAdaptor(operands);
+    Value X = operandAdaptor.getIn();
+    Value XRecScale = operandAdaptor.getInRecScale();
+    Value XOffset = operandAdaptor.getInOffset();
+    StringAttr layout = qstickOp.getLayoutAttr();
+    StringAttr quantizedType = qstickOp.getQuantizedTypeAttr();
+    bool symmetricMode = qstickOp.getSymMode() != 0;
+
+    MultiDialectBuilder<IndexExprBuilderForKrnl, KrnlBuilder, MathBuilder,
+        MemRefBuilder, OnnxBuilder, ZLowBuilder>
+        create(rewriter, loc);
+    ZHighQuantizedStickOpShapeHelper shapeHelper(op, operands, &create.krnlIE);
+    shapeHelper.computeShapeAndAssertOnFailure();
+
+    // Convert ZTensor type to MemRefType.
+    ZMemRefType zMemRefType =
+        convertZTensorToMemRefType(*op->result_type_begin());
+
+    Type si64Ty = rewriter.getIntegerType(64, true);
+    Type i8Ty = rewriter.getIntegerType(8);
+    Type f32Ty = rewriter.getF32Type();
+    MemRefType scalarF32MemRefTy = MemRefType::get({}, f32Ty);
+    MemRefType scalarI8MemRefTy = MemRefType::get({}, i8Ty);
+
+    // Attributes.
+    IntegerAttr trueAttr = rewriter.getIntegerAttr(si64Ty, -1);
+
+    // Compute rec_scale and offset.
+    Value recScale = nullptr;
+    Value offset = nullptr;
+    if (!isNoneValue(XRecScale))
+      recScale = create.krnl.load(XRecScale);
+    if (!isNoneValue(XOffset))
+      offset = create.krnl.load(XOffset);
+
+    // Find out more about the original input tensor.
+    Type inputOriginalType = op->getOperand(0).getType();
+    StringAttr xLayout = getZTensorLayoutAttr(rewriter, inputOriginalType);
+    bool xIsZTensorOfDLF16 = (xLayout != nullptr);
+
+    if (!recScale && !offset) {
+      if (symmetricMode) {
+        if (xIsZTensorOfDLF16) {
+          llvm_unreachable("Does not support symmetric quantization for a "
+                           "ztensor at this moment");
+        }
+        offset = create.math.constant(f32Ty, 0.0);
+        emitSymmetricQuantRecscaleToScalar(
+            rewriter, loc, op, X, 8, recScale, enableSIMD, enableParallel);
+      } else {
+        // Get layout of the defining operation of X. Do not checking that we
+        // have a supported z tensor, as this checking will be performed in
+        // emitDynamicQuantizationLinearMinMaxFromStickifiedInput, when called.
+        // Compute min/max.
+        Value inputMin, inputMax;
+        if (xIsZTensorOfDLF16) {
+          // Call will test that we can handle the specific xLayout.
+          emitDynamicQuantizationLinearMinMaxFromStickifiedInput(rewriter, loc,
+              op, X, xLayout, inputMin, inputMax, enableSIMD, enableParallel);
+        } else {
+          // Proceed with computing min/max using normal tensor of normal types.
+          assert(xLayout == nullptr && "expected no layout");
+          emitDynamicQuantizationLinearMinMax(rewriter, loc, op, X, inputMin,
+              inputMax, enableSIMD, enableParallel);
+        }
+        // Compute scale & zero point. NNPA uses signed i8 so QMax is 127 and
+        // QMin is -128.
+        Value scale, quantizedOffset;
+        Value qMax = create.math.constant(f32Ty, 127.0);
+        Value qMin = create.math.constant(f32Ty, -128.0);
+        emitDynamicQuantizationLinearScalarParametersFromMinMax(rewriter, loc,
+            op, scalarF32MemRefTy, scalarI8MemRefTy, inputMin, inputMax, qMin,
+            qMax, scale, offset, quantizedOffset, /*want zero point*/ true,
+            enableParallel);
+        // Compute recScale.
+        Value one = create.math.constant(f32Ty, 1.0);
+        recScale = create.math.div(one, scale);
+      }
+    }
+
+    // MemRefs for recScale and offset.
+    Value memrefRecScale = create.mem.alignedAlloc(scalarF32MemRefTy);
+    create.krnl.store(recScale, memrefRecScale);
+    Value memrefOffset = create.mem.alignedAlloc(scalarF32MemRefTy);
+    create.krnl.store(offset, memrefOffset);
+
+    if (xIsZTensorOfDLF16) {
+      // Already stickified.
+      rewriter.replaceOp(op, {X, memrefRecScale, memrefOffset});
+      return success();
+    }
+
+    // Allocate a buffer for the result MemRef.
+    Value alloc = insertAllocForZMemRef(
+        zMemRefType, shapeHelper.getOutputDims(), op, rewriter);
+    // Emit a ZLow operation.
+    if (quantizedType.getValue().equals_insensitive(QTYPE_DLFLOAT16)) {
+      // Use normal stickification for dlfloat16 type so that we can flexibly
+      // switch between compiler-generated and zdnn stick.
+      create.zlow.stick(X, alloc, layout, trueAttr);
+    } else {
+      create.zlow.quantizedStick(
+          X, memrefRecScale, memrefOffset, alloc, layout, quantizedType);
+    }
+    rewriter.replaceOp(op, {alloc, memrefRecScale, memrefOffset});
+    return success();
+  }
+
+private:
+  bool enableSIMD = false;
+  bool enableParallel = false;
+};
+
 //===----------------------------------------------------------------------===//
 // Lower ZHigh Unstick to ZLow Unstick
 //===----------------------------------------------------------------------===//
@@ -866,11 +1027,26 @@ struct ZLowOpFor<ZHighExpOp> {
   using Op = ZLowExpOp;
 };
 
+template <>
+struct ZLowOpFor<ZHighSqrtOp> {
+  using Op = ZLowSqrtOp;
+};
+
+template <>
+struct ZLowOpFor<ZHighInvSqrtOp> {
+  using Op = ZLowInvSqrtOp;
+};
+
 template <>
 struct ZLowOpFor<ZHighReluOp> {
   using Op = ZLowReluOp;
 };
 
+template <>
+struct ZLowOpFor<ZHighGeluOp> {
+  using Op = ZLowGeluOp;
+};
+
 template <>
 struct ZLowOpFor<ZHighTanhOp> {
   using Op = ZLowTanhOp;
@@ -917,6 +1093,68 @@ struct ZHighToZLowUnaryOpLowering : public ConversionPattern {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Lower ZHigh ReduceMax/ReduceMin to ZLow ReduceMax/ReduceMin
+//===----------------------------------------------------------------------===//
+template <typename OP_TYPE>
+struct ZLowReduceOpFor {
+  using Op = void;
+};
+
+template <>
+struct ZLowReduceOpFor<ZHighReduceMaxOp> {
+  using Op = ZLowReduceMaxOp;
+};
+
+template <>
+struct ZLowReduceOpFor<ZHighReduceMinOp> {
+  using Op = ZLowReduceMinOp;
+};
+
+template <typename OP_TYPE>
+struct ZHighToZLowReduceOpLowering : public ConversionPattern {
+  ZHighToZLowReduceOpLowering(TypeConverter &typeConverter, MLIRContext *ctx)
+      : ConversionPattern(OP_TYPE::getOperationName(), 1, ctx) {}
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    MLIRContext *context = rewriter.getContext();
+    OP_TYPE reduceOp = mlir::cast<OP_TYPE>(op);
+    Location loc = op->getLoc();
+    Value data = operands[0];
+
+    // Helper builders.
+    MultiDialectBuilder<IndexExprBuilderForKrnl, KrnlBuilder, LLVMBuilder,
+        MemRefBuilder>
+        create(rewriter, loc);
+
+    // Convert ZTensor type to MemRefType.
+    ZMemRefType zMemRefType =
+        convertZTensorToMemRefType(*op->result_type_begin());
+
+    // Shape helper.
+    ZHighReduceMaxOpShapeHelper shapeHelper(op, operands, &create.krnlIE);
+    shapeHelper.computeShapeAndAssertOnFailure();
+    SmallVector<IndexExpr, 4> &dims = shapeHelper.getOutputDims();
+
+    // Allocate a buffer for the result MemRef.
+    Value alloc = insertAllocForZMemRef(
+        zMemRefType, shapeHelper.getOutputDims(), op, rewriter);
+
+    // Get the original shape before it is vanished by lower passes.
+    Value shape = insertShapeMemRefI64(rewriter, loc, dims);
+
+    // If set to NULL, the operation will determine, allocate and free storage
+    // automatically.
+    Value workArea = create.llvm.null(krnl::getI8PointerType(context));
+
+    // Emit a ZLow operation.
+    rewriter.create<typename ZLowReduceOpFor<OP_TYPE>::Op>(loc, data, workArea,
+        shape, alloc, zMemRefType.layout, reduceOp.getOpTypeAttr());
+    rewriter.replaceOp(op, alloc);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Lower ZHigh Softmax to ZLow Softmax
 //===----------------------------------------------------------------------===//
@@ -1045,6 +1283,50 @@ struct ZHighToZLowPool2DOpLowering : public ConversionPattern {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Lower ZHigh LeakyRelu to ZLow LeakyRelu
+//===----------------------------------------------------------------------===//
+
+struct ZHighToZLowLeakyReluOpLowering : public ConversionPattern {
+  ZHighToZLowLeakyReluOpLowering(TypeConverter &typeConverter, MLIRContext *ctx)
+      : ConversionPattern(
+            typeConverter, ZHighLeakyReluOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    Location loc = op->getLoc();
+    ZHighLeakyReluOp leakyreluOp = llvm::dyn_cast<ZHighLeakyReluOp>(op);
+    ZHighLeakyReluOpAdaptor operandAdaptor(operands);
+
+    // Helper builders.
+    MultiDialectBuilder<IndexExprBuilderForKrnl> create(rewriter, loc);
+
+    // Convert ZTensor type to MemRefType.
+    ZMemRefType zMemRefType =
+        convertZTensorToMemRefType(*op->result_type_begin());
+
+    // Shape helper.
+    ZHighUnaryOpShapeHelper shapeHelper(op, operands, &create.krnlIE);
+    shapeHelper.computeShapeAndAssertOnFailure();
+    SmallVector<IndexExpr, 4> &dims = shapeHelper.getOutputDims();
+
+    // Allocate a buffer for the result MemRef.
+    Value alloc = insertAllocForZMemRef(zMemRefType, dims, op, rewriter);
+
+    // Get the original shape before it is vanished by lower passes.
+    Value shape = insertShapeMemRefI64(rewriter, loc, dims);
+
+    // Attributes.
+    FloatAttr alphaVal = leakyreluOp.getAlphaAttr();
+
+    // Emit zlow.leakyrelu.
+    rewriter.create<ZLowLeakyReluOp>(
+        loc, operandAdaptor.getX(), shape, alloc, alphaVal, zMemRefType.layout);
+    rewriter.replaceOp(op, alloc);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Lower ZHigh MatMul to ZLow MatMul
 //===----------------------------------------------------------------------===//
@@ -1057,6 +1339,7 @@ struct ZHighToZLowMatMulOpLowering : public ConversionPattern {
   LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const final {
     Location loc = op->getLoc();
+    ZHighMatMulOp matmulOp = llvm::dyn_cast<ZHighMatMulOp>(op);
     ZHighMatMulOpAdaptor operandAdaptor(operands);
 
     // Helper builders.
@@ -1082,7 +1365,8 @@ struct ZHighToZLowMatMulOpLowering : public ConversionPattern {
     //   - 2nd item: n
     //   - 3rd item: p
     // - In case of stacked: X(s, m, n) * Y(s, n, p) + Bias(s, p)
-    //      or broadcasting: X(s, m, n) * Y(n, p) + Bias(p)
+    //      or broadcasting1: X(m, n) * Y(s, n, p) + Bias(s, p)
+    //      or broadcasting23: X(s, m, n) * Y(n, p) + Bias(p)
     // shape is a 1D MemRef (memref<4xindex>) whose items are:
     //   - 1st item: s
     //   - 2nd item: m
@@ -1098,7 +1382,7 @@ struct ZHighToZLowMatMulOpLowering : public ConversionPattern {
       SmallVector<IndexExpr, 4> resDims, biasDims;
       create.krnlIE.getShapeAsDims(alloc, resDims);
       ZTensorEncodingAttr::DataLayout biasLayout;
-      if (shapeHelper.isStacked) {
+      if (shapeHelper.isStacked || shapeHelper.isBroadcasted1) {
         // Bias type is 2DS.
         biasDims.emplace_back(resDims[0]);
         biasDims.emplace_back(resDims[2]);
@@ -1114,22 +1398,196 @@ struct ZHighToZLowMatMulOpLowering : public ConversionPattern {
     }
 
     // Attributes.
-    int64_t bcast = (shapeHelper.isBroadcasted) ? -1 : 0;
+    int64_t bcast1 = (shapeHelper.isBroadcasted1) ? -1 : 0;
+    int64_t bcast23 = (shapeHelper.isBroadcasted23) ? -1 : 0;
     int64_t stacked = (shapeHelper.isStacked) ? -1 : 0;
-    IntegerAttr is_bcastAttr =
-        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), bcast);
+    int64_t transposeA = (matmulOp.getTransposeA() != 0) ? 1 : 0;
+    int64_t transposeB = (matmulOp.getTransposeB() != 0) ? 1 : 0;
+    IntegerAttr is_bcast1Attr =
+        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), bcast1);
+    IntegerAttr is_bcast23Attr =
+        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), bcast23);
     IntegerAttr is_stackedAttr =
         rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), stacked);
+    IntegerAttr transposeAAttr =
+        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), transposeA);
+    IntegerAttr transposeBAttr =
+        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), transposeB);
 
     // Emit zlow.matmul.
     rewriter.create<ZLowMatMulOp>(loc, operandAdaptor.getX(),
-        operandAdaptor.getY(), bias, shapeMemRef, alloc, is_bcastAttr,
-        is_stackedAttr);
+        operandAdaptor.getY(), bias, shapeMemRef, alloc, is_bcast1Attr,
+        is_bcast23Attr, is_stackedAttr, transposeAAttr, transposeBAttr);
     rewriter.replaceOp(op, alloc);
     return success();
   }
 };
 
+//===----------------------------------------------------------------------===//
+// Lower ZHigh QuantizedMatMul to ZLow QuantizedMatMul
+//===----------------------------------------------------------------------===//
+
+struct ZHighToZLowQuantizedMatMulOpLowering : public ConversionPattern {
+  ZHighToZLowQuantizedMatMulOpLowering(
+      TypeConverter &typeConverter, MLIRContext *ctx)
+      : ConversionPattern(typeConverter,
+            ZHighQuantizedMatMulOp::getOperationName(), 1, ctx) {}
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    Location loc = op->getLoc();
+    ZHighQuantizedMatMulOp matmulOp =
+        llvm::dyn_cast<ZHighQuantizedMatMulOp>(op);
+    ZHighQuantizedMatMulOpAdaptor operandAdaptor(operands);
+
+    // Helper builders.
+    MultiDialectBuilder<IndexExprBuilderForKrnl, KrnlBuilder, MathBuilder,
+        MemRefBuilder, ZLowBuilder>
+        create(rewriter, loc);
+
+    // Compute shape.
+    ZHighQuantizedMatMulOpShapeHelper shapeHelper(op, operands, &create.krnlIE);
+    shapeHelper.computeShapeAndAssertOnFailure();
+
+    // Convert ZTensor type to MemRefType.
+    ZMemRefType zMemRefType =
+        convertZTensorToMemRefType(*op->result_type_begin());
+    Type f32Ty = rewriter.getF32Type();
+    MemRefType scalarF32MemRefTy = MemRefType::get({}, f32Ty);
+
+    Value zero = create.math.constant(f32Ty, 0.0);
+    Value one = create.math.constant(f32Ty, 1.0);
+
+    Value alloc = insertAllocForZMemRef(
+        zMemRefType, shapeHelper.getOutputDims(), op, rewriter);
+    Value outRecScale = operandAdaptor.getOutRecScaleIn();
+    if (mlir::isa<NoneType>(outRecScale.getType())) {
+      outRecScale = create.mem.alignedAlloc(
+          MemRefType::get({}, rewriter.getF32Type()), {});
+      create.krnl.store(one, outRecScale);
+    }
+    Value outOffset = operandAdaptor.getOutOffsetIn();
+    if (mlir::isa<NoneType>(outOffset.getType())) {
+      outOffset = create.mem.alignedAlloc(
+          MemRefType::get({}, rewriter.getF32Type()), {});
+      create.krnl.store(zero, outOffset);
+    }
+
+    // Get the original shape before it is vanished by lower passes.
+    // Create a 1D MemRef containing necessary dimensions for constructing
+    // original shapes.
+    // - In case of unstacked: X(m, n) * Y(n, p) + Bias(p)
+    // shape is a 1D MemRef (memref<3xindex>) whose items are:
+    //   - 1st item: m
+    //   - 2nd item: n
+    //   - 3rd item: p
+    // - In case of stacked: X(s, m, n) * Y(s, n, p) + Bias(s, p)
+    //      or broadcasting: X(s, m, n) * Y(n, p) + Bias(p)
+    // shape is a 1D MemRef (memref<4xindex>) whose items are:
+    //   - 1st item: s
+    //   - 2nd item: m
+    //   - 3rd item: n
+    //   - 4th item: p
+    Value shapeMemRef =
+        insertShapeMemRefI64(rewriter, loc, shapeHelper.allOriginalDims);
+
+    // Attributes.
+    int64_t bcast = (shapeHelper.isBroadcasted) ? -1 : 0;
+    int64_t stacked = (shapeHelper.isStacked) ? -1 : 0;
+    IntegerAttr is_bcastAttr =
+        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), bcast);
+    IntegerAttr is_stackedAttr =
+        rewriter.getIntegerAttr(rewriter.getIntegerType(64, true), stacked);
+    // QuantizedType attributes.
+    StringAttr xQTypeAttr = convertZTensorQuantizedTypeToStringAttr(
+        rewriter, getZTensorQuantizedType(matmulOp.getX().getType()));
+    StringAttr yQTypeAttr = convertZTensorQuantizedTypeToStringAttr(
+        rewriter, getZTensorQuantizedType(matmulOp.getY().getType()));
+    StringAttr outQTypeAttr =
+        StringAttr::get(rewriter.getContext(), QTYPE_DLFLOAT16);
+    StringAttr bQTypeAttr;
+
+    // Prepare optional bias.
+    SmallVector<IndexExpr, 4> resDims;
+    create.krnlIE.getShapeAsDims(alloc, resDims);
+    Value bias = operandAdaptor.getB();
+    Value biasRecScale = operandAdaptor.getBRecScale();
+    Value biasOffset = operandAdaptor.getBOffset();
+    SmallVector<IndexExpr, 4> bDims;
+    if (shapeHelper.isStacked) {
+      // Bias type is 2DS.
+      bDims.emplace_back(resDims[0]);
+      bDims.emplace_back(resDims[2]);
+    } else {
+      // Bias type is 1D. Get the last dim size.
+      bDims.emplace_back(resDims[resDims.size() - 1]);
+    }
+    ZTensorEncodingAttr::DataLayout bLayout;
+    ZTensorEncodingAttr::QuantizedType bQType;
+    if (mlir::isa<NoneType>(bias.getType())) {
+      if (shapeHelper.isStacked) {
+        // Bias type is 2DS.
+        bLayout = ZTensorEncodingAttr::DataLayout::_2DS;
+      } else {
+        // Bias type is 1D. Get the last dim size.
+        bLayout = ZTensorEncodingAttr::DataLayout::_1D;
+      }
+      bool preCompute = matmulOp.getPreComputedBias() != 0;
+      // Allocate bias.
+      if (preCompute)
+        bQType = ZTensorEncodingAttr::QuantizedType::DLFLOAT16;
+      else
+        bQType = ZTensorEncodingAttr::QuantizedType::INT8;
+      bQTypeAttr = convertZTensorQuantizedTypeToStringAttr(rewriter, bQType);
+      bias = insertAllocOrEmitZeroConstant(
+          bDims, bLayout, op, rewriter, loc, bQType);
+    } else {
+      Type bTensorType = matmulOp.getB().getType();
+      bLayout = getZTensorLayout(bTensorType);
+      ZTensorEncodingAttr::QuantizedType qtype =
+          getZTensorQuantizedType(bTensorType);
+      if (qtype == ZTensorEncodingAttr::QuantizedType::UNDEFINED) {
+        // Bias is a non-quantized or normal ztensor. Use DLFLOAT16 type.
+        qtype = ZTensorEncodingAttr::QuantizedType::DLFLOAT16;
+      }
+      bQTypeAttr = convertZTensorQuantizedTypeToStringAttr(rewriter, qtype);
+      bQType = convertStringAttrToZTensorQuantizedType(bQTypeAttr);
+    }
+    if (mlir::isa<NoneType>(biasRecScale.getType())) {
+      biasRecScale = create.mem.alignedAlloc(scalarF32MemRefTy);
+      create.krnl.store(one, biasRecScale);
+    }
+    if (mlir::isa<NoneType>(biasOffset.getType())) {
+      biasOffset = create.mem.alignedAlloc(scalarF32MemRefTy);
+      create.krnl.store(zero, biasOffset);
+    }
+
+    // Prepare a buffer for work_area.
+    // Work area has the same layout as bias but dlfloat16 type.
+    if (bDims.empty())
+      create.krnlIE.getShapeAsDims(bias, bDims);
+    Value workArea = insertAllocForZMemRefByDim(bDims, bLayout,
+        ZTensorEncodingAttr::QuantizedType::DLFLOAT16, op, rewriter);
+
+    // Emit zlow.quantizedMatmul.
+    // clang-format off
+    create.zlow.quantizedMatMul(
+        operandAdaptor.getX(), operandAdaptor.getXRecScale(), operandAdaptor.getXOffset(),
+        operandAdaptor.getY(), operandAdaptor.getYRecScale(), operandAdaptor.getYOffset(),
+        bias, biasRecScale, biasOffset,
+        workArea, shapeMemRef,
+        alloc, outRecScale, outOffset,
+        xQTypeAttr, yQTypeAttr, bQTypeAttr, outQTypeAttr,
+        is_bcastAttr, is_stackedAttr,
+        matmulOp.getPreComputedBiasAttr(),
+        matmulOp.getDisableClippingAttr(),
+        matmulOp.getDequantizeOutputAttr());
+    // clang-format on
+    rewriter.replaceOp(op, {alloc, outRecScale, outOffset});
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Lower ZHigh LSTM to ZLow LSTM
 //===----------------------------------------------------------------------===//
@@ -1794,7 +2252,7 @@ struct ZHighToZLowDataConversionLowering
 };
 
 void populateZHighToZLowConversionPattern(mlir::RewritePatternSet &patterns,
-    mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx,
+    mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx, bool enableSIMD,
     bool enableParallel) {
   // Stickify and unstickify operations.
   patterns.insert<ZHighToZLowStickifiedConstantOpLowering>(typeConverter, ctx);
@@ -1818,13 +2276,22 @@ void populateZHighToZLowConversionPattern(mlir::RewritePatternSet &patterns,
   // Activations
   patterns.insert<ZHighToZLowUnaryOpLowering<ZHighLogOp>>(typeConverter, ctx);
   patterns.insert<ZHighToZLowUnaryOpLowering<ZHighExpOp>>(typeConverter, ctx);
+  patterns.insert<ZHighToZLowUnaryOpLowering<ZHighInvSqrtOp>>(
+      typeConverter, ctx);
   patterns.insert<ZHighToZLowUnaryOpLowering<ZHighReluOp>>(typeConverter, ctx);
+  patterns.insert<ZHighToZLowUnaryOpLowering<ZHighGeluOp>>(typeConverter, ctx);
+  patterns.insert<ZHighToZLowUnaryOpLowering<ZHighSqrtOp>>(typeConverter, ctx);
   patterns.insert<ZHighToZLowUnaryOpLowering<ZHighTanhOp>>(typeConverter, ctx);
   patterns.insert<ZHighToZLowUnaryOpLowering<ZHighSigmoidOp>>(
       typeConverter, ctx);
   // Neural network operations.
+  patterns.insert<ZHighToZLowReduceOpLowering<ZHighReduceMaxOp>>(
+      typeConverter, ctx);
+  patterns.insert<ZHighToZLowReduceOpLowering<ZHighReduceMinOp>>(
+      typeConverter, ctx);
   patterns.insert<ZHighToZLowSoftmaxOpLowering>(typeConverter, ctx);
   patterns.insert<ZHighToZLowMeanReduce2DOpLowering>(typeConverter, ctx);
+  patterns.insert<ZHighToZLowLeakyReluOpLowering>(typeConverter, ctx);
   patterns.insert<ZHighToZLowMatMulOpLowering>(typeConverter, ctx);
   patterns.insert<ZHighToZLowLSTMOpLowering>(typeConverter, ctx);
   patterns.insert<ZHighToZLowGRUOpLowering>(typeConverter, ctx);
@@ -1838,6 +2305,10 @@ void populateZHighToZLowConversionPattern(mlir::RewritePatternSet &patterns,
   patterns
       .insert<ZHighToZLowPool2DOpLowering<ZHighAvgPool2DOp, ZLowAvgPool2DOp>>(
           typeConverter, ctx);
+  // Quantized operations.
+  patterns.insert<ZHighToZLowQuantizedStickOpLowering>(
+      typeConverter, ctx, enableSIMD, enableParallel);
+  patterns.insert<ZHighToZLowQuantizedMatMulOpLowering>(typeConverter, ctx);
 }
 
 } // namespace zhigh
diff --git a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.hpp b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.hpp
index 021f47deb3..e8e21eefd3 100644
--- a/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.hpp
+++ b/src/Accelerators/NNPA/Conversion/ZHighToZLow/ZHighToZLow.hpp
@@ -19,6 +19,8 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "src/Dialect/Mlir/IndexExpr.hpp"
 
+#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
+
 /// Default 4K alignment for sticked tensors.
 static constexpr int64_t gAlignment = 4096;
 
@@ -42,8 +44,9 @@ mlir::Value insertShapeMemRefI64(mlir::PatternRewriter &rewriter,
 /// Insert an allocation for the given dimensions and layout.
 /// By default, set alignment to 4K.
 mlir::Value insertAllocForZMemRefByDim(mlir::ArrayRef<IndexExpr> dims,
-    mlir::Type layoutType, mlir::Operation *op, mlir::PatternRewriter &rewriter,
-    int64_t alignment);
+    ZTensorEncodingAttr::DataLayout layout,
+    ZTensorEncodingAttr::QuantizedType qtype, mlir::Operation *op,
+    mlir::PatternRewriter &rewriter, int64_t alignment);
 
 /// Insert an allocation for the given ZMemRefType.
 /// By default, set alignment to 4K.
@@ -53,7 +56,7 @@ mlir::Value insertAllocForZMemRef(ZMemRefType zType,
 
 /// Populate all conversion patterns for ZHigh Ops.
 void populateZHighToZLowConversionPattern(mlir::RewritePatternSet &patterns,
-    mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx,
+    mlir::TypeConverter &typeConverter, mlir::MLIRContext *ctx, bool enableSIMD,
     bool enableParallel);
 
 } // namespace zhigh
diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/CMakeLists.txt b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/CMakeLists.txt
index fe421a2a25..50ac04750b 100644
--- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/CMakeLists.txt
+++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/CMakeLists.txt
@@ -6,6 +6,7 @@ add_onnx_mlir_library(OMZLowToLLVM
   libzdnn
 
   LINK_LIBS PUBLIC
+  OMCompilerOptions
   MLIRLLVMCommonConversion
   OMKrnlToLLVM
   OMLayoutHelper
diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVM.cpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVM.cpp
index 8c1cc25786..c29d18d056 100644
--- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVM.cpp
+++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVM.cpp
@@ -4,7 +4,7 @@
 
 //===---------- ZLowToLLVM.cpp - Lowering from ZLow to LLVM ---------------===//
 //
-// Copyright 2019-2022 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -24,6 +24,8 @@
 #include "src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
 #include "src/Accelerators/NNPA/Support/LayoutHelper.hpp"
+#include "src/Accelerators/NNPA/Support/NNPALimit.hpp"
+#include "src/Compiler/CompilerOptions.hpp"
 #include "src/Conversion/KrnlToLLVM/KrnlToLLVMHelper.hpp"
 #include "src/Dialect/Mlir/DialectBuilder.hpp"
 #include "zdnn.h"
@@ -73,10 +75,18 @@ API APIFor<ZLowExpOp>() {
   return API::ZDNN_EXP;
 }
 template <>
+API APIFor<ZLowInvSqrtOp>() {
+  return API::ZDNN_INVSQRT;
+}
+template <>
 API APIFor<ZLowReluOp>() {
   return API::ZDNN_RELU;
 }
 template <>
+API APIFor<ZLowGeluOp>() {
+  return API::ZDNN_GELU;
+}
+template <>
 API APIFor<ZLowTanhOp>() {
   return API::ZDNN_TANH;
 }
@@ -85,6 +95,11 @@ API APIFor<ZLowSigmoidOp>() {
   return API::ZDNN_SIGMOID;
 }
 
+template <>
+API APIFor<ZLowSqrtOp>() {
+  return API::ZDNN_SQRT;
+}
+
 class ZLowStickLowering : public mlir::ConvertToLLVMPattern {
 public:
   explicit ZLowStickLowering(MLIRContext *context, LLVMTypeConverter &lowering_,
@@ -99,6 +114,8 @@ class ZLowStickLowering : public mlir::ConvertToLLVMPattern {
     ModuleOp module = op->getParentOfType<ModuleOp>();
     Location loc = op->getLoc();
     ZLowStickOp stickOp = mlir::cast<ZLowStickOp>(op);
+    std::optional<int64_t> saturationOpt = stickOp.getSaturation();
+    bool saturation = saturationOpt.has_value() && saturationOpt.value() != 0;
 
     ZLowStickOpAdaptor operandAdaptor(operands);
     // Do not get element type from adaptor since the type can be opaque.
@@ -130,8 +147,96 @@ class ZLowStickLowering : public mlir::ConvertToLLVMPattern {
 
     // Ready to stickify.
     Value unstickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getX());
-    callApi(rewriter, loc, module, apiRegistry, API::ZDNN_TRANSFORM_ZTENSOR,
-        {toOpaquePtr(rewriter, loc, module, zTensor.val), unstickI8Ptr});
+    if (saturation)
+      callApi(rewriter, loc, module, apiRegistry,
+          API::ZDNN_TRANSFORM_ZTENSOR_WITH_SATURATION,
+          {toOpaquePtr(rewriter, loc, module, zTensor.val), unstickI8Ptr});
+    else
+      callApi(rewriter, loc, module, apiRegistry, API::ZDNN_TRANSFORM_ZTENSOR,
+          {toOpaquePtr(rewriter, loc, module, zTensor.val), unstickI8Ptr});
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  ApiRegistry apiRegistry;
+};
+
+class ZLowQuantizedStickLowering : public mlir::ConvertToLLVMPattern {
+public:
+  explicit ZLowQuantizedStickLowering(MLIRContext *context,
+      LLVMTypeConverter &lowering_, ApiRegistry apiRegistry)
+      : ConvertToLLVMPattern(
+            ZLowQuantizedStickOp::getOperationName(), context, lowering_) {
+    this->apiRegistry = apiRegistry;
+  }
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+
+    ZLowQuantizedStickOp stickOp = cast<ZLowQuantizedStickOp>(op);
+    ZLowQuantizedStickOpAdaptor operandAdaptor(operands);
+    Value recScale = operandAdaptor.getRecScale();
+    Value offset = operandAdaptor.getOffset();
+    StringRef transformTypeStr = stickOp.getQType();
+
+    // Do not get element type from adaptor since the type can be opaque.
+    Type llvmElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(stickOp.getX().getType()).getElementType());
+    Type llvmI64Ty = rewriter.getI64Type();
+    Type llvmF32Ty = rewriter.getF32Type();
+
+    ZTensorHelper zTensorHelper =
+        ZTensorHelper(rewriter, loc, module, apiRegistry);
+
+    // Get the dimensions of the original shape (the shape before stickifying)
+    // used for creating a zTensor. For 'zLow.quantizedStick', the original
+    // shape is obtained from the first argument.
+    SmallVector<Value, 3> dims;
+    getDimsFromMemRef(rewriter, loc, module, operandAdaptor.getX(), dims);
+
+    // Get zDNN data type.
+    zdnn_data_types zDNNDataType = llvmTypeToZDNNType(llvmElementTy);
+
+    // Get zDNN data layout.
+    zdnn_data_layouts zDNNDataLayout =
+        convertLayoutAttrToZDNNDataLayout(dims.size(), stickOp.getLayoutAttr());
+
+    // Get zDNN transform type.
+    zdnn_quantized_transform_types transformType =
+        getQuantizedTransformType(transformTypeStr);
+
+    // Create a zTensor.
+    Value stickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getOut());
+    Value recScaleF32 = loadFromMemRef(create.llvm, llvmF32Ty, recScale, 0);
+    Value offsetF32 = loadFromMemRef(create.llvm, llvmF32Ty, offset, 0);
+    ZTensor zTensor =
+        zTensorHelper.getQuantizedZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+            /*layout=*/zDNNDataLayout, /*transformType=*/transformType,
+            /*originalDims=*/dims, /*recScale=*/recScaleF32,
+            /*offset=*/offsetF32,
+            /*isTransformed=*/false);
+
+    // Always saturate.
+    Value saturationVal =
+        create.llvm.constant(llvmI64Ty, static_cast<int64_t>(1));
+
+    // Min, Max clip values.
+    Value clipMIN =
+        create.llvm.constant(llvmI64Ty, static_cast<int64_t>(INT8_MIN));
+    Value clipMAX =
+        create.llvm.constant(llvmI64Ty, static_cast<int64_t>(INT8_MAX));
+
+    // Ready to stickify.
+    Value unstickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getX());
+    callApi(rewriter, loc, module, apiRegistry,
+        API::ZDNN_TRANSFORM_QUANTIZED_ZTENSOR,
+        {toOpaquePtr(rewriter, loc, module, zTensor.val), saturationVal,
+            clipMIN, clipMAX, unstickI8Ptr});
 
     rewriter.eraseOp(op);
     return success();
@@ -782,6 +887,14 @@ class ZLowUnaryElementwiseOpLowering : public ConvertToLLVMPattern {
       callApi(rewriter, loc, module, apiRegistry, APIFor<UnaryElementwiseOp>(),
           {toOpaquePtr(rewriter, loc, module, inputZTensor.val), nullpointer,
               toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
+    } else if (APIFor<UnaryElementwiseOp>() == API::ZDNN_INVSQRT) {
+      MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+      // Create a float for the epsilon value.
+      Value epsilon = create.llvm.constant(rewriter.getF32Type(), nnpaEpsilon);
+      // Pass to ZDNN.
+      callApi(rewriter, loc, module, apiRegistry, APIFor<UnaryElementwiseOp>(),
+          {toOpaquePtr(rewriter, loc, module, inputZTensor.val), epsilon,
+              toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
     } else {
       callApi(rewriter, loc, module, apiRegistry, APIFor<UnaryElementwiseOp>(),
           {toOpaquePtr(rewriter, loc, module, inputZTensor.val),
@@ -958,6 +1071,195 @@ class ZLowSoftmaxOpLowering : public ConvertToLLVMPattern {
   ApiRegistry apiRegistry;
 };
 
+class ZLowLeakyReluLowering : public ConvertToLLVMPattern {
+public:
+  explicit ZLowLeakyReluLowering(MLIRContext *context,
+      LLVMTypeConverter &lowering_, ApiRegistry apiRegistry)
+      : ConvertToLLVMPattern(
+            ZLowLeakyReluOp::getOperationName(), context, lowering_) {
+    this->apiRegistry = apiRegistry;
+  }
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    Location loc = op->getLoc();
+    ZLowLeakyReluOp leakyreluOp = cast<ZLowLeakyReluOp>(op);
+    MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+    MLIRContext *context = rewriter.getContext();
+    typename ZLowLeakyReluOp::Adaptor operandAdaptor(operands);
+
+    Value input = operandAdaptor.getX();
+    Value shape = operandAdaptor.getShape();
+    Value output = operandAdaptor.getOut();
+    Type llvmElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(op->getOperand(0).getType()).getElementType());
+
+    ZTensorHelper zTensorHelper =
+        ZTensorHelper(rewriter, loc, module, apiRegistry);
+
+    // Get zDNN data type.
+    zdnn_data_types zDNNDataType = llvmTypeToZDNNType(llvmElementTy);
+
+    // Get zDNN data layout.
+    zdnn_data_layouts zDNNDataLayout =
+        convertLayoutAttrToZDNNDataLayout(0, leakyreluOp.getLayoutAttr());
+
+    // Get the dimensions of the original shape (the shape before stickifying)
+    // used for creating a zTensor.
+    std::vector<Value> dims =
+        getDimsFromShapeMemRef(rewriter, loc, module, shape,
+            /*layout=*/zDNNDataLayout);
+
+    // Create an input zTensor.
+    Value stickI8Ptr = zTensorHelper.getAlignedI8Ptr(input);
+    ZTensor inputZTensor =
+        zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+            /*layout=*/zDNNDataLayout, /*originalDims=*/dims,
+            /*isTransformed=*/true);
+
+    // Create an output zTensor.
+    stickI8Ptr = zTensorHelper.getAlignedI8Ptr(output);
+    ZTensor outputZTensor = zTensorHelper.getZTensor(
+        /*preTransformedDescPtr=*/inputZTensor.preTransformedDescPtr,
+        /*transformedDescPtr=*/inputZTensor.transformedDescPtr,
+        /*bufferSize=*/inputZTensor.bufferSize,
+        /*alignedBuffer=*/stickI8Ptr,
+        /*isTransformed=*/true);
+
+    // Create the clipping value as null because the zDNN LeakyRelu API does not
+    // use it.
+    Value clippingVal = create.llvm.null(krnl::getI8PointerType(context));
+
+    // Create the adjustment factor value from the input alpha attribute.
+    FloatAttr alphaAttr = leakyreluOp.getAlphaAttr();
+    float alphaFloat = (float)alphaAttr.getValueAsDouble();
+    Value adjustmentFactorVal =
+        create.llvm.constant(rewriter.getF32Type(), alphaFloat);
+
+    // Call the zDNN LeakyRelu API.
+    callApi(rewriter, loc, module, apiRegistry, API::ZDNN_LEAKY_RELU,
+        {toOpaquePtr(rewriter, loc, module, inputZTensor.val), clippingVal,
+            adjustmentFactorVal,
+            toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  ApiRegistry apiRegistry;
+};
+
+template <typename REDUCE_OP>
+API getReduceAPI() {
+  return API::NULL_API;
+}
+
+template <>
+API getReduceAPI<ZLowReduceMaxOp>() {
+  return API::ZDNN_REDUCEMAX;
+}
+
+template <>
+API getReduceAPI<ZLowReduceMinOp>() {
+  return API::ZDNN_REDUCEMIN;
+}
+
+template <typename REDUCE_OP>
+class ZLowReduceLowering : public ConvertToLLVMPattern {
+public:
+  explicit ZLowReduceLowering(MLIRContext *context,
+      LLVMTypeConverter &lowering_, ApiRegistry apiRegistry)
+      : ConvertToLLVMPattern(
+            REDUCE_OP::getOperationName(), context, lowering_) {
+    this->apiRegistry = apiRegistry;
+  }
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    Location loc = op->getLoc();
+    REDUCE_OP reduceOp = mlir::cast<REDUCE_OP>(op);
+    MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+    typename REDUCE_OP::Adaptor operandAdaptor(operands);
+
+    Value data = operandAdaptor.getX();
+    Value shape = operandAdaptor.getShape();
+    Value output = operandAdaptor.getOut();
+    Type llvmElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(op->getOperand(0).getType()).getElementType());
+
+    ZTensorHelper zTensorHelper =
+        ZTensorHelper(rewriter, loc, module, apiRegistry);
+
+    // Get zDNN data type.
+    zdnn_data_types zDNNDataType = llvmTypeToZDNNType(llvmElementTy);
+
+    // Get zDNN data layout.
+    zdnn_data_layouts zDNNDataLayout =
+        convertLayoutAttrToZDNNDataLayout(0, reduceOp.getLayoutAttr());
+
+    // Get the dimensions of the original shape (the shape before stickifying)
+    // used for creating a zTensor.
+    std::vector<Value> dims =
+        getDimsFromShapeMemRef(rewriter, loc, module, shape,
+            /*layout=*/zDNNDataLayout);
+
+    Type llvmI64Ty = rewriter.getI64Type();
+    Value one = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(1));
+
+    // Calculation for the output dimension
+    int64_t axis = dims.size();
+    std::vector<Value> outputDims;
+    for (int64_t i = 0; i < axis; ++i) {
+      outputDims.emplace_back(dims[i]);
+    }
+    outputDims.emplace_back(one);
+
+    // Create an input zTensor.
+    Value stickI8Ptr = zTensorHelper.getAlignedI8Ptr(data);
+    ZTensor inputZTensor =
+        zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+            /*layout=*/zDNNDataLayout, /*originalDims=*/dims,
+            /*isTransformed=*/true);
+
+    // Create an output zTensor.
+    stickI8Ptr = zTensorHelper.getAlignedI8Ptr(output);
+    ZTensor outputZTensor =
+        zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+            /*layout=*/zDNNDataLayout, /*originalDims=*/outputDims,
+            /*isTransformed=*/true);
+
+    // work_area.
+    Value workArea =
+        zTensorHelper.getAlignedI8Ptr(operandAdaptor.getWorkArea());
+
+    // op_type
+    nnpa_reduce_operations opType;
+    StringRef opTypeStr = reduceOp.getOpType();
+    if (opTypeStr.equals_insensitive("REDUCE_OP_MINIMUM"))
+      opType = NNPA_REDUCE_OP_MINIMUM;
+    else if (opTypeStr.equals_insensitive("REDUCE_OP_MAXIMUM"))
+      opType = NNPA_REDUCE_OP_MAXIMUM;
+    else
+      llvm_unreachable("Unsupported operation type");
+    Value optype = create.llvm.constant(
+        rewriter.getI64Type(), static_cast<int64_t>(opType));
+
+    // Call the zDNN ReduceMax/ReduceMin API.
+    callApi(rewriter, loc, module, apiRegistry, getReduceAPI<REDUCE_OP>(),
+        {toOpaquePtr(rewriter, loc, module, inputZTensor.val), workArea, optype,
+            toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  ApiRegistry apiRegistry;
+};
+
 class ZLowMatMulLowering : public ConvertToLLVMPattern {
 public:
   explicit ZLowMatMulLowering(MLIRContext *context,
@@ -978,15 +1280,18 @@ class ZLowMatMulLowering : public ConvertToLLVMPattern {
     Type llvmElementTy = typeConverter->convertType(
         mlir::cast<MemRefType>(matmulOp.getX().getType()).getElementType());
 
-    bool stacked, broadcasting;
+    bool stacked = false, broadcasting1 = false, broadcasting23 = false,
+         transposeA = false, transposeB = false;
     if (matmulOp.getIsStacked() == -1)
       stacked = true;
-    else
-      stacked = false;
-    if (matmulOp.getIsBcast() == -1)
-      broadcasting = true;
-    else
-      broadcasting = false;
+    if (matmulOp.getIsBcast1() == -1)
+      broadcasting1 = true;
+    else if (matmulOp.getIsBcast23() == -1)
+      broadcasting23 = true;
+    if (matmulOp.getTransposeA() != 0)
+      transposeA = true;
+    if (matmulOp.getTransposeB() != 0)
+      transposeB = true;
 
     ZTensorHelper zTensorHelper =
         ZTensorHelper(rewriter, loc, module, apiRegistry);
@@ -997,17 +1302,22 @@ class ZLowMatMulLowering : public ConvertToLLVMPattern {
     // Get the dimensions of the original shape (the shape before stickifying)
     // used for creating zTensors.
     int dimCount = 3;
-    if (stacked || broadcasting)
+    if (stacked || broadcasting1 || broadcasting23)
       dimCount = 4;
     std::vector<Value> dims = getDimsFromShapeMemRefBySize(
         rewriter, loc, module, operandAdaptor.getShape(), /*size=*/dimCount);
     // Dimensions: s, m, n, p;
     Value S, M, N, P;
-    if (stacked || broadcasting) {
+    if (stacked || broadcasting23) {
       S = dims[0];
       M = dims[1];
       N = dims[2];
       P = dims[3];
+    } else if (broadcasting1) {
+      M = dims[0];
+      N = dims[1];
+      S = dims[2];
+      P = dims[3];
     } else {
       M = dims[0];
       N = dims[1];
@@ -1019,49 +1329,111 @@ class ZLowMatMulLowering : public ConvertToLLVMPattern {
 
     // Create zTensors.
     ZTensor xZTensor, yZTensor, biasZTensor, outputZTensor;
+
+    // clang-format off
+    // Requirements
+    // Type        X                    Y                   Bias             Output
+    // ----------------------------------------------------------------------------------------
+    // unstacked   ZDNN_2D (m, n)       ZDNN_2D (n, p)      ZDNN_1D (p)      ZDNN_2D (m, p)
+    // stacked     ZDNN_3DS (s, m, n)   ZDNN_3DS (s, n, p)  ZDNN_2DS (s, p)  ZDNN_3DS (s, m, p)
+    // bcast1      ZDNN_2D (m, n)       ZDNN_3DS (s, n, p)  ZDNN_2DS (s, p)  ZDNN_3DS (s, m, p)
+    // bcast23     ZDNN_3DS (s, m, n)   ZDNN_2D (n, p)      ZDNN_1D (p)      ZDNN_3DS (s, m, p)
+    // clang-format on
+
     // X
     Value stickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getX());
-    if (stacked || broadcasting)
-      xZTensor = zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
-          /*layout=*/ZDNN_3DS, /*originalDims=*/{S, M, N},
-          /*isTransformed=*/true);
-    else
-      xZTensor = zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
-          /*layout=*/ZDNN_2D, /*originalDims=*/{M, N},
-          /*isTransformed=*/true);
+    if (stacked || broadcasting23) {
+      if (transposeA)
+        // ZDNN_3DS (s, n, m)
+        xZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_3DS, /*originalDims=*/{S, N, M},
+                /*isTransformed=*/true);
+      else
+        // ZDNN_3DS (s, m, n)
+        xZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_3DS, /*originalDims=*/{S, M, N},
+                /*isTransformed=*/true);
+    } else { /* unstacked || broadcasting1 */
+      if (transposeA)
+        // ZDNN_2D (n, m)
+        xZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_2D, /*originalDims=*/{N, M},
+                /*isTransformed=*/true);
+      else
+        // ZDNN_2D (m, n)
+        xZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_2D, /*originalDims=*/{M, N},
+                /*isTransformed=*/true);
+    }
     // Y
     stickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getY());
-    if (stacked)
-      yZTensor = zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
-          /*layout=*/ZDNN_3DS, /*originalDims=*/{S, N, P},
-          /*isTransformed=*/true);
-    else
-      yZTensor = zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
-          /*layout=*/ZDNN_2D, /*originalDims=*/{N, P},
-          /*isTransformed=*/true);
+    if (stacked || broadcasting1) {
+      if (transposeB)
+        // ZDNN_3DS (s, p, n)
+        yZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_3DS, /*originalDims=*/{S, P, N},
+                /*isTransformed=*/true);
+      else
+        // ZDNN_3DS (s, n, p)
+        yZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_3DS, /*originalDims=*/{S, N, P},
+                /*isTransformed=*/true);
+    } else { /* unstacked || broadcasting23 */
+      if (transposeB)
+        // ZDNN_2D (p, n)
+        yZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_2D, /*originalDims=*/{P, N},
+                /*isTransformed=*/true);
+      else
+        // ZDNN_2D (n, p)
+        yZTensor =
+            zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
+                /*layout=*/ZDNN_2D, /*originalDims=*/{N, P},
+                /*isTransformed=*/true);
+    }
     // Bias
     stickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getBias());
-    if (stacked)
+    if (stacked || broadcasting1)
+      // ZDNN_2D (s, p)
       biasZTensor =
           zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
               /*layout=*/ZDNN_2DS, /*originalDims=*/{S, P},
               /*isTransformed=*/true);
     else
+      // ZDNN_1D (p)
       biasZTensor =
           zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
               /*layout=*/ZDNN_1D, /*originalDims=*/{P},
               /*isTransformed=*/true);
     // Op_type
-    Value op_type;
-    if (broadcasting)
-      op_type = create.llvm.constant(
+    Value opType;
+    if (broadcasting23 || broadcasting1)
+      opType = create.llvm.constant(
           llvmI64Ty, static_cast<int64_t>(NNPA_MATMUL_BCAST_OP_ADDITION));
     else
-      op_type = create.llvm.constant(
+      opType = create.llvm.constant(
           llvmI64Ty, static_cast<int64_t>(NNPA_MATMUL_OP_ADDITION));
+    // Transposing
+    Value transposeAVal;
+    if (transposeA)
+      transposeAVal = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(1));
+    else
+      transposeAVal = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(0));
+    Value transposeBVal;
+    if (transposeB)
+      transposeBVal = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(1));
+    else
+      transposeBVal = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(0));
     // Output
     stickI8Ptr = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getOut());
-    if (stacked || broadcasting)
+    if (stacked || broadcasting23 || broadcasting1)
       outputZTensor =
           zTensorHelper.getZTensor(stickI8Ptr, /*dataType=*/zDNNDataType,
               /*layout=*/ZDNN_3DS, /*originalDims=*/{S, M, P},
@@ -1073,17 +1445,24 @@ class ZLowMatMulLowering : public ConvertToLLVMPattern {
               /*isTransformed=*/true);
 
     // Ready to call zDNN MatMul.
-    if (broadcasting) {
+    if (transposeA || transposeB) {
+      callApi(rewriter, loc, module, apiRegistry, API::ZDNN_MATMUL_TRANSPOSE_OP,
+          {toOpaquePtr(rewriter, loc, module, xZTensor.val),
+              toOpaquePtr(rewriter, loc, module, yZTensor.val),
+              toOpaquePtr(rewriter, loc, module, biasZTensor.val),
+              transposeAVal, transposeBVal, opType,
+              toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
+    } else if (broadcasting23 || broadcasting1) {
       callApi(rewriter, loc, module, apiRegistry, API::ZDNN_MATMUL_BCAST_OP,
           {toOpaquePtr(rewriter, loc, module, xZTensor.val),
               toOpaquePtr(rewriter, loc, module, yZTensor.val),
-              toOpaquePtr(rewriter, loc, module, biasZTensor.val), op_type,
+              toOpaquePtr(rewriter, loc, module, biasZTensor.val), opType,
               toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
     } else {
       callApi(rewriter, loc, module, apiRegistry, API::ZDNN_MATMUL_OP,
           {toOpaquePtr(rewriter, loc, module, xZTensor.val),
               toOpaquePtr(rewriter, loc, module, yZTensor.val),
-              toOpaquePtr(rewriter, loc, module, biasZTensor.val), op_type,
+              toOpaquePtr(rewriter, loc, module, biasZTensor.val), opType,
               toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
     }
 
@@ -1095,6 +1474,230 @@ class ZLowMatMulLowering : public ConvertToLLVMPattern {
   ApiRegistry apiRegistry;
 };
 
+class ZLowQuantizedMatMulLowering : public ConvertToLLVMPattern {
+public:
+  explicit ZLowQuantizedMatMulLowering(MLIRContext *context,
+      LLVMTypeConverter &lowering_, ApiRegistry apiRegistry)
+      : ConvertToLLVMPattern(
+            ZLowQuantizedMatMulOp::getOperationName(), context, lowering_) {
+    this->apiRegistry = apiRegistry;
+  }
+
+  LogicalResult matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    MLIRContext *context = module.getContext();
+    MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+
+    ZLowQuantizedMatMulOp matmulOp = cast<ZLowQuantizedMatMulOp>(op);
+    ZLowQuantizedMatMulOpAdaptor operandAdaptor(operands);
+
+    // Inputs.
+    // X
+    Value X = operandAdaptor.getX();
+    Value XRecScale = operandAdaptor.getXRecScale();
+    Value XOffset = operandAdaptor.getXOffset();
+    StringRef XQType = matmulOp.getXQType();
+    // Y
+    Value Y = operandAdaptor.getY();
+    Value YRecScale = operandAdaptor.getYRecScale();
+    Value YOffset = operandAdaptor.getYOffset();
+    StringRef YQType = matmulOp.getYQType();
+    // Bias
+    Value Bias = operandAdaptor.getBias();
+    Value BiasRecScale = operandAdaptor.getBiasRecScale();
+    Value BiasOffset = operandAdaptor.getBiasOffset();
+    StringRef BiasQType = matmulOp.getBiasQType();
+    // Out
+    Value Out = operandAdaptor.getOut();
+    Value OutRecScale = operandAdaptor.getOutRecScale();
+    Value OutOffset = operandAdaptor.getOutOffset();
+    StringRef OutQType = matmulOp.getOutQType();
+
+    // Types.
+    Type llvmXElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(matmulOp.getX().getType()).getElementType());
+    Type llvmYElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(matmulOp.getY().getType()).getElementType());
+    Type llvmBiasElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(matmulOp.getBias().getType()).getElementType());
+    Type llvmOutElementTy = typeConverter->convertType(
+        mlir::cast<MemRefType>(matmulOp.getOut().getType()).getElementType());
+    Type llvmF32Ty = rewriter.getF32Type();
+    Type llvmI64Ty = rewriter.getI64Type();
+    Type llvmZTensorTy = getZTensorStructTy(context);
+    Type llvmZTensorPtrTy = krnl::getPointerType(context, llvmZTensorTy);
+
+    bool stacked, broadcasting;
+    if (matmulOp.getIsStacked() == -1)
+      stacked = true;
+    else
+      stacked = false;
+    if (matmulOp.getIsBcast() == -1)
+      broadcasting = true;
+    else
+      broadcasting = false;
+
+    // Get the dimensions of the original shape (the shape before stickifying)
+    // used for creating zTensors.
+    int dimCount = 3;
+    if (stacked || broadcasting)
+      dimCount = 4;
+    std::vector<Value> dims = getDimsFromShapeMemRefBySize(
+        rewriter, loc, module, operandAdaptor.getShape(), /*size=*/dimCount);
+    // Dimensions: s, m, n, p;
+    Value S, M, N, P;
+    if (stacked || broadcasting) {
+      S = dims[0];
+      M = dims[1];
+      N = dims[2];
+      P = dims[3];
+    } else {
+      M = dims[0];
+      N = dims[1];
+      P = dims[2];
+    }
+
+    // Create zTensors.
+    ZTensorHelper zTensorHelper =
+        ZTensorHelper(rewriter, loc, module, apiRegistry);
+    ZTensor xZTensor, yZTensor, biasZTensor, outputZTensor;
+    // X
+    zdnn_data_types zDNNDataType = llvmTypeToZDNNType(llvmXElementTy);
+    zdnn_quantized_transform_types zDNNQType =
+        getQuantizedTransformType(XQType);
+    Value recScale = loadFromMemRef(create.llvm, llvmF32Ty, XRecScale, 0);
+    Value offset = loadFromMemRef(create.llvm, llvmF32Ty, XOffset, 0);
+    Value stickI8Ptr = zTensorHelper.getAlignedI8Ptr(X);
+    if (stacked || broadcasting)
+      xZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType, /*layout=*/ZDNN_3DS,
+          /*transformType=*/zDNNQType,
+          /*originalDims=*/{S, M, N},
+          /*recScale=*/recScale, /*offset=*/offset,
+          /*isTransformed=*/true);
+    else
+      xZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType, /*layout=*/ZDNN_2D,
+          /*transformType=*/zDNNQType,
+          /*originalDims=*/{M, N}, /*recScale=*/recScale, /*offset=*/offset,
+          /*isTransformed=*/true);
+    // Y
+    zDNNDataType = llvmTypeToZDNNType(llvmYElementTy);
+    zDNNQType = getQuantizedTransformType(YQType);
+    recScale = loadFromMemRef(create.llvm, llvmF32Ty, YRecScale, 0);
+    offset = loadFromMemRef(create.llvm, llvmF32Ty, YOffset, 0);
+    stickI8Ptr = zTensorHelper.getAlignedI8Ptr(Y);
+    if (stacked)
+      yZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType, /*layout=*/ZDNN_3DS,
+          /*transformType=*/zDNNQType, /*originalDims=*/{S, N, P},
+          /*recScale=*/recScale, /*offset=*/offset, /*isTransformed=*/true);
+    else
+      yZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType, /*layout=*/ZDNN_2D,
+          /*transformType=*/zDNNQType, /*originalDims=*/{N, P},
+          /*recScale=*/recScale, /*offset=*/offset, /*isTransformed=*/true);
+    // Bias
+    zDNNDataType = llvmTypeToZDNNType(llvmBiasElementTy);
+    zDNNQType = getQuantizedTransformType(BiasQType);
+    recScale = loadFromMemRef(create.llvm, llvmF32Ty, BiasRecScale, 0);
+    offset = loadFromMemRef(create.llvm, llvmF32Ty, BiasOffset, 0);
+    stickI8Ptr = zTensorHelper.getAlignedI8Ptr(Bias);
+    if (stacked)
+      biasZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType,
+          /*layout=*/ZDNN_2DS,
+          /*transformType=*/zDNNQType, /*originalDims=*/{S, P},
+          /*recScale=*/recScale, /*offset=*/offset, /*isTransformed=*/true);
+    else
+      biasZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType, /*layout=*/ZDNN_1D,
+          /*transformType=*/zDNNQType, /*originalDims=*/{P},
+          /*recScale=*/recScale, /*offset=*/offset, /*isTransformed=*/true);
+
+    // Op_type
+    Value opType = create.llvm.constant(
+        llvmI64Ty, static_cast<int64_t>(NNPA_MATMUL_OP_ADDITION));
+
+    // Min, Max clip values.
+    Value clipMIN =
+        create.llvm.constant(llvmI64Ty, static_cast<int64_t>(INT8_MIN));
+    Value clipMAX =
+        create.llvm.constant(llvmI64Ty, static_cast<int64_t>(INT8_MAX));
+
+    // work_area.
+    Value workArea;
+    if (mlir::isa<NoneType>(matmulOp.getWorkArea().getType()))
+      workArea = create.llvm.null(krnl::getI8PointerType(context));
+    else
+      workArea = zTensorHelper.getAlignedI8Ptr(operandAdaptor.getWorkArea());
+
+    // Output
+    zDNNDataType = llvmTypeToZDNNType(llvmOutElementTy);
+    zDNNQType = getQuantizedTransformType(OutQType);
+    recScale = loadFromMemRef(create.llvm, llvmF32Ty, OutRecScale, 0);
+    offset = loadFromMemRef(create.llvm, llvmF32Ty, OutOffset, 0);
+    stickI8Ptr = zTensorHelper.getAlignedI8Ptr(Out);
+    if (stacked || broadcasting)
+      outputZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType,
+          /*layout=*/ZDNN_3DS,
+          /*transformType=*/zDNNQType,
+          /*originalDims=*/{S, M, P},
+          /*recScale=*/recScale, /*offset=*/offset,
+          /*isTransformed=*/true);
+    else
+      outputZTensor = zTensorHelper.getQuantizedZTensor(stickI8Ptr,
+          /*dataType=*/zDNNDataType,
+          /*layout=*/ZDNN_2D,
+          /*transformType=*/zDNNQType,
+          /*originalDims=*/{M, P},
+          /*recScale=*/recScale, /*offset=*/offset,
+          /*isTransformed=*/true);
+
+    // Ready to call zDNN MatMul.
+    Value disableClipping = create.llvm.constant(
+        llvmI64Ty, static_cast<int64_t>(matmulOp.getDisableClipping()));
+    Value dequantizeOutput = create.llvm.constant(
+        llvmI64Ty, static_cast<int64_t>(matmulOp.getDequantizeOutput()));
+    Value preComputedBias = create.llvm.constant(
+        llvmI64Ty, static_cast<int64_t>(matmulOp.getPreComputedBias()));
+    zlow::API apiName = API::ZDNN_QUANTIZED_MATMUL_OP;
+    callApi(rewriter, loc, module, apiRegistry, apiName,
+        {/*input_a=*/toOpaquePtr(rewriter, loc, module, xZTensor.val),
+            /*input_b=*/toOpaquePtr(rewriter, loc, module, yZTensor.val),
+            /*input_c=*/toOpaquePtr(rewriter, loc, module, biasZTensor.val),
+            /*op_type=*/opType,
+            /*clip_min=*/clipMIN,
+            /*clip_max=*/clipMAX,
+            /*disable_clipping=*/disableClipping,
+            /*dequantized=*/dequantizeOutput,
+            /*pre_computed=*/preComputedBias,
+            /*work_area=*/workArea,
+            /*output=*/
+            toOpaquePtr(rewriter, loc, module, outputZTensor.val)});
+
+    // Store the output rec_scale.
+    Value recScalePtr = create.llvm.getElemPtr(llvmZTensorPtrTy, llvmZTensorTy,
+        outputZTensor.val, ArrayRef<LLVM::GEPArg>{0, 6});
+    Value outRecScale = create.llvm.load(llvmF32Ty, recScalePtr);
+    storeToMemRef(create.llvm, outRecScale, OutRecScale, 0);
+    // Store the output offset.
+    Value offsetPtr = create.llvm.getElemPtr(llvmZTensorPtrTy, llvmZTensorTy,
+        outputZTensor.val, ArrayRef<LLVM::GEPArg>{0, 7});
+    Value outOffset = create.llvm.load(llvmF32Ty, offsetPtr);
+    storeToMemRef(create.llvm, outOffset, OutOffset, 0);
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  ApiRegistry apiRegistry;
+};
+
 class ZLowConv2DLowering : public ConvertToLLVMPattern {
 public:
   explicit ZLowConv2DLowering(MLIRContext *context,
@@ -1965,6 +2568,7 @@ void populateZLowToLLVMConversionPattern(mlir::RewritePatternSet &patterns,
   // clang-format off
   patterns.insert<
       ZLowStickLowering,
+      ZLowQuantizedStickLowering,
       ZLowUnstickLowering,
       ZLowStickForLSTMLowering,
       ZLowStickForGRULowering,
@@ -1975,9 +2579,11 @@ void populateZLowToLLVMConversionPattern(mlir::RewritePatternSet &patterns,
       ZLowGRULowering,
       // Other operations
       ZLowMatMulLowering,
+      ZLowQuantizedMatMulLowering,
       ZLowConv2DLowering,
       ZLowMeanReduce2DLowering,
       ZLowBatchNormLowering,
+      ZLowLeakyReluLowering,
       // Scalar operations
       ZLowDLF16ToF32Lowering,
       ZLowF32ToDLF16Lowering,
@@ -1996,13 +2602,18 @@ void populateZLowToLLVMConversionPattern(mlir::RewritePatternSet &patterns,
       // Unary operations
       ZLowUnaryElementwiseOpLowering<ZLowLogOp>,
       ZLowUnaryElementwiseOpLowering<ZLowExpOp>,
+      ZLowUnaryElementwiseOpLowering<ZLowInvSqrtOp>,
       // Activation operations
       ZLowUnaryElementwiseOpLowering<ZLowReluOp>,
+      ZLowUnaryElementwiseOpLowering<ZLowGeluOp>,
       ZLowUnaryElementwiseOpLowering<ZLowTanhOp>,
       ZLowUnaryElementwiseOpLowering<ZLowSigmoidOp>,
+      ZLowUnaryElementwiseOpLowering<ZLowSqrtOp>,
       // Other operations
       ZLowPool2DLowering<ZLowAvgPool2DOp>,
-      ZLowPool2DLowering<ZLowMaxPool2DOp>
+      ZLowPool2DLowering<ZLowMaxPool2DOp>,
+      ZLowReduceLowering<ZLowReduceMaxOp>,
+      ZLowReduceLowering<ZLowReduceMinOp>
     >(ctx, typeConverter, apiRegistry);
   // clang-format on
 }
diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
index 4ffcdc6baa..86f4b43bbb 100644
--- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
+++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
@@ -4,7 +4,7 @@
 
 //===---------- ZLowToLLVMCommon.hpp - Lowering from ZLow to LLVM ---------===//
 //
-// Copyright 2019-2020 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 
 #include "src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.hpp"
+#include "src/Accelerators/NNPA/Support/LayoutHelper.hpp"
 #include "src/Conversion/KrnlToLLVM/KrnlToLLVMHelper.hpp"
 #include "src/Dialect/Mlir/DialectBuilder.hpp"
 #include "zdnn.h"
@@ -52,9 +53,12 @@ ApiRegistry RegisterAllApis(MLIRContext *context) {
     ApiSpec(API::ZDNN_INIT_PRE_TRANSFORMED_DESC, "zdnn_init_pre_transformed_desc", voidTy, {int64Ty, int64Ty, opaquePtrTy}, true),
     ApiSpec(API::ZDNN_GENERATE_TRANSFORMED_DESC, "zdnn_generate_transformed_desc", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_GENERATE_TRANSFORMED_DESC_CONCATENATED, "zdnn_generate_transformed_desc_concatenated", int32Ty, {opaquePtrTy, int64Ty, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_GENERATE_QUANTIZED_TRANSFORMED_DESC, "zdnn_generate_quantized_transformed_desc", int32Ty, {opaquePtrTy, int64Ty, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_GETSIZE_ZTENSOR, "zdnn_getsize_ztensor", int64Ty, {opaquePtrTy}, false),
     ApiSpec(API::ZDNN_TRANSFORM_ZTENSOR, "zdnn_transform_ztensor", int32Ty, {opaquePtrTy}, true),
+    ApiSpec(API::ZDNN_TRANSFORM_ZTENSOR_WITH_SATURATION, "zdnn_transform_ztensor_with_saturation", int32Ty, {opaquePtrTy}, true),
     ApiSpec(API::ZDNN_TRANSFORM_ORIGTENSOR, "zdnn_transform_origtensor", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_TRANSFORM_QUANTIZED_ZTENSOR, "zdnn_transform_quantized_ztensor", int32Ty, {opaquePtrTy, int64Ty, int64Ty, int64Ty, opaquePtrTy}, false),
     // Elementwise operations
     ApiSpec(API::ZDNN_ADD, "zdnn_add_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_SUB, "zdnn_sub_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
@@ -64,17 +68,25 @@ ApiRegistry RegisterAllApis(MLIRContext *context) {
     ApiSpec(API::ZDNN_MAX, "zdnn_max_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_LOG, "zdnn_log_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_EXP, "zdnn_exp_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_INVSQRT, "zdnn_invsqrt_ext", int32Ty, {opaquePtrTy, float32Ty, opaquePtrTy}, false),                     
+    ApiSpec(API::ZDNN_REDUCEMAX, "zdnn_reduce_ext", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_REDUCEMIN, "zdnn_reduce_ext", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
     // Activation operations
+    ApiSpec(API::ZDNN_LEAKY_RELU, "zdnn_leaky_relu_ext", int32Ty, {opaquePtrTy, opaquePtrTy, float32Ty, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_RELU, "zdnn_relu_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_GELU, "zdnn_gelu_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_TANH, "zdnn_tanh_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_SIGMOID, "zdnn_sigmoid_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_SOFTMAX, "zdnn_softmax_ext", int32Ty, {opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_SQRT, "zdnn_sqrt_ext", int32Ty, {opaquePtrTy, opaquePtrTy}, false),
     // RNN operations
     ApiSpec(API::ZDNN_LSTM, "zdnn_lstm", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy, opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_GRU, "zdnn_gru", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy, opaquePtrTy}, false),
     // Other operations
     ApiSpec(API::ZDNN_MATMUL_OP, "zdnn_matmul_op_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_MATMUL_BCAST_OP, "zdnn_matmul_bcast_op_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_MATMUL_TRANSPOSE_OP, "zdnn_matmul_transpose_op_ext", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, int64Ty, int64Ty, opaquePtrTy}, false),
+    ApiSpec(API::ZDNN_QUANTIZED_MATMUL_OP, "zdnn_quantized_matmul_op", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, int64Ty, int64Ty, int64Ty, int64Ty, int64Ty, opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_CONV2D, "zdnn_conv2d", int32Ty, {opaquePtrTy, opaquePtrTy, opaquePtrTy, int64Ty, int64Ty, int64Ty, int64Ty, opaquePtrTy, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_AVGPOOL2D, "zdnn_avgpool2d", int32Ty, {opaquePtrTy, int64Ty, int64Ty, int64Ty, int64Ty, int64Ty, opaquePtrTy}, false),
     ApiSpec(API::ZDNN_MAXPOOL2D, "zdnn_maxpool2d", int32Ty, {opaquePtrTy, int64Ty, int64Ty, int64Ty, int64Ty, int64Ty, opaquePtrTy}, false),
@@ -177,6 +189,31 @@ Value ZTensorHelper::getTransformedDescPtr(
   return transformedDescPtr;
 }
 
+// Get a transformed descriptor.
+Value ZTensorHelper::getQuantizedTransformedDescPtr(Value preTransformedDescPtr,
+    zdnn_quantized_transform_types transformedType) {
+  MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+  MLIRContext *context = module.getContext();
+
+  Type llvmI64Ty = rewriter.getI64Type();
+  Type llvmZTensorDescStructTy = getZTensorDescStructTy(context);
+  Value one = create.llvm.constant(llvmI64Ty, (int64_t)1);
+
+  Value transformedDescPtr = create.llvm._alloca(
+      krnl::getPointerType(context, llvmZTensorDescStructTy),
+      llvmZTensorDescStructTy, one,
+      /*alignment=*/0);
+
+  Value transformedTyVal =
+      create.llvm.constant(llvmI64Ty, (int64_t)transformedType);
+  callApi(rewriter, loc, module, apiRegistry,
+      API::ZDNN_GENERATE_QUANTIZED_TRANSFORMED_DESC,
+      {toOpaquePtr(rewriter, loc, module, preTransformedDescPtr),
+          transformedTyVal,
+          toOpaquePtr(rewriter, loc, module, transformedDescPtr)});
+  return transformedDescPtr;
+}
+
 // Get the pointer to memref.
 Value ZTensorHelper::getAlignedI8Ptr(Value memRef) {
   MLIRContext *context = rewriter.getContext();
@@ -229,7 +266,9 @@ ZTensor ZTensorHelper::getZTensor(Value bufferPtr, zdnn_data_types dataType,
                 /*transformedDescPtr=*/transformedDescPtr,
                 /*isTransformed=*/isTransformed,
                 /*bufferSize=*/bufferSize,
-                /*alignedBuffer=*/bufferPtr);
+                /*alignedBuffer=*/bufferPtr,
+                /*recScale=*/nullptr,
+                /*offset=*/nullptr);
   // clang-format on
 
   zTensor.val = alloc;
@@ -241,6 +280,55 @@ ZTensor ZTensorHelper::getZTensor(Value bufferPtr, zdnn_data_types dataType,
   return zTensor;
 }
 
+/// Create a quantized zTensor.
+ZTensor ZTensorHelper::getQuantizedZTensor(Value bufferPtr,
+    zdnn_data_types dataType, zdnn_data_layouts layout,
+    zdnn_quantized_transform_types transformType, ArrayRef<Value> originalDims,
+    Value recScale, Value offset, bool isTransformed) {
+  MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
+  MLIRContext *context = module.getContext();
+  ZTensor zTensor;
+
+  // LLVM types for zTensor and zTensor descriptor.
+  Type llvmZTensorStructTy = getZTensorStructTy(context);
+  // Some frequently used constants.
+  Value one = create.llvm.constant(rewriter.getI64Type(), (int64_t)1);
+
+  // Create a pre transformed descriptor.
+  Value preTransformedDescPtr =
+      getPreTransformedDescPtr(dataType, layout, originalDims);
+  // Create a transformed descriptor.
+  Value transformedDescPtr =
+      getQuantizedTransformedDescPtr(preTransformedDescPtr, transformType);
+  // Create the input zTensor.
+  Value alloc =
+      create.llvm._alloca(krnl::getPointerType(context, llvmZTensorStructTy),
+          llvmZTensorStructTy, one,
+          /*alignment=*/0);
+  // Buffer size.
+  Value bufferSize = getBufferSize(transformedDescPtr);
+  // clang-format off
+  fillInZTensor(rewriter, loc, module, alloc,
+                /*preTransformedDescPtr=*/preTransformedDescPtr,
+                /*transformedDescPtr=*/transformedDescPtr,
+                /*isTransformed=*/isTransformed,
+                /*bufferSize=*/bufferSize,
+                /*alignedBuffer=*/bufferPtr,
+                /*recScale=*/recScale,
+                /*offset=*/offset);
+  // clang-format on
+
+  zTensor.val = alloc;
+  zTensor.preTransformedDescPtr = preTransformedDescPtr;
+  zTensor.transformedDescPtr = transformedDescPtr;
+  zTensor.isTransformed = isTransformed;
+  zTensor.bufferSize = bufferSize;
+  zTensor.bufferPtr = bufferPtr;
+  zTensor.recScale = recScale;
+  zTensor.offset = offset;
+  return zTensor;
+}
+
 /// Create a zTensor from existing descriptors.
 ZTensor ZTensorHelper::getZTensor(Value preTransformedDescPtr,
     Value transformedDescPtr, Value bufferSize, Value bufferPtr,
@@ -440,6 +528,8 @@ zdnn_data_types llvmTypeToZDNNType(Type elemType) {
     return FP16;
   else if (mlir::isa<Float32Type>(elemType))
     return FP32;
+  else if (elemType.isInteger(8))
+    return INT8;
   else
     llvm_unreachable("Unexpected LLVM type, cannot be converted to zDNN type.");
 }
@@ -481,7 +571,9 @@ Type getZTensorStructTy(MLIRContext *context) {
   Type llvmI64Ty = IntegerType::get(context, 64);
   Type llvmI1Ty = IntegerType::get(context, 1);
   Type llvmI8Ty = IntegerType::get(context, 8);
-  Type llvmArrayI8Ty = LLVM::LLVMArrayType::get(llvmI8Ty, 31);
+  Type llvmF32Ty = FloatType::getF32(context);
+  Type llvmArray3I8Ty = LLVM::LLVMArrayType::get(llvmI8Ty, 3);
+  Type llvmArray20I8Ty = LLVM::LLVMArrayType::get(llvmI8Ty, 20);
   Type llvmI8PtrTy = krnl::getPointerType(context, llvmI8Ty);
   Type llvmZTensorDescStructTy = getZTensorDescStructTy(context);
 
@@ -498,8 +590,14 @@ Type getZTensorStructTy(MLIRContext *context) {
   zTensorTypeElements.emplace_back(llvmI8PtrTy);
   // indicator if data in buffer has been transformed
   zTensorTypeElements.emplace_back(llvmI1Ty);
-  // reserved[31], not currently used, exploiter should not touch
-  zTensorTypeElements.emplace_back(llvmArrayI8Ty);
+  // reserved[3], not currently used, should contain zeros
+  zTensorTypeElements.emplace_back(llvmArray3I8Ty);
+  // the scale factor for quantization, stored as reciprocal
+  zTensorTypeElements.emplace_back(llvmF32Ty);
+  // the offset for quantization
+  zTensorTypeElements.emplace_back(llvmF32Ty);
+  // reserved[20], not currently used, should contain zeros
+  zTensorTypeElements.emplace_back(llvmArray20I8Ty);
 
   Type zTensorStructTy = LLVM::LLVMStructType::getLiteral(context,
       /*elements=*/zTensorTypeElements,
@@ -517,7 +615,8 @@ Value toOpaquePtr(
 
 void fillInZTensor(PatternRewriter &rewriter, Location loc, ModuleOp module,
     Value zTensor, Value preTransformedDescPtr, Value transformedDescPtr,
-    bool isTransformed, Value bufferSize, Value alignedBuffer) {
+    bool isTransformed, Value bufferSize, Value alignedBuffer, Value recScale,
+    Value offset) {
   MLIRContext *context = module.getContext();
   MultiDialectBuilder<LLVMBuilder> create(rewriter, loc);
 
@@ -553,7 +652,67 @@ void fillInZTensor(PatternRewriter &rewriter, Location loc, ModuleOp module,
       llvmZTensorPtrTy, llvmZTensorTy, zTensor, ArrayRef<LLVM::GEPArg>{0, 4});
   create.llvm.store(isTransformedVal, isTransformedDescPtr);
 
-  // 6. Set reserved (not currently used), not touch
+  // 6. Set reserved1 (3 bytes), not currently used.
+
+  // 7. Set rec_scale.
+  Value recScalePtr = create.llvm.getElemPtr(
+      llvmZTensorPtrTy, llvmZTensorTy, zTensor, ArrayRef<LLVM::GEPArg>{0, 6});
+  if (recScale) {
+    Type scaleTy = recScale.getType();
+    assert(
+        scaleTy.isF32() && "Wrong type for zTensor's rec_scale. Must be float");
+    create.llvm.store(recScale, recScalePtr);
+  } else {
+    Value zero = create.llvm.constant(FloatType::getF32(context), (double)0.);
+    create.llvm.store(zero, recScalePtr);
+  }
+
+  // 8. Set offset
+  Value offsetPtr = create.llvm.getElemPtr(
+      llvmZTensorPtrTy, llvmZTensorTy, zTensor, ArrayRef<LLVM::GEPArg>{0, 7});
+  if (offset) {
+    Type offsetTy = offset.getType();
+    assert(
+        offsetTy.isF32() && "Wrong type for zTensor's offset. Must be float");
+    create.llvm.store(offset, offsetPtr);
+  } else {
+    Value zero = create.llvm.constant(FloatType::getF32(context), (double)0.);
+    create.llvm.store(zero, offsetPtr);
+  }
+
+  // 9. Set reserved2 (20 bytes), not currently used.
+}
+
+Value loadFromMemRef(
+    LLVMBuilder &create, Type elementTy, Value llvmMemRef, int32_t index) {
+  MLIRContext *context = create.getBuilder().getContext();
+  MemRefDescriptor mrd(llvmMemRef);
+  Value alignedPtr = mrd.alignedPtr(create.getBuilder(), create.getLoc());
+  Value alignedGep = create.getElemPtr(krnl::getPointerType(context, elementTy),
+      elementTy, alignedPtr, ArrayRef<LLVM::GEPArg>{index});
+  return create.load(elementTy, alignedGep);
+}
+
+void storeToMemRef(
+    LLVMBuilder &create, Value val, Value llvmMemRef, int32_t index) {
+  MLIRContext *context = create.getBuilder().getContext();
+  Type elementTy = val.getType();
+  MemRefDescriptor mrd(llvmMemRef);
+  Value alignedPtr = mrd.alignedPtr(create.getBuilder(), create.getLoc());
+  Value alignedGep = create.getElemPtr(krnl::getPointerType(context, elementTy),
+      elementTy, alignedPtr, ArrayRef<LLVM::GEPArg>{index});
+  create.store(val, alignedGep);
+}
+
+zdnn_quantized_transform_types getQuantizedTransformType(mlir::StringRef str) {
+  if (str.equals_insensitive(QTYPE_DLFLOAT16))
+    return QUANTIZED_DLFLOAT16;
+  else if (str.equals_insensitive(QTYPE_INT8))
+    return QUANTIZED_INT8;
+  else if (str.equals_insensitive(QTYPE_WEIGHTS))
+    return QUANTIZED_WEIGHTS_INT8;
+  else
+    llvm_unreachable("Invalid transform type");
 }
 
 } // namespace zlow
diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.hpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.hpp
index 9e9c251b73..8faa68ee8d 100644
--- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.hpp
+++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.hpp
@@ -19,6 +19,8 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
 
+#include "src/Dialect/Mlir/DialectBuilder.hpp"
+
 #include "zdnn.h"
 
 namespace onnx_mlir {
@@ -30,9 +32,12 @@ enum class API {
   ZDNN_INIT_PRE_TRANSFORMED_DESC,
   ZDNN_GENERATE_TRANSFORMED_DESC,
   ZDNN_GENERATE_TRANSFORMED_DESC_CONCATENATED,
+  ZDNN_GENERATE_QUANTIZED_TRANSFORMED_DESC,
   ZDNN_GETSIZE_ZTENSOR,
   ZDNN_TRANSFORM_ZTENSOR,
+  ZDNN_TRANSFORM_ZTENSOR_WITH_SATURATION,
   ZDNN_TRANSFORM_ORIGTENSOR,
+  ZDNN_TRANSFORM_QUANTIZED_ZTENSOR,
   // Elementwise operations
   ZDNN_ADD,
   ZDNN_SUB,
@@ -42,22 +47,30 @@ enum class API {
   ZDNN_MAX,
   ZDNN_LOG,
   ZDNN_EXP,
+  ZDNN_INVSQRT,
+  ZDNN_REDUCEMAX,
+  ZDNN_REDUCEMIN,
   // Activation operations
   ZDNN_RELU,
+  ZDNN_GELU,
   ZDNN_TANH,
   ZDNN_SIGMOID,
   ZDNN_SOFTMAX,
+  ZDNN_SQRT,
   // RNN operations
   ZDNN_LSTM,
   ZDNN_GRU,
   // Other operations
   ZDNN_MATMUL_OP,
   ZDNN_MATMUL_BCAST_OP,
+  ZDNN_MATMUL_TRANSPOSE_OP,
+  ZDNN_QUANTIZED_MATMUL_OP,
   ZDNN_CONV2D,
   ZDNN_AVGPOOL2D,
   ZDNN_MAXPOOL2D,
   ZDNN_MEANREDUCE2D,
   ZDNN_BATCHNORM,
+  ZDNN_LEAKY_RELU,
   // Scalar operations.
   DLF16_TO_F32,
   F32_TO_DLF16,
@@ -100,6 +113,8 @@ struct ZTensor {
   mlir::Value bufferSize;
   mlir::Value bufferPtr;
   bool isTransformed;
+  mlir::Value recScale;
+  mlir::Value offset;
 };
 
 /// A helper class to create a zTensor.
@@ -117,6 +132,10 @@ class ZTensorHelper {
       bool isConcat = false,
       zdnn_concat_info concatInfo = RNN_TYPE_GRU | USAGE_WEIGHTS |
                                     PREV_LAYER_NONE);
+  // Get a quantized transformed descriptor.
+  mlir::Value getQuantizedTransformedDescPtr(mlir::Value preTransformedDescPtr,
+      zdnn_quantized_transform_types transform_type);
+
   // Get the pointer to memref.
   mlir::Value getAlignedI8Ptr(mlir::Value memRef);
   // Get buffer size from a transformed descriptor.
@@ -127,6 +146,11 @@ class ZTensorHelper {
       bool isTransformed, bool isConcat = false,
       zdnn_concat_info concatInfo = RNN_TYPE_GRU | USAGE_WEIGHTS |
                                     PREV_LAYER_NONE);
+  // Create a quantized zTensor.
+  ZTensor getQuantizedZTensor(mlir::Value bufferPtr, zdnn_data_types dataType,
+      zdnn_data_layouts layout, zdnn_quantized_transform_types transformType,
+      mlir::ArrayRef<mlir::Value> originalDims, mlir::Value recScale,
+      mlir::Value offset, bool isTransformed);
   // Create a zTensor from existing descriptors.
   ZTensor getZTensor(mlir::Value preTransformedDescPtr,
       mlir::Value transformedDescPtr, mlir::Value bufferSize,
@@ -197,7 +221,19 @@ mlir::Value toOpaquePtr(mlir::PatternRewriter &rewriter, mlir::Location loc,
 void fillInZTensor(mlir::PatternRewriter &rewriter, mlir::Location loc,
     mlir::ModuleOp module, mlir::Value zTensor,
     mlir::Value preTransformedDescPtr, mlir::Value transformedDescPtr,
-    bool isTransformed, mlir::Value bufferSize, mlir::Value alignedBuffer);
+    bool isTransformed, mlir::Value bufferSize, mlir::Value alignedBuffer,
+    mlir::Value recScale = nullptr, mlir::Value offset = nullptr);
+
+/// Function to load a value from a LLVM Struct of MemRef.
+mlir::Value loadFromMemRef(onnx_mlir::LLVMBuilder &create, mlir::Type elementTy,
+    mlir::Value llvmMemRef, int32_t index);
+
+/// Function to store a value to a LLVM Struct of MemRef.
+void storeToMemRef(onnx_mlir::LLVMBuilder &create, mlir::Value val,
+    mlir::Value llvmMemRef, int32_t index);
+
+/// Function to get a quantized tranform type from a string.
+zdnn_quantized_transform_types getQuantizedTransformType(mlir::StringRef str);
 
 } // namespace zlow
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/CMakeLists.txt b/src/Accelerators/NNPA/Dialect/ZHigh/CMakeLists.txt
index 915ed61717..2c3e6ca953 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/CMakeLists.txt
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/CMakeLists.txt
@@ -23,6 +23,9 @@ add_onnx_mlir_library(OMZHighOps
   ZHighOps/MatMul/MatMul.cpp
   ZHighOps/MeanReduce2D/MeanReduce2D.cpp
   ZHighOps/Pooling/Pooling.cpp
+  ZHighOps/QuantizedMatMul/QuantizedMatMul.cpp
+  ZHighOps/QuantizedStick/QuantizedStick.cpp
+  ZHighOps/Reduction/Reduction.cpp
   ZHighOps/Softmax/Softmax.cpp
   ZHighOps/Stick/Stick.cpp
   ZHighOps/StickForGRU/StickForGRU.cpp
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHigh.td b/src/Accelerators/NNPA/Dialect/ZHigh/ZHigh.td
index d2624138c0..6f80f12e66 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHigh.td
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHigh.td
@@ -2,7 +2,7 @@
 
 //===----- ZHigh.td -- ZHigh Dialect Operation Definitions -*- tablegen ----==//
 //
-// Copyright 2019-2020 The IBM Research Authors
+// Copyright 2019-2024 The IBM Research Authors
 //
 // =============================================================================
 //
@@ -67,7 +67,8 @@ def ZTensorEncodingAttr : ZHigh_Attr<"ZTensorEncoding"> {
   // Data in ztensor encoding.
   let parameters = (ins
     // A original data layout of the pre-stickified data.
-    "ZTensorEncodingAttr::DataLayout":$dataLayout
+    "ZTensorEncodingAttr::DataLayout":$dataLayout,
+    "ZTensorEncodingAttr::QuantizedType":$quantizedType
   );
 
   let extraClassDeclaration = [{
@@ -76,6 +77,13 @@ def ZTensorEncodingAttr : ZHigh_Attr<"ZTensorEncoding"> {
       NCHW, NHWC, HWCK,
       FICO, ZRH, BFICO, BZRH
     };
+    enum class QuantizedType {
+      UNDEFINED, DLFLOAT16, INT8, WEIGHTS
+    };
+    // QuantizedType is optional.
+    static ZTensorEncodingAttr get(::mlir::MLIRContext *context, ZTensorEncodingAttr::DataLayout dataLayout) {
+      return get(context, dataLayout, ZTensorEncodingAttr::QuantizedType::UNDEFINED);
+    }
   }];
 
   let cppNamespace = "::onnx_mlir::zhigh";
@@ -89,6 +97,18 @@ class DataLayoutOfPred<string layout> : And<[
         " == ZTensorEncodingAttr::DataLayout::" # layout # ")">
 ]>;
 
+// Whether a ztensor type has the specified quantized type.
+class QuantizedTypeOfPred<string qtype> : And<[
+  CPred<"(mlir::cast<::mlir::RankedTensorType>($_self)) &&"
+        "(mlir::dyn_cast_or_null<ZTensorEncodingAttr>(mlir::cast<::mlir::RankedTensorType>($_self).getEncoding())) &&"
+        "(mlir::cast<ZTensorEncodingAttr>(mlir::cast<::mlir::RankedTensorType>($_self).getEncoding()).getQuantizedType()"
+        " == ZTensorEncodingAttr::QuantizedType::" # qtype # ")">
+]>;
+
+// Whether a shaped type has one of the specified quantized type.
+class HasAnyQuantizedTypeOfPred<list<string> qtypes> : And<[
+  Or<!foreach(qtype, qtypes, QuantizedTypeOfPred<qtype>)>]>;
+
 // So far ZTensor supports only F16 for stickified data.
 class ZTensorOf<string layout, list<int> ranks> :
     Type<And<[TensorOf<[F16]>.predicate, HasAnyRankOfPred<ranks>,
@@ -97,8 +117,21 @@ class ZTensorOf<string layout, list<int> ranks> :
          TensorOf<[F16]>.summary # " with layout " # layout,
          "::mlir::TensorType">;
 
+// Quantized ZTensor.
+class QZTensorOf<string layout, list<int> ranks> :
+    Type<And<[TensorOf<[I8, F16]>.predicate,
+              DataLayoutOfPred<layout>,
+              HasAnyQuantizedTypeOfPred<["DLFLOAT16", "INT8", "WEIGHTS"]>,
+              HasAnyRankOfPred<ranks>
+             ]>,
+         !interleave(!foreach(rank, ranks, rank # "D"), "/") # " " #
+         TensorOf<[I8, F16]>.summary # " with layout " # layout,
+         "::mlir::TensorType">;
+
 def UnrankedZTensor : UnrankedTensorOf<[F16]>;
 
+def UnrankedQZTensor : UnrankedTensorOf<[I8, F16]>;
+
 def ZTensor_1D: AnyTypeOf<[UnrankedZTensor, ZTensorOf<"_1D", [1]>]>;
 def ZTensor_2D: AnyTypeOf<[UnrankedZTensor, ZTensorOf<"_2D", [2]>]>;
 def ZTensor_2DS: AnyTypeOf<[UnrankedZTensor, ZTensorOf<"_2DS", [2]>]>;
@@ -119,6 +152,15 @@ def AnyZTensor: AnyTypeOf<[ZTensor_1D, ZTensor_2D, ZTensor_3D, ZTensor_4D,
       ZTensor_NCHW, ZTensor_NHWC, ZTensor_HWCK,
       ZTensor_FICO, ZTensor_ZRH, ZTensor_BFICO, ZTensor_BZRH]>;
 
+def QZTensor_1D: AnyTypeOf<[UnrankedQZTensor, QZTensorOf<"_1D", [1]>]>;
+def QZTensor_2D: AnyTypeOf<[UnrankedQZTensor, QZTensorOf<"_2D", [2]>]>;
+def QZTensor_2DS: AnyTypeOf<[UnrankedQZTensor, QZTensorOf<"_2DS", [2]>]>;
+def QZTensor_3D: AnyTypeOf<[UnrankedQZTensor, QZTensorOf<"_3D", [3]>]>;
+def QZTensor_3DS: AnyTypeOf<[UnrankedQZTensor, QZTensorOf<"_3DS", [3]>]>;
+
+def AnyQZTensor: AnyTypeOf<[QZTensor_1D, QZTensor_2D, QZTensor_3D, QZTensor_2DS,
+      QZTensor_3DS]>;
+
 //===----------------------------------------------------------------------===//
 // ZHigh Operations
 //===----------------------------------------------------------------------===//
@@ -174,6 +216,47 @@ def ZHighStickOp:ZHigh_Op<"Stick", [Pure,
   }];
 }
 
+def ZHighQuantizedStickOp:ZHigh_Op<"QuantizedStick", [Pure,
+    DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+    DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+  let summary = "ZHigh QuantizedStick operation";
+  let description = [{
+    ZHigh operation to perform a quantized Stick.
+    Type is one of values: dlfloat16, int8, and weights.
+    `sym_mode` indicates whether to use symmetric quantization or not to compute the output rescale and offset.
+    `sym_mode` is only effective when the input rescale and offset are None.
+    By default, asymmetric quantization is used.
+  }];
+  let arguments = (ins AnyTypeOf<[TensorOf<[F32]>, TensorOf<[I8]>,
+                                  ZTensor_3D, ZTensor_2DS, ZTensor_3DS]>:$In,
+                       AnyTypeOf<[0DTensorOf<[F32]>, NoneType]>:$InRecScale,
+                       AnyTypeOf<[0DTensorOf<[F32]>, NoneType]>:$InOffset,
+                       StrAttr:$layout,
+                       StrAttr:$quantized_type,
+                       DefaultValuedAttr<I64Attr, "0">:$sym_mode);
+  let results = (outs AnyTypeOf<[QZTensor_1D, QZTensor_2D, QZTensor_3D,
+                                 QZTensor_2DS, QZTensor_3DS,
+                                 NoneType]>:$Out,
+                       0DTensorOf<[F32]>:$RecScale,
+                       0DTensorOf<[F32]>:$Offset);
+  let hasCanonicalizer = 1;
+  let builders = [
+    OpBuilder<(ins "::mlir::Value":$In, "::mlir::Value":$InRecScale, "::mlir::Value":$InOffset,
+               "::mlir::StringAttr":$layout, "::mlir::StringAttr":$quantized_type)>,
+    OpBuilder<(ins "::mlir::Value":$In, "::mlir::Value":$InRecScale, "::mlir::Value":$InOffset,
+               "::mlir::StringAttr":$layout, "::mlir::StringAttr":$quantized_type,
+               "::mlir::IntegerAttr":$sym_mode)>
+  ];
+  let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighQuantizedStickOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighQuantizedStickOpShapeHelper(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
 def ZHighUnstickOp:ZHigh_Op<"Unstick", [Pure,
     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
@@ -417,6 +500,45 @@ def ZHighExpOp:ZHigh_Op<"Exp", [Pure, SameOperandsAndResultLayout,
   }];
 }
 
+def ZHighLeakyReluOp:ZHigh_Op<"LeakyRelu", [Pure, SameOperandsAndResultLayout,
+    DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+    DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+  let summary = "ZHigh LeakyRelu operation";
+  let description = [{
+    "ZHigh operation to perform a LeakyRelu."
+  }];
+  let arguments = (ins AnyTypeOf<[AnyZTensor]>:$X,
+                       DefaultValuedAttr<F32Attr, "0.01">:$alpha);
+  let results = (outs AnyTypeOf<[AnyZTensor]>:$Out);
+  let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighLeakyReluOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighUnaryOpShapeHelper(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
+def ZHighInvSqrtOp:ZHigh_Op<"InvSqrt", [Pure, SameOperandsAndResultLayout,
+    DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+    DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+  let summary = "ZHigh InvSqrt operation";
+  let description = [{
+    ZHigh operation to perform a InvSqrt.
+  }];
+  let arguments = (ins AnyTypeOf<[AnyZTensor]>:$X);
+  let results = (outs AnyTypeOf<[AnyZTensor]>:$Out);
+  let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighInvSqrtOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighUnaryOpShapeHelper(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
 def ZHighReluOp:ZHigh_Op<"Relu", [Pure, SameOperandsAndResultLayout,
     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
@@ -436,6 +558,25 @@ def ZHighReluOp:ZHigh_Op<"Relu", [Pure, SameOperandsAndResultLayout,
   }];
 }
 
+def ZHighGeluOp:ZHigh_Op<"Gelu", [Pure, SameOperandsAndResultLayout,
+    DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+    DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+  let summary = "ZHigh Gelu operation";
+  let description = [{
+    "ZHigh operation to perform a Gelu."
+  }];
+  let arguments = (ins AnyTypeOf<[AnyZTensor]>:$X, DefaultValuedStrAttr<StrAttr, "none">:$approximate);
+  let results = (outs AnyTypeOf<[AnyZTensor]>:$Out);
+  let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighGeluOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighUnaryOpShapeHelper(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
 def ZHighTanhOp:ZHigh_Op<"Tanh", [Pure, SameOperandsAndResultLayout,
     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
@@ -495,6 +636,72 @@ def ZHighSoftmaxOp:ZHigh_Op<"Softmax", [Pure, SameOperandsAndResultLayout,
   }];
 }
 
+def ZHighSqrtOp:ZHigh_Op<"Sqrt", [Pure, SameOperandsAndResultLayout,
+    DeclareOpInterfaceMethods<ShapeInferenceOpInterface>]> {
+  let summary = "ZHigh Sqrt operation";
+  let description = [{
+    ZHigh operation to perform a Sqrt.
+  }];
+  let arguments = (ins AnyTypeOf<[AnyZTensor]>:$X);
+  let results = (outs AnyTypeOf<[AnyZTensor]>:$Out);
+}
+
+def ZHighReduceMaxOp:ZHigh_Op<"ReduceMax", [Pure, SameOperandsAndResultLayout,
+     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+   let summary = "ZHigh ReduceMax operation";
+   let description = [{
+     ZHigh operation to perform a ReduceMax.
+     op_type: REDUCE_OP_MAXIMUM or REDUCE_OP_MINIMUM.
+   }];
+   let arguments = (ins AnyTypeOf<[AnyZTensor]>:$data,
+                       DefaultValuedStrAttr<StrAttr, "REDUCE_OP_MAXIMUM">:$op_type);
+   let results = (outs AnyTypeOf<[AnyZTensor]>:$output);
+   let builders = [
+    OpBuilder<(ins "::mlir::Value":$data, "::mlir::StringAttr":$op_type), [{
+      Type elementType = mlir::cast<ShapedType>(data.getType()).getElementType();
+      UnrankedTensorType resType = UnrankedTensorType::get(elementType);
+      build($_builder, $_state, resType, data, op_type);
+    }]>
+  ];
+   let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighReduceMaxOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighReductionOpShapeHelper<ZHighReduceMaxOp>(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
+def ZHighReduceMinOp:ZHigh_Op<"ReduceMin", [Pure, SameOperandsAndResultLayout,
+     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+   let summary = "ZHigh ReduceMin operation";
+   let description = [{
+     ZHigh operation to perform a ReduceMin.
+     op_type: REDUCE_OP_MAXIMUM or REDUCE_OP_MINIMUM.
+   }];
+   let arguments = (ins AnyTypeOf<[AnyZTensor]>:$data,
+                       DefaultValuedStrAttr<StrAttr, "REDUCE_OP_MINIMUM">:$op_type);
+   let results = (outs AnyTypeOf<[AnyZTensor]>:$output);
+   let builders = [
+    OpBuilder<(ins "::mlir::Value":$data, "::mlir::StringAttr":$op_type), [{
+      Type elementType = mlir::cast<ShapedType>(data.getType()).getElementType();
+      UnrankedTensorType resType = UnrankedTensorType::get(elementType);
+      build($_builder, $_state, resType, data, op_type);
+    }]>
+  ];
+   let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighReduceMinOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighReductionOpShapeHelper<ZHighReduceMinOp>(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
 def ZHighMeanReduce2DOp:ZHigh_Op<"MeanReduce2d", [Pure,
     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
@@ -595,14 +802,17 @@ def ZHighMatMulOp:ZHigh_Op<"MatMul", [Pure,
   }];
   let arguments = (ins AnyTypeOf<[ZTensor_2D, ZTensor_3DS]>:$X,
                        AnyTypeOf<[ZTensor_2D, ZTensor_3DS]>:$Y,
-                       AnyTypeOf<[ZTensor_1D, ZTensor_2DS, NoneType]>:$B);
+                       AnyTypeOf<[ZTensor_1D, ZTensor_2DS, NoneType]>:$B,
+                       DefaultValuedAttr<SI64Attr, "0">:$transposeA,
+                       DefaultValuedAttr<SI64Attr, "0">:$transposeB);
 
   let results = (outs AnyTypeOf<[ZTensor_2D, ZTensor_3DS]>:$Out);
   let builders = [
-    OpBuilder<(ins "::mlir::Value":$X, "::mlir::Value":$Y, "::mlir::Value":$B), [{
+    OpBuilder<(ins "::mlir::Value":$X, "::mlir::Value":$Y, "::mlir::Value":$B,
+      "::mlir::IntegerAttr":$transposeA, "::mlir::IntegerAttr":$transposeB), [{
       Type elementType = mlir::cast<ShapedType>(X.getType()).getElementType();
       UnrankedTensorType resType = UnrankedTensorType::get(elementType);
-      build($_builder, $_state, resType, X, Y, B);
+      build($_builder, $_state, resType, X, Y, B, transposeA, transposeB);
     }]>
   ];
   let hasVerifier = 1;
@@ -616,6 +826,54 @@ def ZHighMatMulOp:ZHigh_Op<"MatMul", [Pure,
   }];
 }
 
+def ZHighQuantizedMatMulOp:ZHigh_Op<"QuantizedMatMul", [Pure,
+    DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
+    DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
+  let summary = "ZHigh QuantizedMatMul operation";
+  let description = [{
+    ZHigh operation to perform a quantized MatMul.
+
+    `OutRecScaleIn` and `OutOffsetIn` are recscale and offset for the output.
+    If `OutRecScaleIn` is given, it will be passed to `OutRecScale`. If it is
+    None, `OutRescScale` is set to 1.0.
+    If `OutOffsetIn` is given, it will be passed to `OutOffset`. If it is
+    None, `OutOffset` is set to 0.0.
+
+    * PreComputedBias: -1 bias is re-computed, 0: bias is not pre-computed.
+
+    `DequantizeOutput` indicates if the output
+    is dequantized to real dfloat16 or not. If not, the output is int8 but stored in dlfloat (int8-as-dlfloat).
+    * DequantizeOutput: -1 output is dequantized, 0: output is not dequantized.
+  }];
+  let arguments = (ins AnyTypeOf<[QZTensor_2D, QZTensor_3DS]>:$X,
+                       0DTensorOf<[F32]>:$XRecScale,
+                       0DTensorOf<[F32]>:$XOffset,
+                       AnyTypeOf<[QZTensor_2D, QZTensor_3DS]>:$Y,
+                       0DTensorOf<[F32]>:$YRecScale,
+                       0DTensorOf<[F32]>:$YOffset,
+                       AnyTypeOf<[ZTensor_1D, ZTensor_2DS, QZTensor_1D, QZTensor_2DS, NoneType]>:$B,
+                       AnyTypeOf<[0DTensorOf<[F32]>, NoneType]>:$BRecScale,
+                       AnyTypeOf<[0DTensorOf<[F32]>, NoneType]>:$BOffset,
+                       AnyTypeOf<[0DTensorOf<[F32]>, NoneType]>:$OutRecScaleIn,
+                       AnyTypeOf<[0DTensorOf<[F32]>, NoneType]>:$OutOffsetIn,
+                       DefaultValuedAttr<SI64Attr, "0">:$PreComputedBias,
+                       DefaultValuedAttr<SI64Attr, "0">:$DisableClipping,
+                       DefaultValuedAttr<SI64Attr, "0">:$DequantizeOutput);
+
+  let results = (outs AnyTypeOf<[QZTensor_2D, QZTensor_3DS, ZTensor_2D, ZTensor_3DS]>:$Out,
+                      0DTensorOf<[F32]>:$OutRecScale,
+                      0DTensorOf<[F32]>:$OutOffset);
+  let hasVerifier = 1;
+  let extraClassDefinition = [{
+    onnx_mlir::ONNXOpShapeHelper * ZHighQuantizedMatMulOp::getShapeHelper(mlir::Operation *op, mlir::ArrayRef<mlir::Value> oper,
+        onnx_mlir::IndexExprBuilder *ieb, onnx_mlir::IndexExprScope *scope) {
+      onnx_mlir::ONNXOpShapeHelper *sh = new ZHighQuantizedMatMulOpShapeHelper(op, oper, ieb, scope);
+      assert(sh && "failed to allocate shape helper");
+      return sh;
+    }
+  }];
+}
+
 def ZHighLSTMOp:ZHigh_Op<"LSTM", [Pure,
     DeclareOpInterfaceMethods<ShapeInferenceOpInterface>,
     DeclareOpInterfaceMethods<ShapeHelperOpInterface>]> {
@@ -868,7 +1126,7 @@ def ZHighStickifiedConstantOp:ZHigh_Op<"StickifiedConstant", [Pure]> {
   }];
   let arguments = (ins OptionalAttr<AnyAttr>:$value,
                        DefaultValuedAttr<I64Attr, "4096">:$alignment);
-  let results = (outs AnyZTensor:$output);
+  let results = (outs AnyTypeOf<[AnyZTensor, AnyQZTensor]>:$output);
 }
 
 def ZHighStickifiedConstantOfShapeOp:ZHigh_Op<"StickifiedConstantOfShape", [Pure,
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.cpp
index a4e8b51403..de587aecba 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.cpp
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.cpp
@@ -111,6 +111,9 @@ Attribute ZTensorEncodingAttr::parse(AsmParser &parser, Type type) {
   ZTensorEncodingAttr::DataLayout dataLayout =
       ZTensorEncodingAttr::DataLayout::UNDEFINED;
 
+  ZTensorEncodingAttr::QuantizedType quantizedType =
+      ZTensorEncodingAttr::QuantizedType::UNDEFINED;
+
   // Process the data from the parsed dictionary value into struct-like data.
   for (const NamedAttribute &attr : dict) {
     if (attr.getName() == "dataLayout") {
@@ -155,6 +158,27 @@ Attribute ZTensorEncodingAttr::parse(AsmParser &parser, Type type) {
             << strVal;
         return {};
       }
+    } else if (attr.getName() == "quantizedType") {
+      StringAttr qtypeAttr = mlir::dyn_cast<StringAttr>(attr.getValue());
+      if (!qtypeAttr) {
+        parser.emitError(
+            parser.getNameLoc(), "expected a string value for quantized type");
+        return {};
+      }
+      StringRef strVal = qtypeAttr.getValue();
+      if (strVal.equals_insensitive(QTYPE_DLFLOAT16)) {
+        quantizedType = ZTensorEncodingAttr::QuantizedType::DLFLOAT16;
+      } else if (strVal.equals_insensitive(QTYPE_INT8)) {
+        quantizedType = ZTensorEncodingAttr::QuantizedType::INT8;
+      } else if (strVal.equals_insensitive(QTYPE_WEIGHTS)) {
+        quantizedType = ZTensorEncodingAttr::QuantizedType::WEIGHTS;
+      } else if (strVal.equals_insensitive(QTYPE_UNDEFINED)) {
+        quantizedType = ZTensorEncodingAttr::QuantizedType::UNDEFINED;
+      } else {
+        parser.emitError(parser.getNameLoc(), "unexpected quantized type: ")
+            << strVal;
+        return {};
+      }
     } else {
       parser.emitError(parser.getNameLoc(), "unexpected key: ")
           << attr.getName().str();
@@ -163,7 +187,7 @@ Attribute ZTensorEncodingAttr::parse(AsmParser &parser, Type type) {
   }
   // Construct struct-like storage for attribute.
   return parser.getChecked<ZTensorEncodingAttr>(
-      parser.getContext(), dataLayout);
+      parser.getContext(), dataLayout, quantizedType);
 }
 
 void ZTensorEncodingAttr::print(AsmPrinter &printer) const {
@@ -216,6 +240,27 @@ void ZTensorEncodingAttr::print(AsmPrinter &printer) const {
     llvm_unreachable("Unexpected data layout");
     break;
   }
+
+  // QuantizedType is optional.
+  switch (getQuantizedType()) {
+  case QuantizedType::DLFLOAT16:
+    printer << ", quantizedType = ";
+    printer << "\"" << QTYPE_DLFLOAT16 << "\"";
+    break;
+  case QuantizedType::INT8:
+    printer << ", quantizedType = ";
+    printer << "\"" << QTYPE_INT8 << "\"";
+    break;
+  case QuantizedType::WEIGHTS:
+    printer << ", quantizedType = ";
+    printer << "\"" << QTYPE_WEIGHTS << "\"";
+    break;
+  case QuantizedType::UNDEFINED:
+    break;
+  default:
+    llvm_unreachable("Unexpected quantized type");
+    break;
+  }
   printer << "}>";
 }
 
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Elementwise/Elementwise.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Elementwise/Elementwise.cpp
index 7bdb75a3f3..429d662693 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Elementwise/Elementwise.cpp
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Elementwise/Elementwise.cpp
@@ -4,7 +4,7 @@
 
 //===------------------ Elementwise.cpp - ZHigh Operations ----------------===//
 //
-// Copyright 2019-2022 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -82,6 +82,22 @@ LogicalResult ZHighExpOp::inferShapes(
   return inferShapeForUnaryOps(this->getOperation());
 }
 
+//===----------------------------------------------------------------------===//
+// InvSqrtOp
+
+LogicalResult ZHighInvSqrtOp::inferShapes(
+    std::function<void(mlir::Region &)> doShapeInference) {
+  return inferShapeForUnaryOps(this->getOperation());
+}
+
+//===----------------------------------------------------------------------===//
+// LeakyReluOp
+
+LogicalResult ZHighLeakyReluOp::inferShapes(
+    std::function<void(mlir::Region &)> doShapeInference) {
+  return inferShapeForUnaryOps(this->getOperation());
+}
+
 //===----------------------------------------------------------------------===//
 // ReluOp
 
@@ -90,6 +106,14 @@ LogicalResult ZHighReluOp::inferShapes(
   return inferShapeForUnaryOps(this->getOperation());
 }
 
+//===----------------------------------------------------------------------===//
+// GeluOp
+
+LogicalResult ZHighGeluOp::inferShapes(
+    std::function<void(mlir::Region &)> doShapeInference) {
+  return inferShapeForUnaryOps(this->getOperation());
+}
+
 //===----------------------------------------------------------------------===//
 // TanhOp
 
@@ -106,5 +130,13 @@ LogicalResult ZHighSigmoidOp::inferShapes(
   return inferShapeForUnaryOps(this->getOperation());
 }
 
+//===----------------------------------------------------------------------===//
+// SqrtOp
+
+LogicalResult ZHighSqrtOp::inferShapes(
+    std::function<void(mlir::Region &)> doShapeInference) {
+  return inferShapeForUnaryOps(this->getOperation());
+}
+
 } // namespace zhigh
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/MatMul/MatMul.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/MatMul/MatMul.cpp
index 776f57dd17..a37dce3671 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/MatMul/MatMul.cpp
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/MatMul/MatMul.cpp
@@ -4,7 +4,7 @@
 
 //===------------------ MatMul.cpp - ZHigh Operations ---------------------===//
 //
-// Copyright 2019-2022 The IBM Research Authors.
+// Copyright 2019-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -24,6 +24,7 @@ namespace zhigh {
 //===----------------------------------------------------------------------===//
 
 LogicalResult ZHighMatMulOpShapeHelper::computeShape() {
+  ZHighMatMulOp matmulOp = llvm::dyn_cast<ZHighMatMulOp>(op);
   ZHighMatMulOp::Adaptor operandAdaptor(operands);
   // Output dims of result.
   DimsExpr outputDims;
@@ -32,6 +33,10 @@ LogicalResult ZHighMatMulOpShapeHelper::computeShape() {
   Value X = operandAdaptor.getX();
   Value Y = operandAdaptor.getY();
 
+  // Get transpose attributes.
+  int64_t transposeA = (matmulOp.getTransposeA() != 0) ? 1 : 0;
+  int64_t transposeB = (matmulOp.getTransposeB() != 0) ? 1 : 0;
+
   // Get bounds
   SmallVector<IndexExpr, 4> XDims, YDims;
   createIE->getShapeAsDims(X, XDims);
@@ -42,46 +47,125 @@ LogicalResult ZHighMatMulOpShapeHelper::computeShape() {
   if (!(xRank == 2 || xRank == 3))
     return failure();
 
+  // Determine the dimensions of the output tensor.
   if (xRank == 2) {
     // X :: MxN
-    // Y :: NxP
-    outputDims.emplace_back(XDims[0]);
-    outputDims.emplace_back(YDims[1]);
+    int64_t xI = 0;
+    if (transposeA)
+      // X :: NxM
+      xI = 1;
+    if (yRank == 2) {
+      // Y :: NxP
+      int64_t yI = 1;
+      if (transposeB)
+        // Y :: PxN
+        yI = 0;
+      // Unstacked case: X:2D (m,n) - Y:2D (n,p) - Bias:1D (p) - Out:2D (m,p)
+      outputDims.emplace_back(XDims[xI]);
+      outputDims.emplace_back(YDims[yI]);
+    } else if (yRank == 3) {
+      // Y :: SxNxP
+      int64_t yI1 = 0;
+      int64_t yI2 = 2;
+      if (transposeB) {
+        // Y :: SxPxN
+        yI2 = 1;
+      }
+      // Broadcast 1 case: X:2D (m,n) - Y:3DS (s,n,p) - Bias:2DS (s,p) - Out:3DS
+      // (s,m,p)
+      outputDims.emplace_back(YDims[yI1]);
+      outputDims.emplace_back(XDims[xI]);
+      outputDims.emplace_back(YDims[yI2]);
+      isBroadcasted1 = true;
+    }
   } else if (xRank == 3) {
     // X :: SxMxN
-    outputDims.emplace_back(XDims[0]);
-    outputDims.emplace_back(XDims[1]);
+    int64_t xI1 = 0;
+    int64_t xI2 = 1;
+    if (transposeA)
+      // X :: SxNxM
+      xI2 = 2;
     if (yRank == 2) {
       // Y :: NxP
-      outputDims.emplace_back(YDims[1]);
-      isBroadcasted = true;
+      int64_t yI = 1;
+      if (transposeB)
+        // Y :: PxN
+        yI = 0;
+      // Broadcast 23 case: X:3DS (s,m,n) - Y:2D (n,p) - Bias:1D (p) - Out:3DS
+      // (s,m,p)
+      outputDims.emplace_back(XDims[xI1]);
+      outputDims.emplace_back(XDims[xI2]);
+      outputDims.emplace_back(YDims[yI]);
+      isBroadcasted23 = true;
     } else if (yRank == 3) {
       // Y :: SxNxP
-      outputDims.emplace_back(YDims[2]);
+      int64_t yI = 2;
+      if (transposeB)
+        // Y :: SxPxN
+        yI = 1;
+      // Stacked case: X:3DS (s,m,n) - Y:3DS (s,n,p) - Bias:2DS (s,p) - Out:3DS
+      // (s,m,p)
+      outputDims.emplace_back(XDims[xI1]);
+      outputDims.emplace_back(XDims[xI2]);
+      outputDims.emplace_back(YDims[yI]);
       isStacked = true;
     }
   }
 
   // Keep all original dimensions: M, N, P if 2D or S, M, N, P if 3D.
   if (xRank == 2) {
-    // M
-    allOriginalDims.emplace_back(XDims[0]);
-    // N
-    allOriginalDims.emplace_back(XDims[1]);
-    // P
-    allOriginalDims.emplace_back(YDims[1]);
+    if (transposeA) {
+      // M
+      allOriginalDims.emplace_back(XDims[1]);
+      // N
+      allOriginalDims.emplace_back(XDims[0]);
+    } else {
+      // M
+      allOriginalDims.emplace_back(XDims[0]);
+      // N
+      allOriginalDims.emplace_back(XDims[1]);
+    }
+    if (yRank == 2) {
+      // P
+      if (transposeB)
+        allOriginalDims.emplace_back(YDims[0]);
+      else
+        allOriginalDims.emplace_back(YDims[1]);
+    } else if (yRank == 3) {
+      // S
+      allOriginalDims.emplace_back(YDims[0]);
+      // P
+      if (transposeB)
+        allOriginalDims.emplace_back(YDims[1]);
+      else
+        allOriginalDims.emplace_back(YDims[2]);
+    }
   } else if (xRank == 3) {
     // S
     allOriginalDims.emplace_back(XDims[0]);
-    // M
-    allOriginalDims.emplace_back(XDims[1]);
-    // N
-    allOriginalDims.emplace_back(XDims[2]);
+    if (transposeA) {
+      // M
+      allOriginalDims.emplace_back(XDims[2]);
+      // N
+      allOriginalDims.emplace_back(XDims[1]);
+    } else {
+      // M
+      allOriginalDims.emplace_back(XDims[1]);
+      // N
+      allOriginalDims.emplace_back(XDims[2]);
+    }
     // P
     if (yRank == 2)
-      allOriginalDims.emplace_back(YDims[1]);
-    else if (yRank == 3)
-      allOriginalDims.emplace_back(YDims[2]);
+      if (transposeB)
+        allOriginalDims.emplace_back(YDims[0]);
+      else
+        allOriginalDims.emplace_back(YDims[1]);
+    else if (yRank == 3) {
+      if (transposeB)
+        allOriginalDims.emplace_back(YDims[1]);
+      else
+        allOriginalDims.emplace_back(YDims[2]);
+    }
   }
 
   // Save the final result.
@@ -138,12 +222,20 @@ LogicalResult ZHighMatMulOp::verify() {
           (xLayout == ZTensorEncodingAttr::DataLayout::_3DS)))
     return failure();
 
-  // If X is 2D, Y must be 2D and B must be 1D
+  // If X is 2D, Y must be 2D or 3DS.
+  // If X is 2D and Y is 2D, B must be 1D.
+  // If X is 2D and Y is 3DS, B must be 2DS.
   if (xLayout == ZTensorEncodingAttr::DataLayout::_2D) {
-    if (!(yLayout == ZTensorEncodingAttr::DataLayout::_2D))
-      return failure();
-    if (hasBias && !(bLayout == ZTensorEncodingAttr::DataLayout::_1D))
+    if (!((yLayout == ZTensorEncodingAttr::DataLayout::_2D) ||
+            (yLayout == ZTensorEncodingAttr::DataLayout::_3DS)))
       return failure();
+    if (yLayout == ZTensorEncodingAttr::DataLayout::_2D) {
+      if (hasBias && !(bLayout == ZTensorEncodingAttr::DataLayout::_1D))
+        return failure();
+    } else if (yLayout == ZTensorEncodingAttr::DataLayout::_3DS) {
+      if (hasBias && !(bLayout == ZTensorEncodingAttr::DataLayout::_2DS))
+        return failure();
+    }
   }
 
   // X is 3DS, valid types for (X, Y, B) are (3DS, 3DS, 2DS) or (3DS, 2D, 1D)
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.cpp
index f5b9ff910f..c120bc6b44 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.cpp
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.cpp
@@ -140,6 +140,45 @@ StringAttr convertZTensorDataLayoutToStringAttr(
   return attr;
 }
 
+/// Get a ztensor quantized type by StringAttr.
+ZTensorEncodingAttr::QuantizedType convertStringAttrToZTensorQuantizedType(
+    StringAttr qtypeAttr) {
+  if (qtypeAttr) {
+    StringRef qtypeStr = qtypeAttr.getValue();
+    if (qtypeStr.equals_insensitive(QTYPE_DLFLOAT16))
+      return ZTensorEncodingAttr::QuantizedType::DLFLOAT16;
+    else if (qtypeStr.equals_insensitive(QTYPE_INT8))
+      return ZTensorEncodingAttr::QuantizedType::INT8;
+    else if (qtypeStr.equals_insensitive(QTYPE_WEIGHTS))
+      return ZTensorEncodingAttr::QuantizedType::WEIGHTS;
+    else if (qtypeStr.equals_insensitive(QTYPE_UNDEFINED))
+      return ZTensorEncodingAttr::QuantizedType::UNDEFINED;
+    else
+      llvm_unreachable("Invalid quantized type string");
+  } else
+    llvm_unreachable("Could not get quantized type by an empty StringAttr");
+}
+
+/// Convert a quantized type to StringAttr.
+StringAttr convertZTensorQuantizedTypeToStringAttr(
+    OpBuilder &builder, ZTensorEncodingAttr::QuantizedType qtype) {
+  StringAttr attr;
+  switch (qtype) {
+  case ZTensorEncodingAttr::QuantizedType::DLFLOAT16:
+    attr = builder.getStringAttr(QTYPE_DLFLOAT16);
+    break;
+  case ZTensorEncodingAttr::QuantizedType::INT8:
+    attr = builder.getStringAttr(QTYPE_INT8);
+    break;
+  case ZTensorEncodingAttr::QuantizedType::WEIGHTS:
+    attr = builder.getStringAttr(QTYPE_WEIGHTS);
+    break;
+  default:
+    break;
+  }
+  return attr;
+}
+
 //===----------------------------------------------------------------------===//
 // Utility functions to query ztensor information.
 
@@ -169,6 +208,12 @@ StringAttr getZTensorLayoutAttr(OpBuilder &builder, Type type) {
   return nullptr;
 }
 
+ZTensorEncodingAttr::QuantizedType getZTensorQuantizedType(Type type) {
+  if (auto encoding = getZTensorEncoding(type))
+    return encoding.getQuantizedType();
+  return ZTensorEncodingAttr::QuantizedType::UNDEFINED;
+}
+
 //===----------------------------------------------------------------------===//
 // Utility functions.
 
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp
index def0813d7b..cc346ef17d 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp
@@ -34,6 +34,14 @@ ZTensorEncodingAttr::DataLayout getZTensorDataLayoutByRank(int64_t rank);
 mlir::StringAttr convertZTensorDataLayoutToStringAttr(
     mlir::OpBuilder &builder, ZTensorEncodingAttr::DataLayout layout);
 
+/// Get a ztensor quantized type by StringAttr.
+ZTensorEncodingAttr::QuantizedType convertStringAttrToZTensorQuantizedType(
+    mlir::StringAttr qtypeAttr);
+
+/// Convert a quantized type to StringAttr.
+mlir::StringAttr convertZTensorQuantizedTypeToStringAttr(
+    mlir::OpBuilder &builder, ZTensorEncodingAttr::QuantizedType qtype);
+
 //===----------------------------------------------------------------------===//
 // Convenience method to query information of a ztensor
 
@@ -51,6 +59,9 @@ ZTensorEncodingAttr::DataLayout getZTensorLayout(mlir::Type type);
 mlir::StringAttr getZTensorLayoutAttr(
     mlir::OpBuilder &builder, mlir::Type type);
 
+/// Get the quantized type of a ztensor.
+ZTensorEncodingAttr::QuantizedType getZTensorQuantizedType(mlir::Type type);
+
 /// Get a minus value.
 mlir::Value getMinusBcastConst(mlir::OpBuilder &builder, mlir::Location loc,
     mlir::FloatAttr floatAttr, mlir::Value input);
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/QuantizedMatMul/QuantizedMatMul.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/QuantizedMatMul/QuantizedMatMul.cpp
new file mode 100644
index 0000000000..bc9a34696f
--- /dev/null
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/QuantizedMatMul/QuantizedMatMul.cpp
@@ -0,0 +1,177 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===-------------- QuantizedMatMul.cpp - ZHigh Operations ----------------===//
+//
+// Copyright 2023 The IBM Research Authors.
+//
+// =============================================================================
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp"
+
+using namespace mlir;
+using namespace onnx_mlir;
+
+namespace onnx_mlir {
+namespace zhigh {
+
+//===----------------------------------------------------------------------===//
+// ShapeHelper
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZHighQuantizedMatMulOpShapeHelper::computeShape() {
+  ZHighQuantizedMatMulOp::Adaptor operandAdaptor(operands);
+  // Output dims of result.
+  DimsExpr outputDims;
+
+  // Get operands.
+  Value X = operandAdaptor.getX();
+  Value Y = operandAdaptor.getY();
+
+  // Get bounds
+  SmallVector<IndexExpr, 4> XDims, YDims;
+  createIE->getShapeAsDims(X, XDims);
+  createIE->getShapeAsDims(Y, YDims);
+  int64_t xRank = XDims.size();
+  int64_t yRank = YDims.size();
+
+  if (!(xRank == 2 || xRank == 3))
+    return failure();
+
+  if (xRank == 2) {
+    // X :: MxN
+    int64_t xI = 0;
+    outputDims.emplace_back(XDims[xI]);
+    // Y :: NxP
+    int64_t yI = 1;
+    outputDims.emplace_back(YDims[yI]);
+  } else if (xRank == 3) {
+    // X :: SxMxN
+    outputDims.emplace_back(XDims[0]);
+    outputDims.emplace_back(XDims[1]);
+    if (yRank == 2) {
+      // Y :: NxP
+      outputDims.emplace_back(YDims[1]);
+      isBroadcasted = true;
+    } else if (yRank == 3) {
+      // Y :: SxNxP
+      outputDims.emplace_back(YDims[2]);
+      isStacked = true;
+    }
+  }
+
+  // Keep all original dimensions: M, N, P if 2D or S, M, N, P if 3D.
+  if (xRank == 2) {
+    // M
+    allOriginalDims.emplace_back(XDims[0]);
+    // N
+    allOriginalDims.emplace_back(XDims[1]);
+    // P
+    allOriginalDims.emplace_back(YDims[1]);
+  } else if (xRank == 3) {
+    // S
+    allOriginalDims.emplace_back(XDims[0]);
+    // M
+    allOriginalDims.emplace_back(XDims[1]);
+    // N
+    allOriginalDims.emplace_back(XDims[2]);
+    // P
+    if (yRank == 2)
+      allOriginalDims.emplace_back(YDims[1]);
+    else if (yRank == 3)
+      allOriginalDims.emplace_back(YDims[2]);
+  }
+
+  // Save the final result.
+  setOutputDims(outputDims);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Shape inference
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZHighQuantizedMatMulOp::inferShapes(
+    std::function<void(mlir::Region &)> doShapeInference) {
+  if (!hasRankedType(getX()) || !hasRankedType(getY()))
+    return success();
+
+  bool dequantizeOutput = (getDequantizeOutput() == -1);
+  ZHighQuantizedMatMulOpShapeHelper shapeHelper(getOperation());
+  shapeHelper.computeShapeAndAssertOnFailure();
+
+  SmallVector<int64_t, 4> outputDims;
+  IndexExpr::getShape(shapeHelper.getOutputDims(), outputDims);
+  Type elementType =
+      mlir::cast<ShapedType>(getResult(0).getType()).getElementType();
+  ZTensorEncodingAttr encoding;
+  ZTensorEncodingAttr::QuantizedType qtype =
+      ZTensorEncodingAttr::QuantizedType::DLFLOAT16;
+  if (dequantizeOutput)
+    qtype = ZTensorEncodingAttr::QuantizedType::UNDEFINED;
+  if (outputDims.size() == 2)
+    encoding = ZTensorEncodingAttr::get(
+        this->getContext(), ZTensorEncodingAttr::DataLayout::_2D, qtype);
+  else if (outputDims.size() == 3)
+    encoding = ZTensorEncodingAttr::get(
+        this->getContext(), ZTensorEncodingAttr::DataLayout::_3DS, qtype);
+
+  updateType(getOperation(), getResult(0), outputDims, elementType, encoding);
+  return success();
+}
+
+LogicalResult ZHighQuantizedMatMulOp::verify() {
+  ZHighQuantizedMatMulOpAdaptor operandAdaptor(*this);
+  // Get operands.
+  Value X = operandAdaptor.getX();
+  Value Y = operandAdaptor.getY();
+  Value B = operandAdaptor.getB();
+
+  if (!hasRankedType(X) || !hasRankedType(Y))
+    return success();
+
+  // Get layouts.
+  ZTensorEncodingAttr::DataLayout xLayout = getZTensorLayout(X.getType());
+  ZTensorEncodingAttr::DataLayout yLayout = getZTensorLayout(Y.getType());
+  // Bias can be None.
+  ZTensorEncodingAttr::DataLayout bLayout;
+  bool hasBias = !mlir::isa<NoneType>(B.getType());
+  if (hasBias) {
+    if (!hasRankedType(B))
+      return success();
+    bLayout = getZTensorLayout(B.getType());
+  }
+
+  // X must be 2D or 3DS.
+  if (!((xLayout == ZTensorEncodingAttr::DataLayout::_2D) ||
+          (xLayout == ZTensorEncodingAttr::DataLayout::_3DS)))
+    return failure();
+
+  // If X is 2D, Y must be 2D and B must be 1D
+  if (xLayout == ZTensorEncodingAttr::DataLayout::_2D) {
+    if (!(yLayout == ZTensorEncodingAttr::DataLayout::_2D))
+      return failure();
+    if (hasBias && !(bLayout == ZTensorEncodingAttr::DataLayout::_1D))
+      return failure();
+  }
+
+  // X is 3DS, valid types for (X, Y, B) are (3DS, 3DS, 2DS) or (3DS, 2D, 1D)
+  if (xLayout == ZTensorEncodingAttr::DataLayout::_3DS) {
+    if (yLayout == ZTensorEncodingAttr::DataLayout::_3DS) {
+      if (hasBias && !(bLayout == ZTensorEncodingAttr::DataLayout::_2DS))
+        return failure();
+    } else if (yLayout == ZTensorEncodingAttr::DataLayout::_2D) {
+      if (hasBias && !(bLayout == ZTensorEncodingAttr::DataLayout::_1D))
+        return failure();
+    }
+  }
+
+  return success();
+}
+
+} // namespace zhigh
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/QuantizedStick/QuantizedStick.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/QuantizedStick/QuantizedStick.cpp
new file mode 100644
index 0000000000..9e8f9515f3
--- /dev/null
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/QuantizedStick/QuantizedStick.cpp
@@ -0,0 +1,205 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===------------------ Stick.cpp - ZHigh Operations ----------------------===//
+//
+// Copyright 2023 The IBM Research Authors.
+//
+// =============================================================================
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/Accelerators/NNPA/Compiler/NNPACompilerOptions.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp"
+
+using namespace mlir;
+
+namespace onnx_mlir {
+namespace zhigh {
+
+//===----------------------------------------------------------------------===//
+// Custom builders
+//===----------------------------------------------------------------------===//
+
+void ZHighQuantizedStickOp::build(OpBuilder &builder, OperationState &state,
+    Value input, Value recScale, Value offset, StringAttr layout,
+    StringAttr qtype, IntegerAttr symMode) {
+  // Quantized type.
+  auto quantizedType = convertStringAttrToZTensorQuantizedType(qtype);
+
+  Type resElementType;
+  if (quantizedType == ZTensorEncodingAttr::QuantizedType::DLFLOAT16)
+    resElementType = builder.getF16Type();
+  else if (quantizedType == ZTensorEncodingAttr::QuantizedType::INT8)
+    resElementType = builder.getI8Type();
+  else if (quantizedType == ZTensorEncodingAttr::QuantizedType::WEIGHTS)
+    resElementType = builder.getI8Type();
+  else
+    llvm_unreachable("Unsupported quantized transform type");
+
+  Type resType = builder.getNoneType();
+  if (!mlir::isa<NoneType>(input.getType())) {
+    ShapedType inputType = mlir::cast<ShapedType>(input.getType());
+    int64_t rank = -1;
+    if (inputType.hasRank()) {
+      rank = inputType.getRank();
+      ZTensorEncodingAttr::DataLayout dataLayout;
+      if (layout)
+        dataLayout = convertStringAttrToZTensorDataLayout(layout);
+      else {
+        dataLayout = getZTensorDataLayoutByRank(rank);
+        // Create a layout attribute.
+        layout = convertZTensorDataLayoutToStringAttr(builder, dataLayout);
+      }
+      // Compute shape.
+      ArrayRef<int64_t> inputShape = inputType.getShape();
+      SmallVector<int64_t, 4> resShape(inputShape.begin(), inputShape.end());
+      resType = RankedTensorType::get(resShape, resElementType,
+          ZTensorEncodingAttr::get(
+              builder.getContext(), dataLayout, quantizedType));
+    } else {
+      resType = UnrankedTensorType::get(resElementType);
+    }
+  }
+  RankedTensorType scalarTensorF32Type =
+      RankedTensorType::get({}, builder.getF32Type());
+  build(builder, state, {resType, scalarTensorF32Type, scalarTensorF32Type},
+      input, recScale, offset, layout, qtype, symMode);
+}
+
+void ZHighQuantizedStickOp::build(OpBuilder &builder, OperationState &state,
+    Value input, Value recScale, Value offset, StringAttr layout,
+    StringAttr qtype) {
+  // By default, sym_mode is off.
+  IntegerAttr symMode = builder.getIntegerAttr(builder.getI64Type(), 0);
+  build(builder, state, input, recScale, offset, layout, qtype, symMode);
+}
+
+//===----------------------------------------------------------------------===//
+// ShapeHelper
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZHighQuantizedStickOpShapeHelper::computeShape() {
+  ZHighQuantizedStickOp::Adaptor operandAdaptor(operands);
+  Value input = operandAdaptor.getIn();
+
+  // Output dims of result.
+  DimsExpr outputDims;
+
+  // Get operands and bounds.
+  SmallVector<IndexExpr, 4> inputDims;
+  createIE->getShapeAsDims(input, inputDims);
+  int64_t rank = inputDims.size();
+
+  for (int64_t i = 0; i < rank; ++i)
+    outputDims.emplace_back(inputDims[i]);
+
+  // Save the final result.
+  setOutputDims(outputDims);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Shape inference
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZHighQuantizedStickOp::inferShapes(
+    std::function<void(mlir::Region &)> doShapeInference) {
+  Operation *op = getOperation();
+  OpBuilder builder(op);
+
+  Value input = getIn();
+  if (isa<NoneType>(input.getType()) || !hasRankedType(input))
+    return success();
+
+  auto inputType = mlir::cast<RankedTensorType>(input.getType());
+  StringAttr layout = getLayoutAttr();
+  StringAttr qtype = getQuantizedTypeAttr();
+  int64_t rank = inputType.getRank();
+
+  ZTensorEncodingAttr::DataLayout dataLayout;
+  if (layout)
+    dataLayout = convertStringAttrToZTensorDataLayout(layout);
+  else
+    dataLayout = getZTensorDataLayoutByRank(rank);
+  ZTensorEncodingAttr::QuantizedType quantizedType =
+      convertStringAttrToZTensorQuantizedType(qtype);
+  auto encoding =
+      ZTensorEncodingAttr::get(this->getContext(), dataLayout, quantizedType);
+
+  Type resElementType;
+  if (quantizedType == ZTensorEncodingAttr::QuantizedType::DLFLOAT16)
+    resElementType = builder.getF16Type();
+  else if (quantizedType == ZTensorEncodingAttr::QuantizedType::INT8)
+    resElementType = builder.getI8Type();
+  else if (quantizedType == ZTensorEncodingAttr::QuantizedType::WEIGHTS)
+    resElementType = builder.getI8Type();
+  else
+    llvm_unreachable("Unsupported quantized transform type");
+
+  ZHighQuantizedStickOpShapeHelper shapeHelper(getOperation());
+  shapeHelper.computeShapeAndAssertOnFailure();
+  SmallVector<int64_t, 4> outputDims;
+  IndexExpr::getShape(shapeHelper.getOutputDims(0), outputDims);
+
+  updateType(op, getResults()[0], outputDims, resElementType, encoding);
+  getResults()[1].setType(RankedTensorType::get({}, builder.getF32Type()));
+  getResults()[2].setType(RankedTensorType::get({}, builder.getF32Type()));
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// Canonicalization patterns
+//===----------------------------------------------------------------------===//
+
+class QuantizedStickUnstickRemovalPattern
+    : public OpRewritePattern<ZHighQuantizedStickOp> {
+public:
+  using OpRewritePattern<ZHighQuantizedStickOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ZHighQuantizedStickOp qStickOp,
+      PatternRewriter &rewriter) const override {
+    Location loc = qStickOp.getLoc();
+    Value input = qStickOp.getIn();
+    StringAttr quantizedType = qStickOp.getQuantizedTypeAttr();
+
+    // ZHighQuantizedStickOp's type is dlfloat16.
+    if (!quantizedType.getValue().equals_insensitive(QTYPE_DLFLOAT16))
+      return failure();
+
+    // ZHighQuantizedStickOp's input was defined by ZHighUnstickOp.
+    auto unstickOp = input.getDefiningOp<ZHighUnstickOp>();
+    if (!unstickOp)
+      return failure();
+    // Stickified input's layout is 3D, 2DS or 3DS.
+    Value stickInput = unstickOp.getIn();
+    StringAttr stickLayout =
+        getZTensorLayoutAttr(rewriter, stickInput.getType());
+    if (!(stickLayout.getValue().equals_insensitive("3D") ||
+            stickLayout.getValue().equals_insensitive("2DS") ||
+            stickLayout.getValue().equals_insensitive("3DS")))
+      return failure();
+    // Match layout.
+    StringAttr qStickLayout = qStickOp.getLayoutAttr();
+    if (stickLayout != qStickLayout)
+      return failure();
+
+    // Rewrite by passing the stickified input directly to ZHighQuantizedStick.
+    ZHighQuantizedStickOp newQStickOp = rewriter.create<ZHighQuantizedStickOp>(
+        loc, stickInput, qStickOp.getInRecScale(), qStickOp.getInOffset(),
+        qStickOp.getLayoutAttr(), qStickOp.getQuantizedTypeAttr());
+    rewriter.replaceOp(qStickOp, newQStickOp.getResults());
+    return success();
+  }
+};
+
+void ZHighQuantizedStickOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  if (nnpaUseDynamicQuantizeLinearOnCPUForScaleOffset)
+    results.insert<QuantizedStickUnstickRemovalPattern>(context);
+}
+
+} // namespace zhigh
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Reduction/Reduction.cpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Reduction/Reduction.cpp
new file mode 100644
index 0000000000..91e96a5a62
--- /dev/null
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/Reduction/Reduction.cpp
@@ -0,0 +1,99 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===------------------ Reduction.cpp - ZHigh Operations ----------------===//
+//
+// Copyright 2024 The IBM Research Authors.
+//
+// =============================================================================
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp"
+
+using namespace mlir;
+using namespace onnx_mlir;
+
+namespace onnx_mlir {
+namespace zhigh {
+
+//===----------------------------------------------------------------------===//
+// ShapeHelper
+//===----------------------------------------------------------------------===//
+template <typename OP_TYPE>
+LogicalResult ZHighReductionOpShapeHelper<OP_TYPE>::computeShape() {
+  typename OP_TYPE::Adaptor operandAdaptor(operands, op->getAttrDictionary());
+
+  // Get operand.
+  Value data = operandAdaptor.getData();
+
+  // Get Rank
+  int64_t rank = createIE->getShapedTypeRank(data);
+
+  // Output dims of result.
+  DimsExpr outputDims;
+
+  // Get operands and bounds.
+  SmallVector<IndexExpr, 4> inputDims;
+  createIE->getShapeAsDims(data, inputDims);
+
+  int64_t axis = rank - 1;
+  LiteralIndexExpr one(1);
+  // Copy the input until the second to last dimension
+  for (int64_t i = 0; i < axis; ++i) {
+    outputDims.emplace_back(inputDims[i]);
+  }
+  // The innermost dimension or last dimension needs to be reduced to one
+  outputDims.emplace_back(
+      one); // NNPA is always true for keepdims so we will reduce the dimension
+
+  // Save the final result.
+  setOutputDims(outputDims);
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// ZHigh Shape Helper template instantiation
+// Keep template instantiation at the end of the file.
+//===----------------------------------------------------------------------===//
+
+template struct ZHighReductionOpShapeHelper<ZHighReduceMaxOp>;
+template struct ZHighReductionOpShapeHelper<ZHighReduceMinOp>;
+
+//===----------------------------------------------------------------------===//
+// Shape inference
+//===----------------------------------------------------------------------===//
+template <class OP_TYPE>
+static LogicalResult inferShapeForReductionOps(OP_TYPE &op) {
+  typename OP_TYPE::Adaptor operandAdaptor(op);
+  if (!hasRankedType(operandAdaptor.getData()))
+    return success();
+  RankedTensorType dataType =
+      mlir::cast<RankedTensorType>(operandAdaptor.getData().getType());
+  ZHighReductionOpShapeHelper<OP_TYPE> shapeHelper(op.getOperation(), {});
+  return shapeHelper.computeShapeAndUpdateType(
+      dataType.getElementType(), dataType.getEncoding());
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceMax
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZHighReduceMaxOp::inferShapes(
+    std::function<void(Region &)> doShapeInference) {
+  return inferShapeForReductionOps<ZHighReduceMaxOp>(*this);
+}
+
+//===----------------------------------------------------------------------===//
+// ReduceMin
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZHighReduceMinOp::inferShapes(
+    std::function<void(Region &)> doShapeInference) {
+  return inferShapeForReductionOps<ZHighReduceMinOp>(*this);
+}
+
+} // namespace zhigh
+} // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp
index cb8194f408..f9427116af 100644
--- a/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp
+++ b/src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp
@@ -54,6 +54,7 @@ DECLARE_SHAPE_HELPER_ZHIGH(ZHighStickForGRUOpShapeHelper)
 DECLARE_SHAPE_HELPER_ZHIGH(ZHighStickForLSTMOpShapeHelper)
 DECLARE_SHAPE_HELPER_ZHIGH(ZHighStickifiedConstantOfShapeOpShapeHelper)
 DECLARE_SHAPE_HELPER_ZHIGH(ZHighStickOpShapeHelper)
+DECLARE_SHAPE_HELPER_ZHIGH(ZHighQuantizedStickOpShapeHelper)
 DECLARE_SHAPE_HELPER_ZHIGH(ZHighUnstickOpShapeHelper)
 #undef DECLARE_SHAPE_HELPER_ZHIGH
 
@@ -68,6 +69,27 @@ struct ZHighMatMulOpShapeHelper : public ONNXOpShapeHelper {
       : ONNXOpShapeHelper(op, operands, ieBuilder, scope) {}
   virtual ~ZHighMatMulOpShapeHelper() {}
   mlir::LogicalResult computeShape() final;
+  // Broadcast 1 case: X:2D - Y:3DS
+  bool isBroadcasted1 = false;
+  // Broadcast 23 case: X:3DS - Y:2D
+  bool isBroadcasted23 = false;
+  // Stack case: X:3DS - Y:3DS
+  bool isStacked = false;
+  // Keep original dimensions in this order: m, n, p if 2D or s, m, n, p if 3D.
+  DimsExpr allOriginalDims;
+};
+
+//===----------------------------------------------------------------------===//
+// Shape helper for QuantizedMatMulOp.
+//===----------------------------------------------------------------------===//
+
+struct ZHighQuantizedMatMulOpShapeHelper : public ONNXOpShapeHelper {
+  ZHighQuantizedMatMulOpShapeHelper(mlir::Operation *op,
+      mlir::ArrayRef<mlir::Value> operands = {},
+      IndexExprBuilder *ieBuilder = nullptr, IndexExprScope *scope = nullptr)
+      : ONNXOpShapeHelper(op, operands, ieBuilder, scope) {}
+  virtual ~ZHighQuantizedMatMulOpShapeHelper() {}
+  mlir::LogicalResult computeShape() final;
   // Broadcast case: X:3DS - Y:2D
   bool isBroadcasted = false;
   // Stack case: X:3DS - Y:3DS
@@ -145,6 +167,25 @@ struct ZHighPoolingOpShapeHelper : public ONNXOpShapeHelper {
   DimsExpr allOriginalDims;
 };
 
+//===----------------------------------------------------------------------===//
+// Shape helper for ReductionOp.
+//===----------------------------------------------------------------------===//
+
+template <typename OP_TYPE>
+struct ZHighReductionOpShapeHelper : public ONNXOpShapeHelper {
+  ZHighReductionOpShapeHelper(mlir::Operation *op,
+      mlir::ArrayRef<mlir::Value> operands = {},
+      IndexExprBuilder *ieBuilder = nullptr, IndexExprScope *scope = nullptr)
+      : ONNXOpShapeHelper(op, operands, ieBuilder, scope) {}
+  virtual ~ZHighReductionOpShapeHelper() {}
+  mlir::LogicalResult computeShape() final;
+};
+
+using ZHighReduceMaxOpShapeHelper =
+    ZHighReductionOpShapeHelper<ZHighReduceMaxOp>;
+using ZHighReduceMinOpShapeHelper =
+    ZHighReductionOpShapeHelper<ZHighReduceMinOp>;
+
 //===----------------------------------------------------------------------===//
 // Shape helper for UnaryOp.
 //===----------------------------------------------------------------------===//
diff --git a/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.cpp b/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.cpp
index a67a8366b2..c08aca9e87 100644
--- a/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.cpp
+++ b/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.cpp
@@ -15,12 +15,41 @@
 #include "llvm/ADT/TypeSwitch.h"
 
 #include "src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
 #include "src/Dialect/Mlir/DialectBuilder.hpp"
 
 using namespace mlir;
 
 namespace onnx_mlir {
 
+// =============================================================================
+// ZLow Builder for building ZLow operations
+// =============================================================================
+
+void ZLowBuilder::stick(
+    Value x, Value out, StringAttr layout, IntegerAttr saturation) const {
+  b().create<zlow::ZLowStickOp>(loc(), x, out, layout, saturation);
+}
+
+void ZLowBuilder::quantizedStick(Value x, Value recScale, Value offset,
+    Value out, StringAttr layout, StringAttr qType) const {
+  b().create<zlow::ZLowQuantizedStickOp>(
+      loc(), x, recScale, offset, out, layout, qType);
+}
+
+void ZLowBuilder::quantizedMatMul(Value x, Value xRecScale, Value xOffset,
+    Value y, Value yRecScale, Value yOffset, Value bias, Value biasRecScale,
+    Value biasOffset, Value workArea, Value shape, Value out, Value outRecScale,
+    Value outOffset, StringAttr xQType, StringAttr yQType, StringAttr biasQType,
+    StringAttr outQType, IntegerAttr isBcast, IntegerAttr isStacked,
+    IntegerAttr preComputedBias, IntegerAttr disableClipping,
+    IntegerAttr dequantizeOutput) const {
+  b().create<zlow::ZLowQuantizedMatMulOp>(loc(), x, xRecScale, xOffset, y,
+      yRecScale, yOffset, bias, biasRecScale, biasOffset, workArea, shape, out,
+      outRecScale, outOffset, xQType, yQType, biasQType, outQType, isBcast,
+      isStacked, preComputedBias, disableClipping, dequantizeOutput);
+}
+
 // =============================================================================
 // IndexExpr Builder for Analysis
 // =============================================================================
diff --git a/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.hpp b/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.hpp
index b3310f0373..5af5db24c3 100644
--- a/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.hpp
+++ b/src/Accelerators/NNPA/Dialect/ZLow/DialectBuilder.hpp
@@ -37,10 +37,49 @@ struct IndexExprBuilderForZLow : IndexExprBuilder {
   mlir::Value getShapeVal(mlir::Value tensorOrMemrefValue, uint64_t i) final;
 };
 
+// =============================================================================
+// ZLow Builder for building ZLow operations
+// =============================================================================
+
+struct ZLowBuilder : public DialectBuilder {
+  ZLowBuilder(mlir::Location loc) : DialectBuilder(loc) {}
+  ZLowBuilder(mlir::OpBuilder &b, mlir::Location loc)
+      : DialectBuilder(b, loc) {}
+  ZLowBuilder(const DialectBuilder &db) : DialectBuilder(db) {}
+  virtual ~ZLowBuilder() {}
+
+  void stick(mlir::Value x, mlir::Value out, mlir::StringAttr layout,
+      mlir::IntegerAttr saturation) const;
+
+  void quantizedStick(mlir::Value x, mlir::Value xRecScale, mlir::Value xOffset,
+      mlir::Value out, mlir::StringAttr layout, mlir::StringAttr qType) const;
+
+  void quantizedMatMul(mlir::Value x, mlir::Value xRecScale,
+      mlir::Value xOffset, mlir::Value y, mlir::Value yRecScale,
+      mlir::Value yOffset, mlir::Value b, mlir::Value bRecScale,
+      mlir::Value bOffset, mlir::Value workArea, mlir::Value shape,
+      mlir::Value out, mlir::Value outRecScale, mlir::Value outOffset,
+      mlir::StringAttr xQType, mlir::StringAttr yQType, mlir::StringAttr bQType,
+      mlir::StringAttr outQType, mlir::IntegerAttr isBcast,
+      mlir::IntegerAttr isStacked, mlir::IntegerAttr preComputedBias,
+      mlir::IntegerAttr disableClipping,
+      mlir::IntegerAttr dequantizeOutput) const;
+};
+
 // =============================================================================
 // MultiDialectBuilder for ZLow
 // =============================================================================
 
+// Recursive class specialized for ZLowBuilder referred to as krnl.
+template <class... Ts>
+struct MultiDialectBuilder<ZLowBuilder, Ts...> : MultiDialectBuilder<Ts...> {
+  MultiDialectBuilder(mlir::OpBuilder &b, mlir::Location loc)
+      : MultiDialectBuilder<Ts...>(b, loc), zlow(b, loc) {}
+  MultiDialectBuilder(const DialectBuilder &db)
+      : MultiDialectBuilder<Ts...>(db), zlow(db) {}
+  ZLowBuilder zlow;
+};
+
 // Recursive class specialized for IndexExprBuilderForZLow referred to as
 // zlowIE.
 template <class... Ts>
diff --git a/src/Accelerators/NNPA/Dialect/ZLow/ZLow.td b/src/Accelerators/NNPA/Dialect/ZLow/ZLow.td
index 63fcb0704d..366252f6b7 100644
--- a/src/Accelerators/NNPA/Dialect/ZLow/ZLow.td
+++ b/src/Accelerators/NNPA/Dialect/ZLow/ZLow.td
@@ -2,7 +2,7 @@
 
 //===-- ZLowOps.td -- ZLow Dialect Operation Definitions -*- tablegen ------==//
 //
-// Copyright 2019-2020 The IBM Research Authors
+// Copyright 2019-2024 The IBM Research Authors
 //
 // =============================================================================
 //
@@ -36,8 +36,13 @@ class ZLow_Op<string mnemonic, list<Trait> traits = []> :
 def DLF16 : Type<CPred<"$_self.isF16()">, "dlfloat16 type">,
                  BuildableType<"$_builder.getF16Type()">;
 
+// 0-rank MemRef for scalar.
+def ODMemRefF32: MemRefRankOf<[F32], [0]>;
+
 // MemRef-like type for zTensor.
 def ZMemRef : MemRefOf<[DLF16]>;
+// Quantized zTensor.
+def ZQMemRef : MemRefOf<[DLF16, I8]>;
 
 //===----------------------------------------------------------------------===//
 // ZLow Operations
@@ -121,6 +126,19 @@ def ZLowExpOp:ZLow_Op<"exp", [MemRefsNormalizable,
                        StrAttr:$layout);
 }
 
+
+def ZLowInvSqrtOp:ZLow_Op<"invsqrt", [MemRefsNormalizable]> {
+  let summary = "ZLow invsqrt operation";
+  let description = [{
+  ZLow operation to perform a invsqrt.
+  }];
+  let arguments = (ins ZMemRef:$X,
+                       MemRefOf<[I64]>:$shape,
+                       ZMemRef:$Out,
+                       StrAttr:$layout);
+}
+
+
 def ZLowMinOp:ZLow_Op<"min", [MemRefsNormalizable,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "ZLow min operation";
@@ -147,6 +165,18 @@ def ZLowMaxOp:ZLow_Op<"max", [MemRefsNormalizable,
                        StrAttr:$layout);
 }
 
+def ZLowLeakyReluOp:ZLow_Op<"leakyrelu", [MemRefsNormalizable]> {
+  let summary = "ZLow leakyrelu operation";
+  let description = [{
+  ZLow operation to perform a leakyrelu.
+  }];
+  let arguments = (ins ZMemRef:$X,
+                       MemRefOf<[I64]>:$shape,
+                       ZMemRef:$Out,
+                       DefaultValuedAttr<F32Attr, "0.01">:$alpha,
+                       StrAttr:$layout);
+}
+
 def ZLowReluOp:ZLow_Op<"relu", [MemRefsNormalizable,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "ZLow relu operation";
@@ -159,6 +189,17 @@ def ZLowReluOp:ZLow_Op<"relu", [MemRefsNormalizable,
                        StrAttr:$layout);
 }
 
+def ZLowGeluOp:ZLow_Op<"gelu", [MemRefsNormalizable]> {
+  let summary = "ZLow gelu operation";
+  let description = [{
+  ZLow operation to perform a gelu.
+  }];
+  let arguments = (ins ZMemRef:$X,
+                       MemRefOf<[I64]>:$shape,
+                       ZMemRef:$Out,
+                       StrAttr:$layout);
+}
+
 def ZLowTanhOp:ZLow_Op<"tanh", [MemRefsNormalizable,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "ZLow tanh operation";
@@ -198,6 +239,43 @@ def ZLowSoftmaxOp:ZLow_Op<"softmax", [MemRefsNormalizable,
                        StrAttr:$act_func);
 }
 
+def ZLowSqrtOp:ZLow_Op<"sqrt", [MemRefsNormalizable]> {
+  let summary = "ZLow sqrt operation";
+  let description = [{
+  ZLow operation to perform a sqrt.
+  }];
+  let arguments = (ins ZMemRef:$X,
+                       MemRefOf<[I64]>:$shape,
+                       ZMemRef:$Out,
+                       StrAttr:$layout);
+}
+
+def ZLowReduceMaxOp:ZLow_Op<"reducemax", [MemRefsNormalizable]> {
+  let summary = "ZLow reducemax operation";
+  let description = [{
+  ZLow operation to perform a reducemax.
+  }];
+  let arguments = (ins ZMemRef:$X,
+                       MemRefOf<[I8]>:$work_area,
+                       MemRefOf<[I64]>:$shape,
+                       ZMemRef:$Out,
+                       StrAttr:$layout,
+                       StrAttr:$op_type);
+}
+
+def ZLowReduceMinOp:ZLow_Op<"reducemin", [MemRefsNormalizable]> {
+  let summary = "ZLow reducemin operation";
+  let description = [{
+  ZLow operation to perform a reducemin.
+  }];
+  let arguments = (ins ZMemRef:$X,
+                       MemRefOf<[I8]>:$work_area,
+                       MemRefOf<[I64]>:$shape,
+                       ZMemRef:$Out,
+                       StrAttr:$layout,
+                       StrAttr:$op_type);
+}
+
 def ZLowMatMulOp:ZLow_Op<"matmul", [MemRefsNormalizable,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "ZLow matmul operation";
@@ -209,22 +287,71 @@ def ZLowMatMulOp:ZLow_Op<"matmul", [MemRefsNormalizable,
     * 2nd item: n
     * 3rd item: p
   * In case of stacked: X(s, m, n) * Y(s, n, p) + Bias(s, p)
-       or broadcasting: X(s, m, n) * Y(n, p) + Bias(p)
+       or broadcasting1: X(m, n) * Y(s, n, p) + Bias(s, p)
+       or broadcasting23: X(s, m, n) * Y(n, p) + Bias(p)
   shape is a 1D MemRef (memref<4xi64>) whose items are:
     * 1st item: s
     * 2nd item: m
     * 3rd item: n
     * 4th item: p
-  * is_bcast: -1 broadcasting, 0: no broadcasting.
+  * is_bcast1:  -1 broadcasting1, 0: no broadcasting1.
+  * is_bcast23: -1 broadcasting23, 0: no broadcasting23.
   * is_stacked: -1 stacked, 0: unstacked.
+  * transposeA: !0 transpose A, 0: do not transpose A.
+  * transposeB: !0 transpose B, 0: do not transpose B.
   }];
   let arguments = (ins ZMemRef:$X,
                        ZMemRef:$Y,
                        ZMemRef:$Bias,
                        MemRefOf<[I64]>:$shape,
                        ZMemRef:$Out,
+                       DefaultValuedAttr<SI64Attr, "0">:$is_bcast1,
+                       DefaultValuedAttr<SI64Attr, "-1">:$is_bcast23,
+                       DefaultValuedAttr<SI64Attr, "-1">:$is_stacked,
+                       DefaultValuedAttr<SI64Attr, "0">:$transposeA,
+                       DefaultValuedAttr<SI64Attr, "0">:$transposeB);
+}
+
+def ZLowQuantizedMatMulOp:ZLow_Op<"quantizedMatmul", [MemRefsNormalizable]> {
+  let summary = "ZLow quantized matmul operation";
+  let description = [{
+  ZLow operation to perform a matmul.
+  work_area: a 4K-aligned buffer having the same layout as bias but dlfloat16 type.
+  * In case of unstacked: X(m, n) * Y(n, p) + Bias(p)
+  shape is a 1D MemRef (memref<3xi64>) whose items are:
+    * 1st item: m
+    * 2nd item: n
+    * 3rd item: p
+  * In case of stacked: X(s, m, n) * Y(s, n, p) + Bias(s, p)
+       or broadcasting: X(s, m, n) * Y(n, p) + Bias(p)
+  shape is a 1D MemRef (memref<4xi64>) whose items are:
+    * 1st item: s
+    * 2nd item: m
+    * 3rd item: n
+    * 4th item: p
+  * is_bcast: -1 broadcasting, 0: no broadcasting.
+  * is_stacked: -1 stacked, 0: unstacked.
+  * DequantizeOutput: -1 output is dequantized, 0: output is not dequantized.
+  * PreComputedBias: -1 bias is re-computed, 0: bias is not pre-computed.
+
+  Values for `q_type` are "DLFLOAT16", "INT8", "WEIGHTS", "UNDEFINED".
+
+  }];
+  let arguments = (ins ZQMemRef:$X, ODMemRefF32:$x_rec_scale, ODMemRefF32:$x_offset,
+                       ZQMemRef:$Y, ODMemRefF32:$y_rec_scale, ODMemRefF32:$y_offset,
+                       ZQMemRef:$Bias, ODMemRefF32:$bias_rec_scale, ODMemRefF32:$bias_offset,
+                       AnyTypeOf<[ZQMemRef, NoneType]>:$work_area,
+                       MemRefOf<[I64]>:$shape,
+                       ZQMemRef:$Out, ODMemRefF32:$out_rec_scale, ODMemRefF32:$out_offset,
+                       StrAttr:$x_q_type,
+                       StrAttr:$y_q_type,
+                       StrAttr:$bias_q_type,
+                       StrAttr:$out_q_type,
                        DefaultValuedAttr<SI64Attr, "-1">:$is_bcast,
-                       DefaultValuedAttr<SI64Attr, "-1">:$is_stacked);
+                       DefaultValuedAttr<SI64Attr, "-1">:$is_stacked,
+                       DefaultValuedAttr<SI64Attr, "0">:$pre_computed_bias,
+                       DefaultValuedAttr<SI64Attr, "0">:$disable_clipping,
+                       DefaultValuedAttr<SI64Attr, "0">:$dequantize_output);
 }
 
 def ZLowLSTMOp:ZLow_Op<"lstm", [MemRefsNormalizable,
@@ -340,6 +467,21 @@ def ZLowStickForGRUOp:ZLow_Op<"stickForGRU", [MemRefsNormalizable,
                        DefaultValuedStrAttr<StrAttr, "none">:$prev_layer);
 }
 
+def ZLowQuantizedStickOp:ZLow_Op<"quantizedStick", [MemRefsNormalizable]> {
+  let summary = "ZLow stick operation for quantization";
+  let description = [{
+  "ZLow operation to perform a quantization stick."
+  "Type is one of values: dlfloat16, int8, and weights."
+  }];
+  let arguments = (ins MemRefOf<[I8, F32]>:$X,
+                       MemRefRankOf<[F32], [0]>:$rec_scale,
+                       MemRefRankOf<[F32], [0]>:$offset,
+                       ZQMemRef:$out,
+                       StrAttr:$layout,
+                       StrAttr:$q_type);
+  let hasVerifier = 1;
+}
+
 def ZLowUnstickOp:ZLow_Op<"unstick", [MemRefsNormalizable,
     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "ZLow unstick operation";
diff --git a/src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.cpp b/src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.cpp
index 7526933777..677c666bcc 100644
--- a/src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.cpp
+++ b/src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.cpp
@@ -357,6 +357,48 @@ void ZLowBatchNormOp::getEffects(
   effects.emplace_back(MemoryEffects::Read::get(), &getShapeMutable(),
       SideEffects::DefaultResource::get());
 }
+//===----------------------------------------------------------------------===//
+// ZLowOps methods
+//===----------------------------------------------------------------------===//
+
+LogicalResult ZLowQuantizedStickOp::verify() {
+  ZLowQuantizedStickOp::Adaptor operandAdaptor(*this);
+  Value recScale = operandAdaptor.getRecScale();
+  Value offset = operandAdaptor.getOffset();
+  Value output = operandAdaptor.getOut();
+  auto outputType = llvm::dyn_cast<MemRefType>(output.getType());
+  if (!outputType)
+    return failure();
+
+  // Verify quantized type.
+  StringRef quantizedType = getQType();
+  if (!(quantizedType.equals_insensitive("dlfloat16") ||
+          quantizedType.equals_insensitive("int8") ||
+          quantizedType.equals_insensitive("weights")))
+    return emitOpError("q_type must be one of dlfloat16, int8, and weights");
+
+  // Verify element type of the output.
+  // TODO: should we have a more stricted contraint, e.g. signed integer?
+  Type elementType = outputType.getElementType();
+  if (quantizedType.equals_insensitive("dfloat16") && !elementType.isF16())
+    return emitOpError("q_type and element type mismatched");
+  if (quantizedType.equals_insensitive("int8") && !elementType.isInteger(8))
+    return emitOpError("q_type and element type mismatched");
+  if (quantizedType.equals_insensitive("weights") && !elementType.isInteger(8))
+    return emitOpError("q_type and element type mismatched");
+
+  // Verify recScale and offset.
+  if (auto ty = llvm::dyn_cast<MemRefType>(recScale.getType())) {
+    if (!ty.getElementType().isF32())
+      return emitOpError("recScale must be f32");
+  }
+  if (auto ty = llvm::dyn_cast<MemRefType>(offset.getType())) {
+    if (!ty.getElementType().isF32())
+      return emitOpError("offset must be f32");
+  }
+
+  return success();
+}
 
 } // namespace zlow
 } // namespace onnx_mlir
diff --git a/src/Accelerators/NNPA/NNPAAccelerator.cpp b/src/Accelerators/NNPA/NNPAAccelerator.cpp
index afd1bf5596..2e4a06c477 100644
--- a/src/Accelerators/NNPA/NNPAAccelerator.cpp
+++ b/src/Accelerators/NNPA/NNPAAccelerator.cpp
@@ -4,7 +4,7 @@
 
 //===-------------------------- NNPAAccelerator.cpp -----------------------===//
 //
-// Copyright 2022 The IBM Research Authors.
+// Copyright 2022-2024 The IBM Research Authors.
 //
 // =============================================================================
 //
@@ -52,9 +52,9 @@ NNPAAccelerator::NNPAAccelerator() : Accelerator(Accelerator::Kind::NNPA) {
   LLVM_DEBUG(llvm::dbgs() << "Creating an NNPA accelerator\n");
 
   // Print a warning if mcpu is not set or < z16.
-  if (!isCompatibleWithNNPALevel(NNPA_Z16))
-    llvm::outs() << "Warning: No NNPA code is generated because --mcpu is not "
-                    "set or < z16.\n";
+  if (!isCompatibleWithNNPALevel(NNPALevel::M14))
+    llvm::outs() << "\nWarning: No NNPA code is generated because:\n"
+                    "  --march is not set/older than z16.\n\n";
 
   acceleratorTargets.push_back(this);
   // Order is important! libRuntimeNNPA depends on libzdnn
@@ -63,7 +63,12 @@ NNPAAccelerator::NNPAAccelerator() : Accelerator(Accelerator::Kind::NNPA) {
 
 NNPAAccelerator::~NNPAAccelerator() { delete instance; }
 
-uint64_t NNPAAccelerator::getVersionNumber() const { return ZDNN_VERNUM; }
+// Return accelerator version number based on compile NNPA version
+uint64_t NNPAAccelerator::getVersionNumber() const {
+  if (isCompatibleWithNNPALevel(NNPALevel::M15))
+    return NNPA_ZDNN_VERSIONS[NNPALevel::M15];
+  return NNPA_ZDNN_VERSIONS[NNPALevel::M14];
+}
 
 void NNPAAccelerator::addPasses(mlir::OwningOpRef<mlir::ModuleOp> &module,
     mlir::PassManager &pm, onnx_mlir::EmissionTargetType &emissionTarget,
@@ -162,8 +167,10 @@ void NNPAAccelerator::conversionTargetONNXToKrnl(
 void NNPAAccelerator::rewritePatternONNXToKrnl(
     mlir::RewritePatternSet &patterns, mlir::TypeConverter &typeConverter,
     mlir::MLIRContext *ctx) const {
-  onnx_mlir::zhigh::populateZHighToZLowConversionPattern(
-      patterns, typeConverter, ctx, enableParallel);
+  onnx_mlir::zhigh::populateZHighToZLowConversionPattern(patterns,
+      typeConverter, ctx,
+      /*enableSIMD*/ OptimizationLevel >= 3 && !disableSimdOption,
+      enableParallel);
 }
 
 void NNPAAccelerator::conversionTargetKrnlToLLVM(
diff --git a/src/Accelerators/NNPA/Pass/NNPAPasses.hpp b/src/Accelerators/NNPA/Pass/NNPAPasses.hpp
index e1abed7ba2..f00fcdedff 100644
--- a/src/Accelerators/NNPA/Pass/NNPAPasses.hpp
+++ b/src/Accelerators/NNPA/Pass/NNPAPasses.hpp
@@ -30,6 +30,7 @@ std::unique_ptr<mlir::Pass> createDevicePlacementPass(
 
 /// Add pass for lowering ONNX ops to ZHigh ops.
 std::unique_ptr<mlir::Pass> createONNXToZHighPass();
+std::unique_ptr<mlir::Pass> createONNXToZHighPass(NNPAQuantType quantMode);
 void configureOnnxToZHighLoweringPass(bool reportOnNNPAUnsupportedOps);
 
 /// Add pass for rewriting ONNX ops for ZHigh.
diff --git a/src/Accelerators/NNPA/Runtime/OMRuntimeNNPA.c b/src/Accelerators/NNPA/Runtime/OMRuntimeNNPA.c
index d2d8877f1e..f55c30e795 100644
--- a/src/Accelerators/NNPA/Runtime/OMRuntimeNNPA.c
+++ b/src/Accelerators/NNPA/Runtime/OMRuntimeNNPA.c
@@ -24,7 +24,6 @@
 #include <stdlib.h>
 
 #include "zDNNExtension/zDNNExtension.h"
-#include "zdnn.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -114,6 +113,22 @@ void OMInitAccelNNPA() {
   }
 }
 
+/*!
+ *  \brief Function to obtain the zDNN versions from the input versionNum.
+ *
+ *  The zDNN major, minor, and patch versions are extracted from the input
+ *  versionNum and set in *ver_major, *ver_minor, and *ver_patch.
+ *
+ *  See the zDNN documentation for the definition of the major, minor, and
+ *  patch versions.
+ */
+void getZDNNVersions(uint32_t versionNum, unsigned long long *ver_major,
+    unsigned long long *ver_minor, unsigned long long *ver_patch) {
+  *ver_major = versionNum >> 16;
+  *ver_minor = (versionNum >> 8) & 0xff;
+  *ver_patch = versionNum & 0xff;
+}
+
 /*!
  *  \brief Function that performs the initialization of the NNPA device and
  *   check that the NNPA version that the program was compiled for is compatible
@@ -174,15 +189,37 @@ uint64_t OMInitCompatibleAccelNNPA(uint64_t versionNum) {
     pthread_mutex_unlock(&OMMutexForInitShutdownNNPA);
     if (!isCompatible) {
       /* Code below has to agree with zdnn.h convention. */
-      unsigned long long ver_major = versionNum >> 16;
-      unsigned long long ver_minor = (versionNum >> 8) & 0xff;
-      unsigned long long ver_patch = versionNum & 0xff;
+      /* Create and initialize variables to 0 to avoid code scan error. */
+      unsigned long long mod_ver_major = 0;
+      unsigned long long mod_ver_minor = 0;
+      unsigned long long mod_ver_patch = 0;
+      /* Invoke getZDNNVersions() to extract the zDNN major, minor, and patch
+       * version numbers from the model's version number. */
+      getZDNNVersions(
+          versionNum, &mod_ver_major, &mod_ver_minor, &mod_ver_patch);
+      uint32_t zDNNLibaryVersion = zdnn_get_library_version();
+      unsigned long long lib_ver_major = 0;
+      unsigned long long lib_ver_minor = 0;
+      unsigned long long lib_ver_patch = 0;
+      /* Invoke getZDNNVersions() to extract the zDNN major, minor, and patch
+       * version numbers from the zDNN library version number. */
+      getZDNNVersions(
+          zDNNLibaryVersion, &lib_ver_major, &lib_ver_minor, &lib_ver_patch);
+      uint32_t zDNNAPIMaxVersion = zdnn_get_max_runnable_version();
+      unsigned long long api_ver_major = 0;
+      unsigned long long api_ver_minor = 0;
+      unsigned long long api_ver_patch = 0;
+      /* Invoke getZDNNVersions() to extract the zDNN major, minor, and patch
+       * version numbers from the zDNN maximum API version number. */
+      getZDNNVersions(
+          zDNNAPIMaxVersion, &api_ver_major, &api_ver_minor, &api_ver_patch);
       fprintf(stderr,
-          "Model is running on hardware that is not compatible with "
-          "the zDNN library that this model was compiled for "
-          "(version num %llu.%llu.%llu). Please ensure a compatible zDNN "
-          "library is available.\n ",
-          ver_major, ver_minor, ver_patch);
+          "Model requires zDNN API version %llu.%llu.%llu. The system has "
+          "zDNN library version %llu.%llu.%llu and supports up to zDNN API"
+          " version %llu.%llu.%llu.\n",
+          mod_ver_major, mod_ver_minor, mod_ver_patch, lib_ver_major,
+          lib_ver_minor, lib_ver_patch, api_ver_major, api_ver_minor,
+          api_ver_patch);
       errno = EPERM;
       return false;
     }
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c
index 276c5ee87d..4a2beedf7a 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/Elementwise.c
@@ -28,7 +28,6 @@
 #include <sys/time.h>
 
 #include "zDNNExtension.h"
-#include "zdnn.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -304,6 +303,46 @@ zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
   return status;
 }
 
+// -----------------------------------------------------------------------------
+// Extension Functions for arch15
+// arch15 specific zdnn functions but with the `_ext` postfix.
+// Retrieve the zdnn status message
+// -----------------------------------------------------------------------------
+
+zdnn_status zdnn_gelu_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
+  zdnn_status status = zdnn_gelu(input, output);
+  CHECK_ZDNN_STATUS(status, "zdnn_gelu");
+  return status;
+}
+
+zdnn_status zdnn_invsqrt_ext(
+    const zdnn_ztensor *input, float epsilon, zdnn_ztensor *output) {
+  zdnn_status status = zdnn_invsqrt(input, epsilon, output);
+  CHECK_ZDNN_STATUS(status, "zdnn_invsqrt");
+  return status;
+}
+
+zdnn_status zdnn_leaky_relu_ext(const zdnn_ztensor *input,
+    const void *clipping_value, float adjustment_factor, zdnn_ztensor *output) {
+  zdnn_status status =
+      zdnn_leaky_relu(input, clipping_value, adjustment_factor, output);
+  CHECK_ZDNN_STATUS(status, "zdnn_leaky_relu");
+  return status;
+}
+
+zdnn_status zdnn_reduce_ext(const zdnn_ztensor *input, void *save_area,
+    int opType, zdnn_ztensor *output) {
+  zdnn_status status = zdnn_reduce(input, save_area, opType, output);
+  CHECK_ZDNN_STATUS(status, "zdnn_reduce");
+  return status;
+}
+
+zdnn_status zdnn_sqrt_ext(const zdnn_ztensor *input, zdnn_ztensor *output) {
+  zdnn_status status = zdnn_sqrt(input, output);
+  CHECK_ZDNN_STATUS(status, "zdnn_sqrt");
+  return status;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
index c15c8666b0..311555ce9e 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/MatMul.c
@@ -31,7 +31,6 @@
 #include <sys/time.h>
 
 #include "zDNNExtension.h"
-#include "zdnn.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -173,6 +172,19 @@ zdnn_status zdnn_matmul_bcast_op_ext(const zdnn_ztensor *inputA,
   return status;
 }
 
+// transpose_a and transpose_b are actually boolean values but we will represent
+// these values in terms of integer values 0 or 1 for consistency.
+zdnn_status zdnn_matmul_transpose_op_ext(const zdnn_ztensor *inputA,
+    const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int transpose_a,
+    int transpose_b, int opType, zdnn_ztensor *output) {
+  zdnn_status status = zdnn_matmul_transpose_op(
+      inputA, inputB, inputC, transpose_a, transpose_b, opType, output);
+  // Compiler does not check the return result at this moment. Thus, check it
+  // here.
+  CHECK_ZDNN_STATUS(status, "zdnn_matmul_transpose");
+  return status;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c b/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c
index 69192260a5..e59143fdbb 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/Softmax.c
@@ -28,7 +28,6 @@
 #include <sys/time.h>
 
 #include "zDNNExtension.h"
-#include "zdnn.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
index 9cc2ae9e25..e967bdb9fd 100644
--- a/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
+++ b/src/Accelerators/NNPA/Runtime/zDNNExtension/zDNNExtension.h
@@ -329,6 +329,23 @@ zdnn_status zdnn_softmax_ext(const zdnn_ztensor *input, void *save_area,
     zdnn_softmax_act act_func, zdnn_ztensor *output);
 zdnn_status zdnn_tanh_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
 
+// -----------------------------------------------------------------------------
+// Extension Functions for arch15
+// arch15 specific zdnn functions but with the `_ext` postfix.
+// -----------------------------------------------------------------------------
+
+zdnn_status zdnn_gelu_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
+zdnn_status zdnn_invsqrt_ext(
+    const zdnn_ztensor *input, float epsilon, zdnn_ztensor *output);
+zdnn_status zdnn_leaky_relu_ext(const zdnn_ztensor *input,
+    const void *clipping_value, float adjustment_factor, zdnn_ztensor *output);
+zdnn_status zdnn_sqrt_ext(const zdnn_ztensor *input, zdnn_ztensor *output);
+zdnn_status zdnn_matmul_transpose_op_ext(const zdnn_ztensor *inputA,
+    const zdnn_ztensor *inputB, const zdnn_ztensor *inputC, int transpose_a,
+    int transpose_b, int opType, zdnn_ztensor *output);
+zdnn_status zdnn_reduce_ext(const zdnn_ztensor *input, void *save_area,
+    int op_type, zdnn_ztensor *output);
+
 // -----------------------------------------------------------------------------
 // Misc Utility Functions
 // -----------------------------------------------------------------------------
diff --git a/src/Accelerators/NNPA/Support/LayoutHelper.hpp b/src/Accelerators/NNPA/Support/LayoutHelper.hpp
index fb512fc90a..26ca4a6801 100644
--- a/src/Accelerators/NNPA/Support/LayoutHelper.hpp
+++ b/src/Accelerators/NNPA/Support/LayoutHelper.hpp
@@ -35,6 +35,12 @@ const std::string LAYOUT_ZRH = "ZRH";
 const std::string LAYOUT_BFICO = "BFICO";
 const std::string LAYOUT_BZRH = "BZRH";
 
+// Quantized transform type.
+const std::string QTYPE_DLFLOAT16 = "DLFLOAT16";
+const std::string QTYPE_INT8 = "INT8";
+const std::string QTYPE_WEIGHTS = "WEIGHTS";
+const std::string QTYPE_UNDEFINED = "UNDEFINED";
+
 zdnn_data_layouts convertLayoutAttrToZDNNDataLayout(
     int64_t rank, mlir::StringAttr layoutAttr);
 
diff --git a/src/Accelerators/NNPA/Support/NNPALimit.cpp b/src/Accelerators/NNPA/Support/NNPALimit.cpp
index 5206442803..df14d4a4ce 100644
--- a/src/Accelerators/NNPA/Support/NNPALimit.cpp
+++ b/src/Accelerators/NNPA/Support/NNPALimit.cpp
@@ -18,29 +18,87 @@
 #include <assert.h>
 #include <string>
 
+using namespace onnx_mlir;
+
 //===----------------------------------------------------------------------===//
-// Compatibility checks
+// Scan mcpu and march flags into NNPALevel
 
-/// Convert the input NNPA level, ie. "z16", to a integer value representing the
-/// level, ie. "16". When unkown / out of bounds, returns 0.
-int64_t convertNNPALevel(std::string inputNNPALevel) {
-  if (inputNNPALevel.size() != 3 || inputNNPALevel[0] != 'z')
-    return 0;
-  if (inputNNPALevel[1] == '1') {
-    if (inputNNPALevel[2] == '6')
-      return 16;
+static NNPALevel getNNPAFromTargetFlag(std::string str) {
+  // Coded it efficiently as it is called over and over again.
+  if (str.size() == 3) {
+    if (str[0] == 'z') {
+      if (str[1] == '1') {
+        if (str[2] == '6')
+          return NNPALevel::M14;
+      }
+    }
+  } else if (str.size() == 6) {
+    if (str[0] == 'a' && str[1] == 'r' && str[2] == 'c' && str[3] == 'h') {
+      if (str[4] == '1') {
+        if (str[5] == '4')
+          return NNPALevel::M14;
+        if (str[5] == '5')
+          return NNPALevel::M15;
+      }
+    }
   }
-  return 0;
+  return NNPALevel::NONE;
+}
+
+// Read march flag, and if undefined, then read mcpu.
+NNPALevel getNNPAFromFlags() {
+  NNPALevel level = getNNPAFromTargetFlag(march);
+  if (level == NNPALevel::NONE)
+    level = getNNPAFromTargetFlag(mcpu);
+  return level;
+}
+
+//===----------------------------------------------------------------------===//
+// Print NNPALevel as a string (depending on which option was given)
+
+// Print level using mcpu, march, or both depending on the options that were
+// given to the compiler. Favor the zYY names below over the archXX names.
+std::string getNNPAString(NNPALevel level) {
+  std::string val;
+  if (!mcpu.empty()) {
+    // The mcpu compiler option is defined, give an answer
+    if (level == NNPALevel::M14)
+      val = "--mcpu=z16"; // Note: --mcpu is deprecated.
+    else if (level == NNPALevel::M15)
+      val = "--mcpu=arch15"; // Note: --mcpu is deprecated.
+    else
+      assert(level == NNPALevel::NONE && "unknown mcpu option");
+  }
+  if (!march.empty()) {
+    if (!val.empty() && level != NNPALevel::NONE)
+      val = val.append(" ");
+    // The march compiler option is defined, give an answer
+    if (level == NNPALevel::M14)
+      val = val.append("--march=z16");
+    else if (level == NNPALevel::M15)
+      val = val.append("--march=arch15");
+    else
+      assert(level == NNPALevel::NONE && "unknown march option");
+  }
+  return val;
 }
 
 /// A function to check whether the input NNPA level, ie. "z16", is compatible
 /// with the current NNPA level.
-bool isCompatibleWithNNPALevel(std::string inputNNPALevel) {
-  int64_t inLevel = convertNNPALevel(inputNNPALevel);
-  int64_t mcpuLevel = convertNNPALevel(onnx_mlir::mcpu);
-  if (inLevel == 0 && mcpuLevel == 0)
+bool isCompatibleWithNNPALevel(NNPALevel level) {
+  NNPALevel flagLevel = getNNPAFromFlags();
+  if (level == NNPALevel::NONE && flagLevel == NNPALevel::NONE)
     return false;
-  return inLevel <= mcpuLevel;
+  return level <= flagLevel;
+}
+
+/// A function to check whether the current --march, ie. "z16", is less than or
+/// equal to the given NNPA level.
+bool isLessEqualNNPALevel(NNPALevel level) {
+  NNPALevel flagLevel = getNNPAFromFlags();
+  if (level == NNPALevel::NONE && flagLevel == NNPALevel::NONE)
+    return false;
+  return flagLevel <= level;
 }
 
 //===----------------------------------------------------------------------===//
@@ -48,14 +106,41 @@ bool isCompatibleWithNNPALevel(std::string inputNNPALevel) {
 
 // The NNPA maximum supported dimension index size value by using
 // zdnn_get_nnpa_max_dim_idx_size() This value depends on HW.
-static constexpr int64_t NNPA_Z16_MAXIMUM_DIMENSION_INDEX_SIZE = 32768;
+static constexpr int64_t NNPA_ARCH14_MAXIMUM_DIMENSION_INDEX_SIZE = 32768;
+
+/*
+  ARCH15 sizes are dimension dependent:
+        for(int i=1; i<=4; ++i) {
+                uint32_t maxDimSize = zdnn_get_max_for_dim((uint8_t) i);
+                printf("  max size for dim e%i: %i\n", i, (int) maxDimSize);
+        }
+
+  max size for dim e1: 2097152
+  max size for dim e2: 1048576
+  max size for dim e3: 32768
+  max size for dim e4: 32768
+*/
+static constexpr int64_t NNPA_ARCH15_MAXIMUM_DIMENSION_INDEX_SIZES[] = {
+    /*e1*/ 2097152, /*e2*/ 1048576, /*e3*/ 32768, /*e4*/ 32768};
 
 int64_t NNPAGetMaxForDim(int64_t dim, int64_t rank) {
   assert(rank >= 0 && "expected positive rank");
   assert(dim >= 0 && dim < rank && "dim outside range [0..rank)");
   if (rank > 4)
     return 0;
-  if (isCompatibleWithNNPALevel(NNPA_Z16))
-    return NNPA_Z16_MAXIMUM_DIMENSION_INDEX_SIZE;
+  // rank 4: (index from memref = 0, 1, 2, 3) -> e (4, 3, 2, 1)
+  // rank 3: (index from memref = 0, 1, 2) -> e (3, 2, 1)
+  // rank 2: (index from memref = 0, 1) -> e (2, 1)
+  // rank 1: (index from memref = 0) -> e (1)
+  int64_t e = rank - dim;
+
+  // List from newest NNPA to oldest, to select the most recent compatible
+  // one.
+  if (isCompatibleWithNNPALevel(NNPALevel::M15))
+    return NNPA_ARCH15_MAXIMUM_DIMENSION_INDEX_SIZES[e - 1];
+
+  if (isCompatibleWithNNPALevel(NNPALevel::M14))
+    return NNPA_ARCH14_MAXIMUM_DIMENSION_INDEX_SIZE;
+
   return 0;
 }
diff --git a/src/Accelerators/NNPA/Support/NNPALimit.hpp b/src/Accelerators/NNPA/Support/NNPALimit.hpp
index fdf43a65e3..25f7d71c70 100644
--- a/src/Accelerators/NNPA/Support/NNPALimit.hpp
+++ b/src/Accelerators/NNPA/Support/NNPALimit.hpp
@@ -16,6 +16,7 @@
 #define ONNX_MLIR_NNPA_LIMIT_H
 
 #include <stdint.h>
+#include <string>
 
 // Get maximum number of element for a given NNPA tensor. Dim is a tensor/memref
 // index (from 0 to rank-1), with dim=0 being the outermost dimension and
@@ -32,8 +33,28 @@ static constexpr int64_t NNPA_MAXIMUM_TENSOR_SIZE = 4294967296;
 static constexpr int64_t MAXIMUM_NUM_HIDDEN_SIZE_LSTM = 8192;
 static constexpr int64_t MAXIMUM_NUM_HIDDEN_SIZE_GRU = 10880;
 
-// The NNPA levels.
-static constexpr const char *NNPA_Z16 = "z16";
+// The NNPA levels. Newer versions must have larger numbers than older versions.
+typedef enum NNPALevel {
+  NONE = 0,
+  M14 = 1, // Associated with march=arch14 | z16.
+  M15 = 2, // Associated with march=arch15.
+} NNPALevel;
+
+// The NNPA ZDNN versions. Keep in sync with enum NNPALevel.
+static constexpr uint64_t NNPA_ZDNN_VERSIONS[3] = {
+    /*NONE*/ 0x0, /*M14*/ 0x010001, /*M15*/ 0x010101};
+
+// Scan to NNPALevel and print from NNPALevel.
+NNPALevel getNNPAFromFlags();
+std::string getNNPAString(NNPALevel level);
+
+/// A function to check whether the input NNPA level, ie. "z16" or "arch14", is
+/// compatible with the current NNPA level.
+bool isCompatibleWithNNPALevel(NNPALevel level);
+
+/// A function to check whether the current --march (or deprecated --mcpu), ie.
+/// "z16" or "arch14", is less than or equal to the given NNPA level.
+bool isLessEqualNNPALevel(NNPALevel level);
 
 // Maximum/Minimum value in dlfloat16.
 // dlfloat value =  (-1)^s * 2^(e-31) * (1 + m/512), e=[0, 63], m=[0, 511],
@@ -43,4 +64,5 @@ static constexpr const char *NNPA_Z16 = "z16";
 // and (s=1,e=63,m=510) as the minimum value.
 static constexpr float DLF16_MAX = (1L << 32) * (1.0 + (510.0 / 512.0));
 static constexpr float DLF16_MIN = -1 * (1L << 32) * (1.0 + (510.0 / 512.0));
+
 #endif
diff --git a/src/Accelerators/NNPA/Support/Stickify/Stickify.cpp b/src/Accelerators/NNPA/Support/Stickify/Stickify.cpp
index 1ea51cc91d..646ae0ec91 100644
--- a/src/Accelerators/NNPA/Support/Stickify/Stickify.cpp
+++ b/src/Accelerators/NNPA/Support/Stickify/Stickify.cpp
@@ -30,18 +30,41 @@
 #pragma export(zdnn_get_library_version)
 #endif
 
+/// Verify the transformed descriptor
 zdnn_status verify_transformed_descriptor(const zdnn_tensor_desc *tfrmd_desc);
 
+zdnn_status set_zdnn_status(zdnn_status status, const char *func_name,
+    const char *file_name, int line_no, const char *format, ...);
+
+#define ZDNN_STATUS(status, format, ...)                                       \
+  set_zdnn_status(status, __func__, __FILE__, __LINE__, format, __VA_ARGS__)
+
+#define ZDNN_STATUS_NO_MSG(status) ZDNN_STATUS(status, NULL, NO_ARG)
+#ifndef ZDNN_CONFIG_DEBUG
+#define ZDNN_STATUS_OK ZDNN_OK
+#else
+#define ZDNN_STATUS_OK ZDNN_STATUS_NO_MSG(ZDNN_OK)
+#endif
+
 /// Macros from third_party/zdnn-lib/zdnn/zdnn_private.h
 
 #define AIU_BYTES_PER_STICK 128
+#define AIU_1BYTE_CELLS_PER_STICK 128
 #define AIU_2BYTE_CELLS_PER_STICK 64
+#define AIU_4BYTE_CELLS_PER_STICK 32
 #define AIU_2BYTE_CELL_SIZE 2
 #define AIU_STICKS_PER_PAGE 32
 #define AIU_PAGESIZE_IN_BYTES 4096
 
 #define ZDNN_MAX_DIMS 4 // number of dims in AIU's Tensor Descriptor
 
+// From status.c
+// maximum size for the format string, including the prepended STATUS_STR_XXX
+#define MAX_STATUS_FMTSTR_SIZE 1024
+
+// -----------------------------------------------------------------------------
+// Misc Macros
+// -----------------------------------------------------------------------------
 #define CEIL(a, b)                                                             \
   static_cast<uint64_t>(((a) + (b)-1) / (b)) // positive numbers only
 #define MIN(a, b) (((a) > (b)) ? (b) : (a))
@@ -54,6 +77,7 @@ zdnn_status verify_transformed_descriptor(const zdnn_tensor_desc *tfrmd_desc);
       AIU_2BYTE_CELLS_PER_STICK)
 #define ZDNN_STATUS_OK ZDNN_OK
 
+// From zdnn_private.h
 typedef enum elements_mode {
   ELEMENTS_AIU,
   ELEMENTS_PRE,
@@ -61,11 +85,14 @@ typedef enum elements_mode {
   ELEMENTS_PRE_ALL_GATES
 } elements_mode;
 
-typedef /*vector*/ unsigned int vec_float32;
-typedef /*vector*/ unsigned short vec_int16;
-typedef /*vector*/ unsigned char vec_char8;
 // End - Macros from third_party/zdnn-lib/zdnn/zdnn_private.h
 
+// Functions from third_party/zdnn-lib/zdnn/status.h
+zdnn_status set_zdnn_status(zdnn_status status, const char *func_name,
+    const char *file_name, int line_no, const char *format, ...) {
+  return status;
+}
+
 // Functions from third_party/zdnn-lib/zdnn/get.c
 #define DECLARE_DATA_LAYOUT_STR(a) static const char *DATA_LAYOUT_STR_##a = #a;
 
@@ -219,6 +246,7 @@ short get_data_type_size(zdnn_data_types type) {
     CASE_RTN_SIZE(FP16, 2);
     CASE_RTN_SIZE(FP32, 4);
     CASE_RTN_SIZE(ZDNN_DLFLOAT16, 2);
+    CASE_RTN_SIZE(INT8, 1);
   }
 #undef CASE_RTN_SIZE
 
@@ -305,11 +333,33 @@ uint64_t get_num_elements(const zdnn_ztensor *ztensor, elements_mode mode) {
 
 // Functions from third_party/zdnn-lib/zdnn/allochelper.c
 uint64_t getsize_ztensor(const zdnn_tensor_desc *tfrmd_desc) {
-  // same formula for 4DFEATURE and 4DKERNEL tensors
+  uint32_t cells_per_stick;
+  uint32_t number_of_sticks;
+  switch (tfrmd_desc->type) {
+  case ZDNN_BINARY_INT8:
+    if (tfrmd_desc->format == ZDNN_FORMAT_4DWEIGHTS) {
+      // 4DWEIGHTS has two vectors interleaved, therefore only 64 cells vs 128
+      // Due to this interleaving, number_of_sticks is halved, but must be
+      // rounded up to stay even for proper interleaving.
+      cells_per_stick = AIU_2BYTE_CELLS_PER_STICK;
+      number_of_sticks = CEIL(tfrmd_desc->dim2, 2);
+    } else {
+      cells_per_stick = AIU_1BYTE_CELLS_PER_STICK;
+      number_of_sticks = tfrmd_desc->dim2;
+    }
+    break;
+  case ZDNN_BINARY_INT32:
+    cells_per_stick = AIU_4BYTE_CELLS_PER_STICK;
+    number_of_sticks = tfrmd_desc->dim2;
+    break;
+  case ZDNN_DLFLOAT16: /* fallthrough */
+  default:
+    cells_per_stick = AIU_2BYTE_CELLS_PER_STICK;
+    number_of_sticks = tfrmd_desc->dim2;
+  }
   return static_cast<uint64_t>(tfrmd_desc->dim4) * tfrmd_desc->dim3 *
-         CEIL(tfrmd_desc->dim2, AIU_STICKS_PER_PAGE) *
-         CEIL(tfrmd_desc->dim1, AIU_2BYTE_CELLS_PER_STICK) *
-         AIU_PAGESIZE_IN_BYTES;
+         CEIL(number_of_sticks, AIU_STICKS_PER_PAGE) *
+         CEIL(tfrmd_desc->dim1, cells_per_stick) * AIU_PAGESIZE_IN_BYTES;
 }
 
 zdnn_status allochelper_ztensor_alloc(zdnn_ztensor *ztensor) {
@@ -325,7 +375,7 @@ zdnn_status allochelper_ztensor_alloc(zdnn_ztensor *ztensor) {
 
   // get the size and allocate space aligned on a 4k boundary. If the malloc
   // fails, return error.
-  size = getsize_ztensor(ztensor->transformed_desc);
+  size = getsize_ztensor(ztensor->transformed_desc); // Modified
   if (!(ztensor->buffer = malloc_aligned_4k(size))) {
     return ZDNN_ALLOCATION_FAILURE;
   }
@@ -342,7 +392,9 @@ void allochelper_ztensor_free(zdnn_ztensor *ztensor) {
   free_aligned_4k(ztensor->buffer);
   ztensor->buffer = NULL;
   ztensor->buffer_size = 0;
-} // End - Functions from third_party/zdnn-lib/zdnn/allochelper.c
+}
+
+/* End - Functions from third_party/zdnn-lib/zdnn/allochelper.c */
 
 // Functions from third_party/zdnn-lib/zdnn/tensor_desc.c
 zdnn_status verify_pre_transformed_descriptor(
@@ -371,6 +423,7 @@ zdnn_status verify_pre_transformed_descriptor(
   case BFLOAT:
   case FP16:
   case FP32:
+  case INT8:
     // all of these are good cases
     break;
   default:
@@ -395,35 +448,71 @@ zdnn_status verify_transformed_descriptor(const zdnn_tensor_desc *tfrmd_desc) {
     case ZDNN_BIDIR_ZRH:
       break;
     default:
-      return ZDNN_INVALID_LAYOUT;
+      return ZDNN_STATUS(ZDNN_INVALID_LAYOUT, "Format is %s but layout is %s",
+          get_data_format_str(tfrmd_desc->format),
+          get_data_layout_str(tfrmd_desc->layout));
     }
     break;
   case ZDNN_FORMAT_4DKERNEL:
     if (tfrmd_desc->layout != ZDNN_HWCK) {
-      return ZDNN_INVALID_LAYOUT;
+      return ZDNN_STATUS(ZDNN_INVALID_LAYOUT, "Format is %s but layout is %s",
+          get_data_format_str(tfrmd_desc->format),
+          get_data_layout_str(tfrmd_desc->layout));
+    }
+    break;
+  case ZDNN_FORMAT_4DWEIGHTS:
+    if (tfrmd_desc->layout != ZDNN_NHWC) {
+      return ZDNN_STATUS(ZDNN_INVALID_LAYOUT, "Format is %s but layout is %s",
+          get_data_format_str(tfrmd_desc->format),
+          get_data_layout_str(tfrmd_desc->layout));
     }
     break;
+  default:
+    // unrecognized
+    return ZDNN_STATUS(ZDNN_INVALID_FORMAT, "Invalid format: %d (%s)",
+        tfrmd_desc->format, get_data_format_str(tfrmd_desc->format));
   }
-
-  // for right now only ZDNN_DLFLOAT16 is valid
-  if (tfrmd_desc->type != ZDNN_DLFLOAT16) {
+  // Only ZDNN_DLFLOAT16, ZDNN_BINARY_INT8, and ZDNN_BINARY_INT32 are currently
+  // supported.
+  if (tfrmd_desc->type != ZDNN_DLFLOAT16 &&
+      tfrmd_desc->type != ZDNN_BINARY_INT8 &&
+      tfrmd_desc->type != ZDNN_BINARY_INT32) {
     return ZDNN_INVALID_TYPE;
   }
 
   const uint32_t *dims_ptr = &(tfrmd_desc->dim4);
 
+  /* ToFix: the nnpa_query_result is not set up with onnx-mlir
+   * Temporarily commented out.
+   * Refer to issue #3034
+   */
+
+#if 0
   // is the dimension above the limit or zero?
   // transformed layout uses all dim* entries, so we'll check them all
   for (int i = 0; i < ZDNN_MAX_DIMS; i++) {
     if (!dims_ptr[i] || dims_ptr[i] > NNPAGetMaxForDim(i, ZDNN_MAX_DIMS)) {
       return ZDNN_INVALID_SHAPE;
     }
+   if (dims_ptr[i] > zdnn_get_max_for_dim(ZDNN_MAX_DIMS - i)) {
+
+      if (!zdnn_get_max_for_dim(ZDNN_MAX_DIMS - i)) {
+        return ZDNN_UNSUPPORTED_AIU_EXCEPTION;
+      } else {
+        return ZDNN_STATUS(
+            ZDNN_INVALID_SHAPE,
+            "Invalid shape for dim%d. (reason: dimension value %d exceeds %d)",
+            ZDNN_MAX_DIMS - i, dims_ptr[i],
+            zdnn_get_max_for_dim(ZDNN_MAX_DIMS - i));
+      }
+    }
   }
 
   // is stick area size above the limit?
-  if (getsize_ztensor(tfrmd_desc) > NNPA_MAXIMUM_TENSOR_SIZE) {
+  if (getsize_ztensor(tfrmd_desc) > zdnn_get_nnpa_max_tensor_size()) {
     return ZDNN_INVALID_SHAPE;
   }
+#endif
 
   return ZDNN_STATUS_OK;
 }
@@ -548,6 +637,36 @@ zdnn_status generate_transformed_desc(
   return status;
 }
 
+zdnn_status generate_quantized_transformed_desc(
+    const zdnn_tensor_desc *pre_tfrmd_desc,
+    zdnn_quantized_transform_types transform_type,
+    zdnn_tensor_desc *tfrmd_desc) {
+
+  zdnn_status status;
+  if ((status = generate_transformed_desc(pre_tfrmd_desc, tfrmd_desc)) !=
+      ZDNN_OK) {
+    return status;
+  }
+  switch (transform_type) {
+  case QUANTIZED_DLFLOAT16:
+    tfrmd_desc->format = ZDNN_FORMAT_4DFEATURE;
+    tfrmd_desc->type = ZDNN_DLFLOAT16;
+    return ZDNN_STATUS_OK;
+  case QUANTIZED_INT8:
+    tfrmd_desc->format = ZDNN_FORMAT_4DFEATURE;
+    tfrmd_desc->type = ZDNN_BINARY_INT8;
+    return ZDNN_STATUS_OK;
+  case QUANTIZED_WEIGHTS_INT8:
+    tfrmd_desc->format = ZDNN_FORMAT_4DWEIGHTS;
+    tfrmd_desc->type = ZDNN_BINARY_INT8;
+    return ZDNN_STATUS_OK;
+  default:
+    return ZDNN_INVALID_TRANSFORM_TYPE;
+    // return ZDNN_STATUS(ZDNN_INVALID_TRANSFORM_TYPE,
+    //                    "Invalid transform type: %d", transform_type);
+  }
+}
+
 zdnn_status generate_transformed_desc_concatenated(
     const zdnn_tensor_desc *pre_tfrmd_desc, zdnn_concat_info info,
     zdnn_tensor_desc *tfrmd_desc) {
@@ -631,6 +750,9 @@ void init_ztensor(zdnn_tensor_desc *pre_tfrmd_desc,
   output->transformed_desc = tfrmd_desc;
   output->is_transformed = false;
   memset(&output->reserved, 0, sizeof(output->reserved));
+  output->rec_scale = 0;
+  output->offset = 0;
+  memset(&output->reserved2, 0, sizeof(output->reserved2));
 } // End - Functions from third_party/zdnn-lib/zdnn/init_ztensor.c
 
 // Functions from third_party/zdnn-lib/zdnn/stickify.c
@@ -1389,6 +1511,152 @@ zdnn_status stickify(zdnn_ztensor *ztensor, ...) {
   return status;
 } // End - Functions from third_party/zdnn-lib/zdnn/stickify.c
 
+#define AIU_STICKS_PER_PAGE 32
+#define AIU_BYTES_PER_STICK 128
+#define AIU_1BYTE_CELLS_PER_STICK 128
+#define AIU_PAGESIZE_IN_BYTES 4096
+
+#define VECPERM_MAX_INT8_ENTRIES 8
+
+// The scalar version of transform_quantized_weights_ztensor()
+zdnn_status transform_quantized_weights_ztensor_element_wise(
+    const void *in_buf, zdnn_ztensor *output) {
+
+  // moving position as the input is processed, in BYTES
+  uint64_t input_offset = 0;
+  // moving position as the output is processed, in BYTES
+  uint64_t output_offset = 0;
+
+  // loop invariant values
+  uint64_t bytes_all_h =
+      (uint64_t)output->transformed_desc->dim3 *
+      CEIL(CEIL(output->transformed_desc->dim2, 2), AIU_STICKS_PER_PAGE) *
+      AIU_PAGESIZE_IN_BYTES;
+
+  uint64_t bytes_per_n = bytes_all_h * CEIL(output->transformed_desc->dim1,
+                                           (AIU_1BYTE_CELLS_PER_STICK / 2));
+
+  // N
+  for (uint32_t e4x = 0; e4x < output->transformed_desc->dim4; e4x++) {
+
+    // used for pushing out_offset from n to n+1 (i.e., + bytes_per_n)
+    uint64_t out_offset_n = output_offset;
+
+    // H
+    for (uint32_t e3x = 0; e3x < output->transformed_desc->dim3; e3x++) {
+
+      // W, sticks are processed in pairs
+      for (uint32_t e2x = 0; e2x < output->transformed_desc->dim2;
+           e2x = e2x + 2) {
+
+        // used for pushing out_offset from w to w+1 (i.e., +
+        // AIU_BYTES_PER_STICK)
+        uint64_t out_offset_w = output_offset;
+
+        // true when dim2 is odd number and we're at the last w
+        bool no_stick2 = ((output->transformed_desc->dim2 - e2x) == 1);
+
+        int8_t *stick1 = (int8_t *)in_buf + input_offset;
+        int8_t *stick2 = no_stick2 ? stick1
+                                   // duplicate stick1 entries if no stick2
+                                   : stick1 + output->transformed_desc->dim1;
+
+        // this C loop takes care of the full VECPERM_MAX_INT8_ENTRIES-entries
+        // groups
+        for (uint32_t i = 0;
+             i < output->transformed_desc->dim1 / VECPERM_MAX_INT8_ENTRIES;
+             i++) {
+          ((int8_t *)output->buffer + output_offset)[0] = stick1[0];
+          ((int8_t *)output->buffer + output_offset)[1] = stick2[0];
+          ((int8_t *)output->buffer + output_offset)[2] = stick1[1];
+          ((int8_t *)output->buffer + output_offset)[3] = stick2[1];
+          ((int8_t *)output->buffer + output_offset)[4] = stick1[2];
+          ((int8_t *)output->buffer + output_offset)[5] = stick2[2];
+          ((int8_t *)output->buffer + output_offset)[6] = stick1[3];
+          ((int8_t *)output->buffer + output_offset)[7] = stick2[3];
+
+          ((int8_t *)output->buffer + output_offset)[8] = stick1[4];
+          ((int8_t *)output->buffer + output_offset)[9] = stick2[4];
+          ((int8_t *)output->buffer + output_offset)[10] = stick1[5];
+          ((int8_t *)output->buffer + output_offset)[11] = stick2[5];
+          ((int8_t *)output->buffer + output_offset)[12] = stick1[6];
+          ((int8_t *)output->buffer + output_offset)[13] = stick2[6];
+          ((int8_t *)output->buffer + output_offset)[14] = stick1[7];
+          ((int8_t *)output->buffer + output_offset)[15] = stick2[7];
+
+          stick1 += VECPERM_MAX_INT8_ENTRIES;
+          stick2 += VECPERM_MAX_INT8_ENTRIES;
+          output_offset += VECPERM_MAX_INT8_ENTRIES * 2;
+
+          if ((i + 1) %
+                  (AIU_BYTES_PER_STICK / (VECPERM_MAX_INT8_ENTRIES * 2)) ==
+              0) {
+            // we need to jump to the next c-stick of the same super c-stick
+            //
+            // roll-back to the beginning and jump to bytes_all_h number of
+            // bytes away
+            output_offset = output_offset - AIU_BYTES_PER_STICK + bytes_all_h;
+          }
+        }
+
+        // takes care of the leftover c entries
+        for (uint32_t i = 0;
+             i < output->transformed_desc->dim1 % VECPERM_MAX_INT8_ENTRIES;
+             i++) {
+          ((int8_t *)output->buffer + output_offset)[0] = stick1[i];
+          ((int8_t *)output->buffer + output_offset)[1] = stick2[i];
+
+          output_offset += 2;
+        }
+
+        // move on to the next set
+        input_offset += output->transformed_desc->dim1 * (no_stick2 ? 1 : 2);
+        // output_offset was pushed around in dim1 loops, so reset it to
+        // the next w
+        output_offset = out_offset_w + AIU_BYTES_PER_STICK;
+      }
+
+      // after processing all the w-entries, go to the next 4k-boundary
+      // location (aka stick padding)
+      output_offset = (output_offset + (AIU_PAGESIZE_IN_BYTES - 1)) &
+                      (-AIU_PAGESIZE_IN_BYTES);
+    }
+
+    // output_offset was pushed around in the dims[2-0] loops, so reset it
+    // to the next n
+    output_offset = out_offset_n + bytes_per_n;
+  }
+
+  // Update the tensor's format to indicate it has been stickified
+  output->is_transformed = true;
+  return ZDNN_STATUS_OK;
+}
+
+zdnn_status quantized_stickify(zdnn_ztensor *ztensor, const void *in_buf) {
+  /* It is supposed to use zdnn_transform_quantized_ztensor here.
+   *  return zdnn_transform_quantized_ztensor(ztensor, 0, 0, in_buf);
+   *  The clip_min and clip_max will not be used when
+   *  transform_quantized_weights_ztensor() is called in this transform.
+   *  The reason that zdnn_transform_quantized_ztensor can't be called
+   *  is that the variable, nnpa_query_result, in the zdnn library built with
+   *  onnx-mlir has not been properly set up. Therefore, the check on
+   *  dimension size will fail. verify_transformed_descriptor() is called
+   *  by zdnn_transform_quantized_ztensor().
+   *  Tried to call zdnn_refresh_nnpa_query_result(), but failed.
+   *  In the copied verify_transformed_descriptor code, the code for checking
+   *  has been commented out.
+   *  Refer to issue #3034
+   */
+
+  zdnn_status status;
+  if ((status = verify_transformed_descriptor(ztensor->transformed_desc)) !=
+      ZDNN_OK) {
+    return status;
+  }
+
+  return transform_quantized_weights_ztensor_element_wise(in_buf, ztensor);
+}
+
 /// Set information for a pre transformed descriptor.
 void set_info_pre_transformed_desc(zdnn_tensor_desc *pre_tfrmd_desc,
     zdnn_data_layouts layout, zdnn_data_types type,
diff --git a/src/Accelerators/NNPA/Support/Stickify/Stickify.hpp b/src/Accelerators/NNPA/Support/Stickify/Stickify.hpp
index 9bc1284f0c..304893a14a 100644
--- a/src/Accelerators/NNPA/Support/Stickify/Stickify.hpp
+++ b/src/Accelerators/NNPA/Support/Stickify/Stickify.hpp
@@ -15,7 +15,10 @@
 #ifndef ONNX_MLIR_STICKIFY_H
 #define ONNX_MLIR_STICKIFY_H
 
+extern "C" {
 #include "zdnn.h"
+}
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 
@@ -28,6 +31,10 @@ void set_info_pre_transformed_desc(zdnn_tensor_desc *pre_tfrmd_desc,
 zdnn_status generate_transformed_desc(
     const zdnn_tensor_desc *pre_tfrmd_desc, zdnn_tensor_desc *tfrmd_desc);
 
+zdnn_status generate_quantized_transformed_desc(
+    const zdnn_tensor_desc *pre_tfrmd_desc, zdnn_quantized_transform_types,
+    zdnn_tensor_desc *tfrmd_desc);
+
 /// Generate a concatenated transformed descriptor.
 zdnn_status generate_transformed_desc_concatenated(
     const zdnn_tensor_desc *pre_tfrmd_desc, zdnn_concat_info concat_info,
@@ -66,4 +73,5 @@ void allochelper_ztensor_free(zdnn_ztensor *ztensor);
 ///          ZDNN_CONVERT_FAILURE
 ///
 zdnn_status stickify(zdnn_ztensor *ztensor, ...);
+zdnn_status quantized_stickify(zdnn_ztensor *ztensor, const void *in_buf);
 #endif
diff --git a/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.cpp b/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.cpp
index 6478052c37..e5a87008eb 100644
--- a/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.cpp
+++ b/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps.hpp"
+#include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/OpHelper.hpp"
 #include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
 #include "src/Accelerators/NNPA/Support/LayoutHelper.hpp"
 #include "src/Accelerators/NNPA/Support/Stickify/Stickify.hpp"
@@ -78,6 +79,8 @@ zdnn_data_types mlirTypeToZDNNType(Type elementType) {
       return FP32;
     } else
       llvm_unreachable("Unsupported data type.");
+  } else if (elementType.isInteger(8)) {
+    return INT8; // INT8 is accepted by verify_pre_transformed_descriptor
   } else
     llvm_unreachable("Unsupported data type.");
 }
@@ -172,6 +175,67 @@ ZHighStickifiedConstantOp createConstantForStick(PatternRewriter &rewriter,
   return constantOp;
 }
 
+bool isFoldableQuantizedStickOp(Value res) {
+  ZTensorEncodingAttr::QuantizedType qtype =
+      getZTensorQuantizedType(res.getType());
+  return (qtype == ZTensorEncodingAttr::QuantizedType::WEIGHTS ||
+          qtype == ZTensorEncodingAttr::QuantizedType::INT8);
+}
+
+ZHighStickifiedConstantOp createQuantizedConstantForStick(
+    PatternRewriter &rewriter, Value replacingValue, Value input,
+    Value recScale, Value offset, StringAttr layout, StringAttr quantizeType) {
+  Location loc = replacingValue.getLoc();
+  ArrayRef<int64_t> shape = mlir::cast<ShapedType>(input.getType()).getShape();
+  Type elementType = mlir::cast<ShapedType>(input.getType()).getElementType();
+  int rank = shape.size();
+
+  // Read dense attributes.
+  ElementsAttr dataAttr = getElementAttributeFromONNXValue(input);
+  assert(dataAttr && "Attribute is null");
+  // Read attributes's raw data.
+  std::vector<char> rawData;
+  getRawData(dataAttr, rawData);
+  // assert((rawData.size() == (uint64_t)getMemRefSizeInBytes(input)) &&
+  //        "Data size mismatched");
+
+  // Call stickify.
+  zdnn_tensor_desc pre_tfrmd_desc, tfrmd_desc;
+  // pre-transformed desc.
+  zdnn_data_layouts zDNNLayout =
+      convertLayoutAttrToZDNNDataLayout(rank, layout);
+  // If zDNNLayout is NHWC, we stickify directly from NCHW.
+  if (zDNNLayout == ZDNN_NHWC)
+    zDNNLayout = ZDNN_NCHW;
+  zdnn_data_types zDNNType = mlirTypeToZDNNType(elementType);
+  set_info_pre_transformed_desc(&pre_tfrmd_desc, zDNNLayout, zDNNType, shape);
+  // Check the condition for transformed desc.
+  // Currently, only QUANTIZED_WEIGHTS_INT8 is supported.
+  // The condition of being the weight for QuantizedMatMul has been checked
+  // by the matching pattern.
+  assert(zDNNType == INT8);
+  zdnn_quantized_transform_types transform_type = QUANTIZED_WEIGHTS_INT8;
+  zdnn_status status = generate_quantized_transformed_desc(
+      &pre_tfrmd_desc, transform_type, &tfrmd_desc);
+  assert(status == ZDNN_OK);
+  // Stick data using the software stickify.
+  zdnn_ztensor ztensor;
+  // init_quantized_ztensor can be used if the constant value for recScale and
+  // offset is extracted at compile time. However, in the following
+  // transformation for the quantized weight tensor, the recScale and offset
+  // is not used. The parameters are kept for possible future use.
+  init_ztensor(&pre_tfrmd_desc, &tfrmd_desc, &ztensor);
+  status = allochelper_ztensor_alloc(&ztensor);
+  assert(status == ZDNN_OK);
+  status = quantized_stickify(&ztensor, rawData.data());
+  assert(status == ZDNN_OK);
+  // Emit a constant global in ZHigh dialect.
+  ZHighStickifiedConstantOp constantOp = emitZHighStickifiedConstant(
+      rewriter, loc, &ztensor, replacingValue.getType());
+
+  return constantOp;
+}
+
 ZHighStickifiedConstantOp createConstantForStickForLSTM(
     PatternRewriter &rewriter, Value replacingValue, Value inputF, Value inputI,
     Value inputC, Value inputO) {
@@ -390,6 +454,34 @@ struct ConstantStickForLSTMPattern
   }
 };
 
+// zhigh.QuantizedStick (c) = krnl.global(c1), where c1 is stickified data.
+// Always saturate constants.
+struct ConstantQuantizedStickPattern
+    : public OpRewritePattern<ZHighQuantizedStickOp> {
+  ConstantQuantizedStickPattern(MLIRContext *context)
+      : OpRewritePattern(context) {}
+  LogicalResult matchAndRewrite(
+      ZHighQuantizedStickOp stickOp, PatternRewriter &rewriter) const override {
+    Value input = stickOp.getIn();
+    Value recscale = stickOp.getRecScale();
+    Value offset = stickOp.getOffset();
+    Value output = stickOp.getOut();
+    StringAttr layout = stickOp.getLayoutAttr();
+    StringAttr quantizedType = stickOp.getQuantizedTypeAttr();
+
+    // Match
+    if (!isDenseONNXConstant(input) || !isFoldableQuantizedStickOp(output)) {
+      return failure();
+    }
+
+    // Rewrite
+    Value stickifiedVal = createQuantizedConstantForStick(
+        rewriter, output, input, recscale, offset, layout, quantizedType);
+    replaceOpAndGC(rewriter, stickOp, {stickifiedVal, recscale, offset});
+    return success();
+  }
+};
+
 struct ZHighConstPropagationPass
     : public PassWrapper<ZHighConstPropagationPass, OperationPass<ModuleOp>> {
 
@@ -405,6 +497,7 @@ struct ZHighConstPropagationPass
     ModuleOp moduleOp = getOperation();
     ConversionTarget target(getContext());
     RewritePatternSet patterns(&getContext());
+    patterns.insert<ConstantQuantizedStickPattern>(patterns.getContext());
     patterns.insert<ConstantStickPattern>(patterns.getContext());
     patterns.insert<ConstantStickForGRUPattern>(patterns.getContext());
     patterns.insert<ConstantStickForLSTMPattern>(patterns.getContext());
diff --git a/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.td b/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.td
index 2646d6dba3..f8f822ab6c 100644
--- a/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.td
+++ b/src/Accelerators/NNPA/Transform/ZHigh/ZHighConstPropagation.td
@@ -28,28 +28,5 @@ include "src/Accelerators/NNPA/Dialect/ZHigh/ZHigh.td"
 ///    dag benefitsAdded = (addBenefit 0)
 /// >;
 
-//===----------------------------------------------------------------------===//
-// Pattern-Match and Rewrite
-//===----------------------------------------------------------------------===//
-
-// Useful test definitions.
-
-// Check an ONNXConstantOp is using dense attribute or not.
-def IsFromDenseONNXConstantOp:
-    Constraint<CPred<"isDenseONNXConstant($_self)">,
-  "Value is produced by a dense ONNXConstantOp">;
-
-// Constant propagation for stickify
-def CreateConstantForStick: NativeCodeCall<
-  "createConstantForStick($_builder, $0, $1, $2)"
->;
-
-def CreateConstantForStickForLSTM : NativeCodeCall<
-  "createConstantForStickForLSTM($_builder, $0, $1, $2, $3, $4)"
->;
-
-def CreateConstantForStickForGRU : NativeCodeCall<
-  "createConstantForStickForGRU($_builder, $0, $1, $2, $3)"
->;
 
 #endif // ZHIGH_CONST_PROPAGATION
diff --git a/src/Accelerators/NNPA/Transform/ZLow/CMakeLists.txt b/src/Accelerators/NNPA/Transform/ZLow/CMakeLists.txt
index 3710f0929b..fb304ec35b 100644
--- a/src/Accelerators/NNPA/Transform/ZLow/CMakeLists.txt
+++ b/src/Accelerators/NNPA/Transform/ZLow/CMakeLists.txt
@@ -14,6 +14,7 @@ add_onnx_mlir_library(OMZLowRewrite
   MLIRTransformUtils
   MLIRViewLikeInterface
   OMONNXToKrnl
+  OMZHighToZLow
   OMZLowOps
 
 
diff --git a/src/Accelerators/NNPA/Transform/ZLow/ZLowStickExpansion.cpp b/src/Accelerators/NNPA/Transform/ZLow/ZLowStickExpansion.cpp
index 49ce989321..e434f309b7 100644
--- a/src/Accelerators/NNPA/Transform/ZLow/ZLowStickExpansion.cpp
+++ b/src/Accelerators/NNPA/Transform/ZLow/ZLowStickExpansion.cpp
@@ -25,6 +25,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
+#include "src/Accelerators/NNPA/Conversion/ZHighToZLow/ProcessStickData.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZHigh/ZHighOps/ShapeHelper.hpp"
 #include "src/Accelerators/NNPA/Dialect/ZLow/ZLowOps.hpp"
 #include "src/Accelerators/NNPA/Pass/NNPAPasses.hpp"
@@ -40,7 +41,6 @@
 #define DEBUG_TYPE "zlow-stick-expansion"
 
 // Todo: cleanup after we are done experimenting.
-#define ENABLE_CSU_PAR true /* Allow parallel compiler gen Stick/Unstick. */
 #define PREFETCH_CSU_DIST 0
 #define PREFETCH_CSU 1
 
@@ -75,7 +75,7 @@ class UnstickExpansionPattern : public OpRewritePattern<ZLowUnstickOp> {
         layout.getValue().equals_insensitive("3D") ||
         layout.getValue().equals_insensitive("2D") ||
         layout.getValue().equals_insensitive("3DS") ||
-        (layout.getValue().equals_insensitive("NHWC"))) {
+        layout.getValue().equals_insensitive("NHWC")) {
       return generateUnstickCodeNoBuffer(rewriter, unstickOp);
     }
     // Otherwise, we don't replace and keep the zdnn call.
@@ -96,233 +96,27 @@ class UnstickExpansionPattern : public OpRewritePattern<ZLowUnstickOp> {
     Value alloc = unstickOp.getOut();
     DimsExpr outputDims;
     create.krnlIE.getShapeAsSymbols(alloc, outputDims);
-    int64_t rank = outputDims.size();
 
-    // Info for SIMD Vector Length (VL) and associated types.
-    int64_t archVL = 8;              // FP16 archVL.
-    int64_t archVLHalf = archVL / 2; // FP32 archVL.
-    assert(64 % archVL == 0 && "SIMD vector length must divide 64");
-    Type f16Type = rewriter.getF16Type();
-    Type f32Type = rewriter.getF32Type();
-    VectorType vecF16Type = VectorType::get({archVL}, f16Type);
-    MemRefType bufferType = MemRefType::get({archVL}, f32Type);
-
-    // Define useful literals.
-    IndexExpr litZero = LitIE(0);
-    IndexExpr lit1 = LitIE(1);
-    IndexExpr litArchVLHalf = LitIE(archVLHalf);
-    IndexExpr litArchVL = LitIE(archVL);
-    IndexExpr lit64 = LitIE(64);
-
-    // Useful references for indexing dimensions (neg val are not used).
-    int64_t E1 = rank - 1;
-
-    // Create loop iterations. Note that we iterate over E1 as tiles of 64
-    // elements.
-    ValueRange loopDefs = create.krnl.defineLoops(rank);
-    DimsExpr lbs(rank, litZero);
+    DimsExpr lbs(outputDims.size(), LitIE(0));
     DimsExpr ubs = outputDims;
-    IndexExpr T1 = outputDims[E1].ceilDiv(64);
-    ubs[E1] = T1; // E1 dim is over tiles.
-
-    // Predicates used to avoid creating code that is never used.
-    bool neverHas64 = outputDims[E1].isLiteralAndSmallerThan(64);
-    bool neverHas8 = outputDims[E1].isLiteralAndSmallerThan(8);
-    bool hasOnly64 =
-        outputDims[E1].isLiteral() && (outputDims[E1].getLiteral() % 64 == 0);
-    bool hasOnly8 =
-        outputDims[E1].isLiteral() && (outputDims[E1].getLiteral() % 8 == 0);
-
-    // Parallel...
-    if (enableParallel) {
-      int64_t parId;
-      // TODO: may want to check if ub of rank makes sense here.
-      if (findSuitableParallelDimension(lbs, ubs, 0, rank, parId, 8)) {
-        create.krnl.parallel(loopDefs[parId]);
-        onnxToKrnlParallelReport(op, true, parId, lbs[parId], ubs[parId],
-            "compiler-generated stickify");
-      } else {
-        onnxToKrnlParallelReport(op, false, -1, -1,
-            "no dim with enough work in compiler-generated stickify");
-      }
-    }
-
-    // Compute max tiles. It is actually not easy to compute the max number of
-    // tiles. Since we don't allocate, it is just a "view", we only need to
-    // index by the "tile size", it is sufficient to assume 2 or more. Tiles are
-    // 64.
-    IndexExpr T = LitIE(2);
-    DimsExpr reallocTileDims = {T, lit64};
-    Value inputAsTx64 = create.mem.reinterpretCast(input, reallocTileDims);
-    // Outer loop (E4, E3, E2, E1 iterates over tiles of 64 elements)
-    create.krnl.iterateIE(loopDefs, loopDefs, lbs, ubs,
-        [&](const KrnlBuilder &b, ValueRange loopInd) {
-          MDBuilder create(b);
-          IndexExprScope outerScope(create.krnl, &allocScope);
-          DimsExpr outerIndices = DimListIE(loopInd);
-          // Computation for reading inputs.
-          DimsExpr inputAF = outerIndices;
-          IndexExpr e1 = outerIndices[E1] * 64;
-          inputAF[E1] = e1;
-          // Translate the tile index t1 to the actual targetted data.
-          Value inputOffset =
-              create.krnl.getLinearOffsetIndexIE(input, inputAF);
-          IndexExpr inputDataOffset = SymIE(inputOffset);
-          IndexExpr inputTileOffset = inputDataOffset.floorDiv(64);
-
-          // Buffer for small leftovers (used when E1 % 8 != 0)
-          Value bufferF32;
-          if (!hasOnly8)
-            bufferF32 = create.mem.alignedAlloc(bufferType);
-
-// Prefetch
-#if PREFETCH_CSU
-          DimsExpr prefetchAF = inputAF;
-          // Prefetch current line
-          create.krnl.prefetchIE(input, prefetchAF, /*isWrite*/ false,
-              /*locality*/ 1);
-          create.krnl.prefetchIE(alloc, prefetchAF, /*isWrite*/ true,
-              /*locality*/ 1);
-#if PREFETCH_CSU_DIST > 0
-          // Prefetch line in advance.
-          prefetchAF[E1] = prefetchAF[E1] + (PREFETCH_CSU_DIST * 64);
-          create.krnl.prefetchIE(input, prefetchAF, /*isWrite*/ false,
-              /*locality*/ 1);
-          create.krnl.prefetchIE(alloc, prefetchAF, /*isWrite*/ true,
-              /*locality*/ 1);
-#endif
-#endif
-
-          // I may process here up to [e1 ... e1 + m*64), make sure its
-          // not going out of bound, i.e. beyond outputDIms[E1];
-          IndexExpr isFullLogical;
-          IndexExpr ub1 = SymIE(outputDims[E1]);
-          if (hasOnly64) {
-            isFullLogical = PredIE(true);
-          } else if (neverHas64) {
-            isFullLogical = PredIE(false);
-          } else {
-            IndexExpr isFull = create.krnlIE.isTileFull(e1, lit64, ub1);
-            isFullLogical = isFull >= 0;
+    IterateOverStickInputData<KrnlBuilder>(/* Affine, fine to use Krnl.*/
+        create.krnl, op, lbs, ubs, outputDims, unstickOp.getLayoutAttr(), input,
+        alloc, /*unroll*/ 4, enableParallel, PREFETCH_CSU,
+        [&](const KrnlBuilder &b, SmallVectorImpl<Value> &vecOfF32Vals,
+            DimsExpr &loopIndices) {
+          MultiDialectBuilder<VectorBuilder> create(b);
+          // Save the vectors of vecOfF32Vals in consecutive values of alloc.
+          int64_t size = vecOfF32Vals.size();
+          for (int64_t i = 0; i < size; ++i) {
+            Value val = vecOfF32Vals[i];
+            IndexExpr offset = LitIE(4 * i); // Vector of float have 4 values.
+            create.vec.storeIE(val, alloc, loopIndices, {offset.getValue()});
           }
-          create.scf.ifThenElse(
-              // Condition
-              isFullLogical.getValue(),
-              // Then (is full).
-              [&](const SCFBuilder b) {
-                MDBuilder create(b);
-                // Loop (tried unroll of 2 and 8, 4 was best).
-                const int64_t unrollVL = 4;
-                const int64_t totVL = unrollVL * archVL;
-                assert(totVL <= 64 && "bad unroll");
-                if (neverHas64)
-                  return; // Nothing to do here.
-
-                create.scf.forLoop(litZero.getValue(), lit64.getValue(), totVL,
-                    [&](const SCFBuilder b, ValueRange loopInd) {
-                      MDBuilder create(b);
-                      IndexExprScope innerScope(b, &outerScope);
-                      Value loopIndex = loopInd[0];
-                      IndexExpr l = DimIE(loopIndex);
-                      Value vecF16[unrollVL], vecF32H[unrollVL],
-                          vecF32L[unrollVL];
-                      // Load f16 values from input via reinterpreted data
-                      // tile.
-                      for (int64_t i = 0; i < unrollVL; ++i) {
-                        vecF16[i] = create.vec.loadIE(vecF16Type, inputAsTx64,
-                            {SymIE(inputTileOffset), l + (i * archVL)});
-                      }
-                      // Convert back to f32.
-                      for (int64_t i = 0; i < unrollVL; ++i) {
-                        auto convertOp =
-                            rewriter.create<ZLowConvertDLF16ToF32VectorOp>(
-                                loc, vecF16[i]);
-                        vecF32H[i] = convertOp.getResult(0);
-                        vecF32L[i] = convertOp.getResult(1);
-                      }
-                      // Store f32 values back to the (normal layout) output.
-                      DimsExpr outputAF = SymListIE(inputAF);
-                      outputAF[E1] = outputAF[E1] + l;
-                      for (int64_t i = 0; i < unrollVL; ++i) {
-                        LitIE iH(i * archVL), iL(i * archVL + archVL / 2);
-                        create.vec.storeIE(
-                            vecF32H[i], alloc, outputAF, {iH.getValue()});
-                        create.vec.storeIE(
-                            vecF32L[i], alloc, outputAF, {iL.getValue()});
-                      }
-                    });
-              },
-              // Else, we don't have a full (64 e1) tile.
-              [&](SCFBuilder b) {
-                MDBuilder create(b);
-                IndexExprScope middleScope(b, &outerScope);
-                IndexExpr tripCount = SymIE(ub1) - SymIE(e1);
-                if (hasOnly64)
-                  return;
-                if (!neverHas8) {
-                  // Note: if we only have multiple of VL, loop below will
-                  // handle all as we subtract (VL-1). Aka if VL=8 and tripCount
-                  // = 16, tripCountWithoutPartialLastVL is 16 - 7 = 9. Thus we
-                  // iterate over i=0 & i=8 as both are < 9.
-                  IndexExpr tripCountWithoutPartialLastVL =
-                      tripCount - (archVL - 1);
-                  create.scf.forLoop(litZero.getValue(),
-                      tripCountWithoutPartialLastVL.getValue(), archVL,
-                      [&](SCFBuilder b, ValueRange loopInd) {
-                        MDBuilder create(b);
-                        IndexExprScope innerScope(b, &middleScope);
-                        Value loopIndex = loopInd[0];
-                        IndexExpr l = DimIE(loopIndex);
-                        // Load f16 values from input via reinterpreted data
-                        // tile.
-                        Value vecF16 = create.vec.loadIE(vecF16Type,
-                            inputAsTx64, {SymIE(inputTileOffset), l});
-                        // Convert back to f32.
-                        auto convertOp =
-                            rewriter.create<ZLowConvertDLF16ToF32VectorOp>(
-                                loc, vecF16);
-                        Value vecF32H = convertOp.getResult(0);
-                        Value vecF32L = convertOp.getResult(1);
-                        // Store f32 values back to the (normal layout) output.
-                        DimsExpr outputAF = SymListIE(inputAF);
-                        outputAF[E1] = outputAF[E1] + l;
-                        create.vec.storeIE(vecF32H, alloc, outputAF);
-                        create.vec.storeIE(vecF32L, alloc, outputAF,
-                            {litArchVLHalf.getValue()});
-                      });
-                }
-                if (!hasOnly8) {
-                  // Deal with the last <8 values: compute f32 using simd.
-                  IndexExpr remainingScalarValues = tripCount % archVL;
-                  IndexExpr lastL = tripCount - remainingScalarValues;
-                  Value vecF16 = create.vec.loadIE(
-                      vecF16Type, inputAsTx64, {SymIE(inputTileOffset), lastL});
-                  // Convert back to f32.
-                  auto convertOp =
-                      rewriter.create<ZLowConvertDLF16ToF32VectorOp>(
-                          loc, vecF16);
-                  Value vecF32H = convertOp.getResult(0);
-                  Value vecF32L = convertOp.getResult(1);
-                  // Save into archVL value buffer.
-                  create.vec.storeIE(vecF32H, bufferF32, {litZero});
-                  create.vec.storeIE(vecF32L, bufferF32, {litArchVLHalf});
-                  // Save the remaining values as scalars.
-                  create.scf.forLoop(litZero.getValue(),
-                      remainingScalarValues.getValue(), 1,
-                      [&](SCFBuilder b, ValueRange loopInd) {
-                        MDBuilder create(b);
-                        IndexExprScope innerScope(b, &middleScope);
-                        Value loopIndex = loopInd[0];
-                        IndexExpr l = DimIE(loopIndex);
-                        // Load converted value.
-                        Value f32 = create.krnl.loadIE(bufferF32, {l});
-                        DimsExpr outputAF = SymListIE(inputAF);
-                        outputAF[E1] = outputAF[E1] + SymIE(lastL);
-                        outputAF[E1] = outputAF[E1] + l;
-                        create.krnl.storeIE(f32, alloc, outputAF);
-                      });
-                }
-              });
+        },
+        [&](const KrnlBuilder &b, mlir::Value scalarF32Val,
+            DimsExpr &loopIndices) {
+          // Save scalar value in alloc.
+          b.storeIE(scalarF32Val, alloc, loopIndices);
         });
     rewriter.eraseOp(unstickOp);
     return success();
diff --git a/src/Accelerators/NNPA/zdnn.cmake b/src/Accelerators/NNPA/zdnn.cmake
index 6585c32b0a..ff7a280c1f 100644
--- a/src/Accelerators/NNPA/zdnn.cmake
+++ b/src/Accelerators/NNPA/zdnn.cmake
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 function(setup_zdnn version)
-  set(ZDNN_GITHUB_URL https://github.com/IBM/zDNN)
+  # Set policy CMP0097 to NEW for it to not initialize submodules
+  cmake_policy(SET CMP0097 NEW)
+
+  set(ZDNN_GITHUB_URL https://github.com/IBM/zDNN.git)
+  message("Git clone zDNN. The ZDNN_GITHUB_URL is: ${ZDNN_GITHUB_URL}")
+
   set(ZDNN_PREFIX     ${CMAKE_CURRENT_BINARY_DIR}/zDNN)
   set(ZDNN_TOPDIR     ${ZDNN_PREFIX}/src/zdnn)
   set(ZDNN_OBJDIR     ${ZDNN_TOPDIR}/zdnn/obj)
@@ -12,6 +17,7 @@ function(setup_zdnn version)
     ExternalProject_Add(zdnn
       GIT_REPOSITORY ${ZDNN_GITHUB_URL}
       GIT_TAG ${version}
+      GIT_SUBMODULES ""
       PREFIX ${ZDNN_PREFIX}
       BUILD_IN_SOURCE ON
       CONFIGURE_COMMAND sh -c "autoconf && ./configure"
@@ -55,6 +61,7 @@ function(setup_zdnn version)
     ExternalProject_Add(zdnn
       GIT_REPOSITORY ${ZDNN_GITHUB_URL}
       GIT_TAG ${version}
+      GIT_SUBMODULES ""
       PREFIX ${ZDNN_PREFIX}
       BUILD_IN_SOURCE ON
       CONFIGURE_COMMAND ""
diff --git a/src/Builder/FrontendDialectTransformer.cpp b/src/Builder/FrontendDialectTransformer.cpp
index a913f25b84..bb516499e1 100644
--- a/src/Builder/FrontendDialectTransformer.cpp
+++ b/src/Builder/FrontendDialectTransformer.cpp
@@ -1152,10 +1152,10 @@ class FrontendGenImpl {
     if (node.domain().compare("ai.onnx.ml") != 0 &&
         current_opset < opset_list.back() &&
         current_opset < MINIMUM_SUPPORTED_OPSET)
-      llvm::outs() << "Warning: ONNX " << node.op_type()
+      llvm::outs() << "\nWarning: ONNX " << node.op_type()
                    << " in your model is using Opset " << current_opset
                    << ", which is quite old. Please consider regenerating your "
-                      "model with a newer Opset.\n";
+                      "model with a newer Opset.\n\n";
 
     for (int i = opset_list.size() - 1; i > 0; i--) {
       if (current_opset < opset_list[i - 1]) {
@@ -1595,9 +1595,9 @@ int readAndStripComments(
     if (line->contains("//")) {
       // Not stripping end-of-line comments because there's no robust way to
       // distinguish them from valid uses of // in the json itself.
-      llvm::errs() << "Warning: possible invalid end-of-line // comment in "
+      llvm::errs() << "\nWarning: possible invalid end-of-line // comment in "
                       "json input file "
-                   << fname.str() << ":" << line.line_number() << "\n";
+                   << fname.str() << ":" << line.line_number() << "\n\n";
     }
     contents.append(*line);
   }
diff --git a/src/Builder/ModelInputShaper.cpp b/src/Builder/ModelInputShaper.cpp
index 067ab67896..6b77d9dc33 100644
--- a/src/Builder/ModelInputShaper.cpp
+++ b/src/Builder/ModelInputShaper.cpp
@@ -84,9 +84,9 @@ void ModelInputShaper::setShapeInformation(
     }
     if (hasAllInputSetting && (inputs_shape_information_.size() > 1)) {
       llvm::outs()
-          << "Warning: Found multiple settings that includes -1:d1xd2x...xdn "
+          << "\nWarning: Found multiple settings that includes -1:d1xd2x...xdn "
              "for all inputs. Only the first -1:d1xd2x...xdn is effective and "
-             "the other settings are ignored.\n";
+             "the other settings are ignored.\n\n";
     }
   }
 }
diff --git a/src/Compiler/CompilerOptions.cpp b/src/Compiler/CompilerOptions.cpp
index 730f2ac50f..064ee17680 100644
--- a/src/Compiler/CompilerOptions.cpp
+++ b/src/Compiler/CompilerOptions.cpp
@@ -33,6 +33,7 @@ std::vector<accel::Accelerator::Kind> maccel;          // common for both
 OptLevel OptimizationLevel;                            // common for both
 std::string mtriple;                                   // common for both
 std::string mcpu;                                      // common for both
+float nnpaEpsilon;                                     // common for both
 std::string march;                                     // common for both
 InstrumentStages instrumentStage;                      // common for both
 bool onnxConstPropRoundFPToInt;                        // common for both
@@ -159,6 +160,15 @@ static llvm::cl::opt<std::string, true> mcpuOpt("mcpu",
     llvm::cl::location(mcpu), llvm::cl::cat(OnnxMlirCommonOptions),
     llvm::cl::ValueRequired);
 
+static llvm::cl::opt<float, true> nnpaEpsilonOpt("nnpa-epsilon",
+    // TODO: what text should go here.
+    llvm::cl::desc("A value added to inputs during computations to prevent "
+                   "undefined mathematical operations, \n"
+                   "such as division by zero or logarithms of zero. Default "
+                   "value set to 1e-5."),
+    llvm::cl::value_desc("Float value"), llvm::cl::location(nnpaEpsilon),
+    llvm::cl::cat(OnnxMlirCommonOptions), llvm::cl::init(1e-5));
+
 static llvm::cl::opt<std::string, true> marchOpt("march",
     llvm::cl::desc("Target architecture to generate code for."),
     llvm::cl::value_desc("Target a specific architecture type"),
@@ -750,8 +760,8 @@ bool parseCustomEnvFlagsCommandLineOption(
     std::string envVal(envValCstr);
     if (envVal.find("-customEnvFlags") != std::string::npos) {
       if (errs)
-        *errs << "Warning: recursive use of --customEnvFlags in "
-                 "environment flag not permited\n";
+        *errs << "\nWarning: recursive use of --customEnvFlags in "
+                 "environment flag not permited\n\n";
       return false;
     }
     if (envVal.find("-v") != std::string::npos)
@@ -815,7 +825,36 @@ void setTargetArch(const std::string &arch) {
 
 void clearTargetArch() { march.clear(); }
 
-std::string getTargetArchOption() {
+// Sort out architectures for Z systems (hybrid archXX and zYY names).
+static int64_t decodeZArchNum(std::string str) {
+  if (str == "arch12" || str == "z14") // Z14 and equivalents.
+    return 12;
+  if (str == "arch13" || str == "z15") // Z15 and equivalents.
+    return 13;
+  if (str == "arch14" || str == "z16") // Z16 and equivalents.
+    return 14;
+  if (str == "arch15")
+    return 15;
+  return -1;
+}
+
+int64_t getZArchNum(const std::string &arch, const std::string cpu) {
+  // Give priority to march, use (deprecated) mcpu if march is not defined.
+  int64_t num = decodeZArchNum(arch);
+  if (num == -1)
+    num = decodeZArchNum(cpu);
+  return num;
+}
+
+std::string getTargetArchOption(bool forLLVMToolchain) {
+  // LLVM toolchain wants a --march=systemz for all z machines; the specific
+  // Z architecture will be specified with the LLVM Toolchain --mcpu.
+  if (forLLVMToolchain) {
+    // Handle special case for Z.
+    int64_t zArchNum = getZArchNum(march, mcpu);
+    if (zArchNum != -1)
+      return "--march=systemz";
+  }
   return (march != "") ? "--march=" + march : "";
 }
 
@@ -828,8 +867,27 @@ void setTargetCPU(const std::string &cpu) {
 
 void clearTargetCPU() { mcpu.clear(); }
 
-std::string getTargetCPUOption() {
-  return (mcpu != "") ? "--mcpu=" + mcpu : "";
+// As the LLVM tooling for Z may not support the latest, cap it by this
+// --mcpu=arch{MAX_LLVM_Z_ARCH_LEVEL} value.
+#define MAX_LLVM_Z_ARCH_LEVEL 14
+
+std::string getTargetCPUOption(bool forLLVMToolchain, bool cpuOnly) {
+  // With cpu only, return the mcpu value; without it, prepend with "--mcpu=".
+  std::string str = (cpuOnly ? "" : "--mcpu=");
+
+  // The LLVM toolchain wants the specific Z architecture to be expressed with
+  // the LLVM Toolchain --mcpu. Convert below the --march into their
+  // corresponding --mcpu equivalent.
+  if (forLLVMToolchain) {
+    // Handle special case for Z.
+    int64_t zArchNum = getZArchNum(march, mcpu);
+    if (zArchNum != -1) {
+      // Cap at max supported LLVM level.
+      zArchNum = std::min(zArchNum, (int64_t)MAX_LLVM_Z_ARCH_LEVEL);
+      return str.append("arch" + std::to_string(zArchNum));
+    }
+  }
+  return (mcpu != "") ? str + mcpu : "";
 }
 
 // Support for Accel.
@@ -1132,9 +1190,9 @@ std::string getExecPath() {
   auto execPath = llvm::sys::fs::getMainExecutable(nullptr, nullptr);
   if (execPath.empty()) {
     llvm::errs()
-        << "Warning: Could not find path to current executable, falling "
+        << "\nWarning: Could not find path to current executable, falling "
            "back to default install path: "
-        << kExecPath << "\n";
+        << kExecPath << "\n\n";
     return kExecPath;
   }
   return execPath;
@@ -1173,16 +1231,16 @@ std::string getLibraryPath() {
 
 // onnx-mlir currently requires llvm tools llc and opt and they are assumed
 // to be under llvm-project/build/bin. This doesn't work with the case where
-// llvm-project has been installed system wide (typically under /usr/local/...)
-// and its source has been removed.
+// llvm-project has been installed system wide (typically under
+// /usr/local/...) and its source has been removed.
 //
 // To account for this scenario, we first search for the tools in the same
-// directory where onnx-mlir is run. If they are found, it means both onnx-mlir
-// and llvm-project have been installed system wide under the same directory,
-// so we get them from that directory (typically /usr/local/bin). Otherwise,
-// at least one of onnx-mlir and llvm-project has not been installed system
-// wide. In this case, getToolPath returns the fallback directory where llvm
-// is built which is typically llvm-project/build/bin.
+// directory where onnx-mlir is run. If they are found, it means both
+// onnx-mlir and llvm-project have been installed system wide under the same
+// directory, so we get them from that directory (typically /usr/local/bin).
+// Otherwise, at least one of onnx-mlir and llvm-project has not been
+// installed system wide. In this case, getToolPath returns the fallback
+// directory where llvm is built which is typically llvm-project/build/bin.
 //
 // Note that this will not work if both onnx-mlir and llvm-project have been
 // installed system wide but to different places and their sources have been
@@ -1190,8 +1248,8 @@ std::string getLibraryPath() {
 // llvm-project.
 //
 // If the flag is true, getToolPath will simply return the path detected by
-// cmake at compile time. This is used for system wide tools such as cc, ld, ar,
-// etc. Note that this means the path is valid only on the system where
+// cmake at compile time. This is used for system wide tools such as cc, ld,
+// ar, etc. Note that this means the path is valid only on the system where
 // onnx-mlir is built. If onnx-mlir is subsequently run on a system that does
 // not have these tools installed in the "standard" places, it will fail.
 //
@@ -1247,8 +1305,8 @@ void initCompilerConfig() {
   // Test option requirements.
   if (!ONNXOpStats.empty() && emissionTarget <= EmitONNXIR)
     llvm::errs()
-        << "Warning: --onnx-op-stats requires targets like --EmitMLIR, "
-           "--EmitLLVMIR, or binary-generating emit commands.\n";
+        << "\nWarning: --onnx-op-stats requires targets like --EmitMLIR, "
+           "--EmitLLVMIR, or binary-generating emit commands.\n\n";
 
   // Library setup for EmitLib and EmitJNI targets
   if (emissionTarget == EmitLib || emissionTarget == EmitJNI) {
diff --git a/src/Compiler/CompilerOptions.hpp b/src/Compiler/CompilerOptions.hpp
index f7ebc2c6e5..110a27a994 100644
--- a/src/Compiler/CompilerOptions.hpp
+++ b/src/Compiler/CompilerOptions.hpp
@@ -14,6 +14,7 @@
 
 #ifndef ONNX_MLIR_COMPILER_OPTIONS_H
 #define ONNX_MLIR_COMPILER_OPTIONS_H
+
 #include "onnx-mlir/Compiler/OMCompilerTypes.h"
 #include "src/Accelerators/Accelerator.hpp"
 #include "llvm/Support/CommandLine.h"
@@ -78,6 +79,7 @@ extern std::vector<accel::Accelerator::Kind> maccel;          // common for both
 extern OptLevel OptimizationLevel;                            // common for both
 extern std::string mtriple;                                   // common for both
 extern std::string mcpu;                                      // common for both
+extern float nnpaEpsilon;                                     // common for both
 extern std::string march;                                     // common for both
 extern InstrumentStages instrumentStage;                      // common for both
 extern bool onnxConstPropRoundFPToInt;                        // common for both
@@ -159,11 +161,13 @@ std::string getTargetTripleOption();
 
 void setTargetArch(const std::string &arch);
 void clearTargetArch();
-std::string getTargetArchOption();
+int64_t getZArchNum(const std::string &arch, const std::string cpu);
+std::string getTargetArchOption(bool forLLVMToolchain = false);
 
 void setTargetCPU(const std::string &cpu);
 void clearTargetCPU();
-std::string getTargetCPUOption();
+std::string getTargetCPUOption(
+    bool forLLVMToolchain = false, bool cpuOnly = false);
 
 int setTargetAccel(const std::string &str);
 void setTargetAccel(const accel::Accelerator::Kind accel);
diff --git a/src/Compiler/CompilerPasses.cpp b/src/Compiler/CompilerPasses.cpp
index 0b5dd418ac..231b241c5e 100644
--- a/src/Compiler/CompilerPasses.cpp
+++ b/src/Compiler/CompilerPasses.cpp
@@ -44,6 +44,18 @@ using namespace mlir;
 namespace onnx_mlir {
 
 void configurePasses() {
+  // Handle deprecated mcpu.
+  if (!mcpu.empty()) {
+    if (!march.empty()) {
+      llvm::outs() << "\nWarning: Got values for both --march and --mcpu, "
+                      "ignore --mcpu. "
+                      "Please remove deprecated --mcpu in the near future.\n\n";
+    } else {
+      llvm::outs()
+          << "\nWarning: Got deprecated --mcpu option. Please switch to "
+             "--march in the near future.\n\n";
+    }
+  }
   // Set global vector machine support.
   VectorMachineSupport::setGlobalVectorMachineSupport(march, mcpu, "");
   configureConstPropONNXToONNXPass(onnxConstPropRoundFPToInt,
diff --git a/src/Compiler/CompilerUtils.cpp b/src/Compiler/CompilerUtils.cpp
index 96365f66c7..4310010d36 100644
--- a/src/Compiler/CompilerUtils.cpp
+++ b/src/Compiler/CompilerUtils.cpp
@@ -467,8 +467,8 @@ static int genLLVMBitcode(const mlir::OwningOpRef<ModuleOp> &module,
                .appendStr(getOptimizationLevelUniqueOption(
                    {getLLVMOptions(), getXoptOption()}))
                .appendStr(getTargetTripleOption())
-               .appendStr(getTargetArchOption())
-               .appendStr(getTargetCPUOption())
+               .appendStr(getTargetArchOption(/*forLLVM toolchain*/ true))
+               .appendStr(getTargetCPUOption(/*forLLVM*/ true))
                .appendList(getXoptOption())
                .appendList(getLLVMOptions())
                .appendList({"-o", optimizedBitcodeNameWithExt})
@@ -491,8 +491,8 @@ static int genModelObject(
                .appendStr(getOptimizationLevelUniqueOption(
                    {getLLVMOptions(), getXllcOption()}))
                .appendStr(getTargetTripleOption())
-               .appendStr(getTargetArchOption())
-               .appendStr(getTargetCPUOption())
+               .appendStr(getTargetArchOption(/*LLVM toolchain*/ true))
+               .appendStr(getTargetCPUOption(/*LLVM*/ true))
                .appendList(getXllcOption())
                .appendList(getLLVMOptions())
                .appendStr("-filetype=obj")
@@ -886,14 +886,16 @@ static const llvm::Target *getLLVMTarget(
   return LLVMTarget;
 }
 
-/// Return the module datalayout string. The datalayout string is determined
+/// Return the module data layout string. The data layout string is determined
 /// by creating a target machine using the target triple and target cpu.
 static std::string getDataLayout(const Location &loc) {
   const llvm::Target &LLVMTarget = *getLLVMTarget(mtriple, loc);
   llvm::TargetOptions ops;
+  std::string mcpuForLLVMToolchain = getTargetCPUOption(
+      /*forLLVM*/ true, /*cpu only*/ true);
   auto targetMachine =
       std::unique_ptr<llvm::TargetMachine>{LLVMTarget.createTargetMachine(
-          mtriple, mcpu, "" /*features*/, ops, std::nullopt)};
+          mtriple, mcpuForLLVMToolchain, "" /*features*/, ops, std::nullopt)};
   if (!targetMachine) {
     emitError(loc, "failed to create target machine");
     return nullptr;
@@ -901,7 +903,7 @@ static std::string getDataLayout(const Location &loc) {
 
   const llvm::DataLayout &dl = targetMachine->createDataLayout();
   std::string dataLayoutString = dl.getStringRepresentation();
-  assert(dataLayoutString != "" && "Expecting a valid target datalayout");
+  assert(dataLayoutString != "" && "Expecting a valid target data layout");
 
   return dataLayoutString;
 }
diff --git a/src/Conversion/KrnlToAffine/KrnlMemset.cpp b/src/Conversion/KrnlToAffine/KrnlMemset.cpp
index 0a1f145e91..0cda7ad596 100644
--- a/src/Conversion/KrnlToAffine/KrnlMemset.cpp
+++ b/src/Conversion/KrnlToAffine/KrnlMemset.cpp
@@ -57,8 +57,9 @@ class KrnlMemsetLowering : public ConversionPattern {
     int rank = ubs.size();
     SmallVector<IndexExpr, 4> lbs(rank, LitIE(0));
     SmallVector<int64_t, 4> steps(rank, 1);
+    SmallVector<bool, 4> useParallel(rank, false);
     // Copy data,
-    create.affineKMem.forLoopsIE(lbs, ubs, steps,
+    create.affineKMem.forLoopsIE(lbs, ubs, steps, useParallel,
         [&](const AffineBuilderKrnlMem &createAffine, ValueRange indices) {
           createAffine.store(destVal, destMemRef, indices);
         });
diff --git a/src/Conversion/ONNXToKrnl/ML/CategoryMapper.cpp b/src/Conversion/ONNXToKrnl/ML/CategoryMapper.cpp
index f71e0e487c..565e63a7d7 100644
--- a/src/Conversion/ONNXToKrnl/ML/CategoryMapper.cpp
+++ b/src/Conversion/ONNXToKrnl/ML/CategoryMapper.cpp
@@ -140,7 +140,7 @@ struct ONNXCategoryMapperOpLowering
     create.krnlIE.getShapeAsDims(X, ubs);
 
     if (emitPrintStmts)
-      create.krnl.printTensor("Input tensor:\n", X);
+      create.krnl.printTensor("Input tensor:%s%d%e", X);
 
     ValueRange loopDef = create.krnl.defineLoops(rank);
     create.krnl.iterateIE(loopDef, loopDef, lbs, ubs,
diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
index d49e1bd058..3697d9b60a 100644
--- a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
+++ b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
@@ -12,6 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <functional>
+
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 #include "src/Dialect/Krnl/DialectBuilder.hpp"
@@ -283,11 +285,14 @@ Value emitScalarOpFor<ONNXReduceMinV13Op>(ConversionPatternRewriter &rewriter,
 using MDBuilder = MultiDialectBuilder<KrnlBuilder, IndexExprBuilderForKrnl,
     MathBuilder, MemRefBuilder, VectorBuilder, AffineBuilderKrnlMem, SCFBuilder,
     AffineBuilder>;
+using PreProcessFn = std::function<mlir::Value(mlir::Value)>;
 
 //===----------------------------------------------------------------------===//
 // Helper function to perform reduction when an entire tensor is reduced to a
 // single value. Support the reduction for up to 2 operations at once. If only
 // one is needed, then pass ONNXNoneOp in the second slot.
+// If PreProcessFn is given, each input value is preprocessed by the function
+// before doing reduction.
 // Return true if we can optimize the reduction, false otherwise.
 
 template <typename BUILDER, typename ONNXReductionOp1,
@@ -296,7 +301,8 @@ void emitOneStepOfFullSIMDReduction(ConversionPatternRewriter &rewriter,
     Operation *op, MDBuilder &create, Type elementType, IndexExpr lb,
     IndexExpr ub, int64_t VL, bool simdOnly, IndexExpr t, int64_t tNum,
     bool hasTwoRed, Value input1, Value input2, Value tmp1, Value tmp2,
-    Value output1, Value output2, Value divisorForMean) {
+    PreProcessFn preProc1, PreProcessFn preProc2, Value output1, Value output2,
+    Value divisorForMean) {
 
   VectorType vecType = VectorType::get({VL}, elementType);
 
@@ -340,6 +346,8 @@ void emitOneStepOfFullSIMDReduction(ConversionPatternRewriter &rewriter,
       [&](const BUILDER &b, Value inputVal, Value tmpVal, int64_t VL) {
         Type currType = (VL > 1) ? vecType : elementType;
         // Perform reduction of tmp and input.
+        if (preProc1)
+          inputVal = preProc1(inputVal);
         return emitScalarOpFor<ONNXReductionOp1>(
             rewriter, create.getLoc(), op, currType, {tmpVal, inputVal});
       });
@@ -360,6 +368,8 @@ void emitOneStepOfFullSIMDReduction(ConversionPatternRewriter &rewriter,
         [&](const BUILDER &b, Value inputVal, Value tmpVal, int64_t VL) {
           Type currType = (VL > 1) ? vecType : elementType;
           // Perform reduction of tmp and input.
+          if (preProc2)
+            inputVal = preProc2(inputVal);
           return emitScalarOpFor<ONNXReductionOp2>(
               rewriter, create.getLoc(), op, currType, {tmpVal, inputVal});
         });
@@ -383,8 +393,8 @@ void emitOneStepOfFullSIMDReduction(ConversionPatternRewriter &rewriter,
 
 template <typename ONNXReductionOp1, typename ONNXReductionOp2>
 bool emitFullSIMDReductionFor(ConversionPatternRewriter &rewriter, Location loc,
-    Operation *op, Value input, Value &alloc1, Value &alloc2,
-    bool enableParallel) {
+    Operation *op, Value input, PreProcessFn preProc1, PreProcessFn preProc2,
+    Value &alloc1, Value &alloc2, bool enableParallel) {
   // Create scope.
   IndexExprScope scope(&rewriter, loc);
   MDBuilder create(rewriter, loc);
@@ -454,8 +464,8 @@ bool emitFullSIMDReductionFor(ConversionPatternRewriter &rewriter, Location loc,
     // OK to use Krnl builder here as we have a simple loop structure.
     emitOneStepOfFullSIMDReduction<KrnlBuilder, ONNXReductionOp1,
         ONNXReductionOp2>(rewriter, op, create, elementType, lb, ub, totVL,
-        simdOnly, t, tNum, hasTwoRed, flatInput, flatInput, tmp1, tmp2, alloc1,
-        alloc2, divisorForMean);
+        simdOnly, t, tNum, hasTwoRed, flatInput, flatInput, tmp1, tmp2,
+        preProc1, preProc2, alloc1, alloc2, divisorForMean);
   } else {
     // Performs 2 rounds: first round compute a parallel partial reduction
     // where each (possibly virtual) thread is responsible for one chunk.
@@ -477,22 +487,20 @@ bool emitFullSIMDReductionFor(ConversionPatternRewriter &rewriter, Location loc,
     }
 
     IndexExpr tNumIE = LitIE(tNum);
-    IndexExpr blockSize = ub.ceilDiv(tNum);
     bool simdOnly = false; // Refine, but since we are chunking input, safer.
-    create.krnl.forLoopIE(zero, tNumIE, /*step*/ 1, /*par*/ true,
-        [&](const KrnlBuilder &ck, mlir::ValueRange loopInd) {
+    create.krnl.forExplicitParallelLoopIE(
+        lb, ub, tNumIE, [&](const KrnlBuilder &ck, mlir::ValueRange loopInd) {
           IndexExprScope scope(ck);
           MDBuilder create(ck);
           IndexExpr t = DimIE(loopInd[0]);
-          IndexExpr currLB = t * SymIE(blockSize);
-          IndexExpr currUB = currLB + SymIE(blockSize);
-          currUB = IndexExpr::min(currUB, SymIE(ub));
+          IndexExpr currLB = SymIE(loopInd[1]);
+          IndexExpr currUB = SymIE(loopInd[2]);
           // Use SCF builder because the partition of outer loop into block
           // makes the formulas non-affine.
           emitOneStepOfFullSIMDReduction<SCFBuilder, ONNXReductionOp1,
               ONNXReductionOp2>(rewriter, op, create, elementType, currLB,
               currUB, totVL, simdOnly, t, tNum, hasTwoRed, flatInput, flatInput,
-              tmp1, tmp2, output1, output2, nullptr);
+              tmp1, tmp2, preProc1, preProc2, output1, output2, nullptr);
           // Result here, each iteration would have generate 1 value in
           // output1 &2,
         });
@@ -509,7 +517,8 @@ bool emitFullSIMDReductionFor(ConversionPatternRewriter &rewriter, Location loc,
     emitOneStepOfFullSIMDReduction<KrnlBuilder, ONNXReductionOp1,
         ONNXReductionOp2>(rewriter, op, create, elementType, finalLB, finalUB,
         /*VL*/ 1, /*simd only*/ false, t, /*thread num */ 1, hasTwoRed, output1,
-        output2, tmp1, tmp2, alloc1, alloc2, divisorForMean);
+        output2, tmp1, tmp2, preProc1, preProc2, alloc1, alloc2,
+        divisorForMean);
   }
 
   if (hasTwoRed)
@@ -527,8 +536,8 @@ void emitMinMaxReductionToScalar(ConversionPatternRewriter &rewriter,
     bool enableSIMD, bool enableParallel) {
   // Try optimized path first.
   if (enableSIMD &&
-      emitFullSIMDReductionFor<ONNXReduceMinOp, ONNXReduceMaxOp>(
-          rewriter, loc, op, input, minAlloc, maxAlloc, enableParallel)) {
+      emitFullSIMDReductionFor<ONNXReduceMinOp, ONNXReduceMaxOp>(rewriter, loc,
+          op, input, nullptr, nullptr, minAlloc, maxAlloc, enableParallel)) {
     return;
   }
   // Could not optimize the pattern, generate default path.
@@ -543,6 +552,37 @@ void emitMinMaxReductionToScalar(ConversionPatternRewriter &rewriter,
       create.onnx.reduceMax(outputType, input, none, false));
 }
 
+void emitSymmetricQuantRecscaleToScalar(ConversionPatternRewriter &rewriter,
+    Location loc, Operation *op, Value input, uint64_t bitWidth,
+    Value &recscale, bool enableSIMD, bool enableParallel) {
+  Type elemType = getElementType(input.getType());
+  assert(elemType.isF32() && "Only support f32");
+  double range = static_cast<double>((1 << (bitWidth - 1)) - 1);
+
+  // Try optimized path first.
+  Value absmaxMemRef, noused;
+  MultiDialectBuilder<OnnxBuilder, KrnlBuilder, MathBuilder> create(
+      rewriter, loc);
+  if (enableSIMD &&
+      emitFullSIMDReductionFor<ONNXReduceMaxOp, ONNXNoneOp>(
+          rewriter, loc, op, input, [&](Value v) { return create.math.abs(v); },
+          nullptr, absmaxMemRef, noused, enableParallel)) {
+    Value cst = create.math.constant(elemType, range);
+    Value absmax = create.krnl.load(absmaxMemRef);
+    recscale = create.math.div(cst, absmax);
+    return;
+  }
+
+  // Could not optimize the pattern, generate default path.
+  Value none = create.onnx.none();
+  RankedTensorType scalarTy = RankedTensorType::get({}, elemType);
+  Value cst = create.onnx.constant(
+      DenseElementsAttr::get(scalarTy, static_cast<float>(range)));
+  Value recscaleMemRef = create.onnx.toMemref(
+      create.onnx.div(cst, create.onnx.reduceMax(scalarTy,
+                               create.onnx.abs(input), none, false, false)));
+  recscale = create.krnl.load(recscaleMemRef);
+}
 //===----------------------------------------------------------------------===//
 // Generic reduction code (for current and legacy using "if constexpr".
 // Function use SIMD if all reductions occur consecutively in the innermost
@@ -668,8 +708,8 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
         hasNoAxes || (rawAxesIE.size() == static_cast<uint64_t>(inRank));
     if (fullReduction && !isKeepdims && enableSIMD) {
       Value alloc, none;
-      if (emitFullSIMDReductionFor<ONNXReductionOp, ONNXNoneOp>(
-              rewriter, loc, op, input, alloc, none, enableParallel)) {
+      if (emitFullSIMDReductionFor<ONNXReductionOp, ONNXNoneOp>(rewriter, loc,
+              op, input, nullptr, nullptr, alloc, none, enableParallel)) {
         rewriter.replaceOp(op, alloc);
         return success();
       }
@@ -788,7 +828,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
 #if REDUCTION_MULTIPLE_OF_VL_ONLY
           // Currently fails with krnl to affine without this. Should
           // consider an affine simd iterate/reduce. onnx-mlir
-          // -shapeInformation=0:4x8 reducemean2.mlir -O3 -march=arm64
+          // -shapeInformation=0:4x8 reducemean2.mlir -O3 --march=arm64
           if (!simdOnly) {
             totVL =
                 capVLForSimdOnly(memRefInType, totVL, simdLoopStaticTripCount);
diff --git a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
index 0e0b9dc7c4..3db45b4525 100644
--- a/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
+++ b/src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp
@@ -754,5 +754,21 @@ void emitMinMaxReductionToScalar(mlir::ConversionPatternRewriter &rewriter,
     mlir::Value &minAlloc, mlir::Value &maxAlloc, bool enableSIMD,
     bool enableParallel);
 
+// Compute the reciprocal scale (recscale) for the symmetric quantization. Can
+// generate parallel and SIMD code as requested. Formula for recscale:
+// ```
+// recscale  = (2^(b-1) - 1) / absmax(X)
+// ```
+// where
+// - X is the input tensor,
+// - b is the number of bits we want to quantize to (e.g. 8 for integer 8), and
+// - absmax is a function to compute the absolute maximun value over entire
+// tensor
+//
+void emitSymmetricQuantRecscaleToScalar(
+    mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
+    mlir::Operation *op, mlir::Value input, uint64_t bitWidth,
+    mlir::Value &recscale, bool enableSIMD, bool enableParallel);
+
 } // namespace onnx_mlir
 #endif
diff --git a/src/Conversion/ONNXToKrnl/Quantization/DynamicQuantizeLinear.cpp b/src/Conversion/ONNXToKrnl/Quantization/DynamicQuantizeLinear.cpp
index cc62f80cdb..1222f98708 100644
--- a/src/Conversion/ONNXToKrnl/Quantization/DynamicQuantizeLinear.cpp
+++ b/src/Conversion/ONNXToKrnl/Quantization/DynamicQuantizeLinear.cpp
@@ -24,18 +24,27 @@ using namespace mlir;
 
 namespace onnx_mlir {
 
-// Implementation of quantize helper function.
-// TODO: add parallel.
-void emitDynamicQuantizationLinearScalarParameters(
-    ConversionPatternRewriter &rewriter, Location loc, Operation *op,
-    MemRefType inputType, MemRefType quantizedType, Value input, Value qMin,
-    Value qMax, Value &scale, Value &zeroPoint, Value &quantizedZeroPoint,
-    bool wantZeroPoint, bool enableSIMD, bool enableParallel) {
-  MultiDialectBuilder<KrnlBuilder, MathBuilder> create(rewriter, loc);
+// Implementation of computing the min and max over an entire tensor. Can
+// generate parallel and SIMD code as requested.
+void emitDynamicQuantizationLinearMinMax(ConversionPatternRewriter &rewriter,
+    Location loc, Operation *op, Value input, Value &inputMin, Value &inputMax,
+    bool enableSIMD, bool enableParallel) {
+  MultiDialectBuilder<KrnlBuilder> create(rewriter, loc);
+  Value inputMinAlloc, inputMaxAlloc;
+  emitMinMaxReductionToScalar(rewriter, loc, op, input, inputMinAlloc,
+      inputMaxAlloc, enableSIMD, enableParallel);
+  inputMin = create.krnl.load(inputMinAlloc);
+  inputMax = create.krnl.load(inputMaxAlloc);
+}
 
-  // Types
-  Type elementType = inputType.getElementType();
-  Type quantizedElementType = quantizedType.getElementType();
+// Implementation of quantize helper function. Returns Values scale, zeroPoint,
+// and quantizedZeroPoint directly (not as a value to a memory location,
+// directly the floating point results).
+void emitDynamicQuantizationLinearScalarParametersFromMinMax(
+    ConversionPatternRewriter &rewriter, Location loc, Operation *op,
+    MemRefType inputType, MemRefType quantizedType, Value inputMin,
+    Value inputMax, Value qMin, Value qMax, Value &scale, Value &zeroPoint,
+    Value &quantizedZeroPoint, bool wantZeroPoint, bool enableParallel) {
 
   // Equations:
   // y_scale = (max(x) - min(x))/(qMax - qMin)
@@ -45,26 +54,25 @@ void emitDynamicQuantizationLinearScalarParameters(
   //
   // where, saturate is to clip to [0, 255] for ui8.
 
-  Value inputMinAlloc, inputMaxAlloc;
-  emitMinMaxReductionToScalar(rewriter, loc, op, input, inputMinAlloc,
-      inputMaxAlloc, enableSIMD, enableParallel);
-  Value xMin = create.krnl.load(inputMinAlloc);
-  Value xMax = create.krnl.load(inputMaxAlloc);
-
+  MultiDialectBuilder<KrnlBuilder, MathBuilder> create(rewriter, loc);
+  // Types.
+  Type elementType = inputType.getElementType();
+  Type quantizedElementType = quantizedType.getElementType();
   // Include 0 to max(x) and min(x).
   // x_min = min(min(x), 0)
   // x_max = max(max(x), 0)
   Value zero = create.math.constant(elementType, 0.0);
-  xMax = create.math.max(xMax, zero);
-  xMin = create.math.min(xMin, zero);
+  inputMax = create.math.max(inputMax, zero);
+  inputMin = create.math.min(inputMin, zero);
   // Compute y_scale.
-  Value xDiff = create.math.sub(xMax, xMin);
+  Value xDiff = create.math.sub(inputMax, inputMin);
   Value boundDiff = create.math.sub(qMax, qMin);
   scale = create.math.div(xDiff, boundDiff);
 
   // Compute y_zero_point.
   if (wantZeroPoint) {
-    Value interZeroPoint = create.math.sub(qMin, create.math.div(xMin, scale));
+    Value interZeroPoint =
+        create.math.sub(qMin, create.math.div(inputMin, scale));
     // Saturate zero point.
     Value saturateZeroPoint = create.math.clip(interZeroPoint, qMin, qMax);
     // Round zero point.
@@ -125,14 +133,16 @@ struct ONNXDynamicQuantizeLinearOpLowering
     Value YZeroPoint = create.mem.alignedAlloc(
         yZeroPointMemRefType, shapeHelper.getOutputDims(2));
 
+    Value xMin, xMax;
+    emitDynamicQuantizationLinearMinMax(
+        rewriter, loc, op, X, xMin, xMax, enableSIMD, enableParallel);
     Value qMax = create.math.constant(elementType, 255.0);
     Value qMin = create.math.constant(elementType, 0.0);
     Value scale, zeroPoint, zeroPointInt;
-
     bool wantZeroPoint = !disableQuantZeroPoint;
-    emitDynamicQuantizationLinearScalarParameters(rewriter, loc, op,
-        xMemRefType, yMemRefType, X, qMin, qMax, scale, zeroPoint, zeroPointInt,
-        wantZeroPoint, enableSIMD, enableParallel);
+    emitDynamicQuantizationLinearScalarParametersFromMinMax(rewriter, loc, op,
+        xMemRefType, yMemRefType, xMin, xMax, qMin, qMax, scale, zeroPoint,
+        zeroPointInt, wantZeroPoint, enableParallel);
     create.krnl.store(scale, YScale);
     create.krnl.store(zeroPointInt, YZeroPoint);
 
diff --git a/src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp b/src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp
index e202e1244c..c75ebd155e 100644
--- a/src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp
+++ b/src/Conversion/ONNXToKrnl/Quantization/QuantizeHelper.hpp
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef ONNX_MLIR_QUANTIZE_HELPER_HPP
+#define ONNX_MLIR_QUANTIZE_HELPER_HPP 1
+
 #include "src/Conversion/ONNXToKrnl/ONNXToKrnlCommon.hpp"
 
 namespace onnx_mlir {
@@ -27,13 +30,22 @@ void emitQuantizationLinearScalarParameters(
     mlir::Value zeroPoint, bool hasZeroPoint, bool enableSIMD,
     bool enableParallel, bool enableFastMath);
 
-// Scan the input to compute scale, zeroPoint, and quantizedZeroPoint given qMin
-// and qMax.
-void emitDynamicQuantizationLinearScalarParameters(
+// Compute min max over an entire tensor, which can then be used for dynamic
+// quantize linear.
+void emitDynamicQuantizationLinearMinMax(
+    mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
+    mlir::Operation *op, mlir::Value input, mlir::Value &inputMin,
+    mlir::Value &inputMax, bool enableSIMD, bool enableParallel);
+
+// Compute scale and zero points for dynamic quantization from min/max.
+void emitDynamicQuantizationLinearScalarParametersFromMinMax(
     mlir::ConversionPatternRewriter &rewriter, mlir::Location loc,
     mlir::Operation *op, mlir::MemRefType inputType,
-    mlir::MemRefType quantizedType, mlir::Value input, mlir::Value qMin,
-    mlir::Value qMax, mlir::Value &scale, mlir::Value &zeroPoint,
-    mlir::Value &quantizedZeroPoint, bool wantZeroPoint, bool enableSIMD,
+    mlir::MemRefType quantizedType, mlir::Value inputMin, mlir::Value inputMax,
+    mlir::Value qMin, mlir::Value qMax, mlir::Value &scale,
+    mlir::Value &zeroPoint, mlir::Value &quantizedZeroPoint, bool wantZeroPoint,
     bool enableParallel);
+
 } // namespace onnx_mlir
+
+#endif
diff --git a/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp b/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp
index 2f94e1023e..5743e71077 100644
--- a/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp
+++ b/src/Conversion/ONNXToKrnl/Quantization/QuantizeLinear.cpp
@@ -46,6 +46,37 @@ void emitQuantizationLinearScalarParameters(ConversionPatternRewriter &rewriter,
   // Flatten the input data and outputs
   DimsExpr inputDims, flatInputDims, flatAllocDims;
   inputDims = allocDims; // Unput and output have the same shape.
+                         //
+  if (rank == 0) {
+    // Do scalar computation only when the input is a scalar tensor.
+    Value x = create.krnl.load(input);
+    // Scale
+    Value scaleX;
+    if (useReciprocal) {
+      Value one = create.math.constant(inputElementType, 1.0);
+      Value scaleReciprocal = create.math.div(one, scale);
+      scaleX = create.math.mul(x, scaleReciprocal);
+    } else {
+      scaleX = create.math.div(x, scale);
+    }
+    // Round
+    Value roundX = create.krnl.roundEven(scaleX);
+    // Adjust
+    Value adjustX;
+    if (hasZeroPoint)
+      adjustX = create.math.add(roundX, zeroPoint);
+    else
+      adjustX = roundX;
+    // Saturate: use max into a min.
+    Value saturateX = create.math.clip(adjustX, qMin, qMax);
+    // Convert into quantized type.
+    Value quantSaturateX = create.math.cast(quantizedElementType, saturateX);
+    create.krnl.store(quantSaturateX, alloc);
+    onnxToKrnlSimdReport(op, /*successful*/ false, 0, 0,
+        "no simd in quantizationLinear whole tensor");
+    return;
+  }
+
   Value flatInput =
       create.mem.reshapeToFlatInnermost(input, inputDims, flatInputDims, rank);
   Value flatAlloc =
diff --git a/src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp b/src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp
index 13b4931eed..3bbed9d647 100644
--- a/src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp
+++ b/src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp
@@ -137,8 +137,8 @@ struct ONNXGatherNDOpLowering : public OpConversionPattern<ONNXGatherNDOp> {
     DimsExpr lbs(2, LitIE(0)), ubs = {newIndicesShape[0], newIndicesShape[1]};
 
     if (emitPrintStmts) {
-      create.krnl.printTensor("reshapedIndices: ", reshapedIndices);
-      create.krnl.printTensor("reshapedData: ", reshapedData);
+      create.krnl.printTensor("reshapedIndices%s%d%e", reshapedIndices);
+      create.krnl.printTensor("reshapedData%s%d%e", reshapedData);
     }
 
     create.krnl.iterateIE(loopDef, loopDef, lbs, ubs,
diff --git a/src/Dialect/Krnl/DialectBuilder.cpp b/src/Dialect/Krnl/DialectBuilder.cpp
index 01cb7486b3..81b6795913 100644
--- a/src/Dialect/Krnl/DialectBuilder.cpp
+++ b/src/Dialect/Krnl/DialectBuilder.cpp
@@ -251,6 +251,41 @@ void KrnlBuilder::forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step,
   iterateIE(originalLoopDef, optLoopDef, {lb}, {ub}, builderFn);
 }
 
+void KrnlBuilder::forLoopsIE(ArrayRef<IndexExpr> lbs, ArrayRef<IndexExpr> ubs,
+    ArrayRef<int64_t> steps, ArrayRef<bool> useParallel,
+    KrnlLoopBodyFn builderFn) const {
+  impl::forLoopsIE(*this, lbs, ubs, steps, useParallel, builderFn);
+}
+
+void KrnlBuilder::forExplicitParallelLoopIE(IndexExpr lb, IndexExpr ub,
+    IndexExpr threadNum, KrnlLoopBodyFn builderFn) const {
+  IndexExpr zero = LitIE(0);
+  if (threadNum.isLiteralAndIdenticalTo(1)) {
+    // Sequential case as we have only 1 thread (parallel disabled statically).
+    llvm::SmallVector<Value, 4> params = {
+        zero.getValue(), lb.getValue(), ub.getValue()};
+    builderFn(*this, params);
+    return;
+  }
+  // Compute blockSize: the number of elements of (lb...ub) per thread.
+  IndexExpr trip = ub - lb; // Expected to be positive, aka ub>lb.
+  IndexExpr blockSize = trip.ceilDiv(threadNum);
+  // Explicit parallelism: iterate over all threads 0..threadNum in parallel.
+  forLoopIE(zero, threadNum, /*step*/ 1, /*parallel*/ true,
+      [&](const KrnlBuilder &ck, ValueRange loopInd) {
+        IndexExprScope scope(ck);
+        IndexExpr t = DimIE(loopInd[0]);
+        IndexExpr tTimesBlockSize = t * SymIE(blockSize);
+        IndexExpr currLB = SymIE(lb) + tTimesBlockSize;
+        IndexExpr currUB = currLB + SymIE(blockSize);
+        currUB = IndexExpr::min(currUB, SymIE(ub));
+        // Passes the thread ID, its lower bound, and its upper bound.
+        llvm::SmallVector<Value, 4> params = {
+            t.getValue(), currLB.getValue(), currUB.getValue()};
+        builderFn(ck, params);
+      });
+}
+
 void KrnlBuilder::simdIterateIE(IndexExpr lb, IndexExpr ub, int64_t VL,
     bool fullySimd, bool useParallel, ArrayRef<Value> inputs,
     ArrayRef<DimsExpr> inputAFs, ArrayRef<Value> outputs,
diff --git a/src/Dialect/Krnl/DialectBuilder.hpp b/src/Dialect/Krnl/DialectBuilder.hpp
index f810998dca..4673ba2a98 100644
--- a/src/Dialect/Krnl/DialectBuilder.hpp
+++ b/src/Dialect/Krnl/DialectBuilder.hpp
@@ -75,8 +75,7 @@ struct KrnlBuilder : public DialectBuilder {
   // Iterate over optimized loops given the original loops, lbs and ubs. Lambda
   // function implement the body of the loop, and receive a KRNL builder and the
   // loop indices.
-  using KrnlLoopBodyFn =
-      mlir::function_ref<void(const KrnlBuilder &, mlir::ValueRange)>;
+  using KrnlLoopBodyFn = impl::LoopBodyFn<KrnlBuilder>;
   using KrnlLoopBody2Fn = mlir::function_ref<void(
       const KrnlBuilder &, mlir::ValueRange, mlir::ValueRange)>;
 
@@ -105,6 +104,17 @@ struct KrnlBuilder : public DialectBuilder {
   // Common loop interface (krnl/affine/scf).
   void forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step, bool useParallel,
       KrnlLoopBodyFn builderFn) const;
+  void forLoopsIE(mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
+      mlir::ArrayRef<int64_t> steps, mlir::ArrayRef<bool> useParallel,
+      KrnlLoopBodyFn builderFn) const;
+
+  // Loop with explicit parallelism. Loop body is invoked on each parallel
+  // thread with its threadID (0..threadNum-1) and its corresponding lb and ub
+  // (using static schedule). When threadNum==1 (compile time literal), we
+  // simply call the builderFn for the entire range as there is no
+  // parallelism, namely we call builderFn(builder, {0, lb, ub}).
+  void forExplicitParallelLoopIE(IndexExpr lb, IndexExpr ub,
+      IndexExpr threadNum, KrnlLoopBodyFn builderFn) const;
 
   // Common simd loop interface (krnl/affine/scf).
   /*
@@ -284,6 +294,8 @@ struct KrnlBuilder : public DialectBuilder {
       mlir::StringRef msg, IndexExpr input, bool endsWithNewLine = false) const;
   void printf(mlir::StringRef msg, mlir::Value input, mlir::Type inputType,
       bool endsWithNewLine = false) const;
+  // Use "%s" for signature, "%t" for detailed type, "%d" for data, "%e" for end
+  // of string (recommended). If no "%X" pattern is given, we assume "%s%d".
   void printTensor(mlir::StringRef msg, mlir::Value input) const;
 
   // Onnx-mlir runtime functions.
diff --git a/src/Dialect/Mlir/DialectBuilder.cpp b/src/Dialect/Mlir/DialectBuilder.cpp
index a0eeb54147..8baa53f8ef 100644
--- a/src/Dialect/Mlir/DialectBuilder.cpp
+++ b/src/Dialect/Mlir/DialectBuilder.cpp
@@ -1804,6 +1804,12 @@ void SCFBuilder::forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step,
   }
 }
 
+void SCFBuilder::forLoopsIE(ArrayRef<IndexExpr> lbs, ArrayRef<IndexExpr> ubs,
+    ArrayRef<int64_t> steps, ArrayRef<bool> useParallel,
+    SCFLoopBodyFn builderFn) const {
+  impl::forLoopsIE<SCFBuilder>(*this, lbs, ubs, steps, useParallel, builderFn);
+}
+
 void SCFBuilder::parallelLoops(ValueRange lbs, ValueRange ubs, ValueRange steps,
     SCFLoopBodyFn bodyFn) const {
   b().create<scf::ParallelOp>(loc(), lbs, ubs, steps,
@@ -2317,7 +2323,7 @@ LLVM::LLVMFuncOp LLVMBuilder::func(
     StringRef funcName, Type funcType, bool createUniqueFunc) const {
   // If createUniqueFunc, we create two functions: name and name_postfix.
   // They have the same signatures and `name` will call `name_postfix`.
-  // `name_postfix` funtion is expected to be unique across all generated
+  // `name_postfix` function is expected to be unique across all generated
   // modules, allowing to run multiple models at the same time.
   LLVM::LLVMFuncOp funcOp =
       b().create<LLVM::LLVMFuncOp>(loc(), funcName, funcType);
diff --git a/src/Dialect/Mlir/DialectBuilder.hpp b/src/Dialect/Mlir/DialectBuilder.hpp
index 70946ca07e..acf9fb55b5 100644
--- a/src/Dialect/Mlir/DialectBuilder.hpp
+++ b/src/Dialect/Mlir/DialectBuilder.hpp
@@ -119,6 +119,9 @@ struct MathBuilder final : DialectBuilder {
   // "B" below indicates that the operation will splat scalar values if one of
   // the input value is itself a vector.
 
+  // "B" below indicates that the operation will splat scalar values if one of
+  // the input value is itself a vector.
+
   mlir::Value abs(mlir::Value val) const;
   mlir::Value add(mlir::Value lhs, mlir::Value rhs) const;     // B.
   mlir::Value andi(mlir::Value lhs, mlir::Value rhs) const;    // B/Int only.
@@ -457,6 +460,11 @@ template <class BUILDER>
 using SimdPostReductionBodyFn = std::function<mlir::Value(
     const BUILDER &b, mlir::Value tmpVal, int64_t VL)>;
 
+// Function used for (nearly) all loops, where there is typically one value in
+// the provided ValueRange per loop nest.
+template <class BUILDER>
+using LoopBodyFn = mlir::function_ref<void(const BUILDER &, mlir::ValueRange)>;
+
 } // namespace impl
 
 //===----------------------------------------------------------------------===//
@@ -475,10 +483,12 @@ struct SCFBuilder final : DialectBuilder {
   void ifThenElse(mlir::Value cond, SCFThenElseBodyFn thenFn,
       SCFThenElseBodyFn elseFn = nullptr) const;
   // Common loop interface (krnl/affine/scf).
-  using SCFLoopBodyFn =
-      mlir::function_ref<void(const SCFBuilder &, mlir::ValueRange)>;
+  using SCFLoopBodyFn = impl::LoopBodyFn<SCFBuilder>;
   void forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step, bool useParallel,
       SCFLoopBodyFn bodyFn) const;
+  void forLoopsIE(mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
+      mlir::ArrayRef<int64_t> steps, mlir::ArrayRef<bool> useParallel,
+      SCFLoopBodyFn builderFn) const;
   // Custom interface
   void forLoop(
       mlir::Value lb, mlir::Value ub, int64_t step, SCFLoopBodyFn bodyFn) const;
@@ -625,16 +635,16 @@ struct GenericAffineBuilder final : DialectBuilder {
       bool isDataCache = true);
 
   // Common loop interface (krnl/affine/scf).
-  using GenericAffineLoopBodyFn =
-      mlir::function_ref<void(const GenericAffineBuilder &, mlir::ValueRange)>;
+  using GenericAffineLoopBodyFn = impl::LoopBodyFn<GenericAffineBuilder>;
   void forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step, bool useParallel,
       GenericAffineLoopBodyFn builderFn) const;
+  void forLoopsIE(mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
+      mlir::ArrayRef<int64_t> steps, mlir::ArrayRef<bool> useParallel,
+      GenericAffineLoopBodyFn builderFn) const;
 
   // Custom interface
   void forLoopIE(IndexExpr lb, IndexExpr ub, int64_t step,
       GenericAffineLoopBodyFn builderFn) const; // Sequential only.
-  void forLoopsIE(mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
-      mlir::ArrayRef<int64_t> steps, GenericAffineLoopBodyFn builderFn) const;
 
   // Common simd loop interface (krnl/affine/scf).
   using GenericAffineSimdIterateBodyFn =
@@ -680,12 +690,6 @@ struct GenericAffineBuilder final : DialectBuilder {
   void yield() const;
 
 private:
-  // Support for multiple for loops.
-  void recursionForLoopsIE(mlir::ArrayRef<IndexExpr> lbs,
-      mlir::ArrayRef<IndexExpr> ubs, mlir::ArrayRef<int64_t> steps,
-      llvm::SmallVectorImpl<mlir::Value> &loopIndices,
-      GenericAffineLoopBodyFn builderFn) const;
-
   // Support for adding blocks.
   void appendToBlock(mlir::Block *block,
       mlir::function_ref<void(mlir::ValueRange)> builderFn) const;
diff --git a/src/Dialect/Mlir/DialectBuilder.hpp.inc b/src/Dialect/Mlir/DialectBuilder.hpp.inc
index 43e8fa4534..ac4e07909c 100644
--- a/src/Dialect/Mlir/DialectBuilder.hpp.inc
+++ b/src/Dialect/Mlir/DialectBuilder.hpp.inc
@@ -84,6 +84,45 @@ void storeIE(const BUILDER &b, mlir::Value val, mlir::Value memref,
   store<BUILDER, STORE_OP>(b, val, memref, indexValues, offsets);
 }
 
+
+//===----------------------------------------------------------------------===//
+// Templates for multi-dimensional loop iterator.
+//===----------------------------------------------------------------------===//
+
+template <class BUILDER>
+void recursionForLoopsIE(const BUILDER &builder, mlir::ArrayRef<IndexExpr> lbs,
+    mlir::ArrayRef<IndexExpr> ubs, mlir::ArrayRef<int64_t> steps,
+    mlir::ArrayRef<bool> useParallel,
+    llvm::SmallVector<mlir::Value> &loopIndices,
+    LoopBodyFn<BUILDER> builderFn) {
+  int64_t d = loopIndices.size();
+  if (d < (int64_t)lbs.size()) {
+    // Issue a loop and recurse again.
+    builder.forLoopIE(lbs[d], ubs[d], steps[d], useParallel[d],
+        [&](const BUILDER &b, mlir::ValueRange loopInd) {
+          loopIndices.emplace_back(loopInd[0]);
+          recursionForLoopsIE(
+              b, lbs, ubs, steps, useParallel, loopIndices, builderFn);
+        });
+  } else {
+    // Call lambda function
+    BUILDER b(builder);
+    builderFn(b, loopIndices);
+  }
+}
+
+template <class BUILDER>
+void forLoopsIE(const BUILDER &builder, mlir::ArrayRef<IndexExpr> lbs,
+    mlir::ArrayRef<IndexExpr> ubs, mlir::ArrayRef<int64_t> steps,
+    mlir::ArrayRef<bool> useParallel, LoopBodyFn<BUILDER> builderFn) {
+  assert(lbs.size() == ubs.size() && "expect same size");
+  assert(lbs.size() == steps.size() && "expect same size");
+  assert(lbs.size() == useParallel.size() && "expect same size");
+  llvm::SmallVector<mlir::Value> loopIndices;
+  recursionForLoopsIE<BUILDER>(
+      builder, lbs, ubs, steps, useParallel, loopIndices, builderFn);
+}
+
 } // namespace impl
 
 //===----------------------------------------------------------------------===//
@@ -655,6 +694,14 @@ inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::forLoopIE(IndexExpr lb,
   }
 }
 
+template <class LOAD_OP, class STORE_OP>
+inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::forLoopsIE(
+    mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
+    mlir::ArrayRef<int64_t> steps, mlir::ArrayRef<bool> useParallel,
+    GenericAffineLoopBodyFn builderFn) const {
+  impl::forLoopsIE(*this, lbs, ubs, steps, useParallel, builderFn);
+}
+
 // Sequential only version.
 template <class LOAD_OP, class STORE_OP>
 inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::forLoopIE(IndexExpr lb,
@@ -662,16 +709,6 @@ inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::forLoopIE(IndexExpr lb,
   forLoopIE(lb, ub, step, false /*use parallel*/, builderFn);
 }
 
-template <class LOAD_OP, class STORE_OP>
-inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::forLoopsIE(
-    mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
-    mlir::ArrayRef<int64_t> steps, GenericAffineLoopBodyFn builderFn) const {
-  assert(lbs.size() == ubs.size() && "expected identical sizes");
-  assert(lbs.size() == steps.size() && "expected identical sizes");
-  llvm::SmallVector<mlir::Value> loopIndices;
-  recursionForLoopsIE(lbs, ubs, steps, loopIndices, builderFn);
-}
-
 template <class LOAD_OP, class STORE_OP>
 inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::simdIterateIE(IndexExpr lb,
     IndexExpr ub, int64_t VL, bool fullySimd, bool useParallel,
@@ -771,29 +808,6 @@ inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::yield() const {
   b().template create<mlir::affine::AffineYieldOp>(loc());
 }
 
-// Support for multiple forLoopIE loops.
-template <class LOAD_OP, class STORE_OP>
-void GenericAffineBuilder<LOAD_OP, STORE_OP>::recursionForLoopsIE(
-    mlir::ArrayRef<IndexExpr> lbs, mlir::ArrayRef<IndexExpr> ubs,
-    mlir::ArrayRef<int64_t> steps,
-    llvm::SmallVectorImpl<mlir::Value> &loopIndices,
-    GenericAffineLoopBodyFn builderFn) const {
-  int d = loopIndices.size();
-  if (d < (int)lbs.size()) {
-    // Issue a loop and recurse again.
-    forLoopIE(lbs[d], ubs[d], steps[d],
-        [&](const GenericAffineBuilder &createAffine,
-            mlir::ValueRange loopInd) {
-          loopIndices.emplace_back(loopInd[0]);
-          recursionForLoopsIE(lbs, ubs, steps, loopIndices, builderFn);
-        });
-  } else {
-    // Call lambda function
-    GenericAffineBuilder createAffine(b(), loc());
-    builderFn(createAffine, loopIndices);
-  }
-}
-
 // Support for adding blocks.
 template <class LOAD_OP, class STORE_OP>
 inline void GenericAffineBuilder<LOAD_OP, STORE_OP>::appendToBlock(
diff --git a/src/Dialect/Mlir/VectorMachineSupport.cpp b/src/Dialect/Mlir/VectorMachineSupport.cpp
index c03a8cdffb..954eb030d1 100644
--- a/src/Dialect/Mlir/VectorMachineSupport.cpp
+++ b/src/Dialect/Mlir/VectorMachineSupport.cpp
@@ -9,6 +9,7 @@
 // =============================================================================
 
 #include "src/Dialect/Mlir/VectorMachineSupport.hpp"
+#include "src/Compiler/CompilerOptions.hpp"
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "llvm/Support/Debug.h"
@@ -28,14 +29,17 @@ namespace onnx_mlir {
     *VectorMachineSupport::globalVectorMachineSupport = nullptr;
 
 /*static*/ void VectorMachineSupport::setGlobalVectorMachineSupport(
-    std::string arch, std::string cpu, std::string attr) {
-  // IBM Z servers use mcpu.
-  if (cpu.compare("z14") == 0) {
-    globalVectorMachineSupport = new Z14VectorMachineSupport();
-  } else if (cpu.compare("z15") == 0) {
-    globalVectorMachineSupport = new Z15VectorMachineSupport();
-  } else if (cpu.compare("z16") == 0) {
-    globalVectorMachineSupport = new Z16VectorMachineSupport();
+    const std::string &arch, const std::string &cpu, const std::string &attr) {
+  // IBM Z servers use march (deprecated mcpu), process here.
+  int64_t zArchNum = getZArchNum(arch, cpu);
+  if (zArchNum == 12) {
+    globalVectorMachineSupport = new ZArch12VectorMachineSupport();
+  } else if (zArchNum == 13) {
+    globalVectorMachineSupport = new ZArch13VectorMachineSupport();
+  } else if (zArchNum == 14) {
+    globalVectorMachineSupport = new ZArch14VectorMachineSupport();
+  } else if (zArchNum == 15) {
+    globalVectorMachineSupport = new ZArch15VectorMachineSupport();
     // Intel uses arch
   } else if (arch.compare("x86-64") == 0) {
     // Intel arch
@@ -128,7 +132,7 @@ int64_t VectorMachineSupport::computeArchVectorLength(Type elementType) {
 // IBM Z servers
 // =============================================================================
 
-bool Z16VectorMachineSupport::needCustomASM(
+bool ZArch14VectorMachineSupport::needCustomASM(
     GenericOps genOp, Type elementType) {
   assert(genOp < GenericOps::LastGop && "no metrics here, only genOps");
   bool isFloat = mlir::isa<FloatType>(elementType);
@@ -144,7 +148,7 @@ bool Z16VectorMachineSupport::needCustomASM(
   return false;
 }
 
-int64_t Z16VectorMachineSupport::computeArchVectorLength(
+int64_t ZArch14VectorMachineSupport::computeArchVectorLength(
     GenericOps genOp, Type elementType) {
   assert(genOp < GenericOps::LastGop && "no metrics here, only genOps");
   int64_t bitWidth = elementType.getIntOrFloatBitWidth();
diff --git a/src/Dialect/Mlir/VectorMachineSupport.hpp b/src/Dialect/Mlir/VectorMachineSupport.hpp
index f4597ce736..86327f4ae4 100644
--- a/src/Dialect/Mlir/VectorMachineSupport.hpp
+++ b/src/Dialect/Mlir/VectorMachineSupport.hpp
@@ -98,7 +98,7 @@ class VectorMachineSupport {
 public:
   // Must call setGlobalVectorMachineSupport once before using any calls below.
   static void setGlobalVectorMachineSupport(
-      std::string arch, std::string cpu, std::string attr);
+      const std::string &arch, const std::string &cpu, const std::string &attr);
   static void clearGlobalVectorMachineSupport();
 
   static std::string getArchName() { return vms()->computeArchName(); }
@@ -201,12 +201,12 @@ class NoVectorMachineSupport : public VectorMachineSupport {
 
 // Support for IBM Z servers.
 
-class Z16VectorMachineSupport : public VectorMachineSupport {
+class ZArch14VectorMachineSupport : public VectorMachineSupport {
 public:
-  Z16VectorMachineSupport() = default;
-  virtual ~Z16VectorMachineSupport() = default;
+  ZArch14VectorMachineSupport() = default;
+  virtual ~ZArch14VectorMachineSupport() = default;
 
-  std::string computeArchName() override { return "z16"; }
+  std::string computeArchName() override { return "z16/arch14 equivalent"; }
   bool needCustomASM(GenericOps gop, mlir::Type elementType) override;
   int64_t computeArchVectorRegisterNum() override { return 32; }
   int64_t computeArchVectorBitWidth() override { return 128; }
@@ -214,9 +214,10 @@ class Z16VectorMachineSupport : public VectorMachineSupport {
       GenericOps gop, mlir::Type elementType) override;
 };
 
-// TODO: create models for z14 and z15.
-using Z14VectorMachineSupport = Z16VectorMachineSupport;
-using Z15VectorMachineSupport = Z16VectorMachineSupport;
+// TODO: create models for arch12, arch13, arch15.
+using ZArch12VectorMachineSupport = ZArch14VectorMachineSupport;
+using ZArch13VectorMachineSupport = ZArch14VectorMachineSupport;
+using ZArch15VectorMachineSupport = ZArch14VectorMachineSupport;
 
 // Support for x86 processors (SSE 4.2 and AVX2)
 class SSE42x86VectorMachineSupport : public VectorMachineSupport {
diff --git a/src/Dialect/ONNX/DialectBuilder.cpp b/src/Dialect/ONNX/DialectBuilder.cpp
index 1210f89f23..672b4f6865 100644
--- a/src/Dialect/ONNX/DialectBuilder.cpp
+++ b/src/Dialect/ONNX/DialectBuilder.cpp
@@ -36,6 +36,12 @@ IntegerAttr OnnxBuilder::getSignedInt64Attr(int64_t n) const {
 // Basic operations
 // =============================================================================
 
+Value OnnxBuilder::abs(Value input) const {
+  Type outputType = input.getType(); // input == output type.
+  return createTypedOpAndInferShapes<ONNXAbsOp>(
+      toTensor(outputType), toTensor(input));
+}
+
 Value OnnxBuilder::add(Value A, Value B) const {
   assert((mlir::cast<ShapedType>(A.getType()).getElementType() ==
              mlir::cast<ShapedType>(B.getType()).getElementType()) &&
@@ -138,6 +144,13 @@ void OnnxBuilder::dimGroup(Value input, int axis, int groupID) const {
   b().create<ONNXDimGroupOp>(loc(), input, axisAttr, groupIDAttr);
 }
 
+Value OnnxBuilder::dequantizeLinear(
+    Type resType, Value X, Value scale, Value zeroPoint, int axis) const {
+  IntegerAttr axisAttr = getSignedInt64Attr(axis);
+  return createOpAndInferShapes<ONNXDequantizeLinearOp>(
+      resType, toTensor(X), toTensor(scale), toTensor(zeroPoint), axisAttr);
+}
+
 Value OnnxBuilder::div(Value A, Value B) const {
   assert((mlir::cast<ShapedType>(A.getType()).getElementType() ==
              mlir::cast<ShapedType>(B.getType()).getElementType()) &&
diff --git a/src/Dialect/ONNX/DialectBuilder.hpp b/src/Dialect/ONNX/DialectBuilder.hpp
index 6bc31974f2..7cec4a5b37 100644
--- a/src/Dialect/ONNX/DialectBuilder.hpp
+++ b/src/Dialect/ONNX/DialectBuilder.hpp
@@ -41,6 +41,9 @@ struct OnnxBuilder : DialectBuilder {
   OnnxOpType createTypedOpAndInferShapes(
       mlir::Type result_ty, Args &&... args) const;
 
+  // ONNXAbsOp
+  mlir::Value abs(mlir::Value input) const;
+
   // ONNXAddOp
   mlir::Value add(mlir::Value A, mlir::Value B) const;
 
@@ -74,6 +77,10 @@ struct OnnxBuilder : DialectBuilder {
       mlir::ArrayRef<int64_t> kernelShape, mlir::ArrayRef<int64_t> pads,
       mlir::ArrayRef<int64_t> strides) const;
 
+  // ONNXDequantizeLinearOp
+  mlir::Value dequantizeLinear(mlir::Type resType, mlir::Value X,
+      mlir::Value scale, mlir::Value zeroPoint, int axis = 1) const;
+
   // ONNXDivOp
   mlir::Value div(mlir::Value A, mlir::Value B) const;
 
diff --git a/src/Dialect/ONNX/ONNXOps/NN/Normalization.cpp b/src/Dialect/ONNX/ONNXOps/NN/Normalization.cpp
index 091426074f..f4f53553b3 100644
--- a/src/Dialect/ONNX/ONNXOps/NN/Normalization.cpp
+++ b/src/Dialect/ONNX/ONNXOps/NN/Normalization.cpp
@@ -155,12 +155,12 @@ LogicalResult ONNXInstanceNormalizationOp::verify() {
 LogicalResult ONNXGroupNormalizationV18Op::verify() {
   ONNXGroupNormalizationV18OpAdaptor(*this);
   llvm::outs()
-      << "Warning: The previous understanding of Opset 18 for "
+      << "\nWarning: The previous understanding of Opset 18 for "
          "GroupNormalization "
          "is incorrect. As shown in the following issue: "
          "https://github.com/onnx/onnx/issues/5466.Rather, use Opset 21 for "
          "GroupNormalization instead."
-      << "/n";
+      << "\n\n";
   return success();
 }
 
diff --git a/src/Dialect/ONNX/ONNXOps/OpHelper.hpp b/src/Dialect/ONNX/ONNXOps/OpHelper.hpp
index b084ad5cd6..68976fe05b 100644
--- a/src/Dialect/ONNX/ONNXOps/OpHelper.hpp
+++ b/src/Dialect/ONNX/ONNXOps/OpHelper.hpp
@@ -269,6 +269,36 @@ bool hasIntegerPowerExponent(mlir::ONNXPowOp *op, int64_t &exponentValue);
 template <typename OP>
 bool definedBy(mlir::Value v);
 
+// This is to match if two values A and B are bijectively defined by OP1 and
+// OP2. In other words,
+// - if A is defined by OP1, then B would be defined by OP2.
+// - if A is defined by OP2, then B would be defined by OP1.
+//
+// In both case, the output has two values,
+// - the first one is the value defined by OP1,
+// - the second one is the value defined by OP2.
+//
+// For example, to recognize BOTH A*B+C and C+A*B, where C is defined by
+// ONNXConstant
+// ```
+// %C = onnx.Constant
+// %AB = onnx.MatMul(A, B)
+// onnx.Add(%AB, %C);
+// ```
+//
+// We can use:
+// Value lhs = addOp.getOperation(0);
+// Value rhs = addOp.getOperation(1);
+// ValueRange matchedValued;
+//
+// Value AB, C;
+// areDefinedBy<ONNXMatMulOp, ONNXConstantOp>(lhs, rhs, AB, C);
+//
+// Note: The order of A and B are not important, they can be swapped.
+template <typename OP1, typename OP2>
+bool areDefinedBy(mlir::Value A, mlir::Value B, mlir::Value &matchedOP1,
+    mlir::Value &matchedOP2);
+
 // Check if the operation defining `op->operand[matchThisOperandIndex]` matches
 // `OP`. If it does, set matchOperand to that operand, and matchOp to that
 // defining op. Otherwise, don't change the match values.
diff --git a/src/Dialect/ONNX/ONNXOps/OpHelper.hpp.inc b/src/Dialect/ONNX/ONNXOps/OpHelper.hpp.inc
index c4e961e755..fd3de372d2 100644
--- a/src/Dialect/ONNX/ONNXOps/OpHelper.hpp.inc
+++ b/src/Dialect/ONNX/ONNXOps/OpHelper.hpp.inc
@@ -33,6 +33,22 @@ bool definedBy(mlir::Value v) {
   return !mlir::isa<mlir::BlockArgument>(v) && llvm::isa<OP>(v.getDefiningOp());
 }
 
+template <typename OP1, typename OP2>
+bool areDefinedBy(mlir::Value A, mlir::Value B, mlir::Value &matchedOP1, mlir::Value
+  &matchedOP2) {
+  if (A.getDefiningOp<OP1>() && B.getDefiningOp<OP2>()) {
+    matchedOP1 = A;
+    matchedOP2 = B;
+    return true;
+  }
+  if (A.getDefiningOp<OP2>() && B.getDefiningOp<OP1>()) {
+    matchedOP1 = B;
+    matchedOP2 = A;
+    return true;
+  }
+  return false;
+}
+
 // Support for recognizing patterns. Detects if the operation "op" has an input
 // operand number "matchThisOperandIndex" that is defined by an operation of
 // type "OP". If that is the case, "matchOperand" will be set to that operand,
diff --git a/src/Dialect/ONNX/ONNXOps/ShapeHelper.cpp b/src/Dialect/ONNX/ONNXOps/ShapeHelper.cpp
index 1b0cd1dc34..02955d52e5 100644
--- a/src/Dialect/ONNX/ONNXOps/ShapeHelper.cpp
+++ b/src/Dialect/ONNX/ONNXOps/ShapeHelper.cpp
@@ -114,16 +114,18 @@ static void refineDims(Operation *op, DimsExpr &inferredDims, Value output) {
     assert(inferredDims[i].isLiteral() && "isLiteral failed");
     if (existingDims[i] != inferredDims[i].getLiteral()) {
       if (op)
-        llvm::outs() << "Warning for operation " << op->getName()
+        llvm::outs() << "\nWarning for operation " << op->getName()
                      << ": [Shape inference, dim " << i
                      << "] the inferred dim (" << inferredDims[i].getLiteral()
                      << ") is different from the existing dim ("
-                     << existingDims[i] << "). Use the existing dim instead.\n";
+                     << existingDims[i]
+                     << "). Use the existing dim instead.\n\n";
       else
-        llvm::outs() << "Warning: [Shape inference, dim " << i
+        llvm::outs() << "\nWarning: [Shape inference, dim " << i
                      << "] the inferred dim (" << inferredDims[i].getLiteral()
                      << ") is different from the existing dim ("
-                     << existingDims[i] << "). Use the existing dim instead.\n";
+                     << existingDims[i]
+                     << "). Use the existing dim instead.\n\n";
       inferredDims[i] = LitIE(existingDims[i]);
     }
   }
diff --git a/src/Dialect/ONNX/Transforms/ConstProp.cpp b/src/Dialect/ONNX/Transforms/ConstProp.cpp
index e95632b3e3..d3c2698273 100644
--- a/src/Dialect/ONNX/Transforms/ConstProp.cpp
+++ b/src/Dialect/ONNX/Transforms/ConstProp.cpp
@@ -34,6 +34,7 @@
 #include "src/Pass/Passes.hpp"
 #include "src/Support/TypeUtilities.hpp"
 
+#include <fenv.h>
 #include <math.h>
 #include <numeric>
 
@@ -414,6 +415,11 @@ struct ElementWiseUnaryOpImpl<ONNXBitwiseNotOp, T, EnableInteger<T>> {
   static T eval(T val) { return ~val; }
 };
 
+template <typename T>
+struct ElementWiseUnaryOpImpl<ONNXAbsOp, T, EnableNotBool<T>> {
+  static T eval(T val) { return (T)abs((double)val); }
+};
+
 template <typename T>
 struct ElementWiseUnaryOpImpl<ONNXCeilOp, T, EnableNotBool<T>> {
   static T eval(T val) { return ceil(val); }
@@ -474,6 +480,14 @@ struct ElementWiseUnaryOpImpl<ONNXReciprocalOp, T, EnableNotBool<T>> {
   static T eval(T val) { return (1 / val); }
 };
 
+template <typename T>
+struct ElementWiseUnaryOpImpl<ONNXRoundOp, T, EnableNotBool<T>> {
+  static T eval(T val) {
+    /*std::*/ fesetround(FE_TONEAREST);
+    return (T)std::rint(val);
+  }
+};
+
 template <typename ElementwiseUnaryOp>
 auto elementwiseUnaryOpFunction(Type elemType) {
   return getWideNumWrappedTemplateFunction<ElementWiseUnaryOpImpl,
diff --git a/src/Dialect/ONNX/Transforms/ConstProp.td b/src/Dialect/ONNX/Transforms/ConstProp.td
index 408d01464e..06fec4bbb9 100644
--- a/src/Dialect/ONNX/Transforms/ConstProp.td
+++ b/src/Dialect/ONNX/Transforms/ConstProp.td
@@ -142,6 +142,9 @@ def CreateCastOfConst :
 def CreateBitwiseNotOfConst :
    NativeCodeCall<"ConstPropElementwiseUnary<mlir::ONNXBitwiseNotOp>($_builder, $0, $1)">;
 
+def CreateAbsOfConst :
+   NativeCodeCall<"ConstPropElementwiseUnary<mlir::ONNXAbsOp>($_builder, $0, $1)">;
+
 def CreateCeilOfConst :
    NativeCodeCall<"ConstPropElementwiseUnary<mlir::ONNXCeilOp>($_builder, $0, $1)">;
 
@@ -178,6 +181,9 @@ def CreateReluOfConst :
 def CreateReciprocalOfConst :
    NativeCodeCall<"ConstPropElementwiseUnary<mlir::ONNXReciprocalOp>($_builder, $0, $1)">;
 
+def CreateRoundOfConst :
+   NativeCodeCall<"ConstPropElementwiseUnary<mlir::ONNXRoundOp>($_builder, $0, $1)">;
+
 def CreateMaxOfConst :
    NativeCodeCall<"ConstPropVariadicElementwiseBinary<mlir::ONNXMaxOp>($_builder, $0, $1)">;
 
@@ -420,6 +426,14 @@ def BitwiseNotConstProp : NamedPat<"BitwiseNotofConst",
     (CreateBitwiseNotOfConst $bitwiseNotOp, $input),
     [(IsFromDenseONNXConstantOp:$input)]>;
 
+// Constant Propagation for Abs
+def AbsConstProp : NamedPat<"AbsofConst",
+    // From abs(c).
+    (ONNXAbsOp:$ceilOp (ONNXConstantOp:$input $_, $_, $_, $_, $_, $_, $_, $_)),
+    // To new_c
+    (CreateAbsOfConst $ceilOp, $input),
+    [(IsFromDenseONNXConstantOp:$input)]>;
+
 // Constant Propagation for Ceil
 def CeilConstProp : NamedPat<"CeilofConst",
     // From ceil(c).
@@ -533,6 +547,14 @@ def ReciprocalOfConst : NamedPat<"ReciprocalOfConst",
     (CreateReciprocalOfConst $reciprocalOp, $input),
     [(IsFromDenseONNXConstantOp:$input)]>;
 
+// Constant Propagation for Round
+def RoundofConst : NamedPat<"RoundofConst",
+    // From  onnx.Round(c)
+    (ONNXRoundOp:$reluOp (ONNXConstantOp:$input $_, $_, $_, $_, $_, $_, $_, $_)),
+    // To round_even(c)
+    (CreateRoundOfConst $reluOp, $input),
+    [(IsFromDenseONNXConstantOp:$input)]>;
+
 //===----------------------------------------------------------------------===//
 // Const propagation patterns for variadic elementwise operations.
 //===----------------------------------------------------------------------===//
diff --git a/src/Dialect/ONNX/Transforms/ONNXHybridTransformPass.cpp b/src/Dialect/ONNX/Transforms/ONNXHybridTransformPass.cpp
index ccfd5fe154..0e58963512 100644
--- a/src/Dialect/ONNX/Transforms/ONNXHybridTransformPass.cpp
+++ b/src/Dialect/ONNX/Transforms/ONNXHybridTransformPass.cpp
@@ -146,11 +146,11 @@ struct ONNXHybridTransformPass
           maxNumRewritesOffset + maxNumRewritesMultiplier * numOps;
     }
     if (failed(applyPatternsAndFoldGreedily(body, patterns, config))) {
-      llvm::errs() << "Warning: onnx-hybrid-transform didn't converge with "
+      llvm::errs() << "\nWarning: onnx-hybrid-transform didn't converge with "
                    << "max-num-rewrites-offset="
                    << maxNumRewritesOffset.getValue() << ", "
                    << "max-num-rewrites-multiplier="
-                   << maxNumRewritesMultiplier.getValue() << "\n";
+                   << maxNumRewritesMultiplier.getValue() << "\n\n";
     }
 
     inferFunctionReturnShapes(f);
diff --git a/src/Runtime/OMTensor.inc b/src/Runtime/OMTensor.inc
index cdc2fd05f3..49e24befc3 100644
--- a/src/Runtime/OMTensor.inc
+++ b/src/Runtime/OMTensor.inc
@@ -480,7 +480,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {
 /* Helper macros to print data for 1-4D tensors  */
 #define LOOP_1(INDEX, IV, UB)                                                  \
   fprintf(fout, "[");                                                          \
-  for (int64_t (IV) = 0; (IV) < (UB); ++(IV)) {                                \
+  for (int64_t IV = 0; (IV) < (UB); ++(IV)) {                                \
     if (IV)                                                                    \
       fprintf(fout, ", ");                                                     \
     indexes[(INDEX)] = (IV);                                                   \
@@ -491,7 +491,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {
 
 #define LOOP_2(INDEX, IV, UB, ...)                                             \
   fprintf(fout, "[");                                                          \
-  for (int64_t (IV) = 0; (IV) < (UB); ++(IV)) {                                \
+  for (int64_t IV = 0; (IV) < (UB); ++(IV)) {                                \
     if (IV)                                                                    \
       fprintf(fout, ", ");                                                     \
     indexes[(INDEX)] = (IV);                                                   \
@@ -501,7 +501,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {
 
 #define LOOP_3(INDEX, IV, UB, ...)                                             \
   fprintf(fout, "[");                                                          \
-  for (int64_t (IV) = 0; (IV) < (UB); ++(IV)) {                                \
+  for (int64_t IV = 0; (IV) < (UB); ++(IV)) {                                \
     if (IV)                                                                    \
       fprintf(fout, ", ");                                                     \
     indexes[(INDEX)] = (IV);                                                   \
@@ -511,7 +511,7 @@ static void printData(FILE *fout, const OMTensor *tensor) {
 
 #define LOOP_4(INDEX, IV, UB, ...)                                             \
   fprintf(fout, "[");                                                          \
-  for (int64_t (IV) = 0; (IV) < (UB); ++(IV)) {                                \
+  for (int64_t IV = 0; (IV) < (UB); ++(IV)) {                                \
     if (IV)                                                                    \
       fprintf(fout, ", ");                                                     \
     indexes[(INDEX)] = (IV);                                                   \
@@ -519,6 +519,26 @@ static void printData(FILE *fout, const OMTensor *tensor) {
   }                                                                            \
   fprintf(fout, "]");
 
+#define LOOP_5(INDEX, IV, UB, ...)                                             \
+  fprintf(fout, "[");                                                          \
+  for (int64_t IV = 0; (IV) < (UB); ++(IV)) {                                 \
+    if (IV)                                                                    \
+      fprintf(fout, ", ");                                                     \
+    indexes[(INDEX)] = (IV);                                                   \
+    LOOP_4((INDEX) + 1, __VA_ARGS__)                                           \
+  }                                                                            \
+  fprintf(fout, "]");
+
+#define LOOP_6(INDEX, IV, UB, ...)                                             \
+  fprintf(fout, "[");                                                          \
+  for (int64_t IV = 0; (IV) < (UB); ++(IV)) {                                 \
+    if (IV)                                                                    \
+      fprintf(fout, ", ");                                                     \
+    indexes[(INDEX)] = (IV);                                                   \
+    LOOP_5((INDEX) + 1, __VA_ARGS__)                                           \
+  }                                                                            \
+  fprintf(fout, "]");
+
   const OM_DATA_TYPE dataType = omTensorGetDataType(tensor);
   const int64_t rank = omTensorGetRank(tensor);
   const int64_t *shape = omTensorGetShape(tensor);
@@ -545,6 +565,14 @@ static void printData(FILE *fout, const OMTensor *tensor) {
     int64_t indexes[4];
     LOOP_4(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3])
   } break;
+  case 5: {
+    int64_t indexes[5];
+    LOOP_5(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3], m, shape[4])
+  } break;
+  case 6: {
+    int64_t indexes[6];
+    LOOP_6(0, i, shape[0], j, shape[1], k, shape[2], l, shape[3],  m, shape[4],  n, shape[5])
+  } break;
   default:
     assert(false && "not implemented");
   }
@@ -577,6 +605,7 @@ void omTensorPrint(const char *msg, const OMTensor *tensor) {
     msg += 2;
     len -= 2;
   }
+  bool hadOneOrMoreFormats = false;
   while (len > 0) {
     if (msg[0] == '%') {
       if (len < 2) {
@@ -586,12 +615,15 @@ void omTensorPrint(const char *msg, const OMTensor *tensor) {
       if (msg[1] == 'd') { /* Letter `d` for data. */
         assert(tensor && "attempt to print a null OMTensor");
         printData(fout, tensor);
+        hadOneOrMoreFormats = true;
       } else if (msg[1] == 's') { /* Letter `s` for signature. */
         assert(tensor && "attempt to print a null OMTensor");
         printSignature(fout, tensor);
+        hadOneOrMoreFormats = true;
       } else if (msg[1] == 't') { /* Letter `t` for type only. */
         assert(tensor && "attempt to print a null OMTensor");
         printType(fout, tensor);
+        hadOneOrMoreFormats = true;
       } else if (msg[1] == 'e') { /* Letter `e` for end. */
         return;
       } else {
@@ -607,6 +639,13 @@ void omTensorPrint(const char *msg, const OMTensor *tensor) {
     msg++;
     len--;
   }
+  if (!hadOneOrMoreFormats) {
+    // default per Krnl.td: %s%d
+    fprintf(fout, "\n");
+    printSignature(fout, tensor);
+    printData(fout, tensor);
+    fprintf(fout, "\n");
+  }
 }
 
 #ifdef __cplusplus
diff --git a/src/Support/SmallVectorHelper.hpp b/src/Support/SmallVectorHelper.hpp
index a20047c578..e3ae9c2a3e 100644
--- a/src/Support/SmallVectorHelper.hpp
+++ b/src/Support/SmallVectorHelper.hpp
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef ONNX_MLIR_SMALL_VECTOR_HELPER_H
+#define ONNX_MLIR_SMALL_VECTOR_HELPER_H
+
 #include "llvm/ADT/SmallVector.h"
 
 //===----------------------------------------------------------------------===//
@@ -103,3 +106,5 @@ llvm::SmallVector<T, N> lastFew(
     res.emplace_back(vec[i]);
   return res;
 }
+
+#endif
diff --git a/src/Tools/onnx-mlir-opt/onnx-mlir-opt.cpp b/src/Tools/onnx-mlir-opt/onnx-mlir-opt.cpp
index abea202cf3..29411aaf68 100644
--- a/src/Tools/onnx-mlir-opt/onnx-mlir-opt.cpp
+++ b/src/Tools/onnx-mlir-opt/onnx-mlir-opt.cpp
@@ -64,7 +64,7 @@ void scanAndSetOptLevel(int argc, char **argv) {
 }
 
 void scanAndSetMCPU(int argc, char **argv) {
-  // Scan --mcpu and add them to the mcpu option.
+  // Scan for (deprecated) --mcpu and add them to the mcpu option.
   for (int i = argc - 1; i > 0; --i) {
     std::string currStr(argv[i]);
     if (currStr.find("--mcpu=") == 0) {
@@ -80,6 +80,25 @@ void scanAndSetMCPU(int argc, char **argv) {
   }
 }
 
+void scanAndSetMArch(int argc, char **argv) {
+  // Scan --march and add them to the march option.
+  for (int i = argc - 1; i > 0; --i) {
+    std::string currStr(argv[i]);
+    if (currStr.find("--march=") == 0) {
+      std::string archKind(
+          &argv[i][8]); // Get the string starting 8 chars down.
+      setTargetArch(archKind);
+      break;
+    }
+    if (currStr.find("-march=") == 0) {
+      std::string archKind(
+          &argv[i][7]); // Get the string starting 7 chars down.
+      setTargetArch(archKind);
+      break;
+    }
+  }
+}
+
 void scanAndSetMAccel(int argc, char **argv) {
   // Scan accelerators and add them to the maccel option.
   for (int i = argc - 1; i > 0; --i) {
@@ -106,9 +125,10 @@ int main(int argc, char **argv) {
   // before command line options are parsed.
   scanAndSetOptLevel(argc, argv);
 
-  // Scan CPU manually now as it is needed to register passes
+  // Scan CPU and Arch manually now as it is needed to register passes
   // before command line options are parsed.
   scanAndSetMCPU(argc, argv);
+  scanAndSetMArch(argc, argv);
 
   // Scan maccel manually now as it is needed to initialize accelerators
   // before ParseCommandLineOptions() is called.
diff --git a/test/accelerators/NNPA/backend/CMakeLists.txt b/test/accelerators/NNPA/backend/CMakeLists.txt
index c2c946615c..fb5ef89904 100644
--- a/test/accelerators/NNPA/backend/CMakeLists.txt
+++ b/test/accelerators/NNPA/backend/CMakeLists.txt
@@ -101,12 +101,14 @@ endif()
 # "NO_DYNAMIC_SHAPE_TEST", backend test is skipped, otherwise the string is
 # passed as --dimParams option. "0:0=a,1=b,2=c|1:0=a,1=b,2=c" means that the
 # first, second and third dimensions of the first and second input arguments # are the same respectively.
-set(NNPA_TEST_LIST
+set(NNPA_TEST_LIST_z16
 
+    # To rebuild after changes: make onnx_mlir_supported_ops
     # ==ARCH== NNPA
-    # ==ADDITIONAL_PARAGRAPH== NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA. NNPA currently only support DLFLOAT16 as its data type. Common data formats like FP32, FP16, BFLOAT need to undergo data conversions to the NNPA internal format DLFLOAT16. Hence ONNX ops which updated their tensors to BFLOAT16 will not be natively supported on NNPA.
+    # ==ADDITIONAL_PARAGRAPH== NNPA has hardware limitations in dimension index size and tensor size, which are described in [NNPALimit.hpp](../src/Accelerators/NNPA/Support/NNPALimit.hpp). They are large enough for normal use cases, but if your model exceeds the limitations, CPU is used instead of NNPA. NNPA currently only support DLFLOAT16 as its data type. Common data formats like FP32, FP16, BFLOAT need to undergo data conversions to the NNPA internal format DLFLOAT16. Hence ONNX ops which updated their tensors to BFLOAT16 will not be natively supported on NNPA.  Onnx-mlir with NNPA utilizes hardware when possible. To accomplish this, the compiler converts ONNX ops to [ZHigh](Dialects/zhigh.md) ops, [ZLow](Dialects/zlow.md) ops, and are processed by the [IBM Z Deep Neural Network Library (zDNN)](https://github.com/IBM/zDNN).
 
     # ==OP== Add
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions.
     # Scalar tensor not supported.
@@ -114,6 +116,7 @@ set(NNPA_TEST_LIST
     # test_add_bcast_cpu # bcast not supported
 
     # ==OP== AveragePool
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors (N x C x H x W).<br>- `kernel_shape` must be static.<br>- `count_include_pad` must be default value(0).<br>- `ceil_mode` must be default value(0).
     # test_averagepool_1d_default_cpu
@@ -131,12 +134,14 @@ set(NNPA_TEST_LIST
     # test_averagepool_3d_default_cpu
 
     # ==OP== BatchNormalization
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== Input and output tensor must be 4D(N x C x H x W).
     test_batchnorm_epsilon_cpu,zdnn_mul_ext,"0:0=a,1=b,2=c,3=d|1:0=b|2:0=b|3:0=b|4:0=b"
     test_batchnorm_example_cpu,zdnn_mul_ext,"0:0=a,1=b,2=c,3=d|1:0=b|2:0=b|3:0=b|4:0=b"
-
+    
     # ==OP== Conv
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- Dimension in Height and weight must be static.<br>- `group` must be default value(1).<br>- `dilations` must be default value(1).<br>- Input and output tensors must have 4D (N x C x H x W).<br>- `kernel_shape` must be static.
     test_basic_conv_with_padding_cpu,zdnn_conv2d,NO_DYNAMIC_SHAPE_TEST
@@ -147,6 +152,7 @@ set(NNPA_TEST_LIST
     # test_conv_with_strides_and_asymmetric_padding_cpu
 
     # ==OP== ConvTranspose
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - 1D and 3D not supported because Conv1D and Conv3D not supported in zDNN. non-default `dilations` not supported because dilated convolution not supported in zDNN.
     # Spatial dims must be static.
@@ -161,6 +167,7 @@ set(NNPA_TEST_LIST
     test_convtranspose_pads_cpu,zdnn_conv2d
 
     # ==OP== Div
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions.
     test_div_cpu,zdnn_div_ext,"0:0=a,1=b,2=c|1:0=a,1=b,2=c"
@@ -168,14 +175,20 @@ set(NNPA_TEST_LIST
     test_div_example_cpu,zdnn_div_ext,"0:0=a|1:0=a"
 
     # ==OP== Exp
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== Input tensor must have 4 dimensions.
     test_exp_cpu,zdnn_exp_ext
     test_exp_example_cpu,zdnn_exp_ext
 
     # ==OP== Gemm
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
-    # ==LIM== - `alpha` and `beta` must be default value(1).<br>- Rank of `C` must be 1 or 2. If the rank is 1, the dimension of `C` must be the same with the seconde dimension of `B`.
+    # ==LIM== - `alpha` and `beta` must be default value(1).<br>- Rank of `C` must be 1 or 2. If the rank is 1, the dimension of `C` must be the same with the seconde dimension of `B`.<br>
+
+    # Commented out for the moment.
+    # -`gemm_transposeA` and `gemm_transposeB` will require an "--march" or an NNPA level of at least arch15, and the "transA" or "transB" attribute must be non-zero.
+
     # test_gemm_all_attributes_cpu
     # test_gemm_alpha_cpu
     # test_gemm_beta_cpu
@@ -189,6 +202,7 @@ set(NNPA_TEST_LIST
     test_gemm_transposeB_cpu,zdnn_matmul_op_ext
 
     # ==OP== GlobalAveragePool
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - Input shape must be 4D tensor(NCHW).<br>- Dimensions in `H` and `W` must be static.
     test_globalaveragepool_cpu,zdnn_meanreduce2d,NO_DYNAMIC_SHAPE_TEST
@@ -198,26 +212,22 @@ set(NNPA_TEST_LIST
     # test_globalmaxpool_precomputed_cpu
 
     # ==OP== GRU
+    # ==LEVEL== z16,arch15
     # ==MIN== 7
     # ==LIM== - `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- If `B` and `initial_h` are given, they must have static dimensions.<br>- `sequence_lens` is not supported for bidirectional GRU.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `linear_before_reset` must be 1.<br>- `layout` is not supported.
     # test_gru_defaults_cpu
     # test_gru_seq_length_cpu
     # test_gru_with_initial_bias_cpu
 
-    # ==OP== LeakyRelu
-    # ==MIN== 6
-    # ==LIM== The operations immediately before and after the LeakyRelu operation must be executed on the NNPA. Otherwise, LeakyRelu is executed on the CPU. This limitation is set to avoid performance degradation.
-    # Leakyrelu op in following test cases doesn't run on NNPA because single LeakyRelu op is included.
-    # test_leakyrelu_cpu
-    # test_leakyrelu_default_cpu
-    # test_leakyrelu_example_cpu
-
     # ==OP== Log
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== Input tensor must have 4 dimensions.
     test_log_example_cpu,zdnn_log_ext
     test_log_cpu,zdnn_log_ext
+
     # ==OP== LogSoftmax
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # test_logsoftmax_axis_0_cpu
     # test_logsoftmax_axis_1_cpu
@@ -228,6 +238,7 @@ set(NNPA_TEST_LIST
     # test_logsoftmax_large_number_cpu #  accuracy error in test_logsoftmax_large_number_cpu
 
     # ==OP== LSTM
+    # ==LEVEL== z16,arch15
     # ==MIN== 7
     # ==LIM== - `direction` and `hidden_size` in `W` must have static dimensions.<br>- `R` must have static dimensions.<br>- `B` and `initial_h` have static dimensions if given. `B`'s direction dim must be 1 or 2.<br>- `P`(peepholes), `activation_alpha`, and `activation_beta` are not supported.<br>- `activations` must be `["Sigmoid", "Tanh", "Tanh"]`.<br>- `clip` is not supported.<br>- `input_forget` must be default value(0).<br>- `layout` is not supported.
     test_lstm_defaults_cpu,zdnn_lstm
@@ -235,6 +246,7 @@ set(NNPA_TEST_LIST
     # test_lstm_with_peepholes_cpu
 
     # ==OP== MatMul
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== Ranks of input tensors must be (Rank of A, Rank of B) = (M, N), where M >= 2 and N >= 2.
     test_matmul_2d_cpu,zdnn_matmul_op_ext
@@ -242,6 +254,7 @@ set(NNPA_TEST_LIST
     test_matmul_4d_cpu,zdnn_matmul_op_ext,"0:0=a,1=b,2=c,3=d|1:0=a,1=b,2=d,3=c"
 
     # ==OP== Max
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions.
     # test_max_example_cpu
@@ -260,6 +273,7 @@ set(NNPA_TEST_LIST
     # test_max_uint64_cpu
 
     # ==OP== MaxPool
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - `auto_pad` must be `NOTSET`, `VALID`, and `SAME_UPPER`. If `NOTSET` is used, `pads` must be set so that the padding valid type or same upper.<br>- `ceil_mode` must be default value(0) <br>- Input and output tensors must be 4D tensors(N x C x H x W).<br>- `kernel_shape` must be static.<br>- `ceil_mode` must be default value(0).<br>- `dilations` must be default value(1).
     # test_maxpool_1d_default_cpu
@@ -276,6 +290,7 @@ set(NNPA_TEST_LIST
     # test_maxpool_3d_default_cpu
 
     # ==OP== Min
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - Shape of input tensors must be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions.
     # test_min_example_cpu
@@ -294,6 +309,7 @@ set(NNPA_TEST_LIST
     # test_min_uint64_cpu
 
     # ==OP== Mul
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions.
     test_mul_cpu,zdnn_mul_ext,"0:0=a,1=b,2=c|1:0=a,1=b,2=c"
@@ -301,11 +317,13 @@ set(NNPA_TEST_LIST
     test_mul_example_cpu,zdnn_mul_ext,"0:0=a|1:0=a"
 
     # ==OP== Pow
+    # ==LEVEL== z16,arch15
     # ==MIN== 7
     # ==LIM== - Exponent should be a scalar integer and less or equal to 64.
     test_pow_bcast_scalar_cpu
 
     # ==OP== ReduceMean
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - `keepdims` must be 1.<br>- Input tensor must be 4D tensors and `axis` must be [2, 3].
     # test_reduce_mean_default_axes_keepdims_example_cpu
@@ -318,11 +336,13 @@ set(NNPA_TEST_LIST
     # test_reduce_mean_negative_axes_keepdims_random_cpu
 
     # ==OP== Relu
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== Input tensor must be less than or equal to 4 dimensions.
     test_relu_cpu,zdnn_relu_ext
 
     # ==OP== Softmax
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== - `axis` must be the last dimension, i.e. `rank - 1` or -1.
     # test_softmax_axis_0_cpu
@@ -333,6 +353,7 @@ set(NNPA_TEST_LIST
     # test_softmax_large_number_cpu #  accuracy error
 
     # ==OP== Softplus
+    # ==LEVEL== z16,arch15
     # ==MIN== 1
     # ==LIM== The operations immediately before and after the Softplus operation must be executed on the NNPA. Otherwise, Softplus is executed on the CPU. This limitation is set to avoid performance degradation.
     # Softplus op in following test cases doesn't run on NNPA because single Softplus op is included. Softplus is tested not by backend tests but by the TestSoftplus numerical test
@@ -340,6 +361,7 @@ set(NNPA_TEST_LIST
     # test_softplus_example_cpu,zdnn_log
 
     # ==OP== Sub
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - Shape of input tensors should be the same since broadcasting is not supported.<br>- Input tensors must have static dimensions.
     test_sub_cpu,zdnn_sub_ext,"0:0=a,1=b,2=c|1:0=a,1=b,2=c"
@@ -347,6 +369,7 @@ set(NNPA_TEST_LIST
     test_sub_example_cpu,zdnn_sub_ext,"0:0=a|1:0=a"
 
     # ==OP== Sum
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== - All inputs must have the same static shape (Broadcasting not supported.)<br>- Single input not supported.
     test_sum_example_cpu,zdnn_add,"0:0=a|1:0=a|2:0=a"
@@ -354,37 +377,129 @@ set(NNPA_TEST_LIST
     test_sum_two_inputs_cpu,zdnn_add,"0:0=a|1:0=a"
 
     # ==OP== Tanh
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== Input tensor must be less than or equal to 4 dimensions.
 
     # ==OP== Sigmoid
+    # ==LEVEL== z16,arch15
     # ==MIN== 6
     # ==LIM== Input tensor must be less than or equal to 4 dimensions.
 
     # Model
     test_densenet121_cpu,zdnn_conv2d
-    test_inception_v1_cpu,zdnn_conv2d
-    test_resnet50_cpu,zdnn_conv2d
-    test_shufflenet_cpu,zdnn_matmul_op_ext
-    # test_squeezenet_cpu,zdnn_conv # got NaN results
-    test_vgg19_cpu,zdnn_conv
+    # TODO re-enable below 2 tests
+    # test_inception_v1_cpu,zdnn_conv2d
+    # test_resnet50_cpu,zdnn_conv2d
+
+    # test_shufflenet_cpu,zdnn_matmul_op_ext # got NaN results in check-onnx-backend-dynamic-jni-nnpa.
+    # Got NaN results because the last Conv running on NNPA produces dlfloat16 out-of-range values that are represented as NaN.
+    # test_squeezenet_cpu,zdnn_conv
+    # TODO re-enable below test
+    # test_vgg19_cpu,zdnn_conv
+)
+
+set(NNPA_TEST_LIST_ARCH_15
+
+    # ==OP== Gelu
+    # ==LEVEL== arch15
+    # ==MIN== 20
+    test_gelu_default_1_cpu,zdnn_gelu_ext
+    test_gelu_default_2_cpu,zdnn_gelu_ext
+    test_gelu_tanh_1_cpu,zdnn_gelu_ext
+    test_gelu_tanh_2_cpu,zdnn_gelu_ext
+
+    # Gemm Transpose
+    test_gemm_transposeA_cpu,zdnn_matmul_transpose_op_ext
+    test_gemm_transposeB_cpu,zdnn_matmul_transpose_op_ext
+
+    # ==OP== LeakyRelu
+    # ==LEVEL== arch15
+    # ==MIN== 6
+    # ==LIM== Input tensor must be less than or equal to 4 dimensions.
+    test_leakyrelu_cpu,zdnn_leaky_relu_ext
+    test_leakyrelu_default_cpu,zdnn_leaky_relu_ext
+    test_leakyrelu_example_cpu,zdnn_leaky_relu_ext
+
+    # ==OP== MatMulInteger
+    # ==LEVEL== arch15
+    # ==MIN== 10
+    test_matmulinteger_cpu,zdnn_quantized_matmul_op
+
+    # ==OP== QLinearMatMul
+    # ==LEVEL== arch15
+    # ==MIN== 10 
+    test_qlinearmatmul_2D_uint8_float32_cpu,zdnn_quantized_matmul_op
+    test_qlinearmatmul_3D_uint8_float32_cpu,zdnn_quantized_matmul_op
+    # Error: at (1, 0) mismatch 0 (actual) vs 1 (reference)
+    # test_qlinearmatmul_2D_int8_float32_cpu,zdnn_quantized_matmul_op
+    # test_qlinearmatmul_3D_int8_float32_cpu,zdnn_quantized_matmul_op
+
+    # ==OP== ReduceMax
+    # ==LEVEL== arch15
+    # ==MIN== 1
+    # ==LIM== - We do no support `do_not_keepdims` backend tests. 
+    # test_reduce_max_bool_inputs_cpu,zdnn_reduce_ext
+    test_reduce_max_default_axes_keepdim_example_cpu,zdnn_reduce_ext
+    test_reduce_max_default_axes_keepdims_random_cpu,zdnn_reduce_ext
+    # test_reduce_max_do_not_keepdims_example_cpu,zdnn_reduce_ext
+    # test_reduce_max_do_not_keepdims_random_cpu,zdnn_reduce_ext
+    test_reduce_max_keepdims_example_cpu,zdnn_reduce_ext
+    test_reduce_max_keepdims_random_cpu,zdnn_reduce_ext
+    test_reduce_max_negative_axes_keepdims_example_cpu,zdnn_reduce_ext
+    test_reduce_max_negative_axes_keepdims_random_cpu,zdnn_reduce_ext
+
+    # ==OP== ReduceMin
+    # ==LEVEL== arch15 
+    # ==MIN== 1
+    # ==LIM== - We do no support `do_not_keepdims` backend tests. 
+    # test_reduce_min_bool_inputs_cpu,zdnn_reduce_ext
+    test_reduce_min_default_axes_keepdims_example_cpu,zdnn_reduce_ext
+    test_reduce_min_default_axes_keepdims_random_cpu,zdnn_reduce_ext 
+    # test_reduce_min_do_not_keepdims_example_cpu,zdnn_reduce_ext 
+    # test_reduce_min_do_not_keepdims_random_cpu,zdnn_reduce_ext     
+    test_reduce_min_empty_set_cpu,zdnn_reduce_ext
+    test_reduce_min_keepdims_example_cpu,zdnn_reduce_ext
+    test_reduce_min_keepdims_random_cpu,zdnn_reduce_ext
+    test_reduce_min_negative_axes_keepdims_example_cpu,zdnn_reduce_ext
+    test_reduce_min_negative_axes_keepdims_random_cpu,zdnn_reduce_ext
+
+    # ==OP== Sqrt
+    # ==LEVEL== arch15
+    # ==MIN== 6
+    test_sqrt_cpu,zdnn_sqrt_ext,zdnn_invsqrt_ext
+    test_sqrt_example_cpu,zdnn_sqrt_ext,zdnn_invsqrt_ext
 )
-set(ENV_TEST_CASE_BY_USER "")
-foreach(test_name IN LISTS NNPA_TEST_LIST)
-  set(ENV_TEST_CASE_BY_USER "${ENV_TEST_CASE_BY_USER} ${test_name}")
+
+set(ENV_TEST_CASE_BY_USER_z16 "")
+foreach(test_name IN LISTS NNPA_TEST_LIST_z16)
+  set(ENV_TEST_CASE_BY_USER_z16 "${ENV_TEST_CASE_BY_USER_z16} ${test_name}")
+endforeach()
+
+set(ENV_TEST_CASE_BY_USER_ARCH_15 "")
+foreach(test_name IN LISTS NNPA_TEST_LIST_ARCH_15)
+  set(ENV_TEST_CASE_BY_USER_ARCH_15 "${ENV_TEST_CASE_BY_USER_ARCH_15} ${test_name}")
 endforeach()
 
-set(NNPA_TESTS_ENVS TEST_MCPU=z16 TEST_MACCEL=NNPA TEST_CASE_BY_USER=${ENV_TEST_CASE_BY_USER} TEST_ATOL=0.01 TEST_RTOL=0.05)
+set(NNPA_TESTS_ENVS_z16 TEST_MARCH=z16 TEST_MACCEL=NNPA TEST_CASE_BY_USER=${ENV_TEST_CASE_BY_USER_z16} TEST_ATOL=0.01 TEST_RTOL=0.05)
+set(NNPA_TESTS_ENVS_ARCH_15 TEST_MARCH=arch15 TEST_MACCEL=NNPA TEST_CASE_BY_USER=${ENV_TEST_CASE_BY_USER_ARCH_15} TEST_ATOL=0.01 TEST_RTOL=0.05)
 
-set(ENV_TEST_CASE_BY_USER_DYNAMIC "")
-foreach(test_name IN LISTS NNPA_TEST_LIST)
+set(ENV_TEST_CASE_BY_USER_DYNAMIC_z16 "")
+foreach(test_name IN LISTS NNPA_TEST_LIST_z16)
   if(NOT ${test_name} MATCHES ",NO_DYNAMIC_SHAPE_TEST$")
-    set(ENV_TEST_CASE_BY_USER_DYNAMIC "${ENV_TEST_CASE_BY_USER_DYNAMIC} ${test_name}")
+    set(ENV_TEST_CASE_BY_USER_DYNAMIC_z16 "${ENV_TEST_CASE_BY_USER_DYNAMIC_z16} ${test_name}")
   endif()
 endforeach()
 
-set(NNPA_TESTS_ENVS_DYNAMIC TEST_MCPU=z16 TEST_MACCEL=NNPA TEST_CASE_BY_USER=${ENV_TEST_CASE_BY_USER_DYNAMIC} TEST_ATOL=0.01 TEST_RTOL=0.05)
+set(ENV_TEST_CASE_BY_USER_DYNAMIC_ARCH_15 "")
+foreach(test_name IN LISTS NNPA_TEST_LIST_ARCH_15)
+  if(NOT ${test_name} MATCHES ",NO_DYNAMIC_SHAPE_TEST$")
+    set(ENV_TEST_CASE_BY_USER_DYNAMIC_ARCH_15 "${ENV_TEST_CASE_BY_USER_DYNAMIC_ARCH_15} ${test_name}")
+  endif()
+endforeach()
 
+set(NNPA_TESTS_ENVS_DYNAMIC_z16 TEST_MARCH=z16 TEST_MACCEL=NNPA TEST_CASE_BY_USER=${ENV_TEST_CASE_BY_USER_DYNAMIC_z16} TEST_ATOL=0.01 TEST_RTOL=0.05)
+set(NNPA_TESTS_ENVS_DYNAMIC_ARCH_15 TEST_MARCH=arch15 TEST_MACCEL=NNPA TEST_CASE_BY_USER=${ENV_TEST_CASE_BY_USER_DYNAMIC_ARCH_15} TEST_ATOL=0.01 TEST_RTOL=0.05)
 
 # ${ONNX_HOME} is the directory where onnx downloads real model files.
 # Model files are saved under ${ONNX_HOME}/models/model_name/model.onnx.
@@ -397,7 +512,22 @@ add_custom_target(check-onnx-backend-nnpa
     # Needed for convolution models to avoid NaN outputs.
     # Remove this if saturation is enabled by default.
     TEST_COMPILE_ARGS="--nnpa-saturation=true"
-    ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+    ${NNPA_TESTS_ENVS_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+  DEPENDS
+    ${FILE_GENERATE_DIR}/test.py
+    ${FILE_GENERATE_DIR}/test_config.py
+   )
+
+# Ensure check-onnx-backend-ARCH_15-nnpa is backwards compatible
+add_custom_target(check-onnx-backend-arch15-nnpa
+  COMMAND
+    TEST_INSTRUCTION_CHECK=true
+    ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-arch15-nnpa
+    # Needed for convolution models to avoid NaN outputs.
+    # Remove this if saturation is enabled by default.
+    TEST_COMPILE_ARGS="--nnpa-saturation=true"
+    ${NNPA_TESTS_ENVS_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+    && ${NNPA_TESTS_ENVS_ARCH_15} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
     ${FILE_GENERATE_DIR}/test_config.py
@@ -411,7 +541,23 @@ add_custom_target(check-onnx-backend-dynamic-nnpa
     # Needed for convolution models to avoid NaN outputs.
     # Remove this if saturation is enabled by default.
     TEST_COMPILE_ARGS="--nnpa-saturation=true"
-    ${NNPA_TESTS_ENVS_DYNAMIC} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+    ${NNPA_TESTS_ENVS_DYNAMIC_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+  DEPENDS
+    ${FILE_GENERATE_DIR}/test.py
+    ${FILE_GENERATE_DIR}/test_config.py
+  )
+
+# Ensure check-onnx-backend-dynamic-arch15-nnpa is backwards compatible
+add_custom_target(check-onnx-backend-dynamic-arch15-nnpa
+  COMMAND
+    ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-dynamic-arch15-nnpa
+    TEST_INSTRUCTION_CHECK=true
+    TEST_DYNAMIC=true
+    # Needed for convolution models to avoid NaN outputs.
+    # Remove this if saturation is enabled by default.
+    TEST_COMPILE_ARGS="--nnpa-saturation=true"
+    ${NNPA_TESTS_ENVS_DYNAMIC_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+    && ${NNPA_TESTS_ENVS_DYNAMIC_ARCH_15} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
     ${FILE_GENERATE_DIR}/test_config.py
@@ -427,7 +573,7 @@ add_custom_target(check-onnx-backend-constant-nnpa
     # Needed for convolution models to avoid NaN outputs.
     # Remove this if saturation is enabled by default.
     TEST_COMPILE_ARGS="--nnpa-saturation=true" 
-    ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+    ${NNPA_TESTS_ENVS_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
     ${FILE_GENERATE_DIR}/test_config.py
@@ -439,7 +585,7 @@ add_custom_target(check-onnx-backend-compilerlib-nnpa
     # Needed for convolution models to avoid NaN outputs.
     # Remove this if saturation is enabled by default.
     TEST_COMPILE_ARGS="--nnpa-saturation=true" 
-    ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+    ${NNPA_TESTS_ENVS_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
   DEPENDS
     ${FILE_GENERATE_DIR}/test.py
     ${FILE_GENERATE_DIR}/test_config_compilerlib.py
@@ -454,14 +600,23 @@ add_custom_target(clean-onnx-backend-nnpa
 
 add_dependencies(check-onnx-backend-nnpa onnx-mlir)
 add_dependencies(check-onnx-backend-nnpa PyRuntimeC)
+add_dependencies(check-onnx-backend-arch15-nnpa onnx-mlir)
+add_dependencies(check-onnx-backend-arch15-nnpa PyRuntimeC)
 add_dependencies(check-onnx-backend-dynamic-nnpa onnx-mlir)
 add_dependencies(check-onnx-backend-dynamic-nnpa PyRuntimeC)
+add_dependencies(check-onnx-backend-dynamic-arch15-nnpa onnx-mlir)
+add_dependencies(check-onnx-backend-dynamic-arch15-nnpa PyRuntimeC)
 add_dependencies(check-onnx-backend-constant-nnpa onnx-mlir)
 add_dependencies(check-onnx-backend-constant-nnpa PyRuntimeC)
 add_dependencies(check-onnx-backend-compilerlib-nnpa CompilerLibTest)
 add_dependencies(check-onnx-backend-compilerlib-nnpa PyRuntimeC)
 
 add_dependencies(check-onnx-backend-numerical-nnpa check-onnx-backend-nnpa)
+# If on arch 15 machines then (TODO: enable once avail on test machines):
+# add_dependencies(check-onnx-backend-numerical-nnpa check-onnx-backend-arch15-nnpa)
+# else while on an arch 14 machine:
+add_dependencies(check-onnx-backend-numerical-nnpa check-onnx-backend-nnpa)
+# end if.
 add_dependencies(check-onnx-backend-numerical-nnpa check-onnx-backend-dynamic-nnpa)
 add_dependencies(check-onnx-backend-numerical-nnpa check-onnx-backend-constant-nnpa)
 
@@ -472,7 +627,10 @@ if (ONNX_MLIR_ENABLE_JNI)
     COMMAND
       ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-jni-nnpa
       TEST_EMIT=jni JSONITER_JAR=${JSONITER_JAR}
-      ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+      # Needed for convolution models to avoid NaN outputs.
+      # Remove this if saturation is enabled by default.
+      TEST_COMPILE_ARGS="--nnpa-saturation=true"
+      ${NNPA_TESTS_ENVS_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
     DEPENDS
       ${FILE_GENERATE_DIR}/test.py
       ${FILE_GENERATE_DIR}/test_config.py
@@ -482,7 +640,10 @@ if (ONNX_MLIR_ENABLE_JNI)
     COMMAND
       ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-dynamic-jni-nnpa
       TEST_DYNAMIC=true TEST_EMIT=jni JSONITER_JAR=${JSONITER_JAR}
-      ${NNPA_TESTS_ENVS_DYNAMIC} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+      # Needed for convolution models to avoid NaN outputs.
+      # Remove this if saturation is enabled by default.
+      TEST_COMPILE_ARGS="--nnpa-saturation=true"
+      ${NNPA_TESTS_ENVS_DYNAMIC_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
     DEPENDS
       ${FILE_GENERATE_DIR}/test.py
       ${FILE_GENERATE_DIR}/test_config.py
@@ -492,7 +653,10 @@ if (ONNX_MLIR_ENABLE_JNI)
     COMMAND
       ONNX_HOME=${FILE_GENERATE_DIR}/check-onnx-backend-constant-jni-nnpa
       TEST_CONSTANT=true TEST_EMIT=jni JSONITER_JAR=${JSONITER_JAR}
-      ${NNPA_TESTS_ENVS} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
+      # Needed for convolution models to avoid NaN outputs.
+      # Remove this if saturation is enabled by default.
+      TEST_COMPILE_ARGS="--nnpa-saturation=true"
+      ${NNPA_TESTS_ENVS_z16} ${BACKEND_TEST_COMMAND} ${BACKEND_TEST_ARGS} ${FILE_GENERATE_DIR}/test.py
     DEPENDS
       ${FILE_GENERATE_DIR}/test.py
       ${FILE_GENERATE_DIR}/test_config.py
@@ -519,3 +683,4 @@ if (ONNX_MLIR_ENABLE_JNI)
 else()
   message(STATUS "  JNI backend-nnpa tests         : OFF")
 endif()
+
diff --git a/test/accelerators/NNPA/numerical/CMakeLists.txt b/test/accelerators/NNPA/numerical/CMakeLists.txt
index 20f8f28b38..3f81055175 100644
--- a/test/accelerators/NNPA/numerical/CMakeLists.txt
+++ b/test/accelerators/NNPA/numerical/CMakeLists.txt
@@ -41,7 +41,7 @@ function(add_numerical_test test_name)
 
   # Optimization level set by ONNX_MLIR_TEST_OPTLEVEL, defaults to 3
   add_test(NAME ${test_name}
-    COMMAND ${test_name} -O${ONNX_MLIR_TEST_OPTLEVEL} --mcpu=z16 --maccel=NNPA
+    COMMAND ${test_name} -O${ONNX_MLIR_TEST_OPTLEVEL} --march=z16 --maccel=NNPA
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     )
   set_tests_properties(${test_name} PROPERTIES LABELS numerical-nnpa)
@@ -93,7 +93,7 @@ add_numerical_test(TestGemmNNPA
 # LSTM
 set(TestLSTMNNPA_instruction zdnn_lstm)
 # Automatically set following config when using --maccel=NNPA
-# set(TestLSTMNNPA_config "-peephole=0")
+set(TestLSTMNNPA_config "-peephole=0")
 add_numerical_test(TestLSTMNNPA
   ${ONNX_NUMERICALTEST_SRC_DIR}/TestLSTM.cpp
   LINK_LIBS PRIVATE ${TEST_LINK_LIBS}
@@ -102,12 +102,13 @@ add_numerical_test(TestLSTMNNPA
 # GRU
 set(TestGRUNNPA_instruction zdnn_gru)
 # Automatically set following config when using --maccel=NNPA
-# set(TestGRUNNPA_config "-linearBeforeReset=1")
+set(TestGRUNNPA_config "-linearBeforeReset=1")
 add_numerical_test(TestGRUNNPA
   ${ONNX_NUMERICALTEST_SRC_DIR}/TestGRU.cpp
   LINK_LIBS PRIVATE ${TEST_LINK_LIBS}
   )
 
+
 # LeakyRelu
 set(TestLeakyReluNNPA_instruction zdnn_mul)
 # Automatically set following config when using --maccel=NNPA
diff --git a/test/backend/common.py b/test/backend/common.py
index a9b077eae7..c6f5e8d827 100644
--- a/test/backend/common.py
+++ b/test/backend/common.py
@@ -5,7 +5,7 @@
 # Copyright 2021-2022 The IBM Research Authors.
 #
 ################################################################################
-# commom function `compile_model` called by both
+# Common function `compile_model` called by both
 # SignatureExecutionSession and EndiannessAwareExecutionSession
 ################################################################################
 from __future__ import absolute_import
@@ -114,7 +114,8 @@ def compile_model(model, emit):
     command_list = [TEST_DRIVER]
     if args.Optlevel:
         command_list.append("-O" + args.Optlevel)
-    if args.mcpu:
+    if args.mcpu:  # deprecated
+        print("warning, --mcpu option is deprecated, please use --march instead")
         command_list.append("--mcpu=" + args.mcpu)
     if args.march:
         command_list.append("--march=" + args.march)
diff --git a/test/backend/inference_backend.py b/test/backend/inference_backend.py
index b13f25f177..f3059af931 100644
--- a/test/backend/inference_backend.py
+++ b/test/backend/inference_backend.py
@@ -74,6 +74,7 @@ def get_test_models():
         ############################################################
         # Elementary ops, ordered in the order they are found in
         # onnx-mlir/third_party/onnx/onnx/backend/test/case/node.
+        # To rebuild after changes: make onnx_mlir_supported_ops
         # ==ARCH== cpu
         # ==OP== Abs
         # ==MIN== 6
diff --git a/test/backend/variables.py b/test/backend/variables.py
index 6a2594b6af..b13fe96c50 100644
--- a/test/backend/variables.py
+++ b/test/backend/variables.py
@@ -170,7 +170,7 @@ def get_args_from_env():
         "--mcpu",
         type=str,
         default=os.getenv("TEST_MCPU", ""),
-        help="target a specific cpu, passed to the compiler",
+        help="target a specific cpu, passed to the compiler (deprecated, use --march)",
     )
     parser.add_argument(
         "--march",
diff --git a/test/mlir/accelerators/nnpa/analysis/dyn-dim-analysis.mlir b/test/mlir/accelerators/nnpa/analysis/dyn-dim-analysis.mlir
index 7a07ed5732..276bde1324 100644
--- a/test/mlir/accelerators/nnpa/analysis/dyn-dim-analysis.mlir
+++ b/test/mlir/accelerators/nnpa/analysis/dyn-dim-analysis.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --onnx-dim-analysis %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --onnx-dim-analysis %s -split-input-file | FileCheck %s
 
 // COM: test zdnn unary operations. Use Relu as a sample.
 func.func @test_stick_unary_unstick(%arg0 : tensor<?x3x?xf32>) -> tensor<?x3x?xf32> {
@@ -104,7 +104,7 @@ func.func @test_stick_matmul_unstick(%arg0 : tensor<?x?x?xf32>) -> tensor<?x?x?x
   %3 = "zhigh.Stick"(%2) {layout = "3DS"} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 
   %none = "onnx.NoValue"() {value} : () -> none
-  %4 = "zhigh.MatMul"(%1, %3, %none) : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+  %4 = "zhigh.MatMul"(%1, %3, %none) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 
   %5 = "zhigh.Unstick"(%4) : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<?x?x?xf32>
   "onnx.Return"(%5) : (tensor<?x?x?xf32>) -> ()
@@ -131,7 +131,7 @@ func.func @test_stick_matmul_unstick(%arg0 : tensor<?x?x?xf32>) -> tensor<?x?x?x
 // CHECK-DAG:       "onnx.DimGroup"([[VAR_3_]]) {axis = 1 : si64, group_id = [[GROUP_2_]] : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> ()
 // CHECK-DAG:       "onnx.DimGroup"([[VAR_3_]]) {axis = 0 : si64, group_id = [[GROUP_0_]] : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> ()
 // CHECK:           [[VAR_4_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_5_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_3_]], [[VAR_4_]]) : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_5_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_3_]], [[VAR_4_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       "onnx.DimGroup"([[VAR_5_]]) {axis = 1 : si64, group_id = [[GROUP_1_]] : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> ()
 // CHECK-DAG:       "onnx.DimGroup"([[VAR_5_]]) {axis = 2 : si64, group_id = [[GROUP_1_]] : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> ()
 // CHECK-DAG:       "onnx.DimGroup"([[VAR_5_]]) {axis = 0 : si64, group_id = [[GROUP_0_]] : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> ()
diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass.mlir
index 261b9e46ae..7e43956edc 100644
--- a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --device-placement --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s
+// RUN: onnx-mlir-opt --device-placement --march=z16 --maccel=NNPA --split-input-file %s | FileCheck %s
 
 module attributes {llvm.data_layout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64", llvm.target_triple = "s390x-ibm-linux", "onnx-mlir.symbol-postfix" = "model"} {
   func.func @mnist(%arg0: tensor<1x1x28x28xf32>) -> tensor<1x10xf32> {
diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_load_config_file.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_load_config_file.mlir
index ac630f16e1..4559df6875 100644
--- a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_load_config_file.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_load_config_file.mlir
@@ -1,10 +1,10 @@
-// RUN: cfg_file=$(dirname %s)/load-cfg-all-on-cpu.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=ALL-ON-CPU
+// RUN: cfg_file=$(dirname %s)/load-cfg-all-on-cpu.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --march=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=ALL-ON-CPU
 
-// RUN: cfg_file=$(dirname %s)/load-cfg-all-relu-on-cpu.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=ALL-RELU-ON-CPU
+// RUN: cfg_file=$(dirname %s)/load-cfg-all-relu-on-cpu.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --march=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=ALL-RELU-ON-CPU
 
-// RUN: cfg_file=$(dirname %s)/load-cfg-not-match-relu.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=NOT-MATCH-RELU
+// RUN: cfg_file=$(dirname %s)/load-cfg-not-match-relu.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --march=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=NOT-MATCH-RELU
 
-// RUN: cfg_file=$(dirname %s)/load-cfg-overlapping-condition.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=OVERLAPPING
+// RUN: cfg_file=$(dirname %s)/load-cfg-overlapping-condition.json && onnx-mlir-opt --device-placement=load-config-file=$cfg_file --march=z16 --maccel=NNPA --split-input-file %s | FileCheck %s --check-prefix=OVERLAPPING
 
 func.func @test_load_config_file_all_on_cpu(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = "onnx.Relu"(%arg0) {onnx_node_name = "Relu_0"} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir
index f36bb3d2b8..59cdeaeba9 100644
--- a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_perf_model.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --device-placement=use-faster=true --mcpu=z16 --maccel=NNPA --split-input-file %s | FileCheck %s
+// RUN: onnx-mlir-opt --device-placement=use-faster=true --march=z16 --maccel=NNPA --split-input-file %s | FileCheck %s
 // -----
 
 // Shape is such that this op is nearly guaranteed to be faster on CPU.
diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_save_config_file.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_save_config_file.mlir
index bd29c6549d..10c8966ea5 100644
--- a/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_save_config_file.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/device-placement/device_placement_pass_save_config_file.mlir
@@ -1,4 +1,4 @@
-// RUN: cfg_file=$(dirname %s)/save-cfg.json && onnx-mlir-opt --device-placement=save-config-file=$cfg_file --mcpu=z16 --maccel=NNPA --split-input-file %s && cat $cfg_file | FileCheck %s && rm $cfg_file
+// RUN: cfg_file=$(dirname %s)/save-cfg.json && onnx-mlir-opt --device-placement=save-config-file=$cfg_file --march=z16 --maccel=NNPA --split-input-file %s && cat $cfg_file | FileCheck %s && rm $cfg_file
 
 func.func @test_save_config_file(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = "onnx.Relu"(%arg0) {onnx_node_name = "Relu_0"} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/device-placement/emit-onnxir.mlir b/test/mlir/accelerators/nnpa/conversion/device-placement/emit-onnxir.mlir
index 05d14ca167..073204d9fd 100644
--- a/test/mlir/accelerators/nnpa/conversion/device-placement/emit-onnxir.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/device-placement/emit-onnxir.mlir
@@ -1,5 +1,5 @@
 
-// RUN: onnx-mlir --EmitONNXIR --mcpu=z16 --maccel=NNPA --disable-constant-prop=true --printIR %s | FileCheck %s
+// RUN: onnx-mlir --EmitONNXIR --march=z16 --maccel=NNPA --disable-constant-prop=true --printIR %s | FileCheck %s
 
 module attributes {llvm.data_layout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64", llvm.target_triple = "s390x-ibm-linux", "onnx-mlir.symbol-postfix" = "model"} {
   func.func @mnist(%arg0: tensor<1x1x28x28xf32>) -> tensor<1x10xf32> {
diff --git a/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-level.mlir b/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-level.mlir
index 90f9f85341..7ec4ed1ce6 100644
--- a/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-level.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-level.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA  --printIR --EmitZHighIR -profile-ir=Onnx %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA  --printIR --EmitZHighIR -profile-ir=Onnx %s | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-zhigh-level.mlir b/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-zhigh-level.mlir
index 059c3bcfb8..793e3bcd35 100644
--- a/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-zhigh-level.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/instrument/add-onnx-zhigh-level.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --printIR --EmitZHighIR --profile-ir=ZHigh %s  | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --printIR --EmitZHighIR --profile-ir=ZHigh %s  | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/instrument/add-zhigh-level.mlir b/test/mlir/accelerators/nnpa/conversion/instrument/add-zhigh-level.mlir
index 89f226d57f..d3edf91c1b 100644
--- a/test/mlir/accelerators/nnpa/conversion/instrument/add-zhigh-level.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/instrument/add-zhigh-level.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --printIR --EmitZLowIR --instrument-stage=ZHigh --instrument-ops=zhigh.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime %s  | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --printIR --EmitZLowIR --instrument-stage=ZHigh --instrument-ops=zhigh.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime %s  | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/instrument/add-zlow-level.mlir b/test/mlir/accelerators/nnpa/conversion/instrument/add-zlow-level.mlir
index e71c98e640..a4cc105245 100644
--- a/test/mlir/accelerators/nnpa/conversion/instrument/add-zlow-level.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/instrument/add-zlow-level.mlir
@@ -1,4 +1,4 @@
-// RUN:  onnx-mlir --mcpu=z16 --maccel=NNPA --printIR --EmitZLowIR --instrument-stage=ZLow --instrument-ops=zlow.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime %s  | FileCheck %s
+// RUN:  onnx-mlir --march=z16 --maccel=NNPA --printIR --EmitZLowIR --instrument-stage=ZLow --instrument-ops=zlow.* --InstrumentBeforeOp --InstrumentAfterOp --InstrumentReportTime %s  | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-arch15.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-arch15.mlir
new file mode 100644
index 0000000000..ffa01a707f
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-arch15.mlir
@@ -0,0 +1,148 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
+
+// -----
+
+// Check stickification with saturation.
+func.func @test_stick_with_saturation() -> () {
+  %0 = memref.alloc() : memref<10x10xf32>
+  %1 = memref.alloc() : memref<1x1x32x64xf16>
+  "zlow.stick"(%0, %1) {saturation = -1 : si64} : (memref<10x10xf32>, memref<1x1x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_stick_with_saturation
+  // CHECK: llvm.call @zdnn_transform_ztensor_with_saturation({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check stickification without saturation.
+func.func @test_stick_without_saturation() -> () {
+  %0 = memref.alloc() : memref<10x10xf32>
+  %1 = memref.alloc() : memref<1x1x32x64xf16>
+  "zlow.stick"(%0, %1) {saturation = 0 : si64} : (memref<10x10xf32>, memref<1x1x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_stick_without_saturation
+  // CHECK: llvm.call @zdnn_transform_ztensor({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.gelu calls the correct zDNN API or not.
+func.func @test_call_zdnn_gelu() -> () {
+  %0 = memref.alloc() : memref<1x1x32x64xf16>
+  %1 = memref.alloc() : memref<1x1x32x64xf16>
+  %shape = memref.alloc() : memref<2xi64>
+  "zlow.gelu"(%0, %shape, %1) {layout = "2D"} : (memref<1x1x32x64xf16>, memref<2xi64>, memref<1x1x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_call_zdnn_gelu
+  // CHECK: {{.*}} = llvm.call @zdnn_gelu_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.leakyrelu calls the correct zDNN API or not. 
+func.func @test_call_zdnn_leaky_relu() -> () {
+  %0 = memref.alloc() : memref<1x1x32x64xf16>
+  %1 = memref.alloc() : memref<1x1x32x64xf16>
+  %shape = memref.alloc() : memref<2xi64>
+  "zlow.leakyrelu"(%0, %shape, %1) {layout = "2D"} : (memref<1x1x32x64xf16>, memref<2xi64>, memref<1x1x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_call_zdnn_leaky_relu
+  // CHECK: {{.*}} = llvm.call @zdnn_leaky_relu_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, f32, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.invsqrt calls the correct zDNN API or not.
+func.func @test_call_zdnn_invsqrt() -> () {
+  %0 = memref.alloc() : memref<1x1x32x64xf16>
+  %1 = memref.alloc() : memref<1x1x32x64xf16>
+  %shape = memref.alloc() :  memref<2xi64>
+  "zlow.invsqrt"(%0, %shape, %1) {layout = "2D"} : (memref<1x1x32x64xf16>, memref<2xi64>, memref<1x1x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_call_zdnn_invsqrt
+  // CHECK: {{.*}} = llvm.call @zdnn_invsqrt_ext({{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, f32, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.reducemax calls the correct zDNN API or not.
+func.func @test_call_zdnn_reducemax() -> () {
+  %0 = memref.alloc() : memref<1x1x32x64xf16>
+  %1 = memref.alloc() : memref<1x1x32x64xf16>
+  %work_area = memref.alloc() {alignment = 4096 : i64} : memref<8192xi8>
+  %shape = memref.alloc() : memref<i64>
+  "zlow.reducemax"(%0, %work_area, %shape, %1) {layout = "2D", op_type = "REDUCE_OP_MAXIMUM" : i64} : (memref<1x1x32x64xf16>,  memref<8192xi8>, memref<i64>, memref<1x1x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_call_zdnn_reducemax
+  // CHECK: {{.*}} = llvm.call @zdnn_reduce_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.reducemin calls the correct zDNN API or not.
+func.func @test_call_zdnn_reducemin() -> () {
+  %0 = memref.alloc() : memref<3x2x32x64xf16>
+  %1 = memref.alloc() : memref<3x2x32x64xf16>
+  %work_area = memref.alloc() {alignment = 4096 : i64} : memref<8192xi8>
+  %shape = memref.alloc() : memref<i64>
+  "zlow.reducemin"(%0, %work_area, %shape, %1) {layout = "2D", op_type = "REDUCE_OP_MINIMUM" : i64} : (memref<3x2x32x64xf16>,  memref<8192xi8>, memref<i64>, memref<3x2x32x64xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_call_zdnn_reducemin
+  // CHECK: {{.*}} = llvm.call @zdnn_reduce_ext({{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.sqrt calls the correct zDNN API or not.
+func.func @test_call_zdnn_sqrt() -> () {
+  %0 = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16>
+  %1 = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16>
+  %shape = memref.alloc() : memref<4xi64>
+  "zlow.sqrt"(%0, %shape, %1) {layout = "2D"} : (memref<2048xf16>, memref<4xi64>, memref<2048xf16>) -> ()
+  return
+
+  // CHECK-LABEL: test_call_zdnn_sqrt
+  // CHECK: {{.*}} = llvm.call @zdnn_sqrt_ext({{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
+func.func @test_matmul_bcast1(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
+  %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16>
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = -1 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  return %res : memref<2048xf16>
+  // CHECK-LABEL: test_matmul_bcast1
+  // CHECK: %{{.*}} = llvm.call @zdnn_matmul_bcast_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.quantized_matmul calls the correct zDNN API or not.
+func.func @test_call_zdnn_quantized_matmul_op(%arg0: memref<1x1x1x1x32x64xf16>, %arg1: memref<f32>, %arg2: memref<f32>, %arg3: memref<1x1x1x1x32x64xi8>, %arg4: memref<f32>, %arg5: memref<f32>, %arg6: memref<1x1x1x1x32x64xi8>, %arg7: memref<f32>, %arg8: memref<f32>, %arg9: memref<1x1x1x1x32x64xf16>, %arg10: memref<4xi64>, %arg11: memref<f32>, %arg12: memref<f32>) -> memref<1x1x1x1x32x64xf16> {
+  %alloc = memref.alloc() {alignment = 4096 : i64} : memref<1x1x1x1x32x64xf16>
+  "zlow.quantizedMatmul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %alloc, %arg11, %arg12) {bias_q_type = "INT8", dequantize_output = 0 : si64, is_bcast = -1 : si64, is_stacked = 0 : si64, out_q_type = "DLFLOAT16", x_q_type = "DLFLOAT16", y_q_type = "WEIGHTS"} : (memref<1x1x1x1x32x64xf16>, memref<f32>, memref<f32>, memref<1x1x1x1x32x64xi8>, memref<f32>, memref<f32>, memref<1x1x1x1x32x64xi8>, memref<f32>, memref<f32>, memref<1x1x1x1x32x64xf16>, memref<4xi64>, memref<1x1x1x1x32x64xf16>, memref<f32>, memref<f32>) -> ()
+  return %alloc : memref<1x1x1x1x32x64xf16>
+
+  // CHECK-LABEL: test_call_zdnn_quantized_matmul_op
+  // CHECK: {{.*}} = llvm.call @zdnn_quantized_matmul_op({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr) -> i32
+}
+
+// -----
+
+// Check whether the lowering of zlow.quantized_matmul calls the correct zDNN API or not.
+func.func @test_call_zdnn_quantized_matmul_dequantized_op(%arg0: memref<1x1x1x1x32x64xf16>, %arg1: memref<f32>, %arg2: memref<f32>, %arg3: memref<1x1x1x1x32x64xi8>, %arg4: memref<f32>, %arg5: memref<f32>, %arg6: memref<1x1x1x1x32x64xi8>, %arg7: memref<f32>, %arg8: memref<f32>, %arg9: memref<1x1x1x1x32x64xf16>, %arg10: memref<4xi64>, %arg11: memref<f32>, %arg12: memref<f32>) -> memref<1x1x1x1x32x64xf16> {
+  %alloc = memref.alloc() {alignment = 4096 : i64} : memref<1x1x1x1x32x64xf16>
+  "zlow.quantizedMatmul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %alloc, %arg11, %arg12) {bias_q_type = "INT8", dequantize_output = -1 : si64, is_bcast = -1 : si64, is_stacked = 0 : si64, out_q_type = "DLFLOAT16", x_q_type = "DLFLOAT16", y_q_type = "WEIGHTS"} : (memref<1x1x1x1x32x64xf16>, memref<f32>, memref<f32>, memref<1x1x1x1x32x64xi8>, memref<f32>, memref<f32>, memref<1x1x1x1x32x64xi8>, memref<f32>, memref<f32>, memref<1x1x1x1x32x64xf16>, memref<4xi64>, memref<1x1x1x1x32x64xf16>, memref<f32>, memref<f32>) -> ()
+  return %alloc : memref<1x1x1x1x32x64xf16>
+
+  // CHECK-LABEL: test_call_zdnn_quantized_matmul_dequantized_op
+  // CHECK: {{.*}} = llvm.call @zdnn_quantized_matmul_op({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr) -> i32
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-constant-shape.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-constant-shape.mlir
index d4af23b9eb..2b56c8db2b 100644
--- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-constant-shape.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-constant-shape.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir
index f0ea3355aa..782bde6e13 100644
--- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm-typed-pointer.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
 
 // -----
 
@@ -39,19 +39,20 @@ func.func @test_stick() -> () {
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: {{.*}} = llvm.call @zdnn_generate_transformed_desc([[PRE_TRANSFORMED_DESC_I8PTR]], [[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr, !llvm.ptr) -> i32
 
-  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)> : (i64) -> !llvm.ptr
+  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)> : (i64) -> !llvm.ptr
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: [[BUFFER_SIZE:%.+]] = llvm.call @zdnn_getsize_ztensor([[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr) -> i64
-  // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
+
   // CHECK: llvm.store [[PRE_TRANSFORMED_DESC]], [[ZTENSOR_PRE_TRANSFORMED_DESC]] : !llvm.ptr, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[TRANSFORMED_DESC]], [[ZTENSOR_TRANSFORMED_DESC]] : !llvm.ptr, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_BUFFER_SIZE:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_BUFFER_SIZE:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[BUFFER_SIZE]], [[ZTENSOR_BUFFER_SIZE]] : i64, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_BUFFER:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_BUFFER:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[ALIGNED_BUFFER_I8PTR]], [[ZTENSOR_BUFFER]] : !llvm.ptr, !llvm.ptr
 
   // CHECK: [[FALSE:%.+]] = llvm.mlir.constant(false) : i1
@@ -93,7 +94,7 @@ func.func @test_unstick() -> () {
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: {{.*}} = llvm.call @zdnn_generate_transformed_desc([[PRE_TRANSFORMED_DESC_I8PTR]], [[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr, !llvm.ptr) -> i32
 
-  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)> : (i64) -> !llvm.ptr
+  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)> : (i64) -> !llvm.ptr
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: [[BUFFER_SIZE:%.+]] = llvm.call @zdnn_getsize_ztensor([[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr) -> i64
   // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr
@@ -334,7 +335,7 @@ func.func @test_call_zdnn_log() -> () {
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_no_bcast_unstacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = 0 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_no_bcast_unstacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
@@ -345,7 +346,7 @@ func.func @test_matmul_no_bcast_unstacked(%x: memref<2048xf16>,%y: memref<2048xf
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_no_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = 0 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_no_bcast_stacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
@@ -356,7 +357,7 @@ func.func @test_matmul_no_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = -1 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = -1 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_bcast_stacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_bcast_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
@@ -367,7 +368,7 @@ func.func @test_matmul_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_bcast_unstacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = -1 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = -1 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_bcast_unstacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_bcast_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
diff --git a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir
index 2307680415..7907969c6b 100644
--- a/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/lower-all-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-krnl-to-llvm %s -split-input-file | FileCheck %s
 
 // -----
 
@@ -38,24 +38,24 @@ func.func @test_stick() -> () {
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: {{.*}} = llvm.call @zdnn_generate_transformed_desc([[PRE_TRANSFORMED_DESC_I8PTR]], [[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr, !llvm.ptr) -> i32
 
-  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)> : (i64) -> !llvm.ptr
+  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)> : (i64) -> !llvm.ptr
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: [[BUFFER_SIZE:%.+]] = llvm.call @zdnn_getsize_ztensor([[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr) -> i64
-  // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[PRE_TRANSFORMED_DESC]], [[ZTENSOR_PRE_TRANSFORMED_DESC]] : !llvm.ptr, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[TRANSFORMED_DESC]], [[ZTENSOR_TRANSFORMED_DESC]] : !llvm.ptr, !llvm.ptr 
 
-  // CHECK: [[ZTENSOR_BUFFER_SIZE:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_BUFFER_SIZE:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[BUFFER_SIZE]], [[ZTENSOR_BUFFER_SIZE]] : i64, !llvm.ptr 
 
-  // CHECK: [[ZTENSOR_BUFFER:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_BUFFER:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[ALIGNED_BUFFER_I8PTR]], [[ZTENSOR_BUFFER]] : !llvm.ptr, !llvm.ptr 
 
   // CHECK: [[FALSE:%.+]] = llvm.mlir.constant(false) : i1
 
-  // CHECK: [[IS_TRANSFORMED:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[IS_TRANSFORMED:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[FALSE]], [[IS_TRANSFORMED]] : i1, !llvm.ptr 
 
   // CHECK: [[UNSTICKIFIED:%.+]] = llvm.extractvalue [[UNSTICKIFIED_MEMREF]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -92,24 +92,24 @@ func.func @test_unstick() -> () {
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: {{.*}} = llvm.call @zdnn_generate_transformed_desc([[PRE_TRANSFORMED_DESC_I8PTR]], [[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr, !llvm.ptr) -> i32
 
-  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)> : (i64) -> !llvm.ptr
+  // CHECK: [[ZTENSOR:%.+]] = llvm.alloca {{.*}} x !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)> : (i64) -> !llvm.ptr
   // CHECK: [[TRANSFORMED_DESC_I8PTR:%.+]] = llvm.bitcast [[TRANSFORMED_DESC]] : !llvm.ptr to !llvm.ptr
   // CHECK: [[BUFFER_SIZE:%.+]] = llvm.call @zdnn_getsize_ztensor([[TRANSFORMED_DESC_I8PTR]]) : (!llvm.ptr) -> i64
-  // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_PRE_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[PRE_TRANSFORMED_DESC]], [[ZTENSOR_PRE_TRANSFORMED_DESC]] : !llvm.ptr, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_TRANSFORMED_DESC:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[TRANSFORMED_DESC]], [[ZTENSOR_TRANSFORMED_DESC]] : !llvm.ptr, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_BUFFER_SIZE:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_BUFFER_SIZE:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[BUFFER_SIZE]], [[ZTENSOR_BUFFER_SIZE]] : i64, !llvm.ptr
 
-  // CHECK: [[ZTENSOR_BUFFER:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[ZTENSOR_BUFFER:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[ALIGNED_BUFFER_I8PTR]], [[ZTENSOR_BUFFER]] : !llvm.ptr, !llvm.ptr
 
   // CHECK: [[TRUE:%.+]] = llvm.mlir.constant(true) : i1
 
-  // CHECK: [[IS_TRANSFORMED:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<31 x i8>)>
+  // CHECK: [[IS_TRANSFORMED:%.+]] = llvm.getelementptr [[ZTENSOR]]{{\[}}0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, ptr, i64, ptr, i1, array<3 x i8>, f32, f32, array<20 x i8>)>
   // CHECK: llvm.store [[TRUE]], [[IS_TRANSFORMED]] : i1, !llvm.ptr
 
   // CHECK: [[UNSTICKIFIED:%.+]] = llvm.extractvalue [[UNSTICKIFIED_MEMREF]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
@@ -333,7 +333,7 @@ func.func @test_call_zdnn_log() -> () {
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_no_bcast_unstacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = 0 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_no_bcast_unstacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
@@ -344,7 +344,7 @@ func.func @test_matmul_no_bcast_unstacked(%x: memref<2048xf16>,%y: memref<2048xf
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_no_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = 0 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_no_bcast_stacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
@@ -355,7 +355,7 @@ func.func @test_matmul_no_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = -1 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = -1 : si64, is_stacked = -1 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_bcast_stacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_bcast_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
@@ -366,7 +366,7 @@ func.func @test_matmul_bcast_stacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%
 // Check whether the lowering of zlow.matmul calls the correct zDNN API or not.
 func.func @test_matmul_bcast_unstacked(%x: memref<2048xf16>,%y: memref<2048xf16>,%bias: memref<2048xf16>, %shape: memref<3xi64>) -> memref<2048xf16> {
   %res = memref.alloc() {alignment = 4096 : i64} : memref<2048xf16> 
-  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast = -1 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
+  "zlow.matmul"(%x, %y, %bias, %shape, %res) {is_bcast1 = 0 : si64, is_bcast23 = -1 : si64, is_stacked = 0 : si64} : (memref<2048xf16>, memref<2048xf16>, memref<2048xf16>, memref<3xi64>, memref<2048xf16>) -> ()
   return %res : memref<2048xf16>
   // CHECK-LABEL: test_matmul_bcast_unstacked
   // CHECK: %{{.*}} = llvm.call @zdnn_matmul_bcast_op_ext(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> i32
diff --git a/test/mlir/accelerators/nnpa/conversion/normalize-memref.mlir b/test/mlir/accelerators/nnpa/conversion/normalize-memref.mlir
index 6e59e2e39d..d884d6c906 100644
--- a/test/mlir/accelerators/nnpa/conversion/normalize-memref.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/normalize-memref.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --normalize-memrefs %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --normalize-memrefs %s -split-input-file | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-krnl/onnx-on-ztensor.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-krnl/onnx-on-ztensor.mlir
index 41728bc5d0..75f81d7843 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-krnl/onnx-on-ztensor.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-krnl/onnx-on-ztensor.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 // Test doing unary element-wise computation directly on zTensor.
 // Taking ONNXSqrtOp as the example.
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-arch15.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-arch15.mlir
new file mode 100644
index 0000000000..595771e326
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-arch15.mlir
@@ -0,0 +1,66 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+
+func.func @test_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+// CHECK-LABEL:  func @test_add
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x10xf32>, [[PARAM_1_:%.+]]: tensor<10x10xf32>) -> tensor<10x10xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Add"([[VAR_0_]], [[VAR_1_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_3_]] : tensor<10x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+// COM: Binary ops use 3DS by default for rank 3.
+func.func @test_add_3ds(%arg0 : tensor<10x10x10xf32>, %arg1 : tensor<10x10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10x10xf32>, tensor<10x10x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+// CHECK-LABEL:  func @test_add_3ds
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x10x10xf32>, [[PARAM_1_:%.+]]: tensor<10x10x10xf32>) -> tensor<10x10x10xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<10x10x10xf32>) -> tensor<10x10x10xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "3DS"} : (tensor<10x10x10xf32>) -> tensor<10x10x10xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Add"([[VAR_0_]], [[VAR_1_]]) : (tensor<10x10x10xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<10x10x10xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<10x10x10xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<10x10x10xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<10x10x10xf32>
+// CHECK:           return [[VAR_3_]] : tensor<10x10x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+// COM:  Do not lower broadcasting onnx.Add to zHigh.
+func.func @test_add_not_lowered_diff_shape(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_add_not_lowered_diff_shape
+}
+
+// -----
+
+/// Do not lower onnx.Add to zHigh if inputs have unknown dimensions
+/// because we cannot statically check whether it is really broadcasting or not.
+func.func @test_add_not_lowered_unknown_dims(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x?xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x?xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+  // CHECK-LABEL: test_add_not_lowered_unknown_dims
+}
+
+// -----
+
+/// COM: Test for zdnn limitation.
+/// COM: Not lowered when dimensin size exceeds DLCPP_MAXIMUM_DIMENSION_INDEX_SIZE in `third_party/zdnn-lib/zdnn_limit.h`
+/// COM: DLCPP_MAXIMUM_DIMENSION_INDEX_SIZE depends on zAIU HW. Please check the value if these tests fails.
+
+func.func @test_exceed_limit_add(%arg0 : tensor<2097152x10xf32>, %arg1 : tensor<2097152x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Add"(%arg0, %arg1) : (tensor<2097152x10xf32>, tensor<2097152x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+// CHECK-LABEL:  func @test_exceed_limit_add
+// CHECK:        "onnx.Add"
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu-opt.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu-opt.mlir
index d50fd6c291..370f844d8c 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu-opt.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu-opt.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s | FileCheck %s
 
 func.func @test_add_force_cpu_opt(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Add"(%arg0, %arg1) {device = "cpu", onnx_node_name = "test/add0"} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu.mlir
index 173210e1db..c10dd38b9c 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add-exec-cpu.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --printIR --EmitZHighIR -tag="test" %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --printIR --EmitZHighIR -tag="test" %s | FileCheck %s
 
 func.func @test_add_force_cpu(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Add"(%arg0, %arg1) {device = "cpu", onnx_node_name = "test/add0"} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add.mlir
index d637c76b4f..8874702eb0 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/add.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Add"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/common-rules.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/common-rules.mlir
index fb7a6e13af..a3cf9c34c6 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/common-rules.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/common-rules.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 // COM:  Do not lower element-wise ops with scalar tensor since it is not benefical. 
 func.func @test_not_lowered_scalar_tensor(%arg0 : tensor<f32>, %arg1 : tensor<f32>, %arg2: tensor<2xf32>) -> tensor<*xf32> {
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/conv.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/conv.mlir
index c7857a7588..700b615268 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/conv.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/conv.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_onnx_conv2d(%arg0: tensor<5x3x32x32xf32>, %arg1 : tensor<2x3x2x2xf32>, %arg2: tensor<2xf32>) -> tensor<*xf32> {
   %0 = "onnx.Conv"(%arg0, %arg1, %arg2) {kernel_shape = [2, 2]} : (tensor<5x3x32x32xf32>, tensor<2x3x2x2xf32>, tensor<2xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div-bcast.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div-bcast.mlir
index 7df7a2cb2a..d11c7f631c 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div-bcast.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div-bcast.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --nnpa-enable-scalar-bcast-binary %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --nnpa-enable-scalar-bcast-binary %s -split-input-file | FileCheck %s
 
 // COM: Division by a scalar in case of dynamic dimensions.
 func.func @test_div_unknown_scalar1(%arg0 : tensor<?x10xf32>) -> tensor<*xf32> {
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div.mlir
index 9cad7a6915..879ec80a61 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/div.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Div"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/exp.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/exp.mlir
index cd1b115435..2d7f38f57d 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/exp.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/exp.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_exp(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Exp"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gelu.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gelu.mlir
new file mode 100644
index 0000000000..647085c4fb
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gelu.mlir
@@ -0,0 +1,30 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s | FileCheck %s
+
+func.func @test_gelu_erf_arch15(%arg0 : tensor<1x2xf32>) -> tensor<1x2xf32>{
+  %0 ="onnx.Gelu"(%arg0) {approximate = "none"} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+ "func.return"(%0) : (tensor<1x2xf32>) -> ()
+
+
+// CHECK-LABEL:  func @test_gelu_erf_arch15
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x2xf32>) -> tensor<1x2xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<1x2xf32>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.Gelu"(%0) {approximate = "none"} : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf32>
+// CHECK:           return [[VAR_2_]] : tensor<1x2xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_gelu_tanh_arch15(%arg0 : tensor<1x2xf32>) -> tensor<1x2xf32> {
+  %0 ="onnx.Gelu"(%arg0) {approximate = "tanh"} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+ "func.return"(%0) : (tensor<1x2xf32>) -> ()
+
+// CHECK-LABEL:  func @test_gelu_tanh_arch15
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x2xf32>) -> tensor<1x2xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<1x2xf32>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.Gelu"(%0) {approximate = "tanh"} : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf32>
+// CHECK:           return [[VAR_2_]] : tensor<1x2xf32>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gemm.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gemm.mlir
index 948995a469..317d51a0e1 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gemm.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gemm.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @test_gemm_bias_none(%arg0 : tensor<10x5xf32>, %arg1 : tensor<5x10xf32>) -> tensor<*xf32> {
   %bias = "onnx.NoValue"() {value} : () -> none
@@ -10,7 +10,7 @@ func.func @test_gemm_bias_none(%arg0 : tensor<10x5xf32>, %arg1 : tensor<5x10xf32
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
 // CHECK:           return [[VAR_4_]] : tensor<10x10xf32>
 // CHECK:         }
@@ -27,7 +27,7 @@ func.func @test_gemm_bias_1d(%arg0 : tensor<10x5xf32>, %arg1 : tensor<5x10xf32>,
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
 // CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
 // CHECK:           return [[VAR_4_]] : tensor<10x10xf32>
 // CHECK:         }
@@ -44,7 +44,7 @@ func.func @test_gemm_bias_2d(%arg0 : tensor<10x5xf32>, %arg1 : tensor<5x10xf32>,
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
 // CHECK-DAG:       [[VAR_5_:%.+]] = "zhigh.Stick"([[VAR_4_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_6_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
@@ -62,13 +62,12 @@ func.func @test_gemm_transA(%arg0 : tensor<5x10xf32>, %arg1 : tensor<5x10xf32>,
 
 // CHECK-LABEL:  func @test_gemm_transA
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<5x10xf32>, [[PARAM_1_:%.+]]: tensor<5x10xf32>, [[PARAM_2_:%.+]]: tensor<10xf32>) -> tensor<10x10xf32> {
-// CHECK:           [[VAR_0_:%.+]] = "onnx.Transpose"([[PARAM_0_]]) {perm = [1, 0]} : (tensor<5x10xf32>) -> tensor<10x5xf32>
-// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[VAR_0_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
-// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
-// CHECK-DAG:       [[VAR_3_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_4_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_2_]], [[VAR_3_]]) : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
-// CHECK:           [[VAR_5_:%.+]] = "zhigh.Unstick"([[VAR_4_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
-// CHECK:           return [[VAR_5_]] : tensor<10x10xf32>
+// CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 1 : si64, transposeB = 0 : si64} : (tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_4_]] : tensor<10x10xf32>
 // CHECK:         }
 }
 
@@ -80,14 +79,13 @@ func.func @test_gemm_transB(%arg0 : tensor<10x5xf32>, %arg1 : tensor<10x5xf32>,
 
 // CHECK-LABEL:  func @test_gemm_transB
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x5xf32>, [[PARAM_1_:%.+]]: tensor<10x5xf32>, [[PARAM_2_:%.+]]: tensor<10xf32>) -> tensor<10x10xf32> {
-// CHECK-DAG:       [[VAR_1_:%.+]] = "onnx.Transpose"([[PARAM_1_]]) {perm = [1, 0]} : (tensor<10x5xf32>) -> tensor<5x10xf32>
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[VAR_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
-// CHECK-DAG:       [[VAR_3_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_4_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_2_]], [[VAR_3_]]) : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
-// CHECK:           [[VAR_5_:%.+]] = "zhigh.Unstick"([[VAR_4_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
-// CHECK:           return [[VAR_5_]] : tensor<10x10xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 1 : si64} : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_4_]] : tensor<10x10xf32>
 // CHECK:         }
 }
 
@@ -99,15 +97,13 @@ func.func @test_gemm_transAB(%arg0 : tensor<10x5xf32>, %arg1 : tensor<5x10xf32>,
 
 // CHECK-LABEL:  func @test_gemm_transAB
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x5xf32>, [[PARAM_1_:%.+]]: tensor<5x10xf32>, [[PARAM_2_:%.+]]: tensor<5xf32>) -> tensor<5x5xf32> {
-// CHECK:           [[VAR_2_:%.+]] = "onnx.Transpose"([[PARAM_1_]]) {perm = [1, 0]} : (tensor<5x10xf32>) -> tensor<10x5xf32>
-// CHECK:           [[VAR_0_:%.+]] = "onnx.Transpose"([[PARAM_0_]]) {perm = [1, 0]} : (tensor<10x5xf32>) -> tensor<5x10xf32>
-// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[VAR_0_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:       [[VAR_3_:%.+]] = "zhigh.Stick"([[VAR_2_]]) {layout = "2D"} : (tensor<10x5xf32>) -> tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
-// CHECK-DAG:       [[VAR_4_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<5xf32>) -> tensor<5xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_5_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_3_]], [[VAR_4_]]) : (tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
-// CHECK:           [[VAR_6_:%.+]] = "zhigh.Unstick"([[VAR_5_]]) : (tensor<*xf16>) -> tensor<5x5xf32>
-// CHECK:           return [[VAR_6_]] : tensor<5x5xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<5xf32>) -> tensor<5xf16, #zhigh.layout<{dataLayout = "1D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 1 : si64, transposeB = 1 : si64} : (tensor<10x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<5x5xf32>
+// CHECK:           return [[VAR_4_]] : tensor<5x5xf32>
 // CHECK:         }
 }
 
@@ -124,7 +120,7 @@ func.func @test_gemm_unknown_dims(%arg0: tensor<?x5xf32>, %arg1: tensor<5x10xf32
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<?x5xf32>) -> tensor<?x5xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<5x10xf32>) -> tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<?x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x5xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<5x10xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
 // CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<*xf16>) -> tensor<?x10xf32>
 // CHECK:           return [[VAR_4_]] : tensor<?x10xf32>
 // CHECK:         }
@@ -159,8 +155,8 @@ func.func @test_gemm_not_lowered(%arg0 : tensor<5x10xf32>, %arg1 : tensor<5x10xf
 /// COM: Not lowered when dimensin size exceeds DLCPP_MAXIMUM_DIMENSION_INDEX_SIZE in `third_party/zdnn-lib/zdnn_limit.h`
 /// COM: DLCPP_MAXIMUM_DIMENSION_INDEX_SIZE depends on zAIU HW. Please check the value if these tests fails.
 
-func.func @test_exceed_limit_gemm(%arg0 : tensor<32769x5xf32>, %arg1 : tensor<5x32769xf32>, %arg2: tensor<32769xf32>) -> tensor<*xf32> {
-  %0 ="onnx.Gemm"(%arg0, %arg1, %arg2) {alpha = 1.0 : f32, beta = 1.0 : f32, transA = 0 : si64, transB = 0 : si64} : (tensor<32769x5xf32>, tensor<5x32769xf32>, tensor<32769xf32>) -> tensor<*xf32>
+func.func @test_exceed_limit_gemm(%arg0 : tensor<2097152x5xf32>, %arg1 : tensor<5x2097152xf32>, %arg2: tensor<2097152xf32>) -> tensor<*xf32> {
+  %0 ="onnx.Gemm"(%arg0, %arg1, %arg2) {alpha = 1.0 : f32, beta = 1.0 : f32, transA = 0 : si64, transB = 0 : si64} : (tensor<2097152x5xf32>, tensor<5x2097152xf32>, tensor<2097152xf32>) -> tensor<*xf32>
  "func.return"(%0) : (tensor<*xf32>) -> ()
 
 // CHECK-LABEL:  func @test_exceed_limit_gemm
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gru.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gru.mlir
index 494c311bce..546b3e23de 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gru.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/gru.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @test_onnx_to_zhigh_gru0(%X: tensor<7x2000x204xf32>, %W: tensor<1x600x204xf32>, %R: tensor<1x600x200xf32>, %B: tensor<1x1200xf32>) -> (tensor<7x1x2000x200xf32>, tensor<1x2000x200xf32>) {
  %cst = "onnx.NoValue"() {value} : () -> none
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/invsqrt.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/invsqrt.mlir
new file mode 100644
index 0000000000..d37990ceb9
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/invsqrt.mlir
@@ -0,0 +1,45 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s | FileCheck %s
+
+func.func @test_invsqrt_reciprocal(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %a = "onnx.Sqrt"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
+  %y = "onnx.Reciprocal"(%a) : (tensor<*xf32>) -> tensor<*xf32>
+  "func.return"(%y) : (tensor<*xf32>) -> ()
+
+// CHECK-LABEL:  func @test_invsqrt_reciprocal
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x10xf32>) -> tensor<10x10xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.InvSqrt"([[VAR_0_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_2_]] : tensor<10x10xf32>
+// CHECK:         }
+}
+
+func.func @test_invsqrt_div(%arg0 : tensor<1x2xf32>) -> tensor<1x2xf32> {
+  %x = onnx.Constant dense<[[1.0, 1.0]]> : tensor<1x2xf32>
+  %a = "onnx.Sqrt"(%arg0) : (tensor<1x2xf32>) -> tensor<1x2xf32>
+  %y = "onnx.Div"(%x, %a) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
+  "func.return"(%y) : (tensor<1x2xf32>) -> ()
+
+// CHECK-LABEL:  func @test_invsqrt_div
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x2xf32>) -> tensor<1x2xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<1x2xf32>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.InvSqrt"([[VAR_0_]]) : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf32>
+// CHECK:           return [[VAR_2_]] : tensor<1x2xf32>
+// CHECK:         }
+}
+
+func.func @test_invsqrt_div2(%arg0 : tensor<1x2xf32>) -> tensor<*xf32> {
+  %x = onnx.Constant dense<[[1.0, 1.0]]> : tensor<1x2xf32>
+  %a = "onnx.Sqrt"(%arg0) : (tensor<1x2xf32>) -> tensor<*xf32>
+  %y = "onnx.Div"(%x, %a) : (tensor<1x2xf32>, tensor<*xf32>) -> tensor<*xf32>
+  "func.return"(%y) : (tensor<*xf32>) -> ()
+
+// CHECK-LABEL:  func @test_invsqrt_div
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x2xf32>) -> tensor<1x2xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<1x2xf32>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.InvSqrt"([[VAR_0_]]) : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<1x2xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<1x2xf32>
+// CHECK:           return [[VAR_2_]] : tensor<1x2xf32>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/leakyrelu.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/leakyrelu.mlir
new file mode 100644
index 0000000000..b5eb4b09d6
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/leakyrelu.mlir
@@ -0,0 +1,42 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+
+func.func @test_leakyrelu(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.LeakyRelu"(%arg0) { alpha = 0.02:f32 } : (tensor<10x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+// CHECK-LABEL:  func @test_leakyrelu
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x10xf32>) -> tensor<10x10xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.LeakyRelu"([[VAR_0_]]) {alpha = 2.000000e-02 : f32} : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_2_]] : tensor<10x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_leakyrelu2(%arg0 : tensor<2x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.LeakyRelu"(%arg0) { alpha = 0.01:f32 } : (tensor<2x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+// CHECK-LABEL:  func @test_leakyrelu2
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x10xf32>) -> tensor<2x10xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<2x10xf32>) -> tensor<2x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.LeakyRelu"([[VAR_0_]]) {alpha = 0.00999999977 : f32} : (tensor<2x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<2x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<2x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<2x10xf32>
+// CHECK:           return [[VAR_2_]] : tensor<2x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_leakyrelu_default(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.LeakyRelu"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+// CHECK-LABEL:  func @test_leakyrelu_default
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x10xf32>) -> tensor<10x10xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.LeakyRelu"([[VAR_0_]]) {alpha = 0.00999999977 : f32} : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_2_]] : tensor<10x10xf32>
+// CHECK:         }
+}
+
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/log.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/log.mlir
index 2c51af040e..0a6580ece9 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/log.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/log.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_log(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Log"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/lstm.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/lstm.mlir
index cee97d059f..39bc16d49a 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/lstm.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @test_onnx_to_zhigh_ccfd0(%X: tensor<7x2000x204xf32>, %W: tensor<1x800x204xf32>, %R: tensor<1x800x200xf32>, %B: tensor<1x1600xf32>) -> (tensor<7x1x2000x200xf32>, tensor<1x2000x200xf32>, tensor<1x2000x200xf32>) {
  %cst = "onnx.NoValue"() {value} : () -> none
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul-arch15.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul-arch15.mlir
new file mode 100644
index 0000000000..e6bf56beb1
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul-arch15.mlir
@@ -0,0 +1,115 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+
+// COM: In these tests, matmul and transpose will be combined together to be lowered to
+// COM: zhigh.MatMul.
+
+func.func @test_onnx_transposea_matmul_to_zhigh(%arg0 : tensor<8x4xf32>, %arg1 : tensor<8x4xf32>) -> tensor<*xf32> {
+   %0 = "onnx.Transpose"(%arg0) {perm = [1, 0]}: (tensor<8x4xf32>) -> tensor<4x8xf32>
+   %1 = "onnx.MatMul"(%0, %arg1) : (tensor<4x8xf32>,tensor<8x4xf32>) -> tensor<*xf32>
+   "func.return"(%1) : (tensor<*xf32>) -> ()
+
+// CHECK-LABEL:  func @test_onnx_transposea_matmul_to_zhigh
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<8x4xf32>, [[PARAM_1_:%.+]]: tensor<8x4xf32>) -> tensor<4x4xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<8x4xf32>) -> tensor<8x4xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<8x4xf32>) -> tensor<8x4xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 1 : si64, transposeB = 0 : si64} : (tensor<8x4xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x4xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<4x4xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<4x4xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<4x4xf32>
+// CHECK:           return [[VAR_3_]] : tensor<4x4xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_onnx_transposeb_matmul_to_zhigh(%arg0 : tensor<4x8xf32>, %arg1 : tensor<4x8xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Transpose"(%arg1) {perm = [1, 0]}: (tensor<4x8xf32>) -> tensor<8x4xf32>
+  %1 = "onnx.MatMul"(%arg0, %0) : (tensor<4x8xf32>,tensor<8x4xf32>) -> tensor<*xf32>
+  "func.return"(%1) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_onnx_transposeb_matmul_to_zhigh
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<4x8xf32>, [[PARAM_1_:%.+]]: tensor<4x8xf32>) -> tensor<4x4xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<4x8xf32>) -> tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<4x8xf32>) -> tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_2_]], [[VAR_0_]]) {transposeA = 0 : si64, transposeB = 1 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<4x4xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<4x4xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<4x4xf32>
+// CHECK:           return [[VAR_4_]] : tensor<4x4xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_onnx_transposeab_matmul_to_zhigh(%arg0 : tensor<4x8xf32>, %arg1 : tensor<16x4xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Transpose"(%arg0) {permA = [1, 0]}: (tensor<4x8xf32>) -> tensor<8x4xf32>
+  %1 = "onnx.Transpose"(%arg1) {permB = [1, 0]}: (tensor<16x4xf32>) -> tensor<4x16xf32>
+  %2 = "onnx.MatMul"(%0, %1) : (tensor<8x4xf32>,tensor<4x16xf32>) -> tensor<*xf32>
+  "func.return"(%2) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_onnx_transposeab_matmul_to_zhigh
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<4x8xf32>, [[PARAM_1_:%.+]]: tensor<16x4xf32>) -> tensor<8x16xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<4x8xf32>) -> tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<16x4xf32>) -> tensor<16x4xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_2_]], [[VAR_0_]]) {transposeA = 1 : si64, transposeB = 1 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16x4xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<8x16xf32>
+// CHECK:           return [[VAR_4_]] : tensor<8x16xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_onnx_to_transposea_matmul_to_zhigh_3d(%arg0 : tensor<100x4x8xf32>, %arg1 : tensor<100x16x8xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Transpose"(%arg0) {perm = [0, 2, 1]}: (tensor<100x4x8xf32>) -> tensor<100x8x4xf32>
+  %1 = "onnx.MatMul"(%0, %arg1) : (tensor<100x8x4xf32>, tensor<100x16x8xf32>) -> tensor<*xf32>
+  "func.return"(%1) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_onnx_to_transposea_matmul_to_zhigh_3d
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<100x4x8xf32>, [[PARAM_1_:%.+]]: tensor<100x16x8xf32>) -> tensor<*xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "onnx.Transpose"([[PARAM_0_]]) {perm = [0, 2, 1]} : (tensor<100x4x8xf32>) -> tensor<100x8x4xf32>
+// CHECK:           [[VAR_1_:%.+]] = "onnx.MatMul"([[VAR_0_]], [[PARAM_1_]]) : (tensor<100x8x4xf32>, tensor<100x16x8xf32>) -> tensor<*xf32>
+// CHECK:           return [[VAR_1_]] : tensor<*xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_onnx_to_transposeb_matmul_to_zhigh_3d(%arg0 : tensor<100x4x8xf32>, %arg1 : tensor<100x16x8xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Transpose"(%arg1) {perm = [0, 2, 1]}: (tensor<100x16x8xf32>) -> tensor<100x8x16xf32>
+  %1 = "onnx.MatMul"(%arg0, %0) : (tensor<100x4x8xf32>, tensor<100x8x16xf32>) -> tensor<*xf32>
+  "func.return"(%1) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_onnx_to_transposeb_matmul_to_zhigh_3d
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<100x4x8xf32>, [[PARAM_1_:%.+]]: tensor<100x16x8xf32>) -> tensor<100x4x16xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<100x4x8xf32>) -> tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "3DS"} : (tensor<100x16x8xf32>) -> tensor<100x16x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_1_]], [[VAR_2_]], [[VAR_0_]]) {transposeA = 0 : si64, transposeB = 1 : si64} : (tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<100x16x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<100x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<100x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<100x4x16xf32>
+// CHECK:           return [[VAR_4_]] : tensor<100x4x16xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_onnx_to_transposeab_matmul_to_zhigh_3d(%arg0 : tensor<100x4x8xf32>, %arg1 : tensor<100x8x16xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Transpose"(%arg0) {permA = [0, 2, 1]}: (tensor<100x4x8xf32>) -> tensor<100x8x4xf32>
+  %1 = "onnx.Transpose"(%arg1) {permB = [0, 2, 1]}: (tensor<100x8x16xf32>) -> tensor<100x16x8xf32>
+  %2 = "onnx.MatMul"(%0, %1) : (tensor<100x8x4xf32>,tensor<100x16x8xf32>) -> tensor<*xf32>
+  "func.return"(%2) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_onnx_to_transposeab_matmul_to_zhigh_3d
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<100x4x8xf32>, [[PARAM_1_:%.+]]: tensor<100x8x16xf32>) -> tensor<*xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.Transpose"([[PARAM_0_]]) {perm = [2, 1, 0], permA = [0, 2, 1]} : (tensor<100x4x8xf32>) -> tensor<100x8x4xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "onnx.Transpose"([[PARAM_1_]]) {perm = [2, 1, 0], permB = [0, 2, 1]} : (tensor<100x8x16xf32>) -> tensor<100x16x8xf32>
+// CHECK:           [[VAR_2_:%.+]] = "onnx.MatMul"([[VAR_0_]], [[VAR_1_]]) : (tensor<100x8x4xf32>, tensor<100x16x8xf32>) -> tensor<*xf32>
+// CHECK:           return [[VAR_2_]] : tensor<*xf32>
+// CHECK:         }
+}
+
+// -----
+
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul.mlir
index 857baf98f6..064d27f518 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmul.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @test_onnx_to_matmul2d(%arg0 : tensor<4x8xf32>, %arg1 : tensor<8x16xf32>) -> tensor<*xf32> {
   %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<4x8xf32>, tensor<8x16xf32>) -> tensor<*xf32>
@@ -9,7 +9,7 @@ func.func @test_onnx_to_matmul2d(%arg0 : tensor<4x8xf32>, %arg1 : tensor<8x16xf3
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<4x8xf32>) -> tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<8x16xf32>) -> tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<4x16xf32>
 // CHECK:           return [[VAR_3_]] : tensor<4x16xf32>
 // CHECK:         }
@@ -26,7 +26,7 @@ func.func @test_onnx_to_matmul3d(%arg0 : tensor<100x4x8xf32>, %arg1 : tensor<100
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<100x4x8xf32>) -> tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "3DS"} : (tensor<100x8x16xf32>) -> tensor<100x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) : (tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<100x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<100x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<100x4x16xf32>
 // CHECK:           return [[VAR_3_]] : tensor<100x4x16xf32>
 // CHECK:         }
@@ -43,7 +43,7 @@ func.func @test_onnx_to_matmul3dbcast(%arg0 : tensor<100x4x8xf32>, %arg1 : tenso
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<100x4x8xf32>) -> tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<8x16xf32>) -> tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) : (tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<100x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<100x4x16xf32>
 // CHECK:           return [[VAR_3_]] : tensor<100x4x16xf32>
 // CHECK:         }
@@ -79,7 +79,7 @@ func.func @test_onnx_matmul_add_to_zhigh_1D_bias(
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<4x8xf32>) -> tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<8x16xf32>) -> tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<16xf32>) -> tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>
 // CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<4x16xf32>
 // CHECK:           return [[VAR_4_]] : tensor<4x16xf32>
 // CHECK:         }
@@ -105,7 +105,7 @@ func.func @test_onnx_matmul_add_to_zhigh_1D_bias_normalized(
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<4x8xf32>) -> tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<8x16xf32>) -> tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.Stick"([[PARAM_2_]]) {layout = "1D"} : (tensor<16xf32>) -> tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK:           [[VAR_4_:%.+]] = "zhigh.Unstick"([[VAR_3_]]) : (tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<4x16xf32>
 // CHECK:           return [[VAR_4_]] : tensor<4x16xf32>
 // CHECK:         }
@@ -161,7 +161,7 @@ func.func @test_onnx_to_matmul2d_dyn(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<?x?xf32>) -> tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<?x?xf32>) -> tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) : (tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<?x?xf32>
 // CHECK:           return [[VAR_3_]] : tensor<?x?xf32>
 // CHECK:         }
@@ -178,7 +178,7 @@ func.func @test_onnx_to_matmul3d_dyn(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "3DS"} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<?x?x?xf32>
 // CHECK:           return [[VAR_3_]] : tensor<?x?x?xf32>
 // CHECK:         }
@@ -195,7 +195,7 @@ func.func @test_onnx_to_matmul3dbcast_dyn(%arg0 : tensor<?x?x?xf32>, %arg1 : ten
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<?x?x?xf32>) -> tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<?x?xf32>) -> tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<*xf16>
 // CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<?x?x?xf32>
 // CHECK:           return [[VAR_3_]] : tensor<?x?x?xf32>
 // CHECK:         }
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmulinteger.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmulinteger.mlir
new file mode 100644
index 0000000000..b121ac4628
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/matmulinteger.mlir
@@ -0,0 +1,187 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s --check-prefix=CHECK-FUSION
+
+func.func @matmulinteger(%arg0: tensor<?x?x768xui8>, %arg1: tensor<768x768xi8>, %arg2: tensor<ui8>, %arg3: tensor<i8>) -> tensor<?x?x768xi32> {
+  %0 = "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %arg3) : (tensor<?x?x768xui8>, tensor<768x768xi8>, tensor<ui8>, tensor<i8>) -> tensor<?x?x768xi32>
+  return %0 : tensor<?x?x768xi32>
+
+// CHECK-LABEL:  func.func @matmulinteger
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x768xui8>, [[PARAM_1_:%.+]]: tensor<768x768xi8>, [[PARAM_2_:%.+]]: tensor<ui8>, [[PARAM_3_:%.+]]: tensor<i8>) -> tensor<?x?x768xi32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.Cast"([[PARAM_0_]]) {saturate = 1 : si64, to = i8} : (tensor<?x?x768xui8>) -> tensor<?x?x768xi8>
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.Cast"([[PARAM_2_]]) {saturate = 1 : si64, to = i8} : (tensor<ui8>) -> tensor<i8>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_5_:%.+]] = "onnx.Cast"([[VAR_4_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.Cast"([[PARAM_3_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[VAR_3_]], [[VAR_2_]], [[VAR_5_]]) {layout = "3DS", quantized_type = "INT8", sym_mode = 0 : i64} : (tensor<?x?x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[PARAM_1_]], [[VAR_2_]], [[VAR_6_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<768x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_RecScale_]]_1, [[VAR_Offset_]]_2, [[VAR_0_]], [[VAR_0_]], [[VAR_0_]], [[VAR_2_]], [[VAR_1_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = 0 : si64} : (tensor<?x?x768xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, none, none, none, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_7_:%.+]] = "zhigh.Unstick"([[VAR_Out_3_]]) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x768xf32>
+// CHECK:           [[VAR_8_:%.+]] = "onnx.Cast"([[VAR_7_]]) {saturate = 1 : si64, to = i32} : (tensor<?x?x768xf32>) -> tensor<?x?x768xi32>
+// CHECK:           return [[VAR_8_]] : tensor<?x?x768xi32>
+// CHECK:         }
+}
+
+// -----
+
+// Do not do pre_compute when B is not a constant.
+func.func @matmulinteger_no_precompute_bias(%arg0: tensor<?x?x768xui8>, %arg1: tensor<768x768xi8>, %arg2: tensor<ui8>) -> tensor<?x?x768xi32> {
+  %0 = onnx.Constant dense<0> : tensor<i8>
+  %1 = "onnx.MatMulInteger"(%arg0, %arg1, %arg2, %0) : (tensor<?x?x768xui8>, tensor<768x768xi8>, tensor<ui8>, tensor<i8>) -> tensor<?x?x768xi32>
+  return %1 : tensor<?x?x768xi32>
+
+// CHECK-LABEL:  func.func @matmulinteger_no_precompute_bias
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x768xui8>, [[PARAM_1_:%.+]]: tensor<768x768xi8>, [[PARAM_2_:%.+]]: tensor<ui8>) -> tensor<?x?x768xi32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<0> : tensor<i8>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.Cast"([[PARAM_0_]]) {saturate = 1 : si64, to = i8} : (tensor<?x?x768xui8>) -> tensor<?x?x768xi8>
+// CHECK-DAG:       [[VAR_5_:%.+]] = "onnx.Cast"([[PARAM_2_]]) {saturate = 1 : si64, to = i8} : (tensor<ui8>) -> tensor<i8>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.Cast"([[VAR_5_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.Cast"([[VAR_0_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[VAR_4_]], [[VAR_3_]], [[VAR_6_]]) {layout = "3DS", quantized_type = "INT8", sym_mode = 0 : i64} : (tensor<?x?x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[PARAM_1_]], [[VAR_3_]], [[VAR_7_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<768x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_RecScale_]]_1, [[VAR_Offset_]]_2, [[VAR_1_]], [[VAR_1_]], [[VAR_1_]], [[VAR_3_]], [[VAR_2_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = 0 : si64} : (tensor<?x?x768xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, none, none, none, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_8_:%.+]] = "zhigh.Unstick"([[VAR_Out_3_]]) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x768xf32>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Cast"([[VAR_8_]]) {saturate = 1 : si64, to = i32} : (tensor<?x?x768xf32>) -> tensor<?x?x768xi32>
+// CHECK:           return [[VAR_9_]] : tensor<?x?x768xi32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @matmulinteger_precompute_bias(%arg0: tensor<?x?x768xui8>, %arg1: tensor<ui8>) -> tensor<?x?x768xi32> {
+  %0 = onnx.Constant dense<0> : tensor<i8>
+  %B = onnx.Constant dense<0> : tensor<768x768xi8>
+  %1 = "onnx.MatMulInteger"(%arg0, %B, %arg1, %0) : (tensor<?x?x768xui8>, tensor<768x768xi8>, tensor<ui8>, tensor<i8>) -> tensor<?x?x768xi32>
+  return %1 : tensor<?x?x768xi32>
+
+// CHECK-LABEL:  func.func @matmulinteger_precompute_bias
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x768xui8>, [[PARAM_1_:%.+]]: tensor<ui8>) -> tensor<?x?x768xi32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<-2> : tensor<i64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0> : tensor<i8>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<0> : tensor<768x768xi8>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = "onnx.Cast"([[PARAM_0_]]) {saturate = 1 : si64, to = i8} : (tensor<?x?x768xui8>) -> tensor<?x?x768xi8>
+// CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.Cast"([[PARAM_1_]]) {saturate = 1 : si64, to = i8} : (tensor<ui8>) -> tensor<i8>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.Cast"([[VAR_6_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Cast"([[VAR_1_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[VAR_5_]], [[VAR_4_]], [[VAR_7_]]) {layout = "3DS", quantized_type = "INT8", sym_mode = 0 : i64} : (tensor<?x?x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_2_]], [[VAR_4_]], [[VAR_8_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<768x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Cast"([[VAR_2_]]) {saturate = 1 : si64, to = f32} : (tensor<768x768xi8>) -> tensor<768x768xf32>
+// CHECK-DAG:       [[VAR_10_:%.+]] = "onnx.ReduceSum"([[VAR_9_]], [[VAR_0_]]) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64} : (tensor<768x768xf32>, tensor<i64>) -> tensor<768xf32>
+// CHECK-DAG:       [[VAR_11_:%.+]] = "onnx.Div"([[VAR_4_]], [[VAR_4_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_12_:%.+]] = "onnx.Div"([[VAR_11_]], [[VAR_4_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_13_:%.+]] = "onnx.Mul"([[VAR_12_]], [[VAR_7_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_14_:%.+]] = "onnx.Sub"([[VAR_3_]], [[VAR_13_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_15_:%.+]] = "onnx.Mul"([[VAR_14_]], [[VAR_10_]]) : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[VAR_15_]], [[VAR_4_]], [[VAR_3_]]) {layout = "1D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<768xf32>, tensor<f32>, tensor<f32>) -> (tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_6_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_RecScale_]]_1, [[VAR_Offset_]]_2, [[VAR_Out_]]_3, [[VAR_RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_4_]], [[VAR_3_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x768xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_16_:%.+]] = "zhigh.Unstick"([[VAR_Out_6_]]) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x768xf32>
+// CHECK:           [[VAR_17_:%.+]] = "onnx.Cast"([[VAR_16_]]) {saturate = 1 : si64, to = i32} : (tensor<?x?x768xf32>) -> tensor<?x?x768xi32>
+// CHECK:           return [[VAR_17_]] : tensor<?x?x768xi32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @matmulinteger_rewrite_from_mul_pattern_in_bert(%arg0: tensor<?x?x768xf32>) -> tensor<?x?x768xf32> {
+  %0 = onnx.Constant dense<5> : tensor<768x768xi8>
+  %1 = onnx.Constant dense<0.00656270096> : tensor<f32>
+  %2 = onnx.Constant dense<0> : tensor<i8>
+  %y, %y_scale, %y_zero_point = "onnx.DynamicQuantizeLinear"(%arg0) : (tensor<?x?x768xf32>) -> (tensor<?x?x768xui8>, tensor<f32>, tensor<ui8>)
+  %3 = "onnx.MatMulInteger"(%y, %0, %y_zero_point, %2) : (tensor<?x?x768xui8>, tensor<768x768xi8>, tensor<ui8>, tensor<i8>) -> tensor<?x?x768xi32>
+  %4 = "onnx.Cast"(%3) {saturate = 1 : si64, to = f32} : (tensor<?x?x768xi32>) -> tensor<?x?x768xf32>
+  %5 = "onnx.Mul"(%4, %y_scale) : (tensor<?x?x768xf32>, tensor<f32>) -> tensor<?x?x768xf32>
+  %6 = "onnx.Mul"(%5, %1) : (tensor<?x?x768xf32>, tensor<f32>) -> tensor<?x?x768xf32>
+  return %6 : tensor<?x?x768xf32>
+
+// CHECK-LABEL:  func.func @matmulinteger_rewrite_from_mul_pattern_in_bert
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x768xf32>) -> tensor<?x?x768xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<-2> : tensor<i64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<5> : tensor<768x768xi8>
+// CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<0.00656270096> : tensor<f32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<0> : tensor<i8>
+// CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_6_]], [[VAR_6_]]) {layout = "3DS", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<?x?x768xf32>, none, none) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.Reciprocal"([[VAR_4_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Cast"([[VAR_5_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_3_]], [[VAR_7_]], [[VAR_8_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<768x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Cast"([[VAR_3_]]) {saturate = 1 : si64, to = f32} : (tensor<768x768xi8>) -> tensor<768x768xf32>
+// CHECK-DAG:       [[VAR_10_:%.+]] = "onnx.ReduceSum"([[VAR_9_]], [[VAR_0_]]) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64} : (tensor<768x768xf32>, tensor<i64>) -> tensor<768xf32>
+// CHECK-DAG:       [[VAR_11_:%.+]] = "onnx.Div"([[VAR_2_]], [[VAR_RecScale_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_12_:%.+]] = "onnx.Div"([[VAR_11_]], [[VAR_7_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_13_:%.+]] = "onnx.Mul"([[VAR_12_]], [[VAR_Offset_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_14_:%.+]] = "onnx.Sub"([[VAR_1_]], [[VAR_1_]]3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_15_:%.+]] = "onnx.Mul"([[VAR_14_]], [[VAR_10_]]) : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[VAR_15_]], [[VAR_2_]], [[VAR_1_]]) {layout = "1D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<768xf32>, tensor<f32>, tensor<f32>) -> (tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_6_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_7_]], [[VAR_8_]], [[VAR_Out_]]_3, [[VAR_RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_2_]], [[VAR_1_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_16_:%.+]] = "zhigh.Unstick"([[VAR_Out_6_]]) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x768xf32>
+// CHECK:           return [[VAR_16_]] : tensor<?x?x768xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @matmulinteger_fuse_add_pattern_in_bert(%arg0: tensor<?x?x768xf32>) -> tensor<?x?x768xf32> {
+   %0 = onnx.Constant dense<-2> : tensor<i64>
+   %1 = onnx.Constant dense<0.000000e+00> : tensor<f32>
+   %2 = onnx.Constant dense<1.000000e+00> : tensor<f32>
+   %3 = onnx.Constant dense<5.000000e+00> : tensor<768xf32>
+   %4 = onnx.Constant dense<5> : tensor<768x768xi8>
+   %5 = onnx.Constant dense<0.00656270096> : tensor<f32>
+   %6 = onnx.Constant dense<0> : tensor<i8>
+   %7 = "onnx.NoValue"() {value} : () -> none
+   %Out, %RecScale, %Offset = "zhigh.QuantizedStick"(%arg0, %7, %7) {layout = "3DS", quantized_type = "DLFLOAT16"} : (tensor<?x?x768xf32>, none, none) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+   %8 = "onnx.Reciprocal"(%5) : (tensor<f32>) -> tensor<f32>
+   %9 = "onnx.Cast"(%6) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+   %Out_0, %RecScale_1, %Offset_2 = "zhigh.QuantizedStick"(%4, %8, %9) {layout = "2D", quantized_type = "WEIGHTS"} : (tensor<768x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+   %10 = "onnx.Cast"(%4) {saturate = 1 : si64, to = f32} : (tensor<768x768xi8>) -> tensor<768x768xf32>
+   %11 = "onnx.ReduceSum"(%10, %0) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64} : (tensor<768x768xf32>, tensor<i64>) -> tensor<768xf32>
+   %12 = "onnx.Div"(%2, %RecScale) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+   %13 = "onnx.Div"(%12, %8) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+   %14 = "onnx.Mul"(%13, %Offset) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+   %15 = "onnx.Sub"(%1, %14) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+   %16 = "onnx.Mul"(%15, %11) : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
+   %17 = "onnx.Add"(%3, %16) : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
+   %Out_3, %RecScale_4, %Offset_5 = "zhigh.QuantizedStick"(%17, %2, %1) {layout = "1D", quantized_type = "DLFLOAT16"} : (tensor<768xf32>, tensor<f32>, tensor<f32>) -> (tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+   %Out_6, %OutRecScale, %OutOffset = "zhigh.QuantizedMatMul"(%Out, %RecScale, %Offset, %Out_0, %8, %9, %Out_3, %RecScale_4, %Offset_5, %2, %1) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+   %18 = "zhigh.Unstick"(%Out_6) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x768xf32>
+   return %18 : tensor<?x?x768xf32>
+
+// CHECK-FUSION-LABEL:  func.func @matmulinteger_fuse_add_pattern_in_bert
+// CHECK-FUSION-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x768xf32>) -> tensor<?x?x768xf32> {
+// CHECK-FUSION-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<-2> : tensor<i64>
+// CHECK-FUSION-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-FUSION-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK-FUSION-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<5.000000e+00> : tensor<768xf32>
+// CHECK-FUSION-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<5> : tensor<768x768xi8>
+// CHECK-FUSION-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<0.00656270096> : tensor<f32>
+// CHECK-FUSION-DAG:       [[VAR_6_:%.+]] = onnx.Constant dense<0> : tensor<i8>
+// CHECK-FUSION-DAG:       [[VAR_7_:%.+]] = "onnx.NoValue"() {value} : () -> none
+   // CHECK-FUSION:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_7_]], [[VAR_7_]]) {layout = "3DS", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<?x?x768xf32>, none, none) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK-FUSION-DAG:       [[VAR_8_:%.+]] = "onnx.Reciprocal"([[VAR_5_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-FUSION-DAG:       [[VAR_9_:%.+]] = "onnx.Cast"([[VAR_6_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+   // CHECK-FUSION:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_4_]], [[VAR_8_]], [[VAR_9_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<768x768xi8>, tensor<f32>, tensor<f32>) -> (tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK-FUSION:           [[VAR_10_:%.+]] = "onnx.Cast"([[VAR_4_]]) {saturate = 1 : si64, to = f32} : (tensor<768x768xi8>) -> tensor<768x768xf32>
+// CHECK-FUSION-DAG:       [[VAR_11_:%.+]] = "onnx.ReduceSum"([[VAR_10_]], [[VAR_0_]]) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64} : (tensor<768x768xf32>, tensor<i64>) -> tensor<768xf32>
+// CHECK-FUSION-DAG:       [[VAR_12_:%.+]] = "onnx.Div"([[VAR_2_]], [[VAR_RecScale_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK-FUSION:           [[VAR_13_:%.+]] = "onnx.Div"([[VAR_12_]], [[VAR_8_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK-FUSION:           [[VAR_14_:%.+]] = "onnx.Mul"([[VAR_13_]], [[VAR_Offset_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK-FUSION:           [[VAR_15_:%.+]] = "onnx.Sub"([[VAR_1_]], [[VAR_1_]]4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK-FUSION:           [[VAR_16_:%.+]] = "onnx.Mul"([[VAR_15_]], [[VAR_11_]]) : (tensor<f32>, tensor<768xf32>) -> tensor<768xf32>
+// CHECK-FUSION:           [[VAR_17_:%.+]] = "onnx.Add"([[VAR_3_]], [[VAR_16_]]) : (tensor<768xf32>, tensor<768xf32>) -> tensor<768xf32>
+   // CHECK-FUSION:           [[VAR_Out_3_:%.+]], [[VAR_RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[VAR_17_]], [[VAR_2_]], [[VAR_1_]]) {layout = "1D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<768xf32>, tensor<f32>, tensor<f32>) -> (tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK-FUSION:           [[VAR_Out_6_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_8_]], [[VAR_9_]], [[VAR_Out_]]_3, [[VAR_RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_2_]], [[VAR_1_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<768x768xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<768xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK-FUSION:           [[VAR_18_:%.+]] = "zhigh.Unstick"([[VAR_Out_6_]]) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x768xf32>
+// CHECK-FUSION:           return [[VAR_18_]] : tensor<?x?x768xf32>
+// CHECK-FUSION:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/max.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/max.mlir
index d4b0da8748..d916c93ab5 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/max.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/max.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Max"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/min.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/min.mlir
index 4e59b9e415..8148fad37c 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/min.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/min.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Min"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/mul.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/mul.mlir
index 69acfb9e44..0de095bd6d 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/mul.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/mul.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Mul"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/pool.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/pool.mlir
index 5e8e3ad622..f974ae04f0 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/pool.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/pool.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @maxpool_should_lower_to_zhigh_padtype_valid(%arg0: tensor<1x3x32x32xf32>) -> tensor<*xf32> {
   %0 = "onnx.MaxPoolSingleOut"(%arg0) {auto_pad = "NOTSET", dilations = [1, 1], kernel_shape = [2, 2], pads = [0, 0, 0, 0], strides = [1, 1]} : (tensor<1x3x32x32xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/qlinearmatmul.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/qlinearmatmul.mlir
new file mode 100644
index 0000000000..2a7a11637b
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/qlinearmatmul.mlir
@@ -0,0 +1,66 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @qlinearmatmul_i8_f32(%arg0: tensor<2x4xi8> {onnx.name = "a"}, %arg1: tensor<f32> {onnx.name = "a_scale"}, %arg2: tensor<i8> {onnx.name = "a_zero_point"}, %arg3: tensor<4x3xi8> {onnx.name = "b"}, %arg4: tensor<f32> {onnx.name = "b_scale"}, %arg5: tensor<i8> {onnx.name = "b_zero_point"}, %arg6: tensor<f32> {onnx.name = "y_scale"}, %arg7: tensor<i8> {onnx.name = "y_zero_point"}) -> (tensor<2x3xi8> {onnx.name = "y"}) {
+    %0 = "onnx.QLinearMatMul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (tensor<2x4xi8>, tensor<f32>, tensor<i8>, tensor<4x3xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<2x3xi8>
+    onnx.Return %0 : tensor<2x3xi8>
+
+// CHECK-LABEL:  func.func @qlinearmatmul_i8_f32
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x4xi8> {onnx.name = "a"}, [[PARAM_1_:%.+]]: tensor<f32> {onnx.name = "a_scale"}, [[PARAM_2_:%.+]]: tensor<i8> {onnx.name = "a_zero_point"}, [[PARAM_3_:%.+]]: tensor<4x3xi8> {onnx.name = "b"}, [[PARAM_4_:%.+]]: tensor<f32> {onnx.name = "b_scale"}, [[PARAM_5_:%.+]]: tensor<i8> {onnx.name = "b_zero_point"}, [[PARAM_6_:%.+]]: tensor<f32> {onnx.name = "y_scale"}, [[PARAM_7_:%.+]]: tensor<i8> {onnx.name = "y_zero_point"}) -> (tensor<2x3xi8> {onnx.name = "y"}) {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_1_:%.+]] = "onnx.Reciprocal"([[PARAM_1_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.Cast"([[PARAM_2_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.Reciprocal"([[PARAM_4_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.Cast"([[PARAM_5_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = "onnx.Reciprocal"([[PARAM_6_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.Cast"([[PARAM_7_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK:           [[Out_:%.+]], [[RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_1_]], [[VAR_2_]]) {layout = "2D", quantized_type = "INT8", sym_mode = 0 : i64} : (tensor<2x4xi8>, tensor<f32>, tensor<f32>) -> (tensor<2x4xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_0_:%.+]], [[RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[PARAM_3_]], [[VAR_3_]], [[VAR_4_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<4x3xi8>, tensor<f32>, tensor<f32>) -> (tensor<4x3xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_3_:%.+]], [[OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[Out_]], [[RecScale_]], [[VAR_Offset_]], [[Out_0_]], [[RecScale_1_]], [[VAR_Offset_2_]], [[VAR_0_]], [[VAR_0_]], [[VAR_0_]], [[VAR_5_]], [[VAR_6_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = 0 : si64} : (tensor<2x4xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, tensor<4x3xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, none, none, none, tensor<f32>, tensor<f32>) -> (tensor<2x3xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_7_:%.+]] = "zhigh.Unstick"([[Out_3_]]) : (tensor<2x3xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>) -> tensor<2x3xf32>
+// CHECK:           [[VAR_8_:%.+]] = "onnx.Cast"([[VAR_7_]]) {saturate = 1 : si64, to = i8} : (tensor<2x3xf32>) -> tensor<2x3xi8>
+// CHECK:           onnx.Return [[VAR_8_]] : tensor<2x3xi8>
+// CHECK:         }
+}
+
+// -----
+
+func.func @qlinearmatmul_ui8_f32(%arg0: tensor<2x4xui8> {onnx.name = "a"}, %arg1: tensor<f32> {onnx.name = "a_scale"}, %arg2: tensor<ui8> {onnx.name = "a_zero_point"}, %arg3: tensor<4x3xui8> {onnx.name = "b"}, %arg4: tensor<f32> {onnx.name = "b_scale"}, %arg5: tensor<ui8> {onnx.name = "b_zero_point"}, %arg6: tensor<f32> {onnx.name = "y_scale"}, %arg7: tensor<ui8> {onnx.name = "y_zero_point"}) -> (tensor<2x3xui8> {onnx.name = "y"}) {
+    %0 = "onnx.QLinearMatMul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (tensor<2x4xui8>, tensor<f32>, tensor<ui8>, tensor<4x3xui8>, tensor<f32>, tensor<ui8>, tensor<f32>, tensor<ui8>) -> tensor<2x3xui8>
+    onnx.Return %0 : tensor<2x3xui8>
+
+// CHECK-LABEL:  func.func @qlinearmatmul_ui8_f32
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x4xui8> {onnx.name = "a"}, [[PARAM_1_:%.+]]: tensor<f32> {onnx.name = "a_scale"}, [[PARAM_2_:%.+]]: tensor<ui8> {onnx.name = "a_zero_point"}, [[PARAM_3_:%.+]]: tensor<4x3xui8> {onnx.name = "b"}, [[PARAM_4_:%.+]]: tensor<f32> {onnx.name = "b_scale"}, [[PARAM_5_:%.+]]: tensor<ui8> {onnx.name = "b_zero_point"}, [[PARAM_6_:%.+]]: tensor<f32> {onnx.name = "y_scale"}, [[PARAM_7_:%.+]]: tensor<ui8> {onnx.name = "y_zero_point"}) -> (tensor<2x3xui8> {onnx.name = "y"}) {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<128> : tensor<i16>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.Cast"([[PARAM_0_]]) {saturate = 1 : si64, to = i16} : (tensor<2x4xui8>) -> tensor<2x4xi16>
+// CHECK:           [[VAR_3_:%.+]] = "onnx.Sub"([[VAR_2_]], [[VAR_0_]]) : (tensor<2x4xi16>, tensor<i16>) -> tensor<2x4xi16>
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.Cast"([[VAR_3_]]) {saturate = 1 : si64, to = i8} : (tensor<2x4xi16>) -> tensor<2x4xi8>
+// CHECK-DAG:       [[VAR_5_:%.+]] = "onnx.Cast"([[PARAM_3_]]) {saturate = 1 : si64, to = i16} : (tensor<4x3xui8>) -> tensor<4x3xi16>
+// CHECK:           [[VAR_6_:%.+]] = "onnx.Sub"([[VAR_5_]], [[VAR_0_]]) : (tensor<4x3xi16>, tensor<i16>) -> tensor<4x3xi16>
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.Cast"([[VAR_6_]]) {saturate = 1 : si64, to = i8} : (tensor<4x3xi16>) -> tensor<4x3xi8>
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Reciprocal"([[PARAM_1_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_9_:%.+]] = "onnx.Cast"([[PARAM_2_]]) {saturate = 1 : si64, to = i16} : (tensor<ui8>) -> tensor<i16>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Sub"([[VAR_9_]], [[VAR_0_]]) : (tensor<i16>, tensor<i16>) -> tensor<i16>
+// CHECK:           [[VAR_11_:%.+]] = "onnx.Cast"([[VAR_10_]]) {saturate = 1 : si64, to = i8} : (tensor<i16>) -> tensor<i8>
+// CHECK-DAG:       [[VAR_12_:%.+]] = "onnx.Cast"([[VAR_11_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_13_:%.+]] = "onnx.Reciprocal"([[PARAM_4_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_14_:%.+]] = "onnx.Cast"([[PARAM_5_]]) {saturate = 1 : si64, to = i16} : (tensor<ui8>) -> tensor<i16>
+// CHECK:           [[VAR_15_:%.+]] = "onnx.Sub"([[VAR_14_]], [[VAR_0_]]) : (tensor<i16>, tensor<i16>) -> tensor<i16>
+// CHECK:           [[VAR_16_:%.+]] = "onnx.Cast"([[VAR_15_]]) {saturate = 1 : si64, to = i8} : (tensor<i16>) -> tensor<i8>
+// CHECK-DAG:       [[VAR_17_:%.+]] = "onnx.Cast"([[VAR_16_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_18_:%.+]] = "onnx.Reciprocal"([[PARAM_6_]]) : (tensor<f32>) -> tensor<f32>
+// CHECK-DAG:       [[VAR_19_:%.+]] = "onnx.Cast"([[PARAM_7_]]) {saturate = 1 : si64, to = i16} : (tensor<ui8>) -> tensor<i16>
+// CHECK:           [[VAR_20_:%.+]] = "onnx.Sub"([[VAR_19_]], [[VAR_0_]]) : (tensor<i16>, tensor<i16>) -> tensor<i16>
+// CHECK:           [[VAR_21_:%.+]] = "onnx.Cast"([[VAR_20_]]) {saturate = 1 : si64, to = i8} : (tensor<i16>) -> tensor<i8>
+// CHECK:           [[VAR_22_:%.+]] = "onnx.Cast"([[VAR_21_]]) {saturate = 1 : si64, to = f32} : (tensor<i8>) -> tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[VAR_4_]], [[VAR_8_]], [[VAR_12_]]) {layout = "2D", quantized_type = "INT8", sym_mode = 0 : i64} : (tensor<2x4xi8>, tensor<f32>, tensor<f32>) -> (tensor<2x4xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_7_]], [[VAR_13_]], [[VAR_17_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<4x3xi8>, tensor<f32>, tensor<f32>) -> (tensor<4x3xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_RecScale_]]_1, [[VAR_Offset_]]_2, [[VAR_1_]], [[VAR_1_]], [[VAR_1_]], [[VAR_1_]]8, [[VAR_22_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = 0 : si64} : (tensor<2x4xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, tensor<4x3xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, none, none, none, tensor<f32>, tensor<f32>) -> (tensor<2x3xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_23_:%.+]] = "zhigh.Unstick"([[VAR_Out_3_]]) : (tensor<2x3xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>) -> tensor<2x3xf32>
+// CHECK:           [[VAR_24_:%.+]] = "onnx.Cast"([[VAR_23_]]) {saturate = 1 : si64, to = i16} : (tensor<2x3xf32>) -> tensor<2x3xi16>
+// CHECK:           [[VAR_25_:%.+]] = "onnx.Add"([[VAR_24_]], [[VAR_0_]]) : (tensor<2x3xi16>, tensor<i16>) -> tensor<2x3xi16>
+// CHECK:           [[VAR_26_:%.+]] = "onnx.Cast"([[VAR_25_]]) {saturate = 1 : si64, to = ui16} : (tensor<2x3xi16>) -> tensor<2x3xui16>
+// CHECK:           [[VAR_27_:%.+]] = "onnx.Cast"([[VAR_26_]]) {saturate = 1 : si64, to = ui8} : (tensor<2x3xui16>) -> tensor<2x3xui8>
+// CHECK:           onnx.Return [[VAR_27_]] : tensor<2x3xui8>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/quantization.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/quantization.mlir
new file mode 100644
index 0000000000..83565c6e42
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/quantization.mlir
@@ -0,0 +1,179 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --convert-onnx-to-zhigh="quantization=DynSymI8" --constprop-onnx --canonicalize --mlir-print-elementsattrs-with-hex-if-larger=-1 %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --convert-onnx-to-zhigh="quantization=SymSymI8" --constprop-onnx --canonicalize --mlir-print-elementsattrs-with-hex-if-larger=-1 %s -split-input-file | FileCheck %s --check-prefix=SYMSYMI8
+
+func.func @test_correctness_of_symmetric_quant_for_weight(%arg0: tensor<?x?x200xf32>) -> tensor<?x?x1xf32> {
+  %0 = onnx.Constant dense<[[-0.00718058366], [5.253110e-01], [-0.0434652828], [-0.305256933], [0.193365857], [0.0105065238], [-0.143788248], [-0.0161222648], [0.0230324212], [-0.34107244], [-0.273072243], [-0.104352467], [0.0164068397], [-1.32305741], [-0.0345043093], [-0.232206389], [-0.150001124], [0.119475454], [0.730642438], [-0.407772154], [-0.0164191965], [-1.625590e-01], [-0.112515017], [0.158920377], [-0.0997497215], [0.0788274407], [1.1542908], [0.492949218], [-0.125796661], [0.0107790371], [0.141159713], [-0.0774109289], [-0.438130081], [-0.0888700857], [0.207725927], [-0.0913108587], [0.258232892], [0.0672571063], [-0.100412264], [1.68460846], [-0.289168775], [-0.686722457], [0.903651654], [0.110602334], [-0.0505490415], [1.31204939], [0.136107579], [0.26376456], [-0.508291602], [-0.0118971812], [-0.0373991691], [0.448705465], [0.00448446581], [-0.165114298], [0.156860754], [0.141124308], [-0.272756487], [-0.0834815949], [0.020905681], [-0.0877983123], [-1.0087887], [-0.353012145], [-0.0439243801], [-0.00592191564], [-0.0637216269], [0.175808683], [-0.193864927], [-0.0574007072], [0.390869558], [0.138100505], [0.429396927], [1.10117233], [-0.362377733], [0.116578773], [0.0540139228], [-5.85162896E-4], [-0.335441321], [-0.0902953073], [0.017575942], [-0.0359748788], [1.50025952], [-0.668821096], [0.0109066488], [9.907780e-01], [0.10227681], [-0.0582750589], [0.0172416102], [0.0429656394], [0.0465254933], [0.350135148], [-0.260139734], [0.199394852], [-0.136131078], [0.241424322], [0.855418264], [-0.160689577], [-0.825074911], [-0.124827594], [0.0153419804], [0.389386117], [0.153694436], [-0.897866904], [-0.292769879], [0.181667477], [-0.188009143], [-0.0245181341], [-2.17088842], [-0.0526076891], [-0.108600065], [0.187120304], [0.171495944], [0.310159177], [2.204240e+00], [0.0506350659], [-0.159419239], [-0.145082235], [-0.0991335287], [-0.0680764392], [-0.311415762], [-0.187137261], [-0.416945577], [0.0703471377], [0.498331547], [-0.41216433], [-0.427900195], [0.102105901], [0.130767033], [-0.440281332], [0.778514624], [-0.253678083], [0.395671815], [0.380029172], [-0.418493837], [-0.288157403], [0.0689846799], [1.269960e+00], [-0.0585722439], [-0.138125435], [-0.191710189], [0.0163070802], [0.159242466], [0.116627224], [0.289637923], [-0.299413532], [-0.0216965247], [0.271396786], [0.250576884], [-0.131420374], [0.137698188], [-0.0102280416], [0.234722644], [-0.0366179943], [-0.105632246], [-0.145528033], [-0.278210133], [-0.247100428], [0.217718393], [0.171669215], [0.0151556451], [0.961385667], [-0.0484847203], [0.434219301], [-0.00167646946], [-0.0308207348], [-0.102328695], [-0.127907664], [-0.185960412], [0.210866481], [0.140434876], [-0.233541235], [-0.123745643], [-0.0113738365], [1.30043447], [0.179708347], [-0.331716627], [0.0133318678], [-0.107284561], [-0.114116102], [-0.478514463], [0.0616452768], [-0.781869769], [-0.121830635], [-0.0684970543], [-6.584100e-02], [-0.131784603], [-0.619898796], [0.160366163], [-0.50115186], [0.0228514839], [0.581515431], [4.220270e-01], [1.944400e-01], [-1.07740963], [3.732520e-01], [0.725471556], [-0.117193311], [-0.105938725], [0.320118755], [-0.484032601], [-0.0467250831]]> : tensor<200x1xf32>
+  %1 = "onnx.MatMul"(%arg0, %0) : (tensor<?x?x200xf32>, tensor<200x1xf32>) -> tensor<?x?x1xf32>
+  return %1 : tensor<?x?x1xf32>
+
+// CHECK-LABEL:  func.func @test_correctness_of_symmetric_quant_for_weight
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x200xf32>) -> tensor<?x?x1xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<2.750000e+02> : tensor<1xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<{{.}}[0], [30], [-3], [-18], [11], [1], [-8], [-1], [1], [-20], [-16], [-6], [1], [-76], [-2], [-13], [-9], [7], [42], [-23], [-1], [-9], [-6], [9], [-6], [5], [67], [28], [-7], [1], [8], [-4], [-25], [-5], [12], [-5], [15], [4], [-6], [97], [-17], [-40], [52], [6], [-3], [76], [8], [15], [-29], [-1], [-2], [26], [0], [-10], [9], [8], [-16], [-5], [1], [-5], [-58], [-20], [-3], [0], [-4], [10], [-11], [-3], [23], [8], [25], [63], [-21], [7], [3], [0], [-19], [-5], [1], [-2], [86], [-39], [1], [57], [6], [-3], [1], [2], [3], [20], [-15], [11], [-8], [14], [49], [-9], [-48], [-7], [1], [22], [9], [-52], [-17], [10], [-11], [-1], [-125], [-3], [-6], [11], [10], [18], [127], [3], [-9], [-8], [-6], [-4], [-18], [-11], [-24], [4], [29], [-24], [-25], [6], [8], [-25], [45], [-15], [23], [22], [-24], [-17], [4], [73], [-3], [-8], [-11], [1], [9], [7], [17], [-17], [-1], [16], [14], [-8], [8], [-1], [14], [-2], [-6], [-8], [-16], [-14], [13], [10], [1], [55], [-3], [25], [0], [-2], [-6], [-7], [-11], [12], [8], [-13], [-7], [-1], [75], [10], [-19], [1], [-6], [-7], [-28], [4], [-45], [-7], [-4], [-4], [-8], [-36], [9], [-29], [1], [34], [24], [11], [-62], [22], [42], [-7], [-6], [18], [-28], [-3]{{.}}> : tensor<200x1xi8>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<57.61623> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_3_]], [[VAR_3_]]) {layout = "3DS", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<?x?x200xf32>, none, none) -> (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_1_]], [[VAR_2_]], [[VAR_4_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<200x1xi8>, tensor<f32>, tensor<f32>) -> (tensor<200x1xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_correctness_of_symmetric_quant_for_activation_and_weight(%arg0: tensor<?x?x200xf32>) -> tensor<?x?x1xf32> {
+  %0 = onnx.Constant dense<[[-0.00718058366], [5.253110e-01], [-0.0434652828], [-0.305256933], [0.193365857], [0.0105065238], [-0.143788248], [-0.0161222648], [0.0230324212], [-0.34107244], [-0.273072243], [-0.104352467], [0.0164068397], [-1.32305741], [-0.0345043093], [-0.232206389], [-0.150001124], [0.119475454], [0.730642438], [-0.407772154], [-0.0164191965], [-1.625590e-01], [-0.112515017], [0.158920377], [-0.0997497215], [0.0788274407], [1.1542908], [0.492949218], [-0.125796661], [0.0107790371], [0.141159713], [-0.0774109289], [-0.438130081], [-0.0888700857], [0.207725927], [-0.0913108587], [0.258232892], [0.0672571063], [-0.100412264], [1.68460846], [-0.289168775], [-0.686722457], [0.903651654], [0.110602334], [-0.0505490415], [1.31204939], [0.136107579], [0.26376456], [-0.508291602], [-0.0118971812], [-0.0373991691], [0.448705465], [0.00448446581], [-0.165114298], [0.156860754], [0.141124308], [-0.272756487], [-0.0834815949], [0.020905681], [-0.0877983123], [-1.0087887], [-0.353012145], [-0.0439243801], [-0.00592191564], [-0.0637216269], [0.175808683], [-0.193864927], [-0.0574007072], [0.390869558], [0.138100505], [0.429396927], [1.10117233], [-0.362377733], [0.116578773], [0.0540139228], [-5.85162896E-4], [-0.335441321], [-0.0902953073], [0.017575942], [-0.0359748788], [1.50025952], [-0.668821096], [0.0109066488], [9.907780e-01], [0.10227681], [-0.0582750589], [0.0172416102], [0.0429656394], [0.0465254933], [0.350135148], [-0.260139734], [0.199394852], [-0.136131078], [0.241424322], [0.855418264], [-0.160689577], [-0.825074911], [-0.124827594], [0.0153419804], [0.389386117], [0.153694436], [-0.897866904], [-0.292769879], [0.181667477], [-0.188009143], [-0.0245181341], [-2.17088842], [-0.0526076891], [-0.108600065], [0.187120304], [0.171495944], [0.310159177], [2.204240e+00], [0.0506350659], [-0.159419239], [-0.145082235], [-0.0991335287], [-0.0680764392], [-0.311415762], [-0.187137261], [-0.416945577], [0.0703471377], [0.498331547], [-0.41216433], [-0.427900195], [0.102105901], [0.130767033], [-0.440281332], [0.778514624], [-0.253678083], [0.395671815], [0.380029172], [-0.418493837], [-0.288157403], [0.0689846799], [1.269960e+00], [-0.0585722439], [-0.138125435], [-0.191710189], [0.0163070802], [0.159242466], [0.116627224], [0.289637923], [-0.299413532], [-0.0216965247], [0.271396786], [0.250576884], [-0.131420374], [0.137698188], [-0.0102280416], [0.234722644], [-0.0366179943], [-0.105632246], [-0.145528033], [-0.278210133], [-0.247100428], [0.217718393], [0.171669215], [0.0151556451], [0.961385667], [-0.0484847203], [0.434219301], [-0.00167646946], [-0.0308207348], [-0.102328695], [-0.127907664], [-0.185960412], [0.210866481], [0.140434876], [-0.233541235], [-0.123745643], [-0.0113738365], [1.30043447], [0.179708347], [-0.331716627], [0.0133318678], [-0.107284561], [-0.114116102], [-0.478514463], [0.0616452768], [-0.781869769], [-0.121830635], [-0.0684970543], [-6.584100e-02], [-0.131784603], [-0.619898796], [0.160366163], [-0.50115186], [0.0228514839], [0.581515431], [4.220270e-01], [1.944400e-01], [-1.07740963], [3.732520e-01], [0.725471556], [-0.117193311], [-0.105938725], [0.320118755], [-0.484032601], [-0.0467250831]]> : tensor<200x1xf32>
+  %1 = "onnx.MatMul"(%arg0, %0) : (tensor<?x?x200xf32>, tensor<200x1xf32>) -> tensor<?x?x1xf32>
+  return %1 : tensor<?x?x1xf32>
+
+// SYMSYMI8-LABEL:  func.func @test_correctness_of_symmetric_quant_for_activation_and_weight
+// SYMSYMI8-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x200xf32>) -> tensor<?x?x1xf32> {
+// SYMSYMI8-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<{{.}}[0], [30], [-3], [-18], [11], [1], [-8], [-1], [1], [-20], [-16], [-6], [1], [-76], [-2], [-13], [-9], [7], [42], [-23], [-1], [-9], [-6], [9], [-6], [5], [67], [28], [-7], [1], [8], [-4], [-25], [-5], [12], [-5], [15], [4], [-6], [97], [-17], [-40], [52], [6], [-3], [76], [8], [15], [-29], [-1], [-2], [26], [0], [-10], [9], [8], [-16], [-5], [1], [-5], [-58], [-20], [-3], [0], [-4], [10], [-11], [-3], [23], [8], [25], [63], [-21], [7], [3], [0], [-19], [-5], [1], [-2], [86], [-39], [1], [57], [6], [-3], [1], [2], [3], [20], [-15], [11], [-8], [14], [49], [-9], [-48], [-7], [1], [22], [9], [-52], [-17], [10], [-11], [-1], [-125], [-3], [-6], [11], [10], [18], [127], [3], [-9], [-8], [-6], [-4], [-18], [-11], [-24], [4], [29], [-24], [-25], [6], [8], [-25], [45], [-15], [23], [22], [-24], [-17], [4], [73], [-3], [-8], [-11], [1], [9], [7], [17], [-17], [-1], [16], [14], [-8], [8], [-1], [14], [-2], [-6], [-8], [-16], [-14], [13], [10], [1], [55], [-3], [25], [0], [-2], [-6], [-7], [-11], [12], [8], [-13], [-7], [-1], [75], [10], [-19], [1], [-6], [-7], [-28], [4], [-45], [-7], [-4], [-4], [-8], [-36], [9], [-29], [1], [34], [24], [11], [-62], [22], [42], [-7], [-6], [18], [-28], [-3]{{.}}> : tensor<200x1xi8>
+// SYMSYMI8-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<57.61623> : tensor<f32>
+// SYMSYMI8-DAG:       [[VAR_2_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// SYMSYMI8-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// SYMSYMI8-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// SYMSYMI8:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_2_]], [[VAR_2_]]) {layout = "3DS", quantized_type = "DLFLOAT16", sym_mode = 1 : i64} : (tensor<?x?x200xf32>, none, none) -> (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// SYMSYMI8:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_0_]], [[VAR_1_]], [[VAR_3_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<200x1xi8>, tensor<f32>, tensor<f32>) -> (tensor<200x1xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// SYMSYMI8:           [[VAR_Out_3_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_1_]], [[VAR_3_]], [[VAR_2_]], [[VAR_4_]], [[VAR_3_]], [[VAR_4_]], [[VAR_3_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<200x1xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, none, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x1xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// SYMSYMI8:           [[VAR_5_:%.+]] = "zhigh.Unstick"([[VAR_Out_3_]]) : (tensor<?x?x1xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x1xf32>
+// SYMSYMI8:           return [[VAR_5_]] : tensor<?x?x1xf32>
+// SYMSYMI8:         }
+}
+
+// -----
+
+func.func @test_matmul(%arg0: tensor<?x?x200xf32>) -> tensor<?x?x1xf32> {
+  %0 = onnx.Constant dense<-0.00718058366> : tensor<200x1xf32>
+  %1 = "onnx.MatMul"(%arg0, %0) : (tensor<?x?x200xf32>, tensor<200x1xf32>) -> tensor<?x?x1xf32>
+  return %1 : tensor<?x?x1xf32>
+
+// CHECK-LABEL:  func.func @test_matmul
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x200xf32>) -> tensor<?x?x1xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<-2.540000e+04> : tensor<1xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<-127> : tensor<200x1xi8>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<17686.584> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_3_]], [[VAR_3_]]) {layout = "3DS", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<?x?x200xf32>, none, none) -> (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_1_]], [[VAR_2_]], [[VAR_4_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<200x1xi8>, tensor<f32>, tensor<f32>) -> (tensor<200x1xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_6_:%.+]] = "onnx.Div"([[VAR_5_]], [[VAR_RecScale_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_7_:%.+]] = "onnx.Div"([[VAR_6_]], [[VAR_2_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_8_:%.+]] = "onnx.Mul"([[VAR_7_]], [[VAR_Offset_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Sub"([[VAR_4_]], [[VAR_8_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Mul"([[VAR_9_]], [[VAR_0_]]) : (tensor<f32>, tensor<1xf32>) -> tensor<1xf32>
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[VAR_10_]], [[VAR_5_]], [[VAR_4_]]) {layout = "1D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<1xf32>, tensor<f32>, tensor<f32>) -> (tensor<1xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_6_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_2_]], [[VAR_4_]], [[VAR_Out_]]_3, [[VAR_RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_5_]], [[VAR_4_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<200x1xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<1xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x1xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_11_:%.+]] = "zhigh.Unstick"([[VAR_Out_6_]]) : (tensor<?x?x1xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x1xf32>
+// CHECK:           return [[VAR_11_]] : tensor<?x?x1xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_matmul_add(%arg0: tensor<?x?x200xf32>) -> tensor<?x?x10xf32> {
+  %0 = onnx.Constant dense<-0.00718058366> : tensor<200x10xf32>
+  %1 = onnx.Constant dense<-0.00718058366> : tensor<10xf32>
+  %2 = "onnx.MatMul"(%arg0, %0) : (tensor<?x?x200xf32>, tensor<200x10xf32>) -> tensor<?x?x10xf32>
+  %3 = "onnx.Add"(%2, %1): (tensor<?x?x10xf32>, tensor<10xf32>) -> tensor<?x?x10xf32>
+  return %3 : tensor<?x?x10xf32>
+
+// CHECK-LABEL:  func.func @test_matmul_add
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x200xf32>) -> tensor<?x?x10xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<-2.540000e+04> : tensor<10xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<-127> : tensor<200x10xi8>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<17686.584> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<-0.00718058366> : tensor<10xf32>
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_6_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_4_]], [[VAR_4_]]) {layout = "3DS", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<?x?x200xf32>, none, none) -> (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_1_]], [[VAR_2_]], [[VAR_5_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<200x10xi8>, tensor<f32>, tensor<f32>) -> (tensor<200x10xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_7_:%.+]] = "onnx.Div"([[VAR_6_]], [[VAR_RecScale_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_8_:%.+]] = "onnx.Div"([[VAR_7_]], [[VAR_2_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Mul"([[VAR_8_]], [[VAR_Offset_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Sub"([[VAR_5_]], [[VAR_9_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_11_:%.+]] = "onnx.Mul"([[VAR_10_]], [[VAR_0_]]) : (tensor<f32>, tensor<10xf32>) -> tensor<10xf32>
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[VAR_11_]], [[VAR_6_]], [[VAR_5_]]) {layout = "1D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<10xf32>, tensor<f32>, tensor<f32>) -> (tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_12_:%.+]] = "zhigh.Stick"([[VAR_3_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
+// CHECK:           [[VAR_13_:%.+]] = "zhigh.Add"([[VAR_Out_3_]], [[VAR_12_]]) : (tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>
+// CHECK:           [[VAR_Out_6_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_2_]], [[VAR_5_]], [[VAR_13_]], [[VAR_RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_6_]], [[VAR_5_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x?x200xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<200x10xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x?x10xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_14_:%.+]] = "zhigh.Unstick"([[VAR_Out_6_]]) : (tensor<?x?x10xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>) -> tensor<?x?x10xf32>
+// CHECK:           return [[VAR_14_]] : tensor<?x?x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_gemm(%arg0: tensor<?x200xf32>) -> tensor<?x10xf32> {
+  %0 = onnx.Constant dense<-0.00718058366> : tensor<200x10xf32>
+  %1 = onnx.Constant dense<-0.00718058366> : tensor<10xf32>
+  %2 = "onnx.Gemm"(%arg0, %0, %1) {transA = 0 : si64, transB = 0 : si64, alpha = 1.0 : f32, beta = 1.0 : f32} : (tensor<?x200xf32>, tensor<200x10xf32>, tensor<10xf32>) -> tensor<?x10xf32>
+  return %2 : tensor<?x10xf32>
+
+// CHECK-LABEL:  func.func @test_gemm
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x200xf32>) -> tensor<?x10xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<-2.540000e+04> : tensor<10xf32>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<-127> : tensor<200x10xi8>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<17686.584> : tensor<f32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<-0.00718058366> : tensor<10xf32>
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<0.000000e+00> : tensor<f32>
+// CHECK-DAG:       [[VAR_6_:%.+]] = onnx.Constant dense<1.000000e+00> : tensor<f32>
+// CHECK:           [[VAR_Out_:%.+]], [[VAR_RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_4_]], [[VAR_4_]]) {layout = "2D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<?x200xf32>, none, none) -> (tensor<?x200xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_Out_0_:%.+]], [[VAR_RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[VAR_1_]], [[VAR_2_]], [[VAR_5_]]) {layout = "2D", quantized_type = "WEIGHTS", sym_mode = 0 : i64} : (tensor<200x10xi8>, tensor<f32>, tensor<f32>) -> (tensor<200x10xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_7_:%.+]] = "onnx.Div"([[VAR_6_]], [[VAR_RecScale_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_8_:%.+]] = "onnx.Div"([[VAR_7_]], [[VAR_2_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Mul"([[VAR_8_]], [[VAR_Offset_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Sub"([[VAR_5_]], [[VAR_9_]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:           [[VAR_11_:%.+]] = "onnx.Mul"([[VAR_10_]], [[VAR_0_]]) : (tensor<f32>, tensor<10xf32>) -> tensor<10xf32>
+// CHECK:           [[VAR_Out_3_:%.+]], [[VAR_RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[VAR_11_]], [[VAR_6_]], [[VAR_5_]]) {layout = "1D", quantized_type = "DLFLOAT16", sym_mode = 0 : i64} : (tensor<10xf32>, tensor<f32>, tensor<f32>) -> (tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_12_:%.+]] = "zhigh.Stick"([[VAR_3_]]) {layout = "1D"} : (tensor<10xf32>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>
+// CHECK:           [[VAR_13_:%.+]] = "zhigh.Add"([[VAR_Out_3_]], [[VAR_12_]]) : (tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>
+// CHECK:           [[VAR_Out_6_:%.+]], [[VAR_OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[VAR_Out_]], [[VAR_RecScale_]], [[VAR_Offset_]], [[VAR_Out_]]_0, [[VAR_2_]], [[VAR_5_]], [[VAR_13_]], [[VAR_RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_6_]], [[VAR_5_]]) {DequantizeOutput = 0 : si64, DisableClipping = -1 : si64, PreComputedBias = -1 : si64} : (tensor<?x200xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<200x10xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<10xf16, #zhigh.layout<{dataLayout = "1D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<?x10xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[VAR_14_:%.+]] = "zhigh.Unstick"([[VAR_Out_6_]]) : (tensor<?x10xf16, #zhigh.layout<{dataLayout = "2D", quantizedType = "DLFLOAT16"}>>) -> tensor<?x10xf32>
+// CHECK:           return [[VAR_14_]] : tensor<?x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+// Do not quantize because B is not a constant.
+func.func @test_matmul_not_quantized(%arg0: tensor<?x?x200xf32>, %arg1: tensor<200x1xf32>) -> tensor<?x?x1xf32> {
+  %1 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x?x200xf32>, tensor<200x1xf32>) -> tensor<?x?x1xf32>
+  return %1 : tensor<?x?x1xf32>
+
+// CHECK-LABEL:  func.func @test_matmul_not_quantized
+// CHECK:        "zhigh.MatMul"
+// CHECK-NOT:    "zhigh.QuantizedMatMul"
+}
+
+// -----
+
+// Do not quantize because C is not a constant.
+func.func @test_matmul_add_not_quantized(%arg0: tensor<?x?x200xf32>, %arg1: tensor<10xf32>) -> tensor<?x?x10xf32> {
+  %0 = onnx.Constant dense<-0.00718058366> : tensor<200x10xf32>
+  %1 = "onnx.MatMul"(%arg0, %0) : (tensor<?x?x200xf32>, tensor<200x10xf32>) -> tensor<?x?x10xf32>
+  %2 = "onnx.Add"(%1, %arg1): (tensor<?x?x10xf32>, tensor<10xf32>) -> tensor<?x?x10xf32>
+  return %2 : tensor<?x?x10xf32>
+
+// CHECK-LABEL:  func.func @test_matmul_add_not_quantized
+// CHECK:        "zhigh.MatMul"
+// CHECK-NOT:    "zhigh.QuantizedMatMul"
+}
+
+// -----
+
+// Do not quantize because A is transposed.
+func.func @test_gemm_not_quantized(%arg0: tensor<200x?xf32>) -> tensor<?x10xf32> {
+  %0 = onnx.Constant dense<-0.00718058366> : tensor<200x10xf32>
+  %1 = onnx.Constant dense<-0.00718058366> : tensor<10xf32>
+  %2 = "onnx.Gemm"(%arg0, %0, %1) {transA = 1 : si64, transB = 0 : si64, alpha = 1.0 : f32, beta = 1.0 : f32} : (tensor<200x?xf32>, tensor<200x10xf32>, tensor<10xf32>) -> tensor<?x10xf32>
+  return %2 : tensor<?x10xf32>
+
+// CHECK-LABEL:  func.func @test_gemm_not_quantized
+// CHECK:        "zhigh.MatMul"
+// CHECK-NOT:    "zhigh.QuantizedMatMul"
+}
+
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemax.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemax.mlir
new file mode 100644
index 0000000000..2b3bdd7f68
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemax.mlir
@@ -0,0 +1,17 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+
+ func.func @test_reduce_max_axes_defined_noop_0(%arg0 : tensor<3x2x2xf32>) -> tensor<*xf32> {
+  %cst = "onnx.Constant"() {value = dense<[2]> : tensor<1xi64> } : () -> tensor<1xi64>
+  %0 ="onnx.ReduceMax"(%arg0, %cst) {keepdims = 1 : si64, noop_with_empty_axes = 0 : si64} : (tensor<3x2x2xf32>, tensor<1xi64>)-> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_reduce_max_axes_defined_noop_0
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<3x2x2xf32>) -> tensor<3x2x1xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<2> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<3x2x2xf32>) -> tensor<3x2x2xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.ReduceMax"([[VAR_1_]]) {op_type = "REDUCE_OP_MAXIMUM"} : (tensor<3x2x2xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<3x2x1xf32>
+// CHECK:           return [[VAR_3_]] : tensor<3x2x1xf32>
+// CHECK:         }
+ }
\ No newline at end of file
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemean.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemean.mlir
index 686c684be3..d4f8e87a99 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemean.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemean.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zhigh(%arg0 : tensor<1x3x5x7xf32>) -> tensor<*xf32> {
   %0 = "onnx.ReduceMeanV13"(%arg0) { axes = [2, 3] }: (tensor<1x3x5x7xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemin.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemin.mlir
new file mode 100644
index 0000000000..6bbd5ac19f
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/reducemin.mlir
@@ -0,0 +1,17 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+
+ func.func @test_reduce_min_axes_defined_noop_0(%arg0: tensor<1x2x4xf32>) -> tensor<*xf32> {
+   %0 = "onnx.Constant"() {value = dense<[2]> : tensor<1xi64> } : () -> tensor<1xi64>
+   %1 ="onnx.ReduceMin"(%arg0, %0) {keepdims = 1: si64, noop_with_empty_axes = 0: si64} : (tensor<1x2x4xf32>, tensor<1xi64>) -> tensor<*xf32>
+  "func.return"(%1) : (tensor<*xf32>) -> ()
+
+// mlir2FileCheck.py
+// CHECK-LABEL:  func.func @test_reduce_min_axes_defined_noop_0
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x2x4xf32>) -> tensor<1x2x1xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<2> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<1x2x4xf32>) -> tensor<1x2x4xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.ReduceMin"([[VAR_1_]]) {op_type = "REDUCE_OP_MINIMUM"} : (tensor<1x2x4xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<*xf16>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.Unstick"([[VAR_2_]]) : (tensor<*xf16>) -> tensor<1x2x1xf32>
+// CHECK:           return [[VAR_3_]] : tensor<1x2x1xf32>
+// CHECK:         }
+}
\ No newline at end of file
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/relu.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/relu.mlir
index a330794899..7b2d4c9138 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/relu.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/relu.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_relu(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Relu"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sigmoid.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sigmoid.mlir
index 3b7c3f9f8a..91e1be781f 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sigmoid.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sigmoid.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_sigmoid(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Sigmoid"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/softmax.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/softmax.mlir
index 70fbeda772..1fac5a1f4a 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/softmax.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/softmax.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_softmax(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Softmax"(%arg0) {axis = 1: si64} : (tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sqrt.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sqrt.mlir
new file mode 100644
index 0000000000..48c3fc0b81
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sqrt.mlir
@@ -0,0 +1,27 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+
+func.func @test_sqrt(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Sqrt"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+// CHECK-LABEL:  func @test_sqrt
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<10x10xf32>) -> tensor<10x10xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "2D"} : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.Sqrt"([[VAR_0_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_2_:%.+]] = "zhigh.Unstick"([[VAR_1_]]) : (tensor<10x10xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<10x10xf32>
+// CHECK:           return [[VAR_2_]] : tensor<10x10xf32>
+// CHECK:         }
+}
+
+// -----
+
+/// COM: Test for zdnn limitation.
+/// COM: Not lowered when dimensin size exceeds DLCPP_MAXIMUM_DIMENSION_INDEX_SIZE in `third_party/zdnn-lib/zdnn_limit.h`
+/// COM: DLCPP_MAXIMUM_DIMENSION_INDEX_SIZE depends on zAIU HW. Please check the value if these tests fails.
+
+func.func @test_exceed_limit_sqrt(%arg0 : tensor<2097152x10xf32>) -> tensor<*xf32> {
+  %0 = "onnx.Sqrt"(%arg0) : (tensor<2097152x10xf32>) -> tensor<*xf32>
+  "func.return"(%0) : (tensor<*xf32>) -> ()
+
+// CHECK-LABEL:  func @test_exceed_limit_sqrt
+// CHECK:        "onnx.Sqrt"
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sub.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sub.mlir
index e9e40305e4..c1e5593e36 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sub.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sub.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Sub"(%arg0, %arg1) : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sum.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sum.mlir
index aaff8185bf..6f31e650b4 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sum.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/sum.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 // COM: Check the singleton case of lowering ONNXSumOp to ZHighAddOp,
 // COM: where ONNXSumOp has two inputs and is lowered to a single ZHighAddOp.
diff --git a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/tanh.mlir b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/tanh.mlir
index a1a755da1a..a53daee5a1 100644
--- a/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/tanh.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/onnx-to-zhigh/tanh.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-zhigh %s -split-input-file | FileCheck %s
 
 func.func @test_tanh(%arg0 : tensor<10x10xf32>) -> tensor<*xf32> {
   %0 = "onnx.Tanh"(%arg0) : (tensor<10x10xf32>) -> tensor<*xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh-arch15.mlir b/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh-arch15.mlir
new file mode 100644
index 0000000000..3e7d0b4c0d
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh-arch15.mlir
@@ -0,0 +1,245 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --rewrite-onnx-for-zhigh --canonicalize %s -split-input-file | FileCheck %s
+
+// -----
+
+// Do not Split MatMul because a dimension does not exceeds NNPAGetMaxForDim for e2 of 1048576.
+
+func.func @test_matmul_no_splitting_arch15_A(%arg0: tensor<?x1048576x768xf32>, %arg1: tensor<768x1024xf32>) -> (tensor<?x1048576x1024xf32>) {
+  %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x1048576x768xf32>, tensor<768x1024xf32>) -> tensor<?x1048576x1024xf32>
+  return %0 : tensor<?x1048576x1024xf32>
+
+// mlir2FileCheck.py -a '["A","B"]'
+// CHECK-LABEL:  func.func @test_matmul_no_splitting_arch15_A
+// CHECK-SAME:   ([[A_:%.+]]: tensor<?x1048576x768xf32>, [[B_:%.+]]: tensor<768x1024xf32>) -> tensor<?x1048576x1024xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "onnx.MatMul"([[A_]], [[B_]]) : (tensor<?x1048576x768xf32>, tensor<768x1024xf32>) -> tensor<?x1048576x1024xf32>
+// CHECK:           return [[VAR_0_]] : tensor<?x1048576x1024xf32>
+// CHECK:         }
+}
+
+// -----
+
+// Split MatMul because a dimension exceeds NNPAGetMaxForDim for e2 on arch15 of 1048576: use 2097152
+
+func.func @test_matmul_splitting_arch15_A(%arg0: tensor<?x2097152x768xf32>, %arg1: tensor<768x1024xf32>) -> (tensor<?x2097152x1024xf32>) {
+  %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x2097152x768xf32>, tensor<768x1024xf32>) -> tensor<?x2097152x1024xf32>
+  return %0 : tensor<?x2097152x1024xf32>
+
+// mlir2FileCheck.py -a '["A","B"]'
+// CHECK-LABEL:  func.func @test_matmul_splitting_arch15_A
+// CHECK-SAME:   ([[A_:%.+]]: tensor<?x2097152x768xf32>, [[B_:%.+]]: tensor<768x1024xf32>) -> tensor<?x2097152x1024xf32> {
+// CHECK:           [[VAR_0_:%.+]] = onnx.Constant dense<1048576> : tensor<2xi64>
+// CHECK:           [[VAR_1_:%.+]]:2 = "onnx.Split"([[A_]], [[VAR_0_]]) {axis = 1 : si64} : (tensor<?x2097152x768xf32>, tensor<2xi64>) -> (tensor<?x1048576x768xf32>, tensor<?x1048576x768xf32>)
+// CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.MatMul"([[VAR_1_]]#0, [[B_]]) : (tensor<?x1048576x768xf32>, tensor<768x1024xf32>) -> tensor<?x1048576x1024xf32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.MatMul"([[VAR_1_]]#1, [[B_]]) : (tensor<?x1048576x768xf32>, tensor<768x1024xf32>) -> tensor<?x1048576x1024xf32>
+// CHECK:           [[VAR_4_:%.+]] = "onnx.Concat"([[VAR_2_]], [[VAR_3_]]) {axis = 1 : si64} : (tensor<?x1048576x1024xf32>, tensor<?x1048576x1024xf32>) -> tensor<?x2097152x1024xf32>
+// CHECK:           return [[VAR_4_]] : tensor<?x2097152x1024xf32>
+// CHECK:         }
+}
+
+// -----
+
+// Do not split MatMul because a dimension does not exceeds NNPAGetMaxForDim e1 on arch15 of 2097152.
+
+func.func @test_matmul_no_splitting_arch15_B(%arg0: tensor<?x?x768xf32>, %arg1: tensor<768x2097152xf32>) -> (tensor<?x?x2097152xf32>) {
+  %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x?x768xf32>, tensor<768x2097152xf32>) -> tensor<?x?x2097152xf32>
+  return %0 : tensor<?x?x2097152xf32>
+
+// mlir2FileCheck.py -a '["A","B"]'
+// CHECK-LABEL:  func.func @test_matmul_no_splitting_arch15_B
+// CHECK-SAME:   ([[A_:%.+]]: tensor<?x?x768xf32>, [[B_:%.+]]: tensor<768x2097152xf32>) -> tensor<?x?x2097152xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "onnx.MatMul"([[A_]], [[B_]]) : (tensor<?x?x768xf32>, tensor<768x2097152xf32>) -> tensor<?x?x2097152xf32>
+// CHECK:           return [[VAR_0_]] : tensor<?x?x2097152xf32>
+// CHECK:         }
+}
+
+// -----
+
+// Split MatMul because a dimension exceeds NNPAGetMaxForDim e1 on arch15 of 2097152: use 4194304.
+
+func.func @test_matmul_splitting_arch15_B(%arg0: tensor<?x?x768xf32>, %arg1: tensor<768x4194304xf32>) -> (tensor<?x?x4194304xf32>) {
+  %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x?x768xf32>, tensor<768x4194304xf32>) -> tensor<?x?x4194304xf32>
+  return %0 : tensor<?x?x4194304xf32>
+
+// mlir2FileCheck.py -a '["A","B"]'
+// CHECK-LABEL:  func.func @test_matmul_splitting_arch15_B
+// CHECK-SAME:   ([[A_:%.+]]: tensor<?x?x768xf32>, [[B_:%.+]]: tensor<768x4194304xf32>) -> tensor<?x?x4194304xf32> {
+// CHECK:           [[VAR_0_:%.+]] = onnx.Constant dense<2097152> : tensor<2xi64>
+// CHECK:           [[VAR_1_:%.+]]:2 = "onnx.Split"([[B_]], [[VAR_0_]]) {axis = 1 : si64} : (tensor<768x4194304xf32>, tensor<2xi64>) -> (tensor<768x2097152xf32>, tensor<768x2097152xf32>)
+// CHECK-DAG:       [[VAR_2_:%.+]] = "onnx.MatMul"([[A_]], [[VAR_1_]]#0) : (tensor<?x?x768xf32>, tensor<768x2097152xf32>) -> tensor<?x?x2097152xf32>
+// CHECK-DAG:       [[VAR_3_:%.+]] = "onnx.MatMul"([[A_]], [[VAR_1_]]#1) : (tensor<?x?x768xf32>, tensor<768x2097152xf32>) -> tensor<?x?x2097152xf32>
+// CHECK:           [[VAR_4_:%.+]] = "onnx.Concat"([[VAR_2_]], [[VAR_3_]]) {axis = 2 : si64} : (tensor<?x?x2097152xf32>, tensor<?x?x2097152xf32>) -> tensor<?x?x4194304xf32>
+// CHECK:           return [[VAR_4_]] : tensor<?x?x4194304xf32>
+// CHECK:         }
+}
+
+// -----
+
+// No split MatMul because a dimension does not exceeds NNPAGetMaxForDim for e2/e1 on arch15 of 1048576 / 2097152
+
+func.func @test_matmul_no_splitting_arch15_A_B(%arg0: tensor<?x1048576x768xf32>, %arg1: tensor<768x2097152xf32>) -> (tensor<?x1048576x2097152xf32>) {
+  %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x1048576x768xf32>, tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32>
+  return %0 : tensor<?x1048576x2097152xf32>
+
+// mlir2FileCheck.py -a '["A","B"]'
+// CHECK-LABEL:  func.func @test_matmul_no_splitting_arch15_A_B
+// CHECK-SAME:   ([[A_:%.+]]: tensor<?x1048576x768xf32>, [[B_:%.+]]: tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32> {
+// CHECK:           [[VAR_0_:%.+]] = "onnx.MatMul"([[A_]], [[B_]]) : (tensor<?x1048576x768xf32>, tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32>
+// CHECK:           return [[VAR_0_]] : tensor<?x1048576x2097152xf32>
+// CHECK:         }
+}
+
+// -----
+
+// Split MatMul because a dimension exceeds NNPAGetMaxForDim for e2/e1 on arch15 of 1048576 / 2097152: use 2097152 and 4194304
+
+func.func @test_matmul_splitting_arch15_A_B(%arg0: tensor<?x2097152x768xf32>, %arg1: tensor<768x4194304xf32>) -> (tensor<?x2097152x4194304xf32>) {
+  %0 = "onnx.MatMul"(%arg0, %arg1) : (tensor<?x2097152x768xf32>, tensor<768x4194304xf32>) -> tensor<?x2097152x4194304xf32>
+  return %0 : tensor<?x2097152x4194304xf32>
+
+// mlir2FileCheck.py -a '["A","B"]'
+// CHECK-LABEL:  func.func @test_matmul_splitting_arch15_A_B
+// CHECK-SAME:   ([[A_:%.+]]: tensor<?x2097152x768xf32>, [[B_:%.+]]: tensor<768x4194304xf32>) -> tensor<?x2097152x4194304xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<1048576> : tensor<2xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]]:2 = "onnx.Split"([[A_]], [[VAR_0_]]) {axis = 1 : si64} : (tensor<?x2097152x768xf32>, tensor<2xi64>) -> (tensor<?x1048576x768xf32>, tensor<?x1048576x768xf32>)
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<2097152> : tensor<2xi64>
+// CHECK:           [[VAR_3_:%.+]]:2 = "onnx.Split"([[B_]], [[VAR_2_]]) {axis = 1 : si64} : (tensor<768x4194304xf32>, tensor<2xi64>) -> (tensor<768x2097152xf32>, tensor<768x2097152xf32>)
+// CHECK-DAG:       [[VAR_4_:%.+]] = "onnx.MatMul"([[VAR_1_]]#0, [[VAR_3_]]#0) : (tensor<?x1048576x768xf32>, tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = "onnx.MatMul"([[VAR_1_]]#0, [[VAR_3_]]#1) : (tensor<?x1048576x768xf32>, tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.Concat"([[VAR_4_]], [[VAR_5_]]) {axis = 2 : si64} : (tensor<?x1048576x2097152xf32>, tensor<?x1048576x2097152xf32>) -> tensor<?x1048576x4194304xf32>
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.MatMul"([[VAR_1_]]#1, [[VAR_3_]]#0) : (tensor<?x1048576x768xf32>, tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32>
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.MatMul"([[VAR_1_]]#1, [[VAR_3_]]#1) : (tensor<?x1048576x768xf32>, tensor<768x2097152xf32>) -> tensor<?x1048576x2097152xf32>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Concat"([[VAR_7_]], [[VAR_8_]]) {axis = 2 : si64} : (tensor<?x1048576x2097152xf32>, tensor<?x1048576x2097152xf32>) -> tensor<?x1048576x4194304xf32>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Concat"([[VAR_6_]], [[VAR_9_]]) {axis = 1 : si64} : (tensor<?x1048576x4194304xf32>, tensor<?x1048576x4194304xf32>) -> tensor<?x2097152x4194304xf32>
+// CHECK:           return [[VAR_10_]] : tensor<?x2097152x4194304xf32>
+// CHECK:         }
+}
+
+// -----
+
+// Rewrite N-D QLinearMatMul into 3-D one.
+  
+func.func @test_nd_qlinearmatmul_nd_nd(%arg0: tensor<?x?x384x64xf32> {onnx.dim_params = "0:bs,1:sl"}, %arg1: tensor<?x?x64x384xf32> {onnx.dim_params = "0:bs,1:sl"}, %arg2: tensor<f32>, %arg3: tensor<i8>) -> tensor<?x?x384x384xf32> {
+  %0 = "onnx.QuantizeLinear"(%arg0, %arg2, %arg3) : (tensor<?x?x384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x64xi8>
+  %1 = "onnx.QuantizeLinear"(%arg1, %arg2, %arg3) : (tensor<?x?x64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<?x?x64x384xi8>
+  %2 = "onnx.QLinearMatMul"(%0, %arg2, %arg3, %1, %arg2, %arg3, %arg2, %arg3) : (tensor<?x?x384x64xi8>, tensor<f32>, tensor<i8>, tensor<?x?x64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xi8>
+  %3 = "onnx.DequantizeLinear"(%2, %arg2, %arg3) : (tensor<?x?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xf32>
+  return %3 : tensor<?x?x384x384xf32>
+
+// CHECK-LABEL:  func.func @test_nd_qlinearmatmul
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x384x64xf32> {onnx.dim_params = "0:bs,1:sl"}, [[PARAM_1_:%.+]]: tensor<?x?x64x384xf32> {onnx.dim_params = "0:bs,1:sl"}, [[PARAM_2_:%.+]]: tensor<f32>, [[PARAM_3_:%.+]]: tensor<i8>) -> tensor<?x?x384x384xf32> {
+  // CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<3> : tensor<1xi64>
+  // CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<1> : tensor<1xi64>
+  // CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<4> : tensor<1xi64>
+  // CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<2> : tensor<1xi64>
+  // CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<0> : tensor<1xi64>
+  // CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<-1> : tensor<1xi64>
+  // CHECK-DAG:       [[VAR_6_:%.+]] = "onnx.Shape"([[PARAM_0_]]) {start = 0 : si64} : (tensor<?x?x384x64xf32>) -> tensor<4xi64>
+  // CHECK:           [[VAR_7_:%.+]] = "onnx.Slice"([[VAR_6_]], [[VAR_3_]], [[VAR_2_]], [[VAR_4_]], [[VAR_1_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+  // CHECK:           [[VAR_8_:%.+]] = "onnx.Concat"([[VAR_5_]], [[VAR_7_]]) {axis = 0 : si64} : (tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+  // CHECK:           [[VAR_9_:%.+]] = "onnx.Reshape"([[PARAM_0_]], [[VAR_8_]]) {allowzero = 0 : si64} : (tensor<?x?x384x64xf32>, tensor<3xi64>) -> tensor<?x384x64xf32>
+  // CHECK-DAG:       [[VAR_10_:%.+]] = "onnx.QuantizeLinear"([[VAR_9_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<?x384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<?x384x64xi8>
+  // CHECK-DAG:       [[VAR_11_:%.+]] = "onnx.Shape"([[PARAM_1_]]) {start = 0 : si64} : (tensor<?x?x64x384xf32>) -> tensor<4xi64>
+  // CHECK:           [[VAR_12_:%.+]] = "onnx.Slice"([[VAR_11_]], [[VAR_3_]], [[VAR_2_]], [[VAR_4_]], [[VAR_1_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+  // CHECK:           [[VAR_13_:%.+]] = "onnx.Concat"([[VAR_5_]], [[VAR_12_]]) {axis = 0 : si64} : (tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+  // CHECK:           [[VAR_14_:%.+]] = "onnx.Reshape"([[PARAM_1_]], [[VAR_13_]]) {allowzero = 0 : si64} : (tensor<?x?x64x384xf32>, tensor<3xi64>) -> tensor<?x64x384xf32>
+  // CHECK:           [[VAR_15_:%.+]] = "onnx.QuantizeLinear"([[VAR_14_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<?x64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<?x64x384xi8>
+  // CHECK:           [[VAR_16_:%.+]] = "onnx.QLinearMatMul"([[VAR_10_]], [[PARAM_2_]], [[PARAM_3_]], [[VAR_15_]], [[PARAM_2_]], [[PARAM_3_]], [[PARAM_2_]], [[PARAM_3_]]) : (tensor<?x384x64xi8>, tensor<f32>, tensor<i8>, tensor<?x64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x384x384xi8>
+  // CHECK-DAG:       [[VAR_17_:%.+]] = "onnx.DequantizeLinear"([[VAR_16_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64} : (tensor<?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x384x384xf32>
+  // CHECK-DAG:       [[VAR_18_:%.+]] = "onnx.Shape"([[PARAM_0_]]) {start = 0 : si64} : (tensor<?x?x384x64xf32>) -> tensor<4xi64>
+  // CHECK-DAG:       [[VAR_19_:%.+]] = "onnx.Shape"([[PARAM_1_]]) {start = 0 : si64} : (tensor<?x?x64x384xf32>) -> tensor<4xi64>
+  // CHECK-NOT: separator of consecutive DAGs
+  // CHECK-DAG:       [[VAR_20_:%.+]] = "onnx.Slice"([[VAR_18_]], [[VAR_4_]], [[VAR_0_]], [[VAR_4_]], [[VAR_1_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<3xi64>
+  // CHECK-DAG:       [[VAR_21_:%.+]] = "onnx.Slice"([[VAR_19_]], [[VAR_0_]], [[VAR_2_]], [[VAR_4_]], [[VAR_1_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+  // CHECK:           [[VAR_22_:%.+]] = "onnx.Concat"([[VAR_20_]], [[VAR_21_]]) {axis = 0 : si64} : (tensor<3xi64>, tensor<1xi64>) -> tensor<4xi64>
+  // CHECK:           [[VAR_23_:%.+]] = "onnx.Reshape"([[VAR_17_]], [[VAR_22_]]) {allowzero = 0 : si64} : (tensor<?x384x384xf32>, tensor<4xi64>) -> tensor<?x?x384x384xf32>
+  // CHECK:           return [[VAR_23_]] : tensor<?x?x384x384xf32>
+  // CHECK:         }
+}
+
+func.func @test_nd_qlinearmatmul_nd_2d(%arg0: tensor<?x?x384x64xf32> {onnx.dim_params = "0:bs,1:sl"}, %arg1: tensor<64x384xf32>, %arg2: tensor<f32>, %arg3: tensor<i8>) -> tensor<?x?x384x384xf32> {
+  %0 = "onnx.QuantizeLinear"(%arg0, %arg2, %arg3) : (tensor<?x?x384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x64xi8>
+  %1 = "onnx.QuantizeLinear"(%arg1, %arg2, %arg3) : (tensor<64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<64x384xi8>
+  %2 = "onnx.QLinearMatMul"(%0, %arg2, %arg3, %1, %arg2, %arg3, %arg2, %arg3) : (tensor<?x?x384x64xi8>, tensor<f32>, tensor<i8>, tensor<64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xi8>
+  %3 = "onnx.DequantizeLinear"(%2, %arg2, %arg3) : (tensor<?x?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xf32>
+  return %3 : tensor<?x?x384x384xf32>
+
+// CHECK-LABEL:  func.func @test_nd_qlinearmatmul_nd_2d
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x384x64xf32> {onnx.dim_params = "0:bs,1:sl"}, [[PARAM_1_:%.+]]: tensor<64x384xf32>, [[PARAM_2_:%.+]]: tensor<f32>, [[PARAM_3_:%.+]]: tensor<i8>) -> tensor<?x?x384x384xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<[64, 384]> : tensor<2xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<3> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<1> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<4> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<2> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<0> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_6_:%.+]] = onnx.Constant dense<-1> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.QuantizeLinear"([[PARAM_1_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<64x384xi8>
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Shape"([[PARAM_0_]]) {start = 0 : si64} : (tensor<?x?x384x64xf32>) -> tensor<4xi64>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Slice"([[VAR_8_]], [[VAR_4_]], [[VAR_3_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Concat"([[VAR_6_]], [[VAR_9_]]) {axis = 0 : si64} : (tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+// CHECK:           [[VAR_11_:%.+]] = "onnx.Reshape"([[PARAM_0_]], [[VAR_10_]]) {allowzero = 0 : si64} : (tensor<?x?x384x64xf32>, tensor<3xi64>) -> tensor<?x384x64xf32>
+// CHECK:           [[VAR_12_:%.+]] = "onnx.QuantizeLinear"([[VAR_11_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<?x384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<?x384x64xi8>
+// CHECK:           [[VAR_13_:%.+]] = "onnx.QLinearMatMul"([[VAR_12_]], [[PARAM_2_]], [[PARAM_3_]], [[VAR_7_]], [[PARAM_2_]], [[PARAM_3_]], [[PARAM_2_]], [[PARAM_3_]]) : (tensor<?x384x64xi8>, tensor<f32>, tensor<i8>, tensor<64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x384x384xi8>
+// CHECK-DAG:       [[VAR_14_:%.+]] = "onnx.DequantizeLinear"([[VAR_13_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64} : (tensor<?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x384x384xf32>
+// CHECK-DAG:       [[VAR_15_:%.+]] = "onnx.Shape"([[PARAM_0_]]) {start = 0 : si64} : (tensor<?x?x384x64xf32>) -> tensor<4xi64>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_16_:%.+]] = "onnx.Slice"([[VAR_15_]], [[VAR_5_]], [[VAR_1_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<3xi64>
+// CHECK-DAG:       [[VAR_17_:%.+]] = "onnx.Slice"([[VAR_0_]], [[VAR_2_]], [[VAR_4_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<2xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK:           [[VAR_18_:%.+]] = "onnx.Concat"([[VAR_16_]], [[VAR_17_]]) {axis = 0 : si64} : (tensor<3xi64>, tensor<1xi64>) -> tensor<4xi64>
+// CHECK:           [[VAR_19_:%.+]] = "onnx.Reshape"([[VAR_14_]], [[VAR_18_]]) {allowzero = 0 : si64} : (tensor<?x384x384xf32>, tensor<4xi64>) -> tensor<?x?x384x384xf32>
+// CHECK:           return [[VAR_19_]] : tensor<?x?x384x384xf32>
+// CHECK:         }
+}
+
+func.func @test_nd_qlinearmatmul_2d_nd(%arg0: tensor<384x64xf32>, %arg1: tensor<?x?x64x384xf32> {onnx.dim_params = "0:bs,1:sl"}, %arg2: tensor<f32>, %arg3: tensor<i8>) -> tensor<?x?x384x384xf32> {
+  %0 = "onnx.QuantizeLinear"(%arg0, %arg2, %arg3) : (tensor<384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<384x64xi8>
+  %1 = "onnx.QuantizeLinear"(%arg1, %arg2, %arg3) : (tensor<?x?x64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<?x?x64x384xi8>
+  %2 = "onnx.QLinearMatMul"(%0, %arg2, %arg3, %1, %arg2, %arg3, %arg2, %arg3) : (tensor<384x64xi8>, tensor<f32>, tensor<i8>, tensor<?x?x64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xi8>
+  %3 = "onnx.DequantizeLinear"(%2, %arg2, %arg3) : (tensor<?x?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xf32>
+  return %3 : tensor<?x?x384x384xf32>
+
+// CHECK-LABEL:  func.func @test_nd_qlinearmatmul_2d_nd
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<384x64xf32>, [[PARAM_1_:%.+]]: tensor<?x?x64x384xf32> {onnx.dim_params = "0:bs,1:sl"}, [[PARAM_2_:%.+]]: tensor<f32>, [[PARAM_3_:%.+]]: tensor<i8>) -> tensor<?x?x384x384xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = onnx.Constant dense<[384, 64]> : tensor<2xi64>
+// CHECK-DAG:       [[VAR_1_:%.+]] = onnx.Constant dense<3> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_2_:%.+]] = onnx.Constant dense<1> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_3_:%.+]] = onnx.Constant dense<4> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_4_:%.+]] = onnx.Constant dense<2> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_5_:%.+]] = onnx.Constant dense<0> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_6_:%.+]] = onnx.Constant dense<-1> : tensor<1xi64>
+// CHECK-DAG:       [[VAR_7_:%.+]] = "onnx.QuantizeLinear"([[PARAM_0_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<384x64xi8>
+// CHECK-DAG:       [[VAR_8_:%.+]] = "onnx.Shape"([[PARAM_1_]]) {start = 0 : si64} : (tensor<?x?x64x384xf32>) -> tensor<4xi64>
+// CHECK:           [[VAR_9_:%.+]] = "onnx.Slice"([[VAR_8_]], [[VAR_4_]], [[VAR_3_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK:           [[VAR_10_:%.+]] = "onnx.Concat"([[VAR_6_]], [[VAR_9_]]) {axis = 0 : si64} : (tensor<1xi64>, tensor<2xi64>) -> tensor<3xi64>
+// CHECK:           [[VAR_11_:%.+]] = "onnx.Reshape"([[PARAM_1_]], [[VAR_10_]]) {allowzero = 0 : si64} : (tensor<?x?x64x384xf32>, tensor<3xi64>) -> tensor<?x64x384xf32>
+// CHECK:           [[VAR_12_:%.+]] = "onnx.QuantizeLinear"([[VAR_11_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<?x64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<?x64x384xi8>
+// CHECK:           [[VAR_13_:%.+]] = "onnx.QLinearMatMul"([[VAR_7_]], [[PARAM_2_]], [[PARAM_3_]], [[VAR_12_]], [[PARAM_2_]], [[PARAM_3_]], [[PARAM_2_]], [[PARAM_3_]]) : (tensor<384x64xi8>, tensor<f32>, tensor<i8>, tensor<?x64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x384x384xi8>
+// CHECK-DAG:       [[VAR_14_:%.+]] = "onnx.DequantizeLinear"([[VAR_13_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64} : (tensor<?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x384x384xf32>
+// CHECK-DAG:       [[VAR_15_:%.+]] = "onnx.Shape"([[PARAM_1_]]) {start = 0 : si64} : (tensor<?x?x64x384xf32>) -> tensor<4xi64>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_16_:%.+]] = "onnx.Slice"([[VAR_15_]], [[VAR_5_]], [[VAR_4_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<2xi64>
+// CHECK-DAG:       [[VAR_17_:%.+]] = "onnx.Slice"([[VAR_0_]], [[VAR_5_]], [[VAR_2_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<2xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK-DAG:       [[VAR_18_:%.+]] = "onnx.Slice"([[VAR_15_]], [[VAR_1_]], [[VAR_3_]], [[VAR_5_]], [[VAR_2_]]) : (tensor<4xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi64>
+// CHECK:           [[VAR_19_:%.+]] = "onnx.Concat"([[VAR_16_]], [[VAR_17_]], [[VAR_18_]]) {axis = 0 : si64} : (tensor<2xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<4xi64>
+// CHECK:           [[VAR_20_:%.+]] = "onnx.Reshape"([[VAR_14_]], [[VAR_19_]]) {allowzero = 0 : si64} : (tensor<?x384x384xf32>, tensor<4xi64>) -> tensor<?x?x384x384xf32>
+// CHECK:           return [[VAR_20_]] : tensor<?x?x384x384xf32>
+// CHECK:         }
+}
+
+// Do not rewrite because of potential broadcasting.
+func.func @test_nd_qlinearmatmul_nd_nd_not_rewriting(%arg0: tensor<?x?x384x64xf32> {onnx.dim_params = "0:bs,1:sl"}, %arg1: tensor<1x?x64x384xf32> {onnx.dim_params = "1:sl"}, %arg2: tensor<f32>, %arg3: tensor<i8>) -> tensor<?x?x384x384xf32> {
+  %0 = "onnx.QuantizeLinear"(%arg0, %arg2, %arg3) : (tensor<?x?x384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x64xi8>
+  %1 = "onnx.QuantizeLinear"(%arg1, %arg2, %arg3) : (tensor<1x?x64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<1x?x64x384xi8>
+  %2 = "onnx.QLinearMatMul"(%0, %arg2, %arg3, %1, %arg2, %arg3, %arg2, %arg3) : (tensor<?x?x384x64xi8>, tensor<f32>, tensor<i8>, tensor<1x?x64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xi8>
+  %3 = "onnx.DequantizeLinear"(%2, %arg2, %arg3) : (tensor<?x?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xf32>
+  return %3 : tensor<?x?x384x384xf32>
+
+// CHECK-LABEL:  func.func @test_nd_qlinearmatmul_nd_nd_not_rewriting
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?x384x64xf32> {onnx.dim_params = "0:bs,1:sl"}, [[PARAM_1_:%.+]]: tensor<1x?x64x384xf32> {onnx.dim_params = "1:sl"}, [[PARAM_2_:%.+]]: tensor<f32>, [[PARAM_3_:%.+]]: tensor<i8>) -> tensor<?x?x384x384xf32> {
+// CHECK-DAG:       [[VAR_0_:%.+]] = "onnx.QuantizeLinear"([[PARAM_0_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<?x?x384x64xf32>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x64xi8>
+// CHECK-DAG:       [[VAR_1_:%.+]] = "onnx.QuantizeLinear"([[PARAM_1_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64, saturate = 1 : si64} : (tensor<1x?x64x384xf32>, tensor<f32>, tensor<i8>) -> tensor<1x?x64x384xi8>
+// CHECK:           [[VAR_2_:%.+]] = "onnx.QLinearMatMul"([[VAR_0_]], [[PARAM_2_]], [[PARAM_3_]], [[VAR_1_]], [[PARAM_2_]], [[PARAM_3_]], [[PARAM_2_]], [[PARAM_3_]]) : (tensor<?x?x384x64xi8>, tensor<f32>, tensor<i8>, tensor<1x?x64x384xi8>, tensor<f32>, tensor<i8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xi8>
+// CHECK:           [[VAR_3_:%.+]] = "onnx.DequantizeLinear"([[VAR_2_]], [[PARAM_2_]], [[PARAM_3_]]) {axis = 1 : si64} : (tensor<?x?x384x384xi8>, tensor<f32>, tensor<i8>) -> tensor<?x?x384x384xf32>
+// CHECK:           return [[VAR_3_]] : tensor<?x?x384x384xf32>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh.mlir b/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh.mlir
index e0679a6870..f788a0091a 100644
--- a/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/rewrite-onnx-for-zhigh.mlir
@@ -1,5 +1,5 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --rewrite-onnx-for-zhigh %s -split-input-file | FileCheck %s
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --rewrite-onnx-for-zhigh --shape-inference --canonicalize --constprop-onnx  --shape-inference %s --split-input-file | FileCheck --check-prefix=CONSTPROP %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --rewrite-onnx-for-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --rewrite-onnx-for-zhigh --shape-inference --canonicalize --constprop-onnx  --shape-inference %s --split-input-file | FileCheck --check-prefix=CONSTPROP %s
 
 func.func @test_batchnorm_epsilon(%arg0: tensor<2x3x4x5xf32>, %arg1: tensor<3xf32>, %arg2: tensor<3xf32>, %arg3: tensor<3xf32>, %arg4: tensor<3xf32>) -> tensor<2x3x4x5xf32> {
   %0 = "onnx.BatchNormalizationInferenceMode"(%arg0, %arg1, %arg2, %arg3, %arg4) {epsilon = 0.00999999977 : f32} : (tensor<2x3x4x5xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>) -> tensor<2x3x4x5xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/add.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/add.mlir
index c1f3324bc9..1b58dab4ed 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/add.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/add.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
 
 func.func @test_add() -> tensor<10x10xf32> {
   %cst0 = onnx.Constant dense<1.0> : tensor<10x10xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/max.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/max.mlir
index ab1cbbed3c..32b483a241 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/max.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/max.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
 
 func.func @test_max() -> tensor<10x10xf32> {
   %cst0 = onnx.Constant dense<1.0> : tensor<10x10xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/min.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/min.mlir
index 8a12a30542..3995ef0f75 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/min.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/min.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
 
 func.func @test_min() -> tensor<10x10xf32> {
   %cst0 = onnx.Constant dense<1.0> : tensor<10x10xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/mul.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/mul.mlir
index fecda71427..84aaf9a14d 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/mul.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/mul.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
 
 func.func @test_mul() -> tensor<10x10xf32> {
   %cst0 = onnx.Constant dense<1.0> : tensor<10x10xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/relu.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/relu.mlir
index 7d50da0f86..ae3cff3538 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/relu.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/relu.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
 
 func.func @test_log() -> tensor<10x10xf32> {
   %cst0 = onnx.Constant dense<1.0> : tensor<10x10xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/sub.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/sub.mlir
index f4ce23eaa5..e84f89eeff 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/sub.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-onnx/sub.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-zhigh-to-onnx %s -split-input-file | FileCheck %s
 
 func.func @test_sub() -> tensor<10x10xf32> {
   %cst0 = onnx.Constant dense<1.0> : tensor<10x10xf32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/add.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/add.mlir
index eaad58308b..dbfd014dc4 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/add.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/add.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, %arg1: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Add"(%arg0, %arg1) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/compiler-stick-unstick.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/compiler-stick-unstick.mlir
index b8cef7cf2b..270940f2b6 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/compiler-stick-unstick.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/compiler-stick-unstick.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=true --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=true --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<1x3x5x7xf32>) -> tensor<*xf32> {
   %0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x3x5x7xf32>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/conv.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/conv.mlir
index 2d2983ba07..a4002b2227 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/conv.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/conv.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @conv_valid_padding(%arg0: tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>, %arg1: tensor<2x2x3x1xf16, #zhigh.layout<{dataLayout = "HWCK"}>>, %arg2: tensor<1xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
   %0 = "zhigh.Conv2D"(%arg0, %arg1, %arg2) {kernel_shape = [2, 2], padding_type = "VALID_PADDING", strides = [1, 1], act_func = "ACT_NONE"} : (tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>, tensor<2x2x3x1xf16, #zhigh.layout<{dataLayout = "HWCK"}>>, tensor<1xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/div.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/div.mlir
index 8ddd718edd..4cb93ea443 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/div.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/div.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, %arg1: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Div"(%arg0, %arg1) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/dlf16_to_f32.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/dlf16_to_f32.mlir
index 07ae1fbd06..972adfaf08 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/dlf16_to_f32.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/dlf16_to_f32.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/exp.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/exp.mlir
index cd4c6cd6a9..77876561fb 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/exp.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/exp.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Exp"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/f32_to_dlf16.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/f32_to_dlf16.mlir
index 696cb8401a..a038b6887e 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/f32_to_dlf16.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/f32_to_dlf16.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gelu.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gelu.mlir
new file mode 100644
index 0000000000..44acd88099
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gelu.mlir
@@ -0,0 +1,50 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
+  %0 = "zhigh.Gelu"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x4x5xf16, #map>) -> memref<3x4x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<3x4x5xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.gelu"([[PARAM_0_]], [[RES_1_]], [[RES_]]) {layout = "3D"} : (memref<3x4x5xf16, #map>, memref<3xi64>, memref<3x4x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x4x5xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @should_lower_to_zlow_unknown_dims(%arg0: tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
+  %0 = "zhigh.Gelu"(%arg0) : (tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow_unknown_dims
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x?x5xf16, #map>) -> memref<3x?x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK:           [[VAR_0_:%.+]] = memref.dim [[PARAM_0_]], [[VAR_c1_]] : memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc([[VAR_0_]]) {{.*}}: memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           [[VAR_3_:%.+]] = arith.index_cast [[VAR_0_]] : index to i64
+// CHECK:           krnl.store [[VAR_3_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.gelu"([[PARAM_0_]], [[RES_1_]], [[RES_]]) {layout = "3D"} : (memref<3x?x5xf16, #map>, memref<3xi64>, memref<3x?x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x?x5xf16, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gru.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gru.mlir
index b828da2b80..930b1a0179 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gru.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/gru.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @gru_return_single_step(%input : tensor<3x5x7xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %h0 : tensor<1x5x9xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %input_weights : tensor<1x7x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>, %input_bias : tensor<1x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>, %hidden_weights : tensor<1x9x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>, %hidden_bias : tensor<1x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>) -> tensor<*xf16> {
 
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/invsqrt.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/invsqrt.mlir
new file mode 100644
index 0000000000..ae34ab5495
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/invsqrt.mlir
@@ -0,0 +1,51 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> {
+  %0 = "zhigh.InvSqrt"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x4x5xf16, #map>) -> memref<3x4x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<3x4x5xf16, #map>
+// CHECK-DAG:       [[RES_0_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_0_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_0_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_0_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.invsqrt"([[PARAM_0_]], [[RES_0_]], [[RES_]]) {layout = "3D"} : (memref<3x4x5xf16, #map>, memref<3xi64>, memref<3x4x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x4x5xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @should_lower_to_zlow_unknown_dims(%arg0: tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> {
+  %0 = "zhigh.InvSqrt"(%arg0) : (tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow_unknown_dims
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x?x5xf16, #map>) -> memref<3x?x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK:           [[VAR_0_:%.+]] = memref.dim [[PARAM_0_]], [[VAR_c1_]] : memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc([[VAR_0_]]) {{.*}}: memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_0_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_0_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           [[VAR_3_:%.+]] = arith.index_cast [[VAR_0_]] : index to i64
+// CHECK:           krnl.store [[VAR_3_]], [[RES_0_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_0_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.invsqrt"([[PARAM_0_]], [[RES_0_]], [[RES_]]) {layout = "3D"} : (memref<3x?x5xf16, #map>, memref<3xi64>, memref<3x?x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x?x5xf16, #map>
+// CHECK:         }
+
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/leakyrelu.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/leakyrelu.mlir
new file mode 100644
index 0000000000..b5b2477027
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/leakyrelu.mlir
@@ -0,0 +1,50 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
+  %0 = "zhigh.LeakyRelu"(%arg0) {alpha = 0.02 : f32} : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x4x5xf16, #map>) -> memref<3x4x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<3x4x5xf16, #map>
+// CHECK-DAG:       [[RES_0_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_0_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_0_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_0_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.leakyrelu"([[PARAM_0_]], [[RES_0_]], [[RES_]]) {alpha = 2.000000e-02 : f32, layout = "3D"} : (memref<3x4x5xf16, #map>, memref<3xi64>, memref<3x4x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x4x5xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @should_lower_to_zlow_unknown_dims(%arg0: tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
+  %0 = "zhigh.LeakyRelu"(%arg0) : (tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow_unknown_dims
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x?x5xf16, #map>) -> memref<3x?x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK:           [[VAR_0_:%.+]] = memref.dim [[PARAM_0_]], [[VAR_c1_]] : memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc([[VAR_0_]]) {{.*}}: memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_0_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_0_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           [[VAR_3_:%.+]] = arith.index_cast [[VAR_0_]] : index to i64
+// CHECK:           krnl.store [[VAR_3_]], [[RES_0_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_0_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.leakyrelu"([[PARAM_0_]], [[RES_0_]], [[RES_]]) {alpha = 0.00999999977 : f32, layout = "3D"} : (memref<3x?x5xf16, #map>, memref<3xi64>, memref<3x?x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x?x5xf16, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/log.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/log.mlir
index 7263c491da..b9ce99f445 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/log.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/log.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Log"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/lstm.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/lstm.mlir
index e63d5cee97..1011c632d8 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/lstm.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize --zlow-rewrite --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize --zlow-rewrite --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @lstm_return_single_step(%input : tensor<3x5x7xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %h0 : tensor<1x5x9xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %c0 : tensor<1x5x9xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %input_weights : tensor<1x7x36xf16, #zhigh.layout<{dataLayout = "FICO"}>>, %input_bias : tensor<1x36xf16, #zhigh.layout<{dataLayout = "FICO"}>>, %hidden_weights : tensor<1x9x36xf16, #zhigh.layout<{dataLayout = "FICO"}>>, %hidden_bias : tensor<1x36xf16, #zhigh.layout<{dataLayout = "FICO"}>>) -> (tensor<*xf16>, tensor<*xf16>) {
 
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/matmul.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/matmul.mlir
index f9ee8d6786..e67f3d5423 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/matmul.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/matmul.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @matmul(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
  %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> 
@@ -19,13 +19,89 @@ func.func @matmul(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %a
 // CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
 // CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
 // CHECK:           krnl.store [[VAR_c16_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
-// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast = 0 : si64, is_stacked = 0 : si64} : (memref<4x8xf16, #map>, memref<8x16xf16, #map>, memref<16xf16, #map1>, memref<3xi64>, memref<4x16xf16, #map>) -> ()
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64, transposeA = 0 : si64, transposeB = 0 : si64} : (memref<4x8xf16, #map>, memref<8x16xf16, #map>, memref<16xf16, #map1>, memref<3xi64>, memref<4x16xf16, #map>) -> ()
 // CHECK:           return [[RES_]] : memref<4x16xf16, #map>
 // CHECK:         }
 }
 
 // -----
 
+func.func @matmul_transposeA(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
+ %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) {transposeA = 1 : si64, transposeB = 0 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> 
+ return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 32, d0 mod 32, d1 mod 64)>
+// CHECK-DAG: #map1 = affine_map<(d0) -> (0, d0 floordiv 64, 0, 0, 31, d0 mod 64)>
+// CHECK-LABEL:  func @matmul_transposeA
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<4x8xf16, #map>, [[PARAM_1_:%.+]]: memref<8x16xf16, #map>, [[PARAM_2_:%.+]]: memref<16xf16, #map1>) -> memref<8x16xf16, #map> {
+// CHECK-DAG:       [[VAR_c16_i64_:%.+]] = arith.constant 16 : i64
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c8_i64_:%.+]] = arith.constant 8 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<8x16xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c16_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64, transposeA = 1 : si64, transposeB = 0 : si64} : (memref<4x8xf16, #map>, memref<8x16xf16, #map>, memref<16xf16, #map1>, memref<3xi64>, memref<8x16xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<8x16xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @matmul_transposeB(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
+ %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) {transposeA = 0 : si64, transposeB = 1 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> 
+ return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 32, d0 mod 32, d1 mod 64)>
+// CHECK-DAG: #map1 = affine_map<(d0) -> (0, d0 floordiv 64, 0, 0, 31, d0 mod 64)>
+// CHECK-LABEL:  func @matmul_transposeB
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<4x8xf16, #map>, [[PARAM_1_:%.+]]: memref<8x16xf16, #map>, [[PARAM_2_:%.+]]: memref<16xf16, #map1>) -> memref<4x8xf16, #map> {
+// CHECK-DAG:       [[VAR_c8_i64_:%.+]] = arith.constant 8 : i64
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<4x8xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64, transposeA = 0 : si64, transposeB = 1 : si64} : (memref<4x8xf16, #map>, memref<8x16xf16, #map>, memref<16xf16, #map1>, memref<3xi64>, memref<4x8xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<4x8xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @matmul_transposeAB(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
+ %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) {transposeA = 1 : si64, transposeB = 1 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> 
+ return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 32, d0 mod 32, d1 mod 64)>
+// CHECK-DAG: #map1 = affine_map<(d0) -> (0, d0 floordiv 64, 0, 0, 31, d0 mod 64)>
+// CHECK-LABEL:  func @matmul_transposeAB
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<4x8xf16, #map>, [[PARAM_1_:%.+]]: memref<8x16xf16, #map>, [[PARAM_2_:%.+]]: memref<16xf16, #map1>) -> memref<8x8xf16, #map> {
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c8_i64_:%.+]] = arith.constant 8 : i64
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<8x8xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64, transposeA = 1 : si64, transposeB = 1 : si64} : (memref<4x8xf16, #map>, memref<8x16xf16, #map>, memref<16xf16, #map1>, memref<3xi64>, memref<8x8xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<8x8xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
 func.func @matmul_stack(%arg0: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %arg1: tensor<2x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %arg2: tensor<2x16xf16, #zhigh.layout<{dataLayout = "2DS"}>>) -> tensor<*xf16> {
  %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) : (tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x16xf16, #zhigh.layout<{dataLayout = "2DS"}>>) -> tensor<*xf16> 
  return %0 : tensor<*xf16> 
@@ -48,21 +124,21 @@ func.func @matmul_stack(%arg0: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3D
 // CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<4xi64>
 // CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<4xi64>
 // CHECK:           krnl.store [[VAR_c16_i64_]], [[RES_1_]]{{.}}[[VAR_c3_]]{{.}} : memref<4xi64>
-// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast = 0 : si64, is_stacked = -1 : si64} : (memref<2x4x8xf16, #map>, memref<2x8x16xf16, #map>, memref<2x16xf16, #map1>, memref<4xi64>, memref<2x4x16xf16, #map>) -> ()
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = -1 : si64, transposeA = 0 : si64, transposeB = 0 : si64} : (memref<2x4x8xf16, #map>, memref<2x8x16xf16, #map>, memref<2x16xf16, #map1>, memref<4xi64>, memref<2x4x16xf16, #map>) -> ()
 // CHECK:           return [[RES_]] : memref<2x4x16xf16, #map>
 // CHECK:         }
 }
 
 // -----
 
-func.func @matmul_broadcast(%arg0: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %arg1: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
+func.func @matmul_broadcast23(%arg0: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %arg1: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
  %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) : (tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<16xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> 
  return %0 : tensor<*xf16> 
 
 // CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
 // CHECK-DAG: #map1 = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 32, d0 mod 32, d1 mod 64)>
 // CHECK-DAG: #map2 = affine_map<(d0) -> (0, d0 floordiv 64, 0, 0, 31, d0 mod 64)>
-// CHECK-LABEL:  func @matmul_broadcast
+// CHECK-LABEL:  func @matmul_broadcast23
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<2x4x8xf16, #map>, [[PARAM_1_:%.+]]: memref<8x16xf16, #map1>, [[PARAM_2_:%.+]]: memref<16xf16, #map2>) -> memref<2x4x16xf16, #map> {
 // CHECK-DAG:       [[VAR_c3_:%.+]] = arith.constant 3 : index
 // CHECK-DAG:       [[VAR_c16_i64_:%.+]] = arith.constant 16 : i64
@@ -78,13 +154,42 @@ func.func @matmul_broadcast(%arg0: tensor<2x4x8xf16, #zhigh.layout<{dataLayout =
 // CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<4xi64>
 // CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<4xi64>
 // CHECK:           krnl.store [[VAR_c16_i64_]], [[RES_1_]]{{.}}[[VAR_c3_]]{{.}} : memref<4xi64>
-// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast = -1 : si64, is_stacked = 0 : si64} : (memref<2x4x8xf16, #map>, memref<8x16xf16, #map1>, memref<16xf16, #map2>, memref<4xi64>, memref<2x4x16xf16, #map>) -> ()
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = -1 : si64, is_stacked = 0 : si64, transposeA = 0 : si64, transposeB = 0 : si64} : (memref<2x4x8xf16, #map>, memref<8x16xf16, #map1>, memref<16xf16, #map2>, memref<4xi64>, memref<2x4x16xf16, #map>) -> ()
 // CHECK:           return [[RES_]] : memref<2x4x16xf16, #map>
 // CHECK:         }
 }
 
 // -----
 
+func.func @matmul_broadcast1(%arg0: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %arg2: tensor<2x8xf16, #zhigh.layout<{dataLayout = "2DS"}>>) -> tensor<*xf16> {
+ %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) : (tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x8xf16, #zhigh.layout<{dataLayout = "2DS"}>>) -> tensor<*xf16>
+ return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 32, d0 mod 32, d1 mod 64)>
+// CHECK-DAG: #map1 = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-DAG: #map2 = affine_map<(d0, d1) -> (d0, d1 floordiv 64, 0, 0, 31, d1 mod 64)>
+// CHECK-LABEL:  func @matmul_broadcast1
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<8x16xf16, #map>, [[PARAM_1_:%.+]]: memref<2x4x8xf16, #map1>, [[PARAM_2_:%.+]]: memref<2x8xf16, #map2>) -> memref<2x8x8xf16, #map1> {
+// CHECK-DAG:       [[VAR_c2_i64_:%.+]] = arith.constant 2 : i64
+// CHECK-DAG:       [[VAR_c16_i64_:%.+]] = arith.constant 16 : i64
+// CHECK-DAG:       [[VAR_c8_i64_:%.+]] = arith.constant 8 : i64
+// CHECK-DAG:       [[VAR_c3_:%.+]] = arith.constant 3 : index
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<2x8x8xf16, #map1>
+// CHECK-DAG:       [[RES_0_:%.+]] = memref.alloc() {{.*}}: memref<4xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_0_]]{{.}}[[VAR_c0_]]{{.}} : memref<4xi64>
+// CHECK:           krnl.store [[VAR_c16_i64_]], [[RES_0_]]{{.}}[[VAR_c1_]]{{.}} : memref<4xi64>
+// CHECK:           krnl.store [[VAR_c2_i64_]], [[RES_0_]]{{.}}[[VAR_c2_]]{{.}} : memref<4xi64>
+// CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_0_]]{{.}}[[VAR_c3_]]{{.}} : memref<4xi64>
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_0_]], [[RES_]]) {is_bcast1 = -1 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64, transposeA = 0 : si64, transposeB = 0 : si64} : (memref<8x16xf16, #map>, memref<2x4x8xf16, #map1>, memref<2x8xf16, #map2>, memref<4xi64>, memref<2x8x8xf16, #map1>) -> ()
+// CHECK:           return [[RES_]] : memref<2x8x8xf16, #map1>
+// CHECK:         }
+}
+
+// -----
+
 func.func @matmul_unknown_dims(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1: tensor<8x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg2: tensor<?xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
  %0 ="zhigh.MatMul"(%arg0, %arg1, %arg2) : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<?xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> 
  return %0 : tensor<*xf16> 
@@ -107,7 +212,7 @@ func.func @matmul_unknown_dims(%arg0: tensor<4x8xf16, #zhigh.layout<{dataLayout
 // CHECK:           krnl.store [[VAR_c8_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
 // CHECK:           [[VAR_4_:%.+]] = arith.index_cast [[VAR_0_]] : index to i64
 // CHECK:           krnl.store [[VAR_4_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
-// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast = 0 : si64, is_stacked = 0 : si64} : (memref<4x8xf16, #map>, memref<8x?xf16, #map>, memref<?xf16, #map1>, memref<3xi64>, memref<4x?xf16, #map>) -> ()
+// CHECK:           "zlow.matmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[RES_1_]], [[RES_]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = 0 : si64, transposeA = 0 : si64, transposeB = 0 : si64} : (memref<4x8xf16, #map>, memref<8x?xf16, #map>, memref<?xf16, #map1>, memref<3xi64>, memref<4x?xf16, #map>) -> ()
 // CHECK:           return [[RES_]] : memref<4x?xf16, #map>
 // CHECK:         }
 }
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/max.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/max.mlir
index 42d8c30842..4f009f9326 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/max.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/max.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, %arg1: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Max"(%arg0, %arg1) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/meanreduce.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/meanreduce.mlir
index 4242f00d97..7dbc85d42c 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/meanreduce.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/meanreduce.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<1x5x7x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16> {
   %0 = "zhigh.MeanReduce2d"(%arg0) : (tensor<1x5x7x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/min.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/min.mlir
index a0baea41cd..9ff5e9cf42 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/min.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/min.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, %arg1: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Min"(%arg0, %arg1) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/mul.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/mul.mlir
index 22662aa3ac..2a33115d08 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/mul.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/mul.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, %arg1: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Mul"(%arg0, %arg1) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/pool.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/pool.mlir
index 9cd499a287..652a0c8e56 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/pool.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/pool.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @maxpool_valid_padding(%arg0: tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16> {
   %0 = "zhigh.MaxPool2D"(%arg0) {kernel_shape = [2, 2], padding_type = "VALID_PADDING", strides = [1, 1], act_func = "ACT_NONE"} : (tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_matmul.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_matmul.mlir
new file mode 100644
index 0000000000..45302c2404
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_matmul.mlir
@@ -0,0 +1,38 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @test_zhigh_quantized_matmul(%arg0: tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<5x7xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<7xi8, #zhigh.layout<{dataLayout = "1D", quantizedType = "INT8"}>>, %arg7: tensor<f32>, %arg8: tensor<f32>) -> tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>> {
+    %none = "onnx.NoValue"() {value} : () -> none
+    %Out, %Out_RecScale, %Out_Offset = "zhigh.QuantizedMatMul"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %none, %none) {DequantizeOutput = 0 : si64, DisableClipping = 0 : si64, PreComputedBias = 0 : si64} : (tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<5x7xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<7xi8, #zhigh.layout<{dataLayout = "1D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, none, none) -> (tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+    return %Out : tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 64, d0 mod 64, d1 mod 64)>
+// CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0) -> (0, d0 floordiv 128, 0, 0, 31, d0 mod 128)>
+// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<(d0) -> (0, d0 floordiv 64, 0, 0, 31, d0 mod 64)>
+// CHECK-LABEL:  func.func @test_zhigh_quantized_matmul
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5xf16, #map>, [[PARAM_1_:%.+]]: memref<f32>, [[PARAM_2_:%.+]]: memref<f32>, [[PARAM_3_:%.+]]: memref<5x7xi8, #map1>, [[PARAM_4_:%.+]]: memref<f32>, [[PARAM_5_:%.+]]: memref<f32>, [[PARAM_6_:%.+]]: memref<7xi8, #map2>, [[PARAM_7_:%.+]]: memref<f32>, [[PARAM_8_:%.+]]: memref<f32>) -> memref<1x3x7xf16, #map> {
+// CHECK-DAG:       [[CST_7_:%.+]] = arith.constant 7 : i64
+// CHECK-DAG:       [[CST_5_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[CST_3_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : i64
+// CHECK-DAG:       [[CST_2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[CST_1_1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[CST_3_1_:%.+]] = arith.constant 3 : index
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_1_dot_000000_:%.+]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<1x3x7xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[CST_1_dot_000000_]], [[RES_1_]][] : memref<f32>
+// CHECK:           [[RES_2_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[CST_0_dot_000000_]], [[RES_2_]][] : memref<f32>
+// CHECK:           [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4xi64>
+// CHECK:           krnl.store [[CST_1_]], [[RES_3_]]{{.}}[[CST_0_]]{{.}} : memref<4xi64>
+// CHECK:           krnl.store [[CST_3_]], [[RES_3_]]{{.}}[[CST_1_1_]]{{.}} : memref<4xi64>
+// CHECK:           krnl.store [[CST_5_]], [[RES_3_]]{{.}}[[CST_2_]]{{.}} : memref<4xi64>
+// CHECK:           krnl.store [[CST_7_]], [[RES_3_]]{{.}}[[CST_3_1_]]{{.}} : memref<4xi64>
+// CHECK:           [[RES_4_:%.+]] = memref.alloc() {{.*}}: memref<7xf16, #map3>
+// CHECK:           "zlow.quantizedMatmul"([[PARAM_0_]], [[PARAM_1_]], [[PARAM_2_]], [[PARAM_3_]], [[PARAM_4_]], [[PARAM_5_]], [[PARAM_6_]], [[PARAM_7_]], [[PARAM_8_]], [[RES_4_]], [[RES_3_]], [[RES_]], [[RES_]]_1, [[RES_]]_2) {bias_q_type = "INT8", dequantize_output = 0 : si64, disable_clipping = 0 : si64, is_bcast = -1 : si64, is_stacked = 0 : si64, out_q_type = "DLFLOAT16", pre_computed_bias = 0 : si64, x_q_type = "DLFLOAT16", y_q_type = "WEIGHTS"} : (memref<1x3x5xf16, #map>, memref<f32>, memref<f32>, memref<5x7xi8, #map1>, memref<f32>, memref<f32>, memref<7xi8, #map2>, memref<f32>, memref<f32>, memref<7xf16, #map3>, memref<4xi64>, memref<1x3x7xf16, #map>, memref<f32>, memref<f32>) -> ()
+// CHECK:           return [[RES_]] : memref<1x3x7xf16, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_stick.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_stick.mlir
new file mode 100644
index 0000000000..8d9a623aa9
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_stick.mlir
@@ -0,0 +1,229 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @test_zhigh_quantized_stick_dlfloat16(%arg0: tensor<1x3x5xf32>) -> tensor<*xf16> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>)
+  return %0#0: tensor<*xf16>
+
+// mlir2FileCheck.py
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_dlfloat16
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5xf32>) -> memref<1x3x5xf16, #map> {
+// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
+// CHECK-DAG:       [[CST_1_dot_000000_:%.+]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:       [[CST_0_1_:%.+]] = arith.constant 0x7F800000 : f32
+// CHECK-DAG:       [[CST_1_dot_270000_:%.+]] = arith.constant 1.270000e+02 : f32
+// CHECK-DAG:       [[CST_minus_1_dot_280000_:%.+]] = arith.constant -1.280000e+02 : f32
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_]], [[CST_0_1_]] : memref<f32>
+// CHECK:           [[LOOP_0_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 3, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_14_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_14_]]#0, [[VAR_14_]]#1, [[VAR_14_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[LOAD_RES_MEM_:%.+]] = krnl.load [[RES_]][] : memref<f32>
+// CHECK:             [[VAR_17_:%.+]] = arith.minnumf [[LOAD_RES_MEM_]], [[LOAD_PARAM_0_MEM_]] : f32
+// CHECK:             krnl.store [[VAR_17_]], [[RES_]][] : memref<f32>
+// CHECK:           }
+// CHECK:           [[RES_1_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_1_]], [[CST_0_]] : memref<f32>
+// CHECK:           [[LOOP_1_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) with ([[LOOP_1_]]#0 -> [[I_3_:%.+]] = 0 to 1, [[LOOP_1_]]#1 -> [[I_4_:%.+]] = 0 to 3, [[LOOP_1_]]#2 -> [[I_5_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_14_1_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_1_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_14_1_]]#0, [[VAR_14_1_]]#1, [[VAR_14_1_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[LOAD_RES_MEM_1_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK:             [[VAR_17_1_:%.+]] = arith.maxnumf [[LOAD_RES_MEM_1_]], [[LOAD_PARAM_0_MEM_1_]] : f32
+// CHECK:             krnl.store [[VAR_17_1_]], [[RES_1_]][] : memref<f32>
+// CHECK:           }
+// CHECK-DAG:       [[LOAD_RES_MEM_2_:%.+]] = krnl.load [[RES_]][] : memref<f32>
+// CHECK-DAG:       [[LOAD_RES_1_MEM_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_4_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_]], [[CST_0_dot_000000_]] : f32
+// CHECK-DAG:       [[VAR_5_:%.+]] = arith.minnumf [[LOAD_RES_MEM_2_]], [[CST_0_dot_000000_]] : f32
+// CHECK:           [[VAR_6_:%.+]] = arith.subf [[VAR_4_]], [[VAR_5_]] : f32
+// CHECK:           [[VAR_7_:%.+]] = arith.divf [[VAR_6_]], [[CST_2_dot_550000_]] : f32
+// CHECK:           [[VAR_8_:%.+]] = arith.divf [[VAR_5_]], [[VAR_7_]] : f32
+// CHECK:           [[VAR_9_:%.+]] = arith.subf [[CST_minus_1_dot_280000_]], [[VAR_8_]] : f32
+// CHECK:           [[VAR_10_:%.+]] = arith.maxnumf [[VAR_9_]], [[CST_minus_1_dot_280000_]] : f32
+// CHECK:           [[VAR_11_:%.+]] = arith.minnumf [[VAR_10_]], [[CST_1_dot_270000_]] : f32
+// CHECK-DAG:       [[VAR_12_:%.+]] = "krnl.round_even"([[VAR_11_]]) : (f32) -> f32
+// CHECK-DAG:       [[VAR_13_:%.+]] = arith.divf [[CST_1_dot_000000_]], [[VAR_7_]] : f32
+// CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_13_]], [[RES_2_]][] : memref<f32>
+// CHECK:           [[RES_3_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_12_]], [[RES_3_]][] : memref<f32>
+// CHECK:           [[RES_4_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5xf16, #map>
+// CHECK:           "zlow.stick"([[PARAM_0_]], [[RES_4_]]) {layout = "3DS", saturation = -1 : si64} : (memref<1x3x5xf32>, memref<1x3x5xf16, #map>) -> ()
+// CHECK:           return [[RES_4_]] : memref<1x3x5xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+
+func.func @test_zhigh_quantized_stick_dlfloat16_symmetric(%arg0: tensor<1x3x5xf32>) -> tensor<*xf16> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 1 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>)
+  return %0#0: tensor<*xf16>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_dlfloat16_symmetric
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5xf32>) -> memref<1x3x5xf16, #map> {
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[VAR_0_:%.+]] = "krnl.global"() {name = "constant_{{[0-9]+}}", shape = [], value = dense<1.270000e+02> : tensor<f32>} : () -> memref<f32>
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5xf32>
+// CHECK-DAG:       [[LOOP_0_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 3, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_7_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK:             [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_7_]]#0, [[VAR_7_]]#1, [[VAR_7_]]#2] : memref<1x3x5xf32>
+// CHECK:             [[VAR_9_:%.+]] = math.absf [[LOAD_PARAM_0_MEM_]] : f32
+// CHECK:             krnl.store [[VAR_9_]], [[RES_]]{{.}}[[VAR_7_]]#0, [[VAR_7_]]#1, [[VAR_7_]]#2] : memref<1x3x5xf32>
+// CHECK:           }
+// CHECK:           [[RES_1_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_1_]], [[CST_0_]] : memref<f32>
+// CHECK:           [[LOOP_1_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) with ([[LOOP_1_]]#0 -> [[I_3_:%.+]] = 0 to 1, [[LOOP_1_]]#1 -> [[I_4_:%.+]] = 0 to 3, [[LOOP_1_]]#2 -> [[I_5_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_7_1_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_1_:%.+]] = krnl.load [[RES_]]{{.}}[[VAR_7_1_]]#0, [[VAR_7_1_]]#1, [[VAR_7_1_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[VAR_9_1_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK:             [[VAR_10_:%.+]] = arith.maxnumf [[VAR_9_1_]], [[LOAD_PARAM_0_MEM_1_]] : f32
+// CHECK:             krnl.store [[VAR_10_]], [[RES_1_]][] : memref<f32>
+// CHECK:           }
+// CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<f32>
+// CHECK-DAG:       [[LOAD_VAR_0_MEM_:%.+]] = krnl.load [[VAR_0_]][] : memref<f32>
+// CHECK-DAG:       [[LOAD_RES_1_MEM_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK:           [[VAR_5_:%.+]] = arith.divf [[LOAD_VAR_0_MEM_]], [[LOAD_RES_1_MEM_]] : f32
+// CHECK:           krnl.store [[VAR_5_]], [[RES_2_]][] : memref<f32>
+// CHECK-DAG:       [[LOAD_RES_2_MEM_:%.+]] = krnl.load [[RES_2_]][] : memref<f32>
+// CHECK-DAG:       [[RES_3_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[LOAD_RES_2_MEM_]], [[RES_3_]][] : memref<f32>
+// CHECK:           [[RES_4_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[CST_0_dot_000000_]], [[RES_4_]][] : memref<f32>
+// CHECK:           [[RES_5_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5xf16, #map>
+// CHECK:           "zlow.stick"([[PARAM_0_]], [[RES_5_]]) {layout = "3DS", saturation = -1 : si64} : (memref<1x3x5xf32>, memref<1x3x5xf16, #map>) -> ()
+// CHECK:           return [[RES_5_]] : memref<1x3x5xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_zhigh_quantized_stick_int8(%arg0: tensor<1x3x5xf32>) -> tensor<*xi8> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "int8"} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  return %0#0: tensor<*xi8>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 128, 0, d1 floordiv 32, d1 mod 32, d2 mod 128)>
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_int8
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5xf32>) -> memref<1x3x5xi8, #map> {
+// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
+// CHECK-DAG:       [[CST_1_dot_000000_:%.+]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:       [[CST_0_1_:%.+]] = arith.constant 0x7F800000 : f32
+// CHECK-DAG:       [[CST_1_dot_270000_:%.+]] = arith.constant 1.270000e+02 : f32
+// CHECK-DAG:       [[CST_minus_1_dot_280000_:%.+]] = arith.constant -1.280000e+02 : f32
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_]], [[CST_0_1_]] : memref<f32>
+// CHECK:           [[LOOP_0_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 3, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_14_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_14_]]#0, [[VAR_14_]]#1, [[VAR_14_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[LOAD_RES_MEM_:%.+]] = krnl.load [[RES_]][] : memref<f32>
+// CHECK:             [[VAR_17_:%.+]] = arith.minnumf [[LOAD_RES_MEM_]], [[LOAD_PARAM_0_MEM_]] : f32
+// CHECK:             krnl.store [[VAR_17_]], [[RES_]][] : memref<f32>
+// CHECK:           }
+// CHECK:           [[RES_1_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_1_]], [[CST_0_]] : memref<f32>
+// CHECK:           [[LOOP_1_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) with ([[LOOP_1_]]#0 -> [[I_3_:%.+]] = 0 to 1, [[LOOP_1_]]#1 -> [[I_4_:%.+]] = 0 to 3, [[LOOP_1_]]#2 -> [[I_5_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_14_1_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_1_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_14_1_]]#0, [[VAR_14_1_]]#1, [[VAR_14_1_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[LOAD_RES_MEM_1_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK:             [[VAR_17_1_:%.+]] = arith.maxnumf [[LOAD_RES_MEM_1_]], [[LOAD_PARAM_0_MEM_1_]] : f32
+// CHECK:             krnl.store [[VAR_17_1_]], [[RES_1_]][] : memref<f32>
+// CHECK:           }
+// CHECK-DAG:       [[LOAD_RES_MEM_2_:%.+]] = krnl.load [[RES_]][] : memref<f32>
+// CHECK-DAG:       [[LOAD_RES_1_MEM_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_4_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_]], [[CST_0_dot_000000_]] : f32
+// CHECK-DAG:       [[VAR_5_:%.+]] = arith.minnumf [[LOAD_RES_MEM_2_]], [[CST_0_dot_000000_]] : f32
+// CHECK:           [[VAR_6_:%.+]] = arith.subf [[VAR_4_]], [[VAR_5_]] : f32
+// CHECK:           [[VAR_7_:%.+]] = arith.divf [[VAR_6_]], [[CST_2_dot_550000_]] : f32
+// CHECK:           [[VAR_8_:%.+]] = arith.divf [[VAR_5_]], [[VAR_7_]] : f32
+// CHECK:           [[VAR_9_:%.+]] = arith.subf [[CST_minus_1_dot_280000_]], [[VAR_8_]] : f32
+// CHECK:           [[VAR_10_:%.+]] = arith.maxnumf [[VAR_9_]], [[CST_minus_1_dot_280000_]] : f32
+// CHECK:           [[VAR_11_:%.+]] = arith.minnumf [[VAR_10_]], [[CST_1_dot_270000_]] : f32
+// CHECK-DAG:       [[VAR_12_:%.+]] = "krnl.round_even"([[VAR_11_]]) : (f32) -> f32
+// CHECK-DAG:       [[VAR_13_:%.+]] = arith.divf [[CST_1_dot_000000_]], [[VAR_7_]] : f32
+// CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_13_]], [[RES_2_]][] : memref<f32>
+// CHECK:           [[RES_3_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_12_]], [[RES_3_]][] : memref<f32>
+// CHECK:           [[RES_4_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5xi8, #map>
+// CHECK:           "zlow.quantizedStick"([[PARAM_0_]], [[RES_2_]], [[RES_3_]], [[RES_4_]]) {layout = "3DS", q_type = "int8"} : (memref<1x3x5xf32>, memref<f32>, memref<f32>, memref<1x3x5xi8, #map>) -> ()
+// CHECK:           return [[RES_4_]] : memref<1x3x5xi8, #map>
+// CHECK:         }
+}
+
+// -----
+
+
+func.func @test_zhigh_quantized_stick_weights(%arg0: tensor<1x3x5xf32>) -> tensor<*xi8> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "weights"} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  return %0#0: tensor<*xi8>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 64, d1 mod 64, d2 mod 64)>
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_weights
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5xf32>) -> memref<1x3x5xi8, #map> {
+// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
+// CHECK-DAG:       [[CST_1_dot_000000_:%.+]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:       [[CST_0_1_:%.+]] = arith.constant 0x7F800000 : f32
+// CHECK-DAG:       [[CST_1_dot_270000_:%.+]] = arith.constant 1.270000e+02 : f32
+// CHECK-DAG:       [[CST_minus_1_dot_280000_:%.+]] = arith.constant -1.280000e+02 : f32
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_]], [[CST_0_1_]] : memref<f32>
+// CHECK:           [[LOOP_0_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 3, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_14_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_14_]]#0, [[VAR_14_]]#1, [[VAR_14_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[LOAD_RES_MEM_:%.+]] = krnl.load [[RES_]][] : memref<f32>
+// CHECK:             [[VAR_17_:%.+]] = arith.minnumf [[LOAD_RES_MEM_]], [[LOAD_PARAM_0_MEM_]] : f32
+// CHECK:             krnl.store [[VAR_17_]], [[RES_]][] : memref<f32>
+// CHECK:           }
+// CHECK:           [[RES_1_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.memset [[RES_1_]], [[CST_0_]] : memref<f32>
+// CHECK:           [[LOOP_1_:%.+]]:3 = krnl.define_loops 3
+// CHECK:           krnl.iterate([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) with ([[LOOP_1_]]#0 -> [[I_3_:%.+]] = 0 to 1, [[LOOP_1_]]#1 -> [[I_4_:%.+]] = 0 to 3, [[LOOP_1_]]#2 -> [[I_5_:%.+]] = 0 to 5){
+// CHECK:             [[VAR_14_1_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_1_]]#0, [[LOOP_1_]]#1, [[LOOP_1_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
+// CHECK-DAG:         [[LOAD_PARAM_0_MEM_1_:%.+]] = krnl.load [[PARAM_0_]]{{.}}[[VAR_14_1_]]#0, [[VAR_14_1_]]#1, [[VAR_14_1_]]#2] : memref<1x3x5xf32>
+// CHECK-DAG:         [[LOAD_RES_MEM_1_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK:             [[VAR_17_1_:%.+]] = arith.maxnumf [[LOAD_RES_MEM_1_]], [[LOAD_PARAM_0_MEM_1_]] : f32
+// CHECK:             krnl.store [[VAR_17_1_]], [[RES_1_]][] : memref<f32>
+// CHECK:           }
+// CHECK-DAG:       [[LOAD_RES_MEM_2_:%.+]] = krnl.load [[RES_]][] : memref<f32>
+// CHECK-DAG:       [[LOAD_RES_1_MEM_:%.+]] = krnl.load [[RES_1_]][] : memref<f32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_4_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_]], [[CST_0_dot_000000_]] : f32
+// CHECK-DAG:       [[VAR_5_:%.+]] = arith.minnumf [[LOAD_RES_MEM_2_]], [[CST_0_dot_000000_]] : f32
+// CHECK:           [[VAR_6_:%.+]] = arith.subf [[VAR_4_]], [[VAR_5_]] : f32
+// CHECK:           [[VAR_7_:%.+]] = arith.divf [[VAR_6_]], [[CST_2_dot_550000_]] : f32
+// CHECK:           [[VAR_8_:%.+]] = arith.divf [[VAR_5_]], [[VAR_7_]] : f32
+// CHECK:           [[VAR_9_:%.+]] = arith.subf [[CST_minus_1_dot_280000_]], [[VAR_8_]] : f32
+// CHECK:           [[VAR_10_:%.+]] = arith.maxnumf [[VAR_9_]], [[CST_minus_1_dot_280000_]] : f32
+// CHECK:           [[VAR_11_:%.+]] = arith.minnumf [[VAR_10_]], [[CST_1_dot_270000_]] : f32
+// CHECK-DAG:       [[VAR_12_:%.+]] = "krnl.round_even"([[VAR_11_]]) : (f32) -> f32
+// CHECK-DAG:       [[VAR_13_:%.+]] = arith.divf [[CST_1_dot_000000_]], [[VAR_7_]] : f32
+// CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_13_]], [[RES_2_]][] : memref<f32>
+// CHECK:           [[RES_3_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_12_]], [[RES_3_]][] : memref<f32>
+// CHECK:           [[RES_4_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5xi8, #map>
+// CHECK:           "zlow.quantizedStick"([[PARAM_0_]], [[RES_2_]], [[RES_3_]], [[RES_4_]]) {layout = "3DS", q_type = "weights"} : (memref<1x3x5xf32>, memref<f32>, memref<f32>, memref<1x3x5xi8, #map>) -> ()
+// CHECK:           return [[RES_4_]] : memref<1x3x5xi8, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_stick_O3.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_stick_O3.mlir
new file mode 100644
index 0000000000..2a45c12b5e
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/quantized_stick_O3.mlir
@@ -0,0 +1,54 @@
+// RUN: onnx-mlir-opt -O3 --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @test_zhigh_quantized_stick_dlfloat16_symmetric(%arg0: tensor<1x3x5xf32>) -> tensor<*xf16> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 1 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>)
+  return %0#0: tensor<*xf16>
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_dlfloat16_symmetric
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<1x3x5xf32>) -> memref<1x3x5xf16, #map> {
+// CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<0xFF800000> : vector<12xf32>
+// CHECK-DAG:       [[CST_1_dot_270000_:%.+]] = arith.constant 1.270000e+02 : f32
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_15_:%.+]] = arith.constant 15 : index
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<1xindex>
+// CHECK:           affine.store [[CST_15_]], [[RES_]][0] : memref<1xindex>
+// CHECK-DAG:       [[VAR_reshape_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_]]) : (memref<1x3x5xf32>, memref<1xindex>) -> memref<15xf32>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<12xf32>
+// CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           vector.store [[VAR_cst_]], [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<12xf32>, vector<12xf32>
+// CHECK:           [[LOOP_0_:%.+]] = krnl.define_loops 1
+// CHECK:           [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]] 12 : (!krnl.loop) -> (!krnl.loop, !krnl.loop)
+// CHECK:           krnl.iterate([[BLOCK_TILE__0_]]) with ([[LOOP_0_]] -> [[I_0_:%.+]] = 0 to 4){
+// CHECK:             [[VAR_6_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__0_]]) : (!krnl.loop) -> index
+// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_:%.+]] = vector.load [[VAR_reshape_]]{{.}}[[VAR_6_]]{{.}} : memref<15xf32>, vector<12xf32>
+// CHECK-DAG:         [[LOAD_RES_1_MEM_:%.+]] = vector.load [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<12xf32>, vector<12xf32>
+// CHECK:             [[VAR_9_:%.+]] = math.absf [[LOAD_VAR_reshape_MEM_]] : vector<12xf32>
+// CHECK:             [[VAR_10_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_]], [[VAR_9_]] : vector<12xf32>
+// CHECK:             vector.store [[VAR_10_]], [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<12xf32>, vector<12xf32>
+// CHECK:           }
+// CHECK:           [[LOOP_1_:%.+]] = krnl.define_loops 1
+// CHECK:           krnl.iterate([[LOOP_1_]]) with ([[LOOP_1_]] -> [[I_1_:%.+]] = 12 to 15){
+// CHECK:             [[VAR_6_1_:%.+]] = krnl.get_induction_var_value([[LOOP_1_]]) : (!krnl.loop) -> index
+// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_1_:%.+]] = krnl.load [[VAR_reshape_]]{{.}}[[VAR_6_1_]]{{.}} : memref<15xf32>
+// CHECK-DAG:         [[LOAD_RES_1_MEM_1_:%.+]] = krnl.load [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<12xf32>
+// CHECK:             [[VAR_9_1_:%.+]] = math.absf [[LOAD_VAR_reshape_MEM_1_]] : f32
+// CHECK:             [[VAR_10_1_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_1_]], [[VAR_9_1_]] : f32
+// CHECK:             krnl.store [[VAR_10_1_]], [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<12xf32>
+// CHECK:           }
+// CHECK:           [[LOAD_RES_1_MEM_2_:%.+]] = vector.load [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<12xf32>, vector<12xf32>
+// CHECK:           [[VAR_3_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_1_MEM_2_]] : vector<12xf32> into f32
+// CHECK:           krnl.store [[VAR_3_]], [[RES_2_]][] : memref<f32>
+// CHECK:           [[LOAD_RES_2_MEM_:%.+]] = krnl.load [[RES_2_]][] : memref<f32>
+// CHECK-DAG:       [[VAR_5_:%.+]] = arith.divf [[CST_1_dot_270000_]], [[LOAD_RES_2_MEM_]] : f32
+// CHECK-DAG:       [[RES_3_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[VAR_5_]], [[RES_3_]][] : memref<f32>
+// CHECK:           [[RES_4_:%.+]] = memref.alloc() : memref<f32>
+// CHECK:           krnl.store [[CST_0_dot_000000_]], [[RES_4_]][] : memref<f32>
+// CHECK:           [[RES_5_:%.+]] = memref.alloc() {{.*}}: memref<1x3x5xf16, #map>
+// CHECK:           "zlow.stick"([[PARAM_0_]], [[RES_5_]]) {layout = "3DS", saturation = -1 : si64} : (memref<1x3x5xf32>, memref<1x3x5xf16, #map>) -> ()
+// CHECK:           return [[RES_5_]] : memref<1x3x5xf16, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/reducemax.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/reducemax.mlir
new file mode 100644
index 0000000000..83393283cf
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/reducemax.mlir
@@ -0,0 +1,16 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @reduce_max_axes_defined_noop_0(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>> { 
+  %0 = "zhigh.ReduceMax"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+  return %0 : tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+
+// mlir2FileCheck.py
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func.func @reduce_max_axes_defined_noop_0
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x4x5xf16, #map>) -> memref<3x4x1xf16, #map> {
+// CHECK:           [[VAR_0_:%.+]] = builtin.unrealized_conversion_cast [[PARAM_0_]] : memref<3x4x5xf16, #map> to tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.ReduceMax"([[VAR_0_]]) {op_type = "REDUCE_OP_MAXIMUM"} : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_2_:%.+]] = builtin.unrealized_conversion_cast [[VAR_1_]] : tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>> to memref<3x4x1xf16, #map>
+// CHECK:           return [[VAR_2_]] : memref<3x4x1xf16, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/reducemin.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/reducemin.mlir
new file mode 100644
index 0000000000..926c3f4ede
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/reducemin.mlir
@@ -0,0 +1,16 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @reduce_min_axes_defined_noop_0(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>> { 
+  %0 = "zhigh.ReduceMin"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+  return %0 : tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+
+// mlir2FileCheck.py
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func.func @reduce_min_axes_defined_noop_0
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x4x5xf16, #map>) -> memref<3x4x1xf16, #map> {
+// CHECK:           [[VAR_0_:%.+]] = builtin.unrealized_conversion_cast [[PARAM_0_]] : memref<3x4x5xf16, #map> to tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_1_:%.+]] = "zhigh.ReduceMin"([[VAR_0_]]) {op_type = "REDUCE_OP_MINIMUM"} : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_2_:%.+]] = builtin.unrealized_conversion_cast [[VAR_1_]] : tensor<3x4x1xf16, #zhigh.layout<{dataLayout = "3DS"}>> to memref<3x4x1xf16, #map>
+// CHECK:           return [[VAR_2_]] : memref<3x4x1xf16, #map>
+// CHECK:         }
+}
\ No newline at end of file
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/relu.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/relu.mlir
index 6e16464c62..9a83ab0f2c 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/relu.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/relu.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Relu"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sigmoid.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sigmoid.mlir
index 077198915d..066d27eef1 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sigmoid.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sigmoid.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Sigmoid"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/softmax.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/softmax.mlir
index f99a5d5efe..354aa94ef7 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/softmax.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/softmax.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Softmax"(%arg0) {act_func = "ACT_NONE"} : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sqrt.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sqrt.mlir
new file mode 100644
index 0000000000..441573348b
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sqrt.mlir
@@ -0,0 +1,50 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+
+func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
+  %0 = "zhigh.Sqrt"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x4x5xf16, #map>) -> memref<3x4x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c4_i64_:%.+]] = arith.constant 4 : i64
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<3x4x5xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c4_i64_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.sqrt"([[PARAM_0_]], [[RES_1_]], [[RES_]]) {layout = "3D"} : (memref<3x4x5xf16, #map>, memref<3xi64>, memref<3x4x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x4x5xf16, #map>
+// CHECK:         }
+}
+
+// -----
+
+func.func @should_lower_to_zlow_unknown_dims(%arg0: tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
+  %0 = "zhigh.Sqrt"(%arg0) : (tensor<3x?x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
+  return %0 : tensor<*xf16>
+
+// CHECK-DAG: #map = affine_map<(d0, d1, d2) -> (0, d2 floordiv 64, d0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
+// CHECK-LABEL:  func @should_lower_to_zlow_unknown_dims
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<3x?x5xf16, #map>) -> memref<3x?x5xf16, #map> {
+// CHECK-DAG:       [[VAR_c2_:%.+]] = arith.constant 2 : index
+// CHECK-DAG:       [[VAR_c5_i64_:%.+]] = arith.constant 5 : i64
+// CHECK-DAG:       [[VAR_c1_:%.+]] = arith.constant 1 : index
+// CHECK-DAG:       [[VAR_c0_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_c3_i64_:%.+]] = arith.constant 3 : i64
+// CHECK:           [[VAR_0_:%.+]] = memref.dim [[PARAM_0_]], [[VAR_c1_]] : memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc([[VAR_0_]]) {{.*}}: memref<3x?x5xf16, #map>
+// CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<3xi64>
+// CHECK:           krnl.store [[VAR_c3_i64_]], [[RES_1_]]{{.}}[[VAR_c0_]]{{.}} : memref<3xi64>
+// CHECK:           [[VAR_3_:%.+]] = arith.index_cast [[VAR_0_]] : index to i64
+// CHECK:           krnl.store [[VAR_3_]], [[RES_1_]]{{.}}[[VAR_c1_]]{{.}} : memref<3xi64>
+// CHECK:           krnl.store [[VAR_c5_i64_]], [[RES_1_]]{{.}}[[VAR_c2_]]{{.}} : memref<3xi64>
+// CHECK:           "zlow.sqrt"([[PARAM_0_]], [[RES_1_]], [[RES_]]) {layout = "3D"} : (memref<3x?x5xf16, #map>, memref<3xi64>, memref<3x?x5xf16, #map>) -> ()
+// CHECK:           return [[RES_]] : memref<3x?x5xf16, #map>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stick-unstick.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stick-unstick.mlir
index 22a67eec40..9696f5afc0 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stick-unstick.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stick-unstick.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<1x3x5x7xf32>) -> tensor<*xf32> {
   %0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x3x5x7xf32>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant-of-shape.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant-of-shape.mlir
index 5c8d4a63c9..6bb7cd76ae 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant-of-shape.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant-of-shape.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @test_stickified_constant_of_shape(%arg0: tensor<?x10xf16>) -> tensor<?x10xf16, #zhigh.layout<{dataLayout = "2D"}>> {
   %0 = onnx.Constant dense<8.000000e+00> : tensor<f32>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant.mlir
index 6d961c2dea..1055244b1a 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/stickified-constant.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --convert-onnx-to-krnl %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --convert-onnx-to-krnl %s -split-input-file | FileCheck %s
 
 module  {
   func.func @remove_stick_2d() -> tensor<2x3xf32> {
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sub.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sub.mlir
index b360592392..04a1951709 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sub.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/sub.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, %arg1: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Sub"(%arg0, %arg1) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>, tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/tanh.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/tanh.mlir
index a7659b291e..f65a874971 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/tanh.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/tanh.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16> { 
   %0 = "zhigh.Tanh"(%arg0) : (tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/test-datalayout.mlir b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/test-datalayout.mlir
index 508a819ccc..be231f26e9 100644
--- a/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/test-datalayout.mlir
+++ b/test/mlir/accelerators/nnpa/conversion/zhigh-to-zlow/test-datalayout.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow_1d(%arg0: tensor<7xf32>) -> tensor<*xf16> {
   %0 = "zhigh.Stick"(%arg0) {layout = "1D"} : (tensor<7xf32>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/driver/ccfd.mlir b/test/mlir/accelerators/nnpa/driver/ccfd.mlir
index a45bb1e9d8..a6aa1de5b0 100644
--- a/test/mlir/accelerators/nnpa/driver/ccfd.mlir
+++ b/test/mlir/accelerators/nnpa/driver/ccfd.mlir
@@ -1,4 +1,4 @@
-// RUN: ccfd=$(dirname %s)/ccfd.onnx && curl -L https://github.com/IBM/ai-on-z-fraud-detection/raw/main/onnx%20models/ccf_lstm_static_tf2onnx_OS_new.onnx -o ${ccfd} && onnx-mlir --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR -tag="test" ${ccfd} | FileCheck %s && rm -rf ${ccfd}
+// RUN: ccfd=$(dirname %s)/ccfd.onnx && curl -L https://github.com/IBM/ai-on-z-fraud-detection/raw/main/onnx%20models/ccf_lstm_static_tf2onnx_OS_new.onnx -o ${ccfd} && onnx-mlir --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR -tag="test" ${ccfd} | FileCheck %s && rm -rf ${ccfd}
 
 // COM: This test is to check regression on the IBM CCFD model.
 // COM: We expect that there are only one zlow.stick for the input and one zlow.unstick for the output.
diff --git a/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor-num2.mlir b/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor-num2.mlir
index dc26676eb4..9c24e93acb 100644
--- a/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor-num2.mlir
+++ b/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor-num2.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR -tag="test" %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR -tag="test" %s | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor.mlir b/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor.mlir
index d5f40bbc1f..a369f1289e 100644
--- a/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor.mlir
+++ b/test/mlir/accelerators/nnpa/driver/data-transformation-on-ztensor.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR -tag="test" %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR -tag="test" %s | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/driver/dense-out-attention-layer.mlir b/test/mlir/accelerators/nnpa/driver/dense-out-attention-layer.mlir
index 863efd1ee4..7b8c9cc748 100644
--- a/test/mlir/accelerators/nnpa/driver/dense-out-attention-layer.mlir
+++ b/test/mlir/accelerators/nnpa/driver/dense-out-attention-layer.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --EmitZHighIR --printIR %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitZHighIR --printIR %s | FileCheck %s
 
 // This pattern is found in bert models, where the output of attention layer is passed through a dense layer, then added with the attention layer's input.
 // To simplify the test we use the input of MatMul to mimic the input of attention layer.
@@ -15,5 +15,5 @@ func.func @test_matmul_add_add(%arg0: tensor<?x?x768xf32>, %arg1: tensor<768x768
 // CHECK-DAG:       [[VAR_0_:%.+]] = "zhigh.Stick"([[PARAM_0_]]) {layout = "3DS"} : (tensor<?x?x768xf32>) -> tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK-DAG:       [[VAR_1_:%.+]] = "zhigh.Stick"([[PARAM_1_]]) {layout = "2D"} : (tensor<768x768xf32>) -> tensor<768x768xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK-DAG:       [[VAR_2_:%.+]] = "zhigh.StickifiedConstant"() {alignment = 4096 : i64, value = dense_resource<zhigh> : tensor<49152xi8>} : () -> tensor<768xf16, #zhigh.layout<{dataLayout = "1D"}>>
-// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<768x768xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<768xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_3_:%.+]] = "zhigh.MatMul"([[VAR_0_]], [[VAR_1_]], [[VAR_2_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<768x768xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<768xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<?x?x768xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 }
diff --git a/test/mlir/accelerators/nnpa/driver/matmul-div-in-attention-layer.mlir b/test/mlir/accelerators/nnpa/driver/matmul-div-in-attention-layer.mlir
index 58d8a904eb..5c2b199907 100644
--- a/test/mlir/accelerators/nnpa/driver/matmul-div-in-attention-layer.mlir
+++ b/test/mlir/accelerators/nnpa/driver/matmul-div-in-attention-layer.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --EmitMLIR --nnpa-enable-scalar-bcast-binary --printIR %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitMLIR --nnpa-enable-scalar-bcast-binary --printIR %s | FileCheck %s
 
 // Check whether the compiler can remove unstick/stick so that the output of zdnn matmul is passed directly to zdnn div.
 func.func @matmul_div(%arg0: tensor<?x12x?x64xf32>) -> tensor<?x?x?x?xf32> {
@@ -13,8 +13,8 @@ func.func @matmul_div(%arg0: tensor<?x12x?x64xf32>) -> tensor<?x?x?x?xf32> {
 // CHECK:           memref.alloc
 // CHECK:           [[ALLOC:%.+]] = memref.alloc({{.*}}) {{.*}}: memref<?x?x1x?x32x64xf16>
 // CHECK-DAG:       [[MATMUL_RES:%.+]] = memref.cast [[ALLOC]] : memref<?x?x1x?x32x64xf16> to memref<?x?x1x?x?x?xf16>
-// CHECK:           "zlow.matmul"({{.*}}, {{.*}}, {{.*}}, {{.*}}, [[MATMUL_RES]]) {is_bcast = 0 : si64, is_stacked = -1 : si64} : (memref<?x1x1x?x?x64xf16>, memref<?x?x1x2x32x?xf16>, memref<?x?x1x1x32x?xf16>, memref<4xi64>, memref<?x?x1x?x?x?xf16>) -> ()
-// CHECK-NOT:       "zlow.stick"
-// CHECK-NOT:       "zlow.unstick"
+// CHECK:           "zlow.matmul"({{.*}}, {{.*}}, {{.*}}, {{.*}}, [[MATMUL_RES]]) {is_bcast1 = 0 : si64, is_bcast23 = 0 : si64, is_stacked = -1 : si64, transposeA = 0 : si64, transposeB = 0 : si64} : (memref<?x1x1x?x?x64xf16>, memref<?x?x1x2x32x?xf16>, memref<?x?x1x1x32x?xf16>, memref<4xi64>, memref<?x?x1x?x?x?xf16>) -> ()
+// CHECK-NOT:        "zlow.stick"
+// CHECK-NOT:        "zlow.unstick"
 // CHECK:           "zlow.div"([[MATMUL_RES]], {{.*}}, {{.*}}, {{.*}}) {layout = "3DS"} : (memref<?x?x1x?x?x?xf16>, memref<?x?x1x?x?x?xf16>, memref<3xi64>, memref<?x?x1x?x?x?xf16>) -> ()
 }
diff --git a/test/mlir/accelerators/nnpa/driver/saturation.mlir b/test/mlir/accelerators/nnpa/driver/saturation.mlir
index 80d4383b2b..0245786f20 100644
--- a/test/mlir/accelerators/nnpa/driver/saturation.mlir
+++ b/test/mlir/accelerators/nnpa/driver/saturation.mlir
@@ -1,11 +1,11 @@
-// RUN: onnx-mlir -mcpu=z16 -maccel=NNPA --EmitZHighIR --nnpa-saturation=false --printIR %s | FileCheck --check-prefix=ZHIGH_OFF %s
-// RUN: onnx-mlir -mcpu=z16 -maccel=NNPA --EmitZHighIR --nnpa-saturation=true --printIR %s | FileCheck --check-prefix=ZHIGH_ON %s
-// RUN: onnx-mlir -mcpu=z16 -maccel=NNPA --EmitZLowIR --nnpa-saturation=false --printIR %s | FileCheck --check-prefix=ZLOW_OFF %s
-// RUN: onnx-mlir -mcpu=z16 -maccel=NNPA --EmitZLowIR --nnpa-saturation=true --printIR %s | FileCheck --check-prefix=ZLOW_ON %s
-// RUN: onnx-mlir-opt -mcpu=z16 -maccel=NNPA --nnpa-saturation=false --shape-inference --convert-onnx-to-zhigh --zhigh-decompose-stick-unstick %s | FileCheck --check-prefix=DECOMPOSE_OFF %s
-// RUN: onnx-mlir-opt -mcpu=z16 -maccel=NNPA --nnpa-saturation=true --shape-inference --convert-onnx-to-zhigh --zhigh-decompose-stick-unstick %s | FileCheck --check-prefix=DECOMPOSE_ON %s
-// RUN: onnx-mlir -mcpu=z16 -maccel=NNPA --EmitMLIR --nnpa-saturation=false --enable-compiler-stick-unstick --printIR %s | FileCheck --check-prefix=COMPILER_STICK_OFF %s
-// RUN: onnx-mlir -mcpu=z16 -maccel=NNPA --EmitMLIR --nnpa-saturation=true --enable-compiler-stick-unstick --printIR %s | FileCheck --check-prefix=COMPILER_STICK_ON %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitZHighIR --nnpa-saturation=false --printIR %s | FileCheck --check-prefix=ZHIGH_OFF %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitZHighIR --nnpa-saturation=true --printIR %s | FileCheck --check-prefix=ZHIGH_ON %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitZLowIR --nnpa-saturation=false --printIR %s | FileCheck --check-prefix=ZLOW_OFF %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitZLowIR --nnpa-saturation=true --printIR %s | FileCheck --check-prefix=ZLOW_ON %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --nnpa-saturation=false --shape-inference --convert-onnx-to-zhigh --zhigh-decompose-stick-unstick %s | FileCheck --check-prefix=DECOMPOSE_OFF %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --nnpa-saturation=true --shape-inference --convert-onnx-to-zhigh --zhigh-decompose-stick-unstick %s | FileCheck --check-prefix=DECOMPOSE_ON %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitMLIR --nnpa-saturation=false --enable-compiler-stick-unstick --printIR %s | FileCheck --check-prefix=COMPILER_STICK_OFF %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --EmitMLIR --nnpa-saturation=true --enable-compiler-stick-unstick --printIR %s | FileCheck --check-prefix=COMPILER_STICK_ON %s
 
 // COM: for each case, check saturation ON and OFF.
 
diff --git a/test/mlir/accelerators/nnpa/driver/softmax-matmul-in-attention-layer.mlir b/test/mlir/accelerators/nnpa/driver/softmax-matmul-in-attention-layer.mlir
index 8a6d0d0ede..a685699191 100644
--- a/test/mlir/accelerators/nnpa/driver/softmax-matmul-in-attention-layer.mlir
+++ b/test/mlir/accelerators/nnpa/driver/softmax-matmul-in-attention-layer.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --enable-compiler-stick-unstick=false --EmitMLIR --printIR %s | FileCheck %s
 
 // Check whether the compiler can remove unstick/stick so that the output of zdnn softmax is passed directly to zdnn matmul.
 func.func @softmax_matmul(%arg0: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
diff --git a/test/mlir/accelerators/nnpa/module_op_be/compiler-config.mlir b/test/mlir/accelerators/nnpa/module_op_be/compiler-config.mlir
index bfbd33690a..f275622bce 100644
--- a/test/mlir/accelerators/nnpa/module_op_be/compiler-config.mlir
+++ b/test/mlir/accelerators/nnpa/module_op_be/compiler-config.mlir
@@ -1,5 +1,5 @@
 
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA -v -tag="test" %s -o %t 2>&1 | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA -v -tag="test" %s -o %t 2>&1 | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/module_op_be/module_op.mlir b/test/mlir/accelerators/nnpa/module_op_be/module_op.mlir
index 2a7ec2359d..ca55de53ce 100644
--- a/test/mlir/accelerators/nnpa/module_op_be/module_op.mlir
+++ b/test/mlir/accelerators/nnpa/module_op_be/module_op.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --maccel=NNPA --printIR %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --maccel=NNPA --printIR %s | FileCheck %s
 
 // CHECK: module attributes {llvm.data_layout = "E-{{.*}}", llvm.target_triple = "{{.*}}", "onnx-mlir.accels" = ["NNPA-0x10001"], "onnx-mlir.symbol-postfix" = "{{.*}}"}
 module {
diff --git a/test/mlir/accelerators/nnpa/module_op_be/module_op_arch15.mlir b/test/mlir/accelerators/nnpa/module_op_be/module_op_arch15.mlir
new file mode 100644
index 0000000000..4cb3cea8e8
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/module_op_be/module_op_arch15.mlir
@@ -0,0 +1,5 @@
+// RUN: onnx-mlir --march=arch15 --maccel=NNPA --printIR %s | FileCheck %s
+
+// CHECK: module attributes {llvm.data_layout = "E-{{.*}}", llvm.target_triple = "{{.*}}", "onnx-mlir.accels" = ["NNPA-0x10101"], "onnx-mlir.symbol-postfix" = "{{.*}}"}
+module {
+}
diff --git a/test/mlir/accelerators/nnpa/module_op_le/module_op.mlir b/test/mlir/accelerators/nnpa/module_op_le/module_op.mlir
index 8d2432488c..17d85fa387 100644
--- a/test/mlir/accelerators/nnpa/module_op_le/module_op.mlir
+++ b/test/mlir/accelerators/nnpa/module_op_le/module_op.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir --mcpu=z16 --accel=NNPA --printIR %s | FileCheck %s
+// RUN: onnx-mlir --march=z16 --accel=NNPA --printIR %s | FileCheck %s
 
 // CHECK: module attributes {llvm.data_layout = "e-{{.*}}", "onnx-mlir.symbol-postfix" = "{{.*}}"}
 module {
diff --git a/test/mlir/accelerators/nnpa/transform/fold-std-alloc.mlir b/test/mlir/accelerators/nnpa/transform/fold-std-alloc.mlir
index 20438651da..0df2ee00ad 100644
--- a/test/mlir/accelerators/nnpa/transform/fold-std-alloc.mlir
+++ b/test/mlir/accelerators/nnpa/transform/fold-std-alloc.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --fold-std-alloc %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --fold-std-alloc %s -split-input-file | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-clip-to-dlfloat-range.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-clip-to-dlfloat-range.mlir
index c3a638feb6..ee1e31f280 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-clip-to-dlfloat-range.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-clip-to-dlfloat-range.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --zhigh-clip-to-dlfloat -split-input-file %s || FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zhigh-clip-to-dlfloat -split-input-file %s || FileCheck %s
 
 func.func @should_clip_stick(%arg0: tensor<3x4x5xf32>) -> tensor<3x4x5xf32> { 
   %0 = "zhigh.Stick"(%arg0) {layout = "3DS"} : (tensor<3x4x5xf32>) -> tensor<3x4x5xf16, #zhigh.layout<{dataLayout = "3DS"}>>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-combine.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-combine.mlir
index 99b9ec6c60..175f228aba 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-combine.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-combine.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --canonicalize %s -split-input-file | FileCheck %s
 
 func.func @remove_stick_and_unstick_same_layout(%arg0 : tensor<10x10xf32>) -> tensor<10x10xf32> {
   %0 = "zhigh.Stick"(%arg0) : (tensor<10x10xf32>) -> tensor<10x10xf16, #zhigh.layout<{ dataLayout = "2D"}>>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/constprop.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/constprop.mlir
index 609cab1aec..78cae2e6f9 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/constprop.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/constprop.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --constprop-zhigh %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --constprop-zhigh %s -split-input-file | FileCheck %s
 
 // -----
 
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/quantizedconstprop.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/quantizedconstprop.mlir
new file mode 100644
index 0000000000..3125c749c7
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-constant-propagation-be/quantizedconstprop.mlir
@@ -0,0 +1,32 @@
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --constprop-zhigh %s -split-input-file | FileCheck %s
+
+// -----
+
+// Note: from zdnn, the padding value might be value other than 0
+
+func.func @quantized_weight_int8() -> tensor<7x65xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>> {
+  %0 = onnx.Constant dense<0.000000e+00> : tensor<f32>
+  %3 = onnx.Constant dense<0.00656270096> : tensor<f32>
+  %inp = "onnx.Constant"() {value = dense<[
+      [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],
+      [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, -65],
+      [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],
+      [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, -65],
+      [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],
+      [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61, -62, -63, -64, -65],
+      [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65]
+]> : tensor<7x65xi8>} : () -> tensor<7x65xi8>
+  %res:3 = "zhigh.QuantizedStick"(%inp, %3, %0) {layout = "2D", quantized_type = "WEIGHTS"} : (tensor<7x65xi8>, tensor<f32>, tensor<f32>) -> (tensor<7x65xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+  return %res#0 : tensor<7x65xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>
+}
+// CHECK-LABEL:  func.func @quantized_weight_int8
+// CHECK-SAME:   () -> tensor<7x65xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>> {
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.StickifiedConstant"() {alignment = 4096 : i64, value = dense_resource<zhigh> : tensor<8192xi8>} : () -> tensor<7x65xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>
+// CHECK:           return [[VAR_0_]] : tensor<7x65xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>
+// CHECK:         }
+// CHECK:         dialect_resources: {
+// CHECK:           builtin: {
+// CHECK:             zhigh: "0x0100000001FF02FE03FD04FC05FB06FA07F908F809F70AF60BF50CF40DF30EF20FF110F011EF12EE13ED14EC15EB16EA17E918E819E71AE61BE51CE41DE31EE21FE120E021DF22DE23DD24DC25DB26DA27D928D829D72AD62BD52CD42DD32ED22FD130D031CF32CE33CD34CC35CB36CA37C938C839C73AC63BC53CC43DC33EC23FC140C001FF02FE03FD04FC05FB06FA07F908F809F70AF60BF50CF40DF30EF20FF110F011EF12EE13ED14EC15EB16EA17E918E819E71AE61BE51CE41DE31EE21FE120E021DF22DE23DD24DC25DB26DA27D928D829D72AD62BD52CD42DD32ED22FD130D031CF32CE33CD34CC35CB36CA37C938C839C73AC63BC53CC43DC33EC23FC140C001FF02FE03FD04FC05FB06FA07F908F809F70AF60BF50CF40DF30EF20FF110F011EF12EE13ED14EC15EB16EA17E918E819E71AE61BE51CE41DE31EE21FE120E021DF22DE23DD24DC25DB26DA27D928D829D72AD62BD52CD42DD32ED22FD130D031CF32CE33CD34CC35CB36CA37C938C839C73AC63BC53CC43DC33EC23FC140C00101020203030404050506060707080809090A0A0B0B0C0C0D0D0E0E0F0F10101111121213131414151516161717181819191A1A1B1B1C1C1D1D1E1E1F1F20202121222223232424252526262727282829292A2A2B2B2C2C2D2D2E2E2F2F30303131323233333434353536363737383839393A3A3B3B3C3C3D3D3E3E3F3F4040000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041BF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041BF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041BF00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041410000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+// CHECK:           }
+// CHECK:         }
+
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-decompose-stick-unstick.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-decompose-stick-unstick.mlir
index c93417a689..52efd31c26 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-decompose-stick-unstick.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-decompose-stick-unstick.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt -mcpu=z16 -maccel=NNPA --zhigh-decompose-stick-unstick --split-input-file %s | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zhigh-decompose-stick-unstick --split-input-file %s | FileCheck %s
 
 func.func @test_relu(%arg0: tensor<1x3x5x?xf32>) -> tensor<1x3x5x?xf32> {
   %0 = "zhigh.Stick"(%arg0) {layout = "4D"} : (tensor<1x3x5x?xf32>) -> tensor<1x3x5x?xf16, #zhigh.layout<{dataLayout = "4D"}>>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-layout-propagation.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-layout-propagation.mlir
index 2a644f8d47..8ec3b55c2b 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-layout-propagation.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-layout-propagation.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --zhigh-layout-prop --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zhigh-layout-prop --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @add_layout_propagate_nhwc_1(%arg0: tensor<1x56x56x256xf16, #zhigh.layout<{dataLayout = "NHWC"}>>, %arg1: tensor<1x256x56x56xf32>) -> tensor<1x256x56x56xf16, #zhigh.layout<{dataLayout = "4D"}>> {
   %0 = "zhigh.Unstick"(%arg0) : (tensor<1x56x56x256xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<1x256x56x56xf32>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-recompose-to-stick-unstick.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-recompose-to-stick-unstick.mlir
index 4ae3b52a8e..3dac318abe 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-recompose-to-stick-unstick.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-recompose-to-stick-unstick.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt -mcpu=z16 -maccel=NNPA --zhigh-recompose-to-stick-unstick --split-input-file %s | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zhigh-recompose-to-stick-unstick --split-input-file %s | FileCheck %s
 
 func.func @test_relu(%arg0: tensor<1x3x5x?xf32>) -> tensor<1x3x5x?xf32> {
   %0 = "zhigh.F32ToDLF16"(%arg0) : (tensor<1x3x5x?xf32>) -> tensor<1x3x5x?xf16>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/conv.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/conv.mlir
index 66108f4b2c..0537e8e2b0 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/conv.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/conv.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @conv_valid_padding(%arg0: tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>, %arg1: tensor<2x2x3x1xf16, #zhigh.layout<{dataLayout = "HWCK"}>>, %arg2: tensor<1xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16> {
   %0 = "zhigh.Conv2D"(%arg0, %arg1, %arg2) {kernel_shape = [2, 2], padding_type = "VALID_PADDING", strides = [1, 1], act_func = "ACT_NONE"} : (tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>, tensor<2x2x3x1xf16, #zhigh.layout<{dataLayout = "HWCK"}>>, tensor<1xf16, #zhigh.layout<{dataLayout = "1D"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/elementwise.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/elementwise.mlir
index 69c66a655b..c99379b45d 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/elementwise.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/elementwise.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 /// Test the default behavior of unary lement-wise ops users give the shape of
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/gru.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/gru.mlir
index bd7c05048b..01c60f3d42 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/gru.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/gru.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @gru_return_single_step(%input : tensor<3x5x7xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %h0 : tensor<1x5x9xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %input_weights : tensor<1x7x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>, %input_bias : tensor<1x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>, %hidden_weights : tensor<1x9x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>, %hidden_bias : tensor<1x27xf16, #zhigh.layout<{dataLayout = "ZRH"}>>) -> tensor<*xf16> {
 
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/lstm.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/lstm.mlir
index d4efc2716f..378478155c 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/lstm.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/lstm.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @test_lstm_all_timesteps(%X: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, %W: tensor<1x8x64xf16, #zhigh.layout<{dataLayout = "FICO"}>>, %R: tensor<1x16x64xf16, #zhigh.layout<{dataLayout = "FICO"}>>) -> (tensor<*xf16>) {
   %cst = "onnx.NoValue"() {value} : () -> none
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/matmul.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/matmul.mlir
index 9328578423..6eab625b55 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/matmul.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/matmul.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @test_matmul_2d(%arg0 : tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, %arg1 : tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<*xf16> {
   %cst = "onnx.NoValue"() {value} : () -> none
@@ -8,7 +8,7 @@ func.func @test_matmul_2d(%arg0 : tensor<4x8xf16, #zhigh.layout<{dataLayout = "2
 // CHECK-LABEL:  func @test_matmul_2d
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, [[PARAM_1_:%.+]]: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>> {
 // CHECK:           [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<4x8xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK:           return [[VAR_0_]] : tensor<4x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK:         }
 }
@@ -23,7 +23,7 @@ func.func @test_matmul_3d_broadcast(%arg0 : tensor<2x4x8xf16, #zhigh.layout<{dat
 // CHECK-LABEL:  func @test_matmul_3d_broadcast
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, [[PARAM_1_:%.+]]: tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>> {
 // CHECK:           [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) : (tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<8x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:           return [[VAR_0_]] : tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:         }
 }
@@ -38,7 +38,7 @@ func.func @test_matmul_3d_stack(%arg0 : tensor<2x4x8xf16, #zhigh.layout<{dataLay
 // CHECK-LABEL:  func @test_matmul_3d_stack
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, [[PARAM_1_:%.+]]: tensor<2x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>> {
 // CHECK:           [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) : (tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<2x4x8xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x8x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:           return [[VAR_0_]] : tensor<2x4x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:         }
 }
@@ -53,7 +53,7 @@ func.func @test_matmul_2d_unknown_dims(%arg0 : tensor<?x?xf16, #zhigh.layout<{da
 // CHECK-LABEL:  func @test_matmul_2d_unknown_dims
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, [[PARAM_1_:%.+]]: tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>> {
 // CHECK:           [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) : (tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<?x?xf16, #zhigh.layout<{dataLayout = "2D"}>>, tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK:           return [[VAR_0_]] : tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>
 // CHECK:         }
 }
@@ -68,7 +68,7 @@ func.func @test_matmul_3d_broadcast_unknown_dims(%arg0 : tensor<2x?x?xf16, #zhig
 // CHECK-LABEL:  func @test_matmul_3d_broadcast_unknown_dims
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, [[PARAM_1_:%.+]]: tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>) -> tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>> {
 // CHECK:           [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) : (tensor<2x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<2x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<?x16xf16, #zhigh.layout<{dataLayout = "2D"}>>, none) -> tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:           return [[VAR_0_]] : tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:         }
 }
@@ -83,7 +83,7 @@ func.func @test_matmul_3d_stack_unknown_dims(%arg0 : tensor<2x?x?xf16, #zhigh.la
 // CHECK-LABEL:  func @test_matmul_3d_stack_unknown_dims
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<2x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, [[PARAM_1_:%.+]]: tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>) -> tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>> {
 // CHECK:           [[VAR_cst_:%.+]] = "onnx.NoValue"() {value} : () -> none
-// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) : (tensor<2x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:           [[VAR_0_:%.+]] = "zhigh.MatMul"([[PARAM_0_]], [[PARAM_1_]], [[VAR_cst_]]) {transposeA = 0 : si64, transposeB = 0 : si64} : (tensor<2x?x?xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>, none) -> tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:           return [[VAR_0_]] : tensor<2x?x16xf16, #zhigh.layout<{dataLayout = "3DS"}>>
 // CHECK:         }
 }
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/meanreduce.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/meanreduce.mlir
index eee8622cf9..6a1dc1d425 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/meanreduce.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/meanreduce.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @should_lower_to_zlow(%arg0: tensor<1x5x7x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16> {
   %0 = "zhigh.MeanReduce2d"(%arg0) : (tensor<1x5x7x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/pool.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/pool.mlir
index 2eb25df189..5d92b987e1 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/pool.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/pool.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @maxpool_valid_padding(%arg0: tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16> {
   %0 = "zhigh.MaxPool2D"(%arg0) {kernel_shape = [2, 2], padding_type = "VALID_PADDING", strides = [1, 1], act_func = "ACT_NONE"} : (tensor<1x32x32x3xf16, #zhigh.layout<{dataLayout = "NHWC"}>>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/quantized_matmul.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/quantized_matmul.mlir
new file mode 100644
index 0000000000..e1c8cb6cc4
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/quantized_matmul.mlir
@@ -0,0 +1,41 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+
+func.func @test_zhigh_quantized_matmul(%arg0: tensor<1x3x5xf32>, %arg1: tensor<5x7xf32>, %arg2: tensor<7xf32>) -> tensor<*xf16> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %x:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "dlfloat16"} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>)
+  %y:3 = "zhigh.QuantizedStick"(%arg1, %none, %none) {layout = "2D", quantized_type = "weights"} : (tensor<5x7xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  %b:3 = "zhigh.QuantizedStick"(%arg2, %none, %none) {layout = "1D", quantized_type = "int8"} : (tensor<7xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  %m:3 = "zhigh.QuantizedMatMul"(%x#0, %x#1, %x#2, %y#0, %y#1, %y#2, %b#0, %b#1, %b#2, %none, %none) {DequantizeOutput = 0 : si64} : (tensor<*xf16>, tensor<f32>, tensor<f32>, tensor<*xi8>, tensor<f32>, tensor<f32>, tensor<*xi8>, tensor<f32>, tensor<f32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>) 
+  onnx.Return %m#0: tensor<*xf16>
+
+// CHECK-LABEL:  func.func @test_zhigh_quantized_matmul
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x3x5xf32>, [[PARAM_1_:%.+]]: tensor<5x7xf32>, [[PARAM_2_:%.+]]: tensor<7xf32>) -> tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>> {
+// CHECK:           [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[Out_:%.+]], [[RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_0_]], [[VAR_0_]]) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_0_:%.+]], [[RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[PARAM_1_]], [[VAR_0_]], [[VAR_0_]]) {layout = "2D", quantized_type = "weights", sym_mode = 0 : i64} : (tensor<5x7xf32>, none, none) -> (tensor<5x7xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_3_:%.+]], [[RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[PARAM_2_]], [[VAR_0_]], [[VAR_0_]]) {layout = "1D", quantized_type = "int8", sym_mode = 0 : i64} : (tensor<7xf32>, none, none) -> (tensor<7xi8, #zhigh.layout<{dataLayout = "1D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_6_:%.+]], [[OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[Out_]], [[RecScale_]], [[VAR_Offset_]], [[Out_]]_0, [[RecScale_]]_1, [[VAR_Offset_]]_2, [[Out_]]_3, [[RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_0_]], [[VAR_0_]]) {DequantizeOutput = 0 : si64, DisableClipping = 0 : si64, PreComputedBias = 0 : si64} : (tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<5x7xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<7xi8, #zhigh.layout<{dataLayout = "1D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, none, none) -> (tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           onnx.Return [[Out_6_]] : tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_zhigh_quantized_matmul_dequantized(%arg0: tensor<1x3x5xf32>, %arg1: tensor<5x7xf32>, %arg2: tensor<7xf32>) -> tensor<*xf16> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %x:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "dlfloat16"} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>)
+  %y:3 = "zhigh.QuantizedStick"(%arg1, %none, %none) {layout = "2D", quantized_type = "weights"} : (tensor<5x7xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  %b:3 = "zhigh.QuantizedStick"(%arg2, %none, %none) {layout = "1D", quantized_type = "int8"} : (tensor<7xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  %m:3 = "zhigh.QuantizedMatMul"(%x#0, %x#1, %x#2, %y#0, %y#1, %y#2, %b#0, %b#1, %b#2, %none, %none) {DequantizeOutput = -1 : si64} : (tensor<*xf16>, tensor<f32>, tensor<f32>, tensor<*xi8>, tensor<f32>, tensor<f32>, tensor<*xi8>, tensor<f32>, tensor<f32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>) 
+  onnx.Return %m#0: tensor<*xf16>
+
+// CHECK-LABEL:  func.func @test_zhigh_quantized_matmul_dequantized
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x3x5xf32>, [[PARAM_1_:%.+]]: tensor<5x7xf32>, [[PARAM_2_:%.+]]: tensor<7xf32>) -> tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS"}>> {
+// CHECK:           [[VAR_0_:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[Out_:%.+]], [[RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[VAR_0_]], [[VAR_0_]]) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_0_:%.+]], [[RecScale_1_:%.+]], [[VAR_Offset_2_:%.+]] = "zhigh.QuantizedStick"([[PARAM_1_]], [[VAR_0_]], [[VAR_0_]]) {layout = "2D", quantized_type = "weights", sym_mode = 0 : i64} : (tensor<5x7xf32>, none, none) -> (tensor<5x7xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_3_:%.+]], [[RecScale_4_:%.+]], [[VAR_Offset_5_:%.+]] = "zhigh.QuantizedStick"([[PARAM_2_]], [[VAR_0_]], [[VAR_0_]]) {layout = "1D", quantized_type = "int8", sym_mode = 0 : i64} : (tensor<7xf32>, none, none) -> (tensor<7xi8, #zhigh.layout<{dataLayout = "1D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           [[Out_6_:%.+]], [[OutRecScale_:%.+]], [[VAR_OutOffset_:%.+]] = "zhigh.QuantizedMatMul"([[Out_]], [[RecScale_]], [[VAR_Offset_]], [[Out_]]_0, [[RecScale_]]_1, [[VAR_Offset_]]_2, [[Out_]]_3, [[RecScale_]]_4, [[VAR_Offset_]]_5, [[VAR_0_]], [[VAR_0_]]) {DequantizeOutput = -1 : si64, DisableClipping = 0 : si64, PreComputedBias = 0 : si64} : (tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>, tensor<5x7xi8, #zhigh.layout<{dataLayout = "2D", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>, tensor<7xi8, #zhigh.layout<{dataLayout = "1D", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>, none, none) -> (tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           onnx.Return [[Out_6_]] : tensor<1x3x7xf16, #zhigh.layout<{dataLayout = "3DS"}>>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/quantized_stick.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/quantized_stick.mlir
new file mode 100644
index 0000000000..61b66f9543
--- /dev/null
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/quantized_stick.mlir
@@ -0,0 +1,44 @@
+// RUN: onnx-mlir-opt --march=arch15 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+
+func.func @test_zhigh_quantized_stick_dlfloat16(%arg0: tensor<1x3x5xf32>) -> tensor<*xf16> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xf16>, tensor<f32>, tensor<f32>)
+  onnx.Return %0#0: tensor<*xf16>
+
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_dlfloat16
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x3x5xf32>) -> tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>> {
+// CHECK:           [[NONE:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[Out_:%.+]], [[RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[NONE]], [[NONE]]) {layout = "3DS", quantized_type = "dlfloat16", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           onnx.Return [[Out_]] : tensor<1x3x5xf16, #zhigh.layout<{dataLayout = "3DS", quantizedType = "DLFLOAT16"}>>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_zhigh_quantized_stick_int8(%arg0: tensor<1x3x5xf32>) -> tensor<*xi8> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "int8", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  onnx.Return %0#0: tensor<*xi8>
+
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_int8
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x3x5xf32>) -> tensor<1x3x5xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>> {
+// CHECK:           [[NONE:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[Out_:%.+]], [[RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[NONE]], [[NONE]]) {layout = "3DS", quantized_type = "int8", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<1x3x5xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           onnx.Return [[Out_]] : tensor<1x3x5xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "INT8"}>>
+// CHECK:         }
+}
+
+// -----
+
+func.func @test_zhigh_quantized_stick_weights(%arg0: tensor<1x3x5xf32>) -> tensor<*xi8> {
+  %none = "onnx.NoValue"() {value} : () -> none
+  %0:3 = "zhigh.QuantizedStick"(%arg0, %none, %none) {layout = "3DS", quantized_type = "weights", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<*xi8>, tensor<f32>, tensor<f32>)
+  onnx.Return %0#0: tensor<*xi8>
+
+// CHECK-LABEL:  func.func @test_zhigh_quantized_stick_weights
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<1x3x5xf32>) -> tensor<1x3x5xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "WEIGHTS"}>> {
+// CHECK:           [[NONE:%.+]] = "onnx.NoValue"() {value} : () -> none
+// CHECK:           [[Out_:%.+]], [[RecScale_:%.+]], [[VAR_Offset_:%.+]] = "zhigh.QuantizedStick"([[PARAM_0_]], [[NONE]], [[NONE]]) {layout = "3DS", quantized_type = "weights", sym_mode = 0 : i64} : (tensor<1x3x5xf32>, none, none) -> (tensor<1x3x5xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "WEIGHTS"}>>, tensor<f32>, tensor<f32>)
+// CHECK:           onnx.Return [[Out_]] : tensor<1x3x5xi8, #zhigh.layout<{dataLayout = "3DS", quantizedType = "WEIGHTS"}>>
+// CHECK:         }
+}
diff --git a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/stick-unstick.mlir b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/stick-unstick.mlir
index 60d73d9343..c616da172a 100644
--- a/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/stick-unstick.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zhigh-shape-inference/stick-unstick.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --shape-inference %s -split-input-file | FileCheck %s
 
 func.func @stick_unstick_static_dims(%arg0: tensor<1x3x5x7xf32>) -> tensor<*xf32> {
   %0 = "zhigh.Stick"(%arg0) {layout = "NHWC"} : (tensor<1x3x5x7xf32>) -> tensor<*xf16>
diff --git a/test/mlir/accelerators/nnpa/transform/zlow-normalize-by-using-dummyop.mlir b/test/mlir/accelerators/nnpa/transform/zlow-normalize-by-using-dummyop.mlir
index c03db866d7..ca55ab5ee2 100644
--- a/test/mlir/accelerators/nnpa/transform/zlow-normalize-by-using-dummyop.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zlow-normalize-by-using-dummyop.mlir
@@ -1,11 +1,11 @@
-// RUN: (onnx-mlir-opt --mcpu=z16 --maccel=NNPA --normalize-memrefs %s 2>&1 || true) | FileCheck --check-prefix=FAILED %s
+// RUN: (onnx-mlir-opt --march=z16 --maccel=NNPA --normalize-memrefs %s 2>&1 || true) | FileCheck --check-prefix=FAILED %s
 
 // COM: Current MLIR normalize-memres does not support multiple dereferencing uses
 // in a single op, check expected failure emitted by MLIR. 
 
 // FAILED: "multiple dereferencing uses in a single op not supported"
 
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --zlow-dummyop-for-multideref --normalize-memrefs --canonicalize %s | FileCheck --check-prefix=PASSED %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zlow-dummyop-for-multideref --normalize-memrefs --canonicalize %s | FileCheck --check-prefix=PASSED %s
 
 // COM: Check normalize memrefs when there are multiple dereferencing uses in a single op.
 // COM: Check that --zlow-dummyop-for-multideref can help to bypass the issue.
diff --git a/test/mlir/accelerators/nnpa/transform/zlow-rewrite.mlir b/test/mlir/accelerators/nnpa/transform/zlow-rewrite.mlir
index fd38ea6bf0..07cffa92b5 100644
--- a/test/mlir/accelerators/nnpa/transform/zlow-rewrite.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zlow-rewrite.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --zlow-rewrite --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zlow-rewrite --canonicalize %s -split-input-file | FileCheck %s
 
 #map = affine_map<(d0, d1) -> (0, d1 floordiv 64, 0, d0 floordiv 32, d0 mod 32, d1 mod 64)>
 func.func @remove_dangling_stick(%arg0: memref<?x?xf32>) -> memref<?x?xf32> {
diff --git a/test/mlir/accelerators/nnpa/transform/zlow-stick-unstick-expansion.mlir b/test/mlir/accelerators/nnpa/transform/zlow-stick-unstick-expansion.mlir
index 48dd275710..65676f4805 100644
--- a/test/mlir/accelerators/nnpa/transform/zlow-stick-unstick-expansion.mlir
+++ b/test/mlir/accelerators/nnpa/transform/zlow-stick-unstick-expansion.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt --mcpu=z16 --maccel=NNPA --zlow-stick-expansion %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt --march=z16 --maccel=NNPA --zlow-stick-expansion %s -split-input-file | FileCheck %s
 
 // -----
 
@@ -187,10 +187,10 @@ func.func @test_unstick_expansion(%arg0: memref<16x8x128xf16, #map>) -> memref<1
 // CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0, d1, d2) -> (d0, d2 floordiv 64, 0, d1 floordiv 32, d1 mod 32, d2 mod 64)>
 // CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0) -> (d0 * 64)>
 // CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0)[s0] -> (s0 floordiv 64)>
-// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<(d0) -> (d0 + 8)>
-// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0) -> (d0 + 16)>
-// CHECK-DAG:   [[MAP_5_:#.+]] = affine_map<(d0) -> (d0 + 24)>
-// CHECK-DAG:   [[MAP_6_:#.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0)[s0] -> (d0 + 8)>
+// CHECK-DAG:   [[MAP_5_:#.+]] = affine_map<(d0)[s0] -> (d0 + 16)>
+// CHECK-DAG:   [[MAP_6_:#.+]] = affine_map<(d0)[s0] -> (d0 + 24)>
 // CHECK-LABEL:  func.func @test_unstick_expansion
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<16x8x128xf16, #map>) -> memref<16x8x128xf32> {
 // CHECK-DAG:       [[CST_28_:%.+]] = arith.constant 28 : index
@@ -205,48 +205,54 @@ func.func @test_unstick_expansion(%arg0: memref<16x8x128xf16, #map>) -> memref<1
 // CHECK-DAG:       [[CST_64_:%.+]] = arith.constant 64 : index
 // CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
 // CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<16x8x128xf32>
-// CHECK-DAG:       [[LOOP_0_:%.+]]:3 = krnl.define_loops 3
 // CHECK-DAG:       [[VAR_reinterpret_cast_:%.+]] = memref.reinterpret_cast [[PARAM_0_]] to offset: [0], sizes: [2, 64], strides: [64, 1] : memref<16x8x128xf16, #map> to memref<2x64xf16>
-// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 16, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 8, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 2){
-// CHECK:             [[VAR_1_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
-// CHECK:             [[VAR_2_:%.+]] = affine.apply [[MAP_1_]]([[VAR_1_]]#2)
-// CHECK:             [[VAR_3_:%.+]] = krnl.get_linear_offset_index [[PARAM_0_]] at {{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_2_]]{{.}} : memref<16x8x128xf16, #map>
-// CHECK:             [[VAR_4_:%.+]] = affine.apply [[MAP_2_]]([[VAR_1_]]#2){{.}}[[VAR_3_]]{{.}}
-// CHECK:             krnl.prefetch [[PARAM_0_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_2_]]{{.}}, read, locality<1>, data : memref<16x8x128xf16, #map>
-// CHECK:             krnl.prefetch [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_2_]]{{.}}, write, locality<1>, data : memref<16x8x128xf32>
-// CHECK:             scf.if [[VAR_true_]] {
-// CHECK:               scf.for [[I_3_:%.+]] = [[CST_0_]] to [[CST_64_]] step [[CST_32_]] {
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[I_3_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK-DAG:             [[VAR_6_:%.+]] = affine.apply [[MAP_3_]]([[I_3_]])
-// CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_1_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[VAR_6_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK-DAG:             [[VAR_8_:%.+]] = affine.apply [[MAP_4_]]([[I_3_]])
-// CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_2_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[VAR_8_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK-DAG:             [[VAR_10_:%.+]] = affine.apply [[MAP_5_]]([[I_3_]])
-// CHECK:                 [[LOAD_VAR_reinterpret_cast_MEM_3_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[VAR_10_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK:                 [[VAR_output1_:%.+]], [[VAR_output2_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_output1_0_:%.+]], [[VAR_output2_1_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_1_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_output1_2_:%.+]], [[VAR_output2_3_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_2_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_output1_4_:%.+]], [[VAR_output2_5_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_3_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_12_:%.+]] = affine.apply [[MAP_6_]]([[I_3_]]){{.}}[[VAR_2_]]{{.}}
-// CHECK:                 vector.store [[VAR_output1_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]2] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_13_:%.+]] = arith.addi [[VAR_12_]], [[CST_4_]] : index
-// CHECK:                 vector.store [[VAR_output2_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]3] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_14_:%.+]] = arith.addi [[VAR_12_]], [[CST_8_]] : index
-// CHECK:                 vector.store [[VAR_output1_0_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]4] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_15_:%.+]] = arith.addi [[VAR_12_]], [[CST_12_]] : index
-// CHECK:                 vector.store [[VAR_output2_1_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]5] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_16_:%.+]] = arith.addi [[VAR_12_]], [[CST_16_]] : index
-// CHECK:                 vector.store [[VAR_output1_2_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]6] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_17_:%.+]] = arith.addi [[VAR_12_]], [[CST_20_]] : index
-// CHECK:                 vector.store [[VAR_output2_3_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]7] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_18_:%.+]] = arith.addi [[VAR_12_]], [[CST_24_]] : index
-// CHECK:                 vector.store [[VAR_output1_4_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]8] : memref<16x8x128xf32>, vector<4xf32>
-// CHECK:                 [[VAR_19_:%.+]] = arith.addi [[VAR_12_]], [[CST_28_]] : index
-// CHECK:                 vector.store [[VAR_output2_5_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]9] : memref<16x8x128xf32>, vector<4xf32>
+// CHECK-DAG:       [[LOOP_0_:%.+]] = krnl.define_loops 1
+// CHECK:           krnl.iterate([[LOOP_0_]]) with ([[LOOP_0_]] -> [[I_0_:%.+]] = 0 to 16){
+// CHECK-DAG:         [[VAR_1_:%.+]] = krnl.get_induction_var_value([[LOOP_0_]]) : (!krnl.loop) -> index
+// CHECK-DAG:         [[LOOP_1_:%.+]] = krnl.define_loops 1
+// CHECK:             krnl.iterate([[LOOP_1_]]) with ([[LOOP_1_]] -> [[I_1_:%.+]] = 0 to 8){
+// CHECK-DAG:           [[VAR_3_:%.+]] = krnl.get_induction_var_value([[LOOP_1_]]) : (!krnl.loop) -> index
+// CHECK-DAG:           [[LOOP_2_:%.+]] = krnl.define_loops 1
+// CHECK:               krnl.iterate([[LOOP_2_]]) with ([[LOOP_2_]] -> [[I_2_:%.+]] = 0 to 2){
+// CHECK:                 [[VAR_5_:%.+]] = krnl.get_induction_var_value([[LOOP_2_]]) : (!krnl.loop) -> index
+// CHECK:                 [[VAR_6_:%.+]] = affine.apply [[MAP_1_]]([[VAR_5_]])
+// CHECK:                 [[VAR_7_:%.+]] = krnl.get_linear_offset_index [[PARAM_0_]] at {{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_6_]]{{.}} : memref<16x8x128xf16, #map>
+// CHECK:                 [[VAR_8_:%.+]] = affine.apply [[MAP_2_]]([[VAR_5_]]){{.}}[[VAR_7_]]{{.}}
+// CHECK:                 krnl.prefetch [[PARAM_0_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_6_]]{{.}}, read, locality<1>, data : memref<16x8x128xf16, #map>
+// CHECK:                 krnl.prefetch [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_6_]]{{.}}, write, locality<1>, data : memref<16x8x128xf32>
+// CHECK:                 scf.if [[VAR_true_]] {
+// CHECK:                   scf.for [[I_3_:%.+]] = [[CST_0_]] to [[CST_64_]] step [[CST_32_]] {
+// CHECK-DAG:                 [[VAR_9_:%.+]] = affine.apply [[MAP_3_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK-DAG:                 [[LOAD_VAR_reinterpret_cast_MEM_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[I_3_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_:%.+]], [[VAR_output2_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     [[VAR_11_:%.+]] = affine.apply [[MAP_4_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK:                     [[LOAD_VAR_reinterpret_cast_MEM_1_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_11_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_0_:%.+]], [[VAR_output2_1_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_1_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     [[VAR_13_:%.+]] = affine.apply [[MAP_5_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK:                     [[LOAD_VAR_reinterpret_cast_MEM_2_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_13_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_2_:%.+]], [[VAR_output2_3_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_2_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     [[VAR_15_:%.+]] = affine.apply [[MAP_6_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK:                     [[LOAD_VAR_reinterpret_cast_MEM_3_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_15_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_4_:%.+]], [[VAR_output2_5_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_3_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     vector.store [[VAR_output1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_9_]]{{.}} : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_17_:%.+]] = arith.addi [[VAR_9_]], [[CST_4_]] : index
+// CHECK:                     vector.store [[VAR_output2_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]7] : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_18_:%.+]] = arith.addi [[VAR_9_]], [[CST_8_]] : index
+// CHECK:                     vector.store [[VAR_output1_0_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]8] : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_19_:%.+]] = arith.addi [[VAR_9_]], [[CST_12_]] : index
+// CHECK:                     vector.store [[VAR_output2_1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]9] : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_20_:%.+]] = arith.addi [[VAR_9_]], [[CST_16_]] : index
+// CHECK:                     vector.store [[VAR_output1_2_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_20_]]{{.}} : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_21_:%.+]] = arith.addi [[VAR_9_]], [[CST_20_]] : index
+// CHECK:                     vector.store [[VAR_output2_3_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_21_]]{{.}} : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_22_:%.+]] = arith.addi [[VAR_9_]], [[CST_24_]] : index
+// CHECK:                     vector.store [[VAR_output1_4_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_22_]]{{.}} : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                     [[VAR_23_:%.+]] = arith.addi [[VAR_9_]], [[CST_28_]] : index
+// CHECK:                     vector.store [[VAR_output2_5_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_23_]]{{.}} : memref<16x8x128xf32>, vector<4xf32>
+// CHECK:                   }
+// CHECK:                 } else {
+// CHECK:                 }
 // CHECK:               }
-// CHECK:             } else {
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return [[RES_]] : memref<16x8x128xf32>
@@ -267,10 +273,10 @@ func.func @test_unstick_expansion_127(%arg0: memref<16x8x127xf16, #map>) -> memr
 // CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0) -> (d0 * 64)>
 // CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0)[s0] -> (s0 floordiv 64)>
 // CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<(d0)[s0] -> (d0 * -64 + 63)>
-// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0) -> (d0 + 8)>
-// CHECK-DAG:   [[MAP_5_:#.+]] = affine_map<(d0) -> (d0 + 16)>
-// CHECK-DAG:   [[MAP_6_:#.+]] = affine_map<(d0) -> (d0 + 24)>
-// CHECK-DAG:   [[MAP_7_:#.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG:   [[MAP_5_:#.+]] = affine_map<(d0)[s0] -> (d0 + 8)>
+// CHECK-DAG:   [[MAP_6_:#.+]] = affine_map<(d0)[s0] -> (d0 + 16)>
+// CHECK-DAG:   [[MAP_7_:#.+]] = affine_map<(d0)[s0] -> (d0 + 24)>
 // CHECK-DAG:   [[MAP_8_:#.+]] = affine_map<()[s0] -> (-s0 + 120)>
 // CHECK-DAG:   [[MAP_9_:#.+]] = affine_map<()[s0] -> ((-s0 + 127) mod 8)>
 // CHECK-DAG:   [[MAP_10_:#.+]] = affine_map<()[s0] -> (-s0 - (-s0 + 127) mod 8 + 127)>
@@ -289,73 +295,80 @@ func.func @test_unstick_expansion_127(%arg0: memref<16x8x127xf16, #map>) -> memr
 // CHECK-DAG:       [[CST_4_:%.+]] = arith.constant 4 : index
 // CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
 // CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<16x8x127xf32>
-// CHECK-DAG:       [[LOOP_0_:%.+]]:3 = krnl.define_loops 3
 // CHECK-DAG:       [[VAR_reinterpret_cast_:%.+]] = memref.reinterpret_cast [[PARAM_0_]] to offset: [0], sizes: [2, 64], strides: [64, 1] : memref<16x8x127xf16, #map> to memref<2x64xf16>
-// CHECK:           krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 16, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 8, [[LOOP_0_]]#2 -> [[I_2_:%.+]] = 0 to 2){
-// CHECK:             [[VAR_1_:%.+]]:3 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1, [[LOOP_0_]]#2) : (!krnl.loop, !krnl.loop, !krnl.loop) -> (index, index, index)
-// CHECK:             [[VAR_2_:%.+]] = affine.apply [[MAP_1_]]([[VAR_1_]]#2)
-// CHECK:             [[VAR_3_:%.+]] = krnl.get_linear_offset_index [[PARAM_0_]] at {{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_2_]]{{.}} : memref<16x8x127xf16, #map>
-// CHECK-DAG:         [[VAR_4_:%.+]] = affine.apply [[MAP_2_]]([[VAR_1_]]#2){{.}}[[VAR_3_]]{{.}}
-// CHECK-DAG:         [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<8xf32>
-// CHECK:             krnl.prefetch [[PARAM_0_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_2_]]{{.}}, read, locality<1>, data : memref<16x8x127xf16, #map>
-// CHECK:             krnl.prefetch [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_2_]]{{.}}, write, locality<1>, data : memref<16x8x127xf32>
-// CHECK:             [[VAR_5_:%.+]] = affine.apply [[MAP_3_]]([[VAR_1_]]#2){{.}}[[VAR_3_]]{{.}}
-// CHECK:             [[VAR_6_:%.+]] = arith.cmpi sge, [[VAR_5_]], [[CST_0_]] : index
-// CHECK:             scf.if [[VAR_6_]] {
-// CHECK:               scf.for [[I_3_:%.+]] = [[CST_0_]] to [[CST_64_]] step [[CST_32_]] {
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[I_3_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK-DAG:             [[VAR_8_:%.+]] = affine.apply [[MAP_4_]]([[I_3_]])
-// CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_1_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[VAR_8_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK-DAG:             [[VAR_10_:%.+]] = affine.apply [[MAP_5_]]([[I_3_]])
-// CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_2_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[VAR_10_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK-DAG:             [[VAR_12_:%.+]] = affine.apply [[MAP_6_]]([[I_3_]])
-// CHECK:                 [[LOAD_VAR_reinterpret_cast_MEM_3_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[VAR_12_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK:                 [[VAR_output1_:%.+]], [[VAR_output2_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_output1_1_:%.+]], [[VAR_output2_2_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_1_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_output1_3_:%.+]], [[VAR_output2_4_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_2_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_output1_5_:%.+]], [[VAR_output2_6_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_3_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_14_:%.+]] = affine.apply [[MAP_7_]]([[I_3_]]){{.}}[[VAR_2_]]{{.}}
-// CHECK:                 vector.store [[VAR_output1_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]4] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_15_:%.+]] = arith.addi [[VAR_14_]], [[CST_4_]] : index
-// CHECK:                 vector.store [[VAR_output2_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]5] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_16_:%.+]] = arith.addi [[VAR_14_]], [[CST_8_]] : index
-// CHECK:                 vector.store [[VAR_output1_1_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]6] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_17_:%.+]] = arith.addi [[VAR_14_]], [[CST_12_]] : index
-// CHECK:                 vector.store [[VAR_output2_2_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]7] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_18_:%.+]] = arith.addi [[VAR_14_]], [[CST_16_]] : index
-// CHECK:                 vector.store [[VAR_output1_3_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]8] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_19_:%.+]] = arith.addi [[VAR_14_]], [[CST_20_]] : index
-// CHECK:                 vector.store [[VAR_output2_4_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]9] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_20_:%.+]] = arith.addi [[VAR_14_]], [[CST_24_]] : index
-// CHECK:                 vector.store [[VAR_output1_5_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_20_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[VAR_21_:%.+]] = arith.addi [[VAR_14_]], [[CST_28_]] : index
-// CHECK:                 vector.store [[VAR_output2_6_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_21_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:               }
-// CHECK:             } else {
-// CHECK:               [[LOAD_VAR_reinterpret_cast_MEM_4_:%.+]] = affine.apply [[MAP_8_]](){{.}}[[VAR_2_]]{{.}}
-// CHECK:               scf.for [[I_4_:%.+]] = [[CST_0_]] to [[LOAD_VAR_reinterpret_cast_MEM_4_]] step [[CST_8_]] {
-// CHECK:                 [[LOAD_VAR_reinterpret_cast_MEM_5_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[I_4_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK:                 [[VAR_output1_1_1_:%.+]], [[VAR_output2_2_1_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_5_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:                 [[VAR_12_1_:%.+]] = affine.apply [[MAP_7_]]([[I_4_]]){{.}}[[VAR_2_]]{{.}}
-// CHECK:                 vector.store [[VAR_output1_1_1_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]2] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:                 [[LOAD_VAR_reinterpret_cast_MEM_3_:%.+]] = arith.addi [[VAR_12_1_]], [[CST_4_]] : index
-// CHECK:                 vector.store [[VAR_output2_2_1_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]3] : memref<16x8x127xf32>, vector<4xf32>
-// CHECK:               }
-// CHECK-DAG:           [[VAR_8_1_:%.+]] = affine.apply [[MAP_9_]](){{.}}[[VAR_2_]]{{.}}
-// CHECK-DAG:           [[LOAD_VAR_reinterpret_cast_MEM_1_:%.+]] = affine.apply [[MAP_10_]](){{.}}[[VAR_2_]]{{.}}
-// CHECK:               [[LOAD_VAR_reinterpret_cast_MEM_6_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_4_]], [[LOAD_VAR_reinterpret_cast_MEM_1_]]{{.}} : memref<2x64xf16>, vector<8xf16>
-// CHECK:               [[VAR_output1_1_:%.+]], [[VAR_output2_1_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_6_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
-// CHECK:               vector.store [[VAR_output1_1_]], [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<8xf32>, vector<4xf32>
-// CHECK:               vector.store [[VAR_output2_1_]], [[RES_1_]]{{.}}[[CST_4_]]{{.}} : memref<8xf32>, vector<4xf32>
-// CHECK:               scf.for [[I_5_:%.+]] = [[CST_0_]] to [[VAR_8_1_]] step [[CST_1_]] {
-// CHECK-DAG:             [[LOAD_VAR_reinterpret_cast_MEM_5_:%.+]] = krnl.load [[RES_1_]]{{.}}[[I_5_]]{{.}} : memref<8xf32>
-// CHECK-DAG:             [[VAR_12_2_:%.+]] = affine.apply [[MAP_11_]]([[I_5_]]){{.}}[[VAR_2_]], [[LOAD_VAR_reinterpret_cast_MEM_1_]]{{.}}
-// CHECK:                 krnl.store [[LOAD_VAR_reinterpret_cast_MEM_5_]], [[RES_]]{{.}}[[VAR_1_]]#0, [[VAR_1_]]#1, [[VAR_1_]]2] : memref<16x8x127xf32>
+// CHECK-DAG:       [[LOOP_0_:%.+]] = krnl.define_loops 1
+// CHECK:           krnl.iterate([[LOOP_0_]]) with ([[LOOP_0_]] -> [[I_0_:%.+]] = 0 to 16){
+// CHECK-DAG:         [[VAR_1_:%.+]] = krnl.get_induction_var_value([[LOOP_0_]]) : (!krnl.loop) -> index
+// CHECK-DAG:         [[LOOP_1_:%.+]] = krnl.define_loops 1
+// CHECK:             krnl.iterate([[LOOP_1_]]) with ([[LOOP_1_]] -> [[I_1_:%.+]] = 0 to 8){
+// CHECK-DAG:           [[VAR_3_:%.+]] = krnl.get_induction_var_value([[LOOP_1_]]) : (!krnl.loop) -> index
+// CHECK-DAG:           [[LOOP_2_:%.+]] = krnl.define_loops 1
+// CHECK:               krnl.iterate([[LOOP_2_]]) with ([[LOOP_2_]] -> [[I_2_:%.+]] = 0 to 2){
+// CHECK:                 [[VAR_5_:%.+]] = krnl.get_induction_var_value([[LOOP_2_]]) : (!krnl.loop) -> index
+// CHECK:                 [[VAR_6_:%.+]] = affine.apply [[MAP_1_]]([[VAR_5_]])
+// CHECK:                 [[VAR_7_:%.+]] = krnl.get_linear_offset_index [[PARAM_0_]] at {{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_6_]]{{.}} : memref<16x8x127xf16, #map>
+// CHECK-DAG:             [[VAR_8_:%.+]] = affine.apply [[MAP_2_]]([[VAR_5_]]){{.}}[[VAR_7_]]{{.}}
+// CHECK-DAG:             [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<8xf32>
+// CHECK:                 krnl.prefetch [[PARAM_0_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_6_]]{{.}}, read, locality<1>, data : memref<16x8x127xf16, #map>
+// CHECK:                 krnl.prefetch [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_6_]]{{.}}, write, locality<1>, data : memref<16x8x127xf32>
+// CHECK:                 [[VAR_9_:%.+]] = affine.apply [[MAP_3_]]([[VAR_5_]]){{.}}[[VAR_7_]]{{.}}
+// CHECK:                 [[VAR_10_:%.+]] = arith.cmpi sge, [[VAR_9_]], [[CST_0_]] : index
+// CHECK:                 scf.if [[VAR_10_]] {
+// CHECK:                   scf.for [[I_3_:%.+]] = [[CST_0_]] to [[CST_64_]] step [[CST_32_]] {
+// CHECK-DAG:                 [[VAR_11_:%.+]] = affine.apply [[MAP_4_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK-DAG:                 [[LOAD_VAR_reinterpret_cast_MEM_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[I_3_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_:%.+]], [[VAR_output2_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     [[VAR_13_:%.+]] = affine.apply [[MAP_5_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK:                     [[LOAD_VAR_reinterpret_cast_MEM_1_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_13_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_1_:%.+]], [[VAR_output2_2_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_1_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     [[VAR_15_:%.+]] = affine.apply [[MAP_6_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK:                     [[LOAD_VAR_reinterpret_cast_MEM_2_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_15_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_3_:%.+]], [[VAR_output2_4_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_2_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     [[VAR_17_:%.+]] = affine.apply [[MAP_7_]]([[I_3_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK:                     [[LOAD_VAR_reinterpret_cast_MEM_3_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_17_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_5_:%.+]], [[VAR_output2_6_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_3_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     vector.store [[VAR_output1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]1] : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_19_:%.+]] = arith.addi [[VAR_11_]], [[CST_4_]] : index
+// CHECK:                     vector.store [[VAR_output2_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]9] : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_20_:%.+]] = arith.addi [[VAR_11_]], [[CST_8_]] : index
+// CHECK:                     vector.store [[VAR_output1_1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_20_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_21_:%.+]] = arith.addi [[VAR_11_]], [[CST_12_]] : index
+// CHECK:                     vector.store [[VAR_output2_2_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_21_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_22_:%.+]] = arith.addi [[VAR_11_]], [[CST_16_]] : index
+// CHECK:                     vector.store [[VAR_output1_3_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_22_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_23_:%.+]] = arith.addi [[VAR_11_]], [[CST_20_]] : index
+// CHECK:                     vector.store [[VAR_output2_4_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_23_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_24_:%.+]] = arith.addi [[VAR_11_]], [[CST_24_]] : index
+// CHECK:                     vector.store [[VAR_output1_5_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_24_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_25_:%.+]] = arith.addi [[VAR_11_]], [[CST_28_]] : index
+// CHECK:                     vector.store [[VAR_output2_6_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_25_]]{{.}} : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                   }
+// CHECK:                 } else {
+// CHECK:                   [[VAR_11_1_:%.+]] = affine.apply [[MAP_8_]](){{.}}[[VAR_6_]]{{.}}
+// CHECK:                   scf.for [[I_4_:%.+]] = [[CST_0_]] to [[VAR_11_1_]] step [[CST_8_]] {
+// CHECK-DAG:                 [[VAR_15_1_:%.+]] = affine.apply [[MAP_4_]]([[I_4_]]){{.}}[[VAR_6_]]{{.}}
+// CHECK-DAG:                 [[LOAD_VAR_reinterpret_cast_MEM_4_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[I_4_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                     [[VAR_output1_1_1_:%.+]], [[VAR_output2_2_1_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_4_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                     vector.store [[VAR_output1_1_1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]5] : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                     [[VAR_17_1_:%.+]] = arith.addi [[VAR_15_1_]], [[CST_4_]] : index
+// CHECK:                     vector.store [[VAR_output2_2_1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]7] : memref<16x8x127xf32>, vector<4xf32>
+// CHECK:                   }
+// CHECK-DAG:               [[LOAD_VAR_reinterpret_cast_MEM_5_:%.+]] = affine.apply [[MAP_9_]](){{.}}[[VAR_6_]]{{.}}
+// CHECK-DAG:               [[VAR_13_1_:%.+]] = affine.apply [[MAP_10_]](){{.}}[[VAR_6_]]{{.}}
+// CHECK:                   [[LOAD_VAR_reinterpret_cast_MEM_6_:%.+]] = vector.load [[VAR_reinterpret_cast_]]{{.}}[[VAR_8_]], [[VAR_13_1_]]{{.}} : memref<2x64xf16>, vector<8xf16>
+// CHECK:                   [[VAR_output1_1_:%.+]], [[VAR_output2_1_:%.+]] = "zlow.vec_dlf16_to_f32"([[LOAD_VAR_reinterpret_cast_MEM_6_]]) : (vector<8xf16>) -> (vector<4xf32>, vector<4xf32>)
+// CHECK:                   vector.store [[VAR_output1_1_]], [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<8xf32>, vector<4xf32>
+// CHECK:                   vector.store [[VAR_output2_1_]], [[RES_1_]]{{.}}[[CST_4_]]{{.}} : memref<8xf32>, vector<4xf32>
+// CHECK:                   scf.for [[I_5_:%.+]] = [[CST_0_]] to [[LOAD_VAR_reinterpret_cast_MEM_5_]] step [[CST_1_]] {
+// CHECK-DAG:                 [[VAR_15_1_:%.+]] = krnl.load [[RES_1_]]{{.}}[[I_5_]]{{.}} : memref<8xf32>
+// CHECK-DAG:                 [[LOAD_VAR_reinterpret_cast_MEM_4_:%.+]] = affine.apply [[MAP_11_]]([[I_5_]]){{.}}[[VAR_6_]], [[VAR_13_1_]]{{.}}
+// CHECK:                     krnl.store [[VAR_15_1_]], [[RES_]]{{.}}[[VAR_1_]], [[VAR_3_]], [[VAR_1_]]6] : memref<16x8x127xf32>
+// CHECK:                   }
+// CHECK:                 }
 // CHECK:               }
 // CHECK:             }
 // CHECK:           }
 // CHECK:           return [[RES_]] : memref<16x8x127xf32>
 // CHECK:         }
-}
\ No newline at end of file
+}
+
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize_O3.mlir
index 7d397ccc9d..075f1626a0 100644
--- a/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize_O3.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Math/Elementwise_with_canonicalize_O3.mlir
@@ -1,7 +1,7 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/MatMulInteger_with_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/MatMulInteger_with_canonicalize_O3.mlir
index 29c76f24fc..576e245cb9 100644
--- a/test/mlir/conversion/onnx_to_krnl/Math/MatMulInteger_with_canonicalize_O3.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Math/MatMulInteger_with_canonicalize_O3.mlir
@@ -1,9 +1,9 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 // -----
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
 
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/MatMul_with_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/MatMul_with_canonicalize_O3.mlir
index e221f249a9..e1be9cebbc 100644
--- a/test/mlir/conversion/onnx_to_krnl/Math/MatMul_with_canonicalize_O3.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Math/MatMul_with_canonicalize_O3.mlir
@@ -1,7 +1,7 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir
index 82d5e441e5..f35603fc9e 100644
--- a/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir
@@ -1,7 +1,7 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
diff --git a/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_parallel_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_parallel_canonicalize_O3.mlir
index b2cc41276c..8e76541332 100644
--- a/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_parallel_canonicalize_O3.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_parallel_canonicalize_O3.mlir
@@ -10,11 +10,12 @@ func.func @test_reduce_all_to_scalar(%arg0: tensor<?x64x?xf32>) -> tensor<*xf32>
 
 // CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<()[s0] -> (s0 * 64)>
 // CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0) -> (d0 * 32)>
+// CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<()[s0] -> (s0 - 31)>
+// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<()[s0, s1] -> (s1 + ((s0 - s1) floordiv 32) * 32)>
 // CHECK-LABEL:  func.func @test_reduce_all_to_scalar
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<?x64x?xf32>) -> memref<f32> {
 // CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<0xFF800000> : vector<1xf32>
 // CHECK-DAG:       [[VAR_cst_0_:%.+]] = arith.constant dense<0xFF800000> : vector<32xf32>
-// CHECK-DAG:       [[CST_31_:%.+]] = arith.constant 31 : index
 // CHECK-DAG:       [[CST_32_:%.+]] = arith.constant 32 : index
 // CHECK-DAG:       [[CST_8_:%.+]] = arith.constant 8 : index
 // CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : index
@@ -41,26 +42,23 @@ func.func @test_reduce_all_to_scalar(%arg0: tensor<?x64x?xf32>) -> tensor<*xf32>
 // CHECK-DAG:         [[VAR_11_:%.+]] = arith.select [[VAR_10_]], [[VAR_1_]], [[VAR_9_]] : index
 // CHECK-DAG:         [[VAR_12_:%.+]] = affine.apply [[MAP_1_]]([[VAR_7_]])
 // CHECK:             vector.store [[VAR_cst_0_]], [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:             [[VAR_13_:%.+]] = arith.subi [[VAR_11_]], [[CST_31_]] : index
+// CHECK:             [[VAR_13_:%.+]] = affine.apply [[MAP_2_]](){{.}}[[VAR_11_]]{{.}}
 // CHECK:             scf.for [[I_1_:%.+]] = [[VAR_8_]] to [[VAR_13_]] step [[CST_32_]] {
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_:%.+]] = vector.load [[VAR_reshape_]]{{.}}[[I_1_]]{{.}} : memref<?xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_RES_1_MEM_:%.+]] = vector.load [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:               [[VAR_22_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_]], [[LOAD_VAR_reshape_MEM_]] : vector<32xf32>
-// CHECK:               vector.store [[VAR_22_]], [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>, vector<32xf32>
+// CHECK:               [[VAR_19_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_]], [[LOAD_VAR_reshape_MEM_]] : vector<32xf32>
+// CHECK:               vector.store [[VAR_19_]], [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK:             }
-// CHECK:             [[VAR_14_:%.+]] = arith.subi [[VAR_11_]], [[VAR_8_]] : index
-// CHECK:             [[VAR_15_:%.+]] = arith.remsi [[VAR_14_]], [[CST_32_]] : index
-// CHECK:             [[VAR_16_:%.+]] = arith.subi [[VAR_14_]], [[VAR_15_]] : index
-// CHECK:             [[VAR_17_:%.+]] = arith.addi [[VAR_8_]], [[VAR_16_]] : index
-// CHECK:             scf.for [[I_2_:%.+]] = [[VAR_17_]] to [[VAR_11_]] step [[CST_1_]] {
+// CHECK:             [[VAR_14_:%.+]] = affine.apply [[MAP_3_]](){{.}}[[VAR_11_]], [[VAR_8_]]{{.}}
+// CHECK:             scf.for [[I_2_:%.+]] = [[VAR_14_]] to [[VAR_11_]] step [[CST_1_]] {
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_1_:%.+]] = memref.load [[VAR_reshape_]]{{.}}[[I_2_]]{{.}} : memref<?xf32>
 // CHECK-DAG:           [[LOAD_RES_1_MEM_1_:%.+]] = memref.load [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>
-// CHECK:               [[VAR_22_1_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_1_]], [[LOAD_VAR_reshape_MEM_1_]] : f32
-// CHECK:               memref.store [[VAR_22_1_]], [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>
+// CHECK:               [[VAR_19_1_:%.+]] = arith.maxnumf [[LOAD_RES_1_MEM_1_]], [[LOAD_VAR_reshape_MEM_1_]] : f32
+// CHECK:               memref.store [[VAR_19_1_]], [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>
 // CHECK:             }
 // CHECK:             [[LOAD_RES_1_MEM_2_:%.+]] = vector.load [[RES_1_]]{{.}}[[VAR_12_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:             [[VAR_19_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_1_MEM_2_]] : vector<32xf32> into f32
-// CHECK:             memref.store [[VAR_19_]], [[RES_2_]]{{.}}[[VAR_7_]]{{.}} : memref<8xf32>
+// CHECK:             [[VAR_16_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_1_MEM_2_]] : vector<32xf32> into f32
+// CHECK:             memref.store [[VAR_16_]], [[RES_2_]]{{.}}[[VAR_7_]]{{.}} : memref<8xf32>
 // CHECK:           }
 // CHECK:           [[RES_3_:%.+]] = memref.alloc() : memref<f32>
 // CHECK:           vector.store [[VAR_cst_]], [[RES_1_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>, vector<1xf32>
diff --git a/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir
index 5f075f9626..ca4ce47c7e 100644
--- a/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir
@@ -1,7 +1,7 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 
 // -----
 
diff --git a/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_parallel_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_parallel_canonicalize.mlir
index d6cc1237c2..e177dbf262 100644
--- a/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_parallel_canonicalize.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_parallel_canonicalize.mlir
@@ -1,7 +1,7 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl=enable-parallel --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl=enable-parallel --canonicalize %s -split-input-file | FileCheck %s
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 
 // -----
 
diff --git a/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_canonicalize.mlir
index 818d8f2149..2682e155c4 100644
--- a/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_canonicalize.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: onnx-mlir-opt -O3 -mcpu=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --march=z16 --shape-inference --convert-onnx-to-krnl --canonicalize %s -split-input-file | FileCheck %s
 
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
diff --git a/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_parallel_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_parallel_canonicalize.mlir
index 86a9d3f14b..269c85f9ce 100644
--- a/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_parallel_canonicalize.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Quantization/DynamicQuantizeLinear_with_simd_parallel_canonicalize.mlir
@@ -1,4 +1,5 @@
-// RUN: onnx-mlir-opt -O3 -mcpu=z16 --shape-inference --convert-onnx-to-krnl=enable-parallel --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --march=arch14 --shape-inference --convert-onnx-to-krnl=enable-parallel --canonicalize %s -split-input-file | FileCheck %s
+// above: used --march=arch14 instead of --march=z16 on purpose to make sure either option works
 
 // Adding canonicalize is important here as this is the only way to check the values of the map,
 // which are otherwise before the function, and thus are hard to test.
@@ -14,8 +15,8 @@ func.func @test_dynamic_quantize_linear_simd_only(%arg0: tensor<256x16xf32>) ->
 // CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0) -> (d0 * 512)>
 // CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0) -> (4096, d0 * 512 + 512)>
 // CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0) -> (d0 * 32)>
-// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<(d0) -> (4065, d0 * 512 + 481)>
-// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0) -> (d0 * -512 + 4096, 512)>
+// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<()[s0] -> (s0 - 31)>
+// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 32)>
 // CHECK-LABEL:  func.func @test_dynamic_quantize_linear_simd_only
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<256x16xf32>) -> (memref<256x16xui8>, memref<f32>, memref<ui8>) {
 // CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<2.550000e+02> : vector<16xf32>
@@ -24,12 +25,12 @@ func.func @test_dynamic_quantize_linear_simd_only(%arg0: tensor<256x16xf32>) ->
 // CHECK-DAG:       [[VAR_cst_2_:%.+]] = arith.constant dense<0x7F800000> : vector<1xf32>
 // CHECK-DAG:       [[VAR_cst_3_:%.+]] = arith.constant dense<0xFF800000> : vector<32xf32>
 // CHECK-DAG:       [[VAR_cst_4_:%.+]] = arith.constant dense<0x7F800000> : vector<32xf32>
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
 // CHECK-DAG:       [[CST_32_:%.+]] = arith.constant 32 : index
 // CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
 // CHECK-DAG:       [[CST_4096_:%.+]] = arith.constant 4096 : index
 // CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : index
-// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
 // CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<256x16xui8>
 // CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() : memref<f32>
 // CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<ui8>
@@ -49,40 +50,37 @@ func.func @test_dynamic_quantize_linear_simd_only(%arg0: tensor<256x16xf32>) ->
 // CHECK-DAG:         [[VAR_24_:%.+]] = affine.apply [[MAP_2_]]([[VAR_21_]])
 // CHECK:             vector.store [[VAR_cst_4_]], [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK:             vector.store [[VAR_cst_3_]], [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:             [[VAR_25_:%.+]] = affine.min [[MAP_3_]]([[VAR_21_]])
+// CHECK:             [[VAR_25_:%.+]] = affine.apply [[MAP_3_]](){{.}}[[VAR_23_]]{{.}}
 // CHECK:             scf.for [[I_1_:%.+]] = [[VAR_22_]] to [[VAR_25_]] step [[CST_32_]] {
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_:%.+]] = vector.load [[VAR_reshape_]]{{.}}[[I_1_]]{{.}} : memref<4096xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_1_:%.+]] = vector.load [[VAR_reshape_]]{{.}}[[I_1_]]{{.}} : memref<4096xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_RES_4_MEM_:%.+]] = vector.load [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_RES_6_MEM_:%.+]] = vector.load [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:           [[VAR_38_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_]], [[LOAD_VAR_reshape_MEM_]] : vector<32xf32>
-// CHECK-DAG:           [[VAR_39_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_]], [[LOAD_VAR_reshape_MEM_1_]] : vector<32xf32>
-// CHECK:               vector.store [[VAR_38_]], [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:               vector.store [[VAR_39_]], [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
+// CHECK-DAG:           [[VAR_35_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_]], [[LOAD_VAR_reshape_MEM_]] : vector<32xf32>
+// CHECK-DAG:           [[VAR_36_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_]], [[LOAD_VAR_reshape_MEM_1_]] : vector<32xf32>
+// CHECK:               vector.store [[VAR_35_]], [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
+// CHECK:               vector.store [[VAR_36_]], [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK:             }
-// CHECK:             [[VAR_26_:%.+]] = affine.min [[MAP_4_]]([[VAR_21_]])
-// CHECK:             [[VAR_27_:%.+]] = arith.remsi [[VAR_26_]], [[CST_32_]] : index
-// CHECK:             [[VAR_28_:%.+]] = arith.subi [[VAR_26_]], [[VAR_27_]] : index
-// CHECK:             [[VAR_29_:%.+]] = arith.addi [[VAR_22_]], [[VAR_28_]] : index
-// CHECK:             scf.for [[I_2_:%.+]] = [[VAR_29_]] to [[VAR_23_]] step [[CST_1_]] {
+// CHECK:             [[VAR_26_:%.+]] = affine.apply [[MAP_4_]](){{.}}[[VAR_23_]]{{.}}
+// CHECK:             scf.for [[I_2_:%.+]] = [[VAR_26_]] to [[VAR_23_]] step [[CST_1_]] {
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_2_:%.+]] = memref.load [[VAR_reshape_]]{{.}}[[I_2_]]{{.}} : memref<4096xf32>
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_3_:%.+]] = memref.load [[VAR_reshape_]]{{.}}[[I_2_]]{{.}} : memref<4096xf32>
 // CHECK-DAG:           [[LOAD_RES_4_MEM_1_:%.+]] = memref.load [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>
 // CHECK-DAG:           [[LOAD_RES_6_MEM_1_:%.+]] = memref.load [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:           [[VAR_38_1_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] : f32
-// CHECK-DAG:           [[VAR_39_1_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_1_]], [[LOAD_VAR_reshape_MEM_3_]] : f32
-// CHECK:               memref.store [[VAR_38_1_]], [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>
-// CHECK:               memref.store [[VAR_39_1_]], [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>
+// CHECK-DAG:           [[VAR_35_1_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] : f32
+// CHECK-DAG:           [[VAR_36_1_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_1_]], [[LOAD_VAR_reshape_MEM_3_]] : f32
+// CHECK:               memref.store [[VAR_35_1_]], [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>
+// CHECK:               memref.store [[VAR_36_1_]], [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>
 // CHECK:             }
 // CHECK-DAG:         [[LOAD_RES_4_MEM_2_:%.+]] = vector.load [[RES_4_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-DAG:         [[LOAD_RES_6_MEM_2_:%.+]] = vector.load [[RES_6_]]{{.}}[[VAR_24_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:         [[VAR_32_:%.+]] = vector.reduction <minnumf>, [[LOAD_RES_4_MEM_2_]] : vector<32xf32> into f32
-// CHECK-DAG:         [[VAR_33_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_6_MEM_2_]] : vector<32xf32> into f32
-// CHECK:             memref.store [[VAR_32_]], [[RES_5_]]{{.}}[[VAR_21_]]{{.}} : memref<8xf32>
-// CHECK:             memref.store [[VAR_33_]], [[RES_7_]]{{.}}[[VAR_21_]]{{.}} : memref<8xf32>
+// CHECK-DAG:         [[VAR_29_:%.+]] = vector.reduction <minnumf>, [[LOAD_RES_4_MEM_2_]] : vector<32xf32> into f32
+// CHECK-DAG:         [[VAR_30_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_6_MEM_2_]] : vector<32xf32> into f32
+// CHECK:             memref.store [[VAR_29_]], [[RES_5_]]{{.}}[[VAR_21_]]{{.}} : memref<8xf32>
+// CHECK:             memref.store [[VAR_30_]], [[RES_7_]]{{.}}[[VAR_21_]]{{.}} : memref<8xf32>
 // CHECK:           }
 // CHECK-DAG:       [[RES_8_:%.+]] = memref.alloc() : memref<f32>
 // CHECK-DAG:       [[RES_9_:%.+]] = memref.alloc() : memref<f32>
@@ -97,9 +95,9 @@ func.func @test_dynamic_quantize_linear_simd_only(%arg0: tensor<256x16xf32>) ->
 // CHECK-DAG:         [[LOAD_RES_6_MEM_3_:%.+]] = krnl.load [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
 // CHECK-NOT: separator of consecutive DAGs
 // CHECK-DAG:         [[VAR_26_1_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_3_]], [[VAR_22_1_]] : f32
-// CHECK-DAG:         [[VAR_27_1_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_3_]], [[VAR_23_1_]] : f32
+// CHECK-DAG:         [[LOAD_RES_4_MEM_2_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_3_]], [[VAR_23_1_]] : f32
 // CHECK:             krnl.store [[VAR_26_1_]], [[RES_4_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
-// CHECK:             krnl.store [[VAR_27_1_]], [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
+// CHECK:             krnl.store [[LOAD_RES_4_MEM_2_]], [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
 // CHECK:           }
 // CHECK-DAG:       [[LOAD_RES_4_MEM_4_:%.+]] = vector.load [[RES_4_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>, vector<1xf32>
 // CHECK-DAG:       [[LOAD_RES_6_MEM_4_:%.+]] = vector.load [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>, vector<1xf32>
@@ -141,20 +139,20 @@ func.func @test_dynamic_quantize_linear_simd_only(%arg0: tensor<256x16xf32>) ->
 // CHECK:             [[VAR_24_1_:%.+]] = arith.divf [[VAR_22_1_]], [[VAR_23_2_]] : vector<16xf32>
 // CHECK:             [[VAR_25_1_:%.+]] = vector.shape_cast [[VAR_24_1_]] : vector<16xf32> to vector<4x4xf32>
 // CHECK:             [[VAR_26_2_:%.+]] = vector.extract [[VAR_25_1_]][0] : vector<4xf32> from vector<4x4xf32>
-// CHECK:             [[VAR_27_2_:%.+]] = "krnl.round_even"([[VAR_26_2_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK-DAG:         [[VAR_28_1_:%.+]] = vector.insert [[VAR_27_2_]], [[VAR_25_1_]] [0] : vector<4xf32> into vector<4x4xf32>
+// CHECK:             [[LOAD_RES_4_MEM_2_1_:%.+]] = "krnl.round_even"([[VAR_26_2_]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK-DAG:         [[LOAD_RES_6_MEM_2_:%.+]] = vector.insert [[LOAD_RES_4_MEM_2_1_]], [[VAR_25_1_]] [0] : vector<4xf32> into vector<4x4xf32>
 // CHECK-DAG:         [[VAR_29_1_:%.+]] = vector.extract [[VAR_25_1_]][1] : vector<4xf32> from vector<4x4xf32>
-// CHECK:             [[LOAD_RES_4_MEM_2_:%.+]] = "krnl.round_even"([[VAR_29_1_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK-DAG:         [[LOAD_RES_6_MEM_2_:%.+]] = vector.insert [[LOAD_RES_4_MEM_2_]], [[VAR_28_1_]] [1] : vector<4xf32> into vector<4x4xf32>
-// CHECK-DAG:         [[VAR_32_1_:%.+]] = vector.extract [[VAR_25_1_]][2] : vector<4xf32> from vector<4x4xf32>
-// CHECK:             [[VAR_33_1_:%.+]] = "krnl.round_even"([[VAR_32_1_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_2_:%.+]] = vector.insert [[VAR_33_1_]], [[LOAD_RES_6_MEM_2_]] [2] : vector<4xf32> into vector<4x4xf32>
-// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_3_:%.+]] = vector.extract [[VAR_25_1_]][3] : vector<4xf32> from vector<4x4xf32>
+// CHECK:             [[VAR_30_1_:%.+]] = "krnl.round_even"([[VAR_29_1_]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_2_:%.+]] = vector.insert [[VAR_30_1_]], [[LOAD_RES_6_MEM_2_]] [1] : vector<4xf32> into vector<4x4xf32>
+// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_3_:%.+]] = vector.extract [[VAR_25_1_]][2] : vector<4xf32> from vector<4x4xf32>
 // CHECK:             [[LOAD_RES_4_MEM_1_:%.+]] = "krnl.round_even"([[LOAD_VAR_reshape_MEM_3_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK:             [[LOAD_RES_6_MEM_1_:%.+]] = vector.insert [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] [3] : vector<4xf32> into vector<4x4xf32>
-// CHECK-DAG:         [[VAR_38_2_:%.+]] = vector.shape_cast [[LOAD_RES_6_MEM_1_]] : vector<4x4xf32> to vector<16xf32>
-// CHECK-DAG:         [[VAR_39_2_:%.+]] = vector.splat [[VAR_16_]] : vector<16xf32>
-// CHECK:             [[VAR_40_:%.+]] = arith.addf [[VAR_38_2_]], [[VAR_39_2_]] : vector<16xf32>
+// CHECK-DAG:         [[LOAD_RES_6_MEM_1_:%.+]] = vector.insert [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] [2] : vector<4xf32> into vector<4x4xf32>
+// CHECK-DAG:         [[VAR_35_2_:%.+]] = vector.extract [[VAR_25_1_]][3] : vector<4xf32> from vector<4x4xf32>
+// CHECK:             [[VAR_36_2_:%.+]] = "krnl.round_even"([[VAR_35_2_]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK:             [[VAR_37_:%.+]] = vector.insert [[VAR_36_2_]], [[LOAD_RES_6_MEM_1_]] [3] : vector<4xf32> into vector<4x4xf32>
+// CHECK-DAG:         [[VAR_38_:%.+]] = vector.shape_cast [[VAR_37_]] : vector<4x4xf32> to vector<16xf32>
+// CHECK-DAG:         [[VAR_39_:%.+]] = vector.splat [[VAR_16_]] : vector<16xf32>
+// CHECK:             [[VAR_40_:%.+]] = arith.addf [[VAR_38_]], [[VAR_39_]] : vector<16xf32>
 // CHECK:             [[VAR_41_:%.+]] = arith.maxnumf [[VAR_40_]], [[VAR_cst_0_]] : vector<16xf32>
 // CHECK:             [[VAR_42_:%.+]] = arith.minnumf [[VAR_41_]], [[VAR_cst_]] : vector<16xf32>
 // CHECK:             [[VAR_43_:%.+]] = arith.fptoui [[VAR_42_]] : vector<16xf32> to vector<16xi32>
@@ -177,8 +175,8 @@ func.func @test_dynamic_quantize_linear_simd_and_scalar(%arg0: tensor<255x17xf32
 // CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0) -> (d0 * 542)>
 // CHECK-DAG:   [[MAP_1_:#.+]] = affine_map<(d0) -> (4335, d0 * 542 + 542)>
 // CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0) -> (d0 * 32)>
-// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<(d0) -> (4304, d0 * 542 + 511)>
-// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0) -> (d0 * -542 + 4335, 542)>
+// CHECK-DAG:   [[MAP_3_:#.+]] = affine_map<()[s0] -> (s0 - 31)>
+// CHECK-DAG:   [[MAP_4_:#.+]] = affine_map<(d0)[s0] -> (d0 * 542 + ((d0 * -542 + s0) floordiv 32) * 32)>
 // CHECK-LABEL:  func.func @test_dynamic_quantize_linear_simd_and_scalar
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<255x17xf32>) -> (memref<255x17xui8>, memref<f32>, memref<ui8>) {
 // CHECK-DAG:       [[VAR_cst_:%.+]] = arith.constant dense<2.550000e+02> : vector<16xf32>
@@ -187,12 +185,12 @@ func.func @test_dynamic_quantize_linear_simd_and_scalar(%arg0: tensor<255x17xf32
 // CHECK-DAG:       [[VAR_cst_2_:%.+]] = arith.constant dense<0x7F800000> : vector<1xf32>
 // CHECK-DAG:       [[VAR_cst_3_:%.+]] = arith.constant dense<0xFF800000> : vector<32xf32>
 // CHECK-DAG:       [[VAR_cst_4_:%.+]] = arith.constant dense<0x7F800000> : vector<32xf32>
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
 // CHECK-DAG:       [[CST_32_:%.+]] = arith.constant 32 : index
 // CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : index
 // CHECK-DAG:       [[CST_4335_:%.+]] = arith.constant 4335 : index
 // CHECK-DAG:       [[CST_1_:%.+]] = arith.constant 1 : index
-// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
-// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
 // CHECK-DAG:       [[RES_:%.+]] = memref.alloc() {{.*}}: memref<255x17xui8>
 // CHECK-DAG:       [[RES_1_:%.+]] = memref.alloc() : memref<f32>
 // CHECK-DAG:       [[RES_2_:%.+]] = memref.alloc() : memref<ui8>
@@ -212,40 +210,37 @@ func.func @test_dynamic_quantize_linear_simd_and_scalar(%arg0: tensor<255x17xf32
 // CHECK-DAG:         [[VAR_25_:%.+]] = affine.apply [[MAP_2_]]([[VAR_22_]])
 // CHECK:             vector.store [[VAR_cst_4_]], [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK:             vector.store [[VAR_cst_3_]], [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:             [[VAR_26_:%.+]] = affine.min [[MAP_3_]]([[VAR_22_]])
+// CHECK:             [[VAR_26_:%.+]] = affine.apply [[MAP_3_]](){{.}}[[VAR_24_]]{{.}}
 // CHECK:             scf.for [[I_1_:%.+]] = [[VAR_23_]] to [[VAR_26_]] step [[CST_32_]] {
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_:%.+]] = vector.load [[VAR_reshape_]]{{.}}[[I_1_]]{{.}} : memref<4335xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_1_:%.+]] = vector.load [[VAR_reshape_]]{{.}}[[I_1_]]{{.}} : memref<4335xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_RES_4_MEM_:%.+]] = vector.load [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-DAG:           [[LOAD_RES_6_MEM_:%.+]] = vector.load [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:           [[VAR_39_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_]], [[LOAD_VAR_reshape_MEM_]] : vector<32xf32>
-// CHECK-DAG:           [[VAR_40_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_]], [[LOAD_VAR_reshape_MEM_1_]] : vector<32xf32>
-// CHECK:               vector.store [[VAR_39_]], [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
-// CHECK:               vector.store [[VAR_40_]], [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
+// CHECK-DAG:           [[VAR_36_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_]], [[LOAD_VAR_reshape_MEM_]] : vector<32xf32>
+// CHECK-DAG:           [[VAR_37_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_]], [[LOAD_VAR_reshape_MEM_1_]] : vector<32xf32>
+// CHECK:               vector.store [[VAR_36_]], [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
+// CHECK:               vector.store [[VAR_37_]], [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK:             }
-// CHECK:             [[VAR_27_:%.+]] = affine.min [[MAP_4_]]([[VAR_22_]])
-// CHECK:             [[VAR_28_:%.+]] = arith.remsi [[VAR_27_]], [[CST_32_]] : index
-// CHECK:             [[VAR_29_:%.+]] = arith.subi [[VAR_27_]], [[VAR_28_]] : index
-// CHECK:             [[VAR_30_:%.+]] = arith.addi [[VAR_23_]], [[VAR_29_]] : index
-// CHECK:             scf.for [[I_2_:%.+]] = [[VAR_30_]] to [[VAR_24_]] step [[CST_1_]] {
+// CHECK:             [[VAR_27_:%.+]] = affine.apply [[MAP_4_]]([[VAR_22_]]){{.}}[[VAR_24_]]{{.}}
+// CHECK:             scf.for [[I_2_:%.+]] = [[VAR_27_]] to [[VAR_24_]] step [[CST_1_]] {
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_2_:%.+]] = memref.load [[VAR_reshape_]]{{.}}[[I_2_]]{{.}} : memref<4335xf32>
 // CHECK-DAG:           [[LOAD_VAR_reshape_MEM_3_:%.+]] = memref.load [[VAR_reshape_]]{{.}}[[I_2_]]{{.}} : memref<4335xf32>
 // CHECK-DAG:           [[LOAD_RES_4_MEM_1_:%.+]] = memref.load [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>
 // CHECK-DAG:           [[LOAD_RES_6_MEM_1_:%.+]] = memref.load [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:           [[VAR_39_1_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] : f32
-// CHECK-DAG:           [[VAR_40_1_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_1_]], [[LOAD_VAR_reshape_MEM_3_]] : f32
-// CHECK:               memref.store [[VAR_39_1_]], [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>
-// CHECK:               memref.store [[VAR_40_1_]], [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>
+// CHECK-DAG:           [[VAR_36_1_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] : f32
+// CHECK-DAG:           [[VAR_37_1_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_1_]], [[LOAD_VAR_reshape_MEM_3_]] : f32
+// CHECK:               memref.store [[VAR_36_1_]], [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>
+// CHECK:               memref.store [[VAR_37_1_]], [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>
 // CHECK:             }
 // CHECK-DAG:         [[LOAD_RES_4_MEM_2_:%.+]] = vector.load [[RES_4_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-DAG:         [[LOAD_RES_6_MEM_2_:%.+]] = vector.load [[RES_6_]]{{.}}[[VAR_25_]]{{.}} : memref<256xf32>, vector<32xf32>
 // CHECK-NOT: separator of consecutive DAGs
-// CHECK-DAG:         [[VAR_33_:%.+]] = vector.reduction <minnumf>, [[LOAD_RES_4_MEM_2_]] : vector<32xf32> into f32
-// CHECK-DAG:         [[VAR_34_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_6_MEM_2_]] : vector<32xf32> into f32
-// CHECK:             memref.store [[VAR_33_]], [[RES_5_]]{{.}}[[VAR_22_]]{{.}} : memref<8xf32>
-// CHECK:             memref.store [[VAR_34_]], [[RES_7_]]{{.}}[[VAR_22_]]{{.}} : memref<8xf32>
+// CHECK-DAG:         [[VAR_30_:%.+]] = vector.reduction <minnumf>, [[LOAD_RES_4_MEM_2_]] : vector<32xf32> into f32
+// CHECK-DAG:         [[VAR_31_:%.+]] = vector.reduction <maxnumf>, [[LOAD_RES_6_MEM_2_]] : vector<32xf32> into f32
+// CHECK:             memref.store [[VAR_30_]], [[RES_5_]]{{.}}[[VAR_22_]]{{.}} : memref<8xf32>
+// CHECK:             memref.store [[VAR_31_]], [[RES_7_]]{{.}}[[VAR_22_]]{{.}} : memref<8xf32>
 // CHECK:           }
 // CHECK-DAG:       [[RES_8_:%.+]] = memref.alloc() : memref<f32>
 // CHECK-DAG:       [[RES_9_:%.+]] = memref.alloc() : memref<f32>
@@ -260,9 +255,9 @@ func.func @test_dynamic_quantize_linear_simd_and_scalar(%arg0: tensor<255x17xf32
 // CHECK-DAG:         [[LOAD_RES_6_MEM_3_:%.+]] = krnl.load [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
 // CHECK-NOT: separator of consecutive DAGs
 // CHECK-DAG:         [[VAR_27_1_:%.+]] = arith.minnumf [[LOAD_RES_4_MEM_3_]], [[VAR_23_1_]] : f32
-// CHECK-DAG:         [[VAR_28_1_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_3_]], [[VAR_24_1_]] : f32
+// CHECK-DAG:         [[LOAD_RES_4_MEM_2_:%.+]] = arith.maxnumf [[LOAD_RES_6_MEM_3_]], [[VAR_24_1_]] : f32
 // CHECK:             krnl.store [[VAR_27_1_]], [[RES_4_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
-// CHECK:             krnl.store [[VAR_28_1_]], [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
+// CHECK:             krnl.store [[LOAD_RES_4_MEM_2_]], [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>
 // CHECK:           }
 // CHECK-DAG:       [[LOAD_RES_4_MEM_4_:%.+]] = vector.load [[RES_4_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>, vector<1xf32>
 // CHECK-DAG:       [[LOAD_RES_6_MEM_4_:%.+]] = vector.load [[RES_6_]]{{.}}[[CST_0_]]{{.}} : memref<256xf32>, vector<1xf32>
@@ -304,20 +299,20 @@ func.func @test_dynamic_quantize_linear_simd_and_scalar(%arg0: tensor<255x17xf32
 // CHECK:             [[VAR_25_1_:%.+]] = arith.divf [[VAR_23_1_]], [[VAR_24_2_]] : vector<16xf32>
 // CHECK:             [[VAR_26_1_:%.+]] = vector.shape_cast [[VAR_25_1_]] : vector<16xf32> to vector<4x4xf32>
 // CHECK:             [[VAR_27_2_:%.+]] = vector.extract [[VAR_26_1_]][0] : vector<4xf32> from vector<4x4xf32>
-// CHECK:             [[VAR_28_2_:%.+]] = "krnl.round_even"([[VAR_27_2_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK-DAG:         [[VAR_29_1_:%.+]] = vector.insert [[VAR_28_2_]], [[VAR_26_1_]] [0] : vector<4xf32> into vector<4x4xf32>
+// CHECK:             [[LOAD_RES_4_MEM_2_1_:%.+]] = "krnl.round_even"([[VAR_27_2_]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK-DAG:         [[LOAD_RES_6_MEM_2_:%.+]] = vector.insert [[LOAD_RES_4_MEM_2_1_]], [[VAR_26_1_]] [0] : vector<4xf32> into vector<4x4xf32>
 // CHECK-DAG:         [[VAR_30_1_:%.+]] = vector.extract [[VAR_26_1_]][1] : vector<4xf32> from vector<4x4xf32>
-// CHECK:             [[LOAD_RES_4_MEM_2_:%.+]] = "krnl.round_even"([[VAR_30_1_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK-DAG:         [[LOAD_RES_6_MEM_2_:%.+]] = vector.insert [[LOAD_RES_4_MEM_2_]], [[VAR_29_1_]] [1] : vector<4xf32> into vector<4x4xf32>
-// CHECK-DAG:         [[VAR_33_1_:%.+]] = vector.extract [[VAR_26_1_]][2] : vector<4xf32> from vector<4x4xf32>
-// CHECK:             [[VAR_34_1_:%.+]] = "krnl.round_even"([[VAR_33_1_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_2_:%.+]] = vector.insert [[VAR_34_1_]], [[LOAD_RES_6_MEM_2_]] [2] : vector<4xf32> into vector<4x4xf32>
-// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_3_:%.+]] = vector.extract [[VAR_26_1_]][3] : vector<4xf32> from vector<4x4xf32>
+// CHECK:             [[VAR_31_1_:%.+]] = "krnl.round_even"([[VAR_30_1_]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_2_:%.+]] = vector.insert [[VAR_31_1_]], [[LOAD_RES_6_MEM_2_]] [1] : vector<4xf32> into vector<4x4xf32>
+// CHECK-DAG:         [[LOAD_VAR_reshape_MEM_3_:%.+]] = vector.extract [[VAR_26_1_]][2] : vector<4xf32> from vector<4x4xf32>
 // CHECK:             [[LOAD_RES_4_MEM_1_:%.+]] = "krnl.round_even"([[LOAD_VAR_reshape_MEM_3_]]) : (vector<4xf32>) -> vector<4xf32>
-// CHECK:             [[LOAD_RES_6_MEM_1_:%.+]] = vector.insert [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] [3] : vector<4xf32> into vector<4x4xf32>
-// CHECK-DAG:         [[VAR_39_2_:%.+]] = vector.shape_cast [[LOAD_RES_6_MEM_1_]] : vector<4x4xf32> to vector<16xf32>
-// CHECK-DAG:         [[VAR_40_2_:%.+]] = vector.splat [[VAR_16_]] : vector<16xf32>
-// CHECK:             [[VAR_41_:%.+]] = arith.addf [[VAR_39_2_]], [[VAR_40_2_]] : vector<16xf32>
+// CHECK-DAG:         [[LOAD_RES_6_MEM_1_:%.+]] = vector.insert [[LOAD_RES_4_MEM_1_]], [[LOAD_VAR_reshape_MEM_2_]] [2] : vector<4xf32> into vector<4x4xf32>
+// CHECK-DAG:         [[VAR_36_2_:%.+]] = vector.extract [[VAR_26_1_]][3] : vector<4xf32> from vector<4x4xf32>
+// CHECK:             [[VAR_37_2_:%.+]] = "krnl.round_even"([[VAR_36_2_]]) : (vector<4xf32>) -> vector<4xf32>
+// CHECK:             [[VAR_38_:%.+]] = vector.insert [[VAR_37_2_]], [[LOAD_RES_6_MEM_1_]] [3] : vector<4xf32> into vector<4x4xf32>
+// CHECK-DAG:         [[VAR_39_:%.+]] = vector.shape_cast [[VAR_38_]] : vector<4x4xf32> to vector<16xf32>
+// CHECK-DAG:         [[VAR_40_:%.+]] = vector.splat [[VAR_16_]] : vector<16xf32>
+// CHECK:             [[VAR_41_:%.+]] = arith.addf [[VAR_39_]], [[VAR_40_]] : vector<16xf32>
 // CHECK:             [[VAR_42_:%.+]] = arith.maxnumf [[VAR_41_]], [[VAR_cst_0_]] : vector<16xf32>
 // CHECK:             [[VAR_43_:%.+]] = arith.minnumf [[VAR_42_]], [[VAR_cst_]] : vector<16xf32>
 // CHECK:             [[VAR_44_:%.+]] = arith.fptoui [[VAR_43_]] : vector<16xf32> to vector<16xi32>
@@ -333,11 +328,11 @@ func.func @test_dynamic_quantize_linear_simd_and_scalar(%arg0: tensor<255x17xf32
 // CHECK:             [[VAR_25_2_:%.+]] = "krnl.round_even"([[VAR_24_3_]]) : (f32) -> f32
 // CHECK:             [[VAR_26_2_:%.+]] = arith.addf [[VAR_25_2_]], [[VAR_16_]] : f32
 // CHECK:             [[VAR_27_3_:%.+]] = arith.maxnumf [[VAR_26_2_]], [[CST_0_dot_000000_]] : f32
-// CHECK:             [[VAR_28_3_:%.+]] = arith.minnumf [[VAR_27_3_]], [[CST_2_dot_550000_]] : f32
-// CHECK:             [[VAR_29_2_:%.+]] = arith.fptoui [[VAR_28_3_]] : f32 to i32
-// CHECK:             [[VAR_30_2_:%.+]] = arith.trunci [[VAR_29_2_]] : i32 to i8
-// CHECK:             [[LOAD_RES_4_MEM_2_1_:%.+]] = builtin.unrealized_conversion_cast [[VAR_30_2_]] : i8 to ui8
-// CHECK:             krnl.store [[LOAD_RES_4_MEM_2_1_]], [[VAR_reshape_19_]]{{.}}[[VAR_22_3_]]{{.}} : memref<4335xui8>
+// CHECK:             [[LOAD_RES_4_MEM_2_1_:%.+]] = arith.minnumf [[VAR_27_3_]], [[CST_2_dot_550000_]] : f32
+// CHECK:             [[LOAD_RES_6_MEM_2_1_:%.+]] = arith.fptoui [[LOAD_RES_4_MEM_2_1_]] : f32 to i32
+// CHECK:             [[VAR_30_2_:%.+]] = arith.trunci [[LOAD_RES_6_MEM_2_1_]] : i32 to i8
+// CHECK:             [[VAR_31_2_:%.+]] = builtin.unrealized_conversion_cast [[VAR_30_2_]] : i8 to ui8
+// CHECK:             krnl.store [[VAR_31_2_]], [[VAR_reshape_19_]]{{.}}[[VAR_22_3_]]{{.}} : memref<4335xui8>
 // CHECK:           }
 // CHECK:           return [[RES_]], [[RES_]]_7, [[RES_]]_8 : memref<255x17xui8>, memref<f32>, memref<ui8>
 // CHECK:         }
diff --git a/test/mlir/conversion/onnx_to_krnl/Quantization/QuantizeLinear_with_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/Quantization/QuantizeLinear_with_canonicalize.mlir
index 42d7cad73e..ece948da3d 100644
--- a/test/mlir/conversion/onnx_to_krnl/Quantization/QuantizeLinear_with_canonicalize.mlir
+++ b/test/mlir/conversion/onnx_to_krnl/Quantization/QuantizeLinear_with_canonicalize.mlir
@@ -109,3 +109,50 @@ func.func @test_quantize_linear_i8(%arg0: tensor<6xf32>, %arg1: tensor<f32>, %ar
 // CHECK:         }
 }
 
+// -----
+
+func.func @test_quantize_linear_ui8_scalar(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<ui8>) -> tensor<ui8> {
+  %0 = "onnx.QuantizeLinear"(%arg0, %arg1, %arg2) {axis = 1 : si64} : (tensor<f32>, tensor<f32>, tensor<ui8>) -> tensor<ui8>
+  return %0 : tensor<ui8>
+
+// CHECK-LABEL:  func.func @test_quantize_linear_ui8_scalar
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<f32>, [[PARAM_1_:%.+]]: memref<f32>, [[PARAM_2_:%.+]]: memref<ui8>) -> memref<ui8> {
+// CHECK-DAG:       [[CST_5_dot_000000_:%.+]] = arith.constant 5.000000e-01 : f32
+// CHECK-DAG:       [[CST_2_dot_000000_:%.+]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG:       [[CST_1_dot_000000_:%.+]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:       [[CST_0_dot_000000_:%.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:       [[CST_2_dot_550000_:%.+]] = arith.constant 2.550000e+02 : f32
+// CHECK-DAG:       [[RES_:%.+]] = memref.alloc() : memref<ui8>
+// CHECK-DAG:       [[LOAD_PARAM_1_MEM_:%.+]] = krnl.load [[PARAM_1_]][] : memref<f32>
+// CHECK-DAG:       [[LOAD_PARAM_2_MEM_:%.+]] = krnl.load [[PARAM_2_]][] : memref<ui8>
+// CHECK:           [[VAR_2_:%.+]] = builtin.unrealized_conversion_cast [[LOAD_PARAM_2_MEM_]] : ui8 to i8
+// CHECK:           [[VAR_3_:%.+]] = arith.extui [[VAR_2_]] : i8 to i32
+// CHECK-DAG:       [[VAR_4_:%.+]] = arith.uitofp [[VAR_3_]] : i32 to f32
+// CHECK-DAG:       [[LOAD_PARAM_0_MEM_:%.+]] = krnl.load [[PARAM_0_]][] : memref<f32>
+// CHECK:           [[VAR_6_:%.+]] = arith.divf [[LOAD_PARAM_0_MEM_]], [[LOAD_PARAM_1_MEM_]] : f32
+// CHECK:           [[VAR_7_:%.+]] = math.floor [[VAR_6_]] : f32
+// CHECK:           [[VAR_8_:%.+]] = arith.subf [[VAR_6_]], [[VAR_7_]] : f32
+// CHECK-DAG:       [[VAR_9_:%.+]] = arith.cmpf ogt, [[VAR_8_]], [[CST_5_dot_000000_]] : f32
+// CHECK-DAG:       [[VAR_10_:%.+]] = arith.addf [[VAR_7_]], [[CST_1_dot_000000_]] : f32
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_11_:%.+]] = arith.select [[VAR_9_]], [[VAR_10_]], [[VAR_7_]] : f32
+// CHECK-DAG:       [[VAR_12_:%.+]] = arith.mulf [[VAR_7_]], [[CST_5_dot_000000_]] : f32
+// CHECK:           [[VAR_13_:%.+]] = math.floor [[VAR_12_]] : f32
+// CHECK:           [[VAR_14_:%.+]] = arith.mulf [[VAR_13_]], [[CST_2_dot_000000_]] : f32
+// CHECK:           [[VAR_15_:%.+]] = arith.subf [[VAR_7_]], [[VAR_14_]] : f32
+// CHECK-DAG:       [[VAR_16_:%.+]] = arith.cmpf oeq, [[VAR_15_]], [[CST_1_dot_000000_]] : f32
+// CHECK-DAG:       [[VAR_17_:%.+]] = arith.addf [[VAR_7_]], [[CST_1_dot_000000_]] : f32
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_18_:%.+]] = arith.select [[VAR_16_]], [[VAR_17_]], [[VAR_7_]] : f32
+// CHECK-DAG:       [[VAR_19_:%.+]] = arith.cmpf oeq, [[VAR_8_]], [[CST_5_dot_000000_]] : f32
+// CHECK:           [[VAR_20_:%.+]] = arith.select [[VAR_19_]], [[VAR_18_]], [[VAR_11_]] : f32
+// CHECK:           [[VAR_21_:%.+]] = arith.addf [[VAR_20_]], [[VAR_4_]] : f32
+// CHECK:           [[VAR_22_:%.+]] = arith.maxnumf [[VAR_21_]], [[CST_0_dot_000000_]] : f32
+// CHECK:           [[VAR_23_:%.+]] = arith.minnumf [[VAR_22_]], [[CST_2_dot_550000_]] : f32
+// CHECK:           [[VAR_24_:%.+]] = arith.fptoui [[VAR_23_]] : f32 to i32
+// CHECK:           [[VAR_25_:%.+]] = arith.trunci [[VAR_24_]] : i32 to i8
+// CHECK:           [[VAR_26_:%.+]] = builtin.unrealized_conversion_cast [[VAR_25_]] : i8 to ui8
+// CHECK:           krnl.store [[VAR_26_]], [[RES_]][] : memref<ui8>
+// CHECK:           return [[RES_]] : memref<ui8>
+// CHECK:         }
+}
diff --git a/test/mlir/onnx/onnx_constprop.mlir b/test/mlir/onnx/onnx_constprop.mlir
index 0d58715541..7815912e21 100644
--- a/test/mlir/onnx/onnx_constprop.mlir
+++ b/test/mlir/onnx/onnx_constprop.mlir
@@ -2480,3 +2480,31 @@ func.func @test_reciprocal() -> tensor<1x2xf32> {
   // CHECK: {{.*}} = onnx.Constant dense<{{\[}}[-2.500000e-01, 6.250000e-02]{{\]}}> : tensor<1x2xf32>
   // CHECK-NOT: {{.*}} = "onnx.Reciprocal"{{.*}}
 }
+
+//===----------------------------------------------------------------------===//
+/// Abs test
+
+// -----
+
+// CHECK-LABEL: @test_abs() -> tensor<2xf32>
+func.func @test_abs() -> tensor<2xf32> {
+  %0 = onnx.Constant dense<[-4.0, 16.0]> : tensor<2xf32>
+  %1 = "onnx.Abs"(%0) : (tensor<2xf32>) -> tensor<2xf32>
+  "onnx.Return"(%1) : (tensor<2xf32>) -> ()
+  // CHECK: {{.*}} = onnx.Constant dense<[4.000000e+00, 1.600000e+01]> : tensor<2xf32>
+  // CHECK-NOT: {{.*}} = "onnx.Abs"{{.*}}
+}
+
+//===----------------------------------------------------------------------===//
+/// Round test
+
+// -----
+
+// CHECK-LABEL: @test_round() -> tensor<5xf32>
+func.func @test_round() -> tensor<5xf32> {
+  %0 = onnx.Constant dense<[0.9, 2.5, 2.3, 1.5, -4.5]> : tensor<5xf32>
+  %1 = "onnx.Round"(%0) : (tensor<5xf32>) -> tensor<5xf32>
+  "onnx.Return"(%1) : (tensor<5xf32>) -> ()
+  // CHECK: {{.*}} = onnx.Constant dense<[1.000000e+00, 2.000000e+00, 2.000000e+00, 2.000000e+00, -4.000000e+00]> : tensor<5xf32>
+  // CHECK-NOT: {{.*}} = "onnx.Round"{{.*}}
+}
diff --git a/test/mlir/onnx/onnx_decompose.mlir b/test/mlir/onnx/onnx_decompose.mlir
index a6d97d498c..0f8ac9f554 100644
--- a/test/mlir/onnx/onnx_decompose.mlir
+++ b/test/mlir/onnx/onnx_decompose.mlir
@@ -90,7 +90,7 @@ func.func @test_reducelogsumexp(%arg0 : tensor<?x?x?xf32>, %arg1 : tensor<?xi64>
   // CHECK-NEXT: [[REDUCE_MAX:%.+]] = "onnx.ReduceMax"(%arg0, %arg1) {keepdims = 1 : si64, noop_with_empty_axes = 0 : si64} : (tensor<?x?x?xf32>, tensor<?xi64>) -> tensor<*xf32>
   // CHECK-NEXT: [[SUB:%.+]] = "onnx.Sub"(%arg0, [[REDUCE_MAX]]) : (tensor<?x?x?xf32>, tensor<*xf32>) -> tensor<*xf32>
   // CHECK-NEXT: [[EXP:%.+]] = "onnx.Exp"([[SUB]]) : (tensor<*xf32>) -> tensor<*xf32>
-  // CHECK-NEXT: [[REDUCE_SUM:%.+]] = "onnx.ReduceSum"([[EXP]], %arg1) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64} : (tensor<*xf32>, tensor<?xi64>) -> tensor<*xf32> 
+  // CHECK-NEXT: [[REDUCE_SUM:%.+]] = "onnx.ReduceSum"([[EXP]], %arg1) {keepdims = 0 : si64, noop_with_empty_axes = 0 : si64} : (tensor<*xf32>, tensor<?xi64>) -> tensor<*xf32>
   // CHECK-NEXT: [[LOG:%.+]] = "onnx.Log"([[REDUCE_SUM]]) : (tensor<*xf32>) -> tensor<*xf32>
   // CHECK-NEXT: [[SQUEEZE:%.+]] = "onnx.Squeeze"([[REDUCE_MAX]], %arg1) : (tensor<*xf32>, tensor<?xi64>) -> tensor<*xf32>
   // CHECK-NEXT: [[RES:%.+]] = "onnx.Add"([[LOG]], [[SQUEEZE]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
@@ -367,7 +367,7 @@ func.func @test_seqence_construct_2(%arg0: tensor<*xi16>) -> !onnx.Seq<tensor<*x
 
 func.func @test_clipv6(%arg0 : tensor<*xf32>) -> () {
   %0 = "onnx.ClipV6"(%arg0) {max = 6.000000e+00 : f32, min = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-  onnx.Return 
+  onnx.Return
 
   // CHECK-LABEL:  func @test_clipv6
   // CHECK-SAME:   ([[PARAM_0_:%.+]]: tensor<*xf32>) {
diff --git a/test/mlir/onnx/onnx_lowering_call_canonicalize_O3.mlir b/test/mlir/onnx/onnx_lowering_call_canonicalize_O3.mlir
index 3a976908d2..1e1ea20022 100644
--- a/test/mlir/onnx/onnx_lowering_call_canonicalize_O3.mlir
+++ b/test/mlir/onnx/onnx_lowering_call_canonicalize_O3.mlir
@@ -1,7 +1,7 @@
-// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --mcpu=z16 --shape-inference --convert-onnx-to-krnl='ops-for-call=Conv' --canonicalize %s -split-input-file | FileCheck %s
+// RUN: onnx-mlir-opt -O3 --mtriple=s390x-ibm-loz --march=z16 --shape-inference --convert-onnx-to-krnl='ops-for-call=Conv' --canonicalize %s -split-input-file | FileCheck %s
 
-// use --mtriple=s390x-ibm-loz --mcpu=z16 to enable SIMD as we now need a machine
-// can also use -march=x86-64 instead.
+// use --mtriple=s390x-ibm-loz --march=z16 to enable SIMD as we now need a machine
+// can also use --march=x86-64 instead.
 
 // -----
 
diff --git a/utils/CheckONNXModel.py b/utils/CheckONNXModel.py
index db4ebd49ca..e73895ec95 100755
--- a/utils/CheckONNXModel.py
+++ b/utils/CheckONNXModel.py
@@ -13,7 +13,7 @@
 
 # This script can be used as follows:
 #
-# CheckONNXModel.py --model=reducegpt2.mlir --test-compile-args="-O3 -march=x86-64" --shape-info=0:10x20
+# CheckONNXModel.py --model=reducegpt2.mlir --test-compile-args="-O3 --march=x86-64" --shape-info=0:10x20
 #
 # It will compile and run the model reducegpt2.mlir twice.
 # * Once with the default (-O0) option, which can be overridden with
diff --git a/utils/RunONNXModelZoo.py b/utils/RunONNXModelZoo.py
index 91fab6a861..56fe2ab27c 100755
--- a/utils/RunONNXModelZoo.py
+++ b/utils/RunONNXModelZoo.py
@@ -37,7 +37,7 @@
     - Use `-m model_name` to check a list of selected models.
 
 Example:
-    $ ONNX_MLIR_HOME=/onnx-mlir/build/Release/ /onnx-mlir/utils/RunONNXModelZoo.py -m mnist-8 -c "-O3 -mcpu=z14"
+    $ ONNX_MLIR_HOME=/onnx-mlir/build/Release/ /onnx-mlir/utils/RunONNXModelZoo.py -m mnist-8 -c "-O3 --march=z16"
 """
 
 if not os.environ.get("ONNX_MLIR_HOME", None):
diff --git a/utils/documentOps.py b/utils/documentOps.py
index 199ca0cfab..b23726b1f2 100755
--- a/utils/documentOps.py
+++ b/utils/documentOps.py
@@ -4,7 +4,7 @@
 
 ##################### documentOps.py ########################################
 #
-# Copyright 2022, 2023 The IBM Research Authors.
+# Copyright 2022, 2024 The IBM Research Authors.
 #
 ################################################################################
 #
@@ -23,6 +23,15 @@
 # Default values are used when no explicit ==MIN==/==MAX== values are used.
 min_opset_default = 6
 max_opset_default = "*"
+# NNPA supported. Ordered list, with oldest first and most recent last.
+nnpa_supported_list = ["z16"]
+
+# Derived variables (do not update).
+nnpa_level_default = nnpa_supported_list[-1]  # Most recent arch (last).
+nnpa_supported_set = set(nnpa_supported_list)
+nnpa_most_current_str = ""
+if len(nnpa_supported_list) > 1:
+    nnpa_most_current_str = " - ^"
 
 import sys
 import getopt
@@ -35,7 +44,10 @@
 # SEMANTIC for LABELING (one line per directive)
 #
 # ==ARCH== <arch>
-#   where <arch> is cpu/NNPA/... this option is valid until reset by another ARCH dir
+#   where <arch> is cpu or NNPA.. this option is valid until reset by another ARCH dir
+#
+# ==LEVEL== <levels>
+#   where <levels> is a comma separated names of versions supported by NNPA.
 #
 # ==OP== <op> <text>
 #   where <op> is the ONNX op name
@@ -69,6 +81,8 @@ def print_usage(msg=""):
     print('  -a, --arch <arch>: report on "==ARCH== <arch>".')
     print("  -d, --debug: include debug.")
     print("  -i, --input <file name>: input file.")
+    if "NNPA" in target_arch:
+        print('  -l, --level <level>: report on "==LEVEL== <level>".')
     print("  -n, --notes: include notes/TODOs.")
     print("  -p, --path <util path>: path to onnx-mlir util directory.")
     print("  -u, --unsupported: list unsupported ops.")
@@ -81,6 +95,7 @@ def print_usage(msg=""):
 hightest_opset = None  # Highest opset found in the description.
 opset_dict = {}  # <op> -> <text> in "==OP== <op> <text>".
 limit_dict = {}  # <op> -> <text> in "==LIM== <text>".
+level_dict = {}  # <op> -> <text> in "==LEVEL== <text>".
 min_dict = {}  # <op> -> <num> in "==MIN== <num>".
 max_dict = {}  # <op> -> <num> in "==MAX== <num>".
 todo_dict = {}  # <op> -> <text> in "==TODO== <text>".
@@ -106,7 +121,6 @@ def parse_file(file_name):
         file = open(file_name, "r")
     except OSError:
         print_usage("Could not open file `" + file_name + "`")
-
     op = ""
     arch = ""
     for line in file:
@@ -117,7 +131,7 @@ def parse_file(file_name):
             arch = p[1]
             if debug:
                 print("process arch", arch)
-            continue
+                continue
         if arch != target_arch:
             continue
         # Additional top paragraph
@@ -142,6 +156,50 @@ def parse_file(file_name):
             if debug:
                 print("got supported op", op, "at level", list_op_version[op])
             continue
+        # Scan NNPA Level
+        if "NNPA" in target_arch:
+            p = re.search(r"==LEVEL==\s+(\w+)(?:,\s*(\w+))*", l)
+            if p is not None:
+                assert op is not None, "Level without op."
+                assert op not in level_dict, "Redefinition of level for op " + op
+                current_set = set(p.groups())
+                join_set = current_set & nnpa_supported_set
+                if not join_set:
+                    if debug:
+                        print(
+                            "process NNPA level, no overlap between",
+                            current_set,
+                            "and",
+                            nnpa_supported_set,
+                        )
+                else:
+                    # Find the first and last in set according to the order in nnpa_supported_list.
+                    first_in_set = None
+                    last_in_set = None
+                    for x in nnpa_supported_list:
+                        if x in join_set:
+                            last_in_set = x
+                            if first_in_set == None:
+                                first_in_set = x
+                    assert first_in_set and last_in_set, "should both be defined"
+                    if debug:
+                        print(
+                            "join set is",
+                            join_set,
+                            "first",
+                            first_in_set,
+                            "last",
+                            last_in_set,
+                        )
+                    if last_in_set == nnpa_level_default:  # First to default (current).
+                        level_dict[op] = first_in_set + nnpa_most_current_str
+                    elif first_in_set == last_in_set:  # Only one.
+                        level_dict[op] = first_in_set
+                    else:  # Interval finishing before current.
+                        level_dict[op] = first_in_set + " - " + last_in_set
+                    if debug:
+                        print("process NNPA level", level_dict[op])
+                continue
         # Limits.
         p = re.search(r"==LIM==\s+(.*)\s*$", l)
         if p is not None:
@@ -232,6 +290,13 @@ def print_md():
         + str(hightest_opset)
         + "."
     )
+    if "NNPA" in target_arch:
+        print(
+            "   * A ^ indicates onnx-mlir is compatible with the latest"
+            + " level of the NNPA Architecture which is "
+            + str(nnpa_level_default)
+            + "."
+        )
 
     print("\n")
     # Additional top paragraph.
@@ -239,8 +304,17 @@ def print_md():
         print(additional_top_paragraph)
         print("\n")
     # Table.
-    header = ["Op", "Supported Opsets (inclusive)", "Limitations"]
-    separator = ["---", "---", "---"]
+    if "NNPA" in target_arch:
+        header = [
+            "Op",
+            "Supported Opsets (inclusive)",
+            "Minimum NNPA Level(Inclusive)",
+            "Limitations",
+        ]
+        separator = ["---", "---", "---", "---"]
+    else:
+        header = ["Op", "Supported Opsets (inclusive)", "Limitations"]
+        separator = ["---", "---", "---"]
     if emit_notes:
         header.append("Notes")
         separator.append("---")
@@ -248,8 +322,16 @@ def print_md():
     print_row(separator)
     for op in sorted(list_op_version.keys()):
         supported_op = op in min_dict
+        if supported_op and "NNPA" in target_arch and op not in level_dict:
+            supported_op = False
         if supported_op:
             info = ["**" + op + "**", f"{min_dict[op]} - {max_dict[op]}"]
+            if "NNPA" in target_arch:
+                info = [
+                    "**" + op + "**",
+                    f"{min_dict[op]} - {max_dict[op]}",
+                    f"{level_dict[op]}",
+                ]
         else:
             if not emit_unsupported:
                 continue
@@ -288,7 +370,10 @@ def main(argv):
     for opt, arg in opts:
         if opt in ("-a", "--arch"):
             target_arch = arg
-            input_command += " --arch " + arg
+            input_command += " --arch " + target_arch
+        elif opt in ("-l", "--level"):
+            nnpa_level = arg
+            input_command += " --level " + nnpa_level
         elif opt in ("-d", "--debug"):
             debug = 1
         elif opt in ("-h", "--help"):
diff --git a/utils/onnxmlirrun.py b/utils/onnxmlirrun.py
index ae1332c68f..fcca711bfc 100644
--- a/utils/onnxmlirrun.py
+++ b/utils/onnxmlirrun.py
@@ -101,7 +101,7 @@ def compile(self):
         output_path = os.path.join(self.temp_dir.name, self.temp_lib_name)
         command_str += ["-o", output_path]
         if self.target == "zAIU":
-            command_str += ["--maccel=NNPA", "-O3", "--mcpu=z16"]
+            command_str += ["--maccel=NNPA", "-O3", "--march=z16"]
         command_str += self.options.split()
 
         # Compile the model.
diff --git a/utils/python/transformers/run_gpt2_from_huggingface.py b/utils/python/transformers/run_gpt2_from_huggingface.py
index 3c31e52e80..ffe8783858 100644
--- a/utils/python/transformers/run_gpt2_from_huggingface.py
+++ b/utils/python/transformers/run_gpt2_from_huggingface.py
@@ -91,7 +91,7 @@
 
 # Create CompileExecutionSession to compile and run the model,
 compile_flags = "-O3 -v --onnx-op-stats TXT"
-# compile_flags = "-O3 -mcpu=z16 -maccel=NNPA -v --onnx-op-stats TXT"
+# compile_flags = "-O3 --march=z16 --maccel=NNPA -v --onnx-op-stats TXT"
 decoder_sess = OMCompileExecutionSession(
     decoder_model_path, compile_flags + " -tag=decoder", reuse_compiled_model=1
 )