From bb947822170a33324c1016bde7f5ec6661a23ab7 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 11 Aug 2025 10:19:44 +0000
Subject: [PATCH 01/23] load w8a8

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py | 108 +++++++++++++++++++-
 examples/load_w8a8.py                 | 136 ++++++++++++++++++++++++++
 2 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100644 examples/load_w8a8.py

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index bd6dde836..1fff106d5 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -472,6 +472,8 @@ def infer_target_device(device_map=None):
 
 
 def post_init(model, used_backends):
+    if is_weight_fp8_activation_static_fp8(model.config.quantization_config):
+        return
     need_autogptq_init = False
     need_gptqmodel_init = False
     need_ipex_itrex_init = False
@@ -526,6 +528,108 @@ def post_init(model, used_backends):
             logger.warning("force model to bfloat16")
 
 
+
+def quant_tensor_with_scale(tensor, scale):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
+    return scale, cliped_qtensor_fp8
+
+
+class FP8QDQLinear(torch.nn.Module):
+    dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self, in_features: int, out_features: int, bias: bool = True, device=None
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype),
+            requires_grad=True,
+        )
+        self.weight_scale = nn.Parameter(
+            torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype),
+            requires_grad=False,
+        )
+        self.input_scale = nn.Parameter(
+            torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.pre_dequantized = False
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = quant_tensor_with_scale(
+            bf16_input, self.input_scale.data
+        )
+        qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale
+        return qdq_input_bf16
+
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an FP8QDQLinear layer from an original linear layer.
+        """
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias is not None,
+            )
+            return qdq_linear
+
+
+def _patching_mod(
+    mod, config, src_cls, dst_cls
+):
+    named_children_list = list(mod.named_children())
+    for name, layer in named_children_list:
+        if isinstance(layer, src_cls):
+            new_layer = dst_cls.from_original(config, layer)
+            setattr(mod, name, new_layer)
+            print(f"Patched {name} with {new_layer.__class__.__name__}")
+        elif isinstance(layer, nn.Module):
+            _patching_mod(layer, config, src_cls, dst_cls)
+    return mod
+
+
+def patching_model(model):
+    model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear)
+    return model
+
+
+def is_weight_fp8_activation_static_fp8(quant_config):
+    return True
+
 def convert_hf_model(model: nn.Module, target_device="cpu"):
     """Converts the given model to an AutoRound model by replacing its layers with quantized layers.
 
@@ -547,7 +651,9 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
     """
 
     quantization_config = model.config.quantization_config
-
+    if is_weight_fp8_activation_static_fp8(quantization_config):
+        model = patching_model(model)
+        
     if hasattr(quantization_config, "desc_act") and quantization_config.desc_act:
         ##check static_group
         if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or (
diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py
new file mode 100644
index 000000000..df10b6c10
--- /dev/null
+++ b/examples/load_w8a8.py
@@ -0,0 +1,136 @@
+import os
+import torch
+import tqdm
+from loguru import logger
+import logging
+import safetensors
+from safetensors import safe_open
+from safetensors.torch import save_file
+import json
+
+logging.basicConfig(level=logging.DEBUG)
+torch.set_grad_enabled(False)
+
+# CONSTANTS
+SAFETENSORS = "safetensors"
+WEIGHT_SCALE_NAME = "weight_scale"
+INPUT_SCALE_NAME = "scale_input"
+SCALE_DTYPE = torch.bfloat16
+SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
+FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+WEIGHT_BACKOFF = 1.0
+QUANT_MODULE_TYPES = (torch.nn.Linear,)
+SKIP_WEIGHT_LST = {
+    "model.norm",
+    "layernorm",
+    "e_score_correction_bias",
+    # "lm_head.weight",
+    "embed_tokens",
+    "mlp.gate.weight",  # mlp.gate is not linear
+}
+
+MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
+
+
+seed = 0
+import random
+
+random.seed(seed)
+import torch
+
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+import numpy as np
+
+np.random.seed(seed)
+
+
+# torch.use_deterministic_algorithms(True)
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+g = torch.Generator()
+g.manual_seed(0)
+
+
+
+
+def pre_dequantize(model):
+    """
+    Pre-dequantize all FP8QDQLinear layers in the model.
+    """
+    for name, module in model.named_modules():
+        if module.__class__.__name__ == "FP8QDQLinear":
+            logger.info(f"Pre-dequantizing {name}")
+            module.pre_dequantize()
+        else:
+            logger.debug(f"Skipping {name} as it is not FP8QDQLinear")
+
+
+def qdq_eval(model_path, not_patch_lin=False):
+    import transformers
+    from transformers.modeling_utils import no_init_weights
+
+
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
+    logger.info(f"Patched model: {model}")
+    model.eval()
+    model.to("cuda")
+    import torch
+
+    model = torch.compile(model)
+    # pre_dequantize(model)
+    with torch.device("cuda"):
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+        prompt = "Hi, who"
+        encode = tokenizer.encode(prompt, return_tensors="pt")
+        with torch.no_grad():
+            output_tokens = model.generate(encode, max_length=100)
+            output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+            logger.info(f"Prompt: {prompt}")
+            logger.info(f"Output: {output}")
+
+    # from auto_round.script.llm import eval_task_by_task
+
+    # eval_task_by_task(
+    #     model=model,
+    #     device="cuda",
+    #     tasks="gsm8k",
+    #     batch_size=32,
+    #     limit=128,
+    #     # trust_remote_code=not args.disable_trust_remote_code,
+    #     # eval_model_dtype=args.eval_model_dtype
+    # )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', "--qmodel_path", type=str, required=True)
+    parser.add_argument(
+        "--not_patch_lin", action="store_true", help="Measure float model"
+    )
+    args = parser.parse_args()
+    qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
+
+
+"""
+p load_w8a8.py --qmodel_path  /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8
+Running generate_until requests:  76%|███ | 97/128 [11:45<03:
+Running generate_until requests: 100%|███| 128/128 [11:45<00:00,  5.51s/it]
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7422|±  |0.0388|
+|     |       |strict-match    |     5|exact_match|↑  |0.6797|±  |0.0414|
+
+total eval time: 742.8823928833008
+"""
\ No newline at end of file

From 9bef8263328fe7ef152d828c1775d4aa385885cc Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 02:44:17 -0400
Subject: [PATCH 02/23] refactor

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../export_to_autoround/export_to_fp8_woq.py  |  89 +++++++++++++
 auto_round/inference/backend.py               |  20 ++-
 auto_round/inference/convert_model.py         | 120 ++----------------
 examples/load_w8a8.py                         |  39 +++---
 4 files changed, 141 insertions(+), 127 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 5b6a4c400..8b357e090 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -16,6 +16,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from typing import Optional, Union
 
 import threadpoolctl as tctl
 import torch
@@ -83,6 +84,94 @@ def __init__(
             self.register_buffer("input_scale", input_scale.to(dtype))
 
 
+def quant_tensor_with_scale(tensor, scale):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
+    return scale, cliped_qtensor_fp8
+
+
+class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module):
+    hp_dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight: Optional[torch.Tensor] = None,
+        weight_scale: Optional[torch.Tensor] = None,
+        bias: Union[torch.Tensor, bool, None] = None,
+        weight_zp: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight
+        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
+        self.dtype = dtype
+        if bias is not None:
+            if isinstance(bias, bool):
+                bias = torch.zeros((out_features,), dtype=dtype)
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.register_parameter("bias", None)
+        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
+        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
+
+        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
+        if weight_zp:
+            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
+
+        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
+        self.register_buffer("input_scale", init_input_scale.to(dtype))
+        self.pre_dequantized = False
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an FP8WOQLinear layer from an original linear layer.
+        """
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias,
+            )
+            return qdq_linear
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data)
+        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
+        return qdq_input_bf16
+
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
+
+
 def pack_layer(layer_name, model, data_type, packing_device=None):
     """
      Packs a model layer for quantization based on its type and configuration.
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index a4f578726..4b259db0a 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -410,7 +410,18 @@ def check_compatible(
     return True
 
 
-def dynamic_import_inference_linear(backend, bits, group_size, sym):
+def is_weight_fp8_activation_static_fp8(config):
+    bits, group_size, sym, data_type, act_dynamic = (
+        config["bits"],
+        config["group_size"],
+        config["sym"],
+        config["data_type"],
+        config["act_dynamic"],
+    )
+    return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
+
+
+def dynamic_import_inference_linear(backend, config):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
 
     This function dynamically loads the correct `QuantLinear` class based on the backend and quantization
@@ -435,6 +446,13 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym):
         ImportError:
             If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq).
     """
+    bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
+
+    if is_weight_fp8_activation_static_fp8(config):
+        from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear
+
+        return WeightFP8ActFP8StaticQuantLinear
+
     if "qbits" in backend:
         try:
             from intel_extension_for_transformers import qbits  # pylint: disable=E0401
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index 1fff106d5..bbca26f4f 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -27,6 +27,7 @@
     find_backend,
     get_highest_priority_backend,
     get_layer_backend,
+    is_weight_fp8_activation_static_fp8,
     process_requirement,
 )
 from auto_round.utils import (
@@ -61,7 +62,7 @@ def skip_not_convert_modules(model, quantization_config, layer_names, layer_conf
     try:  # transformers new api
         modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert, add_default_skips=True)
     except:
-        modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert)
+        modules_to_not_convert = _get_modules_to_not_convert(model, modules_to_not_convert)
     if modules_to_not_convert:
         for layer_name in layer_names:
             if any([re.search(re.compile(n), layer_name) for n in modules_to_not_convert]):
@@ -219,6 +220,7 @@ def get_layer_config(model, quantization_config):
             - group_size (int): Group size for weight quantization.
             - data_type (str, optional): Data type for quantization (default: "int").
             - sym (bool): Whether to use symmetric quantization.
+            - act_dynamic (bool, optional): Whether to use dynamic activation quantization (default: False).
             - quant_block_list (list, optional): Predefined list of blocks to quantize.
             - to_quant_block_names (list or str, optional): Blocks to quantize (if quant_block_list is None).
             - extra_config (dict, optional): Per-layer overrides for quantization settings.
@@ -231,13 +233,14 @@ def get_layer_config(model, quantization_config):
             - "group_size" (int): Group size for quantization.
             - "data_type" (str): Data type used for quantization.
             - "sym" (bool): Whether symmetric quantization is applied.
+            - "act_dynamic" (bool): Whether dynamic activation quantization is used.
             - "clip" (bool): Whether weight clipping is enabled.
     """
     bits = quantization_config.bits
     group_size = quantization_config.group_size
     data_type = getattr(quantization_config, "data_type", "int")  # Default to "int" if not specified
     sym = quantization_config.sym
-
+    act_dynamic = getattr(quantization_config, "act_dynamic", False)
     # Determine the quantization block list
     quant_block_list = getattr(quantization_config, "quant_block_list", None)
     if quant_block_list is None:
@@ -290,11 +293,11 @@ def get_layer_config(model, quantization_config):
             "group_size": extra_config.get(layer_name, {}).get("group_size", group_size),
             "data_type": extra_config.get(layer_name, {}).get("data_type", data_type),
             "sym": extra_config.get(layer_name, {}).get("sym", sym),
+            "act_dynamic": extra_config.get(layer_name, {}).get("act_dynamic", act_dynamic),
             "clip": extra_config.get(layer_name, {}).get("clip", False),
         }
         for layer_name in layer_names
     }
-
     return layer_configs
 
 
@@ -415,7 +418,7 @@ def _import_exllamav2_kernels():
 
 def _create_quant_layer(layer, layer_backend, config, in_features, out_features):
     """Creates a quantized layer using the appropriate class."""
-    QuantLinear = dynamic_import_inference_linear(layer_backend, config["bits"], config["group_size"], config["sym"])
+    QuantLinear = dynamic_import_inference_linear(layer_backend, config)
     bias = layer.bias is not None
 
     # Special handling for AWQ layers
@@ -437,6 +440,8 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features)
             out_features=out_features,
             bias=bias,
         )
+    elif is_weight_fp8_activation_static_fp8(config):
+        return QuantLinear.from_original(config, layer)
     # Default quantized layer creation
     try:
         return QuantLinear(
@@ -528,108 +533,6 @@ def post_init(model, used_backends):
             logger.warning("force model to bfloat16")
 
 
-
-def quant_tensor_with_scale(tensor, scale):
-    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
-    qtensor = tensor / scale
-    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
-    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
-    return scale, cliped_qtensor_fp8
-
-
-class FP8QDQLinear(torch.nn.Module):
-    dtype = torch.bfloat16
-    fp8_dtype = torch.float8_e4m3fn
-
-    def __init__(
-        self, in_features: int, out_features: int, bias: bool = True, device=None
-    ):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight = nn.Parameter(
-            torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype),
-            requires_grad=True,
-        )
-        self.weight_scale = nn.Parameter(
-            torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype),
-            requires_grad=False,
-        )
-        self.input_scale = nn.Parameter(
-            torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False
-        )
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_features))
-        else:
-            self.register_parameter("bias", None)
-        self.pre_dequantized = False
-
-    def dequant_weight_online(self):
-        if self.pre_dequantized:
-            return self.weight
-        fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale
-        return qdq_weight
-
-    def pre_dequantize(self):
-        if self.pre_dequantized:
-            return
-        dequant_weight = self.dequant_weight_online()
-        del self.weight
-        del self.weight_scale
-        self.weight = nn.Parameter(dequant_weight, requires_grad=False)
-        self.pre_dequantized = True
-
-    def qdq_input(self, bf16_input: torch.Tensor):
-        input_scale, input_fp8 = quant_tensor_with_scale(
-            bf16_input, self.input_scale.data
-        )
-        qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale
-        return qdq_input_bf16
-
-    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
-        qdq_input = self.qdq_input(bf16_input)
-        qdq_weight = self.dequant_weight_online()
-        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
-        return out
-
-    @classmethod
-    def from_original(cls, config, original_layer):
-        """
-        Create an FP8QDQLinear layer from an original linear layer.
-        """
-        device = original_layer.weight.device
-        with torch.device(device):
-            qdq_linear = cls(
-                in_features=original_layer.in_features,
-                out_features=original_layer.out_features,
-                bias=original_layer.bias is not None,
-            )
-            return qdq_linear
-
-
-def _patching_mod(
-    mod, config, src_cls, dst_cls
-):
-    named_children_list = list(mod.named_children())
-    for name, layer in named_children_list:
-        if isinstance(layer, src_cls):
-            new_layer = dst_cls.from_original(config, layer)
-            setattr(mod, name, new_layer)
-            print(f"Patched {name} with {new_layer.__class__.__name__}")
-        elif isinstance(layer, nn.Module):
-            _patching_mod(layer, config, src_cls, dst_cls)
-    return mod
-
-
-def patching_model(model):
-    model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear)
-    return model
-
-
-def is_weight_fp8_activation_static_fp8(quant_config):
-    return True
-
 def convert_hf_model(model: nn.Module, target_device="cpu"):
     """Converts the given model to an AutoRound model by replacing its layers with quantized layers.
 
@@ -651,9 +554,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
     """
 
     quantization_config = model.config.quantization_config
-    if is_weight_fp8_activation_static_fp8(quantization_config):
-        model = patching_model(model)
-        
+
     if hasattr(quantization_config, "desc_act") and quantization_config.desc_act:
         ##check static_group
         if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or (
@@ -694,7 +595,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
         backend = backend[len("auto_round:") :]
 
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend)
-
     if backend == "auto" or backend == "":
         best_backend = get_highest_priority_backend(
             quantization_config.bits,
diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py
index df10b6c10..ad6218f9b 100644
--- a/examples/load_w8a8.py
+++ b/examples/load_w8a8.py
@@ -1,12 +1,13 @@
+import json
+import logging
 import os
+
+import safetensors
 import torch
 import tqdm
 from loguru import logger
-import logging
-import safetensors
 from safetensors import safe_open
 from safetensors.torch import save_file
-import json
 
 logging.basicConfig(level=logging.DEBUG)
 torch.set_grad_enabled(False)
@@ -42,13 +43,13 @@
 torch.cuda.manual_seed(seed)
 import numpy as np
 
-np.random.seed(seed)
+np.random.Generator(seed)
 
 
 # torch.use_deterministic_algorithms(True)
 def seed_worker(worker_id):
     worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
+    np.random.Generator(worker_seed)
     random.seed(worker_seed)
 
 
@@ -56,8 +57,6 @@ def seed_worker(worker_id):
 g.manual_seed(0)
 
 
-
-
 def pre_dequantize(model):
     """
     Pre-dequantize all FP8QDQLinear layers in the model.
@@ -70,10 +69,15 @@ def pre_dequantize(model):
             logger.debug(f"Skipping {name} as it is not FP8QDQLinear")
 
 
+import torch
+
+
+@torch.no_grad()
 def qdq_eval(model_path, not_patch_lin=False):
+
     import transformers
-    from transformers.modeling_utils import no_init_weights
 
+    # from transformers.modeling_utils import no_init_weights
 
     model = transformers.AutoModelForCausalLM.from_pretrained(
         model_path,
@@ -86,14 +90,19 @@ def qdq_eval(model_path, not_patch_lin=False):
     model.to("cuda")
     import torch
 
-    model = torch.compile(model)
-    # pre_dequantize(model)
     with torch.device("cuda"):
+        from transformers import GenerationConfig
+
+        gen_config = GenerationConfig(use_cache=True, cache_implementation="static")
         tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
         prompt = "Hi, who"
         encode = tokenizer.encode(prompt, return_tensors="pt")
         with torch.no_grad():
-            output_tokens = model.generate(encode, max_length=100)
+            output_tokens = model.generate(
+                encode,
+                max_length=10,
+                #    generation_config=gen_config
+            )
             output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
             logger.info(f"Prompt: {prompt}")
             logger.info(f"Output: {output}")
@@ -115,10 +124,8 @@ def qdq_eval(model_path, not_patch_lin=False):
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('-m', "--qmodel_path", type=str, required=True)
-    parser.add_argument(
-        "--not_patch_lin", action="store_true", help="Measure float model"
-    )
+    parser.add_argument("-m", "--qmodel_path", type=str, required=True)
+    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
     args = parser.parse_args()
     qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
 
@@ -133,4 +140,4 @@ def qdq_eval(model_path, not_patch_lin=False):
 |     |       |strict-match    |     5|exact_match|↑  |0.6797|±  |0.0414|
 
 total eval time: 742.8823928833008
-"""
\ No newline at end of file
+"""

From b30a126fed56bd07473d2bba53d1dcbe9ed9bd7b Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 03:01:56 -0400
Subject: [PATCH 03/23] add ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py |  2 --
 test/test_cpu/test_export.py          | 28 ++++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index bbca26f4f..bd8b4621d 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -477,8 +477,6 @@ def infer_target_device(device_map=None):
 
 
 def post_init(model, used_backends):
-    if is_weight_fp8_activation_static_fp8(model.config.quantization_config):
-        return
     need_autogptq_init = False
     need_gptqmodel_init = False
     need_ipex_itrex_init = False
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index bbce4036b..367d20c5d 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -199,7 +199,7 @@ def test_autoround_3bit_sym_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_static_afp8_export(self):
+    def test_static_afp8_export_and_load(self):
         import os
 
         from safetensors import safe_open
@@ -226,6 +226,32 @@ def test_static_afp8_export(self):
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1, 1]))
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        with torch.no_grad():
+            import transformers
+
+            model = transformers.AutoModelForCausalLM.from_pretrained(
+                quantized_model_path,
+                torch_dtype="auto",
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            )
+            model.eval()
+            assert (
+                model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear"
+            ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}"
+            tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path)
+            prompt = "AI is "
+            encode = tokenizer.encode(prompt, return_tensors="pt")
+            with torch.no_grad():
+                output_tokens = model.generate(
+                    encode,
+                    max_length=10,
+                )
+                output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+                print(f"Prompt: {prompt}")
+                print(f"Output: {output}")
+                assert output is not None, "Output should not be None"
+
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)

From eaad3a6e150d8830c96460b333ed557c04e165ae Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 03:02:38 -0400
Subject: [PATCH 04/23] remove example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/load_w8a8.py | 143 ------------------------------------------
 1 file changed, 143 deletions(-)
 delete mode 100644 examples/load_w8a8.py

diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py
deleted file mode 100644
index ad6218f9b..000000000
--- a/examples/load_w8a8.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import json
-import logging
-import os
-
-import safetensors
-import torch
-import tqdm
-from loguru import logger
-from safetensors import safe_open
-from safetensors.torch import save_file
-
-logging.basicConfig(level=logging.DEBUG)
-torch.set_grad_enabled(False)
-
-# CONSTANTS
-SAFETENSORS = "safetensors"
-WEIGHT_SCALE_NAME = "weight_scale"
-INPUT_SCALE_NAME = "scale_input"
-SCALE_DTYPE = torch.bfloat16
-SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
-FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
-WEIGHT_BACKOFF = 1.0
-QUANT_MODULE_TYPES = (torch.nn.Linear,)
-SKIP_WEIGHT_LST = {
-    "model.norm",
-    "layernorm",
-    "e_score_correction_bias",
-    # "lm_head.weight",
-    "embed_tokens",
-    "mlp.gate.weight",  # mlp.gate is not linear
-}
-
-MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
-
-
-seed = 0
-import random
-
-random.seed(seed)
-import torch
-
-torch.manual_seed(seed)
-torch.cuda.manual_seed(seed)
-import numpy as np
-
-np.random.Generator(seed)
-
-
-# torch.use_deterministic_algorithms(True)
-def seed_worker(worker_id):
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.Generator(worker_seed)
-    random.seed(worker_seed)
-
-
-g = torch.Generator()
-g.manual_seed(0)
-
-
-def pre_dequantize(model):
-    """
-    Pre-dequantize all FP8QDQLinear layers in the model.
-    """
-    for name, module in model.named_modules():
-        if module.__class__.__name__ == "FP8QDQLinear":
-            logger.info(f"Pre-dequantizing {name}")
-            module.pre_dequantize()
-        else:
-            logger.debug(f"Skipping {name} as it is not FP8QDQLinear")
-
-
-import torch
-
-
-@torch.no_grad()
-def qdq_eval(model_path, not_patch_lin=False):
-
-    import transformers
-
-    # from transformers.modeling_utils import no_init_weights
-
-    model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_path,
-        torch_dtype="auto",
-        low_cpu_mem_usage=True,
-        trust_remote_code=True,
-    )
-    logger.info(f"Patched model: {model}")
-    model.eval()
-    model.to("cuda")
-    import torch
-
-    with torch.device("cuda"):
-        from transformers import GenerationConfig
-
-        gen_config = GenerationConfig(use_cache=True, cache_implementation="static")
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
-        prompt = "Hi, who"
-        encode = tokenizer.encode(prompt, return_tensors="pt")
-        with torch.no_grad():
-            output_tokens = model.generate(
-                encode,
-                max_length=10,
-                #    generation_config=gen_config
-            )
-            output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
-            logger.info(f"Prompt: {prompt}")
-            logger.info(f"Output: {output}")
-
-    # from auto_round.script.llm import eval_task_by_task
-
-    # eval_task_by_task(
-    #     model=model,
-    #     device="cuda",
-    #     tasks="gsm8k",
-    #     batch_size=32,
-    #     limit=128,
-    #     # trust_remote_code=not args.disable_trust_remote_code,
-    #     # eval_model_dtype=args.eval_model_dtype
-    # )
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--qmodel_path", type=str, required=True)
-    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
-    args = parser.parse_args()
-    qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
-
-
-"""
-p load_w8a8.py --qmodel_path  /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8
-Running generate_until requests:  76%|███ | 97/128 [11:45<03:
-Running generate_until requests: 100%|███| 128/128 [11:45<00:00,  5.51s/it]
-|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
-|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
-|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7422|±  |0.0388|
-|     |       |strict-match    |     5|exact_match|↑  |0.6797|±  |0.0414|
-
-total eval time: 742.8823928833008
-"""

From c411ca5f86fdc2f84a5fa301ceab34d98ddf2bcb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 03:04:26 -0400
Subject: [PATCH 05/23] fix typo

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 8b357e090..1b2d7c222 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -133,7 +133,7 @@ def __init__(
     @classmethod
     def from_original(cls, config, original_layer):
         """
-        Create an FP8WOQLinear layer from an original linear layer.
+        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
         """
         device = original_layer.weight.device
         with torch.device(device):
@@ -165,6 +165,7 @@ def qdq_input(self, bf16_input: torch.Tensor):
         qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
         return qdq_input_bf16
 
+    @torch.no_grad()
     def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
         qdq_input = self.qdq_input(bf16_input)
         qdq_weight = self.dequant_weight_online()

From 6597d5ca36d084848f76cde2a972bc684f888d4c Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 13 Aug 2025 08:39:45 +0800
Subject: [PATCH 06/23] Update
 auto_round/export/export_to_autoround/export_to_fp8_woq.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 1b2d7c222..09af9e270 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -89,7 +89,9 @@ def quant_tensor_with_scale(tensor, scale):
     qtensor = tensor / scale
     cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
     cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
-    return scale, cliped_qtensor_fp8
+    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
+    return scale, clipped_qtensor_fp8
 
 
 class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module):

From 9b0f32ffdd0cb4aac2c36922588c8cdd56296346 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 13 Aug 2025 08:40:45 +0800
Subject: [PATCH 07/23] Update export_to_fp8_woq.py

---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 09af9e270..4d2b924d1 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -87,8 +87,6 @@ def __init__(
 def quant_tensor_with_scale(tensor, scale):
     FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
     qtensor = tensor / scale
-    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
-    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
     clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
     clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
     return scale, clipped_qtensor_fp8

From 5ebca24b6ee300f4205ae3798c5568ac419cf134 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 24 Aug 2025 05:00:23 -0400
Subject: [PATCH 08/23] update shape

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../export/export_to_autoround/export_to_fp8_woq.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 7bcfb8011..e7b473593 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -94,7 +94,7 @@ def __init__(
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
-        init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight
+        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
         self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
         self.dtype = dtype
         if bias is not None:
@@ -103,14 +103,14 @@ def __init__(
             self.bias = torch.nn.Parameter(bias, requires_grad=False)
         else:
             self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
         self.register_buffer("weight_scale", init_weight_scale.to(dtype))
 
         init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
         if weight_zp:
             self.register_buffer("weight_zp", init_weight_zp.to(dtype))
 
-        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
+        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 
@@ -132,7 +132,7 @@ def dequant_weight_online(self):
         if self.pre_dequantized:
             return self.weight
         fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale
+        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
         return qdq_weight
 
     def pre_dequantize(self):

From 03cb21711a34b22fc002ebca399d9a58b7d07ec9 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 02:54:16 -0400
Subject: [PATCH 09/23] refactor

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py      |  28 +++++
 .../experimental/qmodules/fp8_static.py       | 108 ++++++++++++++++++
 .../export_to_autoround/export_to_fp8_woq.py  |  89 ---------------
 3 files changed, 136 insertions(+), 89 deletions(-)
 create mode 100644 auto_round/experimental/qmodules/base.py
 create mode 100644 auto_round/experimental/qmodules/fp8_static.py

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
new file mode 100644
index 000000000..860e66836
--- /dev/null
+++ b/auto_round/experimental/qmodules/base.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import Optional, Union
+
+import torch
+
+
+class QModuleBase(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    @abstractmethod
+    def from_original(cls, config, original_layer):
+        raise NotImplementedError
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
new file mode 100644
index 000000000..8d58480d3
--- /dev/null
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+
+from auto_round.experimental.qmodules.base import QModuleBase
+
+
+def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
+    return scale, clipped_qtensor_fp8
+
+
+class WeightFP8ActFP8StaticQuantLinear(QModuleBase):
+    hp_dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight: Optional[torch.Tensor] = None,
+        weight_scale: Optional[torch.Tensor] = None,
+        bias: Union[torch.Tensor, bool, None] = None,
+        weight_zp: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
+        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
+        self.dtype = dtype
+        if bias is not None:
+            if isinstance(bias, bool):
+                bias = torch.zeros((out_features,), dtype=dtype)
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.register_parameter("bias", None)
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
+        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
+
+        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
+        if weight_zp:
+            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
+
+        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
+        self.register_buffer("input_scale", init_input_scale.to(dtype))
+        self.pre_dequantized = False
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
+        """
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias,
+            )
+            return qdq_linear
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = _quant_tensor_to_fp8_with_scale(bf16_input, self.input_scale.data)
+        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
+        return qdq_input_bf16
+
+    @torch.no_grad()
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index e7b473593..214e5046e 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -68,95 +68,6 @@ def __init__(
             self.register_buffer("input_scale", input_scale.to(dtype))
 
 
-def quant_tensor_with_scale(tensor, scale):
-    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
-    qtensor = tensor / scale
-    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
-    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
-    return scale, clipped_qtensor_fp8
-
-
-class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module):
-    hp_dtype = torch.bfloat16
-    fp8_dtype = torch.float8_e4m3fn
-
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        weight: Optional[torch.Tensor] = None,
-        weight_scale: Optional[torch.Tensor] = None,
-        bias: Union[torch.Tensor, bool, None] = None,
-        weight_zp: Optional[torch.Tensor] = None,
-        input_scale: Optional[torch.Tensor] = None,
-        dtype=torch.bfloat16,
-    ):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
-        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
-        self.dtype = dtype
-        if bias is not None:
-            if isinstance(bias, bool):
-                bias = torch.zeros((out_features,), dtype=dtype)
-            self.bias = torch.nn.Parameter(bias, requires_grad=False)
-        else:
-            self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
-        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
-
-        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
-        if weight_zp:
-            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
-
-        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
-        self.register_buffer("input_scale", init_input_scale.to(dtype))
-        self.pre_dequantized = False
-
-    @classmethod
-    def from_original(cls, config, original_layer):
-        """
-        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
-        """
-        device = original_layer.weight.device
-        with torch.device(device):
-            qdq_linear = cls(
-                in_features=original_layer.in_features,
-                out_features=original_layer.out_features,
-                bias=original_layer.bias,
-            )
-            return qdq_linear
-
-    def dequant_weight_online(self):
-        if self.pre_dequantized:
-            return self.weight
-        fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
-        return qdq_weight
-
-    def pre_dequantize(self):
-        if self.pre_dequantized:
-            return
-        dequant_weight = self.dequant_weight_online()
-        del self.weight
-        del self.weight_scale
-        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
-        self.pre_dequantized = True
-
-    def qdq_input(self, bf16_input: torch.Tensor):
-        input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data)
-        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
-        return qdq_input_bf16
-
-    @torch.no_grad()
-    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
-        qdq_input = self.qdq_input(bf16_input)
-        qdq_weight = self.dequant_weight_online()
-        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
-        return out
-
-
 def pack_layer(layer_name, model, data_type, packing_device=None):
     """
      Packs a model layer for quantization based on its type and configuration.

From 66388e5360173de4e4b6340a4e075bdd1749c46c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 05:23:48 -0400
Subject: [PATCH 10/23] tmp add bk

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       | 12 ++++++++++++
 auto_round/inference/convert_model.py |  3 +--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 4e3f42861..867d9f398 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -172,6 +172,17 @@ def feature_multiply_checker_group_size(
     requirements=["auto-round>=0.5.1"],
 )
 
+BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
+    device=["cuda", "cpu"],
+    packing_format="",
+    sym=[True],
+    bits=[8],
+    priority=0,
+    feature_checks=[],
+    alias=["auto_round", "torch"],
+    requirements=["auto-round>=0.6.1"],
+)
+
 BackendInfos["auto_round:tritonv2_zp"] = BackendInfo(
     device=["cuda", "xpu"],
     sym=[True],  ## asym has accuracys
@@ -732,6 +743,7 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
             If no compatible backend is found for the given layer configuration.
     """
     # Check if the provided backend is in BackendInfos
+    # breakpoint()
     backend = find_backend(backend)
     if backend not in BackendInfos.keys():
         raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.")
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index bd8b4621d..fbdfb8804 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -566,7 +566,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
         backend = quantization_config.backend
     else:
         backend = "auto"
-
     ##target_backend could be None
     _, backend = parse_target_device_and_backend(backend)
 
@@ -591,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
 
     if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format):
         backend = backend[len("auto_round:") :]
-
+    # breakpoint()
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend)
     if backend == "auto" or backend == "":
         best_backend = get_highest_priority_backend(

From 17ddd2d0d22d42a990a1dafcc47d47f14e45f0a5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:00:54 -0400
Subject: [PATCH 11/23] refactor code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       | 7 ++++---
 auto_round/inference/convert_model.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 867d9f398..3e4c8a7f2 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -176,11 +176,12 @@ def feature_multiply_checker_group_size(
     device=["cuda", "cpu"],
     packing_format="",
     sym=[True],
+    dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
     feature_checks=[],
     alias=["auto_round", "torch"],
-    requirements=["auto-round>=0.6.1"],
+    requirements=["auto-round>=0.6.1.dev0"],
 )
 
 BackendInfos["auto_round:tritonv2_zp"] = BackendInfo(
@@ -463,7 +464,7 @@ def dynamic_import_inference_linear(backend, config):
     bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
 
     if is_weight_fp8_activation_static_fp8(config):
-        from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear
+        from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear
 
         return WeightFP8ActFP8StaticQuantLinear
 
@@ -743,7 +744,6 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
             If no compatible backend is found for the given layer configuration.
     """
     # Check if the provided backend is in BackendInfos
-    # breakpoint()
     backend = find_backend(backend)
     if backend not in BackendInfos.keys():
         raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.")
@@ -855,6 +855,7 @@ def build_pip_commands(gptq_req, other_reqs):
 
     # Instructional messages
     install_instructions = []
+
     for cmd in pip_cmds:
         if "intel-extension-for-pytorch" in cmd and target_device == "xpu":
             install_instructions.append(
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index fbdfb8804..df8b52c07 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -590,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
 
     if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format):
         backend = backend[len("auto_round:") :]
-    # breakpoint()
+
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend)
     if backend == "auto" or backend == "":
         best_backend = get_highest_priority_backend(

From 808449d71e0d004298c183d76a417a3df83f3528 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:12:52 -0400
Subject: [PATCH 12/23] refine code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py      | 24 +++++++++++++++++++
 .../experimental/qmodules/fp8_static.py       | 12 ++++++++++
 auto_round/inference/backend.py               |  4 ++++
 3 files changed, 40 insertions(+)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index 860e66836..affc7552d 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -19,6 +19,14 @@
 
 
 class QModuleBase(torch.nn.Module):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by Auto-Round.
+    The design is inspired by vLLM's CompressedTensorsScheme:
+    https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+
+    """
+
     def __init__(self):
         super().__init__()
 
@@ -26,3 +34,19 @@ def __init__(self):
     @abstractmethod
     def from_original(cls, config, original_layer):
         raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 8d58480d3..3774da810 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from abc import abstractmethod
 from typing import Optional, Union
 
 import torch
@@ -106,3 +107,14 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
         qdq_weight = self.dequant_weight_online()
         out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
         return out
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        # FIXME: set to 0 for now, as fp8 kernels are not available yet
+        return 0
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        pass
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 3e4c8a7f2..0ca0d4726 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -172,6 +172,10 @@ def feature_multiply_checker_group_size(
     requirements=["auto-round>=0.5.1"],
 )
 
+# FP8 static quant
+# Weight: FP8, per-channel, may be extended to per-tensor in future
+# Activation: FP8, per-tensor
+
 BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
     device=["cuda", "cpu"],
     packing_format="",

From f74ed6f6ffd7c40b55ce2886a9882f55b5f96bce Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:17:49 -0400
Subject: [PATCH 13/23] fix device list

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../experimental/qmodules/fp8_static.py       | 22 +++++++++----------
 auto_round/inference/backend.py               |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 3774da810..074cf34e7 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -66,6 +66,17 @@ def __init__(
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        # FIXME: set to 0 for now, as fp8 kernels are not available yet
+        return 0
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        pass
+
     @classmethod
     def from_original(cls, config, original_layer):
         """
@@ -107,14 +118,3 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
         qdq_weight = self.dequant_weight_online()
         out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
         return out
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        """
-        Get minimum device capability.
-        """
-        # FIXME: set to 0 for now, as fp8 kernels are not available yet
-        return 0
-
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        pass
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 0ca0d4726..f74f22b75 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -177,7 +177,7 @@ def feature_multiply_checker_group_size(
 # Activation: FP8, per-tensor
 
 BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
-    device=["cuda", "cpu"],
+    device=["xpu", "cuda", "cpu"],
     packing_format="",
     sym=[True],
     dtype=["float32", "float16", "bfloat16"],

From 632cf8a91046608bb26afedf63c81e0920a3d822 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:25:13 -0400
Subject: [PATCH 14/23] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 9dbbca5ab..b8a32896f 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -16,7 +16,6 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
-from typing import Optional, Union
 
 import threadpoolctl as tctl
 import torch

From 5b8b29d4a2e315b9656eb90c8b3948015bcb4a20 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:14:04 -0400
Subject: [PATCH 15/23] refactor code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py                       | 19 +++++++++++---
 .../export/export_to_autoround/export.py      |  8 +++++-
 auto_round/inference/backend.py               | 13 +---------
 auto_round/utils.py                           | 26 +++++++++++++++++++
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index fed33df34..85ea75e60 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -19,6 +19,7 @@
 import sys
 import time
 import traceback
+from enum import Enum
 from typing import Any, Union
 
 import accelerate
@@ -74,6 +75,7 @@
     is_optimum_habana_available,
     is_standard_fp,
     is_static_afp8,
+    is_torch_fp8_static,
     llm_load_model,
     logger,
     mv_module_from_gpu,
@@ -87,6 +89,12 @@
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
 
+class AutoRoundFormat(str, Enum):
+    # Weight: FP8, per-channel, may be extended to per-tensor in future
+    # Activation: FP8, per-tensor
+    TORCH_FP8_STATIC = "torch_fp8_static"
+
+
 class AutoRound(object):
     """Automatic weight rounding (Signed Gradient Descent) for LLM quantization
 
@@ -663,9 +671,14 @@ def _parse_format_to_list(self, format: str) -> list:
                     )
                     if enable_awq:
                         formats[index] = format.replace("auto_round", "auto_round:auto_awq")
-                if is_nv_fp(self.data_type) or is_mx_fp(self.data_type) or is_standard_fp(self.data_type):
+                if is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
                     format = format.replace("auto_round", f"auto_round:{self.data_type}")
                     formats[index] = format
+                if is_torch_fp8_static(self):
+                    format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}")
+                    formats[index] = format
+                # if is_torch_fp8_static(self):
+                #     formats[index] = "auto_round:torch_fp8_static"
             elif format == "llmcompressor":
                 from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported
 
@@ -731,10 +744,10 @@ def _check_supported_format(self, format: str) -> bool:
                     )
                     format = "fake"
             else:
-                if not (format == "auto_round" or format == "auto_round:fp8"):
+                if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"):
                     logger.warning(
                         f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model,"
-                        " change format to auto_round"
+                        f" change format {format} to auto_round"
                     )
                     format = "auto_round"
             if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8":
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 1640528b6..38b815eb1 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -263,6 +263,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     Raises:
         ValueError: If the backend is not supported.
     """
+    # breakpoint()
     data_type = kwargs.get("data_type", None)
     if is_nv_fp(data_type) or is_mx_fp(data_type):  ## detect nvfp & mxfp first
         from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp
@@ -273,9 +274,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         from auto_round.export.export_to_autoround.export_to_fp8_woq import save_quantized_as_autoround
 
         return save_quantized_as_autoround(output_dir, inplace=inplace, backend="auto_round", **kwargs)
+    from auto_round.autoround import AutoRoundFormat
 
     ##if using sym, we change to gptq sym kernel to avoid compiling from auto_round source
-    if (kwargs.get("sym") is None or kwargs.get("sym")) and ("gptq" not in backend and "awq" not in backend):
+    if (
+        (kwargs.get("sym") is None or kwargs.get("sym"))
+        and ("gptq" not in backend and "awq" not in backend)
+        and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend)
+    ):
         backend = backend.replace("auto_round", "auto_round:auto_gptq")
 
     model = kwargs["model"]
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index f74f22b75..739ff4e89 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -19,7 +19,7 @@
 from transformers.utils.versions import require_version
 
 import auto_round_extension.cuda.gptqmodel_marlin
-from auto_round.utils import get_library_version, logger
+from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger
 
 BackendInfos = {}
 
@@ -429,17 +429,6 @@ def check_compatible(
     return True
 
 
-def is_weight_fp8_activation_static_fp8(config):
-    bits, group_size, sym, data_type, act_dynamic = (
-        config["bits"],
-        config["group_size"],
-        config["sym"],
-        config["data_type"],
-        config["act_dynamic"],
-    )
-    return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
-
-
 def dynamic_import_inference_linear(backend, config):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 74999c624..c13556827 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2516,3 +2516,29 @@ def is_nv_fp(backend):
 
 def is_static_afp8(ar):
     return not ar.act_dynamic and "fp8" in ar.act_data_type
+
+
+def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic):
+    return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
+
+
+def is_weight_fp8_activation_static_fp8(config):
+    bits, group_size, sym, data_type, act_dynamic = (
+        config["bits"],
+        config["group_size"],
+        config["sym"],
+        config["data_type"],
+        config["act_dynamic"],
+    )
+    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
+
+
+def is_torch_fp8_static(ar):
+    bits, group_size, sym, data_type, act_dynamic = (
+        ar.bits,
+        ar.group_size,
+        ar.sym,
+        ar.data_type,
+        ar.act_dynamic,
+    )
+    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)

From 57b4c19913c442434144e8ba50df1dfb6f5ba7df Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:18:02 -0400
Subject: [PATCH 16/23] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py        | 6 +++---
 auto_round/export/export_to_autoround/export.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index affc7552d..c069f5151 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from typing import Optional, Union
 
 import torch
 
 
-class QModuleBase(torch.nn.Module):
+class QModuleBase(ABC):
     """
     Abstract class used to describe the weight creation and forward pass
     of different quantization schemes supported by Auto-Round.
@@ -32,7 +32,7 @@ def __init__(self):
 
     @classmethod
     @abstractmethod
-    def from_original(cls, config, original_layer):
+    def from_original(cls, config, original_layer: torch.nn.Module):
         raise NotImplementedError
 
     @classmethod
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 38b815eb1..48a59f5e5 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -263,7 +263,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     Raises:
         ValueError: If the backend is not supported.
     """
-    # breakpoint()
     data_type = kwargs.get("data_type", None)
     if is_nv_fp(data_type) or is_mx_fp(data_type):  ## detect nvfp & mxfp first
         from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp

From bdf5f3e554da100b337f327257fa2308b90811f5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:19:06 -0400
Subject: [PATCH 17/23] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py       | 2 ++
 auto_round/experimental/qmodules/fp8_static.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index c069f5151..2a74a470d 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -17,6 +17,8 @@
 
 import torch
 
+__all__ = ["QModuleBase"]
+
 
 class QModuleBase(ABC):
     """
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 074cf34e7..b5c7d2dd2 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -19,6 +19,8 @@
 
 from auto_round.experimental.qmodules.base import QModuleBase
 
+__all__ = ["WeightFP8ActFP8StaticQuantLinear"]
+
 
 def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor):
     FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max

From ce3384f33ec861f00e4c704f032dc99b907c8536 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:26:05 -0400
Subject: [PATCH 18/23] fix ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py |  4 +-
 test/test_cpu/test_export.py             | 48 ++++++++++++------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index 2a74a470d..8b7a9c138 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -20,9 +20,9 @@
 __all__ = ["QModuleBase"]
 
 
-class QModuleBase(ABC):
+class QModuleBase(torch.nn.Module):
     """
-    Abstract class used to describe the weight creation and forward pass
+    Base class used to describe the weight creation and forward pass
     of different quantization schemes supported by Auto-Round.
     The design is inspired by vLLM's CompressedTensorsScheme:
     https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 24498c780..d648fd721 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -230,31 +230,33 @@ def test_static_afp8_export(self, static_kv_dtype):
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
-        with torch.no_grad():
-            import transformers
-
-            model = transformers.AutoModelForCausalLM.from_pretrained(
-                quantized_model_path,
-                torch_dtype="auto",
-                low_cpu_mem_usage=True,
-                trust_remote_code=True,
-            )
-            model.eval()
-            assert (
-                model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear"
-            ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}"
-            tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path)
-            prompt = "AI is "
-            encode = tokenizer.encode(prompt, return_tensors="pt")
+        if static_kv_dtype is None:
             with torch.no_grad():
-                output_tokens = model.generate(
-                    encode,
-                    max_length=10,
+                import transformers
+
+                model = transformers.AutoModelForCausalLM.from_pretrained(
+                    quantized_model_path,
+                    torch_dtype="auto",
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=True,
                 )
-                output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
-                print(f"Prompt: {prompt}")
-                print(f"Output: {output}")
-                assert output is not None, "Output should not be None"
+                model.eval()
+                assert (
+                    model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__
+                    == "WeightFP8ActFP8StaticQuantLinear"
+                ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}"
+                tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path)
+                prompt = "AI is "
+                encode = tokenizer.encode(prompt, return_tensors="pt")
+                with torch.no_grad():
+                    output_tokens = model.generate(
+                        encode,
+                        max_length=10,
+                    )
+                    output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+                    print(f"Prompt: {prompt}")
+                    print(f"Output: {output}")
+                    assert output is not None, "Output should not be None"
 
         if static_kv_dtype == "fp8":
             self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys())

From 22d11de19ce77a04b29f28c5c19e6639a7130298 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 01:04:39 -0400
Subject: [PATCH 19/23] correct

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/fp8_static.py | 13 +++++--------
 auto_round/utils.py                            | 14 +++++++++++---
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index b5c7d2dd2..90ee09357 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -18,6 +18,7 @@
 import torch
 
 from auto_round.experimental.qmodules.base import QModuleBase
+from auto_round.utils import logger
 
 __all__ = ["WeightFP8ActFP8StaticQuantLinear"]
 
@@ -41,7 +42,6 @@ def __init__(
         weight: Optional[torch.Tensor] = None,
         weight_scale: Optional[torch.Tensor] = None,
         bias: Union[torch.Tensor, bool, None] = None,
-        weight_zp: Optional[torch.Tensor] = None,
         input_scale: Optional[torch.Tensor] = None,
         dtype=torch.bfloat16,
     ):
@@ -57,14 +57,10 @@ def __init__(
             self.bias = torch.nn.Parameter(bias, requires_grad=False)
         else:
             self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
+        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
         self.register_buffer("weight_scale", init_weight_scale.to(dtype))
 
-        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
-        if weight_zp:
-            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
-
-        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
+        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 
@@ -73,7 +69,8 @@ def get_min_capability(cls) -> int:
         """
         Get minimum device capability.
         """
-        # FIXME: set to 0 for now, as fp8 kernels are not available yet
+        # TODO: correct that config once we add fp8 op support.
+        logger.warning_once("FP8 ops are not yet supported. Using capability 0.")
         return 0
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
diff --git a/auto_round/utils.py b/auto_round/utils.py
index c13556827..2fd78f7a0 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -108,9 +108,17 @@ def infer_bits_by_data_type(data_type: str):
     return None
 
 
-@lru_cache(None)
-def warning_once(self, msg: str):
-    self.warning(msg)
+@lru_cache(maxsize=None)
+def warning_once(self, msg, *args, **kwargs):
+    """
+    Log a warning message only once per unique message/arguments combination.
+
+    Args:
+        msg: The warning message format string
+        *args: Variable positional arguments for message formatting
+        **kwargs: Variable keyword arguments for message formatting and logging options
+    """
+    self.warning(msg, *args, **kwargs)
 
 
 class AutoRoundFormatter(logging.Formatter):

From 90826139a8ddfb53a983ad2e87b2ef978fcbe3fb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 01:05:28 -0400
Subject: [PATCH 20/23] clean

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 85ea75e60..2af8df95e 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -677,8 +677,7 @@ def _parse_format_to_list(self, format: str) -> list:
                 if is_torch_fp8_static(self):
                     format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}")
                     formats[index] = format
-                # if is_torch_fp8_static(self):
-                #     formats[index] = "auto_round:torch_fp8_static"
+
             elif format == "llmcompressor":
                 from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported
 

From 2202856fabc8abe2f8ad7a964899450621fbd598 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 03:34:11 -0400
Subject: [PATCH 21/23] fix shape

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/fp8_static.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 90ee09357..a6798f53d 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -57,10 +57,10 @@ def __init__(
             self.bias = torch.nn.Parameter(bias, requires_grad=False)
         else:
             self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
         self.register_buffer("weight_scale", init_weight_scale.to(dtype))
 
-        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
+        init_input_scale = torch.zeros((1), dtype=dtype) if input_scale is None else input_scale
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 

From d0b99a8f1c493d8484e10871b3a533705c8f1401 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 20:59:33 -0400
Subject: [PATCH 22/23] fix check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 934486c5a..6ef3884a9 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -687,7 +687,7 @@ def _parse_format_to_list(self, format: str) -> list:
                         format = "auto_round:auto_awq"
                 elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
                     format = f"auto_round:{self.data_type}"
-                elif is_wfp8afp8(self):  # staic wfp8afp8
+                elif is_static_wfp8afp8(self):  # staic wfp8afp8
                     format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"
                 elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16:  # woq fp8
                     format = "auto_round:fp8"

From 31845d0d025db8b24e4676192a5b998c56188c8e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 21:02:34 -0400
Subject: [PATCH 23/23] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py |  2 --
 auto_round/utils.py     | 15 ---------------
 2 files changed, 17 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 6ef3884a9..49e3984a7 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -73,9 +73,7 @@
     is_nv_fp,
     is_optimum_habana_available,
     is_standard_fp,
-    is_static_afp8,
     is_static_wfp8afp8,
-    is_torch_fp8_static,
     is_wfp8afp8,
     llm_load_model,
     logger,
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 9886a5337..21363688b 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2527,10 +2527,6 @@ def is_nv_fp(backend):
     return BackendDataType.NV_FP in backend
 
 
-def is_static_afp8(ar):
-    return not ar.act_dynamic and "fp8" in ar.act_data_type
-
-
 def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic):
     return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
 
@@ -2546,17 +2542,6 @@ def is_weight_fp8_activation_static_fp8(config):
     return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
 
 
-def is_torch_fp8_static(ar):
-    bits, group_size, sym, data_type, act_dynamic = (
-        ar.bits,
-        ar.group_size,
-        ar.sym,
-        ar.data_type,
-        ar.act_dynamic,
-    )
-    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
-
-
 def is_wfp8afp8(ar):
     if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and (
         "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)