From bb947822170a33324c1016bde7f5ec6661a23ab7 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 11 Aug 2025 10:19:44 +0000 Subject: [PATCH 01/23] load w8a8 Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 108 +++++++++++++++++++- examples/load_w8a8.py | 136 ++++++++++++++++++++++++++ 2 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 examples/load_w8a8.py diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index bd6dde836..1fff106d5 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -472,6 +472,8 @@ def infer_target_device(device_map=None): def post_init(model, used_backends): + if is_weight_fp8_activation_static_fp8(model.config.quantization_config): + return need_autogptq_init = False need_gptqmodel_init = False need_ipex_itrex_init = False @@ -526,6 +528,108 @@ def post_init(model, used_backends): logger.warning("force model to bfloat16") + +def quant_tensor_with_scale(tensor, scale): + FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max + qtensor = tensor / scale + cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) + return scale, cliped_qtensor_fp8 + + +class FP8QDQLinear(torch.nn.Module): + dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__( + self, in_features: int, out_features: int, bias: bool = True, device=None + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter( + torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), + requires_grad=True, + ) + self.weight_scale = nn.Parameter( + torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype), + requires_grad=False, + ) + self.input_scale = nn.Parameter( + torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False + ) + if bias: + self.bias = nn.Parameter(torch.empty(out_features)) + else: + self.register_parameter("bias", None) + self.pre_dequantized = False + + def dequant_weight_online(self): + if self.pre_dequantized: + return self.weight + fp8_weight = self.weight + qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale + return qdq_weight + + def pre_dequantize(self): + if self.pre_dequantized: + return + dequant_weight = self.dequant_weight_online() + del self.weight + del self.weight_scale + self.weight = nn.Parameter(dequant_weight, requires_grad=False) + self.pre_dequantized = True + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = quant_tensor_with_scale( + bf16_input, self.input_scale.data + ) + qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale + return qdq_input_bf16 + + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out + + @classmethod + def from_original(cls, config, original_layer): + """ + Create an FP8QDQLinear layer from an original linear layer. + """ + device = original_layer.weight.device + with torch.device(device): + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + bias=original_layer.bias is not None, + ) + return qdq_linear + + +def _patching_mod( + mod, config, src_cls, dst_cls +): + named_children_list = list(mod.named_children()) + for name, layer in named_children_list: + if isinstance(layer, src_cls): + new_layer = dst_cls.from_original(config, layer) + setattr(mod, name, new_layer) + print(f"Patched {name} with {new_layer.__class__.__name__}") + elif isinstance(layer, nn.Module): + _patching_mod(layer, config, src_cls, dst_cls) + return mod + + +def patching_model(model): + model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear) + return model + + +def is_weight_fp8_activation_static_fp8(quant_config): + return True + def convert_hf_model(model: nn.Module, target_device="cpu"): """Converts the given model to an AutoRound model by replacing its layers with quantized layers. @@ -547,7 +651,9 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): """ quantization_config = model.config.quantization_config - + if is_weight_fp8_activation_static_fp8(quantization_config): + model = patching_model(model) + if hasattr(quantization_config, "desc_act") and quantization_config.desc_act: ##check static_group if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or ( diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py new file mode 100644 index 000000000..df10b6c10 --- /dev/null +++ b/examples/load_w8a8.py @@ -0,0 +1,136 @@ +import os +import torch +import tqdm +from loguru import logger +import logging +import safetensors +from safetensors import safe_open +from safetensors.torch import save_file +import json + +logging.basicConfig(level=logging.DEBUG) +torch.set_grad_enabled(False) + +# CONSTANTS +SAFETENSORS = "safetensors" +WEIGHT_SCALE_NAME = "weight_scale" +INPUT_SCALE_NAME = "scale_input" +SCALE_DTYPE = torch.bfloat16 +SCALE_FILE_NAME = f"scales.{SAFETENSORS}" +FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max +WEIGHT_BACKOFF = 1.0 +QUANT_MODULE_TYPES = (torch.nn.Linear,) +SKIP_WEIGHT_LST = { + "model.norm", + "layernorm", + "e_score_correction_bias", + # "lm_head.weight", + "embed_tokens", + "mlp.gate.weight", # mlp.gate is not linear +} + +MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" + + +seed = 0 +import random + +random.seed(seed) +import torch + +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +import numpy as np + +np.random.seed(seed) + + +# torch.use_deterministic_algorithms(True) +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + + +g = torch.Generator() +g.manual_seed(0) + + + + +def pre_dequantize(model): + """ + Pre-dequantize all FP8QDQLinear layers in the model. + """ + for name, module in model.named_modules(): + if module.__class__.__name__ == "FP8QDQLinear": + logger.info(f"Pre-dequantizing {name}") + module.pre_dequantize() + else: + logger.debug(f"Skipping {name} as it is not FP8QDQLinear") + + +def qdq_eval(model_path, not_patch_lin=False): + import transformers + from transformers.modeling_utils import no_init_weights + + + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + logger.info(f"Patched model: {model}") + model.eval() + model.to("cuda") + import torch + + model = torch.compile(model) + # pre_dequantize(model) + with torch.device("cuda"): + tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) + prompt = "Hi, who" + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate(encode, max_length=100) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + logger.info(f"Prompt: {prompt}") + logger.info(f"Output: {output}") + + # from auto_round.script.llm import eval_task_by_task + + # eval_task_by_task( + # model=model, + # device="cuda", + # tasks="gsm8k", + # batch_size=32, + # limit=128, + # # trust_remote_code=not args.disable_trust_remote_code, + # # eval_model_dtype=args.eval_model_dtype + # ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('-m', "--qmodel_path", type=str, required=True) + parser.add_argument( + "--not_patch_lin", action="store_true", help="Measure float model" + ) + args = parser.parse_args() + qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) + + +""" +p load_w8a8.py --qmodel_path /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8 +Running generate_until requests: 76%|███ | 97/128 [11:45<03: +Running generate_until requests: 100%|███| 128/128 [11:45<00:00, 5.51s/it] +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7422|± |0.0388| +| | |strict-match | 5|exact_match|↑ |0.6797|± |0.0414| + +total eval time: 742.8823928833008 +""" \ No newline at end of file From 9bef8263328fe7ef152d828c1775d4aa385885cc Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 02:44:17 -0400 Subject: [PATCH 02/23] refactor Signed-off-by: yiliu30 --- .../export_to_autoround/export_to_fp8_woq.py | 89 +++++++++++++ auto_round/inference/backend.py | 20 ++- auto_round/inference/convert_model.py | 120 ++---------------- examples/load_w8a8.py | 39 +++--- 4 files changed, 141 insertions(+), 127 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 5b6a4c400..8b357e090 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -16,6 +16,7 @@ import json import os from concurrent.futures import ThreadPoolExecutor +from typing import Optional, Union import threadpoolctl as tctl import torch @@ -83,6 +84,94 @@ def __init__( self.register_buffer("input_scale", input_scale.to(dtype)) +def quant_tensor_with_scale(tensor, scale): + FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max + qtensor = tensor / scale + cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) + return scale, cliped_qtensor_fp8 + + +class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module): + hp_dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__( + self, + in_features, + out_features, + weight: Optional[torch.Tensor] = None, + weight_scale: Optional[torch.Tensor] = None, + bias: Union[torch.Tensor, bool, None] = None, + weight_zp: Optional[torch.Tensor] = None, + input_scale: Optional[torch.Tensor] = None, + dtype=torch.bfloat16, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight + self.weight = torch.nn.Parameter(init_weight, requires_grad=False) + self.dtype = dtype + if bias is not None: + if isinstance(bias, bool): + bias = torch.zeros((out_features,), dtype=dtype) + self.bias = torch.nn.Parameter(bias, requires_grad=False) + else: + self.register_parameter("bias", None) + init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale + self.register_buffer("weight_scale", init_weight_scale.to(dtype)) + + init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp + if weight_zp: + self.register_buffer("weight_zp", init_weight_zp.to(dtype)) + + init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale + self.register_buffer("input_scale", init_input_scale.to(dtype)) + self.pre_dequantized = False + + @classmethod + def from_original(cls, config, original_layer): + """ + Create an FP8WOQLinear layer from an original linear layer. + """ + device = original_layer.weight.device + with torch.device(device): + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + bias=original_layer.bias, + ) + return qdq_linear + + def dequant_weight_online(self): + if self.pre_dequantized: + return self.weight + fp8_weight = self.weight + qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale + return qdq_weight + + def pre_dequantize(self): + if self.pre_dequantized: + return + dequant_weight = self.dequant_weight_online() + del self.weight + del self.weight_scale + self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False) + self.pre_dequantized = True + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data) + qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale + return qdq_input_bf16 + + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out + + def pack_layer(layer_name, model, data_type, packing_device=None): """ Packs a model layer for quantization based on its type and configuration. diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index a4f578726..4b259db0a 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -410,7 +410,18 @@ def check_compatible( return True -def dynamic_import_inference_linear(backend, bits, group_size, sym): +def is_weight_fp8_activation_static_fp8(config): + bits, group_size, sym, data_type, act_dynamic = ( + config["bits"], + config["group_size"], + config["sym"], + config["data_type"], + config["act_dynamic"], + ) + return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic + + +def dynamic_import_inference_linear(backend, config): """Dynamically imports and returns the appropriate QuantLinear class based on the given backend. This function dynamically loads the correct `QuantLinear` class based on the backend and quantization @@ -435,6 +446,13 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym): ImportError: If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq). """ + bits, group_size, sym = config["bits"], config["group_size"], config["sym"] + + if is_weight_fp8_activation_static_fp8(config): + from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear + + return WeightFP8ActFP8StaticQuantLinear + if "qbits" in backend: try: from intel_extension_for_transformers import qbits # pylint: disable=E0401 diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 1fff106d5..bbca26f4f 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -27,6 +27,7 @@ find_backend, get_highest_priority_backend, get_layer_backend, + is_weight_fp8_activation_static_fp8, process_requirement, ) from auto_round.utils import ( @@ -61,7 +62,7 @@ def skip_not_convert_modules(model, quantization_config, layer_names, layer_conf try: # transformers new api modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert, add_default_skips=True) except: - modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert) + modules_to_not_convert = _get_modules_to_not_convert(model, modules_to_not_convert) if modules_to_not_convert: for layer_name in layer_names: if any([re.search(re.compile(n), layer_name) for n in modules_to_not_convert]): @@ -219,6 +220,7 @@ def get_layer_config(model, quantization_config): - group_size (int): Group size for weight quantization. - data_type (str, optional): Data type for quantization (default: "int"). - sym (bool): Whether to use symmetric quantization. + - act_dynamic (bool, optional): Whether to use dynamic activation quantization (default: False). - quant_block_list (list, optional): Predefined list of blocks to quantize. - to_quant_block_names (list or str, optional): Blocks to quantize (if quant_block_list is None). - extra_config (dict, optional): Per-layer overrides for quantization settings. @@ -231,13 +233,14 @@ def get_layer_config(model, quantization_config): - "group_size" (int): Group size for quantization. - "data_type" (str): Data type used for quantization. - "sym" (bool): Whether symmetric quantization is applied. + - "act_dynamic" (bool): Whether dynamic activation quantization is used. - "clip" (bool): Whether weight clipping is enabled. """ bits = quantization_config.bits group_size = quantization_config.group_size data_type = getattr(quantization_config, "data_type", "int") # Default to "int" if not specified sym = quantization_config.sym - + act_dynamic = getattr(quantization_config, "act_dynamic", False) # Determine the quantization block list quant_block_list = getattr(quantization_config, "quant_block_list", None) if quant_block_list is None: @@ -290,11 +293,11 @@ def get_layer_config(model, quantization_config): "group_size": extra_config.get(layer_name, {}).get("group_size", group_size), "data_type": extra_config.get(layer_name, {}).get("data_type", data_type), "sym": extra_config.get(layer_name, {}).get("sym", sym), + "act_dynamic": extra_config.get(layer_name, {}).get("act_dynamic", act_dynamic), "clip": extra_config.get(layer_name, {}).get("clip", False), } for layer_name in layer_names } - return layer_configs @@ -415,7 +418,7 @@ def _import_exllamav2_kernels(): def _create_quant_layer(layer, layer_backend, config, in_features, out_features): """Creates a quantized layer using the appropriate class.""" - QuantLinear = dynamic_import_inference_linear(layer_backend, config["bits"], config["group_size"], config["sym"]) + QuantLinear = dynamic_import_inference_linear(layer_backend, config) bias = layer.bias is not None # Special handling for AWQ layers @@ -437,6 +440,8 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features) out_features=out_features, bias=bias, ) + elif is_weight_fp8_activation_static_fp8(config): + return QuantLinear.from_original(config, layer) # Default quantized layer creation try: return QuantLinear( @@ -528,108 +533,6 @@ def post_init(model, used_backends): logger.warning("force model to bfloat16") - -def quant_tensor_with_scale(tensor, scale): - FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max - qtensor = tensor / scale - cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) - cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) - return scale, cliped_qtensor_fp8 - - -class FP8QDQLinear(torch.nn.Module): - dtype = torch.bfloat16 - fp8_dtype = torch.float8_e4m3fn - - def __init__( - self, in_features: int, out_features: int, bias: bool = True, device=None - ): - super().__init__() - self.in_features = in_features - self.out_features = out_features - self.weight = nn.Parameter( - torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), - requires_grad=True, - ) - self.weight_scale = nn.Parameter( - torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype), - requires_grad=False, - ) - self.input_scale = nn.Parameter( - torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False - ) - if bias: - self.bias = nn.Parameter(torch.empty(out_features)) - else: - self.register_parameter("bias", None) - self.pre_dequantized = False - - def dequant_weight_online(self): - if self.pre_dequantized: - return self.weight - fp8_weight = self.weight - qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale - return qdq_weight - - def pre_dequantize(self): - if self.pre_dequantized: - return - dequant_weight = self.dequant_weight_online() - del self.weight - del self.weight_scale - self.weight = nn.Parameter(dequant_weight, requires_grad=False) - self.pre_dequantized = True - - def qdq_input(self, bf16_input: torch.Tensor): - input_scale, input_fp8 = quant_tensor_with_scale( - bf16_input, self.input_scale.data - ) - qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale - return qdq_input_bf16 - - def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: - qdq_input = self.qdq_input(bf16_input) - qdq_weight = self.dequant_weight_online() - out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) - return out - - @classmethod - def from_original(cls, config, original_layer): - """ - Create an FP8QDQLinear layer from an original linear layer. - """ - device = original_layer.weight.device - with torch.device(device): - qdq_linear = cls( - in_features=original_layer.in_features, - out_features=original_layer.out_features, - bias=original_layer.bias is not None, - ) - return qdq_linear - - -def _patching_mod( - mod, config, src_cls, dst_cls -): - named_children_list = list(mod.named_children()) - for name, layer in named_children_list: - if isinstance(layer, src_cls): - new_layer = dst_cls.from_original(config, layer) - setattr(mod, name, new_layer) - print(f"Patched {name} with {new_layer.__class__.__name__}") - elif isinstance(layer, nn.Module): - _patching_mod(layer, config, src_cls, dst_cls) - return mod - - -def patching_model(model): - model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear) - return model - - -def is_weight_fp8_activation_static_fp8(quant_config): - return True - def convert_hf_model(model: nn.Module, target_device="cpu"): """Converts the given model to an AutoRound model by replacing its layers with quantized layers. @@ -651,9 +554,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): """ quantization_config = model.config.quantization_config - if is_weight_fp8_activation_static_fp8(quantization_config): - model = patching_model(model) - + if hasattr(quantization_config, "desc_act") and quantization_config.desc_act: ##check static_group if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or ( @@ -694,7 +595,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): backend = backend[len("auto_round:") :] used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend) - if backend == "auto" or backend == "": best_backend = get_highest_priority_backend( quantization_config.bits, diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py index df10b6c10..ad6218f9b 100644 --- a/examples/load_w8a8.py +++ b/examples/load_w8a8.py @@ -1,12 +1,13 @@ +import json +import logging import os + +import safetensors import torch import tqdm from loguru import logger -import logging -import safetensors from safetensors import safe_open from safetensors.torch import save_file -import json logging.basicConfig(level=logging.DEBUG) torch.set_grad_enabled(False) @@ -42,13 +43,13 @@ torch.cuda.manual_seed(seed) import numpy as np -np.random.seed(seed) +np.random.Generator(seed) # torch.use_deterministic_algorithms(True) def seed_worker(worker_id): worker_seed = torch.initial_seed() % 2**32 - np.random.seed(worker_seed) + np.random.Generator(worker_seed) random.seed(worker_seed) @@ -56,8 +57,6 @@ def seed_worker(worker_id): g.manual_seed(0) - - def pre_dequantize(model): """ Pre-dequantize all FP8QDQLinear layers in the model. @@ -70,10 +69,15 @@ def pre_dequantize(model): logger.debug(f"Skipping {name} as it is not FP8QDQLinear") +import torch + + +@torch.no_grad() def qdq_eval(model_path, not_patch_lin=False): + import transformers - from transformers.modeling_utils import no_init_weights + # from transformers.modeling_utils import no_init_weights model = transformers.AutoModelForCausalLM.from_pretrained( model_path, @@ -86,14 +90,19 @@ def qdq_eval(model_path, not_patch_lin=False): model.to("cuda") import torch - model = torch.compile(model) - # pre_dequantize(model) with torch.device("cuda"): + from transformers import GenerationConfig + + gen_config = GenerationConfig(use_cache=True, cache_implementation="static") tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) prompt = "Hi, who" encode = tokenizer.encode(prompt, return_tensors="pt") with torch.no_grad(): - output_tokens = model.generate(encode, max_length=100) + output_tokens = model.generate( + encode, + max_length=10, + # generation_config=gen_config + ) output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) logger.info(f"Prompt: {prompt}") logger.info(f"Output: {output}") @@ -115,10 +124,8 @@ def qdq_eval(model_path, not_patch_lin=False): import argparse parser = argparse.ArgumentParser() - parser.add_argument('-m', "--qmodel_path", type=str, required=True) - parser.add_argument( - "--not_patch_lin", action="store_true", help="Measure float model" - ) + parser.add_argument("-m", "--qmodel_path", type=str, required=True) + parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model") args = parser.parse_args() qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) @@ -133,4 +140,4 @@ def qdq_eval(model_path, not_patch_lin=False): | | |strict-match | 5|exact_match|↑ |0.6797|± |0.0414| total eval time: 742.8823928833008 -""" \ No newline at end of file +""" From b30a126fed56bd07473d2bba53d1dcbe9ed9bd7b Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 03:01:56 -0400 Subject: [PATCH 03/23] add ut Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 2 -- test/test_cpu/test_export.py | 28 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index bbca26f4f..bd8b4621d 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -477,8 +477,6 @@ def infer_target_device(device_map=None): def post_init(model, used_backends): - if is_weight_fp8_activation_static_fp8(model.config.quantization_config): - return need_autogptq_init = False need_gptqmodel_init = False need_ipex_itrex_init = False diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index bbce4036b..367d20c5d 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -199,7 +199,7 @@ def test_autoround_3bit_sym_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_static_afp8_export(self): + def test_static_afp8_export_and_load(self): import os from safetensors import safe_open @@ -226,6 +226,32 @@ def test_static_afp8_export(self): self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1, 1])) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + with torch.no_grad(): + import transformers + + model = transformers.AutoModelForCausalLM.from_pretrained( + quantized_model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + model.eval() + assert ( + model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear" + ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}" + tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) + prompt = "AI is " + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate( + encode, + max_length=10, + ) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + print(f"Prompt: {prompt}") + print(f"Output: {output}") + assert output is not None, "Output should not be None" + shutil.rmtree(quantized_model_path, ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) From eaad3a6e150d8830c96460b333ed557c04e165ae Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 03:02:38 -0400 Subject: [PATCH 04/23] remove example Signed-off-by: yiliu30 --- examples/load_w8a8.py | 143 ------------------------------------------ 1 file changed, 143 deletions(-) delete mode 100644 examples/load_w8a8.py diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py deleted file mode 100644 index ad6218f9b..000000000 --- a/examples/load_w8a8.py +++ /dev/null @@ -1,143 +0,0 @@ -import json -import logging -import os - -import safetensors -import torch -import tqdm -from loguru import logger -from safetensors import safe_open -from safetensors.torch import save_file - -logging.basicConfig(level=logging.DEBUG) -torch.set_grad_enabled(False) - -# CONSTANTS -SAFETENSORS = "safetensors" -WEIGHT_SCALE_NAME = "weight_scale" -INPUT_SCALE_NAME = "scale_input" -SCALE_DTYPE = torch.bfloat16 -SCALE_FILE_NAME = f"scales.{SAFETENSORS}" -FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max -WEIGHT_BACKOFF = 1.0 -QUANT_MODULE_TYPES = (torch.nn.Linear,) -SKIP_WEIGHT_LST = { - "model.norm", - "layernorm", - "e_score_correction_bias", - # "lm_head.weight", - "embed_tokens", - "mlp.gate.weight", # mlp.gate is not linear -} - -MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" - - -seed = 0 -import random - -random.seed(seed) -import torch - -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) -import numpy as np - -np.random.Generator(seed) - - -# torch.use_deterministic_algorithms(True) -def seed_worker(worker_id): - worker_seed = torch.initial_seed() % 2**32 - np.random.Generator(worker_seed) - random.seed(worker_seed) - - -g = torch.Generator() -g.manual_seed(0) - - -def pre_dequantize(model): - """ - Pre-dequantize all FP8QDQLinear layers in the model. - """ - for name, module in model.named_modules(): - if module.__class__.__name__ == "FP8QDQLinear": - logger.info(f"Pre-dequantizing {name}") - module.pre_dequantize() - else: - logger.debug(f"Skipping {name} as it is not FP8QDQLinear") - - -import torch - - -@torch.no_grad() -def qdq_eval(model_path, not_patch_lin=False): - - import transformers - - # from transformers.modeling_utils import no_init_weights - - model = transformers.AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype="auto", - low_cpu_mem_usage=True, - trust_remote_code=True, - ) - logger.info(f"Patched model: {model}") - model.eval() - model.to("cuda") - import torch - - with torch.device("cuda"): - from transformers import GenerationConfig - - gen_config = GenerationConfig(use_cache=True, cache_implementation="static") - tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) - prompt = "Hi, who" - encode = tokenizer.encode(prompt, return_tensors="pt") - with torch.no_grad(): - output_tokens = model.generate( - encode, - max_length=10, - # generation_config=gen_config - ) - output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) - logger.info(f"Prompt: {prompt}") - logger.info(f"Output: {output}") - - # from auto_round.script.llm import eval_task_by_task - - # eval_task_by_task( - # model=model, - # device="cuda", - # tasks="gsm8k", - # batch_size=32, - # limit=128, - # # trust_remote_code=not args.disable_trust_remote_code, - # # eval_model_dtype=args.eval_model_dtype - # ) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("-m", "--qmodel_path", type=str, required=True) - parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model") - args = parser.parse_args() - qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) - - -""" -p load_w8a8.py --qmodel_path /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8 -Running generate_until requests: 76%|███ | 97/128 [11:45<03: -Running generate_until requests: 100%|███| 128/128 [11:45<00:00, 5.51s/it] -|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| -|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| -|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7422|± |0.0388| -| | |strict-match | 5|exact_match|↑ |0.6797|± |0.0414| - -total eval time: 742.8823928833008 -""" From c411ca5f86fdc2f84a5fa301ceab34d98ddf2bcb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 03:04:26 -0400 Subject: [PATCH 05/23] fix typo Signed-off-by: yiliu30 --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 8b357e090..1b2d7c222 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -133,7 +133,7 @@ def __init__( @classmethod def from_original(cls, config, original_layer): """ - Create an FP8WOQLinear layer from an original linear layer. + Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. """ device = original_layer.weight.device with torch.device(device): @@ -165,6 +165,7 @@ def qdq_input(self, bf16_input: torch.Tensor): qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale return qdq_input_bf16 + @torch.no_grad() def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: qdq_input = self.qdq_input(bf16_input) qdq_weight = self.dequant_weight_online() From 6597d5ca36d084848f76cde2a972bc684f888d4c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 13 Aug 2025 08:39:45 +0800 Subject: [PATCH 06/23] Update auto_round/export/export_to_autoround/export_to_fp8_woq.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 1b2d7c222..09af9e270 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -89,7 +89,9 @@ def quant_tensor_with_scale(tensor, scale): qtensor = tensor / scale cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) - return scale, cliped_qtensor_fp8 + clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) + return scale, clipped_qtensor_fp8 class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module): From 9b0f32ffdd0cb4aac2c36922588c8cdd56296346 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 13 Aug 2025 08:40:45 +0800 Subject: [PATCH 07/23] Update export_to_fp8_woq.py --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 09af9e270..4d2b924d1 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -87,8 +87,6 @@ def __init__( def quant_tensor_with_scale(tensor, scale): FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max qtensor = tensor / scale - cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) - cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) return scale, clipped_qtensor_fp8 From 5ebca24b6ee300f4205ae3798c5568ac419cf134 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 24 Aug 2025 05:00:23 -0400 Subject: [PATCH 08/23] update shape Signed-off-by: yiliu30 --- .../export/export_to_autoround/export_to_fp8_woq.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 7bcfb8011..e7b473593 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -94,7 +94,7 @@ def __init__( super().__init__() self.in_features = in_features self.out_features = out_features - init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight + init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight self.weight = torch.nn.Parameter(init_weight, requires_grad=False) self.dtype = dtype if bias is not None: @@ -103,14 +103,14 @@ def __init__( self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale + init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale self.register_buffer("weight_scale", init_weight_scale.to(dtype)) init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp if weight_zp: self.register_buffer("weight_zp", init_weight_zp.to(dtype)) - init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale + init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False @@ -132,7 +132,7 @@ def dequant_weight_online(self): if self.pre_dequantized: return self.weight fp8_weight = self.weight - qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale + qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) return qdq_weight def pre_dequantize(self): From 03cb21711a34b22fc002ebca399d9a58b7d07ec9 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 02:54:16 -0400 Subject: [PATCH 09/23] refactor Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 28 +++++ .../experimental/qmodules/fp8_static.py | 108 ++++++++++++++++++ .../export_to_autoround/export_to_fp8_woq.py | 89 --------------- 3 files changed, 136 insertions(+), 89 deletions(-) create mode 100644 auto_round/experimental/qmodules/base.py create mode 100644 auto_round/experimental/qmodules/fp8_static.py diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py new file mode 100644 index 000000000..860e66836 --- /dev/null +++ b/auto_round/experimental/qmodules/base.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from typing import Optional, Union + +import torch + + +class QModuleBase(torch.nn.Module): + def __init__(self): + super().__init__() + + @classmethod + @abstractmethod + def from_original(cls, config, original_layer): + raise NotImplementedError diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py new file mode 100644 index 000000000..8d58480d3 --- /dev/null +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -0,0 +1,108 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch + +from auto_round.experimental.qmodules.base import QModuleBase + + +def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor): + FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max + qtensor = tensor / scale + clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) + return scale, clipped_qtensor_fp8 + + +class WeightFP8ActFP8StaticQuantLinear(QModuleBase): + hp_dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__( + self, + in_features, + out_features, + weight: Optional[torch.Tensor] = None, + weight_scale: Optional[torch.Tensor] = None, + bias: Union[torch.Tensor, bool, None] = None, + weight_zp: Optional[torch.Tensor] = None, + input_scale: Optional[torch.Tensor] = None, + dtype=torch.bfloat16, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight + self.weight = torch.nn.Parameter(init_weight, requires_grad=False) + self.dtype = dtype + if bias is not None: + if isinstance(bias, bool): + bias = torch.zeros((out_features,), dtype=dtype) + self.bias = torch.nn.Parameter(bias, requires_grad=False) + else: + self.register_parameter("bias", None) + init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale + self.register_buffer("weight_scale", init_weight_scale.to(dtype)) + + init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp + if weight_zp: + self.register_buffer("weight_zp", init_weight_zp.to(dtype)) + + init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale + self.register_buffer("input_scale", init_input_scale.to(dtype)) + self.pre_dequantized = False + + @classmethod + def from_original(cls, config, original_layer): + """ + Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. + """ + device = original_layer.weight.device + with torch.device(device): + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + bias=original_layer.bias, + ) + return qdq_linear + + def dequant_weight_online(self): + if self.pre_dequantized: + return self.weight + fp8_weight = self.weight + qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) + return qdq_weight + + def pre_dequantize(self): + if self.pre_dequantized: + return + dequant_weight = self.dequant_weight_online() + del self.weight + del self.weight_scale + self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False) + self.pre_dequantized = True + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = _quant_tensor_to_fp8_with_scale(bf16_input, self.input_scale.data) + qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale + return qdq_input_bf16 + + @torch.no_grad() + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index e7b473593..214e5046e 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -68,95 +68,6 @@ def __init__( self.register_buffer("input_scale", input_scale.to(dtype)) -def quant_tensor_with_scale(tensor, scale): - FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max - qtensor = tensor / scale - clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) - clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) - return scale, clipped_qtensor_fp8 - - -class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module): - hp_dtype = torch.bfloat16 - fp8_dtype = torch.float8_e4m3fn - - def __init__( - self, - in_features, - out_features, - weight: Optional[torch.Tensor] = None, - weight_scale: Optional[torch.Tensor] = None, - bias: Union[torch.Tensor, bool, None] = None, - weight_zp: Optional[torch.Tensor] = None, - input_scale: Optional[torch.Tensor] = None, - dtype=torch.bfloat16, - ): - super().__init__() - self.in_features = in_features - self.out_features = out_features - init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight - self.weight = torch.nn.Parameter(init_weight, requires_grad=False) - self.dtype = dtype - if bias is not None: - if isinstance(bias, bool): - bias = torch.zeros((out_features,), dtype=dtype) - self.bias = torch.nn.Parameter(bias, requires_grad=False) - else: - self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale - self.register_buffer("weight_scale", init_weight_scale.to(dtype)) - - init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp - if weight_zp: - self.register_buffer("weight_zp", init_weight_zp.to(dtype)) - - init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale - self.register_buffer("input_scale", init_input_scale.to(dtype)) - self.pre_dequantized = False - - @classmethod - def from_original(cls, config, original_layer): - """ - Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. - """ - device = original_layer.weight.device - with torch.device(device): - qdq_linear = cls( - in_features=original_layer.in_features, - out_features=original_layer.out_features, - bias=original_layer.bias, - ) - return qdq_linear - - def dequant_weight_online(self): - if self.pre_dequantized: - return self.weight - fp8_weight = self.weight - qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) - return qdq_weight - - def pre_dequantize(self): - if self.pre_dequantized: - return - dequant_weight = self.dequant_weight_online() - del self.weight - del self.weight_scale - self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False) - self.pre_dequantized = True - - def qdq_input(self, bf16_input: torch.Tensor): - input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data) - qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale - return qdq_input_bf16 - - @torch.no_grad() - def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: - qdq_input = self.qdq_input(bf16_input) - qdq_weight = self.dequant_weight_online() - out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) - return out - - def pack_layer(layer_name, model, data_type, packing_device=None): """ Packs a model layer for quantization based on its type and configuration. From 66388e5360173de4e4b6340a4e075bdd1749c46c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 05:23:48 -0400 Subject: [PATCH 10/23] tmp add bk Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 12 ++++++++++++ auto_round/inference/convert_model.py | 3 +-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 4e3f42861..867d9f398 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -172,6 +172,17 @@ def feature_multiply_checker_group_size( requirements=["auto-round>=0.5.1"], ) +BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( + device=["cuda", "cpu"], + packing_format="", + sym=[True], + bits=[8], + priority=0, + feature_checks=[], + alias=["auto_round", "torch"], + requirements=["auto-round>=0.6.1"], +) + BackendInfos["auto_round:tritonv2_zp"] = BackendInfo( device=["cuda", "xpu"], sym=[True], ## asym has accuracys @@ -732,6 +743,7 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f If no compatible backend is found for the given layer configuration. """ # Check if the provided backend is in BackendInfos + # breakpoint() backend = find_backend(backend) if backend not in BackendInfos.keys(): raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.") diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index bd8b4621d..fbdfb8804 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -566,7 +566,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): backend = quantization_config.backend else: backend = "auto" - ##target_backend could be None _, backend = parse_target_device_and_backend(backend) @@ -591,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format): backend = backend[len("auto_round:") :] - + # breakpoint() used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend) if backend == "auto" or backend == "": best_backend = get_highest_priority_backend( From 17ddd2d0d22d42a990a1dafcc47d47f14e45f0a5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:00:54 -0400 Subject: [PATCH 11/23] refactor code Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 7 ++++--- auto_round/inference/convert_model.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 867d9f398..3e4c8a7f2 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -176,11 +176,12 @@ def feature_multiply_checker_group_size( device=["cuda", "cpu"], packing_format="", sym=[True], + dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, feature_checks=[], alias=["auto_round", "torch"], - requirements=["auto-round>=0.6.1"], + requirements=["auto-round>=0.6.1.dev0"], ) BackendInfos["auto_round:tritonv2_zp"] = BackendInfo( @@ -463,7 +464,7 @@ def dynamic_import_inference_linear(backend, config): bits, group_size, sym = config["bits"], config["group_size"], config["sym"] if is_weight_fp8_activation_static_fp8(config): - from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear + from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear return WeightFP8ActFP8StaticQuantLinear @@ -743,7 +744,6 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f If no compatible backend is found for the given layer configuration. """ # Check if the provided backend is in BackendInfos - # breakpoint() backend = find_backend(backend) if backend not in BackendInfos.keys(): raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.") @@ -855,6 +855,7 @@ def build_pip_commands(gptq_req, other_reqs): # Instructional messages install_instructions = [] + for cmd in pip_cmds: if "intel-extension-for-pytorch" in cmd and target_device == "xpu": install_instructions.append( diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index fbdfb8804..df8b52c07 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -590,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format): backend = backend[len("auto_round:") :] - # breakpoint() + used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend) if backend == "auto" or backend == "": best_backend = get_highest_priority_backend( From 808449d71e0d004298c183d76a417a3df83f3528 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:12:52 -0400 Subject: [PATCH 12/23] refine code Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 24 +++++++++++++++++++ .../experimental/qmodules/fp8_static.py | 12 ++++++++++ auto_round/inference/backend.py | 4 ++++ 3 files changed, 40 insertions(+) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index 860e66836..affc7552d 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -19,6 +19,14 @@ class QModuleBase(torch.nn.Module): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by Auto-Round. + The design is inspired by vLLM's CompressedTensorsScheme: + https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py + + """ + def __init__(self): super().__init__() @@ -26,3 +34,19 @@ def __init__(self): @abstractmethod def from_original(cls, config, original_layer): raise NotImplementedError + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 8d58480d3..3774da810 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from abc import abstractmethod from typing import Optional, Union import torch @@ -106,3 +107,14 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: qdq_weight = self.dequant_weight_online() out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) return out + + @classmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + # FIXME: set to 0 for now, as fp8 kernels are not available yet + return 0 + + def process_weights_after_loading(self, layer: torch.nn.Module): + pass diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 3e4c8a7f2..0ca0d4726 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -172,6 +172,10 @@ def feature_multiply_checker_group_size( requirements=["auto-round>=0.5.1"], ) +# FP8 static quant +# Weight: FP8, per-channel, may be extended to per-tensor in future +# Activation: FP8, per-tensor + BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( device=["cuda", "cpu"], packing_format="", From f74ed6f6ffd7c40b55ce2886a9882f55b5f96bce Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:17:49 -0400 Subject: [PATCH 13/23] fix device list Signed-off-by: yiliu30 --- .../experimental/qmodules/fp8_static.py | 22 +++++++++---------- auto_round/inference/backend.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 3774da810..074cf34e7 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -66,6 +66,17 @@ def __init__( self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False + @classmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + # FIXME: set to 0 for now, as fp8 kernels are not available yet + return 0 + + def process_weights_after_loading(self, layer: torch.nn.Module): + pass + @classmethod def from_original(cls, config, original_layer): """ @@ -107,14 +118,3 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: qdq_weight = self.dequant_weight_online() out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) return out - - @classmethod - def get_min_capability(cls) -> int: - """ - Get minimum device capability. - """ - # FIXME: set to 0 for now, as fp8 kernels are not available yet - return 0 - - def process_weights_after_loading(self, layer: torch.nn.Module): - pass diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 0ca0d4726..f74f22b75 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -177,7 +177,7 @@ def feature_multiply_checker_group_size( # Activation: FP8, per-tensor BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( - device=["cuda", "cpu"], + device=["xpu", "cuda", "cpu"], packing_format="", sym=[True], dtype=["float32", "float16", "bfloat16"], From 632cf8a91046608bb26afedf63c81e0920a3d822 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:25:13 -0400 Subject: [PATCH 14/23] fix Signed-off-by: yiliu30 --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 9dbbca5ab..b8a32896f 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -16,7 +16,6 @@ import json import os from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Union import threadpoolctl as tctl import torch From 5b8b29d4a2e315b9656eb90c8b3948015bcb4a20 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:14:04 -0400 Subject: [PATCH 15/23] refactor code Signed-off-by: yiliu30 --- auto_round/autoround.py | 19 +++++++++++--- .../export/export_to_autoround/export.py | 8 +++++- auto_round/inference/backend.py | 13 +--------- auto_round/utils.py | 26 +++++++++++++++++++ 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index fed33df34..85ea75e60 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -19,6 +19,7 @@ import sys import time import traceback +from enum import Enum from typing import Any, Union import accelerate @@ -74,6 +75,7 @@ is_optimum_habana_available, is_standard_fp, is_static_afp8, + is_torch_fp8_static, llm_load_model, logger, mv_module_from_gpu, @@ -87,6 +89,12 @@ from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block +class AutoRoundFormat(str, Enum): + # Weight: FP8, per-channel, may be extended to per-tensor in future + # Activation: FP8, per-tensor + TORCH_FP8_STATIC = "torch_fp8_static" + + class AutoRound(object): """Automatic weight rounding (Signed Gradient Descent) for LLM quantization @@ -663,9 +671,14 @@ def _parse_format_to_list(self, format: str) -> list: ) if enable_awq: formats[index] = format.replace("auto_round", "auto_round:auto_awq") - if is_nv_fp(self.data_type) or is_mx_fp(self.data_type) or is_standard_fp(self.data_type): + if is_nv_fp(self.data_type) or is_mx_fp(self.data_type): format = format.replace("auto_round", f"auto_round:{self.data_type}") formats[index] = format + if is_torch_fp8_static(self): + format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}") + formats[index] = format + # if is_torch_fp8_static(self): + # formats[index] = "auto_round:torch_fp8_static" elif format == "llmcompressor": from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported @@ -731,10 +744,10 @@ def _check_supported_format(self, format: str) -> bool: ) format = "fake" else: - if not (format == "auto_round" or format == "auto_round:fp8"): + if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"): logger.warning( f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model," - " change format to auto_round" + f" change format {format} to auto_round" ) format = "auto_round" if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8": diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 1640528b6..38b815eb1 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -263,6 +263,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex Raises: ValueError: If the backend is not supported. """ + # breakpoint() data_type = kwargs.get("data_type", None) if is_nv_fp(data_type) or is_mx_fp(data_type): ## detect nvfp & mxfp first from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp @@ -273,9 +274,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex from auto_round.export.export_to_autoround.export_to_fp8_woq import save_quantized_as_autoround return save_quantized_as_autoround(output_dir, inplace=inplace, backend="auto_round", **kwargs) + from auto_round.autoround import AutoRoundFormat ##if using sym, we change to gptq sym kernel to avoid compiling from auto_round source - if (kwargs.get("sym") is None or kwargs.get("sym")) and ("gptq" not in backend and "awq" not in backend): + if ( + (kwargs.get("sym") is None or kwargs.get("sym")) + and ("gptq" not in backend and "awq" not in backend) + and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend) + ): backend = backend.replace("auto_round", "auto_round:auto_gptq") model = kwargs["model"] diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index f74f22b75..739ff4e89 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -19,7 +19,7 @@ from transformers.utils.versions import require_version import auto_round_extension.cuda.gptqmodel_marlin -from auto_round.utils import get_library_version, logger +from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger BackendInfos = {} @@ -429,17 +429,6 @@ def check_compatible( return True -def is_weight_fp8_activation_static_fp8(config): - bits, group_size, sym, data_type, act_dynamic = ( - config["bits"], - config["group_size"], - config["sym"], - config["data_type"], - config["act_dynamic"], - ) - return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic - - def dynamic_import_inference_linear(backend, config): """Dynamically imports and returns the appropriate QuantLinear class based on the given backend. diff --git a/auto_round/utils.py b/auto_round/utils.py index 74999c624..c13556827 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2516,3 +2516,29 @@ def is_nv_fp(backend): def is_static_afp8(ar): return not ar.act_dynamic and "fp8" in ar.act_data_type + + +def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic): + return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic + + +def is_weight_fp8_activation_static_fp8(config): + bits, group_size, sym, data_type, act_dynamic = ( + config["bits"], + config["group_size"], + config["sym"], + config["data_type"], + config["act_dynamic"], + ) + return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) + + +def is_torch_fp8_static(ar): + bits, group_size, sym, data_type, act_dynamic = ( + ar.bits, + ar.group_size, + ar.sym, + ar.data_type, + ar.act_dynamic, + ) + return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) From 57b4c19913c442434144e8ba50df1dfb6f5ba7df Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:18:02 -0400 Subject: [PATCH 16/23] fix Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 6 +++--- auto_round/export/export_to_autoround/export.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index affc7552d..c069f5151 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Optional, Union import torch -class QModuleBase(torch.nn.Module): +class QModuleBase(ABC): """ Abstract class used to describe the weight creation and forward pass of different quantization schemes supported by Auto-Round. @@ -32,7 +32,7 @@ def __init__(self): @classmethod @abstractmethod - def from_original(cls, config, original_layer): + def from_original(cls, config, original_layer: torch.nn.Module): raise NotImplementedError @classmethod diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 38b815eb1..48a59f5e5 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -263,7 +263,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex Raises: ValueError: If the backend is not supported. """ - # breakpoint() data_type = kwargs.get("data_type", None) if is_nv_fp(data_type) or is_mx_fp(data_type): ## detect nvfp & mxfp first from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp From bdf5f3e554da100b337f327257fa2308b90811f5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:19:06 -0400 Subject: [PATCH 17/23] update Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 2 ++ auto_round/experimental/qmodules/fp8_static.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index c069f5151..2a74a470d 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -17,6 +17,8 @@ import torch +__all__ = ["QModuleBase"] + class QModuleBase(ABC): """ diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 074cf34e7..b5c7d2dd2 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -19,6 +19,8 @@ from auto_round.experimental.qmodules.base import QModuleBase +__all__ = ["WeightFP8ActFP8StaticQuantLinear"] + def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor): FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max From ce3384f33ec861f00e4c704f032dc99b907c8536 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:26:05 -0400 Subject: [PATCH 18/23] fix ut Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 4 +- test/test_cpu/test_export.py | 48 ++++++++++++------------ 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index 2a74a470d..8b7a9c138 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -20,9 +20,9 @@ __all__ = ["QModuleBase"] -class QModuleBase(ABC): +class QModuleBase(torch.nn.Module): """ - Abstract class used to describe the weight creation and forward pass + Base class used to describe the weight creation and forward pass of different quantization schemes supported by Auto-Round. The design is inspired by vLLM's CompressedTensorsScheme: https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 24498c780..d648fd721 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -230,31 +230,33 @@ def test_static_afp8_export(self, static_kv_dtype): self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) - with torch.no_grad(): - import transformers - - model = transformers.AutoModelForCausalLM.from_pretrained( - quantized_model_path, - torch_dtype="auto", - low_cpu_mem_usage=True, - trust_remote_code=True, - ) - model.eval() - assert ( - model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear" - ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}" - tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) - prompt = "AI is " - encode = tokenizer.encode(prompt, return_tensors="pt") + if static_kv_dtype is None: with torch.no_grad(): - output_tokens = model.generate( - encode, - max_length=10, + import transformers + + model = transformers.AutoModelForCausalLM.from_pretrained( + quantized_model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, ) - output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) - print(f"Prompt: {prompt}") - print(f"Output: {output}") - assert output is not None, "Output should not be None" + model.eval() + assert ( + model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ + == "WeightFP8ActFP8StaticQuantLinear" + ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}" + tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) + prompt = "AI is " + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate( + encode, + max_length=10, + ) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + print(f"Prompt: {prompt}") + print(f"Output: {output}") + assert output is not None, "Output should not be None" if static_kv_dtype == "fp8": self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys()) From 22d11de19ce77a04b29f28c5c19e6639a7130298 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 01:04:39 -0400 Subject: [PATCH 19/23] correct Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/fp8_static.py | 13 +++++-------- auto_round/utils.py | 14 +++++++++++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index b5c7d2dd2..90ee09357 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -18,6 +18,7 @@ import torch from auto_round.experimental.qmodules.base import QModuleBase +from auto_round.utils import logger __all__ = ["WeightFP8ActFP8StaticQuantLinear"] @@ -41,7 +42,6 @@ def __init__( weight: Optional[torch.Tensor] = None, weight_scale: Optional[torch.Tensor] = None, bias: Union[torch.Tensor, bool, None] = None, - weight_zp: Optional[torch.Tensor] = None, input_scale: Optional[torch.Tensor] = None, dtype=torch.bfloat16, ): @@ -57,14 +57,10 @@ def __init__( self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale + init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale self.register_buffer("weight_scale", init_weight_scale.to(dtype)) - init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp - if weight_zp: - self.register_buffer("weight_zp", init_weight_zp.to(dtype)) - - init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale + init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False @@ -73,7 +69,8 @@ def get_min_capability(cls) -> int: """ Get minimum device capability. """ - # FIXME: set to 0 for now, as fp8 kernels are not available yet + # TODO: correct that config once we add fp8 op support. + logger.warning_once("FP8 ops are not yet supported. Using capability 0.") return 0 def process_weights_after_loading(self, layer: torch.nn.Module): diff --git a/auto_round/utils.py b/auto_round/utils.py index c13556827..2fd78f7a0 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -108,9 +108,17 @@ def infer_bits_by_data_type(data_type: str): return None -@lru_cache(None) -def warning_once(self, msg: str): - self.warning(msg) +@lru_cache(maxsize=None) +def warning_once(self, msg, *args, **kwargs): + """ + Log a warning message only once per unique message/arguments combination. + + Args: + msg: The warning message format string + *args: Variable positional arguments for message formatting + **kwargs: Variable keyword arguments for message formatting and logging options + """ + self.warning(msg, *args, **kwargs) class AutoRoundFormatter(logging.Formatter): From 90826139a8ddfb53a983ad2e87b2ef978fcbe3fb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 01:05:28 -0400 Subject: [PATCH 20/23] clean Signed-off-by: yiliu30 --- auto_round/autoround.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 85ea75e60..2af8df95e 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -677,8 +677,7 @@ def _parse_format_to_list(self, format: str) -> list: if is_torch_fp8_static(self): format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}") formats[index] = format - # if is_torch_fp8_static(self): - # formats[index] = "auto_round:torch_fp8_static" + elif format == "llmcompressor": from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported From 2202856fabc8abe2f8ad7a964899450621fbd598 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 03:34:11 -0400 Subject: [PATCH 21/23] fix shape Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/fp8_static.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 90ee09357..a6798f53d 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -57,10 +57,10 @@ def __init__( self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale + init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale self.register_buffer("weight_scale", init_weight_scale.to(dtype)) - init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale + init_input_scale = torch.zeros((1), dtype=dtype) if input_scale is None else input_scale self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False From d0b99a8f1c493d8484e10871b3a533705c8f1401 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 20:59:33 -0400 Subject: [PATCH 22/23] fix check Signed-off-by: yiliu30 --- auto_round/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 934486c5a..6ef3884a9 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -687,7 +687,7 @@ def _parse_format_to_list(self, format: str) -> list: format = "auto_round:auto_awq" elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type): format = f"auto_round:{self.data_type}" - elif is_wfp8afp8(self): # staic wfp8afp8 + elif is_static_wfp8afp8(self): # staic wfp8afp8 format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}" elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16: # woq fp8 format = "auto_round:fp8" From 31845d0d025db8b24e4676192a5b998c56188c8e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 21:02:34 -0400 Subject: [PATCH 23/23] clean code Signed-off-by: yiliu30 --- auto_round/autoround.py | 2 -- auto_round/utils.py | 15 --------------- 2 files changed, 17 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 6ef3884a9..49e3984a7 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -73,9 +73,7 @@ is_nv_fp, is_optimum_habana_available, is_standard_fp, - is_static_afp8, is_static_wfp8afp8, - is_torch_fp8_static, is_wfp8afp8, llm_load_model, logger, diff --git a/auto_round/utils.py b/auto_round/utils.py index 9886a5337..21363688b 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2527,10 +2527,6 @@ def is_nv_fp(backend): return BackendDataType.NV_FP in backend -def is_static_afp8(ar): - return not ar.act_dynamic and "fp8" in ar.act_data_type - - def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic): return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic @@ -2546,17 +2542,6 @@ def is_weight_fp8_activation_static_fp8(config): return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) -def is_torch_fp8_static(ar): - bits, group_size, sym, data_type, act_dynamic = ( - ar.bits, - ar.group_size, - ar.sym, - ar.data_type, - ar.act_dynamic, - ) - return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) - - def is_wfp8afp8(ar): if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and ( "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)