diff --git a/.azure-pipelines/scripts/ut/run_ut_cuda.sh b/.azure-pipelines/scripts/ut/run_ut_cuda.sh index 02073e958..8580f760d 100644 --- a/.azure-pipelines/scripts/ut/run_ut_cuda.sh +++ b/.azure-pipelines/scripts/ut/run_ut_cuda.sh @@ -46,6 +46,7 @@ function run_unit_test() { CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" uv pip install llama-cpp-python uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py' uv pip install -r requirements.txt + uv pip install -r requirements_diffusion.txt uv pip list export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage diff --git a/auto_round/__init__.py b/auto_round/__init__.py index 15bbc373d..a3a3b24bc 100644 --- a/auto_round/__init__.py +++ b/auto_round/__init__.py @@ -14,7 +14,7 @@ from auto_round.autoround import AutoRound # support for old api -from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam +from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion from auto_round.utils import LazyImport diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 07bc3f273..b806b583b 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -230,6 +230,25 @@ def __init__(self, *args, **kwargs): help="the template for building training dataset. It can be a custom one.", ) + ## ===================== diffusion model ================== + self.add_argument( + "--guidance_scale", + default=7.5, + type=float, + ) + + self.add_argument( + "--num_inference_steps", + default=50, + type=int, + ) + + self.add_argument( + "--generator_seed", + default=None, + type=int, + ) + ## ======================= eval ======================= self.add_argument( "--tasks", @@ -258,6 +277,22 @@ def __init__(self, *args, **kwargs): "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation." ) + ## ======================= diffusion model eval ======================= + self.add_argument("--prompt_file", default=None, type=str, help="the prompt file to load prmpt.") + + self.add_argument("--prompt", default=None, type=str, help="the prompt for test.") + + self.add_argument( + "--metrics", + "--metric", + default="clip", + help="support clip, clip-iqa, imagereward", + ) + + self.add_argument( + "--image_save_dir", default="./tmp_image_save", type=str, help="path to save generated images" + ) + def setup_parser(): parser = BasicArgumentParser() @@ -427,6 +462,7 @@ def tune(args): ) from auto_round.compressors import ( + DiffusionExtraConfig, ExtraConfig, MLLMExtraConfig, SchemeExtraConfig, @@ -466,9 +502,15 @@ def tune(args): mllm_config = MLLMExtraConfig( quant_nontext_module=args.quant_nontext_module, extra_data_dir=args.extra_data_dir, template=args.template ) + diffusion_config = DiffusionExtraConfig( + guidance_scale=args.guidance_scale, + num_inference_steps=args.num_inference_steps, + generator_seed=args.generator_seed, + ) extra_config.tuning_config = tuning_config extra_config.scheme_config = scheme_config extra_config.mllm_config = mllm_config + extra_config.diffusion_config = diffusion_config autoround: BaseCompressor = AutoRound( model=model_name, @@ -524,6 +566,45 @@ def tune(args): model.eval() clear_memory() + eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") + + # diffusion model has different evaluation path + if getattr(autoround, "diffusion", False): + pipe = autoround.pipe + pipe.to(model.dtype) + pipe.transformer = model + device_str = detect_device(device_str) + pipe = pipe.to(device_str) + if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto": + pipe.to(getattr(torch, eval_model_dtype)) + + gen_kwargs = { + "guidance_scale": args.guidance_scale, + "output_type": "pil", + "num_inference_steps": args.num_inference_steps, + "generator": ( + None + if args.generator_seed is None + else torch.Generator(device=pipe.device).manual_seed(args.generator_seed) + ), + } + if not os.path.exists(args.image_save_dir): + os.makedirs(args.image_save_dir) + + if args.prompt is not None: + outputs = pipe(prompt=args.prompt, **gen_kwargs) + outputs.images[0].save(os.path.join(args.image_save_dir, "img.png")) + logger.info( + f"Image generated with prompt {args.prompt} is saved as {os.path.join(args.image_save_dir, 'img.png')}" + ) + + if args.prompt_file is not None: + from auto_round.compressors.diffusion import diffusion_eval + + metrics = args.metrics.split(",") + diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs) + return + lm_eval_version = get_library_version("lm-eval") eval_folder = folders[-1] @@ -545,8 +626,6 @@ def tune(args): import time - eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") - if (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model: if eval_gguf_model: # for file in os.listdir(eval_folder): diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 4074213a9..68420d65c 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -20,13 +20,14 @@ from auto_round.compressors import ( AdamCompressor, BaseCompressor, + DiffusionCompressor, ExtraConfig, LLMCompressor, MLLMCompressor, ) from auto_round.logger import deprecated, logger from auto_round.schemes import QuantizationScheme -from auto_round.utils import is_mllm_model +from auto_round.utils import is_diffusion_model, is_mllm_model class AutoRound: @@ -77,7 +78,7 @@ def __new__( seed: int = 42, # for adam enable_adam: bool = False, - # for MLLM + # for MLLM and Diffusion extra_config: ExtraConfig = None, **kwargs, ) -> BaseCompressor: @@ -145,9 +146,17 @@ def __new__( if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model): logger.info("using MLLM mode for multimodal model.") model_cls.append(MLLMCompressor) + if extra_config: + extra_config.diffusion_config = None + elif (extra_config and not extra_config.diffusion_config.is_default()) or is_diffusion_model(model): + logger.info("using Diffusion mode for diffusion model.") + model_cls.append(DiffusionCompressor) + if extra_config: + extra_config.mllm_config = None else: if extra_config: extra_config.mllm_config = None + extra_config.diffusion_config = None model_cls.append(LLMCompressor) if enable_adam: @@ -540,3 +549,83 @@ def __init__( seed=seed, **kwargs, ) + + +@deprecated("AutoRound") +class AutoRoundDiffusion(DiffusionCompressor): + """Class for automatic rounding-based quantization with Diffusion models. + + Args: + model: The PyTorch model to be quantized. + tokenizer: An optional tokenizer for processing input data, is not used for diffusion models. + guidance_scale (float): Control how much the image generation process follows the text prompt. + The more it is, the more closely it follows the prompt (default is 7.5). + num_inference_steps (int): The reference number of denoising steps (default is 50). + generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None). + scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. + layer_config (dict): Configuration for weight quantization (default is None). + dataset: The path or name of the calib dataset. + iters (int): Number of iterations (default is 200). + seqlen (int): Length of the sequence. + nsamples (int): Number of samples (default is 128). + batch_size (int): Batch size for training (default is 8). + gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). + low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). + device_map (str | dict | int | torch.device, optional): Device placement map. Defaults to 0. + enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer + **kwargs: Additional keyword arguments. + """ + + bits: int | None + group_size: int | None + sym: bool | None + data_type: str | None + act_bits: int | None + act_group_size: int | None + act_sym: bool | None + act_data_type: str | None + act_dynamic: bool | None + super_bits: int | None + super_group_size: int | None + + def __init__( + self, + model: Union[object, str], + tokenizer=None, + guidance_scale: float = 7.5, + num_inference_steps: int = 50, + generator_seed: int = None, + scheme: Union[str, dict, QuantizationScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "coco2014", + iters: int = 200, + seqlen: int = 2048, + nsamples: int = 128, + batch_size: int = 8, + gradient_accumulate_steps: int = 1, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + seed: int = 42, + **kwargs, + ): + super().__init__( + model=model, + tokenizer=None, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator_seed=generator_seed, + scheme=scheme, + layer_config=layer_config, + dataset=dataset, + iters=iters, + seqlen=seqlen, + nsamples=nsamples, + batch_size=batch_size, + gradient_accumulate_steps=gradient_accumulate_steps, + low_gpu_mem_usage=low_gpu_mem_usage, + device_map=device_map, + enable_torch_compile=enable_torch_compile, + seed=seed, + **kwargs, + ) diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py index 35bbc5666..dbf47b9c2 100644 --- a/auto_round/compressors/__init__.py +++ b/auto_round/compressors/__init__.py @@ -14,7 +14,9 @@ from auto_round.compressors.base import * from auto_round.compressors.mllm.compressor import MLLMCompressor +from auto_round.compressors.diffusion.compressor import DiffusionCompressor from auto_round.compressors.config import ( + DiffusionExtraConfig, ExtraConfig, MLLMExtraConfig, SchemeExtraConfig, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 01546034d..39f1b6575 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -18,6 +18,7 @@ import sys import time import traceback +from collections import defaultdict from dataclasses import asdict, fields from enum import Enum from typing import Any, Callable, Union @@ -229,6 +230,7 @@ def __init__( device = kwargs.pop("device", None) self.quant_lm_head = kwargs.pop("quant_lm_head", False) self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False + self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False # Scale factor for RAM usage per parameter. self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) fp_layers = kwargs.pop("fp_layers", None) @@ -276,7 +278,7 @@ def __init__( device="cpu", low_cpu_mem_mode=low_cpu_mem_usage, # always load cpu first ) - elif tokenizer is None and iters > 0: + elif tokenizer is None and not self.diffusion and iters > 0: raise ValueError("A tokenizer must be set for non-str model input") self.low_cpu_mem_usage = bool(low_cpu_mem_usage) if unsupport_meta_device(model): @@ -342,7 +344,7 @@ def __init__( model, tokenizer, low_cpu_mem_usage = llm_load_model( model, device=device, low_cpu_mem_mode=low_cpu_mem_usage ) - elif tokenizer is None and iters > 0: + elif tokenizer is None and not self.diffusion and iters > 0: raise ValueError("A tokenizer must be set for non-str model input") self.low_cpu_mem_usage = bool(low_cpu_mem_usage) if unsupport_meta_device(model): @@ -1697,6 +1699,19 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) cnt = 1 cnt += 1 + def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]: + keys = inputs.keys() + input_id_str = [key for key in keys if key.startswith("hidden_state")] + if len(input_id_str) != 1: + raise RuntimeError( + "hidden_states arg mismatch error," + "please raise an issue in https://github.com/intel/auto-round/issues" + ) + inputs["input_ids"] = inputs.pop(input_id_str[0], None) + if q_inputs is not None: + q_inputs = q_inputs.pop(input_id_str[0], None) + return inputs, q_inputs + def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound. Returns: @@ -1784,7 +1799,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if len(all_blocks) > 1: pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks)) else: - pbar = None # move the alg warning outside pbar + pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks)) # move the alg warning outside pbar for block_names in all_blocks: inputs = all_inputs[block_names[0]] @@ -1793,16 +1808,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: if all_q_inputs is not None: q_inputs = all_q_inputs[block_names[0]] all_q_inputs.pop(block_names[0]) - keys = inputs.keys() - input_id_str = [key for key in keys if key.startswith("hidden_state")] - if len(input_id_str) != 1: - raise RuntimeError( - "hidden_states arg mismatch error," - "please raise an issue in https://github.com/intel/auto-round/issues" - ) - inputs["input_ids"] = inputs.pop(input_id_str[0], None) - if q_inputs is not None: - q_inputs["input_ids"] = q_inputs.pop(input_id_str[0], None) + + inputs, q_inputs = self._update_inputs(inputs, q_inputs) clear_memory(self.inputs) @@ -1816,7 +1823,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: self.model, inputs, block_names, - q_input=q_inputs["input_ids"] if q_inputs is not None else None, + q_input=q_inputs if q_inputs is not None else None, nblocks=self.nblocks, device=self.device, pbar=pbar, @@ -1826,6 +1833,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]: f"Expected exactly one packing format when 'is_packing_immediate' is True, " f"but got {len(self.formats)} formats." ) + pbar.set_description("Quantizing done") + pbar.close() self._quantize_layers(layer_names, all_inputs) ##TODO pack layer immediately @@ -2686,6 +2695,30 @@ def get_act_max_hook(module, input, output): continue return hook_handles + def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor: + current_output = [output[x] for x in indices] + current_output = torch.cat(current_output, dim=self.batch_dim) + return current_output + + def _get_current_q_output( + self, + block: torch.nn.Module, + input_ids: list[torch.Tensor], + input_others: dict, + indices: list[int], + device: str, + ) -> torch.Tensor: + current_input_ids, current_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + seqlen=self.seqlen, + batch_dim=self.batch_dim, + share_cache_keys=self.shared_cache_keys, + ) + output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device) + return output_q + def _quantize_block( self, block: torch.nn.Module, @@ -2831,22 +2864,12 @@ def _quantize_block( for tmp_step in range(self.gradient_accumulate_steps): indices = whole_indices[tmp_step * self.batch_size : (tmp_step + 1) * self.batch_size] - current_input_ids, current_input_others = self._sampling_inputs( - input_ids, - input_others, - indices, - seqlen=self.seqlen, - batch_dim=self.batch_dim, - share_cache_keys=self.shared_cache_keys, - ) - current_output = [output[x] for x in indices] - current_output = torch.cat(current_output, dim=self.batch_dim) + current_output = self._get_current_output(output, indices) + current_output = to_device(current_output, device) - output_q = block_forward( - block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device - ) + output_q = self._get_current_q_output(block, input_ids, input_others, indices, device) if self.amp: with autocast(device_type=device.split(":")[0], dtype=self.amp_dtype): loss = mse_loss(output_q, current_output) # pylint: disable=not-callable @@ -2925,6 +2948,12 @@ def _quantize_block( clear_memory(input_ids) return None, output + def _split_inputs(self, inputs: dict) -> tuple[torch.Tensor, dict]: + input_ids = inputs["input_ids"] + inputs.pop("input_ids", None) + input_others = inputs + return input_ids, input_others + def _quantize_blocks( self, model: torch.nn.Module, @@ -2950,16 +2979,14 @@ def _quantize_blocks( clear_memory() for n, m in model.named_parameters(): m.requires_grad_(False) - input_ids = inputs["input_ids"] - inputs.pop("input_ids", None) - input_others = inputs + + input_ids, input_others = self._split_inputs(inputs) clear_memory() input_ids = to_device(input_ids, self.cache_device) input_others = to_device(input_others, self.cache_device) # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage tmp_dtype = self.amp_dtype if self.amp else torch.float32 - for i in range(len(input_ids)): - input_ids[i] = input_ids[i].to(tmp_dtype) + input_ids = to_dtype(input_ids, tmp_dtype) for key in input_others.keys(): if isinstance(input_others[key], torch.Tensor) and ( @@ -3057,9 +3084,9 @@ def _quantize_blocks( PACKING_LAYER_WITH_FORMAT[target_backend]( tmp_m.tmp_name, self.model, self.formats[0], device=self.device ) - pbar.set_description("Quantizing done") - pbar.update(1) - pbar.close() + if pbar is not None: + pbar.update(1) + self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage) for n, m in self.model.named_modules(): if hasattr(m, "name"): @@ -3299,7 +3326,7 @@ def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any): @torch.no_grad() def _sampling_inputs( cls, - input_ids: list[torch.Tensor], + input_ids: Union[list[torch.Tensor], dict], input_others: dict, indices: list[int], seqlen: int, @@ -3318,9 +3345,14 @@ def _sampling_inputs( current_input_ids: The sampled input IDs. current_input_others: The sampled other input data. """ - current_input_ids = [input_ids[i] for i in indices] - - current_input_ids = torch.cat(current_input_ids, dim=batch_dim) + if isinstance(input_ids, list): + current_input_ids = [input_ids[i] for i in indices] + current_input_ids = torch.cat(current_input_ids, dim=batch_dim) + elif isinstance(input_ids, dict): + current_input_ids = defaultdict(list) + for k in input_ids.keys(): + current_input_ids[k].extend([input_ids[k][i] for i in indices]) + current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim) current_input_others = {"positional_inputs": input_others["positional_inputs"]} for key in input_others.keys(): diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py index 5b40735ff..d42e13427 100644 --- a/auto_round/compressors/config.py +++ b/auto_round/compressors/config.py @@ -26,6 +26,7 @@ class ExtraConfig: _scheme_config = None _tuning_config = None _mllm_config = None + _diffusion_config = None def __init__( self, @@ -65,6 +66,10 @@ def __init__( quant_nontext_module: bool = False, extra_data_dir: str = None, template: str = None, + # diffusion + guidance_scale: float = 7.5, + num_inference_steps: int = 50, + generator_seed: int = None, ): """Initialize @@ -103,6 +108,10 @@ def __init__( quant_nontext_module: Whether to quantize nontext module. extra_data_dir: The path of extra data such as images, audio and videos. template: The path or name of template used to specify process for different MLLMs. + guidance_scale (float): Control how much the image generation process follows the text prompt. + The more it is, the more closely it follows the prompt (default is 7.5). + num_inference_steps (int): The reference number of denoising steps (default is 50). + generator_seed (int): A seed that controls the initial noise for image generation (default is None). """ self.tuning_config = TuningExtraConfig( amp=amp, @@ -143,6 +152,11 @@ def __init__( extra_data_dir=extra_data_dir, template=template, ) + self.diffusion_config = DiffusionExtraConfig( + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator_seed=generator_seed, + ) @property def tuning_config(self): @@ -180,6 +194,20 @@ def mllm_config(self, config: MLLMExtraConfig): ), f"mllm_config should be MLLMExtraConfig, but got {config.__class__.__name__}" self._mllm_config = config + @property + def diffusion_config(self): + return self._diffusion_config + + @diffusion_config.setter + def diffusion_config(self, config: DiffusionExtraConfig): + if config is None: + self._diffusion_config = None + else: + assert isinstance( + config, DiffusionExtraConfig + ), f"diffusion_config should be DiffusionExtraConfig, but got {config.__class__.__name__}" + self._diffusion_config = config + def to_dict(self): output_dict = {} for config in self.__dict__.values(): @@ -263,3 +291,10 @@ class MLLMExtraConfig(BaseExtraConfig): quant_nontext_module: bool = False extra_data_dir: str = None template: str = None + + +@dataclass +class DiffusionExtraConfig(BaseExtraConfig): + guidance_scale: float = 7.5 + num_inference_steps: int = 50 + generator_seed: int = None diff --git a/auto_round/compressors/diffusion/__init__.py b/auto_round/compressors/diffusion/__init__.py new file mode 100644 index 000000000..b084e94f2 --- /dev/null +++ b/auto_round/compressors/diffusion/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader +from auto_round.compressors.diffusion.compressor import DiffusionCompressor +from auto_round.compressors.diffusion.eval import diffusion_eval diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py new file mode 100644 index 000000000..4c4b9a00e --- /dev/null +++ b/auto_round/compressors/diffusion/compressor.py @@ -0,0 +1,367 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from copy import deepcopy +from typing import Union + +import torch +from tqdm import tqdm + +from auto_round.compressors.base import BaseCompressor +from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader +from auto_round.logger import logger +from auto_round.low_cpu_mem.utils import get_layers_before_block +from auto_round.schemes import QuantizationScheme +from auto_round.utils import ( + LazyImport, + block_forward, + clear_memory, + diffusion_load_model, + extract_block_names_to_str, + find_matching_blocks, + get_block_names, +) + +pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") + +output_configs = { + "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"], + "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"], +} + + +class DiffusionCompressor(BaseCompressor): + """Class for automatic rounding-based quantization with Diffusion models. + + Args: + model: The PyTorch model to be quantized. + tokenizer: An optional tokenizer for processing input data, is not used for diffusion models. + guidance_scale (float): Control how much the image generation process follows the text prompt. + The more it is, the more closely it follows the prompt (default is 7.5). + num_inference_steps (int): The reference number of denoising steps (default is 50). + generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None). + scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. + layer_config (dict): Configuration for weight quantization (default is None). + dataset: The path or name of the calib dataset. + iters (int): Number of iterations (default is 200). + seqlen (int): Length of the sequence. + nsamples (int): Number of samples (default is 128). + batch_size (int): Batch size for training (default is 8). + gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1). + low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False). + device_map (str | dict | int | torch.device, optional): Device placement map. Defaults to 0. + enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer + **kwargs: Additional keyword arguments. + """ + + bits: int | None + group_size: int | None + sym: bool | None + data_type: str | None + act_bits: int | None + act_group_size: int | None + act_sym: bool | None + act_data_type: str | None + act_dynamic: bool | None + super_bits: int | None + super_group_size: int | None + + def __init__( + self, + model: Union[object, str], + tokenizer=None, + guidance_scale: float = 7.5, + num_inference_steps: int = 50, + generator_seed: int = None, + scheme: Union[str, dict, QuantizationScheme] = "W4A16", + layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None, + dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "coco2014", + iters: int = 200, + seqlen: int = 2048, + nsamples: int = 128, + batch_size: int = 8, + gradient_accumulate_steps: int = 1, + low_gpu_mem_usage: bool = False, + device_map: Union[str, torch.device, int, dict] = 0, + enable_torch_compile: bool = False, + seed: int = 42, + **kwargs, + ): + self.guidance_scale = guidance_scale + self.num_inference_steps = num_inference_steps + self.generator_seed = generator_seed + + to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None) + if device_map is None: + device_map = 0 + self._set_device(device_map) + + if isinstance(model, str): + pipe, model = diffusion_load_model(model, device=self.device) + elif isinstance(model, pipeline_utils.DiffusionPipeline): + pipe = model + model = pipe.transformer + else: + raise ValueError(f"Only support str or DiffusionPipeline class for model, but get {type(model)}") + + self.model = model + self.pipe = pipe + + all_blocks = get_block_names(model) + self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names) + if to_quant_block_names is None: + to_quant_block_names = extract_block_names_to_str(self.quant_block_list) + + if iters > 0 and batch_size != 1: + logger.warning( + f"reset batch_size({batch_size}) to 1 and " + f"gradient_accumulate_steps({gradient_accumulate_steps}) " + f"to {batch_size * gradient_accumulate_steps}, " + f"because batch_size={batch_size} cannot be used for calibrating non-text modules." + ) + gradient_accumulate_steps = batch_size * gradient_accumulate_steps + batch_size = 1 + + seqlen = 2048 if seqlen is None else seqlen + + if nsamples % batch_size != 0: + nsamples = (nsamples // batch_size + 1) * batch_size + logger.warning(f"'nsamples' is not divisible by 'batch_size', will adjusted to {nsamples}") + + kwargs["diffusion"] = True + super(DiffusionCompressor, self).__init__( + model=model, + tokenizer=None, + scheme=scheme, + layer_config=layer_config, + dataset=dataset, + iters=iters, + seqlen=seqlen, + nsamples=nsamples, + batch_size=batch_size, + gradient_accumulate_steps=gradient_accumulate_steps, + low_gpu_mem_usage=low_gpu_mem_usage, + device_map=device_map, + enable_torch_compile=enable_torch_compile, + seed=seed, + to_quant_block_names=to_quant_block_names, + **kwargs, + ) + + def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]: + # flux transformer model's blocks will update hidden_states and encoder_hidden_states + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + if q_inputs is not None: + q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str} + return inputs, q_inputs + + def _split_inputs(self, inputs: dict) -> tuple[dict, dict]: + input_id_str = [key for key in inputs.keys() if "hidden_state" in key] + input_ids = {k: inputs.pop(k, None) for k in input_id_str} + input_others = inputs + return input_ids, input_others + + def _get_current_output(self, output: dict, indices: list[int]) -> torch.Tensor: + assert "hidden_states" in output + current_output = [output["hidden_states"][x] for x in indices] + current_output = torch.cat(current_output, dim=self.batch_dim) + return current_output + + def _get_current_q_output( + self, + block: torch.nn.Module, + input_ids: dict, + input_others: dict, + indices: list[int], + device: str, + ) -> torch.Tensor: + output_config = output_configs.get(block.__class__.__name__, []) + idx = None if "hidden_states" not in output_config else output_config.index("hidden_states") + current_input_ids, current_input_others = self._sampling_inputs( + input_ids, + input_others, + indices, + seqlen=self.seqlen, + batch_dim=self.batch_dim, + share_cache_keys=self.shared_cache_keys, + ) + if isinstance(current_input_ids, dict): + hidden_states = current_input_ids.pop("hidden_states") + current_input_others.update(current_input_ids) + current_input_ids = hidden_states + output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device, idx) + return output_q + + @torch.no_grad() + def _get_block_outputs( + self, + block: torch.nn.Module, + input_ids: torch.Tensor, + input_others: torch.Tensor, + bs: int, + device: Union[str, torch.device], + cache_device: Union[str, torch.device], + save_output: bool = True, + ): + """Compute the output of a given block of the model for a given input. + + Args: + block: The block of the model. + input_ids: The input tensor containing tokenized input ids. + input_others: A dictionary containing additional input data. + bs: The batch size for computing the output. + device: The device for computation. + cache_device: The device for storing the output. + batch_dim: The batch dimension of the output tensor. + + Returns: + The output tensor of the block. + """ + + output = defaultdict(list) + nsamples = len(input_ids) + output_config = output_configs.get(block.__class__.__name__, []) + + for i in range(0, nsamples, bs): + end_index = min(nsamples, i + bs) + indices = torch.arange(i, end_index).to(torch.long) + tmp_input_ids, tmp_input_others = self._sampling_inputs( + input_ids, input_others, indices, self.seqlen, self.batch_dim, share_cache_keys=self.shared_cache_keys + ) + if isinstance(tmp_input_ids, dict): + hidden_states = tmp_input_ids.pop("hidden_states") + tmp_input_others.update(tmp_input_ids) + tmp_input_ids = hidden_states + + tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None) + assert len(output_config) == len(tmp_output) + tmp_output = dict(zip(output_config, tmp_output)) + + if save_output: + for name, out in tmp_output.items(): + if self.batch_size == 1: + output[name].append(out.to(cache_device)) + else: + output[name].extend(list(torch.split(out.to(cache_device), 1, dim=self.batch_dim))) + if self.low_gpu_mem_usage: + clear_memory() + + return output + + def calib(self, nsamples, bs): + """Perform calibration for quantization. + + This method calibrates the model for quantization by processing a specified + number of samples from the calibration dataset. It ensures that the data is + properly formatted and feeds it to the model. If the number of samples processed + is less than the specified number, it logs a warning. If no samples are processed, + it logs an error and exits. + Args: + nsamples (int): The number of samples to use for calibration. + bs (int): The number of samples to use for calibration + """ + logger.warning( + "Diffusion model will catch nsamples * num_inference_steps inputs, " + "you can reduce nsamples or num_inference_steps if OOM or take too much time." + ) + if isinstance(self.dataset, str): + dataset = self.dataset.replace(" ", "") + self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader( + dataset=dataset, + bs=self.batch_size, + seed=self.seed, + nsamples=self.nsamples, + gradient_accumulate_steps=self.gradient_accumulate_steps, + ) + else: + self.dataloader = self.dataset + total_cnt = 0 + + if self.low_cpu_mem_usage: + embed_layers = get_layers_before_block(self.model) + for n, m in embed_layers: + m = m.to(self.device) + + total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader)) + if self.pipe.dtype != self.model.dtype: + self.pipe.to(self.model.dtype) + if self.pipe.device != self.model.device: + self.pipe.to(self.model.device) + with tqdm(range(1, total + 1), desc="cache block inputs") as pbar: + for ids, prompts in self.dataloader: + if isinstance(prompts, tuple): + prompts = list(prompts) + try: + self.pipe( + prompt=prompts, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator=( + None + if self.generator_seed is None + else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed) + ), + ) + except NotImplementedError: + pass + except Exception as error: + raise error + step = len(prompts) + total_cnt += step + pbar.update(step) + if total_cnt >= nsamples: + break + if total_cnt == 0: + logger.error( + f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the " + f"dataset or decease the sequence length" + ) + exit(-1) + elif total_cnt < nsamples: + logger.warning( + f"Insufficient number of samples collected may affect the quantization. " + f"target samples count is {nsamples}, while valid samples count is {total_cnt}" + ) + if total_cnt < self.batch_size: + raise ValueError( + f"valid samples is less than batch_size({self.batch_size})," + " please adjust self.batch_size or seqlen." + ) + max_len = (total_cnt // self.batch_size) * self.batch_size + for k, v in self.inputs.items(): + for key in v: + if isinstance(v[key], list) and len(v[key]) == total_cnt: + self.inputs[k][key] = v[key][:max_len] + + # clean embed weight to save memory + if self.low_cpu_mem_usage: + for n, m in embed_layers: + m = m.to("meta") + # torch.cuda.empty_cache() + + def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs): + """Save the quantized model to the specified output directory in the specified format. + + Args: + output_dir (str, optional): The directory to save the quantized model. Defaults to None. + format (str, optional): The format in which to save the model. Defaults to "auto_round". + inplace (bool, optional): Whether to modify the model in place. Defaults to True. + **kwargs: Additional keyword arguments specific to the export format. + + Returns: + object: The compressed model object. + """ + compressed_model = super().save_quantized(output_dir=output_dir, format=format, inplace=inplace, **kwargs) + return compressed_model diff --git a/auto_round/compressors/diffusion/dataset.py b/auto_round/compressors/diffusion/dataset.py new file mode 100644 index 000000000..a716a8a58 --- /dev/null +++ b/auto_round/compressors/diffusion/dataset.py @@ -0,0 +1,117 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Dict + +import pandas as pd +import torch +from torch.utils.data import DataLoader, Dataset +from transformers import set_seed + +from auto_round.utils import logger + +DIFFUSION_DATASET: Dict[str, Dataset] = {} + + +COCO_URL = { + "coco2014": ( + "https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/" + "coco2014/captions/captions_source.tsv" + ) +} + + +def register_dataset(name_list): + """Class decorator to register a DATASET subclass to the registry. + + Decorator function used before a Pattern subclass. + + Args: + name: A string. Define the dataset type. + + Returns: + cls: The class of register. + """ + + def register(dataset): + for name in name_list.replace(" ", "").split(","): + DIFFUSION_DATASET[name] = dataset + + return register + + +@register_dataset("local") +class Text2ImgDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, + dataset_path, + nsamples=128, + ) -> None: + super().__init__() + self.captions = [] + self.caption_ids = [] + + logger.info(f"use dataset {dataset_path}, loading from disk...") + df = pd.read_csv(dataset_path, sep="\t") + + for index, row in df.iterrows(): + if nsamples > 0 and index + 1 > nsamples: + break + assert "id" in row and "caption" in row + caption_id = row["id"] + caption_text = row["caption"] + self.caption_ids.append(caption_id) + self.captions.append(caption_text) + + def __len__(self): + return len(self.captions) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + return self.caption_ids[i], self.captions[i] + + +def get_diffusion_dataloader( + dataset="coco2014", + bs=1, + seed=42, + nsamples=128, + gradient_accumulate_steps=1, +): + """Generate a DataLoader for calibration using specified parameters. + Args: + Dataset_name (str): The name or path of the dataset. + bs (int, optional): The batch size. Defaults to 1. + Returns: + DataLoader: The DataLoader for the calibrated datasets. + """ + if dataset in COCO_URL: + import requests + + logger.info(f"use dataset {dataset}, downloading ...") + text_data = requests.get(COCO_URL[dataset]).text + with open("captions_source.tsv", "w") as f: + f.write(text_data) + dataset = "captions_source.tsv" + + if isinstance(dataset, str) and os.path.exists(dataset): + dataset = DIFFUSION_DATASET["local"](dataset, nsamples) + else: + raise ValueError("Only support coco2014 dataset or loading local tsv file now.") + set_seed(seed) + dataloader_params = {"batch_size": bs, "shuffle": True} + + return DataLoader(dataset, **dataloader_params), bs, gradient_accumulate_steps diff --git a/auto_round/compressors/diffusion/eval.py b/auto_round/compressors/diffusion/eval.py new file mode 100644 index 000000000..5baed978b --- /dev/null +++ b/auto_round/compressors/diffusion/eval.py @@ -0,0 +1,112 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import os + +import numpy as np +import torch +from PIL import Image +from tqdm import tqdm + +from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader +from auto_round.utils import LazyImport + +metrics = LazyImport("torchmetrics.multimodal") +reward = LazyImport("ImageReward") + + +def compute_clip(prompts, images, device: str = "cuda"): + clip_model = metrics.CLIPScore(model_name_or_path="openai/clip-vit-large-patch14").to(device) + for prompt, img_path in tqdm(zip(prompts, images), desc="Computing CLIP score"): + image_data = Image.open(img_path).convert("RGB") + image_tensor = torch.from_numpy(np.array(image_data)).permute(2, 0, 1) + clip_model.update(image_tensor.to(torch.float32).to(device).unsqueeze(0), prompt) + result = clip_model.compute().mean().item() + return {"CLIP": result} + + +def compute_clip_iqa(prompts, images, device: str = "cuda"): + clip_model = metrics.CLIPImageQualityAssessment(model_name_or_path="openai/clip-vit-large-patch14").to(device) + for prompt, img_path in tqdm(zip(prompts, images), desc="Computing CLIP-IQA score"): + image_data = Image.open(img_path).convert("RGB") + image_tensor = torch.from_numpy(np.array(image_data)).permute(2, 0, 1) + clip_model.update(image_tensor.to(torch.float32).to(device).unsqueeze(0)) + result = clip_model.compute().mean().item() + return {"CLIP-IQA": result} + + +def compute_image_reward_metrics(prompts, images, device="cuda"): + image_reward_model = reward.load("ImageReward-v1.0", device=device) + scores = [] + for prompt, img_path in tqdm(zip(prompts, images), desc="Computing image reward metrics"): + score = image_reward_model.score(prompt, img_path) + scores.append(score) + return {"ImageReward": np.mean(scores)} + + +metric_map = { + "clip": compute_clip, + "clip-iqa": compute_clip_iqa, + "imagereward": compute_image_reward_metrics, +} + + +def diffusion_eval( + pipe, + prompt_file, + metrics, + image_save_dir, + batch_size, + gen_kwargs, +): + if ( + not importlib.util.find_spec("clip") + or not importlib.util.find_spec("ImageReward") + or not importlib.util.find_spec("torchmetrics") + ): + raise ImportError( + "Please make sure clip, image-reward and torchmetrics are installed for diffusion model evaluation." + ) + dataloader, _, _ = get_diffusion_dataloader(prompt_file, nsamples=-1, bs=batch_size) + prompt_list = [] + image_list = [] + for image_ids, prompts in dataloader: + prompt_list.extend(prompts) + + new_ids = [] + new_prompts = [] + for idx, image_id in enumerate(image_ids): + image_id = image_id.item() + image_list.append(os.path.join(image_save_dir, str(image_id) + ".png")) + + if os.path.exists(os.path.join(image_save_dir, str(image_id) + ".png")): + continue + new_ids.append(image_id) + new_prompts.append(prompts[idx]) + + if len(new_prompts) == 0: + continue + + output = pipe(prompt=new_prompts, **gen_kwargs) + for idx, image_id in enumerate(new_ids): + output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png")) + + result = {} + for metric in metrics: + result.update(metric_map[metric](prompt_list, image_list, pipe.device)) + + import tabulate + + print(tabulate.tabulate(result.items(), tablefmt="grid")) diff --git a/auto_round/utils.py b/auto_round/utils.py index 9af09758e..d39520f42 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -406,13 +406,21 @@ def get_block_names(model, quant_vision=False): """ from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK - def _get_llm_block_names(model): - block_names = [] + def _search_block(name, module): + if hasattr(type(module), "__name__") and "ModuleList" in type(module).__name__: + return [(name, module)] target_modules = [] - for n, m in model.named_modules(): + for n, m in module.named_children(): if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__: - target_modules.append((n, m)) - break ## only find the first modulelist, may be not robust + target_modules.append((".".join(filter(None, (name, n))), m)) + else: + target_modules.extend(_search_block(".".join(filter(None, (name, n))), m)) + return target_modules + + def _get_llm_block_names(model): + block_names = [] + target_modules = _search_block("", model) + for i, target_m in enumerate(target_modules): block_names.append([]) for n, m in target_m[1].named_children(): @@ -459,7 +467,15 @@ def collect_best_params(block): return params -def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.float16, device=torch.device("cpu")): +def block_forward( + block, + input_ids, + input_others, + amp=False, + amp_dtype=torch.float16, + device=torch.device("cpu"), + output_return_id=0, +): """Performs a forward pass through a block with the given inputs. Args: @@ -469,6 +485,7 @@ def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.flo amp: A boolean indicating whether to use automatic mixed precision. amp_dtype: The data type for automatic mixed precision. device: The target device. + output_return_id: if the output has more than one tenor, return the specified idx tensor. Returns: output: The output of the forward pass. @@ -485,8 +502,8 @@ def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.flo output = block(input_ids, *input_tuple, **input_others) else: output = block(input_ids, *input_tuple, **input_others) - if isinstance(output, list) or isinstance(output, tuple): - output = output[0] + if isinstance(output_return_id, int) and (isinstance(output, list) or isinstance(output, tuple)): + output = output[output_return_id] return output @@ -1614,6 +1631,30 @@ def mllm_load_model( return model, processor, tokenizer, image_processor +def diffusion_load_model( + pretrained_model_name_or_path, + device="cpu", + torch_dtype="auto", + use_auto_mapping=False, + trust_remote_code=True, + model_dtype=None, + **kwargs, +): + device_str, use_auto_mapping = get_device_and_parallelism(device) + torch_dtype = "auto" + if device_str is not None and "hpu" in device_str: + torch_dtype = torch.bfloat16 + + pipelines = LazyImport("diffusers.pipelines") + + pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained( + pretrained_model_name_or_path, torch_dtype=torch_dtype + ) + pipe = _to_model_dtype(pipe, model_dtype) + model = pipe.transformer + return pipe, model.to(device) + + def is_pure_text_model(model): """verify on: phi-3.5, Mistral-Small-3.1, gemma-3, qwen2-vl,""" if hasattr(model, "config") and hasattr(model.config, "vision_config"): @@ -2743,11 +2784,12 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): return True if os.path.exists(os.path.join(model_path, "processor_config.json")): return True - with open(os.path.join(model_path, "config.json")) as f: - config = json.load(f) - for key in config.keys(): - if any([k in key for k in MM_KEYS]): - return True + if os.path.exists(os.path.join(model_path, "config.json")): + with open(os.path.join(model_path, "config.json")) as f: + config = json.load(f) + for key in config.keys(): + if any([k in key for k in MM_KEYS]): + return True if isinstance(model_or_path, torch.nn.Module): for name, module in model_or_path.named_modules(): @@ -2755,3 +2797,24 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]): return True return False + + +def is_diffusion_model(model_or_path: Union[str, object]): + if isinstance(model_or_path, str): + index_file = None + if not os.path.isdir(model_or_path): + try: + from huggingface_hub import hf_hub_download + + index_file = hf_hub_download(model_or_path, "model_index.json") + except: + index_file = None + + elif os.path.exists(os.path.join(model_or_path, "model_index.json")): + index_file = os.path.join(model_or_path, "model_index.json") + return index_file is not None + elif not isinstance(model_or_path, torch.nn.Module): + pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils") + return isinstance(model_or_path, pipeline_utils.DiffusionPipeline) + else: + return False diff --git a/test/test_cuda/requirements_diffusion.txt b/test/test_cuda/requirements_diffusion.txt new file mode 100644 index 000000000..55908f6ac --- /dev/null +++ b/test/test_cuda/requirements_diffusion.txt @@ -0,0 +1,3 @@ +diffusers +image-reward +clip \ No newline at end of file diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py new file mode 100644 index 000000000..c5fa63471 --- /dev/null +++ b/test/test_cuda/test_diffusion.py @@ -0,0 +1,86 @@ +import copy +import os +import re +import shutil +import sys +import unittest + +import requests + +sys.path.insert(0, "../..") + +from PIL import Image + +from auto_round import AutoRoundConfig +from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env + + +class TestAutoRound(unittest.TestCase): + @classmethod + def setUpClass(self): + self.save_dir = "./saved" + self.model_name = "/dataset/FLUX.1-dev" + + @classmethod + def tearDownClass(self): + shutil.rmtree(self.save_dir, ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) + + @require_optimum + def test_diffusion_tune(self): + + from diffusers import AutoPipelineForText2Image + + from auto_round import AutoRoundDiffusion + + ## load the model + pipe = AutoPipelineForText2Image.from_pretrained(self.model_name) + model = pipe.transformer + + layer_config = {} + # skip some layers since it takes much time + for n, m in model.named_modules(): + if m.__class__.__name__ != "Linear": + continue + match = re.search(r"blocks\.(\d+)", n) + if match and int(match.group(1)) > 0: + layer_config[n] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"} + + ## quantize the model + autoround = AutoRoundDiffusion( + pipe, + tokenizer=None, + scheme="MXFP4", + iters=1, + nsamples=1, + num_inference_steps=2, + layer_config=layer_config, + dataset="/dataset/captions_source.tsv", + ) + # skip model saving since it takes much time + autoround.quantize() + shutil.rmtree(self.save_dir, ignore_errors=True) + + def test_block_name(self): + from diffusers import AutoPipelineForText2Image + + from auto_round.utils import get_block_names + + pipe = AutoPipelineForText2Image.from_pretrained(self.model_name) + model = pipe.transformer + + block_name = get_block_names(model) + self.assertTrue(len(block_name) == 2) + self.assertTrue(any(["context_embedder" not in n for n in block_name])) + + def test_diffusion_model_checker(self): + from auto_round.utils import is_diffusion_model + + self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev")) + self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1")) + self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0")) + self.assertFalse(is_diffusion_model("/models/Qwen3-8B")) + + +if __name__ == "__main__": + unittest.main()