diff --git a/.azure-pipelines/scripts/ut/run_ut_cuda.sh b/.azure-pipelines/scripts/ut/run_ut_cuda.sh
index 02073e958..8580f760d 100644
--- a/.azure-pipelines/scripts/ut/run_ut_cuda.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_cuda.sh
@@ -46,6 +46,7 @@ function run_unit_test() {
     CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" uv pip install llama-cpp-python
     uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py'
     uv pip install -r requirements.txt
+    uv pip install -r requirements_diffusion.txt
 
     uv pip list
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
diff --git a/auto_round/__init__.py b/auto_round/__init__.py
index 15bbc373d..a3a3b24bc 100644
--- a/auto_round/__init__.py
+++ b/auto_round/__init__.py
@@ -14,7 +14,7 @@
 from auto_round.autoround import AutoRound
 
 # support for old api
-from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam
+from auto_round.autoround import AutoRoundLLM, AutoRoundMLLM, AutoRoundAdam, AutoRoundDiffusion
 from auto_round.utils import LazyImport
 
 
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 07bc3f273..b806b583b 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -230,6 +230,25 @@ def __init__(self, *args, **kwargs):
             help="the template for building training dataset. It can be a custom one.",
         )
 
+        ## ===================== diffusion model ==================
+        self.add_argument(
+            "--guidance_scale",
+            default=7.5,
+            type=float,
+        )
+
+        self.add_argument(
+            "--num_inference_steps",
+            default=50,
+            type=int,
+        )
+
+        self.add_argument(
+            "--generator_seed",
+            default=None,
+            type=int,
+        )
+
         ## ======================= eval =======================
         self.add_argument(
             "--tasks",
@@ -258,6 +277,22 @@ def __init__(self, *args, **kwargs):
             "--eval_model_dtype", default=None, type=str, help="the torch_dytpe to load the model for evaluation."
         )
 
+        ## ======================= diffusion model eval =======================
+        self.add_argument("--prompt_file", default=None, type=str, help="the prompt file to load prmpt.")
+
+        self.add_argument("--prompt", default=None, type=str, help="the prompt for test.")
+
+        self.add_argument(
+            "--metrics",
+            "--metric",
+            default="clip",
+            help="support clip, clip-iqa, imagereward",
+        )
+
+        self.add_argument(
+            "--image_save_dir", default="./tmp_image_save", type=str, help="path to save generated images"
+        )
+
 
 def setup_parser():
     parser = BasicArgumentParser()
@@ -427,6 +462,7 @@ def tune(args):
         )
 
     from auto_round.compressors import (
+        DiffusionExtraConfig,
         ExtraConfig,
         MLLMExtraConfig,
         SchemeExtraConfig,
@@ -466,9 +502,15 @@ def tune(args):
     mllm_config = MLLMExtraConfig(
         quant_nontext_module=args.quant_nontext_module, extra_data_dir=args.extra_data_dir, template=args.template
     )
+    diffusion_config = DiffusionExtraConfig(
+        guidance_scale=args.guidance_scale,
+        num_inference_steps=args.num_inference_steps,
+        generator_seed=args.generator_seed,
+    )
     extra_config.tuning_config = tuning_config
     extra_config.scheme_config = scheme_config
     extra_config.mllm_config = mllm_config
+    extra_config.diffusion_config = diffusion_config
 
     autoround: BaseCompressor = AutoRound(
         model=model_name,
@@ -524,6 +566,45 @@ def tune(args):
     model.eval()
     clear_memory()
 
+    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
+
+    # diffusion model has different evaluation path
+    if getattr(autoround, "diffusion", False):
+        pipe = autoround.pipe
+        pipe.to(model.dtype)
+        pipe.transformer = model
+        device_str = detect_device(device_str)
+        pipe = pipe.to(device_str)
+        if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto":
+            pipe.to(getattr(torch, eval_model_dtype))
+
+        gen_kwargs = {
+            "guidance_scale": args.guidance_scale,
+            "output_type": "pil",
+            "num_inference_steps": args.num_inference_steps,
+            "generator": (
+                None
+                if args.generator_seed is None
+                else torch.Generator(device=pipe.device).manual_seed(args.generator_seed)
+            ),
+        }
+        if not os.path.exists(args.image_save_dir):
+            os.makedirs(args.image_save_dir)
+
+        if args.prompt is not None:
+            outputs = pipe(prompt=args.prompt, **gen_kwargs)
+            outputs.images[0].save(os.path.join(args.image_save_dir, "img.png"))
+            logger.info(
+                f"Image generated with prompt {args.prompt} is saved as {os.path.join(args.image_save_dir, 'img.png')}"
+            )
+
+        if args.prompt_file is not None:
+            from auto_round.compressors.diffusion import diffusion_eval
+
+            metrics = args.metrics.split(",")
+            diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs)
+        return
+
     lm_eval_version = get_library_version("lm-eval")
 
     eval_folder = folders[-1]
@@ -545,8 +626,6 @@ def tune(args):
 
     import time
 
-    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
-
     if (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model:
         if eval_gguf_model:
             # for file in os.listdir(eval_folder):
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 4074213a9..68420d65c 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -20,13 +20,14 @@
 from auto_round.compressors import (
     AdamCompressor,
     BaseCompressor,
+    DiffusionCompressor,
     ExtraConfig,
     LLMCompressor,
     MLLMCompressor,
 )
 from auto_round.logger import deprecated, logger
 from auto_round.schemes import QuantizationScheme
-from auto_round.utils import is_mllm_model
+from auto_round.utils import is_diffusion_model, is_mllm_model
 
 
 class AutoRound:
@@ -77,7 +78,7 @@ def __new__(
         seed: int = 42,
         # for adam
         enable_adam: bool = False,
-        # for MLLM
+        # for MLLM and Diffusion
         extra_config: ExtraConfig = None,
         **kwargs,
     ) -> BaseCompressor:
@@ -145,9 +146,17 @@ def __new__(
         if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model):
             logger.info("using MLLM mode for multimodal model.")
             model_cls.append(MLLMCompressor)
+            if extra_config:
+                extra_config.diffusion_config = None
+        elif (extra_config and not extra_config.diffusion_config.is_default()) or is_diffusion_model(model):
+            logger.info("using Diffusion mode for diffusion model.")
+            model_cls.append(DiffusionCompressor)
+            if extra_config:
+                extra_config.mllm_config = None
         else:
             if extra_config:
                 extra_config.mllm_config = None
+                extra_config.diffusion_config = None
             model_cls.append(LLMCompressor)
 
         if enable_adam:
@@ -540,3 +549,83 @@ def __init__(
             seed=seed,
             **kwargs,
         )
+
+
+@deprecated("AutoRound")
+class AutoRoundDiffusion(DiffusionCompressor):
+    """Class for automatic rounding-based quantization with Diffusion models.
+
+    Args:
+        model: The PyTorch model to be quantized.
+        tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
+        guidance_scale (float): Control how much the image generation process follows the text prompt.
+                                The more it is, the more closely it follows the prompt (default is 7.5).
+        num_inference_steps (int): The reference number of denoising steps (default is 50).
+        generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None).
+        scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
+        layer_config (dict): Configuration for weight quantization (default is None).
+        dataset: The path or name of the calib dataset.
+        iters (int): Number of iterations (default is 200).
+        seqlen (int): Length of the sequence.
+        nsamples (int): Number of samples (default is 128).
+        batch_size (int): Batch size for training (default is 8).
+        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
+        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
+        device_map (str | dict | int | torch.device, optional): Device placement map. Defaults to 0.
+        enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer
+        **kwargs: Additional keyword arguments.
+    """
+
+    bits: int | None
+    group_size: int | None
+    sym: bool | None
+    data_type: str | None
+    act_bits: int | None
+    act_group_size: int | None
+    act_sym: bool | None
+    act_data_type: str | None
+    act_dynamic: bool | None
+    super_bits: int | None
+    super_group_size: int | None
+
+    def __init__(
+        self,
+        model: Union[object, str],
+        tokenizer=None,
+        guidance_scale: float = 7.5,
+        num_inference_steps: int = 50,
+        generator_seed: int = None,
+        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
+        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "coco2014",
+        iters: int = 200,
+        seqlen: int = 2048,
+        nsamples: int = 128,
+        batch_size: int = 8,
+        gradient_accumulate_steps: int = 1,
+        low_gpu_mem_usage: bool = False,
+        device_map: Union[str, torch.device, int, dict] = 0,
+        enable_torch_compile: bool = False,
+        seed: int = 42,
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=None,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator_seed=generator_seed,
+            scheme=scheme,
+            layer_config=layer_config,
+            dataset=dataset,
+            iters=iters,
+            seqlen=seqlen,
+            nsamples=nsamples,
+            batch_size=batch_size,
+            gradient_accumulate_steps=gradient_accumulate_steps,
+            low_gpu_mem_usage=low_gpu_mem_usage,
+            device_map=device_map,
+            enable_torch_compile=enable_torch_compile,
+            seed=seed,
+            **kwargs,
+        )
diff --git a/auto_round/compressors/__init__.py b/auto_round/compressors/__init__.py
index 35bbc5666..dbf47b9c2 100644
--- a/auto_round/compressors/__init__.py
+++ b/auto_round/compressors/__init__.py
@@ -14,7 +14,9 @@
 
 from auto_round.compressors.base import *
 from auto_round.compressors.mllm.compressor import MLLMCompressor
+from auto_round.compressors.diffusion.compressor import DiffusionCompressor
 from auto_round.compressors.config import (
+    DiffusionExtraConfig,
     ExtraConfig,
     MLLMExtraConfig,
     SchemeExtraConfig,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 01546034d..39f1b6575 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -18,6 +18,7 @@
 import sys
 import time
 import traceback
+from collections import defaultdict
 from dataclasses import asdict, fields
 from enum import Enum
 from typing import Any, Callable, Union
@@ -229,6 +230,7 @@ def __init__(
         device = kwargs.pop("device", None)
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
+        self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
         # Scale factor for RAM usage per parameter.
         self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
         fp_layers = kwargs.pop("fp_layers", None)
@@ -276,7 +278,7 @@ def __init__(
                 device="cpu",
                 low_cpu_mem_mode=low_cpu_mem_usage,  # always load cpu first
             )
-        elif tokenizer is None and iters > 0:
+        elif tokenizer is None and not self.diffusion and iters > 0:
             raise ValueError("A tokenizer must be set for non-str model input")
         self.low_cpu_mem_usage = bool(low_cpu_mem_usage)
         if unsupport_meta_device(model):
@@ -342,7 +344,7 @@ def __init__(
             model, tokenizer, low_cpu_mem_usage = llm_load_model(
                 model, device=device, low_cpu_mem_mode=low_cpu_mem_usage
             )
-        elif tokenizer is None and iters > 0:
+        elif tokenizer is None and not self.diffusion and iters > 0:
             raise ValueError("A tokenizer must be set for non-str model input")
         self.low_cpu_mem_usage = bool(low_cpu_mem_usage)
         if unsupport_meta_device(model):
@@ -1697,6 +1699,19 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                 cnt = 1
             cnt += 1
 
+    def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, torch.Tensor]:
+        keys = inputs.keys()
+        input_id_str = [key for key in keys if key.startswith("hidden_state")]
+        if len(input_id_str) != 1:
+            raise RuntimeError(
+                "hidden_states arg mismatch error,"
+                "please raise an issue in https://github.com/intel/auto-round/issues"
+            )
+        inputs["input_ids"] = inputs.pop(input_id_str[0], None)
+        if q_inputs is not None:
+            q_inputs = q_inputs.pop(input_id_str[0], None)
+        return inputs, q_inputs
+
     def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         """Quantize the model and return the quantized model along with layer configurations.The entry of AutoRound.
         Returns:
@@ -1784,7 +1799,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         if len(all_blocks) > 1:
             pbar = tqdm(range(0, sum([len(i) for i in all_blocks]), self.nblocks))
         else:
-            pbar = None  # move the alg warning outside pbar
+            pbar = tqdm(range(0, len(all_blocks[0]), self.nblocks))  # move the alg warning outside pbar
 
         for block_names in all_blocks:
             inputs = all_inputs[block_names[0]]
@@ -1793,16 +1808,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
             if all_q_inputs is not None:
                 q_inputs = all_q_inputs[block_names[0]]
                 all_q_inputs.pop(block_names[0])
-            keys = inputs.keys()
-            input_id_str = [key for key in keys if key.startswith("hidden_state")]
-            if len(input_id_str) != 1:
-                raise RuntimeError(
-                    "hidden_states arg mismatch error,"
-                    "please raise an issue in https://github.com/intel/auto-round/issues"
-                )
-            inputs["input_ids"] = inputs.pop(input_id_str[0], None)
-            if q_inputs is not None:
-                q_inputs["input_ids"] = q_inputs.pop(input_id_str[0], None)
+
+            inputs, q_inputs = self._update_inputs(inputs, q_inputs)
 
             clear_memory(self.inputs)
 
@@ -1816,7 +1823,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                 self.model,
                 inputs,
                 block_names,
-                q_input=q_inputs["input_ids"] if q_inputs is not None else None,
+                q_input=q_inputs if q_inputs is not None else None,
                 nblocks=self.nblocks,
                 device=self.device,
                 pbar=pbar,
@@ -1826,6 +1833,8 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                     f"Expected exactly one packing format when 'is_packing_immediate' is True, "
                     f"but got {len(self.formats)} formats."
                 )
+        pbar.set_description("Quantizing done")
+        pbar.close()
 
         self._quantize_layers(layer_names, all_inputs)  ##TODO pack layer immediately
 
@@ -2686,6 +2695,30 @@ def get_act_max_hook(module, input, output):
                     continue
         return hook_handles
 
+    def _get_current_output(self, output: list[torch.Tensor], indices: list[int]) -> torch.Tensor:
+        current_output = [output[x] for x in indices]
+        current_output = torch.cat(current_output, dim=self.batch_dim)
+        return current_output
+
+    def _get_current_q_output(
+        self,
+        block: torch.nn.Module,
+        input_ids: list[torch.Tensor],
+        input_others: dict,
+        indices: list[int],
+        device: str,
+    ) -> torch.Tensor:
+        current_input_ids, current_input_others = self._sampling_inputs(
+            input_ids,
+            input_others,
+            indices,
+            seqlen=self.seqlen,
+            batch_dim=self.batch_dim,
+            share_cache_keys=self.shared_cache_keys,
+        )
+        output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device)
+        return output_q
+
     def _quantize_block(
         self,
         block: torch.nn.Module,
@@ -2831,22 +2864,12 @@ def _quantize_block(
 
             for tmp_step in range(self.gradient_accumulate_steps):
                 indices = whole_indices[tmp_step * self.batch_size : (tmp_step + 1) * self.batch_size]
-                current_input_ids, current_input_others = self._sampling_inputs(
-                    input_ids,
-                    input_others,
-                    indices,
-                    seqlen=self.seqlen,
-                    batch_dim=self.batch_dim,
-                    share_cache_keys=self.shared_cache_keys,
-                )
 
-                current_output = [output[x] for x in indices]
-                current_output = torch.cat(current_output, dim=self.batch_dim)
+                current_output = self._get_current_output(output, indices)
+
                 current_output = to_device(current_output, device)
 
-                output_q = block_forward(
-                    block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device
-                )
+                output_q = self._get_current_q_output(block, input_ids, input_others, indices, device)
                 if self.amp:
                     with autocast(device_type=device.split(":")[0], dtype=self.amp_dtype):
                         loss = mse_loss(output_q, current_output)  # pylint: disable=not-callable
@@ -2925,6 +2948,12 @@ def _quantize_block(
             clear_memory(input_ids)
             return None, output
 
+    def _split_inputs(self, inputs: dict) -> tuple[torch.Tensor, dict]:
+        input_ids = inputs["input_ids"]
+        inputs.pop("input_ids", None)
+        input_others = inputs
+        return input_ids, input_others
+
     def _quantize_blocks(
         self,
         model: torch.nn.Module,
@@ -2950,16 +2979,14 @@ def _quantize_blocks(
         clear_memory()
         for n, m in model.named_parameters():
             m.requires_grad_(False)
-        input_ids = inputs["input_ids"]
-        inputs.pop("input_ids", None)
-        input_others = inputs
+
+        input_ids, input_others = self._split_inputs(inputs)
         clear_memory()
         input_ids = to_device(input_ids, self.cache_device)
         input_others = to_device(input_others, self.cache_device)
         # As in calibration phase, we may use bf16 for calibration due to low_gpu_memory usage
         tmp_dtype = self.amp_dtype if self.amp else torch.float32
-        for i in range(len(input_ids)):
-            input_ids[i] = input_ids[i].to(tmp_dtype)
+        input_ids = to_dtype(input_ids, tmp_dtype)
 
         for key in input_others.keys():
             if isinstance(input_others[key], torch.Tensor) and (
@@ -3057,9 +3084,9 @@ def _quantize_blocks(
                         PACKING_LAYER_WITH_FORMAT[target_backend](
                             tmp_m.tmp_name, self.model, self.formats[0], device=self.device
                         )
-        pbar.set_description("Quantizing done")
-        pbar.update(1)
-        pbar.close()
+        if pbar is not None:
+            pbar.update(1)
+
         self.model = mv_module_from_gpu(self.model, self.low_cpu_mem_usage)
         for n, m in self.model.named_modules():
             if hasattr(m, "name"):
@@ -3299,7 +3326,7 @@ def _step(self, scaler: Any, optimizer: Any, lr_schedule: Any):
     @torch.no_grad()
     def _sampling_inputs(
         cls,
-        input_ids: list[torch.Tensor],
+        input_ids: Union[list[torch.Tensor], dict],
         input_others: dict,
         indices: list[int],
         seqlen: int,
@@ -3318,9 +3345,14 @@ def _sampling_inputs(
         current_input_ids: The sampled input IDs.
         current_input_others: The sampled other input data.
         """
-        current_input_ids = [input_ids[i] for i in indices]
-
-        current_input_ids = torch.cat(current_input_ids, dim=batch_dim)
+        if isinstance(input_ids, list):
+            current_input_ids = [input_ids[i] for i in indices]
+            current_input_ids = torch.cat(current_input_ids, dim=batch_dim)
+        elif isinstance(input_ids, dict):
+            current_input_ids = defaultdict(list)
+            for k in input_ids.keys():
+                current_input_ids[k].extend([input_ids[k][i] for i in indices])
+                current_input_ids[k] = torch.cat(current_input_ids[k], dim=batch_dim)
 
         current_input_others = {"positional_inputs": input_others["positional_inputs"]}
         for key in input_others.keys():
diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py
index 5b40735ff..d42e13427 100644
--- a/auto_round/compressors/config.py
+++ b/auto_round/compressors/config.py
@@ -26,6 +26,7 @@ class ExtraConfig:
     _scheme_config = None
     _tuning_config = None
     _mllm_config = None
+    _diffusion_config = None
 
     def __init__(
         self,
@@ -65,6 +66,10 @@ def __init__(
         quant_nontext_module: bool = False,
         extra_data_dir: str = None,
         template: str = None,
+        # diffusion
+        guidance_scale: float = 7.5,
+        num_inference_steps: int = 50,
+        generator_seed: int = None,
     ):
         """Initialize
 
@@ -103,6 +108,10 @@ def __init__(
             quant_nontext_module: Whether to quantize nontext module.
             extra_data_dir: The path of extra data such as images, audio and videos.
             template: The path or name of template used to specify process for different MLLMs.
+            guidance_scale (float): Control how much the image generation process follows the text prompt.
+                                    The more it is, the more closely it follows the prompt (default is 7.5).
+            num_inference_steps (int): The reference number of denoising steps (default is 50).
+            generator_seed (int): A seed that controls the initial noise for image generation (default is None).
         """
         self.tuning_config = TuningExtraConfig(
             amp=amp,
@@ -143,6 +152,11 @@ def __init__(
             extra_data_dir=extra_data_dir,
             template=template,
         )
+        self.diffusion_config = DiffusionExtraConfig(
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator_seed=generator_seed,
+        )
 
     @property
     def tuning_config(self):
@@ -180,6 +194,20 @@ def mllm_config(self, config: MLLMExtraConfig):
             ), f"mllm_config should be MLLMExtraConfig, but got {config.__class__.__name__}"
             self._mllm_config = config
 
+    @property
+    def diffusion_config(self):
+        return self._diffusion_config
+
+    @diffusion_config.setter
+    def diffusion_config(self, config: DiffusionExtraConfig):
+        if config is None:
+            self._diffusion_config = None
+        else:
+            assert isinstance(
+                config, DiffusionExtraConfig
+            ), f"diffusion_config should be DiffusionExtraConfig, but got {config.__class__.__name__}"
+            self._diffusion_config = config
+
     def to_dict(self):
         output_dict = {}
         for config in self.__dict__.values():
@@ -263,3 +291,10 @@ class MLLMExtraConfig(BaseExtraConfig):
     quant_nontext_module: bool = False
     extra_data_dir: str = None
     template: str = None
+
+
+@dataclass
+class DiffusionExtraConfig(BaseExtraConfig):
+    guidance_scale: float = 7.5
+    num_inference_steps: int = 50
+    generator_seed: int = None
diff --git a/auto_round/compressors/diffusion/__init__.py b/auto_round/compressors/diffusion/__init__.py
new file mode 100644
index 000000000..b084e94f2
--- /dev/null
+++ b/auto_round/compressors/diffusion/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
+from auto_round.compressors.diffusion.compressor import DiffusionCompressor
+from auto_round.compressors.diffusion.eval import diffusion_eval
diff --git a/auto_round/compressors/diffusion/compressor.py b/auto_round/compressors/diffusion/compressor.py
new file mode 100644
index 000000000..4c4b9a00e
--- /dev/null
+++ b/auto_round/compressors/diffusion/compressor.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from copy import deepcopy
+from typing import Union
+
+import torch
+from tqdm import tqdm
+
+from auto_round.compressors.base import BaseCompressor
+from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
+from auto_round.logger import logger
+from auto_round.low_cpu_mem.utils import get_layers_before_block
+from auto_round.schemes import QuantizationScheme
+from auto_round.utils import (
+    LazyImport,
+    block_forward,
+    clear_memory,
+    diffusion_load_model,
+    extract_block_names_to_str,
+    find_matching_blocks,
+    get_block_names,
+)
+
+pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
+
+output_configs = {
+    "FluxTransformerBlock": ["encoder_hidden_states", "hidden_states"],
+    "FluxSingleTransformerBlock": ["encoder_hidden_states", "hidden_states"],
+}
+
+
+class DiffusionCompressor(BaseCompressor):
+    """Class for automatic rounding-based quantization with Diffusion models.
+
+    Args:
+        model: The PyTorch model to be quantized.
+        tokenizer: An optional tokenizer for processing input data, is not used for diffusion models.
+        guidance_scale (float): Control how much the image generation process follows the text prompt.
+                                The more it is, the more closely it follows the prompt (default is 7.5).
+        num_inference_steps (int): The reference number of denoising steps (default is 50).
+        generator_seed (int): A sees that controls the initial noise from which an image is generated (default is None).
+        scheme: (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
+        layer_config (dict): Configuration for weight quantization (default is None).
+        dataset: The path or name of the calib dataset.
+        iters (int): Number of iterations (default is 200).
+        seqlen (int): Length of the sequence.
+        nsamples (int): Number of samples (default is 128).
+        batch_size (int): Batch size for training (default is 8).
+        gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
+        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
+        device_map (str | dict | int | torch.device, optional): Device placement map. Defaults to 0.
+        enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer
+        **kwargs: Additional keyword arguments.
+    """
+
+    bits: int | None
+    group_size: int | None
+    sym: bool | None
+    data_type: str | None
+    act_bits: int | None
+    act_group_size: int | None
+    act_sym: bool | None
+    act_data_type: str | None
+    act_dynamic: bool | None
+    super_bits: int | None
+    super_group_size: int | None
+
+    def __init__(
+        self,
+        model: Union[object, str],
+        tokenizer=None,
+        guidance_scale: float = 7.5,
+        num_inference_steps: int = 50,
+        generator_seed: int = None,
+        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
+        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "coco2014",
+        iters: int = 200,
+        seqlen: int = 2048,
+        nsamples: int = 128,
+        batch_size: int = 8,
+        gradient_accumulate_steps: int = 1,
+        low_gpu_mem_usage: bool = False,
+        device_map: Union[str, torch.device, int, dict] = 0,
+        enable_torch_compile: bool = False,
+        seed: int = 42,
+        **kwargs,
+    ):
+        self.guidance_scale = guidance_scale
+        self.num_inference_steps = num_inference_steps
+        self.generator_seed = generator_seed
+
+        to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None)
+        if device_map is None:
+            device_map = 0
+        self._set_device(device_map)
+
+        if isinstance(model, str):
+            pipe, model = diffusion_load_model(model, device=self.device)
+        elif isinstance(model, pipeline_utils.DiffusionPipeline):
+            pipe = model
+            model = pipe.transformer
+        else:
+            raise ValueError(f"Only support str or DiffusionPipeline class for model, but get {type(model)}")
+
+        self.model = model
+        self.pipe = pipe
+
+        all_blocks = get_block_names(model)
+        self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
+        if to_quant_block_names is None:
+            to_quant_block_names = extract_block_names_to_str(self.quant_block_list)
+
+        if iters > 0 and batch_size != 1:
+            logger.warning(
+                f"reset batch_size({batch_size}) to 1 and "
+                f"gradient_accumulate_steps({gradient_accumulate_steps}) "
+                f"to {batch_size * gradient_accumulate_steps}, "
+                f"because batch_size={batch_size} cannot be used for calibrating non-text modules."
+            )
+            gradient_accumulate_steps = batch_size * gradient_accumulate_steps
+            batch_size = 1
+
+        seqlen = 2048 if seqlen is None else seqlen
+
+        if nsamples % batch_size != 0:
+            nsamples = (nsamples // batch_size + 1) * batch_size
+            logger.warning(f"'nsamples' is not divisible by 'batch_size', will adjusted to {nsamples}")
+
+        kwargs["diffusion"] = True
+        super(DiffusionCompressor, self).__init__(
+            model=model,
+            tokenizer=None,
+            scheme=scheme,
+            layer_config=layer_config,
+            dataset=dataset,
+            iters=iters,
+            seqlen=seqlen,
+            nsamples=nsamples,
+            batch_size=batch_size,
+            gradient_accumulate_steps=gradient_accumulate_steps,
+            low_gpu_mem_usage=low_gpu_mem_usage,
+            device_map=device_map,
+            enable_torch_compile=enable_torch_compile,
+            seed=seed,
+            to_quant_block_names=to_quant_block_names,
+            **kwargs,
+        )
+
+    def _update_inputs(self, inputs: dict, q_inputs: dict) -> tuple[dict, dict]:
+        # flux transformer model's blocks will update hidden_states and encoder_hidden_states
+        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
+        if q_inputs is not None:
+            q_inputs = {k: q_inputs.pop(k, None) for k in input_id_str}
+        return inputs, q_inputs
+
+    def _split_inputs(self, inputs: dict) -> tuple[dict, dict]:
+        input_id_str = [key for key in inputs.keys() if "hidden_state" in key]
+        input_ids = {k: inputs.pop(k, None) for k in input_id_str}
+        input_others = inputs
+        return input_ids, input_others
+
+    def _get_current_output(self, output: dict, indices: list[int]) -> torch.Tensor:
+        assert "hidden_states" in output
+        current_output = [output["hidden_states"][x] for x in indices]
+        current_output = torch.cat(current_output, dim=self.batch_dim)
+        return current_output
+
+    def _get_current_q_output(
+        self,
+        block: torch.nn.Module,
+        input_ids: dict,
+        input_others: dict,
+        indices: list[int],
+        device: str,
+    ) -> torch.Tensor:
+        output_config = output_configs.get(block.__class__.__name__, [])
+        idx = None if "hidden_states" not in output_config else output_config.index("hidden_states")
+        current_input_ids, current_input_others = self._sampling_inputs(
+            input_ids,
+            input_others,
+            indices,
+            seqlen=self.seqlen,
+            batch_dim=self.batch_dim,
+            share_cache_keys=self.shared_cache_keys,
+        )
+        if isinstance(current_input_ids, dict):
+            hidden_states = current_input_ids.pop("hidden_states")
+            current_input_others.update(current_input_ids)
+            current_input_ids = hidden_states
+        output_q = block_forward(block, current_input_ids, current_input_others, self.amp, self.amp_dtype, device, idx)
+        return output_q
+
+    @torch.no_grad()
+    def _get_block_outputs(
+        self,
+        block: torch.nn.Module,
+        input_ids: torch.Tensor,
+        input_others: torch.Tensor,
+        bs: int,
+        device: Union[str, torch.device],
+        cache_device: Union[str, torch.device],
+        save_output: bool = True,
+    ):
+        """Compute the output of a given block of the model for a given input.
+
+        Args:
+        block: The block of the model.
+        input_ids: The input tensor containing tokenized input ids.
+        input_others: A dictionary containing additional input data.
+        bs: The batch size for computing the output.
+        device: The device for computation.
+        cache_device: The device for storing the output.
+        batch_dim: The batch dimension of the output tensor.
+
+        Returns:
+        The output tensor of the block.
+        """
+
+        output = defaultdict(list)
+        nsamples = len(input_ids)
+        output_config = output_configs.get(block.__class__.__name__, [])
+
+        for i in range(0, nsamples, bs):
+            end_index = min(nsamples, i + bs)
+            indices = torch.arange(i, end_index).to(torch.long)
+            tmp_input_ids, tmp_input_others = self._sampling_inputs(
+                input_ids, input_others, indices, self.seqlen, self.batch_dim, share_cache_keys=self.shared_cache_keys
+            )
+            if isinstance(tmp_input_ids, dict):
+                hidden_states = tmp_input_ids.pop("hidden_states")
+                tmp_input_others.update(tmp_input_ids)
+                tmp_input_ids = hidden_states
+
+            tmp_output = block_forward(block, tmp_input_ids, tmp_input_others, self.amp, self.amp_dtype, device, None)
+            assert len(output_config) == len(tmp_output)
+            tmp_output = dict(zip(output_config, tmp_output))
+
+            if save_output:
+                for name, out in tmp_output.items():
+                    if self.batch_size == 1:
+                        output[name].append(out.to(cache_device))
+                    else:
+                        output[name].extend(list(torch.split(out.to(cache_device), 1, dim=self.batch_dim)))
+        if self.low_gpu_mem_usage:
+            clear_memory()
+
+        return output
+
+    def calib(self, nsamples, bs):
+        """Perform calibration for quantization.
+
+        This method calibrates the model for quantization by processing a specified
+        number of samples from the calibration dataset. It ensures that the data is
+        properly formatted and feeds it to the model. If the number of samples processed
+        is less than the specified number, it logs a warning. If no samples are processed,
+        it logs an error and exits.
+        Args:
+            nsamples (int): The number of samples to use for calibration.
+            bs (int): The number of samples to use for calibration
+        """
+        logger.warning(
+            "Diffusion model will catch nsamples * num_inference_steps inputs, "
+            "you can reduce nsamples or num_inference_steps if OOM or take too much time."
+        )
+        if isinstance(self.dataset, str):
+            dataset = self.dataset.replace(" ", "")
+            self.dataloader, self.batch_size, self.gradient_accumulate_steps = get_diffusion_dataloader(
+                dataset=dataset,
+                bs=self.batch_size,
+                seed=self.seed,
+                nsamples=self.nsamples,
+                gradient_accumulate_steps=self.gradient_accumulate_steps,
+            )
+        else:
+            self.dataloader = self.dataset
+        total_cnt = 0
+
+        if self.low_cpu_mem_usage:
+            embed_layers = get_layers_before_block(self.model)
+            for n, m in embed_layers:
+                m = m.to(self.device)
+
+        total = nsamples if not hasattr(self.dataloader, "len") else min(nsamples, len(self.dataloader))
+        if self.pipe.dtype != self.model.dtype:
+            self.pipe.to(self.model.dtype)
+        if self.pipe.device != self.model.device:
+            self.pipe.to(self.model.device)
+        with tqdm(range(1, total + 1), desc="cache block inputs") as pbar:
+            for ids, prompts in self.dataloader:
+                if isinstance(prompts, tuple):
+                    prompts = list(prompts)
+                try:
+                    self.pipe(
+                        prompt=prompts,
+                        guidance_scale=self.guidance_scale,
+                        num_inference_steps=self.num_inference_steps,
+                        generator=(
+                            None
+                            if self.generator_seed is None
+                            else torch.Generator(device=self.pipe.device).manual_seed(self.generator_seed)
+                        ),
+                    )
+                except NotImplementedError:
+                    pass
+                except Exception as error:
+                    raise error
+                step = len(prompts)
+                total_cnt += step
+                pbar.update(step)
+                if total_cnt >= nsamples:
+                    break
+        if total_cnt == 0:
+            logger.error(
+                f"no data has been cached, please provide more data with sequence length >={self.seqlen} in the "
+                f"dataset or decease the sequence length"
+            )
+            exit(-1)
+        elif total_cnt < nsamples:
+            logger.warning(
+                f"Insufficient number of samples collected may affect the quantization. "
+                f"target samples count is {nsamples}, while valid samples count is {total_cnt}"
+            )
+            if total_cnt < self.batch_size:
+                raise ValueError(
+                    f"valid samples is less than batch_size({self.batch_size}),"
+                    " please adjust self.batch_size or seqlen."
+                )
+            max_len = (total_cnt // self.batch_size) * self.batch_size
+            for k, v in self.inputs.items():
+                for key in v:
+                    if isinstance(v[key], list) and len(v[key]) == total_cnt:
+                        self.inputs[k][key] = v[key][:max_len]
+
+        # clean embed weight to save memory
+        if self.low_cpu_mem_usage:
+            for n, m in embed_layers:
+                m = m.to("meta")
+        # torch.cuda.empty_cache()
+
+    def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
+        """Save the quantized model to the specified output directory in the specified format.
+
+        Args:
+            output_dir (str, optional): The directory to save the quantized model. Defaults to None.
+            format (str, optional): The format in which to save the model. Defaults to "auto_round".
+            inplace (bool, optional): Whether to modify the model in place. Defaults to True.
+            **kwargs: Additional keyword arguments specific to the export format.
+
+        Returns:
+            object: The compressed model object.
+        """
+        compressed_model = super().save_quantized(output_dir=output_dir, format=format, inplace=inplace, **kwargs)
+        return compressed_model
diff --git a/auto_round/compressors/diffusion/dataset.py b/auto_round/compressors/diffusion/dataset.py
new file mode 100644
index 000000000..a716a8a58
--- /dev/null
+++ b/auto_round/compressors/diffusion/dataset.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict
+
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader, Dataset
+from transformers import set_seed
+
+from auto_round.utils import logger
+
+DIFFUSION_DATASET: Dict[str, Dataset] = {}
+
+
+COCO_URL = {
+    "coco2014": (
+        "https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/"
+        "coco2014/captions/captions_source.tsv"
+    )
+}
+
+
+def register_dataset(name_list):
+    """Class decorator to register a DATASET subclass to the registry.
+
+    Decorator function used before a Pattern subclass.
+
+    Args:
+        name: A string. Define the dataset type.
+
+    Returns:
+        cls: The class of register.
+    """
+
+    def register(dataset):
+        for name in name_list.replace(" ", "").split(","):
+            DIFFUSION_DATASET[name] = dataset
+
+    return register
+
+
+@register_dataset("local")
+class Text2ImgDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(
+        self,
+        dataset_path,
+        nsamples=128,
+    ) -> None:
+        super().__init__()
+        self.captions = []
+        self.caption_ids = []
+
+        logger.info(f"use dataset {dataset_path}, loading from disk...")
+        df = pd.read_csv(dataset_path, sep="\t")
+
+        for index, row in df.iterrows():
+            if nsamples > 0 and index + 1 > nsamples:
+                break
+            assert "id" in row and "caption" in row
+            caption_id = row["id"]
+            caption_text = row["caption"]
+            self.caption_ids.append(caption_id)
+            self.captions.append(caption_text)
+
+    def __len__(self):
+        return len(self.captions)
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return self.caption_ids[i], self.captions[i]
+
+
+def get_diffusion_dataloader(
+    dataset="coco2014",
+    bs=1,
+    seed=42,
+    nsamples=128,
+    gradient_accumulate_steps=1,
+):
+    """Generate a DataLoader for calibration using specified parameters.
+    Args:
+        Dataset_name (str): The name or path of the dataset.
+        bs (int, optional): The batch size. Defaults to 1.
+    Returns:
+        DataLoader: The DataLoader for the calibrated datasets.
+    """
+    if dataset in COCO_URL:
+        import requests
+
+        logger.info(f"use dataset {dataset}, downloading ...")
+        text_data = requests.get(COCO_URL[dataset]).text
+        with open("captions_source.tsv", "w") as f:
+            f.write(text_data)
+        dataset = "captions_source.tsv"
+
+    if isinstance(dataset, str) and os.path.exists(dataset):
+        dataset = DIFFUSION_DATASET["local"](dataset, nsamples)
+    else:
+        raise ValueError("Only support coco2014 dataset or loading local tsv file now.")
+    set_seed(seed)
+    dataloader_params = {"batch_size": bs, "shuffle": True}
+
+    return DataLoader(dataset, **dataloader_params), bs, gradient_accumulate_steps
diff --git a/auto_round/compressors/diffusion/eval.py b/auto_round/compressors/diffusion/eval.py
new file mode 100644
index 000000000..5baed978b
--- /dev/null
+++ b/auto_round/compressors/diffusion/eval.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import os
+
+import numpy as np
+import torch
+from PIL import Image
+from tqdm import tqdm
+
+from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader
+from auto_round.utils import LazyImport
+
+metrics = LazyImport("torchmetrics.multimodal")
+reward = LazyImport("ImageReward")
+
+
+def compute_clip(prompts, images, device: str = "cuda"):
+    clip_model = metrics.CLIPScore(model_name_or_path="openai/clip-vit-large-patch14").to(device)
+    for prompt, img_path in tqdm(zip(prompts, images), desc="Computing CLIP score"):
+        image_data = Image.open(img_path).convert("RGB")
+        image_tensor = torch.from_numpy(np.array(image_data)).permute(2, 0, 1)
+        clip_model.update(image_tensor.to(torch.float32).to(device).unsqueeze(0), prompt)
+    result = clip_model.compute().mean().item()
+    return {"CLIP": result}
+
+
+def compute_clip_iqa(prompts, images, device: str = "cuda"):
+    clip_model = metrics.CLIPImageQualityAssessment(model_name_or_path="openai/clip-vit-large-patch14").to(device)
+    for prompt, img_path in tqdm(zip(prompts, images), desc="Computing CLIP-IQA score"):
+        image_data = Image.open(img_path).convert("RGB")
+        image_tensor = torch.from_numpy(np.array(image_data)).permute(2, 0, 1)
+        clip_model.update(image_tensor.to(torch.float32).to(device).unsqueeze(0))
+    result = clip_model.compute().mean().item()
+    return {"CLIP-IQA": result}
+
+
+def compute_image_reward_metrics(prompts, images, device="cuda"):
+    image_reward_model = reward.load("ImageReward-v1.0", device=device)
+    scores = []
+    for prompt, img_path in tqdm(zip(prompts, images), desc="Computing image reward metrics"):
+        score = image_reward_model.score(prompt, img_path)
+        scores.append(score)
+    return {"ImageReward": np.mean(scores)}
+
+
+metric_map = {
+    "clip": compute_clip,
+    "clip-iqa": compute_clip_iqa,
+    "imagereward": compute_image_reward_metrics,
+}
+
+
+def diffusion_eval(
+    pipe,
+    prompt_file,
+    metrics,
+    image_save_dir,
+    batch_size,
+    gen_kwargs,
+):
+    if (
+        not importlib.util.find_spec("clip")
+        or not importlib.util.find_spec("ImageReward")
+        or not importlib.util.find_spec("torchmetrics")
+    ):
+        raise ImportError(
+            "Please make sure clip, image-reward and torchmetrics are installed for diffusion model evaluation."
+        )
+    dataloader, _, _ = get_diffusion_dataloader(prompt_file, nsamples=-1, bs=batch_size)
+    prompt_list = []
+    image_list = []
+    for image_ids, prompts in dataloader:
+        prompt_list.extend(prompts)
+
+        new_ids = []
+        new_prompts = []
+        for idx, image_id in enumerate(image_ids):
+            image_id = image_id.item()
+            image_list.append(os.path.join(image_save_dir, str(image_id) + ".png"))
+
+            if os.path.exists(os.path.join(image_save_dir, str(image_id) + ".png")):
+                continue
+            new_ids.append(image_id)
+            new_prompts.append(prompts[idx])
+
+        if len(new_prompts) == 0:
+            continue
+
+        output = pipe(prompt=new_prompts, **gen_kwargs)
+        for idx, image_id in enumerate(new_ids):
+            output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png"))
+
+    result = {}
+    for metric in metrics:
+        result.update(metric_map[metric](prompt_list, image_list, pipe.device))
+
+    import tabulate
+
+    print(tabulate.tabulate(result.items(), tablefmt="grid"))
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 9af09758e..d39520f42 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -406,13 +406,21 @@ def get_block_names(model, quant_vision=False):
     """
     from auto_round.special_model_handler import SPECIAL_MULTIMODAL_BLOCK
 
-    def _get_llm_block_names(model):
-        block_names = []
+    def _search_block(name, module):
+        if hasattr(type(module), "__name__") and "ModuleList" in type(module).__name__:
+            return [(name, module)]
         target_modules = []
-        for n, m in model.named_modules():
+        for n, m in module.named_children():
             if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
-                target_modules.append((n, m))
-                break  ## only find the first modulelist, may be not robust
+                target_modules.append((".".join(filter(None, (name, n))), m))
+            else:
+                target_modules.extend(_search_block(".".join(filter(None, (name, n))), m))
+        return target_modules
+
+    def _get_llm_block_names(model):
+        block_names = []
+        target_modules = _search_block("", model)
+
         for i, target_m in enumerate(target_modules):
             block_names.append([])
             for n, m in target_m[1].named_children():
@@ -459,7 +467,15 @@ def collect_best_params(block):
     return params
 
 
-def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.float16, device=torch.device("cpu")):
+def block_forward(
+    block,
+    input_ids,
+    input_others,
+    amp=False,
+    amp_dtype=torch.float16,
+    device=torch.device("cpu"),
+    output_return_id=0,
+):
     """Performs a forward pass through a block with the given inputs.
 
     Args:
@@ -469,6 +485,7 @@ def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.flo
     amp: A boolean indicating whether to use automatic mixed precision.
     amp_dtype: The data type for automatic mixed precision.
     device: The target device.
+    output_return_id: if the output has more than one tenor, return the specified idx tensor.
 
     Returns:
     output: The output of the forward pass.
@@ -485,8 +502,8 @@ def block_forward(block, input_ids, input_others, amp=False, amp_dtype=torch.flo
             output = block(input_ids, *input_tuple, **input_others)
     else:
         output = block(input_ids, *input_tuple, **input_others)
-    if isinstance(output, list) or isinstance(output, tuple):
-        output = output[0]
+    if isinstance(output_return_id, int) and (isinstance(output, list) or isinstance(output, tuple)):
+        output = output[output_return_id]
     return output
 
 
@@ -1614,6 +1631,30 @@ def mllm_load_model(
     return model, processor, tokenizer, image_processor
 
 
+def diffusion_load_model(
+    pretrained_model_name_or_path,
+    device="cpu",
+    torch_dtype="auto",
+    use_auto_mapping=False,
+    trust_remote_code=True,
+    model_dtype=None,
+    **kwargs,
+):
+    device_str, use_auto_mapping = get_device_and_parallelism(device)
+    torch_dtype = "auto"
+    if device_str is not None and "hpu" in device_str:
+        torch_dtype = torch.bfloat16
+
+    pipelines = LazyImport("diffusers.pipelines")
+
+    pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(
+        pretrained_model_name_or_path, torch_dtype=torch_dtype
+    )
+    pipe = _to_model_dtype(pipe, model_dtype)
+    model = pipe.transformer
+    return pipe, model.to(device)
+
+
 def is_pure_text_model(model):
     """verify on: phi-3.5, Mistral-Small-3.1, gemma-3, qwen2-vl,"""
     if hasattr(model, "config") and hasattr(model.config, "vision_config"):
@@ -2743,11 +2784,12 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
             return True
         if os.path.exists(os.path.join(model_path, "processor_config.json")):
             return True
-        with open(os.path.join(model_path, "config.json")) as f:
-            config = json.load(f)
-        for key in config.keys():
-            if any([k in key for k in MM_KEYS]):
-                return True
+        if os.path.exists(os.path.join(model_path, "config.json")):
+            with open(os.path.join(model_path, "config.json")) as f:
+                config = json.load(f)
+            for key in config.keys():
+                if any([k in key for k in MM_KEYS]):
+                    return True
 
     if isinstance(model_or_path, torch.nn.Module):
         for name, module in model_or_path.named_modules():
@@ -2755,3 +2797,24 @@ def is_mllm_model(model_or_path: Union[str, torch.nn.Module]):
                 return True
 
     return False
+
+
+def is_diffusion_model(model_or_path: Union[str, object]):
+    if isinstance(model_or_path, str):
+        index_file = None
+        if not os.path.isdir(model_or_path):
+            try:
+                from huggingface_hub import hf_hub_download
+
+                index_file = hf_hub_download(model_or_path, "model_index.json")
+            except:
+                index_file = None
+
+        elif os.path.exists(os.path.join(model_or_path, "model_index.json")):
+            index_file = os.path.join(model_or_path, "model_index.json")
+        return index_file is not None
+    elif not isinstance(model_or_path, torch.nn.Module):
+        pipeline_utils = LazyImport("diffusers.pipelines.pipeline_utils")
+        return isinstance(model_or_path, pipeline_utils.DiffusionPipeline)
+    else:
+        return False
diff --git a/test/test_cuda/requirements_diffusion.txt b/test/test_cuda/requirements_diffusion.txt
new file mode 100644
index 000000000..55908f6ac
--- /dev/null
+++ b/test/test_cuda/requirements_diffusion.txt
@@ -0,0 +1,3 @@
+diffusers
+image-reward
+clip
\ No newline at end of file
diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py
new file mode 100644
index 000000000..c5fa63471
--- /dev/null
+++ b/test/test_cuda/test_diffusion.py
@@ -0,0 +1,86 @@
+import copy
+import os
+import re
+import shutil
+import sys
+import unittest
+
+import requests
+
+sys.path.insert(0, "../..")
+
+from PIL import Image
+
+from auto_round import AutoRoundConfig
+from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
+
+
+class TestAutoRound(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.save_dir = "./saved"
+        self.model_name = "/dataset/FLUX.1-dev"
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree(self.save_dir, ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+    @require_optimum
+    def test_diffusion_tune(self):
+
+        from diffusers import AutoPipelineForText2Image
+
+        from auto_round import AutoRoundDiffusion
+
+        ## load the model
+        pipe = AutoPipelineForText2Image.from_pretrained(self.model_name)
+        model = pipe.transformer
+
+        layer_config = {}
+        # skip some layers since it takes much time
+        for n, m in model.named_modules():
+            if m.__class__.__name__ != "Linear":
+                continue
+            match = re.search(r"blocks\.(\d+)", n)
+            if match and int(match.group(1)) > 0:
+                layer_config[n] = {"bits": 16, "act_bits": 16, "data_type": "float", "act_data_type": "float"}
+
+        ## quantize the model
+        autoround = AutoRoundDiffusion(
+            pipe,
+            tokenizer=None,
+            scheme="MXFP4",
+            iters=1,
+            nsamples=1,
+            num_inference_steps=2,
+            layer_config=layer_config,
+            dataset="/dataset/captions_source.tsv",
+        )
+        # skip model saving since it takes much time
+        autoround.quantize()
+        shutil.rmtree(self.save_dir, ignore_errors=True)
+
+    def test_block_name(self):
+        from diffusers import AutoPipelineForText2Image
+
+        from auto_round.utils import get_block_names
+
+        pipe = AutoPipelineForText2Image.from_pretrained(self.model_name)
+        model = pipe.transformer
+
+        block_name = get_block_names(model)
+        self.assertTrue(len(block_name) == 2)
+        self.assertTrue(any(["context_embedder" not in n for n in block_name]))
+
+    def test_diffusion_model_checker(self):
+        from auto_round.utils import is_diffusion_model
+
+        self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev"))
+        self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1"))
+        self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0"))
+        self.assertFalse(is_diffusion_model("/models/Qwen3-8B"))
+
+
+if __name__ == "__main__":
+    unittest.main()