From bd404779210b57a5a914ec73b569119f10abad0a Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 17 May 2025 16:23:49 +0930
Subject: [PATCH 01/33] fix import error

---
 lite_llama/kernels/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lite_llama/kernels/__init__.py b/lite_llama/kernels/__init__.py
index 7effa2a..479f198 100644
--- a/lite_llama/kernels/__init__.py
+++ b/lite_llama/kernels/__init__.py
@@ -8,7 +8,7 @@
 
 from .skip_rmsnorm import skip_rmsnorm
 from .swiglu import swiglu_forward
-from .rope_emb import (rope_forward, rope_emb_forward)
+from .rope_emb import (rope_emb_forward)
 from .softmax_split import softmax_split
 from .update_kv_buffer import update_kv_buffer
 from .update_kv_index import update_kv_index

From 0f84e95e68ceb34756601cf581f4ddc1dbdbcba6 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 17 May 2025 16:24:11 +0930
Subject: [PATCH 02/33] add sentencepiece  into requirement

---
 requirement.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirement.txt b/requirement.txt
index fc036a1..3346643 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -13,4 +13,5 @@ rich==13.7.1
 termvisage==0.2.0
 accelerate==1.6.0
 sentence-transformers==4.0.2
-jsonargparse==4.38.0
\ No newline at end of file
+jsonargparse==4.38.0
+sentencepiece==0.2.0
\ No newline at end of file

From 5008cd3aadbb60947e4667146d98bf3b79a1fa9c Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 17 May 2025 16:24:37 +0930
Subject: [PATCH 03/33] add logs

---
 lite_llama/executor/mem_manager.py        | 14 +++++++-------
 lite_llama/executor/req_tokens_manager.py |  9 ++++-----
 lite_llama/executor/weight_convert.py     |  2 +-
 lite_llama/generate_stream.py             |  5 +----
 lite_llama/llava_generate_stream.py       |  5 +----
 lite_llama/models/RotaryEmbedding.py      |  5 ++---
 tests/test_available_blocks.py            |  7 ++++---
 7 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/lite_llama/executor/mem_manager.py b/lite_llama/executor/mem_manager.py
index 280ef35..b83cb52 100644
--- a/lite_llama/executor/mem_manager.py
+++ b/lite_llama/executor/mem_manager.py
@@ -1,8 +1,8 @@
 import torch
-import logging, gc
+import gc
 from typing import List
 
-logger = logging.getLogger(__name__)
+from utils.logger import log
 
 def get_dtype_size(dtype: torch.dtype) -> int:
     """Get the size of the data type in bytes."""
@@ -109,7 +109,7 @@ def compute_num_available_blocks(self, model_path=None, dummy_input = None, mode
         # 确保缓存块数量不为负数
         num_gpu_blocks = max(num_gpu_blocks, 0)
 
-        logger.info(
+        log.info(
                 " Memory profiling results: total_gpu_memory = %.2f GB \n"
                 "    initial_memory_usage = %.2f GB peak_torch_memory = %.2f GB \n"
                 "    memory_usage_post_profile = %.2f GB \n"
@@ -166,12 +166,12 @@ def init_kv_buffers(self,
         self.gpu_kv_buffer = [
             torch.empty((max_num_tokens, 2 * num_kv_heads, head_dim), dtype=dtype, device=device) for _ in range(num_layers)
         ]
-        logger.debug(f"gpu_kv_buffer per layer shape: {self.gpu_kv_buffer[0].shape}")
+        log.debug(f"gpu_kv_buffer per layer shape: {self.gpu_kv_buffer[0].shape}")
     
     @torch.no_grad()
     def alloc_kvcache(self, need_size):
         if need_size > self.can_use_mem_size:
-            logger.warning(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
+            log.warning(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
             return None
         
         can_use_pos_index = torch.nonzero(self.kv_mem_use_state == 0).view(-1)
@@ -183,7 +183,7 @@ def alloc_kvcache(self, need_size):
     @torch.no_grad()
     def alloc_contiguous_kvcache(self, need_size):
         if need_size > self.can_use_mem_size:
-            logger.warning(f"warn no enough contiguous cache need_size {need_size} left_size {self.can_use_mem_size}")
+            log.warning(f"warn no enough contiguous cache need_size {need_size} left_size {self.can_use_mem_size}")
             return None
 
         # 获取未使用的内存块索引
@@ -259,7 +259,7 @@ def free(self, free_index):
         free_index = free_index.long()
         self.release_ref(free_index)
         if self.can_use_mem_size == len(self.mem_state):
-            logger.debug(f"freed all gpu mem size {self.can_use_mem_size}")
+            log.debug(f"freed all gpu mem size {self.can_use_mem_size}")
         return
     
     # 释放所有内存
diff --git a/lite_llama/executor/req_tokens_manager.py b/lite_llama/executor/req_tokens_manager.py
index 21993fd..afa5478 100644
--- a/lite_llama/executor/req_tokens_manager.py
+++ b/lite_llama/executor/req_tokens_manager.py
@@ -1,7 +1,6 @@
 import torch
-import logging
+from utils.logger import log
 
-logger = logging.getLogger(__name__)
 
 class ReqTokensManager:
     """管理请求序列的 kv 内存 tokens 的类。
@@ -21,7 +20,7 @@ def __init__(self, max_request_num, max_seq_len, mem_manager=None, device="cuda"
     # 分配批次请求需要的内存空间
     def alloc_req(self, request_num):
         if request_num > self.can_use_req_size:
-            logger.error(f'Insufficient requested capacity, remaining {self.can_use_req_size}')
+            log.error(f'Insufficient requested capacity, remaining {self.can_use_req_size}')
             return None
 
         logical_select_index = torch.nonzero(self.req_state==0).reshape(-1)[:request_num]
@@ -34,13 +33,13 @@ def free_reqs(self, free_req_index, free_token_index):
         self.can_use_req_size += len(free_req_index)
         self.req_state[free_token_index] = 0 # 对应批次请求的索引重新置为 0
         if self.can_use_req_size == len(self.req_state):
-            logger.debug(f"freed all request size {self.can_use_req_size}")
+            log.debug(f"freed all request size {self.can_use_req_size}")
         # self.mem_manager.free(free_token_index)
 
     # 仅释放指定请求的索引
     def free_req(self, free_req_index):
         if free_req_index < 0 or free_req_index >= self.req_state.size(0):
-            logger.error(f"Invalid free_req_index: {free_req_index}")
+            log.error(f"Invalid free_req_index: {free_req_index}")
             return
         self.can_use_req_size += 1
         self.req_state[free_req_index] = 0
diff --git a/lite_llama/executor/weight_convert.py b/lite_llama/executor/weight_convert.py
index ff13959..06b47c1 100644
--- a/lite_llama/executor/weight_convert.py
+++ b/lite_llama/executor/weight_convert.py
@@ -17,7 +17,7 @@ def build_new_weight_dir(checkpoints_dir:str, new_sd):
     json_files = glob.glob(os.path.join(checkpoints_dir, "*.json"))
     for file_path in json_files:
         shutil.copy(file_path, my_weight_dir) # 复制 hf 权重目录的所有 json 文件到新的目录
-        print(f"已复制: {file_path} -> {my_weight_dir}")
+        print(f"Copy: {file_path} -> {my_weight_dir}")
 
 def convert_qwen2_hf_to_litellama(
     checkpoints_dir: str, 
diff --git a/lite_llama/generate_stream.py b/lite_llama/generate_stream.py
index a501870..aef5421 100644
--- a/lite_llama/generate_stream.py
+++ b/lite_llama/generate_stream.py
@@ -1,5 +1,5 @@
 from typing import Optional
-import torch, logging
+import torch
 from typing import List, Optional, Tuple, TypedDict, Generator
 from .executor.model_executor import ModelExecutor
 from .utils.file_interface import get_model_name_from_path
@@ -7,9 +7,6 @@
 
 from transformers import AutoTokenizer
 
-# 设置日志
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 
 
 class CompletionPrediction(TypedDict, total=False):
diff --git a/lite_llama/llava_generate_stream.py b/lite_llama/llava_generate_stream.py
index 6560641..162a85f 100644
--- a/lite_llama/llava_generate_stream.py
+++ b/lite_llama/llava_generate_stream.py
@@ -1,5 +1,5 @@
 from typing import Optional
-import torch, logging, re
+import torch, re
 from PIL import Image
 
 from typing import List, Optional, Tuple, TypedDict, Generator, Union
@@ -9,9 +9,6 @@
 
 from transformers import AutoTokenizer, AutoProcessor
 
-# 设置日志
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
diff --git a/lite_llama/models/RotaryEmbedding.py b/lite_llama/models/RotaryEmbedding.py
index 6acaa94..111ffe8 100644
--- a/lite_llama/models/RotaryEmbedding.py
+++ b/lite_llama/models/RotaryEmbedding.py
@@ -1,10 +1,9 @@
 import torch, math
 import torch.nn as nn
 from typing import Optional, Tuple
-import logging
 from .model_config import LlamaConfig, Qwen2Config
+from utils.logger import log
 
-logger = logging.getLogger(__name__)
 
 def _compute_default_rope_parameters(
     config = None,
@@ -235,7 +234,7 @@ def __init__(
         # TODO (joao): remove the `if` below, only used for BC
         self.rope_kwargs = {}
         if config is None:
-            logger.warning_once(
+            log.warning(
                 "`Qwen2RotaryEmbedding` can now be fully parameterized by passing the model config through the "
                 "`config` argument. All other arguments will be removed in v4.46"
             )
diff --git a/tests/test_available_blocks.py b/tests/test_available_blocks.py
index a25735b..2c77049 100644
--- a/tests/test_available_blocks.py
+++ b/tests/test_available_blocks.py
@@ -1,13 +1,14 @@
 import torch, gc
 from typing import List, Tuple
 from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
-import logging, json,os,sys
+import json,os,sys
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from lite_llama.models.model_config import LlamaConfig
 
-logger = logging.getLogger(__name__)
+from utils.logger import log
+
 
 def load_config_from_json(json_file_path: str, device: str="cuda") -> LlamaConfig:
     with open(json_file_path, "r") as f:
@@ -78,7 +79,7 @@ def determine_num_available_blocks(model_config, gpu_memory_utilization = 0.9) -
     # 确保缓存块数量不为负数
     num_gpu_blocks = max(num_gpu_blocks, 0)
 
-    logger.info(
+    log.info(
             "Memory profiling results: total_gpu_memory=%.2fGiB \n"
             " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB \n"
             " memory_usage_post_profile=%.2fGib \n"

From 50de54d8f8f5cf177158b9ba142fb8ccf371338f Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 17 May 2025 16:24:55 +0930
Subject: [PATCH 04/33] merge llava into generate

---
 generate.py     | 141 ++++++++++++++++++++++++++++++++++++++++++------
 utils/common.py |   2 +-
 2 files changed, 127 insertions(+), 16 deletions(-)

diff --git a/generate.py b/generate.py
index 8a38d8b..764e6f9 100644
--- a/generate.py
+++ b/generate.py
@@ -1,11 +1,14 @@
 import torch
 from typing import Optional
-from lite_llama.utils.prompt_templates import get_prompter
-from lite_llama.generate_stream import GenerateStreamText  # 导入 GenerateText 类
+from lite_llama.utils.prompt_templates import get_prompter, get_image_token
+from lite_llama.generate_stream import GenerateStreamText  # import GenerateText
+from lite_llama.utils.image_process import vis_images
+
 import warnings
 
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 from utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
+from lite_llama.llava_generate_stream import LlavaGeneratorStream
 
 import sys, os, time
 from pathlib import Path
@@ -13,14 +16,16 @@
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 import psutil
+from utils.logger import log
+import argparse
+from argparse import RawTextHelpFormatter
 
 process = psutil.Process(os.getpid())
 
-
-def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
+def report_resource_usage(ram_before, vram_before) -> None:
     end_time = time.time()
     ram_after = process.memory_info().rss
-    vram_after = get_gpu_memory(gpu_type)
+    vram_after = get_gpu_memory(detect_device())
 
     ram_used = (ram_after - ram_before) / (1024 ** 3)  # Bytes to GB
 
@@ -30,11 +35,11 @@ def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
     else:
         vram_text = "Unavailable"
 
-    print(f"CPU RAM Used: {ram_used:.2f} GB")
-    print(f"GPU VRAM Used: {vram_text}")
+    log.info(f"CPU RAM Used: {ram_used:.2f} GB")
+    log.info(f"GPU VRAM Used: {vram_text}")
 
 
-def main(
+def generate_llama(
         prompt: str = "Hello, my name is",
         *,
         temperature: float = 0.6,
@@ -52,7 +57,6 @@ def main(
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
-
     if max_seq_len <= 1024:
         short_prompt = True
     else:
@@ -61,11 +65,10 @@ def main(
     # Start resource tracking
     ram_before = process.memory_info().rss
 
-    gpu_type = detect_device()
     vram_before = get_gpu_memory(gpu_type)
-    # Init LLM generator
     start = time.perf_counter()
 
+    # Init LLM generator
     generator = GenerateStreamText(
         checkpoints_dir=checkpoint_path,
         tokenizer_path=checkpoint_path,
@@ -77,6 +80,7 @@ def main(
         device=device,
     )
 
+
     model_prompter.insert_prompt(prompt)
     prompts = [model_prompter.model_input]
     # Call the generation function and start the stream generation
@@ -98,13 +102,120 @@ def main(
         text_msg +=new_text
 
     print("\n\n==================================\n")
-    print(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+
+    # Report resource usage
+    report_resource_usage(ram_before, vram_before)
+
+
+def generate_llava(
+        prompt: str = "Hello, my name is",
+        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+        figure_path: Path = Path("figures/lit-llama/"),
+        gpu_type: str = "nvidia",
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_seq_len: int = 2048,
+        max_gpu_num_blocks=None,
+        max_gen_len: Optional[int] = 512,
+        load_model: bool = True,
+        compiled_model: bool = False,
+        triton_weight: bool = True
+):
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    if max_seq_len <= 1024:
+        short_prompt = True
+    else:
+        short_prompt = False
+
+    if not os.path.isfile(figure_path):
+        log.error(f"'{figure_path}' Not a valid file path！")
+    else:
+        image_input = str(figure_path).strip()
+    image_items = [image_input]  # Prepare the image_items list
+    image_num = len(image_items)  # Calculate the number of input images
+    vis_images(image_items)  # Displaying images in the terminal
+    assert checkpoint_path.is_dir(), checkpoint_path
+    checkpoint_path = str(checkpoint_path)
+    model_prompter = get_prompter("llama", checkpoint_path, short_prompt)
+
+    # Start resource tracking
+    ram_before = process.memory_info().rss
+
+    vram_before = get_gpu_memory(gpu_type)
+    start = time.perf_counter()
+
+    # Initializing the Multimodal Model Text Generator
+    try:
+        generator = LlavaGeneratorStream(
+            checkpoints_dir=checkpoint_path,
+            tokenizer_path=checkpoint_path,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            load_model=load_model,
+            compiled_model=compiled_model,
+            triton_weight=triton_weight,
+            device=device,
+        )
+    except Exception as e:
+        log.error(f"Model loading failure: {e}")
+        sys.exit(1)
+
+    image_token = get_image_token()
+    model_prompter.insert_prompt(image_token * image_num + prompt)
+    prompts = [model_prompter.model_input]
+
+    try:
+        stream = generator.text_completion_stream(
+            prompts,
+            image_items,
+            temperature=temperature,
+            top_p=top_p,
+            max_gen_len=max_gen_len,
+        )
+    except Exception as e:
+        log.error(f"Text Generation Failure: {e}")
+    end = time.perf_counter()
 
+    completion = ''  # Initialization generates results
+    text_msg = ""
+
+    for batch_completions in stream:
+        next_text = batch_completions[0]['generation'][len(completion):]
+        completion = batch_completions[0]['generation']
+        print(f"\033[91m{next_text}\033[0m", end='', flush=True)  # 红色文本
+        text_msg += next_text
+
+    print("\n\n==================================\n")
+    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
     # Report resource usage
-    report_resource_usage(ram_before, vram_before, gpu_type)
+    report_resource_usage(ram_before, vram_before)
 
 if __name__ == "__main__":
-    from jsonargparse import CLI
 
     torch.set_float32_matmul_precision("high")
-    CLI(main)
\ No newline at end of file
+
+
+    PARSER = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter)
+    PARSER.add_argument('-m', "--model_path", type=str,
+                        default='checkpoints/lit-llama/7B/',
+                        help='Path of the Model')
+    PARSER.add_argument('-q', "--quant_method", type=str,
+                        default='',
+                        help="Quantization method")
+
+    PARSER.add_argument('-p', "--prompt", type=str,
+                        default='Hello, my name is',
+                        help="String of prompt")
+    PARSER.add_argument('-f', "--figure_path", type=str,
+                        default=None,
+                        help="Path of the Figure")
+
+
+    gpu_type = detect_device()
+    args = PARSER.parse_args()
+    model_path = os.path.abspath(args.model_path)
+    if args.figure_path:
+        generate_llava(prompt=args.prompt, checkpoint_path=Path(model_path), figure_path=Path(args.figure_path), gpu_type=gpu_type)
+    else:
+        generate_llama(prompt=args.prompt, checkpoint_path=Path(model_path), gpu_type=gpu_type)
diff --git a/utils/common.py b/utils/common.py
index 791b1fe..488567b 100644
--- a/utils/common.py
+++ b/utils/common.py
@@ -32,7 +32,7 @@ def getProjectPath():
     return os.path.abspath(os.path.join(script_path, ".."))
 
 
-def get_gpu_memory(gpu_type="amd", device_id="0"):
+def get_gpu_memory(gpu_type, device_id="0"):
     try:
         if gpu_type == "amd":
             result = subprocess.run(

From 522171aee90a72be1c985ead46bf9b523f1102c2 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 17 May 2025 16:53:40 +0930
Subject: [PATCH 05/33] fix readme and add interface for weight converter

---
 README.md               | 16 ++++----
 apply_weight_convert.py | 91 +++++++++++++++++++++++------------------
 2 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index cd412d5..680b718 100644
--- a/README.md
+++ b/README.md
@@ -74,8 +74,8 @@ conda activate lite_llama
 git clone https://github.com/harleyszhang/lite_llama.git
 cd lite_llama/
 pip install -r requirement.txt
-python test_weight_convert.py # model weight transformation
-python generate.py --prompt "What is large language model" --checkpoint_path /path/to/model/Llama-3.2-1B-Instruct/ # Run on the basis that the model has been downloaded and placed in the specified directory
+python apply_weight_convert.py -m /path/to/model/Llama-3.2-1B-Instruct/# model weight transformation
+python generate.py -p "What is large language model" -m /path/to/model/Llama-3.2-1B-Instruct/ -f /path/to/figure# Run on the basis that the model has been downloaded and placed in the specified directory
 ```
 
 ROCm version 5.7 and above is recommended.
@@ -92,21 +92,21 @@ conda activate lite_llama
 git clone https://github.com/harleyszhang/lite_llama.git
 cd lite_llama/
 pip install -r requirement.txt
-python test_weight_convert.py # model weight transformation
-python generate.py --prompt "What is large language model" --checkpoint_path /path/to/model/Llama-3.2-1B-Instruct/ # Run on the basis that the model has been downloaded and placed in the specified directory
+python apply_weight_convert.py -m /path/to/model/Llama-3.2-1B-Instruct/# model weight transformation
+python generate.py -p "What is large language model" -m /path/to/model/Llama-3.2-1B-Instruct/ -f /path/to/figure# Run on the basis that the model has been downloaded and placed in the specified directory
 ```
 
 
 ## Evaluation
 
-After `cli.py` runs successfully, the terminal displays the interface as shown below, and you can enter your question in the terminal.
-
-![cli](./images/cli_stream.png)
-
 After `generate.py` runs successfully, the terminal displays the interface as shown below, and you can enter your question in the terminal.
 
 ![generate](./images/generate_stream.png)
 
+After `cli.py` runs successfully, the terminal displays the interface as shown below, and you can enter your question in the terminal.
+
+![cli](./images/cli_stream.png)
+
 After `cli_llava.py` runs successfully, the terminal displays the interface as shown below, enter your picture and prompt word in the terminal, and then enter.
 
 ![llava model streaming output](./images/llava_output2.gif)
diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index 308696a..49399f1 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -4,46 +4,59 @@
 from lite_llama.executor.weight_convert import convert_llavallama_hf_to_litellama, \
                                         convert_llama_hf_to_litellama, convert_qwen2_hf_to_litellama
 
-checkpoints_dir = "/gemini/code/my_weight/Llama-3.2-1B-Instruct-hf"
-
-if "llava" in checkpoints_dir.lower():
-    model = LlavaForConditionalGeneration.from_pretrained( # LlavaForConditionalGeneration
-        checkpoints_dir, 
-        torch_dtype=torch.float16, 
-        low_cpu_mem_usage=True,
-    ).to("cuda")
-else:
-    model = AutoModelForCausalLM.from_pretrained( # LlavaForConditionalGeneration
-        checkpoints_dir, 
-        torch_dtype=torch.float16, 
-        low_cpu_mem_usage=True,
-    ).to("cuda")
-
-hf_sd = model.state_dict()
-
-# for name, parameters in hf_sd.items():
-#     print(name, parameters.shape)
-
-if "qwen2" in checkpoints_dir.lower():
-    llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-    num_layers = llm_config.num_hidden_layers
-    print("num_layers: ", num_layers)
-    convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-    
-elif "llama" in checkpoints_dir.lower():
-    llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-    num_layers = llm_config.num_hidden_layers
-    print("num_layers: ", num_layers)
-    convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-elif "llava" in checkpoints_dir.lower():
-    llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-    num_layers = llava_config.text_config.num_hidden_layers
-    print("num_layers: ", num_layers)
-    convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-else:
-    print("Error! Unsupported model type!")
+import argparse
+from argparse import RawTextHelpFormatter
 
+def main(checkpoints_dir: str):
+    if "llava" in checkpoints_dir.lower():
+        model = LlavaForConditionalGeneration.from_pretrained( # LlavaForConditionalGeneration
+            checkpoints_dir,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to("cuda")
+    else:
+        model = AutoModelForCausalLM.from_pretrained( # LlavaForConditionalGeneration
+            checkpoints_dir,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to("cuda")
+
+    hf_sd = model.state_dict()
+
+    # for name, parameters in hf_sd.items():
+    #     print(name, parameters.shape)
+
+    if "qwen2" in checkpoints_dir.lower():
+        llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+        num_layers = llm_config.num_hidden_layers
+        print("num_layers: ", num_layers)
+        convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+    elif "llama" in checkpoints_dir.lower():
+        llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+        num_layers = llm_config.num_hidden_layers
+        print("num_layers: ", num_layers)
+        convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+    elif "llava" in checkpoints_dir.lower():
+        llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
+        num_layers = llava_config.text_config.num_hidden_layers
+        print("num_layers: ", num_layers)
+        convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+    else:
+        print("Error! Unsupported model type!")
+
+
+if __name__ == '__main__':
+    PARSER = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter)
+    PARSER.add_argument('-m', "--model_path", type=str,
+                        default='checkpoints/lit-llama/7B/',
+                        help='Path of the Model')
+    args = PARSER.parse_args()
+
+    model_path = os.path.abspath(args.model_path)
+
+    main(str(model_path))
 # from transformers import LlavaNextConfig, LlavaNextForConditionalGeneration
 # from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 # from lite_llama.models.llava import LlavaLlama

From f42bf9f059efe16e0e796e7bb0dbe92578804468 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sun, 18 May 2025 21:11:55 +0930
Subject: [PATCH 06/33] fix latency calculation

---
 generate.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/generate.py b/generate.py
index 764e6f9..ede2f23 100644
--- a/generate.py
+++ b/generate.py
@@ -84,13 +84,13 @@ def generate_llama(
     model_prompter.insert_prompt(prompt)
     prompts = [model_prompter.model_input]
     # Call the generation function and start the stream generation
+    start = time.perf_counter()
     stream = generator.text_completion_stream(
         prompts,
         temperature=temperature,
         top_p=top_p,
         max_gen_len=max_gen_len,
     )
-    end = time.perf_counter()
 
     completion = ''  # Initialize to generate the result
     # NOTE: After creating a generator, it can be iterated through a for loop
@@ -100,6 +100,7 @@ def generate_llama(
         completion = batch_completions[0]['generation']
         print(new_text, end='', flush=True)
         text_msg +=new_text
+    end = time.perf_counter()
 
     print("\n\n==================================\n")
     log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
@@ -143,7 +144,6 @@ def generate_llava(
     ram_before = process.memory_info().rss
 
     vram_before = get_gpu_memory(gpu_type)
-    start = time.perf_counter()
 
     # Initializing the Multimodal Model Text Generator
     try:
@@ -164,7 +164,7 @@ def generate_llava(
     image_token = get_image_token()
     model_prompter.insert_prompt(image_token * image_num + prompt)
     prompts = [model_prompter.model_input]
-
+    start = time.perf_counter()
     try:
         stream = generator.text_completion_stream(
             prompts,
@@ -175,7 +175,6 @@ def generate_llava(
         )
     except Exception as e:
         log.error(f"Text Generation Failure: {e}")
-    end = time.perf_counter()
 
     completion = ''  # Initialization generates results
     text_msg = ""
@@ -185,6 +184,7 @@ def generate_llava(
         completion = batch_completions[0]['generation']
         print(f"\033[91m{next_text}\033[0m", end='', flush=True)  # 红色文本
         text_msg += next_text
+    end = time.perf_counter()
 
     print("\n\n==================================\n")
     log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")

From 6801f53aaa497994a85d2e1af168811a2cff4465 Mon Sep 17 00:00:00 2001
From: "zhanghonggao.zhg" <zhanghonggao.zhg@alibaba-inc.com>
Date: Mon, 19 May 2025 21:34:36 +0800
Subject: [PATCH 07/33] refactor: folder and import bugs

---
 .gitignore                                    |   3 +-
 README.md                                     |   4 +-
 apply_weight_convert.py                       |  46 +-
 cli.py                                        |  43 +-
 cli_llava.py                                  |  62 +--
 examples/benchmark.py                         | 144 ++++--
 {evaluator => examples/evaluator}/__init__.py |   0
 {utils => examples/evaluator}/eval.py         | 125 ++---
 examples/evaluator/eval_acc.py                |  57 +++
 examples/example_chat.py                      | 101 ++--
 .../example_eval_acc.py                       |  38 +-
 examples/example_llava.py                     |  35 +-
 generate.py                                   |  57 ++-
 lite_llama/__init__.py                        |   2 +-
 lite_llama/executor/cuda_graph.py             | 113 +++--
 lite_llama/executor/executor_struct.py        |  10 +-
 lite_llama/executor/mem_manager.py            | 181 ++++---
 lite_llama/executor/model_executor.py         | 267 +++++++----
 lite_llama/executor/req_tokens_manager.py     |  38 +-
 lite_llama/executor/weight_convert.py         | 116 ++---
 lite_llama/generate.py                        | 140 +++---
 lite_llama/generate_stream.py                 | 126 +++--
 lite_llama/generete_with_probs.py             | 154 +++---
 .../inference.py                              |  39 +-
 lite_llama/kernels/__init__.py                |   7 +-
 lite_llama/kernels/activations.py             |  12 +-
 lite_llama/kernels/flashattention.py          | 219 +++++----
 lite_llama/kernels/flashattention2_nopad.py   | 214 ++++++---
 lite_llama/kernels/flashattentionv2.py        | 192 +++++---
 lite_llama/kernels/flashdecoding.py           | 444 +++++++++++-------
 .../kernels/others/activation_layers.py       |  39 +-
 .../others/context_flashattention_nopad.py    | 186 ++++++--
 lite_llama/kernels/others/fused_linear.py     |  59 ++-
 lite_llama/kernels/others/layernorm.py        |  55 +--
 lite_llama/kernels/others/rmsnorm_layer.py    |  42 +-
 lite_llama/kernels/others/rmsnorm_v1.py       | 103 ++--
 lite_llama/kernels/others/rope_orig.py        | 100 ++--
 lite_llama/kernels/others/rotary_emb_v1.py    |  93 +++-
 lite_llama/kernels/rope_emb.py                |  17 +-
 lite_llama/kernels/skip_rmsnorm.py            | 156 ++++--
 lite_llama/kernels/softmax_split.py           |  13 +-
 lite_llama/kernels/swiglu.py                  |   7 +-
 lite_llama/kernels/update_kv_buffer.py        |  57 ++-
 lite_llama/kernels/update_kv_index.py         |  56 ++-
 lite_llama/kernels/utils.py                   |   2 +
 lite_llama/llava_generate_stream.py           | 176 ++++---
 lite_llama/models/RotaryEmbedding.py          | 172 +++++--
 lite_llama/models/clip.py                     |  96 ++--
 lite_llama/models/llama.py                    | 222 ++++++---
 lite_llama/models/llava.py                    | 122 +++--
 lite_llama/models/model_config.py             | 144 +++---
 lite_llama/models/qwen2.py                    | 224 ++++++---
 lite_llama/models/utils.py                    | 285 +++++++----
 {utils => lite_llama/utils}/common.py         |  34 +-
 lite_llama/utils/config_convert.py            |  46 +-
 lite_llama/utils/constants.py                 |   2 +-
 lite_llama/utils/file_interface.py            |   6 +-
 lite_llama/utils/image_process.py             |  17 +-
 {utils => lite_llama/utils}/logger.py         |  65 ++-
 lite_llama/utils/prompt_templates.py          |  44 +-
 requirement.txt                               |   3 +-
 tests/fused_mlp_silu.py                       | 227 ++++++---
 tests/kernels_benchmark.py                    | 142 ++++--
 tests/kernels_test.py                         |  73 ++-
 tests/softmax_native.py                       |  55 ++-
 tests/softmax_split.py                        |  18 +-
 tests/test_LlamaConfig.py                     |   9 +-
 tests/test_LlamaForCausalLM.py                |  33 +-
 tests/test_LlamaModel.py                      | 152 ++++--
 tests/test_LlavaConfig.py                     |  27 +-
 tests/test_LlavaForConditionalGeneration.py   |  25 +-
 tests/test_LlavaLlama.py                      |   7 +-
 tests/test_Qwen2ForCausalLM.py                |  25 +-
 tests/test_attention.py                       |  93 +++-
 tests/test_available_blocks.py                |  69 +--
 tests/test_cuda_graph.py                      |  83 ++--
 tests/test_flashattentionv2.py                |  99 +++-
 tests/test_flashdecoding.py                   | 111 +++--
 tests/test_flashdecoding_stage1.py            | 117 +++--
 tests/test_flashdecoding_stage2.py            | 220 +++++----
 tests/test_get_model_name.py                  |   6 +-
 tests/test_gpt2.py                            |  50 +-
 tests/test_image_process.py                   |   8 +-
 tests/test_image_token.py                     | 203 ++++++--
 tests/test_llama_layer.py                     |  59 ++-
 tests/test_load_weight.py                     |  70 +--
 tests/test_mask.py                            |  29 +-
 tests/test_mem_manager.py                     |  43 +-
 tests/test_merge.py                           | 281 ++++++-----
 ...est_merge_input_ids_with_image_features.py | 186 ++++----
 tests/test_qwen2.py                           | 253 ++++++----
 tests/test_rope_forward.py                    |  34 +-
 tests/test_standard_mha.py                    |  96 ++--
 tests/test_torch_matmul.py                    |  41 +-
 tests/test_torch_rope.py                      | 141 ++++--
 tests/test_transformers.py                    |  23 +-
 utils/__init__.py                             |   0
 97 files changed, 5553 insertions(+), 3187 deletions(-)
 rename {evaluator => examples/evaluator}/__init__.py (100%)
 rename {utils => examples/evaluator}/eval.py (75%)
 create mode 100644 examples/evaluator/eval_acc.py
 rename evaluator/evaluate_accuracy.py => examples/example_eval_acc.py (63%)
 rename evaluator/lite_llama_inference.py => lite_llama/inference.py (81%)
 rename {utils => lite_llama/utils}/common.py (75%)
 rename {utils => lite_llama/utils}/logger.py (66%)
 delete mode 100644 utils/__init__.py

diff --git a/.gitignore b/.gitignore
index ea67596..f78c57b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ test/tmp
 test/debug
 my_weight
 .idea
-logs
\ No newline at end of file
+logs
+lite_llama/logs
\ No newline at end of file
diff --git a/README.md b/README.md
index cd412d5..69f0b47 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 ## Setup and Installation
  
 ### Pre-requisites
-> If you don't have a physical server, you can try using [virtaicloud remote server](https://growthdata.virtaicloud.com/t/hK).
+> If you don't have a physical server, you can try using [virtal cloud remote server](https://growthdata.virtaicloud.com/t/hK).
 
 lite_llama framework requires the following dependencies:
 
@@ -58,10 +58,8 @@ pytorch-triton-rocm 3.2.0
 torch               2.6.0+rocm6.2.4
 torchaudio          2.6.0+rocm6.2.4
 torchvision         0.21.0+rocm6.2.4
-
 ```
 
-
 ## Getting Started
 
 Recommended cuda version 12.0 and above. Download [llama3.2-1B-Instruct Model](https://pan.quark.cn/s/f476119babb3) and place it in the specified `checkpoints_dir` directory. `python apply_weight_convert.py` needs to be run to convert the hf model weights to `lite_llama` weight format, before running `cli.py`.
diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index 308696a..c8901b0 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -1,21 +1,35 @@
-import os, sys, torch
-from transformers import LlavaForConditionalGeneration, AutoConfig, AutoModelForCausalLM, LlavaConfig
+import torch
+from transformers import (
+    LlavaForConditionalGeneration,
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlavaConfig,
+)
+
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-from lite_llama.executor.weight_convert import convert_llavallama_hf_to_litellama, \
-                                        convert_llama_hf_to_litellama, convert_qwen2_hf_to_litellama
+from lite_llama.executor.weight_convert import (
+    convert_llavallama_hf_to_litellama,
+    convert_llama_hf_to_litellama,
+    convert_qwen2_hf_to_litellama,
+)
+
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
-checkpoints_dir = "/gemini/code/my_weight/Llama-3.2-1B-Instruct-hf"
+checkpoints_dir = "/path/llm_weights/llava-v1.5-7b"
 
 if "llava" in checkpoints_dir.lower():
-    model = LlavaForConditionalGeneration.from_pretrained( # LlavaForConditionalGeneration
-        checkpoints_dir, 
-        torch_dtype=torch.float16, 
-        low_cpu_mem_usage=True,
-    ).to("cuda")
+    model = (
+        LlavaForConditionalGeneration.from_pretrained(  # LlavaForConditionalGeneration
+            checkpoints_dir,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to("cuda")
+    )
 else:
-    model = AutoModelForCausalLM.from_pretrained( # LlavaForConditionalGeneration
-        checkpoints_dir, 
-        torch_dtype=torch.float16, 
+    model = AutoModelForCausalLM.from_pretrained(
+        checkpoints_dir,
+        torch_dtype=torch.float16,
         low_cpu_mem_usage=True,
     ).to("cuda")
 
@@ -29,7 +43,7 @@
     num_layers = llm_config.num_hidden_layers
     print("num_layers: ", num_layers)
     convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-    
+
 elif "llama" in checkpoints_dir.lower():
     llm_config = AutoConfig.from_pretrained(checkpoints_dir)
     num_layers = llm_config.num_hidden_layers
@@ -57,5 +71,5 @@
 # 使用 init_empty_weights 初始化空模型
 # with init_empty_weights():
 #     llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-#     model = LlavaLlama(llava_config)  
-#     llama_config = model.llama_config
\ No newline at end of file
+#     model = LlavaLlama(llava_config)
+#     llama_config = model.llama_config
diff --git a/cli.py b/cli.py
index 0e4e390..ec49fca 100644
--- a/cli.py
+++ b/cli.py
@@ -1,24 +1,24 @@
 import torch
 from typing import Optional
 from lite_llama.utils.prompt_templates import get_prompter
-from lite_llama.generate_stream import GenerateStreamText # 导入 GenerateText 类
+from lite_llama.generate_stream import GenerateStreamText  # 导入 GenerateText 类
+
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
-# checkpoints_dir = '/gemini/code/lite_llama/my_weight/Qwen2.5-3B-Instruct' # 改成自己的存放模型路径
-checkpoints_dir = "/gemini/code/my_weight/Llama-3.2-1B-Instruct"
+checkpoints_dir = "/path/lite_llama/my_weight/Qwen2.5-3B"
 
 def main(
     temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 2048,
-    max_gpu_num_blocks = 40960,
+    max_gpu_num_blocks=40960,
     max_gen_len: Optional[int] = 1024,
     load_model: bool = True,
     compiled_model: bool = False,
-    triton_weight: bool = True
+    triton_weight: bool = True,
 ):
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     if max_seq_len <= 1024:
         short_prompt = True
     else:
@@ -29,26 +29,26 @@ def main(
     generator = GenerateStreamText(
         checkpoints_dir=checkpoints_dir,
         tokenizer_path=checkpoints_dir,
-        max_gpu_num_blocks = max_gpu_num_blocks,
-        max_seq_len = max_seq_len,
-        load_model = load_model,
-        compiled_model = compiled_model,
-        triton_weight = triton_weight,
+        max_gpu_num_blocks=max_gpu_num_blocks,
+        max_seq_len=max_seq_len,
+        load_model=load_model,
+        compiled_model=compiled_model,
+        triton_weight=triton_weight,
         device=device,
     )
-    
+
     while True:
-        prompt = input("请输入您的提示（输入 'exit' 退出）：\n") # 提示用户输入
+        prompt = input("请输入您的提示（输入 'exit' 退出）：\n")  # 提示用户输入
         # NOTE: strip() 是字符串方法，用于移除字符串开头和结尾的指定字符（默认为空格或换行符）。
-        if prompt.strip().lower() == 'exit':
+        if prompt.strip().lower() == "exit":
             print("程序已退出。")
             break
 
-        print("\n生成结果: ", end='', flush=True)
+        print("\n生成结果: ", end="", flush=True)
 
         model_prompter.insert_prompt(prompt)
         prompts = [model_prompter.model_input]
-        
+
         # 调用生成函数，开始流式生成
         stream = generator.text_completion_stream(
             prompts,
@@ -57,13 +57,14 @@ def main(
             max_gen_len=max_gen_len,
         )
 
-        completion = '' # 初始化生成结果
+        completion = ""  # 初始化生成结果
         # NOTE: 创建了一个 generator 后，可以通过 for 循环来迭代它
         for batch_completions in stream:
-            new_text = batch_completions[0]['generation'][len(completion):]
-            completion = batch_completions[0]['generation']
-            print(new_text, end='', flush=True)
+            new_text = batch_completions[0]["generation"][len(completion) :]
+            completion = batch_completions[0]["generation"]
+            print(new_text, end="", flush=True)
         print("\n\n==================================\n")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/cli_llava.py b/cli_llava.py
index 480fd1a..7822925 100644
--- a/cli_llava.py
+++ b/cli_llava.py
@@ -1,26 +1,29 @@
 import torch
 from typing import Optional
-from lite_llama.llava_generate_stream import LlavaGeneratorStream
-from lite_llama.utils.image_process import vis_images
-from lite_llama.utils.prompt_templates import get_prompter, get_image_token
+
 from rich.console import Console
 from rich.prompt import Prompt
-import sys,os
+
+import sys, os
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
+from lite_llama.llava_generate_stream import LlavaGeneratorStream
+from lite_llama.utils.image_process import vis_images
+from lite_llama.utils.prompt_templates import get_prompter, get_image_token
+
 # 模型检查点目录，请根据实际情况修改
-checkpoints_dir = "/gemini/code/lite_llama/my_weight/llava-1.5-7b-hf"
+checkpoints_dir = "/path/Qwen/llava-v1.5-7b"
 
 def main(
     temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 2048,
-    max_gpu_num_blocks = None,
+    max_gpu_num_blocks=None,
     max_gen_len: Optional[int] = 512,
     load_model: bool = True,
     compiled_model: bool = False,
-    triton_weight: bool = True
+    triton_weight: bool = True,
 ):
     """
     主函数，处理用户输入并生成响应。
@@ -36,7 +39,7 @@ def main(
         triton_weight (bool, optional): 是否使用Triton权重。默认值为True。
     """
     console = Console()
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     if max_seq_len <= 1024:
         short_prompt = True
     else:
@@ -61,35 +64,37 @@ def main(
         sys.exit(1)
 
     while True:
-        console.print("[bold green]请输入图片路径或URL (输入 'exit' 退出）：[/bold green]") # 获取用户输入的图片路径或URL
-        while True: # 循环判断输入图像路径是否成功, 成功则跳出循环
+        console.print(
+            "[bold green]请输入图片路径或URL (输入 'exit' 退出）：[/bold green]"
+        )  # 获取用户输入的图片路径或URL
+        while True:  # 循环判断输入图像路径是否成功, 成功则跳出循环
             image_input = Prompt.ask("图片")
             if os.path.isfile(image_input):
                 break
-            elif image_input.strip().lower() == 'exit':
+            elif image_input.strip().lower() == "exit":
                 break
             else:
                 print(f"错误：'{image_input}' 不是有效的文件路径！")
                 image_input = Prompt.ask("图片")
 
         image_input = image_input.strip()
-        if image_input.lower() == 'exit':
+        if image_input.lower() == "exit":
             break
-        
-        image_items = [image_input] # 准备image_items列表
-        image_num = len(image_items) # 计算输入图片数量
-        vis_images(image_items) # 在终端中显示图片
+
+        image_items = [image_input]  # 准备image_items列表
+        image_num = len(image_items)  # 计算输入图片数量
+        vis_images(image_items)  # 在终端中显示图片
 
         # console.print("\n[bold blue]请输入提示词（输入 'exit' 退出）：[/bold blue]") # 获取用户的提示词
         input_prompt = Prompt.ask("[bold green]提示词[/bold green]").strip()
-        if input_prompt.lower() == 'exit':
+        if input_prompt.lower() == "exit":
             break
 
         image_token = get_image_token()
         model_prompter.insert_prompt(image_token * image_num + input_prompt)
 
         # prompts = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
-        prompts = [model_prompter.model_input] # 准备提示词，替换<image>标记
+        prompts = [model_prompter.model_input]  # 准备提示词，替换<image>标记
 
         # 调用生成器生成文本
         try:
@@ -103,16 +108,17 @@ def main(
         except Exception as e:
             console.print(f"[red]文本生成失败: {e}[/red]")
             continue
-        
-        completion = ''  # 初始化生成结果
-        console.print("ASSISTANT: ", end='')
-        
+
+        completion = ""  # 初始化生成结果
+        console.print("ASSISTANT: ", end="")
+
         for batch_completions in stream:
-            next_text = batch_completions[0]['generation'][len(completion):]
-            completion = batch_completions[0]['generation']
-            print(f"\033[91m{next_text}\033[0m", end='', flush=True)  # 红色文本
-        
+            next_text = batch_completions[0]["generation"][len(completion) :]
+            completion = batch_completions[0]["generation"]
+            print(f"\033[91m{next_text}\033[0m", end="", flush=True)  # 红色文本
+
         console.print("\n[bold green]==================================[/bold green]\n")
-    
+
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/benchmark.py b/examples/benchmark.py
index 4e929f2..618358d 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -1,20 +1,26 @@
-from typing import List, Optional
+from typing import Optional
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, Qwen2ForCausalLM
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
 
 import sys, os, time
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.generate import GenerateText
 from lite_llama.utils.prompt_templates import get_prompter
 
 import warnings
+
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
+
 def load_lite_llama_generator(
     checkpoints_dir: str,
     max_seq_len: int,
-    max_gpu_num_blocks = None,
-    device: str = "cuda"
+    max_gpu_num_blocks=None,
+    device: str = "cuda",
 ) -> GenerateText:
     """
     初始化 lite-llama 的生成器
@@ -23,7 +29,7 @@ def load_lite_llama_generator(
         checkpoints_dir=checkpoints_dir,
         tokenizer_path=checkpoints_dir,
         max_seq_len=max_seq_len,
-        max_gpu_num_blocks = max_gpu_num_blocks,
+        max_gpu_num_blocks=max_gpu_num_blocks,
         load_model=True,
         compiled_model=True,
         triton_weight=True,
@@ -31,7 +37,8 @@ def load_lite_llama_generator(
     )
     return generator
 
-def count_tokens(texts: List[str], tokenizer) -> int:
+
+def count_tokens(texts: list[str], tokenizer) -> int:
     # 优化后的分词统计
     total_tokens = 0
     for t in texts:
@@ -39,13 +46,14 @@ def count_tokens(texts: List[str], tokenizer) -> int:
         total_tokens += len(ids)
     return total_tokens
 
+
 def lite_llama_inference(
     generator: GenerateText,
-    prompts: List[str],
+    prompts: list[str],
     temperature: float,
     top_p: float,
     max_gen_len: Optional[int],
-    device: str = "cuda"
+    device: str = "cuda",
 ):
     """
     使用 lite-llama 的 GenerateText 实例执行推理，并返回结果与耗时、输出 tokens 数量
@@ -73,13 +81,14 @@ def lite_llama_inference(
 
     return results, end_time - start_time, total_tokens
 
+
 def transformers_inference(
     hf_model_name: str,
-    prompts: List[str],
+    prompts: list[str],
     temperature: float,
     top_p: float,
     max_gen_len: int,
-    device: str = "cuda"
+    device: str = "cuda",
 ):
     """
     使用 Transformers 官方库对一组 prompts 进行批量推理, 返回结果与耗时、输出 tokens 数量。
@@ -88,51 +97,62 @@ def transformers_inference(
     tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
     # 确保分词器有 eos_token
     if tokenizer.pad_token is None:
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-        
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
     model = AutoModelForCausalLM.from_pretrained(
-        hf_model_name,
-        torch_dtype=torch.float16,
-        device_map="auto"
+        hf_model_name, torch_dtype=torch.float16, device_map="auto"
     )
     model.resize_token_embeddings(len(tokenizer))
     model.eval()
 
     # 预热步骤：让模型先对一个非常简单的 prompt 做一次推理
     warm_up_prompt = ["Hello World"]
-    warm_up_inputs = tokenizer(warm_up_prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)
+    warm_up_inputs = tokenizer(
+        warm_up_prompt, return_tensors="pt", padding=True, truncation=True
+    ).to(model.device)
     with torch.no_grad():
-        _ = model.generate(**warm_up_inputs, max_new_tokens=10, temperature=temperature, top_p=top_p, do_sample=True)
+        _ = model.generate(
+            **warm_up_inputs,
+            max_new_tokens=10,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+        )
 
     start_time = time.time()
-    model_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
+    model_inputs = tokenizer(
+        prompts, return_tensors="pt", padding=True, truncation=True
+    ).to(model.device)
     input_ids = model_inputs.input_ids
     generation_kwargs = {
         "max_new_tokens": max_gen_len,
         "top_p": top_p,
         "temperature": temperature,
         "do_sample": True,
-        "eos_token_id": None   # 避免过早终止
+        "eos_token_id": None,  # 避免过早终止
     }
 
     # 一次性进行批量推理
     with torch.no_grad():
         outputs = model.generate(**model_inputs, **generation_kwargs)
-        generated_ids = outputs[:, input_ids.size(-1):] 
-        generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-    
+        generated_ids = outputs[:, input_ids.size(-1) :]
+        generated_texts = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
+
     end_time = time.time()
     results = [{"generation": text} for text in generated_texts]
-    texts = [res['generation'] for res in results]
+    texts = [res["generation"] for res in results]
     total_tokens = count_tokens(texts, tokenizer)
     total_time = end_time - start_time
     prompts_tokens = input_ids.numel()
-    per_token_latency = total_time / total_tokens if total_tokens > 0 else float('inf')
+    per_token_latency = total_time / total_tokens if total_tokens > 0 else float("inf")
 
     return results, total_time, total_tokens, prompts_tokens, per_token_latency
 
+
 def compare_inference_speed(
-    prompts: List[str],
+    prompts: list[str],
     temperature: float,
     top_p: float,
     max_seq_len: int,
@@ -140,7 +160,7 @@ def compare_inference_speed(
     lite_llama_ckpt_dir: str,
     hf_model_name: str,
     print_result=False,
-    device: str = "cuda"
+    device: str = "cuda",
 ):
     """
     对比 lite-llama 与 transformers 官方模型在相同 prompts 下的推理速度和吞吐量。
@@ -161,16 +181,30 @@ def compare_inference_speed(
         update_prompts.append(model_prompter.model_input)
 
     # 1. lite-llama inference
-    lite_llama_generator = load_lite_llama_generator(lite_llama_ckpt_dir, max_seq_len, max_gpu_num_blocks = 40960, device=device)
+    lite_llama_generator = load_lite_llama_generator(
+        lite_llama_ckpt_dir, max_seq_len, max_gpu_num_blocks=40960, device=device
+    )
     lite_llama_results, lite_llama_time, lite_llama_tokens = lite_llama_inference(
-        lite_llama_generator, update_prompts, temperature, top_p, max_gen_len, device=device
+        lite_llama_generator,
+        update_prompts,
+        temperature,
+        top_p,
+        max_gen_len,
+        device=device,
     )
     del lite_llama_generator
-    torch.cuda.empty_cache() # 使用完成后释放 lite_llama_generator 占用的显存
+    torch.cuda.empty_cache()  # 使用完成后释放 lite_llama_generator 占用的显存
 
     # 2. transformers inference
-    hf_results, hf_time, hf_tokens, prompts_tokens, hf_pt_latency = transformers_inference(
-        hf_model_name, update_prompts, temperature, top_p, max_gen_len, device=device
+    hf_results, hf_time, hf_tokens, prompts_tokens, hf_pt_latency = (
+        transformers_inference(
+            hf_model_name,
+            update_prompts,
+            temperature,
+            top_p,
+            max_gen_len,
+            device=device,
+        )
     )
 
     lite_llama_pt_latency = lite_llama_time / (lite_llama_tokens)
@@ -183,10 +217,12 @@ def compare_inference_speed(
     print("Transformers inference output tokens number: {:2d}".format(hf_tokens))
 
     # 吞吐量计算
-    lite_llama_throughput = (lite_llama_tokens) / lite_llama_time if lite_llama_time > 0 else float('inf')
+    lite_llama_throughput = (
+        (lite_llama_tokens) / lite_llama_time if lite_llama_time > 0 else float("inf")
+    )
     print(f"lite_llama throughput: {lite_llama_throughput:.2f} tokens/s")
-    
-    hf_throughput = hf_tokens / hf_time if hf_time > 0 else float('inf')
+
+    hf_throughput = hf_tokens / hf_time if hf_time > 0 else float("inf")
     print(f"Transformers throughput: {hf_throughput:.2f} tokens/s")
 
     # 打印 per token latency
@@ -195,17 +231,20 @@ def compare_inference_speed(
 
     # 打印部分推理结果对比
     if print_result:
-        for i, (prompt, litellama_res, hf_res) in enumerate(zip(prompts, lite_llama_results, hf_results)):
+        for i, (prompt, litellama_res, hf_res) in enumerate(
+            zip(prompts, lite_llama_results, hf_results)
+        ):
             # print(f"\n[Prompt {i}]:\n{prompt}")
-            if i == 0: # 省略部分打印
+            if i == 0:  # 省略部分打印
                 print("\n[lite_llama]: {}".format(litellama_res))
-                print("\n[Transformers]: {}".format(hf_res['generation']))
-                print("\n" + "="*40 + "\n")
+                print("\n[Transformers]: {}".format(hf_res["generation"]))
+                print("\n" + "=" * 40 + "\n")
+
 
 def main():
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    
-    prompts: List[str] = [
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    prompts: list[str] = [
         "I believe the meaning of life is to find happiness in the simple things. but how to achieve the meaning of life?",
         "VGG is a very important cnn backbone, please introduce vgg architecture and give implement code ",
         "Can you introduce the History of the American Civil War. ",
@@ -221,22 +260,22 @@ def main():
         "A Complete Introduction to the History of the American Civil War",
         "Python is a good programming language, how tolearn it?",
         "Please introduce llama model architecture and give implement cuda code.",
-        "Please introduce Qwen2.5 model structure and give cuda implement code."
+        "Please introduce Qwen2.5 model structure and give cuda implement code.",
     ]
 
-    # prompts: List[str] = [
+    # prompts: list[str] = [
     #     "How to learn cnn, please introduce resnet architecture and give code ",
     #     "How to learn cuda programming, give me some code example.",
     # ]
 
-    # prompts: List[str] = [
+    # prompts: list[str] = [
     #     "How to learn cnn, please introduce resnet architecture and give code.",
     #     "How to learn cuda programming, give me some code example.",
     #     "How to learn rust, give me some code examples.",
     #     "How to learn c++, give me some code examples.",
     # ]
 
-    # prompts: List[str] = [
+    # prompts: list[str] = [
     #     "I believe the meaning of life is to find happiness in the simple things. This is a very subjective and personal perspective, and it may vary from person to person. However, I believe that the simple things can bring a sense of joy and fulfillment to our lives.",
     #     "VGG is a very important cnn backbone, please introduce vgg architecture and give implement code ",
     #     "A Complete Introduction to the History of the American Civil War",
@@ -247,7 +286,7 @@ def main():
     #     "How to learn cnn, please introduce resnet architecture and give code ",
     # ]
 
-    # prompts: List[str] = [
+    # prompts: list[str] = [
     #     "I believe the meaning of life is to find happiness in the simple things. This is a very subjective and personal perspective, and it may vary from person ",
     #     "Simply put, the theory of relativity states that 3D space is not fixed, but is relative to the observer's frame of reference. Time is also relative, and it appears to ",
     #     """A brief message congratulating the team on the launch:
@@ -258,7 +297,7 @@ def main():
     #     "Roosevelt was the 26th president of the United States, he has a lot of information on the early history of the ,",
     # ]
 
-    # prompts: List[str] = [
+    # prompts: list[str] = [
     #     "I believe the meaning of life is",
     #     "Simply put, the theory of relativity states that 3D space",
     #     """A brief message congratulating the team on the launch:
@@ -270,9 +309,11 @@ def main():
     # ]
 
     hf_model_name = "/gemini/code/my_weight/Llama-3.2-1B-Instruct-hf"
-    custom_checkpoints_dir = "/gemini/code/my_weight/Llama-3.2-1B-Instruct"  # 根据实际情况修改
+    custom_checkpoints_dir = (
+        "/gemini/code/my_weight/Llama-3.2-1B-Instruct"  # 根据实际情况修改
+    )
     # hf_model_name = "/gemini/code/my_weight/Qwen-hf/Qwen2.5-1.5B-Instruct"
-    # custom_checkpoints_dir = "/gemini/code/my_weight/Qwen2.5-1.5B-Instruct"    
+    # custom_checkpoints_dir = "/gemini/code/my_weight/Qwen2.5-1.5B-Instruct"
     compare_inference_speed(
         prompts=prompts,
         temperature=0.7,
@@ -282,8 +323,9 @@ def main():
         lite_llama_ckpt_dir=custom_checkpoints_dir,
         hf_model_name=hf_model_name,
         print_result=True,
-        device=device
+        device=device,
     )
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/evaluator/__init__.py b/examples/evaluator/__init__.py
similarity index 100%
rename from evaluator/__init__.py
rename to examples/evaluator/__init__.py
diff --git a/utils/eval.py b/examples/evaluator/eval.py
similarity index 75%
rename from utils/eval.py
rename to examples/evaluator/eval.py
index ba29a5a..5a16de9 100644
--- a/utils/eval.py
+++ b/examples/evaluator/eval.py
@@ -1,18 +1,21 @@
 import random
-import sys, os, time
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
-
-from utils.common import *
-from typing import List, Optional, Any
-import string, re
+from typing import Optional, Any
+import string, re, json
 import torch
 from sentence_transformers import SentenceTransformer, util
 
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+
+def read_jsonl(jsonl_path):
+    with open(jsonl_path, "r", encoding="utf-8") as f:
+        data = [json.loads(line) for line in f]
+    return data
+
 class HotpotQA(object):
     r"""
-        for testing hotpot wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json
+    for testing hotpot wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json
     """
+
     def __init__(self, data_path, data_batch=None):
         self.data_type = "qa"
         self.data_path = data_path
@@ -21,8 +24,8 @@ def __init__(self, data_path, data_batch=None):
         self.data_batch = data_batch
 
     def extract_supporting_context(self, data: dict):
-        context_dict = dict(data['context'])
-        supporting_facts = data['supporting_facts']
+        context_dict = dict(data["context"])
+        supporting_facts = data["supporting_facts"]
 
         support_text = []
         for title, sent_idx in supporting_facts:
@@ -30,29 +33,29 @@ def extract_supporting_context(self, data: dict):
                 sentences = context_dict[title]
                 if sent_idx < len(sentences):
                     support_text.append(sentences[sent_idx])
-        return '\n'.join(support_text)
+        return "\n".join(support_text)
 
     def build_prompt(self, data: dict) -> str:
         context = self.extract_supporting_context(data)
-        question = data['question']
+        question = data["question"]
         prompt = f"""
-    Context:
-    {context}
-    
-    Question:
-    {question}
-    
-    Answer:"""
+            Context:
+            {context}
+            
+            Question:
+            {question}
+            
+            Answer:
+        """
         return prompt
 
-
     def parse_data(self) -> tuple[Any, Any, list[Any]]:
         data = read_json(self.data_path)
         test_data = list()
         for hotpot_index, hotpot_content in enumerate(data):
             data_index = hotpot_content["_id"]
             prompt = self.build_prompt(hotpot_content)
-            answer = hotpot_content['answer'].strip().lower()
+            answer = hotpot_content["answer"].strip().lower()
 
             test_data.append({data_index: {"prompt": prompt, "answer": answer}})
 
@@ -62,8 +65,9 @@ def parse_data(self) -> tuple[Any, Any, list[Any]]:
         return unify_data(test_data, self.data_batch, "qa")
 
     def evaluate(self, predictions, ground_truth):
-
-        assert len(predictions) == len(ground_truth), "Prediction and Ground Truth list must be the same length."
+        assert len(predictions) == len(ground_truth), (
+            "Prediction and Ground Truth list must be the same length."
+        )
 
         total_em = 0.0
         total_f1 = 0.0
@@ -81,21 +85,23 @@ def evaluate(self, predictions, ground_truth):
             "EM": total_em / n,
             "F1 (penalized)": total_f1 / n,
             "Jaccard": total_jaccard / n,
-            "Embedding Sim": total_embed_sim / n
+            "Embedding Sim": total_embed_sim / n,
         }
 
-        print(f"The test result of lite_llama inference for {self.data_type} dataset: {scores}")
+        print(
+            f"The test result of lite_llama inference for {self.data_type} dataset: {scores}"
+        )
 
 
 class HellaSwag(object):
     r"""
-        for testing HellaSwag wget https://raw.githubusercontent.com/rowanz/hellaswag/refs/heads/master/data/hellaswag_val.jsonl
+    for testing HellaSwag wget https://raw.githubusercontent.com/rowanz/hellaswag/refs/heads/master/data/hellaswag_val.jsonl
     """
 
     def __init__(self, data_path, data_batch=None):
         self.data_path = data_path
         self.data_type = "mcq"
-        self.choices = ['A', 'B', 'C', 'D']
+        self.choices = ["A", "B", "C", "D"]
 
         # data_batch=none means testing all the data in the dataset
         self.data_batch = data_batch
@@ -108,16 +114,15 @@ def format_prompt(self, ctx, endings):
         return prompt
 
     def extract_choice(self, output_text):
-        for letter in ['A', 'B', 'C', 'D']:
+        for letter in ["A", "B", "C", "D"]:
             if letter in output_text:
-                return ['A', 'B', 'C', 'D'].index(letter)
+                return ["A", "B", "C", "D"].index(letter)
         return -1
 
     def convert_answer(self, answer) -> str:
         return self.choices[int(answer)]
 
     def parse_data(self) -> tuple[Any, Any, list[Any]]:
-
         data = read_jsonl(self.data_path)
         test_data = list()
         for index, content in enumerate(data):
@@ -131,15 +136,18 @@ def parse_data(self) -> tuple[Any, Any, list[Any]]:
                 ("D", content["endings"][3]),
             ]
 
-            test_data.append({index: {"prompt": prompt, "answer": answer, "options": option}})
+            test_data.append(
+                {index: {"prompt": prompt, "answer": answer, "options": option}}
+            )
 
         if self.data_batch is None:
             self.data_batch = len(test_data)
         return unify_data(test_data, self.data_batch, self.data_type)
 
     def evaluate(self, predictions, ground_truth, options):
-
-        assert len(predictions) == len(ground_truth), "Prediction and Ground Truth list must be the same length."
+        assert len(predictions) == len(ground_truth), (
+            "Prediction and Ground Truth list must be the same length."
+        )
 
         total_em = 0.0
         total_f1 = 0.0
@@ -163,10 +171,12 @@ def evaluate(self, predictions, ground_truth, options):
             "EM": total_em / n,
             "F1 (penalized)": total_f1 / n,
             "Jaccard": total_jaccard / n,
-            "Embedding Sim": total_embed_sim / n
+            "Embedding Sim": total_embed_sim / n,
         }
 
-        print(f"The test result of lite_llama inference for {self.data_type} dataset: {scores}")
+        print(
+            f"The test result of lite_llama inference for {self.data_type} dataset: {scores}"
+        )
 
 
 def matched_pairs(list1, list2, n):
@@ -197,27 +207,26 @@ def unify_data(test_data, data_batch, data_type: Optional[str]):
     for index, data in enumerate(test_data):
         key = next(iter(data))
 
-        ground_truth.append(data[key]['answer'])
-        prompts.append(data[key]['prompt'])
+        ground_truth.append(data[key]["answer"])
+        prompts.append(data[key]["prompt"])
         if data_type == "mcq":
-            options.append(data[key]['options'])
+            options.append(data[key]["options"])
     ground_truth, prompts = matched_pairs(ground_truth, prompts, data_batch)
 
     return ground_truth, prompts, options
 
 
-
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
 
     def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
+        return re.sub(r"\b(a|an|the)\b", " ", text)
 
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
 
     def remove_punc(text):
-        return text.translate(str.maketrans('', '', string.punctuation))
+        return text.translate(str.maketrans("", "", string.punctuation))
 
     def lower(text):
         return text.lower()
@@ -229,9 +238,11 @@ def remove_consecutive_duplicates(text):
         for i in range(1, len(words)):
             if words[i] != words[i - 1]:
                 result.append(words[i])
-        return ' '.join(result)
+        return " ".join(result)
 
-    return remove_consecutive_duplicates(white_space_fix(remove_articles(remove_punc(lower(s)))))
+    return remove_consecutive_duplicates(
+        white_space_fix(remove_articles(remove_punc(lower(s))))
+    )
 
 
 def exact_match(pred, gt):
@@ -256,7 +267,6 @@ def penalized_f1(prediction, ground_truth, max_len_ratio=3, penalty_factor=0.5):
     return f1
 
 
-
 def jaccard_similarity(prediction, ground_truth):
     pred_tokens = set(normalize_answer(prediction).split())
     gt_tokens = set(normalize_answer(ground_truth).split())
@@ -268,8 +278,11 @@ def jaccard_similarity(prediction, ground_truth):
     union = pred_tokens | gt_tokens
     return len(intersection) / len(union)
 
+
 def embedding_similarity(prediction, ground_truth):
-    embeddings = embedding_model.encode([prediction, ground_truth], convert_to_tensor=True)
+    embeddings = embedding_model.encode(
+        [prediction, ground_truth], convert_to_tensor=True
+    )
     sim_score = util.cos_sim(embeddings[0], embeddings[1])
     return sim_score.item()
 
@@ -284,18 +297,17 @@ def extract_final_choice(text: str) -> Any | None:
     # Priority: explicit natural language conclusion
     # Pattern 1: match "answer: a", "correct answer is: b", etc.
     patterns = [
-        r'answer\s*[:\-]?\s*([a-dA-D])\b',
-        r'option\s*([a-dA-D])\b',
-        r'\b([a-dA-D])\b\s+is\s+(correct|the answer)',
-        r'\b([a-dA-D])[\).]',
-        r'choice\s*[:\-]?\s*([a-dA-D])\b'
+        r"answer\s*[:\-]?\s*([a-dA-D])\b",
+        r"option\s*([a-dA-D])\b",
+        r"\b([a-dA-D])\b\s+is\s+(correct|the answer)",
+        r"\b([a-dA-D])[\).]",
+        r"choice\s*[:\-]?\s*([a-dA-D])\b",
     ]
     for pat in patterns:
         match = re.search(pat, text, re.IGNORECASE)
         if match:
             return match.group(1).upper()
 
-
     return None
 
 
@@ -314,10 +326,11 @@ def match_mc_option(prediction, options):
     # Compute cosine similarity
     cos_sims = util.cos_sim(pred_emb, option_embs)[0]  # shape: (4,)
     best_idx = int(torch.argmax(cos_sims).item())
-    return options[best_idx][0], cos_sims.tolist()  # Returns the matching option ID and all similarities
-
-if __name__ == '__main__':
+    return options[best_idx][
+        0
+    ], cos_sims.tolist()  # Returns the matching option ID and all similarities
 
 
+if __name__ == "__main__":
     hw = HellaSwag("/path_to/hellaswag_val.jsonl")
-    hw.process()
\ No newline at end of file
+    hw.process()
diff --git a/examples/evaluator/eval_acc.py b/examples/evaluator/eval_acc.py
new file mode 100644
index 0000000..4ac65e2
--- /dev/null
+++ b/examples/evaluator/eval_acc.py
@@ -0,0 +1,57 @@
+import warnings
+
+warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
+import torch
+
+from eval import *
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from lite_llama.inference import Inference
+
+class EvaluatorAccuracy(object):
+    def __init__(self, test_data_path, custom_checkpoints_dir, data_batch=10):
+        self.custom_checkpoints_dir = custom_checkpoints_dir
+        self.test_data_path = test_data_path
+        self.data_batch = data_batch
+
+        # init inference
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.model_inference = Inference(
+            temperature=0.7,
+            top_p=0.8,
+            max_seq_len=2048,
+            max_gen_len=1900,
+            lite_llama_ckpt_dir=self.custom_checkpoints_dir,
+            device=self.device,
+        )
+
+    def process(
+        self,
+    ):
+        if "hotpot" in self.test_data_path.lower():
+            data_obj = HotpotQA(self.test_data_path, self.data_batch)
+
+        elif "hellaswag" in self.test_data_path.lower():
+            data_obj = HellaSwag(self.test_data_path, self.data_batch)
+
+        try:
+            assert data_obj is not None, "data_obj has not been created"
+        except NameError:
+            raise AssertionError("Dataset may not be supported")
+
+        ground_truth, prompts, options = data_obj.parse_data()
+
+        predictions = self.model_inference.process(prompts)
+
+        if data_obj.data_type == "mcq":
+            data_obj.evaluate(predictions, ground_truth, options)
+        else:
+            data_obj.evaluate(predictions, ground_truth)
+
+
+if __name__ == "__main__":
+    ea = EvaluatorAccuracy(
+        "/path_to/hotpot_dev_distractor_v1.json", "/path_to/Llama-3.2-3B-Instruct"
+    )
+    ea.process()
diff --git a/examples/example_chat.py b/examples/example_chat.py
index 3def89e..0e9b542 100644
--- a/examples/example_chat.py
+++ b/examples/example_chat.py
@@ -2,19 +2,24 @@
 import torch
 
 import sys, os, time
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.generate import GenerateText
 from lite_llama.generate_stream import GenerateStreamText
 import warnings
+
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
-checkpoints_dir = "/gemini/code/lite_llama/my_weight/Qwen2.5-3B" # 改成自己的存放模型路径
+checkpoints_dir = (
+    "/homg/honggao/lite_llama/my_weight/Qwen2.5-3B"  # 改成自己的存放模型路径
+)
+
 
 def cli_generate_stream(
     temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 512,
-    max_gpu_num_blocks = None,
+    max_gpu_num_blocks=None,
     max_gen_len: Optional[int] = 128,
 ):
     """
@@ -28,12 +33,12 @@ def cli_generate_stream(
         max_gen_len (int): 生成序列的最大长度。
 
     """
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = "cuda" if torch.cuda.is_available() else "cpu"
 
     generator = GenerateStreamText(
         checkpoints_dir=checkpoints_dir,
         tokenizer_path=checkpoints_dir,
-        max_gpu_num_blocks = max_gpu_num_blocks, 
+        max_gpu_num_blocks=max_gpu_num_blocks,
         max_seq_len=max_seq_len,
         load_model=True,
         compiled_model=True,
@@ -50,13 +55,12 @@ def cli_generate_stream(
 
         I just """,
         "Roosevelt was the first president of the United States, he has",
-
-        "Here are some tips and resources to help you get started:"
+        "Here are some tips and resources to help you get started:",
     ]
 
     for idx, prompt in enumerate(prompts):
         print(f"Prompt {idx}: {prompt}")
-        print("Generated output:", end='', flush=True)
+        print("Generated output:", end="", flush=True)
 
         stream = generator.text_completion_stream(
             [prompt],
@@ -66,20 +70,21 @@ def cli_generate_stream(
         )
 
         # 初始化生成结果
-        completion = ''
+        completion = ""
         for batch_completions in stream:
-            new_text = batch_completions[0]['generation'][len(completion):]
-            completion = batch_completions[0]['generation']
-            print(new_text, end='', flush=True)
+            new_text = batch_completions[0]["generation"][len(completion) :]
+            completion = batch_completions[0]["generation"]
+            print(new_text, end="", flush=True)
         print("\n\n==================================\n")
 
+
 def cli_generate(
-	temperature: float = 0.6,
+    temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 512,
     max_gen_len: Optional[int] = 64,
 ):
-	"""
+    """
     Entry point of the program for generating text using a pretrained model.
 
     Args:
@@ -94,44 +99,46 @@ def cli_generate(
         max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be
             set to the model's max sequence length. Defaults to None.
     """
-	device = 'cuda' if torch.cuda.is_available() else 'cpu'
-	
-	generator = GenerateText(
-		checkpoints_dir = checkpoints_dir,
-		tokenizer_path = checkpoints_dir,
-		max_seq_len = max_seq_len,
-		load_model = True,
-		compiled_model = True,
-		triton_weight = True,
-		device = device,
-	)
-
-	prompts: List[str] = [
-		# For these prompts, the expected answer is the natural continuation of the prompt
-		"I believe the meaning of life is",
-		"Simply put, the theory of relativity states that ",
-		"""A brief message congratulating the team on the launch:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    generator = GenerateText(
+        checkpoints_dir=checkpoints_dir,
+        tokenizer_path=checkpoints_dir,
+        max_seq_len=max_seq_len,
+        load_model=True,
+        compiled_model=True,
+        triton_weight=True,
+        device=device,
+    )
+
+    prompts: List[str] = [
+        # For these prompts, the expected answer is the natural continuation of the prompt
+        "I believe the meaning of life is",
+        "Simply put, the theory of relativity states that ",
+        """A brief message congratulating the team on the launch:
 
 		Hi everyone,
 		
 		I just """,
-		"Roosevelt was the first president of the United States, he has",
-	]
-
-	results = generator.text_completion(
-		prompts,
-		temperature=temperature,
-		top_p=top_p,
-		max_gen_len=max_gen_len,
-	)
-
-	for prompt, result in zip(prompts, results):
-		print(prompt)
-		print(f"> {result['generation']}")
-		print("\n==================================\n")
-
-def main(stream_flag = False):
+        "Roosevelt was the first president of the United States, he has",
+    ]
+
+    results = generator.text_completion(
+        prompts,
+        temperature=temperature,
+        top_p=top_p,
+        max_gen_len=max_gen_len,
+    )
+
+    for prompt, result in zip(prompts, results):
+        print(prompt)
+        print(f"> {result['generation']}")
+        print("\n==================================\n")
+
+
+def main(stream_flag=False):
     cli_generate_stream() if stream_flag else cli_generate()
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/evaluator/evaluate_accuracy.py b/examples/example_eval_acc.py
similarity index 63%
rename from evaluator/evaluate_accuracy.py
rename to examples/example_eval_acc.py
index ba611f7..7ea940e 100644
--- a/evaluator/evaluate_accuracy.py
+++ b/examples/example_eval_acc.py
@@ -1,18 +1,13 @@
 import warnings
-import string
-
-import sys, os, time
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
-
-
-from lite_llama_inference import LiteLlamaInference
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, Qwen2ForCausalLM
 
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
+import torch
 
-from utils.eval import *
+from evaluator.eval import *
 
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from lite_llama.inference import Inference
 
 class EvaluatorAccuracy(object):
     def __init__(self, test_data_path, custom_checkpoints_dir, data_batch=10):
@@ -21,18 +16,20 @@ def __init__(self, test_data_path, custom_checkpoints_dir, data_batch=10):
         self.data_batch = data_batch
 
         # init inference
-        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
-        self.lite_llama_inference = LiteLlamaInference(
+        self.model_inference = Inference(
             temperature=0.7,
             top_p=0.8,
             max_seq_len=2048,
             max_gen_len=1900,
             lite_llama_ckpt_dir=self.custom_checkpoints_dir,
-            device=self.device
+            device=self.device,
         )
-    def process(self,):
 
+    def process(
+        self,
+    ):
         if "hotpot" in self.test_data_path.lower():
             data_obj = HotpotQA(self.test_data_path, self.data_batch)
 
@@ -46,7 +43,7 @@ def process(self,):
 
         ground_truth, prompts, options = data_obj.parse_data()
 
-        predictions = self.lite_llama_inference.process(prompts)
+        predictions = self.model_inference.process(prompts)
 
         if data_obj.data_type == "mcq":
             data_obj.evaluate(predictions, ground_truth, options)
@@ -54,9 +51,8 @@ def process(self,):
             data_obj.evaluate(predictions, ground_truth)
 
 
-if __name__ == '__main__':
-    ea = EvaluatorAccuracy("/path_to/hotpot_dev_distractor_v1.json",
-                           "/path_to/Llama-3.2-3B-Instruct")
-    # ea = EvaluatorAccuracy("/path_to/hellaswag_val.jsonl",
-    #                        "/path_to/Llama-3.2-3B-Instruct")
-    ea.process()
\ No newline at end of file
+if __name__ == "__main__":
+    ea = EvaluatorAccuracy(
+        "/path_to/hotpot_dev_distractor_v1.json", "/path_to/Llama-3.2-3B-Instruct"
+    )
+    ea.process()
diff --git a/examples/example_llava.py b/examples/example_llava.py
index 6ad0286..428a2d1 100644
--- a/examples/example_llava.py
+++ b/examples/example_llava.py
@@ -2,31 +2,35 @@
 from typing import Optional
 
 import sys, os
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
-from lite_llama.llava_generate_stream import LlavaGeneratorStream # 导入 GenerateText 类\
+from lite_llama.llava_generate_stream import (
+    LlavaGeneratorStream,
+)  # 导入 GenerateText 类
 
 checkpoints_dir = "/gemini/code/lite_llama/my_weight/llava-1.5-7b-hf"
 
+
 def main(
     temperature: float = 0.6,
     top_p: float = 0.9,
     max_seq_len: int = 2048,
-    max_gpu_num_blocks = None,
+    max_gpu_num_blocks=None,
     max_gen_len: Optional[int] = 64,
     load_model: bool = True,
     compiled_model: bool = True,
-    triton_weight: bool = True
+    triton_weight: bool = True,
 ):
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
     generator = LlavaGeneratorStream(
         checkpoints_dir=checkpoints_dir,
         tokenizer_path=checkpoints_dir,
-        max_gpu_num_blocks = max_gpu_num_blocks,
-        max_seq_len = max_seq_len,
-        load_model = load_model,
-        compiled_model = compiled_model,
-        triton_weight = triton_weight,
+        max_gpu_num_blocks=max_gpu_num_blocks,
+        max_seq_len=max_seq_len,
+        load_model=load_model,
+        compiled_model=compiled_model,
+        triton_weight=triton_weight,
         device=device,
     )
 
@@ -42,13 +46,14 @@ def main(
         max_gen_len=max_gen_len,
     )
 
-    completion = '' # 初始化生成结果
+    completion = ""  # 初始化生成结果
     # NOTE: 创建了一个 generator 后，可以通过 for 循环来迭代它
     for batch_completions in stream:
-        new_text = batch_completions[0]['generation'][len(completion):]
-        completion = batch_completions[0]['generation']
-        print(new_text, end=' ', flush=True)
+        new_text = batch_completions[0]["generation"][len(completion) :]
+        completion = batch_completions[0]["generation"]
+        print(new_text, end=" ", flush=True)
     print("\n\n==================================\n")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/generate.py b/generate.py
index 8a38d8b..6d359e0 100644
--- a/generate.py
+++ b/generate.py
@@ -1,14 +1,14 @@
 import torch
 from typing import Optional
+warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
+from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
 from lite_llama.utils.prompt_templates import get_prompter
 from lite_llama.generate_stream import GenerateStreamText  # 导入 GenerateText 类
 import warnings
 
-warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-from utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
-
 import sys, os, time
 from pathlib import Path
+
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
@@ -22,7 +22,7 @@ def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
     ram_after = process.memory_info().rss
     vram_after = get_gpu_memory(gpu_type)
 
-    ram_used = (ram_after - ram_before) / (1024 ** 3)  # Bytes to GB
+    ram_used = (ram_after - ram_before) / (1024**3)  # Bytes to GB
 
     if vram_before is not None and vram_after is not None:
         vram_used = vram_after - vram_before
@@ -35,21 +35,21 @@ def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
 
 
 def main(
-        prompt: str = "Hello, my name is",
-        *,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_seq_len: int = 2048,
-        max_gpu_num_blocks=40960,
-        max_gen_len: Optional[int] = 1024,
-        load_model: bool = True,
-        compiled_model: bool = False,
-        triton_weight: bool = True,
-        gpu_type: str = "nvidia",
-        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
-        quantize: Optional[str] = None,
+    prompt: str = "Hello, my name is",
+    *,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    max_seq_len: int = 2048,
+    max_gpu_num_blocks=40960,
+    max_gen_len: Optional[int] = 1024,
+    load_model: bool = True,
+    compiled_model: bool = False,
+    triton_weight: bool = True,
+    gpu_type: str = "nvidia",
+    checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+    quantize: Optional[str] = None,
 ):
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
 
@@ -57,7 +57,9 @@ def main(
         short_prompt = True
     else:
         short_prompt = False
-    model_prompter = get_prompter(get_model_type(checkpoint_path), checkpoint_path, short_prompt)
+    model_prompter = get_prompter(
+        get_model_type(checkpoint_path), checkpoint_path, short_prompt
+    )
     # Start resource tracking
     ram_before = process.memory_info().rss
 
@@ -88,23 +90,26 @@ def main(
     )
     end = time.perf_counter()
 
-    completion = ''  # Initialize to generate the result
+    completion = ""  # Initialize to generate the result
     # NOTE: After creating a generator, it can be iterated through a for loop
     text_msg = ""
     for batch_completions in stream:
-        new_text = batch_completions[0]['generation'][len(completion):]
-        completion = batch_completions[0]['generation']
-        print(new_text, end='', flush=True)
-        text_msg +=new_text
+        new_text = batch_completions[0]["generation"][len(completion) :]
+        completion = batch_completions[0]["generation"]
+        print(new_text, end="", flush=True)
+        text_msg += new_text
 
     print("\n\n==================================\n")
-    print(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+    print(
+        f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer) / (end - start):.2f} tokens/sec"
+    )
 
     # Report resource usage
     report_resource_usage(ram_before, vram_before, gpu_type)
 
+
 if __name__ == "__main__":
     from jsonargparse import CLI
 
     torch.set_float32_matmul_precision("high")
-    CLI(main)
\ No newline at end of file
+    CLI(main)
diff --git a/lite_llama/__init__.py b/lite_llama/__init__.py
index 6c82047..8226cf6 100644
--- a/lite_llama/__init__.py
+++ b/lite_llama/__init__.py
@@ -1,3 +1,3 @@
 from lite_llama.generate import GenerateText
 from lite_llama.generate_stream import GenerateStreamText
-from lite_llama.llava_generate_stream import LlavaGeneratorStream
\ No newline at end of file
+from lite_llama.llava_generate_stream import LlavaGeneratorStream
diff --git a/lite_llama/executor/cuda_graph.py b/lite_llama/executor/cuda_graph.py
index 50f6437..c454229 100644
--- a/lite_llama/executor/cuda_graph.py
+++ b/lite_llama/executor/cuda_graph.py
@@ -6,7 +6,10 @@
 from ..models.utils import weak_ref_tensor
 
 _BATCH_SIZE_ALIGNMENT = 8
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [_BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)]
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025)
+]
+
 
 class CUDAGraphRunner:
     def __init__(self, model):
@@ -14,17 +17,17 @@ def __init__(self, model):
         self._cuda_graph = None
         self._graph_inputs: Dict[str, torch.Tensor] = {}
         self._graph_output = None
-    
+
     def capture(
-        self, 
-        input_ids: torch.Tensor, 
-        position_ids: torch.Tensor, 
-        atten_info: AttentionInfo
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        atten_info: AttentionInfo,
     ):
         assert self._cuda_graph is None, "Already compiled the model"
         # 用于捕获的占位符输入
         self._graph_inputs = [input_ids, position_ids, atten_info]
-        
+
         # Warm up
         graph_capture_stream = torch.cuda.Stream()
         graph_capture_stream.wait_stream(torch.cuda.current_stream())
@@ -44,7 +47,7 @@ def capture(
                 position_ids=position_ids,
                 atten_info=atten_info,
             )
-        
+
         # Save the input and output buffers.
         self._graph_inputs = {
             "input_ids": input_ids,
@@ -56,15 +59,17 @@ def capture(
         }
 
     def forward(
-        self, 
-        input_ids: torch.Tensor, 
-        position_ids: torch.Tensor, 
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
         atten_info: AttentionInfo,
     ):
-        del atten_info.kv_buffer # kv_buffer are fixed tensors, so we don't need to copy them.
+        del (
+            atten_info.kv_buffer
+        )  # kv_buffer are fixed tensors, so we don't need to copy them.
         del atten_info.b_req_tokens_table
         # 更新输入缓冲区
-        self._graph_inputs["input_ids"].copy_(input_ids) # 据填充 graph 的输入内存
+        self._graph_inputs["input_ids"].copy_(input_ids)  # 据填充 graph 的输入内存
         self._graph_inputs["position_ids"].copy_(position_ids)
 
         self._graph_inputs["cur_select_index"].copy_(atten_info.cur_select_index)
@@ -73,16 +78,21 @@ def forward(
         self._cuda_graph.replay()
 
         return self._graph_output
-    
+
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
-    
+
+
 class ModelRunner:
-    def __init__(self, model, model_config, 
-                max_gpu_num_blocks:int,
-                kv_mem_manager: KVCacheMemoryManager,
-                req_tokens_manager, 
-                seq_len: int=1, start_pos = 8
+    def __init__(
+        self,
+        model,
+        model_config,
+        max_gpu_num_blocks: int,
+        kv_mem_manager: KVCacheMemoryManager,
+        req_tokens_manager,
+        seq_len: int = 1,
+        start_pos=8,
     ):
         self.model = model
         self.model_config = model_config
@@ -91,7 +101,7 @@ def __init__(self, model, model_config,
         self.req_tokens_manager = req_tokens_manager
 
         self.vocab_size = self.model_config.vocab_size
-        self.graph_max_batch_size=self.model_config.max_batch_size
+        self.graph_max_batch_size = self.model_config.max_batch_size
         self.max_seq_len = model_config.max_seq_len
 
         # 随机参数定义
@@ -102,57 +112,74 @@ def __init__(self, model, model_config,
 
     def build_atten_info(self, batch_size, atten_info, device="cuda"):
         """针对 decode 阶段, 构建 attention 输入信息结构体"""
-        atten_info.kv_buffer = self.kv_mem_manager.gpu_kv_buffer # torch.Tensor
-        atten_info.b_req_tokens_table = self.req_tokens.manager.b_req_tokens_table # torch.Tensor
+        atten_info.kv_buffer = self.kv_mem_manager.gpu_kv_buffer  # torch.Tensor
+        atten_info.b_req_tokens_table = (
+            self.req_tokens.manager.b_req_tokens_table
+        )  # torch.Tensor
 
-        atten_info.b_req_idx = torch.arange(batch_size, device = device) # torch.Tensor
-        atten_info.b_seq_len = torch.ones(batch_size, dtype=torch.int32, device="cuda") # torch.Tensor
-        atten_info.cur_select_index, = self.kv_mem_manager.alloc_kvcache_index(batch_size) # torch.Tensor
+        atten_info.b_req_idx = torch.arange(batch_size, device=device)  # torch.Tensor
+        atten_info.b_seq_len = torch.ones(
+            batch_size, dtype=torch.int32, device="cuda"
+        )  # torch.Tensor
+        (atten_info.cur_select_index,) = self.kv_mem_manager.alloc_kvcache_index(
+            batch_size
+        )  # torch.Tensor
 
         return atten_info
-    
-    def capture_decode_graph(self, ):
+
+    def capture_decode_graph(
+        self,
+    ):
         """
         针对 decode 阶段捕获 CUDA 图
         """
         # 获取要捕获的批量大小列表，确保批量大小不超过最大批量大小
-        batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= self.graph_max_batch_size]
+        batch_size_capture_list = [
+            bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= self.graph_max_batch_size
+        ]
         atten_info = AttentionInfo
         print("cuda graph support batch list", batch_size_capture_list)
-        
+
         # NOTE: Capturing the largest batch size first may help reduce the memory usage of CUDA graph.
         for batch_size in reversed(batch_size_capture_list):
             # 构造输入 tokens id 张量
             input_ids = torch.randint(0, self.vocab_size, (batch_size, 1)).cuda()
             position_ids = (
-                torch.arange(self.start_pos, self.start_pos + 1, device=input_ids.device)
-                .unsqueeze(0)            # shape: [1, seq_len]
+                torch.arange(
+                    self.start_pos, self.start_pos + 1, device=input_ids.device
+                )
+                .unsqueeze(0)  # shape: [1, seq_len]
                 .expand(batch_size, -1)  # shape: [batch_size, seq_len], 不分配额外内存
             )
             atten_info = self.build_atten_info(batch_size, atten_info)
-            print("apply cuda grpah atten_info.decode_index shape ", atten_info.decode_index.shape)
-            
+            print(
+                "apply cuda grpah atten_info.decode_index shape ",
+                atten_info.decode_index.shape,
+            )
+
             graph_intput = (input_ids, position_ids, atten_info)
             graph_runner = CUDAGraphRunner(self.model)
-            
+
             # graph 图捕捉输入
             graph_runner.capture(*graph_intput)
             self.graph_runners[batch_size] = graph_runner
-            
+
             self.kv_mem_manager.free_all()
 
     def decode(
-        self, 
-        input_ids: torch.Tensor, 
-        position_ids: torch.Tensor, 
-        atten_info: AttentionInfo
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        atten_info: AttentionInfo,
     ):
         batch_size = input_ids.shape[0]
         if batch_size in self.graph_runners:
             model_executable = self.graph_runners[batch_size]
         else:
-            print("Warning: CUDA graph not captured for this batch size, falling back to original model.")
+            print(
+                "Warning: CUDA graph not captured for this batch size, falling back to original model."
+            )
             model_executable = self.model
-        
+
         logits = model_executable(input_ids, position_ids, atten_info)
-        return logits
\ No newline at end of file
+        return logits
diff --git a/lite_llama/executor/executor_struct.py b/lite_llama/executor/executor_struct.py
index e02ed8e..5f51bd3 100644
--- a/lite_llama/executor/executor_struct.py
+++ b/lite_llama/executor/executor_struct.py
@@ -1,19 +1,19 @@
 from dataclasses import dataclass
 import torch
-from typing import List
 
 @dataclass
 class ModelRunnerConfig:
     block_size = 1
     checkpoints_dir = "/gemini/code/Llama-3.2-1B-Instruct"
     max_batch_size = 16
-    gpu_memory_utilization=0.9
+    gpu_memory_utilization = 0.9
+
 
 @dataclass
 class AttentionInfo:
     # kv_cache = None # prefill 阶段的 context kv cache
-    kv_buffer = List[torch.tensor([])]
-    cur_select_index = torch.empty((0,),dtype=torch.int32)
+    kv_buffer = list[torch.tensor([])]
+    cur_select_index = torch.empty((0,), dtype=torch.int32)
     b_req_tokens_table = None
     b_start_loc = None
-    b_req_idx = None
\ No newline at end of file
+    b_req_idx = None
diff --git a/lite_llama/executor/mem_manager.py b/lite_llama/executor/mem_manager.py
index 280ef35..3a375df 100644
--- a/lite_llama/executor/mem_manager.py
+++ b/lite_llama/executor/mem_manager.py
@@ -4,25 +4,28 @@
 
 logger = logging.getLogger(__name__)
 
+
 def get_dtype_size(dtype: torch.dtype) -> int:
     """Get the size of the data type in bytes."""
     return torch.tensor([], dtype=dtype).element_size()
 
+
 class ComputeMaxAvailableBlocks:
     """A class that can execute a forward pass with dummy inputs to profile the memory usage of the model.
     and  calculate the maximum possible number of GPU blocks that can be allocated with the remaining free memory.
     if not execute dummy forward run, it should be run after cuda graph!
     """
+
     def __init__(
-        self, 
-        num_layers, 
-        hidden_size, 
-        num_heads, 
-        num_kv_heads, 
-        head_dim = None, 
-        gpu_memory_utilization=0.9, 
-        block_size=1, 
-        dtype="float16"
+        self,
+        num_layers,
+        hidden_size,
+        num_heads,
+        num_kv_heads,
+        head_dim=None,
+        gpu_memory_utilization=0.9,
+        block_size=1,
+        dtype="float16",
     ):
         self.hidden_size = hidden_size
         self.num_heads = num_heads
@@ -31,35 +34,40 @@ def __init__(
         self.head_dim = head_dim
 
         self.gpu_memory_utilization = gpu_memory_utilization
-        self.block_size = block_size # 一个 block 表示多少个 tokens
+        self.block_size = block_size  # 一个 block 表示多少个 tokens
         self.dtype = dtype
-        
+
         if self.dtype in ["float16", "bfloat16", "fp16", "bfp16"]:
             self.dtype_size = 2
         elif self.dtype in ["int8", "fp18"]:
-            self.dtype_size = 1 # byte
+            self.dtype_size = 1  # byte
         else:
             print(f"Unsupported dtype: {self.dtype_size}!")
-        
+
     def compute_cache_block_size_bytes(self):
-        """Get the size of the KV cache block size in bytes.
-        """
+        """Get the size of the KV cache block size in bytes."""
         if self.head_dim is None:
             head_size = self.hidden_size // self.num_heads
         else:
             head_size = self.head_dim
-        
+
         num_layers = self.num_layers
         num_kv_heads = self.num_kv_heads
         # num_heads * head_size = hidden_size
-        kv_cache_token_bytes_per_layer = (num_kv_heads * head_size) * 2 * self.dtype_size
+        kv_cache_token_bytes_per_layer = (
+            (num_kv_heads * head_size) * 2 * self.dtype_size
+        )
         transformer_kv_cache_token_bytes = kv_cache_token_bytes_per_layer * num_layers
 
-        transformer_kv_cache_blocks_bytes = transformer_kv_cache_token_bytes * self.block_size
+        transformer_kv_cache_blocks_bytes = (
+            transformer_kv_cache_token_bytes * self.block_size
+        )
 
         return transformer_kv_cache_blocks_bytes
 
-    def compute_num_available_blocks(self, model_path=None, dummy_input = None, model_byes=None):
+    def compute_num_available_blocks(
+        self, model_path=None, dummy_input=None, model_byes=None
+    ):
         """
         评估模型的峰值内存使用情况，以确定在不发生内存溢出的情况下可以分配的 KV（键值）缓存块的数量。
 
@@ -84,43 +92,47 @@ def compute_num_available_blocks(self, model_path=None, dummy_input = None, mode
         torch.cuda.synchronize()
         # 计算模型加载后的峰值内存使用量. Get the peak memory allocation recorded by torch
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-        
+
         # 清理未使用的缓存，计算非 Torch 分配的内存. 检查是否有任何剩余内存可能已在“torch”之外的 gpu 上分配。例如，NCCL 操作在前向传递期间可能会使用几 GB
         torch.cuda.empty_cache()
         torch_allocated_bytes = torch.cuda.memory_stats()["allocated_bytes.all.current"]
-        
-        total_allocated_bytes = torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+
+        total_allocated_bytes = (
+            torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
+        )
         non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-        
+
         if non_torch_allocations > 0:
             peak_memory += non_torch_allocations
 
         available_kv_cache_memory = (
-            total_gpu_memory * self.gpu_memory_utilization -
-            peak_memory)
-        
+            total_gpu_memory * self.gpu_memory_utilization - peak_memory
+        )
+
         # 计算每个缓存块的大小
         cache_block_size = self.compute_cache_block_size_bytes()
         # 计算在剩余可用内存下，最多可以分配的 GPU 缓存块数量
         num_gpu_blocks = int(
-            (total_gpu_memory * self.gpu_memory_utilization -
-            peak_memory) // cache_block_size
+            (total_gpu_memory * self.gpu_memory_utilization - peak_memory)
+            // cache_block_size
         )
         # 确保缓存块数量不为负数
         num_gpu_blocks = max(num_gpu_blocks, 0)
 
         logger.info(
-                " Memory profiling results: total_gpu_memory = %.2f GB \n"
-                "    initial_memory_usage = %.2f GB peak_torch_memory = %.2f GB \n"
-                "    memory_usage_post_profile = %.2f GB \n"
-                "    non_torch_memory = %.2f GB, kv_cache_size = %.2f GB \n"
-                "    gpu_memory_utilization = %.2f", total_gpu_memory / (1024**3),
-                (total_gpu_memory - free_memory_pre_profile) / (1024**3),
-                (peak_memory - non_torch_allocations) / (1024**3),
-                total_allocated_bytes / (1024**3),
-                non_torch_allocations / (1024**3),
-                available_kv_cache_memory / (1024**3),
-                self.gpu_memory_utilization)
+            " Memory profiling results: total_gpu_memory = %.2f GB \n"
+            "    initial_memory_usage = %.2f GB peak_torch_memory = %.2f GB \n"
+            "    memory_usage_post_profile = %.2f GB \n"
+            "    non_torch_memory = %.2f GB, kv_cache_size = %.2f GB \n"
+            "    gpu_memory_utilization = %.2f",
+            total_gpu_memory / (1024**3),
+            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
+            (peak_memory - non_torch_allocations) / (1024**3),
+            total_allocated_bytes / (1024**3),
+            non_torch_allocations / (1024**3),
+            available_kv_cache_memory / (1024**3),
+            self.gpu_memory_utilization,
+        )
 
         # 进行垃圾回收，释放未使用的内存
         gc.collect()
@@ -129,72 +141,94 @@ def compute_num_available_blocks(self, model_path=None, dummy_input = None, mode
         # 返回可分配的 GPU 和 CPU 缓存块数量（此处 CPU 块数量为 0）
 
         return num_gpu_blocks
-    
+
 
 class KVCacheMemoryManager:
-    def __init__(self, num_layers, num_kv_heads, head_dim, gpu_num_blocks, block_size=1, dtype=torch.float16, device="cuda"):
+    def __init__(
+        self,
+        num_layers,
+        num_kv_heads,
+        head_dim,
+        gpu_num_blocks,
+        block_size=1,
+        dtype=torch.float16,
+        device="cuda",
+    ):
         self.num_layers = num_layers
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
-        self.gpu_num_blocks = gpu_num_blocks # 手动设定的给kv cache 内存管理分配的可用 blocks 数目:gpu_num_blocks
+        self.gpu_num_blocks = gpu_num_blocks  # 手动设定的给kv cache 内存管理分配的可用 blocks 数目:gpu_num_blocks
         self.block_size = block_size
         self.max_num_tokens = gpu_num_blocks * block_size
 
         self.dtype = dtype
         self.device = device
-        self.can_use_mem_size = gpu_num_blocks # 可用的 kv cache tokens 数量
+        self.can_use_mem_size = gpu_num_blocks  # 可用的 kv cache tokens 数量
 
         # 定义 kv 内存位置索引和内存使用状态变量
-        self.kv_mem_pos_indexs = torch.arange(0, self.max_num_tokens, dtype=torch.long, device="cuda")
-        self.kv_mem_use_state = torch.zeros(self.max_num_tokens, dtype = torch.int32, device="cuda")
+        self.kv_mem_pos_indexs = torch.arange(
+            0, self.max_num_tokens, dtype=torch.long, device="cuda"
+        )
+        self.kv_mem_use_state = torch.zeros(
+            self.max_num_tokens, dtype=torch.int32, device="cuda"
+        )
 
         # Initialize the gpu_kv_buffer
         self.init_kv_buffers(
-            self.max_num_tokens,
-            head_dim, num_kv_heads, num_layers, 
-            dtype, device)
+            self.max_num_tokens, head_dim, num_kv_heads, num_layers, dtype, device
+        )
 
-    def init_kv_buffers(self, 
+    def init_kv_buffers(
+        self,
         max_num_tokens,
-        head_dim, num_kv_heads, num_layers,
+        head_dim,
+        num_kv_heads,
+        num_layers,
         dtype,
-        device: str="cuda"
-    )-> List[torch.Tensor]:
+        device: str = "cuda",
+    ) -> List[torch.Tensor]:
         # kv cache shape: config.max_batch_size, config.max_seq_len, self.num_kv_heads, self.head_dim
         # max_num_tokens = max_num_blocks * self.block_size
         # TODO 修改 kv buffer 形状支持 PagedAttention
         self.gpu_kv_buffer = [
-            torch.empty((max_num_tokens, 2 * num_kv_heads, head_dim), dtype=dtype, device=device) for _ in range(num_layers)
+            torch.empty(
+                (max_num_tokens, 2 * num_kv_heads, head_dim), dtype=dtype, device=device
+            )
+            for _ in range(num_layers)
         ]
         logger.debug(f"gpu_kv_buffer per layer shape: {self.gpu_kv_buffer[0].shape}")
-    
+
     @torch.no_grad()
     def alloc_kvcache(self, need_size):
         if need_size > self.can_use_mem_size:
-            logger.warning(f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}")
+            logger.warning(
+                f"warn no enough cache need_size {need_size} left_size {self.can_use_mem_size}"
+            )
             return None
-        
+
         can_use_pos_index = torch.nonzero(self.kv_mem_use_state == 0).view(-1)
         select_index = can_use_pos_index[0:need_size]
         self.add_ref(select_index)
-        
+
         return select_index
-    
+
     @torch.no_grad()
     def alloc_contiguous_kvcache(self, need_size):
         if need_size > self.can_use_mem_size:
-            logger.warning(f"warn no enough contiguous cache need_size {need_size} left_size {self.can_use_mem_size}")
+            logger.warning(
+                f"warn no enough contiguous cache need_size {need_size} left_size {self.can_use_mem_size}"
+            )
             return None
 
         # 获取未使用的内存块索引
         can_use_pos_index = torch.nonzero(self.kv_mem_use_state == 0).view(-1)
         N = can_use_pos_index.numel()
         if N >= need_size:
-            # 正确地计算 start_indexs 和 end_indexs. 
+            # 正确地计算 start_indexs 和 end_indexs.
             # NOTE: 起始索引不能大于 N - need_size, 又因为 [: index] 切片操作是不包含 index 的, 所以需要将 N - need_size 加 1
-            start_indexs = can_use_pos_index[:N - need_size + 1]
+            start_indexs = can_use_pos_index[: N - need_size + 1]
             # NOTE: can_use_pos_index[3:], 将获取索引为 3 到 9 的元素。
-            end_indexs = can_use_pos_index[need_size - 1:]
+            end_indexs = can_use_pos_index[need_size - 1 :]
             diff = end_indexs - start_indexs
 
             # 寻找连续的块，差值应为 need_size - 1
@@ -208,7 +242,7 @@ def alloc_contiguous_kvcache(self, need_size):
                 return select_index, start_index, end_index
 
         return None
-    
+
     @torch.no_grad()
     def alloc_kvcache_index(self, need_size):
         alloc_mem = self.alloc_contiguous_kvcache(need_size)
@@ -222,9 +256,9 @@ def alloc_kvcache_index(self, need_size):
                 dtype=self.dtype,
                 device=self.device,
             )
-        
+
         return select_index.to(torch.int32), kv_cache
-    
+
     # 增加引用计数
     @torch.no_grad()
     def add_ref(self, token_index: torch.Tensor):
@@ -232,10 +266,10 @@ def add_ref(self, token_index: torch.Tensor):
         has_used_tokens = torch.count_nonzero(state).item()
         all_tokens = len(state)
         self.can_use_mem_size -= all_tokens - has_used_tokens
-  
+
         self.kv_mem_use_state[token_index] += 1
         return
-    
+
     # 减少引用计数
     @torch.no_grad()
     def release_ref(self, token_index: torch.Tensor):
@@ -248,11 +282,11 @@ def release_ref(self, token_index: torch.Tensor):
         all_tokens = len(state)
         self.can_use_mem_size += all_tokens - used_tokens
         return
-    
+
     # 释放键值缓存缓冲区
     def _free_buffers(self):
         self.gpu_kv_buffer = None
-    
+
     # 释放指定的kv cache 内存块索引
     @torch.no_grad()
     def free(self, free_index):
@@ -261,16 +295,19 @@ def free(self, free_index):
         if self.can_use_mem_size == len(self.mem_state):
             logger.debug(f"freed all gpu mem size {self.can_use_mem_size}")
         return
-    
+
     # 释放所有内存
     @torch.no_grad()
-    def free_all(self,):
+    def free_all(
+        self,
+    ):
         self.can_use_mem_size = len(self.kv_mem_use_state)
         self.kv_mem_use_state[:] = 0
 
+
 def indexs_convert(indexs: torch.tensor, batch_size: int):
     """
     prefill 阶段分配的kv cache 索引和 decode 阶段分配的索引合并在一起需要做变换
     TODO: 支持连续批处理开发时用上.
     """
-    pass
\ No newline at end of file
+    pass
diff --git a/lite_llama/executor/model_executor.py b/lite_llama/executor/model_executor.py
index 1b349fe..8f7931d 100644
--- a/lite_llama/executor/model_executor.py
+++ b/lite_llama/executor/model_executor.py
@@ -1,7 +1,9 @@
-import torch, json, time
-from pathlib import Path
+import torch
 import torch.nn as nn
 
+import json, time
+from pathlib import Path
+
 from transformers import LlavaConfig
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
@@ -11,16 +13,20 @@
 from .cuda_graph import ModelRunner
 from .executor_struct import AttentionInfo
 from ..models.model_config import LlamaConfig, Qwen2Config
-from .weight_convert import convert_llama_torch_to_litellama, \
-                            convert_llavallama_hf_to_litellama, \
-                            convert_qwen2_hf_to_litellama
+from .weight_convert import (
+    convert_llama_torch_to_litellama,
+    convert_llavallama_hf_to_litellama,
+    convert_qwen2_hf_to_litellama,
+)
 from ..kernels import update_kv_index
+
+
 import sys, os
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from utils.logger import log
 
-    
+
 def get_conversion_func(model_type: str):
     """
     根据模型类型获取相应的权重转换函数。
@@ -37,7 +43,8 @@ def get_conversion_func(model_type: str):
         "llava": convert_llavallama_hf_to_litellama,
     }
     return conversion_funcs.get(model_type.lower())
-    
+
+
 class ModelExecutor:
     # 定义类属性
     model_config = None
@@ -48,13 +55,13 @@ class ModelExecutor:
     # 通过静态方法 build 将类属性当作默认配置使用
     @staticmethod
     def build(
-        checkpoints_dir: str, 
+        checkpoints_dir: str,
         max_seq_len: int,
-        max_gpu_num_blocks: None, 
-        load_model: bool = True, 
+        max_gpu_num_blocks: None,
+        load_model: bool = True,
         triton_weight: bool = True,
-        compiled_model: bool = False, 
-        device: str = "cuda", 
+        compiled_model: bool = False,
+        device: str = "cuda",
     ):
         """
         构建 ModelExecutor 实例, 加载模型、分词器和初始化推理信息结构体 atten_info。
@@ -67,20 +74,34 @@ def build(
 
         返回:
             ModelExecutor: 初始化后的 ModelExecutor 实例。
-        """            
-        model_config = ModelExecutor._load_model_config(checkpoints_dir, max_seq_len, device=device)
+        """
+        model_config = ModelExecutor._load_model_config(
+            checkpoints_dir, max_seq_len, device=device
+        )
         # model = ModelExecutor._accelerate_load_weight(model_config, checkpoints_dir)
-        model = ModelExecutor._load_model_weight(model_config, checkpoints_dir, load_model, triton_weight, device=device) # 加载权重后的模型
+        model = ModelExecutor._load_model_weight(
+            model_config, checkpoints_dir, load_model, triton_weight, device=device
+        )  # 加载权重后的模型
 
-        return ModelExecutor(model_config, model, max_gpu_num_blocks, compiled_model, device)
+        return ModelExecutor(
+            model_config, model, max_gpu_num_blocks, compiled_model, device
+        )
 
     @staticmethod
-    def _accelerate_load_weight(model_config, checkpoints_dir, load_model = True, triton_weight=True, device="cuda"):
+    def _accelerate_load_weight(
+        model_config,
+        checkpoints_dir,
+        load_model=True,
+        triton_weight=True,
+        device="cuda",
+    ):
         with init_empty_weights():
             model = ModelExecutor._initialize_model(model_config, device=device)
 
         # 假设 model 是使用 init_empty_weights 初始化的空模型
-        model = load_checkpoint_and_dispatch(model, checkpoints_dir, device_map="auto", dtype=torch.float16 )
+        model = load_checkpoint_and_dispatch(
+            model, checkpoints_dir, device_map="auto", dtype=torch.float16
+        )
 
         # 将模型转换为半精度, 并验证抓换
         model.to(device)
@@ -90,34 +111,48 @@ def _accelerate_load_weight(model_config, checkpoints_dir, load_model = True, tr
         log.info("Converted model to half precision (FP16)")
 
         return model
-    
+
     @staticmethod
-    def _load_model_weight(model_config, checkpoints_dir, load_model = True, triton_weight=True, device="cuda"):
+    def _load_model_weight(
+        model_config,
+        checkpoints_dir,
+        load_model=True,
+        triton_weight=True,
+        device="cuda",
+    ):
         start_time = time.time()
         hf_sd = None
-            
+
         # 初始化模型
         with init_empty_weights():
             model = ModelExecutor._initialize_model(model_config, device=device)
             state_dict = None
-        
+
         if load_model:
             checkpoints = sorted(Path(checkpoints_dir).glob("*.pth"))
-            assert len(checkpoints) > 0, f"no checkpoint files found in {checkpoints_dir}"
+            assert len(checkpoints) > 0, (
+                f"no checkpoint files found in {checkpoints_dir}"
+            )
             ckpt_path = str(checkpoints[0])
             log.debug("Type(ckpt_path) ", type(ckpt_path))
             log.info(f'Loading checkpoint "{ckpt_path}"')
             # 使用 torch.load 加载权重文件。torch.load 可以根据需要将权重加载到指定的设备上
-            state_dict = torch.load(ckpt_path, mmap=True, weights_only=True, map_location=device)
+            state_dict = torch.load(
+                ckpt_path, mmap=True, weights_only=True, map_location=device
+            )
         else:
             conversion_func = get_conversion_func(model_config.model_type)
             if conversion_func is None:
                 log.error(f"Unsupported model type: {model_config.model_type}")
                 raise ValueError(f"Unsupported model type: {model_config.model_type}")
             state_dict = conversion_func(checkpoints_dir, hf_sd, model_config)
-            log.info(f"Weight conversion completed. Time elapsed: {time.time() - start_time:.2f} sec")
-            
-        model.load_state_dict(state_dict, strict=True, assign=True) # 将加载的 state_dict 应用到模型实例中。
+            log.info(
+                f"Weight conversion completed. Time elapsed: {time.time() - start_time:.2f} sec"
+            )
+
+        model.load_state_dict(
+            state_dict, strict=True, assign=True
+        )  # 将加载的 state_dict 应用到模型实例中。
         model.eval()
         log.info(f"Loaded state dict in {time.time() - start_time:.2f}s")
 
@@ -126,9 +161,9 @@ def _load_model_weight(model_config, checkpoints_dir, load_model = True, triton_
         for param in model.parameters():
             assert param.dtype == torch.float16, "Model parameters are not in FP16"
         log.info("Converted model to half precision (FP16)")
-        
+
         return model
-    
+
     @staticmethod
     def _initialize_model(model_config, device: str) -> nn.Module:
         """
@@ -142,15 +177,20 @@ def _initialize_model(model_config, device: str) -> nn.Module:
             nn.Module: 初始化后的模型。
         """
         model_type = model_config.model_type.lower()
-        log.info(f"Initializing model of type '{model_type}' and moving it to device '{device}'...")
+        log.info(
+            f"Initializing model of type '{model_type}' and moving it to device '{device}'..."
+        )
         if model_type == "llama":
             from ..models.llama import LlamaModel
+
             model = LlamaModel(model_config)
         elif model_type == "qwen2":
             from ..models.qwen2 import Qwen2Model
+
             model = Qwen2Model(model_config)
         elif model_type == "llava":
             from ..models.llava import LlavaLlama
+
             model = LlavaLlama(model_config)
         else:
             raise ValueError(f"Unsupported model type: {model_type}")
@@ -160,30 +200,36 @@ def _initialize_model(model_config, device: str) -> nn.Module:
 
     @staticmethod
     def _load_model_config(checkpoints_dir, max_seq_len, device="cuda"):
-        
-        params_path = Path(checkpoints_dir) / "config.json" # 定义模型配置文件
+        params_path = Path(checkpoints_dir) / "config.json"  # 定义模型配置文件
         assert params_path.exists(), f"config.json not found in {checkpoints_dir}"
         try:
             with open(params_path, "r") as f:
                 params = json.load(f)
         except FileNotFoundError:
-            log.error(f"Configuration file '{params_path}' does not exist. Please check if the path is correct.")
+            log.error(
+                f"Configuration file '{params_path}' does not exist. Please check if the path is correct."
+            )
             raise
 
-        if params["model_type"]== "llama":
+        if params["model_type"] == "llama":
             model_config: LlamaConfig = LlamaConfig.from_dict(params)
         elif params["model_type"] == "qwen2":
             model_config: Qwen2Config = Qwen2Config(
-                params, 
-                max_seq_len = max_seq_len,
-                device=device
+                params, max_seq_len=max_seq_len, device=device
             )
         elif params["model_type"] == "llava":
             model_config = LlavaConfig.from_pretrained(checkpoints_dir)
 
         return model_config
 
-    def __init__(self, model_config, model, max_gpu_num_blocks=None, compiled_model=False, device="cuda"):
+    def __init__(
+        self,
+        model_config,
+        model,
+        max_gpu_num_blocks=None,
+        compiled_model=False,
+        device="cuda",
+    ):
         self.model_config = model_config
         self.device = device
         if isinstance(model_config, LlavaConfig):
@@ -191,76 +237,85 @@ def __init__(self, model_config, model, max_gpu_num_blocks=None, compiled_model=
             print(f"self.llm_config.max_seq_len: {self.llm_config.max_seq_len}")
         else:
             self.llm_config = model_config
-        
+
         self.max_seq_len = self.llm_config.max_seq_len
         self.model_type = model_config.model_type
-        self.model = model        
+        self.model = model
         self.model_runner = None
-        
+
         if max_gpu_num_blocks:
             self.kv_mem_manager = self._init_mem_manager(max_gpu_num_blocks)
             self.max_gpu_num_tokens = max_gpu_num_blocks
         else:
-            max_gpu_num_blocks, self.max_gpu_num_tokens = self._get_max_avaliable_tokens(gpu_memory_utilization=0.9, block_size=1)
-            self.kv_mem_manager = self._init_mem_manager(max_gpu_num_blocks, block_size=1)
-        
+            max_gpu_num_blocks, self.max_gpu_num_tokens = (
+                self._get_max_avaliable_tokens(gpu_memory_utilization=0.9, block_size=1)
+            )
+            self.kv_mem_manager = self._init_mem_manager(
+                max_gpu_num_blocks, block_size=1
+            )
+
         self.max_request_num = max_gpu_num_blocks // self.max_seq_len
 
-        self.req_tokens_manager = ReqTokensManager(self.max_request_num, self.max_seq_len)
-        self.atten_info = AttentionInfo() # 创建 AttentionInfo 实例
+        self.req_tokens_manager = ReqTokensManager(
+            self.max_request_num, self.max_seq_len
+        )
+        self.atten_info = AttentionInfo()  # 创建 AttentionInfo 实例
         self.atten_info.kv_buffer = self.kv_mem_manager.gpu_kv_buffer
         self.atten_info.b_req_tokens_table = self.req_tokens_manager.b_req_tokens_table
 
         # TODO apply_cuda_graph 新代码有 bug，已经删去，后续等待修复
         self.compiled_model = False
         if self.compiled_model:
-            self.apply_cuda_graph() # 调用 cuda graph 优化
+            self.apply_cuda_graph()  # 调用 cuda graph 优化
 
     def _get_max_avaliable_tokens(self, gpu_memory_utilization=0.9, block_size=1):
         avaliable_blocks = ComputeMaxAvailableBlocks(
-            num_layers = self.llm_config.num_layers, 
-            hidden_size = self.llm_config.hidden_size, 
-            num_heads = self.llm_config.num_heads, 
-            num_kv_heads = self.llm_config.num_kv_heads, 
-            gpu_memory_utilization = gpu_memory_utilization, 
-            block_size = block_size,
+            num_layers=self.llm_config.num_layers,
+            hidden_size=self.llm_config.hidden_size,
+            num_heads=self.llm_config.num_heads,
+            num_kv_heads=self.llm_config.num_kv_heads,
+            gpu_memory_utilization=gpu_memory_utilization,
+            block_size=block_size,
         )
         max_gpu_num_blocks = avaliable_blocks.compute_num_available_blocks()
         max_gpu_num_tokens = max_gpu_num_blocks * block_size
 
         return max_gpu_num_blocks, max_gpu_num_tokens
-    
-    def _init_mem_manager(self, gpu_num_blocks, block_size=1, dtype=torch.float16,  device="cuda"):
+
+    def _init_mem_manager(
+        self, gpu_num_blocks, block_size=1, dtype=torch.float16, device="cuda"
+    ):
         kv_mem_manager = KVCacheMemoryManager(
-            num_layers = self.llm_config.num_layers,
-            num_kv_heads = self.llm_config.num_kv_heads,
-            head_dim = self.llm_config.head_dim,
-
-            gpu_num_blocks = gpu_num_blocks,
-            block_size = block_size,
-            dtype = dtype,
-            device=device
+            num_layers=self.llm_config.num_layers,
+            num_kv_heads=self.llm_config.num_kv_heads,
+            head_dim=self.llm_config.head_dim,
+            gpu_num_blocks=gpu_num_blocks,
+            block_size=block_size,
+            dtype=dtype,
+            device=device,
         )
 
         return kv_mem_manager
 
-    def apply_cuda_graph(self, ):
+    def apply_cuda_graph(
+        self,
+    ):
         """应用 cuda graph 优化
         参数:
             - input_ids: 输入 tokens id 列表, shape: (batch_size, 1)
             - prev_pos: 当前处于第几轮迭代循环, 生成第几个 token
         """
         self.model_runner = ModelRunner(
-            self.model, 
-            self.llm_config, 
-            self.max_gpu_num_tokens, 
+            self.model,
+            self.llm_config,
+            self.max_gpu_num_tokens,
             self.kv_mem_manager,
-            self.req_tokens_manager
+            self.req_tokens_manager,
         )
         self.model_runner.capture_decode_graph()
-    
-    def init_req_to_tokens_table(self,
-        b_req_tokens_table, b_req_idx, b_seq_len, alloc_mem_index
+
+    def init_req_to_tokens_table(
+        self, b_req_tokens_table, b_req_idx, b_seq_len, alloc_mem_index
     ):
         """
         初始化 prefill 阶段已分配的批次请求项的 kv cache 所用 tokens 索引
@@ -275,13 +330,20 @@ def init_req_to_tokens_table(self,
             if i > 0:
                 b_start_loc[i] = start_index
             cur_seq_len = b_seq_len_numpy[i]
-            b_req_tokens_table[b_req_idx_numpy[i], :cur_seq_len] = alloc_mem_index[start_index : start_index + cur_seq_len]
+            b_req_tokens_table[b_req_idx_numpy[i], :cur_seq_len] = alloc_mem_index[
+                start_index : start_index + cur_seq_len
+            ]
             start_index += cur_seq_len
 
         return b_start_loc
 
-    def prefill_alloc_kv_cache(self, 
-        max_prompt_len, actual_prompt_lens, b_req_idx, image_batch_size = None, debug_mode=False,
+    def prefill_alloc_kv_cache(
+        self,
+        max_prompt_len,
+        actual_prompt_lens,
+        b_req_idx,
+        image_batch_size=None,
+        debug_mode=False,
     ):
         """
         start_index:        tensor([  0, 270, 540, 810], device='cuda:0', dtype=torch.int32)
@@ -302,48 +364,61 @@ def prefill_alloc_kv_cache(self,
             image_size = self.model_config.vision_config.image_size
             pathch_size = self.model_config.vision_config.patch_size
             number_patchs = image_size // pathch_size
-            num_patch_indexs = (number_patchs * number_patchs - 1)
+            num_patch_indexs = number_patchs * number_patchs - 1
             max_prompt_len += num_patch_indexs
             actual_prompt_lens += num_patch_indexs
             print(f"num_patch_indexs: {num_patch_indexs}")
-        
+
         context_num_tokens = max_prompt_len * batch_size
         # 一次性分配 bsz * seq_len + (number_patchs * number_patchs - 1) * img_batch_size 个索引
-        self.atten_info.cur_select_index, _ = self.kv_mem_manager.alloc_kvcache_index(context_num_tokens)        
+        self.atten_info.cur_select_index, _ = self.kv_mem_manager.alloc_kvcache_index(
+            context_num_tokens
+        )
         # 初始化每个批次项的实际提示词长度
         self.atten_info.b_seq_len = actual_prompt_lens  # 张量, 形状 [batch_size, 1]
         # 初始化批次请求的当前最大序列上下文长度(对应 kv cache 长度)
-        self.atten_info.max_actual_seq_len = max_prompt_len # int 类型
+        self.atten_info.max_actual_seq_len = max_prompt_len  # int 类型
 
         self.atten_info.b_start_loc = self.init_req_to_tokens_table(
-            self.atten_info.b_req_tokens_table, self.atten_info.b_req_idx, 
-            self.atten_info.b_seq_len, self.atten_info.cur_select_index
+            self.atten_info.b_req_tokens_table,
+            self.atten_info.b_req_idx,
+            self.atten_info.b_seq_len,
+            self.atten_info.cur_select_index,
         )
 
         if debug_mode:
-            print(f"context_num_tokens: {context_num_tokens}, max_prompt_len:{max_prompt_len}, \n \
+            print(
+                f"context_num_tokens: {context_num_tokens}, max_prompt_len:{max_prompt_len}, \n \
                     self.atten_info.cur_select_index: {self.atten_info.cur_select_index},\n \
                     self.atten_info.max_actual_seq_len: {self.atten_info.max_actual_seq_len},\n \
-                    self.atten_info.b_seq_len: { self.atten_info.b_seq_len}, \n \
-                    self.atten_info.b_start_loc: { self.atten_info.b_start_loc}, "
-                )
-        
+                    self.atten_info.b_seq_len: {self.atten_info.b_seq_len}, \n \
+                    self.atten_info.b_start_loc: {self.atten_info.b_start_loc}, "
+            )
+
         return self.atten_info.cur_select_index, num_patch_indexs
-    
+
     def decode_alloc_kv_cache(self, batch_size):
         # TODO: torch.empty 创建的临时张量, 保存分配的非连续 kv_cache 索引空间
-        self.atten_info.cur_select_index, _ = self.kv_mem_manager.alloc_kvcache_index(batch_size)
-        update_kv_index(self.atten_info.b_req_tokens_table, self.atten_info.b_req_idx, 
-                        self.atten_info.b_seq_len, self.atten_info.cur_select_index)
+        self.atten_info.cur_select_index, _ = self.kv_mem_manager.alloc_kvcache_index(
+            batch_size
+        )
+        update_kv_index(
+            self.atten_info.b_req_tokens_table,
+            self.atten_info.b_req_idx,
+            self.atten_info.b_seq_len,
+            self.atten_info.cur_select_index,
+        )
 
         self.atten_info.b_seq_len += 1
         self.atten_info.max_actual_seq_len += 1
-        
-        return self.atten_info.cur_select_index # shape [batch_size,]
-    
-    def forward(self, input_ids, position_ids, image_tensor=None):        
+
+        return self.atten_info.cur_select_index  # shape [batch_size,]
+
+    def forward(self, input_ids, position_ids, image_tensor=None):
         if self.model_type == "llava":
-            logits = self.model.forward(input_ids, position_ids, self.atten_info, image_tensor)
+            logits = self.model.forward(
+                input_ids, position_ids, self.atten_info, image_tensor
+            )
         else:
             logits = self.model.forward(input_ids, position_ids, self.atten_info)
-        return logits
\ No newline at end of file
+        return logits
diff --git a/lite_llama/executor/req_tokens_manager.py b/lite_llama/executor/req_tokens_manager.py
index 21993fd..c645ea9 100644
--- a/lite_llama/executor/req_tokens_manager.py
+++ b/lite_llama/executor/req_tokens_manager.py
@@ -3,36 +3,46 @@
 
 logger = logging.getLogger(__name__)
 
+
 class ReqTokensManager:
     """管理请求序列的 kv 内存 tokens 的类。
-    
+
     TokenTable 将一系列 kv tokens 映射到一组token 表中, 每个 token 表代表请求序列分配的 kv cache 内存空间。
     """
+
     def __init__(self, max_request_num, max_seq_len, mem_manager=None, device="cuda"):
         self.max_can_use_req_size = max_request_num
         self.can_use_req_size = max_request_num
         self.max_seq_len = max_seq_len
-        self.req_state = torch.zeros((max_request_num), dtype=torch.int32, device=device)
+        self.req_state = torch.zeros(
+            (max_request_num), dtype=torch.int32, device=device
+        )
         # 一个二维张量，形状为 [num_requests, max_seq_len]，用于存储每个请求的 Token 索引。
         # 每行表示一个请求，每列表示该请求在特定序列位置上的 Token 索引。
-        self.b_req_tokens_table = torch.zeros((max_request_num, max_seq_len), dtype=torch.int32, device=device)
+        self.b_req_tokens_table = torch.zeros(
+            (max_request_num, max_seq_len), dtype=torch.int32, device=device
+        )
         # self.mem_manager = mem_manager
 
     # 分配批次请求需要的内存空间
     def alloc_req(self, request_num):
         if request_num > self.can_use_req_size:
-            logger.error(f'Insufficient requested capacity, remaining {self.can_use_req_size}')
+            logger.error(
+                f"Insufficient requested capacity, remaining {self.can_use_req_size}"
+            )
             return None
 
-        logical_select_index = torch.nonzero(self.req_state==0).reshape(-1)[:request_num]
+        logical_select_index = torch.nonzero(self.req_state == 0).reshape(-1)[
+            :request_num
+        ]
         self.req_state[logical_select_index] = 1
         self.can_use_req_size -= len(logical_select_index)
         return logical_select_index
-    
+
     # 仅释放批次请求的索引
     def free_reqs(self, free_req_index, free_token_index):
         self.can_use_req_size += len(free_req_index)
-        self.req_state[free_token_index] = 0 # 对应批次请求的索引重新置为 0
+        self.req_state[free_token_index] = 0  # 对应批次请求的索引重新置为 0
         if self.can_use_req_size == len(self.req_state):
             logger.debug(f"freed all request size {self.can_use_req_size}")
         # self.mem_manager.free(free_token_index)
@@ -44,21 +54,28 @@ def free_req(self, free_req_index):
             return
         self.can_use_req_size += 1
         self.req_state[free_req_index] = 0
-        return 
-    
+        return
+
     # 释放所有请求的内存，将所有请求状态 req_state 重置为未分配（都归 0）。
     def free_all(self):
         self.can_use_req_size = self.max_can_use_req_size
         self.req_state[:] = 0
 
+
 import unittest
 import torch
 
+
 class TestReqTokensManager(unittest.TestCase):
     def setUp(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.mem_manager_mock = unittest.mock.MagicMock()
-        self.table = ReqTokensManager(max_request_num=10, max_seq_len=5, mem_manager=self.mem_manager_mock, device=self.device)
+        self.table = ReqTokensManager(
+            max_request_num=10,
+            max_seq_len=5,
+            mem_manager=self.mem_manager_mock,
+            device=self.device,
+        )
 
     def test_alloc_req(self):
         indices = self.table.alloc_req(3)
@@ -84,5 +101,6 @@ def test_invalid_free_req(self):
         self.table.free_req(-1)  # Should not raise an error
         self.table.free_req(100)  # Should not raise an error
 
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/lite_llama/executor/weight_convert.py b/lite_llama/executor/weight_convert.py
index ff13959..67e3fed 100644
--- a/lite_llama/executor/weight_convert.py
+++ b/lite_llama/executor/weight_convert.py
@@ -1,60 +1,65 @@
 from tqdm.auto import tqdm
 import torch, os, shutil, glob
+import os.path as osp
 from typing import Dict
-from ..models.qwen2 import Qwen2Config
 
-def build_new_weight_dir(checkpoints_dir:str, new_sd):
+
+def build_new_weight_dir(checkpoints_dir: str, new_sd):
     # 保存 lite_llama 模型权重并构建新的权重目录
-    model_id = os.path.basename(os.path.normpath(checkpoints_dir))
-    current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件所在的目录
-    my_weight_dir = os.path.join(current_dir, "../../my_weight/" + model_id) # 项目所在根目录
-    os.makedirs(my_weight_dir, exist_ok=True) # 创建文件夹（如果不存在）
-    
+    model_id = osp.basename(osp.normpath(checkpoints_dir))
+    current_dir = osp.dirname(osp.abspath(__file__))  # 获取当前文件所在的目录
+    my_weight_dir = osp.join(
+        current_dir, "../../my_weight/" + model_id
+    )  # 项目所在根目录
+    os.makedirs(my_weight_dir, exist_ok=True)  # 创建文件夹（如果不存在）
+
     # 保存模型的状态字典。
-    torch.save(new_sd, os.path.join(my_weight_dir, model_id + ".pth"), _use_new_zipfile_serialization=True)
+    torch.save(
+        new_sd,
+        osp.join(my_weight_dir, model_id + ".pth"),
+        _use_new_zipfile_serialization=True,
+    )
 
     # 获取所有 JSON 文件
-    json_files = glob.glob(os.path.join(checkpoints_dir, "*.json"))
+    json_files = glob.glob(osp.join(checkpoints_dir, "*.json"))
     for file_path in json_files:
-        shutil.copy(file_path, my_weight_dir) # 复制 hf 权重目录的所有 json 文件到新的目录
+        shutil.copy(file_path, my_weight_dir)  # 复制 hf 权重目录的所有 json 文件到新的目录
         print(f"已复制: {file_path} -> {my_weight_dir}")
 
+    if osp.exists(osp.join(checkpoints_dir, "tokenizer.model")):
+        shutil.copy(osp.join(checkpoints_dir, "tokenizer.model"), my_weight_dir)
+
 def convert_qwen2_hf_to_litellama(
-    checkpoints_dir: str, 
-    hf_sd, 
-    num_layers, 
+    checkpoints_dir: str,
+    hf_sd,
+    num_layers,
     print_params: bool = True,
-    device: str = "cuda"
+    device: str = "cuda",
 ) -> Dict[str, torch.Tensor]:
     """
     将 Hugging Face 格式的预训练模型的权重字典转换为自定义模型的权重字典。
     """
     # 映射嵌入层、映射归一化层、映射模型最后的输出线性层
     mapping = {
-        "model.norm.weight": "norm_weight", 
+        "model.norm.weight": "norm_weight",
         "model.embed_tokens.weight": "embed_tokens.weight",
-        "lm_head.weight": "lm_head_weight", # 只支持 hf 格式模型权重
+        "lm_head.weight": "lm_head_weight",  # 只支持 hf 格式模型权重
     }
 
     # 映射层
     layers = {
-        'model.layers.{i}.self_attn.q_proj.weight': 'layers.{i}.self_attn.q_proj_weight',
-        'model.layers.{i}.self_attn.q_proj.bias': 'layers.{i}.self_attn.q_proj_bias',
-
-        'model.layers.{i}.self_attn.k_proj.weight': 'layers.{i}.self_attn.k_proj_weight',
-        'model.layers.{i}.self_attn.k_proj.bias': 'layers.{i}.self_attn.k_proj_bias',
-
-        'model.layers.{i}.self_attn.v_proj.weight': 'layers.{i}.self_attn.v_proj_weight',
-        'model.layers.{i}.self_attn.v_proj.bias': 'layers.{i}.self_attn.v_proj_bias',
-
-        'model.layers.{i}.self_attn.o_proj.weight': 'layers.{i}.self_attn.o_proj_weight',
-
-        'model.layers.{i}.mlp.gate_proj.weight': 'layers.{i}.mlp.gate_proj.weight',
-        'model.layers.{i}.mlp.up_proj.weight': 'layers.{i}.mlp.up_proj.weight',
-        'model.layers.{i}.mlp.down_proj.weight': 'layers.{i}.mlp.down_proj.weight',
-
-        'model.layers.{i}.input_layernorm.weight': 'layers.{i}.input_layernorm_weight',
-        'model.layers.{i}.post_attention_layernorm.weight': 'layers.{i}.post_attention_layernorm_weight',
+        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.self_attn.q_proj_weight",
+        "model.layers.{i}.self_attn.q_proj.bias": "layers.{i}.self_attn.q_proj_bias",
+        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj_weight",
+        "model.layers.{i}.self_attn.k_proj.bias": "layers.{i}.self_attn.k_proj_bias",
+        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj_weight",
+        "model.layers.{i}.self_attn.v_proj.bias": "layers.{i}.self_attn.v_proj_bias",
+        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj_weight",
+        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
+        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
+        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
+        "model.layers.{i}.input_layernorm.weight": "layers.{i}.input_layernorm_weight",
+        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.post_attention_layernorm_weight",
     }
 
     # 根据 Transformer 层数量生成映射
@@ -70,12 +75,10 @@ def convert_qwen2_hf_to_litellama(
         print(f"key {hf_key}, contains bigger {bigger}")
         custom_key = mapping.get(hf_key, None)
         if custom_key is not None:
-            new_sd[custom_key] = tensor # 浅拷贝
+            new_sd[custom_key] = tensor  # 浅拷贝
         else:
             print(f"custom_key: {custom_key}, hf_key: {hf_key}")
             pass  # 忽略未映射的权重
-    
-    # del hf_sd
 
     # 进行 kv_proj 合并操作
     for i in range(num_layers):
@@ -83,8 +86,13 @@ def convert_qwen2_hf_to_litellama(
         v_key = f"layers.{i}.self_attn.v_proj_weight"
         k_bias_key = f"layers.{i}.self_attn.k_proj_bias"
         v_bias_key = f"layers.{i}.self_attn.v_proj_bias"
-        
-        if k_key in new_sd and v_key in new_sd and k_bias_key in new_sd and v_bias_key in new_sd:
+
+        if (
+            k_key in new_sd
+            and v_key in new_sd
+            and k_bias_key in new_sd
+            and v_bias_key in new_sd
+        ):
             # 1. kv weight 权重合并
             k_tensor = new_sd[k_key]
             v_tensor = new_sd[v_key]
@@ -113,7 +121,7 @@ def convert_qwen2_hf_to_litellama(
 
     # 保存转换好的自定义权重
     build_new_weight_dir(checkpoints_dir, new_sd)
-    
+
     if print_params:
         # 打印预训练模型的参数名称
         print("Pretrained model parameters:")
@@ -127,6 +135,7 @@ def convert_qwen2_hf_to_litellama(
 
     # return new_sd
 
+
 def convert_llama_torch_to_litellama(checkpoints_dir, hf_sd, num_layers):
     """
     将 pytorch bin 格式的模型的权重字典转换为自定义模型的权重字典。
@@ -140,7 +149,7 @@ def convert_llama_torch_to_litellama(checkpoints_dir, hf_sd, num_layers):
     """
     mapping = {
         "tok_embeddings.weight": "embed_tokens.weight",
-        "norm.weight": "norm_weight", 
+        "norm.weight": "norm_weight",
         "output.weight": "lm_head.weight",
     }
 
@@ -150,11 +159,9 @@ def convert_llama_torch_to_litellama(checkpoints_dir, hf_sd, num_layers):
         "layers.{i}.attention.wk.weight": "layers.{i}.attention.k_proj.weight",
         "layers.{i}.attention.wv.weight": "layers.{i}.attention.v_proj.weight",
         "layers.{i}.attention.wo.weight": "layers.{i}.attention.o_proj.weight",
-
         "layers.{i}.feed_forward.w1.weight": "layers.{i}.feed_forward.gate_proj.weight",
         "layers.{i}.feed_forward.w3.weight": "layers.{i}.feed_forward.up_proj.weight",
         "layers.{i}.feed_forward.w2.weight": "layers.{i}.feed_forward.down_proj.weight",
-
         "layers.{i}.attention_norm.weight": "layers.{i}.attention_norm_weight",
         "layers.{i}.ffn_norm.weight": "layers.{i}.ffn_norm_weight",
     }
@@ -173,12 +180,13 @@ def convert_llama_torch_to_litellama(checkpoints_dir, hf_sd, num_layers):
             new_sd[custom_key] = tensor
         else:
             print(f"Warning: Unmapped key {hf_key}")
-    
+
     del hf_sd
 
     build_new_weight_dir(checkpoints_dir, new_sd)
     return new_sd
 
+
 def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
     """
     将 hf 格式的模型的权重字典转换为自定义模型的权重字典。
@@ -192,8 +200,8 @@ def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
     """
     mapping = {
         "model.embed_tokens.weight": "embed_tokens.weight",
-        "model.norm.weight": "norm_weight", 
-        "lm_head.weight": "lm_head.weight"
+        "model.norm.weight": "norm_weight",
+        "lm_head.weight": "lm_head.weight",
     }
 
     layers = {
@@ -202,11 +210,9 @@ def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
         "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj.weight",
         "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj.weight",
         "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj.weight",
-
         "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
         "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
         "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
-
         "model.layers.{i}.input_layernorm.weight": "layers.{i}.attention_norm_weight",
         "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.ffn_norm_weight",
     }
@@ -225,7 +231,7 @@ def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
             new_sd[custom_key] = tensor
         else:
             print(f"Warning: Unmapped key {hf_key}")
-    
+
     # 进行 kv_proj 合并操作
     for i in range(num_layers):
         k_key = f"layers.{i}.self_attn.k_proj.weight"
@@ -236,11 +242,11 @@ def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
             # 假设 k_proj, v_proj 的 shape 都是 [hidden_size, hidden_size]
             # 按最后一维拼接后成为 [2 * hidden_size, hidden_size]
             kv_tensor = torch.cat([k_tensor, v_tensor], dim=0)
-            
+
             # 新增 kv_proj.weight
             kv_key = f"layers.{i}.self_attn.kv_proj_weight"
             new_sd[kv_key] = kv_tensor
-            
+
             # 删除原来的 k_proj, v_proj
             del new_sd[k_key]
             del new_sd[v_key]
@@ -250,7 +256,8 @@ def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
 
     # 将处理后的权重保存到指定目录
     build_new_weight_dir(checkpoints_dir, new_sd)
-    
+
+
 def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
     """
     将 Hugging Face 模型的权重字典转换为自定义模型的权重字典。
@@ -265,7 +272,7 @@ def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
     """
     mapping = {
         "language_model.model.embed_tokens.weight": "language_model.embed_tokens.weight",
-        "language_model.model.norm.weight": "language_model.norm_weight", 
+        "language_model.model.norm.weight": "language_model.norm_weight",
         "language_model.lm_head.weight": "language_model.lm_head.weight",
     }
 
@@ -275,11 +282,9 @@ def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
         "language_model.model.layers.{i}.self_attn.k_proj.weight": "language_model.layers.{i}.self_attn.k_proj.weight",
         "language_model.model.layers.{i}.self_attn.v_proj.weight": "language_model.layers.{i}.self_attn.v_proj.weight",
         "language_model.model.layers.{i}.self_attn.o_proj.weight": "language_model.layers.{i}.self_attn.o_proj.weight",
-
         "language_model.model.layers.{i}.mlp.gate_proj.weight": "language_model.layers.{i}.mlp.gate_proj.weight",
         "language_model.model.layers.{i}.mlp.up_proj.weight": "language_model.layers.{i}.mlp.up_proj.weight",
         "language_model.model.layers.{i}.mlp.down_proj.weight": "language_model.layers.{i}.mlp.down_proj.weight",
-
         "language_model.model.layers.{i}.input_layernorm.weight": "language_model.layers.{i}.attention_norm_weight",
         "language_model.model.layers.{i}.post_attention_layernorm.weight": "language_model.layers.{i}.ffn_norm_weight",
     }
@@ -299,7 +304,7 @@ def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
         else:
             new_sd[hf_key] = tensor
             print(f"Warning: Unmapped key {hf_key}")
-    
+
     # 进行 kv_proj 合并操作
     for i in tqdm(range(num_layers), desc="Mapping kv fusedweights"):
         k_key = f"language_model.layers.{i}.self_attn.k_proj.weight"
@@ -325,4 +330,3 @@ def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
         print(name, parameters.shape)
 
     build_new_weight_dir(checkpoints_dir, new_sd)
-
diff --git a/lite_llama/generate.py b/lite_llama/generate.py
index 378b1f7..fb69469 100644
--- a/lite_llama/generate.py
+++ b/lite_llama/generate.py
@@ -5,7 +5,7 @@
 
 from .executor.model_executor import ModelExecutor
 from .utils.file_interface import get_model_name_from_path
-from .kernels.softmax_split import softmax_split
+
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
@@ -16,7 +16,7 @@ class CompletionPrediction(TypedDict, total=False):
 def sample_top_p(probs, p):
     """
     执行 Top-p (Nucleus) 采样, 从概率分布中采样下一个词。
-    
+
     参数：
         probs (torch.Tensor): 概率分布张量，形状为 `[batch_size, vocab_size]`。
         p (float): 累积概率阈值，取值范围在 0 到 1 之间。
@@ -31,8 +31,10 @@ def sample_top_p(probs, p):
     # 计算排序后概率的累积和. 返回的 probs_sum 是累积概率分布。
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     # 保留累积概率未超过阈值 p 的词汇的概率，其余词汇的概率被置为 0.0。
-    mask = probs_sum - probs_sort > p # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
-    probs_sort[mask] = 0.0 # 将累积概率超过阈值 p 的词的概率置零。
+    mask = (
+        probs_sum - probs_sort > p
+    )  # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
+    probs_sort[mask] = 0.0  # 将累积概率超过阈值 p 的词的概率置零。
 
     # 对剩余的概率重新归一化, 确保总和为 1。
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
@@ -40,23 +42,25 @@ def sample_top_p(probs, p):
     next_token_sorted_idx = torch.multinomial(probs_sort, num_samples=1)
     # 在 probs_idx 的最后一维（dim=-1）中，使用 next_token_sorted_idx 作为索引，提取对应的值。沿着 dim=1（列）进行索引提取
     # NOTE: torch.gather 函数按照给定的索引张量 index，从输入张量中收集 (获取) 数据，并返回一个与索引张量形状一致的张量。
-    next_token = torch.gather(probs_idx, -1, index = next_token_sorted_idx)
-    
-    return next_token # 返回采样得到的下一个词的索引
+    next_token = torch.gather(probs_idx, -1, index=next_token_sorted_idx)
+
+    return next_token  # 返回采样得到的下一个词的索引
+
 
 class GenerateText:
     """
     GenerateText 类用于加载LLaMA模型并执行迭代式生成式推理 (文本生成)。
     """
 
-    def __init__(self, 
+    def __init__(
+        self,
         checkpoints_dir: str,
         tokenizer_path: str,
-        max_seq_len = 1024,
-        max_gpu_num_blocks = None,
-        load_model = True,
-        triton_weight = True,
-        compiled_model = False,
+        max_seq_len=1024,
+        max_gpu_num_blocks=None,
+        load_model=True,
+        triton_weight=True,
+        compiled_model=False,
         device="cuda",
     ):
         self.checkpoints_dir = checkpoints_dir
@@ -64,27 +68,29 @@ def __init__(self,
         self.device = device
 
         self.model_executor = ModelExecutor.build(
-            checkpoints_dir = checkpoints_dir,
-            max_seq_len = max_seq_len,
-            max_gpu_num_blocks = max_gpu_num_blocks,
-            load_model = load_model,
-            triton_weight = triton_weight,
+            checkpoints_dir=checkpoints_dir,
+            max_seq_len=max_seq_len,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            load_model=load_model,
+            triton_weight=triton_weight,
             compiled_model=compiled_model,
-            device = device
+            device=device,
         )
         self.model_config = self.model_executor.model_config
         assert self.model_config.vocab_size != -1, "Vocab size must be set"
         self.tokenizer = self.load_tokenizer(tokenizer_path)
-    
+
     def load_tokenizer(self, pretrained_model_name_or_path):
         model_name = get_model_name_from_path(pretrained_model_name_or_path)
         # 根据模型名称决定是否使用 fast tokenizer
         use_fast = True
-        if 'llava' in model_name.lower():
+        if "llava" in model_name.lower():
             use_fast = False
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=use_fast)
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, use_fast=use_fast
+        )
         return tokenizer
-    
+
     @torch.inference_mode()
     def generate(
         self,
@@ -96,7 +102,7 @@ def generate(
     ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
         """
         基于提供的提示词 (prompts) 使用语言生成模型生成文本序列。
-        
+
         参数：
             prompt_tokens (List[List[int]]): 提示词的 token 序列，每个提示词是一个整数列表, 即 input_ids。
             max_gen_len (int): 最大生成序列长度。
@@ -106,7 +112,7 @@ def generate(
         返回：
             Tuple[List[List[int]], Optional[List[List[float]]]]: 生成的 token 序列和（可选）对应的 log 概率。
         """
-        bsz = len(prompt_tokens) # 批量大小
+        bsz = len(prompt_tokens)  # 批量大小
         # min_prompt_len = min(len(t) for t in prompt_tokens)
         max_prompt_len = max(len(t) for t in prompt_tokens)
         total_len = min(self.model_config.max_seq_len, max_gen_len + max_prompt_len)
@@ -116,54 +122,72 @@ def generate(
             else self.tokenizer.eos_token_id
         )
         # 初始化每个批次项的序列长度
-        actual_prompt_lens = torch.tensor([len(t) for t in prompt_tokens], dtype=torch.int32, device=self.device)  
-        # 预分配 tokens 张量 # 整个 batch 的 tokens buffer: [bsz, total_len] 
-        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device = self.device)
+        actual_prompt_lens = torch.tensor(
+            [len(t) for t in prompt_tokens], dtype=torch.int32, device=self.device
+        )
+        # 预分配 tokens 张量 # 整个 batch 的 tokens buffer: [bsz, total_len]
+        tokens = torch.full(
+            (bsz, total_len), pad_id, dtype=torch.long, device=self.device
+        )
 
         # 填充提示词到 tokens 张量
         for seq_id, token_ids in enumerate(prompt_tokens):
-            tokens[seq_id, : len(token_ids)] = torch.tensor(token_ids, dtype=torch.long, device = self.device)
-        
+            tokens[seq_id, : len(token_ids)] = torch.tensor(
+                token_ids, dtype=torch.long, device=self.device
+            )
+
         # 生成一个布尔张量，它的值为 True 的位置表示输入序列的实际内容（即非填充部分）, 形状为 (batch_size, total_len)
         input_text_mask = tokens != pad_id
-        b_req_idx = torch.arange(bsz, device = self.device)
+        b_req_idx = torch.arange(bsz, device=self.device)
         eos_reached = torch.zeros((bsz,), dtype=torch.bool, device=tokens.device)
 
-        all_select_index_list = [] # 预先分配 prefill 阶段的 KV 缓存索引
-        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(max_prompt_len, actual_prompt_lens, b_req_idx)
+        all_select_index_list = []  # 预先分配 prefill 阶段的 KV 缓存索引
+        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(
+            max_prompt_len, actual_prompt_lens, b_req_idx
+        )
         all_select_index_list.append(prefill_select_index)
 
         prev_pos = 0
-        input_ids = tokens[:, : max_prompt_len]  # [batch_size, seq_len]
+        input_ids = tokens[:, :max_prompt_len]  # [batch_size, seq_len]
         for cur_pos in range(max_prompt_len, total_len):
             batch_size, seq_len = input_ids.shape
             position_ids = (
                 torch.arange(prev_pos, prev_pos + seq_len, device=input_ids.device)
-                .unsqueeze(0)            # shape: [1, seq_len]
-                .repeat(batch_size, 1)   # shape: [batch_size, seq_len], 不分配额外内存
+                .unsqueeze(0)  # shape: [1, seq_len]
+                .repeat(batch_size, 1)  # shape: [batch_size, seq_len], 不分配额外内存
             )
 
-            logits = self.model_executor.forward(input_ids, position_ids)  # [batch_size, seq_len, vocab_size]
+            logits = self.model_executor.forward(
+                input_ids, position_ids
+            )  # [batch_size, seq_len, vocab_size]
             decode_select_index = self.model_executor.decode_alloc_kv_cache(bsz)
             all_select_index_list.append(decode_select_index)
-            
+
             last_logits = logits[:, -1, :]  # [batch_size, vocab_size]
-            probs = torch.softmax(last_logits / temperature, dim=-1)  # [batch_size, vocab_size]
+            probs = torch.softmax(
+                last_logits / temperature, dim=-1
+            )  # [batch_size, vocab_size]
             next_token = sample_top_p(probs, top_p)  # [batch_size]
-            input_ids = next_token   # [batch_size, 1]
-            
+            input_ids = next_token  # [batch_size, 1]
+
             mask = ~input_text_mask[:, cur_pos]  # [batch_size]
-            tokens[:, cur_pos] = torch.where(mask, next_token.reshape(-1) , tokens[:, cur_pos])
-            
-            eos_reached = eos_reached | (mask & (next_token == self.tokenizer.eos_token_id))
+            tokens[:, cur_pos] = torch.where(
+                mask, next_token.reshape(-1), tokens[:, cur_pos]
+            )
+
+            eos_reached = eos_reached | (
+                mask & (next_token == self.tokenizer.eos_token_id)
+            )
             prev_pos = cur_pos
-            
+
             if eos_reached.all():
                 break
-        
+
         # out_tokens = self.process_output_tokens(tokens, prompt_tokens, max_gen_len, echo, self.tokenizer.eos_token_id)
         all_select_indexs = torch.concat(all_select_index_list)
-        self.model_executor.kv_mem_manager.release_ref(all_select_indexs) # 减少 kv cache 内存管理器的引用计数
+        self.model_executor.kv_mem_manager.release_ref(
+            all_select_indexs
+        )  # 减少 kv cache 内存管理器的引用计数
 
         return tokens
 
@@ -178,18 +202,22 @@ def text_completion(
         """
         Perform text completion for a list of prompts using the language generation model.
         """
-        input_ids = self.tokenizer.batch_encode_plus(prompts, add_special_tokens=True).input_ids
+        input_ids = self.tokenizer.batch_encode_plus(
+            prompts, add_special_tokens=True
+        ).input_ids
         generated_ids = self.generate(
-            prompt_tokens = input_ids,
-            max_gen_len = max_gen_len,
-            temperature = temperature,
-            top_p = top_p,
-            echo = echo,
+            prompt_tokens=input_ids,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            echo=echo,
         )
 
-        generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        generated_texts = self.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
         return generated_texts
-    
+
     def process_output_tokens(
         self,
         tokens: torch.Tensor,
@@ -203,7 +231,7 @@ def process_output_tokens(
         """
         out_tokens = []
 
-        for i, seq_tokens in enumerate(tokens.tolist()): # 将 tokens 转换为列表
+        for i, seq_tokens in enumerate(tokens.tolist()):  # 将 tokens 转换为列表
             prompt_len = len(prompt_tokens[i])
             # 根据是否需要在输出中包含提示词，确定起始位置
             start_idx = 0 if echo else prompt_len
diff --git a/lite_llama/generate_stream.py b/lite_llama/generate_stream.py
index a501870..18b80ea 100644
--- a/lite_llama/generate_stream.py
+++ b/lite_llama/generate_stream.py
@@ -1,6 +1,6 @@
 from typing import Optional
 import torch, logging
-from typing import List, Optional, Tuple, TypedDict, Generator
+from typing import Optional, TypedDict, Generator
 from .executor.model_executor import ModelExecutor
 from .utils.file_interface import get_model_name_from_path
 from .kernels.softmax_split import softmax_split
@@ -14,8 +14,8 @@
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
-    tokens: List[str]  # not required
-    logprobs: List[float]  # not required
+    tokens: list[str]  # not required
+    logprobs: list[float]  # not required
 
 
 @torch.inference_mode()
@@ -37,7 +37,9 @@ def sample_top_p(probs, p):
     # 计算排序后概率的累积和. 返回的 probs_sum 是累积概率分布。
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     # 保留累积概率未超过阈值 p 的词汇的概率，其余词汇的概率被置为 0.0。
-    mask = probs_sum - probs_sort > p  # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
+    mask = (
+        probs_sum - probs_sort > p
+    )  # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
     probs_sort[mask] = 0.0  # 将累积概率超过阈值 p 的词的概率置零。
 
     # 对剩余的概率重新归一化, 确保总和为 1。
@@ -56,16 +58,17 @@ class GenerateStreamText:
     GenerateText 类用于加载LLaMA模型并执行迭代式生成式推理 (文本生成)。
     """
 
-    def __init__(self,
-                 checkpoints_dir: str,
-                 tokenizer_path: str,
-                 max_gpu_num_blocks=None,
-                 max_seq_len=1024,
-                 load_model=True,
-                 triton_weight=True,
-                 compiled_model=False,
-                 device="cuda",
-                 ):
+    def __init__(
+        self,
+        checkpoints_dir: str,
+        tokenizer_path: str,
+        max_gpu_num_blocks=None,
+        max_seq_len=1024,
+        load_model=True,
+        triton_weight=True,
+        compiled_model=False,
+        device="cuda",
+    ):
         self.checkpoints_dir = checkpoints_dir
 
         self.model_executor = ModelExecutor.build(
@@ -75,7 +78,7 @@ def __init__(self,
             max_seq_len=max_seq_len,
             triton_weight=triton_weight,
             compiled_model=compiled_model,
-            device=device
+            device=device,
         )
         self.tokenizer = self.load_tokenizer(tokenizer_path)
         self.model_config = self.model_executor.model_config
@@ -84,30 +87,32 @@ def __init__(self,
     def load_tokenizer(self, pretrained_model_name_or_path):
         model_name = get_model_name_from_path(pretrained_model_name_or_path)
 
-        if 'llava' in model_name.lower():
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=False,
-                                                      trust_remote_code=True)
+        if "llava" in model_name.lower():
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, use_fast=False, trust_remote_code=True
+            )
         else:
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=True,
-                                                      trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, use_fast=True, trust_remote_code=True
+            )
 
         return tokenizer
 
     @torch.inference_mode()
     def generate_stream(
-            self,
-            prompt_tokens: List[List[int]],
-            max_gen_len: int,
-            temperature: float = 0.6,
-            top_p: float = 0.9,
-            echo: bool = False,
-            device="cuda",
-    ) -> Generator[Tuple[List[str], Optional[List[float]]], None, None]:
+        self,
+        prompt_tokens: list[list[int]],
+        max_gen_len: int,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        echo: bool = False,
+        device="cuda",
+    ) -> Generator[tuple[list[str], Optional[list[float]]], None, None]:
         """
         基于提供的 prompt_tokens, 使用语言生成模型逐个生成 token, 并在生成时立即输出。
 
         参数：
-            prompt_tokens (List[List[int]]): 已经进行分词的 prompt, 每个 prompt 是一个整数列表。
+            prompt_tokens (list[list[int]]): 已经进行分词的 prompt, 每个 prompt 是一个整数列表。
             max_gen_len (int): 生成的最大长度。
             temperature (float, optional): 控制采样随机性的温度值。默认为 0.6。
             top_p (float, optional): 用于 nucleus sampling 的概率阈值。默认为 0.9。
@@ -115,7 +120,7 @@ def generate_stream(
             echo (bool, optional): 是否在输出中包含 prompt_tokens。默认为 False。
 
         generator 输出：
-            Tuple[List[str], Optional[List[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
+            tuple[list[str], Optional[list[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
         说明：
             该方法在生成循环中，每生成一个新 token, 就立即输出对应的文本和概率(如果需要）。
         """
@@ -124,15 +129,23 @@ def generate_stream(
         max_prompt_len = max(len(t) for t in prompt_tokens)
         assert max_prompt_len <= self.model_config.max_seq_len
         total_len = min(self.model_config.max_seq_len, max_gen_len + max_prompt_len)
-        actual_prompt_lens = torch.tensor([len(t) for t in prompt_tokens], dtype=torch.long, device=device)
-        pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+        actual_prompt_lens = torch.tensor(
+            [len(t) for t in prompt_tokens], dtype=torch.long, device=device
+        )
+        pad_id = (
+            self.tokenizer.pad_token_id
+            if self.tokenizer.pad_token_id is not None
+            else self.tokenizer.eos_token_id
+        )
 
         # 预分配tokens张量
         tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device="cuda")
         input_text_mask = tokens != pad_id
         eos_reached = torch.tensor([False] * bsz, device="cuda")
         prev_pos = 0
-        last_yielded_pos = [len(prompt_tokens[i]) if not echo else 0 for i in range(bsz)]  # 初始化每个样本已输出的位置
+        last_yielded_pos = [
+            len(prompt_tokens[i]) if not echo else 0 for i in range(bsz)
+        ]  # 初始化每个样本已输出的位置
 
         # 填充提示词到 tokens 张量
         for k, t in enumerate(prompt_tokens):
@@ -140,13 +153,14 @@ def generate_stream(
 
         b_req_idx = torch.arange(bsz, device=self.device)
         all_select_index_list = []
-        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(max_prompt_len, actual_prompt_lens,
-                                                                             b_req_idx)
+        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(
+            max_prompt_len, actual_prompt_lens, b_req_idx
+        )
         all_select_index_list.append(prefill_select_index)
 
-        input_ids = tokens[:, : max_prompt_len]  # [batch_size, seq_len]
+        input_ids = tokens[:, :max_prompt_len]  # [batch_size, seq_len]
         for cur_pos in range(max_prompt_len, total_len):
-            input_ids = tokens[:, prev_pos: cur_pos]
+            input_ids = tokens[:, prev_pos:cur_pos]
             batch_size, seq_len = input_ids.shape
             position_ids = (
                 torch.arange(prev_pos, prev_pos + seq_len, device=input_ids.device)
@@ -173,9 +187,13 @@ def generate_stream(
             # NOTE: input_text_mask[:, cur_pos]：获取掩码中当前列的布尔值，表示每个序列在当前位置是否为实际输入词元。
             # NOTE: tokens[:, cur_pos]：获取 tokens 中当前列的值。next_token：包含当前生成的词元 ID。
             mask = ~input_text_mask[:, cur_pos]  # [batch_size]
-            tokens[:, cur_pos] = torch.where(mask, next_token.reshape(-1), tokens[:, cur_pos])
+            tokens[:, cur_pos] = torch.where(
+                mask, next_token.reshape(-1), tokens[:, cur_pos]
+            )
 
-            eos_reached = eos_reached | (mask & (next_token == self.tokenizer.eos_token_id))
+            eos_reached = eos_reached | (
+                mask & (next_token == self.tokenizer.eos_token_id)
+            )
             prev_pos = cur_pos
 
             # eos_reached 是一个布尔张量，记录每个序列是否到达了终止状态, 形状为 [batch_size, 1]。
@@ -191,11 +209,13 @@ def generate_stream(
                 end = cur_pos + 1
                 if start < end:
                     token = tokens[i, start:end].tolist()
-                    text = self.tokenizer.decode(token, skip_special_tokens=True)  # 解码时跳过特殊标记。
+                    text = self.tokenizer.decode(
+                        token, skip_special_tokens=True
+                    )  # 解码时跳过特殊标记。
                     batch_outputs.append(text)
                     last_yielded_pos[i] = end
                 else:
-                    batch_outputs.append('')  # 如果没有新生成的内容，添加空字符串
+                    batch_outputs.append("")  # 如果没有新生成的内容，添加空字符串
 
             # 将整个批次的输出一次性 yield
             yield batch_outputs
@@ -208,17 +228,19 @@ def generate_stream(
         self.model_executor.kv_mem_manager.release_ref(all_select_indexs)
 
     def text_completion_stream(
-            self,
-            prompts: List[str],
-            temperature: float = 0.6,
-            top_p: float = 0.9,
-            max_gen_len: Optional[int] = None,
-            echo: bool = False,
-    ) -> Generator[List[CompletionPrediction], None, None]:
+        self,
+        prompts: list[str],
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_gen_len: Optional[int] = None,
+        echo: bool = False,
+    ) -> Generator[list[CompletionPrediction], None, None]:
         if max_gen_len is None:
             max_gen_len = self.model_config.max_seq_len - 1
 
-        prompt_tokens = [self.tokenizer.encode(x, add_special_tokens=True) for x in prompts]
+        prompt_tokens = [
+            self.tokenizer.encode(x, add_special_tokens=True) for x in prompts
+        ]
 
         stream = self.generate_stream(
             prompt_tokens=prompt_tokens,
@@ -229,8 +251,8 @@ def text_completion_stream(
         )
 
         # 初始化每个样本的生成结果
-        completions = [{'generation': '', 'tokens': []} for _ in prompts]
+        completions = [{"generation": "", "tokens": []} for _ in prompts]
         for batch_outputs in stream:
             for i, text in enumerate(batch_outputs):
-                completions[i]['generation'] += text
-            yield completions.copy()
\ No newline at end of file
+                completions[i]["generation"] += text
+            yield completions.copy()
diff --git a/lite_llama/generete_with_probs.py b/lite_llama/generete_with_probs.py
index af93c70..c63fe2c 100644
--- a/lite_llama/generete_with_probs.py
+++ b/lite_llama/generete_with_probs.py
@@ -2,7 +2,7 @@
 import torch
 from utils.logger import log
 from typing import List, Literal, Optional, Tuple, TypedDict
-import torch.nn.functional as F 
+import torch.nn.functional as F
 from transformers import AutoTokenizer
 
 from .executor.model_executor import ModelExecutor
@@ -11,20 +11,24 @@
 
 Role = Literal["system", "user", "assistant"]
 
+
 class Message(TypedDict):
     role: Role
     content: str
 
+
 class CompletionPrediction(TypedDict, total=False):
     generation: str
     tokens: List[str]
     logprobs: List[float]
 
+
 class ChatPrediction(TypedDict, total=False):
     generation: Message
     tokens: List[str]
     logprobs: List[float]
 
+
 Dialog = List[Message]
 
 B_INST, E_INST = "[INST]", "[/INST]"
@@ -32,6 +36,7 @@ class ChatPrediction(TypedDict, total=False):
 SPECIAL_TAGS = [B_INST, E_INST, "<<SYS>>", "<</SYS>>"]
 UNSAFE_ERROR = "Error: special tags are not allowed as part of the prompt."
 
+
 @torch.inference_mode()
 def sample_top_p(probs, p: float):
     # 使用 in-place 操作减少内存分配
@@ -47,15 +52,17 @@ def sample_top_p(probs, p: float):
     next_token = torch.gather(probs_idx, -1, next_token_sorted_idx)
     return next_token
 
+
 class GenerateText:
-    def __init__(self, 
+    def __init__(
+        self,
         checkpoints_dir: str,
         tokenizer_path: str,
-        max_seq_len = 1024,
-        max_gpu_num_blocks = None,
-        load_model = True,
-        triton_weight = True,
-        compiled_model = False,
+        max_seq_len=1024,
+        max_gpu_num_blocks=None,
+        load_model=True,
+        triton_weight=True,
+        compiled_model=False,
         device="cuda",
     ):
         self.checkpoints_dir = checkpoints_dir
@@ -63,25 +70,27 @@ def __init__(self,
         self.device = device
 
         self.model_executor = ModelExecutor.build(
-            checkpoints_dir = checkpoints_dir,
-            load_model = load_model,
-            max_gpu_num_blocks = max_gpu_num_blocks,
-            max_seq_len = max_seq_len,
-            triton_weight = triton_weight,
-            device = device
+            checkpoints_dir=checkpoints_dir,
+            load_model=load_model,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            triton_weight=triton_weight,
+            device=device,
         )
         self.model_config = self.model_executor.model_config
         self.tokenizer = self.load_tokenizer(tokenizer_path)
-    
+
     def load_tokenizer(self, pretrained_model_name_or_path):
         model_name = get_model_name_from_path(pretrained_model_name_or_path)
         # 根据模型名称决定是否使用 fast tokenizer
         use_fast = True
-        if 'llava' in model_name.lower():
+        if "llava" in model_name.lower():
             use_fast = False
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=use_fast)
+        tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path, use_fast=use_fast
+        )
         return tokenizer
-    
+
     @torch.inference_mode()
     def generate(
         self,
@@ -91,7 +100,7 @@ def generate(
         top_p: float = 0.9,
         logprobs: bool = True,
         echo: bool = False,
-        device = "cuda"
+        device="cuda",
     ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
         """
         基于提供的提示词 (prompts) 使用语言生成模型生成文本序列。
@@ -100,29 +109,43 @@ def generate(
         max_prompt_len = max(len(t) for t in prompt_tokens)
         assert max_prompt_len <= self.model_config.max_seq_len
         total_len = min(self.model_config.max_seq_len, max_gen_len + max_prompt_len)
-        actual_prompt_lens = torch.tensor([len(t) for t in prompt_tokens], dtype=torch.long, device=device)
-        pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+        actual_prompt_lens = torch.tensor(
+            [len(t) for t in prompt_tokens], dtype=torch.long, device=device
+        )
+        pad_id = (
+            self.tokenizer.pad_token_id
+            if self.tokenizer.pad_token_id is not None
+            else self.tokenizer.eos_token_id
+        )
         self.model_executor.atten_info.max_actual_seq_len = max_prompt_len
 
         # 预分配tokens张量
         tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=device)
-        
+
         # 填充提示词到 tokens 张量
         for seq_id, token_ids in enumerate(prompt_tokens):
             length = len(token_ids)
-            tokens[seq_id, :length] = torch.tensor(token_ids, dtype=torch.long, device=device)
+            tokens[seq_id, :length] = torch.tensor(
+                token_ids, dtype=torch.long, device=device
+            )
 
         # 生成一个布尔张量，它的值为 True 的位置表示输入序列的实际内容（即非填充部分）, 形状为 (batch_size, total_len)
         input_text_mask = tokens != pad_id
         eos_reached = torch.zeros(bsz, dtype=torch.bool, device=device)
-        token_logprobs = torch.zeros((bsz, total_len), dtype=torch.float, device=device) if logprobs else None
+        token_logprobs = (
+            torch.zeros((bsz, total_len), dtype=torch.float, device=device)
+            if logprobs
+            else None
+        )
 
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        b_req_idx = torch.arange(bsz, device = self.device)
+        b_req_idx = torch.arange(bsz, device=self.device)
         all_select_index_list = []
-        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(max_prompt_len, actual_prompt_lens, b_req_idx)
+        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(
+            max_prompt_len, actual_prompt_lens, b_req_idx
+        )
         all_select_index_list.append(prefill_select_index)
 
         token_count = 0
@@ -131,14 +154,14 @@ def generate(
         # 实际上可以统一从 max_prompt_len 开始生成，因为在 (min_prompt_len, max_prompt_len) 的位置上，有的样本还属于prompt部分
         # 这样减少复杂判断逻辑
         prev_pos = 0
-        input_ids = tokens[:, : max_prompt_len]
+        input_ids = tokens[:, :max_prompt_len]
         for cur_pos in range(max_prompt_len, total_len):
             batch_size, seq_len = input_ids.shape
             position_ids = (
                 torch.arange(prev_pos, prev_pos + seq_len, device=input_ids.device)
-                .unsqueeze(0)            # shape: [1, seq_len]
-                .repeat(batch_size, 1)   # shape: [batch_size, seq_len], 不分配额外内存
-            )  
+                .unsqueeze(0)  # shape: [1, seq_len]
+                .repeat(batch_size, 1)  # shape: [batch_size, seq_len], 不分配额外内存
+            )
             logits = self.model_executor.forward(input_ids, position_ids)
             decode_select_index = self.model_executor.decode_alloc_kv_cache(bsz)
             all_select_index_list.append(decode_select_index)
@@ -150,12 +173,14 @@ def generate(
                 next_token = sample_top_p(probs, top_p).reshape(-1)
             else:
                 next_token = torch.argmax(last_logits, dim=-1)
-            input_ids = next_token   # [batch_size, 1]
-            
+            input_ids = next_token  # [batch_size, 1]
+
             # 对仍在生成过程（非输入部分）的位置写入next_token
             # 对尚在prompt部分的位置保持原值不变
             to_generate = ~input_text_mask[:, cur_pos]
-            next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token)
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
             tokens[:, cur_pos] = next_token
 
             if logprobs:
@@ -166,7 +191,9 @@ def generate(
                 target = tokens[:, cur_pos]
                 # 使用 log_softmax 代替 cross_entropy，可以只提取相应token的logprob
                 log_probs = F.log_softmax(last_logits, dim=-1)
-                step_logprobs = torch.gather(log_probs, 1, target.unsqueeze(1)).squeeze(1)
+                step_logprobs = torch.gather(log_probs, 1, target.unsqueeze(1)).squeeze(
+                    1
+                )
 
                 # 将计算结果写入相应位置
                 token_logprobs[:, cur_pos] = step_logprobs
@@ -184,12 +211,20 @@ def generate(
         torch.cuda.synchronize()
 
         elapsed_time_sec = start_event.elapsed_time(end_event) / 1000.0
-        tokens_per_second = token_count / elapsed_time_sec if elapsed_time_sec > 0 else float('inf')
+        tokens_per_second = (
+            token_count / elapsed_time_sec if elapsed_time_sec > 0 else float("inf")
+        )
         log.info(f"Batch inference time, no decode: {elapsed_time_sec * 1000:.4f} ms")
         log.info(f"Tokens per second, no decode: {tokens_per_second:.2f} tokens/s")
 
         out_tokens, out_logprobs = self.process_output_tokens(
-            tokens, prompt_tokens, max_gen_len, logprobs, echo, self.tokenizer.eos_token_id, token_logprobs
+            tokens,
+            prompt_tokens,
+            max_gen_len,
+            logprobs,
+            echo,
+            self.tokenizer.eos_token_id,
+            token_logprobs,
         )
 
         # 减少 kv cache 内存管理器的引用计数
@@ -206,32 +241,38 @@ def text_completion(
         max_gen_len: Optional[int] = None,
         logprobs: bool = False,
         echo: bool = False,
-        device = "cuda",
+        device="cuda",
     ) -> List[CompletionPrediction]:
         if max_gen_len is None:
             max_gen_len = self.model_config.max_seq_len - 1
 
-        input_ids = self.tokenizer.batch_encode_plus(prompts, add_special_tokens=True).input_ids
+        input_ids = self.tokenizer.batch_encode_plus(
+            prompts, add_special_tokens=True
+        ).input_ids
         generated_ids, generation_logprobs = self.generate(
-            prompt_tokens = input_ids,
-            max_gen_len = max_gen_len,
-            temperature = temperature,
-            top_p = top_p,
-            logprobs = logprobs,
-            echo = echo,
-            device = device,
+            prompt_tokens=input_ids,
+            max_gen_len=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+            device=device,
         )
 
         if logprobs:
             return [
                 {
                     "generation": self.tokenizer.decode(t, skip_special_tokens=True),
-                    "tokens": [self.tokenizer.decode([x], skip_special_tokens=True) for x in t],
+                    "tokens": [
+                        self.tokenizer.decode([x], skip_special_tokens=True) for x in t
+                    ],
                     "logprobs": logprobs_i,
                 }
                 for t, logprobs_i in zip(generated_ids, generation_logprobs)
             ]
-        generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        generated_texts = self.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
         return generated_texts
 
     def process_output_tokens(
@@ -242,9 +283,8 @@ def process_output_tokens(
         logprobs: bool,
         echo: bool,
         eos_token_id,
-        token_logprobs: Optional[torch.Tensor] = None
+        token_logprobs: Optional[torch.Tensor] = None,
     ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
-
         out_tokens = []
         out_logprobs = [] if logprobs else None
         tokens_list = tokens.tolist()  # 转为CPU列表，只在最终处理输出时进行
@@ -273,7 +313,7 @@ def process_output_tokens(
                 out_logprobs.append(seq_logprobs)
 
         return (out_tokens, out_logprobs if logprobs else None)
-    
+
     def chat_completion(
         self,
         dialogs: List[Dialog],
@@ -318,9 +358,9 @@ def chat_completion(
                 ],
                 [],
             )
-            assert (
-                dialog[-1]["role"] == "user"
-            ), f"Last message must be from user, got {dialog[-1]['role']}"
+            assert dialog[-1]["role"] == "user", (
+                f"Last message must be from user, got {dialog[-1]['role']}"
+            )
             dialog_tokens += self.tokenizer.encode(
                 f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}",
             )
@@ -342,7 +382,9 @@ def chat_completion(
                         if not unsafe
                         else UNSAFE_ERROR,
                     },
-                    "tokens": [self.tokenizer.decode([x], skip_special_tokens=True) for x in t],
+                    "tokens": [
+                        self.tokenizer.decode([x], skip_special_tokens=True) for x in t
+                    ],
                     "logprobs": logprobs_i,
                 }
                 for t, logprobs_i, unsafe in zip(
@@ -353,8 +395,10 @@ def chat_completion(
             {
                 "generation": {
                     "role": "assistant",
-                    "content": self.tokenizer.decode(t, skip_special_tokens=True) if not unsafe else UNSAFE_ERROR,
+                    "content": self.tokenizer.decode(t, skip_special_tokens=True)
+                    if not unsafe
+                    else UNSAFE_ERROR,
                 }
             }
             for t, unsafe in zip(generation_tokens, unsafe_requests)
-        ]
\ No newline at end of file
+        ]
diff --git a/evaluator/lite_llama_inference.py b/lite_llama/inference.py
similarity index 81%
rename from evaluator/lite_llama_inference.py
rename to lite_llama/inference.py
index a63b8f1..fe16b06 100644
--- a/evaluator/lite_llama_inference.py
+++ b/lite_llama/inference.py
@@ -1,16 +1,24 @@
-from typing import List, Optional
+from typing import Optional
+import torch
+
 import sys, os, time
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 
-import torch
 from lite_llama.utils.prompt_templates import get_prompter
-
 from lite_llama.generate import GenerateText
 
 
-class LiteLlamaInference(object):
-
-    def __init__(self, temperature: float, top_p: float, max_seq_len: int, max_gen_len: Optional[int], lite_llama_ckpt_dir: str, device: str = "cuda"):
+class Inference(object):
+    def __init__(
+        self,
+        temperature: float,
+        top_p: float,
+        max_seq_len: int,
+        max_gen_len: Optional[int],
+        lite_llama_ckpt_dir: str,
+        device: str = "cuda",
+    ):
         self.temperature = temperature
         self.top_p = top_p
         self.max_seq_len = max_seq_len
@@ -18,9 +26,7 @@ def __init__(self, temperature: float, top_p: float, max_seq_len: int, max_gen_l
         self.lite_llama_ckpt_dir = lite_llama_ckpt_dir
         self.device = device
 
-
-
-    def load_lite_llama_generator(self, max_gpu_num_blocks=None) -> GenerateText:
+    def load_generator(self, max_gpu_num_blocks=None) -> GenerateText:
         """
         Initializes the lite-llama generator
         """
@@ -36,7 +42,7 @@ def load_lite_llama_generator(self, max_gpu_num_blocks=None) -> GenerateText:
         )
         return generator
 
-    def count_tokens(self, texts: List[str], tokenizer) -> int:
+    def count_tokens(self, texts: list[str], tokenizer) -> int:
         # Optimized segmentation statistics
         total_tokens = 0
         for t in texts:
@@ -44,7 +50,7 @@ def count_tokens(self, texts: List[str], tokenizer) -> int:
             total_tokens += len(ids)
         return total_tokens
 
-    def lite_llama_inference(self, generator: GenerateText, prompts: List[str]):
+    def inference(self, generator: GenerateText, prompts: list[str]):
         """
         Inference is performed using lite-llama's GenerateText instance and returns the result with the time taken and the number of tokens output
         """
@@ -71,9 +77,7 @@ def lite_llama_inference(self, generator: GenerateText, prompts: List[str]):
 
         return results, end_time - start_time, total_tokens
 
-
     def process(self, prompts):
-
         if "qwen2" in self.lite_llama_ckpt_dir.lower():
             model_type = "qwen2"
         elif "llama" in self.lite_llama_ckpt_dir.lower():
@@ -90,10 +94,11 @@ def process(self, prompts):
             update_prompts.append(model_prompter.model_input)
 
         # 1. lite-llama inference
-        lite_llama_generator = self.load_lite_llama_generator(max_gpu_num_blocks=40960)
-        lite_llama_results, lite_llama_time, lite_llama_tokens = self.lite_llama_inference(
-            lite_llama_generator, update_prompts)
+        lite_llama_generator = self.load_generator(max_gpu_num_blocks=40960)
+        lite_llama_results, lite_llama_time, lite_llama_tokens = self.inference(
+            lite_llama_generator, update_prompts
+        )
         del lite_llama_generator
         torch.cuda.empty_cache()  # Release the memory used by lite_llama_generator after use.
 
-        return lite_llama_results
\ No newline at end of file
+        return lite_llama_results
diff --git a/lite_llama/kernels/__init__.py b/lite_llama/kernels/__init__.py
index 7effa2a..72727fb 100644
--- a/lite_llama/kernels/__init__.py
+++ b/lite_llama/kernels/__init__.py
@@ -1,5 +1,4 @@
-
-from .activations import (gelu, relu, leaky_relu, tanh)
+from .activations import gelu, relu, leaky_relu, tanh
 
 from .flashattention import flash_attention_v1
 from .flashattention2_nopad import flash_attention2_no_pad
@@ -8,7 +7,7 @@
 
 from .skip_rmsnorm import skip_rmsnorm
 from .swiglu import swiglu_forward
-from .rope_emb import (rope_forward, rope_emb_forward)
+from .rope_emb import rope_emb_forward
 from .softmax_split import softmax_split
 from .update_kv_buffer import update_kv_buffer
 from .update_kv_index import update_kv_index
@@ -20,4 +19,4 @@
 # from .others.layernorm import layernorm
 # from .others.rotary_emb_v1 import rotary_emb_fwd
 # from .others.context_flashattention_nopad import context_attention_fwd_no_prompt_cache
-# from .others.rmsnorm_layer import rmsnorm_fwd
\ No newline at end of file
+# from .others.rmsnorm_layer import rmsnorm_fwd
diff --git a/lite_llama/kernels/activations.py b/lite_llama/kernels/activations.py
index 15ce602..744d8ff 100644
--- a/lite_llama/kernels/activations.py
+++ b/lite_llama/kernels/activations.py
@@ -1,9 +1,10 @@
 import triton
-import triton.language as tl 
-import math 
+import triton.language as tl
+import math
 
 sqrt2 = math.sqrt(2.0)
 
+
 # 激活函数都是逐元素操作算子，所以无需指定维度参数
 @triton.jit
 def relu(x):
@@ -12,6 +13,7 @@ def relu(x):
     """
     return tl.maximum(0, x)
 
+
 # Leaky ReLU
 @triton.jit
 def leaky_relu(x):
@@ -31,13 +33,15 @@ def tanh(x):
     Tanh(双曲正切)函数也是一种 Sigmoid 型函数，可以看作放大并平移的 Sigmoid 函数, only support inference.
     2 / (1+e^{-2x}) -1
     """
-    return 2 / (1 + tl.exp(-2*x)) - 1
+    return 2 / (1 + tl.exp(-2 * x)) - 1
+
 
 @triton.jit
 def gelu(x):
     """Gaussian Error Linear Unit (GELU), only support inference."""
     return x * 0.5 * (1.0 + tl.libdevice.erf(x / sqrt2))
 
+
 @triton.jit
 def silu(x):
-    return x * tl.sigmoid(x)
\ No newline at end of file
+    return x * tl.sigmoid(x)
diff --git a/lite_llama/kernels/flashattention.py b/lite_llama/kernels/flashattention.py
index 82596df..5f17442 100644
--- a/lite_llama/kernels/flashattention.py
+++ b/lite_llama/kernels/flashattention.py
@@ -1,13 +1,14 @@
 # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py
 # https://github.com/ELS-RD/kernl/blob/main/src/kernl/implementations/attention.py#L438
 
-import torch,math
+import torch, math
 import triton
 import triton.language as tl
 from torch.cuda.amp import custom_fwd
 from typing import List, Optional, Union
 import torch.nn.functional as F
 
+
 # TODO: integrating rope with flash-attn
 @triton.jit
 def flash_attention_v1_kernel(
@@ -15,38 +16,32 @@ def flash_attention_v1_kernel(
     k_ptr,
     v_ptr,
     o_ptr,
-
     q_batch_stride,
     q_heads_stride,
     q_seq_stride,
     q_dim_stride,
-
     k_batch_stride,
     k_heads_stride,
     k_seq_stride,
-    k_dim_stride, # matrix Q stride for columns, [seq_len, head_dim]
-
+    k_dim_stride,  # matrix Q stride for columns, [seq_len, head_dim]
     v_batch_stride,
     v_heads_stride,
     v_seq_stride,
     v_dim_stride,
-
     out_batch_stride,
     out_heads_stride,
     out_seq_stride,
     out_dim_stride,
-
-    num_kv_groups, # group of kv heads
-    n_heads,      # number of heads
+    num_kv_groups,  # group of kv heads
+    n_heads,  # number of heads
     m_size,
-    n_size,       # sequence length of k, also be rows of K matrix
-
-    BLOCK_DHEAD_SIZE: tl.constexpr, # head_dim dimension
-    BLOCK_M_SIZE: tl.constexpr, # BLOCK size of m_size dimension，即 Q 矩阵行数分成了m_size // BLOCK_M_SIZE 块，块大小是 BLOCK_M_SIZE
-    BLOCK_N_SIZE: tl.constexpr, # n_size dimension
+    n_size,  # sequence length of k, also be rows of K matrix
+    BLOCK_DHEAD_SIZE: tl.constexpr,  # head_dim dimension
+    BLOCK_M_SIZE: tl.constexpr,  # BLOCK size of m_size dimension，即 Q 矩阵行数分成了m_size // BLOCK_M_SIZE 块，块大小是 BLOCK_M_SIZE
+    BLOCK_N_SIZE: tl.constexpr,  # n_size dimension
     sm_scale,
-    causal_mask
-    ):
+    causal_mask,
+):
     """
     flashattention 内核实现
     """
@@ -59,32 +54,45 @@ def flash_attention_v1_kernel(
     cur_kv_head_idx = cur_head_idx // num_kv_groups
 
     m_range_offs = tl.arange(0, BLOCK_M_SIZE)
-    n_range_offs = tl.arange(0, BLOCK_N_SIZE) # head_dim 维度偏移
+    n_range_offs = tl.arange(0, BLOCK_N_SIZE)  # head_dim 维度偏移
     dhead_range_offs = tl.arange(0, BLOCK_DHEAD_SIZE)
 
     m_offs = block_m_idx * BLOCK_M_SIZE + m_range_offs
 
     # Compute offsets for the first block on matrix Q K V Output
-    q_offs = ( 
-        cur_batch_idx * q_batch_stride 
+    q_offs = (
+        cur_batch_idx * q_batch_stride
         + cur_head_idx * q_heads_stride
-        + (m_offs[:, None] * q_seq_stride + dhead_range_offs[None,:] * q_dim_stride))
+        + (m_offs[:, None] * q_seq_stride + dhead_range_offs[None, :] * q_dim_stride)
+    )
 
     k_offs = (
-        cur_batch_idx * k_batch_stride 
+        cur_batch_idx * k_batch_stride
         + cur_kv_head_idx * k_heads_stride
-        + (n_range_offs[:,None] * k_seq_stride + dhead_range_offs[None,:] * k_dim_stride))
-    
-    v_offs = ( 
-        cur_batch_idx * v_batch_stride 
+        + (
+            n_range_offs[:, None] * k_seq_stride
+            + dhead_range_offs[None, :] * k_dim_stride
+        )
+    )
+
+    v_offs = (
+        cur_batch_idx * v_batch_stride
         + cur_kv_head_idx * v_heads_stride
-        + (n_range_offs[:,None] * v_seq_stride + dhead_range_offs[None,:] * v_dim_stride))
+        + (
+            n_range_offs[:, None] * v_seq_stride
+            + dhead_range_offs[None, :] * v_dim_stride
+        )
+    )
 
-    o_offs = ( 
-        cur_batch_idx * out_batch_stride 
+    o_offs = (
+        cur_batch_idx * out_batch_stride
         + cur_head_idx * out_heads_stride
-        + (m_offs[:,None] * out_seq_stride + dhead_range_offs[None,:] * out_dim_stride))
-    
+        + (
+            m_offs[:, None] * out_seq_stride
+            + dhead_range_offs[None, :] * out_dim_stride
+        )
+    )
+
     q_ptrs = q_ptr + q_offs
     k_ptrs = k_ptr + k_offs
     v_ptrs = v_ptr + v_offs
@@ -94,7 +102,7 @@ def flash_attention_v1_kernel(
     l_i = tl.zeros((BLOCK_M_SIZE,), dtype=tl.float32) - float("inf")
     d_i = tl.zeros((BLOCK_M_SIZE,), dtype=tl.float32)
     acc = tl.zeros((BLOCK_M_SIZE, BLOCK_DHEAD_SIZE), dtype=tl.float32)
-    
+
     q_mask = m_offs[:, None] < m_size
     q = tl.load(q_ptrs, mask=q_mask, other=0.0)
 
@@ -102,10 +110,10 @@ def flash_attention_v1_kernel(
         block_n_offs = block_n_start_idx + n_range_offs
         k_mask = block_n_offs[:, None] < n_size
         k = tl.load(k_ptrs + block_n_start_idx * k_seq_stride, mask=k_mask, other=0.0)
-        
+
         qk = tl.zeros((BLOCK_M_SIZE, BLOCK_N_SIZE), dtype=tl.float32)
         qk += tl.dot(q, tl.trans(k))
-        
+
         # 应用因果遮罩
         if causal_mask:
             offs_k = block_n_offs
@@ -119,20 +127,20 @@ def flash_attention_v1_kernel(
 
         l_j = tl.max(qk, 1)
         numerators = tl.exp(qk - l_j[:, None])
-        d_j = tl.sum(numerators, 1) # 1d vector
+        d_j = tl.sum(numerators, 1)  # 1d vector
 
         l_new = tl.maximum(l_i, l_j)
         alpha = tl.exp(l_i - l_new)
         beta = tl.exp(l_j - l_new)
-        d_new = alpha * d_i  + beta * d_j
-        
+        d_new = alpha * d_i + beta * d_j
+
         # compute softmax(qk)
         p_scale = beta / d_new
         p = numerators * p_scale[:, None]
         # acc scaling
         sigma = d_i / d_new * alpha
         acc = acc * sigma[:, None]
-        
+
         # compute O = PV
         v = tl.load(v_ptrs + block_n_start_idx * v_seq_stride, mask=k_mask, other=0.0)
         p = p.to(q_ptr.dtype.element_ty)
@@ -142,97 +150,104 @@ def flash_attention_v1_kernel(
         # update the normalizer (l and d) for next iteration
         l_i = l_new
         d_i = d_new
-    
+
     out_mask = m_offs[:, None] < m_size
     tl.store(out_ptrs, acc, mask=out_mask)
 
+
 @torch.no_grad()
 @custom_fwd(cast_inputs=torch.float16)
 def flash_attention_v1(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
-    ):
+):
     """Compute Flash-attention, can't support fp32 input
     参数:
         q: Query tensor, shape: [bs, n_heads, m_size, head_dim], decode 阶段, q 的 seq_len 和 k v 不一致, 其值为 1
-        k: Key tensor,  shape: [bs, n_heads, n_size, head_dim]. 
-        v: Value tensor, shape is consistent with k. 
-        output: Attention ouput tensor, shape is consistent with q. 
+        k: Key tensor,  shape: [bs, n_heads, n_size, head_dim].
+        v: Value tensor, shape is consistent with k.
+        output: Attention ouput tensor, shape is consistent with q.
         attention_mask: Attention mask matrix broadcastable to (batch, head_size, m_size, n_size).
     """
-    num_kv_groups = q.shape[1] // k.shape[1] # num_q_heads // num_k_heads
+    num_kv_groups = q.shape[1] // k.shape[1]  # num_q_heads // num_k_heads
     output = torch.empty_like(q)
-    assert q.device.type == 'cuda', "Input tensor q must be on CUDA device"
-    assert k.device.type == 'cuda', "Input tensor keys must be on CUDA device"
+    assert q.device.type == "cuda", "Input tensor q must be on CUDA device"
+    assert k.device.type == "cuda", "Input tensor keys must be on CUDA device"
 
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
-    assert (
-            q.dtype == k.dtype == v.dtype == output.dtype
-        ), f"All tensors must have the same dtype: {q.dtype}, {k.dtype}, {v.dtype}, {output.dtype}"
-    
+    assert q.dtype == k.dtype == v.dtype == output.dtype, (
+        f"All tensors must have the same dtype: {q.dtype}, {k.dtype}, {v.dtype}, {output.dtype}"
+    )
+
     # sequence length of q, also be rows of Q matrix
     bs, n_heads, m_size, HEAD_DIM = q.size()
     causal_mask = False
     if m_size > 1:
         causal_mask: bool = True
-        
+
     n_size = k.shape[2]
     sm_scale = 1 / math.sqrt(HEAD_DIM)
     # BLOCK_M_SIZE = 128
-    grid = lambda meta: (triton.cdiv(m_size, meta["BLOCK_M_SIZE"]), bs*n_heads, 1) # 二维 grid
+    grid = lambda meta: (
+        triton.cdiv(m_size, meta["BLOCK_M_SIZE"]),
+        bs * n_heads,
+        1,
+    )  # 二维 grid
 
     flash_attention_v1_kernel[grid](
         q,
         k,
-        v, 
+        v,
         output,
         *q.stride(),  # (batch, heads, m_size, head_dim)
         *k.stride(),  # (batch, heads, n_size, head_dim)
         *v.stride(),  # (batch, heads, n_size, head_dim)
         *output.stride(),  # (batch, heads, m_size, n_size)
-
         num_kv_groups,
         n_heads,
         m_size,
         n_size,
-        
         HEAD_DIM,
         32,  # BLOCK_M_SIZE
         32,  # BLOCK_N_SIZE
         sm_scale,
-        causal_mask
+        causal_mask,
     )
     return output
 
+
 def standard_attention(Q, K, V, sm_scale, mask=None):
     """
     标准的 PyTorch 实现的自注意力机制。
-    
+
     Args:
         Q (torch.Tensor): 查询张量，形状 (batch_size, num_heads, seq_length, head_dim)
         K (torch.Tensor): 键张量，形状 (batch_size, num_heads, seq_length, head_dim)
         V (torch.Tensor): 值张量，形状 (batch_size, num_heads, seq_length, head_dim)
         sm_scale (float): Softmax 缩放因子
         mask (torch.Tensor, optional): 遮罩张量，形状 (batch_size, num_heads, seq_length, seq_length)
-    
+
     Returns:
         torch.Tensor: 注意力输出，形状与 Q 相同
     """
     # 计算 QK^T
-    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * sm_scale  # (batch_size, num_heads, seq_length, seq_length)
-    
+    attn_scores = (
+        torch.matmul(Q, K.transpose(-2, -1)) * sm_scale
+    )  # (batch_size, num_heads, seq_length, seq_length)
+
     if mask is not None:
-        attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))
-    
+        attn_scores = attn_scores.masked_fill(mask == 0, float("-inf"))
+
     # print("attn_scores", attn_scores)
     attn_weights = F.softmax(attn_scores, dim=-1)
-    
+
     # 计算注意力输出
     out = torch.matmul(attn_weights, V)  # (batch_size, num_heads, seq_length, head_dim)
-    
+
     return out
 
+
 def test_prefill_stage():
     # 设置测试参数
     batch_size = 2
@@ -244,9 +259,15 @@ def test_prefill_stage():
 
     # 生成固定的输入张量（使用固定随机种子以确保可重复性）
     torch.manual_seed(0)
-    q = torch.randn(batch_size, num_heads, seq_length, head_dim, device='cuda', dtype=torch.float32)
-    k = torch.randn(batch_size, num_heads, seq_length, head_dim, device='cuda', dtype=torch.float32)
-    v = torch.randn(batch_size, num_heads, seq_length, head_dim, device='cuda', dtype=torch.float32)
+    q = torch.randn(
+        batch_size, num_heads, seq_length, head_dim, device="cuda", dtype=torch.float32
+    )
+    k = torch.randn(
+        batch_size, num_heads, seq_length, head_dim, device="cuda", dtype=torch.float32
+    )
+    v = torch.randn(
+        batch_size, num_heads, seq_length, head_dim, device="cuda", dtype=torch.float32
+    )
 
     # 计算 Softmax 缩放因子
     sm_scale = 1.0 / math.sqrt(head_dim)  # 1 / sqrt(d_k) * 1/log(2)
@@ -256,17 +277,25 @@ def test_prefill_stage():
 
     # 使用标准 PyTorch 实现计算注意力输出
     # 创建下三角矩阵
-    mask = torch.tril(torch.ones((seq_length, seq_length))).unsqueeze(0).unsqueeze(0).type_as(q)  # (1, 1, seq, seq)
+    mask = (
+        torch.tril(torch.ones((seq_length, seq_length)))
+        .unsqueeze(0)
+        .unsqueeze(0)
+        .type_as(q)
+    )  # (1, 1, seq, seq)
     standard_o = standard_attention(q, k, v, sm_scale, mask)
 
     # 比较 Triton 内核输出与标准实现的输出
     if torch.allclose(out, standard_o, atol=1e-2):
-        print("Prefill Stage Test Passed: Triton output matches PyTorch standard implementation.")
+        print(
+            "Prefill Stage Test Passed: Triton output matches PyTorch standard implementation."
+        )
     else:
         max_diff = (out - standard_o).abs().max()
         print(f"Prefill Stage Test Failed: Maximum difference {max_diff}")
         # 可选择打印更多信息进行调试
 
+
 def test_decode_stage():
     # 设置测试参数
     batch_size = 1
@@ -279,11 +308,34 @@ def test_decode_stage():
 
     # 生成固定的初始输入张量
     torch.manual_seed(0)
-    q_initial = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype=torch.float32)
-    k_initial = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype=torch.float32)
-    v_initial = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype=torch.float32)
-    o_initial = torch.zeros_like(q_initial, device='cuda', dtype=torch.float32)
-    new_token_q = torch.randn(batch_size, num_heads, 1, head_dim, device='cuda', dtype=torch.float32)
+    q_initial = torch.randn(
+        batch_size,
+        num_heads,
+        initial_seq_length,
+        head_dim,
+        device="cuda",
+        dtype=torch.float32,
+    )
+    k_initial = torch.randn(
+        batch_size,
+        num_heads,
+        initial_seq_length,
+        head_dim,
+        device="cuda",
+        dtype=torch.float32,
+    )
+    v_initial = torch.randn(
+        batch_size,
+        num_heads,
+        initial_seq_length,
+        head_dim,
+        device="cuda",
+        dtype=torch.float32,
+    )
+    o_initial = torch.zeros_like(q_initial, device="cuda", dtype=torch.float32)
+    new_token_q = torch.randn(
+        batch_size, num_heads, 1, head_dim, device="cuda", dtype=torch.float32
+    )
 
     triton_k_extended = k_initial
     triton_v_extended = v_initial
@@ -296,7 +348,7 @@ def test_decode_stage():
         # 生成新的 token
         triton_k_extended = torch.cat([triton_k_extended, triton_new_token_q], dim=2)
         triton_v_extended = torch.cat([triton_v_extended, triton_new_token_q], dim=2)
-        
+
         torch_k_extended = torch.cat([torch_k_extended, torch_new_token_q], dim=2)
         torch_v_extended = torch.cat([torch_v_extended, torch_new_token_q], dim=2)
 
@@ -307,20 +359,29 @@ def test_decode_stage():
         sm_scale_extended = 1.0 / math.sqrt(head_dim)
 
         # 计算 Triton 内核输出
-        triton_new_token_q = flash_attention_v1(new_token_q, triton_k_extended, triton_v_extended)
+        triton_new_token_q = flash_attention_v1(
+            new_token_q, triton_k_extended, triton_v_extended
+        )
 
         # 使用标准 PyTorch 实现计算扩展后的注意力输出
-        torch_new_token_q = standard_attention(new_token_q, torch_k_extended, torch_v_extended, sm_scale_extended)
+        torch_new_token_q = standard_attention(
+            new_token_q, torch_k_extended, torch_v_extended, sm_scale_extended
+        )
 
         # 比较 Triton 内核输出与标准实现的输出
         if torch.allclose(triton_new_token_q, torch_new_token_q, atol=1e-1):
-            print(f"Decode Stage Step {step} Test Passed: Triton output matches PyTorch standard implementation.")
+            print(
+                f"Decode Stage Step {step} Test Passed: Triton output matches PyTorch standard implementation."
+            )
         else:
             max_diff = (triton_new_token_q - torch_new_token_q).abs().max()
-            print(f"Decode Stage Step {step} Test Failed: Maximum difference {max_diff}")
+            print(
+                f"Decode Stage Step {step} Test Failed: Maximum difference {max_diff}"
+            )
             # 可选择打印更多信息进行调试
             break  # 根据需要是否停止测试
 
+
 if __name__ == "__main__":
     print("Running Prefill Stage Test...")
     test_prefill_stage()
@@ -348,4 +409,4 @@ def test_decode_stage():
 Decode Stage Step 14 Test Passed: Triton output matches PyTorch standard implementation.
 Decode Stage Step 15 Test Passed: Triton output matches PyTorch standard implementation.
 Decode Stage Step 16 Test Passed: Triton output matches PyTorch standard implementation.
-"""
\ No newline at end of file
+"""
diff --git a/lite_llama/kernels/flashattention2_nopad.py b/lite_llama/kernels/flashattention2_nopad.py
index 152102c..81a369f 100644
--- a/lite_llama/kernels/flashattention2_nopad.py
+++ b/lite_llama/kernels/flashattention2_nopad.py
@@ -2,44 +2,66 @@
 # https://github.com/ELS-RD/kernl/blob/main/src/kernl/implementations/attention.py#L438
 # https://triton-lang.org/main/getting-started/tutorials/06-fused-attention.html
 
-import torch,math
+import torch, math
 import triton
 import triton.language as tl
 from torch.cuda.amp import custom_fwd
 
 
 configs_tma = [
-    triton.Config({'BLOCK_M_SIZE': BM, 'BLOCK_N_SIZE': BN}, num_stages=stages, num_warps=warps) \
-    for BM in [64, 128]\
-    for BN in [32, 64, 128]\
-    for warps in [4, 8, 16]\
-    for stages in [2, 3, 4, 6]\
+    triton.Config(
+        {"BLOCK_M_SIZE": BM, "BLOCK_N_SIZE": BN}, num_stages=stages, num_warps=warps
+    )
+    for BM in [64, 128]
+    for BN in [32, 64, 128]
+    for warps in [4, 8, 16]
+    for stages in [2, 3, 4, 6]
 ]
 
+
 def keep_tma(conf):
     BLOCK_M_SIZE = conf.kwargs["BLOCK_M_SIZE"]
     BLOCK_N_SIZE = conf.kwargs["BLOCK_N_SIZE"]
-    if (torch.cuda.get_device_capability()[0] == 9 and BLOCK_M_SIZE * BLOCK_N_SIZE < 128 * 128 and conf.num_warps == 8):
+    if (
+        torch.cuda.get_device_capability()[0] == 9
+        and BLOCK_M_SIZE * BLOCK_N_SIZE < 128 * 128
+        and conf.num_warps == 8
+    ):
         return False
     return True
 
+
 # key 参数列表(['B_Seqlen', 'HEAD_DIM'])的值会直接影响最佳配置的选择，因为不同的输入尺寸或问题规模可能需要不同的内核调度策略。
 # @triton.autotune(
-#     configs=list(filter(keep_tma, configs_tma)), 
+#     configs=list(filter(keep_tma, configs_tma)),
 #     key=['B_Seqlen', 'HEAD_DIM']
 # )
 @triton.jit
 def flash_attention2_nopad_kernel(
-    Q, K, V, O,
-    B_Start_Loc, B_Seqlen, 
-    sm_scale, heads, num_kv_groups,       # group of kv heads
-    stride_q_bs, stride_q_heads, stride_q_dim,  # Q 的 strides
-    stride_k_bs, stride_k_heads, stride_k_dim,  # K 的 strides
-    stride_v_bs, stride_v_heads, stride_v_dim,  # V 的 strides
-    stride_o_bs, stride_o_heads, stride_o_dim,
-    HEAD_DIM: tl.constexpr, # head_dim dimension
-    BLOCK_M_SIZE: tl.constexpr, # BLOCK size of m_size dimension，即 Q 矩阵行数分成了m_size // BLOCK_M_SIZE 块，块大小是 BLOCK_M_SIZE
-    BLOCK_N_SIZE: tl.constexpr, # n_size dimension    
+    Q,
+    K,
+    V,
+    O,
+    B_Start_Loc,
+    B_Seqlen,
+    sm_scale,
+    heads,
+    num_kv_groups,  # group of kv heads
+    stride_q_bs,
+    stride_q_heads,
+    stride_q_dim,  # Q 的 strides
+    stride_k_bs,
+    stride_k_heads,
+    stride_k_dim,  # K 的 strides
+    stride_v_bs,
+    stride_v_heads,
+    stride_v_dim,  # V 的 strides
+    stride_o_bs,
+    stride_o_heads,
+    stride_o_dim,
+    HEAD_DIM: tl.constexpr,  # head_dim dimension
+    BLOCK_M_SIZE: tl.constexpr,  # BLOCK size of m_size dimension，即 Q 矩阵行数分成了m_size // BLOCK_M_SIZE 块，块大小是 BLOCK_M_SIZE
+    BLOCK_N_SIZE: tl.constexpr,  # n_size dimension
 ):
     """
     flashattentionv1 内核实现, 支持 nopad 计算, 输入为 3 维张量
@@ -55,9 +77,9 @@ def flash_attention2_nopad_kernel(
     # cur_seq_start_loc = tl.load(b_req_tokens_table + cur_batch_idx * stride_req_to_tokens_b)
     cur_seq_start_loc = tl.load(B_Start_Loc + cur_batch_idx)
 
-    block_start_loc = block_m_idx * BLOCK_M_SIZE # 计算当前 block 的起始和结束索引
+    block_start_loc = block_m_idx * BLOCK_M_SIZE  # 计算当前 block 的起始和结束索引
 
-    offs_n = tl.arange(0, BLOCK_N_SIZE) # head_dim 维度偏移
+    offs_n = tl.arange(0, BLOCK_N_SIZE)  # head_dim 维度偏移
     offs_d = tl.arange(0, HEAD_DIM)
     offs_m = block_start_loc + tl.arange(0, BLOCK_M_SIZE)
 
@@ -69,9 +91,17 @@ def flash_attention2_nopad_kernel(
     )
     q = tl.load(Q + q_offs, mask=offs_m[:, None] < cur_seq_len, other=0.0)
 
-    k_offs = offs_n[None, :] * stride_k_bs + cur_kv_head_idx * stride_k_heads + offs_d[:, None] * stride_k_dim
-    v_offs = offs_n[:, None] * stride_v_bs + cur_kv_head_idx * stride_v_heads + offs_d[None, :] * stride_v_dim
-    
+    k_offs = (
+        offs_n[None, :] * stride_k_bs
+        + cur_kv_head_idx * stride_k_heads
+        + offs_d[:, None] * stride_k_dim
+    )
+    v_offs = (
+        offs_n[:, None] * stride_v_bs
+        + cur_kv_head_idx * stride_v_heads
+        + offs_d[None, :] * stride_v_dim
+    )
+
     k_ptrs = K + k_offs
     v_ptrs = V + v_offs
 
@@ -79,7 +109,7 @@ def flash_attention2_nopad_kernel(
     m_i = tl.zeros((BLOCK_M_SIZE,), dtype=tl.float32) - float("inf")
     d_i = tl.zeros((BLOCK_M_SIZE,), dtype=tl.float32)
     acc = tl.zeros((BLOCK_M_SIZE, HEAD_DIM), dtype=tl.float32)
-        
+
     block_mask = tl.where(block_start_loc < cur_seq_len, 1, 0)
     block_end_loc = tl.minimum(block_start_loc + BLOCK_M_SIZE, cur_seq_len)
 
@@ -89,26 +119,27 @@ def flash_attention2_nopad_kernel(
         # 计算 qk^t
         k = tl.load(
             k_ptrs + (cur_seq_start_loc + start_n) * stride_k_bs,
-            mask=(start_n + offs_n[None, :]) < block_end_loc, other = 0.0
+            mask=(start_n + offs_n[None, :]) < block_end_loc,
+            other=0.0,
         )
 
         qk = tl.dot(q, k)
-        
-        # 应用因果遮罩, 下三角矩阵 causal mask 
+
+        # 应用因果遮罩, 下三角矩阵 causal mask
         casual_mask = offs_m[:, None] >= (start_n + offs_n[None, :])
-        qk = tl.where(casual_mask, qk*sm_scale, -1.0e8)
+        qk = tl.where(casual_mask, qk * sm_scale, -1.0e8)
 
-        m_ij = tl.maximum(m_i, tl.max(qk, 1)) # 求 qk 的最大值
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))  # 求 qk 的最大值
         qk -= m_ij[:, None]
         p = tl.math.exp2(qk)  # qk - m_ij[:, None]更新为安全的 qk 分子项
-        d_ij = tl.sum(p, 1) # 1d vector
+        d_ij = tl.sum(p, 1)  # 1d vector
 
         # -- 更新归一化项 d_new
         alpha = tl.math.exp2(m_i - m_ij)
         d_i = d_i * alpha + d_ij
-        
+
         # -- update output accumulator --
-        acc = acc * alpha[:, None] # acc scaling 
+        acc = acc * alpha[:, None]  # acc scaling
 
         # compute O = PV
         v = tl.load(
@@ -118,10 +149,10 @@ def flash_attention2_nopad_kernel(
         )
         p = p.to(v.dtype)
         acc = tl.dot(p, v, acc)
-        
+
         # update the normalizer (l and d) for next iteration
         m_i = m_ij
-    
+
     acc = acc / d_i[:, None]
     off_o = (
         (cur_seq_start_loc + offs_m[:, None]) * stride_o_bs
@@ -131,6 +162,7 @@ def flash_attention2_nopad_kernel(
     out_ptrs = O + off_o
     tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_seq_len)
 
+
 # --------------------------------------
 # Flashattention NoPad 实现（Triton 内核）
 # --------------------------------------
@@ -141,75 +173,94 @@ def flash_attention2_no_pad(
     k: torch.Tensor,
     v: torch.Tensor,
     sm_scale,
-    b_start_loc, 
-    b_seq_len, 
+    b_start_loc,
+    b_seq_len,
     max_seq_len,
-    ):
+):
     """Compute Flash-attention, can't support fp32 input
     参数:
         q: Query tensor, shape: [bs*m_size, n_heads, head_dim], decode 阶段, q 的 seq_len 和 k v 不一致, 其值为 1
-        k: Key tensor,  shape: [bs*n_size, n_heads, head_dim]. 
-        v: Value tensor, shape is consistent with k. 
+        k: Key tensor,  shape: [bs*n_size, n_heads, head_dim].
+        v: Value tensor, shape is consistent with k.
     """
     output = torch.empty_like(q)
     batchs = b_seq_len.shape[0]
     n_heads, HEAD_DIM = q.shape[1], q.shape[2]
 
-    BLOCK_SIZE = 64 # For Ampere Architecture, 3090ti, set 128
+    BLOCK_SIZE = 64  # For Ampere Architecture, 3090ti, set 128
     num_warps = 4 if HEAD_DIM <= 64 else 8
     num_stages = 1
 
-    num_kv_groups = q.shape[1] // k.shape[1] # num_q_heads // num_k_heads
+    num_kv_groups = q.shape[1] // k.shape[1]  # num_q_heads // num_k_heads
     grid = (triton.cdiv(max_seq_len, BLOCK_SIZE), batchs * n_heads, 1)
 
     flash_attention2_nopad_kernel[grid](
         q,
         k,
-        v, 
+        v,
         output,
         b_start_loc,
         b_seq_len,
         sm_scale,
-        n_heads, 
+        n_heads,
         num_kv_groups,
-        q.stride(0), q.stride(1), q.stride(2),
-        k.stride(0), k.stride(1), k.stride(2),
-        v.stride(0), v.stride(1), v.stride(2),
-        output.stride(0), output.stride(1), output.stride(2),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        output.stride(0),
+        output.stride(1),
+        output.stride(2),
         HEAD_DIM=HEAD_DIM,
-        BLOCK_M_SIZE=BLOCK_SIZE, # 使用或者关闭 autotune 针对不同机器和上下文长度自动优化内核配置
+        BLOCK_M_SIZE=BLOCK_SIZE,  # 使用或者关闭 autotune 针对不同机器和上下文长度自动优化内核配置
         BLOCK_N_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
         num_stages=num_stages,
     )
     return output
 
+
 # --------------------------------------
 # 标准 Attention Prefill 实现（纯 PyTorch版）
 # --------------------------------------
-def _naive_attention(q, k ,v):
+def _naive_attention(q, k, v):
     import math
+
     bs, seqlen, num_head, head_dim = q.shape
     device = q.device
-    mask = 1.0 - torch.tril(torch.ones((seqlen, seqlen), device=device), diagonal=0).unsqueeze(0).unsqueeze(0)
+    mask = 1.0 - torch.tril(
+        torch.ones((seqlen, seqlen), device=device), diagonal=0
+    ).unsqueeze(0).unsqueeze(0)
     mask.masked_fill_(mask.to(torch.bool), -100000000.0)
-    q = q.transpose(1, 2) #(bs, num_head, seqlen, head_dim)
-    k = k.transpose(1, 2) #(bs, num_head, seqlen, head_dim)
-    v = v.transpose(1, 2) #(bs, num_head, seqlen, head_dim)
+    q = q.transpose(1, 2)  # (bs, num_head, seqlen, head_dim)
+    k = k.transpose(1, 2)  # (bs, num_head, seqlen, head_dim)
+    v = v.transpose(1, 2)  # (bs, num_head, seqlen, head_dim)
     scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(head_dim)
     scores = torch.nn.functional.softmax(scores.float() + mask, dim=-1).to(q.dtype)
-    output = torch.matmul(scores, v).transpose(1, 2).contiguous().reshape(bs, seqlen, num_head, head_dim)
+    output = (
+        torch.matmul(scores, v)
+        .transpose(1, 2)
+        .contiguous()
+        .reshape(bs, seqlen, num_head, head_dim)
+    )
     return output
 
+
 def _sdpa(q, k, v):
     bs, seqlen, num_head, head_dim = q.shape
-    q = q.transpose(1, 2) #(bs, num_head, seqlen, head_dim)
-    k = k.transpose(1, 2) #(bs, num_head, seqlen, head_dim)
-    v = v.transpose(1, 2) #(bs, num_head, seqlen, head_dim)
+    q = q.transpose(1, 2)  # (bs, num_head, seqlen, head_dim)
+    k = k.transpose(1, 2)  # (bs, num_head, seqlen, head_dim)
+    v = v.transpose(1, 2)  # (bs, num_head, seqlen, head_dim)
     output = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
     output = output.transpose(1, 2).contiguous().reshape(bs, seqlen, num_head, head_dim)
     return output
 
+
 def standard_attention_prefill(q, k, v, b_start_loc, b_seq_len, sdpa=True):
     out = torch.empty_like(q)
     Z = b_start_loc.shape[0]
@@ -226,10 +277,13 @@ def standard_attention_prefill(q, k, v, b_start_loc, b_seq_len, sdpa=True):
         out[start:end] = oi.squeeze(0)
     return out
 
+
 # =============================================================================
 # 内核精度验证与性能比较函数封装
 # =============================================================================
-def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max_seq_len_list=[1024, 2048, 4096]):
+def run_flash_attention2_no_pad_benchmark(
+    batch=4, n_heads=32, head_dim=128, max_seq_len_list=[1024, 2048, 4096]
+):
     """
     构造输入 q/k/v 张量形状为 [batch, n_heads, head_dim] (q) 和 [max_seq_len, n_heads, head_dim] (k, v),
     验证 flash_attention2_no_pad 输出结果与标准 attention 对齐（允许一定误差），
@@ -237,13 +291,14 @@ def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max
     返回一个字典，包含验证误差及各 max_seq_len 下的平均执行时间。
     """
     import matplotlib.pyplot as plt
+
     # =============================================================================
     # 1, 内核精度验证
     # =============================================================================
     device = "cuda"
     sm_scale = 1.0 / math.sqrt(head_dim) * 1.4426950408889634
     max_seq_len = max_seq_len_list[0]
-    
+
     # q 的 shape: [batch, n_heads, head_dim] (decode 阶段 q 的 seq_len=1)
     shape = (batch * max_seq_len, n_heads, head_dim)
     q = torch.randn(shape, device=device, dtype=torch.float16)
@@ -252,11 +307,17 @@ def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max
     # 构造 b_start_loc 和 b_seq_len (假设每个 batch 从 0 开始，序列长度均为 max_seq_len)
     b_seq_len = torch.tensor([512, 1024, 512, 1024], dtype=torch.int32, device="cuda")
     b_start_loc = torch.tensor([0, 512, 1536, 2048], dtype=torch.int32, device="cuda")
-    
-    triton_output = flash_attention2_no_pad(q, k, v, sm_scale, b_start_loc, b_seq_len, max_seq_len)
-    torch_output = standard_attention_prefill(q, k, v, b_start_loc, b_seq_len, sdpa=False)
-    print(f'The maximum difference between torch and triton is {torch.max(torch.abs(torch_output - triton_output))}')
-    
+
+    triton_output = flash_attention2_no_pad(
+        q, k, v, sm_scale, b_start_loc, b_seq_len, max_seq_len
+    )
+    torch_output = standard_attention_prefill(
+        q, k, v, b_start_loc, b_seq_len, sdpa=False
+    )
+    print(
+        f"The maximum difference between torch and triton is {torch.max(torch.abs(torch_output - triton_output))}"
+    )
+
     # =============================================================================
     # 2, 内核运行速度性能比较
     # =============================================================================
@@ -266,17 +327,19 @@ def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max
 
     for seq_len in max_seq_len_list:
         # q 的 shape: [batch, n_heads, head_dim] (decode 阶段 q 的 seq_len=1)
-        shape = (batch*seq_len, n_heads, head_dim)
+        shape = (batch * seq_len, n_heads, head_dim)
         q = torch.randn(shape, device=device, dtype=torch.float16)
         k = torch.randn(shape, device=device, dtype=torch.float16)
         v = torch.randn(shape, device=device, dtype=torch.float16)
-        
+
         # 构造 b_start_loc 和 b_seq_len (假设每个 batch 从 0 开始，序列长度均为 max_seq_len)
-        b_start_loc = torch.tensor([0, seq_len, 2*seq_len, 3*seq_len], dtype=torch.int32, device="cuda") # batch = 4
+        b_start_loc = torch.tensor(
+            [0, seq_len, 2 * seq_len, 3 * seq_len], dtype=torch.int32, device="cuda"
+        )  # batch = 4
         b_seq_len = torch.full((batch,), seq_len, device=device, dtype=torch.int32)
         # b_seq_len = torch.tensor([512, 1024, 512, 1024], dtype=torch.int32, device="cuda")
         # b_start_loc = torch.tensor([0, 512, 1536, 2048], dtype=torch.int32, device="cuda")
-        
+
         # 预热
         _ = flash_attention2_no_pad(q, k, v, sm_scale, b_start_loc, b_seq_len, seq_len)
         torch.cuda.synchronize()
@@ -284,7 +347,9 @@ def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
         for _ in range(iterations):
-            _ = flash_attention2_no_pad(q, k, v, sm_scale, b_start_loc, b_seq_len, seq_len)
+            _ = flash_attention2_no_pad(
+                q, k, v, sm_scale, b_start_loc, b_seq_len, seq_len
+            )
         end_event.record()
         torch.cuda.synchronize()
         flash_time = start_event.elapsed_time(end_event) / iterations
@@ -301,12 +366,14 @@ def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max
         standard_time = start_event.elapsed_time(end_event) / iterations
         standard_times.append(standard_time)
 
-        print(f"max_seq_len = {seq_len:4d}: flash_attn = {flash_time:.3f} ms, standard_attn = {standard_time:.3f} ms")
+        print(
+            f"max_seq_len = {seq_len:4d}: flash_attn = {flash_time:.3f} ms, standard_attn = {standard_time:.3f} ms"
+        )
 
     # 绘制性能对比曲线
     plt.figure(figsize=(8, 5))
-    plt.plot(max_seq_len_list, flash_times, marker='o', label="Flash Attentionv2")
-    plt.plot(max_seq_len_list, standard_times, marker='s', label="Standard Attention")
+    plt.plot(max_seq_len_list, flash_times, marker="o", label="Flash Attentionv2")
+    plt.plot(max_seq_len_list, standard_times, marker="s", label="Standard Attention")
     plt.xlabel("max_seq_len (kv cache length)")
     plt.ylabel("Average execution time (ms)")
     plt.title("Prefill Stage Performance Comparison")
@@ -320,9 +387,10 @@ def run_flash_attention2_no_pad_benchmark(batch=4, n_heads=32, head_dim=128, max
         "standard_times": standard_times,
     }
 
+
 # =============================================================================
 # 如果直接运行该脚本，则执行验证与性能比较
 # =============================================================================
 if __name__ == "__main__":
     stats = run_flash_attention2_no_pad_benchmark()
-    print("Benchmark statistics:", stats)
\ No newline at end of file
+    print("Benchmark statistics:", stats)
diff --git a/lite_llama/kernels/flashattentionv2.py b/lite_llama/kernels/flashattentionv2.py
index 4c743fe..fc0c3fc 100644
--- a/lite_llama/kernels/flashattentionv2.py
+++ b/lite_llama/kernels/flashattentionv2.py
@@ -7,24 +7,31 @@
 
 # TESLA = "Tesla" in torch.cuda.get_device_name(0)
 
+
 @triton.jit
 def _attn_fwd_inner(
-	acc, m_i, d_i, q,
-	k_ptrs, v_ptrs, 
-	k_seq_stride, v_seq_stride,
-	offs_m,
-	qk_scale, 
-	n_size, # kv seq_len
-	BLOCK_M_SIZE: tl.constexpr, BLOCK_N_SIZE: tl.constexpr,
-    fp8_v: tl.constexpr
+    acc,
+    m_i,
+    d_i,
+    q,
+    k_ptrs,
+    v_ptrs,
+    k_seq_stride,
+    v_seq_stride,
+    offs_m,
+    qk_scale,
+    n_size,  # kv seq_len
+    BLOCK_M_SIZE: tl.constexpr,
+    BLOCK_N_SIZE: tl.constexpr,
+    fp8_v: tl.constexpr,
 ):
-    n_range_offs = tl.arange(0, BLOCK_N_SIZE) # head_dim 维度偏移
+    n_range_offs = tl.arange(0, BLOCK_N_SIZE)  # head_dim 维度偏移
 
     # 在 SRAM 上完成计算
     for block_n_start_idx in range(0, n_size, BLOCK_N_SIZE):
         block_n_start_idx = tl.multiple_of(block_n_start_idx, BLOCK_N_SIZE)
         block_n_offs = block_n_start_idx + n_range_offs
-        
+
         k_mask = block_n_offs[:, None] < n_size
         k = tl.load(k_ptrs + block_n_start_idx * k_seq_stride, mask=k_mask, other=0.0)
 
@@ -37,11 +44,11 @@ def _attn_fwd_inner(
         mask = offs_m[:, None] >= offs_k[None, :]
         qk = qk * qk_scale + tl.where(mask, 0, -1.0e8)
         # qk = tl.where(mask, qk * qk_scale, -1.0e8)
-        m_ij = tl.maximum(m_i, tl.max(qk, 1)) # 求 qk 的最大值
-        qk -= m_ij[:, None] # 更新为安全的 qk
-        
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))  # 求 qk 的最大值
+        qk -= m_ij[:, None]  # 更新为安全的 qk
+
         p = tl.math.exp2(qk)
-        d_ij = tl.sum(p, 1) # 1d vector
+        d_ij = tl.sum(p, 1)  # 1d vector
 
         # -- 更新归一化项 d_new
         alpha = tl.math.exp2(m_i - m_ij)
@@ -60,142 +67,175 @@ def _attn_fwd_inner(
 
     return acc, d_i
 
+
 @triton.jit
 def flash_attention_v2_kernel(
     q_ptr,
     k_ptr,
     v_ptr,
     o_ptr,
-
     q_batch_stride,
     q_heads_stride,
     q_seq_stride,
     q_dim_stride,
-
     k_batch_stride,
     k_heads_stride,
     k_seq_stride,
-    k_dim_stride, # matrix Q stride for columns, [seq_len, head_dim]
-
+    k_dim_stride,  # matrix Q stride for columns, [seq_len, head_dim]
     v_batch_stride,
     v_heads_stride,
     v_seq_stride,
     v_dim_stride,
-
     out_batch_stride,
     out_heads_stride,
     out_seq_stride,
     out_dim_stride,
-
-    num_kv_groups, # group of kv heads
-    n_heads,      # number of heads
-    m_size,       # sequence length of q
-    n_size,       # sequence length of k, also be rows of K matrix
-    HEAD_DIM: tl.constexpr, # head_dim dimension
-    BLOCK_M_SIZE: tl.constexpr, # BLOCK size of m_size dimension，即 Q 矩阵行数分成了m_size // BLOCK_M_SIZE 块，块大小是 BLOCK_M_SIZE
-    BLOCK_N_SIZE: tl.constexpr, # n_size dimension
+    num_kv_groups,  # group of kv heads
+    n_heads,  # number of heads
+    m_size,  # sequence length of q
+    n_size,  # sequence length of k, also be rows of K matrix
+    HEAD_DIM: tl.constexpr,  # head_dim dimension
+    BLOCK_M_SIZE: tl.constexpr,  # BLOCK size of m_size dimension，即 Q 矩阵行数分成了m_size // BLOCK_M_SIZE 块，块大小是 BLOCK_M_SIZE
+    BLOCK_N_SIZE: tl.constexpr,  # n_size dimension
     qk_scale,
-    ):
+):
     """
     flashattention2 内核实现
     """
     block_m_idx = tl.program_id(0)
-    head_idx = tl.program_id(1) # 获取当前 CUDA 块在第二个维度（通常是 blockIdx.y）上的索引。head_idx 表示当前块对应的头（head）的索引。
+    head_idx = tl.program_id(
+        1
+    )  # 获取当前 CUDA 块在第二个维度（通常是 blockIdx.y）上的索引。head_idx 表示当前块对应的头（head）的索引。
 
-    cur_batch_idx = head_idx // n_heads # 通过整数除法，将 head_idx 转换为当前批次（batch）的索引。
-    cur_head_idx = head_idx % n_heads # 通过取模操作，计算出当前头在其所属批次中的具体索引。
+    cur_batch_idx = (
+        head_idx // n_heads
+    )  # 通过整数除法，将 head_idx 转换为当前批次（batch）的索引。
+    cur_head_idx = (
+        head_idx % n_heads
+    )  # 通过取模操作，计算出当前头在其所属批次中的具体索引。
 
-    cur_kv_head_idx = cur_head_idx // num_kv_groups # 支持 GQA 模型直接获取 kv heads index, 也兼容非 GQA 模型
+    cur_kv_head_idx = (
+        cur_head_idx // num_kv_groups
+    )  # 支持 GQA 模型直接获取 kv heads index, 也兼容非 GQA 模型
 
-    m_range_offs = tl.arange(0, BLOCK_M_SIZE) # seq_dim 维度偏移
-    n_range_offs = tl.arange(0, BLOCK_N_SIZE) # bs*n_heads 维度偏移
-    dhead_range_offs = tl.arange(0, HEAD_DIM) # head_dim 维度偏移
+    m_range_offs = tl.arange(0, BLOCK_M_SIZE)  # seq_dim 维度偏移
+    n_range_offs = tl.arange(0, BLOCK_N_SIZE)  # bs*n_heads 维度偏移
+    dhead_range_offs = tl.arange(0, HEAD_DIM)  # head_dim 维度偏移
 
-    offs_m = block_m_idx * BLOCK_M_SIZE + m_range_offs # 计算当前块在 M(seq_dim) 维度上的实际偏移量。
+    offs_m = (
+        block_m_idx * BLOCK_M_SIZE + m_range_offs
+    )  # 计算当前块在 M(seq_dim) 维度上的实际偏移量。
 
     # 二维偏移, Compute offsets for the first block on matrix Q K V Output
-    offs_q = ( 
-        cur_batch_idx * q_batch_stride 
+    offs_q = (
+        cur_batch_idx * q_batch_stride
         + cur_head_idx * q_heads_stride
-        + (offs_m[:, None] * q_seq_stride + dhead_range_offs[None,:] * q_dim_stride))
+        + (offs_m[:, None] * q_seq_stride + dhead_range_offs[None, :] * q_dim_stride)
+    )
 
     offs_k = (
-        cur_batch_idx * k_batch_stride 
+        cur_batch_idx * k_batch_stride
         + cur_kv_head_idx * k_heads_stride
-        + (n_range_offs[:,None] * k_seq_stride + dhead_range_offs[None,:] * k_dim_stride))
+        + (
+            n_range_offs[:, None] * k_seq_stride
+            + dhead_range_offs[None, :] * k_dim_stride
+        )
+    )
 
-    offs_v = ( 
-        cur_batch_idx * v_batch_stride 
+    offs_v = (
+        cur_batch_idx * v_batch_stride
         + cur_kv_head_idx * v_heads_stride
-        + (n_range_offs[:,None] * v_seq_stride + dhead_range_offs[None,:] * v_dim_stride))
+        + (
+            n_range_offs[:, None] * v_seq_stride
+            + dhead_range_offs[None, :] * v_dim_stride
+        )
+    )
 
-    offs_o = ( 
-        cur_batch_idx * out_batch_stride 
+    offs_o = (
+        cur_batch_idx * out_batch_stride
         + cur_head_idx * out_heads_stride
-        + (offs_m[:,None] * out_seq_stride + dhead_range_offs[None,:] * out_dim_stride))
+        + (
+            offs_m[:, None] * out_seq_stride
+            + dhead_range_offs[None, :] * out_dim_stride
+        )
+    )
 
     q_ptrs = q_ptr + offs_q
     k_ptrs = k_ptr + offs_k
     v_ptrs = v_ptr + offs_v
     out_ptrs = o_ptr + offs_o
-    
+
     q_mask = offs_m[:, None] < m_size
     q = tl.load(q_ptrs, mask=q_mask, other=0.0)
 
     # 初始化用于计算 softmax 归一化项的 m 和 d, 意义见 online-softmax, 这里
-    m_i = tl.zeros([BLOCK_M_SIZE,], dtype=tl.float32) - float("inf")
-    d_i = tl.zeros([BLOCK_M_SIZE,], dtype=tl.float32)
+    m_i = tl.zeros(
+        [
+            BLOCK_M_SIZE,
+        ],
+        dtype=tl.float32,
+    ) - float("inf")
+    d_i = tl.zeros(
+        [
+            BLOCK_M_SIZE,
+        ],
+        dtype=tl.float32,
+    )
     acc = tl.zeros([BLOCK_M_SIZE, HEAD_DIM], dtype=tl.float32)
 
     # acc 是 attention 输出累加器, d_i 是 softmax 的归一化项（分母）, m_i 是最大值（分子）
-    acc, d_i = _attn_fwd_inner(acc, m_i, d_i, q,
-                                k_ptrs, v_ptrs,
-                                k_seq_stride, v_seq_stride,
-                                offs_m,
-                                qk_scale, 
-                                n_size, # kv seq_len
-                                BLOCK_M_SIZE, BLOCK_N_SIZE,
-                                v_ptr.dtype.element_ty == tl.float8e5)
+    acc, d_i = _attn_fwd_inner(
+        acc,
+        m_i,
+        d_i,
+        q,
+        k_ptrs,
+        v_ptrs,
+        k_seq_stride,
+        v_seq_stride,
+        offs_m,
+        qk_scale,
+        n_size,  # kv seq_len
+        BLOCK_M_SIZE,
+        BLOCK_N_SIZE,
+        v_ptr.dtype.element_ty == tl.float8e5,
+    )
 
     acc = acc / d_i[:, None]
     out_mask = offs_m[:, None] < m_size
     tl.store(out_ptrs, acc, mask=out_mask)
 
+
 @torch.no_grad()
-def flash_attention_v2(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-	qk_scale
-    ):
+def flash_attention_v2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, qk_scale):
     """Compute Flash-attention, can't support fp32 input
     参数:
         q: Query tensor, shape: [bs, n_heads, m_size, head_dim], decode 阶段, q 的 seq_len 和 k v 不一致, 其值为 1
-        k: Key tensor,  shape: [bs, n_heads, n_size, head_dim]. 
-        v: Value tensor, shape is consistent with k. 
-        output: Attention ouput tensor, shape is consistent with q. 
+        k: Key tensor,  shape: [bs, n_heads, n_size, head_dim].
+        v: Value tensor, shape is consistent with k.
+        output: Attention ouput tensor, shape is consistent with q.
         attention_mask: Attention mask matrix broadcastable to (batch, head_size, m_size, n_size).
     """
-    BLOCK_SIZE = 64 # default: BLOCK_M_SIZE = 64
-    num_kv_groups = q.shape[1] // k.shape[1] # num_q_heads // num_k_heads
+    BLOCK_SIZE = 64  # default: BLOCK_M_SIZE = 64
+    num_kv_groups = q.shape[1] // k.shape[1]  # num_q_heads // num_k_heads
     output = torch.empty_like(q)
 
     assert q.shape[-1] == k.shape[-1] == v.shape[-1]
-    assert (
-            q.dtype == k.dtype == v.dtype == output.dtype
-        ), f"All tensors must have the same dtype: {q.dtype}, {k.dtype}, {v.dtype}, {output.dtype}"
+    assert q.dtype == k.dtype == v.dtype == output.dtype, (
+        f"All tensors must have the same dtype: {q.dtype}, {k.dtype}, {v.dtype}, {output.dtype}"
+    )
 
     # sequence length of q, also be rows of Q matrix
     bs, n_heads, m_size, head_dim = q.size()
 
     n_size = k.shape[2]
-    
-    grid = lambda meta: (triton.cdiv(m_size, BLOCK_SIZE), bs*n_heads, 1) # 二维 grid
+
+    grid = lambda meta: (triton.cdiv(m_size, BLOCK_SIZE), bs * n_heads, 1)  # 二维 grid
 
     flash_attention_v2_kernel[grid](
         q,
         k,
-        v, 
+        v,
         output,
         *q.stride(),  # (batch, heads, m_size, head_dim)
         *k.stride(),  # (batch, heads, n_size, head_dim)
@@ -210,4 +250,4 @@ def flash_attention_v2(
         BLOCK_SIZE,  # BLOCK_N_SIZE
         qk_scale,
     )
-    return output
\ No newline at end of file
+    return output
diff --git a/lite_llama/kernels/flashdecoding.py b/lite_llama/kernels/flashdecoding.py
index c1b6fa2..e25f2be 100644
--- a/lite_llama/kernels/flashdecoding.py
+++ b/lite_llama/kernels/flashdecoding.py
@@ -5,18 +5,35 @@
 
 @triton.jit
 def _flash_decoding_stage1_kernel(
-    Q, K, V, qk_scale,
-    b_req_tokens_table, B_Seqlen, 
-	num_kv_groups, # group of kv heads
-    Mid_O, Mid_O_LogExpSum,
-	stride_req_to_tokens_b, stride_req_to_tokens_s,
-    q_bs_stride, q_heads_stride, q_dim_stride,  # Q 的 strides
-    k_bs_stride, k_heads_stride, k_dim_stride,  # K 的 strides
-    v_bs_stride, v_heads_stride, v_dim_stride,  # V 的 strides
-    mido_batch_stride, mido_heads_stride, mido_partitions_stride, mido_dim_stride,
-    mido_les_batch_stride, mido_les_heads_stride, mido_les_partitions_stride,
-    BLOCK_SEQ: tl.constexpr, # 默认 128
-    BLOCK_N: tl.constexpr,   # 默认 32
+    Q,
+    K,
+    V,
+    qk_scale,
+    b_req_tokens_table,
+    B_Seqlen,
+    num_kv_groups,  # group of kv heads
+    Mid_O,
+    Mid_O_LogExpSum,
+    stride_req_to_tokens_b,
+    stride_req_to_tokens_s,
+    q_bs_stride,
+    q_heads_stride,
+    q_dim_stride,  # Q 的 strides
+    k_bs_stride,
+    k_heads_stride,
+    k_dim_stride,  # K 的 strides
+    v_bs_stride,
+    v_heads_stride,
+    v_dim_stride,  # V 的 strides
+    mido_batch_stride,
+    mido_heads_stride,
+    mido_partitions_stride,
+    mido_dim_stride,
+    mido_les_batch_stride,
+    mido_les_heads_stride,
+    mido_les_partitions_stride,
+    BLOCK_SEQ: tl.constexpr,  # 默认 128
+    BLOCK_N: tl.constexpr,  # 默认 32
     BLOCK_DMODEL: tl.constexpr,
 ):
     """Flash Attention Stage1 Triton Kernel"""
@@ -32,25 +49,27 @@ def _flash_decoding_stage1_kernel(
 
     # 计算当前分区的起始和结束索引
     cur_batch_partition_start_index = seq_block_pid * BLOCK_SEQ
-    cur_batch_partition_end_index = tl.minimum(cur_batch_seq_len, cur_batch_partition_start_index + BLOCK_SEQ)
+    cur_batch_partition_end_index = tl.minimum(
+        cur_batch_seq_len, cur_batch_partition_start_index + BLOCK_SEQ
+    )
 
     # 计算需要处理的块数
-    num_blocks = tl.where(cur_batch_partition_end_index - cur_batch_partition_start_index <= 0, 
-                        0, (cur_batch_partition_end_index - cur_batch_partition_start_index + BLOCK_N - 1) // BLOCK_N)
+    num_blocks = tl.where(
+        cur_batch_partition_end_index - cur_batch_partition_start_index <= 0,
+        0,
+        (cur_batch_partition_end_index - cur_batch_partition_start_index + BLOCK_N - 1)
+        // BLOCK_N,
+    )
 
     # 初始化偏移向量
     offs_n = cur_batch_partition_start_index + tl.arange(0, BLOCK_N)  # [BLOCK_N]
     offs_d = tl.arange(0, BLOCK_DMODEL)  # [BLOCK_DMODEL]
 
     # 计算 Q K 的偏移量
-    q_offs = (
-        batch_pid * q_bs_stride 
-        + head_pid * q_heads_stride
-        + offs_d * q_dim_stride
-    )
-    k_offs = kv_head_pid * k_heads_stride + offs_d[None, :] * k_dim_stride 
+    q_offs = batch_pid * q_bs_stride + head_pid * q_heads_stride + offs_d * q_dim_stride
+    k_offs = kv_head_pid * k_heads_stride + offs_d[None, :] * k_dim_stride
 
-    q_ptrs = Q + q_offs # 获取 Q 指针
+    q_ptrs = Q + q_offs  # 获取 Q 指针
     q = tl.load(q_ptrs)  # # 加载 Q 向量 [BLOCK_DMODEL]
 
     # 初始化归一化项和累加器
@@ -62,14 +81,18 @@ def _flash_decoding_stage1_kernel(
     for start_n in range(0, num_blocks, 1):
         # k 位置索引计算
         offs_n_new = offs_n + start_n * BLOCK_N  # [BLOCK_N]
-        k_loc = tl.load(b_req_tokens_table + stride_req_to_tokens_b * batch_pid + offs_n_new, mask=offs_n_new < cur_batch_partition_end_index, other=0.0)
+        k_loc = tl.load(
+            b_req_tokens_table + stride_req_to_tokens_b * batch_pid + offs_n_new,
+            mask=offs_n_new < cur_batch_partition_end_index,
+            other=0.0,
+        )
         k_ptrs = k_loc[:, None] * k_bs_stride + k_offs
-        
+
         k_mask = offs_n_new < cur_batch_partition_end_index  # [BLOCK_N]
-        
+
         k = tl.load(K + k_ptrs, mask=k_mask[:, None], other=0.0)
         v = tl.load(V + k_ptrs, mask=k_mask[:, None], other=0.0)
-        
+
         # 计算 qk^T
         qk = tl.sum(q[None, :] * k, axis=1)  # [BLOCK_N]
         qk *= qk_scale
@@ -79,18 +102,18 @@ def _flash_decoding_stage1_kernel(
         current_max = tl.max(qk)  # 标量
         m_ij = tl.maximum(m_i, current_max)  # 标量
         p = tl.exp(qk - m_ij)  # [BLOCK_N]
-        
+
         # 更新归一化项
-        alpha = tl.exp(m_i - m_ij) 
+        alpha = tl.exp(m_i - m_ij)
         d_i = alpha * d_i + tl.sum(p, axis=0)
 
         # 更新 attention 输出累加器
         acc = alpha * acc + tl.sum(p[:, None] * v, axis=0)  # [BLOCK_DMODEL]
         # acc = acc * alpha + tl.dot(p, v)  # [BLOCK_DMODEL]
-        
+
         # 更新归一化器
         m_i = m_ij
-        
+
     # 计算是否需要存储
     need_store = num_blocks > 0  # 标量布尔值
 
@@ -114,74 +137,94 @@ def _flash_decoding_stage1_kernel(
         tl.store(Mid_O + off_mid_o, acc / d_i)
         tl.store(Mid_O_LogExpSum + off_mid_o_les, m_i + tl.log(d_i))
 
+
 @torch.no_grad()
 def flash_decode_stage1(
-    q, k, v,         		# Q: [batchs, num_heads, head_dim], K, V: [batchs * seq_len, num_heads, head_dim]
-    qk_scale, 
+    q,
+    k,
+    v,  # Q: [batchs, num_heads, head_dim], K, V: [batchs * seq_len, num_heads, head_dim]
+    qk_scale,
     b_req_tokens_table,
-	b_seq_len, 
-	max_actual_seq_len,     # 最大的实际序列长度
-    mid_o, mid_o_logexpsum, # Mid_O: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE), head_dim], Mid_O_LogExpSum: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE)]
+    b_seq_len,
+    max_actual_seq_len,  # 最大的实际序列长度
+    mid_o,
+    mid_o_logexpsum,  # Mid_O: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE), head_dim], Mid_O_LogExpSum: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE)]
     PARTITION_SIZE,
 ):
-	BLOCK_N_SIZE = 16
-
-	# BLOCK_DMODEL = q.shape[-1]
-	assert PARTITION_SIZE % BLOCK_N_SIZE == 0, "PARTITION_SIZE 必须是 BLOCK_N_SIZE 的倍数"
-
-	batchs, num_heads, head_dim = q.shape # decode 阶段 q 张量的 seq_len = 1, 这里的 batchs 实际就是 batch_size
-	
-	# grid 配置的并行度比 flashattention1-2 多了 kv cache seq 维度
-	grid = (batchs, num_heads, triton.cdiv(max_actual_seq_len + PARTITION_SIZE - 1, PARTITION_SIZE))
-	num_kv_groups = q.shape[1] // k.shape[1] # num_q_heads // num_k_heads
-
-	_flash_decoding_stage1_kernel[grid](
-		q, k, v, qk_scale,
-	   	b_req_tokens_table,
-        b_seq_len, 
-		num_kv_groups,   # kv 组数量
-		mid_o, mid_o_logexpsum,
-		*b_req_tokens_table.stride(),
-		*q.stride(),
-		*k.stride(),
-		*v.stride(),
-		*mid_o.stride(),
-		*mid_o_logexpsum.stride(),
-		BLOCK_SEQ = PARTITION_SIZE,
-		BLOCK_N = BLOCK_N_SIZE,
-		BLOCK_DMODEL = head_dim,
-		num_warps = 1, 
-		num_stages = 2,
-	)
+    BLOCK_N_SIZE = 16
+
+    # BLOCK_DMODEL = q.shape[-1]
+    assert PARTITION_SIZE % BLOCK_N_SIZE == 0, (
+        "PARTITION_SIZE 必须是 BLOCK_N_SIZE 的倍数"
+    )
+
+    batchs, num_heads, head_dim = (
+        q.shape
+    )  # decode 阶段 q 张量的 seq_len = 1, 这里的 batchs 实际就是 batch_size
+
+    # grid 配置的并行度比 flashattention1-2 多了 kv cache seq 维度
+    grid = (
+        batchs,
+        num_heads,
+        triton.cdiv(max_actual_seq_len + PARTITION_SIZE - 1, PARTITION_SIZE),
+    )
+    num_kv_groups = q.shape[1] // k.shape[1]  # num_q_heads // num_k_heads
+
+    _flash_decoding_stage1_kernel[grid](
+        q,
+        k,
+        v,
+        qk_scale,
+        b_req_tokens_table,
+        b_seq_len,
+        num_kv_groups,  # kv 组数量
+        mid_o,
+        mid_o_logexpsum,
+        *b_req_tokens_table.stride(),
+        *q.stride(),
+        *k.stride(),
+        *v.stride(),
+        *mid_o.stride(),
+        *mid_o_logexpsum.stride(),
+        BLOCK_SEQ=PARTITION_SIZE,
+        BLOCK_N=BLOCK_N_SIZE,
+        BLOCK_DMODEL=head_dim,
+        num_warps=1,
+        num_stages=2,
+    )
+
 
 @triton.jit
 def _flash_decoding_stage2_kernel(
-	Mid_O,  		  # [batch, head, seq_block_num, head_dim]
-	Mid_O_LogExpSum,  # [batch, head, seq_block_num]
-	Ouput,            # attention 输出首地址
-	mido_batch_stride, mido_heads_stride, mido_partitions_stride, mido_dim_stride,
-	mido_les_batch_stride, mido_les_heads_stride, mido_les_partitions_stride,
-	o_bs_stride, o_heads_stride, o_dim_stride,
-	B_Seqlen,   # TODO 支持 PagedAttention 和连续批处理
-	BLOCK_DMODEL: tl.constexpr,
-	BLOCK_SEQ: tl.constexpr, # type: ignore
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    Ouput,  # attention 输出首地址
+    mido_batch_stride,
+    mido_heads_stride,
+    mido_partitions_stride,
+    mido_dim_stride,
+    mido_les_batch_stride,
+    mido_les_heads_stride,
+    mido_les_partitions_stride,
+    o_bs_stride,
+    o_heads_stride,
+    o_dim_stride,
+    B_Seqlen,  # TODO 支持 PagedAttention 和连续批处理
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_SEQ: tl.constexpr,  # type: ignore
 ):
-    """Reduction (online softmax)
-    """
+    """Reduction (online softmax)"""
     batch_pid = tl.program_id(0)
     head_pid = tl.program_id(1)
     cur_batch_seq_len = tl.load(B_Seqlen + batch_pid)
-    
-    # 初始化偏移 
+
+    # 初始化偏移
     offs_d = tl.arange(0, BLOCK_DMODEL)
 
-	# 最后一个维度 stride 为 1 可省略, 如 mido_dim_stride
-    offs_part_v = batch_pid * mido_batch_stride \
-                + head_pid * mido_heads_stride \
-                + offs_d
+    # 最后一个维度 stride 为 1 可省略, 如 mido_dim_stride
+    offs_part_v = batch_pid * mido_batch_stride + head_pid * mido_heads_stride + offs_d
 
-    offs_part_max = batch_pid * mido_les_batch_stride \
-                + head_pid * mido_les_heads_stride
+    offs_part_max = batch_pid * mido_les_batch_stride + head_pid * mido_les_heads_stride
 
     part_v_ptrs = Mid_O + offs_part_v
     part_max_ptrs = Mid_O_LogExpSum + offs_part_max
@@ -190,12 +233,14 @@ def _flash_decoding_stage2_kernel(
     d_i = 0.0
     m_i = -float("inf")
     acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
-    
+
     num_partitions = (cur_batch_seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ
-    
-    for block_seq_n in range(0, num_partitions, 1): # TODO 有 bug 需要修复
+
+    for block_seq_n in range(0, num_partitions, 1):  # TODO 有 bug 需要修复
         part_v = tl.load(part_v_ptrs + block_seq_n * mido_partitions_stride)
-        part_max = tl.load(part_max_ptrs + block_seq_n) # mido_les_partitions_stride = 1
+        part_max = tl.load(
+            part_max_ptrs + block_seq_n
+        )  # mido_les_partitions_stride = 1
 
         # -- 更新局部最大值 -- #
         m_ij = tl.maximum(part_max, m_i)
@@ -213,94 +258,121 @@ def _flash_decoding_stage2_kernel(
         m_i = m_ij
 
     # -- 更新 attention 输出累加器 -- #
-    offs_out = batch_pid * o_bs_stride + head_pid * o_heads_stride + offs_d * o_dim_stride
+    offs_out = (
+        batch_pid * o_bs_stride + head_pid * o_heads_stride + offs_d * o_dim_stride
+    )
     tl.store(Ouput + offs_out, acc / d_i)
 
+
 @torch.no_grad()
 def flash_decode_stage2(
-    mid_o, mid_o_logexpsum, # 存储每个批次、每个头、每个分区的中间分数输出及 log(sum(exp(scores)))
-	atten_output,           # attention 输出首地址
-	b_seq_len,  	        # kv cache 在 seq_len 维度的长度向量
-    PARTITION_SIZE
-):	
-	batchs, num_heads, HEAD_DIM = mid_o.shape[0], mid_o.shape[1], mid_o.shape[-1]
-	grid = (batchs, num_heads)
-	
-	_flash_decoding_stage2_kernel[grid](
-		mid_o,  	     # [batch, head, seq_block_num, head_dim]
-		mid_o_logexpsum, # [batch, head, seq_block_num]
-		atten_output,           # attention 输出首地址
-		*mid_o.stride(),
-		*mid_o_logexpsum.stride(),
-		*atten_output.stride(),
-		b_seq_len,   # TODO 支持 PagedAttention 和连续批处理
-		BLOCK_DMODEL = HEAD_DIM,
-		BLOCK_SEQ = PARTITION_SIZE, # type: ignore	
-		num_warps = 4,
-		num_stages = 2,
-	)
+    mid_o,
+    mid_o_logexpsum,  # 存储每个批次、每个头、每个分区的中间分数输出及 log(sum(exp(scores)))
+    atten_output,  # attention 输出首地址
+    b_seq_len,  # kv cache 在 seq_len 维度的长度向量
+    PARTITION_SIZE,
+):
+    batchs, num_heads, HEAD_DIM = mid_o.shape[0], mid_o.shape[1], mid_o.shape[-1]
+    grid = (batchs, num_heads)
+
+    _flash_decoding_stage2_kernel[grid](
+        mid_o,  # [batch, head, seq_block_num, head_dim]
+        mid_o_logexpsum,  # [batch, head, seq_block_num]
+        atten_output,  # attention 输出首地址
+        *mid_o.stride(),
+        *mid_o_logexpsum.stride(),
+        *atten_output.stride(),
+        b_seq_len,  # TODO 支持 PagedAttention 和连续批处理
+        BLOCK_DMODEL=HEAD_DIM,
+        BLOCK_SEQ=PARTITION_SIZE,  # type: ignore
+        num_warps=4,
+        num_stages=2,
+    )
+
 
 @torch.no_grad()
 def flash_decoding(
-    q, 			 # q 查询向量，形状为 [bsz, num_head, head_dim]
-    k_cache, v_cache, 	     # 键/值向量缓存，形状为 [max_tokens, kv_num_head, head_dim]
+    q,  # q 查询向量，形状为 [bsz, num_head, head_dim]
+    k_cache,
+    v_cache,  # 键/值向量缓存，形状为 [max_tokens, kv_num_head, head_dim]
     qk_scale,
-    b_req_tokens_table, b_seq_len, # start locations and sequence lengths for kv cache in a batch
-    max_actual_seq_len
+    b_req_tokens_table,
+    b_seq_len,  # start locations and sequence lengths for kv cache in a batch
+    max_actual_seq_len,
 ):
-	# q.view(-1, num_heads, head_dim)
-	assert q.shape[-1] == k_cache.shape[-1] == v_cache.shape[-1]
-	PARTITION_SIZE = 128  # 3090ti 显卡以上可设置为 256
-	batchs, num_heads, head_dim = q.shape # decode 阶段 q 的 seq_len = 1, 
+    # q.view(-1, num_heads, head_dim)
+    assert q.shape[-1] == k_cache.shape[-1] == v_cache.shape[-1]
+    PARTITION_SIZE = 128  # 3090ti 显卡以上可设置为 256
+    batchs, num_heads, head_dim = q.shape  # decode 阶段 q 的 seq_len = 1,
+
+    # 最大可用分区数量计算
+    max_num_partitions = (max_actual_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
+
+    # mid_o: 存储每个批次、每个头、每个分区的中间输出
+    mid_o = torch.empty(
+        (batchs, num_heads, max_num_partitions, head_dim),
+        dtype=torch.float32,
+        device=q.device,
+    )
+    # 存储每个批次、每个头、每个分区的 log(sum(exp(scores)))，用于后续 decode_stage2 的归一化
+    mid_o_logexpsum = torch.empty(
+        (batchs, num_heads, max_num_partitions), dtype=torch.float32, device=q.device
+    )
 
-	# 最大可用分区数量计算
-	max_num_partitions = (max_actual_seq_len + PARTITION_SIZE -1) // PARTITION_SIZE
+    # decode stage 1: attention in partitions
+    flash_decode_stage1(
+        q,
+        k_cache,
+        v_cache,
+        qk_scale,
+        b_req_tokens_table,
+        b_seq_len,
+        max_actual_seq_len,
+        mid_o,
+        mid_o_logexpsum,
+        PARTITION_SIZE,
+    )
 
-	# mid_o: 存储每个批次、每个头、每个分区的中间输出
-	mid_o = torch.empty((batchs, num_heads, max_num_partitions, head_dim), dtype=torch.float32, device=q.device)
-	# 存储每个批次、每个头、每个分区的 log(sum(exp(scores)))，用于后续 decode_stage2 的归一化
-	mid_o_logexpsum = torch.empty((batchs, num_heads, max_num_partitions), dtype=torch.float32, device=q.device)
+    # decode stage 2: reduction among partitions
+    atten_output = torch.empty_like(q)
 
-	# decode stage 1: attention in partitions
-	flash_decode_stage1(q, k_cache, v_cache, qk_scale, 
-                        b_req_tokens_table, b_seq_len, max_actual_seq_len, 
-                        mid_o, mid_o_logexpsum, PARTITION_SIZE
-                    )
-	
-	# decode stage 2: reduction among partitions
-	atten_output = torch.empty_like(q)
+    flash_decode_stage2(mid_o, mid_o_logexpsum, atten_output, b_seq_len, PARTITION_SIZE)
 
-	flash_decode_stage2(mid_o, mid_o_logexpsum, atten_output, b_seq_len, PARTITION_SIZE)
+    return atten_output
 
-	return atten_output
 
 # --------------------------------------
 # 标准 Attention Decode 实现（纯 PyTorch版）
 # --------------------------------------
 def _naive_attention(q, k, v):
     import math
+
     head_dim = q.shape[-1]
-    q = q.transpose(0, 1)  #(nhead, 1, head_dim)
-    k = k.transpose(0, 1)  #(nhead, seqlen, head_dim)
-    v = v.transpose(0, 1)  #(nhead, seqlen, head_dim)
+    q = q.transpose(0, 1)  # (nhead, 1, head_dim)
+    k = k.transpose(0, 1)  # (nhead, seqlen, head_dim)
+    v = v.transpose(0, 1)  # (nhead, seqlen, head_dim)
     scores = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(head_dim)
     scores = torch.nn.functional.softmax(scores.float(), dim=-1).to(q.dtype)
-    output = torch.matmul(scores, v).transpose(0, 1).contiguous() #(1, nhead, head_dim)
+    output = (
+        torch.matmul(scores, v).transpose(0, 1).contiguous()
+    )  # (1, nhead, head_dim)
     return output
 
+
 def torch_attention_with_kvcache(q, k_cache, v_cache, b_start_loc, b_seq_len):
     out = torch.empty_like(q)
     Z = q.shape[0]
     for i in range(Z):
         start = b_start_loc[i]
         end = start + b_seq_len[i]
-        q_i = q[i:i+1]            #(1, nhead, head_dim)
-        k_i = k_cache[start:end]  #(seqlen, nhead, head_dim)
-        v_i = v_cache[start:end]  #(seqlen, nhead, head_dim)
+        q_i = q[i : i + 1]  # (1, nhead, head_dim)
+        k_i = k_cache[start:end]  # (seqlen, nhead, head_dim)
+        v_i = v_cache[start:end]  # (seqlen, nhead, head_dim)
         o_i = _naive_attention(q_i, k_i, v_i)
-        out[i:i+1] = o_i
+        out[i : i + 1] = o_i
     return out
 
+
 # ----------------------------------
 # 性能对比及曲线绘制函数封装（含 Warm up）
 # ----------------------------------
@@ -308,19 +380,20 @@ def plot_performance_comparison(token_sizes, warmup_iterations=10, test_iteratio
     """
     对不同 token size 下的 Flash Decoding 与标准 Attention 的性能进行测试，
     并绘制性能对比曲线。
-    
+
     参数:
       token_sizes: list[int]，不同的 kv cache 长度
       warmup_iterations: int, 预热迭代次数
       test_iterations: int, 正式测试迭代次数
     """
     import matplotlib.pyplot as plt
-    device = torch.device('cuda')
+
+    device = torch.device("cuda")
     batch = 4
     num_heads = 32
     head_dim = 64
-    qk_scale = 1.0 / (head_dim ** 0.5)
-    q = torch.randn(batch*1, num_heads, head_dim, device=device)
+    qk_scale = 1.0 / (head_dim**0.5)
+    q = torch.randn(batch * 1, num_heads, head_dim, device=device)
 
     flash_times = []
     standard_times = []
@@ -329,21 +402,41 @@ def plot_performance_comparison(token_sizes, warmup_iterations=10, test_iteratio
         print(f"\n测试 token size: {tokens}")
         k_cache = torch.randn(batch * tokens, num_heads, head_dim, device=device)
         v_cache = torch.randn(batch * tokens, num_heads, head_dim, device=device)
-        b_req_tokens_table = torch.arange(0, tokens, device=device, dtype=torch.int32).repeat(batch, 1)
-        b_start_loc = torch.tensor([0, tokens, 2*tokens, 3*tokens], dtype=torch.int32, device="cuda") # batch = 4
+        b_req_tokens_table = torch.arange(
+            0, tokens, device=device, dtype=torch.int32
+        ).repeat(batch, 1)
+        b_start_loc = torch.tensor(
+            [0, tokens, 2 * tokens, 3 * tokens], dtype=torch.int32, device="cuda"
+        )  # batch = 4
         b_seq_len = torch.full((batch,), tokens, device=device, dtype=torch.int32)
         max_actual_seq_len = tokens
 
         # Warm up Flash Decoding 内核
         for _ in range(warmup_iterations):
-            _ = flash_decoding(q, k_cache, v_cache, qk_scale, b_req_tokens_table, b_seq_len, max_actual_seq_len)
+            _ = flash_decoding(
+                q,
+                k_cache,
+                v_cache,
+                qk_scale,
+                b_req_tokens_table,
+                b_seq_len,
+                max_actual_seq_len,
+            )
         # 测试 Flash Decoding
         torch.cuda.synchronize()
         flash_start = torch.cuda.Event(enable_timing=True)
         flash_end = torch.cuda.Event(enable_timing=True)
         flash_start.record()
         for _ in range(test_iterations):
-            _ = flash_decoding(q, k_cache, v_cache, qk_scale, b_req_tokens_table, b_seq_len, max_actual_seq_len)
+            _ = flash_decoding(
+                q,
+                k_cache,
+                v_cache,
+                qk_scale,
+                b_req_tokens_table,
+                b_seq_len,
+                max_actual_seq_len,
+            )
         flash_end.record()
         torch.cuda.synchronize()
         flash_avg = flash_start.elapsed_time(flash_end) / test_iterations
@@ -352,14 +445,18 @@ def plot_performance_comparison(token_sizes, warmup_iterations=10, test_iteratio
 
         # Warm up 标准 Attention
         for _ in range(warmup_iterations):
-            _ = torch_attention_with_kvcache(q, k_cache, v_cache, b_start_loc, b_seq_len)
+            _ = torch_attention_with_kvcache(
+                q, k_cache, v_cache, b_start_loc, b_seq_len
+            )
         # 测试标准 Attention
         torch.cuda.synchronize()
         std_start = torch.cuda.Event(enable_timing=True)
         std_end = torch.cuda.Event(enable_timing=True)
         std_start.record()
         for _ in range(test_iterations):
-            _ = torch_attention_with_kvcache(q, k_cache, v_cache, b_start_loc, b_seq_len)
+            _ = torch_attention_with_kvcache(
+                q, k_cache, v_cache, b_start_loc, b_seq_len
+            )
         std_end.record()
         torch.cuda.synchronize()
         std_avg = std_start.elapsed_time(std_end) / test_iterations
@@ -368,28 +465,29 @@ def plot_performance_comparison(token_sizes, warmup_iterations=10, test_iteratio
 
     # 绘制性能对比曲线
     plt.figure(figsize=(8, 6))
-    plt.plot(token_sizes, flash_times, marker='o', label='Flash Decoding')
-    plt.plot(token_sizes, standard_times, marker='o', label='Standard Attention')
-    plt.xlabel('Token Size (kv cache length)')
-    plt.ylabel('Average Time (ms)')
-    plt.title('Performance Comparison: Flash Decoding vs Standard Attention')
+    plt.plot(token_sizes, flash_times, marker="o", label="Flash Decoding")
+    plt.plot(token_sizes, standard_times, marker="o", label="Standard Attention")
+    plt.xlabel("Token Size (kv cache length)")
+    plt.ylabel("Average Time (ms)")
+    plt.title("Performance Comparison: Flash Decoding vs Standard Attention")
     plt.legend()
     plt.grid(True)
     plt.savefig("./flashdecoding_benchamrk.png")
 
+
 # -------------------------------
 # 验证输出和调用性能对比函数
 # -------------------------------
 def main():
     torch.manual_seed(0)
-    device = torch.device('cuda')
-    
+    device = torch.device("cuda")
+
     # 测试参数
     batch = 4
     num_heads = 32
     head_dim = 64
-    max_tokens = 2048 # 每个请求序列的最大 tokens 长度
-    qk_scale = 1.0 / (head_dim ** 0.5)
+    max_tokens = 2048  # 每个请求序列的最大 tokens 长度
+    qk_scale = 1.0 / (head_dim**0.5)
 
     # 构造测试数据：固定 q，k_cache, v_cache, b_req_tokens_table, b_seq_len
     # 输入张量 q/k/v 的形状为 [batch * seq_len, num_heads, head_dim], 形状是三维的，为了兼容 flash_decoding 内核
@@ -397,13 +495,23 @@ def main():
     k_cache = torch.randn(batch * max_tokens, num_heads, head_dim, device=device)
     v_cache = torch.randn(batch * max_tokens, num_heads, head_dim, device=device)
     # 构造每个请求的 kv tokens 分配的显存空间对应的显存块索引
-    b_req_tokens_table = torch.arange(0, max_tokens*batch, device=device, dtype=torch.int32).view(batch, max_tokens)
+    b_req_tokens_table = torch.arange(
+        0, max_tokens * batch, device=device, dtype=torch.int32
+    ).view(batch, max_tokens)
     b_seq_len = torch.full((batch,), max_tokens, device=device, dtype=torch.int32)
-    b_start_loc = torch.tensor([0, max_tokens, 2*max_tokens, 3*max_tokens], dtype=torch.int32, device="cuda") # batch = 4
-    
+    b_start_loc = torch.tensor(
+        [0, max_tokens, 2 * max_tokens, 3 * max_tokens],
+        dtype=torch.int32,
+        device="cuda",
+    )  # batch = 4
+
     # 单次验证 flash_decoding 输出形状及数值（与标准 Attention 接近）
-    flash_out = flash_decoding(q, k_cache, v_cache, qk_scale, b_req_tokens_table, b_seq_len, max_tokens)
-    standard_out = torch_attention_with_kvcache(q, k_cache, v_cache, b_start_loc, b_seq_len)
+    flash_out = flash_decoding(
+        q, k_cache, v_cache, qk_scale, b_req_tokens_table, b_seq_len, max_tokens
+    )
+    standard_out = torch_attention_with_kvcache(
+        q, k_cache, v_cache, b_start_loc, b_seq_len
+    )
     print("Flash Decoding output shape:", flash_out.shape)
     print("Standard Attention output shape:", standard_out.shape)
     if torch.allclose(flash_out, standard_out, atol=1e-3, rtol=1e-3):
@@ -411,11 +519,11 @@ def main():
     else:
         diff = (flash_out - standard_out).abs().max().item()
         print(f"验证失败：最大误差为 {diff:.4f}")
-    
+
     # 封装的性能对比曲线函数
     token_numbers = [64, 128, 256, 512, 1024, max_tokens]
     plot_performance_comparison(token_numbers, warmup_iterations=10, test_iterations=50)
 
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/lite_llama/kernels/others/activation_layers.py b/lite_llama/kernels/others/activation_layers.py
index 43d7e32..fbb91d3 100644
--- a/lite_llama/kernels/others/activation_layers.py
+++ b/lite_llama/kernels/others/activation_layers.py
@@ -19,6 +19,7 @@
 from packaging import version
 from torch import Tensor, nn
 
+
 class PytorchGELUTanh(nn.Module):
     """
     A fast C implementation of the tanh approximation of the GeLU activation function. See
@@ -47,7 +48,17 @@ class NewGELUActivation(nn.Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+        return (
+            0.5
+            * input
+            * (
+                1.0
+                + torch.tanh(
+                    math.sqrt(2.0 / math.pi)
+                    * (input + 0.044715 * torch.pow(input, 3.0))
+                )
+            )
+        )
 
 
 class GELUActivation(nn.Module):
@@ -78,7 +89,14 @@ class FastGELUActivation(nn.Module):
     """
 
     def forward(self, input: Tensor) -> Tensor:
-        return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+        return (
+            0.5
+            * input
+            * (
+                1.0
+                + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))
+            )
+        )
 
 
 class QuickGELUActivation(nn.Module):
@@ -128,7 +146,16 @@ def __init__(self):
         self.precomputed_constant = math.sqrt(2 / math.pi)
 
     def forward(self, input: Tensor) -> Tensor:
-        return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))))
+        return (
+            0.5
+            * input
+            * (
+                1
+                + torch.tanh(
+                    self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3))
+                )
+            )
+        )
 
 
 class MishActivation(nn.Module):
@@ -220,7 +247,9 @@ def get_activation(activation_string):
     if activation_string in ACT2FN:
         return ACT2FN[activation_string]
     else:
-        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+        raise KeyError(
+            f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
+        )
 
 
 # For backwards compatibility with: from activations import gelu_python
@@ -231,4 +260,4 @@ def get_activation(activation_string):
 quick_gelu = get_activation("quick_gelu")
 silu = get_activation("silu")
 mish = get_activation("mish")
-linear_act = get_activation("linear")
\ No newline at end of file
+linear_act = get_activation("linear")
diff --git a/lite_llama/kernels/others/context_flashattention_nopad.py b/lite_llama/kernels/others/context_flashattention_nopad.py
index 8983891..e158459 100644
--- a/lite_llama/kernels/others/context_flashattention_nopad.py
+++ b/lite_llama/kernels/others/context_flashattention_nopad.py
@@ -74,19 +74,30 @@ def _fwd_kernel(
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
     block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
-    block_end_loc = tl.minimum(block_start_loc + BLOCK_M + prompt_cache_len, cur_batch_seq_len + prompt_cache_len)
+    block_end_loc = tl.minimum(
+        block_start_loc + BLOCK_M + prompt_cache_len,
+        cur_batch_seq_len + prompt_cache_len,
+    )
 
     # causal mask
     for start_n in range(0, block_mask * block_end_loc, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
         # -- compute qk ----
         kv_loc = tl.load(
-            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + stride_req_to_tokens_s * (start_n + offs_n),
+            Req_to_tokens
+            + stride_req_to_tokens_b * cur_batch_req_idx
+            + stride_req_to_tokens_s * (start_n + offs_n),
             mask=(start_n + offs_n) < block_end_loc,
             other=0,
         )
-        off_k = kv_loc[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] * stride_kd
-        k = tl.load(K + off_k, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0)
+        off_k = (
+            kv_loc[None, :] * stride_kbs
+            + cur_kv_head * stride_kh
+            + offs_d[:, None] * stride_kd
+        )
+        k = tl.load(
+            K + off_k, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0
+        )
         qk = tl.dot(q, k)
 
         mask = offs_m[:, None] + prompt_cache_len >= (start_n + offs_n[None, :])
@@ -102,8 +113,14 @@ def _fwd_kernel(
         # -- update output accumulator --
         acc = acc * alpha[:, None]
         # update acc
-        off_v = kv_loc[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] * stride_vd
-        v = tl.load(V + off_v, mask=(start_n + offs_n[:, None]) < block_end_loc, other=0.0)
+        off_v = (
+            kv_loc[:, None] * stride_vbs
+            + cur_kv_head * stride_vh
+            + offs_d[None, :] * stride_vd
+        )
+        v = tl.load(
+            V + off_v, mask=(start_n + offs_n[:, None]) < block_end_loc, other=0.0
+        )
         p = p.to(v.dtype)
         acc = tl.dot(p, v, acc)
         # update m_i and l_i
@@ -121,7 +138,16 @@ def _fwd_kernel(
 
 @torch.no_grad()
 def context_attention_fwd(
-    q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, req_to_token_indexs
+    q,
+    k,
+    v,
+    o,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    b_prompt_cache_len,
+    max_input_len,
+    req_to_token_indexs,
 ):
     BLOCK_M = 128 if not TESLA else 64
     # shape constraints
@@ -131,7 +157,7 @@ def context_attention_fwd(
 
     # 计算scale系数, 并乘以 1/log(2) = 1.4426950408889634,
     # 算子内部使用 tl.math.exp2 来使计算与标准attention等价。
-    sm_scale = 1.0 / (Lq ** 0.5) * 1.4426950408889634
+    sm_scale = 1.0 / (Lq**0.5) * 1.4426950408889634
     batch, head = b_seq_len.shape[0], q.shape[1]
     kv_group_num = q.shape[1] // k.shape[1]
 
@@ -224,8 +250,16 @@ def _fwd_kernel_no_prompt_cache(
         + cur_head * stride_qh
         + offs_d[None, :] * stride_qd
     )
-    off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] * stride_kd
-    off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] * stride_vd
+    off_k = (
+        offs_n[None, :] * stride_kbs
+        + cur_kv_head * stride_kh
+        + offs_d[:, None] * stride_kd
+    )
+    off_v = (
+        offs_n[:, None] * stride_vbs
+        + cur_kv_head * stride_vh
+        + offs_d[None, :] * stride_vd
+    )
 
     q = tl.load(Q + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
 
@@ -285,7 +319,9 @@ def _fwd_kernel_no_prompt_cache(
 
 
 @torch.no_grad()
-def context_attention_fwd_no_prompt_cache(q, k, v, b_start_loc, b_seq_len, max_input_len):
+def context_attention_fwd_no_prompt_cache(
+    q, k, v, b_start_loc, b_seq_len, max_input_len
+):
     o = torch.empty_like(q)
     BLOCK_M = 128 if not TESLA else 64
     # shape constraints
@@ -295,7 +331,7 @@ def context_attention_fwd_no_prompt_cache(q, k, v, b_start_loc, b_seq_len, max_i
 
     # 计算scale系数, 并乘以 1/log(2) = 1.4426950408889634,
     # 算子内部使用 tl.math.exp2 来使计算与标准attention等价。
-    sm_scale = 1.0 / (Lq ** 0.5) * 1.4426950408889634
+    sm_scale = 1.0 / (Lq**0.5) * 1.4426950408889634
     batch, head = b_seq_len.shape[0], q.shape[1]
     kv_group_num = q.shape[1] // k.shape[1]
 
@@ -394,7 +430,10 @@ def _fwd_kernel_int8kv(
     acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
 
     block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
-    block_end_loc = tl.minimum(block_start_loc + BLOCK_M + prompt_cache_len, cur_batch_seq_len + prompt_cache_len)
+    block_end_loc = tl.minimum(
+        block_start_loc + BLOCK_M + prompt_cache_len,
+        cur_batch_seq_len + prompt_cache_len,
+    )
     # causal mask
     for start_n in range(0, block_mask * block_end_loc, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
@@ -410,7 +449,9 @@ def _fwd_kernel_int8kv(
             + cur_kv_head * stride_kh
             + offs_d[:, None] * stride_kd
         )
-        k = tl.load(K + off_k, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0)
+        k = tl.load(
+            K + off_k, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0
+        )
 
         qk = tl.dot(q, k)
         mask = (offs_m[:, None] + prompt_cache_len) >= (start_n + offs_n[None, :])
@@ -437,7 +478,9 @@ def _fwd_kernel_int8kv(
             + cur_kv_head * stride_vh
             + offs_d[None, :] * stride_vd
         )
-        v = tl.load(V + off_v, mask=(start_n + offs_n[:, None]) < block_end_loc, other=0.0)
+        v = tl.load(
+            V + off_v, mask=(start_n + offs_n[:, None]) < block_end_loc, other=0.0
+        )
 
         p = p.to(v.dtype)
         acc = tl.dot(p, v, acc)
@@ -455,7 +498,9 @@ def _fwd_kernel_int8kv(
 
 
 @torch.no_grad()
-def context_attention_fwd_ppl_int8kv(q, k, v, o, b_start_loc, b_seq_len, max_input_len, b_prompt_cache_len):
+def context_attention_fwd_ppl_int8kv(
+    q, k, v, o, b_start_loc, b_seq_len, max_input_len, b_prompt_cache_len
+):
     BLOCK_M = 128 if not TESLA else 64
     # shape constraints
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
@@ -464,7 +509,7 @@ def context_attention_fwd_ppl_int8kv(q, k, v, o, b_start_loc, b_seq_len, max_inp
 
     # 计算scale系数, 并乘以 1/log(2) = 1.4426950408889634,
     # 算子内部使用 tl.math.exp2 来使计算与标准attention等价。
-    sm_scale = 1.0 / (Lq ** 0.5) * 1.4426950408889634
+    sm_scale = 1.0 / (Lq**0.5) * 1.4426950408889634
     batch, head = b_seq_len.shape[0], q.shape[1]
     kv_group_num = q.shape[1] // k.shape[1]
 
@@ -506,8 +551,17 @@ def context_attention_fwd_ppl_int8kv(q, k, v, o, b_start_loc, b_seq_len, max_inp
     )
 
 
-def torch_context_attention_fwd(q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, req_to_token_indexs):
-
+def torch_context_attention_fwd(
+    q,
+    k,
+    v,
+    o,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    b_prompt_cache_len,
+    req_to_token_indexs,
+):
     batch = b_start_loc.shape[0]
     print(q.shape)
     for i in range(batch):
@@ -532,7 +586,9 @@ def torch_context_attention_fwd(q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b
 
         dk = cur_q.shape[-1]
 
-        p = torch.matmul(cur_q, cur_k.transpose(-2, -1)) / torch.sqrt(torch.tensor(dk, dtype=torch.float32))
+        p = torch.matmul(cur_q, cur_k.transpose(-2, -1)) / torch.sqrt(
+            torch.tensor(dk, dtype=torch.float32)
+        )
 
         q_index = torch.arange(cur_q.shape[1]).unsqueeze(-1).to(p.device)
         k_index = torch.arange(cur_k.shape[1]).unsqueeze(0).to(p.device)
@@ -543,7 +599,9 @@ def torch_context_attention_fwd(q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b
 
         s = F.softmax(p, dim=-1)
 
-        o[start_loc : start_loc + seq_len - prompt_cache_len, :, :] = torch.matmul(s, cur_v).transpose(0, 1)
+        o[start_loc : start_loc + seq_len - prompt_cache_len, :, :] = torch.matmul(
+            s, cur_v
+        ).transpose(0, 1)
 
 
 def test():
@@ -553,14 +611,24 @@ def test():
     Z, H, N_CTX, D_HEAD = 16, 16, 2048, 128
     dtype = torch.float16
     prompt_cache_len = 128
-    q = torch.empty((Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    k = torch.empty((Z * N_CTX, H, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2)
-    v = torch.empty((Z * N_CTX, H, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2)
-    o = torch.empty((Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2)
-    torch_o = torch.empty((Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda").normal_(
+    q = torch.empty(
+        (Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda"
+    ).normal_(mean=0.1, std=0.2)
+    k = torch.empty((Z * N_CTX, H, D_HEAD), dtype=dtype, device="cuda").normal_(
+        mean=0.4, std=0.2
+    )
+    v = torch.empty((Z * N_CTX, H, D_HEAD), dtype=dtype, device="cuda").normal_(
         mean=0.3, std=0.2
     )
-    req_to_token_indexs = torch.empty((1000, N_CTX + 7000), dtype=torch.int32, device="cuda")
+    o = torch.empty(
+        (Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda"
+    ).normal_(mean=0.3, std=0.2)
+    torch_o = torch.empty(
+        (Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda"
+    ).normal_(mean=0.3, std=0.2)
+    req_to_token_indexs = torch.empty(
+        (1000, N_CTX + 7000), dtype=torch.int32, device="cuda"
+    )
     max_input_len = N_CTX
     b_start_loc = torch.zeros((Z,), dtype=torch.int32, device="cuda")
     b_seq_len = torch.ones((Z,), dtype=torch.int32, device="cuda")
@@ -570,12 +638,22 @@ def test():
     for i in range(Z):
         b_seq_len[i] = N_CTX
         b_req_idx[i] = i
-        req_to_token_indexs[i][:N_CTX] = torch.tensor(np.arange(N_CTX), dtype=torch.int32).cuda() + N_CTX * i
+        req_to_token_indexs[i][:N_CTX] = (
+            torch.tensor(np.arange(N_CTX), dtype=torch.int32).cuda() + N_CTX * i
+        )
         if i != 0:
             b_start_loc[i] = b_start_loc[i - 1] + N_CTX - prompt_cache_len
         b_prompt_cache_len[i] = prompt_cache_len
     torch_context_attention_fwd(
-        q, k, v, torch_o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, req_to_token_indexs
+        q,
+        k,
+        v,
+        torch_o,
+        b_req_idx,
+        b_start_loc,
+        b_seq_len,
+        b_prompt_cache_len,
+        req_to_token_indexs,
     )
 
     import time
@@ -584,7 +662,16 @@ def test():
     a = time.time()
     for i in range(1000):
         context_attention_fwd(
-            q, k, v, o, b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, req_to_token_indexs
+            q,
+            k,
+            v,
+            o,
+            b_req_idx,
+            b_start_loc,
+            b_seq_len,
+            b_prompt_cache_len,
+            max_input_len,
+            req_to_token_indexs,
         )
     torch.cuda.synchronize()
     b = time.time()
@@ -596,8 +683,9 @@ def test():
     assert torch.allclose(torch_o, o, atol=1e-2, rtol=0)
 
 
-def torch_context_attention_fwd2(q, k, v, o, b_start_loc, b_seq_len, b_prompt_cache_len):
-
+def torch_context_attention_fwd2(
+    q, k, v, o, b_start_loc, b_seq_len, b_prompt_cache_len
+):
     batch = b_start_loc.shape[0]
     k = k.transpose(1, 2)
     v = v.transpose(1, 2)
@@ -618,7 +706,9 @@ def torch_context_attention_fwd2(q, k, v, o, b_start_loc, b_seq_len, b_prompt_ca
         cur_v = cur_v.transpose(0, 1)
         dk = cur_q.shape[-1]
 
-        p = torch.matmul(cur_q, cur_k.transpose(-2, -1)) / torch.sqrt(torch.tensor(dk, dtype=torch.float32))
+        p = torch.matmul(cur_q, cur_k.transpose(-2, -1)) / torch.sqrt(
+            torch.tensor(dk, dtype=torch.float32)
+        )
 
         q_index = torch.arange(cur_q.shape[1]).unsqueeze(-1).to(p.device)
         k_index = torch.arange(cur_k.shape[1]).unsqueeze(0).to(p.device)
@@ -629,7 +719,9 @@ def torch_context_attention_fwd2(q, k, v, o, b_start_loc, b_seq_len, b_prompt_ca
 
         s = F.softmax(p, dim=-1)
 
-        o[start_loc : start_loc + seq_len - prompt_cache_len, :, :] = torch.matmul(s, cur_v).transpose(0, 1)
+        o[start_loc : start_loc + seq_len - prompt_cache_len, :, :] = torch.matmul(
+            s, cur_v
+        ).transpose(0, 1)
 
 
 def test2():
@@ -639,15 +731,21 @@ def test2():
     Z, H, N_CTX, D_HEAD = 16, 16, 2048, 128
     dtype = torch.float16
     prompt_cache_len = 0
-    q = torch.empty((Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-    kv = torch.empty((Z, 2 * H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2)
+    q = torch.empty(
+        (Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda"
+    ).normal_(mean=0.1, std=0.2)
+    kv = torch.empty((Z, 2 * H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(
+        mean=0.4, std=0.2
+    )
     k = kv[:, :H]
     v = kv[:, H:]
     # v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2)
-    o = torch.empty((Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2)
-    torch_o = torch.empty((Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda").normal_(
-        mean=0.3, std=0.2
-    )
+    o = torch.empty(
+        (Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda"
+    ).normal_(mean=0.3, std=0.2)
+    torch_o = torch.empty(
+        (Z * (N_CTX - prompt_cache_len), H, D_HEAD), dtype=dtype, device="cuda"
+    ).normal_(mean=0.3, std=0.2)
     max_input_len = N_CTX
     b_start_loc = torch.zeros((Z,), dtype=torch.int32, device="cuda")
     b_seq_len = torch.ones((Z,), dtype=torch.int32, device="cuda")
@@ -658,14 +756,18 @@ def test2():
         if i != 0:
             b_start_loc[i] = b_start_loc[i - 1] + N_CTX - prompt_cache_len
         b_prompt_cache_len[i] = prompt_cache_len
-    torch_context_attention_fwd2(q, k, v, torch_o, b_start_loc, b_seq_len, b_prompt_cache_len)
+    torch_context_attention_fwd2(
+        q, k, v, torch_o, b_start_loc, b_seq_len, b_prompt_cache_len
+    )
 
     import time
 
     torch.cuda.synchronize()
     a = time.time()
     for i in range(1000):
-        context_attention_fwd_ppl_int8kv(q, k, v, o, b_start_loc, b_seq_len, max_input_len, b_prompt_cache_len)
+        context_attention_fwd_ppl_int8kv(
+            q, k, v, o, b_start_loc, b_seq_len, max_input_len, b_prompt_cache_len
+        )
     torch.cuda.synchronize()
     b = time.time()
     # print(o.shape, torch_out.shape)
diff --git a/lite_llama/kernels/others/fused_linear.py b/lite_llama/kernels/others/fused_linear.py
index 2b17475..3b3fb57 100644
--- a/lite_llama/kernels/others/fused_linear.py
+++ b/lite_llama/kernels/others/fused_linear.py
@@ -15,6 +15,7 @@
 def tanh(x):
     return 2 * tl.sigmoid(2 * x) - 1
 
+
 @triton.jit
 def gelu_new(x):
     pi = tl.constexpr(tl.float32(math.pi))
@@ -22,56 +23,64 @@ def gelu_new(x):
     b = x + 0.044715 * x * x * x
     return 0.5 * x * (1.0 + tanh(a * b))
 
+
 @triton.jit
 def silu(x):
     return x * tl.sigmoid(x)
 
+
 @triton.jit
 def _fused_linear_kernel_fwd(
-    x_ptr,   # 输入数据矩阵首元素指针
-    w_ptr,   # 权重矩阵首元素指针
-    z_ptr,   # 输出结果地址
-    M, N, K, # Matrix dimensions
+    x_ptr,  # 输入数据矩阵首元素指针
+    w_ptr,  # 权重矩阵首元素指针
+    z_ptr,  # 输出结果地址
+    M,
+    N,
+    K,  # Matrix dimensions
     b_ptr=None,
     r_ptr=None,
-    apply_silu=False, # gelu 激活和 dropout
+    apply_silu=False,  # gelu 激活和 dropout
     seed=1337,
     BLOCK_SIZE_M: tl.constexpr = 128,  # 块大小
-    BLOCK_SIZE_N: tl.constexpr = 128, 
+    BLOCK_SIZE_N: tl.constexpr = 128,
     BLOCK_SIZE_K: tl.constexpr = 64,
 ):
     # 当前 kernel 在 M/N 方向的程序 id
-    pid_m = tl.program_id(0) # 二维内核允许在行（M）和列（N）两个方向上并行计算，极大地提高了计算效率。
+    pid_m = tl.program_id(
+        0
+    )  # 二维内核允许在行（M）和列（N）两个方向上并行计算，极大地提高了计算效率。
     pid_n = tl.program_id(1)
-    
+
     # 计算行列索引偏移，offs_m: 当前块负责的行索引，形状为 (BLOCK_SIZE_M, 1)。
     offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)[:, None]
-    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)[None, :] # 形状为 (1, BLOCK_SIZE_N)。
-    
+    offs_n = (
+        pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)[None, :]
+    )  # 形状为 (1, BLOCK_SIZE_N)。
+
     # 子块的矩阵乘法
     z = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, K, BLOCK_SIZE_K):
-        x_k = tl.arange(0, BLOCK_SIZE_K)[None,:] + k
+        x_k = tl.arange(0, BLOCK_SIZE_K)[None, :] + k
         # (BLOCK_SIZE_M, BLOCK_SIZE_K)
         x = tl.load(x_ptr + offs_m * K + x_k, mask=(offs_m < M) & (x_k < K), other=0.0)
         x = x.to(tl.float16)
-        
+
         w_k = tl.arange(0, BLOCK_SIZE_K)[:, None] + k
         # (BLOCK_SIZE_K, BLOCK_SIZE_N)
         w = tl.load(w_ptr + w_k * N + offs_n, mask=(w_k < K) & (offs_n < N), other=0.0)
         w = w.to(tl.float16)
-        
+
         # (BLOCK_SIZE_M, BLOCK_SIZE_N)
         z = tl.dot(x, w, acc=z)
-    
+
     if b_ptr is not None:
         b = tl.load(b_ptr + offs_n, mask=(offs_n < N), other=0.0)
         z += b.to(tl.float32)
     # (1, BLOCK_SIZE_N)
-    
+
     z_offset = offs_m * N + offs_n
     z_mask = (offs_m < M) & (offs_n < N)
-    
+
     if apply_silu:
         z = silu(z)
 
@@ -81,12 +90,13 @@ def _fused_linear_kernel_fwd(
 
     tl.store(z_ptr + z_offset, z, mask=z_mask)
 
+
 @torch.no_grad()
 def fused_linear(
     x,
     weight,
     bias=None,
-    residual=None, # 残差输入项
+    residual=None,  # 残差输入项
     add_silu=False,
 ):
     """
@@ -101,7 +111,7 @@ def fused_linear(
     x = x.view((-1, x.shape[-1]))
     M, K = x.shape
     N = weight.shape[1]
-    
+
     # Allocates output.
     z = torch.empty((M, N), device=x.device, dtype=x.dtype)
 
@@ -115,18 +125,20 @@ def fused_linear(
     if residual is not None:
         residual = residual.view(z.shape)
         assert residual.is_contiguous()
-        
+
     BLOCK_SIZE_M = 64
     BLOCK_SIZE_N = 64
     BLOCK_SIZE_K = 32
-    
+
     # 2D launch kernel where each block gets its own program.
     grid = (triton.cdiv(M, BLOCK_SIZE_M), triton.cdiv(N, BLOCK_SIZE_N), 1)
     _fused_linear_kernel_fwd[grid](
-        x, 
-        weight, 
+        x,
+        weight,
         z,
-        M, N, K,
+        M,
+        N,
+        K,
         apply_silu=add_silu,
         b_ptr=bias,
         r_ptr=residual,
@@ -135,4 +147,3 @@ def fused_linear(
         BLOCK_SIZE_K=BLOCK_SIZE_K,
     )
     return z.view((*out_shape_0, N))
-   
\ No newline at end of file
diff --git a/lite_llama/kernels/others/layernorm.py b/lite_llama/kernels/others/layernorm.py
index 095b78b..9867449 100644
--- a/lite_llama/kernels/others/layernorm.py
+++ b/lite_llama/kernels/others/layernorm.py
@@ -13,72 +13,61 @@ def _layernorm_kernel_fwd(
     BLOCK_SIZE: tl.constexpr = 16,
 ):
     row_idx = tl.program_id(0)
-    x_row_ptr = x_ptr + row_idx * H # 一行 H 个元素，H 表示嵌入层大小
+    x_row_ptr = x_ptr + row_idx * H  # 一行 H 个元素，H 表示嵌入层大小
     z_row_ptr = z_ptr + row_idx * H
-    
+
     # 1, compute mean
     _sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
     for i in range(0, H, BLOCK_SIZE):
         col_offsets = i + tl.arange(0, BLOCK_SIZE)
-        x = tl.load(x_row_ptr + col_offsets, mask = col_offsets < H)
+        x = tl.load(x_row_ptr + col_offsets, mask=col_offsets < H)
         _sum += x.to(tl.float32)
-    
+
     mean = tl.sum(_sum, axis=0) / H
-    
+
     # 2, compute variance
     x_var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
     for i in range(0, H, BLOCK_SIZE):
         col_offsets = i + tl.arange(0, BLOCK_SIZE)
-        x = tl.load(x_row_ptr + col_offsets, mask = col_offsets < H).to(tl.float32)
-        x = tl.where(col_offsets < H, x - mean, 0.)
+        x = tl.load(x_row_ptr + col_offsets, mask=col_offsets < H).to(tl.float32)
+        x = tl.where(col_offsets < H, x - mean, 0.0)
         x_var += x * x
-    
+
     x_var = tl.sum(x_var, axis=0) / H
     rtsd = tl.sqrt(x_var + eps)
-    
+
     # 3, compute ln(x_i)
     for i in range(0, H, BLOCK_SIZE):
         col_offsets = i + tl.arange(0, BLOCK_SIZE)
         mask = col_offsets < H
-        x = tl.load(x_row_ptr + col_offsets, mask = mask)
-        w = tl.load(weight_ptr + col_offsets, mask = mask)
+        x = tl.load(x_row_ptr + col_offsets, mask=mask)
+        w = tl.load(weight_ptr + col_offsets, mask=mask)
         b = tl.load(bias_ptr + col_offsets)
-        
+
         x_hat = (x - mean) / rtsd
         z = x_hat * w + b
         tl.store(z_row_ptr + col_offsets, z, mask=mask)
-     
+
+
 @torch.no_grad()
-def layernorm(
-    x,
-    weight,
-    bias,
-    eps=1e-5
-):
+def layernorm(x, weight, bias, eps=1e-5):
     # 只针对 nlp 领域的 layernorm，省去了 normalized_shape 参数
     assert x.is_contiguous()
     assert weight.is_contiguous()
     assert bias.is_contiguous()
-    
+
     assert x.shape[-1] == weight.shape[0] == bias.shape[0]
     out_shape = x.shape
-    x = x.view(-1, x.shape[-1]) # if: [B, L, H] then -> [B*L, H]
+    x = x.view(-1, x.shape[-1])  # if: [B, L, H] then -> [B*L, H]
     BL, H = x.shape
     z = torch.empty(x.shape, device=x.device, dtype=x.dtype)
-    
+
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 4096 // x.element_size()
     BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
     num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
-    
+
     _layernorm_kernel_fwd[BL,](
-        x,
-        weight,
-        bias,
-        z,
-        H, 
-        eps,
-        BLOCK_SIZE,
-        num_warps=num_warps
-    )  
-    return z.view(out_shape)
\ No newline at end of file
+        x, weight, bias, z, H, eps, BLOCK_SIZE, num_warps=num_warps
+    )
+    return z.view(out_shape)
diff --git a/lite_llama/kernels/others/rmsnorm_layer.py b/lite_llama/kernels/others/rmsnorm_layer.py
index dc53271..3da3272 100644
--- a/lite_llama/kernels/others/rmsnorm_layer.py
+++ b/lite_llama/kernels/others/rmsnorm_layer.py
@@ -23,6 +23,7 @@
 else:
     from triton.language.math import rsqrt
 
+
 @triton.jit
 def _rms_norm_forward_kernel(
     Y_ptr,
@@ -55,7 +56,7 @@ def _rms_norm_forward_kernel(
     X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
     X_row_dtype = X_row.dtype
     W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
-    X_row = X_row.to(tl.float32) # On Llama, only rstd is computed on fp32
+    X_row = X_row.to(tl.float32)  # On Llama, only rstd is computed on fp32
 
     mean_square = tl.sum(X_row * X_row, axis=0) / n_cols
     rstd = rsqrt(mean_square + eps)
@@ -69,7 +70,6 @@ def _rms_norm_forward_kernel(
 
 @torch.no_grad()
 def rmsnorm_fwd(X, W, eps=1e-5, offset=0.0):
-
     shape = X.shape
     X = X.view(-1, shape[-1])
     n_rows, n_cols = X.shape
@@ -78,9 +78,9 @@ def rmsnorm_fwd(X, W, eps=1e-5, offset=0.0):
     Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
 
     # Check constraints.
-    assert (
-        X.shape[1] == W.shape[0]
-    ), "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+    assert X.shape[1] == W.shape[0], (
+        "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
+    )
 
     _rms_norm_forward_kernel[(n_rows,)](
         Y,
@@ -99,36 +99,42 @@ def rmsnorm_fwd(X, W, eps=1e-5, offset=0.0):
 
 
 def test_rms_layernorm(
-    dim = 1024, eps = 1e-5, dtype = torch.float16,
-    bsz = 21, random_state = 3407, seqlen = 3341,
+    dim=1024,
+    eps=1e-5,
+    dtype=torch.float16,
+    bsz=21,
+    random_state=3407,
+    seqlen=3341,
 ):
     from transformers.models.llama.modeling_llama import LlamaRMSNorm
-    layernorm = LlamaRMSNorm((dim,), eps = eps).to("cuda")
+
+    layernorm = LlamaRMSNorm((dim,), eps=eps).to("cuda")
     torch.cuda.manual_seed(random_state)
     torch.manual_seed(random_state)
     torch.nn.init.uniform_(layernorm.weight)
-    X = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda")
+    X = torch.randn((bsz, seqlen, dim), dtype=dtype, device="cuda")
     Y = layernorm(X)
     Y2 = rmsnorm_fwd(X, layernorm.weight, eps)
 
-    assert(torch.amax(Y - Y2).item() <= 0.05)
+    assert torch.amax(Y - Y2).item() <= 0.05
     print("max delta:", torch.max(torch.abs(Y - Y2)))
 
 
 def testing_suite_layernorm():
     for dim in [512, 1024, 2048]:
         for dtype in [torch.float16, torch.bfloat16]:
-            with torch.autocast(device_type = "cuda", dtype = dtype):
+            with torch.autocast(device_type="cuda", dtype=dtype):
                 for seqlen in [3341, 2048, 349]:
                     for random_state in [3407, 42]:
                         test_rms_layernorm(
-                            dim = dim,
-                            eps = 1e-5,
-                            dtype = dtype,
-                            bsz = 21,
-                            random_state = random_state,
-                            seqlen = seqlen,
+                            dim=dim,
+                            eps=1e-5,
+                            dtype=dtype,
+                            bsz=21,
+                            random_state=random_state,
+                            seqlen=seqlen,
                         )
 
+
 if __name__ == "__main__":
-    testing_suite_layernorm()
\ No newline at end of file
+    testing_suite_layernorm()
diff --git a/lite_llama/kernels/others/rmsnorm_v1.py b/lite_llama/kernels/others/rmsnorm_v1.py
index 7bd89f9..293e0d0 100644
--- a/lite_llama/kernels/others/rmsnorm_v1.py
+++ b/lite_llama/kernels/others/rmsnorm_v1.py
@@ -1,107 +1,114 @@
 # modified from https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
-import triton,torch, os
-import triton.language as tl 
+import triton, torch, os
+import triton.language as tl
 from ..utils import calculate_settings
 
+
 @triton.jit
 def _rmsnorm_kernel_fwd(
-    x_ptr, # shape is [M, K]
-    w_ptr, # gamma 参数地址
-    z_ptr, # 输出结果首元素指针
-    K,     # 权重 W 大小, 也是输入 X 的第二维度大小
-    eps,   # epsilon to avoid division by zero
+    x_ptr,  # shape is [M, K]
+    w_ptr,  # gamma 参数地址
+    z_ptr,  # 输出结果首元素指针
+    K,  # 权重 W 大小, 也是输入 X 的第二维度大小
+    eps,  # epsilon to avoid division by zero
     BLOCK_SIZE: tl.constexpr = 8,
 ):
     """z = (x / (rms)) * w"""
     row_idx = tl.program_id(0)
-    x_row_ptr = x_ptr + row_idx * K # 一行有 K 个元素，K 是最后一维
+    x_row_ptr = x_ptr + row_idx * K  # 一行有 K 个元素，K 是最后一维
     z_row_ptr = z_ptr + row_idx * K
-    
+
     # Compute variance
     _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
     for col_index in range(0, K, BLOCK_SIZE):
         col_offsets = col_index + tl.arange(0, BLOCK_SIZE)
         x_ptrs = x_row_ptr + col_offsets
-        
-        x = tl.load(x_ptrs, mask = col_offsets < K, other=0.0).to(tl.float32)
-        _var += x*x
-    
+
+        x = tl.load(x_ptrs, mask=col_offsets < K, other=0.0).to(tl.float32)
+        _var += x * x
+
     var = tl.sum(_var, axis=0) / K
-    rsqrt =  1 / tl.sqrt(var + eps)
-    
+    rsqrt = 1 / tl.sqrt(var + eps)
+
     # Normalize and apply rmsnorm
     for col_index in range(0, K, BLOCK_SIZE):
         col_offsets = col_index + tl.arange(0, BLOCK_SIZE)
         mask = col_offsets < K
-        
-        x = tl.load(x_row_ptr + col_offsets, mask = mask, other=0.0).to(tl.float32)
-        w = tl.load(w_ptr + col_offsets, mask = mask).to(tl.float32)
-        
+
+        x = tl.load(x_row_ptr + col_offsets, mask=mask, other=0.0).to(tl.float32)
+        w = tl.load(w_ptr + col_offsets, mask=mask).to(tl.float32)
+
         normed = x * rsqrt
-        normed = normed.to(w.dtype) # Exact copy from HF
+        normed = normed.to(w.dtype)  # Exact copy from HF
         z = normed * w
-        tl.store(z_row_ptr + col_offsets, z.to(z.dtype), mask = mask)
-        
+        tl.store(z_row_ptr + col_offsets, z.to(z.dtype), mask=mask)
+
+
 @torch.no_grad()
-def rmsnorm(
-    x,
-    weight,
-    eps = 1e-5
-):
-    z = torch.empty_like(x) # z 是三维的, [B, L, K]
+def rmsnorm(x, weight, eps=1e-5):
+    z = torch.empty_like(x)  # z 是三维的, [B, L, K]
     out_shape = x.shape
-    x = x.view((-1, x.shape[-1])) # 将 x 的所有维度压缩为二维张量, [B, L, K] -> [M, K], K 是隐藏层的维度。
+    x = x.view(
+        (-1, x.shape[-1])
+    )  # 将 x 的所有维度压缩为二维张量, [B, L, K] -> [M, K], K 是隐藏层的维度。
     M, K = x.shape
-    
+
     # Less than 64KB per feature: enqueue fused kernel
-    # MAX_FUSED_SIZE = 65536 // x.element_size() # 用于返回张量中单个元素的大小（以字节为单位）。 
+    # MAX_FUSED_SIZE = 65536 // x.element_size() # 用于返回张量中单个元素的大小（以字节为单位）。
     BLOCK_SIZE, num_warps = calculate_settings(K)
     if K > BLOCK_SIZE:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     # heuristics for number of warps
     num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
-    _rmsnorm_kernel_fwd[M, ](
+    _rmsnorm_kernel_fwd[M,](
         x,
         weight,
         z,
-        K, 
+        K,
         eps=eps,
         BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=num_warps, 
-    )  
+        num_warps=num_warps,
+    )
     return z.view(out_shape)
 
+
 def test_rms_layernorm(
-    dim = 1024, eps = 1e-5, dtype = torch.float16,
-    bsz = 21, random_state = 3407, seqlen = 3341,
+    dim=1024,
+    eps=1e-5,
+    dtype=torch.float16,
+    bsz=21,
+    random_state=3407,
+    seqlen=3341,
 ):
     from transformers.models.llama.modeling_llama import LlamaRMSNorm
-    layernorm = LlamaRMSNorm((dim,), eps = eps).to("cuda")
+
+    layernorm = LlamaRMSNorm((dim,), eps=eps).to("cuda")
     torch.cuda.manual_seed(random_state)
     torch.manual_seed(random_state)
     torch.nn.init.uniform_(layernorm.weight)
-    X = torch.randn((bsz, seqlen, dim), dtype = dtype, device = "cuda")
+    X = torch.randn((bsz, seqlen, dim), dtype=dtype, device="cuda")
     Y = layernorm(X)
     Y2 = rmsnorm(X, layernorm.weight, eps)
 
-    assert(torch.amax(Y - Y2).item() <= 0.05)
+    assert torch.amax(Y - Y2).item() <= 0.05
     print("max delta:", torch.max(torch.abs(Y - Y2)))
 
 
 def testing_suite_layernorm():
     for dim in [512, 1024, 2048]:
         for dtype in [torch.float16, torch.bfloat16]:
-            with torch.autocast(device_type = "cuda", dtype = dtype):
+            with torch.autocast(device_type="cuda", dtype=dtype):
                 for seqlen in [3341, 2048, 349]:
                     for random_state in [3407, 42]:
                         test_rms_layernorm(
-                            dim = dim,
-                            eps = 1e-5,
-                            dtype = dtype,
-                            bsz = 21,
-                            random_state = random_state,
-                            seqlen = seqlen,
+                            dim=dim,
+                            eps=1e-5,
+                            dtype=dtype,
+                            bsz=21,
+                            random_state=random_state,
+                            seqlen=seqlen,
                         )
 
+
 if __name__ == "__main__":
-    testing_suite_layernorm()
\ No newline at end of file
+    testing_suite_layernorm()
diff --git a/lite_llama/kernels/others/rope_orig.py b/lite_llama/kernels/others/rope_orig.py
index fcbd71e..e3c2517 100644
--- a/lite_llama/kernels/others/rope_orig.py
+++ b/lite_llama/kernels/others/rope_orig.py
@@ -5,11 +5,22 @@
 import triton.language as tl
 from typing import Tuple, Union
 
+
 @triton.jit
-def rope_kernel_fw(input_ptr, in_seq_len_stride, in_batch_stride,
-                   output_ptr, cos_ptr, sin_ptr, cos_stride, sin_stride,
-                   seq_len, head_dim,
-                   BLOCK_SIZE: tl.constexpr, BATCH_NUM: tl.constexpr):
+def rope_kernel_fw(
+    input_ptr,
+    in_seq_len_stride,
+    in_batch_stride,
+    output_ptr,
+    cos_ptr,
+    sin_ptr,
+    cos_stride,
+    sin_stride,
+    seq_len,
+    head_dim,
+    BLOCK_SIZE: tl.constexpr,
+    BATCH_NUM: tl.constexpr,
+):
     pid_seq = tl.program_id(axis=0)
     pid_head = tl.program_id(axis=1)
 
@@ -25,10 +36,19 @@ def rope_kernel_fw(input_ptr, in_seq_len_stride, in_batch_stride,
     sin = tl.load(sin_ptr + sin_offset, mask=mask, other=0.0)
 
     for batch_idx in tl.static_range(0, BATCH_NUM):
-        x1_offset = pid_seq * in_seq_len_stride + batch_idx * \
-            in_batch_stride + pid_head * head_dim + head_dim_offset
-        x2_offset = pid_seq * in_seq_len_stride + batch_idx * in_batch_stride + \
-            pid_head * head_dim + head_dim_mid + head_dim_offset
+        x1_offset = (
+            pid_seq * in_seq_len_stride
+            + batch_idx * in_batch_stride
+            + pid_head * head_dim
+            + head_dim_offset
+        )
+        x2_offset = (
+            pid_seq * in_seq_len_stride
+            + batch_idx * in_batch_stride
+            + pid_head * head_dim
+            + head_dim_mid
+            + head_dim_offset
+        )
 
         x1 = tl.load(input_ptr + x1_offset, mask=mask, other=0.0)
         x2 = tl.load(input_ptr + x2_offset, mask=mask, other=0.0)
@@ -54,10 +74,10 @@ def rope(
         raise ValueError(f"Unsupported tensor_format: {tensor_format}.")
 
     seq_len, batch_num, head_num, head_dim = t.shape
-    assert t.device.type == 'cuda', "Input tensor t must be on CUDA device"
-    assert freqs.device.type == 'cuda', "Input tensor freqs must be on CUDA device"
+    assert t.device.type == "cuda", "Input tensor t must be on CUDA device"
+    assert freqs.device.type == "cuda", "Input tensor freqs must be on CUDA device"
 
-    output = torch.empty_like(t, device='cuda')
+    output = torch.empty_like(t, device="cuda")
 
     BLOCK_SIZE = triton.next_power_of_2(head_dim // 2)
 
@@ -67,25 +87,30 @@ def rope(
     cos = torch.cos(freqs).to(t.dtype)
     sin = torch.sin(freqs).to(t.dtype)
 
-    rope_kernel_fw[grid](t,
-                        t.stride(0),
-                        t.stride(1),
-                        output,
-                        cos,
-                        sin,
-                        cos.stride(0),
-                        sin.stride(0),
-                        seq_len,
-                        head_dim,
-                        BLOCK_SIZE,
-                        batch_num)
+    rope_kernel_fw[grid](
+        t,
+        t.stride(0),
+        t.stride(1),
+        output,
+        cos,
+        sin,
+        cos.stride(0),
+        sin.stride(0),
+        seq_len,
+        head_dim,
+        BLOCK_SIZE,
+        batch_num,
+    )
 
     if tensor_format == "bshd":
         return output.transpose(0, 1)
-    
+
     return output.to("cuda")
 
-def compute_theta(dim: int, base: float = 10000.0, device: torch.device = torch.device('cuda')) -> torch.Tensor:
+
+def compute_theta(
+    dim: int, base: float = 10000.0, device: torch.device = torch.device("cuda")
+) -> torch.Tensor:
     """
     计算旋转位置编码中的 Theta 角度值。
 
@@ -99,19 +124,30 @@ def compute_theta(dim: int, base: float = 10000.0, device: torch.device = torch.
     """
     if dim % 2 != 0:
         print("嵌入维度 dim 必须为偶数")
-    i = torch.arange(1, (dim//2) + 1, dtype=torch.float32, device=device)
-    theta_i = base ** (-2*(i - 1) / dim)
+    i = torch.arange(1, (dim // 2) + 1, dtype=torch.float32, device=device)
+    theta_i = base ** (-2 * (i - 1) / dim)
 
     return theta_i
 
-def precompute_freqs_cis(dim: int, seq_len: int, base: float = 10000.0, device: torch.device = torch.device('cuda')):
-    theta = compute_theta(dim, base, device) # theta 角度值序列，向量, 大小为 dim // 2
-    m = torch.arange(seq_len, device=device) # token 位置值序列，向量，大小为 seq_len
-    m_theta = torch.outer(m, theta) # 所有 token 位置的所有 Theta 值范围, 矩阵，尺寸为 [seq_len, dim // 2]
-    freqs_cis = torch.polar(torch.ones_like(m_theta), m_theta) # e^{i*m*\theta}，本质上是旋转矩阵
+
+def precompute_freqs_cis(
+    dim: int,
+    seq_len: int,
+    base: float = 10000.0,
+    device: torch.device = torch.device("cuda"),
+):
+    theta = compute_theta(dim, base, device)  # theta 角度值序列，向量, 大小为 dim // 2
+    m = torch.arange(seq_len, device=device)  # token 位置值序列，向量，大小为 seq_len
+    m_theta = torch.outer(
+        m, theta
+    )  # 所有 token 位置的所有 Theta 值范围, 矩阵，尺寸为 [seq_len, dim // 2]
+    freqs_cis = torch.polar(
+        torch.ones_like(m_theta), m_theta
+    )  # e^{i*m*\theta}，本质上是旋转矩阵
 
     return freqs_cis
 
+
 def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """同一组的 kv cache 复制多份"""
     batch_size, seq_len, num_kv_heads, head_dim = x.shape
diff --git a/lite_llama/kernels/others/rotary_emb_v1.py b/lite_llama/kernels/others/rotary_emb_v1.py
index 3745776..6ff6765 100644
--- a/lite_llama/kernels/others/rotary_emb_v1.py
+++ b/lite_llama/kernels/others/rotary_emb_v1.py
@@ -2,6 +2,7 @@
 import triton
 import triton.language as tl
 
+
 @triton.jit
 def _rotary_kernel(
     Q,
@@ -45,30 +46,49 @@ def _rotary_kernel(
         + dim_range1[None, None, :] * stride_qd
     )
 
-    off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + dim_range0[None, None, :] * stride_cosd
+    off_dimcos_sin = (
+        cur_seq_range[:, None, None] * stride_cosbs
+        + dim_range0[None, None, :] * stride_cosd
+    )
 
     q0 = tl.load(
         Q + off_q0,
-        mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q),
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_Q),
         other=0.0,
     )
     q1 = tl.load(
         Q + off_q1,
-        mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q),
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_Q),
         other=0.0,
     )
 
-    cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
-    sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
+    cos = tl.load(
+        Cos + off_dimcos_sin,
+        mask=cur_seq_range[:, None, None] < max_total_len,
+        other=0.0,
+    )
+    sin = tl.load(
+        Sin + off_dimcos_sin,
+        mask=cur_seq_range[:, None, None] < max_total_len,
+        other=0.0,
+    )
 
     out0 = q0 * cos - q1 * sin
     out1 = q0 * sin + q1 * cos
 
     tl.store(
-        Q + off_q0, out0, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q)
+        Q + off_q0,
+        out0,
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_Q),
     )
     tl.store(
-        Q + off_q1, out1, mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_Q)
+        Q + off_q1,
+        out1,
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_Q),
     )
 
     off_k0 = (
@@ -82,20 +102,33 @@ def _rotary_kernel(
         + dim_range1[None, None, :] * stride_kd
     )
 
-    off_dimcos_sin = cur_seq_range[:, None, None] * stride_cosbs + dim_range0[None, None, :] * stride_cosd
+    off_dimcos_sin = (
+        cur_seq_range[:, None, None] * stride_cosbs
+        + dim_range0[None, None, :] * stride_cosd
+    )
 
     k0 = tl.load(
         K + off_k0,
-        mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K),
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_K),
         other=0.0,
     )
     k1 = tl.load(
         K + off_k1,
-        mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K),
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_K),
+        other=0.0,
+    )
+    cos = tl.load(
+        Cos + off_dimcos_sin,
+        mask=cur_seq_range[:, None, None] < max_total_len,
+        other=0.0,
+    )
+    sin = tl.load(
+        Sin + off_dimcos_sin,
+        mask=cur_seq_range[:, None, None] < max_total_len,
         other=0.0,
     )
-    cos = tl.load(Cos + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
-    sin = tl.load(Sin + off_dimcos_sin, mask=cur_seq_range[:, None, None] < max_total_len, other=0.0)
 
     out_k0 = k0 * cos - k1 * sin
     out_k1 = k0 * sin + k1 * cos
@@ -103,23 +136,29 @@ def _rotary_kernel(
     tl.store(
         K + off_k0,
         out_k0,
-        mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K),
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_K),
     )
     tl.store(
         K + off_k1,
         out_k1,
-        mask=(cur_seq_range[:, None, None] < max_total_len) & (cur_head_range[None, :, None] < HEAD_K),
+        mask=(cur_seq_range[:, None, None] < max_total_len)
+        & (cur_head_range[None, :, None] < HEAD_K),
     )
     return
 
 
 @torch.no_grad()
-def rotary_emb_fwd(q, k, cos, sin, partial_rotary_factor=1.):
+def rotary_emb_fwd(q, k, cos, sin, partial_rotary_factor=1.0):
     total_len = q.shape[0]
     head_num_q, head_num_k = q.shape[1], k.shape[1]
     head_dim = int(q.shape[2] * partial_rotary_factor)
-    assert q.shape[0] == cos.shape[0] and q.shape[0] == sin.shape[0], f"q shape {q.shape} cos shape {cos.shape}"
-    assert k.shape[0] == cos.shape[0] and k.shape[0] == sin.shape[0], f"k shape {k.shape} cos shape {cos.shape}"
+    assert q.shape[0] == cos.shape[0] and q.shape[0] == sin.shape[0], (
+        f"q shape {q.shape} cos shape {cos.shape}"
+    )
+    assert k.shape[0] == cos.shape[0] and k.shape[0] == sin.shape[0], (
+        f"k shape {k.shape} cos shape {cos.shape}"
+    )
 
     BLOCK_SEQ = 64
     BLOCK_HEAD = 4
@@ -155,12 +194,13 @@ def rotary_emb_fwd(q, k, cos, sin, partial_rotary_factor=1.):
     )
     return q, k
 
+
 def torch_rotary_emb(x, cos, sin):
     seq_len, h, d = x.shape
     # cos, sin 的形状为 (seq_len, d//2)
     half_dim = cos.shape[-1]
     x0 = x[:, :, :half_dim]
-    x1 = x[:, :, half_dim: 2*half_dim]
+    x1 = x[:, :, half_dim : 2 * half_dim]
 
     cos = cos.view(seq_len, 1, half_dim)
     sin = sin.view(seq_len, 1, half_dim)
@@ -169,23 +209,24 @@ def torch_rotary_emb(x, cos, sin):
     o1 = x0 * sin + x1 * cos
 
     if 2 * half_dim < d:
-        out = torch.cat([o0, o1, x[:, :, 2*half_dim:]], dim=-1)
+        out = torch.cat([o0, o1, x[:, :, 2 * half_dim :]], dim=-1)
     else:
         out = torch.cat([o0, o1], dim=-1)
 
     return out
 
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     batch_tokens = 24800
     x_shape = (batch_tokens, 32, 64)  # (seq_len, num_heads, head_dim)
     dtype = torch.float16
-    q = torch.randn(x_shape, dtype=dtype, device='cuda')
+    q = torch.randn(x_shape, dtype=dtype, device="cuda")
     k = torch.clone(q)
 
     # 生成 cos 和 sin，与 head_dim 对应，这里 head_dim=64，因此 cos, sin=(seq_len, head_dim//2)=(128,32)
-    cos_shape = (batch_tokens, 32)  
-    y = torch.randn(cos_shape, dtype=dtype, device='cuda')
+    cos_shape = (batch_tokens, 32)
+    y = torch.randn(cos_shape, dtype=dtype, device="cuda")
     cos = y.cos()
     sin = y.sin()
 
@@ -193,6 +234,8 @@ def torch_rotary_emb(x, cos, sin):
     q_out, k_out = rotary_emb_fwd(q, k, cos, sin)
     print(output_torch)
     print(q_out)
-    print(f'The maximum difference between torch and triton is {torch.max(torch.abs(output_torch - q_out))}')
-    print('torch:', triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
-    print('triton:', triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
\ No newline at end of file
+    print(
+        f"The maximum difference between torch and triton is {torch.max(torch.abs(output_torch - q_out))}"
+    )
+    print("torch:", triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
+    print("triton:", triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
diff --git a/lite_llama/kernels/rope_emb.py b/lite_llama/kernels/rope_emb.py
index e7fe21a..4e5e075 100644
--- a/lite_llama/kernels/rope_emb.py
+++ b/lite_llama/kernels/rope_emb.py
@@ -57,16 +57,24 @@ def _triton_rope_emb(
         tl.arange(0, pad_hd // 2)[None, :] < hd // 2
     )
 
-    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(sin_row.dtype)
-    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(sin_row.dtype)
+    q_tile_1 = tl.load(q_ptr + first_half_q_offsets, mask=first_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_1 = tl.load(k_ptr + first_half_k_offsets, mask=first_k_mask, other=0).to(
+        sin_row.dtype
+    )
 
     second_half_q_offsets = first_half_q_offsets + (hd // 2)
     second_half_k_offsets = first_half_k_offsets + (hd // 2)
     second_q_mask = first_q_mask
     second_k_mask = first_k_mask
 
-    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(sin_row.dtype)
-    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(sin_row.dtype)
+    q_tile_2 = tl.load(q_ptr + second_half_q_offsets, mask=second_q_mask, other=0).to(
+        sin_row.dtype
+    )
+    k_tile_2 = tl.load(k_ptr + second_half_k_offsets, mask=second_k_mask, other=0).to(
+        sin_row.dtype
+    )
 
     new_q_tile_1 = q_tile_1 * cos_row - q_tile_2 * sin_row
     tl.store(q_ptr + first_half_q_offsets, new_q_tile_1, mask=first_q_mask)
@@ -78,6 +86,7 @@ def _triton_rope_emb(
     new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
     tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
 
+
 def rope_emb_forward(q, k, cos, sin, batch_size, seq_len):
     """
     q: (batch_size * seq_len, n_q_heads, head_dim)
diff --git a/lite_llama/kernels/skip_rmsnorm.py b/lite_llama/kernels/skip_rmsnorm.py
index c44ac5c..7c9b5cf 100644
--- a/lite_llama/kernels/skip_rmsnorm.py
+++ b/lite_llama/kernels/skip_rmsnorm.py
@@ -7,13 +7,25 @@
 import triton.language as tl
 from .utils import calculate_settings
 
+
 @triton.jit
 def skip_rms_norm_kernel_no_view(
-    Y_ptr, X_ptr, R_ptr, W_ptr,
-    B, S, N,
-    x_stride_b, x_stride_s, x_stride_n,
-    r_stride_b, r_stride_s, r_stride_n,
-    y_stride_b, y_stride_s, y_stride_n,
+    Y_ptr,
+    X_ptr,
+    R_ptr,
+    W_ptr,
+    B,
+    S,
+    N,
+    x_stride_b,
+    x_stride_s,
+    x_stride_n,
+    r_stride_b,
+    r_stride_s,
+    r_stride_n,
+    y_stride_b,
+    y_stride_s,
+    y_stride_n,
     w_stride,
     eps,
     has_residual: tl.constexpr,
@@ -23,23 +35,23 @@ def skip_rms_norm_kernel_no_view(
     pid = tl.program_id(0)
     batch_idx = pid // S
     seq_idx = pid % S
-    
+
     X_ptr = X_ptr + batch_idx * x_stride_b + seq_idx * x_stride_s
     Y_ptr = Y_ptr + batch_idx * y_stride_b + seq_idx * y_stride_s
     # R_ptr只有在has_residual为True时才使用
-    
+
     cols = tl.arange(0, BLOCK_SIZE)
     mask = cols < N
 
     x = tl.load(X_ptr + cols * x_stride_n, mask=mask, other=0.0).to(tl.float32)
-    
+
     # 当有residual时，加载并加上r，然后回写r
     if has_residual:
         R_ptr = R_ptr + batch_idx * r_stride_b + seq_idx * r_stride_s
         r = tl.load(R_ptr + cols * r_stride_n, mask=mask, other=0.0).to(tl.float32)
         x = x + r
         tl.store(R_ptr + cols * r_stride_n, x, mask=mask)
-    
+
     var = tl.sum(x * x, axis=0) / N
     rrms = 1.0 / tl.sqrt(var + eps)
 
@@ -48,6 +60,7 @@ def skip_rms_norm_kernel_no_view(
 
     tl.store(Y_ptr + cols * y_stride_n, y, mask=mask)
 
+
 @torch.no_grad()
 def skip_rmsnorm_no_view(X, residual, weight, eps=1e-5):
     # 假设X: [B, S, N]
@@ -67,27 +80,40 @@ def skip_rmsnorm_no_view(X, residual, weight, eps=1e-5):
     else:
         # 如果 residual 是 None，则在kernel中不处理residual
         # 这里给r_stride_*赋默认值，但不会使用
-        r_stride_b, r_stride_s, r_stride_n = 0,0,0
+        r_stride_b, r_stride_s, r_stride_n = 0, 0, 0
         has_residual = False
 
     BLOCK_SIZE = triton.next_power_of_2(N)
     grid = (B * S,)
 
     skip_rms_norm_kernel_no_view[grid](
-        Y, X, residual if residual is not None else X,  # 若无residual，这里传X只是占位，kernel中不使用R_ptr
+        Y,
+        X,
+        residual
+        if residual is not None
+        else X,  # 若无residual，这里传X只是占位，kernel中不使用R_ptr
         weight,
-        B, S, N,
-        x_stride_b, x_stride_s, x_stride_n,
-        r_stride_b, r_stride_s, r_stride_n,
-        y_stride_b, y_stride_s, y_stride_n,
+        B,
+        S,
+        N,
+        x_stride_b,
+        x_stride_s,
+        x_stride_n,
+        r_stride_b,
+        r_stride_s,
+        r_stride_n,
+        y_stride_b,
+        y_stride_s,
+        y_stride_n,
         w_stride,
         eps,
         has_residual=has_residual,
-        BLOCK_SIZE=BLOCK_SIZE
+        BLOCK_SIZE=BLOCK_SIZE,
     )
 
     return (Y, residual) if residual is not None else (Y, X)
 
+
 @triton.jit()
 def rms_norm_kernel(
     Y,  # pointer to the output
@@ -116,6 +142,7 @@ def rms_norm_kernel(
     y = (x * rrms).to(Y.dtype.element_ty) * w
     tl.store(Y + cols * y_stride_c, y, mask=mask)
 
+
 @triton.jit()
 def skip_rms_norm_kernel(
     Y,  # pointer to the output
@@ -152,30 +179,46 @@ def skip_rms_norm_kernel(
     y = (x * rrms).to(Y.dtype.element_ty) * w
     tl.store(Y + cols * y_stride_c, y, mask=mask)
 
+
 @torch.no_grad()
 def skip_rmsnorm(X, residual, weight, eps=1e-5):
     orig_shape = X.shape
     X = X.view(-1, orig_shape[-1])
 
-    M, N = X.shape # n_rows, n_cols
+    M, N = X.shape  # n_rows, n_cols
     BLOCK_SIZE, num_warps = calculate_settings(N)
     Y = torch.empty_like(X)
 
     if residual is not None:
         residual = residual.view(-1, N)
         skip_rms_norm_kernel[M,](
-            Y, X, residual, weight, 
-            N, 1, N, 1, N, 1, N, 
-            eps, 
+            Y,
+            X,
+            residual,
+            weight,
+            N,
+            1,
+            N,
+            1,
+            N,
+            1,
+            N,
+            eps,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=num_warps,
         )
         return Y.view(orig_shape), residual.view(orig_shape)
     else:
         rms_norm_kernel[M,](
-            Y, X, weight, 
-            N, 1, N, 1, N, 
-            eps, 
+            Y,
+            X,
+            weight,
+            N,
+            1,
+            N,
+            1,
+            N,
+            eps,
             BLOCK_SIZE=BLOCK_SIZE,
             num_warps=num_warps,
         )
@@ -185,12 +228,14 @@ def skip_rmsnorm(X, residual, weight, eps=1e-5):
 import pytest
 import time
 
+
 def python_rmsnorm(x, w, eps=1e-5):
     # x: (B, N)
     var = x.pow(2).mean(dim=-1, keepdim=True)
     x_normed = x / torch.sqrt(var + eps)
     return x_normed * w
 
+
 def python_skip_rmsnorm(x, r, w, eps=1e-5):
     # x, r: (B, N)
     x = x + r
@@ -198,33 +243,47 @@ def python_skip_rmsnorm(x, r, w, eps=1e-5):
     x_normed = x / torch.sqrt(var + eps)
     return (x_normed * w).half(), x.half()
 
-@pytest.mark.parametrize("batch_size, N, hidden_size", [(4, 128, 4096), (2, 256, 4096), (8, 1024, 4096)])
+
+@pytest.mark.parametrize(
+    "batch_size, N, hidden_size", [(4, 128, 4096), (2, 256, 4096), (8, 1024, 4096)]
+)
 def test_rmsnorm(batch_size, N, hidden_size):
-    x = torch.randn(batch_size, N, hidden_size, device='cuda', dtype=torch.float16)
-    w = torch.randn(hidden_size, device='cuda', dtype=torch.float16)
+    x = torch.randn(batch_size, N, hidden_size, device="cuda", dtype=torch.float16)
+    w = torch.randn(hidden_size, device="cuda", dtype=torch.float16)
 
     y_ref = python_rmsnorm(x.float(), w.float()).half()
-    y_triton, triton_residual = skip_rmsnorm(x, None, w)  # 不传residual，就走rms_norm_kernel分支
+    y_triton, triton_residual = skip_rmsnorm(
+        x, None, w
+    )  # 不传residual，就走rms_norm_kernel分支
 
-    assert torch.allclose(y_ref, y_triton, atol=1e-3, rtol=1e-3), "RMSNorm results do not match"
+    assert torch.allclose(y_ref, y_triton, atol=1e-3, rtol=1e-3), (
+        "RMSNorm results do not match"
+    )
 
-@pytest.mark.parametrize("batch_size, N, hidden_size", [(4, 128, 4096), (2, 256, 4096), (8, 1024, 4096)])
+
+@pytest.mark.parametrize(
+    "batch_size, N, hidden_size", [(4, 128, 4096), (2, 256, 4096), (8, 1024, 4096)]
+)
 def test_skip_rmsnorm(batch_size, N, hidden_size):
-    x = torch.randn(batch_size, N, hidden_size, device='cuda', dtype=torch.float16)
-    r = torch.randn(batch_size, N, hidden_size, device='cuda', dtype=torch.float16)
-    w = torch.randn(hidden_size, device='cuda', dtype=torch.float16)
+    x = torch.randn(batch_size, N, hidden_size, device="cuda", dtype=torch.float16)
+    r = torch.randn(batch_size, N, hidden_size, device="cuda", dtype=torch.float16)
+    w = torch.randn(hidden_size, device="cuda", dtype=torch.float16)
 
     y_ref, py_residual = python_skip_rmsnorm(x.float(), r.float(), w.float())
     y_triton, triton_residual = skip_rmsnorm(x, r, w)
 
-    assert torch.allclose(y_ref, y_triton, atol=1e-3, rtol=1e-3), "Skip RMSNorm results do not match"
-    assert torch.allclose(py_residual, triton_residual, atol=1e-3, rtol=1e-3), "Skip RMSNorm residual results do not match"
+    assert torch.allclose(y_ref, y_triton, atol=1e-3, rtol=1e-3), (
+        "Skip RMSNorm results do not match"
+    )
+    assert torch.allclose(py_residual, triton_residual, atol=1e-3, rtol=1e-3), (
+        "Skip RMSNorm residual results do not match"
+    )
 
 
 def benchmark_skip_rmsnorm(batch_size, N, iters=1000):
-    x = torch.randn(batch_size, N, device='cuda', dtype=torch.float16)
-    r = torch.randn(batch_size, N, device='cuda', dtype=torch.float16)
-    w = torch.randn(N, device='cuda', dtype=torch.float16)
+    x = torch.randn(batch_size, N, device="cuda", dtype=torch.float16)
+    r = torch.randn(batch_size, N, device="cuda", dtype=torch.float16)
+    w = torch.randn(N, device="cuda", dtype=torch.float16)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(iters):
@@ -232,15 +291,16 @@ def benchmark_skip_rmsnorm(batch_size, N, iters=1000):
     torch.cuda.synchronize()
     end = time.time()
     avg_time = (end - start) / iters
-    print(f"skip_rmsnorm: B={batch_size}, N={N}, avg_time={avg_time*1e3:.3f} ms/iter")
+    print(f"skip_rmsnorm: B={batch_size}, N={N}, avg_time={avg_time * 1e3:.3f} ms/iter")
+
 
 # 假设原始函数名为 rmsnorm_original
 def benchmark(func, shapes, warmup=10, iters=50):
     times = []
     for shape in shapes:
-        X = torch.randn(shape, dtype=torch.float16, device='cuda')
-        R = torch.randn(shape, device='cuda', dtype=torch.float16)
-        W = torch.randn(shape[-1], dtype=torch.float16, device='cuda')
+        X = torch.randn(shape, dtype=torch.float16, device="cuda")
+        R = torch.randn(shape, device="cuda", dtype=torch.float16)
+        W = torch.randn(shape[-1], dtype=torch.float16, device="cuda")
         # warmup
         for _ in range(warmup):
             _ = func(X, R, W)
@@ -250,13 +310,15 @@ def benchmark(func, shapes, warmup=10, iters=50):
             _ = func(X, R, W)
         torch.cuda.synchronize()
         end = time.time()
-        avg_time = (end - start)/iters
+        avg_time = (end - start) / iters
         times.append(avg_time)
     return times
 
+
 if __name__ == "__main__":
     import time
     import matplotlib.pyplot as plt
+
     # 示例运行
     shapes = [(16, 2048, 4096), (32, 2048, 4096), (64, 2048, 4096), (256, 2048, 4096)]
     original_times = benchmark(skip_rmsnorm, shapes)
@@ -264,11 +326,11 @@ def benchmark(func, shapes, warmup=10, iters=50):
 
     plt.figure(figsize=(8, 5))
     x_axis = [s[0] * s[1] for s in shapes]
-    plt.plot(x_axis, original_times, color = "red", label='Original')
-    plt.plot(x_axis, optimized_times, color = "blue", label='Optimized')
-    plt.xlabel('Batch * Seq (M dimension)')
-    plt.ylabel('Time (s)')
-    plt.title('RMSNorm Kernel Performance Comparison')
+    plt.plot(x_axis, original_times, color="red", label="Original")
+    plt.plot(x_axis, optimized_times, color="blue", label="Optimized")
+    plt.xlabel("Batch * Seq (M dimension)")
+    plt.ylabel("Time (s)")
+    plt.title("RMSNorm Kernel Performance Comparison")
     plt.legend()
     plt.grid(True)
     plt.savefig("./skip_rmsnorm_benchmark.png")
diff --git a/lite_llama/kernels/softmax_split.py b/lite_llama/kernels/softmax_split.py
index 53a5dda..b17d06e 100644
--- a/lite_llama/kernels/softmax_split.py
+++ b/lite_llama/kernels/softmax_split.py
@@ -4,6 +4,7 @@
 from triton import language as tl
 import torch
 
+
 @triton.jit
 def logsumexp_kernel(
     out_ptr,
@@ -28,18 +29,22 @@ def logsumexp_kernel(
     output_ptrs = out_ptr + pid_m * num_programs_n + pid_n
     tl.store(output_ptrs, logz)
 
+
 @triton.jit
 def combine_logsumexp_kernel(out_ptr, inp_ptr, M, N, TILE_N: tl.constexpr):
     pid_m = tl.program_id(0)
     n_offsets = tl.arange(0, TILE_N)
     mask = n_offsets < N
-    logzs = tl.load(inp_ptr + pid_m * N + n_offsets, other=-float("inf"), mask=mask).to(out_ptr.dtype.element_ty)
+    logzs = tl.load(inp_ptr + pid_m * N + n_offsets, other=-float("inf"), mask=mask).to(
+        out_ptr.dtype.element_ty
+    )
     m = tl.max(logzs, 0)
     e = tl.exp(logzs - m)
     z = tl.sum(e, 0)
     logz = m + tl.log(z)
     tl.store(out_ptr + pid_m, logz)
 
+
 @triton.jit
 def softmax_kernel(out_ptr, in_ptr, logz_ptr, M, N, TILE_N: tl.constexpr):
     pid_n = tl.program_id(0)
@@ -47,7 +52,9 @@ def softmax_kernel(out_ptr, in_ptr, logz_ptr, M, N, TILE_N: tl.constexpr):
     n_offsets = pid_n * TILE_N + tl.arange(0, TILE_N)
     offset = pid_m * N + n_offsets
     mask = n_offsets < N
-    inp = tl.load(in_ptr + offset, mask=mask, other=-float("inf")).to(out_ptr.dtype.element_ty)
+    inp = tl.load(in_ptr + offset, mask=mask, other=-float("inf")).to(
+        out_ptr.dtype.element_ty
+    )
     logz = tl.load(logz_ptr + pid_m).to(tl.float32)
     out = tl.exp(inp - logz)
     tl.store(out_ptr + offset, out, mask=mask)
@@ -64,7 +71,7 @@ def softmax_split(x):
     grid = (num_tiles_n, M, 1)
     logsumexp_kernel[grid](logz, x, M, N, TILE_N)
 
-    combined_logz = torch.empty((M, ), dtype=x.dtype, device=x.device)
+    combined_logz = torch.empty((M,), dtype=x.dtype, device=x.device)
     TILE_N = triton.next_power_of_2(num_tiles_n)
     grid = (M, 1, 1)
     combine_logsumexp_kernel[grid](combined_logz, logz, M, num_tiles_n, TILE_N)
diff --git a/lite_llama/kernels/swiglu.py b/lite_llama/kernels/swiglu.py
index b19f9ce..6e386ac 100644
--- a/lite_llama/kernels/swiglu.py
+++ b/lite_llama/kernels/swiglu.py
@@ -5,9 +5,11 @@
 import triton.language as tl
 import functools
 
+
 def is_hip() -> bool:
     return torch.version.hip is not None
 
+
 def ensure_contiguous(fn):
     @functools.wraps(fn)
     def wrapper(ctx, *args, **kwargs):
@@ -41,6 +43,7 @@ def calculate_settings(n):
         num_warps = 8
     return BLOCK_SIZE, num_warps
 
+
 @triton.jit
 def silu(x):
     return x * tl.sigmoid(x)
@@ -68,7 +71,7 @@ def _swiglu_forward_kernel(
 
 
 def swiglu_forward(a, b):
-    ori_shape = a.shape # ori_shape is [batch_size, seq_len, hidden_size]
+    ori_shape = a.shape  # ori_shape is [batch_size, seq_len, hidden_size]
 
     n_cols = ori_shape[-1]
     a = a.view(-1, n_cols)
@@ -82,7 +85,7 @@ def swiglu_forward(a, b):
         a,
         b,
         c,
-        c.stride(-2), # c.stride(-2) = n_cols
+        c.stride(-2),  # c.stride(-2) = n_cols
         n_cols=n_cols,
         BLOCK_SIZE=BLOCK_SIZE,
         num_warps=num_warps,
diff --git a/lite_llama/kernels/update_kv_buffer.py b/lite_llama/kernels/update_kv_buffer.py
index 74ea1a0..e89f54c 100644
--- a/lite_llama/kernels/update_kv_buffer.py
+++ b/lite_llama/kernels/update_kv_buffer.py
@@ -6,13 +6,18 @@
 
 @triton.jit
 def _fwd_kernel_update_kv(
-    KV_Values, Select_Index,
+    KV_Values,
+    Select_Index,
     KV_Buffer,
-    stride_k_bs, stride_k_h, stride_k_d,
-    stride_o_bs, stride_o_h, stride_o_d,
+    stride_k_bs,
+    stride_k_h,
+    stride_k_d,
+    stride_o_bs,
+    stride_o_h,
+    stride_o_d,
     head_num,
     BLOCK_DMODEL: tl.constexpr,
-    BLOCK_HEAD: tl.constexpr
+    BLOCK_HEAD: tl.constexpr,
 ):
     cur_index = tl.program_id(0)
     offs_h = tl.arange(0, BLOCK_HEAD)
@@ -20,8 +25,18 @@ def _fwd_kernel_update_kv(
 
     dest_index = tl.load(Select_Index + cur_index)
 
-    k_ptrs = KV_Values + cur_index * stride_k_bs + stride_k_h * offs_h[:, None] + stride_k_d * offs_d[None, :]
-    o_ptrs = KV_Buffer + dest_index * stride_o_bs + stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
+    k_ptrs = (
+        KV_Values
+        + cur_index * stride_k_bs
+        + stride_k_h * offs_h[:, None]
+        + stride_k_d * offs_d[None, :]
+    )
+    o_ptrs = (
+        KV_Buffer
+        + dest_index * stride_o_bs
+        + stride_o_h * offs_h[:, None]
+        + stride_o_d * offs_d[None, :]
+    )
 
     kv_value = tl.load(k_ptrs, mask=offs_h[:, None] < head_num, other=0.0)
     tl.store(o_ptrs, kv_value, mask=offs_h[:, None] < head_num)
@@ -39,18 +54,27 @@ def update_kv_buffer(KV_Values, Select_Index, KV_Buffer):
     输出:
         KV_Buffer 张量被填, KV_Buffer[Select_Index[i], :, :] = K[i, :, :]。
     """
-    seq_len = Select_Index.shape[0] # number_tokens
-    head_num = KV_Values.shape[1] # num_kv_head * 2
+    seq_len = Select_Index.shape[0]  # number_tokens
+    head_num = KV_Values.shape[1]  # num_kv_head * 2
     head_dim = KV_Values.shape[2]
-    assert KV_Values.shape[1] == KV_Buffer.shape[1] and KV_Values.shape[2] == KV_Buffer.shape[2]
+    assert (
+        KV_Values.shape[1] == KV_Buffer.shape[1]
+        and KV_Values.shape[2] == KV_Buffer.shape[2]
+    )
     BLOCK_HEAD = triton.next_power_of_2(head_num)
     grid = (seq_len,)
     num_warps = 1
 
     _fwd_kernel_update_kv[grid](
-        KV_Values, Select_Index, KV_Buffer,
-        KV_Values.stride(0), KV_Values.stride(1), KV_Values.stride(2),
-        KV_Buffer.stride(0), KV_Buffer.stride(1), KV_Buffer.stride(2),
+        KV_Values,
+        Select_Index,
+        KV_Buffer,
+        KV_Values.stride(0),
+        KV_Values.stride(1),
+        KV_Values.stride(2),
+        KV_Buffer.stride(0),
+        KV_Buffer.stride(1),
+        KV_Buffer.stride(2),
         head_num,
         BLOCK_DMODEL=head_dim,
         BLOCK_HEAD=BLOCK_HEAD,
@@ -59,8 +83,10 @@ def update_kv_buffer(KV_Values, Select_Index, KV_Buffer):
     )
     return
 
+
 def test1():
     import time
+
     num_of_times = 1000
 
     B, Seq_Len, H, D = 32, 1024, 12, 128
@@ -68,7 +94,7 @@ def test1():
     src = torch.randn((B * Seq_Len, H, D), dtype=torch.float16).cuda()
     dest_loc = torch.arange(0, B * Seq_Len, dtype=torch.int32, device="cuda")
 
-    for _ in range(10): # Warm up
+    for _ in range(10):  # Warm up
         update_kv_buffer(src, dest_loc, dest)
     torch.cuda.synchronize()
 
@@ -89,5 +115,6 @@ def test1():
     print("mean ", torch.mean(torch.abs(dest - src)))
     assert torch.allclose(src, dest, atol=1e-2, rtol=0)
 
-if __name__ == '__main__':
-    test1()
\ No newline at end of file
+
+if __name__ == "__main__":
+    test1()
diff --git a/lite_llama/kernels/update_kv_index.py b/lite_llama/kernels/update_kv_index.py
index 2d422f3..1c4ca82 100644
--- a/lite_llama/kernels/update_kv_index.py
+++ b/lite_llama/kernels/update_kv_index.py
@@ -6,31 +6,35 @@
 @triton.jit
 def _fwd_kernel_update_kv_index(
     req_to_token_indexs,  # 输出张量的指针，形状为 (num_requests, max_seq_len)
-    b_req_idx,            # decode_batch 批次中每个请求的 ID，形状为 (num_tokens,)
-    b_seq_len,            # decode_batch 中每个请求的序列长度，形状为 (num_tokens,)
-    select_index,             # decode_batch 中每个 tokens的 KV 索引，形状为 (num_tokens,)
+    b_req_idx,  # decode_batch 批次中每个请求的 ID，形状为 (num_tokens,)
+    b_seq_len,  # decode_batch 中每个请求的序列长度，形状为 (num_tokens,)
+    select_index,  # decode_batch 中每个 tokens的 KV 索引，形状为 (num_tokens,)
     stride_req_to_token_b,  # req_to_token_indexs 在第一个维度（请求）的步幅
-    stride_req_to_token_s   # req_to_token_indexs 在第二个维度（序列长度）的步幅
+    stride_req_to_token_s,  # req_to_token_indexs 在第二个维度（序列长度）的步幅
 ):
     # 获取当前程序的 ID，即线程的索引
     cur_index = tl.program_id(0)
-    
+
     # 从 b_req_idx 张量加载当前请求的 ID
     cur_req_idx = tl.load(b_req_idx + cur_index)
-    
+
     # 从 select_index 张量加载当前令牌的 KV 索引
     cur_token_index = tl.load(select_index + cur_index)
-    
+
     # 从 b_seq_len 张量加载当前请求的序列长度
     cur_seq_len = tl.load(b_seq_len + cur_index)
-    
+
     # 计算目标位置的偏移量：
     # req_to_token_indexs[cur_req_idx][cur_seq_len - 1]
-    dest_offset = req_to_token_indexs + cur_req_idx * stride_req_to_token_b + (cur_seq_len - 1) * stride_req_to_token_s
-    
+    dest_offset = (
+        req_to_token_indexs
+        + cur_req_idx * stride_req_to_token_b
+        + (cur_seq_len - 1) * stride_req_to_token_s
+    )
+
     # 将当前令牌索引存储到目标位置
     tl.store(dest_offset, cur_token_index)
-    
+
     return
 
 
@@ -43,31 +47,33 @@ def update_kv_index(req_to_token_indexs, b_req_idx, b_seq_len, select_index):
         b_req_idx (torch.Tensor): 批次中每个请求的 ID, 形状为 (num_tokens,)。
         b_seq_len (torch.Tensor): 每个请求的序列长度，形状为 (num_tokens,)。
         select_index (torch.Tensor): 每个令牌的 KV 索引，形状为 (num_tokens,)。
-    
+
     该函数使用 Triton 内核来高效地执行复制操作。
     """
     # 获取序列长度，即令牌数量
     seq_len = b_seq_len.shape[0]
-    
+
     # 确保所有输入张量在第一个维度上的大小相同
-    assert b_seq_len.shape[0] == select_index.shape[0] and b_req_idx.shape[0] == b_seq_len.shape[0], \
-        "所有输入张量在第一个维度上的大小必须相同。"
-    
+    assert (
+        b_seq_len.shape[0] == select_index.shape[0]
+        and b_req_idx.shape[0] == b_seq_len.shape[0]
+    ), "所有输入张量在第一个维度上的大小必须相同。"
+
     # 定义 Triton 内核的网格大小（1D 网格）
     grid = (seq_len,)
-    
+
     # 定义每个 block 使用的 warp 数量
     num_warps = 1
-    
+
     # 启动 Triton 内核
     _fwd_kernel_update_kv_index[grid](
-        req_to_token_indexs,          # 输出张量的指针
-        b_req_idx,                    # 请求索引张量的指针
-        b_seq_len,                    # 序列长度张量的指针
-        select_index,                   # 令牌索引张量的指针
+        req_to_token_indexs,  # 输出张量的指针
+        b_req_idx,  # 请求索引张量的指针
+        b_seq_len,  # 序列长度张量的指针
+        select_index,  # 令牌索引张量的指针
         req_to_token_indexs.stride(0),  # req_to_token_indexs 在第一个维度上的步幅
         req_to_token_indexs.stride(1),  # req_to_token_indexs 在第二个维度上的步幅
-        num_warps=num_warps,          # 使用的 warp 数量
-        num_stages=1,                  # 使用的流水线阶段数量
+        num_warps=num_warps,  # 使用的 warp 数量
+        num_stages=1,  # 使用的流水线阶段数量
     )
-    return
\ No newline at end of file
+    return
diff --git a/lite_llama/kernels/utils.py b/lite_llama/kernels/utils.py
index 831014c..fdee91e 100644
--- a/lite_llama/kernels/utils.py
+++ b/lite_llama/kernels/utils.py
@@ -32,6 +32,7 @@ def keep(conf):
         return False
     return True
 
+
 def ensure_contiguous(fn):
     @functools.wraps(fn)
     def wrapper(ctx, *args, **kwargs):
@@ -74,6 +75,7 @@ def compare_version(package: str, operator: Callable, target: str):
     pkg_version = Version(pkg.__version__)
     return operator(pkg_version, Version(target))
 
+
 torch_to_triton_dtype = {
     torch.float32: tl.float32,
     torch.float16: tl.float16,
diff --git a/lite_llama/llava_generate_stream.py b/lite_llama/llava_generate_stream.py
index 6560641..45982bf 100644
--- a/lite_llama/llava_generate_stream.py
+++ b/lite_llama/llava_generate_stream.py
@@ -13,20 +13,19 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+
 class CompletionPrediction(TypedDict, total=False):
     generation: str
     tokens: List[str]  # not required
     logprobs: List[float]  # not required
 
+
 def tokenizer_image_token(
-    prompt, 
-    tokenizer, 
-    image_token_index=IMAGE_TOKEN_INDEX, 
-    return_tensors=None
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
 ):
     """
     处理包含特殊标记 <image> 的文本提示, 将其转换为相应的 token 序列，并在 <image> 位置插入指定的图像 token 索引。
-    
+
     "A cat <image> is sitting <image> on the mat."
     [65,32,99,97,116,32000,32,105,115,32,115,105,116,116,105,110,103,32000,32,111,110,32,116,104,101,32,109,97,116,46]
 
@@ -35,50 +34,59 @@ def tokenizer_image_token(
         tokenizer: 分词器对象，需支持调用 tokenizer(chunk).input_ids。
         image_token_index (int): 用于替换 <image> 标记的图像 token 索引。
         return_tensors (str, optional): 指定返回的张量类型，例如 'pt' 表示 PyTorch 张量。
-    
+
     返回:
         list 或 torch.Tensor: 生成的 token 序列。
     """
     # 使用正则表达式分割，移除 '<image>' 前的空格，但保留后的空格
-    prompt_chunks = re.split(r'\s?<image>', prompt)
+    prompt_chunks = re.split(r"\s?<image>", prompt)
     # 不过滤空片段，以处理多个连续的 '<image>' 标记
     token_chunks = [tokenizer(chunk).input_ids for chunk in prompt_chunks]
-    
+
     input_ids = []
     offset = 0
     # 检查第一个片段是否以 BOS token 开始
-    if len(token_chunks) > 0 and len(token_chunks[0]) > 0 and token_chunks[0][0] == tokenizer.bos_token_id:
+    if (
+        len(token_chunks) > 0
+        and len(token_chunks[0]) > 0
+        and token_chunks[0][0] == tokenizer.bos_token_id
+    ):
         offset = 1
         input_ids.append(token_chunks[0][0])
-    
+
     # 插入图像 token
     for i, chunk in enumerate(token_chunks):
-        input_ids.extend(chunk[offset:]) # 添加当前片段的 token，跳过 BOS token（如果已添加）
+        input_ids.extend(
+            chunk[offset:]
+        )  # 添加当前片段的 token，跳过 BOS token（如果已添加）
         offset = 0  # 仅适用于第一个片段
-        if i < len(token_chunks) - 1: # 如果不是最后一个片段，插入图像 token
+        if i < len(token_chunks) - 1:  # 如果不是最后一个片段，插入图像 token
             input_ids.append(image_token_index)
-    
+
     if return_tensors is not None:
-        if return_tensors == 'pt':
+        if return_tensors == "pt":
             return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
     """
     [1, 3148, 1001, 29901, 32000, 1, 29871, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 319, 1799, 9047, 13566, 29901]
     """
     return input_ids
 
+
 class LlavaGeneratorStream:
     """
     GenerateText 类用于加载LLaMA模型并执行迭代式生成式推理 (文本生成)。
     """
-    def __init__(self, 
+
+    def __init__(
+        self,
         checkpoints_dir: str,
         tokenizer_path: str,
-        max_gpu_num_blocks = None,
-        max_seq_len = 2048,
-        load_model = True,
-        triton_weight = True,
-        compiled_model = False,
+        max_gpu_num_blocks=None,
+        max_seq_len=2048,
+        load_model=True,
+        triton_weight=True,
+        compiled_model=False,
         device="cuda",
     ):
         self.checkpoints_dir = checkpoints_dir
@@ -87,25 +95,29 @@ def __init__(self,
         self.device = device
 
         self.model_executor = ModelExecutor.build(
-            checkpoints_dir = checkpoints_dir,
-            load_model = load_model,
-            max_gpu_num_blocks =  max_gpu_num_blocks,
-            max_seq_len = max_seq_len,
-            triton_weight = triton_weight,
-            device = device
+            checkpoints_dir=checkpoints_dir,
+            load_model=load_model,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            triton_weight=triton_weight,
+            device=device,
         )
         self.tokenizer = self.load_tokenizer(tokenizer_path)
 
     def load_tokenizer(self, pretrained_model_name_or_path):
         model_name = get_model_name_from_path(pretrained_model_name_or_path)
 
-        if 'llava' in model_name.lower():
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=False)
+        if "llava" in model_name.lower():
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, use_fast=False
+            )
         else:
-            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=True)
-        
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, use_fast=True
+            )
+
         return tokenizer
-    
+
     def encode_images(self, image_items: List[Union[str, Image.Image]]):
         processor = AutoProcessor.from_pretrained(self.checkpoints_dir)
         self.image_processor = processor.image_processor
@@ -115,12 +127,15 @@ def encode_images(self, image_items: List[Union[str, Image.Image]]):
                 image = item
             elif item.startswith("http://") or item.startswith("https://"):
                 import requests
+
                 image = Image.open(requests.get(item, stream=True).raw)
             else:
                 image = Image.open(item)
             images.append(image.convert("RGB"))
 
-        image_tensors = self.image_processor.preprocess(images, return_tensors="pt")["pixel_values"]
+        image_tensors = self.image_processor.preprocess(images, return_tensors="pt")[
+            "pixel_values"
+        ]
         if type(image_tensors) is list:
             image_tensors = [
                 image.to(self.device, dtype=torch.float16) for image in image_tensors
@@ -130,7 +145,6 @@ def encode_images(self, image_items: List[Union[str, Image.Image]]):
 
         return image_tensors
 
-
     @torch.inference_mode()
     def generate_stream(
         self,
@@ -151,7 +165,7 @@ def generate_stream(
             top_p (float, optional): 用于 nucleus sampling 的概率阈值。默认为 0.9。
             logprobs (bool, optional): 是否计算生成 token 的对数概率。默认为 False。
             echo (bool, optional): 是否在输出中包含 prompt_tokens。默认为 False。
-            
+
         generator 输出：
             Tuple[List[str], Optional[List[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
         说明：
@@ -162,39 +176,55 @@ def generate_stream(
         max_prompt_len = max(len(t) for t in prompt_tokens)
         assert max_prompt_len <= self.max_seq_len
         total_seq_len = min(self.max_seq_len, max_gen_len + max_prompt_len)
-        actual_prompt_lens = torch.tensor([len(t) for t in prompt_tokens], dtype=torch.long, device=self.device)  
-        pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
-        
+        actual_prompt_lens = torch.tensor(
+            [len(t) for t in prompt_tokens], dtype=torch.long, device=self.device
+        )
+        pad_id = (
+            self.tokenizer.pad_token_id
+            if self.tokenizer.pad_token_id is not None
+            else self.tokenizer.eos_token_id
+        )
+
         # 预分配 tokens 张量
-        tokens = torch.full((bsz, total_seq_len), pad_id, dtype=torch.long, device=self.device)
+        tokens = torch.full(
+            (bsz, total_seq_len), pad_id, dtype=torch.long, device=self.device
+        )
         # 生成一个布尔张量，它的值为 True 的位置表示输入序列的实际内容（即非填充部分）, 形状为 (batch_size, total_seq_len)
         input_text_mask = tokens != pad_id
         eos_reached = torch.tensor([False] * bsz, device=self.device)
-        last_yielded_pos = [len(prompt_tokens[i]) if not echo else 0 for i in range(bsz)] # 初始化每个样本已输出的位置
+        last_yielded_pos = [
+            len(prompt_tokens[i]) if not echo else 0 for i in range(bsz)
+        ]  # 初始化每个样本已输出的位置
 
         # 填充提示词到 tokens 张量
         for seq_id, token_ids in enumerate(prompt_tokens):
             # NOTE: torch.long 等同于 torch.int64
-            tokens[seq_id, : len(token_ids)] = token_ids.clone().detach().to(dtype=torch.long, device=self.device)
-        
+            tokens[seq_id, : len(token_ids)] = (
+                token_ids.clone().detach().to(dtype=torch.long, device=self.device)
+            )
+
         # 计算输入图像待分配空间
         img_batch_size, _, _, _ = image_tensors.shape
-        b_req_idx = torch.arange(bsz, device = self.device)
+        b_req_idx = torch.arange(bsz, device=self.device)
         all_select_index_list = []
-        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(max_prompt_len, actual_prompt_lens, b_req_idx, img_batch_size)
+        prefill_select_index, _ = self.model_executor.prefill_alloc_kv_cache(
+            max_prompt_len, actual_prompt_lens, b_req_idx, img_batch_size
+        )
         all_select_index_list.append(prefill_select_index)
-        
+
         position_ids = None
         start_pos = len(prefill_select_index)
-        input_ids = tokens[:, : max_prompt_len]  # [batch_size, seq_len]
+        input_ids = tokens[:, :max_prompt_len]  # [batch_size, seq_len]
         for cur_pos in range(max_prompt_len, total_seq_len):
             batch_size, _ = input_ids.shape
-            logits = self.model_executor.forward(input_ids, position_ids, image_tensors) # step 0: position_ids 由 llava 模型类给出
-            
+            logits = self.model_executor.forward(
+                input_ids, position_ids, image_tensors
+            )  # step 0: position_ids 由 llava 模型类给出
+
             start_pos += bsz
             position_ids = (
                 torch.arange(start_pos, start_pos + 1, device=input_ids.device)
-                .unsqueeze(0)            # shape: [1, seq_len]
+                .unsqueeze(0)  # shape: [1, seq_len]
                 .repeat(batch_size, 1)  # shape: [batch_size, seq_len], 不分配额外内存
             )
 
@@ -207,11 +237,15 @@ def generate_stream(
             else:
                 next_token = torch.argmax(logits[:, -1], dim=-1)
 
-            input_ids = next_token   # [batch_size, 1]
+            input_ids = next_token  # [batch_size, 1]
             mask = ~input_text_mask[:, cur_pos]  # [batch_size]
-            tokens[:, cur_pos] = torch.where(mask, next_token.reshape(-1) , tokens[:, cur_pos])
+            tokens[:, cur_pos] = torch.where(
+                mask, next_token.reshape(-1), tokens[:, cur_pos]
+            )
 
-            eos_reached = eos_reached | (mask & (next_token == self.tokenizer.eos_token_id))
+            eos_reached = eos_reached | (
+                mask & (next_token == self.tokenizer.eos_token_id)
+            )
 
             # 为整个批次收集输出
             batch_outputs = []
@@ -224,14 +258,14 @@ def generate_stream(
                     batch_outputs.append(text)
                     last_yielded_pos[i] = end
                 else:
-                    batch_outputs.append('') # 如果没有新生成的内容，添加空字符串
+                    batch_outputs.append("")  # 如果没有新生成的内容，添加空字符串
 
             # 将整个批次的输出一次性 yield
             yield batch_outputs
 
             if eos_reached.all():
                 break
-        
+
         # 减少 kv cache 内存管理器的引用计数
         all_select_indexs = torch.concat(all_select_index_list)
         self.model_executor.kv_mem_manager.release_ref(all_select_indexs)
@@ -246,12 +280,19 @@ def text_completion_stream(
         echo: bool = False,
     ) -> Generator[List[CompletionPrediction], None, None]:
         """每次迭代时，生成器返回一个包含多个 CompletionPrediction 字典的列表。"""
-        
+
         if max_gen_len is None:
             max_gen_len = self.max_seq_len - 1
 
-        prompt_tokens = [tokenizer_image_token(x, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for x in prompts] # torch.Size([1, 22])
-        image_tensors = self.encode_images(image_items) # image_tensors shape is torch.Size([1, 3, 336, 336])
+        prompt_tokens = [
+            tokenizer_image_token(
+                x, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+            )
+            for x in prompts
+        ]  # torch.Size([1, 22])
+        image_tensors = self.encode_images(
+            image_items
+        )  # image_tensors shape is torch.Size([1, 3, 336, 336])
         # print(f"prompt 0 shape: {prompt_tokens[0].shape}, image_tensors shape: {image_tensors.shape}")
 
         stream = self.generate_stream(
@@ -264,16 +305,17 @@ def text_completion_stream(
         )
 
         # 初始化每个样本的生成结果
-        completions = [{'generation': '', 'tokens': []} for _ in prompts]
+        completions = [{"generation": "", "tokens": []} for _ in prompts]
         for batch_outputs in stream:
             for i, text in enumerate(batch_outputs):
-                completions[i]['generation'] += text
+                completions[i]["generation"] += text
             yield completions.copy()
-    
+
+
 def sample_top_p(probs, p):
     """
     执行 Top-p (Nucleus) 采样, 从概率分布中采样下一个词。
-    
+
     参数：
         probs (torch.Tensor): 概率分布张量，形状为 `[batch_size, vocab_size]`。
         p (float): 累积概率阈值，取值范围在 0 到 1 之间。
@@ -288,8 +330,10 @@ def sample_top_p(probs, p):
     # 计算排序后概率的累积和. 返回的 probs_sum 是累积概率分布。
     probs_sum = torch.cumsum(probs_sort, dim=-1)
     # 保留累积概率未超过阈值 p 的词汇的概率，其余词汇的概率被置为 0.0。
-    mask = probs_sum - probs_sort > p # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
-    probs_sort[mask] = 0.0 # 将累积概率超过阈值 p 的词的概率置零。
+    mask = (
+        probs_sum - probs_sort > p
+    )  # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
+    probs_sort[mask] = 0.0  # 将累积概率超过阈值 p 的词的概率置零。
 
     # 对剩余的概率重新归一化, 确保总和为 1。
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
@@ -297,6 +341,6 @@ def sample_top_p(probs, p):
     next_token_sorted_idx = torch.multinomial(probs_sort, num_samples=1)
     # 在 probs_idx 的最后一维（dim=-1）中，使用 next_token_sorted_idx 作为索引，提取对应的值。沿着 dim=1（列）进行索引提取
     # NOTE: torch.gather 函数按照给定的索引张量 index，从输入张量中收集 (获取) 数据，并返回一个与索引张量形状一致的张量。
-    next_token = torch.gather(probs_idx, -1, index = next_token_sorted_idx)
-    
-    return next_token # 返回采样得到的下一个词的索引
+    next_token = torch.gather(probs_idx, -1, index=next_token_sorted_idx)
+
+    return next_token  # 返回采样得到的下一个词的索引
diff --git a/lite_llama/models/RotaryEmbedding.py b/lite_llama/models/RotaryEmbedding.py
index 6acaa94..0eb2d31 100644
--- a/lite_llama/models/RotaryEmbedding.py
+++ b/lite_llama/models/RotaryEmbedding.py
@@ -6,15 +6,16 @@
 
 logger = logging.getLogger(__name__)
 
+
 def _compute_default_rope_parameters(
-    config = None,
+    config=None,
     device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     **rope_kwargs,
 ) -> Tuple["torch.Tensor", float]:
     """
     根据原始 RoPE 实现计算逆频率。
-    
+
     参数:
         config (`~transformers.LlamaConfig` 可选):
             模型的配置。
@@ -24,7 +25,7 @@ def _compute_default_rope_parameters(
             当前序列长度。对于此类型的RoPE未使用。
         rope_kwargs (`Dict` 可选):
             向后兼容参数, 将在v4.45中移除。
-    
+
     返回:
         一个元组, 包含RoPE嵌入的逆频率 (`torch.Tensor`), 形状为 [head_dim//2] 和应用于cos/sin的后处理缩放因子 (`float`)。
     """
@@ -40,26 +41,30 @@ def _compute_default_rope_parameters(
     # 否则，从 config 中提取参数
     elif config is not None:
         base = config.rope_theta
-        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        partial_rotary_factor = (
+            config.partial_rotary_factor
+            if hasattr(config, "partial_rotary_factor")
+            else 1.0
+        )
         head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
-        
+
         dim = int(head_dim * partial_rotary_factor)
 
     attention_factor = 1.0  # 注意力缩放因子，当前类型的RoPE未使用
 
     # Compute the inverse frequencies
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)
+    )
     return inv_freq, attention_factor
 
+
 def _compute_llama3_parameters(
-    config, 
-    device: "torch.device", 
-    seq_len: Optional[int] = None, 
-    **rope_kwargs
+    config, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
 ) -> Tuple["torch.Tensor", float]:
     """
     计算llama 3.1的逆频率。
-    
+
     参数:
         config (`~transformers.LlamaConfig`):
             模型的配置。
@@ -69,29 +74,43 @@ def _compute_llama3_parameters(
             当前序列长度。对于此类型的RoPE未使用。
         rope_kwargs (`Dict` 可选):
             向后兼容参数, 将在v4.45中移除。
-    
+
     返回:
         一个元组, 包含 RoPE 嵌入的逆频率 (`torch.Tensor`) , 形状为 [head_dim//2] 和应用于cos/sin的后处理缩放因子 (`float`)。
     """
     # 获取默认的 RoPE 参数
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
 
     # 从配置中提取 RoPE 缩放参数
     factor = config.rope_scaling["factor"]  # llama3.2 原始实现中值为 `32`
-    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
-    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
-    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_factor = config.rope_scaling[
+        "low_freq_factor"
+    ]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling[
+        "high_freq_factor"
+    ]  # `4` in the original implementation
+    old_context_len = config.rope_scaling[
+        "original_max_position_embeddings"
+    ]  # `8192` in the original implementation
 
     low_freq_wavelen = old_context_len / low_freq_factor
     high_freq_wavelen = old_context_len / high_freq_factor
 
-    wavelen = 2 * math.pi / inv_freq # 计算波长
-    
+    wavelen = 2 * math.pi / inv_freq  # 计算波长
+
     # 对于波长大于低频波长的部分，逆频率除以因子
-    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    inv_freq_llama = torch.where(
+        wavelen > low_freq_wavelen, inv_freq / factor, inv_freq
+    )
     # 对于中频部分，进行平滑插值
-    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+    )
+    smoothed_inv_freq = (
+        1 - smooth_factor
+    ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
     # 标记中频部分
     is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
     # 使用平滑后的逆频率替换中频部分
@@ -99,12 +118,14 @@ def _compute_llama3_parameters(
 
     return inv_freq_llama, attention_factor
 
+
 # 定义 RoPE 初始化函数的映射字典
 ROPE_INIT_FUNCTIONS = {
     "default": _compute_default_rope_parameters,
     "llama3": _compute_llama3_parameters,
 }
 
+
 class LlamaRotaryEmbedding(nn.Module):
     def __init__(
         self,
@@ -117,8 +138,8 @@ def __init__(
         config: Optional[LlamaConfig] = None,
     ):
         super().__init__()
-        self.rope_kwargs = {} # 初始化rope_kwargs，用于向后兼容
-        if config is None:    # 如果未提供配置，使用传入的参数初始化rope_kwargs
+        self.rope_kwargs = {}  # 初始化rope_kwargs，用于向后兼容
+        if config is None:  # 如果未提供配置，使用传入的参数初始化rope_kwargs
             self.rope_kwargs = {
                 "rope_type": rope_type,
                 "factor": scaling_factor,
@@ -129,12 +150,14 @@ def __init__(
             self.rope_type = rope_type
             self.max_seq_len_cached = max_position_embeddings
             self.original_max_seq_len = max_position_embeddings
-        else: # 如果提供了 llama_config 配置，从中提取 rope_type
+        else:  # 如果提供了 llama_config 配置，从中提取 rope_type
             if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+                self.rope_type = config.rope_scaling.get(
+                    "rope_type", config.rope_scaling.get("type")
+                )
             else:
                 self.rope_type = "default"
-            
+
             # 模型输入最大上下文长度赋值为 config.max_position_embeddings
             self.max_seq_len_cached = config.max_position_embeddings
             self.original_max_seq_len = config.max_position_embeddings
@@ -143,7 +166,9 @@ def __init__(
         # 从一个全局定义的字典 ROPE_INIT_FUNCTIONS 中，根据 rope_type 选择 rope 初始化函数
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         # 计算逆频率和注意力缩放因子
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            self.config, device, **self.rope_kwargs
+        )
         # 注册逆频率为 buffer（不会作为模型参数）
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
@@ -158,15 +183,20 @@ def _dynamic_frequency_update(self, position_ids, device):
             device (`torch.device`): 当前设备。
         """
         seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  
+        if seq_len > self.max_seq_len_cached:
             # 如果序列长度增长且大于模型配置中的默认 max_seq_len_cached，则重新计算逆频率,并更新 max_seq_len_cached 为当前序列长度
             inv_freq, self.attention_scaling = self.rope_init_fn(
                 self.config, device, seq_len=seq_len, **self.rope_kwargs
             )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -174,17 +204,17 @@ def _dynamic_frequency_update(self, position_ids, device):
     def forward(self, x, position_ids):
         """
         LlamaRotaryEmbedding 前向传播, 生成RoPE的cos和sin编码。
-        
+
         参数:
             x (`torch.Tensor`):
                 输入张量。
             position_ids (`torch.Tensor`):
                 位置ID张量, 形状为 [batch_size, seq_length]。
-        
+
         返回:
             Tuple[`torch.Tensor`, `torch.Tensor`]: cos和sin编码, 形状为 [batch_size, seq_length, dim]。
         """
-        if "dynamic" in self.rope_type: # 如果使用动态 RoPE，则更新逆频率
+        if "dynamic" in self.rope_type:  # 如果使用动态 RoPE，则更新逆频率
             self._dynamic_frequency_update(position_ids, device=x.device)
 
         """
@@ -197,15 +227,23 @@ def forward(self, x, position_ids):
         6. 计算 cos 和 sin, 形状都为 [batch_size, seq_length, head_dim]。
         """
         # 扩展逆频率张量的形状为 [batch_size, head_dim/2, 1]
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
         # 扩展 position_ids 的形状为 [batch_size, 1, seq_length] 并转换为浮点型
         position_ids_expanded = position_ids[:, None, :].float()
         # 强制使用 float32 类型，避免精度问题
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
         with torch.autocast(device_type=device_type, enabled=False):
             # 计算频率与位置的内积，结果形状为 [batch_size, head_dim//2, seq_length]
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
             # 拼接两份频率，得到形状为 [batch_size, seq_length, head_dim]
             emb = torch.cat((freqs, freqs), dim=-1)
             # torch.sin() 和 torch.cos() 函数会对输入张量的每个元素进行逐元素操作，返回一个新的张量，其中包含对应的正弦或余弦值。
@@ -219,6 +257,7 @@ def forward(self, x, position_ids):
         # 返回与输入dtype相同的cos和sin编码
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
+
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
 class Qwen2RotaryEmbedding(nn.Module):
     def __init__(
@@ -252,7 +291,9 @@ def __init__(
         else:
             # BC: "rope_type" was originally "type"
             if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+                self.rope_type = config.rope_scaling.get(
+                    "rope_type", config.rope_scaling.get("type")
+                )
             else:
                 self.rope_type = "default"
             self.max_seq_len_cached = config.max_position_embeddings
@@ -261,7 +302,9 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            self.config, device, **self.rope_kwargs
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -276,10 +319,15 @@ def _dynamic_frequency_update(self, position_ids, device):
             inv_freq, self.attention_scaling = self.rope_init_fn(
                 self.config, device, seq_len=seq_len, **self.rope_kwargs
             )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -289,13 +337,21 @@ def forward(self, x, position_ids):
             self._dynamic_frequency_update(position_ids, device=x.device)
 
         # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
         with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos()
             sin = emb.sin()
@@ -309,9 +365,11 @@ def forward(self, x, position_ids):
 
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
+
 import unittest
 import torch
 
+
 class TestLlamaRotaryEmbedding(unittest.TestCase):
     def setUp(self):
         # 创建一个自定义的 LlamaConfig 对象，设置较小的 original_max_position_embeddings 和 dim
@@ -326,7 +384,7 @@ def setUp(self):
             "low_freq_factor": 1,
             "high_freq_factor": 4,
             "original_max_position_embeddings": 100,  # 设置较小的 original_max_position_embeddings
-            "rope_type": "llama3"
+            "rope_type": "llama3",
         }
         self.config.max_position_embeddings = 50  # 设置较小的 max_position_embeddings
 
@@ -339,10 +397,14 @@ def test_default_rope_parameters(self):
             device=torch.device("cpu"),
             scaling_factor=1.0,
             rope_type="default",
-            config=None
+            config=None,
         )
-        inv_freq, attention_scaling = rotary_emb.rope_init_fn(None, torch.device("cpu"), **rotary_emb.rope_kwargs)
-        self.assertEqual(inv_freq.shape[0], self.config.head_dim // 2)  # dim=4, step=2 -> 2
+        inv_freq, attention_scaling = rotary_emb.rope_init_fn(
+            None, torch.device("cpu"), **rotary_emb.rope_kwargs
+        )
+        self.assertEqual(
+            inv_freq.shape[0], self.config.head_dim // 2
+        )  # dim=4, step=2 -> 2
         self.assertEqual(attention_scaling, 1.0)
 
     def test_llama3_rope_parameters(self):
@@ -351,10 +413,14 @@ def test_llama3_rope_parameters(self):
             config=self.config,
             device=torch.device("cpu"),
         )
-        inv_freq, attention_scaling = rotary_emb.rope_init_fn(self.config, torch.device("cpu"), **rotary_emb.rope_kwargs)
+        inv_freq, attention_scaling = rotary_emb.rope_init_fn(
+            self.config, torch.device("cpu"), **rotary_emb.rope_kwargs
+        )
         print("llama3 inv_freq shape: ", inv_freq.shape)
         # 根据配置计算 dim = head_dim * partial_rotary_factor = 2 * 1.0 = 2, step=2 -> 1
-        self.assertEqual(inv_freq.shape[0], self.config.head_dim // 2)  # dim=2, step=2 -> 1
+        self.assertEqual(
+            inv_freq.shape[0], self.config.head_dim // 2
+        )  # dim=2, step=2 -> 1
         self.assertEqual(attention_scaling, 1.0)
 
     def test_forward_output_shape(self):
@@ -371,11 +437,13 @@ def test_forward_output_shape(self):
         batch_size = 2
         seq_length = 10
         x = torch.randn(batch_size, seq_length, self.config.hidden_size)
-        position_ids = torch.arange(0, seq_length).unsqueeze(0).expand(batch_size, seq_length)
+        position_ids = (
+            torch.arange(0, seq_length).unsqueeze(0).expand(batch_size, seq_length)
+        )
         cos, sin = rotary_emb(x, position_ids)
         self.assertEqual(cos.shape, (batch_size, seq_length, self.config.head_dim))
         self.assertEqual(sin.shape, (batch_size, seq_length, self.config.head_dim))
 
-if __name__ == '__main__':
-    unittest.main()
 
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lite_llama/models/clip.py b/lite_llama/models/clip.py
index 3fa6421..32b522c 100644
--- a/lite_llama/models/clip.py
+++ b/lite_llama/models/clip.py
@@ -8,18 +8,21 @@
 from .model_config import CLIPVisionConfig
 from ..kernels import *
 
+
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
     return image_size // patch_size
 
+
 def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
-    grid_length = get_clip_patch_grid_length(image_size=image_size,
-                                             patch_size=patch_size)
+    grid_length = get_clip_patch_grid_length(
+        image_size=image_size, patch_size=patch_size
+    )
     return grid_length * grid_length
 
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
-
     def __init__(self, config: CLIPVisionConfig):
         super().__init__()
         self.config = config
@@ -37,19 +40,23 @@ def __init__(self, config: CLIPVisionConfig):
             bias=False,
         )
 
-        self.num_patches = get_clip_num_patches(image_size=self.image_size,
-                                                patch_size=self.patch_size)
+        self.num_patches = get_clip_num_patches(
+            image_size=self.image_size, patch_size=self.patch_size
+        )
         self.num_positions = self.num_patches + 1
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
-        self.register_buffer("position_ids",
-                             torch.arange(self.num_positions).expand((1, -1)),
-                             persistent=False)
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
 
     def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         batch_size = pixel_values.shape[0]
         target_dtype = self.patch_embedding.weight.dtype
-        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
         patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
 
         class_embeds = self.class_embedding.expand(batch_size, 1, -1)
@@ -57,7 +64,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         embeddings = embeddings + self.position_embedding(self.position_ids)
 
         return embeddings
-    
+
+
 class CLIPAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -81,7 +89,11 @@ def __init__(self, config):
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
 
     def forward(
         self,
@@ -120,7 +132,10 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
                     f" {causal_attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         if attention_mask is not None:
@@ -128,7 +143,10 @@ def forward(
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
@@ -138,12 +156,18 @@ def forward(
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
         else:
             attn_weights_reshaped = None
 
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
 
         attn_output = torch.bmm(attn_probs, value_states)
 
@@ -160,7 +184,8 @@ def forward(
         attn_output = self.out_proj(attn_output)
 
         return attn_output, attn_weights_reshaped
-    
+
+
 class CLIPMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -174,7 +199,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = self.activation_fn(hidden_states)
         hidden_states = self.fc2(hidden_states)
         return hidden_states
-    
+
+
 class CLIPEncoderLayer(nn.Module):
     def __init__(self, config: CLIPVisionConfig):
         super().__init__()
@@ -201,8 +227,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-        )
+        hidden_states, attn_weights = self.self_attn()
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -211,7 +236,8 @@ def forward(
         hidden_states = residual + hidden_states
 
         return hidden_states
-    
+
+
 class CLIPEncoder(nn.Module):
     """
     Transformer encoder consisting of `config.num_hidden_layers` self
@@ -253,7 +279,8 @@ def forward(
         if return_all_hidden_states:
             return hidden_states_pool
         return hidden_states
-    
+
+
 def resolve_visual_encoder_outputs(
     encoder_outputs: Union[torch.Tensor, list[torch.Tensor]],
     feature_sample_layers: Optional[list[int]],
@@ -286,7 +313,8 @@ def resolve_visual_encoder_outputs(
     offset = max_possible_layers - len(encoder_outputs)
     hs_pool = [
         encoder_outputs[layer_idx]
-        if layer_idx >= 0 else encoder_outputs[layer_idx + offset]
+        if layer_idx >= 0
+        else encoder_outputs[layer_idx + offset]
         for layer_idx in feature_sample_layers
     ]
 
@@ -296,8 +324,8 @@ def resolve_visual_encoder_outputs(
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
 
-class CLIPVisionTransformer(nn.Module):
 
+class CLIPVisionTransformer(nn.Module):
     def __init__(
         self,
         config: CLIPVisionConfig,
@@ -332,8 +360,7 @@ def __init__(
             require_post_norm = len(self.encoder.layers) == num_hidden_layers
 
         if require_post_norm:
-            self.post_layernorm = nn.LayerNorm(embed_dim,
-                                               eps=config.layer_norm_eps)
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
         else:
             self.post_layernorm = None
 
@@ -342,7 +369,6 @@ def forward(
         pixel_values: torch.Tensor,
         feature_sample_layers: Optional[list[int]] = None,
     ) -> torch.Tensor:
-
         hidden_states = self.embeddings(pixel_values)
         hidden_states = self.pre_layrnorm(hidden_states)
 
@@ -352,11 +378,15 @@ def forward(
         # depending on if we have feature_sample_layers or not
         encoder_outputs = self.encoder(
             inputs_embeds=hidden_states,
-            return_all_hidden_states=return_all_hidden_states)
+            return_all_hidden_states=return_all_hidden_states,
+        )
 
         # Handle post-norm (if applicable) and stacks feature layers if needed
         encoder_outputs = resolve_visual_encoder_outputs(
-            encoder_outputs, feature_sample_layers, self.post_layernorm,
-            self.config.num_hidden_layers)
+            encoder_outputs,
+            feature_sample_layers,
+            self.post_layernorm,
+            self.config.num_hidden_layers,
+        )
 
-        return encoder_outputs
\ No newline at end of file
+        return encoder_outputs
diff --git a/lite_llama/models/llama.py b/lite_llama/models/llama.py
index 721edb7..9a7b648 100644
--- a/lite_llama/models/llama.py
+++ b/lite_llama/models/llama.py
@@ -7,36 +7,57 @@
 from .model_config import LlamaConfig
 from .RotaryEmbedding import LlamaRotaryEmbedding
 
+
 class FusedAttention(nn.Module):
-    def __init__(self,  config: LlamaConfig, cache_k=None, cache_v=None):
+    def __init__(self, config: LlamaConfig, cache_k=None, cache_v=None):
         super().__init__()
-        self.config= config
+        self.config = config
 
         # K V 头数相同，但和 Q 可能不同
-        self.num_kv_heads = config.num_heads if config.num_kv_heads is None else config.num_kv_heads
-        self.head_dim = config.head_dim if config.head_dim is not None else config.hidden_size // config.num_heads
-        
+        self.num_kv_heads = (
+            config.num_heads if config.num_kv_heads is None else config.num_kv_heads
+        )
+        self.head_dim = (
+            config.head_dim
+            if config.head_dim is not None
+            else config.hidden_size // config.num_heads
+        )
+
         self.num_q_heads = config.num_heads
         self.hidden_size = config.num_heads * self.head_dim
 
-        self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False, dtype=torch.float16)
-        self.kv_proj_weight = nn.Parameter(torch.rand(self.num_kv_heads * self.head_dim * 2, self.hidden_size, dtype=torch.float16))
-        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False, dtype=torch.float16)
+        self.q_proj = nn.Linear(
+            self.hidden_size, self.hidden_size, bias=False, dtype=torch.float16
+        )
+        self.kv_proj_weight = nn.Parameter(
+            torch.rand(
+                self.num_kv_heads * self.head_dim * 2,
+                self.hidden_size,
+                dtype=torch.float16,
+            )
+        )
+        self.o_proj = nn.Linear(
+            self.hidden_size, self.hidden_size, bias=False, dtype=torch.float16
+        )
 
     def context_forward(
         self,
         x: torch.Tensor,
         atten_info,
-        layer_index:int,
+        layer_index: int,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        qk_scale = None,
-    ):         
-        batch_size, seq_len, _ = x.shape  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
+        qk_scale=None,
+    ):
+        batch_size, seq_len, _ = (
+            x.shape
+        )  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
         x = x.view(-1, self.hidden_size)
-        
+
         # 1. 计算 Q K V 并且 reshape 它们尺寸, 方便后续做 self-attention
         xq = self.q_proj(x)
-        k_proj_weight, v_proj_weight = torch.split(self.kv_proj_weight, self.num_kv_heads * self.head_dim, dim=0)
+        k_proj_weight, v_proj_weight = torch.split(
+            self.kv_proj_weight, self.num_kv_heads * self.head_dim, dim=0
+        )
         xk = F.linear(x, k_proj_weight)
         xv = F.linear(x, v_proj_weight)
 
@@ -47,15 +68,19 @@ def context_forward(
 
         cos, sin = position_embeddings
         xq, xk = rope_emb_forward(xq, xk, cos, sin, batch_size, seq_len)
-        combined_kv = torch.cat([xk, xv], dim=-2) # (B*L, 2*num_kv_heads, head_dim)  
-        update_kv_buffer(combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index])
+        combined_kv = torch.cat([xk, xv], dim=-2)  # (B*L, 2*num_kv_heads, head_dim)
+        update_kv_buffer(
+            combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index]
+        )
 
         # 3. sel-attention. flashattention 计算: softmax(qk^t) * v
         output = flash_attention2_no_pad(
-            xq, xk, xv,
+            xq,
+            xk,
+            xv,
             qk_scale,
-            atten_info.b_start_loc, 
-            atten_info.b_seq_len, 
+            atten_info.b_start_loc,
+            atten_info.b_seq_len,
             seq_len,
         )
 
@@ -65,92 +90,122 @@ def context_forward(
         output = self.o_proj(output)
         return output
 
-    def token_forward(self, 
+    def token_forward(
+        self,
         x: torch.Tensor,
         atten_info,
-        layer_index:int,
+        layer_index: int,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        qk_scale = None, 
+        qk_scale=None,
     ):
-        batch_size, seq_len, _ = x.shape  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
+        batch_size, seq_len, _ = (
+            x.shape
+        )  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
         x = x.view(-1, self.hidden_size)
-        
+
         # 1. 计算 Q K V 并且 reshape 它们尺寸, 方便后续做 self-attention
         xq = self.q_proj(x)
-        xkv = F.linear(x, self.kv_proj_weight.data) # (B, L, 2 * num_kv_heads * head_dim)
-        
+        xkv = F.linear(
+            x, self.kv_proj_weight.data
+        )  # (B, L, 2 * num_kv_heads * head_dim)
+
         # 2. 应用旋转位置编码到 Q 和 K, 获取 kv 缓冲向量并更新 kv 向量
         xk, xv = torch.split(xkv, self.num_kv_heads * self.head_dim, dim=-1)
         xq = xq.view(batch_size, self.num_q_heads, self.head_dim)
         xk = xk.view(batch_size, self.num_kv_heads, self.head_dim)
         xv = xv.view(batch_size, self.num_kv_heads, self.head_dim)
-        
+
         cos, sin = position_embeddings
         xq, xk = rope_emb_forward(xq, xk, cos, sin, batch_size, seq_len)
 
         # 3. 完成形状变换, 并更新 kv_buffer, 即类似 torch.concat[past_kv_values, kv_values]
-        combined_kv = torch.cat([xk, xv], dim=-2) # (BS, 2*num_kv_heads, head_dim)
+        combined_kv = torch.cat([xk, xv], dim=-2)  # (BS, 2*num_kv_heads, head_dim)
         # 更新 kv_buffer, atten_info.kv_buffer[layer_index]
-        update_kv_buffer(combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index])
-        
+        update_kv_buffer(
+            combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index]
+        )
+
         # 4. flashattention 计算: softmax(qk^t) * v
         output = flash_decoding(
-            xq, 
-            atten_info.kv_buffer[layer_index][:, : self.num_kv_heads, :], 
-            atten_info.kv_buffer[layer_index][:, self.num_kv_heads:, :], 
+            xq,
+            atten_info.kv_buffer[layer_index][:, : self.num_kv_heads, :],
+            atten_info.kv_buffer[layer_index][:, self.num_kv_heads :, :],
             qk_scale,
-            atten_info.b_req_tokens_table, 
-            atten_info.b_seq_len, 
-            atten_info.max_actual_seq_len
-        ) # ouput shape is [batchs, num_heads, head_dim]; batchs = batch_size(seq_len = 1)
-        
+            atten_info.b_req_tokens_table,
+            atten_info.b_seq_len,
+            atten_info.max_actual_seq_len,
+        )  # ouput shape is [batchs, num_heads, head_dim]; batchs = batch_size(seq_len = 1)
+
         output = output.view(batch_size, seq_len, self.hidden_size)
         output = self.o_proj(output)
         return output
 
-class FusedMLP(nn.Module):
 
+class FusedMLP(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
-        
+
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
 
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False, dtype=torch.float16)
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=False, dtype=torch.float16
+        )
 
     def forward(self, x):
         return self.down_proj(swiglu_forward(self.gate_proj(x), self.up_proj(x)))
 
-class LlamaDecoderLayer(nn.Module):
 
+class LlamaDecoderLayer(nn.Module):
     def __init__(self, config: LlamaConfig):
         super().__init__()
-        self.config= config
+        self.config = config
         self.num_heads = config.num_heads
         self.hidden_size = config.hidden_size
-        self.head_dim = config.head_dim if config.head_dim is not None else config.hidden_size // config.num_heads
+        self.head_dim = (
+            config.head_dim
+            if config.head_dim is not None
+            else config.hidden_size // config.num_heads
+        )
         self.rmsnorm_eps = config.rms_norm_eps
 
-        self.attention_norm_weight = nn.Parameter(torch.ones(self.hidden_size,), requires_grad=False)
-        self.ffn_norm_weight = nn.Parameter(torch.ones(self.hidden_size,), requires_grad=False)
-        
+        self.attention_norm_weight = nn.Parameter(
+            torch.ones(
+                self.hidden_size,
+            ),
+            requires_grad=False,
+        )
+        self.ffn_norm_weight = nn.Parameter(
+            torch.ones(
+                self.hidden_size,
+            ),
+            requires_grad=False,
+        )
+
         self.self_attn = FusedAttention(config)
         self.mlp = FusedMLP(config)
 
-    def forward(self, 
-        hidden_states: torch.Tensor, 
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
         atten_info,
         layer_index: int,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        qk_scale = None,
-        residual: Optional[torch.Tensor] = None
+        qk_scale=None,
+        residual: Optional[torch.Tensor] = None,
     ):
         # Normalization before the attention block.
         _, seq_len, _ = hidden_states.shape
-        
-        hidden_states, residual = skip_rmsnorm(hidden_states, residual, self.attention_norm_weight.data, self.rmsnorm_eps)
+
+        hidden_states, residual = skip_rmsnorm(
+            hidden_states, residual, self.attention_norm_weight.data, self.rmsnorm_eps
+        )
 
         if seq_len > 1:
             hidden_states = self.self_attn.context_forward(
@@ -160,11 +215,13 @@ def forward(self,
             hidden_states = self.self_attn.token_forward(
                 hidden_states, atten_info, layer_index, position_embeddings, qk_scale
             )
-        
-        hidden_states, residual = skip_rmsnorm(hidden_states, residual, self.ffn_norm_weight.data, self.rmsnorm_eps)
+
+        hidden_states, residual = skip_rmsnorm(
+            hidden_states, residual, self.ffn_norm_weight.data, self.rmsnorm_eps
+        )
         hidden_states = self.mlp.forward(hidden_states)
         return hidden_states, residual
-        
+
 
 class LlamaModel(nn.Module):
     def __init__(self, config: LlamaConfig):
@@ -173,35 +230,48 @@ def __init__(self, config: LlamaConfig):
         self.config = config
         self.vocab_size = config.vocab_size
         self.num_layers = config.num_layers
-        self.head_dim = config.head_dim if config.head_dim is not None else config.hidden_size // config.num_heads
-        self.qk_scale = 1.0 / (self.head_dim ** 0.5)
+        self.head_dim = (
+            config.head_dim
+            if config.head_dim is not None
+            else config.hidden_size // config.num_heads
+        )
+        self.qk_scale = 1.0 / (self.head_dim**0.5)
         self.rmsnorm_eps = config.rms_norm_eps
 
         # self.hidden_states = []
 
         self.rotary_emb = LlamaRotaryEmbedding(config=config)
-        self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, dtype=torch.float16)
-        self.norm_weight = nn.Parameter(torch.ones(config.hidden_size,), requires_grad=False)
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size, config.hidden_size, dtype=torch.float16
+        )
+        self.norm_weight = nn.Parameter(
+            torch.ones(
+                config.hidden_size,
+            ),
+            requires_grad=False,
+        )
 
         # 使用 nn.Linear 层替代 lm_head_weight
-        self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=False, dtype=torch.float16)
+        self.lm_head = nn.Linear(
+            config.hidden_size, self.vocab_size, bias=False, dtype=torch.float16
+        )
 
         self.layers = nn.ModuleList(
             [LlamaDecoderLayer(config) for _ in range(config.num_layers)]
         )
 
     def forward(
-        self, 
+        self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        atten_info, 
+        atten_info,
         inputs_embeds: Optional[torch.Tensor] = None,
     ):
         # self.hidden_states = []
         batch_size, seq_len = input_ids.shape
         residual = None
 
-        if inputs_embeds is not None: # To support Multi-model Model
+        if inputs_embeds is not None:  # To support Multi-model Model
             h = inputs_embeds
         else:
             h = self.get_input_embeddings(input_ids)
@@ -210,18 +280,24 @@ def forward(
             qk_scale = self.qk_scale * 1.4426950408889634
         else:
             qk_scale = self.qk_scale
-        
-        position_embeddings = self.rotary_emb(h, position_ids) # cos shape is [1, seq_len, head_dim] -> decode: [batch_size, seq_len, head_dim]
-        
-        for i, layer in enumerate(self.layers): # Consecutively apply all the encoder layers
+
+        position_embeddings = self.rotary_emb(
+            h, position_ids
+        )  # cos shape is [1, seq_len, head_dim] -> decode: [batch_size, seq_len, head_dim]
+
+        for i, layer in enumerate(
+            self.layers
+        ):  # Consecutively apply all the encoder layers
             # self.hidden_states.append(h)
-            h, residual = layer(h, atten_info, i, position_embeddings, qk_scale, residual)  # h.shape [batch_size, seq_len, hidden_dim]
+            h, residual = layer(
+                h, atten_info, i, position_embeddings, qk_scale, residual
+            )  # h.shape [batch_size, seq_len, hidden_dim]
 
         h, _ = skip_rmsnorm(h, residual, self.norm_weight.data, self.rmsnorm_eps)
         # self.hidden_states.append(h)
         output = self.lm_head(h)
 
         return output
-    
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
\ No newline at end of file
+        return self.embed_tokens(input_ids)
diff --git a/lite_llama/models/llava.py b/lite_llama/models/llava.py
index 6ec741d..bfb5410 100644
--- a/lite_llama/models/llava.py
+++ b/lite_llama/models/llava.py
@@ -12,20 +12,20 @@
 
 
 class LlavaMultiModalProjector(nn.Module):
-    def __init__(self, vision_hidden_size: int, text_hidden_size: int,
-                 projector_hidden_act: str="gelu"):
+    def __init__(
+        self,
+        vision_hidden_size: int,
+        text_hidden_size: int,
+        projector_hidden_act: str = "gelu",
+    ):
         super().__init__()
 
-        self.linear_1 = nn.Linear(vision_hidden_size,
-                                  text_hidden_size,
-                                  bias=True)
-        self.linear_2 = nn.Linear(text_hidden_size,
-                                  text_hidden_size,
-                                  bias=True)
+        self.linear_1 = nn.Linear(vision_hidden_size, text_hidden_size, bias=True)
+        self.linear_2 = nn.Linear(text_hidden_size, text_hidden_size, bias=True)
 
     def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         hidden_states = self.linear_1(image_features)
-        hidden_states = F.gelu(hidden_states) # GELU 激活函数
+        hidden_states = F.gelu(hidden_states)  # GELU 激活函数
         hidden_states = self.linear_2(hidden_states)
         return hidden_states
 
@@ -35,9 +35,11 @@ def __init__(self, llava_config: LlavaConfig):
         super().__init__()
         self.device = "cuda"
         self.llava_config = llava_config
-        text_config = self.llava_config.text_config # TODO: 将 text_config 转换成 LlamaConfig 类型
+        text_config = (
+            self.llava_config.text_config
+        )  # TODO: 将 text_config 转换成 LlamaConfig 类型
         self.llama_config = LlamaConfig.from_dict(text_config.to_dict())
-        
+
         self.select_layer = llava_config.vision_feature_layer
         self.select_feature = llava_config.vision_feature_select_strategy
 
@@ -46,19 +48,22 @@ def __init__(self, llava_config: LlavaConfig):
 
         # 多模态投影器（multi_modal_projector）初始化
         self.multi_modal_projector = LlavaMultiModalProjector(
-            vision_hidden_size = llava_config.vision_config.hidden_size,
-            text_hidden_size = llava_config.text_config.hidden_size,
-            projector_hidden_act = llava_config.projector_hidden_act)
-        
+            vision_hidden_size=llava_config.vision_config.hidden_size,
+            text_hidden_size=llava_config.text_config.hidden_size,
+            projector_hidden_act=llava_config.projector_hidden_act,
+        )
+
         # 语言模型初始化
         self.language_model = LlamaModel(self.llama_config)
-        
-        self.pad_token_id = self.llava_config.pad_token_id if self.llava_config.pad_token_id is not None else -1
-    
+
+        self.pad_token_id = (
+            self.llava_config.pad_token_id
+            if self.llava_config.pad_token_id is not None
+            else -1
+        )
+
     def _select_image_features(
-        self, 
-        image_features: torch.Tensor,
-        strategy: str
+        self, image_features: torch.Tensor, strategy: str
     ) -> torch.Tensor:
         """根据策略选择图像特征"""
         # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
@@ -68,63 +73,76 @@ def _select_image_features(
             return image_features
 
         raise ValueError(f"Unexpected select feature strategy: {strategy}")
-    
+
     def vision_encode(self, image_tensor):
         x = image_tensor.half().to(device=self.device)
-        
+
         # 1. 通过视觉处理模块提取图像特征
-        x = self.vision_tower(x, output_hidden_states = True)
+        x = self.vision_tower(x, output_hidden_states=True)
         x = x.hidden_states[self.select_layer]
         x = self._select_image_features(x, self.select_feature)
-        
+
         # 2. 通过多模态投影器将图像特征转换为多模态嵌入
         image_features = self.multi_modal_projector(x)
 
-        assert not torch.isnan(image_features).any(), f"After vision_tower image_features tensor contains NaN values!"
+        assert not torch.isnan(image_features).any(), (
+            f"After vision_tower image_features tensor contains NaN values!"
+        )
         return image_features
-    
+
     def get_multi_modal_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        vision_embeddings = None,
+        vision_embeddings=None,
     ) -> torch.Tensor:
         """获取输入嵌入，包括文本和视觉嵌入的合并。"""
-        llm_inputs_embeds = self.language_model.get_input_embeddings(input_ids) # torch.Size([1, 22]) --> torch.Size([1, 22, 4096])
-        
+        llm_inputs_embeds = self.language_model.get_input_embeddings(
+            input_ids
+        )  # torch.Size([1, 22]) --> torch.Size([1, 22, 4096])
+
         # torch.Size([1, 576, 4096]) torch.Size([1, 22, 4096]) torch.Size([1, 22])
         # print("self.llava_config.image_token_index is ", self.llava_config.image_token_index)
         if vision_embeddings is not None:
             inputs_embeds, position_ids = merge_input_ids_with_image_features(
-                input_ids, llm_inputs_embeds, vision_embeddings, 
+                input_ids,
+                llm_inputs_embeds,
+                vision_embeddings,
                 self.llava_config.pad_token_id,
                 self.llava_config.image_token_index,
             )
-        
-        assert not torch.isnan(inputs_embeds).any(), f"After merge inputs_embeds tensor contains NaN values!"
+
+        assert not torch.isnan(inputs_embeds).any(), (
+            f"After merge inputs_embeds tensor contains NaN values!"
+        )
 
         return inputs_embeds, position_ids
-    
+
     def forward(
-        self, 
-        input_ids: torch.Tensor, 
-        position_ids: torch.Tensor,  
-        atten_info, 
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        atten_info,
         image_tensor: Optional[torch.FloatTensor] = None,
     ):
-        input_ids = input_ids.to(self.device) # 将 input_ids 移动到设备
-        if position_ids is not None: # 如果提供了 position_ids，将其移动到设备
+        input_ids = input_ids.to(self.device)  # 将 input_ids 移动到设备
+        if position_ids is not None:  # 如果提供了 position_ids，将其移动到设备
             position_ids = position_ids.to(self.device)
-            
-        if input_ids.shape[1] != 1: # 判断是不是首次 token 输出
-            vision_embeddings = self.vision_encode(image_tensor) #  torch.Size([1, 3, 336, 336]) --> torch.Size([1, 576, 4096])
-            inputs_embeds, position_ids = self.get_multi_modal_input_embeddings(input_ids, vision_embeddings)
-        else: # 进入 decode 阶段, 无需再做视觉编码
+
+        if input_ids.shape[1] != 1:  # 判断是不是首次 token 输出
+            vision_embeddings = self.vision_encode(
+                image_tensor
+            )  #  torch.Size([1, 3, 336, 336]) --> torch.Size([1, 576, 4096])
+            inputs_embeds, position_ids = self.get_multi_modal_input_embeddings(
+                input_ids, vision_embeddings
+            )
+        else:  # 进入 decode 阶段, 无需再做视觉编码
             inputs_embeds = None
-       
-        hidden_states = self.language_model(input_ids = input_ids,
-                                            position_ids = position_ids,
-                                            atten_info = atten_info,
-                                            inputs_embeds = inputs_embeds
-                                            )
-        
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            atten_info=atten_info,
+            inputs_embeds=inputs_embeds,
+        )
+
         return hidden_states
diff --git a/lite_llama/models/model_config.py b/lite_llama/models/model_config.py
index 4b167e0..e817453 100644
--- a/lite_llama/models/model_config.py
+++ b/lite_llama/models/model_config.py
@@ -1,4 +1,4 @@
-from dataclasses import dataclass,field, fields
+from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Optional, Tuple, Union
 import os, json
 
@@ -12,9 +12,9 @@ class LlamaConfig:
     eos_token_id: Optional[int] = None
     head_dim: Optional[int] = None
     hidden_act: str = "silu"
-    
+
     initializer_range: float = 0.02
-    
+
     hidden_size: int = 2048  # 默认值调整为2048，保持一致性
     intermediate_size: int = 8192
     max_position_embeddings: Optional[int] = None
@@ -45,12 +45,12 @@ def __post_init__(self):
             self.head_dim = None  # 或者设置一个默认值，例如 64
 
     @classmethod
-    def from_dict(cls, data: Dict[str, Any]) -> 'LlamaConfig':
+    def from_dict(cls, data: Dict[str, Any]) -> "LlamaConfig":
         # 定义字段映射
         key_mappings = {
-            'num_attention_heads': 'num_heads',
-            'num_hidden_layers': 'num_layers',
-            'num_key_value_heads': 'num_kv_heads',
+            "num_attention_heads": "num_heads",
+            "num_hidden_layers": "num_layers",
+            "num_key_value_heads": "num_kv_heads",
         }
 
         # 创建一个复制的字典，以避免修改原始数据
@@ -69,34 +69,34 @@ def from_dict(cls, data: Dict[str, Any]) -> 'LlamaConfig':
 
         # 设置默认值，确保所有必要字段都有值
         defaults = {
-            'architectures': ["LlamaForCausalLM"],
-            'attention_bias': False,
-            'attention_dropout': 0.0,
-            'bos_token_id': None,
-            'eos_token_id': None,
-            'hidden_act': "silu",
-            'initializer_range': 0.02,
-            'hidden_size': 2048,
-            'intermediate_size': 8192,
-            'max_position_embeddings': None,
-            'mlp_bias': False,
-            'model_type': "llama",
-            'num_heads': 32,
-            'num_layers': 32,
-            'num_kv_heads': None,
-            'pretraining_tp': 1,
-            'rms_norm_eps': 1e-5,
-            'rope_scaling': None,
-            'rope_theta': 10000.0,
-            'tie_word_embeddings': True,
-            'torch_dtype': "bfloat16",
-            'transformers_version': None,
-            'use_cache': True,
-            'vocab_size': 32064,
-            '_name_or_path': None,
-            'max_batch_size': 64,
-            'max_seq_len': 2048,
-            'device': "cuda",
+            "architectures": ["LlamaForCausalLM"],
+            "attention_bias": False,
+            "attention_dropout": 0.0,
+            "bos_token_id": None,
+            "eos_token_id": None,
+            "hidden_act": "silu",
+            "initializer_range": 0.02,
+            "hidden_size": 2048,
+            "intermediate_size": 8192,
+            "max_position_embeddings": None,
+            "mlp_bias": False,
+            "model_type": "llama",
+            "num_heads": 32,
+            "num_layers": 32,
+            "num_kv_heads": None,
+            "pretraining_tp": 1,
+            "rms_norm_eps": 1e-5,
+            "rope_scaling": None,
+            "rope_theta": 10000.0,
+            "tie_word_embeddings": True,
+            "torch_dtype": "bfloat16",
+            "transformers_version": None,
+            "use_cache": True,
+            "vocab_size": 32064,
+            "_name_or_path": None,
+            "max_batch_size": 64,
+            "max_seq_len": 2048,
+            "device": "cuda",
         }
 
         # 更新缺失的字段
@@ -104,7 +104,8 @@ def from_dict(cls, data: Dict[str, Any]) -> 'LlamaConfig':
             data_filtered.setdefault(key, value)
 
         return cls(**data_filtered)
-    
+
+
 @dataclass
 class Qwen2Config:
     max_batch_size: int = 4
@@ -115,10 +116,10 @@ class Qwen2Config:
     bos_token_id: Optional[int] = 151643
     eos_token_id: Optional[int] = 151645
     hidden_act: str = "silu"
-    
+
     # dim: Optional[int] = None
     initializer_range: float = 0.02
-    
+
     # 模型隐藏层大小, Qwen2.5-1.5B-Instruct
     hidden_size: Optional[int] = 1536
     intermediate_size: Optional[int] = 8960
@@ -159,13 +160,13 @@ def __init__(self, config_dict: Optional[Dict[str, Any]] = None, **kwargs):
         if config_dict is not None:
             for key, value in config_dict.items():
                 # 处理名称映射
-                if key == 'num_attention_heads':
+                if key == "num_attention_heads":
                     self.num_heads = value
-                elif key == 'num_hidden_layers':
+                elif key == "num_hidden_layers":
                     self.num_layers = value
-                elif key == 'num_key_value_heads':
+                elif key == "num_key_value_heads":
                     self.num_kv_heads = value
-                elif key == 'max_length':
+                elif key == "max_length":
                     self.max_seq_len = value
                 else:
                     setattr(self, key, value)
@@ -178,28 +179,30 @@ def __init__(self, config_dict: Optional[Dict[str, Any]] = None, **kwargs):
                 # 如果属性不存在，可以选择存储在 extra_args 中，或者直接添加
                 setattr(self, key, value)
         self.head_dim = self.hidden_size // self.num_heads
-    
+
+
 @dataclass
-class CLIPVisionConfig():
+class CLIPVisionConfig:
     """
     This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
     CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
     [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
     """
-    hidden_size: int = 768,
-    intermediate_size: int = 3072,
-    projection_dim: int = 512,
-    num_layers: int = 12, # encoder_layer 层数
-    num_heads: int = 12,  # attention 模块的头数目
-    num_channels: int = 3,
-    image_size: int = 224,
-    patch_size: int = 32,
-    hidden_act: int = "quick_gelu",
-    layer_norm_eps: int = 1e-5,
-    attention_dropout: int = 0.0,
-    initializer_range: int = 0.02,
-    initializer_factor: int = 1.0,
+
+    hidden_size: int = (768,)
+    intermediate_size: int = (3072,)
+    projection_dim: int = (512,)
+    num_layers: int = (12,)  # encoder_layer 层数
+    num_heads: int = (12,)  # attention 模块的头数目
+    num_channels: int = (3,)
+    image_size: int = (224,)
+    patch_size: int = (32,)
+    hidden_act: int = ("quick_gelu",)
+    layer_norm_eps: int = (1e-5,)
+    attention_dropout: int = (0.0,)
+    initializer_range: int = (0.02,)
+    initializer_factor: int = (1.0,)
 
     model_type: str = "clip_vision_model"
 
@@ -220,11 +223,11 @@ def __init__(self, config_dict: Optional[Dict[str, Any]] = None, **kwargs):
 
             for key, value in config_dict.items():
                 # 处理名称映射
-                if key == 'num_attention_heads':
+                if key == "num_attention_heads":
                     self.num_heads = value
-                elif key == 'num_hidden_layers':
+                elif key == "num_hidden_layers":
                     self.num_layers = value
-                elif key == 'num_key_value_heads':
+                elif key == "num_key_value_heads":
                     self.num_kv_heads = value
                 else:
                     setattr(self, key, value)
@@ -252,7 +255,7 @@ class VisionConfig:
     vocab_size: int
 
     @staticmethod
-    def from_dict(data: Dict[str, Any]) -> 'VisionConfig':
+    def from_dict(data: Dict[str, Any]) -> "VisionConfig":
         return VisionConfig(
             hidden_size=data.get("hidden_size", 768),
             image_size=data.get("image_size", 224),
@@ -262,9 +265,10 @@ def from_dict(data: Dict[str, Any]) -> 'VisionConfig':
             num_hidden_layers=data.get("num_hidden_layers", 12),
             patch_size=data.get("patch_size", 16),
             projection_dim=data.get("projection_dim", 768),
-            vocab_size=data.get("vocab_size", 1000)
+            vocab_size=data.get("vocab_size", 1000),
         )
 
+
 @dataclass
 class LlavaConfig:
     architectures: List[str]
@@ -285,9 +289,9 @@ class LlavaConfig:
     max_batch_size: int = 64
     max_seq_len: int = 10240
     device: str = "cuda"
-    
+
     @staticmethod
-    def from_dict(data: Dict[str, Any]) -> 'LlavaConfig':
+    def from_dict(data: Dict[str, Any]) -> "LlavaConfig":
         text_cfg = LlamaConfig.from_dict(data.get("text_config", {}))
         vision_cfg = VisionConfig.from_dict(data.get("vision_config", {}))
         return LlavaConfig(
@@ -303,13 +307,15 @@ def from_dict(data: Dict[str, Any]) -> 'LlavaConfig':
             transformers_version=data.get("transformers_version", "4.36.0.dev0"),
             vision_config=vision_cfg,
             vision_feature_layer=data.get("vision_feature_layer", -2),
-            vision_feature_select_strategy=data.get("vision_feature_select_strategy", "default"),
-            vocab_size=data.get("vocab_size", 32064)
+            vision_feature_select_strategy=data.get(
+                "vision_feature_select_strategy", "default"
+            ),
+            vocab_size=data.get("vocab_size", 32064),
         )
 
     @classmethod
-    def from_json(cls, json_path: str) -> 'LlavaConfig':
-        with open(json_path, 'r') as f:
+    def from_json(cls, json_path: str) -> "LlavaConfig":
+        with open(json_path, "r") as f:
             data = json.load(f)
         return cls.from_dict(data)
 
@@ -318,6 +324,6 @@ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
         """类方法，用于从指定的 JSON 文件中读取数据并将其解析为字典对象"""
         with open(json_file, "r", encoding="utf-8") as reader:
             text = reader.read()
-        
+
         # NOTE: 使用 json.loads 函数将读取到的 JSON 格式字符串解析为 Python 字典对象
         return json.loads(text)
diff --git a/lite_llama/models/qwen2.py b/lite_llama/models/qwen2.py
index e432f33..4c59cac 100644
--- a/lite_llama/models/qwen2.py
+++ b/lite_llama/models/qwen2.py
@@ -15,63 +15,72 @@ def __init__(self, num_q_heads: int, num_kv_heads: int, head_dim: int):
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
         self.hidden_size = num_q_heads * head_dim
-        
+
     def context_forward(
         self,
         xq: torch.Tensor,
         xk: torch.Tensor,
         xv: torch.Tensor,
         atten_info,
-        layer_index:int,
-        qk_scale = None,
+        layer_index: int,
+        qk_scale=None,
     ) -> torch.Tensor:
         # 1. 获取 prefill 阶段的 cur_select_index, 并更新 kv cache 张量
-        combined_kv = torch.cat([xk, xv], dim=-2) # (B, L, 2*num_kv_heads, head_dim)
+        combined_kv = torch.cat([xk, xv], dim=-2)  # (B, L, 2*num_kv_heads, head_dim)
         # 更新 kv_buffer, atten_info.kv_buffer[layer_index]
-        update_kv_buffer(combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index])
+        update_kv_buffer(
+            combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index]
+        )
 
         # 2. sel-attention. flashattention 计算: softmax(qk^t) * v
         output = flash_attention2_no_pad(
-            xq, xk, xv,
+            xq,
+            xk,
+            xv,
             qk_scale,
-            atten_info.b_start_loc, # 批次中每个请求的开始索引位置
-            atten_info.b_seq_len, 
+            atten_info.b_start_loc,  # 批次中每个请求的开始索引位置
+            atten_info.b_seq_len,
             atten_info.max_actual_seq_len,
         )
-        return output # shape is [batch_size*seq_len, num_heads, head_dim]
+        return output  # shape is [batch_size*seq_len, num_heads, head_dim]
 
-    def token_forward(self, 
+    def token_forward(
+        self,
         xq: torch.Tensor,
         xk: torch.Tensor,
         xv: torch.Tensor,
         atten_info,
-        layer_index:int,
-        qk_scale = None, # 计算 attention 分数缩放的系数
+        layer_index: int,
+        qk_scale=None,  # 计算 attention 分数缩放的系数
     ) -> torch.Tensor:
         # xq = xq.to(torch.float16)
         # 1. 先获取 kv 缓冲向量再更新 kv_buffer, atten_info.kv_buffer[layer_index]
-        combined_kv = torch.cat([xk, xv], dim=-2) # (B*L, 2*num_kv_heads, head_dim)
-        update_kv_buffer(combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index])
+        combined_kv = torch.cat([xk, xv], dim=-2)  # (B*L, 2*num_kv_heads, head_dim)
+        update_kv_buffer(
+            combined_kv, atten_info.cur_select_index, atten_info.kv_buffer[layer_index]
+        )
 
         # 2. flashattention 计算: softmax(qk^t) * v
         output = flash_decoding(
             xq,
-            atten_info.kv_buffer[layer_index][:, : self.num_kv_heads, :], 
-            atten_info.kv_buffer[layer_index][:, self.num_kv_heads:, :], 
+            atten_info.kv_buffer[layer_index][:, : self.num_kv_heads, :],
+            atten_info.kv_buffer[layer_index][:, self.num_kv_heads :, :],
             qk_scale,
-            atten_info.b_req_tokens_table, 
-            atten_info.b_seq_len, 
-            atten_info.max_actual_seq_len
-        ) # shape is [batch_size*seq_len, num_heads, head_dim]
+            atten_info.b_req_tokens_table,
+            atten_info.b_seq_len,
+            atten_info.max_actual_seq_len,
+        )  # shape is [batch_size*seq_len, num_heads, head_dim]
 
         return output
 
+
 class Qwen2Attention(nn.Module):
-    def __init__(self,  
+    def __init__(
+        self,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        dtype = torch.float16,
+        dtype=torch.float16,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -80,43 +89,57 @@ def __init__(self,
         self.num_heads = num_heads
         self.head_dim = hidden_size // num_heads
 
-        self.q_proj_weight = nn.Parameter(torch.rand(hidden_size, hidden_size, dtype=torch.float16))
+        self.q_proj_weight = nn.Parameter(
+            torch.rand(hidden_size, hidden_size, dtype=torch.float16)
+        )
         self.q_proj_bias = nn.Parameter(torch.rand(hidden_size, dtype=torch.float16))
 
-        self.kv_proj_weight = nn.Parameter(torch.rand(self.num_kv_heads * self.head_dim * 2, self.hidden_size, dtype=torch.float16))
-        self.kv_proj_bias = nn.Parameter(torch.rand(self.num_kv_heads * self.head_dim * 2, dtype=torch.float16))
-        self.o_proj_weight = nn.Parameter(torch.rand(hidden_size, hidden_size, dtype=torch.float16))
+        self.kv_proj_weight = nn.Parameter(
+            torch.rand(
+                self.num_kv_heads * self.head_dim * 2,
+                self.hidden_size,
+                dtype=torch.float16,
+            )
+        )
+        self.kv_proj_bias = nn.Parameter(
+            torch.rand(self.num_kv_heads * self.head_dim * 2, dtype=torch.float16)
+        )
+        self.o_proj_weight = nn.Parameter(
+            torch.rand(hidden_size, hidden_size, dtype=torch.float16)
+        )
 
         self.attn = Attention(num_heads, num_kv_heads, self.head_dim)
 
     def _get_qkv(
-        self, 
+        self,
         x: torch.Tensor,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> torch.Tensor:
-        batch_size, seq_len, _ = x.shape  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
+        batch_size, seq_len, _ = (
+            x.shape
+        )  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
         x = x.view(-1, self.hidden_size)
 
         xq = F.linear(x, self.q_proj_weight.data, bias=self.q_proj_bias.data)
         xkv = F.linear(x, self.kv_proj_weight.data, bias=self.kv_proj_bias.data)
         xk, xv = torch.split(xkv, self.num_kv_heads * self.head_dim, dim=-1)
 
-        xq = xq.view(batch_size*seq_len, self.num_heads, self.head_dim)
-        xk = xk.view(batch_size*seq_len, self.num_kv_heads, self.head_dim)
-        xv = xv.view(batch_size*seq_len, self.num_kv_heads, self.head_dim)
+        xq = xq.view(batch_size * seq_len, self.num_heads, self.head_dim)
+        xk = xk.view(batch_size * seq_len, self.num_kv_heads, self.head_dim)
+        xv = xv.view(batch_size * seq_len, self.num_kv_heads, self.head_dim)
 
         cos, sin = position_embeddings
         xq, xk = rope_emb_forward(xq, xk, cos, sin, batch_size, seq_len)
 
         return xq, xk, xv
-    
+
     def forward(
         self,
         x: torch.Tensor,
         atten_info,
-        layer_index:int,
+        layer_index: int,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        qk_scale = None,
+        qk_scale=None,
     ) -> torch.Tensor:
         batch_size, seq_len, _ = x.shape
 
@@ -126,78 +149,119 @@ def forward(
         # 根据输入张量 seq_len 长度选择 context_forward 还是 token_forward
         if seq_len > 1:
             attn_output = self.attn.context_forward(
-                xq, xk, xv,
-                atten_info, layer_index,
+                xq,
+                xk,
+                xv,
+                atten_info,
+                layer_index,
                 qk_scale,
             )
-            attn_output = attn_output.view(batch_size, seq_len, self.hidden_size) # 输出张量 seq_len = 1
+            attn_output = attn_output.view(
+                batch_size, seq_len, self.hidden_size
+            )  # 输出张量 seq_len = 1
             # if torch.isnan(attn_output).any(): # 检查 NaNs
-            #     raise ValueError(f"NaNs detected in context_forward output at layer {layer_index}")    
+            #     raise ValueError(f"NaNs detected in context_forward output at layer {layer_index}")
         else:
             attn_output = self.attn.token_forward(
-                xq, xk, xv, 
-                atten_info, layer_index,
+                xq,
+                xk,
+                xv,
+                atten_info,
+                layer_index,
                 qk_scale,
             )
-            attn_output = attn_output.view(batch_size, seq_len, self.hidden_size) # 输出张量 seq_len = 1
+            attn_output = attn_output.view(
+                batch_size, seq_len, self.hidden_size
+            )  # 输出张量 seq_len = 1
             # if torch.isnan(attn_output).any(): # 检查 NaNs
-            #     raise ValueError(f"NaNs detected in token_forward output at layer {layer_index}")    
+            #     raise ValueError(f"NaNs detected in token_forward output at layer {layer_index}")
 
         # 进行张量矩阵乘法, 需要对原始的 o_proj_weight 权重进行转置, attn_output shape is [batch_size, seq_len, hidden_size]
         output = F.linear(attn_output, self.o_proj_weight.data)
         return output
-    
+
+
 class FusedMLP(nn.Module):
     def __init__(self, config: Qwen2Config):
         super().__init__()
-        
+
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
 
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False, dtype=torch.float16) # torch.float32 cpu
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=False, dtype=torch.float16
+        )  # torch.float32 cpu
 
     def forward(self, x):
         return self.down_proj(swiglu_forward(self.gate_proj(x), self.up_proj(x)))
-        
+
+
 class Qwen2DecoderLayer(nn.Module):
     def __init__(self, config: Qwen2Config):
         super().__init__()
-        self.config= config
+        self.config = config
         self.num_heads = config.num_heads
         self.num_kv_heads = config.num_kv_heads
         self.hidden_size = config.hidden_size
-        self.head_dim = config.head_dim if config.head_dim is not None else config.hidden_size // config.num_heads
+        self.head_dim = (
+            config.head_dim
+            if config.head_dim is not None
+            else config.hidden_size // config.num_heads
+        )
 
         self.rmsnorm_eps = config.rms_norm_eps
 
         # 命名和 Qwen2ForCausalLM 一致
-        self.input_layernorm_weight = nn.Parameter(torch.ones(self.hidden_size, dtype=torch.float16))
-        self.post_attention_layernorm_weight = nn.Parameter(torch.ones(self.hidden_size, dtype=torch.float16))
-        
-        self.self_attn = Qwen2Attention(self.hidden_size, self.num_heads, self.num_kv_heads)
+        self.input_layernorm_weight = nn.Parameter(
+            torch.ones(self.hidden_size, dtype=torch.float16)
+        )
+        self.post_attention_layernorm_weight = nn.Parameter(
+            torch.ones(self.hidden_size, dtype=torch.float16)
+        )
+
+        self.self_attn = Qwen2Attention(
+            self.hidden_size, self.num_heads, self.num_kv_heads
+        )
         self.mlp = FusedMLP(config)
 
-    def forward(self, 
-        hidden_states: torch.Tensor, 
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
         atten_info,
         layer_index: int,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        qk_scale = None,
+        qk_scale=None,
         residual: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states, residual = skip_rmsnorm(hidden_states, residual, self.input_layernorm_weight.data, self.rmsnorm_eps)
+        hidden_states, residual = skip_rmsnorm(
+            hidden_states, residual, self.input_layernorm_weight.data, self.rmsnorm_eps
+        )
 
         # 调用 attention 模块
-        hidden_states = self.self_attn(hidden_states, atten_info, layer_index, position_embeddings, qk_scale)
-        
+        hidden_states = self.self_attn(
+            hidden_states, atten_info, layer_index, position_embeddings, qk_scale
+        )
+
         # 调用 mlp 模块
-        hidden_states, residual = skip_rmsnorm(hidden_states, residual, self.post_attention_layernorm_weight.data, self.rmsnorm_eps)
-        hidden_states = self.mlp.forward(hidden_states) # 调用 Feed Forward 模块并做残差连接
-        
+        hidden_states, residual = skip_rmsnorm(
+            hidden_states,
+            residual,
+            self.post_attention_layernorm_weight.data,
+            self.rmsnorm_eps,
+        )
+        hidden_states = self.mlp.forward(
+            hidden_states
+        )  # 调用 Feed Forward 模块并做残差连接
+
         return hidden_states, residual
 
+
 class Qwen2Model(nn.Module):
     def __init__(self, config: Qwen2Config):
         super().__init__()
@@ -208,19 +272,25 @@ def __init__(self, config: Qwen2Config):
         hidden_size = config.hidden_size
         vocab_size = config.vocab_size
         num_layers = config.num_layers
-        head_dim = config.head_dim if config.head_dim is not None else config.hidden_size // config.num_heads
+        head_dim = (
+            config.head_dim
+            if config.head_dim is not None
+            else config.hidden_size // config.num_heads
+        )
 
-        self.qk_scale = 1.0 / (head_dim ** 0.5)
+        self.qk_scale = 1.0 / (head_dim**0.5)
 
         self.rotary_emb = Qwen2RotaryEmbedding(config=config)
-        
+
         # Embedding 层权重的形状为 (vocab_size, hidden_size)
         self.embed_tokens = nn.Embedding(vocab_size, hidden_size, dtype=torch.float16)
         self.norm_weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.float16))
 
         # 使用 nn.Linear 层替代 lm_head_weight
-        self.lm_head_weight = nn.Parameter(torch.rand(vocab_size, hidden_size, dtype=torch.float16))
-        
+        self.lm_head_weight = nn.Parameter(
+            torch.rand(vocab_size, hidden_size, dtype=torch.float16)
+        )
+
         self.layers = nn.ModuleList(
             [Qwen2DecoderLayer(config) for _ in range(num_layers)]
         )
@@ -228,8 +298,9 @@ def __init__(self, config: Qwen2Config):
         # self.hidden_states = []
 
     def forward(
-        self, input_ids: torch.Tensor, 
-        position_ids: torch.Tensor, 
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
         atten_info,
         inputs_embeds: Optional[torch.Tensor] = None,
     ):
@@ -248,16 +319,17 @@ def forward(
             qk_scale = self.qk_scale
 
         position_embeddings = self.rotary_emb(h, position_ids)
-       
+
         # Consecutively apply all the encoder layers
-        for i, layer in enumerate(self.layers):            
+        for i, layer in enumerate(self.layers):
             # self.hidden_states.append(h)
-            h, residual = layer(h, atten_info, i, position_embeddings, qk_scale, residual)  # h.shape [batch_size, seq_len, hidden_dim]
+            h, residual = layer(
+                h, atten_info, i, position_embeddings, qk_scale, residual
+            )  # h.shape [batch_size, seq_len, hidden_dim]
 
-        h, _ = skip_rmsnorm(h, residual, self.norm_weight.data, self.rmsnorm_eps)        
+        h, _ = skip_rmsnorm(h, residual, self.norm_weight.data, self.rmsnorm_eps)
         output = F.linear(h, self.lm_head_weight.data)
         return output
-    
+
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
-    
\ No newline at end of file
diff --git a/lite_llama/models/utils.py b/lite_llama/models/utils.py
index 84c7e9f..6b045bf 100644
--- a/lite_llama/models/utils.py
+++ b/lite_llama/models/utils.py
@@ -1,7 +1,19 @@
 import itertools
 from dataclasses import dataclass, field
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Protocol, Tuple, Union, overload)
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Protocol,
+    Tuple,
+    Union,
+    overload,
+)
 
 import torch
 import torch.nn as nn
@@ -13,6 +25,7 @@
 Uses a list instead of a tensor if the dimensions of each element do not match.
 """
 
+
 def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
     """
     Create a weak reference to a tensor.
@@ -21,6 +34,7 @@ def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
     """
     return torch.ops._C.weak_ref_tensor(tensor)
 
+
 def _flatten_embeddings(embeddings: NestedTensors) -> torch.Tensor:
     """
     递归地将嵌套的张量结构 (NestedTensors) 在最后一个维度之外的所有维度展平, 并将它们连接成一个单一的二维张量。
@@ -41,8 +55,8 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str:
     if isinstance(embeddings, torch.Tensor):
         return " x ".join([str(dim) for dim in embeddings.shape[:-1]])
 
-    return " + ".join(
-        _embedding_count_expression(inner) for inner in embeddings)
+    return " + ".join(_embedding_count_expression(inner) for inner in embeddings)
+
 
 def _merge_multimodal_embeddings(
     inputs_embeds: torch.Tensor,
@@ -65,11 +79,13 @@ def _merge_multimodal_embeddings(
         expr = _embedding_count_expression(multimodal_embeddings)
         raise ValueError(
             f"Attempted to assign {expr} = {flattened.shape[0]} "
-            f"multimodal tokens to {num_expected_tokens} placeholders")
+            f"multimodal tokens to {num_expected_tokens} placeholders"
+        )
 
     inputs_embeds[is_multimodal] = flattened
     return inputs_embeds
 
+
 # def _merge_multimodal_embeddings(
 #     inputs_embeds: torch.Tensor,
 #     is_multimodal: torch.Tensor,
@@ -81,13 +97,13 @@ def _merge_multimodal_embeddings(
 #     flattened = _flatten_embeddings(multimodal_embeddings)
 #     print(f"Attempted to assign {num_expected_tokens} = {flattened.shape[0]} \
 #           multimodal tokens to {num_expected_tokens} placeholders")
-    
+
 #     if flattened.shape[0] != num_expected_tokens:
 #         expr = _embedding_count_expression(multimodal_embeddings)
 #         raise ValueError(
 #             f"Attempted to assign {expr} = {flattened.shape[0]} "
 #             f"multimodal tokens to {num_expected_tokens} placeholders")
-    
+
 #     # Ensure that the assignment is valid
 #     if flattened.shape[0] > num_expected_tokens:
 #         # Option 1: Truncate the embeddings
@@ -101,6 +117,7 @@ def _merge_multimodal_embeddings(
 #     inputs_embeds[is_multimodal] = flattened
 #     return inputs_embeds
 
+
 def merge_multimodal_embeddings(
     input_ids: torch.Tensor,
     inputs_embeds: torch.Tensor,
@@ -121,12 +138,14 @@ def merge_multimodal_embeddings(
         multimodal_embeddings,
     )
 
+
 def embed_multimodal(
     input_ids: torch.Tensor,
     multimodal_token_id: int,
     get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor,
-                                                          List[torch.Tensor]]],
+    get_multimodal_embeds: Callable[
+        [torch.Tensor], Union[torch.Tensor, List[torch.Tensor]]
+    ],
 ) -> torch.Tensor:
     """
     Embed token IDs and multimodal inputs and combine their embeddings.
@@ -161,36 +180,44 @@ def embed_multimodal(
 
 
 def merge_input_ids_with_image_features2(
-    image_features, 
-    inputs_embeds, 
-    input_ids, 
+    image_features,
+    inputs_embeds,
+    input_ids,
     attention_mask,
     pad_token_id,
-    image_token_index
+    image_token_index,
 ):
     num_images, num_image_patches, embed_dim = image_features.shape
     batch_size, sequence_length = input_ids.shape
     # NOTE: 检查每个样本的最后一个 token 是否为填充 token
     # NOTE: 如果最后一个 token 不是填充 token，则为 True，表示存在左侧填充；否则为 False。
     left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(pad_token_id))
-    
+
     # 1. 创建图像 token 的掩码来获取特殊图像 token 的位置, 并计算新序列最大长度
     # NOTE: 一个布尔张量，标识 input_ids 中哪些位置是图像 token（即等于 image_token_index 的位置）
     special_image_token_mask = input_ids == image_token_index
     # NOTE: 每个样本中图像 token 的数量, 形状为 [batch_size, ]
     num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-    
+
     # 计算合并图像特征后的新序列最大长度。
     # NOTE: 每个图像 token 位置会被替换为 (num_image_patches - 1) 个图像 paches embedding token。
-    max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+    max_embed_dim = (
+        num_special_image_tokens.max() * (num_image_patches - 1)
+    ) + sequence_length
     # NOTE: 通过 torch.where 获取所有非图像 token 的位置索引。
     # NOTE: 当仅提供 condition 参数时，torch.where 等同于 torch.nonzero(condition, as_tuple=True)，返回满足条件的元素的索引。
-    batch_indices, non_image_indices = torch.where(input_ids != image_token_index) # 满足条件的样本索引和序列 token 索引
+    batch_indices, non_image_indices = torch.where(
+        input_ids != image_token_index
+    )  # 满足条件的样本索引和序列 token 索引
 
     # 2. 计算文本应写入的位置
     # NOTE: 每个图像 token 会增加 (num_image_patches - 1) 个位置。
-    new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
-    nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1] # 计算需要的图像填充数量，以达到 max_embed_dim。
+    new_token_positions = (
+        torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+    )
+    nb_image_pad = (
+        max_embed_dim - 1 - new_token_positions[:, -1]
+    )  # 计算需要的图像填充数量，以达到 max_embed_dim。
     # 如果存在左侧填充 (left_padding 为 True)，则将 new_token_positions 进行偏移调整。
     if left_padding:
         new_token_positions += nb_image_pad[:, None]  # offset for left padding
@@ -199,12 +226,19 @@ def merge_input_ids_with_image_features2(
 
     # 3. 初始化最终的嵌入与注意力掩码
     final_embedding = torch.zeros(
-        batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        batch_size,
+        max_embed_dim,
+        embed_dim,
+        dtype=inputs_embeds.dtype,
+        device=inputs_embeds.device,
     )
     final_attention_mask = torch.zeros(
-        batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        batch_size,
+        max_embed_dim,
+        dtype=attention_mask.dtype,
+        device=inputs_embeds.device,
     )
-    
+
     # NOTE: 如果视觉模型或语言模型已卸载到 CPU，我们需要手动将相应的张量设置到正确的目标设备中。
     target_device = inputs_embeds.device
     batch_indices, non_image_indices, text_to_overwrite = (
@@ -214,27 +248,41 @@ def merge_input_ids_with_image_features2(
     )
     attention_mask = attention_mask.to(target_device)
 
-    # 4. 填充文本嵌入与注意力掩码. 
+    # 4. 填充文本嵌入与注意力掩码.
     # If we have ["hey" "<image>", "how", "are"]. we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
     # NOTE: 使用 batch_indices 和 text_to_overwrite 将 inputs_embeds 中的非图像 token 嵌入复制到 final_embedding 的相应位置。
-    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-    final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
+        batch_indices, non_image_indices
+    ]
+    final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[
+        batch_indices, non_image_indices
+    ]
 
     # 5. 填充图像特征与更新注意力掩码和位置 ID.
-    image_to_overwrite = torch.all(final_embedding == 0, dim=-1) # 找出 final_embedding 中所有维度为0的位置（即尚未填充的地方）。
+    image_to_overwrite = torch.all(
+        final_embedding == 0, dim=-1
+    )  # 找出 final_embedding 中所有维度为0的位置（即尚未填充的地方）。
     # NOTE: 使用 cumsum 计算累积和，确保这些位置在填充数量 (nb_image_pad) 之后。
-    image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+    image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(
+        target_device
+    )
 
-    if image_to_overwrite.sum() != image_features.shape[:-1].numel():  # 如果需要填充的位置数量不等于 image_features 的数量，抛出错误。
-        raise ValueError(      
+    if (
+        image_to_overwrite.sum() != image_features.shape[:-1].numel()
+    ):  # 如果需要填充的位置数量不等于 image_features 的数量，抛出错误。
+        raise ValueError(
             f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
             f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
         )
 
     # NOTE: 将 image_features 重新排列为 (batch_size * num_image_patches, embed_dim)，并填充到 final_embedding 的相应位置。
-    final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+    final_embedding[image_to_overwrite] = (
+        image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+    )
     final_attention_mask |= image_to_overwrite
-    position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+    position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_(
+        (final_attention_mask == 0), 1
+    )
 
     # 6. 处理填充位置的嵌入, 将填充位置的嵌入设为0：
     batch_indices, pad_indices = torch.where(input_ids == pad_token_id)
@@ -244,65 +292,90 @@ def merge_input_ids_with_image_features2(
 
     return final_embedding, final_attention_mask, position_ids
 
+
 def merge_input_ids_with_image_features(
-    input_ids: torch.Tensor, 
-    inputs_embeds: torch.Tensor, 
+    input_ids: torch.Tensor,
+    inputs_embeds: torch.Tensor,
     image_features: torch.Tensor,
     pad_token_id: int,
-    image_token_index: int
+    image_token_index: int,
 ):
     """
     将 input_ids 与 image_features 合并，生成最终的嵌入和位置 ID。
-    
+
     Args:
         input_ids (torch.Tensor): 输入的 token IDs, 形状为 (batch_size, sequence_length)
         inputs_embeds (torch.Tensor): 文本嵌入，形状为 (batch_size, sequence_length, embed_dim)
         image_features (torch.Tensor): 视觉编码后的图像特征，形状为 (num_images, num_image_patches, embed_dim)
         pad_token_id (int): 填充 token 的 ID
         image_token_index (int): 图像 token 的 ID
-    
+
     Returns:
         final_embedding (torch.Tensor): 合并后的嵌入张量，形状为 (batch_size, max_embed_dim, embed_dim)
         position_ids (torch.Tensor): 位置 ID, 形状为 (batch_size, max_embed_dim)
     """
     target_device = input_ids.device
     # 1, 基础 shape 信息提取
-    num_images, num_image_patches, embed_dim = image_features.shape # torch.Size([1, 576, 4096])
-    batch_size, sequence_length = input_ids.shape # torch.Size([1, 22])
+    num_images, num_image_patches, embed_dim = (
+        image_features.shape
+    )  # torch.Size([1, 576, 4096])
+    batch_size, sequence_length = input_ids.shape  # torch.Size([1, 22])
 
     # 2, 掩码与填充处理
     attention_mask = (input_ids != pad_token_id).long()
     left_padding = not torch.sum(input_ids[:, -1] == pad_token_id).bool().any()
-    batch_image_token_mask  = input_ids == image_token_index
-    
+    batch_image_token_mask = input_ids == image_token_index
+
     # 3, 计算新序列长度
-    num_special_image_tokens = torch.sum(batch_image_token_mask , dim=-1) # 统计每个样本（batch 中每条序列）里出现了多少个“图像占位符” token。
-    max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
-    batch_indices, non_image_indices = torch.where(input_ids != image_token_index) 
+    num_special_image_tokens = torch.sum(
+        batch_image_token_mask, dim=-1
+    )  # 统计每个样本（batch 中每条序列）里出现了多少个“图像占位符” token。
+    max_embed_dim = (
+        num_special_image_tokens.max() * (num_image_patches - 1)
+    ) + sequence_length
+    batch_indices, non_image_indices = torch.where(input_ids != image_token_index)
 
     # 4, 位置映射计算
     # 得到每个原始 token 在新序列中占据的开始位置索引。
-    new_token_positions = torch.cumsum((batch_image_token_mask  * (num_image_patches - 1) + 1), -1) - 1 
-    nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1] 
+    new_token_positions = (
+        torch.cumsum((batch_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+    )
+    nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
     if left_padding:
         new_token_positions += nb_image_pad[:, None]  # offset for left padding
     text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-    
+
     # 5，构建融合张量
     final_embedding = torch.zeros(
-        batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        batch_size,
+        max_embed_dim,
+        embed_dim,
+        dtype=inputs_embeds.dtype,
+        device=inputs_embeds.device,
     )
-    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices] # 填充文本嵌入
+    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
+        batch_indices, non_image_indices
+    ]  # 填充文本嵌入
 
     # 确定图像特征插入位置，通过找到 final_embedding 中所有全 0 的位置
-    image_to_overwrite = torch.all(final_embedding == 0, dim=-1)  # 找出 final_embedding 中所有维度为0的位置
-    image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+    image_to_overwrite = torch.all(
+        final_embedding == 0, dim=-1
+    )  # 找出 final_embedding 中所有维度为0的位置
+    image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(
+        target_device
+    )
 
     # 将 image_features 重新排列为 (num_images * num_image_patches, embed_dim)，并填充到 final_embedding 的相应位置。
-    final_embedding[image_to_overwrite] = image_features.contiguous().view(-1, embed_dim).to(target_device)
-    
+    final_embedding[image_to_overwrite] = (
+        image_features.contiguous().view(-1, embed_dim).to(target_device)
+    )
+
     # 6，生成新的 position_ids
-    position_ids = torch.arange(max_embed_dim, dtype=torch.long, device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1)
+    position_ids = (
+        torch.arange(max_embed_dim, dtype=torch.long, device=inputs_embeds.device)
+        .unsqueeze(0)
+        .expand(batch_size, -1)
+    )
 
     # 7，处理填充位置的嵌入, 将填充位置的嵌入设为0
     batch_indices_pad, pad_indices = torch.where(input_ids == pad_token_id)
@@ -312,6 +385,7 @@ def merge_input_ids_with_image_features(
 
     return final_embedding, position_ids
 
+
 def unit_test_merge_input_ids_with_image_features():
     """
     单元测试函数，测试 merge_input_ids_with_image_features 的各种场景。
@@ -324,10 +398,9 @@ def unit_test_merge_input_ids_with_image_features():
     print("=== 示例1: 统一尺寸的 image_features ===")
     batch_size = 2
     # 计算总图像 token 数量
-    input_ids = torch.tensor([
-        [101, 102, 999, 103, 104],
-        [201, 999, 202, 999, 203]
-    ], dtype=torch.long)
+    input_ids = torch.tensor(
+        [[101, 102, 999, 103, 104], [201, 999, 202, 999, 203]], dtype=torch.long
+    )
     num_image_tokens = torch.sum(input_ids == image_token_index).item()  # 3
 
     num_images = num_image_tokens  # 3
@@ -347,11 +420,11 @@ def unit_test_merge_input_ids_with_image_features():
         inputs_embeds=inputs_embeds,
         input_ids=input_ids,
         pad_token_id=pad_token_id,
-        image_token_index=image_token_index
+        image_token_index=image_token_index,
     )
 
     print("Final Embedding Shape:", final_embedding.shape)  # Expected: (2, 13, 768)
-    print("Position IDs Shape:", position_ids.shape)      # Expected: (2, 13)
+    print("Position IDs Shape:", position_ids.shape)  # Expected: (2, 13)
     print()
 
     # 示例2：没有图像输入
@@ -360,28 +433,33 @@ def unit_test_merge_input_ids_with_image_features():
     image_features_empty = torch.tensor([]).reshape(0, 0, embed_dim)
 
     # input_ids 不包含任何图像 token
-    input_ids_no_image = torch.tensor([
-        [101, 102, 103, 104, 105],
-        [201, 202, 203, 204, 205]
-    ], dtype=torch.long)
+    input_ids_no_image = torch.tensor(
+        [[101, 102, 103, 104, 105], [201, 202, 203, 204, 205]], dtype=torch.long
+    )
 
-    num_image_tokens_no_image = torch.sum(input_ids_no_image == image_token_index).item()  # 0
+    num_image_tokens_no_image = torch.sum(
+        input_ids_no_image == image_token_index
+    ).item()  # 0
 
     num_images_no_image = num_image_tokens_no_image  # 0
     sequence_length_no_image = 5
 
-    inputs_embeds_no_image = torch.randn(batch_size, sequence_length_no_image, embed_dim)
+    inputs_embeds_no_image = torch.randn(
+        batch_size, sequence_length_no_image, embed_dim
+    )
 
     final_embedding_empty, position_ids_empty = merge_input_ids_with_image_features(
         image_features=image_features_empty,
         inputs_embeds=inputs_embeds_no_image,
         input_ids=input_ids_no_image,
         pad_token_id=pad_token_id,
-        image_token_index=image_token_index
+        image_token_index=image_token_index,
     )
 
-    print("Final Embedding Shape (Empty):", final_embedding_empty.shape)  # Expected: (2, 5, 768)
-    print("Position IDs Shape (Empty):", position_ids_empty.shape)        # Expected: (2, 5)
+    print(
+        "Final Embedding Shape (Empty):", final_embedding_empty.shape
+    )  # Expected: (2, 5, 768)
+    print("Position IDs Shape (Empty):", position_ids_empty.shape)  # Expected: (2, 5)
     print()
 
     # 示例3：错误的 image_features 类型
@@ -389,12 +467,14 @@ def unit_test_merge_input_ids_with_image_features():
     try:
         # image_features 不是 tensor
         image_features_invalid = "invalid_image_features"
-        final_embedding_invalid, position_ids_invalid = merge_input_ids_with_image_features(
-            image_features=image_features_invalid,  # 传入字符串，应该是 tensor
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids,
-            pad_token_id=pad_token_id,
-            image_token_index=image_token_index
+        final_embedding_invalid, position_ids_invalid = (
+            merge_input_ids_with_image_features(
+                image_features=image_features_invalid,  # 传入字符串，应该是 tensor
+                inputs_embeds=inputs_embeds,
+                input_ids=input_ids,
+                pad_token_id=pad_token_id,
+                image_token_index=image_token_index,
+            )
         )
     except Exception as e:
         print(f"Caught Exception: {e}")
@@ -404,24 +484,29 @@ def unit_test_merge_input_ids_with_image_features():
     print("=== 示例4：image_features 与图像 token 数量不匹配 ===")
     try:
         # input_ids_mismatch 中有 7 个 image tokens
-        input_ids_mismatch = torch.tensor([
-            [101, 999, 999, 999, 104],
-            [999, 999, 999, 999, 203]
-        ], dtype=torch.long)
-        num_image_tokens_mismatch = torch.sum(input_ids_mismatch == image_token_index).item()  # 7
+        input_ids_mismatch = torch.tensor(
+            [[101, 999, 999, 999, 104], [999, 999, 999, 999, 203]], dtype=torch.long
+        )
+        num_image_tokens_mismatch = torch.sum(
+            input_ids_mismatch == image_token_index
+        ).item()  # 7
 
         num_images_mismatch = num_image_tokens_mismatch  # 7
         num_image_patches_mismatch = 4
         embed_dim_mismatch = 768
 
-        image_features_mismatch = torch.randn(num_images_mismatch, num_image_patches_mismatch, embed_dim_mismatch)  # 正确
-
-        final_embedding_mismatch, position_ids_mismatch = merge_input_ids_with_image_features(
-            image_features=image_features_mismatch,
-            inputs_embeds=torch.randn(2, 5, embed_dim_mismatch),
-            input_ids=input_ids_mismatch,
-            pad_token_id=pad_token_id,
-            image_token_index=image_token_index
+        image_features_mismatch = torch.randn(
+            num_images_mismatch, num_image_patches_mismatch, embed_dim_mismatch
+        )  # 正确
+
+        final_embedding_mismatch, position_ids_mismatch = (
+            merge_input_ids_with_image_features(
+                image_features=image_features_mismatch,
+                inputs_embeds=torch.randn(2, 5, embed_dim_mismatch),
+                input_ids=input_ids_mismatch,
+                pad_token_id=pad_token_id,
+                image_token_index=image_token_index,
+            )
         )
     except ValueError as e:
         print(f"Caught ValueError: {e}")
@@ -430,32 +515,38 @@ def unit_test_merge_input_ids_with_image_features():
     # 示例5：单个样本，单个图像 token
     print("=== 示例5：单个样本，单个图像 token ===")
     batch_size_single = 1
-    input_ids_single = torch.tensor([
-        [101, 999, 102, 103]
-    ], dtype=torch.long)
-    num_image_tokens_single = torch.sum(input_ids_single == image_token_index).item()  # 1
+    input_ids_single = torch.tensor([[101, 999, 102, 103]], dtype=torch.long)
+    num_image_tokens_single = torch.sum(
+        input_ids_single == image_token_index
+    ).item()  # 1
 
     num_images_single = num_image_tokens_single  # 1
     num_image_patches_single = 3
     embed_dim_single = 768
     sequence_length_single = 4
 
-    inputs_embeds_single = torch.randn(batch_size_single, sequence_length_single, embed_dim_single)
+    inputs_embeds_single = torch.randn(
+        batch_size_single, sequence_length_single, embed_dim_single
+    )
 
-    image_features_single = torch.randn(num_images_single, num_image_patches_single, embed_dim_single)
+    image_features_single = torch.randn(
+        num_images_single, num_image_patches_single, embed_dim_single
+    )
 
     final_embedding_single, position_ids_single = merge_input_ids_with_image_features(
         image_features=image_features_single,
         inputs_embeds=inputs_embeds_single,
         input_ids=input_ids_single,
         pad_token_id=pad_token_id,
-        image_token_index=image_token_index
+        image_token_index=image_token_index,
     )
 
-    print("Final Embedding Shape (Single):", final_embedding_single.shape)  # Expected: (1, 6, 768)
-    print("Position IDs Shape (Single):", position_ids_single.shape)      # Expected: (1, 6)
+    print(
+        "Final Embedding Shape (Single):", final_embedding_single.shape
+    )  # Expected: (1, 6, 768)
+    print("Position IDs Shape (Single):", position_ids_single.shape)  # Expected: (1, 6)
     print()
 
+
 if __name__ == "__main__":
     unit_test_merge_input_ids_with_image_features()
-
diff --git a/utils/common.py b/lite_llama/utils/common.py
similarity index 75%
rename from utils/common.py
rename to lite_llama/utils/common.py
index 791b1fe..55dbbc9 100644
--- a/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -3,30 +3,35 @@
 import subprocess
 from typing import List, Optional
 
+
 def read_json(json_path):
     with open(json_path, "r") as json_file:
         data = json.load(json_file)
     return data
 
+
 def read_jsonl(jsonl_path):
-    with open(jsonl_path, 'r', encoding='utf-8') as f:
+    with open(jsonl_path, "r", encoding="utf-8") as f:
         data = [json.loads(line) for line in f]
     return data
 
+
 def detect_device():
     try:
-        subprocess.check_output(['nvidia-smi'], stderr=subprocess.DEVNULL)
+        subprocess.check_output(["nvidia-smi"], stderr=subprocess.DEVNULL)
         return "nvidia"
     except:
         try:
-            subprocess.check_output(['rocm-smi'], stderr=subprocess.DEVNULL)
+            subprocess.check_output(["rocm-smi"], stderr=subprocess.DEVNULL)
             return "amd"
         except:
             return "cpu"
 
+
 def getTime():
     return str(time.strftime("%m-%d %H:%M:%S", time.localtime()))
 
+
 def getProjectPath():
     script_path = os.path.split(os.path.realpath(__file__))[0]
     return os.path.abspath(os.path.join(script_path, ".."))
@@ -37,35 +42,48 @@ def get_gpu_memory(gpu_type="amd", device_id="0"):
         if gpu_type == "amd":
             result = subprocess.run(
                 ["rocm-smi", "--showmeminfo", "vram", device_id],
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
             )
             for line in result.stdout.splitlines():
                 if "VRAM Total Used Memory" in line:
                     used = line.split(":")[-1].strip().split()[0]
-                    return float(used) / (10 ** 9) # Convert MiB to GiB
+                    return float(used) / (10**9)  # Convert MiB to GiB
         elif gpu_type == "nvidia":
             result = subprocess.run(
-                ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader", "-i", device_id],
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+                [
+                    "nvidia-smi",
+                    "--query-gpu=memory.used",
+                    "--format=csv,nounits,noheader",
+                    "-i",
+                    device_id,
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
             )
             return float(result.stdout.strip()) / 1024  # Convert MiB to GiB
         elif gpu_type == "cpu":
             return None
     except Exception as e:
         from utils.logger import log
+
         log.warning(f"Unable to fetch GPU memory: {e}")
         return None
 
-def count_tokens(texts: List[str], tokenizer) -> int:
 
+def count_tokens(texts: List[str], tokenizer) -> int:
     total_tokens = 0
     for t in texts:
         ids = tokenizer(t, add_special_tokens=False)["input_ids"]
         total_tokens += len(ids)
     return total_tokens
 
+
 def get_model_type(checkpoint_path: str) -> str | None:
     from utils.logger import log
+
     model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]
 
     config_content = read_json(os.path.join(checkpoint_path, "config.json"))
diff --git a/lite_llama/utils/config_convert.py b/lite_llama/utils/config_convert.py
index ed9d284..2841433 100644
--- a/lite_llama/utils/config_convert.py
+++ b/lite_llama/utils/config_convert.py
@@ -1,46 +1,44 @@
-import sys, os
 import transformers
-from transformers import LlavaNextConfig, LlavaConfig
+from transformers import LlavaConfig
 
 # sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from ..models.model_config import LlamaConfig
 
 
 def convert_transformers_to_custom_config(
-    transformers_config: transformers.LlamaConfig
+    transformers_config: transformers.LlamaConfig,
 ) -> LlamaConfig:
     # 将 transformers 配置转换为字典
     config_dict = transformers_config.to_dict()
     print("transformers.LlamaConfig dict: ", config_dict)
-    
+
     return LlamaConfig(
-            _name_or_path=config_dict.get("_name_or_path"),
-            architectures=config_dict.get("architectures", ["LlamaForCausalLM"]),
-            max_position_embeddings=config_dict.get("max_position_embeddings", 4096),
-            model_type=config_dict.get("model_type", "llama"),
-            rms_norm_eps=config_dict.get("rms_norm_eps", 1e-5),
-            torch_dtype=config_dict.get("torch_dtype", "float16"),
-            vocab_size=config_dict.get("vocab_size", 32064),
-            
-            hidden_size = config_dict.get("hidden_size", 4096),
-            intermediate_size = config_dict.get("intermediate_size", 11008),
-            num_hidden_layers = config_dict.get("num_hidden_layers", 32),
-            num_attention_heads = config_dict.get("num_attention_heads", 32),
-            num_key_value_heads = config_dict.get("num_key_value_heads", None),
-        )
-
-    # 创建自定义配置实例
+        _name_or_path=config_dict.get("_name_or_path"),
+        architectures=config_dict.get("architectures", ["LlamaForCausalLM"]),
+        max_position_embeddings=config_dict.get("max_position_embeddings", 4096),
+        model_type=config_dict.get("model_type", "llama"),
+        rms_norm_eps=config_dict.get("rms_norm_eps", 1e-5),
+        torch_dtype=config_dict.get("torch_dtype", "float16"),
+        vocab_size=config_dict.get("vocab_size", 32064),
+        hidden_size=config_dict.get("hidden_size", 4096),
+        intermediate_size=config_dict.get("intermediate_size", 11008),
+        num_hidden_layers=config_dict.get("num_hidden_layers", 32),
+        num_attention_heads=config_dict.get("num_attention_heads", 32),
+        num_key_value_heads=config_dict.get("num_key_value_heads", None),
+    )
     custom_config = LlamaConfig(config_dict=config_dict)
-
     return custom_config
 
+
 if __name__ == "__main__":
     # 加载 transformers 的 LlamaConfig（请替换为实际模型名称）
-    model_path = '/gemini/code/liuhaotian/llava-v1.5-7b'
+    model_path = "/gemini/code/liuhaotian/llava-v1.5-7b"
     transformers_config = LlavaConfig.from_pretrained(model_path)
 
     # 转换为自定义配置
-    custom_llama_config = convert_transformers_to_custom_config(transformers_config.text_config)
+    custom_llama_config = convert_transformers_to_custom_config(
+        transformers_config.text_config
+    )
 
     # 打印自定义配置
     # print(json.dumps(custom_llama_config, indent=4, ensure_ascii=False))
@@ -52,4 +50,4 @@ def convert_transformers_to_custom_config(
 num_heads=32, num_layers=32, num_kv_heads=32, pretraining_tp=1, rms_norm_eps=1e-06, rope_scaling=None, rope_theta=10000.0, 
 tie_word_embeddings=False, torch_dtype=None, transformers_version='4.40.2', use_cache=True, vocab_size=32000, max_batch_size=4,
 max_seq_len=2048, device='cuda')
-"""
\ No newline at end of file
+"""
diff --git a/lite_llama/utils/constants.py b/lite_llama/utils/constants.py
index f863747..cc565ab 100644
--- a/lite_llama/utils/constants.py
+++ b/lite_llama/utils/constants.py
@@ -18,4 +18,4 @@
 LLAVA_DEFAULT_IM_TOKEN_PLACE_HOLDER = "<image>"
 LLAVA_DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
 LLAVA_DEFAULT_IM_START_TOKEN = "<im_start>"
-LLAVA_DEFAULT_IM_END_TOKEN = "<im_end>"
\ No newline at end of file
+LLAVA_DEFAULT_IM_END_TOKEN = "<im_end>"
diff --git a/lite_llama/utils/file_interface.py b/lite_llama/utils/file_interface.py
index 9a3c51a..5ce3e03 100644
--- a/lite_llama/utils/file_interface.py
+++ b/lite_llama/utils/file_interface.py
@@ -1,8 +1,10 @@
+import os
+
 
 def get_model_name_from_path(model_path):
     model_path = model_path.strip("/")
     model_paths = model_path.split("/")
-    if model_paths[-1].startswith('checkpoint-'):
+    if model_paths[-1].startswith("checkpoint-"):
         return model_paths[-2] + "_" + model_paths[-1]
     else:
-        return model_paths[-1]
\ No newline at end of file
+        return model_paths[-1]
diff --git a/lite_llama/utils/image_process.py b/lite_llama/utils/image_process.py
index 0541978..ee42d1f 100644
--- a/lite_llama/utils/image_process.py
+++ b/lite_llama/utils/image_process.py
@@ -20,9 +20,11 @@
 import os
 import base64
 
+
 def load_image_from_base64(image):
     return Image.open(BytesIO(base64.b64decode(image)))
 
+
 def load_image(image_file):
     if image_file.startswith("http://") or image_file.startswith("https://"):
         response = requests.get(image_file)
@@ -39,16 +41,21 @@ def load_images(image_files):
         out.append(image)
     return out
 
+
 def vis_images(image_files):
     if len(image_files) == 1:
         image = image_files[0]
-        os.system(f"termvisage --query-timeout 1 -H left --height 40 --oversize {image}") # --height 50：设置图片高度为 500 行。
+        os.system(
+            f"termvisage --query-timeout 1 -H left --height 40 --oversize {image}"
+        )  # --height 50：设置图片高度为 500 行。
 
-    else:        
+    else:
         # Concat images
         system_inst = "convert "
         inst_template1 = " \\( {image} -background none -resize x{height} \\) "
-        inst_template2 = " \\( {image} -background none -resize x{height} -splice 50x0 \\) "
+        inst_template2 = (
+            " \\( {image} -background none -resize x{height} -splice 50x0 \\) "
+        )
         count = 0
         for image in image_files:
             with Image.open(image) as img:
@@ -59,12 +66,13 @@ def vis_images(image_files):
             if count == 1:
                 system_inst += inst_template1.format(image=image, height=height)
             else:
-                system_inst += inst_template2.format(image=image,height=height)
+                system_inst += inst_template2.format(image=image, height=height)
         system_inst += " +append .vis.jpg"
         os.system(system_inst)
 
         os.system(f"termvisage --query-timeout 1 .vis.jpg -H left")
 
+
 def expand2square(pil_img, background_color):
     """
     Copy from Llava codebase for image preprocessing.
@@ -81,6 +89,7 @@ def expand2square(pil_img, background_color):
         result.paste(pil_img, ((height - width) // 2, 0))
         return result
 
+
 def process_images(images, image_processor, model_cfg):
     """
     Copy from Llava codebase for image preprocessing.
diff --git a/utils/logger.py b/lite_llama/utils/logger.py
similarity index 66%
rename from utils/logger.py
rename to lite_llama/utils/logger.py
index 5681eee..1d210db 100644
--- a/utils/logger.py
+++ b/lite_llama/utils/logger.py
@@ -2,14 +2,15 @@
 
 import os
 import sys
-sys.path.append("..")
+import time
+
 import logging
-from utils.common import getTime
+sys.path.append("..")
 from utils.common import getProjectPath
 
-# reload(sys)
-# sys.setdefaultencoding('utf-8')
+__all__ = ["log", "logE", "logP", "logU"]
 
+# Set up the logger
 BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
 
 # These are the sequences need to get colored ouput
@@ -18,19 +19,19 @@
 BOLD_SEQ = "\033[1m"
 
 COLORS = {
-    'WARNING': YELLOW,
-    'INFO': GREEN,
-    'DEBUG': BLUE,
-    'CRITICAL': YELLOW,
-    'ERROR': RED
+    "WARNING": YELLOW,
+    "INFO": GREEN,
+    "DEBUG": BLUE,
+    "CRITICAL": YELLOW,
+    "ERROR": RED,
 }
 
 LEVEL_SIM = {
-    'WARNING': '[W]',
-    'INFO': '[I]',
-    'DEBUG': '[D]',
-    'CRITICAL': '[C]',
-    'ERROR': '[E]'
+    "WARNING": "[W]",
+    "INFO": "[I]",
+    "DEBUG": "[D]",
+    "CRITICAL": "[C]",
+    "ERROR": "[E]",
 }
 
 
@@ -51,14 +52,18 @@ def format(self, record):
         levelname = record.levelname
         if self.use_color and levelname in COLORS:
             simple_ln = LEVEL_SIM.get(levelname)
-            levelname_color = COLOR_SEQ % (30 + COLORS[levelname]) + simple_ln + RESET_SEQ
+            levelname_color = (
+                COLOR_SEQ % (30 + COLORS[levelname]) + simple_ln + RESET_SEQ
+            )
             record.levelname = levelname_color
         return logging.Formatter.format(self, record)
 
 
 # Custom logger class with multiple destinations
 class ColoredLogger(logging.Logger):
-    FORMAT = "%(asctime)s $RESET%(levelname)s %(filename)s$RESET:%(lineno)d %(message)s "
+    FORMAT = (
+        "%(asctime)s $RESET%(levelname)s %(filename)s$RESET:%(lineno)d %(message)s "
+    )
     COLOR_FORMAT = formatter_message(FORMAT, True)
 
     def __init__(self, name):
@@ -72,10 +77,6 @@ def __init__(self, name):
         self.addHandler(console)
         return
 
-
-project_path = getProjectPath()
-
-
 def loggerHandle():
     logging.setLoggerClass(ColoredLogger)
     logger = logging.getLogger(__name__)
@@ -84,15 +85,19 @@ def loggerHandle():
 
 
 def logfileHandle(log_name="logs/common.log"):
+    project_path = getProjectPath()
     log_file = os.path.join(project_path, log_name)
-    if not os.path.exists(os.path.join(project_path, 'logs')):
-        os.makedirs(os.path.join(project_path, 'logs'))
+    if not os.path.exists(os.path.join(project_path, "logs")):
+        os.makedirs(os.path.join(project_path, "logs"))
     if not os.path.exists(log_file):
         os.mknod(log_file)
     logfile = logging.getLogger()
     logfile.setLevel(logging.DEBUG)
-    handler = logging.FileHandler(log_file, encoding='UTF-8')
-    formatter = logging.Formatter('%(asctime)s %(levelname)s %(filename)s:%(lineno)d  %(message)s', datefmt="%m-%d %H:%M:%S")
+    handler = logging.FileHandler(log_file, encoding="UTF-8")
+    formatter = logging.Formatter(
+        "%(asctime)s %(levelname)s %(filename)s:%(lineno)d  %(message)s",
+        datefmt="%m-%d %H:%M:%S",
+    )
     handler.setFormatter(formatter)
     logfile.addHandler(handler)
     return logfile
@@ -103,8 +108,8 @@ def logfileHandle(log_name="logs/common.log"):
 logP = logfileHandle("logs/post.log")
 logU = logfileHandle("logs/upload_data.log")
 
-import time
-if __name__ == '__main__':
+if __name__ == "__main__":
+    
     logging.setLoggerClass(ColoredLogger)
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.DEBUG)
@@ -115,11 +120,3 @@ def logfileHandle(log_name="logs/common.log"):
     logger.error("test")
     time.sleep(10)
     logger.info("aaaaa")
-#
-#     logfile = logging.getLogger()
-#     logfile.setLevel(logging.DEBUG)
-#     handler = logging.FileHandler("Alibaba.log", encoding='UTF-8')
-#     formatter = logging.Formatter('%(asctime)s %(filename)s:%(lineno)d %(levelname)s %(message)s')
-#     handler.setFormatter(formatter)
-#     logfile.addHandler(handler)
-#     logfile.info("aaaaaaaaaaaaaaa")
diff --git a/lite_llama/utils/prompt_templates.py b/lite_llama/utils/prompt_templates.py
index 01e5b79..4669bf6 100644
--- a/lite_llama/utils/prompt_templates.py
+++ b/lite_llama/utils/prompt_templates.py
@@ -12,12 +12,14 @@
 
 LLAVA_DEFAULT_IMAGE_TOKEN = "<image>"
 
+
 def get_image_token(model=None, model_name=None):
     return LLAVA_DEFAULT_IMAGE_TOKEN + "\\n "
 
 
 class BasePrompter:
     """用于构建模型的提示词 (Prompt) 模板和管理对话流程"""
+
     def __init__(
         self,
         system_inst,
@@ -35,7 +37,7 @@ def __init__(
         self.qa_spliter = qa_spliter  # How to split Q&A rounds
         self.decorator = decorator
         self.colon = colon
-        
+
         if self.decorator == None:
             self.starter = ""
             self.stopper = ""
@@ -241,22 +243,26 @@ def __init__(self):
 class Qwen2Prompter(BasePrompter):
     def __init__(self):
         # 在 Qwen2 的提示格式下，system_inst 将包含系统信息（如角色设定）
-        system_inst = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
-        
+        system_inst = (
+            "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
+        )
+
         # role1 用作 user 信息块的起始标记，这里不需要额外标记，只需在模板中插入即可
         # role2 用作 assistant 起始标记
         # 我们在构造时，会通过 template 来定义最终的格式。
-        
-        role1 = "<|im_start|>user\n"    # 用户块开始 
+
+        role1 = "<|im_start|>user\n"  # 用户块开始
         role2 = "<|im_start|>assistant\n"  # 助手块开始
         sen_spliter = "\n"
         qa_spliter = "\n"
         colon = ""  # 这里不再需要冒号
-        
+
         # 调用父类构造函数
-        super().__init__(system_inst, role1, role2, sen_spliter, qa_spliter, colon=colon)
+        super().__init__(
+            system_inst, role1, role2, sen_spliter, qa_spliter, colon=colon
+        )
 
-        # 重写模板: 
+        # 重写模板:
         # 若存在 system_inst，则模板为：
         # <|im_start|>system
         # {system_inst}
@@ -267,14 +273,9 @@ def __init__(self):
         # <|im_start|>assistant
         #
         # 若不存在 system_inst，则跳过 system 块，但这里我们默认有 system_inst。
-        
+
         if self.system_inst is None:
-            self.template = (
-                self.role1 
-                + "{prompt}\n"
-                + "<|im_end|>\n"
-                + self.role2
-            )
+            self.template = self.role1 + "{prompt}\n" + "<|im_end|>\n" + self.role2
         else:
             self.template = (
                 "<|im_start|>system\n"
@@ -291,12 +292,7 @@ def update_template(self, outputs, chunk_prefilling=0):
         # 若有特殊需求，可在此根据逻辑微调。
         # 这里保持简单，不做改动：
         if chunk_prefilling:
-            self.template = (
-                self.role1
-                + "{prompt}\n"
-                + "<|im_end|>\n"
-                + self.role2
-            )
+            self.template = self.role1 + "{prompt}\n" + "<|im_end|>\n" + self.role2
         else:
             # 若需要将对话上下文追加到模板中，可在此实现
             # 简单起见，不做复杂处理
@@ -310,6 +306,7 @@ def update_template(self, outputs, chunk_prefilling=0):
                 + self.role2
             )
 
+
 class FalconSimplePrompter(BasePrompter):
     def __init__(self):
         system_inst = None
@@ -371,7 +368,9 @@ def get_prompter(model_type, model_path="", short_prompt=False, empty_prompt=Fal
         if "vicuna" in model_path.lower():
             return VicunaPrompter()
         elif (
-            "llama-3" in model_path.lower() or "llama3" in model_path.lower() or "llama-3.2" in model_path.lower()
+            "llama-3" in model_path.lower()
+            or "llama3" in model_path.lower()
+            or "llama-3.2" in model_path.lower()
         ) and "30b" not in model_path.lower():
             if "vila" in model_path.lower():
                 # with system prompt by default
@@ -414,6 +413,7 @@ def get_stop_token_ids(model_type, model_path=""):
     else:
         raise ValueError(f"model type {model_type} is not supported")
 
+
 if __name__ == "__main__":
     # 使用方法示例
     prompter = get_prompter("qwen2")
diff --git a/requirement.txt b/requirement.txt
index fc036a1..56c5283 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -1,6 +1,7 @@
 tokenizers==0.20.3
-transformers==4.46.3
 huggingface-hub==0.24.6
+transformers==4.41
+torch=2.1.2
 triton>=2.1.0
 tqdm==4.65.0
 pytest==8.3.3
diff --git a/tests/fused_mlp_silu.py b/tests/fused_mlp_silu.py
index 9ed60b6..c0c3cf3 100644
--- a/tests/fused_mlp_silu.py
+++ b/tests/fused_mlp_silu.py
@@ -3,23 +3,35 @@
 import triton
 import triton.language as tl
 
+
 @triton.jit
 def matmul_silu_kernel(
-        # Pointers to matrices
-        a_ptr, w1_ptr, w2_ptr, c_ptr,
-        # Matrix dimensions
-        M, N, K,
-        # The stride variables represent how much to increase the ptr by when moving by 1
-        stride_am, stride_ak,    # input
-        stride_w1k, stride_w1n,  # weight 1
-        stride_w2k, stride_w2n,  # weight 2
-        stride_cm, stride_cn,    # output
-        # Meta-parameters
-        BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,  #
-        GROUP_SIZE_M: tl.constexpr,  #
+    # Pointers to matrices
+    a_ptr,
+    w1_ptr,
+    w2_ptr,
+    c_ptr,
+    # Matrix dimensions
+    M,
+    N,
+    K,
+    # The stride variables represent how much to increase the ptr by when moving by 1
+    stride_am,
+    stride_ak,  # input
+    stride_w1k,
+    stride_w1n,  # weight 1
+    stride_w2k,
+    stride_w2n,  # weight 2
+    stride_cm,
+    stride_cn,  # output
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,  #
+    GROUP_SIZE_M: tl.constexpr,  #
 ):
     """
-       Fused kernel for computing F.silu(w1(x)) * w2(x)
+    Fused kernel for computing F.silu(w1(x)) * w2(x)
     """
     # -----------------------------------------------------------
     # Map program ids `pid` to pid_m and pid_n
@@ -62,9 +74,9 @@ def matmul_silu_kernel(
     c = (acc1 * tl.sigmoid(acc1)) * acc2
 
     # option 2: silu in fp32
-    #acc1 = (acc1 * tl.sigmoid(acc1)).to(tl.float16)
-    #acc2 = acc2.to(tl.float16)
-    #c = acc1 * acc2
+    # acc1 = (acc1 * tl.sigmoid(acc1)).to(tl.float16)
+    # acc2 = acc2.to(tl.float16)
+    # c = acc1 * acc2
 
     # -----------------------------------------------------------
     # Write back the block of the output matrix
@@ -78,17 +90,26 @@ def matmul_silu_kernel(
 @triton.jit
 def matmul_kernel(
     # Pointers to matrices
-    a_ptr, b_ptr, c_ptr,
+    a_ptr,
+    b_ptr,
+    c_ptr,
     # Matrix dimensions
-    M, N, K,
+    M,
+    N,
+    K,
     # The stride variables represent how much to increase the ptr by when moving by 1
     # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr`
     # by to get the element one row down (A has M rows).
-    stride_am, stride_ak,
-    stride_bk, stride_bn,
-    stride_cm, stride_cn,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
     # Meta-parameters
-    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
     GROUP_SIZE_M: tl.constexpr,
 ):
     """Kernel for computing the matmul C = A x B.
@@ -168,19 +189,31 @@ def mlp_silu(x, w1, w2, w3):
     # 这里的 grid 针对 (M,K) 输出维度进行网格划分
     BLOCK_SIZE_M = 64
     BLOCK_SIZE_N = 64  # 用于中间N和最终K的分块大小
-    BLOCK_SIZE_K = 128 # 用于中间K维的分块大小
+    BLOCK_SIZE_K = 128  # 用于中间K维的分块大小
     # 1D launch kernel where each block gets its own program.
-    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N), )
+    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),)
     matmul_silu_kernel[grid](
-        x, w1, w2, out,
-        M, N, K,
-        x.stride(0), x.stride(1),
-        w1.stride(0), w1.stride(1),
-        w2.stride(0), w2.stride(1),
-        out.stride(0), out.stride(1),
-        BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K,
+        x,
+        w1,
+        w2,
+        out,
+        M,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        w1.stride(0),
+        w1.stride(1),
+        w2.stride(0),
+        w2.stride(1),
+        out.stride(0),
+        out.stride(1),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
         GROUP_SIZE_M=8,
-        num_stages=2, num_warps=4
+        num_stages=2,
+        num_warps=4,
     )
 
     M, K = out.shape
@@ -189,21 +222,32 @@ def mlp_silu(x, w1, w2, w3):
     # Allocates output.
     mlp_silu_out = torch.empty((M, N), device=x.device, dtype=x.dtype)
     # 1D launch kernel where each block gets its own program.
-    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N), )
+    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),)
     matmul_kernel[grid](
-        out, w3, mlp_silu_out,
-        M, N, K,
-        out.stride(0), out.stride(1),
-        w3.stride(0), w3.stride(1),
-        mlp_silu_out.stride(0), mlp_silu_out.stride(1),
-        BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K,
+        out,
+        w3,
+        mlp_silu_out,
+        M,
+        N,
+        K,
+        out.stride(0),
+        out.stride(1),
+        w3.stride(0),
+        w3.stride(1),
+        mlp_silu_out.stride(0),
+        mlp_silu_out.stride(1),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
         GROUP_SIZE_M=8,
-        num_stages=2, num_warps=4
+        num_stages=2,
+        num_warps=4,
     )
 
     mlp_silu_out = mlp_silu_out.view(batch, seq_len, -1)
     return mlp_silu_out
 
+
 def triton_torch_mlp_silu(x, w1, w2, w3):
     # Check constraints.
     assert x.shape[-1] == w1.shape[0], "Incompatible dimensions"
@@ -224,30 +268,45 @@ def triton_torch_mlp_silu(x, w1, w2, w3):
     # 这里的 grid 针对 (M,K) 输出维度进行网格划分
     BLOCK_SIZE_M = 64
     BLOCK_SIZE_N = 64  # 用于中间N和最终K的分块大小
-    BLOCK_SIZE_K = 128 # 用于中间K维的分块大小
+    BLOCK_SIZE_K = 128  # 用于中间K维的分块大小
     # 1D launch kernel where each block gets its own program.
-    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N), )
+    grid = (triton.cdiv(M, BLOCK_SIZE_M) * triton.cdiv(N, BLOCK_SIZE_N),)
     matmul_silu_kernel[grid](
-        x, w1, w2, out,
-        M, N, K,
-        x.stride(0), x.stride(1),
-        w1.stride(0), w1.stride(1),
-        w2.stride(0), w2.stride(1),
-        out.stride(0), out.stride(1),
-        BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K,
+        x,
+        w1,
+        w2,
+        out,
+        M,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        w1.stride(0),
+        w1.stride(1),
+        w2.stride(0),
+        w2.stride(1),
+        out.stride(0),
+        out.stride(1),
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
         GROUP_SIZE_M=8,
-        num_stages=2, num_warps=4
+        num_stages=2,
+        num_warps=4,
     )
 
     mlp_silu_out = torch.mm(out, w3)  # MxK
     mlp_silu_out = mlp_silu_out.view(batch, seq_len, -1)
     return mlp_silu_out
 
+
 import torch.nn as nn
 import sys, os
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.kernels.swiglu import swiglu_forward
 
+
 def torch_mlp_silu(x, w1, w2, w3):
     batch, seq_len, dim = x.shape
     M, K = batch * seq_len, dim
@@ -259,30 +318,53 @@ def torch_mlp_silu(x, w1, w2, w3):
     mlp_silu_out = mlp_silu_out.view(batch, seq_len, -1)
     return mlp_silu_out
 
+
 class FusedMLP(nn.Module):
     def __init__(self, hidden_size, intermediate_size):
         super().__init__()
-        
+
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
 
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False, dtype=torch.float16)
+        self.gate_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16
+        )
+        self.up_proj = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias=False, dtype=torch.float16
+        )
+        self.down_proj = nn.Linear(
+            self.intermediate_size, self.hidden_size, bias=False, dtype=torch.float16
+        )
 
     def forward(self, x):
         return self.down_proj(swiglu_forward(self.gate_proj(x), self.up_proj(x)))
-    
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     B = 4
     seq_len = 256
     hidden_size = 3584
     intermediate_size = 18944
-    x = torch.randn(B, seq_len, hidden_size, device='cuda', dtype=torch.float16)
-    w1 = torch.randn((intermediate_size, hidden_size), device='cuda', dtype=torch.float16) * 0.01
-    w2 = torch.randn((intermediate_size, hidden_size), device='cuda', dtype=torch.float16) * 0.01
-    w3 = torch.randn((hidden_size, intermediate_size), device='cuda', dtype=torch.float16) * 0.01
+    x = torch.randn(B, seq_len, hidden_size, device="cuda", dtype=torch.float16)
+    w1 = (
+        torch.randn(
+            (intermediate_size, hidden_size), device="cuda", dtype=torch.float16
+        )
+        * 0.01
+    )
+    w2 = (
+        torch.randn(
+            (intermediate_size, hidden_size), device="cuda", dtype=torch.float16
+        )
+        * 0.01
+    )
+    w3 = (
+        torch.randn(
+            (hidden_size, intermediate_size), device="cuda", dtype=torch.float16
+        )
+        * 0.01
+    )
 
     w1_t = w1.t().contiguous()
     w2_t = w2.t().contiguous()
@@ -296,11 +378,22 @@ def forward(self, x):
 
     # assert torch.allclose(torch_output, triton_output, atol=1e-2)
     # assert(torch.amax(torch_output - triton_output).item() <= 0.05)
-    print(f"Max diff: {torch.max(torch.abs(torch_output - triton_output))}") # assert(torch.amax(Y - Y2).item() <= 0.05)
-    print(f"Max diff: {torch.max(torch.abs(torch_output - triton_torch_output))}") # assert(torch.amax(Y - Y2).item() <= 0.05)
-    print(f"Max diff: {torch.max(torch.abs(torch_output - torch_fused_mlp_out))}") # assert(torch.amax(Y - Y2).item() <= 0.05)
-
-    print('torch:', triton.testing.do_bench(lambda: torch_mlp_silu(x, w1_t, w2_t, w3_t)))
-    print('triton:', triton.testing.do_bench(lambda: mlp_silu(x, w1_t, w2_t, w3_t)))
-    print('triton_torch:', triton.testing.do_bench(lambda: triton_torch_mlp_silu(x, w1_t, w2_t, w3_t)))
-    print('torch_fused_mlp:', triton.testing.do_bench(lambda: torch_fused_mlp(x)))
+    print(
+        f"Max diff: {torch.max(torch.abs(torch_output - triton_output))}"
+    )  # assert(torch.amax(Y - Y2).item() <= 0.05)
+    print(
+        f"Max diff: {torch.max(torch.abs(torch_output - triton_torch_output))}"
+    )  # assert(torch.amax(Y - Y2).item() <= 0.05)
+    print(
+        f"Max diff: {torch.max(torch.abs(torch_output - torch_fused_mlp_out))}"
+    )  # assert(torch.amax(Y - Y2).item() <= 0.05)
+
+    print(
+        "torch:", triton.testing.do_bench(lambda: torch_mlp_silu(x, w1_t, w2_t, w3_t))
+    )
+    print("triton:", triton.testing.do_bench(lambda: mlp_silu(x, w1_t, w2_t, w3_t)))
+    print(
+        "triton_torch:",
+        triton.testing.do_bench(lambda: triton_torch_mlp_silu(x, w1_t, w2_t, w3_t)),
+    )
+    print("torch_fused_mlp:", triton.testing.do_bench(lambda: torch_fused_mlp(x)))
diff --git a/tests/kernels_benchmark.py b/tests/kernels_benchmark.py
index 031724a..799ab58 100644
--- a/tests/kernels_benchmark.py
+++ b/tests/kernels_benchmark.py
@@ -16,15 +16,18 @@
     # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it
     # should not be added to extras_require in setup.py.
     import apex
+
     HAS_APEX = True
 except ModuleNotFoundError:
     HAS_APEX = False
-    
+
+
 def is_cuda():
     return torch.cuda.is_available()
 
+
 result_path = "/gemini/code/lite_llama/images/benchamrk_result"
-ref_lib = 'cuBLAS' if is_cuda() else 'rocBLAS'
+ref_lib = "cuBLAS" if is_cuda() else "rocBLAS"
 TORCH_HAS_FP8 = hasattr(torch, "float8_e5m2")
 ################################benchamrk matmul################################
 configs = []
@@ -34,18 +37,25 @@ def is_cuda():
     configs.append(
         triton.testing.Benchmark(
             x_names=["M", "N", "K"],  # Argument names to use as an x-axis for the plot
-            x_vals=[128 * i for i in range(2, 33)],  # Different possible values for `x_name`
+            x_vals=[
+                128 * i for i in range(2, 33)
+            ],  # Different possible values for `x_name`
             line_arg="provider",  # Argument name whose value corresponds to a different line in the plot
             # Possible values for `line_arg`
             # Don't compare to cublas for fp8 cases as torch.matmul doesn't support fp8 at the moment.
-            line_vals=["triton"] if fp8_inputs else [ref_lib.lower(), "triton"],  # Label name for the lines
+            line_vals=["triton"]
+            if fp8_inputs
+            else [ref_lib.lower(), "triton"],  # Label name for the lines
             line_names=["Triton"] if fp8_inputs else [ref_lib, "Triton"],  # Line styles
             styles=[("green", "-"), ("blue", "-")],
             ylabel="TFLOPS",  # Label name for the y-axis
-            plot_name="matmul-performance-" +
-            ("fp16" if not fp8_inputs else "fp8"),  # Name for the plot, used also as a file name for saving the plot.
+            plot_name="matmul-performance-"
+            + (
+                "fp16" if not fp8_inputs else "fp8"
+            ),  # Name for the plot, used also as a file name for saving the plot.
             args={"fp8_inputs": fp8_inputs},
-        ))
+        )
+    )
 
 
 # @triton.testing.perf_report(configs)
@@ -56,7 +66,7 @@ def is_cuda():
 #         a = a.to(torch.float8_e5m2)
 #         b = b.T.contiguous()   # 确保 b 在转置后是连续的
 #         b = b.to(torch.float8_e5m2)
-    
+
 #     # print("Weight is contiguous:", b.is_contiguous())  # 添加这一行
 #     quantiles = [0.5, 0.2, 0.8]
 #     if provider == ref_lib.lower():
@@ -77,13 +87,13 @@ def is_cuda():
 #         """
 #         super(RMSNorm, self).__init__()
 #         self.weight = nn.Parameter(torch.ones(dim))  # 可学习的缩放参数
-    
+
 #     def forward(self, x):
-#         # x 的形状为 [batch_size, seq_len, dim]        
+#         # x 的形状为 [batch_size, seq_len, dim]
 #         var = torch.mean(x ** 2, dim=-1, keepdim=True)
 #         rms = torch.sqrt( var)
 #         return x / rms * self.weight # 归一化，并应用缩放参数
-    
+
 # @triton.testing.perf_report(
 #     triton.testing.Benchmark(
 #         x_names=['N'],
@@ -124,7 +134,7 @@ def is_cuda():
 #     if mode == 'forward':
 #         gbps = lambda ms: 2 * x.numel() * x.element_size() * 1e-9 / (ms * 1e-3)
 #         ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, quantiles=quantiles, rep=500)
-        
+
 #     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
 # bench_rmsnorm.run(print_data=True, save_path=result_path)
@@ -186,7 +196,7 @@ def is_cuda():
 #             "Torch_softmax",
 #             "Triton_softmax",
 #             'Triton_online_v2_softmax',
-            
+
 #         ],  # label name for the lines
 #         styles=[('blue', '-'), ('green', '-'), ('yellow', '-')],  # line styles
 #         ylabel="GB/s",  # label name for the y-axis
@@ -208,7 +218,7 @@ def is_cuda():
 #     quantiles = [0.5, 0.2, 0.8]
 #     stream = torch.cuda.Stream()
 #     torch.cuda.set_stream(stream)
-    
+
 #     if provider == 'torch_softmax':
 #         ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1), quantiles=quantiles)
 #     elif provider == 'triton_softmax':
@@ -223,28 +233,42 @@ def is_cuda():
 
 # bench_softmax.run(print_data=True, save_path=result_path)
 
+
 ################################## mlp_silu softmax ####################################
 # 对 mlp_silu 操作的不同实现（Triton、PyTorch、PyTorch JIT）进行性能基准测试（Benchmark）
-@triton.testing.perf_report( # 一个装饰器，用于测试和记录函数性能
-    triton.testing.Benchmark( # 定义了性能测试的不同维度，包括 x 轴参数、线条配置等
-        x_names=['N'],  # argument names to use as an x-axis for the plot
-        x_vals=[32 * i for i in range(1, 60, 8)],  # different possible values for `x_name`
-        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
-        line_vals=['torch_mlp_silu', 'torch_fused_mlp', 'triton_mlp_silu', 'triton_torch_mlp_silu'],  # possible values for `line_arg``
+@triton.testing.perf_report(  # 一个装饰器，用于测试和记录函数性能
+    triton.testing.Benchmark(  # 定义了性能测试的不同维度，包括 x 轴参数、线条配置等
+        x_names=["N"],  # argument names to use as an x-axis for the plot
+        x_vals=[
+            32 * i for i in range(1, 60, 8)
+        ],  # different possible values for `x_name`
+        line_arg="provider",  # argument name whose value corresponds to a different line in the plot
+        line_vals=[
+            "torch_mlp_silu",
+            "torch_fused_mlp",
+            "triton_mlp_silu",
+            "triton_torch_mlp_silu",
+        ],  # possible values for `line_arg``
         line_names=[
             "Torch_mlp_silu",
             "Torch_fused_mlp",
             "Triton_mlp_silu",
             "Triton_torch_mlp_silu",
-            
         ],  # label name for the lines
-        styles=[('blue', '-'),('yellow', '-'), ('green', '-'), ('red', '-')],  # line styles
+        styles=[
+            ("blue", "-"),
+            ("yellow", "-"),
+            ("green", "-"),
+            ("red", "-"),
+        ],  # line styles
         ylabel="GB/s",  # label name for the y-axis
         plot_name="mlp-silu-performance",  # name for the plot. Used also as a file name for saving the plot.
-        args={'M': 3584},  # 设置除 x_names 和 line_arg 外的固定参数值，这里 M 表示批量大小。
-    ))
-
-def bench_mlp_silu(M, N, provider, mode='forward', eps=1e-5, device='cuda'):
+        args={
+            "M": 3584
+        },  # 设置除 x_names 和 line_arg 外的固定参数值，这里 M 表示批量大小。
+    )
+)
+def bench_mlp_silu(M, N, provider, mode="forward", eps=1e-5, device="cuda"):
     """定义性能测试函数 bench_softmax。
     参数：
         M: 批量大小（固定为 4096)。
@@ -257,10 +281,25 @@ def bench_mlp_silu(M, N, provider, mode='forward', eps=1e-5, device='cuda'):
     B = 4
     hidden_size = 3584
     intermediate_size = 18944
-    x = torch.randn(B, N, hidden_size, device='cuda', dtype=torch.float16)
-    w1 = torch.randn((intermediate_size, hidden_size), device='cuda', dtype=torch.float16) * 0.01
-    w2 = torch.randn((intermediate_size, hidden_size), device='cuda', dtype=torch.float16) * 0.01
-    w3 = torch.randn((hidden_size, intermediate_size), device='cuda', dtype=torch.float16) * 0.01
+    x = torch.randn(B, N, hidden_size, device="cuda", dtype=torch.float16)
+    w1 = (
+        torch.randn(
+            (intermediate_size, hidden_size), device="cuda", dtype=torch.float16
+        )
+        * 0.01
+    )
+    w2 = (
+        torch.randn(
+            (intermediate_size, hidden_size), device="cuda", dtype=torch.float16
+        )
+        * 0.01
+    )
+    w3 = (
+        torch.randn(
+            (hidden_size, intermediate_size), device="cuda", dtype=torch.float16
+        )
+        * 0.01
+    )
 
     w1_t = w1.t().contiguous()
     w2_t = w2.t().contiguous()
@@ -270,21 +309,30 @@ def bench_mlp_silu(M, N, provider, mode='forward', eps=1e-5, device='cuda'):
     quantiles = [0.5, 0.2, 0.8]
     stream = torch.cuda.Stream()
     torch.cuda.set_stream(stream)
-    
-    if provider == 'torch_mlp_silu':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch_mlp_silu(x, w1_t, w2_t, w3_t), quantiles=quantiles)
-    elif provider == 'torch_fused_mlp':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch_fused_mlp(x), quantiles=quantiles)
-    elif provider == 'triton_mlp_silu':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: mlp_silu(x, w1_t, w2_t, w3_t), quantiles=quantiles)
-    elif provider == 'triton_torch_mlp_silu':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: triton_torch_mlp_silu(x, w1_t, w2_t, w3_t), quantiles=quantiles)
+
+    if provider == "torch_mlp_silu":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch_mlp_silu(x, w1_t, w2_t, w3_t), quantiles=quantiles
+        )
+    elif provider == "torch_fused_mlp":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: torch_fused_mlp(x), quantiles=quantiles
+        )
+    elif provider == "triton_mlp_silu":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: mlp_silu(x, w1_t, w2_t, w3_t), quantiles=quantiles
+        )
+    elif provider == "triton_torch_mlp_silu":
+        ms, min_ms, max_ms = triton.testing.do_bench(
+            lambda: triton_torch_mlp_silu(x, w1_t, w2_t, w3_t), quantiles=quantiles
+        )
     else:
         raise ValueError(f"Unknown provider: {provider}")
     # * 3e-9 是将 bytes 转换为 gb 单位，* 1e-3 是将 s 转换成 ms 单位
     gbps = lambda ms: 2 * x.numel() * x.element_size() * 1e-9 / (ms * 1e-3)
     return gbps(ms), gbps(max_ms), gbps(min_ms)
 
+
 bench_mlp_silu.run(print_data=True, save_path=result_path)
 
 """
@@ -300,6 +348,7 @@ def bench_mlp_silu(M, N, provider, mode='forward', eps=1e-5, device='cuda'):
 try:
     from ..lite_llama.kernels.flashattention import flash_attention_v1
     from ..lite_llama.kernels.flashattentionv2 import flash_attention_v2
+
     HAS_FLASH = True
 except BaseException:
     HAS_FLASH = False
@@ -318,7 +367,8 @@ def bench_mlp_silu(M, N, provider, mode='forward', eps=1e-5, device='cuda'):
                 x_vals=[2**i for i in range(4, 12)],
                 line_arg="provider",
                 line_vals=["triton-official"] + (["flash_me"] if FLASH_NEW else []),
-                line_names=["triton-official-fp16"] + (["flash-me-fp16"] if FLASH_NEW else []),
+                line_names=["triton-official-fp16"]
+                + (["flash-me-fp16"] if FLASH_NEW else []),
                 styles=[("red", "-"), ("blue", "-"), ("green", "-")],
                 ylabel="TFLOPS",
                 plot_name=f"fused-attention-batch{BATCH}-head{N_HEADS}-d{HEAD_DIM}-{mode}-causal={causal}",
@@ -329,18 +379,21 @@ def bench_mlp_silu(M, N, provider, mode='forward', eps=1e-5, device='cuda'):
                     "mode": mode,
                     "causal": causal,
                 },
-            ))
+            )
+        )
 
 
 @triton.testing.perf_report(configs)
-def bench_flash_attention(BATCH, H, N_CTX, HEAD_DIM, causal, mode, provider, device="cuda"):
+def bench_flash_attention(
+    BATCH, H, N_CTX, HEAD_DIM, causal, mode, provider, device="cuda"
+):
     assert mode in ["fwd"]
     dtype = torch.float16
     if "flashattentionv2" in provider:
         q = torch.randn((BATCH, H, N_CTX, HEAD_DIM), dtype=dtype, device=device)
         k = torch.randn((BATCH, H, N_CTX, HEAD_DIM), dtype=dtype, device=device)
         v = torch.randn((BATCH, H, N_CTX, HEAD_DIM), dtype=dtype, device=device)
-    
+
         sm_scale = 1.3
         fn = lambda: flash_attention_v2(q, k, v, causal, sm_scale)
         if mode == "bwd":
@@ -356,7 +409,7 @@ def bench_flash_attention(BATCH, H, N_CTX, HEAD_DIM, causal, mode, provider, dev
         batch, heads, m_size, head_dim = q.shape
         sm_scale = 1 / math.sqrt(head_dim)
         output = torch.empty_like(q)
-        
+
         fn = lambda: flash_attention_v1(q, k, v, sm_scale)
         ms = triton.testing.do_bench(fn)
 
@@ -368,6 +421,7 @@ def bench_flash_attention(BATCH, H, N_CTX, HEAD_DIM, causal, mode, provider, dev
         total_flops *= 2.5  # 2.0(bwd) + 0.5(recompute)
     return total_flops * 1e-12 / (ms * 1e-3)
 
+
 if __name__ == "__main__":
     # only works on post-Ampere GPUs right now
     bench_flash_attention.run(save_path=result_path, print_data=True)
diff --git a/tests/kernels_test.py b/tests/kernels_test.py
index 0f830b2..2865090 100644
--- a/tests/kernels_test.py
+++ b/tests/kernels_test.py
@@ -1,7 +1,8 @@
-import torch,math
+import torch, math
 import torch.nn as nn
 from transformers.activations import ACT2FN
 import pytest, sys, os
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 
 from lite_llama.lite_llama.kernels.others.fused_linear import fused_linear
@@ -11,10 +12,15 @@
 from lite_llama.kernels.flashattention import flash_attention_v1
 from lite_llama.lite_llama.kernels.others.rope_orig import rope
 from typing import Callable, Dict, Tuple, Union
-from lite_llama.tests.test_torch_rope import RotaryPositionEmbedding, apply_rotary_pos_emb
+from lite_llama.tests.test_torch_rope import (
+    RotaryPositionEmbedding,
+    apply_rotary_pos_emb,
+)
+
 
 class RMSNorm(nn.Module):
     """nlp 领域"""
+
     def __init__(self, dim):
         """
         :param dim: 输入的维度
@@ -22,12 +28,13 @@ def __init__(self, dim):
         """
         super(RMSNorm, self).__init__()
         self.weight = nn.Parameter(torch.ones(dim))  # 可学习的缩放参数
-    
+
     def forward(self, x):
-        # x 的形状为 [batch_size, seq_len, dim]        
-        var = torch.mean(x ** 2, dim=-1, keepdim=True)
-        rms = torch.sqrt( var)
-        return x / rms * self.weight # 归一化，并应用缩放参数
+        # x 的形状为 [batch_size, seq_len, dim]
+        var = torch.mean(x**2, dim=-1, keepdim=True)
+        rms = torch.sqrt(var)
+        return x / rms * self.weight  # 归一化，并应用缩放参数
+
 
 def _get_attn_inputs(B, N, L, H, device):
     torch.manual_seed(1337)
@@ -36,6 +43,7 @@ def _get_attn_inputs(B, N, L, H, device):
     v = torch.rand_like(q)
     return q, k, v
 
+
 def _get_inputs(M, K, N, device="cuda"):
     """return 2D Tensor of input weight bias and redisual input"""
 
@@ -46,9 +54,10 @@ def _get_inputs(M, K, N, device="cuda"):
     r = torch.rand_like(x, dtype=torch.float32)
     if K != N:
         r = r_torch = None
-    
+
     return x, w, b, r
- 
+
+
 def torch_ffn(x, w, b=None, r=None):
     z = x @ w
     if b is not None:
@@ -58,6 +67,7 @@ def torch_ffn(x, w, b=None, r=None):
         z += r
     return z
 
+
 @pytest.mark.parametrize("M,N,K", [(128, 128, 64)])
 def test_fused_ffn(M, N, K):
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -67,15 +77,15 @@ def test_fused_ffn(M, N, K):
     z_torch = torch_ffn(x_torch, w_torch, b=None, r=None)
     z = fused_linear(x, w)
     assert torch.allclose(z, z_torch, atol=1e-2), (z - z_torch).abs().max()
-    
-    
+
+
 @pytest.mark.parametrize("M", [128, 32])
 @pytest.mark.parametrize("K", [32, 128, 64])
 def test_rmsnorm(M, K):
     N = 32
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print("device is ", device)
-    
+
     x, *_ = _get_inputs(M, K, N, device)
     x_torch, *_ = _get_inputs(M, K, N, device)
 
@@ -85,14 +95,15 @@ def test_rmsnorm(M, K):
 
     x = rmsnorm(x, rmsnorm_pytorch.weight.data).to(device)
     assert torch.allclose(x, x_torch, atol=1e-4)
-    
+
+
 @pytest.mark.parametrize("M", [128, 32, 64])
 @pytest.mark.parametrize("K", [32, 128, 64])
 def test_layernorm(M, K):
     N = 32
     device = "cuda" if torch.cuda.is_available() else "cpu"
     print("device is ", device)
-    
+
     x, *_ = _get_inputs(M, K, N, device)
     x_torch, *_ = _get_inputs(M, K, N, device)
 
@@ -100,7 +111,9 @@ def test_layernorm(M, K):
     layernorm_pytorch = nn.LayerNorm(K).to(device)
     x_torch = layernorm_pytorch(x_torch)
 
-    x = layernorm(x, layernorm_pytorch.weight.data, layernorm_pytorch.bias.data).to(device)
+    x = layernorm(x, layernorm_pytorch.weight.data, layernorm_pytorch.bias.data).to(
+        device
+    )
     assert torch.allclose(x, x_torch, atol=1e-5)
 
 
@@ -109,15 +122,16 @@ def test_layernorm(M, K):
 def test_softmax(M, K):
     N = 32
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    
+
     x, *_ = _get_inputs(M, K, N, device)
     x_torch, *_ = _get_inputs(M, K, N, device)
-    
+
     # 模块及其所有参数（如 self.weight）都位于指定设备上（CPU 或 GPU）
     output_torch = torch.softmax(x, axis=-1).to(device)
     output = softmax_fwd(x).to(device)
     assert torch.allclose(output, output_torch, atol=1e-5)
 
+
 def torch_attention(q, k, v, attention_mask=None, is_causal=False):
     assert q.shape == k.shape == v.shape
     B, N, L, H = q.shape
@@ -136,20 +150,28 @@ def torch_attention(q, k, v, attention_mask=None, is_causal=False):
 
     return ref_out
 
+
 @pytest.mark.parametrize("B,N", [(4, 8), (8, 16), (24, 32), (64, 20)])
-@pytest.mark.parametrize("L", [128,256,])
+@pytest.mark.parametrize(
+    "L",
+    [
+        128,
+        256,
+    ],
+)
 @pytest.mark.parametrize("H", [32, 64])
 def test_flash_attention_v1(B, N, L, H):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     q, k, v = _get_attn_inputs(B, N, L, H, device)
     batch, heads, m_size, dhead = q.shape
-    atten_out = torch.empty_like(q) 
+    atten_out = torch.empty_like(q)
     sm_scale = 1 / math.sqrt(dhead)
     z_torch = torch_attention(q, k, v)
     z = flash_attention_v1(q, k, v, sm_scale)
     print(f"z_torch: {z_torch[0][0][0][0]}, z: {z[0][0][0][0]}")
     assert torch.allclose(z[0], z_torch[0], atol=1e-3), (z - z_torch).abs().max()
 
+
 def get_tol(dtype: torch.dtype) -> Dict:
     if dtype == torch.bfloat16:
         return dict(atol=1e-2, rtol=1e-2)
@@ -162,11 +184,13 @@ def get_tol(dtype: torch.dtype) -> Dict:
 def _overlapping_grad(output: torch.Tensor) -> torch.Tensor:
     return output.sum() * 2
 
+
 # Gradient is a full tensor
 def _non_overlapping_grad(output: torch.Tensor) -> torch.Tensor:
     t = torch.ones_like(output)
     return torch.sum(output * t)
 
+
 ####################################### triton 版rope 算法单元测试 #################################
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
 @pytest.mark.parametrize("seq_length", [1024, 2048])
@@ -205,9 +229,7 @@ def test_triton_rope(
     emb = rotary_pos_emb(seq_length)
 
     # triton
-    output_triton = rope(
-        t, emb, tensor_format=tensor_format
-    )
+    output_triton = rope(t, emb, tensor_format=tensor_format)
 
     loss_triton = loss_func(output_triton)
     loss_triton.backward()
@@ -216,7 +238,10 @@ def test_triton_rope(
 
     # te
     output_te = apply_rotary_pos_emb(
-        t, emb, tensor_format=tensor_format, fused=True,
+        t,
+        emb,
+        tensor_format=tensor_format,
+        fused=True,
     )
 
     loss_te = loss_func(output_te)
@@ -226,4 +251,4 @@ def test_triton_rope(
 
     torch.testing.assert_close(output_te, output_triton, **get_tol(dtype))
     torch.testing.assert_close(grad_te, grad_triton, **get_tol(dtype))
-    assert output_te.is_contiguous()
\ No newline at end of file
+    assert output_te.is_contiguous()
diff --git a/tests/softmax_native.py b/tests/softmax_native.py
index 853d82b..10de02b 100644
--- a/tests/softmax_native.py
+++ b/tests/softmax_native.py
@@ -1,5 +1,6 @@
-import triton,torch
-import triton.language as tl 
+import triton, torch
+import triton.language as tl
+
 
 def naive_softmax(x: torch.Tensor) -> torch.Tensor:
     """Compute row-wise softmax of X using native pytorch
@@ -8,20 +9,21 @@ def naive_softmax(x: torch.Tensor) -> torch.Tensor:
     this shift.
     # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements
     """
-    x_max = x.max(dim=1)[0] # read  MN elements ; write M  elements
-    safe_x = x - x_max[:, None] # read MN + M elements ; write MN elements
-    numerator = torch.exp(safe_x) # read  MN elements ; write MN elements
-    denominator = numerator.sum(dim=1) # read  MN elements ; write M  elements
+    x_max = x.max(dim=1)[0]  # read  MN elements ; write M  elements
+    safe_x = x - x_max[:, None]  # read MN + M elements ; write MN elements
+    numerator = torch.exp(safe_x)  # read  MN elements ; write MN elements
+    denominator = numerator.sum(dim=1)  # read  MN elements ; write M  elements
     ret = numerator / denominator[:, None]  # read MN + M elements ; write MN elements
-    
+
     return ret
 
+
 def online_softmax(x: torch.Tensor) -> torch.tensor:
-    """Iterative calculation and 2.5x faster than native softmax """
+    """Iterative calculation and 2.5x faster than native softmax"""
     row_cont, col_count = x.shape
     assert x.ndim == 2, f"only accepts 2D tensor now"
     output = torch.zeros_like(x)
-    
+
     for r in range(row_cont):
         row_max = x[r][0]
         normalizer = 0
@@ -31,11 +33,14 @@ def online_softmax(x: torch.Tensor) -> torch.tensor:
             row_max = max(pre_max, cur)
             # if cur > pre_max:
             #     print(f"Update row max now is {row_max}, row = {r}")
-            normalizer = normalizer * torch.exp(pre_max - row_max) + torch.exp(cur - row_max)
+            normalizer = normalizer * torch.exp(pre_max - row_max) + torch.exp(
+                cur - row_max
+            )
         output[r, :] = torch.exp(x[r, :] - row_max) / normalizer
-    
+
     return output
-    
+
+
 @triton.jit
 def _softmax_kernel_fwd(
     input_ptr,
@@ -43,29 +48,30 @@ def _softmax_kernel_fwd(
     output_ptr,
     stride_output_row,
     num_cols,
-    BLOCK_SIZE: tl.constexpr
+    BLOCK_SIZE: tl.constexpr,
 ):
     # 1, setup input ptrs
     row_id = tl.program_id(axis=0)
     row_start_ptr = input_ptr + row_id * stride_input_row
     col_offsets = tl.arange(0, BLOCK_SIZE)
     input_pointers = row_start_ptr + col_offsets
-    
+
     row_data_mask = col_offsets < num_cols
-    
+
     # 2, move to SRAM
     x = tl.load(input_pointers, mask=row_data_mask, other=0.0)
-    
+
     # 3, softmax cal itself
     safe_row = x - tl.max(x, axis=0)
     numerator = tl.exp(safe_row)
     denominator = tl.sum(numerator, axis=0)
     softmax_out = numerator / denominator
-    
+
     # 4, write back to HBM
     output_row_ptr = output_ptr + row_id * stride_input_row
     output_pointers = output_row_ptr + col_offsets
-    tl.store(output_pointers, softmax_out, mask = row_data_mask)
+    tl.store(output_pointers, softmax_out, mask=row_data_mask)
+
 
 @torch.no_grad()
 def softmax_native_fwd(x: torch.Tensor) -> torch.Tensor:
@@ -82,19 +88,18 @@ def softmax_native_fwd(x: torch.Tensor) -> torch.Tensor:
         num_warps = 8
 
     grid = (rows, 1)
-    
+
     # allocate output buffer
     softmax_out = torch.empty_like(x)
-    
+
     _softmax_kernel_fwd[grid](
         x,
-        x.stride(0), # input row stride
+        x.stride(0),  # input row stride
         softmax_out,
         softmax_out.stride(0),
         cols,
-        BLOCK_SIZE = BLOCK_SIZE,
-        num_warps = num_warps,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
     )
-    
+
     return softmax_out
-    
\ No newline at end of file
diff --git a/tests/softmax_split.py b/tests/softmax_split.py
index 95e0370..df77887 100644
--- a/tests/softmax_split.py
+++ b/tests/softmax_split.py
@@ -2,6 +2,7 @@
 from triton import language as tl
 import torch
 
+
 @triton.jit
 def logsumexp_kernel(
     out_ptr,
@@ -17,7 +18,9 @@ def logsumexp_kernel(
     n_offsets = pid_n * TILE_N + tl.arange(0, TILE_N)
     mask = n_offsets < N
     offset = pid_m * N + n_offsets
-    inp = tl.load(in_ptr + offset, mask=mask, other=-float("inf")).to(out_ptr.dtype.element_ty)
+    inp = tl.load(in_ptr + offset, mask=mask, other=-float("inf")).to(
+        out_ptr.dtype.element_ty
+    )
     m = tl.max(inp, 0)
     e = tl.exp(inp - m)
     z = tl.sum(e, 0)
@@ -26,18 +29,22 @@ def logsumexp_kernel(
     output_ptrs = out_ptr + pid_m * num_programs_n + pid_n
     tl.store(output_ptrs, logz)
 
+
 @triton.jit
 def combine_logsumexp_kernel(out_ptr, inp_ptr, M, N, TILE_N: tl.constexpr):
     pid_m = tl.program_id(0)
     n_offsets = tl.arange(0, TILE_N)
     mask = n_offsets < N
-    logzs = tl.load(inp_ptr + pid_m * N + n_offsets, other=-float("inf"), mask=mask).to(out_ptr.dtype.element_ty)
+    logzs = tl.load(inp_ptr + pid_m * N + n_offsets, other=-float("inf"), mask=mask).to(
+        out_ptr.dtype.element_ty
+    )
     m = tl.max(logzs, 0)
     e = tl.exp(logzs - m)
     z = tl.sum(e, 0)
     logz = m + tl.log(z)
     tl.store(out_ptr + pid_m, logz)
 
+
 @triton.jit
 def softmax_kernel(out_ptr, in_ptr, logz_ptr, M, N, TILE_N: tl.constexpr):
     pid_n = tl.program_id(0)
@@ -45,13 +52,14 @@ def softmax_kernel(out_ptr, in_ptr, logz_ptr, M, N, TILE_N: tl.constexpr):
     n_offsets = pid_n * TILE_N + tl.arange(0, TILE_N)
     offset = pid_m * N + n_offsets
     mask = n_offsets < N
-    inp = tl.load(in_ptr + offset, mask=mask, other=-float("inf")).to(out_ptr.dtype.element_ty)
+    inp = tl.load(in_ptr + offset, mask=mask, other=-float("inf")).to(
+        out_ptr.dtype.element_ty
+    )
     logz = tl.load(logz_ptr + pid_m).to(out_ptr.dtype.element_ty)
     out = tl.exp(inp - logz)
     tl.store(out_ptr + offset, out, mask=mask)
 
 
-
 def softmax_split(x):
     M, N = x.shape
 
@@ -63,7 +71,7 @@ def softmax_split(x):
     grid = (num_tiles_n, M, 1)
     logsumexp_kernel[grid](logz, x, M, N, TILE_N)
 
-    combined_logz = torch.empty((M, ), dtype=x.dtype, device=x.device)
+    combined_logz = torch.empty((M,), dtype=x.dtype, device=x.device)
     TILE_N = triton.next_power_of_2(num_tiles_n)
     grid = (M, 1, 1)
     combine_logsumexp_kernel[grid](combined_logz, logz, M, num_tiles_n, TILE_N)
diff --git a/tests/test_LlamaConfig.py b/tests/test_LlamaConfig.py
index 3a859d9..7192a38 100644
--- a/tests/test_LlamaConfig.py
+++ b/tests/test_LlamaConfig.py
@@ -1,20 +1,23 @@
 import json, os, sys
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.models.model_config import LlamaConfig
 
+
 def load_config_from_json(json_file_path: str) -> LlamaConfig:
-    with open(json_file_path, 'r', encoding='utf-8') as f:
+    with open(json_file_path, "r", encoding="utf-8") as f:
         config_dict = json.load(f)
-    config = LlamaConfig(config_dict, max_seq_len = 2048)
+    config = LlamaConfig(config_dict, max_seq_len=2048)
     return config
 
+
 if __name__ == "__main__":
     # 创建 LlamaConfig 实例，设置 max_batch_size=16
     config = LlamaConfig(max_batch_size=16)
     print("max_batch_size:", config.max_batch_size)
 
     # JSON 文件的路径
-    json_file_path = '/gemini/code/Llama-3.2-1B-Instruct/config.json'
+    json_file_path = "/gemini/code/Llama-3.2-1B-Instruct/config.json"
 
     # 加载配置
     config = load_config_from_json(json_file_path)
diff --git a/tests/test_LlamaForCausalLM.py b/tests/test_LlamaForCausalLM.py
index cd32c88..085ab92 100644
--- a/tests/test_LlamaForCausalLM.py
+++ b/tests/test_LlamaForCausalLM.py
@@ -2,7 +2,8 @@
 from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaTokenizerFast
 from transformers import pipeline
 
-def load_llama_model(model_path: str, device: str = 'cuda'):
+
+def load_llama_model(model_path: str, device: str = "cuda"):
     """
     Load the LLaMA model and tokenizer.
 
@@ -19,12 +20,15 @@ def load_llama_model(model_path: str, device: str = 'cuda'):
     model = LlamaForCausalLM.from_pretrained(
         model_path,
         torch_dtype=torch.float16,  # Use float16 for faster inference if supported
-        low_cpu_mem_usage=True
+        low_cpu_mem_usage=True,
     )
     model.to(device)
     return model, tokenizer
 
-def generate_text(model, tokenizer, prompt: str, max_length: int = 50, device: str = 'cuda'):
+
+def generate_text(
+    model, tokenizer, prompt: str, max_length: int = 50, device: str = "cuda"
+):
     """
     Generate text using the LLaMA model.
 
@@ -43,15 +47,16 @@ def generate_text(model, tokenizer, prompt: str, max_length: int = 50, device: s
         outputs = model.generate(
             **inputs,
             max_length=max_length,
-            do_sample=True,       # Enable sampling to introduce randomness
-            temperature=0.7,      # Adjust temperature for creativity
-            top_p=0.9,            # Use top-p (nucleus) sampling
-            repetition_penalty=1.2  # Penalize repetitions
+            do_sample=True,  # Enable sampling to introduce randomness
+            temperature=0.7,  # Adjust temperature for creativity
+            top_p=0.9,  # Use top-p (nucleus) sampling
+            repetition_penalty=1.2,  # Penalize repetitions
         )
     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return generated_text
 
-def pipline_text(model_id):    
+
+def pipline_text(model_id):
     pipe = pipeline(
         "text-generation",
         model=model_id,
@@ -59,7 +64,10 @@ def pipline_text(model_id):
         device_map="auto",
     )
     messages = [
-        {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
+        {
+            "role": "system",
+            "content": "You are a pirate chatbot who always responds in pirate speak!",
+        },
         {"role": "user", "content": "Who are you?"},
     ]
     outputs = pipe(
@@ -68,17 +76,20 @@ def pipline_text(model_id):
     )
     print(outputs[0]["generated_text"][-1])
 
+
 if __name__ == "__main__":
     # Specify the paths to your model and tokenizer directories
     model_path = "/gemini/code/Llama-3.2-1B-Instruct/"
 
     # Load the model and tokenizer
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     model, tokenizer = load_llama_model(model_path, device)
 
     # Test the model with a sample prompt
     prompt = "I believe the meaning of life is,"
-    generated_text = generate_text(model, tokenizer, prompt, max_length=100, device=device)
+    generated_text = generate_text(
+        model, tokenizer, prompt, max_length=100, device=device
+    )
 
     print("Prompt:")
     print(prompt)
diff --git a/tests/test_LlamaModel.py b/tests/test_LlamaModel.py
index 01236f5..3eaafdd 100644
--- a/tests/test_LlamaModel.py
+++ b/tests/test_LlamaModel.py
@@ -5,9 +5,10 @@
 from pathlib import Path
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from lite_llama.models.llama import LlamaModel, LlamaConfig
 
+
 def sample_top_p(probs, p):
     """
     Perform top-p (nucleus) sampling on a probability distribution.
@@ -33,12 +34,14 @@ def sample_top_p(probs, p):
     next_token = torch.gather(probs_idx, -1, next_token)
     return next_token
 
-def load_config_from_json(json_file_path: str, device: str="cuda") -> LlamaConfig:
+
+def load_config_from_json(json_file_path: str, device: str = "cuda") -> LlamaConfig:
     with open(json_file_path, "r") as f:
         config_dict = json.load(f)
-    config = LlamaConfig(config_dict, max_seq_len = 2048, device=device)
+    config = LlamaConfig(config_dict, max_seq_len=2048, device=device)
     return config
 
+
 def load_original_llama(model_name_or_path: str, device: str = "cuda"):
     # config = LlamaConfig.from_pretrained(model_name_or_path)
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
@@ -50,7 +53,10 @@ def load_original_llama(model_name_or_path: str, device: str = "cuda"):
     model.to(device)
     return model, tokenizer
 
-def load_custom_llam(model_name_or_path: str, model_args: LlamaConfig, device: str = "cuda"):
+
+def load_custom_llam(
+    model_name_or_path: str, model_args: LlamaConfig, device: str = "cuda"
+):
     checkpoints = sorted(Path(model_name_or_path).glob("*.pth"))
     assert len(checkpoints) > 0, f"no checkpoint files found in {model_name_or_path}"
     ckpt_path = checkpoints[0]
@@ -63,35 +69,40 @@ def load_custom_llam(model_name_or_path: str, model_args: LlamaConfig, device: s
     model.load_state_dict(state_dict, strict=True)
 
     return model
-    
-def load_and_convert_to_custom_llama(model_config: LlamaConfig, pretrained_model: AutoModelForCausalLM, device: str = "cuda"):
+
+
+def load_and_convert_to_custom_llama(
+    model_config: LlamaConfig,
+    pretrained_model: AutoModelForCausalLM,
+    device: str = "cuda",
+):
     # 将预训练模型的权重映射到自定义模型
     hf_sd = pretrained_model.state_dict()
     # 映射嵌入层  # 映射归一化层
     mapping = {
-        "model.norm.weight": "norm_weight", 
+        "model.norm.weight": "norm_weight",
         "model.embed_tokens.weight": "embed_tokens.weight",
         # "model.embed_tokens.weight": "lm_head.weight",
     }
 
     # 映射层
     layers = {
-        'model.layers.{i}.self_attn.q_proj.weight': 'layers.{i}.attention.wq.weight',
-        'model.layers.{i}.self_attn.k_proj.weight': 'layers.{i}.attention.wk.weight',
-        'model.layers.{i}.self_attn.v_proj.weight': 'layers.{i}.attention.wv.weight',
-        'model.layers.{i}.self_attn.o_proj.weight': 'layers.{i}.attention.wo.weight',
-        'model.layers.{i}.mlp.gate_proj.weight': 'layers.{i}.feed_forward.gate_proj.weight',
-        'model.layers.{i}.mlp.up_proj.weight': 'layers.{i}.feed_forward.up_proj.weight',
-        'model.layers.{i}.mlp.down_proj.weight': 'layers.{i}.feed_forward.down_proj.weight',
-        'model.layers.{i}.post_attention_layernorm.weight': 'layers.{i}.ffn_norm_weight',
-        'model.layers.{i}.input_layernorm.weight': 'layers.{i}.attention_norm_weight'
+        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.attention.wq.weight",
+        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.attention.wk.weight",
+        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.attention.wv.weight",
+        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.attention.wo.weight",
+        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.feed_forward.gate_proj.weight",
+        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.feed_forward.up_proj.weight",
+        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.feed_forward.down_proj.weight",
+        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.ffn_norm_weight",
+        "model.layers.{i}.input_layernorm.weight": "layers.{i}.attention_norm_weight",
     }
 
     #  根据 Transformer 层数量生成映射
     for i in range(model_config.num_layers):
         for hf_key, custom_key in layers.items():
-            mapped_key = hf_key.format(i=i) # hf 权重参数字典 key
-            custom_mapped_key = custom_key.format(i=i) # 自定义模型权重参数字典 key
+            mapped_key = hf_key.format(i=i)  # hf 权重参数字典 key
+            custom_mapped_key = custom_key.format(i=i)  # 自定义模型权重参数字典 key
             mapping[mapped_key] = custom_mapped_key
 
     # 创建新的状态字典
@@ -99,12 +110,12 @@ def load_and_convert_to_custom_llama(model_config: LlamaConfig, pretrained_model
     for hf_key, tensor in tqdm(hf_sd.items(), desc="Mapping weights"):
         custom_key = mapping.get(hf_key, None)
         if custom_key is not None:
-            new_sd[custom_key] = tensor # 浅拷贝
+            new_sd[custom_key] = tensor  # 浅拷贝
         else:
             print(f"custom_key: {custom_key}")
             # 如果某些权重不需要映射，可以选择忽略或处理
             pass  # 忽略未映射的权重
-    
+
     new_sd["lm_head.weight"] = hf_sd["model.embed_tokens.weight"]
 
     # 打印预训练模型的参数名称
@@ -117,15 +128,20 @@ def load_and_convert_to_custom_llama(model_config: LlamaConfig, pretrained_model
     for name in new_sd.keys():
         print(name)
 
-    torch.save(new_sd, "/gemini/code/Llama-3.2-1B-Instruct/my_weight/my_llama3.2-1B.pth")
+    torch.save(
+        new_sd, "/gemini/code/Llama-3.2-1B-Instruct/my_weight/my_llama3.2-1B.pth"
+    )
     # torch.set_default_tensor_type(torch.cuda.HalfTensor)
     torch.set_default_dtype(torch.half)
     my_model = LlamaModel(model_args).to(device)
     my_model.load_state_dict(new_sd, strict=True)
-    
+
     return my_model
 
-def decode_stage_compare(original_model, custom_model, tokenizer, input_text: str, device: str = "cuda"):
+
+def decode_stage_compare(
+    original_model, custom_model, tokenizer, input_text: str, device: str = "cuda"
+):
     """
     在解码阶段逐步比较原始模型和自定义模型的输出。
 
@@ -138,8 +154,8 @@ def decode_stage_compare(original_model, custom_model, tokenizer, input_text: st
     """
     # 准备输入
     inputs = tokenizer(input_text, return_tensors="pt").to(device)
-    input_ids = inputs['input_ids']
-    attention_mask = inputs.get('attention_mask', None)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs.get("attention_mask", None)
 
     # 设置生成参数
     max_new_tokens = 10
@@ -153,19 +169,28 @@ def decode_stage_compare(original_model, custom_model, tokenizer, input_text: st
     for step in tqdm(range(max_new_tokens), desc="Decoding steps"):
         # 原始模型生成下一个 token
         with torch.no_grad():
-            original_outputs = original_model(original_generated, 
-                                              attention_mask=attention_mask, 
-                                              output_hidden_states=True,
-                                              return_dict = True,
-                                              use_cache = True)
-            original_logits = original_outputs.logits[:, -1, :]  # 获取最后一个时间步的 logits
+            original_outputs = original_model(
+                original_generated,
+                attention_mask=attention_mask,
+                output_hidden_states=True,
+                return_dict=True,
+                use_cache=True,
+            )
+            original_logits = original_outputs.logits[
+                :, -1, :
+            ]  # 获取最后一个时间步的 logits
             original_next_token = torch.argmax(original_logits, dim=-1, keepdim=True)
 
         # 自定义模型生成下一个 token
         with torch.no_grad():
-            custom_outputs_logits = custom_model(custom_generated, start_pos=original_generated.shape[1]-1,)
-            probs = torch.softmax(original_logits[:, -1] / 0.6, dim=-1) # temperature = 0.6
-            custom_next_token = sample_top_p(probs, p = 0.9)
+            custom_outputs_logits = custom_model(
+                custom_generated,
+                start_pos=original_generated.shape[1] - 1,
+            )
+            probs = torch.softmax(
+                original_logits[:, -1] / 0.6, dim=-1
+            )  # temperature = 0.6
+            custom_next_token = sample_top_p(probs, p=0.9)
 
         # 比较所有 layer 的隐藏层状态输出
         # print("original_outputs.hidden_states length is", len(original_outputs.hidden_states)) # 17
@@ -173,17 +198,21 @@ def decode_stage_compare(original_model, custom_model, tokenizer, input_text: st
 
         layer_idxs = range(len(custom_model.hidden_states))
 
-        print(f"============== Step {step+1}: Layer Compares: ====================")
+        print(f"============== Step {step + 1}: Layer Compares: ====================")
         for index in tqdm(layer_idxs):
             custom_layer_output = custom_model.hidden_states[index]
             original_layer_output = original_outputs.hidden_states[index]
 
-            difference = torch.abs(custom_layer_output - original_layer_output).mean().item()
+            difference = (
+                torch.abs(custom_layer_output - original_layer_output).mean().item()
+            )
             print(f"Difference at layer {index}: {difference}")
 
         # # 比较 logits
         logits_diff = torch.abs(original_logits - custom_outputs_logits).mean().item()
-        print(f"=========== Step {step+1}: Logits difference is: {logits_diff} ================")
+        print(
+            f"=========== Step {step + 1}: Logits difference is: {logits_diff} ================"
+        )
 
         # if logits_diff >= 1e-2:
         #     print(f"Step {step+1} failed: Logits difference {logits_diff} exceeds threshold.")
@@ -200,18 +229,33 @@ def decode_stage_compare(original_model, custom_model, tokenizer, input_text: st
         # else:
         #     print(f"Step {step+1} passed.")
 
-        # # 生成下一个 token, 模型内部已经集成了过去的 kv cache 
+        # # 生成下一个 token, 模型内部已经集成了过去的 kv cache
         # original_generated = torch.cat([original_generated, original_next_token], dim=-1)
         # custom_generated = torch.cat([custom_generated, custom_next_token], dim=-1)
 
         # 更新 attention mask if necessary
         if attention_mask is not None:
-            attention_mask = torch.cat([attention_mask, torch.ones((attention_mask.shape[0], 1), device=device, dtype=attention_mask.dtype)], dim=-1)
+            attention_mask = torch.cat(
+                [
+                    attention_mask,
+                    torch.ones(
+                        (attention_mask.shape[0], 1),
+                        device=device,
+                        dtype=attention_mask.dtype,
+                    ),
+                ],
+                dim=-1,
+            )
 
     print("Decode stage comparison completed.")
 
-def compare_models(original_model, custom_model, tokenizer, input_text: str, device: str = "cuda"):
-    print("\n############################ [Starting Prefill stage comparison] #################################")
+
+def compare_models(
+    original_model, custom_model, tokenizer, input_text: str, device: str = "cuda"
+):
+    print(
+        "\n############################ [Starting Prefill stage comparison] #################################"
+    )
     # 准备输入
     inputs = tokenizer(input_text, return_tensors="pt").to(device)
     # 原始模型输出
@@ -220,7 +264,7 @@ def compare_models(original_model, custom_model, tokenizer, input_text: str, dev
     original_logits = original_outputs.logits
 
     # 自定义模型输出
-    tokens = inputs['input_ids']
+    tokens = inputs["input_ids"]
     with torch.no_grad():
         custom_outputs = custom_model(tokens, start_pos=0)
     custom_logits = custom_outputs
@@ -236,30 +280,38 @@ def compare_models(original_model, custom_model, tokenizer, input_text: str, dev
     else:
         print("Models are not consistent.")
 
-    print(f"custom_model.hidden_states number: {len(custom_model.hidden_states)}, original_outputs.hidden_states number: {len(original_outputs.hidden_states)} ")
-    
+    print(
+        f"custom_model.hidden_states number: {len(custom_model.hidden_states)}, original_outputs.hidden_states number: {len(original_outputs.hidden_states)} "
+    )
+
     # 比较所有 layer 的隐藏层状态输出
     layer_idxs = range(len(custom_model.hidden_states))
     for index in tqdm(layer_idxs):
         custom_layer_output = custom_model.hidden_states[index]
         original_layer_output = original_outputs.hidden_states[index]
 
-        difference = torch.abs(custom_layer_output - original_layer_output).mean().item()
+        difference = (
+            torch.abs(custom_layer_output - original_layer_output).mean().item()
+        )
         print(f"Difference at layer {index}: {difference}")
 
     # 解码阶段比较
-    print("\n############################ [Starting Decode stage comparison] #################################")
+    print(
+        "\n############################ [Starting Decode stage comparison] #################################"
+    )
     decode_stage_compare(original_model, custom_model, tokenizer, input_text, device)
 
+
 if __name__ == "__main__":
-    
     original_model_path = "/gemini/code/Llama-3.2-1B-Instruct"
     my_model_path = "/gemini/code/Llama-3.2-1B-Instruct/my_weight"
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    
+
     # 定义模型配置参数
-    json_file_path = '/gemini/code/Llama-3.2-1B-Instruct/my_weight/config.json' # JSON 文件的路径
-    model_args = load_config_from_json(json_file_path, device) # 加载配置
+    json_file_path = (
+        "/gemini/code/Llama-3.2-1B-Instruct/my_weight/config.json"  # JSON 文件的路径
+    )
+    model_args = load_config_from_json(json_file_path, device)  # 加载配置
 
     # 加载原始模型
     original_model, tokenizer = load_original_llama(original_model_path, device)
diff --git a/tests/test_LlavaConfig.py b/tests/test_LlavaConfig.py
index 11f414b..ede95b1 100644
--- a/tests/test_LlavaConfig.py
+++ b/tests/test_LlavaConfig.py
@@ -1,8 +1,10 @@
 import json, os, sys
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.models.model_config import LlavaConfig
 from lite_llama.models.llava import LlavaLlama
 
+
 def test_llava_config():
     # 示例配置 JSON 字符串
     config_json = """
@@ -57,20 +59,21 @@ def test_llava_config():
 
 
 def test_LlavaLlama_structure():
-	model_path = "/gemini/code/llm_weights/llava-hf/llava-1.5-7b-hf"
-	from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-	from transformers import LlavaConfig
+    model_path = "/gemini/code/llm_weights/llava-hf/llava-1.5-7b-hf"
+    from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+    from transformers import LlavaConfig
+
+    # 使用 init_empty_weights 初始化空模型
+    with init_empty_weights():
+        llava_config = LlavaConfig.from_pretrained(model_path)
+        # print(llava_config) # 打印配置以验证
 
-	# 使用 init_empty_weights 初始化空模型
-	with init_empty_weights():
-		llava_config = LlavaConfig.from_pretrained(model_path)
-		# print(llava_config) # 打印配置以验证
+        model = LlavaLlama(llava_config)
+        print(model)  # 打印模型结构
+        for name, param in list(model.named_parameters())[:]:  # 打印模型参数
+            print(name, param.shape)
 
-		model = LlavaLlama(llava_config)        
-		print(model) # 打印模型结构
-		for name, param in list(model.named_parameters())[:]:  # 打印模型参数
-			print(name, param.shape)
 
 if __name__ == "__main__":
     test_llava_config()
-	test_LlavaLlama_structure()
\ No newline at end of file
+    test_LlavaLlama_structure()
diff --git a/tests/test_LlavaForConditionalGeneration.py b/tests/test_LlavaForConditionalGeneration.py
index b64470b..e6fed1d 100644
--- a/tests/test_LlavaForConditionalGeneration.py
+++ b/tests/test_LlavaForConditionalGeneration.py
@@ -1,7 +1,12 @@
 from PIL import Image
-import requests,torch
-from transformers import AutoProcessor, LlavaForConditionalGeneration, LlavaConfig, \
-                        LlavaNextConfig, LlavaNextForConditionalGeneration
+import requests, torch
+from transformers import (
+    AutoProcessor,
+    LlavaForConditionalGeneration,
+    LlavaConfig,
+    LlavaNextConfig,
+    LlavaNextForConditionalGeneration,
+)
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
 model_path = "/gemini/code/llm_weights/llava-hf/llava-1.5-7b-hf"
@@ -15,8 +20,8 @@
 model = load_checkpoint_and_dispatch(model, model_path, device_map="auto")
 
 # model = LlavaForConditionalGeneration.from_pretrained(
-#     model_path, 
-#     torch_dtype=torch.float16, 
+#     model_path,
+#     torch_dtype=torch.float16,
 #     low_cpu_mem_usage=True,
 # ).to("cuda")
 
@@ -32,7 +37,11 @@
 
 # Generate
 generate_ids = model.generate(**inputs, max_new_tokens=30)
-print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
+print(
+    processor.batch_decode(
+        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+)
 
 """
 USER: 
@@ -45,7 +54,7 @@
 print(f"模型总参数量: {sum(p.numel() for p in model.parameters()) / 1e6:.2f} M")
 
 # 打印模型参数信息
-for name, param in list(model.named_parameters()): 
+for name, param in list(model.named_parameters()):
     print(name, param.shape)
 
 """
@@ -111,4 +120,4 @@
     (lm_head): Linear(in_features=4096, out_features=128320, bias=False)
   )
 )
-"""
\ No newline at end of file
+"""
diff --git a/tests/test_LlavaLlama.py b/tests/test_LlavaLlama.py
index 91228a9..134772f 100644
--- a/tests/test_LlavaLlama.py
+++ b/tests/test_LlavaLlama.py
@@ -3,18 +3,18 @@
 import sys, os
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from lite_llama.models.llava import LlavaLlama
 
 hf_model_path = "/gemini/code/liuhaotian/llava-v1.5-7b"
 
+
 def test_LlavaLlama_structure(hf_model_path):
-    
     # 使用 init_empty_weights 初始化空模型
     with init_empty_weights():
         config = LlavaConfig.from_pretrained(hf_model_path)
         model = LlavaLlama(config)
-        
+
         # 打印没有加载权重的 LlavaLlama 模型结构
         print(model)
         # 打印模型的简单摘要
@@ -24,5 +24,6 @@ def test_LlavaLlama_structure(hf_model_path):
         for name, param in list(model.named_parameters())[:]:  # 打印模型参数
             print(name, param.shape)
 
+
 if __name__ == "__main__":
     test_LlavaLlama_structure(hf_model_path)
diff --git a/tests/test_Qwen2ForCausalLM.py b/tests/test_Qwen2ForCausalLM.py
index a99316f..f6e67fb 100644
--- a/tests/test_Qwen2ForCausalLM.py
+++ b/tests/test_Qwen2ForCausalLM.py
@@ -3,9 +3,7 @@
 model_name = "/gemini/code/llm_weights/Qwen/Qwen2.5-3B-Instruct"
 
 model = Qwen2ForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype="auto",
-    device_map="auto"
+    model_name, torch_dtype="auto", device_map="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(model)
@@ -17,25 +15,24 @@
 prompt = "给出 c++ 多线程语法和编程示例代码."
 
 messages = [
-    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
-    {"role": "user", "content": prompt}
+    {
+        "role": "system",
+        "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+    },
+    {"role": "user", "content": prompt},
 ]
 
 text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
+    messages, tokenize=False, add_generation_prompt=True
 )
 print("After call apply_chat_template, text is ", text)
 
 model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
 
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=512
-)
+generated_ids = model.generate(**model_inputs, max_new_tokens=512)
 generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+    output_ids[len(input_ids) :]
+    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
 ]
 
 response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
@@ -69,4 +66,4 @@
   )
   (lm_head): Linear(in_features=1536, out_features=151936, bias=False)
 )
-"""
\ No newline at end of file
+"""
diff --git a/tests/test_attention.py b/tests/test_attention.py
index 1944850..a5a118c 100644
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@@ -1,8 +1,10 @@
-import torch, os,sys
+import torch, os, sys
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.models.llama import *
 from lite_llama.tests.test_torch_rope import apply_rotary_emb
 
+
 def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """同一组的 kv cache 复制多份"""
     batch_size, seq_len, num_kv_heads, head_dim = x.shape
@@ -17,6 +19,7 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
         .reshape(batch_size, seq_len, num_kv_heads * n_rep, head_dim)
     )
 
+
 class ModelArgs:
     def __init__(self):
         self.dim = 64  # 模型维度
@@ -26,6 +29,7 @@ def __init__(self):
         self.max_seq_len = 16
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+
 class FusedAttention(nn.Module):
     def __init__(self, args):
         super().__init__()
@@ -43,17 +47,43 @@ def __init__(self, args):
         self.hidden_size = args.n_heads * self.head_dim
 
         # 定义线性层，并移动到设备
-        self.wq = nn.Linear(args.dim, self.n_heads_q * self.head_dim, bias=False, dtype=torch.float16).to(device)
-        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False, dtype=torch.float16).to(device)
-        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False, dtype=torch.float16).to(device)
-        self.wo = nn.Linear(self.n_heads_q * self.head_dim, args.dim, bias=False, dtype=torch.float16).to(device)
+        self.wq = nn.Linear(
+            args.dim, self.n_heads_q * self.head_dim, bias=False, dtype=torch.float16
+        ).to(device)
+        self.wk = nn.Linear(
+            args.dim, self.n_kv_heads * self.head_dim, bias=False, dtype=torch.float16
+        ).to(device)
+        self.wv = nn.Linear(
+            args.dim, self.n_kv_heads * self.head_dim, bias=False, dtype=torch.float16
+        ).to(device)
+        self.wo = nn.Linear(
+            self.n_heads_q * self.head_dim, args.dim, bias=False, dtype=torch.float16
+        ).to(device)
 
         # 提前按最大可分配空间分配好 kv cache 张量，并注册为 buffer
-        self.register_buffer('cache_k', torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim), dtype=torch.float16, device=device), persistent=False)
-        self.register_buffer('cache_v', torch.zeros((args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim), dtype=torch.float16, device=device), persistent=False)
+        self.register_buffer(
+            "cache_k",
+            torch.zeros(
+                (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim),
+                dtype=torch.float16,
+                device=device,
+            ),
+            persistent=False,
+        )
+        self.register_buffer(
+            "cache_v",
+            torch.zeros(
+                (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim),
+                dtype=torch.float16,
+                device=device,
+            ),
+            persistent=False,
+        )
 
     def forward(self, x: torch.Tensor, start_pos: int):
-        batch_size, seq_len, _ = x.shape  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
+        batch_size, seq_len, _ = (
+            x.shape
+        )  # prefill: (B, Seq_Len, Dim); decode: (B, 1, Dim)
 
         x = x.to(torch.float16)  # 确保输入为 float16
 
@@ -63,7 +93,9 @@ def forward(self, x: torch.Tensor, start_pos: int):
         xv = self.wv(x).view(batch_size, seq_len, self.n_kv_heads, self.head_dim)
 
         # 2. 计算 RoPE 位置编码
-        freqs_cis = precompute_freqs_cis(dim=self.head_dim, seq_len=seq_len, device=x.device)
+        freqs_cis = precompute_freqs_cis(
+            dim=self.head_dim, seq_len=seq_len, device=x.device
+        )
         xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
 
         # 3. 更新缓存
@@ -71,8 +103,12 @@ def forward(self, x: torch.Tensor, start_pos: int):
         self.cache_v[:batch_size, start_pos : start_pos + seq_len, :, :] = xv
 
         # 4. 获取累积的 K V
-        keys = self.cache_k[:batch_size, : start_pos + seq_len, :, :]  # (B, Seq_Len_KV, H_KV, D)
-        values = self.cache_v[:batch_size, : start_pos + seq_len, :, :]  # (B, Seq_Len_KV, H_KV, D)
+        keys = self.cache_k[
+            :batch_size, : start_pos + seq_len, :, :
+        ]  # (B, Seq_Len_KV, H_KV, D)
+        values = self.cache_v[
+            :batch_size, : start_pos + seq_len, :, :
+        ]  # (B, Seq_Len_KV, H_KV, D)
 
         # 5. GQA
         keys = repeat_kv(keys, self.n_rep)  # (B, Seq_Len_KV, H_Q, D)
@@ -84,7 +120,9 @@ def forward(self, x: torch.Tensor, start_pos: int):
         values = values.transpose(1, 2)  # (B, H_Q, Seq_Len_KV, D)
 
         # 7. 计算注意力得分
-        scores = torch.matmul(xq, keys.transpose(-2, -1)) / math.sqrt(self.head_dim)  # (B, H_Q, Seq_Len_Q, Seq_Len_KV)
+        scores = torch.matmul(xq, keys.transpose(-2, -1)) / math.sqrt(
+            self.head_dim
+        )  # (B, H_Q, Seq_Len_Q, Seq_Len_KV)
 
         # 8. 应用因果掩码
         seq_len_q = xq.shape[2]
@@ -98,11 +136,14 @@ def forward(self, x: torch.Tensor, start_pos: int):
         attn_output = torch.matmul(attn_weights, values)  # (B, H_Q, Seq_Len_Q, D)
 
         # 10. 合并 heads 并输出
-        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, -1)  # (B, Seq_Len_Q, H_Q * D)
+        attn_output = (
+            attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len_q, -1)
+        )  # (B, Seq_Len_Q, H_Q * D)
         output = self.wo(attn_output)
 
         return output
 
+
 def test_fused_attention():
     # 模型参数
     args = ModelArgs()
@@ -119,16 +160,23 @@ def test_fused_attention():
     fused_attention = FusedAttention(args).to(args.device)
 
     # 初始化 PyTorch 的 MultiheadAttention，并移动到设备
-    mha = nn.MultiheadAttention(embed_dim=dim, num_heads=args.n_heads, batch_first=True, dtype=torch.float16).to(args.device)
+    mha = nn.MultiheadAttention(
+        embed_dim=dim, num_heads=args.n_heads, batch_first=True, dtype=torch.float16
+    ).to(args.device)
 
     # 同步权重
     with torch.no_grad():
         # 将 FusedAttention 的权重复制到 MultiheadAttention
-        mha.in_proj_weight.copy_(torch.cat([
-            fused_attention.wq.weight,
-            fused_attention.wk.weight,
-            fused_attention.wv.weight
-        ], dim=0))
+        mha.in_proj_weight.copy_(
+            torch.cat(
+                [
+                    fused_attention.wq.weight,
+                    fused_attention.wk.weight,
+                    fused_attention.wv.weight,
+                ],
+                dim=0,
+            )
+        )
 
         # 设置输出投影权重
         mha.out_proj.weight.copy_(fused_attention.wo.weight)
@@ -140,12 +188,15 @@ def test_fused_attention():
 
     # 比较输出
     difference = torch.abs(fused_output - mha_output).mean().item()
-    print(f"Average difference between FusedAttention and MultiheadAttention: {difference}")
+    print(
+        f"Average difference between FusedAttention and MultiheadAttention: {difference}"
+    )
 
     # 断言差异在可接受范围内
     assert difference < 1e-1, "FusedAttention output does not match MultiheadAttention"
 
     print("FusedAttention test passed!")
 
+
 if __name__ == "__main__":
-    test_fused_attention()
\ No newline at end of file
+    test_fused_attention()
diff --git a/tests/test_available_blocks.py b/tests/test_available_blocks.py
index a25735b..f963608 100644
--- a/tests/test_available_blocks.py
+++ b/tests/test_available_blocks.py
@@ -1,24 +1,23 @@
 import torch, gc
 from typing import List, Tuple
 from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
-import logging, json,os,sys
+import logging, json, os, sys
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from lite_llama.models.model_config import LlamaConfig
 
 logger = logging.getLogger(__name__)
 
-def load_config_from_json(json_file_path: str, device: str="cuda") -> LlamaConfig:
+
+def load_config_from_json(json_file_path: str, device: str = "cuda") -> LlamaConfig:
     with open(json_file_path, "r") as f:
         config_dict = json.load(f)
-    config = LlamaConfig(config_dict, max_seq_len = 2048, device=device)
+    config = LlamaConfig(config_dict, max_seq_len=2048, device=device)
     return config
 
-def _get_cache_block_size(
-    model_config,
-    block_size: int = 1
-) -> int:
+
+def _get_cache_block_size(model_config, block_size: int = 1) -> int:
     head_size = model_config.head_dim
     num_heads = model_config.num_kv_heads
     num_attention_layers = model_config.num_layers
@@ -26,12 +25,15 @@ def _get_cache_block_size(
     key_cache_block = block_size * num_heads * head_size
     value_cache_block = key_cache_block
     total = num_attention_layers * (key_cache_block + value_cache_block)
-    dtype_size = 2 # torch.float16
+    dtype_size = 2  # torch.float16
 
     return dtype_size * total
 
+
 @torch.inference_mode()
-def determine_num_available_blocks(model_config, gpu_memory_utilization = 0.9) -> Tuple[int, int]:
+def determine_num_available_blocks(
+    model_config, gpu_memory_utilization=0.9
+) -> Tuple[int, int]:
     """
     评估模型的峰值内存使用情况，以确定在不发生内存溢出的情况下可以分配的 KV（键值）缓存块的数量。
 
@@ -53,43 +55,42 @@ def determine_num_available_blocks(model_config, gpu_memory_utilization = 0.9) -
     # 计算模型加载后的峰值内存使用量
     # Get the peak memory allocation recorded by torch
     peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-    
+
     # 清理未使用的缓存，计算非Torch分配的内存
     torch.cuda.empty_cache()
     torch_allocated_bytes = torch.cuda.memory_stats()["allocated_bytes.all.current"]
 
     total_allocated_bytes = torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
     non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
-    
+
     if non_torch_allocations > 0:
         peak_memory += non_torch_allocations
 
-    available_kv_cache_memory = (
-        total_gpu_memory * gpu_memory_utilization -
-        peak_memory)
-    
+    available_kv_cache_memory = total_gpu_memory * gpu_memory_utilization - peak_memory
+
     # 计算每个缓存块的大小
     cache_block_size = _get_cache_block_size(model_config)
     # 计算在剩余可用内存下，最多可以分配的 GPU 缓存块数量
     num_gpu_blocks = int(
-        (total_gpu_memory * gpu_memory_utilization -
-         peak_memory) // cache_block_size
+        (total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size
     )
     # 确保缓存块数量不为负数
     num_gpu_blocks = max(num_gpu_blocks, 0)
 
     logger.info(
-            "Memory profiling results: total_gpu_memory=%.2fGiB \n"
-            " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB \n"
-            " memory_usage_post_profile=%.2fGib \n"
-            " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB \n"
-            " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3),
-            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
-            (peak_memory - non_torch_allocations) / (1024**3),
-            total_allocated_bytes / (1024**3),
-            non_torch_allocations / (1024**3),
-            available_kv_cache_memory / (1024**3),
-            gpu_memory_utilization)
+        "Memory profiling results: total_gpu_memory=%.2fGiB \n"
+        " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB \n"
+        " memory_usage_post_profile=%.2fGib \n"
+        " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB \n"
+        " gpu_memory_utilization=%.2f",
+        total_gpu_memory / (1024**3),
+        (total_gpu_memory - free_memory_pre_profile) / (1024**3),
+        (peak_memory - non_torch_allocations) / (1024**3),
+        total_allocated_bytes / (1024**3),
+        non_torch_allocations / (1024**3),
+        available_kv_cache_memory / (1024**3),
+        gpu_memory_utilization,
+    )
 
     # 进行垃圾回收，释放未使用的内存
     gc.collect()
@@ -99,6 +100,7 @@ def determine_num_available_blocks(model_config, gpu_memory_utilization = 0.9) -
 
     return num_gpu_blocks, 0
 
+
 def load_original_llama(model_name_or_path: str, device: str = "cuda"):
     # config = LlamaConfig.from_pretrained(model_name_or_path)
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
@@ -110,6 +112,7 @@ def load_original_llama(model_name_or_path: str, device: str = "cuda"):
     model.to(device)
     return model, tokenizer
 
+
 if __name__ == "__main__":
     # 定义模型权重路径及配置参数
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -117,6 +120,8 @@ def load_original_llama(model_name_or_path: str, device: str = "cuda"):
     # 加载原始模型
     original_model, tokenizer = load_original_llama(original_model_path, device)
     # 定义模型配置参数
-    json_file_path = '/gemini/code/Llama-3.2-1B-Instruct/my_weight/config.json' # JSON 文件的路径
-    model_config = load_config_from_json(json_file_path, device) # 加载配置
-    determine_num_available_blocks(model_config)
\ No newline at end of file
+    json_file_path = (
+        "/gemini/code/Llama-3.2-1B-Instruct/my_weight/config.json"  # JSON 文件的路径
+    )
+    model_config = load_config_from_json(json_file_path, device)  # 加载配置
+    determine_num_available_blocks(model_config)
diff --git a/tests/test_cuda_graph.py b/tests/test_cuda_graph.py
index 91b2983..fb5154b 100644
--- a/tests/test_cuda_graph.py
+++ b/tests/test_cuda_graph.py
@@ -1,18 +1,20 @@
 import torch, time
-import torch.nn as nn 
+import torch.nn as nn
 from dataclasses import dataclass
 from typing import List
 from transformers import GPT2Tokenizer
 
+
 @dataclass
 class ModelConfig:
     # config reference: https://huggingface.co/openai-community/gpt2/blob/main/config.json
     num_layers: int = 12  # n_layer
-    embedding_dim: int  = 768 # hidden_size, n_embd
-    num_heads: int = 12   # n_head
-    vocab_size: int = 50257 # vocab_size
-    
-class CUDAGraphRunner():
+    embedding_dim: int = 768  # hidden_size, n_embd
+    num_heads: int = 12  # n_head
+    vocab_size: int = 50257  # vocab_size
+
+
+class CUDAGraphRunner:
     def __init__(self, model):
         self.model = model
         self._cuda_graph = None
@@ -24,7 +26,7 @@ def capture(self, x):
 
         torch.cuda.synchronize()
         self._cuda_graph = torch.cuda.CUDAGraph()
-        
+
         with torch.cuda.graph(self._cuda_graph):
             output = self.model(x)
         torch.cuda.synchronize()
@@ -37,13 +39,14 @@ def forward(self, x):
         self.graph_input.copy_(x)
         # Run the graph.
         self._cuda_graph.replay()
-        
+
         return self.graph_output
-    
+
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
-    
-class ModelRunner():
+
+
+class ModelRunner:
     def __init__(self, model, seq_len=128):
         self.model = model
         self.seq_len = seq_len
@@ -51,8 +54,8 @@ def __init__(self, model, seq_len=128):
 
     @torch.inference_mode()
     def capture_model(self):
-        for batch in [1, 2, 4, 12]: # 提前设置一批 batch
-            input = torch.randn(batch, self.seq_len).cuda() #
+        for batch in [1, 2, 4, 12]:  # 提前设置一批 batch
+            input = torch.randn(batch, self.seq_len).cuda()  #
             graph_runner = CUDAGraphRunner(self.model)
             graph_runner.capture(input)
             self.graph_runners[batch] = graph_runner
@@ -61,13 +64,16 @@ def capture_model(self):
     def execute_model(self, x):
         batch = x.shape[0]
         if batch in self.graph_runners:
-            model_executable = self.graph_runners[batch] # 根据输入找到对应的 graph_runner
+            model_executable = self.graph_runners[
+                batch
+            ]  # 根据输入找到对应的 graph_runner
         else:
             print(f"warning, no cudagraph_runner, back to origin model")
-            model_executable = self.model # 回退到原始的 model
-        
+            model_executable = self.model  # 回退到原始的 model
+
         return model_executable(x)
 
+
 class SimpleGPT2(nn.Module):
     def __init__(self, model_config: ModelConfig):
         super(SimpleGPT2, self).__init__()
@@ -78,36 +84,39 @@ def __init__(self, model_config: ModelConfig):
 
         self.embed_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
         self.transformer_blocks = nn.ModuleList(
-            nn.TransformerEncoderLayer(d_model=self.embedding_dim, nhead=self.num_heads, batch_first=True)
+            nn.TransformerEncoderLayer(
+                d_model=self.embedding_dim, nhead=self.num_heads, batch_first=True
+            )
             for _ in range(self.num_layers)
-        ) 
+        )
         self.lm_head = nn.Linear(self.embedding_dim, self.vocab_size)
 
     def forward(self, x):
-        h = self.embed_layer(x) # [batch_size, seq_len] -> [batch_size, seq_len, embedding_dim]
+        h = self.embed_layer(
+            x
+        )  # [batch_size, seq_len] -> [batch_size, seq_len, embedding_dim]
         # h = h.transpose(0, 1)  # 调整维度 [seq_len, batch_size, embedding_dim]
 
         for transformer_block in self.transformer_blocks:
             h = transformer_block(h)
-        
+
         # h = h.transpose(0, 1)  # 转回 [batch_size, seq_len, embedding_dim]
         logits = self.lm_head(h)
 
         return logits
 
+
 # 在 Python 的 typing 模块中，Union、Optional 和 List 用于类型注解，
 # 帮助开发者明确变量、函数参数和返回值的类型，提高代码的可读性和可靠性。
 
+
 def generate_text(
-    model: SimpleGPT2,
-    tokenizer: GPT2Tokenizer,
-    texts: List[str], 
-    max_gen_len: int = 50
+    model: SimpleGPT2, tokenizer: GPT2Tokenizer, texts: List[str], max_gen_len: int = 50
 ):
     model.eval()
     # 一个包含编码后文本的张量，形状为 (batch_size, sequence_length)
     input_ids = tokenizer.encode(texts, return_tensors="pt")
-    generated_ids = input_ids # shape: (1, 4)
+    generated_ids = input_ids  # shape: (1, 4)
 
     with torch.no_grad():
         for step in range(max_gen_len):
@@ -116,10 +125,14 @@ def generate_text(
             next_token_logits = outputs[:, -1, :]  # [batch_size, vocab_size]
             print(f"Next token logits shape: {next_token_logits.shape}")
             # 选取概率最高的标记
-            next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # [batch_size, 1]
+            next_token_id = torch.argmax(
+                next_token_logits, dim=-1, keepdim=True
+            )  # [batch_size, 1]
             print(f"Next token id shape: {next_token_id.shape}")
             # 将新生成的标记添加到生成的序列中
-            generated_ids = torch.cat((generated_ids, next_token_id), dim=1)  # [batch_size, seq_len + 1]
+            generated_ids = torch.cat(
+                (generated_ids, next_token_id), dim=1
+            )  # [batch_size, seq_len + 1]
             print(f"Generated ids shape: {generated_ids.shape}")
             # 检查是否生成了结束标记
             if torch.all(next_token_id.squeeze(-1) == tokenizer.eos_token_id):
@@ -128,27 +141,29 @@ def generate_text(
 
     # 解码生成的标记序列
     generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-    
+
     return generated_texts
-    
+
     return generated_text
 
+
 def test_model_gen(input_text: List[str]):
     model_config = ModelConfig()
     model = SimpleGPT2(model_config)
-    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     output_text = generate_text(model, tokenizer, input_text, max_gen_len=8)
     print(output_text)
-    
+
+
 if __name__ == "__main__":
-    # test_model_gen("Once upon a time") 
+    # test_model_gen("Once upon a time")
     # 创建模型和输入数据
     model = nn.Linear(128, 256).cuda()
     model.eval()
     input = torch.randn(4, 128).cuda()
 
     # 使用原始模型的推理时间
-    torch.cuda.synchronize() # 同步 CPU 和 GPU 计算 
+    torch.cuda.synchronize()  # 同步 CPU 和 GPU 计算
     start_time = time.time()
     output_ref = model(input)
     torch.cuda.synchronize()
@@ -192,4 +207,4 @@ def test_model_gen(input_text: List[str]):
   )
   (lm_head): Linear(in_features=768, out_features=50257, bias=True)
 )
-"""
\ No newline at end of file
+"""
diff --git a/tests/test_flashattentionv2.py b/tests/test_flashattentionv2.py
index 9af1950..51d8376 100644
--- a/tests/test_flashattentionv2.py
+++ b/tests/test_flashattentionv2.py
@@ -9,31 +9,34 @@
 def standard_attention(Q, K, V, sm_scale, mask=None):
     """
     标准的 PyTorch 实现的自注意力机制。
-    
+
     Args:
         Q (torch.Tensor): 查询张量，形状 (batch_size, num_heads, seq_length, head_dim)
         K (torch.Tensor): 键张量，形状 (batch_size, num_heads, seq_length, head_dim)
         V (torch.Tensor): 值张量，形状 (batch_size, num_heads, seq_length, head_dim)
         sm_scale (float): Softmax 缩放因子
         mask (torch.Tensor, optional): 遮罩张量，形状 (batch_size, num_heads, seq_length, seq_length)
-    
+
     Returns:
         torch.Tensor: 注意力输出，形状与 Q 相同
     """
     # 计算 QK^T
-    attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * sm_scale  # (batch_size, num_heads, seq_length, seq_length)
-    
+    attn_scores = (
+        torch.matmul(Q, K.transpose(-2, -1)) * sm_scale
+    )  # (batch_size, num_heads, seq_length, seq_length)
+
     if mask is not None:
-        attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))
-    
+        attn_scores = attn_scores.masked_fill(mask == 0, float("-inf"))
+
     # print("attn_scores", attn_scores)
     attn_weights = F.softmax(attn_scores, dim=-1)
-    
+
     # 计算注意力输出
     out = torch.matmul(attn_weights, V)  # (batch_size, num_heads, seq_length, head_dim)
-    
+
     return out
 
+
 def test_prefill_stage():
     # 设置测试参数
     batch_size = 2
@@ -43,9 +46,15 @@ def test_prefill_stage():
 
     # 生成固定的输入张量（使用固定随机种子以确保可重复性）
     torch.manual_seed(0)
-    q = torch.randn(batch_size, num_heads, seq_length, head_dim, device='cuda', dtype=torch.float32)
-    k = torch.randn(batch_size, num_heads, seq_length, head_dim, device='cuda', dtype=torch.float32)
-    v = torch.randn(batch_size, num_heads, seq_length, head_dim, device='cuda', dtype=torch.float32)
+    q = torch.randn(
+        batch_size, num_heads, seq_length, head_dim, device="cuda", dtype=torch.float32
+    )
+    k = torch.randn(
+        batch_size, num_heads, seq_length, head_dim, device="cuda", dtype=torch.float32
+    )
+    v = torch.randn(
+        batch_size, num_heads, seq_length, head_dim, device="cuda", dtype=torch.float32
+    )
 
     # 计算 Softmax 缩放因子
     sm_scale = 1.0 / math.sqrt(head_dim)  # 1 / sqrt(d_k)
@@ -54,16 +63,24 @@ def test_prefill_stage():
     out = flash_attention_v2(q, k, v)
 
     # 使用标准 PyTorch 实现计算注意力输出 # 创建下三角矩阵
-    mask = torch.tril(torch.ones((seq_length, seq_length))).unsqueeze(0).unsqueeze(0).type_as(q)  # (1, 1, seq, seq)
+    mask = (
+        torch.tril(torch.ones((seq_length, seq_length)))
+        .unsqueeze(0)
+        .unsqueeze(0)
+        .type_as(q)
+    )  # (1, 1, seq, seq)
     standard_o = standard_attention(q, k, v, sm_scale, mask)
 
     # 比较 Triton 内核输出与标准实现的输出
     if torch.allclose(out, standard_o, atol=1e-2):
-        print("Prefill Stage Test Passed: Triton output matches PyTorch standard implementation.")
+        print(
+            "Prefill Stage Test Passed: Triton output matches PyTorch standard implementation."
+        )
     else:
         max_diff = (out - standard_o).abs().max()
         print(f"Prefill Stage Test Failed: Maximum difference {max_diff}")
 
+
 def test_decode_stage():
     # 设置测试参数
     batch_size = 1
@@ -74,11 +91,34 @@ def test_decode_stage():
 
     # 生成固定的初始输入张量
     torch.manual_seed(0)
-    q_initial = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype=torch.float32)
-    k_initial = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype=torch.float32)
-    v_initial = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype=torch.float32)
-    o_initial = torch.zeros_like(q_initial, device='cuda', dtype=torch.float32)
-    new_token_q = torch.randn(batch_size, num_heads, 1, head_dim, device='cuda', dtype=torch.float32)
+    q_initial = torch.randn(
+        batch_size,
+        num_heads,
+        initial_seq_length,
+        head_dim,
+        device="cuda",
+        dtype=torch.float32,
+    )
+    k_initial = torch.randn(
+        batch_size,
+        num_heads,
+        initial_seq_length,
+        head_dim,
+        device="cuda",
+        dtype=torch.float32,
+    )
+    v_initial = torch.randn(
+        batch_size,
+        num_heads,
+        initial_seq_length,
+        head_dim,
+        device="cuda",
+        dtype=torch.float32,
+    )
+    o_initial = torch.zeros_like(q_initial, device="cuda", dtype=torch.float32)
+    new_token_q = torch.randn(
+        batch_size, num_heads, 1, head_dim, device="cuda", dtype=torch.float32
+    )
 
     triton_k_extended = k_initial
     triton_v_extended = v_initial
@@ -86,13 +126,13 @@ def test_decode_stage():
     torch_v_extended = v_initial
     torch_new_token_q = new_token_q
     triton_new_token_q = new_token_q
-    
+
     # 模拟生成过程中逐步增加序列长度
     for step in range(1, generated_seq_length + 1):
         # 生成新的 token
         triton_k_extended = torch.cat([triton_k_extended, triton_new_token_q], dim=2)
         triton_v_extended = torch.cat([triton_v_extended, triton_new_token_q], dim=2)
-        
+
         torch_k_extended = torch.cat([torch_k_extended, torch_new_token_q], dim=2)
         torch_v_extended = torch.cat([torch_v_extended, torch_new_token_q], dim=2)
 
@@ -103,21 +143,30 @@ def test_decode_stage():
         sm_scale_extended = 1.0 / math.sqrt(head_dim)
 
         # 计算 Triton 内核输出
-        triton_new_token_q = flash_attention_v2(new_token_q, triton_k_extended, triton_v_extended)
+        triton_new_token_q = flash_attention_v2(
+            new_token_q, triton_k_extended, triton_v_extended
+        )
 
         # 使用标准 PyTorch 实现计算扩展后的注意力输出
-        torch_new_token_q = standard_attention(new_token_q, torch_k_extended, torch_v_extended, sm_scale_extended)
+        torch_new_token_q = standard_attention(
+            new_token_q, torch_k_extended, torch_v_extended, sm_scale_extended
+        )
 
         # 比较 Triton 内核输出与标准实现的输出
         if torch.allclose(triton_new_token_q, torch_new_token_q, atol=1e-1):
             max_difference = (triton_new_token_q - torch_new_token_q).abs().max()
-            print(f"Decode Stage Step {step} Difference {max_difference}. Test Passed: Triton output matches PyTorch standard implementation.")
+            print(
+                f"Decode Stage Step {step} Difference {max_difference}. Test Passed: Triton output matches PyTorch standard implementation."
+            )
         else:
             max_diff = (triton_new_token_q - torch_new_token_q).abs().max()
-            print(f"Decode Stage Step {step} Test Failed: Maximum difference {max_diff}")
+            print(
+                f"Decode Stage Step {step} Test Failed: Maximum difference {max_diff}"
+            )
             # 可选择打印更多信息进行调试
             break  # 根据需要是否停止测试
 
+
 if __name__ == "__main__":
     print("Running Prefill Stage Test...")
     test_prefill_stage()
@@ -145,4 +194,4 @@ def test_decode_stage():
 Decode Stage Step 14 Test Passed: Triton output matches PyTorch standard implementation.
 Decode Stage Step 15 Test Passed: Triton output matches PyTorch standard implementation.
 Decode Stage Step 16 Test Passed: Triton output matches PyTorch standard implementation.
-"""
\ No newline at end of file
+"""
diff --git a/tests/test_flashdecoding.py b/tests/test_flashdecoding.py
index 89b5cd0..5dff91a 100644
--- a/tests/test_flashdecoding.py
+++ b/tests/test_flashdecoding.py
@@ -7,32 +7,39 @@
 
 
 def standard_attention(Q, K, V, sm_scale, mask=None):
-	"""
-	标准的 PyTorch 实现的自注意力机制。
+    """
+    标准的 PyTorch 实现的自注意力机制。
 
-	Args:
-		Q (torch.Tensor): 查询张量，形状 (batch_size, num_heads, seq_length, head_dim)
-		K (torch.Tensor): 键张量，形状 (batch_size, num_heads, seq_length, head_dim)
-		V (torch.Tensor): 值张量，形状 (batch_size, num_heads, seq_length, head_dim)
-		sm_scale (float): Softmax 缩放因子
-		mask (torch.Tensor, optional): 遮罩张量，形状 (batch_size, num_heads, seq_length, seq_length)
+    Args:
+            Q (torch.Tensor): 查询张量，形状 (batch_size, num_heads, seq_length, head_dim)
+            K (torch.Tensor): 键张量，形状 (batch_size, num_heads, seq_length, head_dim)
+            V (torch.Tensor): 值张量，形状 (batch_size, num_heads, seq_length, head_dim)
+            sm_scale (float): Softmax 缩放因子
+            mask (torch.Tensor, optional): 遮罩张量，形状 (batch_size, num_heads, seq_length, seq_length)
 
-	Returns:
-		torch.Tensor: 注意力输出，形状与 Q 相同
-	"""
-	print(f"K V cache tensor have 0 numbers is ", torch.nonzero(K==0).numel(), torch.nonzero(V==0).numel())
-	# 计算 QK^T
-	attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * sm_scale  # (batch_size, num_heads, seq_length, seq_length)
+    Returns:
+            torch.Tensor: 注意力输出，形状与 Q 相同
+    """
+    print(
+        f"K V cache tensor have 0 numbers is ",
+        torch.nonzero(K == 0).numel(),
+        torch.nonzero(V == 0).numel(),
+    )
+    # 计算 QK^T
+    attn_scores = (
+        torch.matmul(Q, K.transpose(-2, -1)) * sm_scale
+    )  # (batch_size, num_heads, seq_length, seq_length)
 
-	if mask is not None:
-		attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))
+    if mask is not None:
+        attn_scores = attn_scores.masked_fill(mask == 0, float("-inf"))
 
-	attn_weights = F.softmax(attn_scores, dim=-1)
+    attn_weights = F.softmax(attn_scores, dim=-1)
 
-	# 计算注意力输出
-	out = torch.matmul(attn_weights, V)  # (batch_size, num_heads, seq_length, head_dim)
+    # 计算注意力输出
+    out = torch.matmul(attn_weights, V)  # (batch_size, num_heads, seq_length, head_dim)
+
+    return out
 
-	return out
 
 def test_decode_stage(debug_out_text):
     # 设置测试参数
@@ -48,22 +55,36 @@ def test_decode_stage(debug_out_text):
     torch.manual_seed(0)
 
     # torch_q = torch.randn(batch_size, num_heads, initial_seq_length, head_dim, device='cuda', dtype = dtype)
-    torch_k_cache = torch.randn(batch_size, num_heads, kv_cache_seq_length, head_dim, device='cuda', dtype = dtype)
-    torch_v_cache = torch.randn(batch_size, num_heads, kv_cache_seq_length, head_dim, device='cuda', dtype = dtype)
+    torch_k_cache = torch.randn(
+        batch_size, num_heads, kv_cache_seq_length, head_dim, device="cuda", dtype=dtype
+    )
+    torch_v_cache = torch.randn(
+        batch_size, num_heads, kv_cache_seq_length, head_dim, device="cuda", dtype=dtype
+    )
 
     # triton_q = torch_q.transpose(1, 2).view(-1, num_heads, head_dim)
     triton_k_cache = torch_k_cache.transpose(1, 2).reshape(-1, num_heads, head_dim)
     triton_v_cache = torch_v_cache.transpose(1, 2).reshape(-1, num_heads, head_dim)
     print(f"triton_k_cache shape is ", triton_k_cache.shape)
 
-    torch_new_token_q = torch.randn(batch_size, num_heads, 1, head_dim, device='cuda', dtype = dtype)
-    triton_new_token_q = torch_new_token_q.transpose(1, 2).reshape(-1, num_heads, head_dim)
+    torch_new_token_q = torch.randn(
+        batch_size, num_heads, 1, head_dim, device="cuda", dtype=dtype
+    )
+    triton_new_token_q = torch_new_token_q.transpose(1, 2).reshape(
+        -1, num_heads, head_dim
+    )
     print(f"triton_new_token_q shape is ", triton_new_token_q.shape)
 
     # 初始化线性层，用于生成 Q、K、V. 为了测试，这里使用随机的线性层参数
-    q_linear = torch.nn.Linear(head_dim, num_heads * head_dim, bias=False).to('cuda', dtype=dtype)
-    k_linear = torch.nn.Linear(head_dim, num_heads * head_dim, bias=False).to('cuda', dtype=dtype)
-    v_linear = torch.nn.Linear(head_dim, num_heads * head_dim, bias=False).to('cuda', dtype=dtype)
+    q_linear = torch.nn.Linear(head_dim, num_heads * head_dim, bias=False).to(
+        "cuda", dtype=dtype
+    )
+    k_linear = torch.nn.Linear(head_dim, num_heads * head_dim, bias=False).to(
+        "cuda", dtype=dtype
+    )
+    v_linear = torch.nn.Linear(head_dim, num_heads * head_dim, bias=False).to(
+        "cuda", dtype=dtype
+    )
 
     # 模拟生成过程中逐步增加序列长度
     for step in range(1, generated_seq_length + 1):
@@ -74,22 +95,31 @@ def test_decode_stage(debug_out_text):
         sm_scale_extended = 1.0 / math.sqrt(head_dim)
 
         # 计算 Triton 内核输出
-        
-        triton_new_token_q = flash_decoding(triton_new_token_q, triton_k_cache, triton_v_cache, actual_seq_len=kv_cache_seq_length)
+
+        triton_new_token_q = flash_decoding(
+            triton_new_token_q,
+            triton_k_cache,
+            triton_v_cache,
+            actual_seq_len=kv_cache_seq_length,
+        )
 
         # 使用标准 PyTorch 实现计算扩展后的注意力输出
-        torch_new_token_q = standard_attention(torch_new_token_q, torch_k_cache, torch_v_cache, sm_scale_extended)
+        torch_new_token_q = standard_attention(
+            torch_new_token_q, torch_k_cache, torch_v_cache, sm_scale_extended
+        )
 
         # 生成新的 token
         triton_k_cache = torch.cat([triton_k_cache, triton_new_token_q], dim=0)
         triton_v_cache = torch.cat([triton_v_cache, triton_new_token_q], dim=0)
-        
+
         torch_k_cache = torch.cat([torch_k_cache, torch_new_token_q], dim=2)
         torch_v_cache = torch.cat([torch_v_cache, torch_new_token_q], dim=2)
         kv_cache_seq_length += 1
 
-        torch_new_token_q_format = torch_new_token_q.transpose(1, 2).contiguous().view(-1, num_heads, head_dim)
-        
+        torch_new_token_q_format = (
+            torch_new_token_q.transpose(1, 2).contiguous().view(-1, num_heads, head_dim)
+        )
+
         debug_out_text1 = debug_out_text.format(step=step, kernel_type="torch")
         debug_out_text2 = debug_out_text.format(step=step, kernel_type="triton")
         with open(debug_out_text1, "w") as f:
@@ -101,14 +131,21 @@ def test_decode_stage(debug_out_text):
         # 比较 Triton 内核输出与标准实现的输出
         if torch.allclose(triton_new_token_q, torch_new_token_q_format, atol=1e-1):
             max_difference = (triton_new_token_q - torch_new_token_q_format).abs().max()
-            print(f"Decode Stage Step {step} Difference {max_difference} Test Passed: Triton output matches PyTorch standard implementation.")
+            print(
+                f"Decode Stage Step {step} Difference {max_difference} Test Passed: Triton output matches PyTorch standard implementation."
+            )
         else:
             max_diff = (triton_new_token_q - torch_new_token_q_format).abs().max()
-            print(f"Decode Stage Step {step} Test Failed: Maximum difference {max_diff}")
+            print(
+                f"Decode Stage Step {step} Test Failed: Maximum difference {max_diff}"
+            )
             # 可选择打印更多信息进行调试
             break  # 根据需要是否停止测试
 
+
 if __name__ == "__main__":
-    debug_out_text = "/gemini/code/lite_llama/test/debug/{step}_{kernel_type}_decode_out_tensor.txt"
+    debug_out_text = (
+        "/gemini/code/lite_llama/test/debug/{step}_{kernel_type}_decode_out_tensor.txt"
+    )
     print("\nRunning Decode Stage Test...")
-    test_decode_stage(debug_out_text)
\ No newline at end of file
+    test_decode_stage(debug_out_text)
diff --git a/tests/test_flashdecoding_stage1.py b/tests/test_flashdecoding_stage1.py
index c60a926..44cfb88 100644
--- a/tests/test_flashdecoding_stage1.py
+++ b/tests/test_flashdecoding_stage1.py
@@ -6,20 +6,32 @@
 import triton
 import triton.language as tl
 
+
 @triton.jit
 def _flash_decoding_stage1_kernel(
-    Q, K, V, sm_scale,
-
+    Q,
+    K,
+    V,
+    sm_scale,
     actual_seq_len,  # 实际序列长度
-    Mid_O, Mid_O_LogExpSum,
-
-    q_bs_stride, q_heads_stride, q_dim_stride,  # Q 的 strides
-    k_bs_stride, k_heads_stride, k_dim_stride,  # K 的 strides
-    v_bs_stride, v_heads_stride, v_dim_stride,  # V 的 strides
-
-    mido_batch_stride, mido_heads_stride, mido_partitions_stride, mido_dim_stride,
-    mido_les_batch_stride, mido_les_heads_stride, mido_les_partitions_stride,
-
+    Mid_O,
+    Mid_O_LogExpSum,
+    q_bs_stride,
+    q_heads_stride,
+    q_dim_stride,  # Q 的 strides
+    k_bs_stride,
+    k_heads_stride,
+    k_dim_stride,  # K 的 strides
+    v_bs_stride,
+    v_heads_stride,
+    v_dim_stride,  # V 的 strides
+    mido_batch_stride,
+    mido_heads_stride,
+    mido_partitions_stride,
+    mido_dim_stride,
+    mido_les_batch_stride,
+    mido_les_heads_stride,
+    mido_les_partitions_stride,
     BLOCK_SEQ: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
@@ -35,21 +47,21 @@ def _flash_decoding_stage1_kernel(
 
     # 计算当前分区的起始和结束索引
     cur_batch_partition_start_index = seq_block_idx * BLOCK_SEQ
-    cur_batch_partition_end_index = tl.minimum(actual_seq_len, cur_batch_partition_start_index + BLOCK_SEQ)
+    cur_batch_partition_end_index = tl.minimum(
+        actual_seq_len, cur_batch_partition_start_index + BLOCK_SEQ
+    )
 
     # 计算需要处理的块数
-    num_blocks = (cur_batch_partition_end_index - cur_batch_partition_start_index + BLOCK_N - 1) // BLOCK_N
+    num_blocks = (
+        cur_batch_partition_end_index - cur_batch_partition_start_index + BLOCK_N - 1
+    ) // BLOCK_N
 
     # 初始化偏移向量
     offs_n = cur_batch_partition_start_index + tl.arange(0, BLOCK_N)  # [BLOCK_N]
     offs_d = tl.arange(0, BLOCK_DMODEL)  # [BLOCK_DMODEL]
 
     # 计算 Q 的偏移量
-    q_offs = (
-        batch_idx * q_bs_stride
-        + head_idx * q_heads_stride
-        + offs_d * q_dim_stride
-    )
+    q_offs = batch_idx * q_bs_stride + head_idx * q_heads_stride + offs_d * q_dim_stride
 
     # 计算 K 和 V 的偏移量
     k_offs = (
@@ -145,36 +157,45 @@ def _flash_decoding_stage1_kernel(
 
 @torch.no_grad()
 def flash_decode_stage1(
-    q, k, v,         # Q: [batchs, num_heads, head_dim], K, V: [batchs * seq_len, num_heads, head_dim]
+    q,
+    k,
+    v,  # Q: [batchs, num_heads, head_dim], K, V: [batchs * seq_len, num_heads, head_dim]
     actual_seq_len,  # 实际的序列长度
-    mid_o, mid_o_logexpsum, # Mid_O: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE), head_dim], Mid_O_LogExpSum: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE)]
+    mid_o,
+    mid_o_logexpsum,  # Mid_O: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE), head_dim], Mid_O_LogExpSum: [batchs, num_heads, cdiv(seq_len, PARTITION_SIZE)]
     PARTITION_SIZE,
 ):
     BLOCK_N_SIZE = 32
     BLOCK_DMODEL = q.shape[-1]
-    assert PARTITION_SIZE % BLOCK_N_SIZE == 0, "PARTITION_SIZE 必须是 BLOCK_N_SIZE 的倍数"
+    assert PARTITION_SIZE % BLOCK_N_SIZE == 0, (
+        "PARTITION_SIZE 必须是 BLOCK_N_SIZE 的倍数"
+    )
 
     batchs, num_heads, head_dim = q.shape
-    sm_scale = 1.0 / (head_dim ** 0.5)
+    sm_scale = 1.0 / (head_dim**0.5)
     grid = (batchs, num_heads, triton.cdiv(actual_seq_len, PARTITION_SIZE))
 
     _flash_decoding_stage1_kernel[grid](
-        q, k, v, sm_scale,
+        q,
+        k,
+        v,
+        sm_scale,
         actual_seq_len,  # 使用实际序列长度
-        mid_o, mid_o_logexpsum,
+        mid_o,
+        mid_o_logexpsum,
         *q.stride(),
         *k.stride(),
         *v.stride(),
         *mid_o.stride(),
         *mid_o_logexpsum.stride(),
-
-        BLOCK_SEQ = PARTITION_SIZE,
-        BLOCK_N = BLOCK_N_SIZE,
-        BLOCK_DMODEL = head_dim,
-        num_warps = 1,
-        num_stages = 2,
+        BLOCK_SEQ=PARTITION_SIZE,
+        BLOCK_N=BLOCK_N_SIZE,
+        BLOCK_DMODEL=head_dim,
+        num_warps=1,
+        num_stages=2,
     )
 
+
 import torch
 
 # 设置随机种子以确保可重复性
@@ -185,24 +206,42 @@ def flash_decode_stage1(
 partition_size = 32
 
 # 随机初始化 Q, K, V
-q = torch.randn(batchs, num_heads, head_dim, device='cuda', dtype=torch.float32)
-k = torch.randn(batchs * seq_len, num_heads, head_dim, device='cuda', dtype=torch.float32)
-v = torch.randn(batchs * seq_len, num_heads, head_dim, device='cuda', dtype=torch.float32)
+q = torch.randn(batchs, num_heads, head_dim, device="cuda", dtype=torch.float32)
+k = torch.randn(
+    batchs * seq_len, num_heads, head_dim, device="cuda", dtype=torch.float32
+)
+v = torch.randn(
+    batchs * seq_len, num_heads, head_dim, device="cuda", dtype=torch.float32
+)
 
 # 初始化 mid_o 和 mid_o_logexpsum
-mid_o = torch.zeros(batchs, num_heads, (seq_len + partition_size -1) // partition_size, head_dim, device='cuda', dtype=torch.float32)
-mid_o_logexpsum = torch.zeros(batchs, num_heads, (seq_len + partition_size -1) // partition_size, device='cuda', dtype=torch.float32)
+mid_o = torch.zeros(
+    batchs,
+    num_heads,
+    (seq_len + partition_size - 1) // partition_size,
+    head_dim,
+    device="cuda",
+    dtype=torch.float32,
+)
+mid_o_logexpsum = torch.zeros(
+    batchs,
+    num_heads,
+    (seq_len + partition_size - 1) // partition_size,
+    device="cuda",
+    dtype=torch.float32,
+)
 
 # 调用修复后的函数
 flash_decode_stage1(
-    q, k, v,
+    q,
+    k,
+    v,
     actual_seq_len=seq_len,
-    mid_o=mid_o, 
-    mid_o_logexpsum=mid_o_logexpsum, 
+    mid_o=mid_o,
+    mid_o_logexpsum=mid_o_logexpsum,
     PARTITION_SIZE=partition_size,
 )
 
 # 打印输出结果
 print("Mid_O:", mid_o)
 print("Mid_O_LogExpSum:", mid_o_logexpsum)
-
diff --git a/tests/test_flashdecoding_stage2.py b/tests/test_flashdecoding_stage2.py
index 667ef13..04ed43d 100644
--- a/tests/test_flashdecoding_stage2.py
+++ b/tests/test_flashdecoding_stage2.py
@@ -1,158 +1,193 @@
-import torch,math
+import torch, math
 import triton
 import triton.language as tl
 from torch.cuda.amp import custom_fwd
 from typing import List, Optional, Union
 import torch.nn.functional as F
 
+
 @triton.jit
 def _flash_decoding_stage2_kernel(
-	Mid_O,  		# [batch, head, seq_block_num, head_dim]
-	Mid_O_LogExpSum,  # [batch, head, seq_block_num]
-	Ouput,          # attention 输出首地址
-	mido_batch_stride, mido_heads_stride, mido_partitions_stride, mido_dim_stride,
-	mido_les_batch_stride, mido_les_heads_stride, mido_les_partitions_stride,
-	o_bs_stride, o_heads_stride, o_dim_stride,
-	actual_seq_len,   # TODO 支持 PagedAttention 和连续批处理
-	BLOCK_DMODEL: tl.constexpr,
-	BLOCK_SEQ: tl.constexpr, # type: ignore
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    Ouput,  # attention 输出首地址
+    mido_batch_stride,
+    mido_heads_stride,
+    mido_partitions_stride,
+    mido_dim_stride,
+    mido_les_batch_stride,
+    mido_les_heads_stride,
+    mido_les_partitions_stride,
+    o_bs_stride,
+    o_heads_stride,
+    o_dim_stride,
+    actual_seq_len,  # TODO 支持 PagedAttention 和连续批处理
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_SEQ: tl.constexpr,  # type: ignore
 ):
-	"""Reduction (online softmax)
-	"""
-	batch_idx = tl.program_id(0)
-	head_idx = tl.program_id(1)
-	
-	# 初始化偏移 
-	offs_d = tl.arange(0, BLOCK_DMODEL)
-
-	offs_part_v = batch_idx * mido_batch_stride \
-				+ head_idx * mido_heads_stride \
-				+ offs_d * mido_dim_stride
-
-	offs_part_max = batch_idx * mido_les_batch_stride \
-				+ head_idx * mido_les_heads_stride
-	
-	part_v_ptrs = Mid_O + offs_part_v
-	part_max_ptrs = Mid_O_LogExpSum + offs_part_max
-
-	# Reduce kv 分块相关变量值. num_partitions 是 kv 分块数量
-	d_i = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
-	m_i = -float("inf")
-	acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
-	num_partitions = (actual_seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ
-
-	for _ in range(0, num_partitions, 1):
-		part_v = tl.load(part_v_ptrs)
-		part_max = tl.load(part_max_ptrs)
-
-		# -- 更新局部最大值 和 exp 分子项 p-- #
-		m_ij = tl.maximum(part_max, m_i)
-		p = tl.exp(part_v - m_ij)
-
-		# -- 计算 alpha = exp(m{j-1} - m{j}) 值 -- #
-		alpha = tl.exp(m_i - m_ij)
-
-		# -- 更新归一化项和 attention 输出累加器 -- #
-		d_i = d_i * alpha + p
-        
-		acc *= alpha
-		acc += p * part_v		
-	
-		# 更新 max 值和指针偏移
-		m_i = m_ij
-		part_v_ptrs += mido_partitions_stride
-		part_max_ptrs += mido_les_partitions_stride
-
-	# -- 更新 attention 输出累加器 -- #
-	offs_out = batch_idx * o_bs_stride + head_idx * o_heads_stride + offs_d * o_dim_stride
-	tl.store(Ouput + offs_out, acc / d_i)
+    """Reduction (online softmax)"""
+    batch_idx = tl.program_id(0)
+    head_idx = tl.program_id(1)
+
+    # 初始化偏移
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+
+    offs_part_v = (
+        batch_idx * mido_batch_stride
+        + head_idx * mido_heads_stride
+        + offs_d * mido_dim_stride
+    )
+
+    offs_part_max = batch_idx * mido_les_batch_stride + head_idx * mido_les_heads_stride
+
+    part_v_ptrs = Mid_O + offs_part_v
+    part_max_ptrs = Mid_O_LogExpSum + offs_part_max
+
+    # Reduce kv 分块相关变量值. num_partitions 是 kv 分块数量
+    d_i = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+    m_i = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+    num_partitions = (actual_seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+    for _ in range(0, num_partitions, 1):
+        part_v = tl.load(part_v_ptrs)
+        part_max = tl.load(part_max_ptrs)
+
+        # -- 更新局部最大值 和 exp 分子项 p-- #
+        m_ij = tl.maximum(part_max, m_i)
+        p = tl.exp(part_v - m_ij)
+
+        # -- 计算 alpha = exp(m{j-1} - m{j}) 值 -- #
+        alpha = tl.exp(m_i - m_ij)
+
+        # -- 更新归一化项和 attention 输出累加器 -- #
+        d_i = d_i * alpha + p
+
+        acc *= alpha
+        acc += p * part_v
+
+        # 更新 max 值和指针偏移
+        m_i = m_ij
+        part_v_ptrs += mido_partitions_stride
+        part_max_ptrs += mido_les_partitions_stride
+
+    # -- 更新 attention 输出累加器 -- #
+    offs_out = (
+        batch_idx * o_bs_stride + head_idx * o_heads_stride + offs_d * o_dim_stride
+    )
+    tl.store(Ouput + offs_out, acc / d_i)
+
 
 @torch.no_grad()
 def flash_decode_stage2(
-    mid_o, mid_o_logexpsum, # 存储每个批次、每个头、每个分区的中间分数输出及 log(sum(exp(scores)))
-	atten_output,           # attention 输出首地址
-	actual_seq_len,  	    # kv cache 在 seq_len 维度的最大长度
-    PARTITION_SIZE
-):	
+    mid_o,
+    mid_o_logexpsum,  # 存储每个批次、每个头、每个分区的中间分数输出及 log(sum(exp(scores)))
+    atten_output,  # attention 输出首地址
+    actual_seq_len,  # kv cache 在 seq_len 维度的最大长度
+    PARTITION_SIZE,
+):
     HEAD_DIM = mid_o.shape[-1]
-    
+
     batchs, num_heads = mid_o.shape[0], mid_o.shape[1]
     grid = (batchs, num_heads)
 
     _flash_decoding_stage2_kernel[grid](
-        mid_o,  	     # [batch, head, seq_block_num, head_dim]
-        mid_o_logexpsum, # [batch, head, seq_block_num]
-        atten_output,           # attention 输出首地址
+        mid_o,  # [batch, head, seq_block_num, head_dim]
+        mid_o_logexpsum,  # [batch, head, seq_block_num]
+        atten_output,  # attention 输出首地址
         *mid_o.stride(),
         *mid_o_logexpsum.stride(),
         *atten_output.stride(),
-        actual_seq_len,   # TODO 支持 PagedAttention 和连续批处理
-        BLOCK_DMODEL = HEAD_DIM,
-        BLOCK_SEQ = PARTITION_SIZE, # type: ignore	
-        num_warps = 4,
-        num_stages = 2,
+        actual_seq_len,  # TODO 支持 PagedAttention 和连续批处理
+        BLOCK_DMODEL=HEAD_DIM,
+        BLOCK_SEQ=PARTITION_SIZE,  # type: ignore
+        num_warps=4,
+        num_stages=2,
     )
 
+
 import torch
 
+
 # 定义 PyTorch 对照实现
 def pytorch_flash_decode_stage2(mid_o, mid_o_logexpsum, actual_seq_len, partition_size):
     batchs, num_heads, seq_block_num, head_dim = mid_o.shape
-    atten_output_pt = torch.zeros(batchs, num_heads, head_dim, device='cuda', dtype=torch.float32)
-    
+    atten_output_pt = torch.zeros(
+        batchs, num_heads, head_dim, device="cuda", dtype=torch.float32
+    )
+
     for batch in range(batchs):
         for head in range(num_heads):
-            d_i = torch.zeros(head_dim, device='cuda', dtype=torch.float32)
-            m_i = torch.full((head_dim,), -float("inf"), device='cuda', dtype=torch.float32)  # 初始化为 [head_dim]
-            acc = torch.zeros(head_dim, device='cuda', dtype=torch.float32)
+            d_i = torch.zeros(head_dim, device="cuda", dtype=torch.float32)
+            m_i = torch.full(
+                (head_dim,), -float("inf"), device="cuda", dtype=torch.float32
+            )  # 初始化为 [head_dim]
+            acc = torch.zeros(head_dim, device="cuda", dtype=torch.float32)
             for partition in range(seq_block_num):
                 part_v = mid_o[batch, head, partition]  # [head_dim]
                 part_max = mid_o_logexpsum[batch, head, partition].item()  # scalar
 
                 # Broadcast part_max to [head_dim] for comparison
-                part_max_tensor = torch.full((head_dim,), part_max, device='cuda', dtype=torch.float32)
+                part_max_tensor = torch.full(
+                    (head_dim,), part_max, device="cuda", dtype=torch.float32
+                )
                 m_ij = torch.maximum(part_max_tensor, m_i)  # [head_dim]
                 p = torch.exp(part_v - m_ij)  # [head_dim]
 
                 alpha = torch.exp(m_i - m_ij)  # [head_dim]
 
-                d_i = d_i * alpha + p        # [head_dim]
+                d_i = d_i * alpha + p  # [head_dim]
                 acc = acc * alpha + p * part_v  # [head_dim]
 
-                m_i = m_ij                    # [head_dim]
-            
+                m_i = m_ij  # [head_dim]
+
             # Avoid division by zero by setting zero where d_i is zero
             mask = d_i > 0
             atten_output_pt[batch, head][mask] = acc[mask] / d_i[mask]
             atten_output_pt[batch, head][~mask] = 0.0  # Handle division by zero
-    
+
     return atten_output_pt
 
+
 # 设置随机种子以确保可重复性
 torch.manual_seed(42)
 
 # 假设头维度为 64，批次为 2，头数为 4，分区数量为 4，实际序列长度为 128，分区大小为 32
-batchs, num_heads, seq_block_num, head_dim = 2, 4, 4, 64  # head_dim 必须等于 BLOCK_DMODEL_CONST
+batchs, num_heads, seq_block_num, head_dim = (
+    2,
+    4,
+    4,
+    64,
+)  # head_dim 必须等于 BLOCK_DMODEL_CONST
 actual_seq_len = 128
 partition_size = 32
 
 # 随机初始化 Mid_O 和 Mid_O_LogExpSum
-mid_o = torch.randn(batchs, num_heads, seq_block_num, head_dim, device='cuda', dtype=torch.float32)
-mid_o_logexpsum = torch.randn(batchs, num_heads, seq_block_num, device='cuda', dtype=torch.float32)
+mid_o = torch.randn(
+    batchs, num_heads, seq_block_num, head_dim, device="cuda", dtype=torch.float32
+)
+mid_o_logexpsum = torch.randn(
+    batchs, num_heads, seq_block_num, device="cuda", dtype=torch.float32
+)
 
 # 初始化 atten_output
-atten_output = torch.zeros(batchs, num_heads, head_dim, device='cuda', dtype=torch.float32)
+atten_output = torch.zeros(
+    batchs, num_heads, head_dim, device="cuda", dtype=torch.float32
+)
 
 # 调用修复后的 Triton 函数
 flash_decode_stage2(
-    mid_o, mid_o_logexpsum, 
-    atten_output, 
-    actual_seq_len=actual_seq_len, 
-    PARTITION_SIZE=partition_size
+    mid_o,
+    mid_o_logexpsum,
+    atten_output,
+    actual_seq_len=actual_seq_len,
+    PARTITION_SIZE=partition_size,
 )
 
 # 调用 PyTorch 实现
-pt_atten_output = pytorch_flash_decode_stage2(mid_o, mid_o_logexpsum, actual_seq_len, partition_size)
+pt_atten_output = pytorch_flash_decode_stage2(
+    mid_o, mid_o_logexpsum, actual_seq_len, partition_size
+)
 
 # 比较 Triton 和 PyTorch 的输出
 diff_atten_output = torch.abs(atten_output - pt_atten_output).max()
@@ -161,4 +196,3 @@ def pytorch_flash_decode_stage2(mid_o, mid_o_logexpsum, actual_seq_len, partitio
 # 断言差异在合理范围内
 assert diff_atten_output < 1e-3, "Atten_Output 的差异超出容忍范围"
 print("Triton 内核与 PyTorch 实现的数值对比通过。")
-
diff --git a/tests/test_get_model_name.py b/tests/test_get_model_name.py
index 64d95e6..932b929 100644
--- a/tests/test_get_model_name.py
+++ b/tests/test_get_model_name.py
@@ -1,15 +1,17 @@
 from transformers import LlavaConfig, AutoTokenizer
 
+
 def get_model_name_from_path(model_path):
     model_path = model_path.strip("/")
     model_paths = model_path.split("/")
-    if model_paths[-1].startswith('checkpoint-'):
+    if model_paths[-1].startswith("checkpoint-"):
         return model_paths[-2] + "_" + model_paths[-1]
     else:
         return model_paths[-1]
 
+
 if __name__ == "__main__":
     model_path = "/gemini/code/lite_llama/my_weight/llava-1.5-7b-hf"
     print(get_model_name_from_path(model_path))
     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
-    print(tokenizer)
\ No newline at end of file
+    print(tokenizer)
diff --git a/tests/test_gpt2.py b/tests/test_gpt2.py
index 407296f..baceb4d 100644
--- a/tests/test_gpt2.py
+++ b/tests/test_gpt2.py
@@ -1,7 +1,10 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 
-def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_p=0.9, device="cuda"):
+
+def generate_text(
+    model, tokenizer, prompt, max_length=50, temperature=1.0, top_p=0.9, device="cuda"
+):
     """
     使用 model.forward 实现逐步生成文本，并正确设置 attention_mask。
 
@@ -28,39 +31,43 @@ def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_
 
     for _ in range(max_length):
         # 调用模型的 forward 方法
-        outputs = model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True)
-        
+        outputs = model(
+            input_ids=input_ids, past_key_values=past_key_values, use_cache=True
+        )
+
         # 获取 logits，并仅关注最后一个 token 的 logits
         logits = outputs.logits  # [1, 1, V]
         next_token_logits = logits[:, -1, :] / temperature  # [1, V]
-        
+
         # 应用 top-p 过滤
-        sorted_logits, sorted_indices = torch.sort(next_token_logits, dim=-1, descending=True)
+        sorted_logits, sorted_indices = torch.sort(
+            next_token_logits, dim=-1, descending=True
+        )
         cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
-        
+
         # 创建 mask
         sorted_indices_to_remove = cumulative_probs > top_p
         # Shift the mask to include the first token exceeding p
         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
         sorted_indices_to_remove[..., 0] = False
-        
+
         # 应用 mask
-        sorted_logits[sorted_indices_to_remove] = -float('Inf')
+        sorted_logits[sorted_indices_to_remove] = -float("Inf")
         # 应用 softmax
         probs = torch.softmax(sorted_logits, dim=-1)
-        
+
         # 采样下一个 token
         next_token = torch.multinomial(probs, num_samples=1)  # [1, 1]
-        
+
         # 反向排序索引以获取原始 token ID
         next_token = sorted_indices.gather(-1, next_token)
 
         # 将生成的 token 添加到生成的 Token 列表中
         generated_ids = torch.cat([generated_ids, next_token], dim=-1)
-        
+
         # 更新 input_ids 为新生成的 token
         input_ids = next_token
-        
+
         # 更新 past_key_values
         past_key_values = outputs.past_key_values
 
@@ -68,20 +75,29 @@ def generate_text(model, tokenizer, prompt, max_length=50, temperature=1.0, top_
     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
     return generated_text
 
+
 if __name__ == "__main__":
     # 使用标准的 GPT-2 模型名称，确保模型和 tokenizer 匹配
     model_name = "/gemini/code/llm_weights/gpt2"  # 修改为您的模型路径或名称
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
-    
+
     # 将模型移动到 GPU（如果可用）并设置为评估模式
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     model.eval()
-    
+
     # 定义 prompt
     prompt = "Once upon a time in a distant land,"
-    
+
     # 生成文本
-    generated = generate_text(model, tokenizer, prompt, max_length=500, temperature=1.0, top_p=0.9, device=device)
-    print(generated)
\ No newline at end of file
+    generated = generate_text(
+        model,
+        tokenizer,
+        prompt,
+        max_length=500,
+        temperature=1.0,
+        top_p=0.9,
+        device=device,
+    )
+    print(generated)
diff --git a/tests/test_image_process.py b/tests/test_image_process.py
index e603048..7e01fb1 100644
--- a/tests/test_image_process.py
+++ b/tests/test_image_process.py
@@ -1,18 +1,20 @@
-import os,sys
+import os, sys
 from PIL import Image
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from lite_llama.lite_llama.utils.image_process import vis_images
 
+
 def test_vis_images(image_files):
     print("=" * 50)
     print("Input Image:")
     vis_images(image_files)
 
+
 if __name__ == "__main__":
     image_files = [
         "/gemini/code/lite_llama/images/pexels-christian-heitz-285904-842711.jpg",
         "/gemini/code/lite_llama/images/pexels-francesco-ungaro-1525041.jpg",
     ]
-    test_vis_images(image_files)
\ No newline at end of file
+    test_vis_images(image_files)
diff --git a/tests/test_image_token.py b/tests/test_image_token.py
index 8b3da9b..f15f028 100644
--- a/tests/test_image_token.py
+++ b/tests/test_image_token.py
@@ -5,56 +5,60 @@
 # 假设 IMAGE_TOKEN_INDEX 为 1000
 IMAGE_TOKEN_INDEX = 32000
 
+
 class MockTokenizer:
     def __init__(self, bos_token_id=101, eos_token_id=102):
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-    
+
     def __call__(self, text):
         # 简单模拟分词器，将每个字符转换为其 ASCII 值
         # 并在句首添加 BOS token，如果需要
         input_ids = []
-        if text.startswith('<BOS>'):
+        if text.startswith("<BOS>"):
             input_ids.append(self.bos_token_id)
             text = text[5:]
         for char in text:
             input_ids.append(ord(char))
         return MockEncoding(input_ids)
-    
+
+
 class MockEncoding:
     def __init__(self, input_ids):
         self.input_ids = input_ids
 
+
 def tokenizer_image_token(
-    prompt, 
-    tokenizer, 
-    image_token_index=IMAGE_TOKEN_INDEX, 
-    return_tensors=None
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
 ):
     """
     处理包含特殊标记 <image> 的文本提示, 将其转换为相应的 token 序列，并在 <image> 位置插入指定的图像 token 索引。
-    
+
     参数:
         prompt (str): 包含 <image> 标记的文本。
         tokenizer: 分词器对象，需支持调用 tokenizer(chunk).input_ids。
         image_token_index (int): 用于替换 <image> 标记的图像 token 索引。
         return_tensors (str, optional): 指定返回的张量类型，例如 'pt' 表示 PyTorch 张量。
-    
+
     返回:
         list 或 torch.Tensor: 生成的 token 序列。
     """
     # 使用正则表达式分割，移除 '<image>' 前的空格，但保留后的空格
-    prompt_chunks = re.split(r'\s?<image>', prompt)
+    prompt_chunks = re.split(r"\s?<image>", prompt)
     # 不过滤空片段，以处理多个连续的 '<image>' 标记
     token_chunks = [tokenizer(chunk).input_ids for chunk in prompt_chunks]
-    
+
     input_ids = []
     offset = 0
     # 检查第一个片段是否以 BOS token 开始
-    if len(token_chunks) > 0 and len(token_chunks[0]) > 0 and token_chunks[0][0] == tokenizer.bos_token_id:
+    if (
+        len(token_chunks) > 0
+        and len(token_chunks[0]) > 0
+        and token_chunks[0][0] == tokenizer.bos_token_id
+    ):
         offset = 1
         input_ids.append(token_chunks[0][0])
-    
+
     # 插入图像 token
     for i, chunk in enumerate(token_chunks):
         # 添加当前片段的 token，跳过 BOS token（如果已添加）
@@ -63,30 +67,41 @@ def tokenizer_image_token(
         # 如果不是最后一个片段，插入图像 token
         if i < len(token_chunks) - 1:
             input_ids.append(image_token_index)
-    
+
     if return_tensors is not None:
-        if return_tensors == 'pt':
+        if return_tensors == "pt":
             return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
     return input_ids
 
+
 class TestTokenizerImageToken(unittest.TestCase):
     def setUp(self):
         self.tokenizer = MockTokenizer()
-    
+
     def test_single_image(self):
         prompt = "Hello <image> world."
         # "Hello" -> [72, 101, 108, 108, 111]
         # " world." -> [32, 119, 111, 114, 108, 100, 46]
         # After insertion: [72,101,108,108,111,1000,32,119,111,114,108,100,46]
         expected_input_ids = [
-            ord('H'), ord('e'), ord('l'), ord('l'), ord('o'), 
-            IMAGE_TOKEN_INDEX, 
-            ord(' '), ord('w'), ord('o'), ord('r'), ord('l'), ord('d'), ord('.')
+            ord("H"),
+            ord("e"),
+            ord("l"),
+            ord("l"),
+            ord("o"),
+            IMAGE_TOKEN_INDEX,
+            ord(" "),
+            ord("w"),
+            ord("o"),
+            ord("r"),
+            ord("l"),
+            ord("d"),
+            ord("."),
         ]
         result = tokenizer_image_token(prompt, self.tokenizer)
         self.assertEqual(result, expected_input_ids)
-    
+
     def test_multiple_images(self):
         prompt = "A cat <image> is sitting <image> on the mat."
         # "A cat" -> [65, 32, 99, 97, 116]
@@ -94,28 +109,78 @@ def test_multiple_images(self):
         # " on the mat." -> [32, 111, 110, 32, 116, 104, 101, 32, 109, 97, 116, 46]
         # After insertion: [65,32,99,97,116,1000,32,105,115,32,115,105,116,116,105,110,103,1000,32,111,110,32,116,104,101,32,109,97,116,46]
         expected_input_ids = [
-            ord('A'), ord(' '), ord('c'), ord('a'), ord('t'), 
-            IMAGE_TOKEN_INDEX, 
-            ord(' '), ord('i'), ord('s'), ord(' '), ord('s'), ord('i'), ord('t'), ord('t'), ord('i'), ord('n'), ord('g'), 
-            IMAGE_TOKEN_INDEX, 
-            ord(' '), ord('o'), ord('n'), ord(' '), ord('t'), ord('h'), ord('e'), ord(' '), ord('m'), ord('a'), ord('t'), ord('.')
+            ord("A"),
+            ord(" "),
+            ord("c"),
+            ord("a"),
+            ord("t"),
+            IMAGE_TOKEN_INDEX,
+            ord(" "),
+            ord("i"),
+            ord("s"),
+            ord(" "),
+            ord("s"),
+            ord("i"),
+            ord("t"),
+            ord("t"),
+            ord("i"),
+            ord("n"),
+            ord("g"),
+            IMAGE_TOKEN_INDEX,
+            ord(" "),
+            ord("o"),
+            ord("n"),
+            ord(" "),
+            ord("t"),
+            ord("h"),
+            ord("e"),
+            ord(" "),
+            ord("m"),
+            ord("a"),
+            ord("t"),
+            ord("."),
         ]
         result = tokenizer_image_token(prompt, self.tokenizer)
         self.assertEqual(result, expected_input_ids)
-    
+
     def test_no_image(self):
         prompt = "This is a text without images."
         # "This is a text without images." -> [84, 104, 105, 115, 32, 105, 115, 32, 97, 32, 116, 101, 120, 116, 32, 119, 105, 116, 104, 111, 117, 116, 32, 105, 109, 97, 103, 101, 115, 46]
         expected_input_ids = [
-            ord('T'), ord('h'), ord('i'), ord('s'), ord(' '),
-            ord('i'), ord('s'), ord(' '), ord('a'), ord(' '),
-            ord('t'), ord('e'), ord('x'), ord('t'), ord(' '),
-            ord('w'), ord('i'), ord('t'), ord('h'), ord('o'), ord('u'), ord('t'), ord(' '),
-            ord('i'), ord('m'), ord('a'), ord('g'), ord('e'), ord('s'), ord('.')
+            ord("T"),
+            ord("h"),
+            ord("i"),
+            ord("s"),
+            ord(" "),
+            ord("i"),
+            ord("s"),
+            ord(" "),
+            ord("a"),
+            ord(" "),
+            ord("t"),
+            ord("e"),
+            ord("x"),
+            ord("t"),
+            ord(" "),
+            ord("w"),
+            ord("i"),
+            ord("t"),
+            ord("h"),
+            ord("o"),
+            ord("u"),
+            ord("t"),
+            ord(" "),
+            ord("i"),
+            ord("m"),
+            ord("a"),
+            ord("g"),
+            ord("e"),
+            ord("s"),
+            ord("."),
         ]
         result = tokenizer_image_token(prompt, self.tokenizer)
         self.assertEqual(result, expected_input_ids)
-    
+
     def test_leading_bos_token(self):
         prompt = "<BOS>Start <image> end."
         # "<BOS>Start" -> [101, 83, 116, 97, 114, 116]
@@ -123,14 +188,22 @@ def test_leading_bos_token(self):
         # After insertion: [101,83,116,97,114,116,1000,32,101,110,100,46]
         expected_input_ids = [
             101,  # BOS token
-            ord('S'), ord('t'), ord('a'), ord('r'), ord('t'), 
-            IMAGE_TOKEN_INDEX, 
-            ord(' '), ord('e'), ord('n'), ord('d'), ord('.')
+            ord("S"),
+            ord("t"),
+            ord("a"),
+            ord("r"),
+            ord("t"),
+            IMAGE_TOKEN_INDEX,
+            ord(" "),
+            ord("e"),
+            ord("n"),
+            ord("d"),
+            ord("."),
         ]
         print("expected_input_ids ", expected_input_ids)
         result = tokenizer_image_token(prompt, self.tokenizer)
         self.assertEqual(result, expected_input_ids)
-    
+
     def test_consecutive_images(self):
         prompt = "Image1 <image><image> Image2."
         # "Image1" -> [73, 109, 97, 103, 101, 49]
@@ -138,29 +211,55 @@ def test_consecutive_images(self):
         # " Image2." -> [32, 73, 109, 97, 103, 101, 50, 46]
         # After insertion: [73,109,97,103,101,49,1000,1000,32,73,109,97,103,101,50,46]
         expected_input_ids = [
-            ord('I'), ord('m'), ord('a'), ord('g'), ord('e'), ord('1'), 
-            IMAGE_TOKEN_INDEX, 
-            IMAGE_TOKEN_INDEX, 
-            ord(' '), ord('I'), ord('m'), ord('a'), ord('g'), ord('e'), ord('2'), ord('.')
+            ord("I"),
+            ord("m"),
+            ord("a"),
+            ord("g"),
+            ord("e"),
+            ord("1"),
+            IMAGE_TOKEN_INDEX,
+            IMAGE_TOKEN_INDEX,
+            ord(" "),
+            ord("I"),
+            ord("m"),
+            ord("a"),
+            ord("g"),
+            ord("e"),
+            ord("2"),
+            ord("."),
         ]
         result = tokenizer_image_token(prompt, self.tokenizer)
         self.assertEqual(result, expected_input_ids)
-    
+
     def test_return_tensors_pt(self):
         prompt = "Hello <image> world."
         # [72,101,108,108,111,1000,32,119,111,114,108,100,46]
-        expected_tensor = torch.tensor([
-            ord('H'), ord('e'), ord('l'), ord('l'), ord('o'), 
-            IMAGE_TOKEN_INDEX, 
-            ord(' '), ord('w'), ord('o'), ord('r'), ord('l'), ord('d'), ord('.')
-        ], dtype=torch.long)
-        result = tokenizer_image_token(prompt, self.tokenizer, return_tensors='pt')
+        expected_tensor = torch.tensor(
+            [
+                ord("H"),
+                ord("e"),
+                ord("l"),
+                ord("l"),
+                ord("o"),
+                IMAGE_TOKEN_INDEX,
+                ord(" "),
+                ord("w"),
+                ord("o"),
+                ord("r"),
+                ord("l"),
+                ord("d"),
+                ord("."),
+            ],
+            dtype=torch.long,
+        )
+        result = tokenizer_image_token(prompt, self.tokenizer, return_tensors="pt")
         self.assertTrue(torch.equal(result, expected_tensor))
-    
+
     def test_return_tensors_unsupported(self):
         prompt = "Hello <image> world."
         with self.assertRaises(ValueError):
-            tokenizer_image_token(prompt, self.tokenizer, return_tensors='np')
+            tokenizer_image_token(prompt, self.tokenizer, return_tensors="np")
+
 
-if __name__ == '__main__':
-    unittest.main(argv=[''], exit=False)
\ No newline at end of file
+if __name__ == "__main__":
+    unittest.main(argv=[""], exit=False)
diff --git a/tests/test_llama_layer.py b/tests/test_llama_layer.py
index 9072b12..f6b5568 100644
--- a/tests/test_llama_layer.py
+++ b/tests/test_llama_layer.py
@@ -3,9 +3,17 @@
 import torch
 import pytest, os, sys
 from dataclasses import dataclass
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.kernels import fused_linear, rmsnorm
-from lite_llama.models.llama import ModelArgs, FusedAttention, FusedMLP, LlamaDecoderLayer, Llama
+from lite_llama.models.llama import (
+    ModelArgs,
+    FusedAttention,
+    FusedMLP,
+    LlamaDecoderLayer,
+    Llama,
+)
+
 
 @dataclass
 class TestArgs:
@@ -21,61 +29,84 @@ class TestArgs:
     use_scaled_rope: bool = True
     max_batch_size: int = 4
     max_seq_len: int = 64
-    device: str = 'cuda'
+    device: str = "cuda"
+
 
 @pytest.fixture(scope="module")
 def device():
     if torch.cuda.is_available():
-        return 'cuda'
+        return "cuda"
     else:
-        pytest.skip("CUDA is not available. Skipping tests that require CUDA.", allow_module_level=True)
+        pytest.skip(
+            "CUDA is not available. Skipping tests that require CUDA.",
+            allow_module_level=True,
+        )
+
 
 @pytest.fixture
 def model_args(device):
     return TestArgs(device=device)
 
+
 def test_rmsnorm(device):
     batch_size, seq_len, dim = 2, 64, 128
     x = torch.randn(batch_size, seq_len, dim, device=device)
     norm = torch.ones(batch_size * seq_len, device=device)
     eps = 1e-5
     output = rmsnorm(x.view(-1, dim), norm, eps)
-    expected = x.view(-1, dim) / (torch.sqrt(torch.mean(x.view(-1, dim) ** 2, dim=1, keepdim=True)) + eps)
-    assert torch.allclose(output, expected, atol=1e-4), "RMSNorm does not match expected output."
+    expected = x.view(-1, dim) / (
+        torch.sqrt(torch.mean(x.view(-1, dim) ** 2, dim=1, keepdim=True)) + eps
+    )
+    assert torch.allclose(output, expected, atol=1e-4), (
+        "RMSNorm does not match expected output."
+    )
+
 
 def test_fused_attention(model_args):
     batch_size, seq_len, dim = 2, 16, model_args.dim
     x = torch.randn(batch_size, seq_len, dim, device=model_args.device)
-    start_pos = 0    
+    start_pos = 0
     attention = FusedAttention(model_args).to(model_args.device)
     output = attention(x, start_pos)
     # Since FusedAttention uses Triton kernels, compare shapes
-    assert output.shape == (batch_size, seq_len, dim), "FusedAttention output shape mismatch."
+    assert output.shape == (batch_size, seq_len, dim), (
+        "FusedAttention output shape mismatch."
+    )
+
 
 def test_fused_mlp(model_args):
     batch_size, seq_len, dim = 2, 16, model_args.dim
     x = torch.randn(batch_size, seq_len, dim, device=model_args.device)
-    
+
     mlp = FusedMLP(model_args).to(model_args.device)
     output = mlp(x)
     # The output dimension should match the input dimension
     assert output.shape == (batch_size, seq_len, dim), "FusedMLP output shape mismatch."
 
+
 def test_llama_decoder_layer(model_args):
     batch_size, seq_len, dim = 2, 16, model_args.dim
     x = torch.randn(batch_size, seq_len, dim, device=model_args.device)
-    start_pos = 0    
+    start_pos = 0
     layer = LlamaDecoderLayer(model_args).to(model_args.device)
     output = layer(x, start_pos)
-    assert output.shape == (batch_size, seq_len, dim), "LlamaDecoderLayer output shape mismatch."
+    assert output.shape == (batch_size, seq_len, dim), (
+        "LlamaDecoderLayer output shape mismatch."
+    )
+
 
 def test_llama_model(model_args):
     batch_size, seq_len = 2, 16
-    tokens = torch.randint(0, model_args.vocab_size, (batch_size, seq_len), device=model_args.device)
-    
+    tokens = torch.randint(
+        0, model_args.vocab_size, (batch_size, seq_len), device=model_args.device
+    )
+
     model = Llama(model_args).to(model_args.device)
     output = model(tokens, start_pos=0)
-    assert output.shape == (batch_size, seq_len, model_args.vocab_size), "Llama model output shape mismatch."
+    assert output.shape == (batch_size, seq_len, model_args.vocab_size), (
+        "Llama model output shape mismatch."
+    )
+
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/test_load_weight.py b/tests/test_load_weight.py
index 5370cb3..af9f844 100644
--- a/tests/test_load_weight.py
+++ b/tests/test_load_weight.py
@@ -5,16 +5,18 @@
 from pathlib import Path
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from lite_llama.models.qwen2 import Qwen2Model, Qwen2Config
 
-def load_config_from_json(json_file_path: str, device: str="cuda") -> Qwen2Config:
+
+def load_config_from_json(json_file_path: str, device: str = "cuda") -> Qwen2Config:
     with open(json_file_path, "r") as f:
         config_dict = json.load(f)
-    
-    config = Qwen2Config(config_dict, max_seq_len = 2048, device=device)
+
+    config = Qwen2Config(config_dict, max_seq_len=2048, device=device)
     return config
 
+
 def load_original_llama(model_name_or_path: str, device: str = "cuda"):
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
     model = AutoModelForCausalLM.from_pretrained(
@@ -27,7 +29,10 @@ def load_original_llama(model_name_or_path: str, device: str = "cuda"):
 
     return model, tokenizer, hf_sd
 
-def load_custom_llam(model_name_or_path: str, model_config: Qwen2Config, device: str = "cuda"):
+
+def load_custom_llam(
+    model_name_or_path: str, model_config: Qwen2Config, device: str = "cuda"
+):
     checkpoints = sorted(Path(model_name_or_path).glob("*.pth"))
     assert len(checkpoints) > 0, f"no checkpoint files found in {model_name_or_path}"
     ckpt_path = checkpoints[0]
@@ -42,65 +47,61 @@ def load_custom_llam(model_name_or_path: str, model_config: Qwen2Config, device:
 
     return model, new_sd
 
+
 def compare_model_weights(hf_sd, new_sd, model_config, rtol=1e-5, atol=1e-8):
     """
     比较两个模型权重字典的各个参数是否相等。
-    
+
     Args:
         hf_sd (dict): Hugging Face 模型的 state_dict。
         new_sd (dict): 自定义模型的 state_dict。
         rtol (float): 允许的相对误差。
         atol (float): 允许的绝对误差。
-    
+
     Returns:
         bool: 如果权重完全匹配，则返回 True, 否则返回 False。
     """
 
     all_match = True
-    
+
     # 检查键是否一致
     hf_keys = set(hf_sd.keys())
     new_keys = set(new_sd.keys())
-    
+
     if hf_keys != new_keys:
         print("键不一致！")
         print("Hugging Face 多出的键:", hf_keys - new_keys)
         print("自定义模型多出的键:", new_keys - hf_keys)
         # all_match = False
-    
+
     # 映射嵌入层  # 映射归一化层
     mapping = {
-        "model.norm.weight": "norm_weight", 
+        "model.norm.weight": "norm_weight",
         "model.embed_tokens.weight": "embed_tokens.weight",
         "lm_head.weight": "lm_head_weight",
     }
 
     # 映射层
     layers = {
-        'model.layers.{i}.self_attn.q_proj.weight': 'layers.{i}.self_attn.q_proj_weight',
-        'model.layers.{i}.self_attn.q_proj.bias': 'layers.{i}.self_attn.q_proj_bias',
-
-        'model.layers.{i}.self_attn.k_proj.weight': 'layers.{i}.self_attn.k_proj_weight',
-        'model.layers.{i}.self_attn.k_proj.bias': 'layers.{i}.self_attn.k_proj_bias',
-
-        'model.layers.{i}.self_attn.v_proj.weight': 'layers.{i}.self_attn.v_proj_weight',
-        'model.layers.{i}.self_attn.v_proj.bias': 'layers.{i}.self_attn.v_proj_bias',
-
-        'model.layers.{i}.self_attn.o_proj.weight': 'layers.{i}.self_attn.o_proj_weight',
-
-        'model.layers.{i}.mlp.gate_proj.weight': 'layers.{i}.mlp.gate_proj.weight',
-        'model.layers.{i}.mlp.up_proj.weight': 'layers.{i}.mlp.up_proj.weight',
-        'model.layers.{i}.mlp.down_proj.weight': 'layers.{i}.mlp.down_proj.weight',
-
-        'model.layers.{i}.input_layernorm.weight': 'layers.{i}.input_layernorm_weight',
-        'model.layers.{i}.post_attention_layernorm.weight': 'layers.{i}.post_attention_layernorm_weight',
+        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.self_attn.q_proj_weight",
+        "model.layers.{i}.self_attn.q_proj.bias": "layers.{i}.self_attn.q_proj_bias",
+        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj_weight",
+        "model.layers.{i}.self_attn.k_proj.bias": "layers.{i}.self_attn.k_proj_bias",
+        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj_weight",
+        "model.layers.{i}.self_attn.v_proj.bias": "layers.{i}.self_attn.v_proj_bias",
+        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj_weight",
+        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
+        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
+        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
+        "model.layers.{i}.input_layernorm.weight": "layers.{i}.input_layernorm_weight",
+        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.post_attention_layernorm_weight",
     }
 
     # 根据 Transformer 层数量生成映射
     for i in range(model_config.num_layers):
         for hf_key, custom_key in layers.items():
-            mapped_key = hf_key.format(i=i) # hf 权重参数字典 key
-            custom_mapped_key = custom_key.format(i=i) # 自定义模型权重参数字典 key
+            mapped_key = hf_key.format(i=i)  # hf 权重参数字典 key
+            custom_mapped_key = custom_key.format(i=i)  # 自定义模型权重参数字典 key
             mapping[mapped_key] = custom_mapped_key
 
     # 创建新的状态字典
@@ -114,19 +115,20 @@ def compare_model_weights(hf_sd, new_sd, model_config, rtol=1e-5, atol=1e-8):
             print(f"Hugging Face 权重: {hf_param}")
             print(f"自定义模型权重: {new_param}")
             all_match = False
-    
+
     if all_match:
         print("所有权重完全匹配！")
     else:
         print("权重存在不匹配！")
 
+
 if __name__ == "__main__":
     device = "cuda" if torch.cuda.is_available() else "cpu"
     # 定义 Qwen2.5-3B 模型权重路径和配置参数
     original_model_path = "/gemini/pretrain/Qwen2.5-3B"
     my_model_path = "/gemini/code/Qwen2.5-3B-Instruct/"
-    json_file_path = os.path.join(original_model_path, 'config.json') # JSON 文件的路径
-    model_config = load_config_from_json(json_file_path, device) # 加载配置
+    json_file_path = os.path.join(original_model_path, "config.json")  # JSON 文件的路径
+    model_config = load_config_from_json(json_file_path, device)  # 加载配置
 
     # 加载原始 hf 模型权重
     original_model, tokenizer, hf_sd = load_original_llama(original_model_path, device)
@@ -137,4 +139,4 @@ def compare_model_weights(hf_sd, new_sd, model_config, rtol=1e-5, atol=1e-8):
     compare_model_weights(hf_sd, new_sd, model_config)
 
     for name, param in custom_model.named_parameters():
-        print(name, param.shape)
\ No newline at end of file
+        print(name, param.shape)
diff --git a/tests/test_mask.py b/tests/test_mask.py
index 3c5d4d2..4a73990 100644
--- a/tests/test_mask.py
+++ b/tests/test_mask.py
@@ -2,13 +2,14 @@
 
 import torch, time
 
+
 def create_and_print_mask():
     """用于测试 mask 内容和形状"""
     seq_len = 4
     start_pos = 0
     mask = torch.full((seq_len, seq_len), float("-inf"))
     print(mask)
-    mask1 = torch.triu(mask, diagonal=1) # 创建上三角矩阵
+    mask1 = torch.triu(mask, diagonal=1)  # 创建上三角矩阵
     print(mask1)
     mask2 = torch.hstack([torch.zeros((seq_len, start_pos)), mask1])
     print(mask2)
@@ -18,9 +19,10 @@ def create_and_print_mask():
     offs_k = torch.tensor([0, 1, 2, 3])
     mask3 = offs_m[:, None] >= offs_k[None, :]
     print(mask3)
-    mask4 = scores.masked_fill(mask3 == 0, float('-inf'))
+    mask4 = scores.masked_fill(mask3 == 0, float("-inf"))
     print(mask4)
 
+
 """
 tensor([[-inf, -inf, -inf, -inf],
         [-inf, -inf, -inf, -inf],
@@ -45,9 +47,9 @@ def create_and_print_mask():
         [-0.5477,  0.1412,  0.7192,  0.8276]])
 """
 
+
 def apply_prefill_mask1(scores, seq_len):
-    """llama3 实现的创建并应用 mask 矩阵方法
-    """
+    """llama3 实现的创建并应用 mask 矩阵方法"""
     mask = torch.full((seq_len, seq_len), float("-inf"))
     mask = torch.triu(mask, diagonal=1)
 
@@ -55,21 +57,28 @@ def apply_prefill_mask1(scores, seq_len):
 
     return masked_scores
 
+
 def apply_prefill_mask2(scores, seq_len):
     """使用下三角矩阵方法创建并应用 mask"""
     mask = torch.tril(torch.ones([seq_len, seq_len]))
-    masked_scores = scores.masked_fill(mask == 0, float('-inf'))
+    masked_scores = scores.masked_fill(mask == 0, float("-inf"))
     return masked_scores
 
+
 def apply_prefill_mask3(scores, seq_len):
     """flashattention 内核中创建并应用的 mask"""
-    offs_q = torch.arange(seq_len, )
-    offs_k = torch.arange(seq_len, )
+    offs_q = torch.arange(
+        seq_len,
+    )
+    offs_k = torch.arange(
+        seq_len,
+    )
     mask = offs_q[:, None] >= offs_k[None, :]
-    masked_scores = scores.masked_fill(mask == 0, float('-inf'))
+    masked_scores = scores.masked_fill(mask == 0, float("-inf"))
     # masked_scores = torch.where(mask, scores, torch.full_like(scores, -1.0e8))
     return masked_scores
 
+
 if __name__ == "__main__":
     # torch.manual_seed(42)
     seq_len = 512
@@ -92,7 +101,7 @@ def apply_prefill_mask3(scores, seq_len):
     masked_scores3 = apply_prefill_mask3(scores, seq_len)
     time3 = time.time() - start_time
     print(f"apply_prefill_mask3 运行时间: {time3:.6f} 秒")
-    
+
     # 确保两个函数的结果一致
     assert torch.allclose(masked_scores1, masked_scores2, atol=1e-4)
-    assert torch.allclose(masked_scores1, masked_scores3, atol=1e-4)
\ No newline at end of file
+    assert torch.allclose(masked_scores1, masked_scores3, atol=1e-4)
diff --git a/tests/test_mem_manager.py b/tests/test_mem_manager.py
index ba8f5c0..b6d88c7 100644
--- a/tests/test_mem_manager.py
+++ b/tests/test_mem_manager.py
@@ -1,10 +1,12 @@
 # 代码可直接运行，用于测试 KVCacheMemoryManager 的结果
 
 import unittest
-import torch, os,sys
+import torch, os, sys
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
 from lite_llama.executor.mem_manager import KVCacheMemoryManager
 
+
 class TestKVCacheMemoryManager(unittest.TestCase):
     def setUp(self):
         # 使用较小的参数值以便于测试
@@ -20,7 +22,7 @@ def setUp(self):
             num_layers=self.num_layers,
             gpu_num_blocks=self.gpu_num_blocks,
             dtype=self.dtype,
-            device=self.device
+            device=self.device,
         )
 
     def test_initialization(self):
@@ -37,14 +39,17 @@ def test_alloc_kvcache_success(self):
         self.assertEqual(select_index.numel(), need_size)
         # 检查分配的索引是否被标记为使用
         used_state = self.manager.kv_mem_use_state[select_index]
-        print("After alloc_kvcache(3) alloc_kvcache kv_mem_use_state ", self.manager.kv_mem_use_state)
+        print(
+            "After alloc_kvcache(3) alloc_kvcache kv_mem_use_state ",
+            self.manager.kv_mem_use_state,
+        )
         self.assertTrue(torch.all(used_state == 1))
         # 检查可用内存大小是否更新
         self.assertEqual(self.manager.can_use_mem_size, self.gpu_num_blocks - need_size)
-        
+
         self.manager.release_ref(select_index)
         print("after release_ref kv_mem_use_state ", self.manager.kv_mem_use_state)
-    
+
     def test_alloc_kvcache_failure(self):
         """尝试分配超过可用块数量的内存"""
         need_size = self.gpu_num_blocks + 1
@@ -61,7 +66,7 @@ def test_alloc_contiguous_kvcache_success(self):
         select_index, start, end = result
         self.assertEqual(select_index.numel(), need_size)
         self.assertEqual(end - start, need_size)
-        
+
         # 检查分配的索引是否被标记为使用
         used_state = self.manager.kv_mem_use_state[select_index]
         self.assertTrue(torch.all(used_state == 1))
@@ -74,15 +79,15 @@ def test_alloc_contiguous_kvcache_failure(self):
         need_size = self.gpu_num_blocks
         print("self.can_use_mem_size ", self.manager.can_use_mem_size)
         result = self.manager.alloc_contiguous_kvcache(need_size)
-        
+
         print("result and need_size ", result, need_size)
         select_index, _, _ = result
-        
+
         self.assertIsNotNone(result)
 
         # 可用内存大小应为0
         self.assertEqual(self.manager.can_use_mem_size, 0)
-        
+
         self.manager.release_ref(select_index)
 
     def test_add_ref(self):
@@ -92,8 +97,11 @@ def test_add_ref(self):
         self.assertIsNotNone(select_index)
 
         # 检查引用计数是否为2
-        used_state = self.manager.kv_mem_use_state[select_index] # tensor([1, 1])
-        self.assertTrue(torch.sum(used_state != 0) == 2, "The number of non-zero elements is not equal to 2")
+        used_state = self.manager.kv_mem_use_state[select_index]  # tensor([1, 1])
+        self.assertTrue(
+            torch.sum(used_state != 0) == 2,
+            "The number of non-zero elements is not equal to 2",
+        )
         # 检查可用内存大小是否正确
         self.assertEqual(self.manager.can_use_mem_size, self.gpu_num_blocks - need_size)
         self.manager.release_ref(select_index)
@@ -153,7 +161,7 @@ def test_alloc_contiguous_kvcache_after_release_ref(self):
         self.assertEqual(new_select_index.numel(), 2)
         # 可用内存大小应为 gpu_num_blocks - 2
         self.assertEqual(self.manager.can_use_mem_size, self.gpu_num_blocks - 4)
-        
+
         self.manager.release_ref(new_select_index)  # 释放
 
     def test_in_alloc_contiguous_kvcache(self):
@@ -163,7 +171,7 @@ def test_in_alloc_contiguous_kvcache(self):
         self.assertIsNotNone(select_index)
         # 手动设置块8为已使用以打破连续性
         self.manager.kv_mem_use_state[7] = 1
-        self.manager.can_use_mem_size -=1
+        self.manager.can_use_mem_size -= 1
         # 现在尝试分配 3 个连续块，应失败
         contiguous_result = self.manager.alloc_contiguous_kvcache(3)
         self.assertIsNone(contiguous_result)
@@ -181,7 +189,8 @@ def test_free_buffers(self):
         # 检查 gpu_kv_buffer 是否为 None
         self.assertIsNone(self.manager.gpu_kv_buffer)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     suite = unittest.TestSuite()
     tests = [
         "test_initialization",
@@ -197,5 +206,7 @@ def test_free_buffers(self):
         "test_in_alloc_contiguous_kvcache",
         "test_free_buffers",
     ]
-    suite.addTests(unittest.TestLoader().loadTestsFromNames(tests, TestKVCacheMemoryManager))
-    unittest.TextTestRunner().run(suite)
\ No newline at end of file
+    suite.addTests(
+        unittest.TestLoader().loadTestsFromNames(tests, TestKVCacheMemoryManager)
+    )
+    unittest.TextTestRunner().run(suite)
diff --git a/tests/test_merge.py b/tests/test_merge.py
index b97787c..75dd273 100644
--- a/tests/test_merge.py
+++ b/tests/test_merge.py
@@ -1,5 +1,3 @@
-
-
 import unittest
 import torch
 
@@ -10,23 +8,24 @@
 
 logger = logging.getLogger(__name__)
 
+
 def merge_input_ids_with_image_features(
-    image_features: torch.Tensor, 
-    inputs_embeds: torch.Tensor, 
-    input_ids: torch.Tensor, 
+    image_features: torch.Tensor,
+    inputs_embeds: torch.Tensor,
+    input_ids: torch.Tensor,
     pad_token_id: int,
-    image_token_index: int
+    image_token_index: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     将 input_ids 与 image_features 合并，生成最终的嵌入和位置 ID。
-    
+
     Args:
         image_features (torch.Tensor): 视觉编码后的图像特征，形状为 (batch_size, num_image_patches, embed_dim)
         inputs_embeds (torch.Tensor): 文本嵌入，形状为 (batch_size, sequence_length, embed_dim)
         input_ids (torch.Tensor): 输入的 token IDs, 形状为 (batch_size, sequence_length)
         pad_token_id (int): 填充 token 的 ID
         image_token_index (int): 图像 token 的 ID
-    
+
     Returns:
         final_embedding (torch.Tensor): 合并后的嵌入，形状为 (batch_size, max_embed_dim, embed_dim)
         position_ids (torch.Tensor): 位置 ID, 形状为 (batch_size, max_embed_dim)
@@ -41,23 +40,31 @@ def merge_input_ids_with_image_features(
     special_image_token_mask = input_ids == image_token_index
 
     # 每个样本中图像 token 的数量
-    num_special_image_tokens = special_image_token_mask.sum(dim=1)  # shape: (batch_size,)
+    num_special_image_tokens = special_image_token_mask.sum(
+        dim=1
+    )  # shape: (batch_size,)
 
     # 计算每个样本的新序列长度
-    new_sequence_length_per_sample = sequence_length + num_special_image_tokens * (num_image_patches - 1)
+    new_sequence_length_per_sample = sequence_length + num_special_image_tokens * (
+        num_image_patches - 1
+    )
 
     # 获取批次中最大的序列长度
     max_embed_dim = new_sequence_length_per_sample.max().item()
 
     # 初始化最终的嵌入
     final_embedding = torch.zeros(
-        (batch_size, max_embed_dim, embed_dim), 
-        dtype=inputs_embeds.dtype, 
-        device=inputs_embeds.device
+        (batch_size, max_embed_dim, embed_dim),
+        dtype=inputs_embeds.dtype,
+        device=inputs_embeds.device,
     )
 
     # 初始化 position_ids
-    position_ids = torch.arange(max_embed_dim, dtype=torch.long, device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1)
+    position_ids = (
+        torch.arange(max_embed_dim, dtype=torch.long, device=inputs_embeds.device)
+        .unsqueeze(0)
+        .expand(batch_size, -1)
+    )
 
     for i in range(batch_size):
         curr_pos = 0
@@ -66,13 +73,15 @@ def merge_input_ids_with_image_features(
                 # 插入图像特征
                 if curr_pos + num_image_patches > max_embed_dim:
                     raise ValueError(f"Sample {i} exceeds max_embed_dim.")
-                final_embedding[i, curr_pos: curr_pos + num_image_patches, :] = image_features[i]
+                final_embedding[i, curr_pos : curr_pos + num_image_patches, :] = (
+                    image_features[i]
+                )
                 curr_pos += num_image_patches
             else:
                 if curr_pos >= max_embed_dim:
                     raise ValueError(f"Sample {i} exceeds max_embed_dim.")
                 final_embedding[i, curr_pos, :] = inputs_embeds[i, j, :]
-                curr_pos +=1
+                curr_pos += 1
         # 剩余位置已被初始化为0（填充）
 
     # 处理 pad_token_id，将对应位置的嵌入设为0
@@ -81,14 +90,22 @@ def merge_input_ids_with_image_features(
         sample = batch_indices_pad[idx]
         position = pad_indices[idx]
         # 计算新位置
-        new_position = torch.sum(special_image_token_mask[sample, :position] * (num_image_patches -1)).item() + position
+        new_position = (
+            torch.sum(
+                special_image_token_mask[sample, :position] * (num_image_patches - 1)
+            ).item()
+            + position
+        )
         if new_position < max_embed_dim:
             final_embedding[sample, new_position, :] = 0.0
         else:
-            logger.warning(f"Pad token position {position} exceeds max_embed_dim for sample {sample}")
+            logger.warning(
+                f"Pad token position {position} exceeds max_embed_dim for sample {sample}"
+            )
 
     return final_embedding, position_ids
 
+
 class TestMergeInputIdsWithImageFeatures(unittest.TestCase):
     def test_merge_basic(self):
         # 定义参数
@@ -102,44 +119,47 @@ def test_merge_basic(self):
         # 创建 mock input_ids
         # 样本1: [2, 1, 3, 1, 4] - 两个图像 token
         # 样本2: [1, 2, 3, 0, 0] - 一个图像 token，两个 pad token
-        input_ids = torch.tensor([
-            [2, 1, 3, 1, 4],
-            [1, 2, 3, 0, 0]
-        ], dtype=torch.long)
+        input_ids = torch.tensor([[2, 1, 3, 1, 4], [1, 2, 3, 0, 0]], dtype=torch.long)
 
         # 创建 mock inputs_embeds
-        inputs_embeds = torch.tensor([
+        inputs_embeds = torch.tensor(
             [
-                [0.1, 0.2, 0.3, 0.4],
-                [0.5, 0.6, 0.7, 0.8],
-                [0.9, 1.0, 1.1, 1.2],
-                [1.3, 1.4, 1.5, 1.6],
-                [1.7, 1.8, 1.9, 2.0]
+                [
+                    [0.1, 0.2, 0.3, 0.4],
+                    [0.5, 0.6, 0.7, 0.8],
+                    [0.9, 1.0, 1.1, 1.2],
+                    [1.3, 1.4, 1.5, 1.6],
+                    [1.7, 1.8, 1.9, 2.0],
+                ],
+                [
+                    [2.1, 2.2, 2.3, 2.4],
+                    [2.5, 2.6, 2.7, 2.8],
+                    [2.9, 3.0, 3.1, 3.2],
+                    [0.0, 0.0, 0.0, 0.0],
+                    [0.0, 0.0, 0.0, 0.0],
+                ],
             ],
-            [
-                [2.1, 2.2, 2.3, 2.4],
-                [2.5, 2.6, 2.7, 2.8],
-                [2.9, 3.0, 3.1, 3.2],
-                [0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0]
-            ]
-        ], dtype=torch.float)
+            dtype=torch.float,
+        )
 
         # 创建 mock image_features
         # 样本1: 两个图像 token，每个替换为3个图像 patches
         # 样本2: 一个图像 token，替换为3个图像 patches
-        image_features = torch.tensor([
+        image_features = torch.tensor(
             [
-                [10.1, 10.2, 10.3, 10.4],
-                [10.5, 10.6, 10.7, 10.8],
-                [10.9, 11.0, 11.1, 11.2]
+                [
+                    [10.1, 10.2, 10.3, 10.4],
+                    [10.5, 10.6, 10.7, 10.8],
+                    [10.9, 11.0, 11.1, 11.2],
+                ],
+                [
+                    [20.1, 20.2, 20.3, 20.4],
+                    [20.5, 20.6, 20.7, 20.8],
+                    [20.9, 21.0, 21.1, 21.2],
+                ],
             ],
-            [
-                [20.1, 20.2, 20.3, 20.4],
-                [20.5, 20.6, 20.7, 20.8],
-                [20.9, 21.0, 21.1, 21.2]
-            ]
-        ], dtype=torch.float)
+            dtype=torch.float,
+        )
 
         # 调用函数，确保参数顺序正确
         final_embedding, position_ids = merge_input_ids_with_image_features(
@@ -147,7 +167,7 @@ def test_merge_basic(self):
             inputs_embeds=inputs_embeds,
             input_ids=input_ids,
             pad_token_id=pad_token_id,
-            image_token_index=image_token_index
+            image_token_index=image_token_index,
         )
 
         # 定义预期的形状
@@ -172,17 +192,20 @@ def test_merge_basic(self):
         # [4] = inputs_embeds[0,2]
         # [5,6,7] = image_features[0]
         # [8] = inputs_embeds[0,4]
-        expected_sample1 = torch.tensor([
-            [0.1, 0.2, 0.3, 0.4],
-            [10.1, 10.2, 10.3, 10.4],
-            [10.5, 10.6, 10.7, 10.8],
-            [10.9, 11.0, 11.1, 11.2],
-            [0.9, 1.0, 1.1, 1.2],
-            [10.1, 10.2, 10.3, 10.4],
-            [10.5, 10.6, 10.7, 10.8],
-            [10.9, 11.0, 11.1, 11.2],
-            [1.7, 1.8, 1.9, 2.0]
-        ], dtype=torch.float)
+        expected_sample1 = torch.tensor(
+            [
+                [0.1, 0.2, 0.3, 0.4],
+                [10.1, 10.2, 10.3, 10.4],
+                [10.5, 10.6, 10.7, 10.8],
+                [10.9, 11.0, 11.1, 11.2],
+                [0.9, 1.0, 1.1, 1.2],
+                [10.1, 10.2, 10.3, 10.4],
+                [10.5, 10.6, 10.7, 10.8],
+                [10.9, 11.0, 11.1, 11.2],
+                [1.7, 1.8, 1.9, 2.0],
+            ],
+            dtype=torch.float,
+        )
 
         self.assertTrue(torch.allclose(final_embedding[0], expected_sample1))
 
@@ -194,17 +217,20 @@ def test_merge_basic(self):
         # [3] = inputs_embeds[1,1]
         # [4] = inputs_embeds[1,2]
         # [5,6,7,8] = pad (already zero)
-        expected_sample2 = torch.tensor([
-            [20.1, 20.2, 20.3, 20.4],
-            [20.5, 20.6, 20.7, 20.8],
-            [20.9, 21.0, 21.1, 21.2],
-            [2.5, 2.6, 2.7, 2.8],
-            [2.9, 3.0, 3.1, 3.2],
-            [0.0, 0.0, 0.0, 0.0],
-            [0.0, 0.0, 0.0, 0.0],
-            [0.0, 0.0, 0.0, 0.0],
-            [0.0, 0.0, 0.0, 0.0]
-        ], dtype=torch.float)
+        expected_sample2 = torch.tensor(
+            [
+                [20.1, 20.2, 20.3, 20.4],
+                [20.5, 20.6, 20.7, 20.8],
+                [20.9, 21.0, 21.1, 21.2],
+                [2.5, 2.6, 2.7, 2.8],
+                [2.9, 3.0, 3.1, 3.2],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0],
+            ],
+            dtype=torch.float,
+        )
 
         self.assertTrue(torch.allclose(final_embedding[1], expected_sample2))
 
@@ -218,30 +244,27 @@ def test_merge_no_image_tokens(self):
         image_token_index = 1
 
         input_ids = torch.tensor([[2, 3, 4]], dtype=torch.long)
-        inputs_embeds = torch.tensor([[[0.1, 0.2, 0.3],
-                                       [0.4, 0.5, 0.6],
-                                       [0.7, 0.8, 0.9]]], dtype=torch.float)
-        image_features = torch.tensor([[[10.1, 10.2, 10.3],
-                                        [10.4, 10.5, 10.6]]], dtype=torch.float)
+        inputs_embeds = torch.tensor(
+            [[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]], dtype=torch.float
+        )
+        image_features = torch.tensor(
+            [[[10.1, 10.2, 10.3], [10.4, 10.5, 10.6]]], dtype=torch.float
+        )
 
         final_embedding, position_ids = merge_input_ids_with_image_features(
             image_features=image_features,
             inputs_embeds=inputs_embeds,
             input_ids=input_ids,
             pad_token_id=pad_token_id,
-            image_token_index=image_token_index
+            image_token_index=image_token_index,
         )
 
         # 期望输出与 inputs_embeds 相同
-        expected_final_embedding = torch.tensor([
-            [
-                [0.1, 0.2, 0.3],
-                [0.4, 0.5, 0.6],
-                [0.7, 0.8, 0.9]
-            ]
-        ], dtype=torch.float)
+        expected_final_embedding = torch.tensor(
+            [[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]], dtype=torch.float
+        )
 
-        expected_position_ids = torch.tensor([[0,1,2]], dtype=torch.long)
+        expected_position_ids = torch.tensor([[0, 1, 2]], dtype=torch.long)
 
         self.assertEqual(final_embedding.shape, (batch_size, 3, embed_dim))
         self.assertEqual(position_ids.shape, (batch_size, 3))
@@ -258,34 +281,38 @@ def test_merge_all_image_tokens(self):
         image_token_index = 1
 
         input_ids = torch.tensor([[1, 1, 1]], dtype=torch.long)
-        inputs_embeds = torch.tensor([[[0.0, 0.0, 0.0],
-                                       [0.0, 0.0, 0.0],
-                                       [0.0, 0.0, 0.0]]], dtype=torch.float)
-        image_features = torch.tensor([[[10.1, 10.2, 10.3],
-                                        [10.4, 10.5, 10.6]]], dtype=torch.float)
+        inputs_embeds = torch.tensor(
+            [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]], dtype=torch.float
+        )
+        image_features = torch.tensor(
+            [[[10.1, 10.2, 10.3], [10.4, 10.5, 10.6]]], dtype=torch.float
+        )
 
         final_embedding, position_ids = merge_input_ids_with_image_features(
             image_features=image_features,
             inputs_embeds=inputs_embeds,
             input_ids=input_ids,
             pad_token_id=pad_token_id,
-            image_token_index=image_token_index
+            image_token_index=image_token_index,
         )
 
         # 每个图像 token 替换为2个图像 patches
         # 新序列长度 = 3 + 3*(2-1) =6
-        expected_final_embedding = torch.tensor([
+        expected_final_embedding = torch.tensor(
             [
-                [10.1, 10.2, 10.3],
-                [10.4, 10.5, 10.6],
-                [10.1, 10.2, 10.3],
-                [10.4, 10.5, 10.6],
-                [10.1, 10.2, 10.3],
-                [10.4, 10.5, 10.6]
-            ]
-        ], dtype=torch.float)
+                [
+                    [10.1, 10.2, 10.3],
+                    [10.4, 10.5, 10.6],
+                    [10.1, 10.2, 10.3],
+                    [10.4, 10.5, 10.6],
+                    [10.1, 10.2, 10.3],
+                    [10.4, 10.5, 10.6],
+                ]
+            ],
+            dtype=torch.float,
+        )
 
-        expected_position_ids = torch.tensor([[0,1,2,3,4,5]], dtype=torch.long)
+        expected_position_ids = torch.tensor([[0, 1, 2, 3, 4, 5]], dtype=torch.long)
 
         self.assertEqual(final_embedding.shape, (batch_size, 6, embed_dim))
         self.assertEqual(position_ids.shape, (batch_size, 6))
@@ -302,44 +329,56 @@ def test_merge_with_pad_tokens(self):
         image_token_index = 1
 
         input_ids = torch.tensor([[2, 1, 3, 1, 0]], dtype=torch.long)
-        inputs_embeds = torch.tensor([[[0.1, 0.2, 0.3],
-                                       [0.4, 0.5, 0.6],
-                                       [0.7, 0.8, 0.9],
-                                       [1.0, 1.1, 1.2],
-                                       [0.0, 0.0, 0.0]]], dtype=torch.float)
-        image_features = torch.tensor([[[10.1, 10.2, 10.3],
-                                        [10.4, 10.5, 10.6]]], dtype=torch.float)
+        inputs_embeds = torch.tensor(
+            [
+                [
+                    [0.1, 0.2, 0.3],
+                    [0.4, 0.5, 0.6],
+                    [0.7, 0.8, 0.9],
+                    [1.0, 1.1, 1.2],
+                    [0.0, 0.0, 0.0],
+                ]
+            ],
+            dtype=torch.float,
+        )
+        image_features = torch.tensor(
+            [[[10.1, 10.2, 10.3], [10.4, 10.5, 10.6]]], dtype=torch.float
+        )
 
         final_embedding, position_ids = merge_input_ids_with_image_features(
             image_features=image_features,
             inputs_embeds=inputs_embeds,
             input_ids=input_ids,
             pad_token_id=pad_token_id,
-            image_token_index=image_token_index
+            image_token_index=image_token_index,
         )
 
         # 期望:
         # Original: [2, 1, 3, 1, 0]
         # Replaced: [2, image_features, 3, image_features, 0]
         # new_sequence_length=5 +2*(2-1)=7
-        expected_final_embedding = torch.tensor([
+        expected_final_embedding = torch.tensor(
             [
-                [0.1, 0.2, 0.3],
-                [10.1, 10.2, 10.3],
-                [10.4, 10.5, 10.6],
-                [0.7, 0.8, 0.9],
-                [10.1, 10.2, 10.3],
-                [10.4, 10.5, 10.6],
-                [0.0, 0.0, 0.0]
-            ]
-        ], dtype=torch.float)
-
-        expected_position_ids = torch.tensor([[0,1,2,3,4,5,6]], dtype=torch.long)
+                [
+                    [0.1, 0.2, 0.3],
+                    [10.1, 10.2, 10.3],
+                    [10.4, 10.5, 10.6],
+                    [0.7, 0.8, 0.9],
+                    [10.1, 10.2, 10.3],
+                    [10.4, 10.5, 10.6],
+                    [0.0, 0.0, 0.0],
+                ]
+            ],
+            dtype=torch.float,
+        )
+
+        expected_position_ids = torch.tensor([[0, 1, 2, 3, 4, 5, 6]], dtype=torch.long)
 
         self.assertEqual(final_embedding.shape, (batch_size, 7, embed_dim))
         self.assertEqual(position_ids.shape, (batch_size, 7))
         self.assertTrue(torch.allclose(final_embedding, expected_final_embedding))
         self.assertTrue(torch.all(position_ids == expected_position_ids))
 
-if __name__ == '__main__':
-    unittest.main(argv=[''], exit=False)
+
+if __name__ == "__main__":
+    unittest.main(argv=[""], exit=False)
diff --git a/tests/test_merge_input_ids_with_image_features.py b/tests/test_merge_input_ids_with_image_features.py
index 3e40a62..606c068 100644
--- a/tests/test_merge_input_ids_with_image_features.py
+++ b/tests/test_merge_input_ids_with_image_features.py
@@ -2,51 +2,61 @@
 import torch
 import torch.nn as nn
 
+
 class Config:
     def __init__(self, image_token_index=32000, pad_token_id=0, ignore_index=-100):
         self.image_token_index = image_token_index
         self.pad_token_id = pad_token_id
         self.ignore_index = ignore_index
 
+
 class MockModel:
     def __init__(self, config):
         self.config = config
 
+
 class MultiModalModel:
     def __init__(self, config):
         self.config = config
         self.model = MockModel(config)
 
     def _merge_input_ids_with_image_features(
-        self, 
-        image_features, 
-        inputs_embeds, 
-        input_ids, 
-        attention_mask
+        self, image_features, inputs_embeds, input_ids, attention_mask
     ):
         num_images, num_image_patches, embed_dim = image_features.shape
         batch_size, sequence_length = input_ids.shape
         # NOTE: 检查每个样本的最后一个 token 是否为填充 token
         # NOTE: 如果最后一个 token 不是填充 token，则为 True，表示存在左侧填充；否则为 False。
-        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.config.pad_token_id))
-        
+        left_padding = not torch.sum(
+            input_ids[:, -1] == torch.tensor(self.config.pad_token_id)
+        )
+
         # 1. 创建图像 token 的掩码来获取特殊图像 token 的位置, 并计算新序列最大长度
         # NOTE: 一个布尔张量，标识 input_ids 中哪些位置是图像 token（即等于 image_token_index 的位置）
         special_image_token_mask = input_ids == self.config.image_token_index
         # NOTE: 每个样本中图像 token 的数量, 形状为 [batch_size, ]
         num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        
+
         # 计算合并图像特征后的新序列最大长度。
         # NOTE: 每个图像 token 位置会被替换为 (num_image_patches - 1) 个图像 paches embedding token。
-        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        max_embed_dim = (
+            num_special_image_tokens.max() * (num_image_patches - 1)
+        ) + sequence_length
         # NOTE: 通过 torch.where 获取所有非图像 token 的位置索引。
         # NOTE: 当仅提供 condition 参数时，torch.where 等同于 torch.nonzero(condition, as_tuple=True)，返回满足条件的元素的索引。
-        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index) # 满足条件的样本索引和序列 token 索引
+        batch_indices, non_image_indices = torch.where(
+            input_ids != self.config.image_token_index
+        )  # 满足条件的样本索引和序列 token 索引
 
         # 2. 计算文本应写入的位置
         # NOTE: 每个图像 token 会增加 (num_image_patches - 1) 个位置。
-        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
-        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1] # 计算需要的图像填充数量，以达到 max_embed_dim。
+        new_token_positions = (
+            torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1)
+            - 1
+        )
+        nb_image_pad = (
+            max_embed_dim - 1 - new_token_positions[:, -1]
+        )  # 计算需要的图像填充数量，以达到 max_embed_dim。
         # 如果存在左侧填充 (left_padding 为 True)，则将 new_token_positions 进行偏移调整。
         if left_padding:
             new_token_positions += nb_image_pad[:, None]  # offset for left padding
@@ -55,12 +65,19 @@ def _merge_input_ids_with_image_features(
 
         # 3. 初始化最终的嵌入与注意力掩码
         final_embedding = torch.zeros(
-            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+            batch_size,
+            max_embed_dim,
+            embed_dim,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
         )
         final_attention_mask = torch.zeros(
-            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+            batch_size,
+            max_embed_dim,
+            dtype=attention_mask.dtype,
+            device=inputs_embeds.device,
         )
-      
+
         # NOTE: 如果视觉模型或语言模型已卸载到 CPU，我们需要手动将相应的张量设置到正确的目标设备中。
         target_device = inputs_embeds.device
         batch_indices, non_image_indices, text_to_overwrite = (
@@ -70,27 +87,41 @@ def _merge_input_ids_with_image_features(
         )
         attention_mask = attention_mask.to(target_device)
 
-        # 4. 填充文本嵌入与注意力掩码. 
+        # 4. 填充文本嵌入与注意力掩码.
         # If we have ["hey" "<image>", "how", "are"]. we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
         # NOTE: 使用 batch_indices 和 text_to_overwrite 将 inputs_embeds 中的非图像 token 嵌入复制到 final_embedding 的相应位置。
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
+            batch_indices, non_image_indices
+        ]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[
+            batch_indices, non_image_indices
+        ]
 
         # 5. 填充图像特征与更新注意力掩码和位置 ID.
-        image_to_overwrite = torch.all(final_embedding == 0, dim=-1) # 找出 final_embedding 中所有维度为0的位置（即尚未填充的地方）。
+        image_to_overwrite = torch.all(
+            final_embedding == 0, dim=-1
+        )  # 找出 final_embedding 中所有维度为0的位置（即尚未填充的地方）。
         # NOTE: 使用 cumsum 计算累积和，确保这些位置在填充数量 (nb_image_pad) 之后。
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
-
-        if image_to_overwrite.sum() != image_features.shape[:-1].numel():  # 如果需要填充的位置数量不等于 image_features 的数量，抛出错误。
-            raise ValueError(      
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[
+            :, None
+        ].to(target_device)
+
+        if (
+            image_to_overwrite.sum() != image_features.shape[:-1].numel()
+        ):  # 如果需要填充的位置数量不等于 image_features 的数量，抛出错误。
+            raise ValueError(
                 f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
                 f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
             )
 
         # NOTE: 将 image_features 重新排列为 (batch_size * num_image_patches, embed_dim)，并填充到 final_embedding 的相应位置。
-        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_embedding[image_to_overwrite] = (
+            image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        )
         final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_(
+            (final_attention_mask == 0), 1
+        )
 
         # 6. 处理填充位置的嵌入, 将填充位置的嵌入设为0：
         batch_indices, pad_indices = torch.where(input_ids == self.config.pad_token_id)
@@ -100,28 +131,28 @@ def _merge_input_ids_with_image_features(
 
         return final_embedding, final_attention_mask, position_ids
 
+
 class TestMergeInputIDsWithImageFeaturesDebug(unittest.TestCase):
     def setUp(self):
         # 初始化配置对象
         self.config = Config(image_token_index=32000, pad_token_id=0, ignore_index=-100)
         # 初始化模型
         self.model = MultiModalModel(self.config)
-        self.device = torch.device('cpu')  # 使用CPU进行测试
+        self.device = torch.device("cpu")  # 使用CPU进行测试
 
     def test_merge_without_padding_debug(self):
         """
         测试在没有填充且每个样本有一个图像token的情况下的合并功能，并打印中间变量。
         """
         # Batch size 2, sequence length 7
-        input_ids = torch.tensor([
-            [1, 2, 32000, 4, 5, 6, 7],
-            [8, 32000, 10, 11, 12, 13, 14]
-        ], dtype=torch.long)
+        input_ids = torch.tensor(
+            [[1, 2, 32000, 4, 5, 6, 7], [8, 32000, 10, 11, 12, 13, 14]],
+            dtype=torch.long,
+        )
 
-        attention_mask = torch.tensor([
-            [1, 1, 1, 1, 1, 1, 1],
-            [1, 1, 1, 1, 1, 1, 1]
-        ], dtype=torch.long)
+        attention_mask = torch.tensor(
+            [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]], dtype=torch.long
+        )
 
         # Image features: 2 images, 3 patches each, embed_dim=256
         image_features = torch.randn(2, 3, 256)
@@ -130,11 +161,13 @@ def test_merge_without_padding_debug(self):
         inputs_embeds = torch.randn(2, 7, 256)
 
         # 调用方法
-        final_embedding, final_attention_mask, position_ids = self.model._merge_input_ids_with_image_features(
-            image_features=image_features,
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids,
-            attention_mask=attention_mask
+        final_embedding, final_attention_mask, position_ids = (
+            self.model._merge_input_ids_with_image_features(
+                image_features=image_features,
+                inputs_embeds=inputs_embeds,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
         )
 
         # 打印关键变量
@@ -152,13 +185,9 @@ def test_merge_with_padding_debug(self):
         测试在有填充且样本中有一个图像token的情况下的合并功能，并打印中间变量。
         """
         # Batch size 1, sequence length 5 with padding
-        input_ids = torch.tensor([
-            [1, 32000, 3, 0, 0]
-        ], dtype=torch.long)
+        input_ids = torch.tensor([[1, 32000, 3, 0, 0]], dtype=torch.long)
 
-        attention_mask = torch.tensor([
-            [1, 1, 1, 0, 0]
-        ], dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1, 0, 0]], dtype=torch.long)
 
         # Image features: 1 image, 3 patches each, embed_dim=256
         image_features = torch.randn(1, 3, 256)
@@ -167,11 +196,13 @@ def test_merge_with_padding_debug(self):
         inputs_embeds = torch.randn(1, 5, 256)
 
         # 调用方法
-        final_embedding, final_attention_mask, position_ids = self.model._merge_input_ids_with_image_features(
-            image_features=image_features,
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids,
-            attention_mask=attention_mask
+        final_embedding, final_attention_mask, position_ids = (
+            self.model._merge_input_ids_with_image_features(
+                image_features=image_features,
+                inputs_embeds=inputs_embeds,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
         )
 
         # 打印关键变量
@@ -189,13 +220,9 @@ def test_merge_no_image_tokens_debug(self):
         测试在没有图像token的情况下的合并功能，并打印中间变量。
         """
         # Batch size 1, sequence length 4, no image tokens
-        input_ids = torch.tensor([
-            [1, 2, 3, 4]
-        ], dtype=torch.long)
+        input_ids = torch.tensor([[1, 2, 3, 4]], dtype=torch.long)
 
-        attention_mask = torch.tensor([
-            [1, 1, 1, 1]
-        ], dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1, 1]], dtype=torch.long)
 
         # Image features: 0 images, 0 patches each, embed_dim=256
         image_features = torch.empty(0, 3, 256)
@@ -204,11 +231,13 @@ def test_merge_no_image_tokens_debug(self):
         inputs_embeds = torch.randn(1, 4, 256)
 
         # 调用方法
-        final_embedding, final_attention_mask, position_ids = self.model._merge_input_ids_with_image_features(
-            image_features=image_features,
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids,
-            attention_mask=attention_mask
+        final_embedding, final_attention_mask, position_ids = (
+            self.model._merge_input_ids_with_image_features(
+                image_features=image_features,
+                inputs_embeds=inputs_embeds,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
         )
 
         # 打印关键变量
@@ -226,13 +255,9 @@ def test_merge_all_image_tokens_debug(self):
         测试在所有 token 都是图像 token 的情况下的合并功能，并打印中间变量。
         """
         # Batch size 1, sequence length 3, all image tokens
-        input_ids = torch.tensor([
-            [32000, 32000, 32000]
-        ], dtype=torch.long)
+        input_ids = torch.tensor([[32000, 32000, 32000]], dtype=torch.long)
 
-        attention_mask = torch.tensor([
-            [1, 1, 1]
-        ], dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1]], dtype=torch.long)
 
         # Image features: 3 images, 2 patches each, embed_dim=256
         image_features = torch.randn(3, 2, 256)
@@ -241,11 +266,13 @@ def test_merge_all_image_tokens_debug(self):
         inputs_embeds = torch.randn(1, 3, 256)
 
         # 调用方法
-        final_embedding, final_attention_mask, position_ids = self.model._merge_input_ids_with_image_features(
-            image_features=image_features,
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids,
-            attention_mask=attention_mask
+        final_embedding, final_attention_mask, position_ids = (
+            self.model._merge_input_ids_with_image_features(
+                image_features=image_features,
+                inputs_embeds=inputs_embeds,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )
         )
 
         # 打印关键变量
@@ -263,13 +290,9 @@ def test_merge_invalid_image_tokens_debug(self):
         测试当图像token数量与提供的图像特征数量不匹配时，是否正确抛出错误，并打印相关信息。
         """
         # Batch size 1, sequence length 4, two image tokens but only one image feature
-        input_ids = torch.tensor([
-            [1, 32000, 32000, 4]
-        ], dtype=torch.long)
+        input_ids = torch.tensor([[1, 32000, 32000, 4]], dtype=torch.long)
 
-        attention_mask = torch.tensor([
-            [1, 1, 1, 1]
-        ], dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1, 1]], dtype=torch.long)
 
         # Image features: 1 image, 3 patches each, embed_dim=256
         image_features = torch.randn(1, 3, 256)
@@ -284,10 +307,11 @@ def test_merge_invalid_image_tokens_debug(self):
                 image_features=image_features,
                 inputs_embeds=inputs_embeds,
                 input_ids=input_ids,
-                attention_mask=attention_mask
+                attention_mask=attention_mask,
             )
         except ValueError as e:
             print(f"Raised ValueError as expected: {e}\n")
 
-if __name__ == '__main__':
-    unittest.main(argv=[''], exit=False)
\ No newline at end of file
+
+if __name__ == "__main__":
+    unittest.main(argv=[""], exit=False)
diff --git a/tests/test_qwen2.py b/tests/test_qwen2.py
index 19d9caf..d1f35b8 100644
--- a/tests/test_qwen2.py
+++ b/tests/test_qwen2.py
@@ -5,11 +5,12 @@
 from pathlib import Path
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 from lite_llama.models.qwen2 import Qwen2Model, Qwen2Config
 from lite_llama.executor.model_executor import ModelExecutor
 from lite_llama.executor.weight_convert import convert_qwen2_hf_to_litellama
 
+
 def sample_top_p(probs, p):
     """
     Perform top-p (nucleus) sampling on a probability distribution.
@@ -35,23 +36,25 @@ def sample_top_p(probs, p):
     next_token = torch.gather(probs_idx, -1, next_token)
     return next_token
 
-def load_config_from_json(json_file_path: str, device: str="cuda") -> Qwen2Config:
+
+def load_config_from_json(json_file_path: str, device: str = "cuda") -> Qwen2Config:
     with open(json_file_path, "r") as f:
         config_dict = json.load(f)
-    
+
     # 假设 Qwen2Config 可以通过关键字参数初始化
     config = Qwen2Config(
-        hidden_size = config_dict.get("hidden_size", 128),
-        num_heads = config_dict.get("num_heads", 8),
-        num_kv_heads = config_dict.get("num_kv_heads", 8),
-        intermediate_size = config_dict.get("intermediate_size", 512),
-        num_layers = config_dict.get("num_layers", 2),
-        vocab_size = config_dict.get("vocab_size", 1000),
-        rms_norm_eps = config_dict.get("rms_norm_eps", 1e-6),
-        tie_word_embeddings = config_dict.get("tie_word_embeddings", True),
+        hidden_size=config_dict.get("hidden_size", 128),
+        num_heads=config_dict.get("num_heads", 8),
+        num_kv_heads=config_dict.get("num_kv_heads", 8),
+        intermediate_size=config_dict.get("intermediate_size", 512),
+        num_layers=config_dict.get("num_layers", 2),
+        vocab_size=config_dict.get("vocab_size", 1000),
+        rms_norm_eps=config_dict.get("rms_norm_eps", 1e-6),
+        tie_word_embeddings=config_dict.get("tie_word_embeddings", True),
     )
     return config
 
+
 def load_original_model(model_name_or_path: str, device: str = "cuda"):
     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
     model = Qwen2ForCausalLM.from_pretrained(
@@ -64,6 +67,7 @@ def load_original_model(model_name_or_path: str, device: str = "cuda"):
 
     return model, tokenizer, hf_sd
 
+
 def load_custom_model(model_dir: str, model_config: Qwen2Config, device: str = "cuda"):
     # 找到 checkpoint 文件
     checkpoints = sorted(Path(model_dir).glob("*.pth"))
@@ -73,89 +77,111 @@ def load_custom_model(model_dir: str, model_config: Qwen2Config, device: str = "
 
     # 初始化自定义模型
     model = Qwen2Model(model_config).to(device)
-    
+
     # Convert model to float16 if on CUDA
     if device == "cuda":
         model = model.half()
-    
+
     # Load state_dict
     model.load_state_dict(state_dict, strict=True)
-    
+
     return model
 
-class Qwen2ModelInferTest():
+
+class Qwen2ModelInferTest:
     def __init__(
-        self, 
+        self,
         checkpoints_dir: str,
         tokenizer_path: str,
-        max_batch_size = 32,
-        max_seq_len = 2048,
+        max_batch_size=32,
+        max_seq_len=2048,
         load_model: bool = True,
         triton_weight: bool = True,
         device: str = "cuda",
     ):
         self.model_executor = ModelExecutor.build(
-            checkpoints_dir = checkpoints_dir,
-            tokenizer_path = tokenizer_path,  # Fixed
-            load_model = load_model,
-            max_batch_size = max_batch_size,
-            max_seq_len = max_seq_len,
-            triton_weight = triton_weight,
-            device = device,
+            checkpoints_dir=checkpoints_dir,
+            tokenizer_path=tokenizer_path,  # Fixed
+            load_model=load_model,
+            max_batch_size=max_batch_size,
+            max_seq_len=max_seq_len,
+            triton_weight=triton_weight,
+            device=device,
         )
 
     def prefill_stage_compare(
         self,
-        original_model, model_executor, tokenizer, 
-        input_text: str, device: str = "cuda"
+        original_model,
+        model_executor,
+        tokenizer,
+        input_text: str,
+        device: str = "cuda",
     ):
         """Prefill stage comparison, including hidden states."""
-        print("\n############################ [Starting Prefill stage comparison] #################################")
+        print(
+            "\n############################ [Starting Prefill stage comparison] #################################"
+        )
         # Prepare input
         inputs = tokenizer(input_text, return_tensors="pt").to(device)
         # Original model output
         with torch.no_grad():
-            original_outputs = original_model(**inputs, output_hidden_states=True, return_dict=True)
+            original_outputs = original_model(
+                **inputs, output_hidden_states=True, return_dict=True
+            )
         original_logits = original_outputs.logits  # [B, S, V]
 
         # Custom model output
-        tokens = inputs['input_ids']  # [B, S]
+        tokens = inputs["input_ids"]  # [B, S]
         with torch.no_grad():
-            custom_outputs, _ = model_executor.forward(tokens, prev_pos = 0)
+            custom_outputs, _ = model_executor.forward(tokens, prev_pos=0)
         custom_logits = custom_outputs  # [B, S, V]
 
         # Compare logits
         if original_logits.shape != custom_logits.shape:
-            print(f"Logits shape mismatch: original {original_logits.shape}, custom {custom_logits.shape}")
+            print(
+                f"Logits shape mismatch: original {original_logits.shape}, custom {custom_logits.shape}"
+            )
             return None
 
         # Compare hidden states
         original_hidden_states = original_outputs.hidden_states  # Tuple of [B, S, D]
-        custom_hidden_states = model_executor.model.hidden_states  # Assuming list of [B, S, D]
+        custom_hidden_states = (
+            model_executor.model.hidden_states
+        )  # Assuming list of [B, S, D]
 
         if len(custom_hidden_states) != len(original_hidden_states):
-            print(f"Number of hidden states layers mismatch: custom {len(custom_hidden_states)}, original {len(original_hidden_states)}")
+            print(
+                f"Number of hidden states layers mismatch: custom {len(custom_hidden_states)}, original {len(original_hidden_states)}"
+            )
             return None
 
-        print(f"model_executor.model.hidden_states number: {len(custom_hidden_states)}, original_outputs.hidden_states number: {len(original_hidden_states)} ")
-        
+        print(
+            f"model_executor.model.hidden_states number: {len(custom_hidden_states)}, original_outputs.hidden_states number: {len(original_hidden_states)} "
+        )
+
         for index in tqdm(range(len(custom_hidden_states)), desc="Comparing layers"):
             custom_layer_output = custom_hidden_states[index]
             original_layer_output = original_hidden_states[index]
-            
+
             if custom_layer_output.shape != original_layer_output.shape:
-                print(f"Layer {index} shape mismatch: custom {custom_layer_output.shape}, original {original_layer_output.shape}")
+                print(
+                    f"Layer {index} shape mismatch: custom {custom_layer_output.shape}, original {original_layer_output.shape}"
+                )
                 continue
 
-            difference = torch.abs(custom_layer_output - original_layer_output).mean().item()
+            difference = (
+                torch.abs(custom_layer_output - original_layer_output).mean().item()
+            )
             print(f"Difference at layer {index}: {difference}")
 
-        # Compare logits        
+        # Compare logits
         logits_diff = torch.abs(original_logits - custom_logits).mean().item()
         print(f"Prefill stage model Logits difference: {logits_diff}")
 
         # Sampling next token
-        original_next_token_logits = original_logits[:, -1, :]  # [B, V] Get logits for last token
+        original_next_token_logits = original_logits[
+            :, -1, :
+        ]  # [B, V] Get logits for last token
         probs = torch.softmax(original_next_token_logits, dim=-1)  # [B, V]
         # Sample next token
         next_token_id = torch.argmax(probs, dim=-1)  # [B]
@@ -167,21 +193,21 @@ def prefill_stage_compare(
         return transformers_generated_text
 
     def decode_stage_compare(
-        self, 
-        original_model, 
-        model_executor, 
-        tokenizer, 
-        input_text: str, 
-        device: str = "cuda"
+        self,
+        original_model,
+        model_executor,
+        tokenizer,
+        input_text: str,
+        device: str = "cuda",
     ):
         """
         Decode stage comparison: step-by-step comparison of outputs.
         """
         # Prepare input
         inputs = tokenizer(input_text, return_tensors="pt").to(device)
-        input_ids = inputs['input_ids']  # [B, S]
-        attention_mask = inputs.get('attention_mask', None)
-    
+        input_ids = inputs["input_ids"]  # [B, S]
+        attention_mask = inputs.get("attention_mask", None)
+
         # Set generation parameters
         max_new_tokens = 10
         original_model.eval()
@@ -191,89 +217,130 @@ def decode_stage_compare(
         original_generated = input_ids
         custom_next_token = input_ids
         original_next_token = input_ids
-        
+
         # 初始化 past_key_values 为 None
         past_key_values = None
         for step in tqdm(range(max_new_tokens), desc="Decoding steps"):
             # 1. Original model generates next token
             with torch.no_grad():
-                original_outputs = original_model(original_next_token, 
-                                                  past_key_values=past_key_values,
-                                                  output_hidden_states=True, 
-                                                  return_dict=True, 
-                                                  use_cache=True
-                                                )
+                original_outputs = original_model(
+                    original_next_token,
+                    past_key_values=past_key_values,
+                    output_hidden_states=True,
+                    return_dict=True,
+                    use_cache=True,
+                )
                 original_logits = original_outputs.logits[:, -1, :]  # [B, V]
 
-                temperature = 0.6 # Apply temperature # custom_outputs_logits: [B, V]
+                temperature = 0.6  # Apply temperature # custom_outputs_logits: [B, V]
                 probs = torch.softmax(original_logits / temperature, dim=-1)  # [B, V]
-                original_next_token = sample_top_p(probs, p=0.9)  # Sample next token [B, 1]
+                original_next_token = sample_top_p(
+                    probs, p=0.9
+                )  # Sample next token [B, 1]
                 # original_next_token = torch.argmax(original_logits, dim=-1, keepdim=True)  # [B, 1]
 
             # 2. Custom model generates next token
             with torch.no_grad():
-                custom_outputs_logits, _ = model_executor.forward(custom_next_token, 
-                                                                  prev_pos = original_generated.shape[1] - 1
-                                                                )
+                custom_outputs_logits, _ = model_executor.forward(
+                    custom_next_token, prev_pos=original_generated.shape[1] - 1
+                )
                 # 确保 custom_outputs_logits 是 [B, V]
                 if custom_outputs_logits.dim() == 3:
                     custom_logits = custom_outputs_logits[:, -1, :]  # [B, V]
                 elif custom_outputs_logits.dim() == 2:
                     custom_logits = custom_outputs_logits  # [B, V]
                 else:
-                    raise ValueError(f"Unexpected custom_outputs_logits dimensions: {custom_outputs_logits.dim()}")
-                
-                temperature = 0.6 # Apply temperature # custom_outputs_logits: [B, V]
+                    raise ValueError(
+                        f"Unexpected custom_outputs_logits dimensions: {custom_outputs_logits.dim()}"
+                    )
+
+                temperature = 0.6  # Apply temperature # custom_outputs_logits: [B, V]
                 probs = torch.softmax(custom_logits / temperature, dim=-1)  # [B, V]
-                custom_next_token = sample_top_p(probs, p=0.9)  # Sample next token [B, 1]
+                custom_next_token = sample_top_p(
+                    probs, p=0.9
+                )  # Sample next token [B, 1]
 
             # Compare hidden states
             original_hidden_states = original_outputs.hidden_states
             custom_hidden_states = model_executor.model.hidden_states
 
             if len(custom_hidden_states) != len(original_hidden_states):
-                print(f"Layer count mismatch: custom {len(custom_hidden_states)}, original {len(original_hidden_states)}")
+                print(
+                    f"Layer count mismatch: custom {len(custom_hidden_states)}, original {len(original_hidden_states)}"
+                )
                 break
 
-            print(f"============== Step {step+1}: Layer Compares: ====================")
-            for index in tqdm(range(len(custom_hidden_states)), desc=f"Step {step+1} Layer Comparison"):
+            print(
+                f"============== Step {step + 1}: Layer Compares: ===================="
+            )
+            for index in tqdm(
+                range(len(custom_hidden_states)),
+                desc=f"Step {step + 1} Layer Comparison",
+            ):
                 custom_layer_output = custom_hidden_states[index]
                 original_layer_output = original_hidden_states[index]
-                
+
                 if custom_layer_output.shape != original_layer_output.shape:
-                    print(f"Step {step+1} Layer {index} shape mismatch: custom {custom_layer_output.shape}, original {original_layer_output.shape}")
+                    print(
+                        f"Step {step + 1} Layer {index} shape mismatch: custom {custom_layer_output.shape}, original {original_layer_output.shape}"
+                    )
                     continue
 
-                difference = torch.abs(custom_layer_output - original_layer_output).mean().item()
-                print(f"Step {step+1} Difference at layer {index}: {difference}")
+                difference = (
+                    torch.abs(custom_layer_output - original_layer_output).mean().item()
+                )
+                print(f"Step {step + 1} Difference at layer {index}: {difference}")
 
             # Compare logits
-            logits_diff = torch.abs(original_logits - custom_outputs_logits).mean().item()
-            print(f"=========== Step {step+1}: Logits difference is: {logits_diff} ================")
+            logits_diff = (
+                torch.abs(original_logits - custom_outputs_logits).mean().item()
+            )
+            print(
+                f"=========== Step {step + 1}: Logits difference is: {logits_diff} ================"
+            )
+
+            # 生成下一个 token, 模型内部已经集成了过去的 kv cache
+            original_generated = torch.cat(
+                [original_generated, original_next_token], dim=-1
+            )
+            past_key_values = original_outputs.past_key_values  # 更新 past_key_values
 
-            # 生成下一个 token, 模型内部已经集成了过去的 kv cache 
-            original_generated = torch.cat([original_generated, original_next_token], dim=-1)
-            past_key_values = original_outputs.past_key_values # 更新 past_key_values
-            
             # Update attention_mask if necessary
             if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask, torch.ones((attention_mask.shape[0], 1), device=device, dtype=attention_mask.dtype)], dim=-1)
+                attention_mask = torch.cat(
+                    [
+                        attention_mask,
+                        torch.ones(
+                            (attention_mask.shape[0], 1),
+                            device=device,
+                            dtype=attention_mask.dtype,
+                        ),
+                    ],
+                    dim=-1,
+                )
 
         print("Decode stage comparison completed.")
 
-    def compare_models(self, original_model, tokenizer, input_text: str, device: str = "cuda"):
-        prefill_output_token = self.prefill_stage_compare(original_model, self.model_executor, tokenizer, input_text, device)
-        self.decode_stage_compare(original_model, self.model_executor, tokenizer, input_text, device)
+    def compare_models(
+        self, original_model, tokenizer, input_text: str, device: str = "cuda"
+    ):
+        prefill_output_token = self.prefill_stage_compare(
+            original_model, self.model_executor, tokenizer, input_text, device
+        )
+        self.decode_stage_compare(
+            original_model, self.model_executor, tokenizer, input_text, device
+        )
+
 
 if __name__ == "__main__":
     device = "cuda" if torch.cuda.is_available() else "cpu"
     # Define model config path
     original_model_path = "/gemini/pretrain/Qwen2.5-3B"
     my_model_path = "/gemini/code/llm_weights/Qwen2.5-3B/"
-    json_file_path = os.path.join(original_model_path, 'config.json') # JSON 文件的路径
+    json_file_path = os.path.join(original_model_path, "config.json")  # JSON 文件的路径
 
     # Load config
-    model_config = load_config_from_json(json_file_path, device) # Load config
+    model_config = load_config_from_json(json_file_path, device)  # Load config
 
     # Load original model and tokenizer
     original_model, tokenizer, hf_sd = load_original_model(original_model_path, device)
@@ -282,17 +349,17 @@ def compare_models(self, original_model, tokenizer, input_text: str, device: str
     # hf_sd = original_model.state_dict()
     # custom_model, _ = convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, model_config, device=device)
     qwen2_test = Qwen2ModelInferTest(
-        checkpoints_dir = my_model_path,
-        tokenizer_path = original_model_path,  # Assuming tokenizer is at original_model_path
-        max_batch_size = 64,
-        max_seq_len = 2048,
-        load_model = True,
-        triton_weight = True,
-        device = device,
+        checkpoints_dir=my_model_path,
+        tokenizer_path=original_model_path,  # Assuming tokenizer is at original_model_path
+        max_batch_size=64,
+        max_seq_len=2048,
+        load_model=True,
+        triton_weight=True,
+        device=device,
     )
 
     # Test text
     test_text = "Once upon a time in a distant land,"
 
     # Compare models
-    qwen2_test.compare_models(original_model, tokenizer, test_text, device)
\ No newline at end of file
+    qwen2_test.compare_models(original_model, tokenizer, test_text, device)
diff --git a/tests/test_rope_forward.py b/tests/test_rope_forward.py
index 71a61d9..3f9c980 100644
--- a/tests/test_rope_forward.py
+++ b/tests/test_rope_forward.py
@@ -102,6 +102,7 @@ def _triton_rope(
     new_k_tile_2 = k_tile_2 * cos_row + k_tile_1 * sin_row
     tl.store(k_ptr + second_half_k_offsets, new_k_tile_2, mask=second_k_mask)
 
+
 def rope_forward(q, k, cos, sin):
     """
     输入 q、k 参数是 4 维张量
@@ -110,7 +111,7 @@ def rope_forward(q, k, cos, sin):
     # note: q and k are incontiguous before the transformation and will become contiguous after transpose
     # q = q.transpose(1, 2)
     # k = k.transpose(1, 2)
-    
+
     batch_size, seq_len, n_q_head, head_dim = q.shape
     n_kv_head = k.shape[2]
     pad_hd = triton.next_power_of_2(head_dim)
@@ -147,12 +148,13 @@ def rope_forward(q, k, cos, sin):
     )
     return q, k
 
+
 def torch_rotary_emb(x, cos, sin):
     seq_len, h, d = x.shape
     # cos, sin 的形状为 (seq_len, d//2)
     half_dim = cos.shape[-1]
     x0 = x[:, :, :half_dim]
-    x1 = x[:, :, half_dim: 2*half_dim]
+    x1 = x[:, :, half_dim : 2 * half_dim]
 
     cos = cos.view(seq_len, 1, half_dim)
     sin = sin.view(seq_len, 1, half_dim)
@@ -161,12 +163,13 @@ def torch_rotary_emb(x, cos, sin):
     o1 = x0 * sin + x1 * cos
 
     if 2 * half_dim < d:
-        out = torch.cat([o0, o1, x[:, :, 2*half_dim:]], dim=-1)
+        out = torch.cat([o0, o1, x[:, :, 2 * half_dim :]], dim=-1)
     else:
         out = torch.cat([o0, o1], dim=-1)
 
     return out
 
+
 if __name__ == "__main__":
     # 单元测试有 bug 等待修复
     torch.manual_seed(0)
@@ -176,26 +179,33 @@ def torch_rotary_emb(x, cos, sin):
     batch_tokens = batch_size * seq_len
     x_shape = (batch_tokens, 32, 64)  # (seq_len, num_heads, head_dim)
     dtype = torch.float16
-    q = torch.randn(x_shape, dtype=dtype, device='cuda')
+    q = torch.randn(x_shape, dtype=dtype, device="cuda")
     k = torch.clone(q)
 
     triton_q = q.view(batch_size, seq_len, 32, 64)
     triton_k = k.view(batch_size, seq_len, 32, 64)
 
     # 生成 cos 和 sin，与 head_dim 对应，这里 head_dim=64，因此 cos, sin=(seq_len, head_dim//2)=(128,32)
-    cos_shape = (batch_tokens, 32)  
-    y = torch.randn(cos_shape, dtype=dtype, device='cuda')
+    cos_shape = (batch_tokens, 32)
+    y = torch.randn(cos_shape, dtype=dtype, device="cuda")
     cos = y.cos()
     sin = y.sin()
-    
+
     triton_cos = cos.view(seq_len, 1, head_dim)
     triton_sin = sin.view(seq_len, 1, head_dim)
 
     output_torch = torch_rotary_emb(q, cos, sin)
     q_out, k_out, _, _ = rope_forward(triton_q, triton_k, triton_cos, triton_cos)
     triton_q_out = q_out.view(-1, 32, 64)
-    print(f"output_torch shape {output_torch.shape}, triton_q_out shape {triton_q_out.shape}")
-    
-    print(f'The maximum difference between torch and triton is {torch.max(torch.abs(output_torch - triton_q_out))}')
-    print('torch:', triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
-    print('triton:', triton.testing.do_bench(lambda: rope_forward(triton_q, triton_k, cos, sin)))
\ No newline at end of file
+    print(
+        f"output_torch shape {output_torch.shape}, triton_q_out shape {triton_q_out.shape}"
+    )
+
+    print(
+        f"The maximum difference between torch and triton is {torch.max(torch.abs(output_torch - triton_q_out))}"
+    )
+    print("torch:", triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
+    print(
+        "triton:",
+        triton.testing.do_bench(lambda: rope_forward(triton_q, triton_k, cos, sin)),
+    )
diff --git a/tests/test_standard_mha.py b/tests/test_standard_mha.py
index e1d5333..cfe24c3 100644
--- a/tests/test_standard_mha.py
+++ b/tests/test_standard_mha.py
@@ -4,78 +4,108 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+
 class MultiHeadAttention(nn.Module):
     def __init__(self, embed_dim, num_heads):
         super(MultiHeadAttention, self).__init__()
         assert embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
-        
+
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-        
+
         # 定义线性变换
         self.query = nn.Linear(embed_dim, embed_dim)
-        self.key   = nn.Linear(embed_dim, embed_dim)
+        self.key = nn.Linear(embed_dim, embed_dim)
         self.value = nn.Linear(embed_dim, embed_dim)
-        
+
         self.out = nn.Linear(embed_dim, embed_dim)
-        
+
     def forward(self, x, mask=None):
         batch_size, seq_length, embed_dim = x.size()
-        
+
         # 线性变换并分成多头
-        Q = self.query(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)  # (batch, heads, seq, head_dim)
-        K = self.key(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
-        V = self.value(x).view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1,2)
-        
+        Q = (
+            self.query(x)
+            .view(batch_size, seq_length, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )  # (batch, heads, seq, head_dim)
+        K = (
+            self.key(x)
+            .view(batch_size, seq_length, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+        V = (
+            self.value(x)
+            .view(batch_size, seq_length, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+
         # 计算原始注意力分数, # (batch, heads, seq, seq)
-        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)  
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim**0.5)
 
-        # 对 scores 应用 masked 
+        # 对 scores 应用 masked
         if mask is not None:
-            masked_scores = scores.masked_fill(mask == 0, float('-inf'))
-        
+            masked_scores = scores.masked_fill(mask == 0, float("-inf"))
+
         # 归一化，将注意力权重分数转为概率分布 dim 维度值相加等于，对于2D张量即每行元素值相加等于 1
         attn_scores = F.softmax(masked_scores, dim=-1)  # (batch, heads, seq, seq)
         # 加权求和 (batch, heads, seq, head_dim)
         context = torch.matmul(attn_scores, V)
-        
-        context = context.transpose(1,2).contiguous().view(batch_size, seq_length, embed_dim) 
+
+        context = (
+            context.transpose(1, 2).contiguous().view(batch_size, seq_length, embed_dim)
+        )
         out = self.out(context)  # 最后的线性变换(batch, seq_length, embed_dim)
-        
-        print(f"mask 矩阵:\n {mask.squeeze()} \n") # 使用 torch.squeeze() 函数来移除张量中所有大小为 1 的维度
+
+        print(
+            f"mask 矩阵:\n {mask.squeeze()} \n"
+        )  # 使用 torch.squeeze() 函数来移除张量中所有大小为 1 的维度
         print(f"原始的注意力分数矩阵:\n {scores.squeeze()} \n")
         print(f"应用 mask 后的注意力分数矩阵:\n {masked_scores.squeeze()} \n")
-        print(f"使用 softmax 归一化后的掩码注意力分数矩阵:\n {attn_scores.squeeze()} \n")
+        print(
+            f"使用 softmax 归一化后的掩码注意力分数矩阵:\n {attn_scores.squeeze()} \n"
+        )
         return out
 
+
 def generate_causal_mask(seq_length):
     """生成一个因果遮罩, 上三角为0, 下三角为1"""
-    mask = torch.tril(torch.ones((seq_length, seq_length))).unsqueeze(0).unsqueeze(0)  # (1, 1, seq, seq)
+    mask = (
+        torch.tril(torch.ones((seq_length, seq_length))).unsqueeze(0).unsqueeze(0)
+    )  # (1, 1, seq, seq)
     return mask  # 1表示可见，0表示遮蔽
 
+
 # 单元测试代码
-def test_multihead_attention(vocab_size = 1000, batch_size = 1, seq_length = 4, embed_dim = 6, num_heads = 2):
-    embedding_layer = nn.Embedding(vocab_size, embed_dim) # 将 input_ids 转为 embedding 向量
-    mha_layer = MultiHeadAttention(embed_dim, num_heads) # 构建 MHA 模块
-
-    torch.manual_seed(0)    
-    input_ids = torch.randint(vocab_size, [batch_size, seq_length]) # 构建输入数据
-    mask = generate_causal_mask(seq_length) # 创建注意力 mask, 默认下三角矩阵(张量)
-    
+def test_multihead_attention(
+    vocab_size=1000, batch_size=1, seq_length=4, embed_dim=6, num_heads=2
+):
+    embedding_layer = nn.Embedding(
+        vocab_size, embed_dim
+    )  # 将 input_ids 转为 embedding 向量
+    mha_layer = MultiHeadAttention(embed_dim, num_heads)  # 构建 MHA 模块
+
+    torch.manual_seed(0)
+    input_ids = torch.randint(vocab_size, [batch_size, seq_length])  # 构建输入数据
+    mask = generate_causal_mask(seq_length)  # 创建注意力 mask, 默认下三角矩阵(张量)
+
     h = embedding_layer(input_ids)
-    output = mha_layer(h, mask) # MHA 前向传播
+    output = mha_layer(h, mask)  # MHA 前向传播
     assert output.shape == (batch_size, seq_length, embed_dim), "输出形状不正确"
-    
+
     # 检查因果遮罩是否有效, 通过设置输入为单位矩阵，观察输出是否遵循因果遮罩
-    x_identity = torch.eye(seq_length, embed_dim).unsqueeze(0).repeat(batch_size,1,1)  # (batch, seq, embed)
+    x_identity = (
+        torch.eye(seq_length, embed_dim).unsqueeze(0).repeat(batch_size, 1, 1)
+    )  # (batch, seq, embed)
     output_identity = mha_layer(x_identity, mask)
-    
+
     # 由于输入是单位矩阵，输出应该保持某种结构，可以进行简单的检查
     assert not torch.isnan(output_identity).any(), "输出包含NaN值"
-    
+
     print("多头注意力输出示例：")
     print(output)
 
+
 if __name__ == "__main__":
     test_multihead_attention()
diff --git a/tests/test_torch_matmul.py b/tests/test_torch_matmul.py
index d266025..f1cb4a9 100644
--- a/tests/test_torch_matmul.py
+++ b/tests/test_torch_matmul.py
@@ -6,30 +6,31 @@
 
 # 是否使用GPU进行测试（如果没有GPU则设为False）
 use_cuda = torch.cuda.is_available()
-device = 'cuda' if use_cuda else 'cpu'
+device = "cuda" if use_cuda else "cpu"
 
 # 测试参数配置
 B_values = [1, 4, 8, 16]  # B: 第1维度大小
 N_values = [32, 64, 128]  # N: 第2维度大小
 D_in_values = [64, 128, 256]  # D_in
-D_out_values = [64, 128, 256] # D_out
+D_out_values = [64, 128, 256]  # D_out
 
 results_matmul = {}
 results_linear = {}
 
+
 def benchmark_op(op, args):
-    t = Timer(
-        stmt='op(*args)',
-        globals={'op': op, 'args': args}
-    )
+    t = Timer(stmt="op(*args)", globals={"op": op, "args": args})
     return t.blocked_autorange(min_run_time=0.1)
 
+
 # 开始测试 3D 输入情况
 # X: [B, N, D_in], W: [D_out, D_in], b: [D_out]
 # matmul: (X @ W.T) + b => [B, N, D_out]
 # linear: F.linear(X, W, b) => [B, N, D_out]
 
-for B, N, D_in, D_out in itertools.product(B_values, N_values, D_in_values, D_out_values):
+for B, N, D_in, D_out in itertools.product(
+    B_values, N_values, D_in_values, D_out_values
+):
     X = torch.randn(B, N, D_in, device=device)
     W = torch.randn(D_out, D_in, device=device)
     b = torch.randn(D_out, device=device)
@@ -48,17 +49,25 @@ def benchmark_op(op, args):
 fixed_D_in = 128
 fixed_D_out = 128
 
-filtered_N = [n for n in N_values if (fixed_B, n, fixed_D_in, fixed_D_out) in results_matmul]
+filtered_N = [
+    n for n in N_values if (fixed_B, n, fixed_D_in, fixed_D_out) in results_matmul
+]
 
-matmul_times = [results_matmul[(fixed_B, n, fixed_D_in, fixed_D_out)] for n in filtered_N]
-linear_times = [results_linear[(fixed_B, n, fixed_D_in, fixed_D_out)] for n in filtered_N]
+matmul_times = [
+    results_matmul[(fixed_B, n, fixed_D_in, fixed_D_out)] for n in filtered_N
+]
+linear_times = [
+    results_linear[(fixed_B, n, fixed_D_in, fixed_D_out)] for n in filtered_N
+]
 
 plt.figure(figsize=(8, 6))
-plt.plot(filtered_N, matmul_times, marker='o', label='matmul (3D X)')
-plt.plot(filtered_N, linear_times, marker='s', label='F.linear (3D X)')
-plt.xlabel('N dimension size')
-plt.ylabel('Median time (s)')
-plt.title(f'Performance comparison at B={fixed_B}, D_in={fixed_D_in}, D_out={fixed_D_out}')
+plt.plot(filtered_N, matmul_times, marker="o", label="matmul (3D X)")
+plt.plot(filtered_N, linear_times, marker="s", label="F.linear (3D X)")
+plt.xlabel("N dimension size")
+plt.ylabel("Median time (s)")
+plt.title(
+    f"Performance comparison at B={fixed_B}, D_in={fixed_D_in}, D_out={fixed_D_out}"
+)
 plt.legend()
 plt.grid(True)
-plt.savefig("./result.png")
\ No newline at end of file
+plt.savefig("./result.png")
diff --git a/tests/test_torch_rope.py b/tests/test_torch_rope.py
index 8f343c4..d476664 100644
--- a/tests/test_torch_rope.py
+++ b/tests/test_torch_rope.py
@@ -3,7 +3,9 @@
 import torch.nn as nn
 
 
-def compute_theta(dim: int, base: float = 500000.0, device: torch.device = torch.device('cuda')) -> torch.Tensor:
+def compute_theta(
+    dim: int, base: float = 500000.0, device: torch.device = torch.device("cuda")
+) -> torch.Tensor:
     """
     计算旋转位置编码中的 Theta 角度值。
 
@@ -17,31 +19,45 @@ def compute_theta(dim: int, base: float = 500000.0, device: torch.device = torch
     """
     if dim % 2 != 0:
         print("嵌入维度 dim 必须为偶数")
-    i = torch.arange(1, (dim//2) + 1, dtype=torch.float32, device=device)
-    theta_i = base ** (-2*(i - 1) / dim)
+    i = torch.arange(1, (dim // 2) + 1, dtype=torch.float32, device=device)
+    theta_i = base ** (-2 * (i - 1) / dim)
 
     return theta_i
 
-def precompute_freqs_cis(dim: int, seq_len: int, base: float = 500000.0, device: torch.device = torch.device('cuda')):
-    theta = compute_theta(dim, base, device) # theta 角度值序列，向量, 大小为 dim // 2
-    m = torch.arange(seq_len, device=device) # # token 位置值序列，向量，大小为 seq_len
-    m_theta = torch.outer(m, theta) # 所有 token 位置的所有 Theta 值范围, 矩阵，尺寸为 [seq_len, dim // 2]
-    freqs_cis = torch.polar(torch.ones_like(m_theta), m_theta) # e^{i*m*\theta}，本质上是旋转矩阵
+
+def precompute_freqs_cis(
+    dim: int,
+    seq_len: int,
+    base: float = 500000.0,
+    device: torch.device = torch.device("cuda"),
+):
+    theta = compute_theta(dim, base, device)  # theta 角度值序列，向量, 大小为 dim // 2
+    m = torch.arange(seq_len, device=device)  # # token 位置值序列，向量，大小为 seq_len
+    m_theta = torch.outer(
+        m, theta
+    )  # 所有 token 位置的所有 Theta 值范围, 矩阵，尺寸为 [seq_len, dim // 2]
+    freqs_cis = torch.polar(
+        torch.ones_like(m_theta), m_theta
+    )  # e^{i*m*\theta}，本质上是旋转矩阵
 
     return freqs_cis
 
+
 def reshape_for_broadcast(freqs_cis, x):
     ndim = x.ndim
     assert ndim >= 2
-    assert freqs_cis.shape == (x.shape[1],x.shape[-1]), "the last two dimension of freqs_cis, x must match"
-    shape = [d if i==1 or i==ndim-1 else 1 for i,d in enumerate(x.shape)]
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (
+        "the last two dimension of freqs_cis, x must match"
+    )
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
     return freqs_cis.view(*shape)
 
+
 def apply_rotary_emb(
-    xq: torch.Tensor, 
-    xk: torch.Tensor, 
+    xq: torch.Tensor,
+    xk: torch.Tensor,
     freqs_cis: torch.Tensor,
-    device: torch.device = torch.device('cpu')
+    device: torch.device = torch.device("cpu"),
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     参数:
@@ -52,24 +68,32 @@ def apply_rotary_emb(
         - Tuple[torch.Tensor, torch.Tensor]: 旋转编码后的查询和键张量
     """
     # 实数域张量转为复数域张量
-    xq_reshape = xq.reshape(*xq.shape[:-1], -1, 2) # [batch_size, seq_len, dim] -> [batch_size, seq_len, dim//2, 2] 
-    xk_reshape = xk.reshape(*xk.shape[:-1], -1, 2) # [batch_size, seq_len, dim] -> [batch_size, seq_len, dim//2, 2] 
-    xq_complex = torch.view_as_complex(xq_reshape) # 复数形式张量
-    xk_complex = torch.view_as_complex(xk_reshape) # 复数形式张量
+    xq_reshape = xq.reshape(
+        *xq.shape[:-1], -1, 2
+    )  # [batch_size, seq_len, dim] -> [batch_size, seq_len, dim//2, 2]
+    xk_reshape = xk.reshape(
+        *xk.shape[:-1], -1, 2
+    )  # [batch_size, seq_len, dim] -> [batch_size, seq_len, dim//2, 2]
+    xq_complex = torch.view_as_complex(xq_reshape)  # 复数形式张量
+    xk_complex = torch.view_as_complex(xk_reshape)  # 复数形式张量
 
     # 旋转矩阵（freqs_cis）的维度在序列长度（seq_len，维度 1）和头部维度（head_dim，维度 3）上需要与嵌入的维度一致。
     # 此外，freqs_cis 的形状必须与 xq 和 xk 相匹配，因此我们需要将 freqs_cis 的形状从 [max_seq_len, head_dim] 调整为 [1, max_seq_len, 1, head_dim]。
-    freqs_cis = reshape_for_broadcast(freqs_cis, xq_complex) # [max_seq_len, 1, 1, dim // 2]
+    freqs_cis = reshape_for_broadcast(
+        freqs_cis, xq_complex
+    )  # [max_seq_len, 1, 1, dim // 2]
 
     # 应用旋转操作，并将结果转回实数域
-    xq_out = torch.view_as_real(xq_complex * freqs_cis).flatten(3) # flatten(2) 将后面两个维度压成一个维度
+    xq_out = torch.view_as_real(xq_complex * freqs_cis).flatten(
+        3
+    )  # flatten(2) 将后面两个维度压成一个维度
     xk_out = torch.view_as_real(xk_complex * freqs_cis).flatten(3)
 
     return xq_out.type_as(xq), xk_out.type_as(xk)
 
 
 def _compute_default_rope_parameters(
-    config, 
+    config,
     device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     **rope_kwargs,
@@ -99,16 +123,23 @@ def _compute_default_rope_parameters(
         dim = rope_kwargs["dim"]
     elif config is not None:
         base = config.rope_theta
-        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        partial_rotary_factor = (
+            config.partial_rotary_factor
+            if hasattr(config, "partial_rotary_factor")
+            else 1.0
+        )
         head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
         dim = int(head_dim * partial_rotary_factor)
 
     attention_factor = 1.0  # Unused in this type of RoPE
 
     # Compute the inverse frequencies
-    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)
+    )
     return inv_freq, attention_factor
 
+
 def _compute_llama3_parameters(
     config, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
 ) -> Tuple["torch.Tensor", float]:
@@ -129,32 +160,48 @@ def _compute_llama3_parameters(
         post-processing scaling factor applied to the computed cos/sin.
     """
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+    inv_freq, attention_factor = _compute_default_rope_parameters(
+        config, device, seq_len, **rope_kwargs
+    )
 
     factor = config.rope_scaling["factor"]  # `8` in the original implementation
-    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
-    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
-    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_factor = config.rope_scaling[
+        "low_freq_factor"
+    ]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling[
+        "high_freq_factor"
+    ]  # `4` in the original implementation
+    old_context_len = config.rope_scaling[
+        "original_max_position_embeddings"
+    ]  # `8192` in the original implementation
 
     low_freq_wavelen = old_context_len / low_freq_factor
     high_freq_wavelen = old_context_len / high_freq_factor
 
     wavelen = 2 * math.pi / inv_freq
     # wavelen < high_freq_wavelen: do nothing; wavelen > low_freq_wavelen: divide by factor
-    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    inv_freq_llama = torch.where(
+        wavelen > low_freq_wavelen, inv_freq / factor, inv_freq
+    )
     # otherwise: interpolate between the two, using a smooth factor
-    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (
+        high_freq_factor - low_freq_factor
+    )
+    smoothed_inv_freq = (
+        1 - smooth_factor
+    ) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
     is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
     inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
 
     return inv_freq_llama, attention_factor
 
+
 ROPE_INIT_FUNCTIONS = {
     "default": _compute_default_rope_parameters,
     "llama3": _compute_llama3_parameters,
 }
 
+
 class LlamaRotaryEmbedding(nn.Module):
     def __init__(
         self,
@@ -164,7 +211,7 @@ def __init__(
         device=None,
         scaling_factor=1.0,
         rope_type="default",
-        config = None,
+        config=None,
     ):
         super().__init__()
         # TODO (joao): remove the `if` below, only used for BC
@@ -183,7 +230,9 @@ def __init__(
         else:
             # BC: "rope_type" was originally "type"
             if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+                self.rope_type = config.rope_scaling.get(
+                    "rope_type", config.rope_scaling.get("type")
+                )
             else:
                 self.rope_type = "default"
 
@@ -193,7 +242,9 @@ def __init__(
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            self.config, device, **self.rope_kwargs
+        )
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
 
@@ -208,10 +259,15 @@ def _dynamic_frequency_update(self, position_ids, device):
             inv_freq, self.attention_scaling = self.rope_init_fn(
                 self.config, device, seq_len=seq_len, **self.rope_kwargs
             )
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
             self.max_seq_len_cached = seq_len
 
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
             self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
             self.max_seq_len_cached = self.original_max_seq_len
 
@@ -221,13 +277,21 @@ def forward(self, x, position_ids):
             self._dynamic_frequency_update(position_ids, device=x.device)
 
         # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
         with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            freqs = (
+                inv_freq_expanded.float() @ position_ids_expanded.float()
+            ).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos()
             sin = emb.sin()
@@ -245,6 +309,7 @@ def rotate_half(x):
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
+
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -269,4 +334,4 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     sin = sin.unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
\ No newline at end of file
+    return q_embed, k_embed
diff --git a/tests/test_transformers.py b/tests/test_transformers.py
index 3b3ae5d..f94820b 100644
--- a/tests/test_transformers.py
+++ b/tests/test_transformers.py
@@ -10,18 +10,14 @@
     pipeline,
 )
 
-checkpoints_dir='/gemini/code/Llama-3.2-1B-Instruct/original/'
+checkpoints_dir = "/gemini/code/Llama-3.2-1B-Instruct/original/"
 with open(Path(checkpoints_dir) / "params.json", "r") as f:
     params = json.loads(f.read())
 
 print("model params ", params)
 
 # 打印自定义 llama 模型结构
-ModelArgs = ModelArgs(
-            max_seq_len=2048,
-            max_batch_size=2,
-            device="cuda",
-            **params)
+ModelArgs = ModelArgs(max_seq_len=2048, max_batch_size=2, device="cuda", **params)
 
 model = Llama(ModelArgs)
 model.eval()
@@ -48,7 +44,10 @@
 
 # torch.load 加载模型权重参数并打印 keys; print("llama-3.2-1b torch weights name ", state_dict.keys())
 print("\n AutoModelForCausalLM archetectue and shape")
-state_dict = torch.load('/gemini/code/Llama-3.2-1B-Instruct/original/consolidated.00.pth', map_location='cuda') # 加载模型权重文件
+state_dict = torch.load(
+    "/gemini/code/Llama-3.2-1B-Instruct/original/consolidated.00.pth",
+    map_location="cuda",
+)  # 加载模型权重文件
 for name, param in state_dict.items():
     print(name, param.shape)
 
@@ -57,7 +56,13 @@
 #     print(name, module)
 
 # 打印 transformers 库 AutoModelForCausalLM 模型结构
-from transformers import AutoModelForCausalLM, AutoTokenizer,AutoModel,LlamaForCausalLM,AutoConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModel,
+    LlamaForCausalLM,
+    AutoConfig,
+)
 
 model_checkpoint = "/gemini/code/Llama-3.2-1B-Instruct"
 model = LlamaForCausalLM.from_pretrained(
@@ -103,4 +108,4 @@
   )
   (lm_head): Linear(in_features=2048, out_features=128256, bias=False)
 )
-"""
\ No newline at end of file
+"""
diff --git a/utils/__init__.py b/utils/__init__.py
deleted file mode 100644
index e69de29..0000000

From 5580283f7cb8b51d8b2c83cad9932f622006425a Mon Sep 17 00:00:00 2001
From: "zhanghonggao.zhg" <zhanghonggao.zhg@alibaba-inc.com>
Date: Mon, 19 May 2025 21:37:30 +0800
Subject: [PATCH 08/33] fix: code structure

---
 apply_weight_convert.py             | 18 +-----------------
 examples/example_chat.py            |  2 +-
 lite_llama/inference.py             | 14 +++++++-------
 lite_llama/llava_generate_stream.py | 22 +++++++++++-----------
 requirement.txt                     |  2 +-
 5 files changed, 21 insertions(+), 37 deletions(-)

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index c8901b0..73abf81 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -56,20 +56,4 @@
     print("num_layers: ", num_layers)
     convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
 else:
-    print("Error! Unsupported model type!")
-
-# from transformers import LlavaNextConfig, LlavaNextForConditionalGeneration
-# from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-# from lite_llama.models.llava import LlavaLlama
-# from lite_llama.models.model_config import LlamaConfig
-
-# with init_empty_weights():
-#     llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-#     text_config = llava_config.text_config # TODO: 将 text_config 转换成 LlamaConfig 类型
-#     llama_config = LlamaConfig.from_dict(text_config.to_dict())
-
-# 使用 init_empty_weights 初始化空模型
-# with init_empty_weights():
-#     llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-#     model = LlavaLlama(llava_config)
-#     llama_config = model.llama_config
+    print("Error! Unsupported model type!")
\ No newline at end of file
diff --git a/examples/example_chat.py b/examples/example_chat.py
index 0e9b542..9989748 100644
--- a/examples/example_chat.py
+++ b/examples/example_chat.py
@@ -11,7 +11,7 @@
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
 checkpoints_dir = (
-    "/homg/honggao/lite_llama/my_weight/Qwen2.5-3B"  # 改成自己的存放模型路径
+    "/path/lite_llama/my_weight/Qwen2.5-3B"  # 改成自己的存放模型路径
 )
 
 
diff --git a/lite_llama/inference.py b/lite_llama/inference.py
index fe16b06..0224445 100644
--- a/lite_llama/inference.py
+++ b/lite_llama/inference.py
@@ -1,12 +1,10 @@
 from typing import Optional
 import torch
 
-import sys, os, time
+import time
 
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
-
-from lite_llama.utils.prompt_templates import get_prompter
-from lite_llama.generate import GenerateText
+from .utils.prompt_templates import get_prompter
+from .generate import GenerateText
 
 
 class Inference(object):
@@ -52,10 +50,12 @@ def count_tokens(self, texts: list[str], tokenizer) -> int:
 
     def inference(self, generator: GenerateText, prompts: list[str]):
         """
-        Inference is performed using lite-llama's GenerateText instance and returns the result with the time taken and the number of tokens output
+        Inference is performed using lite-llama's GenerateText instance and returns 
+        the result with the time taken and the number of tokens output
         """
 
-        # Warm-up step: use a short dummy input to allow the model to perform a simple inference to load caches/compile optimizations, etc.
+        # Warm-up step: use a short dummy input to allow the model to 
+        # perform a simple inference to load caches/compile optimizations, etc.
         warm_up_prompt = ["Hello World"] * 4
         _ = generator.text_completion(
             warm_up_prompt,
diff --git a/lite_llama/llava_generate_stream.py b/lite_llama/llava_generate_stream.py
index 45982bf..94ae11b 100644
--- a/lite_llama/llava_generate_stream.py
+++ b/lite_llama/llava_generate_stream.py
@@ -2,7 +2,7 @@
 import torch, logging, re
 from PIL import Image
 
-from typing import List, Optional, Tuple, TypedDict, Generator, Union
+from typing import Optional, TypedDict, Generator, Union
 from .executor.model_executor import ModelExecutor
 from .utils.constants import *
 from .utils.file_interface import get_model_name_from_path
@@ -16,8 +16,8 @@
 
 class CompletionPrediction(TypedDict, total=False):
     generation: str
-    tokens: List[str]  # not required
-    logprobs: List[float]  # not required
+    tokens: list[str]  # not required
+    logprobs: list[float]  # not required
 
 
 def tokenizer_image_token(
@@ -118,7 +118,7 @@ def load_tokenizer(self, pretrained_model_name_or_path):
 
         return tokenizer
 
-    def encode_images(self, image_items: List[Union[str, Image.Image]]):
+    def encode_images(self, image_items: list[Union[str, Image.Image]]):
         processor = AutoProcessor.from_pretrained(self.checkpoints_dir)
         self.image_processor = processor.image_processor
         images = []
@@ -148,18 +148,18 @@ def encode_images(self, image_items: List[Union[str, Image.Image]]):
     @torch.inference_mode()
     def generate_stream(
         self,
-        prompt_tokens: List[List[int]],
+        prompt_tokens: list[list[int]],
         image_tensors: Optional[torch.FloatTensor] = None,
         max_gen_len: int = 2048,
         temperature: float = 0.6,
         top_p: float = 0.9,
         echo: bool = False,
-    ) -> Generator[Tuple[List[str], Optional[List[float]]], None, None]:
+    ) -> Generator[tuple[list[str], Optional[list[float]]], None, None]:
         """
         基于提供的 prompt_tokens, 使用语言生成模型逐个生成 token, 并在生成时立即输出。
 
         参数：
-            prompt_tokens (List[List[int]]): 已经进行分词的 prompt, 每个 prompt 是一个整数列表。
+            prompt_tokens (list[list[int]]): 已经进行分词的 prompt, 每个 prompt 是一个整数列表。
             max_gen_len (int): 生成的最大长度。
             temperature (float, optional): 控制采样随机性的温度值。默认为 0.6。
             top_p (float, optional): 用于 nucleus sampling 的概率阈值。默认为 0.9。
@@ -167,7 +167,7 @@ def generate_stream(
             echo (bool, optional): 是否在输出中包含 prompt_tokens。默认为 False。
 
         generator 输出：
-            Tuple[List[str], Optional[List[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
+            tuple[list[str], Optional[list[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
         说明：
             该方法在生成循环中，每生成一个新 token, 就立即输出对应的文本和概率(如果需要）。
         """
@@ -272,13 +272,13 @@ def generate_stream(
 
     def text_completion_stream(
         self,
-        prompts: List[str],
-        image_items: List[Union[str, Image.Image]],
+        prompts: list[str],
+        image_items: list[Union[str, Image.Image]],
         temperature: float = 0.6,
         top_p: float = 0.9,
         max_gen_len: Optional[int] = None,
         echo: bool = False,
-    ) -> Generator[List[CompletionPrediction], None, None]:
+    ) -> Generator[list[CompletionPrediction], None, None]:
         """每次迭代时，生成器返回一个包含多个 CompletionPrediction 字典的列表。"""
 
         if max_gen_len is None:
diff --git a/requirement.txt b/requirement.txt
index 56c5283..518ad6b 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -1,6 +1,6 @@
 tokenizers==0.20.3
 huggingface-hub==0.24.6
-transformers==4.41
+transformers==4.46.3
 torch=2.1.2
 triton>=2.1.0
 tqdm==4.65.0

From a5665fd290bc54b9681b739b6b4c9b62d9ada1b2 Mon Sep 17 00:00:00 2001
From: "zhanghonggao.zhg" <zhanghonggao.zhg@alibaba-inc.com>
Date: Mon, 19 May 2025 21:43:55 +0800
Subject: [PATCH 09/33] chore: update logs directory

---
 lite_llama/utils/logger.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lite_llama/utils/logger.py b/lite_llama/utils/logger.py
index 1d210db..f46a563 100644
--- a/lite_llama/utils/logger.py
+++ b/lite_llama/utils/logger.py
@@ -104,9 +104,9 @@ def logfileHandle(log_name="logs/common.log"):
 
 
 log = loggerHandle()
-logE = logfileHandle("logs/error.log")
-logP = logfileHandle("logs/post.log")
-logU = logfileHandle("logs/upload_data.log")
+logE = logfileHandle("../logs/error.log")
+logP = logfileHandle("../logs/post.log")
+logU = logfileHandle("../logs/upload_data.log")
 
 if __name__ == "__main__":
     

From 5997bc94fda6c7fe2cf7b15f84cb71f7cdba8fb6 Mon Sep 17 00:00:00 2001
From: "zhanghonggao.zhg" <zhanghonggao.zhg@alibaba-inc.com>
Date: Tue, 20 May 2025 10:48:59 +0800
Subject: [PATCH 10/33] fix: remove repeat file

---
 examples/evaluator/eval_acc.py | 57 ----------------------------------
 examples/example_eval_acc.py   |  2 +-
 2 files changed, 1 insertion(+), 58 deletions(-)
 delete mode 100644 examples/evaluator/eval_acc.py

diff --git a/examples/evaluator/eval_acc.py b/examples/evaluator/eval_acc.py
deleted file mode 100644
index 4ac65e2..0000000
--- a/examples/evaluator/eval_acc.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import warnings
-
-warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-import torch
-
-from eval import *
-import sys, os
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
-from lite_llama.inference import Inference
-
-class EvaluatorAccuracy(object):
-    def __init__(self, test_data_path, custom_checkpoints_dir, data_batch=10):
-        self.custom_checkpoints_dir = custom_checkpoints_dir
-        self.test_data_path = test_data_path
-        self.data_batch = data_batch
-
-        # init inference
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-
-        self.model_inference = Inference(
-            temperature=0.7,
-            top_p=0.8,
-            max_seq_len=2048,
-            max_gen_len=1900,
-            lite_llama_ckpt_dir=self.custom_checkpoints_dir,
-            device=self.device,
-        )
-
-    def process(
-        self,
-    ):
-        if "hotpot" in self.test_data_path.lower():
-            data_obj = HotpotQA(self.test_data_path, self.data_batch)
-
-        elif "hellaswag" in self.test_data_path.lower():
-            data_obj = HellaSwag(self.test_data_path, self.data_batch)
-
-        try:
-            assert data_obj is not None, "data_obj has not been created"
-        except NameError:
-            raise AssertionError("Dataset may not be supported")
-
-        ground_truth, prompts, options = data_obj.parse_data()
-
-        predictions = self.model_inference.process(prompts)
-
-        if data_obj.data_type == "mcq":
-            data_obj.evaluate(predictions, ground_truth, options)
-        else:
-            data_obj.evaluate(predictions, ground_truth)
-
-
-if __name__ == "__main__":
-    ea = EvaluatorAccuracy(
-        "/path_to/hotpot_dev_distractor_v1.json", "/path_to/Llama-3.2-3B-Instruct"
-    )
-    ea.process()
diff --git a/examples/example_eval_acc.py b/examples/example_eval_acc.py
index 7ea940e..326d533 100644
--- a/examples/example_eval_acc.py
+++ b/examples/example_eval_acc.py
@@ -3,7 +3,7 @@
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 import torch
 
-from evaluator.eval import *
+from .evaluator.eval import *
 
 import sys, os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))

From 4b83355267f0b097c2f96a75de658bdf809ba899 Mon Sep 17 00:00:00 2001
From: "zhanghonggao.zhg" <zhanghonggao.zhg@alibaba-inc.com>
Date: Tue, 20 May 2025 12:02:47 +0800
Subject: [PATCH 11/33] fix: logger directory can't create

---
 docs/Qwen2ForCausalLM.md   |   4 +-
 lite_llama/utils/logger.py |   6 +-
 tests/test_transformers.py | 257 +++++++++++++++++++++----------------
 3 files changed, 153 insertions(+), 114 deletions(-)

diff --git a/docs/Qwen2ForCausalLM.md b/docs/Qwen2ForCausalLM.md
index b16fcdc..6fbf2bd 100644
--- a/docs/Qwen2ForCausalLM.md
+++ b/docs/Qwen2ForCausalLM.md
@@ -1,6 +1,6 @@
 ## hf qwen2 模型结构信息
 
-### 模型结构
+### Qwen2.5-1.5B 模型结构
 
 ```bash
 Qwen2ForCausalLM(
@@ -377,7 +377,7 @@ model.norm.weight torch.Size([1536])
 
 ## 自定义 qwen2 模型结构信息
 
-### 模型结构
+### Qwen2.5-1.5B 模型结构
 
 ```bash
 Qwen2Model(
diff --git a/lite_llama/utils/logger.py b/lite_llama/utils/logger.py
index f46a563..e93588c 100644
--- a/lite_llama/utils/logger.py
+++ b/lite_llama/utils/logger.py
@@ -84,11 +84,11 @@ def loggerHandle():
     return logger
 
 
-def logfileHandle(log_name="logs/common.log"):
+def logfileHandle(log_name="../logs/common.log"):
     project_path = getProjectPath()
     log_file = os.path.join(project_path, log_name)
-    if not os.path.exists(os.path.join(project_path, "logs")):
-        os.makedirs(os.path.join(project_path, "logs"))
+    if not os.path.exists(os.path.join(project_path, "../logs")):
+        os.makedirs(os.path.join(project_path, "../logs"))
     if not os.path.exists(log_file):
         os.mknod(log_file)
     logfile = logging.getLogger()
diff --git a/tests/test_transformers.py b/tests/test_transformers.py
index f94820b..569bb4f 100644
--- a/tests/test_transformers.py
+++ b/tests/test_transformers.py
@@ -1,111 +1,150 @@
-from lite_llama.lite_llama.models.llama import Llama, ModelArgs
-from pathlib import Path
-import json
 import torch
+from typing import Union, TextIO, Optional
+import torch
+from transformers import AutoModel, AutoConfig, PreTrainedModel
+from accelerate import init_empty_weights
+
+MODEL_ID = "/home/honggao/llm_weights/Qwen3-30B-A3B"
+
+
+def print_init_model(model_id):
+	"""
+	Accelerate 提供 init_empty_weights 上下文管理器，令所有 Parameter 和 Buffer 
+	都放在 meta device，尺寸为 0，因此既 不下载权重 也 不占内存。
+	"""
+	cfg = AutoConfig.from_pretrained(model_id)   # 只拉配置
+
+	with init_empty_weights(): 
+		model = AutoModel.from_config(cfg)
+		print(model)
+		
+def print_transformers_model_summary(
+    model: PreTrainedModel,
+    *,
+    use_torchinfo: bool = False,
+    input_size: Optional[tuple] = None,
+    file: Union[str, TextIO, None] = None,
+) -> None:
+    """
+    打印 Hugging Face Transformers 模型结构 + 权重 shape。
+    
+    Args:
+        model (PreTrainedModel): 已加载好的模型实例。
+        use_torchinfo (bool): 是否调用 torchinfo.summary() 生成额外摘要。
+        input_size (tuple): 当 use_torchinfo=True 时需提供 (seq_len, ) or (bs, seq_len, ...)。
+        file: None  -> 输出到 stdout；
+              str   -> 输出到指定路径文件；
+              TextIO -> 已打开的文件句柄。
+    """
+    import math
+
+    def _human_readable(num: float, *, base: int = 1000, units=("", "K", "M", "G", "T", "P"), suffix=""):
+        """Convert a large number to human‑readable form (e.g. 12.3G)."""
+        if num == 0:
+            return f"0{suffix}"
+        exp = min(int(math.log(num, base)), len(units) - 1)
+        value = num / (base ** exp)
+        return f"{value:.2f}{units[exp]}{suffix}"
+
+    def _dump(msg: str = ""):
+        if fh:
+            fh.write(msg + "\n")
+        else:
+            print(msg)
+
+    # 0) 处理输出目标
+    fh = open(file, "w") if isinstance(file, str) else file
+
+    # 1) 模型 __repr__
+    _dump("=" * 60)
+    _dump("Model architecture (__repr__):")
+    _dump("=" * 60)
+    _dump(str(model))
+    
+    # 2) 权重 shape
+    _dump("\n" + "=" * 60)
+    _dump("Parameter shapes (name -> shape, #elements):")
+    _dump("=" * 60)
+
+    # Token count estimation for FLOPs (default = 1 token if unknown)
+    tokens = 1
+    if input_size is not None:
+        # Accept (seq_len,), (bs, seq_len) or any shape where last dim is seq_len
+        if len(input_size) == 1:
+            tokens = input_size[0]
+        else:
+            tokens = input_size[0] * input_size[-1]
+
+    total_params = 0
+    total_flops = 0
+    total_mem_bytes = 0
+    for name, param in model.named_parameters():
+        numel = param.numel()
+        total_params += numel
+
+        # ---- Estimate per‑parameter FLOPs ----
+        if param.dim() == 2:  # typical (out, in) weight matrix
+            flops = 2 * param.shape[0] * param.shape[1] * tokens
+        elif param.dim() == 1:  # bias / norm weight
+            flops = param.shape[0] * tokens
+        else:
+            flops = numel  # fallback crude estimate
+        total_flops += flops
+
+        # ---- Memory access cost (parameter bytes only) ----
+        mem_bytes = numel * param.element_size()
+        total_mem_bytes += mem_bytes
+
+        # ---- Pretty print ----
+        flops_str = _human_readable(flops, suffix="F")
+        mem_str = _human_readable(mem_bytes, base=1024, units=("B","KB","MB","GB","TB","PB"))
+        _dump(f"{name:<60} {str(tuple(param.shape)):<20} {numel:,}  |  {flops_str:<8}  |  {mem_str}")
+
+    _dump(f"\nTotal parameters: {total_params:,}")
+    _dump(f"Estimated forward FLOPs: {_human_readable(total_flops, suffix='F')}")
+    _dump(f"Parameter memory: {_human_readable(total_mem_bytes, base=1024, units=('B','KB','MB','GB','TB','PB'))}")
+
+    # 3) 可选 torchinfo 摘要
+    if use_torchinfo:
+        try:
+            from torchinfo import summary  # pip install torchinfo
+            assert input_size is not None, "`input_size` must be provided when use_torchinfo=True"
+            info = summary(
+                model,
+                input_size=input_size,
+                depth=3,
+                col_names=("kernel_size", "output_size", "num_params", "mult_adds"),
+                dtypes=[torch.long],  # 对 NLP 模型输入通常是 int64 token id
+            )
+            _dump("\n" + "=" * 60)
+            _dump("torchinfo summary():")
+            _dump("=" * 60)
+            _dump(str(info))
+        except ImportError:
+            _dump("torchinfo 未安装，跳过摘要。pip install torchinfo 获取更丰富视图。")
+    
+    if isinstance(file, str):  # 自动关闭文件
+        fh.close()
+
+from torchviz import make_dot  # pip install torchviz graphviz
+def save_model_graph(model, input_example: torch.Tensor, file_name: str = "model_graph.svg") -> None:
+    """
+    利用 torchviz 生成前向图；input_example 必须能直接送入 model。
+    """
+    model.eval()
+    y = model(input_example)
+    dot = make_dot(y, params=dict(model.named_parameters()))
+    dot.format = file_name.split(".")[-1]  # 自动根据后缀决定 svg/png
+    dot.render(file_name, cleanup=True)
+    print(f"✅ Graph saved to {file_name}")
 
-from transformers import (
-    AutoModelForCausalLM,
-    LlamaForCausalLM,
-    AutoTokenizer,
-    pipeline,
-)
-
-checkpoints_dir = "/gemini/code/Llama-3.2-1B-Instruct/original/"
-with open(Path(checkpoints_dir) / "params.json", "r") as f:
-    params = json.loads(f.read())
-
-print("model params ", params)
-
-# 打印自定义 llama 模型结构
-ModelArgs = ModelArgs(max_seq_len=2048, max_batch_size=2, device="cuda", **params)
-
-model = Llama(ModelArgs)
-model.eval()
-print(model)
-
-print("my Llama all parameters:", model.state_dict().keys())
-
-# named_parameters() 方法可以返回模型中所有参数的名称和参数（即权重和偏置）。
-print("my llama archetectue and shape")
-for name, param in model.named_parameters():
-    print(name, param.shape)
-
-"""
-Llama(
-  (tok_embeddings): Embedding(128256, 2048)
-  (layers): ModuleList(
-    (0-15): 16 x LlamaDecoderLayer(
-      (attention): FusedAttention()
-      (feed_forward): FusedMLP()
-    )
-  )
-)
-"""
-
-# torch.load 加载模型权重参数并打印 keys; print("llama-3.2-1b torch weights name ", state_dict.keys())
-print("\n AutoModelForCausalLM archetectue and shape")
-state_dict = torch.load(
-    "/gemini/code/Llama-3.2-1B-Instruct/original/consolidated.00.pth",
-    map_location="cuda",
-)  # 加载模型权重文件
-for name, param in state_dict.items():
-    print(name, param.shape)
-
-# 打印所有模块的名称和模块
-# for name, module in model.named_modules():
-#     print(name, module)
-
-# 打印 transformers 库 AutoModelForCausalLM 模型结构
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    AutoModel,
-    LlamaForCausalLM,
-    AutoConfig,
-)
-
-model_checkpoint = "/gemini/code/Llama-3.2-1B-Instruct"
-model = LlamaForCausalLM.from_pretrained(
-    model_checkpoint,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
-
-print(model)
-print("LlamaForCausalLM all parameters:", model.state_dict().keys())
-# named_parameters() 方法可以返回模型中所有参数的名称和参数（即权重和偏置）。
-for name, param in model.named_parameters():
-    print(name, param.shape)
-
-tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-"""
-LlamaForCausalLM(
-  (model): LlamaModel(
-    (embed_tokens): Embedding(128256, 2048)
-    (layers): ModuleList(
-      (0-15): 16 x LlamaDecoderLayer(
-        (self_attn): LlamaSdpaAttention(
-          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
-          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
-          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
-          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
-          (rotary_emb): LlamaRotaryEmbedding()
-        )
-        (mlp): LlamaMLP(
-          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
-          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
-          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
-          (act_fn): SiLU()
-        )
-        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
-        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
-      )
-    )
-    (norm): LlamaRMSNorm((2048,), eps=1e-05)
-    (rotary_emb): LlamaRotaryEmbedding()
-  )
-  (lm_head): Linear(in_features=2048, out_features=128256, bias=False)
-)
-"""
+if __name__ == "__main__":
+	model = AutoModel.from_pretrained(MODEL_ID)
+	input_example = torch.randint(0, 1000, (2, 2048))  # 随机输入
+	print_transformers_model_summary(
+		model=model,
+		use_torchinfo=True,
+		input_size=(2, 2048),
+		file="transformers_model_structure.txt"
+	)
+	# save_model_graph(model, input_example, "transformers_model_graph.svg")
\ No newline at end of file

From 15acd74d49aa121cc31ff8863fd79648c24a006d Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Wed, 21 May 2025 15:24:16 +0930
Subject: [PATCH 12/33] fix conflict

---
 lite_llama/utils/common.py | 2 +-
 lite_llama/utils/logger.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index 92c5be0..fc240f2 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -67,7 +67,7 @@ def get_gpu_memory(gpu_type, device_id="0"):
         elif gpu_type == "cpu":
             return None
     except Exception as e:
-        from utils.logger import log
+        from lite_llama.utils.logger import log
 
         log.warning(f"Unable to fetch GPU memory: {e}")
         return None
diff --git a/lite_llama/utils/logger.py b/lite_llama/utils/logger.py
index e93588c..71d3c6f 100644
--- a/lite_llama/utils/logger.py
+++ b/lite_llama/utils/logger.py
@@ -6,7 +6,7 @@
 
 import logging
 sys.path.append("..")
-from utils.common import getProjectPath
+from lite_llama.utils.common import getProjectPath
 
 __all__ = ["log", "logE", "logP", "logU"]
 

From 228d1e3878df18ea1590ac034bc63a9f59f6a44a Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Fri, 23 May 2025 23:00:11 +0930
Subject: [PATCH 13/33] fix import

---
 apply_weight_convert.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index ac94526..d8b20ec 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -5,8 +5,13 @@
     AutoModelForCausalLM,
     LlavaConfig,
 )
+from lite_llama.executor.weight_convert import (
+    convert_llavallama_hf_to_litellama,
+    convert_llama_hf_to_litellama,
+    convert_qwen2_hf_to_litellama,
+)
 
-import argparse
+import argparse, os
 from argparse import RawTextHelpFormatter
 
 def main(checkpoints_dir: str):
@@ -57,20 +62,4 @@ def main(checkpoints_dir: str):
     args = PARSER.parse_args()
 
     model_path = os.path.abspath(args.model_path)
-
-    main(str(model_path))
-# from transformers import LlavaNextConfig, LlavaNextForConditionalGeneration
-# from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-# from lite_llama.models.llava import LlavaLlama
-# from lite_llama.models.model_config import LlamaConfig
-
-# with init_empty_weights():
-#     llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-#     text_config = llava_config.text_config # TODO: 将 text_config 转换成 LlamaConfig 类型
-#     llama_config = LlamaConfig.from_dict(text_config.to_dict())
-
-# 使用 init_empty_weights 初始化空模型
-# with init_empty_weights():
-#     llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-#     model = LlavaLlama(llava_config)  
-#     llama_config = model.llama_config
+    main(str(model_path))
\ No newline at end of file

From 2ef661931b63ef6b377e69cc108d51b5b72508d0 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 24 May 2025 06:00:57 +0930
Subject: [PATCH 14/33] gptq for llama

---
 apply_weight_convert.py                       | 276 +++++++--
 lite_llama/quantization/debug_quantization.py | 247 ++++++++
 lite_llama/quantization/gptq.py               | 538 ++++++++++++++++++
 quantize_model.py                             | 280 +++++++++
 4 files changed, 1294 insertions(+), 47 deletions(-)
 create mode 100644 lite_llama/quantization/debug_quantization.py
 create mode 100644 lite_llama/quantization/gptq.py
 create mode 100644 quantize_model.py

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index d8b20ec..b6af59d 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -5,61 +5,243 @@
     AutoModelForCausalLM,
     LlavaConfig,
 )
+import argparse
+import os
+import sys
+
+# Add the gptq_quantize module to the path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
 from lite_llama.executor.weight_convert import (
     convert_llavallama_hf_to_litellama,
     convert_llama_hf_to_litellama,
     convert_qwen2_hf_to_litellama,
 )
 
-import argparse, os
-from argparse import RawTextHelpFormatter
+# Import the GPTQ quantization function
+from lite_llama.quantization.gptq import quantize_after_conversion
 
-def main(checkpoints_dir: str):
-    if "llava" in checkpoints_dir.lower():
-        model = LlavaForConditionalGeneration.from_pretrained( # LlavaForConditionalGeneration
-            checkpoints_dir,
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to("cuda")
-    else:
-        model = AutoModelForCausalLM.from_pretrained( # LlavaForConditionalGeneration
-            checkpoints_dir,
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True,
-        ).to("cuda")
-
-    hf_sd = model.state_dict()
-
-    # for name, parameters in hf_sd.items():
-    #     print(name, parameters.shape)
-
-    if "qwen2" in checkpoints_dir.lower():
-        llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-        num_layers = llm_config.num_hidden_layers
-        print("num_layers: ", num_layers)
-        convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-    elif "llama" in checkpoints_dir.lower():
-        llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-        num_layers = llm_config.num_hidden_layers
-        print("num_layers: ", num_layers)
-        convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-    elif "llava" in checkpoints_dir.lower():
-        llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-        num_layers = llava_config.text_config.num_hidden_layers
-        print("num_layers: ", num_layers)
-        convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+import warnings
+
+warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
+
+
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description="Convert HuggingFace models to Lite-LLaMA format with optional GPTQ quantization"
+    )
+
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        help="Path to the HuggingFace model checkpoint directory"
+    )
+
+    parser.add_argument(
+        "--quantize",
+        action="store_true",
+        help="Enable GPTQ quantization after conversion"
+    )
+
+    parser.add_argument(
+        "--wbits",
+        type=int,
+        default=4,
+        choices=[2, 3, 4, 8],
+        help="Number of bits for quantization (default: 4)"
+    )
+
+    parser.add_argument(
+        "--groupsize",
+        type=int,
+        default=128,
+        help="Group size for quantization (default: 128, -1 for no grouping, 0 for auto-detect)"
+    )
+
+    parser.add_argument(
+        "--calibration_data",
+        type=str,
+        default=None,
+        help="Path to calibration dataset file for GPTQ (optional)"
+    )
+
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device to use for conversion (default: cuda)"
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float16",
+        choices=["float16", "float32", "bfloat16"],
+        help="Data type for model weights (default: float16)"
+    )
+
+    return parser.parse_args()
+
+
+def get_torch_dtype(dtype_str):
+    """Convert string dtype to torch dtype"""
+    dtype_map = {
+        "float16": torch.float16,
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16
+    }
+    return dtype_map.get(dtype_str, torch.float16)
+
+
+def main():
+    # Parse arguments
+    args = parse_arguments()
+
+    checkpoints_dir = args.checkpoint_dir
+    device = args.device
+    torch_dtype = get_torch_dtype(args.dtype)
+
+    print(f"Converting model from: {checkpoints_dir}")
+    print(f"Device: {device}")
+    print(f"Data type: {args.dtype}")
+    print(f"Quantization: {'Enabled' if args.quantize else 'Disabled'}")
+
+    if args.quantize:
+        print(f"  - Bits: {args.wbits}")
+        print(f"  - Group size: {args.groupsize}")
+        print(f"  - Calibration data: {args.calibration_data or 'Default'}")
+
+    print("\n" + "=" * 50 + "\n")
+
+    # Step 1: Load the model
+    print("Loading model...")
+
+    try:
+        if "llava" in checkpoints_dir.lower():
+            model = LlavaForConditionalGeneration.from_pretrained(
+                checkpoints_dir,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+            )
+            model_type = "llava"
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                checkpoints_dir,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+            )
+            # Determine model type
+            if "qwen2" in checkpoints_dir.lower():
+                model_type = "qwen2"
+            elif "llama" in checkpoints_dir.lower():
+                model_type = "llama"
+            else:
+                print("Warning: Could not determine model type from path.")
+                print("Assuming Llama architecture...")
+                model_type = "llama"
+
+        if device == "cuda" and torch.cuda.is_available():
+            model = model.to(device)
+
+        hf_sd = model.state_dict()
+
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return 1
+
+    # Step 2: Convert to lite_llama format
+    print(f"\nConverting {model_type} model to lite_llama format...")
+
+    try:
+        if model_type == "qwen2":
+            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+            num_layers = llm_config.num_hidden_layers
+            print(f"Number of layers: {num_layers}")
+            convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        elif model_type == "llama":
+            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+            num_layers = llm_config.num_hidden_layers
+            print(f"Number of layers: {num_layers}")
+            convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        elif model_type == "llava":
+            llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
+            num_layers = llava_config.text_config.num_hidden_layers
+            print(f"Number of layers: {num_layers}")
+            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        print("Conversion completed successfully!")
+
+    except Exception as e:
+        print(f"Error during conversion: {e}")
+        return 1
+
+    # Free memory
+    del model, hf_sd
+    if device == "cuda":
+        torch.cuda.empty_cache()
+
+    # Step 3: Optional quantization
+    if args.quantize:
+        print("\n" + "=" * 50)
+        print(f"Starting GPTQ quantization ({args.wbits}-bit)...")
+
+        # Auto-detect groupsize if needed
+        if args.groupsize == 0:
+            print("Auto-detecting optimal groupsize...")
+            # Quick check of vocabulary size
+            vocab_sizes = []
+            for name, param in hf_sd.items():
+                if ("embed" in name or "lm_head" in name) and len(param.shape) >= 2:
+                    vocab_sizes.extend(param.shape)
+
+            if vocab_sizes:
+                vocab_size = max(vocab_sizes)
+                # Find best groupsize
+                for gs in [128, 256, 512, 1024]:
+                    if vocab_size % gs == 0:
+                        args.groupsize = gs
+                        print(f"Selected groupsize: {gs} (perfect fit for vocab size {vocab_size})")
+                        break
+                else:
+                    args.groupsize = 256 if vocab_size > 100000 else 128
+                    print(f"Selected groupsize: {args.groupsize} (best fit for vocab size {vocab_size})")
+
+        print(f"Groupsize: {args.groupsize}")
+        print("=" * 50 + "\n")
+
+        try:
+            quantized_path = quantize_after_conversion(
+                checkpoints_dir=checkpoints_dir,
+                model_type=model_type,
+                calibration_data_path=args.calibration_data,
+                wbits=args.wbits,
+                groupsize=args.groupsize
+            )
+            print(f"\nQuantization completed successfully!")
+            print(f"Quantized model saved to: {quantized_path}")
+
+        except Exception as e:
+            print(f"Error during quantization: {e}")
+            print("The converted model was saved successfully, but quantization failed.")
+            return 1
     else:
-        print("Error! Unsupported model type!")
+        model_id = os.path.basename(os.path.normpath(checkpoints_dir))
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        converted_path = os.path.join(current_dir, f"my_weight/{model_id}")
+        print(f"\nConverted model saved to: {converted_path}")
+        print("To quantize this model later, use the quantize_model.py script")
+
+    print("\n" + "=" * 50)
+    print("Process completed successfully!")
+    print("=" * 50)
 
+    return 0
 
-if __name__ == '__main__':
-    PARSER = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter)
-    PARSER.add_argument('-m', "--model_path", type=str,
-                        default='checkpoints/lit-llama/7B/',
-                        help='Path of the Model')
-    args = PARSER.parse_args()
 
-    model_path = os.path.abspath(args.model_path)
-    main(str(model_path))
\ No newline at end of file
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/lite_llama/quantization/debug_quantization.py b/lite_llama/quantization/debug_quantization.py
new file mode 100644
index 0000000..b6af59d
--- /dev/null
+++ b/lite_llama/quantization/debug_quantization.py
@@ -0,0 +1,247 @@
+import torch
+from transformers import (
+    LlavaForConditionalGeneration,
+    AutoConfig,
+    AutoModelForCausalLM,
+    LlavaConfig,
+)
+import argparse
+import os
+import sys
+
+# Add the gptq_quantize module to the path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from lite_llama.executor.weight_convert import (
+    convert_llavallama_hf_to_litellama,
+    convert_llama_hf_to_litellama,
+    convert_qwen2_hf_to_litellama,
+)
+
+# Import the GPTQ quantization function
+from lite_llama.quantization.gptq import quantize_after_conversion
+
+import warnings
+
+warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
+
+
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description="Convert HuggingFace models to Lite-LLaMA format with optional GPTQ quantization"
+    )
+
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        required=True,
+        help="Path to the HuggingFace model checkpoint directory"
+    )
+
+    parser.add_argument(
+        "--quantize",
+        action="store_true",
+        help="Enable GPTQ quantization after conversion"
+    )
+
+    parser.add_argument(
+        "--wbits",
+        type=int,
+        default=4,
+        choices=[2, 3, 4, 8],
+        help="Number of bits for quantization (default: 4)"
+    )
+
+    parser.add_argument(
+        "--groupsize",
+        type=int,
+        default=128,
+        help="Group size for quantization (default: 128, -1 for no grouping, 0 for auto-detect)"
+    )
+
+    parser.add_argument(
+        "--calibration_data",
+        type=str,
+        default=None,
+        help="Path to calibration dataset file for GPTQ (optional)"
+    )
+
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device to use for conversion (default: cuda)"
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="float16",
+        choices=["float16", "float32", "bfloat16"],
+        help="Data type for model weights (default: float16)"
+    )
+
+    return parser.parse_args()
+
+
+def get_torch_dtype(dtype_str):
+    """Convert string dtype to torch dtype"""
+    dtype_map = {
+        "float16": torch.float16,
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16
+    }
+    return dtype_map.get(dtype_str, torch.float16)
+
+
+def main():
+    # Parse arguments
+    args = parse_arguments()
+
+    checkpoints_dir = args.checkpoint_dir
+    device = args.device
+    torch_dtype = get_torch_dtype(args.dtype)
+
+    print(f"Converting model from: {checkpoints_dir}")
+    print(f"Device: {device}")
+    print(f"Data type: {args.dtype}")
+    print(f"Quantization: {'Enabled' if args.quantize else 'Disabled'}")
+
+    if args.quantize:
+        print(f"  - Bits: {args.wbits}")
+        print(f"  - Group size: {args.groupsize}")
+        print(f"  - Calibration data: {args.calibration_data or 'Default'}")
+
+    print("\n" + "=" * 50 + "\n")
+
+    # Step 1: Load the model
+    print("Loading model...")
+
+    try:
+        if "llava" in checkpoints_dir.lower():
+            model = LlavaForConditionalGeneration.from_pretrained(
+                checkpoints_dir,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+            )
+            model_type = "llava"
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                checkpoints_dir,
+                torch_dtype=torch_dtype,
+                low_cpu_mem_usage=True,
+            )
+            # Determine model type
+            if "qwen2" in checkpoints_dir.lower():
+                model_type = "qwen2"
+            elif "llama" in checkpoints_dir.lower():
+                model_type = "llama"
+            else:
+                print("Warning: Could not determine model type from path.")
+                print("Assuming Llama architecture...")
+                model_type = "llama"
+
+        if device == "cuda" and torch.cuda.is_available():
+            model = model.to(device)
+
+        hf_sd = model.state_dict()
+
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return 1
+
+    # Step 2: Convert to lite_llama format
+    print(f"\nConverting {model_type} model to lite_llama format...")
+
+    try:
+        if model_type == "qwen2":
+            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+            num_layers = llm_config.num_hidden_layers
+            print(f"Number of layers: {num_layers}")
+            convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        elif model_type == "llama":
+            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+            num_layers = llm_config.num_hidden_layers
+            print(f"Number of layers: {num_layers}")
+            convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        elif model_type == "llava":
+            llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
+            num_layers = llava_config.text_config.num_hidden_layers
+            print(f"Number of layers: {num_layers}")
+            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        print("Conversion completed successfully!")
+
+    except Exception as e:
+        print(f"Error during conversion: {e}")
+        return 1
+
+    # Free memory
+    del model, hf_sd
+    if device == "cuda":
+        torch.cuda.empty_cache()
+
+    # Step 3: Optional quantization
+    if args.quantize:
+        print("\n" + "=" * 50)
+        print(f"Starting GPTQ quantization ({args.wbits}-bit)...")
+
+        # Auto-detect groupsize if needed
+        if args.groupsize == 0:
+            print("Auto-detecting optimal groupsize...")
+            # Quick check of vocabulary size
+            vocab_sizes = []
+            for name, param in hf_sd.items():
+                if ("embed" in name or "lm_head" in name) and len(param.shape) >= 2:
+                    vocab_sizes.extend(param.shape)
+
+            if vocab_sizes:
+                vocab_size = max(vocab_sizes)
+                # Find best groupsize
+                for gs in [128, 256, 512, 1024]:
+                    if vocab_size % gs == 0:
+                        args.groupsize = gs
+                        print(f"Selected groupsize: {gs} (perfect fit for vocab size {vocab_size})")
+                        break
+                else:
+                    args.groupsize = 256 if vocab_size > 100000 else 128
+                    print(f"Selected groupsize: {args.groupsize} (best fit for vocab size {vocab_size})")
+
+        print(f"Groupsize: {args.groupsize}")
+        print("=" * 50 + "\n")
+
+        try:
+            quantized_path = quantize_after_conversion(
+                checkpoints_dir=checkpoints_dir,
+                model_type=model_type,
+                calibration_data_path=args.calibration_data,
+                wbits=args.wbits,
+                groupsize=args.groupsize
+            )
+            print(f"\nQuantization completed successfully!")
+            print(f"Quantized model saved to: {quantized_path}")
+
+        except Exception as e:
+            print(f"Error during quantization: {e}")
+            print("The converted model was saved successfully, but quantization failed.")
+            return 1
+    else:
+        model_id = os.path.basename(os.path.normpath(checkpoints_dir))
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        converted_path = os.path.join(current_dir, f"my_weight/{model_id}")
+        print(f"\nConverted model saved to: {converted_path}")
+        print("To quantize this model later, use the quantize_model.py script")
+
+    print("\n" + "=" * 50)
+    print("Process completed successfully!")
+    print("=" * 50)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq.py b/lite_llama/quantization/gptq.py
new file mode 100644
index 0000000..7a226a5
--- /dev/null
+++ b/lite_llama/quantization/gptq.py
@@ -0,0 +1,538 @@
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+import numpy as np
+import os
+import json
+from typing import Dict, List, Optional, Tuple
+from transformers import AutoTokenizer
+import gc
+
+
+class GPTQ:
+    """
+    GPTQ Quantizer for custom lite_llama models
+    """
+
+    def __init__(
+            self,
+            layer,
+            wbits: int = 4,
+            groupsize: int = 128,
+            actorder: bool = False,
+            percdamp: float = 0.01,
+            device: str = "cuda"
+    ):
+        self.layer = layer
+        self.device = device
+        self.wbits = wbits
+        self.actorder = actorder
+        self.percdamp = percdamp
+
+        # Handle groupsize
+        W = layer.weight.data
+        if groupsize == -1:
+            self.groupsize = W.shape[0]
+        else:
+            self.groupsize = groupsize
+
+        # Check if groupsize is compatible
+        if W.shape[0] % self.groupsize != 0:
+            print(f"Warning: Weight dimension {W.shape[0]} not divisible by groupsize {self.groupsize}")
+            print(f"Last group will have {W.shape[0] % self.groupsize} elements")
+
+        # Calculate quantization parameters
+        self.maxq = 2 ** self.wbits - 1
+        self.nsamples = 0
+
+        # Initialize Hessian and other matrices
+        self.rows = W.shape[0]
+        self.columns = W.shape[1]
+        self.H = None  # Will be initialized when first batch is added
+        self.quantized = False
+
+    def add_batch(self, inp):
+        """Add calibration batch to compute Hessian"""
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+        tmp = inp.shape[0]
+        if len(inp.shape) == 3:
+            inp = inp.reshape((-1, inp.shape[-1]))
+        inp = inp.t()
+
+        # Update sample count
+        if self.nsamples == 0:
+            self.H = torch.zeros((self.columns, self.columns), device=self.device)
+
+        self.H *= self.nsamples / (self.nsamples + tmp)
+        self.nsamples += tmp
+
+        # Ensure numerical stability
+        inp = inp.float()
+
+        # Add small noise for numerical stability
+        inp = inp + torch.randn_like(inp) * 1e-4
+
+        # Update Hessian
+        self.H += 2 / self.nsamples * inp.matmul(inp.t())
+
+    def quantize(self):
+        """Perform GPTQ quantization"""
+        W = self.layer.weight.data.clone()
+        W = W.float()
+
+        # Check if we have calibration data
+        if self.H is None or self.nsamples == 0:
+            print("Warning: No calibration data added, initializing with identity matrix")
+            self.H = torch.eye(self.columns, device=self.device) * 0.01
+            self.nsamples = 1
+
+        # Compute inverse Hessian
+        H = self.H
+        del self.H
+
+        # Add damping for numerical stability
+        damp = self.percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(self.columns, device=self.device)
+        H[diag, diag] += damp
+
+        # Try Cholesky decomposition with fallback
+        try:
+            H = torch.linalg.cholesky(H)
+            H = torch.cholesky_inverse(H)
+            H = torch.linalg.cholesky(H, upper=True)
+            Hinv = H
+        except torch._C._LinAlgError:
+            print("Warning: Cholesky decomposition failed, using eigendecomposition instead")
+            # Fallback to eigendecomposition
+            try:
+                # Add more damping
+                H[diag, diag] += damp * 10
+                eigenvalues, eigenvectors = torch.linalg.eigh(H)
+
+                # Ensure all eigenvalues are positive
+                eigenvalues = eigenvalues.clamp(min=1e-5)
+
+                # Reconstruct inverse
+                Hinv = eigenvectors @ torch.diag(1.0 / eigenvalues) @ eigenvectors.T
+            except:
+                print("Warning: Eigendecomposition also failed, using diagonal approximation")
+                # Last resort: diagonal approximation
+                diagonal = torch.diag(H).clamp(min=1e-5)
+                Hinv = torch.diag(1.0 / diagonal)
+
+        # Initialize quantization parameters
+        n_groups = (self.rows + self.groupsize - 1) // self.groupsize
+        scale = torch.zeros((n_groups, 1), device=self.device)
+        zero = torch.zeros((n_groups, 1), device=self.device)
+
+        # Quantize layer weights
+        for i1 in range(0, self.columns, 128):
+            i2 = min(i1 + 128, self.columns)
+            count = i2 - i1
+
+            # Extract block
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                # Quantize groups
+                for j in range(0, self.rows, self.groupsize):
+                    j2 = min(j + self.groupsize, self.rows)
+                    group_idx = j // self.groupsize
+
+                    # Find optimal scale and zero point
+                    w_group = w[j:j2]
+
+                    # Handle empty groups
+                    if w_group.numel() == 0:
+                        continue
+
+                    w_min = w_group.min()
+                    w_max = w_group.max()
+
+                    # Avoid division by zero
+                    if w_max == w_min:
+                        scale_val = 1.0
+                        zero_val = 0.0
+                    else:
+                        scale_val = (w_max - w_min) / self.maxq
+                        zero_val = torch.round(-w_min / scale_val)
+
+                    if group_idx < scale.shape[0]:
+                        scale[group_idx] = scale_val
+                        zero[group_idx] = zero_val
+
+                    # Quantize
+                    q = torch.clamp(torch.round(w_group / scale_val + zero_val), 0, self.maxq)
+                    Q1[j:j2, i] = q
+
+                    # Dequantize for error computation
+                    dequant = (q - zero_val) * scale_val
+                    Err1[j:j2, i] = (w_group - dequant) / d if d != 0 else 0
+
+                # Update remaining weights
+                if i + 1 < count:
+                    # Ensure proper matrix multiplication dimensions
+                    err_col = Err1[:, i:i + 1]  # Shape: (rows, 1)
+                    hinv_row = Hinv1[i, i + 1:].unsqueeze(0)  # Shape: (1, remaining_cols)
+                    update = err_col.matmul(hinv_row)  # Shape: (rows, remaining_cols)
+                    W1[:, i + 1:] -= update
+
+            W[:, i1:i2] = Q1
+
+        # Store quantized weights and parameters
+        self.layer.weight.data = W.to(self.layer.weight.dtype)
+        self.scale = scale
+        self.zero = zero
+        self.quantized = True
+
+        return scale, zero
+
+
+def prepare_calibration_data(
+        tokenizer,
+        dataset_path: str = None,
+        num_samples: int = 128,
+        seq_length: int = 2048
+) -> List[torch.Tensor]:
+    """
+    Prepare calibration dataset for GPTQ
+
+    Args:
+        tokenizer: Model tokenizer
+        dataset_path: Path to calibration dataset (text file)
+        num_samples: Number of calibration samples
+        seq_length: Sequence length for each sample
+
+    Returns:
+        List of tokenized samples
+    """
+    # Fix padding token issue (common with LLaMA models)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        if tokenizer.pad_token is None:
+            # If still None, use a common token
+            tokenizer.pad_token = tokenizer.unk_token
+            if tokenizer.pad_token is None:
+                # Last resort - add a padding token
+                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+
+    if dataset_path is None:
+        # Use a default calibration text if no dataset provided
+        default_text = """
+        The quick brown fox jumps over the lazy dog. 
+        Machine learning is transforming the world of technology.
+        Large language models have revolutionized natural language processing.
+        Artificial intelligence is rapidly advancing across various domains.
+        Deep learning has enabled breakthroughs in computer vision and NLP.
+        Transformer architectures have become the foundation of modern AI.
+        """ * 50
+
+        texts = [default_text[i:i + 1000] for i in range(0, len(default_text) - 1000, 1000)][:num_samples]
+    else:
+        with open(dataset_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+            # Split into chunks
+            chunk_size = max(1000, len(text) // (num_samples + 1))
+            texts = [text[i:i + chunk_size] for i in range(0, len(text) - chunk_size, chunk_size // 2)][:num_samples]
+
+    # Tokenize
+    calibration_data = []
+    for text in texts[:num_samples]:
+        # Skip empty texts
+        if not text.strip():
+            continue
+
+        tokens = tokenizer(
+            text,
+            return_tensors='pt',
+            max_length=seq_length,
+            truncation=True,
+            padding='max_length'
+        )
+        calibration_data.append(tokens.input_ids)
+
+    # Ensure we have enough samples
+    if len(calibration_data) < num_samples:
+        print(f"Warning: Only {len(calibration_data)} calibration samples available (requested {num_samples})")
+
+    return calibration_data
+
+
+def quantize_litellama_model(
+        model_path: str,
+        output_path: str,
+        calibration_data_path: Optional[str] = None,
+        wbits: int = 4,
+        groupsize: int = 128,
+        device: str = "cuda",
+        num_samples: int = 128,
+        seq_length: int = 2048
+) -> None:
+    """
+    Main function to quantize a lite_llama model using GPTQ
+
+    Args:
+        model_path: Path to converted lite_llama model directory
+        output_path: Path to save quantized model
+        calibration_data_path: Path to calibration dataset
+        wbits: Quantization bits (4, 8, etc.)
+        groupsize: Group size for quantization
+        device: Device to use for quantization
+    """
+    print(f"Loading model from {model_path}")
+
+    # Load model weights
+    model_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
+    if not model_files:
+        raise ValueError(f"No .pth file found in {model_path}")
+
+    model_file = os.path.join(model_path, model_files[0])
+    state_dict = torch.load(model_file, map_location=device)
+
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    # Prepare calibration data
+    print("Preparing calibration data...")
+    calibration_data = prepare_calibration_data(
+        tokenizer,
+        calibration_data_path,
+        num_samples=num_samples,
+        seq_length=seq_length
+    )
+
+    # Create output directory
+    os.makedirs(output_path, exist_ok=True)
+
+    # Quantize each layer
+    quantized_state_dict = {}
+    quantization_config = {
+        "wbits": wbits,
+        "groupsize": groupsize,
+        "layers": {}
+    }
+
+    # Get all weight keys that need quantization
+    weight_keys_to_quantize = []
+    for key in state_dict.keys():
+        if any(pattern in key for pattern in [
+            "q_proj", "kv_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+            "lm_head", "embed_tokens"
+        ]) and "weight" in key:
+            weight_keys_to_quantize.append(key)
+
+    print(f"Found {len(weight_keys_to_quantize)} weights to quantize")
+
+    # Process each weight
+    for key in tqdm(weight_keys_to_quantize, desc="Quantizing layers"):
+        weight = state_dict[key]
+
+        # Skip if weight is too small
+        if weight.numel() < 1024:
+            print(f"\nSkipping {key} (too small: {weight.numel()} parameters)")
+            quantized_state_dict[key] = weight
+            continue
+
+        print(f"\nQuantizing {key} (shape: {weight.shape})...")
+
+        # Create a dummy layer for GPTQ
+        layer = nn.Linear(weight.shape[1], weight.shape[0], bias=False)
+        layer.weight.data = weight.to(device)
+
+        # Adjust percdamp for different layer types
+        percdamp = 0.01
+        if "embed" in key or "lm_head" in key:
+            percdamp = 0.1  # Higher damping for embeddings
+            print(f"  Using higher damping (0.1) for {key}")
+
+        # Initialize GPTQ
+        gptq = GPTQ(
+            layer=layer,
+            wbits=wbits,
+            groupsize=groupsize,
+            device=device,
+            percdamp=percdamp
+        )
+
+        # Add calibration data (simplified - in practice, you'd run forward passes)
+        # For better results, we should use embeddings from the actual text
+        # Get embedding weight if available
+        embed_key = None
+        for k in state_dict.keys():
+            if "embed_tokens.weight" in k:
+                embed_key = k
+                break
+
+        if embed_key and len(calibration_data) > 0:
+            embed_weight = state_dict[embed_key].to(device)
+            # Use actual token embeddings as input
+            for i in range(min(len(calibration_data), 32)):
+                tokens = calibration_data[i][0].to(device)
+                # Get embeddings for these tokens
+                embeddings = torch.embedding(embed_weight, tokens)
+                # Average pool to get input dimension
+                if embeddings.shape[1] > weight.shape[1]:
+                    # Use adaptive pooling to match dimensions
+                    embeddings = torch.nn.functional.adaptive_avg_pool1d(
+                        embeddings.transpose(1, 2),
+                        weight.shape[1]
+                    ).transpose(1, 2)
+                elif embeddings.shape[1] < weight.shape[1]:
+                    # Skip if embedding dimension doesn't match
+                    continue
+
+                # Take mean across sequence length for this layer's input
+                fake_inp = embeddings.mean(dim=0, keepdim=True)
+                if fake_inp.shape[1] == weight.shape[1]:
+                    gptq.add_batch(fake_inp)
+        else:
+            # Fallback to random data if no embeddings available
+            for _ in range(min(len(calibration_data), 32)):
+                fake_inp = torch.randn(1, weight.shape[1], device=device) * 0.1
+                gptq.add_batch(fake_inp)
+
+        # Quantize
+        scale, zero = gptq.quantize()
+
+        # Store quantized weight and parameters
+        quantized_state_dict[key] = layer.weight.data.cpu()
+        quantization_config["layers"][key] = {
+            "scale": scale.cpu().tolist(),
+            "zero": zero.cpu().tolist(),
+            "groupsize": groupsize,
+            "wbits": wbits
+        }
+
+        # Clean up
+        del layer, gptq
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    # Copy non-quantized weights
+    for key in state_dict.keys():
+        if key not in quantized_state_dict:
+            quantized_state_dict[key] = state_dict[key]
+
+    # Save quantized model
+    model_id = os.path.basename(model_path)
+    torch.save(
+        quantized_state_dict,
+        os.path.join(output_path, f"{model_id}-{wbits}bit-gptq.pth")
+    )
+
+    # Save quantization config
+    with open(os.path.join(output_path, "quantization_config.json"), "w") as f:
+        json.dump(quantization_config, f, indent=2)
+
+    # Copy other files
+    for file in os.listdir(model_path):
+        if file.endswith('.json') and file != "quantization_config.json":
+            src = os.path.join(model_path, file)
+            dst = os.path.join(output_path, file)
+            with open(src, 'r') as f_in, open(dst, 'w') as f_out:
+                f_out.write(f_in.read())
+
+    if os.path.exists(os.path.join(model_path, "tokenizer.model")):
+        import shutil
+        shutil.copy(
+            os.path.join(model_path, "tokenizer.model"),
+            os.path.join(output_path, "tokenizer.model")
+        )
+
+    print(f"Quantization complete! Model saved to {output_path}")
+
+    # Print compression statistics
+    original_size = sum(p.numel() * p.element_size() for p in state_dict.values())
+    quantized_size = sum(p.numel() * p.element_size() for p in quantized_state_dict.values())
+    compression_ratio = original_size / quantized_size
+
+    print(f"Original model size: {original_size / 1e9:.2f} GB")
+    print(f"Quantized model size: {quantized_size / 1e9:.2f} GB")
+    print(f"Compression ratio: {compression_ratio:.2f}x")
+
+
+def dequantize_weight(
+        quantized_weight: torch.Tensor,
+        scale: torch.Tensor,
+        zero: torch.Tensor,
+        wbits: int = 4,
+        groupsize: int = 128
+) -> torch.Tensor:
+    """
+    Dequantize a weight tensor
+
+    Args:
+        quantized_weight: Quantized weight tensor
+        scale: Scale parameters
+        zero: Zero point parameters
+        wbits: Quantization bits
+        groupsize: Group size used in quantization
+
+    Returns:
+        Dequantized weight tensor
+    """
+    weight = torch.zeros_like(quantized_weight, dtype=torch.float32)
+
+    for i in range(0, quantized_weight.shape[0], groupsize):
+        j = min(i + groupsize, quantized_weight.shape[0])
+        group_idx = i // groupsize
+
+        if group_idx < scale.shape[0]:
+            weight[i:j] = (quantized_weight[i:j] - zero[group_idx]) * scale[group_idx]
+
+    return weight
+
+
+# Integration with your existing code
+def quantize_after_conversion(
+        checkpoints_dir: str,
+        model_type: str,  # "llama", "qwen2", or "llava"
+        calibration_data_path: Optional[str] = None,
+        wbits: int = 4,
+        groupsize: int = 128,
+        num_samples: int = 128,
+        seq_length: int = 2048
+):
+    """
+    Quantize model after it has been converted to lite_llama format
+
+    Args:
+        checkpoints_dir: Original HF model directory
+        model_type: Type of model ("llama", "qwen2", or "llava")
+        calibration_data_path: Path to calibration dataset
+        wbits: Quantization bits
+        groupsize: Group size for quantization (0 for auto-detect)
+        num_samples: Number of calibration samples
+        seq_length: Sequence length for calibration
+    """
+    # Construct paths
+    model_id = os.path.basename(os.path.normpath(checkpoints_dir))
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # Path to converted model
+    converted_model_path = os.path.join(current_dir, f"../../my_weight/{model_id}")
+
+    # Path for quantized model
+    quantized_model_path = os.path.join(current_dir, f"../../my_weight/{model_id}-{wbits}bit-gptq")
+
+    # Perform quantization
+    quantize_litellama_model(
+        model_path=converted_model_path,
+        output_path=quantized_model_path,
+        calibration_data_path=calibration_data_path,
+        wbits=wbits,
+        groupsize=groupsize,
+        num_samples=num_samples,
+        seq_length=seq_length
+    )
+
+    return quantized_model_path
\ No newline at end of file
diff --git a/quantize_model.py b/quantize_model.py
new file mode 100644
index 0000000..8060612
--- /dev/null
+++ b/quantize_model.py
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+"""
+Quantize an already converted lite_llama model using GPTQ
+"""
+
+import argparse
+import os
+import sys
+import json
+import torch
+
+# Add the current directory to Python path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from lite_llama.quantization.gptq import quantize_litellama_model
+
+
+def parse_arguments():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(
+        description="Quantize an already converted lite_llama model using GPTQ"
+    )
+
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="Path to the converted lite_llama model directory"
+    )
+
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        help="Path to save the quantized model (default: auto-generated based on model_path)"
+    )
+
+    parser.add_argument(
+        "--wbits",
+        type=int,
+        default=4,
+        choices=[2, 3, 4, 8],
+        help="Number of bits for quantization (default: 4)"
+    )
+
+    parser.add_argument(
+        "--groupsize",
+        type=int,
+        default=128,
+        help="Group size for quantization (default: 128, -1 for no grouping, 0 for auto-detect)"
+    )
+
+    parser.add_argument(
+        "--calibration_data",
+        type=str,
+        default=None,
+        help="Path to calibration dataset file for GPTQ (optional)"
+    )
+
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="Device to use for quantization (default: cuda)"
+    )
+
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=128,
+        help="Number of calibration samples to use (default: 128)"
+    )
+
+    parser.add_argument(
+        "--seq_length",
+        type=int,
+        default=2048,
+        help="Sequence length for calibration samples (default: 2048)"
+    )
+
+    return parser.parse_args()
+
+
+def check_model_compatibility(model_path):
+    """Check if the model is a valid converted lite_llama model"""
+    # Check for required files
+    required_files = []
+    optional_files = ["config.json", "tokenizer.model", "tokenizer.json"]
+
+    # Find .pth file
+    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
+    if not pth_files:
+        return False, "No .pth file found in the model directory"
+
+    # Check if already quantized
+    if any('gptq' in f for f in pth_files):
+        return False, "Model appears to be already quantized"
+
+    # Check for config files
+    found_configs = []
+    for config_file in optional_files:
+        if os.path.exists(os.path.join(model_path, config_file)):
+            found_configs.append(config_file)
+
+    if not found_configs:
+        return False, "No configuration files found (config.json, tokenizer.json, etc.)"
+
+    return True, "Model is compatible"
+
+
+def get_model_info(model_path):
+    """Extract model information from the directory"""
+    info = {
+        "model_name": os.path.basename(model_path),
+        "model_type": "unknown",
+        "size": 0
+    }
+
+    # Try to determine model type from name
+    model_name_lower = info["model_name"].lower()
+    if "llava" in model_name_lower:
+        info["model_type"] = "llava"
+    elif "qwen2" in model_name_lower:
+        info["model_type"] = "qwen2"
+    elif "llama" in model_name_lower:
+        info["model_type"] = "llama"
+
+    # Calculate model size
+    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
+    if pth_files:
+        model_file = os.path.join(model_path, pth_files[0])
+        info["size"] = os.path.getsize(model_file) / (1024 ** 3)  # Size in GB
+
+    return info
+
+
+def main():
+    # Parse arguments
+    args = parse_arguments()
+
+    print("=" * 60)
+    print("GPTQ Quantization for Lite-LLaMA Models")
+    print("=" * 60)
+
+    # Check if model path exists
+    if not os.path.exists(args.model_path):
+        print(f"Error: Model path does not exist: {args.model_path}")
+        return 1
+
+    # Check model compatibility
+    is_compatible, message = check_model_compatibility(args.model_path)
+    if not is_compatible:
+        print(f"Error: {message}")
+        return 1
+
+    # Get model information
+    model_info = get_model_info(args.model_path)
+
+    print(f"\nModel Information:")
+    print(f"  Name: {model_info['model_name']}")
+    print(f"  Type: {model_info['model_type']}")
+    print(f"  Size: {model_info['size']:.2f} GB")
+
+    # Auto-detect groupsize if requested
+    if args.groupsize == 0:
+        print("\nAuto-detecting optimal groupsize...")
+        # Load a sample weight to check dimensions
+        pth_files = [f for f in os.listdir(args.model_path) if f.endswith('.pth')]
+        if pth_files:
+            sample_weights = torch.load(
+                os.path.join(args.model_path, pth_files[0]),
+                map_location='cpu'
+            )
+
+            # Find vocabulary size from embeddings or lm_head
+            vocab_sizes = []
+            for name, weight in sample_weights.items():
+                if ("embed" in name or "lm_head" in name) and len(weight.shape) >= 2:
+                    vocab_sizes.extend(weight.shape)
+
+            if vocab_sizes:
+                vocab_size = max(vocab_sizes)
+                # Find suitable groupsize
+                for gs in [128, 256, 512, 1024]:
+                    if vocab_size % gs == 0:
+                        args.groupsize = gs
+                        print(f"✓ Selected groupsize: {gs} (evenly divides vocab size {vocab_size})")
+                        break
+                else:
+                    # No perfect divisor found
+                    if vocab_size % 256 < vocab_size % 128:
+                        args.groupsize = 256
+                    else:
+                        args.groupsize = -1
+                    print(f"✓ Selected groupsize: {args.groupsize} (best fit for vocab size {vocab_size})")
+
+            del sample_weights
+
+    print(f"\nQuantization Settings:")
+    print(f"  Bits: {args.wbits}")
+    print(f"  Group size: {args.groupsize}")
+    print(f"  Device: {args.device}")
+    print(f"  Calibration data: {args.calibration_data or 'Default'}")
+
+    # Check CUDA availability
+    if args.device == "cuda" and not torch.cuda.is_available():
+        print("\nWarning: CUDA is not available. Falling back to CPU.")
+        print("Note: Quantization on CPU will be significantly slower.")
+        args.device = "cpu"
+
+    # Set output path if not provided
+    if args.output_path is None:
+        parent_dir = os.path.dirname(args.model_path)
+        model_name = os.path.basename(args.model_path)
+        args.output_path = parent_dir + f"{model_name}-{args.wbits}bit-gptq"
+
+
+    print(f"\nOutput path: {args.output_path}")
+
+    # Confirm before proceeding
+    print("\n" + "-" * 60)
+    response = input("Proceed with quantization? (y/N): ")
+    if response.lower() != 'y':
+        print("Quantization cancelled.")
+        return 0
+
+    print("\n" + "=" * 60)
+    print("Starting quantization...")
+    print("=" * 60 + "\n")
+
+    try:
+        # Run quantization
+        quantize_litellama_model(
+            model_path=args.model_path,
+            output_path=args.output_path,
+            calibration_data_path=args.calibration_data,
+            wbits=args.wbits,
+            groupsize=args.groupsize,
+            device=args.device,
+            num_samples=args.num_samples,
+            seq_length=args.seq_length
+        )
+
+        print("\n" + "=" * 60)
+        print("Quantization completed successfully!")
+        print("=" * 60)
+
+        # Print summary
+        print(f"\nQuantized model saved to: {args.output_path}")
+
+        # Calculate and show compression ratio
+        original_size = model_info['size']
+        quantized_size = sum(
+            os.path.getsize(os.path.join(args.output_path, f)) / (1024 ** 3)
+            for f in os.listdir(args.output_path)
+            if f.endswith('.pth')
+        )
+        compression_ratio = original_size / quantized_size if quantized_size > 0 else 0
+
+        print(f"\nCompression Statistics:")
+        print(f"  Original size: {original_size:.2f} GB")
+        print(f"  Quantized size: {quantized_size:.2f} GB")
+        print(f"  Compression ratio: {compression_ratio:.2f}x")
+        print(f"  Space saved: {(1 - 1 / compression_ratio) * 100:.1f}%")
+
+    except KeyboardInterrupt:
+        print("\n\nQuantization interrupted by user.")
+        return 1
+    except Exception as e:
+        print(f"\nError during quantization: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file

From af1f42903157e9cd271d025a8f5adcfb2426820e Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 24 May 2025 17:51:56 +0930
Subject: [PATCH 15/33] gptq for llava

---
 apply_weight_convert.py             |   6 +-
 lite_llama/quantization/__init__.py |   0
 lite_llama/quantization/gptq.py     |  62 +++++++++---
 lite_llama/utils/common.py          |  64 ++++++++++---
 quantize_model.py                   | 143 +++++++++++++++++++---------
 5 files changed, 207 insertions(+), 68 deletions(-)
 create mode 100644 lite_llama/quantization/__init__.py

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index b6af59d..33f467d 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -214,13 +214,17 @@ def main():
         print(f"Groupsize: {args.groupsize}")
         print("=" * 50 + "\n")
 
+        # Check if it's a LLaVA model and set skip_vision accordingly
+        skip_vision = model_type == "llava"
+
         try:
             quantized_path = quantize_after_conversion(
                 checkpoints_dir=checkpoints_dir,
                 model_type=model_type,
                 calibration_data_path=args.calibration_data,
                 wbits=args.wbits,
-                groupsize=args.groupsize
+                groupsize=args.groupsize,
+                skip_vision=skip_vision
             )
             print(f"\nQuantization completed successfully!")
             print(f"Quantized model saved to: {quantized_path}")
diff --git a/lite_llama/quantization/__init__.py b/lite_llama/quantization/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lite_llama/quantization/gptq.py b/lite_llama/quantization/gptq.py
index 7a226a5..c410275 100644
--- a/lite_llama/quantization/gptq.py
+++ b/lite_llama/quantization/gptq.py
@@ -274,7 +274,8 @@ def quantize_litellama_model(
         groupsize: int = 128,
         device: str = "cuda",
         num_samples: int = 128,
-        seq_length: int = 2048
+        seq_length: int = 2048,
+        skip_vision: bool = False,
 ) -> None:
     """
     Main function to quantize a lite_llama model using GPTQ
@@ -320,21 +321,39 @@ def quantize_litellama_model(
         "layers": {}
     }
 
+    # Detect if this is a LLaVA model by checking for language_model prefix
+    is_llava = any("language_model" in key for key in state_dict.keys())
+
     # Get all weight keys that need quantization
     weight_keys_to_quantize = []
+
+    # Updated patterns for both regular and LLaVA models
+    patterns = [
+        # Regular model patterns
+        "q_proj", "kv_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+        "lm_head", "embed_tokens",
+        # LLaVA specific patterns (without _weight suffix in search)
+        "q_proj_weight", "kv_proj_weight", "o_proj_weight"
+    ]
+
     for key in state_dict.keys():
-        if any(pattern in key for pattern in [
-            "q_proj", "kv_proj", "o_proj",
-            "gate_proj", "up_proj", "down_proj",
-            "lm_head", "embed_tokens"
-        ]) and "weight" in key:
+        # For LLaVA models, also check for language_model prefix
+        if is_llava and "language_model" in key:
+            if any(pattern in key for pattern in patterns):
+                weight_keys_to_quantize.append(key)
+        elif any(pattern in key for pattern in patterns) and ("weight" in key or key.endswith(("_weight", ".weight"))):
             weight_keys_to_quantize.append(key)
 
     print(f"Found {len(weight_keys_to_quantize)} weights to quantize")
+    if is_llava:
+        print("Detected LLaVA model structure")
 
     # Process each weight
+    skipped_vision_count = 0
     for key in tqdm(weight_keys_to_quantize, desc="Quantizing layers"):
         weight = state_dict[key]
+        is_vision = "vision" in key.lower() or "multi_modal_projector" in key.lower()
 
         # Skip if weight is too small
         if weight.numel() < 1024:
@@ -342,6 +361,12 @@ def quantize_litellama_model(
             quantized_state_dict[key] = weight
             continue
 
+        # Skip vision weights if requested
+        if skip_vision and is_vision:
+            quantized_state_dict[key] = weight
+            skipped_vision_count += 1
+            continue
+
         print(f"\nQuantizing {key} (shape: {weight.shape})...")
 
         # Create a dummy layer for GPTQ
@@ -364,12 +389,21 @@ def quantize_litellama_model(
         )
 
         # Add calibration data (simplified - in practice, you'd run forward passes)
-        # For better results, we should use embeddings from the actual text
         # Get embedding weight if available
         embed_key = None
+
+        # Search for embedding key - handle both regular and LLaVA models
+        embed_patterns = [
+            "embed_tokens.weight",
+            "language_model.embed_tokens.weight"
+        ]
+
         for k in state_dict.keys():
-            if "embed_tokens.weight" in k:
-                embed_key = k
+            for pattern in embed_patterns:
+                if pattern in k:
+                    embed_key = k
+                    break
+            if embed_key:
                 break
 
         if embed_key and len(calibration_data) > 0:
@@ -450,6 +484,9 @@ def quantize_litellama_model(
 
     print(f"Quantization complete! Model saved to {output_path}")
 
+    if skipped_vision_count > 0:
+        print(f"Skipped {skipped_vision_count} vision model weights")
+
     # Print compression statistics
     original_size = sum(p.numel() * p.element_size() for p in state_dict.values())
     quantized_size = sum(p.numel() * p.element_size() for p in quantized_state_dict.values())
@@ -500,7 +537,8 @@ def quantize_after_conversion(
         wbits: int = 4,
         groupsize: int = 128,
         num_samples: int = 128,
-        seq_length: int = 2048
+        seq_length: int = 2048,
+        skip_vision: bool = False
 ):
     """
     Quantize model after it has been converted to lite_llama format
@@ -513,6 +551,7 @@ def quantize_after_conversion(
         groupsize: Group size for quantization (0 for auto-detect)
         num_samples: Number of calibration samples
         seq_length: Sequence length for calibration
+        skip_vision: Skip quantization of vision weights (for LLaVA)
     """
     # Construct paths
     model_id = os.path.basename(os.path.normpath(checkpoints_dir))
@@ -532,7 +571,8 @@ def quantize_after_conversion(
         wbits=wbits,
         groupsize=groupsize,
         num_samples=num_samples,
-        seq_length=seq_length
+        seq_length=seq_length,
+        skip_vision=skip_vision
     )
 
     return quantized_model_path
\ No newline at end of file
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index fc240f2..9a4f3b6 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -81,16 +81,54 @@ def count_tokens(texts: List[str], tokenizer) -> int:
     return total_tokens
 
 
-def get_model_type(checkpoint_path: str) -> str | None:
-    from utils.logger import log
-
-    model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]
-
-    config_content = read_json(os.path.join(checkpoint_path, "config.json"))
-    for m in model_type:
-        if m in config_content["model_type"].lower():
-            if m == "llava":
-                return "llama"
-            return m
-    log.error(f"No model type found: {checkpoint_path}")
-    return None
+def check_model_compatibility(model_path):
+    """Check if the model is a valid converted lite_llama model"""
+    # Check for required files
+    required_files = []
+    optional_files = ["config.json", "tokenizer.model", "tokenizer.json"]
+
+    # Find .pth file
+    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
+    if not pth_files:
+        return False, "No .pth file found in the model directory"
+
+    # Check if already quantized
+    if any('gptq' in f for f in pth_files):
+        return False, "Model appears to be already quantized"
+
+    # Check for config files
+    found_configs = []
+    for config_file in optional_files:
+        if os.path.exists(os.path.join(model_path, config_file)):
+            found_configs.append(config_file)
+
+    if not found_configs:
+        return False, "No configuration files found (config.json, tokenizer.json, etc.)"
+
+    return True, "Model is compatible"
+
+
+def get_model_info(model_path):
+    """Extract model information from the directory"""
+    info = {
+        "model_name": os.path.basename(model_path),
+        "model_type": "unknown",
+        "size": 0
+    }
+
+    # Try to determine model type from name
+    model_name_lower = info["model_name"].lower()
+    if "llava" in model_name_lower:
+        info["model_type"] = "llava"
+    elif "qwen2" in model_name_lower:
+        info["model_type"] = "qwen2"
+    elif "llama" in model_name_lower:
+        info["model_type"] = "llama"
+
+    # Calculate model size
+    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
+    if pth_files:
+        model_file = os.path.join(model_path, pth_files[0])
+        info["size"] = os.path.getsize(model_file) / (1024 ** 3)  # Size in GB
+
+    return info
\ No newline at end of file
diff --git a/quantize_model.py b/quantize_model.py
index 8060612..eb3bde7 100644
--- a/quantize_model.py
+++ b/quantize_model.py
@@ -79,60 +79,84 @@ def parse_arguments():
         help="Sequence length for calibration samples (default: 2048)"
     )
 
+    parser.add_argument(
+        "--skip_vision",
+        action="store_true",
+        help="Skip quantization of vision model weights (for LLaVA models)"
+    )
+
+    parser.add_argument(
+        "--quantize_vision",
+        action="store_true",
+        help="Force quantization of vision model weights (not recommended for LLaVA)"
+    )
+
     return parser.parse_args()
 
 
 def check_model_compatibility(model_path):
-    """Check if the model is a valid converted lite_llama model"""
-    # Check for required files
-    required_files = []
-    optional_files = ["config.json", "tokenizer.model", "tokenizer.json"]
+    """Check if the model is compatible for quantization"""
+    # Check if model path exists and contains .pth files
+    if not os.path.exists(model_path):
+        return False, f"Model path does not exist: {model_path}"
 
-    # Find .pth file
     pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
     if not pth_files:
-        return False, "No .pth file found in the model directory"
-
-    # Check if already quantized
-    if any('gptq' in f for f in pth_files):
-        return False, "Model appears to be already quantized"
+        return False, f"No .pth files found in {model_path}"
 
-    # Check for config files
-    found_configs = []
-    for config_file in optional_files:
-        if os.path.exists(os.path.join(model_path, config_file)):
-            found_configs.append(config_file)
-
-    if not found_configs:
-        return False, "No configuration files found (config.json, tokenizer.json, etc.)"
+    # Check if required config files exist
+    config_files = ["config.json", "tokenizer_config.json"]
+    missing_configs = [f for f in config_files if not os.path.exists(os.path.join(model_path, f))]
+    if missing_configs:
+        print(f"Warning: Missing config files: {missing_configs}")
 
     return True, "Model is compatible"
 
 
 def get_model_info(model_path):
-    """Extract model information from the directory"""
-    info = {
+    """Get basic information about the model"""
+    model_info = {
         "model_name": os.path.basename(model_path),
         "model_type": "unknown",
-        "size": 0
+        "size": 0.0
     }
 
-    # Try to determine model type from name
-    model_name_lower = info["model_name"].lower()
+    # Detect model type from path or config
+    model_name_lower = model_info["model_name"].lower()
     if "llava" in model_name_lower:
-        info["model_type"] = "llava"
+        model_info["model_type"] = "llava"
     elif "qwen2" in model_name_lower:
-        info["model_type"] = "qwen2"
+        model_info["model_type"] = "qwen2"
     elif "llama" in model_name_lower:
-        info["model_type"] = "llama"
-
-    # Calculate model size
-    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
-    if pth_files:
-        model_file = os.path.join(model_path, pth_files[0])
-        info["size"] = os.path.getsize(model_file) / (1024 ** 3)  # Size in GB
-
-    return info
+        model_info["model_type"] = "llama"
+
+    # Try to read from config.json
+    config_path = os.path.join(model_path, "config.json")
+    if os.path.exists(config_path):
+        try:
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+                if "architectures" in config:
+                    arch = config["architectures"][0].lower()
+                    if "llava" in arch:
+                        model_info["model_type"] = "llava"
+                    elif "qwen2" in arch:
+                        model_info["model_type"] = "qwen2"
+                    elif "llama" in arch:
+                        model_info["model_type"] = "llama"
+        except:
+            pass
+
+    # Calculate total size
+    total_size = 0
+    for f in os.listdir(model_path):
+        if f.endswith('.pth'):
+            file_path = os.path.join(model_path, f)
+            total_size += os.path.getsize(file_path)
+
+    model_info["size"] = total_size / (1024 ** 3)  # Convert to GB
+
+    return model_info
 
 
 def main():
@@ -157,10 +181,20 @@ def main():
     # Get model information
     model_info = get_model_info(args.model_path)
 
+    # Detect if this is a LLaVA model
+    is_llava = model_info["model_type"] == "llava"
+    if is_llava:
+        print("\n⚠️  Detected LLaVA model - will handle vision weights specially")
+        if not args.quantize_vision and not args.skip_vision:
+            args.skip_vision = True  # Default to skipping vision weights
+            print("   Skipping vision weights by default (use --quantize_vision to override)")
+
     print(f"\nModel Information:")
     print(f"  Name: {model_info['model_name']}")
     print(f"  Type: {model_info['model_type']}")
     print(f"  Size: {model_info['size']:.2f} GB")
+    if is_llava:
+        print(f"  Vision weights: {'Will be quantized' if args.quantize_vision else 'Will be skipped'}")
 
     # Auto-detect groupsize if requested
     if args.groupsize == 0:
@@ -213,8 +247,7 @@ def main():
     if args.output_path is None:
         parent_dir = os.path.dirname(args.model_path)
         model_name = os.path.basename(args.model_path)
-        args.output_path = parent_dir + f"{model_name}-{args.wbits}bit-gptq"
-
+        args.output_path = os.path.join(parent_dir, f"{model_name}-{args.wbits}bit-gptq")
 
     print(f"\nOutput path: {args.output_path}")
 
@@ -239,7 +272,8 @@ def main():
             groupsize=args.groupsize,
             device=args.device,
             num_samples=args.num_samples,
-            seq_length=args.seq_length
+            seq_length=args.seq_length,
+            skip_vision=args.skip_vision
         )
 
         print("\n" + "=" * 60)
@@ -251,12 +285,20 @@ def main():
 
         # Calculate and show compression ratio
         original_size = model_info['size']
-        quantized_size = sum(
-            os.path.getsize(os.path.join(args.output_path, f)) / (1024 ** 3)
-            for f in os.listdir(args.output_path)
-            if f.endswith('.pth')
-        )
-        compression_ratio = original_size / quantized_size if quantized_size > 0 else 0
+
+        # Calculate quantized size
+        quantized_size = 0
+        if os.path.exists(args.output_path):
+            for f in os.listdir(args.output_path):
+                if f.endswith('.pth'):
+                    file_path = os.path.join(args.output_path, f)
+                    quantized_size += os.path.getsize(file_path) / (1024 ** 3)
+
+        if quantized_size > 0:
+            compression_ratio = original_size / quantized_size
+        else:
+            print("\nWarning: Could not calculate compression ratio (output files not found)")
+            compression_ratio = 0
 
         print(f"\nCompression Statistics:")
         print(f"  Original size: {original_size:.2f} GB")
@@ -264,6 +306,21 @@ def main():
         print(f"  Compression ratio: {compression_ratio:.2f}x")
         print(f"  Space saved: {(1 - 1 / compression_ratio) * 100:.1f}%")
 
+        # Expected compression analysis
+        expected_ratio = 32 / (args.wbits + 0.5)  # 0.5 for metadata overhead
+        if compression_ratio < 1.5:
+            print(f"\n⚠️  Warning: Low compression ratio detected!")
+            print(f"  Expected: ~{expected_ratio:.1f}x for {args.wbits}-bit quantization")
+            print(f"  Actual: {compression_ratio:.2f}x")
+            print("\nPossible reasons:")
+            print("  - Model has many non-quantizable layers (embeddings, norms)")
+            print("  - Vision components were skipped (for LLaVA)")
+            print("  - Small model size (quantization overhead is more significant)")
+            print("\nFor better compression, consider:")
+            print("  - Using fewer bits (e.g., 3-bit or 2-bit)")
+            print("  - Larger groupsize (reduces metadata overhead)")
+            print("  - Quantizing embeddings (if safe for your use case)")
+
     except KeyboardInterrupt:
         print("\n\nQuantization interrupted by user.")
         return 1

From f4c1010e4453797dabaa0f2f549de6b0c62d8de4 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 24 May 2025 22:36:24 +0930
Subject: [PATCH 16/33] fix print problem

---
 lite_llama/quantization/gptq.py | 196 ++++++++++++++++++++++++++++----
 lite_llama/utils/common.py      |  76 +++++++------
 quantize_model.py               | 100 +++-------------
 3 files changed, 237 insertions(+), 135 deletions(-)

diff --git a/lite_llama/quantization/gptq.py b/lite_llama/quantization/gptq.py
index c410275..edecd1e 100644
--- a/lite_llama/quantization/gptq.py
+++ b/lite_llama/quantization/gptq.py
@@ -1,3 +1,14 @@
+"""
+GPTQ Quantization for Lite-LLaMA Models
+
+Key improvements in this version:
+1. Proper weight packing for 4-bit quantization (2 weights per byte)
+2. Stores quantized weights as uint8/int16 instead of float32
+3. Separate storage of scale/zero parameters
+4. Accurate compression ratio calculation
+5. Support for LLaVA models with vision weight skipping
+"""
+
 import torch
 import torch.nn as nn
 from tqdm import tqdm
@@ -9,6 +20,37 @@
 import gc
 
 
+def pack_4bit_weights(qweight, n_rows, n_cols):
+    """Pack 4-bit weights into uint8 format (2 weights per byte)"""
+    # Ensure even number of columns for packing
+    if n_cols % 2 != 0:
+        # Pad with zeros if odd number of columns
+        qweight = torch.nn.functional.pad(qweight, (0, 1), value=0)
+        n_cols += 1
+
+    # Pack two 4-bit values into one 8-bit value
+    packed = torch.zeros((n_rows, n_cols // 2), dtype=torch.uint8, device=qweight.device)
+    for i in range(0, n_cols, 2):
+        # First 4-bit value in lower nibble, second in upper nibble
+        packed[:, i // 2] = (qweight[:, i] & 0xF) | ((qweight[:, i + 1] & 0xF) << 4)
+
+    return packed
+
+
+def unpack_4bit_weights(packed, n_rows, original_n_cols):
+    """Unpack 4-bit weights from uint8 format"""
+    n_packed_cols = packed.shape[1]
+    unpacked = torch.zeros((n_rows, n_packed_cols * 2), dtype=torch.uint8, device=packed.device)
+
+    for i in range(n_packed_cols):
+        # Extract lower and upper nibbles
+        unpacked[:, i * 2] = packed[:, i] & 0xF
+        unpacked[:, i * 2 + 1] = (packed[:, i] >> 4) & 0xF
+
+    # Remove padding if it was added
+    return unpacked[:, :original_n_cols]
+
+
 class GPTQ:
     """
     GPTQ Quantizer for custom lite_llama models
@@ -126,6 +168,13 @@ def quantize(self):
         scale = torch.zeros((n_groups, 1), device=self.device)
         zero = torch.zeros((n_groups, 1), device=self.device)
 
+        # Create quantized weight tensor with appropriate dtype
+        if self.wbits <= 8:
+            Q = torch.zeros((self.rows, self.columns), dtype=torch.uint8, device=self.device)
+        else:
+            # For now, use int16 for >8 bits, though this won't save space
+            Q = torch.zeros((self.rows, self.columns), dtype=torch.int16, device=self.device)
+
         # Quantize layer weights
         for i1 in range(0, self.columns, 128):
             i2 = min(i1 + 128, self.columns)
@@ -185,15 +234,16 @@ def quantize(self):
                     update = err_col.matmul(hinv_row)  # Shape: (rows, remaining_cols)
                     W1[:, i + 1:] -= update
 
-            W[:, i1:i2] = Q1
+            # Store in compact format
+            Q[:, i1:i2] = Q1.to(Q.dtype)
 
-        # Store quantized weights and parameters
-        self.layer.weight.data = W.to(self.layer.weight.dtype)
+        # Store quantized weights in packed format
+        self.qweight = Q
         self.scale = scale
         self.zero = zero
         self.quantized = True
 
-        return scale, zero
+        return Q, scale, zero
 
 
 def prepare_calibration_data(
@@ -435,16 +485,33 @@ def quantize_litellama_model(
                 gptq.add_batch(fake_inp)
 
         # Quantize
-        scale, zero = gptq.quantize()
-
-        # Store quantized weight and parameters
-        quantized_state_dict[key] = layer.weight.data.cpu()
-        quantization_config["layers"][key] = {
-            "scale": scale.cpu().tolist(),
-            "zero": zero.cpu().tolist(),
-            "groupsize": groupsize,
-            "wbits": wbits
-        }
+        qweight, scale, zero = gptq.quantize()
+
+        # Pack weights if 4-bit quantization
+        if wbits == 4:
+            # Store original shape for unpacking
+            original_shape = qweight.shape
+            packed_weight = pack_4bit_weights(qweight, qweight.shape[0], qweight.shape[1])
+            quantized_state_dict[key] = packed_weight.cpu()
+            # Store scale and zero as tensors with specific keys
+            quantized_state_dict[f"{key}.scale"] = scale.cpu()
+            quantized_state_dict[f"{key}.zero"] = zero.cpu()
+            quantization_config["layers"][key] = {
+                "groupsize": groupsize,
+                "wbits": wbits,
+                "original_shape": list(original_shape),
+                "packed": True
+            }
+        else:
+            # For 8-bit or other, store directly
+            quantized_state_dict[key] = qweight.cpu()
+            quantized_state_dict[f"{key}.scale"] = scale.cpu()
+            quantized_state_dict[f"{key}.zero"] = zero.cpu()
+            quantization_config["layers"][key] = {
+                "groupsize": groupsize,
+                "wbits": wbits,
+                "packed": False
+            }
 
         # Clean up
         del layer, gptq
@@ -456,7 +523,7 @@ def quantize_litellama_model(
         if key not in quantized_state_dict:
             quantized_state_dict[key] = state_dict[key]
 
-    # Save quantized model
+    # Save quantized model - now everything is in the state dict
     model_id = os.path.basename(model_path)
     torch.save(
         quantized_state_dict,
@@ -489,12 +556,25 @@ def quantize_litellama_model(
 
     # Print compression statistics
     original_size = sum(p.numel() * p.element_size() for p in state_dict.values())
-    quantized_size = sum(p.numel() * p.element_size() for p in quantized_state_dict.values())
-    compression_ratio = original_size / quantized_size
 
-    print(f"Original model size: {original_size / 1e9:.2f} GB")
+    # Calculate quantized size more accurately
+    quantized_size = 0
+    for key, tensor in quantized_state_dict.items():
+        # Add size of each tensor in the quantized state dict
+        quantized_size += tensor.numel() * tensor.element_size()
+
+    compression_ratio = original_size / quantized_size if quantized_size > 0 else 0
+
+    print(f"\nOriginal model size: {original_size / 1e9:.2f} GB")
     print(f"Quantized model size: {quantized_size / 1e9:.2f} GB")
     print(f"Compression ratio: {compression_ratio:.2f}x")
+    print(f"Space saved: {(1 - quantized_size / original_size) * 100:.1f}%")
+
+    # Expected compression ratios
+    expected_ratio = 32 / (wbits + 0.5)  # 0.5 for metadata overhead
+    if compression_ratio < expected_ratio * 0.8:
+        print(f"\nNote: Compression ratio is lower than expected ({expected_ratio:.1f}x)")
+        print("This may be due to non-quantized layers (embeddings, layer norms, etc.)")
 
 
 def dequantize_weight(
@@ -502,21 +582,35 @@ def dequantize_weight(
         scale: torch.Tensor,
         zero: torch.Tensor,
         wbits: int = 4,
-        groupsize: int = 128
+        groupsize: int = 128,
+        original_shape: Optional[Tuple[int, int]] = None,
+        packed: bool = False
 ) -> torch.Tensor:
     """
     Dequantize a weight tensor
 
     Args:
-        quantized_weight: Quantized weight tensor
+        quantized_weight: Quantized weight tensor (possibly packed)
         scale: Scale parameters
         zero: Zero point parameters
         wbits: Quantization bits
         groupsize: Group size used in quantization
+        original_shape: Original shape before packing (for 4-bit)
+        packed: Whether the weights are packed
 
     Returns:
         Dequantized weight tensor
     """
+    # Unpack if necessary
+    if packed and wbits == 4:
+        if original_shape is None:
+            raise ValueError("original_shape required for unpacking 4-bit weights")
+        quantized_weight = unpack_4bit_weights(
+            quantized_weight,
+            original_shape[0],
+            original_shape[1]
+        )
+
     weight = torch.zeros_like(quantized_weight, dtype=torch.float32)
 
     for i in range(0, quantized_weight.shape[0], groupsize):
@@ -524,11 +618,71 @@ def dequantize_weight(
         group_idx = i // groupsize
 
         if group_idx < scale.shape[0]:
-            weight[i:j] = (quantized_weight[i:j] - zero[group_idx]) * scale[group_idx]
+            weight[i:j] = (quantized_weight[i:j].float() - zero[group_idx]) * scale[group_idx]
 
     return weight
 
 
+def load_quantized_model(model_path: str, device: str = "cpu"):
+    """
+    Load a quantized model
+
+    Args:
+        model_path: Path to the quantized .pth file
+        device: Device to load the model to
+
+    Returns:
+        state_dict with quantized weights and metadata
+    """
+    return torch.load(model_path, map_location=device)
+
+
+def dequantize_model(model_path: str, quantization_config: dict, output_path: str = None):
+    """
+    Fully dequantize a quantized model back to fp16/fp32
+
+    Args:
+        model_path: Path to quantized model
+        quantization_config: Quantization configuration dict
+        output_path: Where to save dequantized model
+    """
+    # Load quantized model
+    state_dict = load_quantized_model(model_path)
+
+    # Dequantize each layer
+    dequantized_dict = {}
+    for key, tensor in state_dict.items():
+        # Skip scale and zero tensors
+        if key.endswith('.scale') or key.endswith('.zero'):
+            continue
+
+        # Check if this is a quantized layer
+        if key in quantization_config["layers"]:
+            layer_config = quantization_config["layers"][key]
+            scale = state_dict[f"{key}.scale"]
+            zero = state_dict[f"{key}.zero"]
+
+            dequantized = dequantize_weight(
+                tensor,
+                scale,
+                zero,
+                wbits=layer_config["wbits"],
+                groupsize=layer_config["groupsize"],
+                original_shape=layer_config.get("original_shape"),
+                packed=layer_config.get("packed", False)
+            )
+            dequantized_dict[key] = dequantized
+        else:
+            dequantized_dict[key] = tensor
+
+    # Save if output path provided
+    if output_path:
+        torch.save(dequantized_dict, output_path)
+        print(f"Dequantized model saved to {output_path}")
+
+    return dequantized_dict
+
+
 # Integration with your existing code
 def quantize_after_conversion(
         checkpoints_dir: str,
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index 9a4f3b6..75cc253 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -82,53 +82,65 @@ def count_tokens(texts: List[str], tokenizer) -> int:
 
 
 def check_model_compatibility(model_path):
-    """Check if the model is a valid converted lite_llama model"""
-    # Check for required files
-    required_files = []
-    optional_files = ["config.json", "tokenizer.model", "tokenizer.json"]
+    """Check if the model is compatible for quantization"""
+    # Check if model path exists and contains .pth files
+    if not os.path.exists(model_path):
+        return False, f"Model path does not exist: {model_path}"
 
-    # Find .pth file
     pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
     if not pth_files:
-        return False, "No .pth file found in the model directory"
+        return False, f"No .pth files found in {model_path}"
 
-    # Check if already quantized
-    if any('gptq' in f for f in pth_files):
-        return False, "Model appears to be already quantized"
-
-    # Check for config files
-    found_configs = []
-    for config_file in optional_files:
-        if os.path.exists(os.path.join(model_path, config_file)):
-            found_configs.append(config_file)
-
-    if not found_configs:
-        return False, "No configuration files found (config.json, tokenizer.json, etc.)"
+    # Check if required config files exist
+    config_files = ["config.json", "tokenizer_config.json"]
+    missing_configs = [f for f in config_files if not os.path.exists(os.path.join(model_path, f))]
+    if missing_configs:
+        print(f"Warning: Missing config files: {missing_configs}")
 
     return True, "Model is compatible"
 
 
 def get_model_info(model_path):
-    """Extract model information from the directory"""
-    info = {
+    """Get basic information about the model"""
+    model_info = {
         "model_name": os.path.basename(model_path),
         "model_type": "unknown",
-        "size": 0
+        "size": 0.0
     }
 
-    # Try to determine model type from name
-    model_name_lower = info["model_name"].lower()
+    # Detect model type from path or config
+    model_name_lower = model_info["model_name"].lower()
     if "llava" in model_name_lower:
-        info["model_type"] = "llava"
+        model_info["model_type"] = "llava"
     elif "qwen2" in model_name_lower:
-        info["model_type"] = "qwen2"
+        model_info["model_type"] = "qwen2"
     elif "llama" in model_name_lower:
-        info["model_type"] = "llama"
+        model_info["model_type"] = "llama"
 
-    # Calculate model size
-    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
-    if pth_files:
-        model_file = os.path.join(model_path, pth_files[0])
-        info["size"] = os.path.getsize(model_file) / (1024 ** 3)  # Size in GB
+    # Try to read from config.json
+    config_path = os.path.join(model_path, "config.json")
+    if os.path.exists(config_path):
+        try:
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+                if "architectures" in config:
+                    arch = config["architectures"][0].lower()
+                    if "llava" in arch:
+                        model_info["model_type"] = "llava"
+                    elif "qwen2" in arch:
+                        model_info["model_type"] = "qwen2"
+                    elif "llama" in arch:
+                        model_info["model_type"] = "llama"
+        except:
+            pass
+
+    # Calculate total size
+    total_size = 0
+    for f in os.listdir(model_path):
+        if f.endswith('.pth'):
+            file_path = os.path.join(model_path, f)
+            total_size += os.path.getsize(file_path)
+
+    model_info["size"] = total_size / (1024 ** 3)  # Convert to GB
 
-    return info
\ No newline at end of file
+    return model_info
\ No newline at end of file
diff --git a/quantize_model.py b/quantize_model.py
index eb3bde7..d9af7e5 100644
--- a/quantize_model.py
+++ b/quantize_model.py
@@ -13,7 +13,8 @@
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
 from lite_llama.quantization.gptq import quantize_litellama_model
-
+from lite_llama.utils.common import get_model_info, check_model_compatibility
+from lite_llama.utils.logger import log
 
 def parse_arguments():
     """Parse command line arguments"""
@@ -94,71 +95,6 @@ def parse_arguments():
     return parser.parse_args()
 
 
-def check_model_compatibility(model_path):
-    """Check if the model is compatible for quantization"""
-    # Check if model path exists and contains .pth files
-    if not os.path.exists(model_path):
-        return False, f"Model path does not exist: {model_path}"
-
-    pth_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
-    if not pth_files:
-        return False, f"No .pth files found in {model_path}"
-
-    # Check if required config files exist
-    config_files = ["config.json", "tokenizer_config.json"]
-    missing_configs = [f for f in config_files if not os.path.exists(os.path.join(model_path, f))]
-    if missing_configs:
-        print(f"Warning: Missing config files: {missing_configs}")
-
-    return True, "Model is compatible"
-
-
-def get_model_info(model_path):
-    """Get basic information about the model"""
-    model_info = {
-        "model_name": os.path.basename(model_path),
-        "model_type": "unknown",
-        "size": 0.0
-    }
-
-    # Detect model type from path or config
-    model_name_lower = model_info["model_name"].lower()
-    if "llava" in model_name_lower:
-        model_info["model_type"] = "llava"
-    elif "qwen2" in model_name_lower:
-        model_info["model_type"] = "qwen2"
-    elif "llama" in model_name_lower:
-        model_info["model_type"] = "llama"
-
-    # Try to read from config.json
-    config_path = os.path.join(model_path, "config.json")
-    if os.path.exists(config_path):
-        try:
-            with open(config_path, 'r') as f:
-                config = json.load(f)
-                if "architectures" in config:
-                    arch = config["architectures"][0].lower()
-                    if "llava" in arch:
-                        model_info["model_type"] = "llava"
-                    elif "qwen2" in arch:
-                        model_info["model_type"] = "qwen2"
-                    elif "llama" in arch:
-                        model_info["model_type"] = "llama"
-        except:
-            pass
-
-    # Calculate total size
-    total_size = 0
-    for f in os.listdir(model_path):
-        if f.endswith('.pth'):
-            file_path = os.path.join(model_path, f)
-            total_size += os.path.getsize(file_path)
-
-    model_info["size"] = total_size / (1024 ** 3)  # Convert to GB
-
-    return model_info
-
-
 def main():
     # Parse arguments
     args = parse_arguments()
@@ -190,9 +126,9 @@ def main():
             print("   Skipping vision weights by default (use --quantize_vision to override)")
 
     print(f"\nModel Information:")
-    print(f"  Name: {model_info['model_name']}")
-    print(f"  Type: {model_info['model_type']}")
-    print(f"  Size: {model_info['size']:.2f} GB")
+    print(f"Name: {model_info['model_name']}")
+    print(f"Type: {model_info['model_type']}")
+    print(f"Size: {model_info['size']:.2f} GB")
     if is_llava:
         print(f"  Vision weights: {'Will be quantized' if args.quantize_vision else 'Will be skipped'}")
 
@@ -232,10 +168,10 @@ def main():
             del sample_weights
 
     print(f"\nQuantization Settings:")
-    print(f"  Bits: {args.wbits}")
-    print(f"  Group size: {args.groupsize}")
-    print(f"  Device: {args.device}")
-    print(f"  Calibration data: {args.calibration_data or 'Default'}")
+    print(f"Bits: {args.wbits}")
+    print(f"Group size: {args.groupsize}")
+    print(f"Device: {args.device}")
+    print(f"Calibration data: {args.calibration_data or 'Default'}")
 
     # Check CUDA availability
     if args.device == "cuda" and not torch.cuda.is_available():
@@ -297,19 +233,19 @@ def main():
         if quantized_size > 0:
             compression_ratio = original_size / quantized_size
         else:
-            print("\nWarning: Could not calculate compression ratio (output files not found)")
+            log.warning("\nWarning: Could not calculate compression ratio (output files not found)")
             compression_ratio = 0
 
-        print(f"\nCompression Statistics:")
-        print(f"  Original size: {original_size:.2f} GB")
-        print(f"  Quantized size: {quantized_size:.2f} GB")
-        print(f"  Compression ratio: {compression_ratio:.2f}x")
-        print(f"  Space saved: {(1 - 1 / compression_ratio) * 100:.1f}%")
+        log.info(f"\nCompression Statistics:")
+        print(f"Original size: {original_size:.2f} GB")
+        print(f"Quantized size: {quantized_size:.2f} GB")
+        print(f"Compression ratio: {compression_ratio:.2f}x")
+        print(f"Space saved: {(1 - 1 / compression_ratio) * 100:.1f}%")
 
         # Expected compression analysis
         expected_ratio = 32 / (args.wbits + 0.5)  # 0.5 for metadata overhead
         if compression_ratio < 1.5:
-            print(f"\n⚠️  Warning: Low compression ratio detected!")
+            log.warning(f"\n⚠️Low compression ratio detected!")
             print(f"  Expected: ~{expected_ratio:.1f}x for {args.wbits}-bit quantization")
             print(f"  Actual: {compression_ratio:.2f}x")
             print("\nPossible reasons:")
@@ -322,10 +258,10 @@ def main():
             print("  - Quantizing embeddings (if safe for your use case)")
 
     except KeyboardInterrupt:
-        print("\n\nQuantization interrupted by user.")
+        log.error("\n\nQuantization interrupted by user.")
         return 1
     except Exception as e:
-        print(f"\nError during quantization: {e}")
+        log.error(f"\nError during quantization: {e}")
         import traceback
         traceback.print_exc()
         return 1

From 57fba86a1ab385deb8c406296dc2fae2fd99a0a8 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 24 May 2025 22:42:14 +0930
Subject: [PATCH 17/33] fix naming issue

---
 generate.py                |  2 +-
 lite_llama/utils/common.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/generate.py b/generate.py
index 84f91d1..cd096d1 100644
--- a/generate.py
+++ b/generate.py
@@ -201,7 +201,7 @@ def generate_llava(
 
 
     PARSER = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter)
-    PARSER.add_argument('-m', "--model_path", type=str,
+    PARSER.add_argument('-m', "--checkpoint_path", type=str,
                         default='checkpoints/lit-llama/7B/',
                         help='Path of the Model')
     PARSER.add_argument('-q', "--quant_method", type=str,
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index 75cc253..9b5d249 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -81,6 +81,21 @@ def count_tokens(texts: List[str], tokenizer) -> int:
     return total_tokens
 
 
+def get_model_type(checkpoint_path: str) -> str | None:
+    from utils.logger import log
+
+    model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]
+
+    config_content = read_json(os.path.join(checkpoint_path, "config.json"))
+    for m in model_type:
+        if m in config_content["model_type"].lower():
+            if m == "llava":
+                return "llama"
+            return m
+    log.error(f"No model type found: {checkpoint_path}")
+    return None
+
+
 def check_model_compatibility(model_path):
     """Check if the model is compatible for quantization"""
     # Check if model path exists and contains .pth files

From cabeb3e530f989fc5707aeec81060d36eff2618b Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Mon, 26 May 2025 11:35:07 +0930
Subject: [PATCH 18/33] fix gptq compress

---
 apply_weight_convert.py                       | 305 +++-----
 generate.py                                   | 284 ++++---
 lite_llama/executor/weight_convert_gptq.py    | 553 +++++++++++++
 lite_llama/quantization/debug_quantization.py | 247 ------
 lite_llama/quantization/gptq.py               | 732 ------------------
 lite_llama/quantization/gptq/__init__.py      |   0
 lite_llama/quantization/gptq/gptq_executor.py | 218 ++++++
 lite_llama/quantization/gptq/gptq_loader.py   | 550 +++++++++++++
 quantize_model.py                             | 273 -------
 9 files changed, 1549 insertions(+), 1613 deletions(-)
 create mode 100644 lite_llama/executor/weight_convert_gptq.py
 delete mode 100644 lite_llama/quantization/debug_quantization.py
 delete mode 100644 lite_llama/quantization/gptq.py
 create mode 100644 lite_llama/quantization/gptq/__init__.py
 create mode 100644 lite_llama/quantization/gptq/gptq_executor.py
 create mode 100644 lite_llama/quantization/gptq/gptq_loader.py
 delete mode 100644 quantize_model.py

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index 33f467d..d366688 100644
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -6,246 +6,125 @@
     LlavaConfig,
 )
 import argparse
-import os
-import sys
-
-# Add the gptq_quantize module to the path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 
+# 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
 from lite_llama.executor.weight_convert import (
     convert_llavallama_hf_to_litellama,
     convert_llama_hf_to_litellama,
     convert_qwen2_hf_to_litellama,
 )
-
-# Import the GPTQ quantization function
-from lite_llama.quantization.gptq import quantize_after_conversion
+from lite_llama.executor.weight_convert_gptq import (
+    convert_llavallama_hf_to_litellama_gptq,
+    convert_llama_hf_to_litellama_gptq,
+    convert_qwen2_hf_to_litellama_gptq,
+)
 
 import warnings
 
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
 
 
-def parse_arguments():
-    """Parse command line arguments"""
-    parser = argparse.ArgumentParser(
-        description="Convert HuggingFace models to Lite-LLaMA format with optional GPTQ quantization"
-    )
-
-    parser.add_argument(
-        "--checkpoint_dir",
-        type=str,
-        required=True,
-        help="Path to the HuggingFace model checkpoint directory"
-    )
-
-    parser.add_argument(
-        "--quantize",
-        action="store_true",
-        help="Enable GPTQ quantization after conversion"
-    )
-
-    parser.add_argument(
-        "--wbits",
-        type=int,
-        default=4,
-        choices=[2, 3, 4, 8],
-        help="Number of bits for quantization (default: 4)"
-    )
-
-    parser.add_argument(
-        "--groupsize",
-        type=int,
-        default=128,
-        help="Group size for quantization (default: 128, -1 for no grouping, 0 for auto-detect)"
-    )
-
-    parser.add_argument(
-        "--calibration_data",
-        type=str,
-        default=None,
-        help="Path to calibration dataset file for GPTQ (optional)"
-    )
-
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help="Device to use for conversion (default: cuda)"
-    )
-
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        default="float16",
-        choices=["float16", "float32", "bfloat16"],
-        help="Data type for model weights (default: float16)"
-    )
-
-    return parser.parse_args()
-
-
-def get_torch_dtype(dtype_str):
-    """Convert string dtype to torch dtype"""
-    dtype_map = {
-        "float16": torch.float16,
-        "float32": torch.float32,
-        "bfloat16": torch.bfloat16
-    }
-    return dtype_map.get(dtype_str, torch.float16)
-
-
 def main():
-    # Parse arguments
-    args = parse_arguments()
+    parser = argparse.ArgumentParser(description='Convert HF models to LiteLLaMA format with optional GPTQ compression')
+    parser.add_argument('--checkpoint_dir', type=str, required=True,
+                        help='Path to the model checkpoint directory')
+    parser.add_argument('--use_gptq', action='store_true',
+                        help='Enable GPTQ quantization (4-bit by default)')
+    parser.add_argument('--bits', type=int, default=4, choices=[2, 3, 4, 8],
+                        help='Number of bits for GPTQ quantization')
+    parser.add_argument('--group_size', type=int, default=128,
+                        help='Group size for GPTQ quantization')
+    parser.add_argument('--act_order', action='store_true',
+                        help='Use activation order for GPTQ quantization')
+    parser.add_argument('--calibration_dataset', type=str, default='c4',
+                        help='Dataset to use for GPTQ calibration')
+    parser.add_argument('--nsamples', type=int, default=128,
+                        help='Number of calibration samples for GPTQ')
+
+    args = parser.parse_args()
 
     checkpoints_dir = args.checkpoint_dir
-    device = args.device
-    torch_dtype = get_torch_dtype(args.dtype)
-
-    print(f"Converting model from: {checkpoints_dir}")
-    print(f"Device: {device}")
-    print(f"Data type: {args.dtype}")
-    print(f"Quantization: {'Enabled' if args.quantize else 'Disabled'}")
-
-    if args.quantize:
-        print(f"  - Bits: {args.wbits}")
-        print(f"  - Group size: {args.groupsize}")
-        print(f"  - Calibration data: {args.calibration_data or 'Default'}")
 
-    print("\n" + "=" * 50 + "\n")
-
-    # Step 1: Load the model
-    print("Loading model...")
-
-    try:
-        if "llava" in checkpoints_dir.lower():
-            model = LlavaForConditionalGeneration.from_pretrained(
+    if "llava" in checkpoints_dir.lower():
+        model = (
+            LlavaForConditionalGeneration.from_pretrained(
                 checkpoints_dir,
-                torch_dtype=torch_dtype,
+                torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
-            )
-            model_type = "llava"
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
+            ).to("cuda")
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoints_dir,
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to("cuda")
+
+    hf_sd = model.state_dict()
+
+    # Determine the conversion function based on model type and GPTQ flag
+    if "qwen2" in checkpoints_dir.lower():
+        llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+        num_layers = llm_config.num_hidden_layers
+        print("num_layers: ", num_layers)
+
+        if args.use_gptq:
+            print(f"Converting Qwen2 with GPTQ quantization ({args.bits}-bit)...")
+            convert_qwen2_hf_to_litellama_gptq(
                 checkpoints_dir,
-                torch_dtype=torch_dtype,
-                low_cpu_mem_usage=True,
+                model,  # Pass model instead of state dict for GPTQ
+                num_layers,
+                bits=args.bits,
+                group_size=args.group_size,
+                act_order=args.act_order,
+                calibration_dataset=args.calibration_dataset,
+                nsamples=args.nsamples
             )
-            # Determine model type
-            if "qwen2" in checkpoints_dir.lower():
-                model_type = "qwen2"
-            elif "llama" in checkpoints_dir.lower():
-                model_type = "llama"
-            else:
-                print("Warning: Could not determine model type from path.")
-                print("Assuming Llama architecture...")
-                model_type = "llama"
-
-        if device == "cuda" and torch.cuda.is_available():
-            model = model.to(device)
-
-        hf_sd = model.state_dict()
-
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        return 1
-
-    # Step 2: Convert to lite_llama format
-    print(f"\nConverting {model_type} model to lite_llama format...")
-
-    try:
-        if model_type == "qwen2":
-            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-            num_layers = llm_config.num_hidden_layers
-            print(f"Number of layers: {num_layers}")
+        else:
             convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
 
-        elif model_type == "llama":
-            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-            num_layers = llm_config.num_hidden_layers
-            print(f"Number of layers: {num_layers}")
+    elif "llama" in checkpoints_dir.lower():
+        llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+        num_layers = llm_config.num_hidden_layers
+        print("num_layers: ", num_layers)
+
+        if args.use_gptq:
+            print(f"Converting Llama with GPTQ quantization ({args.bits}-bit)...")
+            convert_llama_hf_to_litellama_gptq(
+                checkpoints_dir,
+                model,  # Pass model instead of state dict for GPTQ
+                num_layers,
+                bits=args.bits,
+                group_size=args.group_size,
+                act_order=args.act_order,
+                calibration_dataset=args.calibration_dataset,
+                nsamples=args.nsamples
+            )
+        else:
             convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
 
-        elif model_type == "llava":
-            llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-            num_layers = llava_config.text_config.num_hidden_layers
-            print(f"Number of layers: {num_layers}")
-            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+    elif "llava" in checkpoints_dir.lower():
+        llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
+        num_layers = llava_config.text_config.num_hidden_layers
+        print("num_layers: ", num_layers)
 
-        print("Conversion completed successfully!")
-
-    except Exception as e:
-        print(f"Error during conversion: {e}")
-        return 1
-
-    # Free memory
-    del model, hf_sd
-    if device == "cuda":
-        torch.cuda.empty_cache()
-
-    # Step 3: Optional quantization
-    if args.quantize:
-        print("\n" + "=" * 50)
-        print(f"Starting GPTQ quantization ({args.wbits}-bit)...")
-
-        # Auto-detect groupsize if needed
-        if args.groupsize == 0:
-            print("Auto-detecting optimal groupsize...")
-            # Quick check of vocabulary size
-            vocab_sizes = []
-            for name, param in hf_sd.items():
-                if ("embed" in name or "lm_head" in name) and len(param.shape) >= 2:
-                    vocab_sizes.extend(param.shape)
-
-            if vocab_sizes:
-                vocab_size = max(vocab_sizes)
-                # Find best groupsize
-                for gs in [128, 256, 512, 1024]:
-                    if vocab_size % gs == 0:
-                        args.groupsize = gs
-                        print(f"Selected groupsize: {gs} (perfect fit for vocab size {vocab_size})")
-                        break
-                else:
-                    args.groupsize = 256 if vocab_size > 100000 else 128
-                    print(f"Selected groupsize: {args.groupsize} (best fit for vocab size {vocab_size})")
-
-        print(f"Groupsize: {args.groupsize}")
-        print("=" * 50 + "\n")
-
-        # Check if it's a LLaVA model and set skip_vision accordingly
-        skip_vision = model_type == "llava"
-
-        try:
-            quantized_path = quantize_after_conversion(
-                checkpoints_dir=checkpoints_dir,
-                model_type=model_type,
-                calibration_data_path=args.calibration_data,
-                wbits=args.wbits,
-                groupsize=args.groupsize,
-                skip_vision=skip_vision
+        if args.use_gptq:
+            print(f"Converting LLaVA with GPTQ quantization ({args.bits}-bit)...")
+            convert_llavallama_hf_to_litellama_gptq(
+                checkpoints_dir,
+                model,  # Pass model instead of state dict for GPTQ
+                num_layers,
+                bits=args.bits,
+                group_size=args.group_size,
+                act_order=args.act_order,
+                calibration_dataset=args.calibration_dataset,
+                nsamples=args.nsamples
             )
-            print(f"\nQuantization completed successfully!")
-            print(f"Quantized model saved to: {quantized_path}")
-
-        except Exception as e:
-            print(f"Error during quantization: {e}")
-            print("The converted model was saved successfully, but quantization failed.")
-            return 1
+        else:
+            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
     else:
-        model_id = os.path.basename(os.path.normpath(checkpoints_dir))
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        converted_path = os.path.join(current_dir, f"my_weight/{model_id}")
-        print(f"\nConverted model saved to: {converted_path}")
-        print("To quantize this model later, use the quantize_model.py script")
-
-    print("\n" + "=" * 50)
-    print("Process completed successfully!")
-    print("=" * 50)
-
-    return 0
+        print("Error! Unsupported model type!")
 
 
 if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
+    main()
\ No newline at end of file
diff --git a/generate.py b/generate.py
index cd096d1..59aefdb 100644
--- a/generate.py
+++ b/generate.py
@@ -1,14 +1,11 @@
 import torch
 from typing import Optional
-from lite_llama.utils.prompt_templates import get_prompter, get_image_token
-from lite_llama.generate_stream import GenerateStreamText  # import GenerateText
-from lite_llama.utils.image_process import vis_images
-
 import warnings
-
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-from utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
-from lite_llama.llava_generate_stream import LlavaGeneratorStream
+from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
+from lite_llama.utils.prompt_templates import get_prompter
+from lite_llama.generate_stream import GenerateStreamText  # Original import
+from lite_llama.quantization.gptq.gptq_executor import GPTQGenerateStreamText  # GPTQ import
 
 import sys, os, time
 from pathlib import Path
@@ -17,16 +14,20 @@
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 import psutil
-from lite_llama.utils.logger import log
-import argparse
-from argparse import RawTextHelpFormatter
 
 process = psutil.Process(os.getpid())
 
-def report_resource_usage(ram_before, vram_before) -> None:
+
+def is_gptq_model(checkpoint_path: str) -> bool:
+    """Check if the model is GPTQ quantized"""
+    quantize_config_path = Path(checkpoint_path) / "quantization_config.json"
+    return quantize_config_path.exists()
+
+
+def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
     end_time = time.time()
     ram_after = process.memory_info().rss
-    vram_after = get_gpu_memory(detect_device())
+    vram_after = get_gpu_memory(gpu_type)
 
     ram_used = (ram_after - ram_before) / (1024**3)  # Bytes to GB
 
@@ -36,58 +37,84 @@ def report_resource_usage(ram_before, vram_before) -> None:
     else:
         vram_text = "Unavailable"
 
-    log.info(f"CPU RAM Used: {ram_used:.2f} GB")
-    log.info(f"GPU VRAM Used: {vram_text}")
-
-
-def generate_llama(
-        prompt: str = "Hello, my name is",
-        *,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_seq_len: int = 2048,
-        max_gpu_num_blocks=40960,
-        max_gen_len: Optional[int] = 1024,
-        load_model: bool = True,
-        compiled_model: bool = False,
-        triton_weight: bool = True,
-        gpu_type: str = "nvidia",
-        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
-        quantize: Optional[str] = None,
+    print(f"CPU RAM Used: {ram_used:.2f} GB")
+    print(f"GPU VRAM Used: {vram_text}")
+
+
+def main(
+    prompt: str = "Hello, my name is",
+    *,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    max_seq_len: int = 2048,
+    max_gpu_num_blocks=40960,
+    max_gen_len: Optional[int] = 1024,
+    load_model: bool = True,
+    compiled_model: bool = False,
+    triton_weight: bool = True,
+    gpu_type: str = "nvidia",
+    checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+    quantize: Optional[str] = None,
+    use_gptq: Optional[bool] = None,  # New parameter for explicit GPTQ control
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
+
     if max_seq_len <= 1024:
         short_prompt = True
     else:
         short_prompt = False
+
+    # Get model type and prompter
     model_prompter = get_prompter(
         get_model_type(checkpoint_path), checkpoint_path, short_prompt
     )
+
     # Start resource tracking
     ram_before = process.memory_info().rss
-
+    gpu_type = detect_device()
     vram_before = get_gpu_memory(gpu_type)
-    start = time.perf_counter()
 
     # Init LLM generator
-    generator = GenerateStreamText(
-        checkpoints_dir=checkpoint_path,
-        tokenizer_path=checkpoint_path,
-        max_gpu_num_blocks=max_gpu_num_blocks,
-        max_seq_len=max_seq_len,
-        load_model=load_model,
-        compiled_model=compiled_model,
-        triton_weight=triton_weight,
-        device=device,
-    )
 
+    # Auto-detect GPTQ if not explicitly specified
+    if use_gptq is None:
+        use_gptq = is_gptq_model(checkpoint_path)
+        if use_gptq:
+            print(f"GPTQ quantized model detected in {checkpoint_path}")
+
+    # Choose appropriate generator class
+    if use_gptq:
+        print("Using GPTQ-enabled generator")
+        generator = GPTQGenerateStreamText(
+            checkpoints_dir=checkpoint_path,
+            tokenizer_path=checkpoint_path,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            load_model=load_model,
+            compiled_model=compiled_model,
+            triton_weight=triton_weight,
+            device=device,
+            use_gptq=True,  # Explicitly set GPTQ mode
+        )
+    else:
+        print("Using standard FP16 generator")
+        generator = GenerateStreamText(
+            checkpoints_dir=checkpoint_path,
+            tokenizer_path=checkpoint_path,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            load_model=load_model,
+            compiled_model=compiled_model,
+            triton_weight=triton_weight,
+            device=device,
+        )
 
     model_prompter.insert_prompt(prompt)
     prompts = [model_prompter.model_input]
+
     # Call the generation function and start the stream generation
-    start = time.perf_counter()
     stream = generator.text_completion_stream(
         prompts,
         temperature=temperature,
@@ -98,128 +125,89 @@ def generate_llama(
     completion = ""  # Initialize to generate the result
     # NOTE: After creating a generator, it can be iterated through a for loop
     text_msg = ""
+    start = time.perf_counter()
+
     for batch_completions in stream:
-        new_text = batch_completions[0]['generation'][len(completion):]
-        completion = batch_completions[0]['generation']
-        print(new_text, end='', flush=True)
-        text_msg +=new_text
+        new_text = batch_completions[0]["generation"][len(completion) :]
+        completion = batch_completions[0]["generation"]
+        print(new_text, end="", flush=True)
+        text_msg += new_text
     end = time.perf_counter()
 
     print("\n\n==================================\n")
-    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+    print(
+        f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer) / (end - start):.2f} tokens/sec"
+    )
 
     # Report resource usage
-    report_resource_usage(ram_before, vram_before)
+    report_resource_usage(ram_before, vram_before, gpu_type)
 
 
-def generate_llava(
+if __name__ == "__main__":
+    from jsonargparse import CLI
+
+    torch.set_float32_matmul_precision("high")
+
+    # Create a wrapper function that adds the use_gptq parameter
+    def main_with_gptq_option(
         prompt: str = "Hello, my name is",
-        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
-        figure_path: Path = Path("figures/lit-llama/"),
-        gpu_type: str = "nvidia",
+        *,
         temperature: float = 0.6,
         top_p: float = 0.9,
         max_seq_len: int = 2048,
-        max_gpu_num_blocks=None,
-        max_gen_len: Optional[int] = 512,
+        max_gpu_num_blocks=40960,
+        max_gen_len: Optional[int] = 1024,
         load_model: bool = True,
         compiled_model: bool = False,
-        triton_weight: bool = True
-):
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    if max_seq_len <= 1024:
-        short_prompt = True
-    else:
-        short_prompt = False
-
-    if not os.path.isfile(figure_path):
-        log.error(f"'{figure_path}' Not a valid file path！")
-    else:
-        image_input = str(figure_path).strip()
-    image_items = [image_input]  # Prepare the image_items list
-    image_num = len(image_items)  # Calculate the number of input images
-    vis_images(image_items)  # Displaying images in the terminal
-    assert checkpoint_path.is_dir(), checkpoint_path
-    checkpoint_path = str(checkpoint_path)
-    model_prompter = get_prompter("llama", checkpoint_path, short_prompt)
-
-    # Start resource tracking
-    ram_before = process.memory_info().rss
-
-    vram_before = get_gpu_memory(gpu_type)
-
-    # Initializing the Multimodal Model Text Generator
-    try:
-        generator = LlavaGeneratorStream(
-            checkpoints_dir=checkpoint_path,
-            tokenizer_path=checkpoint_path,
-            max_gpu_num_blocks=max_gpu_num_blocks,
+        triton_weight: bool = True,
+        gpu_type: str = "nvidia",
+        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+        quantize: Optional[str] = None,
+        force_gptq: bool = False,
+        force_fp16: bool = False,
+    ):
+        """
+        Generate text using lite_llama with automatic GPTQ detection
+
+        Args:
+            prompt: Input prompt text
+            temperature: Sampling temperature
+            top_p: Nucleus sampling probability
+            max_seq_len: Maximum sequence length
+            max_gpu_num_blocks: Maximum GPU memory blocks
+            max_gen_len: Maximum generation length
+            load_model: Whether to load model weights
+            compiled_model: Whether to use compiled model
+            triton_weight: Whether to use Triton kernels
+            gpu_type: GPU type (nvidia/amd/cpu)
+            checkpoint_path: Path to model checkpoint directory
+            quantize: Quantization method (deprecated, kept for compatibility)
+            force_gptq: Force GPTQ mode even if no quantization_config.json
+            force_fp16: Force FP16 mode even if quantization_config.json exists
+        """
+        # Determine use_gptq based on force flags
+        use_gptq = None
+        if force_gptq and force_fp16:
+            raise ValueError("Cannot force both GPTQ and FP16 modes simultaneously")
+        elif force_gptq:
+            use_gptq = True
+        elif force_fp16:
+            use_gptq = False
+
+        return main(
+            prompt=prompt,
+            temperature=temperature,
+            top_p=top_p,
             max_seq_len=max_seq_len,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_gen_len=max_gen_len,
             load_model=load_model,
             compiled_model=compiled_model,
             triton_weight=triton_weight,
-            device=device,
-        )
-    except Exception as e:
-        log.error(f"Model loading failure: {e}")
-        sys.exit(1)
-
-    image_token = get_image_token()
-    model_prompter.insert_prompt(image_token * image_num + prompt)
-    prompts = [model_prompter.model_input]
-    start = time.perf_counter()
-    try:
-        stream = generator.text_completion_stream(
-            prompts,
-            image_items,
-            temperature=temperature,
-            top_p=top_p,
-            max_gen_len=max_gen_len,
+            gpu_type=gpu_type,
+            checkpoint_path=checkpoint_path,
+            quantize=quantize,
+            use_gptq=use_gptq,
         )
-    except Exception as e:
-        log.error(f"Text Generation Failure: {e}")
-
-    completion = ''  # Initialization generates results
-    text_msg = ""
-
-    for batch_completions in stream:
-        next_text = batch_completions[0]['generation'][len(completion):]
-        completion = batch_completions[0]['generation']
-        print(f"\033[91m{next_text}\033[0m", end='', flush=True)  # 红色文本
-        text_msg += next_text
-    end = time.perf_counter()
 
-    print("\n\n==================================\n")
-    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
-    # Report resource usage
-    report_resource_usage(ram_before, vram_before)
-
-
-if __name__ == "__main__":
-
-    torch.set_float32_matmul_precision("high")
-
-
-    PARSER = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter)
-    PARSER.add_argument('-m', "--checkpoint_path", type=str,
-                        default='checkpoints/lit-llama/7B/',
-                        help='Path of the Model')
-    PARSER.add_argument('-q', "--quant_method", type=str,
-                        default='',
-                        help="Quantization method")
-
-    PARSER.add_argument('-p', "--prompt", type=str,
-                        default='Hello, my name is',
-                        help="String of prompt")
-    PARSER.add_argument('-f', "--figure_path", type=str,
-                        default=None,
-                        help="Path of the Figure")
-
-
-    gpu_type = detect_device()
-    args = PARSER.parse_args()
-    model_path = os.path.abspath(args.model_path)
-    if args.figure_path:
-        generate_llava(prompt=args.prompt, checkpoint_path=Path(model_path), figure_path=Path(args.figure_path), gpu_type=gpu_type)
-    else:
-        generate_llama(prompt=args.prompt, checkpoint_path=Path(model_path), gpu_type=gpu_type)
+    CLI(main_with_gptq_option)
\ No newline at end of file
diff --git a/lite_llama/executor/weight_convert_gptq.py b/lite_llama/executor/weight_convert_gptq.py
new file mode 100644
index 0000000..3dacf96
--- /dev/null
+++ b/lite_llama/executor/weight_convert_gptq.py
@@ -0,0 +1,553 @@
+from tqdm.auto import tqdm
+import torch
+import os
+import shutil
+import glob
+import os.path as osp
+from typing import Dict, Optional
+import gc
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+try:
+    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+    from auto_gptq.modeling import BaseGPTQForCausalLM
+except ImportError:
+    raise ImportError(
+        "Please install auto-gptq: pip install auto-gptq"
+    )
+
+
+def get_calibration_data(model_id: str, dataset_name: str, tokenizer, nsamples: int = 128, seqlen: int = 2048):
+    """
+    Prepare calibration dataset for GPTQ quantization.
+    """
+    if dataset_name == "c4":
+        dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
+        text_column = "text"
+    elif dataset_name == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
+        text_column = "text"
+    else:
+        raise ValueError(f"Unsupported dataset: {dataset_name}")
+
+    calibration_data = []
+
+    for data in tqdm(dataset, desc="Loading calibration data"):
+        text = data[text_column]
+        if len(text.strip()) > 10:  # Skip very short texts
+            inputs = tokenizer(
+                text,
+                truncation=True,
+                max_length=seqlen,
+                return_tensors="pt"
+            )
+            if inputs["input_ids"].shape[1] >= seqlen // 2:  # Ensure reasonable length
+                calibration_data.append({
+                    "input_ids": inputs["input_ids"][0],
+                    "attention_mask": inputs["attention_mask"][0]
+                })
+
+                if len(calibration_data) >= nsamples:
+                    break
+
+    return calibration_data
+
+
+def build_new_weight_dir_gptq(checkpoints_dir: str, new_sd: Dict[str, torch.Tensor], bits: int):
+    """
+    Save GPTQ quantized model weights and build new weight directory.
+    """
+    model_id = osp.basename(osp.normpath(checkpoints_dir))
+    current_dir = osp.dirname(osp.abspath(__file__))
+    my_weight_dir = osp.join(
+        current_dir, f"../../my_weight/{model_id}-{bits}bit-GPTQ"
+    )
+    os.makedirs(my_weight_dir, exist_ok=True)
+
+    # Save quantized model state dict
+    torch.save(
+        new_sd,
+        osp.join(my_weight_dir, f"{model_id}-{bits}bit-GPTQ.pth"),
+        _use_new_zipfile_serialization=True,
+    )
+
+    # Copy JSON files
+    json_files = glob.glob(osp.join(checkpoints_dir, "*.json"))
+    for file_path in json_files:
+        shutil.copy(file_path, my_weight_dir)
+        print(f"已复制: {file_path} -> {my_weight_dir}")
+
+    # Copy tokenizer files
+    if osp.exists(osp.join(checkpoints_dir, "tokenizer.model")):
+        shutil.copy(osp.join(checkpoints_dir, "tokenizer.model"), my_weight_dir)
+
+    # Save quantization config
+    quant_config = {
+        "bits": bits,
+        "quantization_method": "gptq",
+        "model_id": model_id
+    }
+
+    import json
+    with open(osp.join(my_weight_dir, "quantization_config.json"), "w") as f:
+        json.dump(quant_config, f, indent=2)
+
+
+def quantize_and_convert_weights(
+        model,
+        checkpoints_dir: str,
+        bits: int = 4,
+        group_size: int = 128,
+        act_order: bool = False,
+        calibration_dataset: str = "c4",
+        nsamples: int = 128,
+) -> Dict[str, torch.Tensor]:
+    """
+    Quantize model with GPTQ and return quantized state dict.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(checkpoints_dir)
+
+    # Prepare quantization config
+    quantize_config = BaseQuantizeConfig(
+        bits=bits,
+        group_size=group_size,
+        damp_percent=0.01,
+        desc_act=act_order,
+        static_groups=False,
+        sym=True,
+        true_sequential=True,
+        model_name_or_path=checkpoints_dir,
+        model_file_base_name="model"
+    )
+
+    # Get calibration data
+    calibration_data = get_calibration_data(
+        checkpoints_dir,
+        calibration_dataset,
+        tokenizer,
+        nsamples=nsamples
+    )
+
+    # Clear GPU cache before quantization
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    # Quantize the model
+    print(f"Starting GPTQ quantization with {bits} bits...")
+    model.quantize(calibration_data, quantize_config)
+
+    # Get quantized state dict
+    quantized_sd = model.state_dict()
+
+    # Clear memory
+    del model
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    return quantized_sd
+
+
+def convert_qwen2_hf_to_litellama_gptq(
+        checkpoints_dir: str,
+        model,
+        num_layers: int,
+        bits: int = 4,
+        group_size: int = 128,
+        act_order: bool = False,
+        calibration_dataset: str = "c4",
+        nsamples: int = 128,
+) -> Dict[str, torch.Tensor]:
+    """
+    Convert Qwen2 HF model to LiteLLaMA format with GPTQ quantization.
+    """
+    # First quantize the model
+    quantized_sd = quantize_and_convert_weights(
+        model,
+        checkpoints_dir,
+        bits=bits,
+        group_size=group_size,
+        act_order=act_order,
+        calibration_dataset=calibration_dataset,
+        nsamples=nsamples,
+    )
+
+    # Mapping for base layers
+    mapping = {
+        "model.norm.weight": "norm_weight",
+        "model.embed_tokens.weight": "embed_tokens.weight",
+        "lm_head.weight": "lm_head_weight",
+    }
+
+    # Mapping for transformer layers
+    layers = {
+        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.self_attn.q_proj_weight",
+        "model.layers.{i}.self_attn.q_proj.bias": "layers.{i}.self_attn.q_proj_bias",
+        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj_weight",
+        "model.layers.{i}.self_attn.k_proj.bias": "layers.{i}.self_attn.k_proj_bias",
+        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj_weight",
+        "model.layers.{i}.self_attn.v_proj.bias": "layers.{i}.self_attn.v_proj_bias",
+        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj_weight",
+        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
+        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
+        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
+        "model.layers.{i}.input_layernorm.weight": "layers.{i}.input_layernorm_weight",
+        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.post_attention_layernorm_weight",
+    }
+
+    # Add GPTQ-specific mappings
+    gptq_layers = {
+        "model.layers.{i}.self_attn.q_proj.qweight": "layers.{i}.self_attn.q_proj_qweight",
+        "model.layers.{i}.self_attn.q_proj.qzeros": "layers.{i}.self_attn.q_proj_qzeros",
+        "model.layers.{i}.self_attn.q_proj.scales": "layers.{i}.self_attn.q_proj_scales",
+        "model.layers.{i}.self_attn.k_proj.qweight": "layers.{i}.self_attn.k_proj_qweight",
+        "model.layers.{i}.self_attn.k_proj.qzeros": "layers.{i}.self_attn.k_proj_qzeros",
+        "model.layers.{i}.self_attn.k_proj.scales": "layers.{i}.self_attn.k_proj_scales",
+        "model.layers.{i}.self_attn.v_proj.qweight": "layers.{i}.self_attn.v_proj_qweight",
+        "model.layers.{i}.self_attn.v_proj.qzeros": "layers.{i}.self_attn.v_proj_qzeros",
+        "model.layers.{i}.self_attn.v_proj.scales": "layers.{i}.self_attn.v_proj_scales",
+        "model.layers.{i}.self_attn.o_proj.qweight": "layers.{i}.self_attn.o_proj_qweight",
+        "model.layers.{i}.self_attn.o_proj.qzeros": "layers.{i}.self_attn.o_proj_qzeros",
+        "model.layers.{i}.self_attn.o_proj.scales": "layers.{i}.self_attn.o_proj_scales",
+        "model.layers.{i}.mlp.gate_proj.qweight": "layers.{i}.mlp.gate_proj_qweight",
+        "model.layers.{i}.mlp.gate_proj.qzeros": "layers.{i}.mlp.gate_proj_qzeros",
+        "model.layers.{i}.mlp.gate_proj.scales": "layers.{i}.mlp.gate_proj_scales",
+        "model.layers.{i}.mlp.up_proj.qweight": "layers.{i}.mlp.up_proj_qweight",
+        "model.layers.{i}.mlp.up_proj.qzeros": "layers.{i}.mlp.up_proj_qzeros",
+        "model.layers.{i}.mlp.up_proj.scales": "layers.{i}.mlp.up_proj_scales",
+        "model.layers.{i}.mlp.down_proj.qweight": "layers.{i}.mlp.down_proj_qweight",
+        "model.layers.{i}.mlp.down_proj.qzeros": "layers.{i}.mlp.down_proj_qzeros",
+        "model.layers.{i}.mlp.down_proj.scales": "layers.{i}.mlp.down_proj_scales",
+    }
+
+    # Generate mappings for all layers
+    for i in range(num_layers):
+        for hf_key, custom_key in layers.items():
+            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
+        for hf_key, custom_key in gptq_layers.items():
+            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
+
+    # Create new state dict with converted keys
+    new_sd = {}
+    for hf_key, tensor in tqdm(quantized_sd.items(), desc="Mapping GPTQ weights"):
+        custom_key = mapping.get(hf_key, None)
+        if custom_key is not None:
+            new_sd[custom_key] = tensor
+        else:
+            print(f"Warning: Unmapped key {hf_key}")
+
+    # Merge k_proj and v_proj for GPTQ
+    for i in range(num_layers):
+        # For regular weights (if they exist)
+        k_key = f"layers.{i}.self_attn.k_proj_weight"
+        v_key = f"layers.{i}.self_attn.v_proj_weight"
+        k_bias_key = f"layers.{i}.self_attn.k_proj_bias"
+        v_bias_key = f"layers.{i}.self_attn.v_proj_bias"
+
+        if k_key in new_sd and v_key in new_sd:
+            # Merge weights
+            kv_tensor = torch.cat([new_sd[k_key], new_sd[v_key]], dim=0)
+            new_sd[f"layers.{i}.self_attn.kv_proj_weight"] = kv_tensor
+            del new_sd[k_key]
+            del new_sd[v_key]
+
+            # Merge biases if they exist
+            if k_bias_key in new_sd and v_bias_key in new_sd:
+                kv_bias_tensor = torch.cat([new_sd[k_bias_key], new_sd[v_bias_key]], dim=0)
+                new_sd[f"layers.{i}.self_attn.kv_proj_bias"] = kv_bias_tensor
+                del new_sd[k_bias_key]
+                del new_sd[v_bias_key]
+
+        # For GPTQ quantized weights
+        k_qweight = f"layers.{i}.self_attn.k_proj_qweight"
+        v_qweight = f"layers.{i}.self_attn.v_proj_qweight"
+        k_qzeros = f"layers.{i}.self_attn.k_proj_qzeros"
+        v_qzeros = f"layers.{i}.self_attn.v_proj_qzeros"
+        k_scales = f"layers.{i}.self_attn.k_proj_scales"
+        v_scales = f"layers.{i}.self_attn.v_proj_scales"
+
+        if k_qweight in new_sd and v_qweight in new_sd:
+            # Merge quantized weights
+            kv_qweight = torch.cat([new_sd[k_qweight], new_sd[v_qweight]], dim=0)
+            kv_qzeros = torch.cat([new_sd[k_qzeros], new_sd[v_qzeros]], dim=0)
+            kv_scales = torch.cat([new_sd[k_scales], new_sd[v_scales]], dim=0)
+
+            new_sd[f"layers.{i}.self_attn.kv_proj_qweight"] = kv_qweight
+            new_sd[f"layers.{i}.self_attn.kv_proj_qzeros"] = kv_qzeros
+            new_sd[f"layers.{i}.self_attn.kv_proj_scales"] = kv_scales
+
+            # Remove original k and v projections
+            del new_sd[k_qweight]
+            del new_sd[v_qweight]
+            del new_sd[k_qzeros]
+            del new_sd[v_qzeros]
+            del new_sd[k_scales]
+            del new_sd[v_scales]
+
+    # Save the quantized weights
+    build_new_weight_dir_gptq(checkpoints_dir, new_sd, bits)
+
+    print(f"GPTQ quantization complete. Model saved with {bits}-bit precision.")
+    return new_sd
+
+
+def convert_llama_hf_to_litellama_gptq(
+        checkpoints_dir: str,
+        model,
+        num_layers: int,
+        bits: int = 4,
+        group_size: int = 128,
+        act_order: bool = False,
+        calibration_dataset: str = "c4",
+        nsamples: int = 128,
+) -> Dict[str, torch.Tensor]:
+    """
+    Convert Llama HF model to LiteLLaMA format with GPTQ quantization.
+    """
+    # First quantize the model
+    quantized_sd = quantize_and_convert_weights(
+        model,
+        checkpoints_dir,
+        bits=bits,
+        group_size=group_size,
+        act_order=act_order,
+        calibration_dataset=calibration_dataset,
+        nsamples=nsamples,
+    )
+
+    # Mapping for base layers
+    mapping = {
+        "model.embed_tokens.weight": "embed_tokens.weight",
+        "model.norm.weight": "norm_weight",
+        "lm_head.weight": "lm_head.weight",
+    }
+
+    # Mapping for transformer layers
+    layers = {
+        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.self_attn.q_proj.weight",
+        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj.weight",
+        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj.weight",
+        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj.weight",
+        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
+        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
+        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
+        "model.layers.{i}.input_layernorm.weight": "layers.{i}.attention_norm_weight",
+        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.ffn_norm_weight",
+    }
+
+    # Add GPTQ-specific mappings
+    gptq_layers = {
+        "model.layers.{i}.self_attn.q_proj.qweight": "layers.{i}.self_attn.q_proj_qweight",
+        "model.layers.{i}.self_attn.q_proj.qzeros": "layers.{i}.self_attn.q_proj_qzeros",
+        "model.layers.{i}.self_attn.q_proj.scales": "layers.{i}.self_attn.q_proj_scales",
+        "model.layers.{i}.self_attn.k_proj.qweight": "layers.{i}.self_attn.k_proj_qweight",
+        "model.layers.{i}.self_attn.k_proj.qzeros": "layers.{i}.self_attn.k_proj_qzeros",
+        "model.layers.{i}.self_attn.k_proj.scales": "layers.{i}.self_attn.k_proj_scales",
+        "model.layers.{i}.self_attn.v_proj.qweight": "layers.{i}.self_attn.v_proj_qweight",
+        "model.layers.{i}.self_attn.v_proj.qzeros": "layers.{i}.self_attn.v_proj_qzeros",
+        "model.layers.{i}.self_attn.v_proj.scales": "layers.{i}.self_attn.v_proj_scales",
+        "model.layers.{i}.self_attn.o_proj.qweight": "layers.{i}.self_attn.o_proj_qweight",
+        "model.layers.{i}.self_attn.o_proj.qzeros": "layers.{i}.self_attn.o_proj_qzeros",
+        "model.layers.{i}.self_attn.o_proj.scales": "layers.{i}.self_attn.o_proj_scales",
+        "model.layers.{i}.mlp.gate_proj.qweight": "layers.{i}.mlp.gate_proj_qweight",
+        "model.layers.{i}.mlp.gate_proj.qzeros": "layers.{i}.mlp.gate_proj_qzeros",
+        "model.layers.{i}.mlp.gate_proj.scales": "layers.{i}.mlp.gate_proj_scales",
+        "model.layers.{i}.mlp.up_proj.qweight": "layers.{i}.mlp.up_proj_qweight",
+        "model.layers.{i}.mlp.up_proj.qzeros": "layers.{i}.mlp.up_proj_qzeros",
+        "model.layers.{i}.mlp.up_proj.scales": "layers.{i}.mlp.up_proj_scales",
+        "model.layers.{i}.mlp.down_proj.qweight": "layers.{i}.mlp.down_proj_qweight",
+        "model.layers.{i}.mlp.down_proj.qzeros": "layers.{i}.mlp.down_proj_qzeros",
+        "model.layers.{i}.mlp.down_proj.scales": "layers.{i}.mlp.down_proj_scales",
+    }
+
+    # Generate mappings for all layers
+    for i in range(num_layers):
+        for hf_key, custom_key in layers.items():
+            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
+        for hf_key, custom_key in gptq_layers.items():
+            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
+
+    # Create new state dict with converted keys
+    new_sd = {}
+    for hf_key, tensor in tqdm(quantized_sd.items(), desc="Mapping GPTQ weights"):
+        custom_key = mapping.get(hf_key, None)
+        if custom_key is not None:
+            new_sd[custom_key] = tensor
+        else:
+            print(f"Warning: Unmapped key {hf_key}")
+
+    # Merge k_proj and v_proj
+    for i in range(num_layers):
+        # Handle regular weights if they exist
+        k_key = f"layers.{i}.self_attn.k_proj.weight"
+        v_key = f"layers.{i}.self_attn.v_proj.weight"
+        if k_key in new_sd and v_key in new_sd:
+            kv_tensor = torch.cat([new_sd[k_key], new_sd[v_key]], dim=0)
+            new_sd[f"layers.{i}.self_attn.kv_proj_weight"] = kv_tensor
+            del new_sd[k_key]
+            del new_sd[v_key]
+
+        # Handle GPTQ quantized weights
+        k_qweight = f"layers.{i}.self_attn.k_proj_qweight"
+        v_qweight = f"layers.{i}.self_attn.v_proj_qweight"
+        k_qzeros = f"layers.{i}.self_attn.k_proj_qzeros"
+        v_qzeros = f"layers.{i}.self_attn.v_proj_qzeros"
+        k_scales = f"layers.{i}.self_attn.k_proj_scales"
+        v_scales = f"layers.{i}.self_attn.v_proj_scales"
+
+        if k_qweight in new_sd and v_qweight in new_sd:
+            # Merge quantized weights
+            kv_qweight = torch.cat([new_sd[k_qweight], new_sd[v_qweight]], dim=0)
+            kv_qzeros = torch.cat([new_sd[k_qzeros], new_sd[v_qzeros]], dim=0)
+            kv_scales = torch.cat([new_sd[k_scales], new_sd[v_scales]], dim=0)
+
+            new_sd[f"layers.{i}.self_attn.kv_proj_qweight"] = kv_qweight
+            new_sd[f"layers.{i}.self_attn.kv_proj_qzeros"] = kv_qzeros
+            new_sd[f"layers.{i}.self_attn.kv_proj_scales"] = kv_scales
+
+            # Remove original k and v projections
+            del new_sd[k_qweight]
+            del new_sd[v_qweight]
+            del new_sd[k_qzeros]
+            del new_sd[v_qzeros]
+            del new_sd[k_scales]
+            del new_sd[v_scales]
+
+    # Save the quantized weights
+    build_new_weight_dir_gptq(checkpoints_dir, new_sd, bits)
+
+    print(f"GPTQ quantization complete. Model saved with {bits}-bit precision.")
+    return new_sd
+
+
+def convert_llavallama_hf_to_litellama_gptq(
+        checkpoints_dir: str,
+        model,
+        num_layers: int,
+        bits: int = 4,
+        group_size: int = 128,
+        act_order: bool = False,
+        calibration_dataset: str = "c4",
+        nsamples: int = 128,
+) -> Dict[str, torch.Tensor]:
+    """
+    Convert LLaVA-Llama HF model to LiteLLaMA format with GPTQ quantization.
+    """
+    # First quantize the model
+    quantized_sd = quantize_and_convert_weights(
+        model,
+        checkpoints_dir,
+        bits=bits,
+        group_size=group_size,
+        act_order=act_order,
+        calibration_dataset=calibration_dataset,
+        nsamples=nsamples,
+    )
+
+    # Mapping for base layers
+    mapping = {
+        "language_model.model.embed_tokens.weight": "language_model.embed_tokens.weight",
+        "language_model.model.norm.weight": "language_model.norm_weight",
+        "language_model.lm_head.weight": "language_model.lm_head.weight",
+    }
+
+    # Mapping for transformer layers
+    layers = {
+        "language_model.model.layers.{i}.self_attn.q_proj.weight": "language_model.layers.{i}.self_attn.q_proj.weight",
+        "language_model.model.layers.{i}.self_attn.k_proj.weight": "language_model.layers.{i}.self_attn.k_proj.weight",
+        "language_model.model.layers.{i}.self_attn.v_proj.weight": "language_model.layers.{i}.self_attn.v_proj.weight",
+        "language_model.model.layers.{i}.self_attn.o_proj.weight": "language_model.layers.{i}.self_attn.o_proj.weight",
+        "language_model.model.layers.{i}.mlp.gate_proj.weight": "language_model.layers.{i}.mlp.gate_proj.weight",
+        "language_model.model.layers.{i}.mlp.up_proj.weight": "language_model.layers.{i}.mlp.up_proj.weight",
+        "language_model.model.layers.{i}.mlp.down_proj.weight": "language_model.layers.{i}.mlp.down_proj.weight",
+        "language_model.model.layers.{i}.input_layernorm.weight": "language_model.layers.{i}.attention_norm_weight",
+        "language_model.model.layers.{i}.post_attention_layernorm.weight": "language_model.layers.{i}.ffn_norm_weight",
+    }
+
+    # Add GPTQ-specific mappings
+    gptq_layers = {
+        "language_model.model.layers.{i}.self_attn.q_proj.qweight": "language_model.layers.{i}.self_attn.q_proj_qweight",
+        "language_model.model.layers.{i}.self_attn.q_proj.qzeros": "language_model.layers.{i}.self_attn.q_proj_qzeros",
+        "language_model.model.layers.{i}.self_attn.q_proj.scales": "language_model.layers.{i}.self_attn.q_proj_scales",
+        "language_model.model.layers.{i}.self_attn.k_proj.qweight": "language_model.layers.{i}.self_attn.k_proj_qweight",
+        "language_model.model.layers.{i}.self_attn.k_proj.qzeros": "language_model.layers.{i}.self_attn.k_proj_qzeros",
+        "language_model.model.layers.{i}.self_attn.k_proj.scales": "language_model.layers.{i}.self_attn.k_proj_scales",
+        "language_model.model.layers.{i}.self_attn.v_proj.qweight": "language_model.layers.{i}.self_attn.v_proj_qweight",
+        "language_model.model.layers.{i}.self_attn.v_proj.qzeros": "language_model.layers.{i}.self_attn.v_proj_qzeros",
+        "language_model.model.layers.{i}.self_attn.v_proj.scales": "language_model.layers.{i}.self_attn.v_proj_scales",
+        "language_model.model.layers.{i}.self_attn.o_proj.qweight": "language_model.layers.{i}.self_attn.o_proj_qweight",
+        "language_model.model.layers.{i}.self_attn.o_proj.qzeros": "language_model.layers.{i}.self_attn.o_proj_qzeros",
+        "language_model.model.layers.{i}.self_attn.o_proj.scales": "language_model.layers.{i}.self_attn.o_proj_scales",
+        "language_model.model.layers.{i}.mlp.gate_proj.qweight": "language_model.layers.{i}.mlp.gate_proj_qweight",
+        "language_model.model.layers.{i}.mlp.gate_proj.qzeros": "language_model.layers.{i}.mlp.gate_proj_qzeros",
+        "language_model.model.layers.{i}.mlp.gate_proj.scales": "language_model.layers.{i}.mlp.gate_proj_scales",
+        "language_model.model.layers.{i}.mlp.up_proj.qweight": "language_model.layers.{i}.mlp.up_proj_qweight",
+        "language_model.model.layers.{i}.mlp.up_proj.qzeros": "language_model.layers.{i}.mlp.up_proj_qzeros",
+        "language_model.model.layers.{i}.mlp.up_proj.scales": "language_model.layers.{i}.mlp.up_proj_scales",
+        "language_model.model.layers.{i}.mlp.down_proj.qweight": "language_model.layers.{i}.mlp.down_proj_qweight",
+        "language_model.model.layers.{i}.mlp.down_proj.qzeros": "language_model.layers.{i}.mlp.down_proj_qzeros",
+        "language_model.model.layers.{i}.mlp.down_proj.scales": "language_model.layers.{i}.mlp.down_proj_scales",
+    }
+
+    # Generate mappings for all layers
+    for i in range(num_layers):
+        for hf_key, custom_key in layers.items():
+            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
+        for hf_key, custom_key in gptq_layers.items():
+            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
+
+    # Create new state dict with converted keys
+    new_sd = {}
+    for hf_key, tensor in tqdm(quantized_sd.items(), desc="Mapping GPTQ weights"):
+        custom_key = mapping.get(hf_key, None)
+        if custom_key is not None:
+            new_sd[custom_key] = tensor
+        else:
+            # Keep vision model and other components as-is
+            new_sd[hf_key] = tensor
+            print(f"Warning: Unmapped key {hf_key}")
+
+    # Merge k_proj and v_proj for language model
+    for i in tqdm(range(num_layers), desc="Mapping kv fused weights"):
+        # Handle regular weights if they exist
+        k_key = f"language_model.layers.{i}.self_attn.k_proj.weight"
+        v_key = f"language_model.layers.{i}.self_attn.v_proj.weight"
+        if k_key in new_sd and v_key in new_sd:
+            kv_tensor = torch.cat([new_sd[k_key], new_sd[v_key]], dim=0)
+            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_weight"] = kv_tensor
+            del new_sd[k_key]
+            del new_sd[v_key]
+
+        # Handle GPTQ quantized weights
+        k_qweight = f"language_model.layers.{i}.self_attn.k_proj_qweight"
+        v_qweight = f"language_model.layers.{i}.self_attn.v_proj_qweight"
+        k_qzeros = f"language_model.layers.{i}.self_attn.k_proj_qzeros"
+        v_qzeros = f"language_model.layers.{i}.self_attn.v_proj_qzeros"
+        k_scales = f"language_model.layers.{i}.self_attn.k_proj_scales"
+        v_scales = f"language_model.layers.{i}.self_attn.v_proj_scales"
+
+        if k_qweight in new_sd and v_qweight in new_sd:
+            # Merge quantized weights
+            kv_qweight = torch.cat([new_sd[k_qweight], new_sd[v_qweight]], dim=0)
+            kv_qzeros = torch.cat([new_sd[k_qzeros], new_sd[v_qzeros]], dim=0)
+            kv_scales = torch.cat([new_sd[k_scales], new_sd[v_scales]], dim=0)
+
+            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_qweight"] = kv_qweight
+            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_qzeros"] = kv_qzeros
+            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_scales"] = kv_scales
+
+            print(f"Merged GPTQ k/v projections for layer {i}")
+
+            # Remove original k and v projections
+            del new_sd[k_qweight]
+            del new_sd[v_qweight]
+            del new_sd[k_qzeros]
+            del new_sd[v_qzeros]
+            del new_sd[k_scales]
+            del new_sd[v_scales]
+
+    # Save the quantized weights
+    build_new_weight_dir_gptq(checkpoints_dir, new_sd, bits)
+
+    print(f"GPTQ quantization complete. Model saved with {bits}-bit precision.")
+    return new_sd
\ No newline at end of file
diff --git a/lite_llama/quantization/debug_quantization.py b/lite_llama/quantization/debug_quantization.py
deleted file mode 100644
index b6af59d..0000000
--- a/lite_llama/quantization/debug_quantization.py
+++ /dev/null
@@ -1,247 +0,0 @@
-import torch
-from transformers import (
-    LlavaForConditionalGeneration,
-    AutoConfig,
-    AutoModelForCausalLM,
-    LlavaConfig,
-)
-import argparse
-import os
-import sys
-
-# Add the gptq_quantize module to the path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from lite_llama.executor.weight_convert import (
-    convert_llavallama_hf_to_litellama,
-    convert_llama_hf_to_litellama,
-    convert_qwen2_hf_to_litellama,
-)
-
-# Import the GPTQ quantization function
-from lite_llama.quantization.gptq import quantize_after_conversion
-
-import warnings
-
-warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-
-
-def parse_arguments():
-    """Parse command line arguments"""
-    parser = argparse.ArgumentParser(
-        description="Convert HuggingFace models to Lite-LLaMA format with optional GPTQ quantization"
-    )
-
-    parser.add_argument(
-        "--checkpoint_dir",
-        type=str,
-        required=True,
-        help="Path to the HuggingFace model checkpoint directory"
-    )
-
-    parser.add_argument(
-        "--quantize",
-        action="store_true",
-        help="Enable GPTQ quantization after conversion"
-    )
-
-    parser.add_argument(
-        "--wbits",
-        type=int,
-        default=4,
-        choices=[2, 3, 4, 8],
-        help="Number of bits for quantization (default: 4)"
-    )
-
-    parser.add_argument(
-        "--groupsize",
-        type=int,
-        default=128,
-        help="Group size for quantization (default: 128, -1 for no grouping, 0 for auto-detect)"
-    )
-
-    parser.add_argument(
-        "--calibration_data",
-        type=str,
-        default=None,
-        help="Path to calibration dataset file for GPTQ (optional)"
-    )
-
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help="Device to use for conversion (default: cuda)"
-    )
-
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        default="float16",
-        choices=["float16", "float32", "bfloat16"],
-        help="Data type for model weights (default: float16)"
-    )
-
-    return parser.parse_args()
-
-
-def get_torch_dtype(dtype_str):
-    """Convert string dtype to torch dtype"""
-    dtype_map = {
-        "float16": torch.float16,
-        "float32": torch.float32,
-        "bfloat16": torch.bfloat16
-    }
-    return dtype_map.get(dtype_str, torch.float16)
-
-
-def main():
-    # Parse arguments
-    args = parse_arguments()
-
-    checkpoints_dir = args.checkpoint_dir
-    device = args.device
-    torch_dtype = get_torch_dtype(args.dtype)
-
-    print(f"Converting model from: {checkpoints_dir}")
-    print(f"Device: {device}")
-    print(f"Data type: {args.dtype}")
-    print(f"Quantization: {'Enabled' if args.quantize else 'Disabled'}")
-
-    if args.quantize:
-        print(f"  - Bits: {args.wbits}")
-        print(f"  - Group size: {args.groupsize}")
-        print(f"  - Calibration data: {args.calibration_data or 'Default'}")
-
-    print("\n" + "=" * 50 + "\n")
-
-    # Step 1: Load the model
-    print("Loading model...")
-
-    try:
-        if "llava" in checkpoints_dir.lower():
-            model = LlavaForConditionalGeneration.from_pretrained(
-                checkpoints_dir,
-                torch_dtype=torch_dtype,
-                low_cpu_mem_usage=True,
-            )
-            model_type = "llava"
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoints_dir,
-                torch_dtype=torch_dtype,
-                low_cpu_mem_usage=True,
-            )
-            # Determine model type
-            if "qwen2" in checkpoints_dir.lower():
-                model_type = "qwen2"
-            elif "llama" in checkpoints_dir.lower():
-                model_type = "llama"
-            else:
-                print("Warning: Could not determine model type from path.")
-                print("Assuming Llama architecture...")
-                model_type = "llama"
-
-        if device == "cuda" and torch.cuda.is_available():
-            model = model.to(device)
-
-        hf_sd = model.state_dict()
-
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        return 1
-
-    # Step 2: Convert to lite_llama format
-    print(f"\nConverting {model_type} model to lite_llama format...")
-
-    try:
-        if model_type == "qwen2":
-            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-            num_layers = llm_config.num_hidden_layers
-            print(f"Number of layers: {num_layers}")
-            convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-        elif model_type == "llama":
-            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-            num_layers = llm_config.num_hidden_layers
-            print(f"Number of layers: {num_layers}")
-            convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-        elif model_type == "llava":
-            llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-            num_layers = llava_config.text_config.num_hidden_layers
-            print(f"Number of layers: {num_layers}")
-            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-        print("Conversion completed successfully!")
-
-    except Exception as e:
-        print(f"Error during conversion: {e}")
-        return 1
-
-    # Free memory
-    del model, hf_sd
-    if device == "cuda":
-        torch.cuda.empty_cache()
-
-    # Step 3: Optional quantization
-    if args.quantize:
-        print("\n" + "=" * 50)
-        print(f"Starting GPTQ quantization ({args.wbits}-bit)...")
-
-        # Auto-detect groupsize if needed
-        if args.groupsize == 0:
-            print("Auto-detecting optimal groupsize...")
-            # Quick check of vocabulary size
-            vocab_sizes = []
-            for name, param in hf_sd.items():
-                if ("embed" in name or "lm_head" in name) and len(param.shape) >= 2:
-                    vocab_sizes.extend(param.shape)
-
-            if vocab_sizes:
-                vocab_size = max(vocab_sizes)
-                # Find best groupsize
-                for gs in [128, 256, 512, 1024]:
-                    if vocab_size % gs == 0:
-                        args.groupsize = gs
-                        print(f"Selected groupsize: {gs} (perfect fit for vocab size {vocab_size})")
-                        break
-                else:
-                    args.groupsize = 256 if vocab_size > 100000 else 128
-                    print(f"Selected groupsize: {args.groupsize} (best fit for vocab size {vocab_size})")
-
-        print(f"Groupsize: {args.groupsize}")
-        print("=" * 50 + "\n")
-
-        try:
-            quantized_path = quantize_after_conversion(
-                checkpoints_dir=checkpoints_dir,
-                model_type=model_type,
-                calibration_data_path=args.calibration_data,
-                wbits=args.wbits,
-                groupsize=args.groupsize
-            )
-            print(f"\nQuantization completed successfully!")
-            print(f"Quantized model saved to: {quantized_path}")
-
-        except Exception as e:
-            print(f"Error during quantization: {e}")
-            print("The converted model was saved successfully, but quantization failed.")
-            return 1
-    else:
-        model_id = os.path.basename(os.path.normpath(checkpoints_dir))
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        converted_path = os.path.join(current_dir, f"my_weight/{model_id}")
-        print(f"\nConverted model saved to: {converted_path}")
-        print("To quantize this model later, use the quantize_model.py script")
-
-    print("\n" + "=" * 50)
-    print("Process completed successfully!")
-    print("=" * 50)
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq.py b/lite_llama/quantization/gptq.py
deleted file mode 100644
index edecd1e..0000000
--- a/lite_llama/quantization/gptq.py
+++ /dev/null
@@ -1,732 +0,0 @@
-"""
-GPTQ Quantization for Lite-LLaMA Models
-
-Key improvements in this version:
-1. Proper weight packing for 4-bit quantization (2 weights per byte)
-2. Stores quantized weights as uint8/int16 instead of float32
-3. Separate storage of scale/zero parameters
-4. Accurate compression ratio calculation
-5. Support for LLaVA models with vision weight skipping
-"""
-
-import torch
-import torch.nn as nn
-from tqdm import tqdm
-import numpy as np
-import os
-import json
-from typing import Dict, List, Optional, Tuple
-from transformers import AutoTokenizer
-import gc
-
-
-def pack_4bit_weights(qweight, n_rows, n_cols):
-    """Pack 4-bit weights into uint8 format (2 weights per byte)"""
-    # Ensure even number of columns for packing
-    if n_cols % 2 != 0:
-        # Pad with zeros if odd number of columns
-        qweight = torch.nn.functional.pad(qweight, (0, 1), value=0)
-        n_cols += 1
-
-    # Pack two 4-bit values into one 8-bit value
-    packed = torch.zeros((n_rows, n_cols // 2), dtype=torch.uint8, device=qweight.device)
-    for i in range(0, n_cols, 2):
-        # First 4-bit value in lower nibble, second in upper nibble
-        packed[:, i // 2] = (qweight[:, i] & 0xF) | ((qweight[:, i + 1] & 0xF) << 4)
-
-    return packed
-
-
-def unpack_4bit_weights(packed, n_rows, original_n_cols):
-    """Unpack 4-bit weights from uint8 format"""
-    n_packed_cols = packed.shape[1]
-    unpacked = torch.zeros((n_rows, n_packed_cols * 2), dtype=torch.uint8, device=packed.device)
-
-    for i in range(n_packed_cols):
-        # Extract lower and upper nibbles
-        unpacked[:, i * 2] = packed[:, i] & 0xF
-        unpacked[:, i * 2 + 1] = (packed[:, i] >> 4) & 0xF
-
-    # Remove padding if it was added
-    return unpacked[:, :original_n_cols]
-
-
-class GPTQ:
-    """
-    GPTQ Quantizer for custom lite_llama models
-    """
-
-    def __init__(
-            self,
-            layer,
-            wbits: int = 4,
-            groupsize: int = 128,
-            actorder: bool = False,
-            percdamp: float = 0.01,
-            device: str = "cuda"
-    ):
-        self.layer = layer
-        self.device = device
-        self.wbits = wbits
-        self.actorder = actorder
-        self.percdamp = percdamp
-
-        # Handle groupsize
-        W = layer.weight.data
-        if groupsize == -1:
-            self.groupsize = W.shape[0]
-        else:
-            self.groupsize = groupsize
-
-        # Check if groupsize is compatible
-        if W.shape[0] % self.groupsize != 0:
-            print(f"Warning: Weight dimension {W.shape[0]} not divisible by groupsize {self.groupsize}")
-            print(f"Last group will have {W.shape[0] % self.groupsize} elements")
-
-        # Calculate quantization parameters
-        self.maxq = 2 ** self.wbits - 1
-        self.nsamples = 0
-
-        # Initialize Hessian and other matrices
-        self.rows = W.shape[0]
-        self.columns = W.shape[1]
-        self.H = None  # Will be initialized when first batch is added
-        self.quantized = False
-
-    def add_batch(self, inp):
-        """Add calibration batch to compute Hessian"""
-        if len(inp.shape) == 2:
-            inp = inp.unsqueeze(0)
-        tmp = inp.shape[0]
-        if len(inp.shape) == 3:
-            inp = inp.reshape((-1, inp.shape[-1]))
-        inp = inp.t()
-
-        # Update sample count
-        if self.nsamples == 0:
-            self.H = torch.zeros((self.columns, self.columns), device=self.device)
-
-        self.H *= self.nsamples / (self.nsamples + tmp)
-        self.nsamples += tmp
-
-        # Ensure numerical stability
-        inp = inp.float()
-
-        # Add small noise for numerical stability
-        inp = inp + torch.randn_like(inp) * 1e-4
-
-        # Update Hessian
-        self.H += 2 / self.nsamples * inp.matmul(inp.t())
-
-    def quantize(self):
-        """Perform GPTQ quantization"""
-        W = self.layer.weight.data.clone()
-        W = W.float()
-
-        # Check if we have calibration data
-        if self.H is None or self.nsamples == 0:
-            print("Warning: No calibration data added, initializing with identity matrix")
-            self.H = torch.eye(self.columns, device=self.device) * 0.01
-            self.nsamples = 1
-
-        # Compute inverse Hessian
-        H = self.H
-        del self.H
-
-        # Add damping for numerical stability
-        damp = self.percdamp * torch.mean(torch.diag(H))
-        diag = torch.arange(self.columns, device=self.device)
-        H[diag, diag] += damp
-
-        # Try Cholesky decomposition with fallback
-        try:
-            H = torch.linalg.cholesky(H)
-            H = torch.cholesky_inverse(H)
-            H = torch.linalg.cholesky(H, upper=True)
-            Hinv = H
-        except torch._C._LinAlgError:
-            print("Warning: Cholesky decomposition failed, using eigendecomposition instead")
-            # Fallback to eigendecomposition
-            try:
-                # Add more damping
-                H[diag, diag] += damp * 10
-                eigenvalues, eigenvectors = torch.linalg.eigh(H)
-
-                # Ensure all eigenvalues are positive
-                eigenvalues = eigenvalues.clamp(min=1e-5)
-
-                # Reconstruct inverse
-                Hinv = eigenvectors @ torch.diag(1.0 / eigenvalues) @ eigenvectors.T
-            except:
-                print("Warning: Eigendecomposition also failed, using diagonal approximation")
-                # Last resort: diagonal approximation
-                diagonal = torch.diag(H).clamp(min=1e-5)
-                Hinv = torch.diag(1.0 / diagonal)
-
-        # Initialize quantization parameters
-        n_groups = (self.rows + self.groupsize - 1) // self.groupsize
-        scale = torch.zeros((n_groups, 1), device=self.device)
-        zero = torch.zeros((n_groups, 1), device=self.device)
-
-        # Create quantized weight tensor with appropriate dtype
-        if self.wbits <= 8:
-            Q = torch.zeros((self.rows, self.columns), dtype=torch.uint8, device=self.device)
-        else:
-            # For now, use int16 for >8 bits, though this won't save space
-            Q = torch.zeros((self.rows, self.columns), dtype=torch.int16, device=self.device)
-
-        # Quantize layer weights
-        for i1 in range(0, self.columns, 128):
-            i2 = min(i1 + 128, self.columns)
-            count = i2 - i1
-
-            # Extract block
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                # Quantize groups
-                for j in range(0, self.rows, self.groupsize):
-                    j2 = min(j + self.groupsize, self.rows)
-                    group_idx = j // self.groupsize
-
-                    # Find optimal scale and zero point
-                    w_group = w[j:j2]
-
-                    # Handle empty groups
-                    if w_group.numel() == 0:
-                        continue
-
-                    w_min = w_group.min()
-                    w_max = w_group.max()
-
-                    # Avoid division by zero
-                    if w_max == w_min:
-                        scale_val = 1.0
-                        zero_val = 0.0
-                    else:
-                        scale_val = (w_max - w_min) / self.maxq
-                        zero_val = torch.round(-w_min / scale_val)
-
-                    if group_idx < scale.shape[0]:
-                        scale[group_idx] = scale_val
-                        zero[group_idx] = zero_val
-
-                    # Quantize
-                    q = torch.clamp(torch.round(w_group / scale_val + zero_val), 0, self.maxq)
-                    Q1[j:j2, i] = q
-
-                    # Dequantize for error computation
-                    dequant = (q - zero_val) * scale_val
-                    Err1[j:j2, i] = (w_group - dequant) / d if d != 0 else 0
-
-                # Update remaining weights
-                if i + 1 < count:
-                    # Ensure proper matrix multiplication dimensions
-                    err_col = Err1[:, i:i + 1]  # Shape: (rows, 1)
-                    hinv_row = Hinv1[i, i + 1:].unsqueeze(0)  # Shape: (1, remaining_cols)
-                    update = err_col.matmul(hinv_row)  # Shape: (rows, remaining_cols)
-                    W1[:, i + 1:] -= update
-
-            # Store in compact format
-            Q[:, i1:i2] = Q1.to(Q.dtype)
-
-        # Store quantized weights in packed format
-        self.qweight = Q
-        self.scale = scale
-        self.zero = zero
-        self.quantized = True
-
-        return Q, scale, zero
-
-
-def prepare_calibration_data(
-        tokenizer,
-        dataset_path: str = None,
-        num_samples: int = 128,
-        seq_length: int = 2048
-) -> List[torch.Tensor]:
-    """
-    Prepare calibration dataset for GPTQ
-
-    Args:
-        tokenizer: Model tokenizer
-        dataset_path: Path to calibration dataset (text file)
-        num_samples: Number of calibration samples
-        seq_length: Sequence length for each sample
-
-    Returns:
-        List of tokenized samples
-    """
-    # Fix padding token issue (common with LLaMA models)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-        if tokenizer.pad_token is None:
-            # If still None, use a common token
-            tokenizer.pad_token = tokenizer.unk_token
-            if tokenizer.pad_token is None:
-                # Last resort - add a padding token
-                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-
-    if dataset_path is None:
-        # Use a default calibration text if no dataset provided
-        default_text = """
-        The quick brown fox jumps over the lazy dog. 
-        Machine learning is transforming the world of technology.
-        Large language models have revolutionized natural language processing.
-        Artificial intelligence is rapidly advancing across various domains.
-        Deep learning has enabled breakthroughs in computer vision and NLP.
-        Transformer architectures have become the foundation of modern AI.
-        """ * 50
-
-        texts = [default_text[i:i + 1000] for i in range(0, len(default_text) - 1000, 1000)][:num_samples]
-    else:
-        with open(dataset_path, 'r', encoding='utf-8') as f:
-            text = f.read()
-            # Split into chunks
-            chunk_size = max(1000, len(text) // (num_samples + 1))
-            texts = [text[i:i + chunk_size] for i in range(0, len(text) - chunk_size, chunk_size // 2)][:num_samples]
-
-    # Tokenize
-    calibration_data = []
-    for text in texts[:num_samples]:
-        # Skip empty texts
-        if not text.strip():
-            continue
-
-        tokens = tokenizer(
-            text,
-            return_tensors='pt',
-            max_length=seq_length,
-            truncation=True,
-            padding='max_length'
-        )
-        calibration_data.append(tokens.input_ids)
-
-    # Ensure we have enough samples
-    if len(calibration_data) < num_samples:
-        print(f"Warning: Only {len(calibration_data)} calibration samples available (requested {num_samples})")
-
-    return calibration_data
-
-
-def quantize_litellama_model(
-        model_path: str,
-        output_path: str,
-        calibration_data_path: Optional[str] = None,
-        wbits: int = 4,
-        groupsize: int = 128,
-        device: str = "cuda",
-        num_samples: int = 128,
-        seq_length: int = 2048,
-        skip_vision: bool = False,
-) -> None:
-    """
-    Main function to quantize a lite_llama model using GPTQ
-
-    Args:
-        model_path: Path to converted lite_llama model directory
-        output_path: Path to save quantized model
-        calibration_data_path: Path to calibration dataset
-        wbits: Quantization bits (4, 8, etc.)
-        groupsize: Group size for quantization
-        device: Device to use for quantization
-    """
-    print(f"Loading model from {model_path}")
-
-    # Load model weights
-    model_files = [f for f in os.listdir(model_path) if f.endswith('.pth')]
-    if not model_files:
-        raise ValueError(f"No .pth file found in {model_path}")
-
-    model_file = os.path.join(model_path, model_files[0])
-    state_dict = torch.load(model_file, map_location=device)
-
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-    # Prepare calibration data
-    print("Preparing calibration data...")
-    calibration_data = prepare_calibration_data(
-        tokenizer,
-        calibration_data_path,
-        num_samples=num_samples,
-        seq_length=seq_length
-    )
-
-    # Create output directory
-    os.makedirs(output_path, exist_ok=True)
-
-    # Quantize each layer
-    quantized_state_dict = {}
-    quantization_config = {
-        "wbits": wbits,
-        "groupsize": groupsize,
-        "layers": {}
-    }
-
-    # Detect if this is a LLaVA model by checking for language_model prefix
-    is_llava = any("language_model" in key for key in state_dict.keys())
-
-    # Get all weight keys that need quantization
-    weight_keys_to_quantize = []
-
-    # Updated patterns for both regular and LLaVA models
-    patterns = [
-        # Regular model patterns
-        "q_proj", "kv_proj", "o_proj",
-        "gate_proj", "up_proj", "down_proj",
-        "lm_head", "embed_tokens",
-        # LLaVA specific patterns (without _weight suffix in search)
-        "q_proj_weight", "kv_proj_weight", "o_proj_weight"
-    ]
-
-    for key in state_dict.keys():
-        # For LLaVA models, also check for language_model prefix
-        if is_llava and "language_model" in key:
-            if any(pattern in key for pattern in patterns):
-                weight_keys_to_quantize.append(key)
-        elif any(pattern in key for pattern in patterns) and ("weight" in key or key.endswith(("_weight", ".weight"))):
-            weight_keys_to_quantize.append(key)
-
-    print(f"Found {len(weight_keys_to_quantize)} weights to quantize")
-    if is_llava:
-        print("Detected LLaVA model structure")
-
-    # Process each weight
-    skipped_vision_count = 0
-    for key in tqdm(weight_keys_to_quantize, desc="Quantizing layers"):
-        weight = state_dict[key]
-        is_vision = "vision" in key.lower() or "multi_modal_projector" in key.lower()
-
-        # Skip if weight is too small
-        if weight.numel() < 1024:
-            print(f"\nSkipping {key} (too small: {weight.numel()} parameters)")
-            quantized_state_dict[key] = weight
-            continue
-
-        # Skip vision weights if requested
-        if skip_vision and is_vision:
-            quantized_state_dict[key] = weight
-            skipped_vision_count += 1
-            continue
-
-        print(f"\nQuantizing {key} (shape: {weight.shape})...")
-
-        # Create a dummy layer for GPTQ
-        layer = nn.Linear(weight.shape[1], weight.shape[0], bias=False)
-        layer.weight.data = weight.to(device)
-
-        # Adjust percdamp for different layer types
-        percdamp = 0.01
-        if "embed" in key or "lm_head" in key:
-            percdamp = 0.1  # Higher damping for embeddings
-            print(f"  Using higher damping (0.1) for {key}")
-
-        # Initialize GPTQ
-        gptq = GPTQ(
-            layer=layer,
-            wbits=wbits,
-            groupsize=groupsize,
-            device=device,
-            percdamp=percdamp
-        )
-
-        # Add calibration data (simplified - in practice, you'd run forward passes)
-        # Get embedding weight if available
-        embed_key = None
-
-        # Search for embedding key - handle both regular and LLaVA models
-        embed_patterns = [
-            "embed_tokens.weight",
-            "language_model.embed_tokens.weight"
-        ]
-
-        for k in state_dict.keys():
-            for pattern in embed_patterns:
-                if pattern in k:
-                    embed_key = k
-                    break
-            if embed_key:
-                break
-
-        if embed_key and len(calibration_data) > 0:
-            embed_weight = state_dict[embed_key].to(device)
-            # Use actual token embeddings as input
-            for i in range(min(len(calibration_data), 32)):
-                tokens = calibration_data[i][0].to(device)
-                # Get embeddings for these tokens
-                embeddings = torch.embedding(embed_weight, tokens)
-                # Average pool to get input dimension
-                if embeddings.shape[1] > weight.shape[1]:
-                    # Use adaptive pooling to match dimensions
-                    embeddings = torch.nn.functional.adaptive_avg_pool1d(
-                        embeddings.transpose(1, 2),
-                        weight.shape[1]
-                    ).transpose(1, 2)
-                elif embeddings.shape[1] < weight.shape[1]:
-                    # Skip if embedding dimension doesn't match
-                    continue
-
-                # Take mean across sequence length for this layer's input
-                fake_inp = embeddings.mean(dim=0, keepdim=True)
-                if fake_inp.shape[1] == weight.shape[1]:
-                    gptq.add_batch(fake_inp)
-        else:
-            # Fallback to random data if no embeddings available
-            for _ in range(min(len(calibration_data), 32)):
-                fake_inp = torch.randn(1, weight.shape[1], device=device) * 0.1
-                gptq.add_batch(fake_inp)
-
-        # Quantize
-        qweight, scale, zero = gptq.quantize()
-
-        # Pack weights if 4-bit quantization
-        if wbits == 4:
-            # Store original shape for unpacking
-            original_shape = qweight.shape
-            packed_weight = pack_4bit_weights(qweight, qweight.shape[0], qweight.shape[1])
-            quantized_state_dict[key] = packed_weight.cpu()
-            # Store scale and zero as tensors with specific keys
-            quantized_state_dict[f"{key}.scale"] = scale.cpu()
-            quantized_state_dict[f"{key}.zero"] = zero.cpu()
-            quantization_config["layers"][key] = {
-                "groupsize": groupsize,
-                "wbits": wbits,
-                "original_shape": list(original_shape),
-                "packed": True
-            }
-        else:
-            # For 8-bit or other, store directly
-            quantized_state_dict[key] = qweight.cpu()
-            quantized_state_dict[f"{key}.scale"] = scale.cpu()
-            quantized_state_dict[f"{key}.zero"] = zero.cpu()
-            quantization_config["layers"][key] = {
-                "groupsize": groupsize,
-                "wbits": wbits,
-                "packed": False
-            }
-
-        # Clean up
-        del layer, gptq
-        torch.cuda.empty_cache()
-        gc.collect()
-
-    # Copy non-quantized weights
-    for key in state_dict.keys():
-        if key not in quantized_state_dict:
-            quantized_state_dict[key] = state_dict[key]
-
-    # Save quantized model - now everything is in the state dict
-    model_id = os.path.basename(model_path)
-    torch.save(
-        quantized_state_dict,
-        os.path.join(output_path, f"{model_id}-{wbits}bit-gptq.pth")
-    )
-
-    # Save quantization config
-    with open(os.path.join(output_path, "quantization_config.json"), "w") as f:
-        json.dump(quantization_config, f, indent=2)
-
-    # Copy other files
-    for file in os.listdir(model_path):
-        if file.endswith('.json') and file != "quantization_config.json":
-            src = os.path.join(model_path, file)
-            dst = os.path.join(output_path, file)
-            with open(src, 'r') as f_in, open(dst, 'w') as f_out:
-                f_out.write(f_in.read())
-
-    if os.path.exists(os.path.join(model_path, "tokenizer.model")):
-        import shutil
-        shutil.copy(
-            os.path.join(model_path, "tokenizer.model"),
-            os.path.join(output_path, "tokenizer.model")
-        )
-
-    print(f"Quantization complete! Model saved to {output_path}")
-
-    if skipped_vision_count > 0:
-        print(f"Skipped {skipped_vision_count} vision model weights")
-
-    # Print compression statistics
-    original_size = sum(p.numel() * p.element_size() for p in state_dict.values())
-
-    # Calculate quantized size more accurately
-    quantized_size = 0
-    for key, tensor in quantized_state_dict.items():
-        # Add size of each tensor in the quantized state dict
-        quantized_size += tensor.numel() * tensor.element_size()
-
-    compression_ratio = original_size / quantized_size if quantized_size > 0 else 0
-
-    print(f"\nOriginal model size: {original_size / 1e9:.2f} GB")
-    print(f"Quantized model size: {quantized_size / 1e9:.2f} GB")
-    print(f"Compression ratio: {compression_ratio:.2f}x")
-    print(f"Space saved: {(1 - quantized_size / original_size) * 100:.1f}%")
-
-    # Expected compression ratios
-    expected_ratio = 32 / (wbits + 0.5)  # 0.5 for metadata overhead
-    if compression_ratio < expected_ratio * 0.8:
-        print(f"\nNote: Compression ratio is lower than expected ({expected_ratio:.1f}x)")
-        print("This may be due to non-quantized layers (embeddings, layer norms, etc.)")
-
-
-def dequantize_weight(
-        quantized_weight: torch.Tensor,
-        scale: torch.Tensor,
-        zero: torch.Tensor,
-        wbits: int = 4,
-        groupsize: int = 128,
-        original_shape: Optional[Tuple[int, int]] = None,
-        packed: bool = False
-) -> torch.Tensor:
-    """
-    Dequantize a weight tensor
-
-    Args:
-        quantized_weight: Quantized weight tensor (possibly packed)
-        scale: Scale parameters
-        zero: Zero point parameters
-        wbits: Quantization bits
-        groupsize: Group size used in quantization
-        original_shape: Original shape before packing (for 4-bit)
-        packed: Whether the weights are packed
-
-    Returns:
-        Dequantized weight tensor
-    """
-    # Unpack if necessary
-    if packed and wbits == 4:
-        if original_shape is None:
-            raise ValueError("original_shape required for unpacking 4-bit weights")
-        quantized_weight = unpack_4bit_weights(
-            quantized_weight,
-            original_shape[0],
-            original_shape[1]
-        )
-
-    weight = torch.zeros_like(quantized_weight, dtype=torch.float32)
-
-    for i in range(0, quantized_weight.shape[0], groupsize):
-        j = min(i + groupsize, quantized_weight.shape[0])
-        group_idx = i // groupsize
-
-        if group_idx < scale.shape[0]:
-            weight[i:j] = (quantized_weight[i:j].float() - zero[group_idx]) * scale[group_idx]
-
-    return weight
-
-
-def load_quantized_model(model_path: str, device: str = "cpu"):
-    """
-    Load a quantized model
-
-    Args:
-        model_path: Path to the quantized .pth file
-        device: Device to load the model to
-
-    Returns:
-        state_dict with quantized weights and metadata
-    """
-    return torch.load(model_path, map_location=device)
-
-
-def dequantize_model(model_path: str, quantization_config: dict, output_path: str = None):
-    """
-    Fully dequantize a quantized model back to fp16/fp32
-
-    Args:
-        model_path: Path to quantized model
-        quantization_config: Quantization configuration dict
-        output_path: Where to save dequantized model
-    """
-    # Load quantized model
-    state_dict = load_quantized_model(model_path)
-
-    # Dequantize each layer
-    dequantized_dict = {}
-    for key, tensor in state_dict.items():
-        # Skip scale and zero tensors
-        if key.endswith('.scale') or key.endswith('.zero'):
-            continue
-
-        # Check if this is a quantized layer
-        if key in quantization_config["layers"]:
-            layer_config = quantization_config["layers"][key]
-            scale = state_dict[f"{key}.scale"]
-            zero = state_dict[f"{key}.zero"]
-
-            dequantized = dequantize_weight(
-                tensor,
-                scale,
-                zero,
-                wbits=layer_config["wbits"],
-                groupsize=layer_config["groupsize"],
-                original_shape=layer_config.get("original_shape"),
-                packed=layer_config.get("packed", False)
-            )
-            dequantized_dict[key] = dequantized
-        else:
-            dequantized_dict[key] = tensor
-
-    # Save if output path provided
-    if output_path:
-        torch.save(dequantized_dict, output_path)
-        print(f"Dequantized model saved to {output_path}")
-
-    return dequantized_dict
-
-
-# Integration with your existing code
-def quantize_after_conversion(
-        checkpoints_dir: str,
-        model_type: str,  # "llama", "qwen2", or "llava"
-        calibration_data_path: Optional[str] = None,
-        wbits: int = 4,
-        groupsize: int = 128,
-        num_samples: int = 128,
-        seq_length: int = 2048,
-        skip_vision: bool = False
-):
-    """
-    Quantize model after it has been converted to lite_llama format
-
-    Args:
-        checkpoints_dir: Original HF model directory
-        model_type: Type of model ("llama", "qwen2", or "llava")
-        calibration_data_path: Path to calibration dataset
-        wbits: Quantization bits
-        groupsize: Group size for quantization (0 for auto-detect)
-        num_samples: Number of calibration samples
-        seq_length: Sequence length for calibration
-        skip_vision: Skip quantization of vision weights (for LLaVA)
-    """
-    # Construct paths
-    model_id = os.path.basename(os.path.normpath(checkpoints_dir))
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # Path to converted model
-    converted_model_path = os.path.join(current_dir, f"../../my_weight/{model_id}")
-
-    # Path for quantized model
-    quantized_model_path = os.path.join(current_dir, f"../../my_weight/{model_id}-{wbits}bit-gptq")
-
-    # Perform quantization
-    quantize_litellama_model(
-        model_path=converted_model_path,
-        output_path=quantized_model_path,
-        calibration_data_path=calibration_data_path,
-        wbits=wbits,
-        groupsize=groupsize,
-        num_samples=num_samples,
-        seq_length=seq_length,
-        skip_vision=skip_vision
-    )
-
-    return quantized_model_path
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/__init__.py b/lite_llama/quantization/gptq/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lite_llama/quantization/gptq/gptq_executor.py b/lite_llama/quantization/gptq/gptq_executor.py
new file mode 100644
index 0000000..a5ba0cc
--- /dev/null
+++ b/lite_llama/quantization/gptq/gptq_executor.py
@@ -0,0 +1,218 @@
+"""
+Extended ModelExecutor with GPTQ support
+"""
+
+import torch
+import json
+import time
+from pathlib import Path
+from typing import Optional, Dict, Any
+
+from lite_llama.executor.model_executor import ModelExecutor
+from lite_llama.executor.weight_convert import (
+    convert_llama_hf_to_litellama,
+    convert_qwen2_hf_to_litellama,
+    convert_llama_torch_to_litellama,
+)
+from lite_llama.models.model_config import LlamaConfig, Qwen2Config
+from lite_llama.quantization.gptq.gptq_loader import GPTQModelLoader, load_gptq_quantize_config
+from lite_llama.utils.logger import log
+
+
+class GPTQModelExecutor(ModelExecutor):
+    """Extended ModelExecutor with GPTQ quantization support"""
+
+    @staticmethod
+    def _is_gptq_model(checkpoints_dir: str) -> bool:
+        """Check if the model directory contains GPTQ quantized model"""
+        quantize_config_path = Path(checkpoints_dir) / "quantization_config.json"
+        return quantize_config_path.exists()
+
+    @staticmethod
+    def _load_model_weight(
+            model_config,
+            checkpoints_dir,
+            load_model=True,
+            triton_weight=True,
+            device="cuda",
+            use_gptq=None,  # New parameter: None=auto-detect, True=force GPTQ, False=force original
+    ):
+        """Extended weight loading with GPTQ support"""
+        start_time = time.time()
+
+        # Auto-detect GPTQ if not specified
+        if use_gptq is None:
+            use_gptq = GPTQModelExecutor._is_gptq_model(checkpoints_dir)
+            if use_gptq:
+                log.info(f"GPTQ quantized model detected in {checkpoints_dir}")
+
+        # Initialize model
+        with torch.no_grad():
+            model = ModelExecutor._initialize_model(model_config, device=device)
+            state_dict = None
+
+        if not load_model:
+            # Use conversion function (original path)
+            if model_config.model_type.lower() == "llama":
+                # Try to determine if it's HF or torch format
+                config_path = Path(checkpoints_dir) / "config.json"
+                if config_path.exists():
+                    state_dict = convert_llama_hf_to_litellama(checkpoints_dir, None, model_config)
+                else:
+                    state_dict = convert_llama_torch_to_litellama(checkpoints_dir, None, model_config)
+            elif model_config.model_type.lower() == "qwen2":
+                state_dict = convert_qwen2_hf_to_litellama(checkpoints_dir, None, model_config)
+            else:
+                log.error(f"Unsupported model type: {model_config.model_type}")
+                raise ValueError(f"Unsupported model type: {model_config.model_type}")
+        elif use_gptq:
+            # Load GPTQ model
+            state_dict = GPTQModelLoader.load(checkpoints_dir, model_config, device)
+        else:
+            # Original loading path
+            checkpoints = sorted(Path(checkpoints_dir).glob("*.pth"))
+            if not checkpoints:
+                log.error(f"No checkpoint files found in {checkpoints_dir}")
+                raise FileNotFoundError(f"No checkpoint files found in {checkpoints_dir}")
+
+            ckpt_path = str(checkpoints[0])
+            log.info(f'Loading checkpoint "{ckpt_path}"')
+            state_dict = torch.load(
+                ckpt_path, mmap=True, weights_only=True, map_location=device
+            )
+
+        # Load state dict into model
+        model.load_state_dict(state_dict, strict=True, assign=True)
+        model.eval()
+        log.info(f"Loaded state dict in {time.time() - start_time:.2f}s")
+
+        # Convert to half precision
+        model.half().to(device)
+        for param in model.parameters():
+            assert param.dtype == torch.float16, "Model parameters are not in FP16"
+        log.info("Converted model to half precision (FP16)")
+
+        return model
+
+    @staticmethod
+    def build(
+            checkpoints_dir: str,
+            max_seq_len: int,
+            max_gpu_num_blocks: Optional[int] = None,
+            load_model: bool = True,
+            triton_weight: bool = True,
+            compiled_model: bool = False,
+            device: str = "cuda",
+            use_gptq: Optional[bool] = None,  # New parameter for GPTQ
+    ):
+        """
+        Build ModelExecutor with GPTQ support
+
+        Args:
+            checkpoints_dir: Model checkpoint directory
+            max_seq_len: Maximum sequence length
+            max_gpu_num_blocks: Maximum GPU memory blocks
+            load_model: Whether to load model weights
+            triton_weight: Whether to use Triton kernels
+            compiled_model: Whether to compile model
+            device: Device to use
+            use_gptq: Whether to use GPTQ (None=auto-detect)
+        """
+        model_config = ModelExecutor._load_model_config(
+            checkpoints_dir, max_seq_len, device=device
+        )
+
+        model = GPTQModelExecutor._load_model_weight(
+            model_config, checkpoints_dir, load_model, triton_weight, device, use_gptq
+        )
+
+        return ModelExecutor(
+            model_config, model, max_gpu_num_blocks, compiled_model, device
+        )
+
+
+def create_gptq_generate_text_class():
+    """Create a GenerateText class with GPTQ support"""
+
+    from lite_llama.generate import GenerateText
+
+    class GPTQGenerateText(GenerateText):
+        """GenerateText with GPTQ model support"""
+
+        def __init__(
+                self,
+                checkpoints_dir: str,
+                tokenizer_path: str,
+                max_seq_len=1024,
+                max_gpu_num_blocks=None,
+                load_model=True,
+                triton_weight=True,
+                compiled_model=False,
+                device="cuda",
+                use_gptq=None,  # New parameter
+        ):
+            self.checkpoints_dir = checkpoints_dir
+            self.compiled_model = compiled_model
+            self.device = device
+
+            # Use GPTQModelExecutor instead of ModelExecutor
+            self.model_executor = GPTQModelExecutor.build(
+                checkpoints_dir=checkpoints_dir,
+                max_seq_len=max_seq_len,
+                max_gpu_num_blocks=max_gpu_num_blocks,
+                load_model=load_model,
+                triton_weight=triton_weight,
+                compiled_model=compiled_model,
+                device=device,
+                use_gptq=use_gptq,
+            )
+            self.model_config = self.model_executor.model_config
+            assert self.model_config.vocab_size != -1, "Vocab size must be set"
+            self.tokenizer = self.load_tokenizer(tokenizer_path)
+
+    return GPTQGenerateText
+
+
+def create_gptq_generate_stream_text_class():
+    """Create a GenerateStreamText class with GPTQ support"""
+
+    from lite_llama.generate_stream import GenerateStreamText
+
+    class GPTQGenerateStreamText(GenerateStreamText):
+        """GenerateStreamText with GPTQ model support"""
+
+        def __init__(
+                self,
+                checkpoints_dir: str,
+                tokenizer_path: str,
+                max_gpu_num_blocks=None,
+                max_seq_len=1024,
+                load_model=True,
+                triton_weight=True,
+                compiled_model=False,
+                device="cuda",
+                use_gptq=None,  # New parameter
+        ):
+            self.checkpoints_dir = checkpoints_dir
+
+            # Use GPTQModelExecutor instead of ModelExecutor
+            self.model_executor = GPTQModelExecutor.build(
+                checkpoints_dir=checkpoints_dir,
+                load_model=load_model,
+                max_gpu_num_blocks=max_gpu_num_blocks,
+                max_seq_len=max_seq_len,
+                triton_weight=triton_weight,
+                compiled_model=compiled_model,
+                device=device,
+                use_gptq=use_gptq,
+            )
+            self.tokenizer = self.load_tokenizer(tokenizer_path)
+            self.model_config = self.model_executor.model_config
+            self.device = device
+
+    return GPTQGenerateStreamText
+
+
+# Export the GPTQ-enabled classes
+GPTQGenerateText = create_gptq_generate_text_class()
+GPTQGenerateStreamText = create_gptq_generate_stream_text_class()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq_loader.py b/lite_llama/quantization/gptq/gptq_loader.py
new file mode 100644
index 0000000..90e2e8d
--- /dev/null
+++ b/lite_llama/quantization/gptq/gptq_loader.py
@@ -0,0 +1,550 @@
+"""
+GPTQ weight loading and dequantization utilities for lite_llama
+"""
+
+import torch
+import torch.nn as nn
+from pathlib import Path
+import json
+import time
+from typing import Dict, Optional, Tuple, Any
+import numpy as np
+
+try:
+    import safetensors.torch
+    HAS_SAFETENSORS = True
+except ImportError:
+    HAS_SAFETENSORS = False
+    print("Warning: safetensors not installed. Install with: pip install safetensors")
+
+
+class GPTQConfig:
+    """Configuration for GPTQ quantization parameters"""
+    def __init__(self, bits: int = 4, group_size: int = 128,
+                 desc_act: bool = False, sym: bool = True,
+                 true_sequential: bool = True):
+        self.bits = bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.sym = sym
+        self.true_sequential = true_sequential
+        self.pack_num = 32 // self.bits  # number of weights packed in int32
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any]) -> "GPTQConfig":
+        """Create GPTQConfig from dictionary"""
+        return cls(
+            bits=config_dict.get("bits", 4),
+            group_size=config_dict.get("group_size", 128),
+            desc_act=config_dict.get("desc_act", False),
+            sym=config_dict.get("sym", True),
+            true_sequential=config_dict.get("true_sequential", True)
+        )
+
+
+def load_gptq_quantize_config(model_path: str) -> Optional[GPTQConfig]:
+    """Load GPTQ quantization config from model directory"""
+    quantize_config_path = Path(model_path) / "quantization_config.json"
+    if not quantize_config_path.exists():
+        return None
+
+    with open(quantize_config_path, 'r') as f:
+        config_dict = json.load(f)
+
+    return GPTQConfig.from_dict(config_dict)
+
+
+def unpack_gptq_weights(qweight: torch.Tensor, bits: int = 4) -> torch.Tensor:
+    """
+    Unpack GPTQ quantized weights from int32 format
+
+    Args:
+        qweight: Packed quantized weights [out_features, in_features // pack_num]
+        bits: Number of bits per weight (4 or 8)
+
+    Returns:
+        Unpacked weights [out_features, in_features]
+    """
+    pack_num = 32 // bits
+    out_features = qweight.shape[0]
+    in_features = qweight.shape[1] * pack_num
+
+    unpacked_weights = torch.zeros((out_features, in_features),
+                                   dtype=torch.int32, device=qweight.device)
+
+    for i in range(pack_num):
+        shift = i * bits
+        if bits == 4:
+            mask = 0xF
+        elif bits == 8:
+            mask = 0xFF
+        else:
+            raise ValueError(f"Unsupported bits: {bits}")
+
+        unpacked_weights[:, i::pack_num] = (qweight >> shift) & mask
+
+    return unpacked_weights
+
+
+def dequantize_gptq(qweight: torch.Tensor, qzeros: torch.Tensor,
+                    scales: torch.Tensor, g_idx: Optional[torch.Tensor] = None,
+                    bits: int = 4, group_size: int = 128) -> torch.Tensor:
+    """
+    Dequantize GPTQ weights
+
+    Args:
+        qweight: Packed quantized weights
+        qzeros: Packed zero points
+        scales: Scale factors
+        g_idx: Group indices (optional, for act-order)
+        bits: Quantization bits
+        group_size: Quantization group size
+
+    Returns:
+        Dequantized weights in fp16
+    """
+    # Unpack weights and zeros
+    weight = unpack_gptq_weights(qweight, bits).to(torch.float16)
+    zeros = unpack_gptq_weights(qzeros, bits).to(torch.float16)
+
+    # Handle act-order if needed
+    if g_idx is not None:
+        weight = weight[:, g_idx]
+        zeros = zeros[:, g_idx]
+
+    # Reshape for group-wise dequantization
+    out_features, in_features = weight.shape
+    num_groups = in_features // group_size
+
+    weight = weight.reshape(out_features, num_groups, group_size)
+    zeros = zeros.reshape(-1, num_groups, 1)
+    scales = scales.reshape(-1, num_groups, 1)
+
+    # Dequantize: w = (w_q - z) * s
+    weight = (weight - zeros) * scales
+
+    # Reshape back
+    weight = weight.reshape(out_features, in_features)
+
+    return weight
+
+
+def load_gptq_linear_weights(checkpoint_path: str, layer_name: str,
+                            gptq_config: GPTQConfig) -> Dict[str, torch.Tensor]:
+    """
+    Load GPTQ quantized linear layer weights
+
+    Args:
+        checkpoint_path: Path to checkpoint file
+        layer_name: Name prefix of the layer (e.g., "layers.0.self_attn.q_proj")
+        gptq_config: GPTQ configuration
+
+    Returns:
+        Dictionary containing dequantized weight and bias (if exists)
+    """
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+
+    # Load quantized components
+    qweight = checkpoint.get(f"{layer_name}.qweight")
+    qzeros = checkpoint.get(f"{layer_name}.qzeros")
+    scales = checkpoint.get(f"{layer_name}.scales")
+    g_idx = checkpoint.get(f"{layer_name}.g_idx", None)
+    bias = checkpoint.get(f"{layer_name}.bias", None)
+
+    if qweight is None or qzeros is None or scales is None:
+        # Fallback to non-quantized weight
+        weight = checkpoint.get(f"{layer_name}.weight")
+        if weight is None:
+            raise ValueError(f"No weight found for {layer_name}")
+        return {"weight": weight, "bias": bias}
+
+    # Dequantize
+    weight = dequantize_gptq(qweight, qzeros, scales, g_idx,
+                            gptq_config.bits, gptq_config.group_size)
+
+    return {"weight": weight, "bias": bias}
+
+
+def convert_gptq_to_lite_llama(checkpoints_dir: str, model_config) -> Dict[str, torch.Tensor]:
+    """
+    Convert GPTQ quantized model to lite_llama format
+
+    Args:
+        checkpoints_dir: Directory containing GPTQ model files
+        model_config: Model configuration
+
+    Returns:
+        State dictionary in lite_llama format
+    """
+    import safetensors.torch
+
+    # Load GPTQ config
+    gptq_config = load_gptq_quantize_config(checkpoints_dir)
+    if gptq_config is None:
+        raise ValueError(f"No quantization_config.json found in {checkpoints_dir}")
+
+    # Find checkpoint files
+    checkpoint_files = sorted(Path(checkpoints_dir).glob("*.safetensors"))
+    use_safetensors = len(checkpoint_files) > 0
+
+    if not checkpoint_files:
+        checkpoint_files = sorted(Path(checkpoints_dir).glob("*.bin"))
+
+    if not checkpoint_files:
+        checkpoint_files = sorted(Path(checkpoints_dir).glob("*.pth"))
+
+    if not checkpoint_files:
+        raise ValueError(f"No checkpoint files found in {checkpoints_dir}")
+
+    # Load all checkpoints (handle sharded models)
+    full_state_dict = {}
+    for checkpoint_file in checkpoint_files:
+        if use_safetensors:
+            if not HAS_SAFETENSORS:
+                raise ImportError("safetensors is required for loading .safetensors files. Install with: pip install safetensors")
+            state_dict = safetensors.torch.load_file(str(checkpoint_file))
+        else:
+            state_dict = torch.load(str(checkpoint_file), map_location="cpu")
+        full_state_dict.update(state_dict)
+
+    # Check if already in lite_llama format
+    is_lite_llama_format = any("kv_proj_weight" in key for key in full_state_dict.keys())
+
+    if is_lite_llama_format:
+        print("Model is already in lite_llama format")
+        # Just dequantize if needed
+        new_state_dict = {}
+        for key, value in full_state_dict.items():
+            # Check if this is a quantized weight
+            base_key = key.replace(".qweight", "").replace(".qzeros", "").replace(".scales", "").replace(".g_idx", "")
+
+            if key.endswith(".qweight"):
+                # This is a quantized weight, dequantize it
+                qweight = value
+                qzeros = full_state_dict.get(base_key + ".qzeros")
+                scales = full_state_dict.get(base_key + ".scales")
+                g_idx = full_state_dict.get(base_key + ".g_idx", None)
+
+                if qzeros is not None and scales is not None:
+                    weight = dequantize_gptq(
+                        qweight, qzeros, scales, g_idx,
+                        gptq_config.bits, gptq_config.group_size
+                    )
+                    new_state_dict[base_key] = weight
+            elif not any(key.endswith(suffix) for suffix in [".qzeros", ".scales", ".g_idx"]):
+                # Regular weight, just copy
+                new_state_dict[key] = value
+
+        return new_state_dict
+
+    # Otherwise, convert based on model type
+    if model_config.model_type.lower() == "llama":
+        new_state_dict = convert_gptq_llama_to_lite_llama(
+            full_state_dict, gptq_config, model_config
+        )
+    elif model_config.model_type.lower() == "qwen2":
+        new_state_dict = convert_gptq_qwen2_to_lite_llama(
+            full_state_dict, gptq_config, model_config
+        )
+    else:
+        raise ValueError(f"Unsupported model type for GPTQ: {model_config.model_type}")
+
+    return new_state_dict
+
+
+def convert_gptq_llama_to_lite_llama(
+    checkpoint: Dict[str, torch.Tensor],
+    gptq_config: GPTQConfig,
+    model_config
+) -> Dict[str, torch.Tensor]:
+    """Convert GPTQ Llama model to lite_llama format"""
+    new_state_dict = {}
+
+    # Check if this is already in lite_llama format
+    is_lite_llama_format = any("kv_proj_weight" in key for key in checkpoint.keys())
+
+    if is_lite_llama_format:
+        # Already in lite_llama format, just process the weights
+        for key, value in checkpoint.items():
+            new_state_dict[key] = value
+        return new_state_dict
+
+    # Load embeddings and norms (these are not quantized)
+    new_state_dict["embed_tokens.weight"] = checkpoint.get("model.embed_tokens.weight")
+    new_state_dict["norm_weight"] = checkpoint.get("model.norm.weight")
+    new_state_dict["lm_head.weight"] = checkpoint.get("lm_head.weight")
+
+    # Process each layer
+    for i in range(model_config.num_layers):
+        # Check if we have separate k_proj and v_proj or merged kv_proj
+        has_separate_kv = f"model.layers.{i}.self_attn.k_proj.weight" in checkpoint or \
+                         f"model.layers.{i}.self_attn.k_proj.qweight" in checkpoint
+
+        if has_separate_kv:
+            # Process separate K and V projections
+            for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
+                prefix = f"model.layers.{i}.self_attn.{proj}"
+
+                # Check if quantized weights exist
+                if f"{prefix}.qweight" in checkpoint:
+                    # Load and dequantize
+                    qweight = checkpoint[f"{prefix}.qweight"]
+                    qzeros = checkpoint[f"{prefix}.qzeros"]
+                    scales = checkpoint[f"{prefix}.scales"]
+                    g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                    weight = dequantize_gptq(
+                        qweight, qzeros, scales, g_idx,
+                        gptq_config.bits, gptq_config.group_size
+                    )
+                else:
+                    # Use original weight if not quantized
+                    weight = checkpoint.get(f"{prefix}.weight")
+                    if weight is None and proj in ["k_proj", "v_proj"]:
+                        # Skip if k_proj/v_proj don't exist (might be merged already)
+                        continue
+                    elif weight is None:
+                        raise ValueError(f"No weight found for {prefix}")
+
+                if proj in ["k_proj", "v_proj"]:
+                    # Store temporarily for merging
+                    new_state_dict[f"_temp_{i}_{proj}_weight"] = weight
+                else:
+                    new_state_dict[f"layers.{i}.self_attn.{proj}.weight"] = weight
+
+            # Merge k and v projections if they were separate
+            if f"_temp_{i}_k_proj_weight" in new_state_dict:
+                k_weight = new_state_dict.pop(f"_temp_{i}_k_proj_weight")
+                v_weight = new_state_dict.pop(f"_temp_{i}_v_proj_weight")
+                new_state_dict[f"layers.{i}.self_attn.kv_proj_weight"] = torch.cat([k_weight, v_weight], dim=0)
+        else:
+            # Already has merged kv_proj
+            # Q projection
+            prefix = f"model.layers.{i}.self_attn.q_proj"
+            if f"{prefix}.qweight" in checkpoint:
+                qweight = checkpoint[f"{prefix}.qweight"]
+                qzeros = checkpoint[f"{prefix}.qzeros"]
+                scales = checkpoint[f"{prefix}.scales"]
+                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                weight = dequantize_gptq(
+                    qweight, qzeros, scales, g_idx,
+                    gptq_config.bits, gptq_config.group_size
+                )
+            else:
+                weight = checkpoint.get(f"{prefix}.weight")
+
+            new_state_dict[f"layers.{i}.self_attn.q_proj.weight"] = weight
+
+            # O projection
+            prefix = f"model.layers.{i}.self_attn.o_proj"
+            if f"{prefix}.qweight" in checkpoint:
+                qweight = checkpoint[f"{prefix}.qweight"]
+                qzeros = checkpoint[f"{prefix}.qzeros"]
+                scales = checkpoint[f"{prefix}.scales"]
+                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                weight = dequantize_gptq(
+                    qweight, qzeros, scales, g_idx,
+                    gptq_config.bits, gptq_config.group_size
+                )
+            else:
+                weight = checkpoint.get(f"{prefix}.weight")
+
+            new_state_dict[f"layers.{i}.self_attn.o_proj.weight"] = weight
+
+            # KV projection (already merged)
+            prefix = f"model.layers.{i}.self_attn.kv_proj"
+            if f"{prefix}.qweight" in checkpoint:
+                qweight = checkpoint[f"{prefix}.qweight"]
+                qzeros = checkpoint[f"{prefix}.qzeros"]
+                scales = checkpoint[f"{prefix}.scales"]
+                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                weight = dequantize_gptq(
+                    qweight, qzeros, scales, g_idx,
+                    gptq_config.bits, gptq_config.group_size
+                )
+            else:
+                weight = checkpoint.get(f"{prefix}.weight",
+                                       checkpoint.get(f"layers.{i}.self_attn.kv_proj_weight"))
+
+            if weight is not None:
+                new_state_dict[f"layers.{i}.self_attn.kv_proj_weight"] = weight
+
+        # MLP projections
+        for proj in ["gate_proj", "up_proj", "down_proj"]:
+            prefix = f"model.layers.{i}.mlp.{proj}"
+
+            if f"{prefix}.qweight" in checkpoint:
+                # Load and dequantize
+                qweight = checkpoint[f"{prefix}.qweight"]
+                qzeros = checkpoint[f"{prefix}.qzeros"]
+                scales = checkpoint[f"{prefix}.scales"]
+                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                weight = dequantize_gptq(
+                    qweight, qzeros, scales, g_idx,
+                    gptq_config.bits, gptq_config.group_size
+                )
+            else:
+                weight = checkpoint.get(f"{prefix}.weight")
+                if weight is None:
+                    raise ValueError(f"No weight found for {prefix}")
+
+            new_state_dict[f"layers.{i}.mlp.{proj}.weight"] = weight
+
+        # Layer norms (not quantized) - handle different naming conventions
+        attention_norm = checkpoint.get(f"model.layers.{i}.input_layernorm.weight") or \
+                        checkpoint.get(f"layers.{i}.attention_norm_weight") or \
+                        checkpoint.get(f"layers.{i}.input_layernorm_weight")
+
+        ffn_norm = checkpoint.get(f"model.layers.{i}.post_attention_layernorm.weight") or \
+                   checkpoint.get(f"layers.{i}.ffn_norm_weight") or \
+                   checkpoint.get(f"layers.{i}.post_attention_layernorm_weight")
+
+        if attention_norm is not None:
+            new_state_dict[f"layers.{i}.attention_norm_weight"] = attention_norm
+        if ffn_norm is not None:
+            new_state_dict[f"layers.{i}.ffn_norm_weight"] = ffn_norm
+
+    return new_state_dict
+
+
+def convert_gptq_qwen2_to_lite_llama(
+    checkpoint: Dict[str, torch.Tensor],
+    gptq_config: GPTQConfig,
+    model_config
+) -> Dict[str, torch.Tensor]:
+    """Convert GPTQ Qwen2 model to lite_llama format"""
+    new_state_dict = {}
+
+    # Load embeddings and norms
+    new_state_dict["embed_tokens.weight"] = checkpoint.get("model.embed_tokens.weight")
+    new_state_dict["norm_weight"] = checkpoint.get("model.norm.weight")
+    new_state_dict["lm_head_weight"] = checkpoint.get("lm_head.weight")
+
+    # Process each layer
+    for i in range(model_config.num_layers):
+        # Self attention - handle q_proj separately due to bias
+        prefix = f"model.layers.{i}.self_attn.q_proj"
+        if f"{prefix}.qweight" in checkpoint:
+            qweight = checkpoint[f"{prefix}.qweight"]
+            qzeros = checkpoint[f"{prefix}.qzeros"]
+            scales = checkpoint[f"{prefix}.scales"]
+            g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+            weight = dequantize_gptq(
+                qweight, qzeros, scales, g_idx,
+                gptq_config.bits, gptq_config.group_size
+            )
+        else:
+            weight = checkpoint.get(f"{prefix}.weight")
+
+        new_state_dict[f"layers.{i}.self_attn.q_proj_weight"] = weight
+        new_state_dict[f"layers.{i}.self_attn.q_proj_bias"] = checkpoint.get(f"{prefix}.bias")
+
+        # Handle k_proj and v_proj for merging
+        for proj in ["k_proj", "v_proj"]:
+            prefix = f"model.layers.{i}.self_attn.{proj}"
+
+            if f"{prefix}.qweight" in checkpoint:
+                qweight = checkpoint[f"{prefix}.qweight"]
+                qzeros = checkpoint[f"{prefix}.qzeros"]
+                scales = checkpoint[f"{prefix}.scales"]
+                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                weight = dequantize_gptq(
+                    qweight, qzeros, scales, g_idx,
+                    gptq_config.bits, gptq_config.group_size
+                )
+            else:
+                weight = checkpoint.get(f"{prefix}.weight")
+
+            new_state_dict[f"_temp_{i}_{proj}_weight"] = weight
+            new_state_dict[f"_temp_{i}_{proj}_bias"] = checkpoint.get(f"{prefix}.bias")
+
+        # Merge k and v
+        k_weight = new_state_dict.pop(f"_temp_{i}_k_proj_weight")
+        v_weight = new_state_dict.pop(f"_temp_{i}_v_proj_weight")
+        k_bias = new_state_dict.pop(f"_temp_{i}_k_proj_bias")
+        v_bias = new_state_dict.pop(f"_temp_{i}_v_proj_bias")
+
+        new_state_dict[f"layers.{i}.self_attn.kv_proj_weight"] = torch.cat([k_weight, v_weight], dim=0)
+        new_state_dict[f"layers.{i}.self_attn.kv_proj_bias"] = torch.cat([k_bias, v_bias], dim=0)
+
+        # O projection
+        prefix = f"model.layers.{i}.self_attn.o_proj"
+        if f"{prefix}.qweight" in checkpoint:
+            qweight = checkpoint[f"{prefix}.qweight"]
+            qzeros = checkpoint[f"{prefix}.qzeros"]
+            scales = checkpoint[f"{prefix}.scales"]
+            g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+            weight = dequantize_gptq(
+                qweight, qzeros, scales, g_idx,
+                gptq_config.bits, gptq_config.group_size
+            )
+        else:
+            weight = checkpoint.get(f"{prefix}.weight")
+
+        new_state_dict[f"layers.{i}.self_attn.o_proj_weight"] = weight
+
+        # MLP layers
+        for proj in ["gate_proj", "up_proj", "down_proj"]:
+            prefix = f"model.layers.{i}.mlp.{proj}"
+
+            if f"{prefix}.qweight" in checkpoint:
+                qweight = checkpoint[f"{prefix}.qweight"]
+                qzeros = checkpoint[f"{prefix}.qzeros"]
+                scales = checkpoint[f"{prefix}.scales"]
+                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
+
+                weight = dequantize_gptq(
+                    qweight, qzeros, scales, g_idx,
+                    gptq_config.bits, gptq_config.group_size
+                )
+            else:
+                weight = checkpoint.get(f"{prefix}.weight")
+
+            new_state_dict[f"layers.{i}.mlp.{proj}.weight"] = weight
+
+        # Layer norms
+        new_state_dict[f"layers.{i}.input_layernorm_weight"] = checkpoint.get(
+            f"model.layers.{i}.input_layernorm.weight"
+        )
+        new_state_dict[f"layers.{i}.post_attention_layernorm_weight"] = checkpoint.get(
+            f"model.layers.{i}.post_attention_layernorm.weight"
+        )
+
+    return new_state_dict
+
+
+class GPTQModelLoader:
+    """Helper class to load GPTQ models"""
+
+    @staticmethod
+    def load(checkpoints_dir: str, model_config, device: str = "cuda") -> Dict[str, torch.Tensor]:
+        """
+        Load GPTQ model and convert to lite_llama format
+
+        Args:
+            checkpoints_dir: Directory containing GPTQ model
+            model_config: Model configuration
+            device: Target device
+
+        Returns:
+            State dictionary ready for lite_llama
+        """
+        print(f"Loading GPTQ model from {checkpoints_dir}")
+        start_time = time.time()
+
+        state_dict = convert_gptq_to_lite_llama(checkpoints_dir, model_config)
+
+        # Move to device and convert to fp16
+        for key, value in state_dict.items():
+            if value is not None:
+                state_dict[key] = value.to(device).half()
+
+        print(f"GPTQ model loaded and converted in {time.time() - start_time:.2f}s")
+        return state_dict
\ No newline at end of file
diff --git a/quantize_model.py b/quantize_model.py
deleted file mode 100644
index d9af7e5..0000000
--- a/quantize_model.py
+++ /dev/null
@@ -1,273 +0,0 @@
-#!/usr/bin/env python3
-"""
-Quantize an already converted lite_llama model using GPTQ
-"""
-
-import argparse
-import os
-import sys
-import json
-import torch
-
-# Add the current directory to Python path
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from lite_llama.quantization.gptq import quantize_litellama_model
-from lite_llama.utils.common import get_model_info, check_model_compatibility
-from lite_llama.utils.logger import log
-
-def parse_arguments():
-    """Parse command line arguments"""
-    parser = argparse.ArgumentParser(
-        description="Quantize an already converted lite_llama model using GPTQ"
-    )
-
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        required=True,
-        help="Path to the converted lite_llama model directory"
-    )
-
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default=None,
-        help="Path to save the quantized model (default: auto-generated based on model_path)"
-    )
-
-    parser.add_argument(
-        "--wbits",
-        type=int,
-        default=4,
-        choices=[2, 3, 4, 8],
-        help="Number of bits for quantization (default: 4)"
-    )
-
-    parser.add_argument(
-        "--groupsize",
-        type=int,
-        default=128,
-        help="Group size for quantization (default: 128, -1 for no grouping, 0 for auto-detect)"
-    )
-
-    parser.add_argument(
-        "--calibration_data",
-        type=str,
-        default=None,
-        help="Path to calibration dataset file for GPTQ (optional)"
-    )
-
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help="Device to use for quantization (default: cuda)"
-    )
-
-    parser.add_argument(
-        "--num_samples",
-        type=int,
-        default=128,
-        help="Number of calibration samples to use (default: 128)"
-    )
-
-    parser.add_argument(
-        "--seq_length",
-        type=int,
-        default=2048,
-        help="Sequence length for calibration samples (default: 2048)"
-    )
-
-    parser.add_argument(
-        "--skip_vision",
-        action="store_true",
-        help="Skip quantization of vision model weights (for LLaVA models)"
-    )
-
-    parser.add_argument(
-        "--quantize_vision",
-        action="store_true",
-        help="Force quantization of vision model weights (not recommended for LLaVA)"
-    )
-
-    return parser.parse_args()
-
-
-def main():
-    # Parse arguments
-    args = parse_arguments()
-
-    print("=" * 60)
-    print("GPTQ Quantization for Lite-LLaMA Models")
-    print("=" * 60)
-
-    # Check if model path exists
-    if not os.path.exists(args.model_path):
-        print(f"Error: Model path does not exist: {args.model_path}")
-        return 1
-
-    # Check model compatibility
-    is_compatible, message = check_model_compatibility(args.model_path)
-    if not is_compatible:
-        print(f"Error: {message}")
-        return 1
-
-    # Get model information
-    model_info = get_model_info(args.model_path)
-
-    # Detect if this is a LLaVA model
-    is_llava = model_info["model_type"] == "llava"
-    if is_llava:
-        print("\n⚠️  Detected LLaVA model - will handle vision weights specially")
-        if not args.quantize_vision and not args.skip_vision:
-            args.skip_vision = True  # Default to skipping vision weights
-            print("   Skipping vision weights by default (use --quantize_vision to override)")
-
-    print(f"\nModel Information:")
-    print(f"Name: {model_info['model_name']}")
-    print(f"Type: {model_info['model_type']}")
-    print(f"Size: {model_info['size']:.2f} GB")
-    if is_llava:
-        print(f"  Vision weights: {'Will be quantized' if args.quantize_vision else 'Will be skipped'}")
-
-    # Auto-detect groupsize if requested
-    if args.groupsize == 0:
-        print("\nAuto-detecting optimal groupsize...")
-        # Load a sample weight to check dimensions
-        pth_files = [f for f in os.listdir(args.model_path) if f.endswith('.pth')]
-        if pth_files:
-            sample_weights = torch.load(
-                os.path.join(args.model_path, pth_files[0]),
-                map_location='cpu'
-            )
-
-            # Find vocabulary size from embeddings or lm_head
-            vocab_sizes = []
-            for name, weight in sample_weights.items():
-                if ("embed" in name or "lm_head" in name) and len(weight.shape) >= 2:
-                    vocab_sizes.extend(weight.shape)
-
-            if vocab_sizes:
-                vocab_size = max(vocab_sizes)
-                # Find suitable groupsize
-                for gs in [128, 256, 512, 1024]:
-                    if vocab_size % gs == 0:
-                        args.groupsize = gs
-                        print(f"✓ Selected groupsize: {gs} (evenly divides vocab size {vocab_size})")
-                        break
-                else:
-                    # No perfect divisor found
-                    if vocab_size % 256 < vocab_size % 128:
-                        args.groupsize = 256
-                    else:
-                        args.groupsize = -1
-                    print(f"✓ Selected groupsize: {args.groupsize} (best fit for vocab size {vocab_size})")
-
-            del sample_weights
-
-    print(f"\nQuantization Settings:")
-    print(f"Bits: {args.wbits}")
-    print(f"Group size: {args.groupsize}")
-    print(f"Device: {args.device}")
-    print(f"Calibration data: {args.calibration_data or 'Default'}")
-
-    # Check CUDA availability
-    if args.device == "cuda" and not torch.cuda.is_available():
-        print("\nWarning: CUDA is not available. Falling back to CPU.")
-        print("Note: Quantization on CPU will be significantly slower.")
-        args.device = "cpu"
-
-    # Set output path if not provided
-    if args.output_path is None:
-        parent_dir = os.path.dirname(args.model_path)
-        model_name = os.path.basename(args.model_path)
-        args.output_path = os.path.join(parent_dir, f"{model_name}-{args.wbits}bit-gptq")
-
-    print(f"\nOutput path: {args.output_path}")
-
-    # Confirm before proceeding
-    print("\n" + "-" * 60)
-    response = input("Proceed with quantization? (y/N): ")
-    if response.lower() != 'y':
-        print("Quantization cancelled.")
-        return 0
-
-    print("\n" + "=" * 60)
-    print("Starting quantization...")
-    print("=" * 60 + "\n")
-
-    try:
-        # Run quantization
-        quantize_litellama_model(
-            model_path=args.model_path,
-            output_path=args.output_path,
-            calibration_data_path=args.calibration_data,
-            wbits=args.wbits,
-            groupsize=args.groupsize,
-            device=args.device,
-            num_samples=args.num_samples,
-            seq_length=args.seq_length,
-            skip_vision=args.skip_vision
-        )
-
-        print("\n" + "=" * 60)
-        print("Quantization completed successfully!")
-        print("=" * 60)
-
-        # Print summary
-        print(f"\nQuantized model saved to: {args.output_path}")
-
-        # Calculate and show compression ratio
-        original_size = model_info['size']
-
-        # Calculate quantized size
-        quantized_size = 0
-        if os.path.exists(args.output_path):
-            for f in os.listdir(args.output_path):
-                if f.endswith('.pth'):
-                    file_path = os.path.join(args.output_path, f)
-                    quantized_size += os.path.getsize(file_path) / (1024 ** 3)
-
-        if quantized_size > 0:
-            compression_ratio = original_size / quantized_size
-        else:
-            log.warning("\nWarning: Could not calculate compression ratio (output files not found)")
-            compression_ratio = 0
-
-        log.info(f"\nCompression Statistics:")
-        print(f"Original size: {original_size:.2f} GB")
-        print(f"Quantized size: {quantized_size:.2f} GB")
-        print(f"Compression ratio: {compression_ratio:.2f}x")
-        print(f"Space saved: {(1 - 1 / compression_ratio) * 100:.1f}%")
-
-        # Expected compression analysis
-        expected_ratio = 32 / (args.wbits + 0.5)  # 0.5 for metadata overhead
-        if compression_ratio < 1.5:
-            log.warning(f"\n⚠️Low compression ratio detected!")
-            print(f"  Expected: ~{expected_ratio:.1f}x for {args.wbits}-bit quantization")
-            print(f"  Actual: {compression_ratio:.2f}x")
-            print("\nPossible reasons:")
-            print("  - Model has many non-quantizable layers (embeddings, norms)")
-            print("  - Vision components were skipped (for LLaVA)")
-            print("  - Small model size (quantization overhead is more significant)")
-            print("\nFor better compression, consider:")
-            print("  - Using fewer bits (e.g., 3-bit or 2-bit)")
-            print("  - Larger groupsize (reduces metadata overhead)")
-            print("  - Quantizing embeddings (if safe for your use case)")
-
-    except KeyboardInterrupt:
-        log.error("\n\nQuantization interrupted by user.")
-        return 1
-    except Exception as e:
-        log.error(f"\nError during quantization: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
\ No newline at end of file

From 629bd9805cb7c09b51fd9f46128c30992237cf02 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Thu, 29 May 2025 05:31:02 +0930
Subject: [PATCH 19/33] 1

---
 apply_weight_convert.py                       | 217 ++++--
 generate.py                                   |  38 +-
 lite_llama/executor/weight_convert.py         | 209 +++++-
 lite_llama/executor/weight_convert_gptq.py    | 553 ---------------
 lite_llama/quantization/__init__.py           |   0
 lite_llama/quantization/gptq/__init__.py      |   0
 lite_llama/quantization/gptq/gptq.py          | 329 +++++++++
 lite_llama/quantization/gptq/gptq_executor.py | 218 ------
 lite_llama/quantization/gptq/gptq_loader.py   | 668 +++++-------------
 lite_llama/utils/common.py                    |  44 +-
 10 files changed, 868 insertions(+), 1408 deletions(-)
 mode change 100644 => 100755 apply_weight_convert.py
 mode change 100644 => 100755 lite_llama/executor/weight_convert.py
 delete mode 100644 lite_llama/executor/weight_convert_gptq.py
 mode change 100644 => 100755 lite_llama/quantization/__init__.py
 mode change 100644 => 100755 lite_llama/quantization/gptq/__init__.py
 create mode 100755 lite_llama/quantization/gptq/gptq.py
 delete mode 100644 lite_llama/quantization/gptq/gptq_executor.py

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
old mode 100644
new mode 100755
index d366688..916272d
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -1,11 +1,11 @@
 import torch
+import argparse
 from transformers import (
     LlavaForConditionalGeneration,
     AutoConfig,
     AutoModelForCausalLM,
     LlavaConfig,
 )
-import argparse
 
 # 获取 lite_llama 目录的绝对路径并添加到 sys.path 中
 from lite_llama.executor.weight_convert import (
@@ -13,11 +13,6 @@
     convert_llama_hf_to_litellama,
     convert_qwen2_hf_to_litellama,
 )
-from lite_llama.executor.weight_convert_gptq import (
-    convert_llavallama_hf_to_litellama_gptq,
-    convert_llama_hf_to_litellama_gptq,
-    convert_qwen2_hf_to_litellama_gptq,
-)
 
 import warnings
 
@@ -25,106 +20,184 @@
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Convert HF models to LiteLLaMA format with optional GPTQ compression')
-    parser.add_argument('--checkpoint_dir', type=str, required=True,
-                        help='Path to the model checkpoint directory')
-    parser.add_argument('--use_gptq', action='store_true',
-                        help='Enable GPTQ quantization (4-bit by default)')
-    parser.add_argument('--bits', type=int, default=4, choices=[2, 3, 4, 8],
-                        help='Number of bits for GPTQ quantization')
-    parser.add_argument('--group_size', type=int, default=128,
-                        help='Group size for GPTQ quantization')
-    parser.add_argument('--act_order', action='store_true',
-                        help='Use activation order for GPTQ quantization')
-    parser.add_argument('--calibration_dataset', type=str, default='c4',
-                        help='Dataset to use for GPTQ calibration')
-    parser.add_argument('--nsamples', type=int, default=128,
-                        help='Number of calibration samples for GPTQ')
+    parser = argparse.ArgumentParser(
+        description="Convert HuggingFace models to lite_llama format with optional GPTQ compression")
+    parser.add_argument(
+        "--checkpoints_dir",
+        type=str,
+        required=True,
+        help="Path to the model checkpoint directory"
+    )
+    parser.add_argument(
+        "--use_gptq",
+        action="store_true",
+        help="Enable GPTQ quantization"
+    )
+    parser.add_argument(
+        "--wbits",
+        type=int,
+        default=4,
+        help="Number of bits for quantization (default: 4)"
+    )
+    parser.add_argument(
+        "--groupsize",
+        type=int,
+        default=128,
+        help="Group size for quantization (default: 128)"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to use for conversion (default: cuda)"
+    )
+    parser.add_argument(
+        "--no_print_params",
+        action="store_true",
+        help="Disable printing parameter information"
+    )
 
     args = parser.parse_args()
 
-    checkpoints_dir = args.checkpoint_dir
+    checkpoints_dir = args.checkpoints_dir
+    use_gptq = args.use_gptq
+    wbits = args.wbits
+    groupsize = args.groupsize
+    device = args.device
+    print_params = not args.no_print_params
+
+    # Print configuration
+    print(f"Converting model from: {checkpoints_dir}")
+    if use_gptq:
+        print(f"GPTQ Quantization enabled: {wbits} bits, groupsize {groupsize}")
+    else:
+        print("GPTQ Quantization: Disabled")
+    print(f"Device: {device}")
+    print("-" * 50)
 
+    # Load model
     if "llava" in checkpoints_dir.lower():
         model = (
-            LlavaForConditionalGeneration.from_pretrained(
+            LlavaForConditionalGeneration.from_pretrained(  # LlavaForConditionalGeneration
                 checkpoints_dir,
                 torch_dtype=torch.float16,
                 low_cpu_mem_usage=True,
-            ).to("cuda")
+            ).to(device)
         )
     else:
         model = AutoModelForCausalLM.from_pretrained(
             checkpoints_dir,
             torch_dtype=torch.float16,
             low_cpu_mem_usage=True,
-        ).to("cuda")
+        ).to(device)
 
     hf_sd = model.state_dict()
 
-    # Determine the conversion function based on model type and GPTQ flag
+    # Convert based on model type
     if "qwen2" in checkpoints_dir.lower():
         llm_config = AutoConfig.from_pretrained(checkpoints_dir)
         num_layers = llm_config.num_hidden_layers
+        print("Model type: Qwen2")
         print("num_layers: ", num_layers)
-
-        if args.use_gptq:
-            print(f"Converting Qwen2 with GPTQ quantization ({args.bits}-bit)...")
-            convert_qwen2_hf_to_litellama_gptq(
-                checkpoints_dir,
-                model,  # Pass model instead of state dict for GPTQ
-                num_layers,
-                bits=args.bits,
-                group_size=args.group_size,
-                act_order=args.act_order,
-                calibration_dataset=args.calibration_dataset,
-                nsamples=args.nsamples
-            )
-        else:
-            convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+        convert_qwen2_hf_to_litellama(
+            checkpoints_dir,
+            hf_sd,
+            num_layers,
+            print_params=print_params,
+            device=device,
+            use_gptq=use_gptq,
+            wbits=wbits,
+            groupsize=groupsize
+        )
 
     elif "llama" in checkpoints_dir.lower():
         llm_config = AutoConfig.from_pretrained(checkpoints_dir)
         num_layers = llm_config.num_hidden_layers
+        print("Model type: Llama")
         print("num_layers: ", num_layers)
-
-        if args.use_gptq:
-            print(f"Converting Llama with GPTQ quantization ({args.bits}-bit)...")
-            convert_llama_hf_to_litellama_gptq(
-                checkpoints_dir,
-                model,  # Pass model instead of state dict for GPTQ
-                num_layers,
-                bits=args.bits,
-                group_size=args.group_size,
-                act_order=args.act_order,
-                calibration_dataset=args.calibration_dataset,
-                nsamples=args.nsamples
-            )
-        else:
-            convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+        convert_llama_hf_to_litellama(
+            checkpoints_dir,
+            hf_sd,
+            num_layers,
+            use_gptq=use_gptq,
+            wbits=wbits,
+            groupsize=groupsize,
+            device=device
+        )
 
     elif "llava" in checkpoints_dir.lower():
         llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
         num_layers = llava_config.text_config.num_hidden_layers
+        print("Model type: LLaVA")
         print("num_layers: ", num_layers)
-
-        if args.use_gptq:
-            print(f"Converting LLaVA with GPTQ quantization ({args.bits}-bit)...")
-            convert_llavallama_hf_to_litellama_gptq(
-                checkpoints_dir,
-                model,  # Pass model instead of state dict for GPTQ
-                num_layers,
-                bits=args.bits,
-                group_size=args.group_size,
-                act_order=args.act_order,
-                calibration_dataset=args.calibration_dataset,
-                nsamples=args.nsamples
-            )
-        else:
-            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+        convert_llavallama_hf_to_litellama(
+            checkpoints_dir,
+            hf_sd,
+            num_layers,
+            use_gptq=use_gptq,
+            wbits=wbits,
+            groupsize=groupsize,
+            device=device
+        )
     else:
         print("Error! Unsupported model type!")
+        return
+
+    print("\nConversion completed successfully!")
+
+    # Clean up
+    del model
+    torch.cuda.empty_cache()
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    # If script is run directly without arguments, use default values
+    import sys
+
+    if len(sys.argv) == 1:
+        # Legacy behavior - use hardcoded path
+        checkpoints_dir = "/path/llm_weights/llava-v1.5-7b"
+
+        print(f"Running with default path: {checkpoints_dir}")
+        print("To use command line arguments, run with --help")
+        print("-" * 50)
+
+        if "llava" in checkpoints_dir.lower():
+            model = (
+                LlavaForConditionalGeneration.from_pretrained(
+                    checkpoints_dir,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                ).to("cuda")
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                checkpoints_dir,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+            ).to("cuda")
+
+        hf_sd = model.state_dict()
+
+        if "qwen2" in checkpoints_dir.lower():
+            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+            num_layers = llm_config.num_hidden_layers
+            print("num_layers: ", num_layers)
+            convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        elif "llama" in checkpoints_dir.lower():
+            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
+            num_layers = llm_config.num_hidden_layers
+            print("num_layers: ", num_layers)
+            convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+
+        elif "llava" in checkpoints_dir.lower():
+            llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
+            num_layers = llava_config.text_config.num_hidden_layers
+            print("num_layers: ", num_layers)
+            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
+        else:
+            print("Error! Unsupported model type!")
+    else:
+        # Use argparse for command line interface
+        main()
\ No newline at end of file
diff --git a/generate.py b/generate.py
index 59aefdb..15526a2 100644
--- a/generate.py
+++ b/generate.py
@@ -5,7 +5,6 @@
 from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
 from lite_llama.utils.prompt_templates import get_prompter
 from lite_llama.generate_stream import GenerateStreamText  # Original import
-from lite_llama.quantization.gptq.gptq_executor import GPTQGenerateStreamText  # GPTQ import
 
 import sys, os, time
 from pathlib import Path
@@ -84,32 +83,17 @@ def main(
         if use_gptq:
             print(f"GPTQ quantized model detected in {checkpoint_path}")
 
-    # Choose appropriate generator class
-    if use_gptq:
-        print("Using GPTQ-enabled generator")
-        generator = GPTQGenerateStreamText(
-            checkpoints_dir=checkpoint_path,
-            tokenizer_path=checkpoint_path,
-            max_gpu_num_blocks=max_gpu_num_blocks,
-            max_seq_len=max_seq_len,
-            load_model=load_model,
-            compiled_model=compiled_model,
-            triton_weight=triton_weight,
-            device=device,
-            use_gptq=True,  # Explicitly set GPTQ mode
-        )
-    else:
-        print("Using standard FP16 generator")
-        generator = GenerateStreamText(
-            checkpoints_dir=checkpoint_path,
-            tokenizer_path=checkpoint_path,
-            max_gpu_num_blocks=max_gpu_num_blocks,
-            max_seq_len=max_seq_len,
-            load_model=load_model,
-            compiled_model=compiled_model,
-            triton_weight=triton_weight,
-            device=device,
-        )
+    print("Using standard FP16 generator")
+    generator = GenerateStreamText(
+        checkpoints_dir=checkpoint_path,
+        tokenizer_path=checkpoint_path,
+        max_gpu_num_blocks=max_gpu_num_blocks,
+        max_seq_len=max_seq_len,
+        load_model=load_model,
+        compiled_model=compiled_model,
+        triton_weight=triton_weight,
+        device=device,
+    )
 
     model_prompter.insert_prompt(prompt)
     prompts = [model_prompter.model_input]
diff --git a/lite_llama/executor/weight_convert.py b/lite_llama/executor/weight_convert.py
old mode 100644
new mode 100755
index 330db61..0114b2b
--- a/lite_llama/executor/weight_convert.py
+++ b/lite_llama/executor/weight_convert.py
@@ -1,43 +1,63 @@
 from tqdm.auto import tqdm
 import torch, os, shutil, glob
 import os.path as osp
-from typing import Dict
+from typing import Dict, Optional
+from ..quantization.gptq.gptq import quantize_gptq  # Import our GPTQ implementation
 
 
-def build_new_weight_dir(checkpoints_dir: str, new_sd):
+def build_new_weight_dir(checkpoints_dir: str, new_sd, quantized: bool = False):
     # 保存 lite_llama 模型权重并构建新的权重目录
     model_id = osp.basename(osp.normpath(checkpoints_dir))
     current_dir = osp.dirname(osp.abspath(__file__))  # 获取当前文件所在的目录
-    my_weight_dir = osp.join(
-        current_dir, "../../my_weight/" + model_id
-    )  # 项目所在根目录
+
+    # Add quantized suffix if using GPTQ
+    weight_dir_name = f"../../my_weight/{model_id}"
+    if quantized:
+        weight_dir_name += "_gptq"
+
+    my_weight_dir = osp.join(current_dir, weight_dir_name)  # 项目所在根目录
     os.makedirs(my_weight_dir, exist_ok=True)  # 创建文件夹（如果不存在）
 
     # 保存模型的状态字典。
+    save_filename = f"{model_id}_gptq.pth" if quantized else f"{model_id}.pth"
     torch.save(
         new_sd,
-        osp.join(my_weight_dir, model_id + ".pth"),
+        osp.join(my_weight_dir, save_filename),
         _use_new_zipfile_serialization=True,
     )
 
     # 获取所有 JSON 文件
     json_files = glob.glob(osp.join(checkpoints_dir, "*.json"))
     for file_path in json_files:
-        shutil.copy(file_path, my_weight_dir) # 复制 hf 权重目录的所有 json 文件到新的目录
-        print(f"Copy: {file_path} -> {my_weight_dir}")
+        shutil.copy(file_path, my_weight_dir)  # 复制 hf 权重目录的所有 json 文件到新的目录
+        print(f"已复制: {file_path} -> {my_weight_dir}")
 
     if osp.exists(osp.join(checkpoints_dir, "tokenizer.model")):
         shutil.copy(osp.join(checkpoints_dir, "tokenizer.model"), my_weight_dir)
 
+
 def convert_qwen2_hf_to_litellama(
-    checkpoints_dir: str,
-    hf_sd,
-    num_layers,
-    print_params: bool = True,
-    device: str = "cuda",
+        checkpoints_dir: str,
+        hf_sd,
+        num_layers,
+        print_params: bool = True,
+        device: str = "cuda",
+        use_gptq: bool = False,
+        wbits: int = 4,
+        groupsize: int = 128,
 ) -> Dict[str, torch.Tensor]:
     """
     将 Hugging Face 格式的预训练模型的权重字典转换为自定义模型的权重字典。
+
+    Args:
+        checkpoints_dir: 模型权重目录
+        hf_sd: HuggingFace 模型状态字典
+        num_layers: 模型层数
+        print_params: 是否打印参数信息
+        device: 设备
+        use_gptq: 是否使用 GPTQ 量化
+        wbits: 量化位数
+        groupsize: 量化组大小
     """
     # 映射嵌入层、映射归一化层、映射模型最后的输出线性层
     mapping = {
@@ -88,10 +108,10 @@ def convert_qwen2_hf_to_litellama(
         v_bias_key = f"layers.{i}.self_attn.v_proj_bias"
 
         if (
-            k_key in new_sd
-            and v_key in new_sd
-            and k_bias_key in new_sd
-            and v_bias_key in new_sd
+                k_key in new_sd
+                and v_key in new_sd
+                and k_bias_key in new_sd
+                and v_bias_key in new_sd
         ):
             # 1. kv weight 权重合并
             k_tensor = new_sd[k_key]
@@ -119,8 +139,28 @@ def convert_qwen2_hf_to_litellama(
             del new_sd[k_bias_key]
             del new_sd[v_bias_key]
 
+    # Apply GPTQ quantization if requested
+    if use_gptq:
+        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+        # Define layers to quantize (excluding embeddings and layer norms)
+        target_layers = []
+        for name in new_sd.keys():
+            if any(pattern in name for pattern in [
+                "q_proj_weight", "kv_proj_weight", "o_proj_weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+            ]) and "bias" not in name:
+                target_layers.append(name)
+
+        new_sd = quantize_gptq(
+            model_state_dict=new_sd,
+            wbits=wbits,
+            groupsize=groupsize,
+            target_layers=target_layers,
+            device=device
+        )
+
     # 保存转换好的自定义权重
-    build_new_weight_dir(checkpoints_dir, new_sd)
+    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
 
     if print_params:
         # 打印预训练模型的参数名称
@@ -129,20 +169,33 @@ def convert_qwen2_hf_to_litellama(
             print(name, parameters.shape)
 
         # 打印自定义模型的参数名称
-        print("Custom model parameters:")
+        print("\nCustom model parameters:")
         for name, parameters in new_sd.items():
-            print(name, parameters.shape)
-
-    # return new_sd
-
-
-def convert_llama_torch_to_litellama(checkpoints_dir, hf_sd, num_layers):
+            if hasattr(parameters, 'shape'):
+                print(name, parameters.shape)
+            else:
+                print(name, parameters)
+
+
+def convert_llama_torch_to_litellama(
+        checkpoints_dir,
+        hf_sd,
+        num_layers,
+        use_gptq: bool = False,
+        wbits: int = 4,
+        groupsize: int = 128,
+        device: str = "cuda"
+):
     """
     将 pytorch bin 格式的模型的权重字典转换为自定义模型的权重字典。
 
     参数:
         checkpoints_dir: pytorch 模型的目录
         hf_sd (dict): pytorch 模型的状态字典。
+        use_gptq: 是否使用 GPTQ 量化
+        wbits: 量化位数
+        groupsize: 量化组大小
+        device: 设备
 
     返回:
         dict: 转换后的状态字典。
@@ -183,17 +236,48 @@ def convert_llama_torch_to_litellama(checkpoints_dir, hf_sd, num_layers):
 
     del hf_sd
 
-    build_new_weight_dir(checkpoints_dir, new_sd)
+    # Apply GPTQ quantization if requested
+    if use_gptq:
+        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+        target_layers = []
+        for name in new_sd.keys():
+            if any(pattern in name for pattern in [
+                "q_proj.weight", "k_proj.weight", "v_proj.weight", "o_proj.weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+            ]):
+                target_layers.append(name)
+
+        new_sd = quantize_gptq(
+            model_state_dict=new_sd,
+            wbits=wbits,
+            groupsize=groupsize,
+            target_layers=target_layers,
+            device=device
+        )
+
+    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
     return new_sd
 
 
-def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
+def convert_llama_hf_to_litellama(
+        checkpoints_dir,
+        hf_sd,
+        num_layers,
+        use_gptq: bool = False,
+        wbits: int = 4,
+        groupsize: int = 128,
+        device: str = "cuda"
+):
     """
     将 hf 格式的模型的权重字典转换为自定义模型的权重字典。
 
     参数:
         checkpoints_dir: Hugging Face 模型的目录
         hf_sd (dict): Hugging Face 模型的状态字典。
+        use_gptq: 是否使用 GPTQ 量化
+        wbits: 量化位数
+        groupsize: 量化组大小
+        device: 设备
 
     返回:
         dict: 转换后的状态字典。
@@ -251,14 +335,44 @@ def convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
             del new_sd[k_key]
             del new_sd[v_key]
 
+    # Apply GPTQ quantization if requested
+    if use_gptq:
+        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+        target_layers = []
+        for name in new_sd.keys():
+            if any(pattern in name for pattern in [
+                "q_proj.weight", "kv_proj_weight", "o_proj.weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+            ]):
+                target_layers.append(name)
+
+        new_sd = quantize_gptq(
+            model_state_dict=new_sd,
+            wbits=wbits,
+            groupsize=groupsize,
+            target_layers=target_layers,
+            device=device
+        )
+
     for name, parameters in new_sd.items():
-        print(name, parameters.shape)
+        if hasattr(parameters, 'shape'):
+            print(name, parameters.shape)
+        else:
+            print(name, parameters)
 
     # 将处理后的权重保存到指定目录
-    build_new_weight_dir(checkpoints_dir, new_sd)
-
-
-def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
+    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
+
+
+def convert_llavallama_hf_to_litellama(
+        checkpoints_dir,
+        hf_sd,
+        num_layers,
+        use_gptq: bool = False,
+        wbits: int = 4,
+        groupsize: int = 128,
+        device: str = "cuda"
+):
     """
     将 Hugging Face 模型的权重字典转换为自定义模型的权重字典。
 
@@ -266,6 +380,10 @@ def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
         checkpoints_dir: Hugging Face 模型的目录
         hf_sd (dict): Hugging Face 模型的状态字典。
         model_config (LlamaConfig): 自定义模型的配置参数。
+        use_gptq: 是否使用 GPTQ 量化
+        wbits: 量化位数
+        groupsize: 量化组大小
+        device: 设备
 
     返回:
         dict: 转换后的状态字典。
@@ -326,7 +444,30 @@ def convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers):
             del new_sd[k_key]
             del new_sd[v_key]
 
+    # Apply GPTQ quantization if requested
+    if use_gptq:
+        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+        target_layers = []
+        for name in new_sd.keys():
+            if any(pattern in name for pattern in [
+                "q_proj.weight", "kv_proj_weight", "o_proj.weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+            ]) and "language_model" in name:
+                target_layers.append(name)
+
+        new_sd = quantize_gptq(
+            model_state_dict=new_sd,
+            wbits=wbits,
+            groupsize=groupsize,
+            target_layers=target_layers,
+            device=device
+        )
+
     for name, parameters in new_sd.items():
-        print(name, parameters.shape)
+        if hasattr(parameters, 'shape'):
+            print(name, parameters.shape)
+        else:
+            print(name, parameters)
+
+    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
 
-    build_new_weight_dir(checkpoints_dir, new_sd)
diff --git a/lite_llama/executor/weight_convert_gptq.py b/lite_llama/executor/weight_convert_gptq.py
deleted file mode 100644
index 3dacf96..0000000
--- a/lite_llama/executor/weight_convert_gptq.py
+++ /dev/null
@@ -1,553 +0,0 @@
-from tqdm.auto import tqdm
-import torch
-import os
-import shutil
-import glob
-import os.path as osp
-from typing import Dict, Optional
-import gc
-from datasets import load_dataset
-from transformers import AutoTokenizer
-
-try:
-    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-    from auto_gptq.modeling import BaseGPTQForCausalLM
-except ImportError:
-    raise ImportError(
-        "Please install auto-gptq: pip install auto-gptq"
-    )
-
-
-def get_calibration_data(model_id: str, dataset_name: str, tokenizer, nsamples: int = 128, seqlen: int = 2048):
-    """
-    Prepare calibration dataset for GPTQ quantization.
-    """
-    if dataset_name == "c4":
-        dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
-        text_column = "text"
-    elif dataset_name == "wikitext":
-        dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
-        text_column = "text"
-    else:
-        raise ValueError(f"Unsupported dataset: {dataset_name}")
-
-    calibration_data = []
-
-    for data in tqdm(dataset, desc="Loading calibration data"):
-        text = data[text_column]
-        if len(text.strip()) > 10:  # Skip very short texts
-            inputs = tokenizer(
-                text,
-                truncation=True,
-                max_length=seqlen,
-                return_tensors="pt"
-            )
-            if inputs["input_ids"].shape[1] >= seqlen // 2:  # Ensure reasonable length
-                calibration_data.append({
-                    "input_ids": inputs["input_ids"][0],
-                    "attention_mask": inputs["attention_mask"][0]
-                })
-
-                if len(calibration_data) >= nsamples:
-                    break
-
-    return calibration_data
-
-
-def build_new_weight_dir_gptq(checkpoints_dir: str, new_sd: Dict[str, torch.Tensor], bits: int):
-    """
-    Save GPTQ quantized model weights and build new weight directory.
-    """
-    model_id = osp.basename(osp.normpath(checkpoints_dir))
-    current_dir = osp.dirname(osp.abspath(__file__))
-    my_weight_dir = osp.join(
-        current_dir, f"../../my_weight/{model_id}-{bits}bit-GPTQ"
-    )
-    os.makedirs(my_weight_dir, exist_ok=True)
-
-    # Save quantized model state dict
-    torch.save(
-        new_sd,
-        osp.join(my_weight_dir, f"{model_id}-{bits}bit-GPTQ.pth"),
-        _use_new_zipfile_serialization=True,
-    )
-
-    # Copy JSON files
-    json_files = glob.glob(osp.join(checkpoints_dir, "*.json"))
-    for file_path in json_files:
-        shutil.copy(file_path, my_weight_dir)
-        print(f"已复制: {file_path} -> {my_weight_dir}")
-
-    # Copy tokenizer files
-    if osp.exists(osp.join(checkpoints_dir, "tokenizer.model")):
-        shutil.copy(osp.join(checkpoints_dir, "tokenizer.model"), my_weight_dir)
-
-    # Save quantization config
-    quant_config = {
-        "bits": bits,
-        "quantization_method": "gptq",
-        "model_id": model_id
-    }
-
-    import json
-    with open(osp.join(my_weight_dir, "quantization_config.json"), "w") as f:
-        json.dump(quant_config, f, indent=2)
-
-
-def quantize_and_convert_weights(
-        model,
-        checkpoints_dir: str,
-        bits: int = 4,
-        group_size: int = 128,
-        act_order: bool = False,
-        calibration_dataset: str = "c4",
-        nsamples: int = 128,
-) -> Dict[str, torch.Tensor]:
-    """
-    Quantize model with GPTQ and return quantized state dict.
-    """
-    tokenizer = AutoTokenizer.from_pretrained(checkpoints_dir)
-
-    # Prepare quantization config
-    quantize_config = BaseQuantizeConfig(
-        bits=bits,
-        group_size=group_size,
-        damp_percent=0.01,
-        desc_act=act_order,
-        static_groups=False,
-        sym=True,
-        true_sequential=True,
-        model_name_or_path=checkpoints_dir,
-        model_file_base_name="model"
-    )
-
-    # Get calibration data
-    calibration_data = get_calibration_data(
-        checkpoints_dir,
-        calibration_dataset,
-        tokenizer,
-        nsamples=nsamples
-    )
-
-    # Clear GPU cache before quantization
-    torch.cuda.empty_cache()
-    gc.collect()
-
-    # Quantize the model
-    print(f"Starting GPTQ quantization with {bits} bits...")
-    model.quantize(calibration_data, quantize_config)
-
-    # Get quantized state dict
-    quantized_sd = model.state_dict()
-
-    # Clear memory
-    del model
-    torch.cuda.empty_cache()
-    gc.collect()
-
-    return quantized_sd
-
-
-def convert_qwen2_hf_to_litellama_gptq(
-        checkpoints_dir: str,
-        model,
-        num_layers: int,
-        bits: int = 4,
-        group_size: int = 128,
-        act_order: bool = False,
-        calibration_dataset: str = "c4",
-        nsamples: int = 128,
-) -> Dict[str, torch.Tensor]:
-    """
-    Convert Qwen2 HF model to LiteLLaMA format with GPTQ quantization.
-    """
-    # First quantize the model
-    quantized_sd = quantize_and_convert_weights(
-        model,
-        checkpoints_dir,
-        bits=bits,
-        group_size=group_size,
-        act_order=act_order,
-        calibration_dataset=calibration_dataset,
-        nsamples=nsamples,
-    )
-
-    # Mapping for base layers
-    mapping = {
-        "model.norm.weight": "norm_weight",
-        "model.embed_tokens.weight": "embed_tokens.weight",
-        "lm_head.weight": "lm_head_weight",
-    }
-
-    # Mapping for transformer layers
-    layers = {
-        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.self_attn.q_proj_weight",
-        "model.layers.{i}.self_attn.q_proj.bias": "layers.{i}.self_attn.q_proj_bias",
-        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj_weight",
-        "model.layers.{i}.self_attn.k_proj.bias": "layers.{i}.self_attn.k_proj_bias",
-        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj_weight",
-        "model.layers.{i}.self_attn.v_proj.bias": "layers.{i}.self_attn.v_proj_bias",
-        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj_weight",
-        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
-        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
-        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
-        "model.layers.{i}.input_layernorm.weight": "layers.{i}.input_layernorm_weight",
-        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.post_attention_layernorm_weight",
-    }
-
-    # Add GPTQ-specific mappings
-    gptq_layers = {
-        "model.layers.{i}.self_attn.q_proj.qweight": "layers.{i}.self_attn.q_proj_qweight",
-        "model.layers.{i}.self_attn.q_proj.qzeros": "layers.{i}.self_attn.q_proj_qzeros",
-        "model.layers.{i}.self_attn.q_proj.scales": "layers.{i}.self_attn.q_proj_scales",
-        "model.layers.{i}.self_attn.k_proj.qweight": "layers.{i}.self_attn.k_proj_qweight",
-        "model.layers.{i}.self_attn.k_proj.qzeros": "layers.{i}.self_attn.k_proj_qzeros",
-        "model.layers.{i}.self_attn.k_proj.scales": "layers.{i}.self_attn.k_proj_scales",
-        "model.layers.{i}.self_attn.v_proj.qweight": "layers.{i}.self_attn.v_proj_qweight",
-        "model.layers.{i}.self_attn.v_proj.qzeros": "layers.{i}.self_attn.v_proj_qzeros",
-        "model.layers.{i}.self_attn.v_proj.scales": "layers.{i}.self_attn.v_proj_scales",
-        "model.layers.{i}.self_attn.o_proj.qweight": "layers.{i}.self_attn.o_proj_qweight",
-        "model.layers.{i}.self_attn.o_proj.qzeros": "layers.{i}.self_attn.o_proj_qzeros",
-        "model.layers.{i}.self_attn.o_proj.scales": "layers.{i}.self_attn.o_proj_scales",
-        "model.layers.{i}.mlp.gate_proj.qweight": "layers.{i}.mlp.gate_proj_qweight",
-        "model.layers.{i}.mlp.gate_proj.qzeros": "layers.{i}.mlp.gate_proj_qzeros",
-        "model.layers.{i}.mlp.gate_proj.scales": "layers.{i}.mlp.gate_proj_scales",
-        "model.layers.{i}.mlp.up_proj.qweight": "layers.{i}.mlp.up_proj_qweight",
-        "model.layers.{i}.mlp.up_proj.qzeros": "layers.{i}.mlp.up_proj_qzeros",
-        "model.layers.{i}.mlp.up_proj.scales": "layers.{i}.mlp.up_proj_scales",
-        "model.layers.{i}.mlp.down_proj.qweight": "layers.{i}.mlp.down_proj_qweight",
-        "model.layers.{i}.mlp.down_proj.qzeros": "layers.{i}.mlp.down_proj_qzeros",
-        "model.layers.{i}.mlp.down_proj.scales": "layers.{i}.mlp.down_proj_scales",
-    }
-
-    # Generate mappings for all layers
-    for i in range(num_layers):
-        for hf_key, custom_key in layers.items():
-            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
-        for hf_key, custom_key in gptq_layers.items():
-            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
-
-    # Create new state dict with converted keys
-    new_sd = {}
-    for hf_key, tensor in tqdm(quantized_sd.items(), desc="Mapping GPTQ weights"):
-        custom_key = mapping.get(hf_key, None)
-        if custom_key is not None:
-            new_sd[custom_key] = tensor
-        else:
-            print(f"Warning: Unmapped key {hf_key}")
-
-    # Merge k_proj and v_proj for GPTQ
-    for i in range(num_layers):
-        # For regular weights (if they exist)
-        k_key = f"layers.{i}.self_attn.k_proj_weight"
-        v_key = f"layers.{i}.self_attn.v_proj_weight"
-        k_bias_key = f"layers.{i}.self_attn.k_proj_bias"
-        v_bias_key = f"layers.{i}.self_attn.v_proj_bias"
-
-        if k_key in new_sd and v_key in new_sd:
-            # Merge weights
-            kv_tensor = torch.cat([new_sd[k_key], new_sd[v_key]], dim=0)
-            new_sd[f"layers.{i}.self_attn.kv_proj_weight"] = kv_tensor
-            del new_sd[k_key]
-            del new_sd[v_key]
-
-            # Merge biases if they exist
-            if k_bias_key in new_sd and v_bias_key in new_sd:
-                kv_bias_tensor = torch.cat([new_sd[k_bias_key], new_sd[v_bias_key]], dim=0)
-                new_sd[f"layers.{i}.self_attn.kv_proj_bias"] = kv_bias_tensor
-                del new_sd[k_bias_key]
-                del new_sd[v_bias_key]
-
-        # For GPTQ quantized weights
-        k_qweight = f"layers.{i}.self_attn.k_proj_qweight"
-        v_qweight = f"layers.{i}.self_attn.v_proj_qweight"
-        k_qzeros = f"layers.{i}.self_attn.k_proj_qzeros"
-        v_qzeros = f"layers.{i}.self_attn.v_proj_qzeros"
-        k_scales = f"layers.{i}.self_attn.k_proj_scales"
-        v_scales = f"layers.{i}.self_attn.v_proj_scales"
-
-        if k_qweight in new_sd and v_qweight in new_sd:
-            # Merge quantized weights
-            kv_qweight = torch.cat([new_sd[k_qweight], new_sd[v_qweight]], dim=0)
-            kv_qzeros = torch.cat([new_sd[k_qzeros], new_sd[v_qzeros]], dim=0)
-            kv_scales = torch.cat([new_sd[k_scales], new_sd[v_scales]], dim=0)
-
-            new_sd[f"layers.{i}.self_attn.kv_proj_qweight"] = kv_qweight
-            new_sd[f"layers.{i}.self_attn.kv_proj_qzeros"] = kv_qzeros
-            new_sd[f"layers.{i}.self_attn.kv_proj_scales"] = kv_scales
-
-            # Remove original k and v projections
-            del new_sd[k_qweight]
-            del new_sd[v_qweight]
-            del new_sd[k_qzeros]
-            del new_sd[v_qzeros]
-            del new_sd[k_scales]
-            del new_sd[v_scales]
-
-    # Save the quantized weights
-    build_new_weight_dir_gptq(checkpoints_dir, new_sd, bits)
-
-    print(f"GPTQ quantization complete. Model saved with {bits}-bit precision.")
-    return new_sd
-
-
-def convert_llama_hf_to_litellama_gptq(
-        checkpoints_dir: str,
-        model,
-        num_layers: int,
-        bits: int = 4,
-        group_size: int = 128,
-        act_order: bool = False,
-        calibration_dataset: str = "c4",
-        nsamples: int = 128,
-) -> Dict[str, torch.Tensor]:
-    """
-    Convert Llama HF model to LiteLLaMA format with GPTQ quantization.
-    """
-    # First quantize the model
-    quantized_sd = quantize_and_convert_weights(
-        model,
-        checkpoints_dir,
-        bits=bits,
-        group_size=group_size,
-        act_order=act_order,
-        calibration_dataset=calibration_dataset,
-        nsamples=nsamples,
-    )
-
-    # Mapping for base layers
-    mapping = {
-        "model.embed_tokens.weight": "embed_tokens.weight",
-        "model.norm.weight": "norm_weight",
-        "lm_head.weight": "lm_head.weight",
-    }
-
-    # Mapping for transformer layers
-    layers = {
-        "model.layers.{i}.self_attn.q_proj.weight": "layers.{i}.self_attn.q_proj.weight",
-        "model.layers.{i}.self_attn.k_proj.weight": "layers.{i}.self_attn.k_proj.weight",
-        "model.layers.{i}.self_attn.v_proj.weight": "layers.{i}.self_attn.v_proj.weight",
-        "model.layers.{i}.self_attn.o_proj.weight": "layers.{i}.self_attn.o_proj.weight",
-        "model.layers.{i}.mlp.gate_proj.weight": "layers.{i}.mlp.gate_proj.weight",
-        "model.layers.{i}.mlp.up_proj.weight": "layers.{i}.mlp.up_proj.weight",
-        "model.layers.{i}.mlp.down_proj.weight": "layers.{i}.mlp.down_proj.weight",
-        "model.layers.{i}.input_layernorm.weight": "layers.{i}.attention_norm_weight",
-        "model.layers.{i}.post_attention_layernorm.weight": "layers.{i}.ffn_norm_weight",
-    }
-
-    # Add GPTQ-specific mappings
-    gptq_layers = {
-        "model.layers.{i}.self_attn.q_proj.qweight": "layers.{i}.self_attn.q_proj_qweight",
-        "model.layers.{i}.self_attn.q_proj.qzeros": "layers.{i}.self_attn.q_proj_qzeros",
-        "model.layers.{i}.self_attn.q_proj.scales": "layers.{i}.self_attn.q_proj_scales",
-        "model.layers.{i}.self_attn.k_proj.qweight": "layers.{i}.self_attn.k_proj_qweight",
-        "model.layers.{i}.self_attn.k_proj.qzeros": "layers.{i}.self_attn.k_proj_qzeros",
-        "model.layers.{i}.self_attn.k_proj.scales": "layers.{i}.self_attn.k_proj_scales",
-        "model.layers.{i}.self_attn.v_proj.qweight": "layers.{i}.self_attn.v_proj_qweight",
-        "model.layers.{i}.self_attn.v_proj.qzeros": "layers.{i}.self_attn.v_proj_qzeros",
-        "model.layers.{i}.self_attn.v_proj.scales": "layers.{i}.self_attn.v_proj_scales",
-        "model.layers.{i}.self_attn.o_proj.qweight": "layers.{i}.self_attn.o_proj_qweight",
-        "model.layers.{i}.self_attn.o_proj.qzeros": "layers.{i}.self_attn.o_proj_qzeros",
-        "model.layers.{i}.self_attn.o_proj.scales": "layers.{i}.self_attn.o_proj_scales",
-        "model.layers.{i}.mlp.gate_proj.qweight": "layers.{i}.mlp.gate_proj_qweight",
-        "model.layers.{i}.mlp.gate_proj.qzeros": "layers.{i}.mlp.gate_proj_qzeros",
-        "model.layers.{i}.mlp.gate_proj.scales": "layers.{i}.mlp.gate_proj_scales",
-        "model.layers.{i}.mlp.up_proj.qweight": "layers.{i}.mlp.up_proj_qweight",
-        "model.layers.{i}.mlp.up_proj.qzeros": "layers.{i}.mlp.up_proj_qzeros",
-        "model.layers.{i}.mlp.up_proj.scales": "layers.{i}.mlp.up_proj_scales",
-        "model.layers.{i}.mlp.down_proj.qweight": "layers.{i}.mlp.down_proj_qweight",
-        "model.layers.{i}.mlp.down_proj.qzeros": "layers.{i}.mlp.down_proj_qzeros",
-        "model.layers.{i}.mlp.down_proj.scales": "layers.{i}.mlp.down_proj_scales",
-    }
-
-    # Generate mappings for all layers
-    for i in range(num_layers):
-        for hf_key, custom_key in layers.items():
-            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
-        for hf_key, custom_key in gptq_layers.items():
-            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
-
-    # Create new state dict with converted keys
-    new_sd = {}
-    for hf_key, tensor in tqdm(quantized_sd.items(), desc="Mapping GPTQ weights"):
-        custom_key = mapping.get(hf_key, None)
-        if custom_key is not None:
-            new_sd[custom_key] = tensor
-        else:
-            print(f"Warning: Unmapped key {hf_key}")
-
-    # Merge k_proj and v_proj
-    for i in range(num_layers):
-        # Handle regular weights if they exist
-        k_key = f"layers.{i}.self_attn.k_proj.weight"
-        v_key = f"layers.{i}.self_attn.v_proj.weight"
-        if k_key in new_sd and v_key in new_sd:
-            kv_tensor = torch.cat([new_sd[k_key], new_sd[v_key]], dim=0)
-            new_sd[f"layers.{i}.self_attn.kv_proj_weight"] = kv_tensor
-            del new_sd[k_key]
-            del new_sd[v_key]
-
-        # Handle GPTQ quantized weights
-        k_qweight = f"layers.{i}.self_attn.k_proj_qweight"
-        v_qweight = f"layers.{i}.self_attn.v_proj_qweight"
-        k_qzeros = f"layers.{i}.self_attn.k_proj_qzeros"
-        v_qzeros = f"layers.{i}.self_attn.v_proj_qzeros"
-        k_scales = f"layers.{i}.self_attn.k_proj_scales"
-        v_scales = f"layers.{i}.self_attn.v_proj_scales"
-
-        if k_qweight in new_sd and v_qweight in new_sd:
-            # Merge quantized weights
-            kv_qweight = torch.cat([new_sd[k_qweight], new_sd[v_qweight]], dim=0)
-            kv_qzeros = torch.cat([new_sd[k_qzeros], new_sd[v_qzeros]], dim=0)
-            kv_scales = torch.cat([new_sd[k_scales], new_sd[v_scales]], dim=0)
-
-            new_sd[f"layers.{i}.self_attn.kv_proj_qweight"] = kv_qweight
-            new_sd[f"layers.{i}.self_attn.kv_proj_qzeros"] = kv_qzeros
-            new_sd[f"layers.{i}.self_attn.kv_proj_scales"] = kv_scales
-
-            # Remove original k and v projections
-            del new_sd[k_qweight]
-            del new_sd[v_qweight]
-            del new_sd[k_qzeros]
-            del new_sd[v_qzeros]
-            del new_sd[k_scales]
-            del new_sd[v_scales]
-
-    # Save the quantized weights
-    build_new_weight_dir_gptq(checkpoints_dir, new_sd, bits)
-
-    print(f"GPTQ quantization complete. Model saved with {bits}-bit precision.")
-    return new_sd
-
-
-def convert_llavallama_hf_to_litellama_gptq(
-        checkpoints_dir: str,
-        model,
-        num_layers: int,
-        bits: int = 4,
-        group_size: int = 128,
-        act_order: bool = False,
-        calibration_dataset: str = "c4",
-        nsamples: int = 128,
-) -> Dict[str, torch.Tensor]:
-    """
-    Convert LLaVA-Llama HF model to LiteLLaMA format with GPTQ quantization.
-    """
-    # First quantize the model
-    quantized_sd = quantize_and_convert_weights(
-        model,
-        checkpoints_dir,
-        bits=bits,
-        group_size=group_size,
-        act_order=act_order,
-        calibration_dataset=calibration_dataset,
-        nsamples=nsamples,
-    )
-
-    # Mapping for base layers
-    mapping = {
-        "language_model.model.embed_tokens.weight": "language_model.embed_tokens.weight",
-        "language_model.model.norm.weight": "language_model.norm_weight",
-        "language_model.lm_head.weight": "language_model.lm_head.weight",
-    }
-
-    # Mapping for transformer layers
-    layers = {
-        "language_model.model.layers.{i}.self_attn.q_proj.weight": "language_model.layers.{i}.self_attn.q_proj.weight",
-        "language_model.model.layers.{i}.self_attn.k_proj.weight": "language_model.layers.{i}.self_attn.k_proj.weight",
-        "language_model.model.layers.{i}.self_attn.v_proj.weight": "language_model.layers.{i}.self_attn.v_proj.weight",
-        "language_model.model.layers.{i}.self_attn.o_proj.weight": "language_model.layers.{i}.self_attn.o_proj.weight",
-        "language_model.model.layers.{i}.mlp.gate_proj.weight": "language_model.layers.{i}.mlp.gate_proj.weight",
-        "language_model.model.layers.{i}.mlp.up_proj.weight": "language_model.layers.{i}.mlp.up_proj.weight",
-        "language_model.model.layers.{i}.mlp.down_proj.weight": "language_model.layers.{i}.mlp.down_proj.weight",
-        "language_model.model.layers.{i}.input_layernorm.weight": "language_model.layers.{i}.attention_norm_weight",
-        "language_model.model.layers.{i}.post_attention_layernorm.weight": "language_model.layers.{i}.ffn_norm_weight",
-    }
-
-    # Add GPTQ-specific mappings
-    gptq_layers = {
-        "language_model.model.layers.{i}.self_attn.q_proj.qweight": "language_model.layers.{i}.self_attn.q_proj_qweight",
-        "language_model.model.layers.{i}.self_attn.q_proj.qzeros": "language_model.layers.{i}.self_attn.q_proj_qzeros",
-        "language_model.model.layers.{i}.self_attn.q_proj.scales": "language_model.layers.{i}.self_attn.q_proj_scales",
-        "language_model.model.layers.{i}.self_attn.k_proj.qweight": "language_model.layers.{i}.self_attn.k_proj_qweight",
-        "language_model.model.layers.{i}.self_attn.k_proj.qzeros": "language_model.layers.{i}.self_attn.k_proj_qzeros",
-        "language_model.model.layers.{i}.self_attn.k_proj.scales": "language_model.layers.{i}.self_attn.k_proj_scales",
-        "language_model.model.layers.{i}.self_attn.v_proj.qweight": "language_model.layers.{i}.self_attn.v_proj_qweight",
-        "language_model.model.layers.{i}.self_attn.v_proj.qzeros": "language_model.layers.{i}.self_attn.v_proj_qzeros",
-        "language_model.model.layers.{i}.self_attn.v_proj.scales": "language_model.layers.{i}.self_attn.v_proj_scales",
-        "language_model.model.layers.{i}.self_attn.o_proj.qweight": "language_model.layers.{i}.self_attn.o_proj_qweight",
-        "language_model.model.layers.{i}.self_attn.o_proj.qzeros": "language_model.layers.{i}.self_attn.o_proj_qzeros",
-        "language_model.model.layers.{i}.self_attn.o_proj.scales": "language_model.layers.{i}.self_attn.o_proj_scales",
-        "language_model.model.layers.{i}.mlp.gate_proj.qweight": "language_model.layers.{i}.mlp.gate_proj_qweight",
-        "language_model.model.layers.{i}.mlp.gate_proj.qzeros": "language_model.layers.{i}.mlp.gate_proj_qzeros",
-        "language_model.model.layers.{i}.mlp.gate_proj.scales": "language_model.layers.{i}.mlp.gate_proj_scales",
-        "language_model.model.layers.{i}.mlp.up_proj.qweight": "language_model.layers.{i}.mlp.up_proj_qweight",
-        "language_model.model.layers.{i}.mlp.up_proj.qzeros": "language_model.layers.{i}.mlp.up_proj_qzeros",
-        "language_model.model.layers.{i}.mlp.up_proj.scales": "language_model.layers.{i}.mlp.up_proj_scales",
-        "language_model.model.layers.{i}.mlp.down_proj.qweight": "language_model.layers.{i}.mlp.down_proj_qweight",
-        "language_model.model.layers.{i}.mlp.down_proj.qzeros": "language_model.layers.{i}.mlp.down_proj_qzeros",
-        "language_model.model.layers.{i}.mlp.down_proj.scales": "language_model.layers.{i}.mlp.down_proj_scales",
-    }
-
-    # Generate mappings for all layers
-    for i in range(num_layers):
-        for hf_key, custom_key in layers.items():
-            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
-        for hf_key, custom_key in gptq_layers.items():
-            mapping[hf_key.format(i=i)] = custom_key.format(i=i)
-
-    # Create new state dict with converted keys
-    new_sd = {}
-    for hf_key, tensor in tqdm(quantized_sd.items(), desc="Mapping GPTQ weights"):
-        custom_key = mapping.get(hf_key, None)
-        if custom_key is not None:
-            new_sd[custom_key] = tensor
-        else:
-            # Keep vision model and other components as-is
-            new_sd[hf_key] = tensor
-            print(f"Warning: Unmapped key {hf_key}")
-
-    # Merge k_proj and v_proj for language model
-    for i in tqdm(range(num_layers), desc="Mapping kv fused weights"):
-        # Handle regular weights if they exist
-        k_key = f"language_model.layers.{i}.self_attn.k_proj.weight"
-        v_key = f"language_model.layers.{i}.self_attn.v_proj.weight"
-        if k_key in new_sd and v_key in new_sd:
-            kv_tensor = torch.cat([new_sd[k_key], new_sd[v_key]], dim=0)
-            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_weight"] = kv_tensor
-            del new_sd[k_key]
-            del new_sd[v_key]
-
-        # Handle GPTQ quantized weights
-        k_qweight = f"language_model.layers.{i}.self_attn.k_proj_qweight"
-        v_qweight = f"language_model.layers.{i}.self_attn.v_proj_qweight"
-        k_qzeros = f"language_model.layers.{i}.self_attn.k_proj_qzeros"
-        v_qzeros = f"language_model.layers.{i}.self_attn.v_proj_qzeros"
-        k_scales = f"language_model.layers.{i}.self_attn.k_proj_scales"
-        v_scales = f"language_model.layers.{i}.self_attn.v_proj_scales"
-
-        if k_qweight in new_sd and v_qweight in new_sd:
-            # Merge quantized weights
-            kv_qweight = torch.cat([new_sd[k_qweight], new_sd[v_qweight]], dim=0)
-            kv_qzeros = torch.cat([new_sd[k_qzeros], new_sd[v_qzeros]], dim=0)
-            kv_scales = torch.cat([new_sd[k_scales], new_sd[v_scales]], dim=0)
-
-            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_qweight"] = kv_qweight
-            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_qzeros"] = kv_qzeros
-            new_sd[f"language_model.layers.{i}.self_attn.kv_proj_scales"] = kv_scales
-
-            print(f"Merged GPTQ k/v projections for layer {i}")
-
-            # Remove original k and v projections
-            del new_sd[k_qweight]
-            del new_sd[v_qweight]
-            del new_sd[k_qzeros]
-            del new_sd[v_qzeros]
-            del new_sd[k_scales]
-            del new_sd[v_scales]
-
-    # Save the quantized weights
-    build_new_weight_dir_gptq(checkpoints_dir, new_sd, bits)
-
-    print(f"GPTQ quantization complete. Model saved with {bits}-bit precision.")
-    return new_sd
\ No newline at end of file
diff --git a/lite_llama/quantization/__init__.py b/lite_llama/quantization/__init__.py
old mode 100644
new mode 100755
diff --git a/lite_llama/quantization/gptq/__init__.py b/lite_llama/quantization/gptq/__init__.py
old mode 100644
new mode 100755
diff --git a/lite_llama/quantization/gptq/gptq.py b/lite_llama/quantization/gptq/gptq.py
new file mode 100755
index 0000000..b614fde
--- /dev/null
+++ b/lite_llama/quantization/gptq/gptq.py
@@ -0,0 +1,329 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Dict, Tuple, Optional
+from tqdm.auto import tqdm
+import math
+
+
+def pack_int4_weights(qweight: torch.Tensor, wbits: int = 4) -> torch.Tensor:
+    """
+    Pack quantized weights into int32 for efficient storage.
+    For 4-bit quantization, pack 8 weights into one int32.
+
+    Args:
+        qweight: Quantized weight tensor of shape (rows, cols) with values in [0, 15]
+        wbits: Number of bits per weight (4 for int4)
+
+    Returns:
+        Packed weight tensor of shape (rows, cols // 8)
+    """
+    assert wbits == 4, "This function currently only supports 4-bit packing"
+
+    rows, cols = qweight.shape
+    pack_factor = 32 // wbits  # 8 for 4-bit
+
+    # Ensure we can pack evenly
+    if cols % pack_factor != 0:
+        # Pad columns to make it divisible by pack_factor
+        pad_cols = pack_factor - (cols % pack_factor)
+        qweight = torch.nn.functional.pad(qweight, (0, pad_cols), value=0)
+        cols = qweight.shape[1]
+
+    packed_cols = cols // pack_factor
+    packed = torch.zeros((rows, packed_cols), dtype=torch.int32, device=qweight.device)
+
+    # Pack weights
+    for i in range(pack_factor):
+        packed |= (qweight[:, i::pack_factor].to(torch.int32) & 0xF) << (i * 4)
+
+    return packed
+
+
+def unpack_int4_weights(packed: torch.Tensor, original_cols: int, wbits: int = 4) -> torch.Tensor:
+    """
+    Unpack int4 weights from int32 storage.
+
+    Args:
+        packed: Packed weight tensor
+        original_cols: Original number of columns before packing
+        wbits: Number of bits per weight
+
+    Returns:
+        Unpacked weight tensor
+    """
+    assert wbits == 4, "This function currently only supports 4-bit unpacking"
+
+    rows, packed_cols = packed.shape
+    pack_factor = 32 // wbits  # 8 for 4-bit
+
+    # Calculate unpacked dimensions
+    unpacked_cols = packed_cols * pack_factor
+    unpacked = torch.zeros((rows, unpacked_cols), dtype=torch.int32, device=packed.device)
+
+    # Unpack weights
+    for i in range(pack_factor):
+        unpacked[:, i::pack_factor] = (packed >> (i * 4)) & 0xF
+
+    # Remove padding if necessary
+    return unpacked[:, :original_cols]
+
+
+class GPTQ:
+    """
+    Implementation of GPTQ (Generalized Post-Training Quantization) algorithm
+    for quantizing model weights to lower bit precision.
+    """
+
+    def __init__(
+            self,
+            layer: nn.Module,
+            wbits: int = 4,
+            groupsize: int = 128,
+            actorder: bool = False,
+            percdamp: float = 0.01,
+            blocksize: int = 128,
+            device: str = "cuda"
+    ):
+        self.layer = layer
+        self.wbits = wbits
+        self.groupsize = groupsize if groupsize != -1 else float('inf')
+        self.actorder = actorder
+        self.percdamp = percdamp
+        self.blocksize = blocksize
+        self.device = device
+        self.maxq = 2 ** wbits - 1
+
+        # Initialize quantization parameters
+        self.H = None
+        self.dead = None
+        self.rows = None
+        self.columns = None
+
+    def add_batch(self, inp: torch.Tensor, out: torch.Tensor):
+        """Add a batch of data to compute Hessian matrix"""
+        if len(inp.shape) == 2:
+            inp = inp.unsqueeze(0)
+
+        tmp = inp.shape[0]
+        if len(inp.shape) == 3:
+            inp = inp.reshape((-1, inp.shape[-1]))
+        inp = inp.t()
+
+        if self.H is None:
+            self.H = torch.zeros((inp.shape[0], inp.shape[0]), device=self.device)
+
+        self.H += 2 / tmp * inp.matmul(inp.t())
+
+    def quantize(self, weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+        """
+        Quantize the weight matrix using GPTQ algorithm
+
+        Returns:
+            - qweight: quantized weights (packed if 4-bit)
+            - qzeros: zero points for each group
+            - scales: scales for each group
+            - original_cols: original number of columns (for unpacking)
+        """
+        W = weight.clone()
+        if not self.actorder:
+            # Standard quantization order
+            W = W.float()
+
+        rows, columns = W.shape[0], W.shape[1]
+        original_cols = columns
+
+        # Initialize Hessian
+        if self.H is None:
+            self.H = torch.eye(columns, device=self.device)
+
+        H = self.H
+        dead = torch.diag(H) == 0
+        H[dead, dead] = 1
+
+        # Add dampening
+        damp = self.percdamp * torch.mean(torch.diag(H))
+        diag = torch.arange(columns, device=self.device)
+        H[diag, diag] += damp
+
+        # Prepare quantization
+        scales = torch.zeros((rows, (columns + self.groupsize - 1) // self.groupsize), device=self.device)
+        qzeros = torch.zeros_like(scales, dtype=torch.int32)
+        qweight = torch.zeros_like(W, dtype=torch.int32)
+
+        # Cholesky decomposition
+        try:
+            H = torch.linalg.cholesky(H)
+        except:
+            print("Cholesky decomposition failed, using eigenvalue decomposition")
+            eigenvalues, eigenvectors = torch.linalg.eigh(H)
+            eigenvalues = eigenvalues.clamp(min=1e-10)
+            H = eigenvectors @ torch.diag(torch.sqrt(eigenvalues)) @ eigenvectors.T
+
+        H = torch.cholesky_inverse(H)
+        H = torch.linalg.cholesky(H, upper=True)
+        Hinv = H
+
+        # Quantize blocks
+        for i1 in range(0, columns, self.blocksize):
+            i2 = min(i1 + self.blocksize, columns)
+            count = i2 - i1
+
+            W1 = W[:, i1:i2].clone()
+            Q1 = torch.zeros_like(W1)
+            Err1 = torch.zeros_like(W1)
+            Losses1 = torch.zeros_like(W1)
+            Hinv1 = Hinv[i1:i2, i1:i2]
+
+            for i in range(count):
+                w = W1[:, i]
+                d = Hinv1[i, i]
+
+                # Find optimal quantization
+                if self.groupsize != float('inf'):
+                    g_idx = (i1 + i) // self.groupsize
+                    scale = scales[:, g_idx]
+                    zero = qzeros[:, g_idx]
+
+                    if scale.sum() == 0:  # Initialize scale and zero
+                        scale = W1[:, i].abs().max() / (self.maxq / 2)
+                        scales[:, g_idx] = scale
+                        zero = torch.round(-W1[:, i].min() / scale).clamp(0, self.maxq)
+                        qzeros[:, g_idx] = zero.to(torch.int32)
+                else:
+                    scale = W1[:, i].abs().max() / (self.maxq / 2)
+                    zero = torch.round(-W1[:, i].min() / scale).clamp(0, self.maxq)
+
+                # Quantize
+                q = torch.clamp(torch.round(w / scale) + zero, 0, self.maxq)
+                Q1[:, i] = q
+
+                # Dequantize and compute error
+                dq = (q - zero) * scale
+                err = (w - dq) / d
+                Err1[:, i] = err
+
+                # Update remaining weights
+                W1[:, i:] -= err.unsqueeze(1) * Hinv1[i, i:].unsqueeze(0)
+
+            qweight[:, i1:i2] = Q1.to(torch.int32)
+
+        # Pack weights if 4-bit
+        if self.wbits == 4:
+            qweight = pack_int4_weights(qweight, self.wbits)
+
+        return qweight, qzeros, scales, original_cols
+
+
+def quantize_gptq(
+        model_state_dict: Dict[str, torch.Tensor],
+        calibration_data: Optional[torch.Tensor] = None,
+        wbits: int = 4,
+        groupsize: int = 128,
+        target_layers: Optional[list] = None,
+        device: str = "cuda"
+) -> Dict[str, torch.Tensor]:
+    """
+    Quantize model weights using GPTQ algorithm
+
+    Args:
+        model_state_dict: Original model state dictionary
+        calibration_data: Optional calibration data for computing Hessian
+        wbits: Number of bits for quantization (default: 4)
+        groupsize: Group size for quantization (default: 128)
+        target_layers: List of layer names to quantize (if None, quantize all linear layers)
+        device: Device to perform quantization on
+
+    Returns:
+        Dictionary containing quantized weights and quantization parameters
+    """
+    quantized_state_dict = {}
+
+    # Default target layers if not specified
+    if target_layers is None:
+        target_layers = []
+        for name in model_state_dict.keys():
+            if any(pattern in name for pattern in [
+                "q_proj", "k_proj", "v_proj", "o_proj",
+                "gate_proj", "up_proj", "down_proj",
+                "kv_proj"
+            ]):
+                target_layers.append(name)
+
+    print(f"Quantizing {len(target_layers)} layers to {wbits} bits...")
+
+    for name, param in tqdm(model_state_dict.items(), desc="Processing layers"):
+        if name in target_layers and param.dim() == 2:
+            # Create GPTQ quantizer for this layer
+            gptq = GPTQ(
+                layer=None,  # We're working directly with tensors
+                wbits=wbits,
+                groupsize=groupsize,
+                device=device
+            )
+
+            # Move weight to device
+            weight = param.to(device).float()
+
+            # If no calibration data, use identity Hessian
+            if calibration_data is None:
+                gptq.H = torch.eye(weight.shape[1], device=device)
+
+            # Quantize the weight
+            qweight, qzeros, scales, original_cols = gptq.quantize(weight)
+
+            # Store quantized parameters
+            base_name = name.replace(".weight", "").replace("_weight", "")
+            quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
+            quantized_state_dict[f"{base_name}.qzeros"] = qzeros.cpu()
+            quantized_state_dict[f"{base_name}.scales"] = scales.cpu()
+            quantized_state_dict[f"{base_name}.wbits"] = torch.tensor(wbits)
+            quantized_state_dict[f"{base_name}.groupsize"] = torch.tensor(groupsize)
+            quantized_state_dict[f"{base_name}.original_cols"] = torch.tensor(original_cols)
+
+        else:
+            # Keep non-quantized parameters as is
+            quantized_state_dict[name] = param.cpu()
+
+    return quantized_state_dict
+
+
+def dequantize_weight(qweight, qzeros, scales, wbits=4, original_cols=None):
+    """
+    Dequantize weight for inference
+
+    Args:
+        qweight: Quantized weights (packed if 4-bit)
+        qzeros: Zero points
+        scales: Scales
+        wbits: Number of bits used for quantization
+        original_cols: Original number of columns (for unpacking)
+
+    Returns:
+        Dequantized weight tensor
+    """
+    # Unpack if 4-bit
+    if wbits == 4 and original_cols is not None:
+        qweight = unpack_int4_weights(qweight, original_cols, wbits)
+
+    # Get dimensions
+    rows, columns = qweight.shape
+    groupsize = columns // scales.shape[1]
+
+    # Prepare output tensor
+    weight = torch.zeros((rows, columns), dtype=torch.float32, device=qweight.device)
+
+    # Dequantize each group
+    for g in range(scales.shape[1]):
+        start_idx = g * groupsize
+        end_idx = min((g + 1) * groupsize, columns)
+
+        # Extract group quantized values
+        group_qweight = qweight[:, start_idx:end_idx].float()
+        group_scales = scales[:, g].unsqueeze(1)
+        group_zeros = qzeros[:, g].unsqueeze(1).float()
+
+        # Dequantize
+        weight[:, start_idx:end_idx] = (group_qweight - group_zeros) * group_scales
+
+    return weight
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq_executor.py b/lite_llama/quantization/gptq/gptq_executor.py
deleted file mode 100644
index a5ba0cc..0000000
--- a/lite_llama/quantization/gptq/gptq_executor.py
+++ /dev/null
@@ -1,218 +0,0 @@
-"""
-Extended ModelExecutor with GPTQ support
-"""
-
-import torch
-import json
-import time
-from pathlib import Path
-from typing import Optional, Dict, Any
-
-from lite_llama.executor.model_executor import ModelExecutor
-from lite_llama.executor.weight_convert import (
-    convert_llama_hf_to_litellama,
-    convert_qwen2_hf_to_litellama,
-    convert_llama_torch_to_litellama,
-)
-from lite_llama.models.model_config import LlamaConfig, Qwen2Config
-from lite_llama.quantization.gptq.gptq_loader import GPTQModelLoader, load_gptq_quantize_config
-from lite_llama.utils.logger import log
-
-
-class GPTQModelExecutor(ModelExecutor):
-    """Extended ModelExecutor with GPTQ quantization support"""
-
-    @staticmethod
-    def _is_gptq_model(checkpoints_dir: str) -> bool:
-        """Check if the model directory contains GPTQ quantized model"""
-        quantize_config_path = Path(checkpoints_dir) / "quantization_config.json"
-        return quantize_config_path.exists()
-
-    @staticmethod
-    def _load_model_weight(
-            model_config,
-            checkpoints_dir,
-            load_model=True,
-            triton_weight=True,
-            device="cuda",
-            use_gptq=None,  # New parameter: None=auto-detect, True=force GPTQ, False=force original
-    ):
-        """Extended weight loading with GPTQ support"""
-        start_time = time.time()
-
-        # Auto-detect GPTQ if not specified
-        if use_gptq is None:
-            use_gptq = GPTQModelExecutor._is_gptq_model(checkpoints_dir)
-            if use_gptq:
-                log.info(f"GPTQ quantized model detected in {checkpoints_dir}")
-
-        # Initialize model
-        with torch.no_grad():
-            model = ModelExecutor._initialize_model(model_config, device=device)
-            state_dict = None
-
-        if not load_model:
-            # Use conversion function (original path)
-            if model_config.model_type.lower() == "llama":
-                # Try to determine if it's HF or torch format
-                config_path = Path(checkpoints_dir) / "config.json"
-                if config_path.exists():
-                    state_dict = convert_llama_hf_to_litellama(checkpoints_dir, None, model_config)
-                else:
-                    state_dict = convert_llama_torch_to_litellama(checkpoints_dir, None, model_config)
-            elif model_config.model_type.lower() == "qwen2":
-                state_dict = convert_qwen2_hf_to_litellama(checkpoints_dir, None, model_config)
-            else:
-                log.error(f"Unsupported model type: {model_config.model_type}")
-                raise ValueError(f"Unsupported model type: {model_config.model_type}")
-        elif use_gptq:
-            # Load GPTQ model
-            state_dict = GPTQModelLoader.load(checkpoints_dir, model_config, device)
-        else:
-            # Original loading path
-            checkpoints = sorted(Path(checkpoints_dir).glob("*.pth"))
-            if not checkpoints:
-                log.error(f"No checkpoint files found in {checkpoints_dir}")
-                raise FileNotFoundError(f"No checkpoint files found in {checkpoints_dir}")
-
-            ckpt_path = str(checkpoints[0])
-            log.info(f'Loading checkpoint "{ckpt_path}"')
-            state_dict = torch.load(
-                ckpt_path, mmap=True, weights_only=True, map_location=device
-            )
-
-        # Load state dict into model
-        model.load_state_dict(state_dict, strict=True, assign=True)
-        model.eval()
-        log.info(f"Loaded state dict in {time.time() - start_time:.2f}s")
-
-        # Convert to half precision
-        model.half().to(device)
-        for param in model.parameters():
-            assert param.dtype == torch.float16, "Model parameters are not in FP16"
-        log.info("Converted model to half precision (FP16)")
-
-        return model
-
-    @staticmethod
-    def build(
-            checkpoints_dir: str,
-            max_seq_len: int,
-            max_gpu_num_blocks: Optional[int] = None,
-            load_model: bool = True,
-            triton_weight: bool = True,
-            compiled_model: bool = False,
-            device: str = "cuda",
-            use_gptq: Optional[bool] = None,  # New parameter for GPTQ
-    ):
-        """
-        Build ModelExecutor with GPTQ support
-
-        Args:
-            checkpoints_dir: Model checkpoint directory
-            max_seq_len: Maximum sequence length
-            max_gpu_num_blocks: Maximum GPU memory blocks
-            load_model: Whether to load model weights
-            triton_weight: Whether to use Triton kernels
-            compiled_model: Whether to compile model
-            device: Device to use
-            use_gptq: Whether to use GPTQ (None=auto-detect)
-        """
-        model_config = ModelExecutor._load_model_config(
-            checkpoints_dir, max_seq_len, device=device
-        )
-
-        model = GPTQModelExecutor._load_model_weight(
-            model_config, checkpoints_dir, load_model, triton_weight, device, use_gptq
-        )
-
-        return ModelExecutor(
-            model_config, model, max_gpu_num_blocks, compiled_model, device
-        )
-
-
-def create_gptq_generate_text_class():
-    """Create a GenerateText class with GPTQ support"""
-
-    from lite_llama.generate import GenerateText
-
-    class GPTQGenerateText(GenerateText):
-        """GenerateText with GPTQ model support"""
-
-        def __init__(
-                self,
-                checkpoints_dir: str,
-                tokenizer_path: str,
-                max_seq_len=1024,
-                max_gpu_num_blocks=None,
-                load_model=True,
-                triton_weight=True,
-                compiled_model=False,
-                device="cuda",
-                use_gptq=None,  # New parameter
-        ):
-            self.checkpoints_dir = checkpoints_dir
-            self.compiled_model = compiled_model
-            self.device = device
-
-            # Use GPTQModelExecutor instead of ModelExecutor
-            self.model_executor = GPTQModelExecutor.build(
-                checkpoints_dir=checkpoints_dir,
-                max_seq_len=max_seq_len,
-                max_gpu_num_blocks=max_gpu_num_blocks,
-                load_model=load_model,
-                triton_weight=triton_weight,
-                compiled_model=compiled_model,
-                device=device,
-                use_gptq=use_gptq,
-            )
-            self.model_config = self.model_executor.model_config
-            assert self.model_config.vocab_size != -1, "Vocab size must be set"
-            self.tokenizer = self.load_tokenizer(tokenizer_path)
-
-    return GPTQGenerateText
-
-
-def create_gptq_generate_stream_text_class():
-    """Create a GenerateStreamText class with GPTQ support"""
-
-    from lite_llama.generate_stream import GenerateStreamText
-
-    class GPTQGenerateStreamText(GenerateStreamText):
-        """GenerateStreamText with GPTQ model support"""
-
-        def __init__(
-                self,
-                checkpoints_dir: str,
-                tokenizer_path: str,
-                max_gpu_num_blocks=None,
-                max_seq_len=1024,
-                load_model=True,
-                triton_weight=True,
-                compiled_model=False,
-                device="cuda",
-                use_gptq=None,  # New parameter
-        ):
-            self.checkpoints_dir = checkpoints_dir
-
-            # Use GPTQModelExecutor instead of ModelExecutor
-            self.model_executor = GPTQModelExecutor.build(
-                checkpoints_dir=checkpoints_dir,
-                load_model=load_model,
-                max_gpu_num_blocks=max_gpu_num_blocks,
-                max_seq_len=max_seq_len,
-                triton_weight=triton_weight,
-                compiled_model=compiled_model,
-                device=device,
-                use_gptq=use_gptq,
-            )
-            self.tokenizer = self.load_tokenizer(tokenizer_path)
-            self.model_config = self.model_executor.model_config
-            self.device = device
-
-    return GPTQGenerateStreamText
-
-
-# Export the GPTQ-enabled classes
-GPTQGenerateText = create_gptq_generate_text_class()
-GPTQGenerateStreamText = create_gptq_generate_stream_text_class()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq_loader.py b/lite_llama/quantization/gptq/gptq_loader.py
index 90e2e8d..7c30c08 100644
--- a/lite_llama/quantization/gptq/gptq_loader.py
+++ b/lite_llama/quantization/gptq/gptq_loader.py
@@ -1,550 +1,216 @@
-"""
-GPTQ weight loading and dequantization utilities for lite_llama
-"""
-
 import torch
 import torch.nn as nn
-from pathlib import Path
-import json
-import time
-from typing import Dict, Optional, Tuple, Any
-import numpy as np
-
-try:
-    import safetensors.torch
-    HAS_SAFETENSORS = True
-except ImportError:
-    HAS_SAFETENSORS = False
-    print("Warning: safetensors not installed. Install with: pip install safetensors")
-
-
-class GPTQConfig:
-    """Configuration for GPTQ quantization parameters"""
-    def __init__(self, bits: int = 4, group_size: int = 128,
-                 desc_act: bool = False, sym: bool = True,
-                 true_sequential: bool = True):
-        self.bits = bits
-        self.group_size = group_size
-        self.desc_act = desc_act
-        self.sym = sym
-        self.true_sequential = true_sequential
-        self.pack_num = 32 // self.bits  # number of weights packed in int32
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict[str, Any]) -> "GPTQConfig":
-        """Create GPTQConfig from dictionary"""
-        return cls(
-            bits=config_dict.get("bits", 4),
-            group_size=config_dict.get("group_size", 128),
-            desc_act=config_dict.get("desc_act", False),
-            sym=config_dict.get("sym", True),
-            true_sequential=config_dict.get("true_sequential", True)
-        )
-
-
-def load_gptq_quantize_config(model_path: str) -> Optional[GPTQConfig]:
-    """Load GPTQ quantization config from model directory"""
-    quantize_config_path = Path(model_path) / "quantization_config.json"
-    if not quantize_config_path.exists():
-        return None
+from typing import Dict, Optional
+import os.path as osp
+from .gptq import dequantize_weight
 
-    with open(quantize_config_path, 'r') as f:
-        config_dict = json.load(f)
 
-    return GPTQConfig.from_dict(config_dict)
-
-
-def unpack_gptq_weights(qweight: torch.Tensor, bits: int = 4) -> torch.Tensor:
+class GPTQLinear(nn.Module):
     """
-    Unpack GPTQ quantized weights from int32 format
-
-    Args:
-        qweight: Packed quantized weights [out_features, in_features // pack_num]
-        bits: Number of bits per weight (4 or 8)
-
-    Returns:
-        Unpacked weights [out_features, in_features]
+    A linear layer that uses GPTQ quantized weights.
+    Automatically dequantizes during forward pass.
     """
-    pack_num = 32 // bits
-    out_features = qweight.shape[0]
-    in_features = qweight.shape[1] * pack_num
-
-    unpacked_weights = torch.zeros((out_features, in_features),
-                                   dtype=torch.int32, device=qweight.device)
-
-    for i in range(pack_num):
-        shift = i * bits
-        if bits == 4:
-            mask = 0xF
-        elif bits == 8:
-            mask = 0xFF
+
+    def __init__(self, qweight, qzeros, scales, wbits=4, bias=None):
+        super().__init__()
+        self.register_buffer('qweight', qweight)
+        self.register_buffer('qzeros', qzeros)
+        self.register_buffer('scales', scales)
+        self.wbits = wbits
+        if bias is not None:
+            self.register_buffer('bias', bias)
         else:
-            raise ValueError(f"Unsupported bits: {bits}")
+            self.bias = None
+
+    def forward(self, x):
+        # Dequantize weight on-the-fly
+        weight = dequantize_weight(
+            self.qweight,
+            self.qzeros,
+            self.scales,
+            self.wbits
+        )
+
+        # Perform linear transformation
+        output = torch.matmul(x, weight.t())
 
-        unpacked_weights[:, i::pack_num] = (qweight >> shift) & mask
+        if self.bias is not None:
+            output += self.bias
 
-    return unpacked_weights
+        return output
 
 
-def dequantize_gptq(qweight: torch.Tensor, qzeros: torch.Tensor,
-                    scales: torch.Tensor, g_idx: Optional[torch.Tensor] = None,
-                    bits: int = 4, group_size: int = 128) -> torch.Tensor:
+def load_quantized_state_dict(checkpoint_path: str, device: str = "cuda") -> Dict[str, torch.Tensor]:
     """
-    Dequantize GPTQ weights
+    Load a quantized state dictionary from checkpoint.
 
     Args:
-        qweight: Packed quantized weights
-        qzeros: Packed zero points
-        scales: Scale factors
-        g_idx: Group indices (optional, for act-order)
-        bits: Quantization bits
-        group_size: Quantization group size
+        checkpoint_path: Path to the .pth file
+        device: Device to load tensors to
 
     Returns:
-        Dequantized weights in fp16
+        State dictionary with quantized weights
     """
-    # Unpack weights and zeros
-    weight = unpack_gptq_weights(qweight, bits).to(torch.float16)
-    zeros = unpack_gptq_weights(qzeros, bits).to(torch.float16)
+    print(f"Loading quantized model from {checkpoint_path}")
+    state_dict = torch.load(checkpoint_path, map_location=device)
 
-    # Handle act-order if needed
-    if g_idx is not None:
-        weight = weight[:, g_idx]
-        zeros = zeros[:, g_idx]
-
-    # Reshape for group-wise dequantization
-    out_features, in_features = weight.shape
-    num_groups = in_features // group_size
-
-    weight = weight.reshape(out_features, num_groups, group_size)
-    zeros = zeros.reshape(-1, num_groups, 1)
-    scales = scales.reshape(-1, num_groups, 1)
-
-    # Dequantize: w = (w_q - z) * s
-    weight = (weight - zeros) * scales
-
-    # Reshape back
-    weight = weight.reshape(out_features, in_features)
+    # Check if this is a quantized model
+    quantized_keys = [k for k in state_dict.keys() if '.qweight' in k]
+    if quantized_keys:
+        print(f"Found {len(quantized_keys)} quantized layers")
+    else:
+        print("No quantized layers found - this appears to be a regular model")
 
-    return weight
+    return state_dict
 
 
-def load_gptq_linear_weights(checkpoint_path: str, layer_name: str,
-                            gptq_config: GPTQConfig) -> Dict[str, torch.Tensor]:
+def replace_linear_with_gptq(module: nn.Module, state_dict: Dict[str, torch.Tensor], prefix: str = ""):
     """
-    Load GPTQ quantized linear layer weights
+    Recursively replace Linear layers with GPTQLinear layers based on quantized state dict.
 
     Args:
-        checkpoint_path: Path to checkpoint file
-        layer_name: Name prefix of the layer (e.g., "layers.0.self_attn.q_proj")
-        gptq_config: GPTQ configuration
-
-    Returns:
-        Dictionary containing dequantized weight and bias (if exists)
+        module: The module to modify
+        state_dict: State dictionary containing quantized weights
+        prefix: Current prefix for parameter names
     """
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-
-    # Load quantized components
-    qweight = checkpoint.get(f"{layer_name}.qweight")
-    qzeros = checkpoint.get(f"{layer_name}.qzeros")
-    scales = checkpoint.get(f"{layer_name}.scales")
-    g_idx = checkpoint.get(f"{layer_name}.g_idx", None)
-    bias = checkpoint.get(f"{layer_name}.bias", None)
-
-    if qweight is None or qzeros is None or scales is None:
-        # Fallback to non-quantized weight
-        weight = checkpoint.get(f"{layer_name}.weight")
-        if weight is None:
-            raise ValueError(f"No weight found for {layer_name}")
-        return {"weight": weight, "bias": bias}
-
-    # Dequantize
-    weight = dequantize_gptq(qweight, qzeros, scales, g_idx,
-                            gptq_config.bits, gptq_config.group_size)
-
-    return {"weight": weight, "bias": bias}
+    for name, child in module.named_children():
+        full_name = f"{prefix}.{name}" if prefix else name
+
+        if isinstance(child, nn.Linear):
+            # Check if this layer has quantized weights
+            qweight_key = f"{full_name}.qweight"
+            if qweight_key in state_dict:
+                # Extract quantization parameters
+                qweight = state_dict[qweight_key]
+                qzeros = state_dict[f"{full_name}.qzeros"]
+                scales = state_dict[f"{full_name}.scales"]
+                wbits = state_dict.get(f"{full_name}.wbits", torch.tensor(4)).item()
+
+                # Check for bias
+                bias_key = f"{full_name}.bias"
+                bias = state_dict.get(bias_key, None)
+
+                # Replace with GPTQLinear
+                gptq_linear = GPTQLinear(qweight, qzeros, scales, wbits, bias)
+                setattr(module, name, gptq_linear)
+
+                print(f"Replaced {full_name} with GPTQLinear")
+        else:
+            # Recursively process child modules
+            replace_linear_with_gptq(child, state_dict, full_name)
 
 
-def convert_gptq_to_lite_llama(checkpoints_dir: str, model_config) -> Dict[str, torch.Tensor]:
+def create_dequantized_state_dict(quantized_state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     """
-    Convert GPTQ quantized model to lite_llama format
+    Create a dequantized state dictionary from a quantized one.
+    This is useful for models that don't support on-the-fly dequantization.
 
     Args:
-        checkpoints_dir: Directory containing GPTQ model files
-        model_config: Model configuration
+        quantized_state_dict: State dictionary with quantized weights
 
     Returns:
-        State dictionary in lite_llama format
+        State dictionary with dequantized weights
     """
-    import safetensors.torch
-
-    # Load GPTQ config
-    gptq_config = load_gptq_quantize_config(checkpoints_dir)
-    if gptq_config is None:
-        raise ValueError(f"No quantization_config.json found in {checkpoints_dir}")
-
-    # Find checkpoint files
-    checkpoint_files = sorted(Path(checkpoints_dir).glob("*.safetensors"))
-    use_safetensors = len(checkpoint_files) > 0
-
-    if not checkpoint_files:
-        checkpoint_files = sorted(Path(checkpoints_dir).glob("*.bin"))
-
-    if not checkpoint_files:
-        checkpoint_files = sorted(Path(checkpoints_dir).glob("*.pth"))
-
-    if not checkpoint_files:
-        raise ValueError(f"No checkpoint files found in {checkpoints_dir}")
-
-    # Load all checkpoints (handle sharded models)
-    full_state_dict = {}
-    for checkpoint_file in checkpoint_files:
-        if use_safetensors:
-            if not HAS_SAFETENSORS:
-                raise ImportError("safetensors is required for loading .safetensors files. Install with: pip install safetensors")
-            state_dict = safetensors.torch.load_file(str(checkpoint_file))
-        else:
-            state_dict = torch.load(str(checkpoint_file), map_location="cpu")
-        full_state_dict.update(state_dict)
-
-    # Check if already in lite_llama format
-    is_lite_llama_format = any("kv_proj_weight" in key for key in full_state_dict.keys())
-
-    if is_lite_llama_format:
-        print("Model is already in lite_llama format")
-        # Just dequantize if needed
-        new_state_dict = {}
-        for key, value in full_state_dict.items():
-            # Check if this is a quantized weight
-            base_key = key.replace(".qweight", "").replace(".qzeros", "").replace(".scales", "").replace(".g_idx", "")
-
-            if key.endswith(".qweight"):
-                # This is a quantized weight, dequantize it
-                qweight = value
-                qzeros = full_state_dict.get(base_key + ".qzeros")
-                scales = full_state_dict.get(base_key + ".scales")
-                g_idx = full_state_dict.get(base_key + ".g_idx", None)
-
-                if qzeros is not None and scales is not None:
-                    weight = dequantize_gptq(
-                        qweight, qzeros, scales, g_idx,
-                        gptq_config.bits, gptq_config.group_size
-                    )
-                    new_state_dict[base_key] = weight
-            elif not any(key.endswith(suffix) for suffix in [".qzeros", ".scales", ".g_idx"]):
-                # Regular weight, just copy
-                new_state_dict[key] = value
-
-        return new_state_dict
-
-    # Otherwise, convert based on model type
-    if model_config.model_type.lower() == "llama":
-        new_state_dict = convert_gptq_llama_to_lite_llama(
-            full_state_dict, gptq_config, model_config
-        )
-    elif model_config.model_type.lower() == "qwen2":
-        new_state_dict = convert_gptq_qwen2_to_lite_llama(
-            full_state_dict, gptq_config, model_config
-        )
-    else:
-        raise ValueError(f"Unsupported model type for GPTQ: {model_config.model_type}")
-
-    return new_state_dict
-
-
-def convert_gptq_llama_to_lite_llama(
-    checkpoint: Dict[str, torch.Tensor],
-    gptq_config: GPTQConfig,
-    model_config
-) -> Dict[str, torch.Tensor]:
-    """Convert GPTQ Llama model to lite_llama format"""
-    new_state_dict = {}
-
-    # Check if this is already in lite_llama format
-    is_lite_llama_format = any("kv_proj_weight" in key for key in checkpoint.keys())
-
-    if is_lite_llama_format:
-        # Already in lite_llama format, just process the weights
-        for key, value in checkpoint.items():
-            new_state_dict[key] = value
-        return new_state_dict
-
-    # Load embeddings and norms (these are not quantized)
-    new_state_dict["embed_tokens.weight"] = checkpoint.get("model.embed_tokens.weight")
-    new_state_dict["norm_weight"] = checkpoint.get("model.norm.weight")
-    new_state_dict["lm_head.weight"] = checkpoint.get("lm_head.weight")
-
-    # Process each layer
-    for i in range(model_config.num_layers):
-        # Check if we have separate k_proj and v_proj or merged kv_proj
-        has_separate_kv = f"model.layers.{i}.self_attn.k_proj.weight" in checkpoint or \
-                         f"model.layers.{i}.self_attn.k_proj.qweight" in checkpoint
-
-        if has_separate_kv:
-            # Process separate K and V projections
-            for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]:
-                prefix = f"model.layers.{i}.self_attn.{proj}"
-
-                # Check if quantized weights exist
-                if f"{prefix}.qweight" in checkpoint:
-                    # Load and dequantize
-                    qweight = checkpoint[f"{prefix}.qweight"]
-                    qzeros = checkpoint[f"{prefix}.qzeros"]
-                    scales = checkpoint[f"{prefix}.scales"]
-                    g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-                    weight = dequantize_gptq(
-                        qweight, qzeros, scales, g_idx,
-                        gptq_config.bits, gptq_config.group_size
-                    )
-                else:
-                    # Use original weight if not quantized
-                    weight = checkpoint.get(f"{prefix}.weight")
-                    if weight is None and proj in ["k_proj", "v_proj"]:
-                        # Skip if k_proj/v_proj don't exist (might be merged already)
-                        continue
-                    elif weight is None:
-                        raise ValueError(f"No weight found for {prefix}")
-
-                if proj in ["k_proj", "v_proj"]:
-                    # Store temporarily for merging
-                    new_state_dict[f"_temp_{i}_{proj}_weight"] = weight
+    dequantized_dict = {}
+    processed_layers = set()
+
+    for key, value in quantized_state_dict.items():
+        if '.qweight' in key:
+            # Extract base name
+            base_name = key.replace('.qweight', '')
+
+            if base_name not in processed_layers:
+                processed_layers.add(base_name)
+
+                # Get quantization parameters
+                qweight = quantized_state_dict[f"{base_name}.qweight"]
+                qzeros = quantized_state_dict[f"{base_name}.qzeros"]
+                scales = quantized_state_dict[f"{base_name}.scales"]
+                wbits = quantized_state_dict.get(f"{base_name}.wbits", torch.tensor(4)).item()
+
+                # Dequantize
+                weight = dequantize_weight(qweight, qzeros, scales, wbits)
+
+                # Store dequantized weight
+                # Handle different naming conventions
+                if "_weight" in base_name:
+                    dequantized_dict[f"{base_name}"] = weight
                 else:
-                    new_state_dict[f"layers.{i}.self_attn.{proj}.weight"] = weight
+                    dequantized_dict[f"{base_name}.weight"] = weight
 
-            # Merge k and v projections if they were separate
-            if f"_temp_{i}_k_proj_weight" in new_state_dict:
-                k_weight = new_state_dict.pop(f"_temp_{i}_k_proj_weight")
-                v_weight = new_state_dict.pop(f"_temp_{i}_v_proj_weight")
-                new_state_dict[f"layers.{i}.self_attn.kv_proj_weight"] = torch.cat([k_weight, v_weight], dim=0)
-        else:
-            # Already has merged kv_proj
-            # Q projection
-            prefix = f"model.layers.{i}.self_attn.q_proj"
-            if f"{prefix}.qweight" in checkpoint:
-                qweight = checkpoint[f"{prefix}.qweight"]
-                qzeros = checkpoint[f"{prefix}.qzeros"]
-                scales = checkpoint[f"{prefix}.scales"]
-                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-                weight = dequantize_gptq(
-                    qweight, qzeros, scales, g_idx,
-                    gptq_config.bits, gptq_config.group_size
-                )
-            else:
-                weight = checkpoint.get(f"{prefix}.weight")
-
-            new_state_dict[f"layers.{i}.self_attn.q_proj.weight"] = weight
-
-            # O projection
-            prefix = f"model.layers.{i}.self_attn.o_proj"
-            if f"{prefix}.qweight" in checkpoint:
-                qweight = checkpoint[f"{prefix}.qweight"]
-                qzeros = checkpoint[f"{prefix}.qzeros"]
-                scales = checkpoint[f"{prefix}.scales"]
-                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-                weight = dequantize_gptq(
-                    qweight, qzeros, scales, g_idx,
-                    gptq_config.bits, gptq_config.group_size
-                )
-            else:
-                weight = checkpoint.get(f"{prefix}.weight")
-
-            new_state_dict[f"layers.{i}.self_attn.o_proj.weight"] = weight
-
-            # KV projection (already merged)
-            prefix = f"model.layers.{i}.self_attn.kv_proj"
-            if f"{prefix}.qweight" in checkpoint:
-                qweight = checkpoint[f"{prefix}.qweight"]
-                qzeros = checkpoint[f"{prefix}.qzeros"]
-                scales = checkpoint[f"{prefix}.scales"]
-                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-                weight = dequantize_gptq(
-                    qweight, qzeros, scales, g_idx,
-                    gptq_config.bits, gptq_config.group_size
-                )
-            else:
-                weight = checkpoint.get(f"{prefix}.weight",
-                                       checkpoint.get(f"layers.{i}.self_attn.kv_proj_weight"))
-
-            if weight is not None:
-                new_state_dict[f"layers.{i}.self_attn.kv_proj_weight"] = weight
-
-        # MLP projections
-        for proj in ["gate_proj", "up_proj", "down_proj"]:
-            prefix = f"model.layers.{i}.mlp.{proj}"
-
-            if f"{prefix}.qweight" in checkpoint:
-                # Load and dequantize
-                qweight = checkpoint[f"{prefix}.qweight"]
-                qzeros = checkpoint[f"{prefix}.qzeros"]
-                scales = checkpoint[f"{prefix}.scales"]
-                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-                weight = dequantize_gptq(
-                    qweight, qzeros, scales, g_idx,
-                    gptq_config.bits, gptq_config.group_size
-                )
-            else:
-                weight = checkpoint.get(f"{prefix}.weight")
-                if weight is None:
-                    raise ValueError(f"No weight found for {prefix}")
-
-            new_state_dict[f"layers.{i}.mlp.{proj}.weight"] = weight
-
-        # Layer norms (not quantized) - handle different naming conventions
-        attention_norm = checkpoint.get(f"model.layers.{i}.input_layernorm.weight") or \
-                        checkpoint.get(f"layers.{i}.attention_norm_weight") or \
-                        checkpoint.get(f"layers.{i}.input_layernorm_weight")
-
-        ffn_norm = checkpoint.get(f"model.layers.{i}.post_attention_layernorm.weight") or \
-                   checkpoint.get(f"layers.{i}.ffn_norm_weight") or \
-                   checkpoint.get(f"layers.{i}.post_attention_layernorm_weight")
-
-        if attention_norm is not None:
-            new_state_dict[f"layers.{i}.attention_norm_weight"] = attention_norm
-        if ffn_norm is not None:
-            new_state_dict[f"layers.{i}.ffn_norm_weight"] = ffn_norm
-
-    return new_state_dict
-
-
-def convert_gptq_qwen2_to_lite_llama(
-    checkpoint: Dict[str, torch.Tensor],
-    gptq_config: GPTQConfig,
-    model_config
-) -> Dict[str, torch.Tensor]:
-    """Convert GPTQ Qwen2 model to lite_llama format"""
-    new_state_dict = {}
-
-    # Load embeddings and norms
-    new_state_dict["embed_tokens.weight"] = checkpoint.get("model.embed_tokens.weight")
-    new_state_dict["norm_weight"] = checkpoint.get("model.norm.weight")
-    new_state_dict["lm_head_weight"] = checkpoint.get("lm_head.weight")
-
-    # Process each layer
-    for i in range(model_config.num_layers):
-        # Self attention - handle q_proj separately due to bias
-        prefix = f"model.layers.{i}.self_attn.q_proj"
-        if f"{prefix}.qweight" in checkpoint:
-            qweight = checkpoint[f"{prefix}.qweight"]
-            qzeros = checkpoint[f"{prefix}.qzeros"]
-            scales = checkpoint[f"{prefix}.scales"]
-            g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-            weight = dequantize_gptq(
-                qweight, qzeros, scales, g_idx,
-                gptq_config.bits, gptq_config.group_size
-            )
-        else:
-            weight = checkpoint.get(f"{prefix}.weight")
-
-        new_state_dict[f"layers.{i}.self_attn.q_proj_weight"] = weight
-        new_state_dict[f"layers.{i}.self_attn.q_proj_bias"] = checkpoint.get(f"{prefix}.bias")
-
-        # Handle k_proj and v_proj for merging
-        for proj in ["k_proj", "v_proj"]:
-            prefix = f"model.layers.{i}.self_attn.{proj}"
-
-            if f"{prefix}.qweight" in checkpoint:
-                qweight = checkpoint[f"{prefix}.qweight"]
-                qzeros = checkpoint[f"{prefix}.qzeros"]
-                scales = checkpoint[f"{prefix}.scales"]
-                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-                weight = dequantize_gptq(
-                    qweight, qzeros, scales, g_idx,
-                    gptq_config.bits, gptq_config.group_size
-                )
-            else:
-                weight = checkpoint.get(f"{prefix}.weight")
-
-            new_state_dict[f"_temp_{i}_{proj}_weight"] = weight
-            new_state_dict[f"_temp_{i}_{proj}_bias"] = checkpoint.get(f"{prefix}.bias")
-
-        # Merge k and v
-        k_weight = new_state_dict.pop(f"_temp_{i}_k_proj_weight")
-        v_weight = new_state_dict.pop(f"_temp_{i}_v_proj_weight")
-        k_bias = new_state_dict.pop(f"_temp_{i}_k_proj_bias")
-        v_bias = new_state_dict.pop(f"_temp_{i}_v_proj_bias")
-
-        new_state_dict[f"layers.{i}.self_attn.kv_proj_weight"] = torch.cat([k_weight, v_weight], dim=0)
-        new_state_dict[f"layers.{i}.self_attn.kv_proj_bias"] = torch.cat([k_bias, v_bias], dim=0)
-
-        # O projection
-        prefix = f"model.layers.{i}.self_attn.o_proj"
-        if f"{prefix}.qweight" in checkpoint:
-            qweight = checkpoint[f"{prefix}.qweight"]
-            qzeros = checkpoint[f"{prefix}.qzeros"]
-            scales = checkpoint[f"{prefix}.scales"]
-            g_idx = checkpoint.get(f"{prefix}.g_idx", None)
-
-            weight = dequantize_gptq(
-                qweight, qzeros, scales, g_idx,
-                gptq_config.bits, gptq_config.group_size
-            )
-        else:
-            weight = checkpoint.get(f"{prefix}.weight")
+                # Copy bias if exists
+                bias_keys = [f"{base_name}.bias", f"{base_name}_bias"]
+                for bias_key in bias_keys:
+                    if bias_key in quantized_state_dict:
+                        dequantized_dict[bias_key] = quantized_state_dict[bias_key]
 
-        new_state_dict[f"layers.{i}.self_attn.o_proj_weight"] = weight
+        elif not any(suffix in key for suffix in ['.qzeros', '.scales', '.wbits', '.groupsize']):
+            # Copy non-quantization related parameters as-is
+            dequantized_dict[key] = value
 
-        # MLP layers
-        for proj in ["gate_proj", "up_proj", "down_proj"]:
-            prefix = f"model.layers.{i}.mlp.{proj}"
+    print(f"Dequantized {len(processed_layers)} layers")
+    return dequantized_dict
 
-            if f"{prefix}.qweight" in checkpoint:
-                qweight = checkpoint[f"{prefix}.qweight"]
-                qzeros = checkpoint[f"{prefix}.qzeros"]
-                scales = checkpoint[f"{prefix}.scales"]
-                g_idx = checkpoint.get(f"{prefix}.g_idx", None)
 
-                weight = dequantize_gptq(
-                    qweight, qzeros, scales, g_idx,
-                    gptq_config.bits, gptq_config.group_size
-                )
-            else:
-                weight = checkpoint.get(f"{prefix}.weight")
+# Example usage functions
 
-            new_state_dict[f"layers.{i}.mlp.{proj}.weight"] = weight
-
-        # Layer norms
-        new_state_dict[f"layers.{i}.input_layernorm_weight"] = checkpoint.get(
-            f"model.layers.{i}.input_layernorm.weight"
-        )
-        new_state_dict[f"layers.{i}.post_attention_layernorm_weight"] = checkpoint.get(
-            f"model.layers.{i}.post_attention_layernorm.weight"
-        )
+def load_gptq_model_for_inference(model: nn.Module, checkpoint_path: str, device: str = "cuda"):
+    """
+    Load a GPTQ quantized model for inference.
 
-    return new_state_dict
+    Args:
+        model: The model architecture (should match the quantized model)
+        checkpoint_path: Path to the quantized .pth file
+        device: Device to load model to
+
+    Example:
+        >>> model = YourModelClass(config)
+        >>> load_gptq_model_for_inference(model, "my_weight/model_gptq.pth")
+        >>> # Model is now ready for inference with automatic dequantization
+    """
+    # Load quantized state dict
+    quantized_state_dict = load_quantized_state_dict(checkpoint_path, device)
+
+    # Check if model uses quantized weights
+    if any('.qweight' in k for k in quantized_state_dict.keys()):
+        print("Dequantizing weights for standard model inference...")
+        # Create dequantized state dict
+        dequantized_state_dict = create_dequantized_state_dict(quantized_state_dict)
+        # Load into model
+        model.load_state_dict(dequantized_state_dict, strict=False)
+    else:
+        # Regular model, load normally
+        model.load_state_dict(quantized_state_dict)
 
+    model.to(device)
+    model.eval()
 
-class GPTQModelLoader:
-    """Helper class to load GPTQ models"""
+    return model
 
-    @staticmethod
-    def load(checkpoints_dir: str, model_config, device: str = "cuda") -> Dict[str, torch.Tensor]:
-        """
-        Load GPTQ model and convert to lite_llama format
 
-        Args:
-            checkpoints_dir: Directory containing GPTQ model
-            model_config: Model configuration
-            device: Target device
+def compare_model_sizes(original_path: str, quantized_path: str):
+    """
+    Compare file sizes between original and quantized models.
 
-        Returns:
-            State dictionary ready for lite_llama
-        """
-        print(f"Loading GPTQ model from {checkpoints_dir}")
-        start_time = time.time()
+    Args:
+        original_path: Path to original .pth file
+        quantized_path: Path to quantized .pth file
+    """
+    import os
 
-        state_dict = convert_gptq_to_lite_llama(checkpoints_dir, model_config)
+    if os.path.exists(original_path):
+        original_size = os.path.getsize(original_path) / (1024 ** 3)  # GB
+        print(f"Original model size: {original_size:.2f} GB")
+    else:
+        print(f"Original model not found at {original_path}")
+        return
 
-        # Move to device and convert to fp16
-        for key, value in state_dict.items():
-            if value is not None:
-                state_dict[key] = value.to(device).half()
+    if os.path.exists(quantized_path):
+        quantized_size = os.path.getsize(quantized_path) / (1024 ** 3)  # GB
+        print(f"Quantized model size: {quantized_size:.2f} GB")
 
-        print(f"GPTQ model loaded and converted in {time.time() - start_time:.2f}s")
-        return state_dict
\ No newline at end of file
+        compression_ratio = original_size / quantized_size
+        print(f"Compression ratio: {compression_ratio:.2f}x")
+        print(f"Size reduction: {(1 - quantized_size / original_size) * 100:.1f}%")
+    else:
+        print(f"Quantized model not found at {quantized_path}")
\ No newline at end of file
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index 9b5d249..b3a4b9f 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -2,7 +2,7 @@
 import time, os
 import subprocess
 from typing import List, Optional
-
+import torch
 
 def read_json(json_path):
     with open(json_path, "r") as json_file:
@@ -82,7 +82,7 @@ def count_tokens(texts: List[str], tokenizer) -> int:
 
 
 def get_model_type(checkpoint_path: str) -> str | None:
-    from utils.logger import log
+    from logger import log
 
     model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]
 
@@ -158,4 +158,42 @@ def get_model_info(model_path):
 
     model_info["size"] = total_size / (1024 ** 3)  # Convert to GB
 
-    return model_info
\ No newline at end of file
+    return model_info
+
+
+def get_model_dtype(checkpoints_dir: str) -> torch.dtype:
+    """
+    Get the model dtype from config.json
+
+    Args:
+        checkpoints_dir: Path to model checkpoint directory
+
+    Returns:
+        torch.dtype: The dtype specified in config.json
+    """
+    config_path = os.path.join(checkpoints_dir, "config.json")
+
+    try:
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+
+        torch_dtype_str = config.get("torch_dtype", "float16")
+
+        # Map string to torch dtype
+        dtype_mapping = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32,
+            "float": torch.float32,
+        }
+
+        dtype = dtype_mapping.get(torch_dtype_str, torch.float16)
+        print(f"Detected model dtype from config: {torch_dtype_str} -> {dtype}")
+
+        return dtype
+
+    except Exception as e:
+        print(f"Warning: Could not read dtype from config.json: {e}")
+        print("Defaulting to torch.float16")
+        return torch.float16
+

From 8becbae947f862c00d2da1191d228c31871f09ba Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 31 May 2025 22:24:55 +0930
Subject: [PATCH 20/33] fix missing weight keys

---
 generate.py                           | 252 ++++++++++++++------------
 lite_llama/executor/weight_convert.py |  20 +-
 lite_llama/utils/common.py            |   2 +-
 3 files changed, 152 insertions(+), 122 deletions(-)

diff --git a/generate.py b/generate.py
index 15526a2..ff1a9c8 100644
--- a/generate.py
+++ b/generate.py
@@ -1,34 +1,33 @@
 import torch
 from typing import Optional
+from lite_llama.utils.prompt_templates import get_prompter, get_image_token
+from lite_llama.generate_stream import GenerateStreamText  # import GenerateText
+from lite_llama.utils.image_process import vis_images
+
 import warnings
+
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
-from lite_llama.utils.prompt_templates import get_prompter
-from lite_llama.generate_stream import GenerateStreamText  # Original import
+from utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
+from lite_llama.llava_generate_stream import LlavaGeneratorStream
 
 import sys, os, time
 from pathlib import Path
-
 # support running without installing as a package
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 import psutil
+from utils.logger import log
+import argparse
+from argparse import RawTextHelpFormatter
 
 process = psutil.Process(os.getpid())
 
-
-def is_gptq_model(checkpoint_path: str) -> bool:
-    """Check if the model is GPTQ quantized"""
-    quantize_config_path = Path(checkpoint_path) / "quantization_config.json"
-    return quantize_config_path.exists()
-
-
-def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
+def report_resource_usage(ram_before, vram_before) -> None:
     end_time = time.time()
     ram_after = process.memory_info().rss
-    vram_after = get_gpu_memory(gpu_type)
+    vram_after = get_gpu_memory(detect_device())
 
-    ram_used = (ram_after - ram_before) / (1024**3)  # Bytes to GB
+    ram_used = (ram_after - ram_before) / (1024 ** 3)  # Bytes to GB
 
     if vram_before is not None and vram_after is not None:
         vram_used = vram_after - vram_before
@@ -36,54 +35,39 @@ def report_resource_usage(ram_before, vram_before, gpu_type) -> None:
     else:
         vram_text = "Unavailable"
 
-    print(f"CPU RAM Used: {ram_used:.2f} GB")
-    print(f"GPU VRAM Used: {vram_text}")
-
-
-def main(
-    prompt: str = "Hello, my name is",
-    *,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    max_seq_len: int = 2048,
-    max_gpu_num_blocks=40960,
-    max_gen_len: Optional[int] = 1024,
-    load_model: bool = True,
-    compiled_model: bool = False,
-    triton_weight: bool = True,
-    gpu_type: str = "nvidia",
-    checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
-    quantize: Optional[str] = None,
-    use_gptq: Optional[bool] = None,  # New parameter for explicit GPTQ control
+    log.info(f"CPU RAM Used: {ram_used:.2f} GB")
+    log.info(f"GPU VRAM Used: {vram_text}")
+
+
+def generate_llama(
+        prompt: str = "Hello, my name is",
+        *,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_seq_len: int = 2048,
+        max_gpu_num_blocks=40960,
+        max_gen_len: Optional[int] = 1024,
+        load_model: bool = True,
+        compiled_model: bool = False,
+        triton_weight: bool = True,
+        gpu_type: str = "nvidia",
+        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+        quantize: Optional[str] = None,
 ):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
-
     if max_seq_len <= 1024:
         short_prompt = True
     else:
         short_prompt = False
-
-    # Get model type and prompter
-    model_prompter = get_prompter(
-        get_model_type(checkpoint_path), checkpoint_path, short_prompt
-    )
-
+    model_prompter = get_prompter(get_model_type(checkpoint_path), checkpoint_path, short_prompt)
     # Start resource tracking
     ram_before = process.memory_info().rss
-    gpu_type = detect_device()
+
     vram_before = get_gpu_memory(gpu_type)
 
     # Init LLM generator
-
-    # Auto-detect GPTQ if not explicitly specified
-    if use_gptq is None:
-        use_gptq = is_gptq_model(checkpoint_path)
-        if use_gptq:
-            print(f"GPTQ quantized model detected in {checkpoint_path}")
-
-    print("Using standard FP16 generator")
     generator = GenerateStreamText(
         checkpoints_dir=checkpoint_path,
         tokenizer_path=checkpoint_path,
@@ -95,9 +79,9 @@ def main(
         device=device,
     )
 
+
     model_prompter.insert_prompt(prompt)
     prompts = [model_prompter.model_input]
-
     # Call the generation function and start the stream generation
     stream = generator.text_completion_stream(
         prompts,
@@ -106,25 +90,106 @@ def main(
         max_gen_len=max_gen_len,
     )
 
-    completion = ""  # Initialize to generate the result
+    completion = ''  # Initialize to generate the result
     # NOTE: After creating a generator, it can be iterated through a for loop
     text_msg = ""
     start = time.perf_counter()
-
     for batch_completions in stream:
-        new_text = batch_completions[0]["generation"][len(completion) :]
-        completion = batch_completions[0]["generation"]
-        print(new_text, end="", flush=True)
-        text_msg += new_text
+        new_text = batch_completions[0]['generation'][len(completion):]
+        completion = batch_completions[0]['generation']
+        print(new_text, end='', flush=True)
+        text_msg +=new_text
     end = time.perf_counter()
 
     print("\n\n==================================\n")
-    print(
-        f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer) / (end - start):.2f} tokens/sec"
-    )
+    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
 
     # Report resource usage
-    report_resource_usage(ram_before, vram_before, gpu_type)
+    report_resource_usage(ram_before, vram_before)
+
+
+def generate_llava(
+        prompt: str = "Hello, my name is",
+        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+        figure_path: Path = Path("figures/lit-llama/"),
+        gpu_type: str = "nvidia",
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        max_seq_len: int = 2048,
+        max_gpu_num_blocks=None,
+        max_gen_len: Optional[int] = 512,
+        load_model: bool = True,
+        compiled_model: bool = False,
+        triton_weight: bool = True
+):
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    if max_seq_len <= 1024:
+        short_prompt = True
+    else:
+        short_prompt = False
+
+    if not os.path.isfile(figure_path):
+        log.error(f"'{figure_path}' Not a valid file path！")
+    else:
+        image_input = str(figure_path).strip()
+    image_items = [image_input]  # Prepare the image_items list
+    image_num = len(image_items)  # Calculate the number of input images
+    vis_images(image_items)  # Displaying images in the terminal
+    assert checkpoint_path.is_dir(), checkpoint_path
+    checkpoint_path = str(checkpoint_path)
+    model_prompter = get_prompter("llama", checkpoint_path, short_prompt)
+
+    # Start resource tracking
+    ram_before = process.memory_info().rss
+
+    vram_before = get_gpu_memory(gpu_type)
+
+    # Initializing the Multimodal Model Text Generator
+    try:
+        generator = LlavaGeneratorStream(
+            checkpoints_dir=checkpoint_path,
+            tokenizer_path=checkpoint_path,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            load_model=load_model,
+            compiled_model=compiled_model,
+            triton_weight=triton_weight,
+            device=device,
+        )
+    except Exception as e:
+        log.error(f"Model loading failure: {e}")
+        sys.exit(1)
+
+    image_token = get_image_token()
+    model_prompter.insert_prompt(image_token * image_num + prompt)
+    prompts = [model_prompter.model_input]
+
+    try:
+        stream = generator.text_completion_stream(
+            prompts,
+            image_items,
+            temperature=temperature,
+            top_p=top_p,
+            max_gen_len=max_gen_len,
+        )
+    except Exception as e:
+        log.error(f"Text Generation Failure: {e}")
+
+    completion = ''  # Initialization generates results
+    text_msg = ""
+    start = time.perf_counter()
+
+    for batch_completions in stream:
+        next_text = batch_completions[0]['generation'][len(completion):]
+        completion = batch_completions[0]['generation']
+        print(f"\033[91m{next_text}\033[0m", end='', flush=True)  # 红色文本
+        text_msg += next_text
+    end = time.perf_counter()
+
+    print("\n\n==================================\n")
+    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+    # Report resource usage
+    report_resource_usage(ram_before, vram_before)
 
 
 if __name__ == "__main__":
@@ -133,65 +198,26 @@ def main(
     torch.set_float32_matmul_precision("high")
 
     # Create a wrapper function that adds the use_gptq parameter
-    def main_with_gptq_option(
+    def main(
         prompt: str = "Hello, my name is",
-        *,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_seq_len: int = 2048,
-        max_gpu_num_blocks=40960,
-        max_gen_len: Optional[int] = 1024,
-        load_model: bool = True,
-        compiled_model: bool = False,
-        triton_weight: bool = True,
-        gpu_type: str = "nvidia",
-        checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
-        quantize: Optional[str] = None,
-        force_gptq: bool = False,
-        force_fp16: bool = False,
+        checkpoint_path: Path = Path("checkpoints/lite-llama/7B/"),
+        figure_path: Optional[Path] = None,
     ):
         """
         Generate text using lite_llama with automatic GPTQ detection
 
         Args:
             prompt: Input prompt text
-            temperature: Sampling temperature
-            top_p: Nucleus sampling probability
-            max_seq_len: Maximum sequence length
-            max_gpu_num_blocks: Maximum GPU memory blocks
-            max_gen_len: Maximum generation length
-            load_model: Whether to load model weights
-            compiled_model: Whether to use compiled model
-            triton_weight: Whether to use Triton kernels
-            gpu_type: GPU type (nvidia/amd/cpu)
             checkpoint_path: Path to model checkpoint directory
-            quantize: Quantization method (deprecated, kept for compatibility)
-            force_gptq: Force GPTQ mode even if no quantization_config.json
-            force_fp16: Force FP16 mode even if quantization_config.json exists
+            figure_path: Path to Image file for LLaVA generation, optional
         """
         # Determine use_gptq based on force flags
-        use_gptq = None
-        if force_gptq and force_fp16:
-            raise ValueError("Cannot force both GPTQ and FP16 modes simultaneously")
-        elif force_gptq:
-            use_gptq = True
-        elif force_fp16:
-            use_gptq = False
-
-        return main(
-            prompt=prompt,
-            temperature=temperature,
-            top_p=top_p,
-            max_seq_len=max_seq_len,
-            max_gpu_num_blocks=max_gpu_num_blocks,
-            max_gen_len=max_gen_len,
-            load_model=load_model,
-            compiled_model=compiled_model,
-            triton_weight=triton_weight,
-            gpu_type=gpu_type,
-            checkpoint_path=checkpoint_path,
-            quantize=quantize,
-            use_gptq=use_gptq,
-        )
-
-    CLI(main_with_gptq_option)
\ No newline at end of file
+        gpu_type = detect_device()
+        model_path = os.path.abspath(checkpoint_path)
+        if figure_path:
+            generate_llava(prompt=prompt, checkpoint_path=Path(model_path), figure_path=Path(figure_path),
+                           gpu_type=gpu_type)
+        else:
+            generate_llama(prompt=prompt, checkpoint_path=Path(model_path), gpu_type=gpu_type)
+
+    CLI(main)
\ No newline at end of file
diff --git a/lite_llama/executor/weight_convert.py b/lite_llama/executor/weight_convert.py
index 0114b2b..0ee1ecc 100755
--- a/lite_llama/executor/weight_convert.py
+++ b/lite_llama/executor/weight_convert.py
@@ -63,7 +63,7 @@ def convert_qwen2_hf_to_litellama(
     mapping = {
         "model.norm.weight": "norm_weight",
         "model.embed_tokens.weight": "embed_tokens.weight",
-        "lm_head.weight": "lm_head_weight",  # 只支持 hf 格式模型权重
+        "lm_head.weight": "lm_head.weight",  # 只支持 hf 格式模型权重
     }
 
     # 映射层
@@ -146,8 +146,9 @@ def convert_qwen2_hf_to_litellama(
         target_layers = []
         for name in new_sd.keys():
             if any(pattern in name for pattern in [
-                "q_proj_weight", "kv_proj_weight", "o_proj_weight",
-                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+                "q_proj.weight", "kv_proj_weight", "o_proj.weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight",
+                "lm_head.weight"  # Add lm_head to quantization targets
             ]) and "bias" not in name:
                 target_layers.append(name)
 
@@ -242,8 +243,9 @@ def convert_llama_torch_to_litellama(
         target_layers = []
         for name in new_sd.keys():
             if any(pattern in name for pattern in [
-                "q_proj.weight", "k_proj.weight", "v_proj.weight", "o_proj.weight",
-                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+                "q_proj.weight", "kv_proj_weight", "o_proj.weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight",
+                "lm_head.weight"  # Add lm_head to quantization targets
             ]):
                 target_layers.append(name)
 
@@ -342,7 +344,8 @@ def convert_llama_hf_to_litellama(
         for name in new_sd.keys():
             if any(pattern in name for pattern in [
                 "q_proj.weight", "kv_proj_weight", "o_proj.weight",
-                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight",
+                "lm_head.weight"  # Add lm_head to quantization targets
             ]):
                 target_layers.append(name)
 
@@ -450,8 +453,9 @@ def convert_llavallama_hf_to_litellama(
         target_layers = []
         for name in new_sd.keys():
             if any(pattern in name for pattern in [
-                "q_proj.weight", "kv_proj_weight", "o_proj.weight",
-                "gate_proj.weight", "up_proj.weight", "down_proj.weight"
+                "q_proj.weight", "k_proj.weight", "o_proj.weight", "v_proj.weight",
+                "gate_proj.weight", "up_proj.weight", "down_proj.weight",
+                "lm_head.weight"  # Add lm_head to quantization targets
             ]) and "language_model" in name:
                 target_layers.append(name)
 
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index b3a4b9f..6baf074 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -82,7 +82,7 @@ def count_tokens(texts: List[str], tokenizer) -> int:
 
 
 def get_model_type(checkpoint_path: str) -> str | None:
-    from logger import log
+    from .logger import log
 
     model_type = ["llama", "falcon", "mpt", "qwen2", "llava"]
 

From 5dbe60a337a9a736617609b334c0fc8702609e5e Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 31 May 2025 22:36:47 +0930
Subject: [PATCH 21/33] fix requirement.txt

---
 requirement.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirement.txt b/requirement.txt
index c3ef2c7..e21926c 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -1,7 +1,7 @@
 tokenizers==0.20.3
 huggingface-hub==0.24.6
 transformers==4.46.3
-torch=2.1.2
+torch==2.1.2
 triton>=2.1.0
 tqdm==4.65.0
 pytest==8.3.3

From 5daf07d4ecbd45ebfc7d1ca864f53bb9372107ef Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sat, 31 May 2025 22:44:27 +0930
Subject: [PATCH 22/33] test for 2.2.0

---
 requirement.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirement.txt b/requirement.txt
index e21926c..d99aca5 100644
--- a/requirement.txt
+++ b/requirement.txt
@@ -1,7 +1,7 @@
 tokenizers==0.20.3
 huggingface-hub==0.24.6
 transformers==4.46.3
-torch==2.1.2
+torch==2.2.0
 triton>=2.1.0
 tqdm==4.65.0
 pytest==8.3.3

From 11ec09434bc724f1a450f66d9d9457221dbd756c Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Sun, 1 Jun 2025 02:00:00 +0930
Subject: [PATCH 23/33] fix inference with missing keys

---
 lite_llama/executor/model_executor.py       |  4 +++
 lite_llama/quantization/gptq/gptq_loader.py | 29 ++++++++++++---------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/lite_llama/executor/model_executor.py b/lite_llama/executor/model_executor.py
index 8f7931d..8d28a93 100644
--- a/lite_llama/executor/model_executor.py
+++ b/lite_llama/executor/model_executor.py
@@ -140,6 +140,10 @@ def _load_model_weight(
             state_dict = torch.load(
                 ckpt_path, mmap=True, weights_only=True, map_location=device
             )
+            if any(key.endswith(".qweight") for key in state_dict.keys()):
+                from ..quantization.gptq.gptq_loader import create_dequantized_state_dict
+                log.info("Detected GPTQ quantized weights. Dequantizing...")
+                state_dict = create_dequantized_state_dict(state_dict)
         else:
             conversion_func = get_conversion_func(model_config.model_type)
             if conversion_func is None:
diff --git a/lite_llama/quantization/gptq/gptq_loader.py b/lite_llama/quantization/gptq/gptq_loader.py
index 7c30c08..84b56f3 100644
--- a/lite_llama/quantization/gptq/gptq_loader.py
+++ b/lite_llama/quantization/gptq/gptq_loader.py
@@ -116,36 +116,39 @@ def create_dequantized_state_dict(quantized_state_dict: Dict[str, torch.Tensor])
 
     for key, value in quantized_state_dict.items():
         if '.qweight' in key:
-            # Extract base name
+            # Extract base name without the '.qweight' suffix
             base_name = key.replace('.qweight', '')
 
             if base_name not in processed_layers:
                 processed_layers.add(base_name)
 
-                # Get quantization parameters
+                # Retrieve quantization parameters
                 qweight = quantized_state_dict[f"{base_name}.qweight"]
                 qzeros = quantized_state_dict[f"{base_name}.qzeros"]
                 scales = quantized_state_dict[f"{base_name}.scales"]
                 wbits = quantized_state_dict.get(f"{base_name}.wbits", torch.tensor(4)).item()
+                original_cols = quantized_state_dict.get(f"{base_name}.original_cols")
+                if original_cols is not None:
+                    original_cols = int(original_cols)
+                # Dequantize to regular fp16 weights
+                weight = dequantize_weight(qweight, qzeros, scales, wbits, original_cols)
 
-                # Dequantize
-                weight = dequantize_weight(qweight, qzeros, scales, wbits)
-
-                # Store dequantized weight
-                # Handle different naming conventions
+                # Store dequantized weight; handle naming with or without '.weight'
                 if "_weight" in base_name:
-                    dequantized_dict[f"{base_name}"] = weight
+                    dequantized_dict[base_name] = weight
                 else:
                     dequantized_dict[f"{base_name}.weight"] = weight
 
-                # Copy bias if exists
-                bias_keys = [f"{base_name}.bias", f"{base_name}_bias"]
-                for bias_key in bias_keys:
+                # Copy bias if present
+                for bias_key in (f"{base_name}.bias", f"{base_name}_bias"):
                     if bias_key in quantized_state_dict:
                         dequantized_dict[bias_key] = quantized_state_dict[bias_key]
 
-        elif not any(suffix in key for suffix in ['.qzeros', '.scales', '.wbits', '.groupsize']):
-            # Copy non-quantization related parameters as-is
+
+
+        elif not any(suffix in key for suffix in ['.qzeros', '.scales', '.wbits', '.groupsize', '.original_cols']):
+
+            # Preserve all other parameters
             dequantized_dict[key] = value
 
     print(f"Dequantized {len(processed_layers)} layers")

From 6cf54e66edffb2b8fbb9ef2c01480e64516ec657 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Mon, 2 Jun 2025 03:59:37 +0930
Subject: [PATCH 24/33] fixed gptq missing int4 inference

---
 apply_weight_convert.py                     |  52 +--
 generate.py                                 |   4 +-
 lite_llama/executor/model_executor.py       |  22 +-
 lite_llama/executor/weight_convert.py       |  10 +-
 lite_llama/models/RotaryEmbedding.py        |   2 +-
 lite_llama/quantization/gptq/gptq.py        | 493 +++++++++++---------
 lite_llama/quantization/gptq/gptq_loader.py |  15 +-
 lite_llama/utils/common.py                  |  15 +-
 8 files changed, 321 insertions(+), 292 deletions(-)

diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index 916272d..3803019 100755
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -42,8 +42,8 @@ def main():
     parser.add_argument(
         "--groupsize",
         type=int,
-        default=128,
-        help="Group size for quantization (default: 128)"
+        default=8,
+        help="Group size for quantization (default: 8)"
     )
     parser.add_argument(
         "--device",
@@ -154,50 +154,4 @@ def main():
     # If script is run directly without arguments, use default values
     import sys
 
-    if len(sys.argv) == 1:
-        # Legacy behavior - use hardcoded path
-        checkpoints_dir = "/path/llm_weights/llava-v1.5-7b"
-
-        print(f"Running with default path: {checkpoints_dir}")
-        print("To use command line arguments, run with --help")
-        print("-" * 50)
-
-        if "llava" in checkpoints_dir.lower():
-            model = (
-                LlavaForConditionalGeneration.from_pretrained(
-                    checkpoints_dir,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                ).to("cuda")
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(
-                checkpoints_dir,
-                torch_dtype=torch.float16,
-                low_cpu_mem_usage=True,
-            ).to("cuda")
-
-        hf_sd = model.state_dict()
-
-        if "qwen2" in checkpoints_dir.lower():
-            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-            num_layers = llm_config.num_hidden_layers
-            print("num_layers: ", num_layers)
-            convert_qwen2_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-        elif "llama" in checkpoints_dir.lower():
-            llm_config = AutoConfig.from_pretrained(checkpoints_dir)
-            num_layers = llm_config.num_hidden_layers
-            print("num_layers: ", num_layers)
-            convert_llama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-
-        elif "llava" in checkpoints_dir.lower():
-            llava_config = LlavaConfig.from_pretrained(checkpoints_dir)
-            num_layers = llava_config.text_config.num_hidden_layers
-            print("num_layers: ", num_layers)
-            convert_llavallama_hf_to_litellama(checkpoints_dir, hf_sd, num_layers)
-        else:
-            print("Error! Unsupported model type!")
-    else:
-        # Use argparse for command line interface
-        main()
\ No newline at end of file
+    main()
\ No newline at end of file
diff --git a/generate.py b/generate.py
index ff1a9c8..7f1028f 100644
--- a/generate.py
+++ b/generate.py
@@ -7,7 +7,7 @@
 import warnings
 
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-from utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
+from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
 from lite_llama.llava_generate_stream import LlavaGeneratorStream
 
 import sys, os, time
@@ -16,7 +16,7 @@
 wd = Path(__file__).parent.parent.resolve()
 sys.path.append(str(wd))
 import psutil
-from utils.logger import log
+from lite_llama.utils.logger import log
 import argparse
 from argparse import RawTextHelpFormatter
 
diff --git a/lite_llama/executor/model_executor.py b/lite_llama/executor/model_executor.py
index 8d28a93..833a8d0 100644
--- a/lite_llama/executor/model_executor.py
+++ b/lite_llama/executor/model_executor.py
@@ -19,13 +19,8 @@
     convert_qwen2_hf_to_litellama,
 )
 from ..kernels import update_kv_index
-
-
-import sys, os
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
-from utils.logger import log
-
+from ..utils.logger import log
+from ..utils.common import get_model_dtype
 
 def get_conversion_func(model_type: str):
     """
@@ -154,6 +149,19 @@ def _load_model_weight(
                 f"Weight conversion completed. Time elapsed: {time.time() - start_time:.2f} sec"
             )
 
+        # Some checkpoints may use the HuggingFace naming style with a dot
+        # before "weight" or "bias" for fused kv projections.  Rename these
+        # keys to match our Module definition so that `load_state_dict` can
+        # succeed without requiring an explicit conversion step.
+        renamed_state_dict = {}
+        for k, v in state_dict.items():
+            new_key = k
+            if "kv_proj.weight" in k:
+                new_key = k.replace("kv_proj.weight", "kv_proj_weight")
+            elif "kv_proj.bias" in k:
+                new_key = k.replace("kv_proj.bias", "kv_proj_bias")
+            renamed_state_dict[new_key] = v
+        state_dict = renamed_state_dict
         model.load_state_dict(
             state_dict, strict=True, assign=True
         )  # 将加载的 state_dict 应用到模型实例中。
diff --git a/lite_llama/executor/weight_convert.py b/lite_llama/executor/weight_convert.py
index 0ee1ecc..bceecbb 100755
--- a/lite_llama/executor/weight_convert.py
+++ b/lite_llama/executor/weight_convert.py
@@ -30,7 +30,7 @@ def build_new_weight_dir(checkpoints_dir: str, new_sd, quantized: bool = False):
     json_files = glob.glob(osp.join(checkpoints_dir, "*.json"))
     for file_path in json_files:
         shutil.copy(file_path, my_weight_dir)  # 复制 hf 权重目录的所有 json 文件到新的目录
-        print(f"已复制: {file_path} -> {my_weight_dir}")
+        print(f"Copied: {file_path} -> {my_weight_dir}")
 
     if osp.exists(osp.join(checkpoints_dir, "tokenizer.model")):
         shutil.copy(osp.join(checkpoints_dir, "tokenizer.model"), my_weight_dir)
@@ -44,7 +44,7 @@ def convert_qwen2_hf_to_litellama(
         device: str = "cuda",
         use_gptq: bool = False,
         wbits: int = 4,
-        groupsize: int = 128,
+        groupsize: int = 8,
 ) -> Dict[str, torch.Tensor]:
     """
     将 Hugging Face 格式的预训练模型的权重字典转换为自定义模型的权重字典。
@@ -184,7 +184,7 @@ def convert_llama_torch_to_litellama(
         num_layers,
         use_gptq: bool = False,
         wbits: int = 4,
-        groupsize: int = 128,
+        groupsize: int = 8,
         device: str = "cuda"
 ):
     """
@@ -267,7 +267,7 @@ def convert_llama_hf_to_litellama(
         num_layers,
         use_gptq: bool = False,
         wbits: int = 4,
-        groupsize: int = 128,
+        groupsize: int = 8,
         device: str = "cuda"
 ):
     """
@@ -373,7 +373,7 @@ def convert_llavallama_hf_to_litellama(
         num_layers,
         use_gptq: bool = False,
         wbits: int = 4,
-        groupsize: int = 128,
+        groupsize: int = 8,
         device: str = "cuda"
 ):
     """
diff --git a/lite_llama/models/RotaryEmbedding.py b/lite_llama/models/RotaryEmbedding.py
index 07be59d..e3f0c0a 100644
--- a/lite_llama/models/RotaryEmbedding.py
+++ b/lite_llama/models/RotaryEmbedding.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 from typing import Optional, Tuple
 from .model_config import LlamaConfig, Qwen2Config
-from utils.logger import log
+from ..utils.logger import log
 
 
 
diff --git a/lite_llama/quantization/gptq/gptq.py b/lite_llama/quantization/gptq/gptq.py
index b614fde..ab10fd9 100755
--- a/lite_llama/quantization/gptq/gptq.py
+++ b/lite_llama/quantization/gptq/gptq.py
@@ -1,85 +1,40 @@
 import torch
 import torch.nn as nn
 import numpy as np
-from typing import Dict, Tuple, Optional
+from typing import Dict, Tuple, Optional, Any
 from tqdm.auto import tqdm
 import math
 
 
-def pack_int4_weights(qweight: torch.Tensor, wbits: int = 4) -> torch.Tensor:
+def pack_int4(qweight: torch.Tensor) -> torch.Tensor:
     """
-    Pack quantized weights into int32 for efficient storage.
-    For 4-bit quantization, pack 8 weights into one int32.
-
-    Args:
-        qweight: Quantized weight tensor of shape (rows, cols) with values in [0, 15]
-        wbits: Number of bits per weight (4 for int4)
-
-    Returns:
-        Packed weight tensor of shape (rows, cols // 8)
+    [rows, cols] uint8 in [0, 15] -> [rows, ceil(cols/2)] uint8
     """
-    assert wbits == 4, "This function currently only supports 4-bit packing"
-
     rows, cols = qweight.shape
-    pack_factor = 32 // wbits  # 8 for 4-bit
-
-    # Ensure we can pack evenly
-    if cols % pack_factor != 0:
-        # Pad columns to make it divisible by pack_factor
-        pad_cols = pack_factor - (cols % pack_factor)
-        qweight = torch.nn.functional.pad(qweight, (0, pad_cols), value=0)
-        cols = qweight.shape[1]
-
-    packed_cols = cols // pack_factor
-    packed = torch.zeros((rows, packed_cols), dtype=torch.int32, device=qweight.device)
-
-    # Pack weights
-    for i in range(pack_factor):
-        packed |= (qweight[:, i::pack_factor].to(torch.int32) & 0xF) << (i * 4)
-
-    return packed
+    if cols % 2 != 0:
+        qweight = torch.nn.functional.pad(qweight, (0, 1), value=0)
+        cols += 1
+    packed = (qweight[:, 0::2] & 0xF) | ((qweight[:, 1::2] & 0xF) << 4)
+    return packed.contiguous()
 
 
-def unpack_int4_weights(packed: torch.Tensor, original_cols: int, wbits: int = 4) -> torch.Tensor:
+def unpack_int4(packed: torch.Tensor, orig_cols: int) -> torch.Tensor:
     """
-    Unpack int4 weights from int32 storage.
-
-    Args:
-        packed: Packed weight tensor
-        original_cols: Original number of columns before packing
-        wbits: Number of bits per weight
-
-    Returns:
-        Unpacked weight tensor
+    [rows, ceil(cols/2)] uint8 -> [rows, cols] uint8 in [0, 15]
     """
-    assert wbits == 4, "This function currently only supports 4-bit unpacking"
-
     rows, packed_cols = packed.shape
-    pack_factor = 32 // wbits  # 8 for 4-bit
-
-    # Calculate unpacked dimensions
-    unpacked_cols = packed_cols * pack_factor
-    unpacked = torch.zeros((rows, unpacked_cols), dtype=torch.int32, device=packed.device)
-
-    # Unpack weights
-    for i in range(pack_factor):
-        unpacked[:, i::pack_factor] = (packed >> (i * 4)) & 0xF
-
-    # Remove padding if necessary
-    return unpacked[:, :original_cols]
+    qweight = torch.empty((rows, packed_cols * 2), dtype=torch.uint8, device=packed.device)
+    qweight[:, 0::2] = packed & 0xF
+    qweight[:, 1::2] = (packed >> 4) & 0xF
+    return qweight[:, :orig_cols].contiguous()
 
 
 class GPTQ:
-    """
-    Implementation of GPTQ (Generalized Post-Training Quantization) algorithm
-    for quantizing model weights to lower bit precision.
-    """
-
     def __init__(
             self,
-            layer: nn.Module,
+            layer: nn.Module = None,
             wbits: int = 4,
-            groupsize: int = 128,
+            groupsize: int = 8,
             actorder: bool = False,
             percdamp: float = 0.01,
             blocksize: int = 128,
@@ -94,132 +49,257 @@ def __init__(
         self.device = device
         self.maxq = 2 ** wbits - 1
 
-        # Initialize quantization parameters
-        self.H = None
-        self.dead = None
-        self.rows = None
-        self.columns = None
+    def relative_error_loss(self, w_original: torch.Tensor, w_reconstructed: torch.Tensor,
+                            eps: float = 1e-5) -> torch.Tensor:
+        """Compute relative error loss with better handling of small weights"""
+        abs_diff = (w_original - w_reconstructed).abs()
+
+        # Use adaptive epsilon based on weight magnitude distribution
+        w_abs = w_original.abs()
+        adaptive_eps = torch.maximum(
+            torch.tensor(eps, device=w_original.device),
+            0.01 * w_abs.median()  # Use median as robust estimate
+        )
+
+        rel_err = abs_diff / (w_abs + adaptive_eps)
+
+        # Use robust loss to handle outliers
+        return rel_err.mean() + 0.1 * rel_err.pow(2).mean()
+
+    def optimize_for_relative_error(self, w_group: torch.Tensor, max_iter: int = 200) -> Tuple[
+        torch.Tensor, torch.Tensor]:
+        """Optimize scale and zero specifically for minimal relative error"""
+        device = w_group.device
+
+        # Separate handling for near-zero and normal weights
+        w_abs = w_group.abs()
+        w_median = w_abs.median()
+        small_weight_threshold = 0.1 * w_median
+
+        # Initialize with better starting points
+        w_min = w_group.min(dim=-1, keepdim=True)[0]
+        w_max = w_group.max(dim=-1, keepdim=True)[0]
+
+        # For groups with many small weights, use tighter bounds
+        if (w_abs < small_weight_threshold).float().mean() > 0.3:
+            # Use percentile-based bounds for groups with many small weights
+            w_flat = w_group.view(w_group.shape[0], -1)
+            w_sorted = torch.sort(w_flat, dim=-1)[0]
+            n = w_sorted.shape[-1]
+            w_min = w_sorted[:, max(0, int(0.05 * n)):max(1, int(0.05 * n) + 1)]
+            w_max = w_sorted[:, min(n - 1, int(0.95 * n)):min(n, int(0.95 * n) + 1)]
+
+        range_val = w_max - w_min
+        range_val = torch.where(range_val < 1e-8, torch.tensor(1e-6, device=device), range_val)
+
+        # Initialize parameters
+        scale = nn.Parameter((range_val / self.maxq).clamp(min=1e-8))
+        zero = nn.Parameter(torch.round(-w_min / scale).clamp(0, self.maxq))
+
+        optimizer = torch.optim.AdamW([scale, zero], lr=0.005, weight_decay=1e-6)
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter)
+
+        best_loss = float('inf')
+        best_scale = scale.data.clone()
+        best_zero = zero.data.clone()
+        patience = 20
+        no_improve = 0
+
+        for i in range(max_iter):
+            optimizer.zero_grad()
+
+            # Ensure valid range
+            scale.data.clamp_(min=1e-8, max=1e3)
+            zero.data.clamp_(0, self.maxq)
+
+            # Quantize and dequantize
+            q = torch.clamp(torch.round(w_group / scale + zero), 0, self.maxq)
+            w_rec = (q - zero) * scale
+
+            # Use relative error loss
+            loss = self.relative_error_loss(w_group, w_rec)
+
+            if loss.item() < best_loss:
+                best_loss = loss.item()
+                best_scale = scale.data.clone()
+                best_zero = zero.data.clone()
+                no_improve = 0
+            else:
+                no_improve += 1
+                if no_improve >= patience:
+                    break
+
+            loss.backward()
+
+            # Gradient clipping for stability
+            torch.nn.utils.clip_grad_norm_([scale, zero], 1.0)
+
+            optimizer.step()
+            scheduler.step()
+
+        return best_scale.detach(), best_zero.detach()
+
+    def magnitude_aware_quantization(self, w_group: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Use different strategies based on weight magnitudes"""
+        device = w_group.device
+        w_abs = w_group.abs()
+        w_std = w_group.std(dim=-1, keepdim=True)
+        w_mean = w_group.mean(dim=-1, keepdim=True)
+
+        # Strategy 1: For groups with large dynamic range, use log-scale quantization
+        dynamic_range = w_abs.max(dim=-1, keepdim=True)[0] / (w_abs.min(dim=-1, keepdim=True)[0] + 1e-8)
+
+        if dynamic_range.mean() > 100:  # High dynamic range
+            # Use log-space quantization for better relative precision
+            sign = torch.sign(w_group)
+            w_abs_log = torch.log(w_abs + 1e-8)
+
+            log_min = w_abs_log.min(dim=-1, keepdim=True)[0]
+            log_max = w_abs_log.max(dim=-1, keepdim=True)[0]
+
+            scale_log = (log_max - log_min) / (self.maxq - 1)
+            zero_log = torch.round(-log_min / scale_log).clamp(0, self.maxq - 1)
 
-    def add_batch(self, inp: torch.Tensor, out: torch.Tensor):
-        """Add a batch of data to compute Hessian matrix"""
-        if len(inp.shape) == 2:
-            inp = inp.unsqueeze(0)
+            # Convert back to linear scale
+            q_log = torch.clamp(torch.round((w_abs_log - log_min) / scale_log), 0, self.maxq - 1)
+            w_abs_rec = torch.exp(log_min + q_log * scale_log)
+            w_rec = sign * w_abs_rec
+
+            # Compute equivalent linear scale and zero
+            scale = (w_group.max(dim=-1, keepdim=True)[0] - w_group.min(dim=-1, keepdim=True)[0]) / self.maxq
+            zero = torch.round(-w_group.min(dim=-1, keepdim=True)[0] / scale).clamp(0, self.maxq)
 
-        tmp = inp.shape[0]
-        if len(inp.shape) == 3:
-            inp = inp.reshape((-1, inp.shape[-1]))
-        inp = inp.t()
+        else:
+            # Strategy 2: For normal range, use adaptive clipping
+            # Use robust statistics to set bounds
+            median = w_group.median(dim=-1, keepdim=True)[0]
+            mad = (w_group - median).abs().median(dim=-1, keepdim=True)[0]  # Median Absolute Deviation
 
-        if self.H is None:
-            self.H = torch.zeros((inp.shape[0], inp.shape[0]), device=self.device)
+            # Set bounds using robust statistics
+            bound = 3.0 * mad
+            w_min = torch.maximum(w_group.min(dim=-1, keepdim=True)[0], median - bound)
+            w_max = torch.minimum(w_group.max(dim=-1, keepdim=True)[0], median + bound)
 
-        self.H += 2 / tmp * inp.matmul(inp.t())
+            range_val = w_max - w_min
+            range_val = torch.where(range_val < 1e-8, torch.tensor(1e-6, device=device), range_val)
 
-    def quantize(self, weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-        """
-        Quantize the weight matrix using GPTQ algorithm
+            scale = range_val / self.maxq
+            zero = torch.round(-w_min / scale).clamp(0, self.maxq)
 
-        Returns:
-            - qweight: quantized weights (packed if 4-bit)
-            - qzeros: zero points for each group
-            - scales: scales for each group
-            - original_cols: original number of columns (for unpacking)
-        """
-        W = weight.clone()
-        if not self.actorder:
-            # Standard quantization order
-            W = W.float()
-
-        rows, columns = W.shape[0], W.shape[1]
-        original_cols = columns
-
-        # Initialize Hessian
-        if self.H is None:
-            self.H = torch.eye(columns, device=self.device)
-
-        H = self.H
-        dead = torch.diag(H) == 0
-        H[dead, dead] = 1
-
-        # Add dampening
-        damp = self.percdamp * torch.mean(torch.diag(H))
-        diag = torch.arange(columns, device=self.device)
-        H[diag, diag] += damp
-
-        # Prepare quantization
-        scales = torch.zeros((rows, (columns + self.groupsize - 1) // self.groupsize), device=self.device)
-        qzeros = torch.zeros_like(scales, dtype=torch.int32)
-        qweight = torch.zeros_like(W, dtype=torch.int32)
-
-        # Cholesky decomposition
-        try:
-            H = torch.linalg.cholesky(H)
-        except:
-            print("Cholesky decomposition failed, using eigenvalue decomposition")
-            eigenvalues, eigenvectors = torch.linalg.eigh(H)
-            eigenvalues = eigenvalues.clamp(min=1e-10)
-            H = eigenvectors @ torch.diag(torch.sqrt(eigenvalues)) @ eigenvectors.T
-
-        H = torch.cholesky_inverse(H)
-        H = torch.linalg.cholesky(H, upper=True)
-        Hinv = H
-
-        # Quantize blocks
-        for i1 in range(0, columns, self.blocksize):
-            i2 = min(i1 + self.blocksize, columns)
-            count = i2 - i1
-
-            W1 = W[:, i1:i2].clone()
-            Q1 = torch.zeros_like(W1)
-            Err1 = torch.zeros_like(W1)
-            Losses1 = torch.zeros_like(W1)
-            Hinv1 = Hinv[i1:i2, i1:i2]
-
-            for i in range(count):
-                w = W1[:, i]
-                d = Hinv1[i, i]
-
-                # Find optimal quantization
-                if self.groupsize != float('inf'):
-                    g_idx = (i1 + i) // self.groupsize
-                    scale = scales[:, g_idx]
-                    zero = qzeros[:, g_idx]
-
-                    if scale.sum() == 0:  # Initialize scale and zero
-                        scale = W1[:, i].abs().max() / (self.maxq / 2)
-                        scales[:, g_idx] = scale
-                        zero = torch.round(-W1[:, i].min() / scale).clamp(0, self.maxq)
-                        qzeros[:, g_idx] = zero.to(torch.int32)
-                else:
-                    scale = W1[:, i].abs().max() / (self.maxq / 2)
-                    zero = torch.round(-W1[:, i].min() / scale).clamp(0, self.maxq)
-
-                # Quantize
-                q = torch.clamp(torch.round(w / scale) + zero, 0, self.maxq)
-                Q1[:, i] = q
-
-                # Dequantize and compute error
-                dq = (q - zero) * scale
-                err = (w - dq) / d
-                Err1[:, i] = err
-
-                # Update remaining weights
-                W1[:, i:] -= err.unsqueeze(1) * Hinv1[i, i:].unsqueeze(0)
-
-            qweight[:, i1:i2] = Q1.to(torch.int32)
-
-        # Pack weights if 4-bit
-        if self.wbits == 4:
-            qweight = pack_int4_weights(qweight, self.wbits)
-
-        return qweight, qzeros, scales, original_cols
+        return scale, zero
 
+    def quantize(self, W: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Quantization optimized specifically for minimal relative error
+        Returns: [O, I] int4, [O, num_groups] zero, [O, num_groups] scale
+        """
+        assert W.ndim == 2
+        rows, cols = W.shape
+        device = W.device
+
+        # Use very small groups for maximum precision
+        effective_groupsize = min(int(self.groupsize), 8) if self.groupsize != float('inf') else 8
+        effective_groupsize = max(effective_groupsize, 4)  # Minimum 4 for 4-bit
+        num_groups = (cols + effective_groupsize - 1) // effective_groupsize
+
+        qweight = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
+        scales = torch.zeros((rows, num_groups), dtype=torch.float32, device=device)
+        zeros = torch.zeros((rows, num_groups), dtype=torch.float32, device=device)
+
+        # Process each group with relative error optimization
+        for g in range(num_groups):
+            start_col = g * effective_groupsize
+            end_col = min((g + 1) * effective_groupsize, cols)
+
+            # Get current group
+            W_group = W[:, start_col:end_col].clone()
+
+            # Try different methods and pick best for relative error
+            methods = []
+
+            # Method 1: Relative error optimization
+            try:
+                scale_rel, zero_rel = self.optimize_for_relative_error(W_group, max_iter=100)
+                q_rel = torch.clamp(torch.round(W_group / scale_rel + zero_rel), 0, self.maxq)
+                w_rec_rel = (q_rel - zero_rel) * scale_rel
+                rel_error_rel = self.relative_error_loss(W_group, w_rec_rel).item()
+                methods.append(('rel_opt', scale_rel, zero_rel, q_rel, rel_error_rel))
+            except Exception as e:
+                print(f"Relative opt failed for group {g}: {e}")
+
+            # Method 2: Magnitude-aware quantization
+            try:
+                scale_mag, zero_mag = self.magnitude_aware_quantization(W_group)
+                q_mag = torch.clamp(torch.round(W_group / scale_mag + zero_mag), 0, self.maxq)
+                w_rec_mag = (q_mag - zero_mag) * scale_mag
+                rel_error_mag = self.relative_error_loss(W_group, w_rec_mag).item()
+                methods.append(('mag_aware', scale_mag, zero_mag, q_mag, rel_error_mag))
+            except Exception as e:
+                print(f"Magnitude aware failed for group {g}: {e}")
+
+            # Method 3: Ultra-conservative approach for small weights
+            w_abs = W_group.abs()
+            if w_abs.max() < 0.01:  # Very small weights
+                # Use much finer quantization resolution
+                w_min = W_group.min(dim=-1, keepdim=True)[0]
+                w_max = W_group.max(dim=-1, keepdim=True)[0]
+
+                # Tighten the range for small weights
+                range_val = w_max - w_min
+                range_val = torch.where(range_val < 1e-8, torch.tensor(1e-8, device=device), range_val)
+
+                scale_small = range_val / self.maxq * 0.8  # Use 80% of range for safety
+                zero_small = torch.round(-w_min / scale_small).clamp(0, self.maxq)
+
+                q_small = torch.clamp(torch.round(W_group / scale_small + zero_small), 0, self.maxq)
+                w_rec_small = (q_small - zero_small) * scale_small
+                rel_error_small = self.relative_error_loss(W_group, w_rec_small).item()
+                methods.append(('small_weights', scale_small, zero_small, q_small, rel_error_small))
+
+            # Pick the method with lowest relative error
+            if methods:
+                best_method = min(methods, key=lambda x: x[4])
+                method_name, scale_best, zero_best, q_best, _ = best_method
+
+                qweight[:, start_col:end_col] = q_best.to(torch.uint8)
+                scales[:, g] = scale_best.squeeze(-1)
+                zeros[:, g] = zero_best.squeeze(-1)
+            else:
+                # Ultimate fallback
+                print(f"All methods failed for group {g}, using zero quantization")
+                qweight[:, start_col:end_col] = 0
+                scales[:, g] = 1.0
+                zeros[:, g] = 0
+
+        return qweight, zeros.to(torch.float16), scales.to(torch.float16)
+
+
+    def dequantize(self, qweight: torch.Tensor, zeros: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
+        """
+        [O, I] int4, [O, num_groups] zero, [O, num_groups] scale => [O, I] float32
+        """
+        rows, cols = qweight.shape
+        # Use same effective groupsize as quantization
+        effective_groupsize = min(self.groupsize, 8)
+        effective_groupsize = max(effective_groupsize, 4)
+        num_groups = (cols + effective_groupsize - 1) // effective_groupsize
+        W = torch.zeros_like(qweight, dtype=torch.float32)
+
+        for g in range(num_groups):
+            start = g * effective_groupsize
+            end = min((g + 1) * effective_groupsize, cols)
+            scale = scales[:, g].unsqueeze(1)  # [O, 1]
+            zero = zeros[:, g].unsqueeze(1)  # [O, 1]
+            q = qweight[:, start:end].float()
+            W[:, start:end] = (q - zero) * scale
+
+        return W
 
 def quantize_gptq(
         model_state_dict: Dict[str, torch.Tensor],
         calibration_data: Optional[torch.Tensor] = None,
         wbits: int = 4,
-        groupsize: int = 128,
+        groupsize: int = 8,
         target_layers: Optional[list] = None,
         device: str = "cuda"
 ) -> Dict[str, torch.Tensor]:
@@ -265,12 +345,8 @@ def quantize_gptq(
             # Move weight to device
             weight = param.to(device).float()
 
-            # If no calibration data, use identity Hessian
-            if calibration_data is None:
-                gptq.H = torch.eye(weight.shape[1], device=device)
-
             # Quantize the weight
-            qweight, qzeros, scales, original_cols = gptq.quantize(weight)
+            qweight, qzeros, scales = gptq.quantize(weight)
 
             # Store quantized parameters
             base_name = name.replace(".weight", "").replace("_weight", "")
@@ -279,7 +355,6 @@ def quantize_gptq(
             quantized_state_dict[f"{base_name}.scales"] = scales.cpu()
             quantized_state_dict[f"{base_name}.wbits"] = torch.tensor(wbits)
             quantized_state_dict[f"{base_name}.groupsize"] = torch.tensor(groupsize)
-            quantized_state_dict[f"{base_name}.original_cols"] = torch.tensor(original_cols)
 
         else:
             # Keep non-quantized parameters as is
@@ -287,43 +362,29 @@ def quantize_gptq(
 
     return quantized_state_dict
 
+if __name__ == '__main__':
+    def test_gptq_groupwise():
+        torch.manual_seed(0)
+        rows, cols = 512, 1024
+        W = torch.randn(rows, cols, device="cuda")
 
-def dequantize_weight(qweight, qzeros, scales, wbits=4, original_cols=None):
-    """
-    Dequantize weight for inference
-
-    Args:
-        qweight: Quantized weights (packed if 4-bit)
-        qzeros: Zero points
-        scales: Scales
-        wbits: Number of bits used for quantization
-        original_cols: Original number of columns (for unpacking)
-
-    Returns:
-        Dequantized weight tensor
-    """
-    # Unpack if 4-bit
-    if wbits == 4 and original_cols is not None:
-        qweight = unpack_int4_weights(qweight, original_cols, wbits)
-
-    # Get dimensions
-    rows, columns = qweight.shape
-    groupsize = columns // scales.shape[1]
-
-    # Prepare output tensor
-    weight = torch.zeros((rows, columns), dtype=torch.float32, device=qweight.device)
+        # Test with relative error optimization
+        gptq = GPTQ(wbits=4, groupsize=8, device=W.device)
+        qweight, zeros, scales = gptq.quantize(W)
+        packed = pack_int4(qweight)
 
-    # Dequantize each group
-    for g in range(scales.shape[1]):
-        start_idx = g * groupsize
-        end_idx = min((g + 1) * groupsize, columns)
+        qweight_unpacked = unpack_int4(packed, orig_cols=cols)
+        W_rec = gptq.dequantize(qweight_unpacked, zeros, scales)
 
-        # Extract group quantized values
-        group_qweight = qweight[:, start_idx:end_idx].float()
-        group_scales = scales[:, g].unsqueeze(1)
-        group_zeros = qzeros[:, g].unsqueeze(1).float()
+        abs_err = (W - W_rec).abs()
+        rel_err = abs_err / (W.abs() + 1e-5)  # Use better epsilon
+        print("== Relative Error Optimized GPTQ (groupsize=4) ==")
+        print(f"Mean abs error: {abs_err.mean().item():.6f}")
+        print(f"Mean rel error: {rel_err.mean().item():.6f}")
+        print(f"Max abs error: {abs_err.max().item():.6f}")
+        print(f"Max rel error: {rel_err.max().item():.6f}")
+        print(f"95th percentile rel error: {rel_err.quantile(0.95).item():.6f}")
+        print(f"99th percentile rel error: {rel_err.quantile(0.99).item():.6f}")
 
-        # Dequantize
-        weight[:, start_idx:end_idx] = (group_qweight - group_zeros) * group_scales
 
-    return weight
\ No newline at end of file
+    test_gptq_groupwise()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq_loader.py b/lite_llama/quantization/gptq/gptq_loader.py
index 84b56f3..3f5ec90 100644
--- a/lite_llama/quantization/gptq/gptq_loader.py
+++ b/lite_llama/quantization/gptq/gptq_loader.py
@@ -2,7 +2,7 @@
 import torch.nn as nn
 from typing import Dict, Optional
 import os.path as osp
-from .gptq import dequantize_weight
+from .gptq import *
 
 
 class GPTQLinear(nn.Module):
@@ -21,10 +21,10 @@ def __init__(self, qweight, qzeros, scales, wbits=4, bias=None):
             self.register_buffer('bias', bias)
         else:
             self.bias = None
-
+        self.gptq = GPTQ
     def forward(self, x):
         # Dequantize weight on-the-fly
-        weight = dequantize_weight(
+        weight = self.gptq.dequantize(
             self.qweight,
             self.qzeros,
             self.scales,
@@ -127,11 +127,10 @@ def create_dequantized_state_dict(quantized_state_dict: Dict[str, torch.Tensor])
                 qzeros = quantized_state_dict[f"{base_name}.qzeros"]
                 scales = quantized_state_dict[f"{base_name}.scales"]
                 wbits = quantized_state_dict.get(f"{base_name}.wbits", torch.tensor(4)).item()
-                original_cols = quantized_state_dict.get(f"{base_name}.original_cols")
-                if original_cols is not None:
-                    original_cols = int(original_cols)
+
                 # Dequantize to regular fp16 weights
-                weight = dequantize_weight(qweight, qzeros, scales, wbits, original_cols)
+                gptq = GPTQ(wbits=4, groupsize=8)
+                weight = gptq.dequantize(qweight, qzeros, scales)
 
                 # Store dequantized weight; handle naming with or without '.weight'
                 if "_weight" in base_name:
@@ -146,7 +145,7 @@ def create_dequantized_state_dict(quantized_state_dict: Dict[str, torch.Tensor])
 
 
 
-        elif not any(suffix in key for suffix in ['.qzeros', '.scales', '.wbits', '.groupsize', '.original_cols']):
+        elif not any(suffix in key for suffix in ['.qzeros', '.scales', '.wbits', '.groupsize']):
 
             # Preserve all other parameters
             dequantized_dict[key] = value
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index 6baf074..e197b51 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -161,7 +161,7 @@ def get_model_info(model_path):
     return model_info
 
 
-def get_model_dtype(checkpoints_dir: str) -> torch.dtype:
+def get_model_dtype(checkpoints_dir: str):
     """
     Get the model dtype from config.json
 
@@ -169,7 +169,7 @@ def get_model_dtype(checkpoints_dir: str) -> torch.dtype:
         checkpoints_dir: Path to model checkpoint directory
 
     Returns:
-        torch.dtype: The dtype specified in config.json
+        torch.dtype or str: The dtype specified in config.json
     """
     config_path = os.path.join(checkpoints_dir, "config.json")
 
@@ -177,14 +177,16 @@ def get_model_dtype(checkpoints_dir: str) -> torch.dtype:
         with open(config_path, 'r') as f:
             config = json.load(f)
 
-        torch_dtype_str = config.get("torch_dtype", "float16")
+        torch_dtype_str = config.get("torch_dtype", "float16").lower()
 
-        # Map string to torch dtype
+        # Map string to torch dtype or string identifiers for quantized formats
         dtype_mapping = {
             "float16": torch.float16,
             "bfloat16": torch.bfloat16,
             "float32": torch.float32,
             "float": torch.float32,
+            "int8": torch.int8,
+            "int4": "int4",  # Placeholder, since PyTorch doesn't natively support int4
         }
 
         dtype = dtype_mapping.get(torch_dtype_str, torch.float16)
@@ -197,3 +199,8 @@ def get_model_dtype(checkpoints_dir: str) -> torch.dtype:
         print("Defaulting to torch.float16")
         return torch.float16
 
+    except Exception as e:
+        print(f"Warning: Could not read dtype from config.json: {e}")
+        print("Defaulting to torch.float16")
+        return torch.float16
+

From b5cd0d593c5119be55c8289d5fe246eb05e04f37 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Wed, 2 Jul 2025 16:47:41 +0930
Subject: [PATCH 25/33] update gptq int4 kernel

---
 generate.py                                 |  66 ++--
 lite_llama/kernels/others/rotary_emb_v1.py  |   2 +-
 lite_llama/quantization/gptq/gptq.py        | 398 +++++++++++++++++---
 lite_llama/quantization/gptq/gptq_loader.py |  71 ----
 lite_llama/utils/common.py                  |  23 +-
 5 files changed, 407 insertions(+), 153 deletions(-)

diff --git a/generate.py b/generate.py
index 7f1028f..ea90bec 100644
--- a/generate.py
+++ b/generate.py
@@ -7,7 +7,7 @@
 import warnings
 
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
+from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type, quantization
 from lite_llama.llava_generate_stream import LlavaGeneratorStream
 
 import sys, os, time
@@ -17,8 +17,6 @@
 sys.path.append(str(wd))
 import psutil
 from lite_llama.utils.logger import log
-import argparse
-from argparse import RawTextHelpFormatter
 
 process = psutil.Process(os.getpid())
 
@@ -41,6 +39,7 @@ def report_resource_usage(ram_before, vram_before) -> None:
 
 def generate_llama(
         prompt: str = "Hello, my name is",
+        quantize: Optional[str] = None,
         *,
         temperature: float = 0.6,
         top_p: float = 0.9,
@@ -52,7 +51,6 @@ def generate_llama(
         triton_weight: bool = True,
         gpu_type: str = "nvidia",
         checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
-        quantize: Optional[str] = None,
 ):
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     assert checkpoint_path.is_dir(), checkpoint_path
@@ -62,22 +60,25 @@ def generate_llama(
     else:
         short_prompt = False
     model_prompter = get_prompter(get_model_type(checkpoint_path), checkpoint_path, short_prompt)
+
     # Start resource tracking
     ram_before = process.memory_info().rss
 
     vram_before = get_gpu_memory(gpu_type)
 
     # Init LLM generator
-    generator = GenerateStreamText(
-        checkpoints_dir=checkpoint_path,
-        tokenizer_path=checkpoint_path,
-        max_gpu_num_blocks=max_gpu_num_blocks,
-        max_seq_len=max_seq_len,
-        load_model=load_model,
-        compiled_model=compiled_model,
-        triton_weight=triton_weight,
-        device=device,
-    )
+    with quantization(quantize):
+
+        generator = GenerateStreamText(
+            checkpoints_dir=checkpoint_path,
+            tokenizer_path=checkpoint_path,
+            max_gpu_num_blocks=max_gpu_num_blocks,
+            max_seq_len=max_seq_len,
+            load_model=load_model,
+            compiled_model=compiled_model,
+            triton_weight=triton_weight,
+            device=device,
+        )
 
 
     model_prompter.insert_prompt(prompt)
@@ -113,6 +114,7 @@ def generate_llava(
         checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
         figure_path: Path = Path("figures/lit-llama/"),
         gpu_type: str = "nvidia",
+        quantize: Optional[str] = None,
         temperature: float = 0.6,
         top_p: float = 0.9,
         max_seq_len: int = 2048,
@@ -145,20 +147,22 @@ def generate_llava(
     vram_before = get_gpu_memory(gpu_type)
 
     # Initializing the Multimodal Model Text Generator
-    try:
-        generator = LlavaGeneratorStream(
-            checkpoints_dir=checkpoint_path,
-            tokenizer_path=checkpoint_path,
-            max_gpu_num_blocks=max_gpu_num_blocks,
-            max_seq_len=max_seq_len,
-            load_model=load_model,
-            compiled_model=compiled_model,
-            triton_weight=triton_weight,
-            device=device,
-        )
-    except Exception as e:
-        log.error(f"Model loading failure: {e}")
-        sys.exit(1)
+    with quantization(quantize):
+
+        try:
+            generator = LlavaGeneratorStream(
+                checkpoints_dir=checkpoint_path,
+                tokenizer_path=checkpoint_path,
+                max_gpu_num_blocks=max_gpu_num_blocks,
+                max_seq_len=max_seq_len,
+                load_model=load_model,
+                compiled_model=compiled_model,
+                triton_weight=triton_weight,
+                device=device,
+            )
+        except Exception as e:
+            log.error(f"Model loading failure: {e}")
+            sys.exit(1)
 
     image_token = get_image_token()
     model_prompter.insert_prompt(image_token * image_num + prompt)
@@ -202,6 +206,7 @@ def main(
         prompt: str = "Hello, my name is",
         checkpoint_path: Path = Path("checkpoints/lite-llama/7B/"),
         figure_path: Optional[Path] = None,
+        quant: str = "gpt.int4"
     ):
         """
         Generate text using lite_llama with automatic GPTQ detection
@@ -210,14 +215,15 @@ def main(
             prompt: Input prompt text
             checkpoint_path: Path to model checkpoint directory
             figure_path: Path to Image file for LLaVA generation, optional
+            quant: GPTQ quantization mode
         """
         # Determine use_gptq based on force flags
         gpu_type = detect_device()
         model_path = os.path.abspath(checkpoint_path)
         if figure_path:
             generate_llava(prompt=prompt, checkpoint_path=Path(model_path), figure_path=Path(figure_path),
-                           gpu_type=gpu_type)
+                           gpu_type=gpu_type, quantization=quant)
         else:
-            generate_llama(prompt=prompt, checkpoint_path=Path(model_path), gpu_type=gpu_type)
+            generate_llama(prompt=prompt, checkpoint_path=Path(model_path), gpu_type=gpu_type, quantization=quant)
 
     CLI(main)
\ No newline at end of file
diff --git a/lite_llama/kernels/others/rotary_emb_v1.py b/lite_llama/kernels/others/rotary_emb_v1.py
index 6ff6765..ccf0dbc 100644
--- a/lite_llama/kernels/others/rotary_emb_v1.py
+++ b/lite_llama/kernels/others/rotary_emb_v1.py
@@ -238,4 +238,4 @@ def torch_rotary_emb(x, cos, sin):
         f"The maximum difference between torch and triton is {torch.max(torch.abs(output_torch - q_out))}"
     )
     print("torch:", triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
-    print("triton:", triton.testing.do_bench(lambda: torch_rotary_emb(q, cos, sin)))
+    print("triton:", triton.testing.do_bench(lambda: rotary_emb_fwd(q, k, cos, sin)))
diff --git a/lite_llama/quantization/gptq/gptq.py b/lite_llama/quantization/gptq/gptq.py
index ab10fd9..324984f 100755
--- a/lite_llama/quantization/gptq/gptq.py
+++ b/lite_llama/quantization/gptq/gptq.py
@@ -3,30 +3,13 @@
 import numpy as np
 from typing import Dict, Tuple, Optional, Any
 from tqdm.auto import tqdm
-import math
+import triton
+import triton.language as tl
+import time, gc, psutil, os, sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
 
+from lite_llama.utils.common import get_gpu_memory  # Replace with actual GPU mem check if needed
 
-def pack_int4(qweight: torch.Tensor) -> torch.Tensor:
-    """
-    [rows, cols] uint8 in [0, 15] -> [rows, ceil(cols/2)] uint8
-    """
-    rows, cols = qweight.shape
-    if cols % 2 != 0:
-        qweight = torch.nn.functional.pad(qweight, (0, 1), value=0)
-        cols += 1
-    packed = (qweight[:, 0::2] & 0xF) | ((qweight[:, 1::2] & 0xF) << 4)
-    return packed.contiguous()
-
-
-def unpack_int4(packed: torch.Tensor, orig_cols: int) -> torch.Tensor:
-    """
-    [rows, ceil(cols/2)] uint8 -> [rows, cols] uint8 in [0, 15]
-    """
-    rows, packed_cols = packed.shape
-    qweight = torch.empty((rows, packed_cols * 2), dtype=torch.uint8, device=packed.device)
-    qweight[:, 0::2] = packed & 0xF
-    qweight[:, 1::2] = (packed >> 4) & 0xF
-    return qweight[:, :orig_cols].contiguous()
 
 
 class GPTQ:
@@ -203,8 +186,8 @@ def quantize(self, W: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.T
         num_groups = (cols + effective_groupsize - 1) // effective_groupsize
 
         qweight = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
-        scales = torch.zeros((rows, num_groups), dtype=torch.float32, device=device)
-        zeros = torch.zeros((rows, num_groups), dtype=torch.float32, device=device)
+        scales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+        zeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
 
         # Process each group with relative error optimization
         for g in range(num_groups):
@@ -276,14 +259,14 @@ def quantize(self, W: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.T
 
     def dequantize(self, qweight: torch.Tensor, zeros: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
         """
-        [O, I] int4, [O, num_groups] zero, [O, num_groups] scale => [O, I] float32
+        [O, I] int4, [O, num_groups] zero, [O, num_groups] scale => [O, I] float16
         """
         rows, cols = qweight.shape
         # Use same effective groupsize as quantization
         effective_groupsize = min(self.groupsize, 8)
         effective_groupsize = max(effective_groupsize, 4)
         num_groups = (cols + effective_groupsize - 1) // effective_groupsize
-        W = torch.zeros_like(qweight, dtype=torch.float32)
+        W = torch.zeros_like(qweight, dtype=torch.float16)
 
         for g in range(num_groups):
             start = g * effective_groupsize
@@ -362,29 +345,350 @@ def quantize_gptq(
 
     return quantized_state_dict
 
-if __name__ == '__main__':
-    def test_gptq_groupwise():
-        torch.manual_seed(0)
-        rows, cols = 512, 1024
-        W = torch.randn(rows, cols, device="cuda")
+@triton.autotune(
+        configs=[
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=5,
+                num_warps=2,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 32,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=5,
+                num_warps=2,
+            ),
+        ],
+        key=["M", "N", "K"],
+    )
+
+@triton.jit
+def int4_gemm_kernel(
+    a_ptr, b_ptr, c_ptr,
+    bscales_ptr, bzeros_ptr,
+    M, N, K,
+    stride_am, stride_ak,
+    stride_bk, stride_bn,
+    stride_cm, stride_cn,
+    GROUP_SIZE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    a_mask = offs_am[:, None] < M
+    b_mask = offs_bn[None, :] < N
+
+    a_ptrs = a_ptr + stride_am * offs_am[:, None] + stride_ak * offs_k[None, :]
+    b_ptrs = b_ptr + stride_bn * offs_bn[None, :] + stride_bk * (offs_k[:, None] // 2)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float16)
+
+    for k in range(0, K, BLOCK_SIZE_K):
+        b_q = tl.load(b_ptrs, mask=b_mask)
+
+        a = tl.load(a_ptrs, mask=a_mask).to(tl.float16)
+
+        # Compute per-group index
+        k_offset = k + offs_k  # shape: [BLOCK_SIZE_K]
+        group_idx = k_offset // GROUP_SIZE  # [BLOCK_SIZE_K]
+
+        # Load scale and zero for each [N, G]
+        scale = tl.load(bscales_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)  # [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        zero = tl.load(bzeros_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)   # same shape
+
+        # Extract int4 values from uint8
+        shift = (k_offset[:, None] % 2) * 4
+        q = (b_q.to(tl.uint8) >> shift) & 0xF
+        b_deq = (q.to(tl.float16) - zero) * scale
+
+        accumulator += tl.dot(a, b_deq, out_dtype=tl.float16)
+
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def triton_int4_gemm(
+    inp: torch.Tensor,
+    weight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+    group_size: int = 64
+) -> torch.Tensor:
+
+
+    weight = weight.t().contiguous()  # [K/2, N]
+    c_shape = inp.shape[:-1] + weight.shape[-1:]
+    inp = inp.view(-1, inp.shape[-1]).contiguous()
+
+    PAD_TO = 256
+    if inp.shape[0] % PAD_TO != 0:
+        c_crop = inp.shape[0]
+        new_inp = inp.new_zeros(((inp.shape[0] + PAD_TO - 1) // PAD_TO * PAD_TO, inp.shape[1]))
+        new_inp[:c_crop] = inp
+        inp = new_inp
+    else:
+        c_crop = None
+
+    M, K = inp.shape
+    N = weight.shape[1]
+
+
+
+    c = torch.empty((M, N), device=inp.device, dtype=torch.float32)
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+    )
+
+    int4_gemm_kernel[grid](
+        inp, weight, c,
+        scales, zeros,
+        M, N, K,
+        inp.stride(0), inp.stride(1),
+        weight.stride(0), weight.stride(1),
+        c.stride(0), c.stride(1),
+        GROUP_SIZE=group_size,
+    )
 
-        # Test with relative error optimization
-        gptq = GPTQ(wbits=4, groupsize=8, device=W.device)
-        qweight, zeros, scales = gptq.quantize(W)
-        packed = pack_int4(qweight)
+    return c[:c_crop] if c_crop is not None else c.view(c_shape)
 
-        qweight_unpacked = unpack_int4(packed, orig_cols=cols)
-        W_rec = gptq.dequantize(qweight_unpacked, zeros, scales)
+class GPTQLinear(nn.Module):
+    """
+    4-bit quantized linear layer using Triton kernels
+    """
 
-        abs_err = (W - W_rec).abs()
-        rel_err = abs_err / (W.abs() + 1e-5)  # Use better epsilon
-        print("== Relative Error Optimized GPTQ (groupsize=4) ==")
-        print(f"Mean abs error: {abs_err.mean().item():.6f}")
-        print(f"Mean rel error: {rel_err.mean().item():.6f}")
-        print(f"Max abs error: {abs_err.max().item():.6f}")
-        print(f"Max rel error: {rel_err.max().item():.6f}")
-        print(f"95th percentile rel error: {rel_err.quantile(0.95).item():.6f}")
-        print(f"99th percentile rel error: {rel_err.quantile(0.99).item():.6f}")
+    def __init__(self, in_features, out_features, bias=True, groupsize=64, device="cuda"):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.groupsize = groupsize
+        self.device = device
 
+        self.tile_cols = groupsize
+        self.original_out_features = out_features
+
+        # Quantized params (assigned later)
+        self.register_buffer("packed_weight", None)
+        self.register_buffer("scales", None)
+        self.register_buffer("zeros", None)
+        self.register_buffer("bias", None if not bias else torch.empty(out_features))
+
+    @staticmethod
+    def pack_weight(weight):
+        rows, cols = weight.shape
+        if cols % 2 != 0:
+            weight = torch.nn.functional.pad(weight, (0, 1), value=0)
+            cols += 1
+        packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
+        return packed.contiguous()
+
+    def get_weight(self, packed: torch.Tensor) -> torch.Tensor:
+        """
+        [rows, ceil(cols/2)] uint8 -> [rows, cols] uint8 in [0, 15]
+        """
+        rows, packed_cols = packed.shape
+        qweight = torch.empty((rows, packed_cols * 2), dtype=torch.uint8, device=packed.device)
+        qweight[:, 0::2] = packed & 0xF
+        qweight[:, 1::2] = (packed >> 4) & 0xF
+        return qweight[:, :self.in_features].contiguous()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_flat = x.view(-1, self.in_features)
+        # Compute quantized matmul
+        output = triton_int4_gemm(
+            x_flat.float(),
+            self.packed_weight,
+            self.scales,
+            self.zeros,
+            group_size=self.groupsize,
+        )
 
-    test_gptq_groupwise()
\ No newline at end of file
+        if self.bias is not None:
+            output += self.bias
+
+        return output.view(*x.shape[:-1], self.out_features)
+
+
+def get_gpu_memory():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        return torch.cuda.memory_allocated() // (1024 ** 2)
+    return 0
+
+
+def test_gptqlinear_vs_nnlinear(
+    in_features=2048,
+    out_features=4096,
+    groupsize=64,
+    wbits=4,
+    device="cuda"
+):
+    torch.manual_seed(42)
+    np.random.seed(42)
+
+    # ---- Create single input vector ----
+    x = torch.randn(in_features, device=device, dtype=torch.float16)
+    linear = nn.Linear(in_features, out_features, bias=True, device=device, dtype=torch.float16).eval()
+
+    weight = linear.weight.detach().to(device).float()
+    bias = linear.bias.detach().to(device).float() if linear.bias is not None else None
+
+    # --- Quantize using GPTQ ---
+    gptq = GPTQ(wbits=wbits, groupsize=groupsize, device=device)
+    qweight, qzeros, qscales = gptq.quantize(weight)
+    packed_weight = GPTQLinear.pack_weight(qweight)
+
+    gptqlinear = GPTQLinear(in_features, out_features, bias=True, groupsize=groupsize, device=device).to(device)
+    gptqlinear.packed_weight = packed_weight
+    gptqlinear.scales = qscales
+    gptqlinear.zeros = qzeros
+    gptqlinear.bias = bias if bias is not None else None
+    gptqlinear.eval()
+
+    # ---- Memory ----
+    gc.collect()
+    torch.cuda.empty_cache()
+    mem0 = get_gpu_memory()
+    _ = linear(x)
+    mem_fp = get_gpu_memory()
+    del _
+    gc.collect()
+    torch.cuda.empty_cache()
+    mem1 = get_gpu_memory()
+    _ = gptqlinear(x)
+    mem_q = get_gpu_memory()
+    del _
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    # ---- Print ----
+
+
+    print("\n== Memory Usage (VRAM, MB) ==")
+    print(f"nn.Linear (fp16): {mem_fp} MB MB)")
+    print(f"GPTQLinear:       {mem_q} MB MB)")
+
+    print("\n== Latency ==")
+    time_fp = triton.testing.do_bench(lambda: linear(x))
+    time_q = triton.testing.do_bench(lambda: gptqlinear(x))
+    print(f"nn.Linear (fp16): {time_fp:.3f} ms")
+    print(f"GPTQLinear:       {time_q:.3f} ms")
+
+
+    print("\n== VRAM saving ratio ==")
+    print(f"GPTQLinear / nn.Linear: {(mem_q-mem1)/(mem_fp-mem0 + 1e-9):.3f}x")
+    print(f"Speedup: {time_fp/time_q:.2f}x\n")
+
+if __name__ == "__main__":
+    test_gptqlinear_vs_nnlinear()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq_loader.py b/lite_llama/quantization/gptq/gptq_loader.py
index 3f5ec90..6f70ef3 100644
--- a/lite_llama/quantization/gptq/gptq_loader.py
+++ b/lite_llama/quantization/gptq/gptq_loader.py
@@ -5,41 +5,6 @@
 from .gptq import *
 
 
-class GPTQLinear(nn.Module):
-    """
-    A linear layer that uses GPTQ quantized weights.
-    Automatically dequantizes during forward pass.
-    """
-
-    def __init__(self, qweight, qzeros, scales, wbits=4, bias=None):
-        super().__init__()
-        self.register_buffer('qweight', qweight)
-        self.register_buffer('qzeros', qzeros)
-        self.register_buffer('scales', scales)
-        self.wbits = wbits
-        if bias is not None:
-            self.register_buffer('bias', bias)
-        else:
-            self.bias = None
-        self.gptq = GPTQ
-    def forward(self, x):
-        # Dequantize weight on-the-fly
-        weight = self.gptq.dequantize(
-            self.qweight,
-            self.qzeros,
-            self.scales,
-            self.wbits
-        )
-
-        # Perform linear transformation
-        output = torch.matmul(x, weight.t())
-
-        if self.bias is not None:
-            output += self.bias
-
-        return output
-
-
 def load_quantized_state_dict(checkpoint_path: str, device: str = "cuda") -> Dict[str, torch.Tensor]:
     """
     Load a quantized state dictionary from checkpoint.
@@ -64,42 +29,6 @@ def load_quantized_state_dict(checkpoint_path: str, device: str = "cuda") -> Dic
     return state_dict
 
 
-def replace_linear_with_gptq(module: nn.Module, state_dict: Dict[str, torch.Tensor], prefix: str = ""):
-    """
-    Recursively replace Linear layers with GPTQLinear layers based on quantized state dict.
-
-    Args:
-        module: The module to modify
-        state_dict: State dictionary containing quantized weights
-        prefix: Current prefix for parameter names
-    """
-    for name, child in module.named_children():
-        full_name = f"{prefix}.{name}" if prefix else name
-
-        if isinstance(child, nn.Linear):
-            # Check if this layer has quantized weights
-            qweight_key = f"{full_name}.qweight"
-            if qweight_key in state_dict:
-                # Extract quantization parameters
-                qweight = state_dict[qweight_key]
-                qzeros = state_dict[f"{full_name}.qzeros"]
-                scales = state_dict[f"{full_name}.scales"]
-                wbits = state_dict.get(f"{full_name}.wbits", torch.tensor(4)).item()
-
-                # Check for bias
-                bias_key = f"{full_name}.bias"
-                bias = state_dict.get(bias_key, None)
-
-                # Replace with GPTQLinear
-                gptq_linear = GPTQLinear(qweight, qzeros, scales, wbits, bias)
-                setattr(module, name, gptq_linear)
-
-                print(f"Replaced {full_name} with GPTQLinear")
-        else:
-            # Recursively process child modules
-            replace_linear_with_gptq(child, state_dict, full_name)
-
-
 def create_dequantized_state_dict(quantized_state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     """
     Create a dequantized state dictionary from a quantized one.
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index e197b51..e552486 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -3,6 +3,8 @@
 import subprocess
 from typing import List, Optional
 import torch
+from contextlib import contextmanager
+import functools
 
 def read_json(json_path):
     with open(json_path, "r") as json_file:
@@ -199,8 +201,21 @@ def get_model_dtype(checkpoints_dir: str):
         print("Defaulting to torch.float16")
         return torch.float16
 
-    except Exception as e:
-        print(f"Warning: Could not read dtype from config.json: {e}")
-        print("Defaulting to torch.float16")
-        return torch.float16
+
+@contextmanager
+def quantization(mode: str = None):
+    quantized_linear_cls = None
+    if mode == 'gptq.int4':
+        from ..quantization.gptq.gptq import GPTQLinear
+        quantized_linear_cls = functools.partial(GPTQLinear, bits=4, tile_cols=-1)
+    elif mode is not None:
+        raise ValueError(f"Unknown quantization mode: {mode}")
+
+    enabled = mode is not None
+    torch_linear_cls = torch.nn.Linear
+    if enabled:
+        torch.nn.Linear = quantized_linear_cls
+    yield
+    if enabled:
+        torch.nn.Linear = torch_linear_cls
 

From ebaeb0c021a721a1b024dca83a17c12070372b02 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Thu, 3 Jul 2025 22:20:24 +0930
Subject: [PATCH 26/33] split int4 kernel and gptq

---
 lite_llama/kernels/int4_linear.py    | 336 +++++++++++++++++++++++++++
 lite_llama/quantization/gptq/gptq.py |  76 +-----
 2 files changed, 337 insertions(+), 75 deletions(-)
 create mode 100644 lite_llama/kernels/int4_linear.py

diff --git a/lite_llama/kernels/int4_linear.py b/lite_llama/kernels/int4_linear.py
new file mode 100644
index 0000000..805c44b
--- /dev/null
+++ b/lite_llama/kernels/int4_linear.py
@@ -0,0 +1,336 @@
+import triton
+import triton.language as tl
+import torch
+import torch.nn as nn
+import numpy as np
+from ..quantization.gptq.gptq import GPTQ
+
+@triton.autotune(
+        configs=[
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=3,
+                num_warps=8,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 256,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 256,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 128,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 128,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=4,
+                num_warps=4,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 64,
+                    "BLOCK_SIZE_N": 32,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=5,
+                num_warps=2,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_SIZE_M": 32,
+                    "BLOCK_SIZE_N": 64,
+                    "BLOCK_SIZE_K": 32,
+                    "GROUP_SIZE_M": 8,
+                },
+                num_stages=5,
+                num_warps=2,
+            ),
+        ],
+        key=["M", "N", "K"],
+    )
+
+@triton.jit
+def int4_gemm_kernel(
+    a_ptr, b_ptr, c_ptr,
+    bscales_ptr, bzeros_ptr,
+    M, N, K,
+    stride_am, stride_ak,
+    stride_bk, stride_bn,
+    stride_cm, stride_cn,
+    GROUP_SIZE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    a_mask = offs_am[:, None] < M
+    b_mask = offs_bn[None, :] < N
+
+    a_ptrs = a_ptr + stride_am * offs_am[:, None] + stride_ak * offs_k[None, :]
+    b_ptrs = b_ptr + stride_bn * offs_bn[None, :] + stride_bk * (offs_k[:, None] // 2)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float16)
+
+    for k in range(0, K, BLOCK_SIZE_K):
+        b_q = tl.load(b_ptrs, mask=b_mask)
+
+        a = tl.load(a_ptrs, mask=a_mask).to(tl.float16)
+
+        # Compute per-group index
+        k_offset = k + offs_k  # shape: [BLOCK_SIZE_K]
+        group_idx = k_offset // GROUP_SIZE  # [BLOCK_SIZE_K]
+
+        # Load scale and zero for each [N, G]
+        scale = tl.load(bscales_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)  # [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        zero = tl.load(bzeros_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)   # same shape
+
+        # Extract int4 values from uint8
+        shift = (k_offset[:, None] % 2) * 4
+        q = (b_q.to(tl.uint8) >> shift) & 0xF
+        b_deq = (q.to(tl.float16) - zero) * scale
+
+        accumulator += tl.dot(a, b_deq, out_dtype=tl.float16)
+
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+def triton_int4_gemm(
+    inp: torch.Tensor,
+    weight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+    group_size: int = 64
+) -> torch.Tensor:
+
+
+    weight = weight.t().contiguous()  # [K/2, N]
+    c_shape = inp.shape[:-1] + weight.shape[-1:]
+    inp = inp.view(-1, inp.shape[-1]).contiguous()
+
+    PAD_TO = 256
+    if inp.shape[0] % PAD_TO != 0:
+        c_crop = inp.shape[0]
+        new_inp = inp.new_zeros(((inp.shape[0] + PAD_TO - 1) // PAD_TO * PAD_TO, inp.shape[1]))
+        new_inp[:c_crop] = inp
+        inp = new_inp
+    else:
+        c_crop = None
+
+    M, K = inp.shape
+    N = weight.shape[1]
+
+
+
+    c = torch.empty((M, N), device=inp.device, dtype=torch.float16)
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+    )
+
+    int4_gemm_kernel[grid](
+        inp, weight, c,
+        scales, zeros,
+        M, N, K,
+        inp.stride(0), inp.stride(1),
+        weight.stride(0), weight.stride(1),
+        c.stride(0), c.stride(1),
+        GROUP_SIZE=group_size,
+    )
+
+    return c[:c_crop] if c_crop is not None else c.view(c_shape)
+
+class GPTQLinear(nn.Module):
+    """
+    4-bit quantized linear layer using Triton kernels
+    """
+
+    def __init__(self, in_features, out_features, bias=True, groupsize=64, device="cuda"):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.groupsize = groupsize
+        self.device = device
+
+        self.tile_cols = groupsize
+        self.original_out_features = out_features
+
+        # Quantized params (assigned later)
+        self.register_buffer("packed_weight", None)
+        self.register_buffer("scales", None)
+        self.register_buffer("zeros", None)
+        self.register_buffer("bias", None if not bias else torch.empty(out_features))
+
+    @staticmethod
+    def pack_weight(weight):
+        rows, cols = weight.shape
+        if cols % 2 != 0:
+            weight = torch.nn.functional.pad(weight, (0, 1), value=0)
+            cols += 1
+        packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
+        return packed.contiguous()
+
+    def get_weight(self, packed: torch.Tensor) -> torch.Tensor:
+        """
+        [rows, ceil(cols/2)] uint8 -> [rows, cols] uint8 in [0, 15]
+        """
+        rows, packed_cols = packed.shape
+        qweight = torch.empty((rows, packed_cols * 2), dtype=torch.uint8, device=packed.device)
+        qweight[:, 0::2] = packed & 0xF
+        qweight[:, 1::2] = (packed >> 4) & 0xF
+        return qweight[:, :self.in_features].contiguous()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_flat = x.view(-1, self.in_features)
+        # Compute quantized matmul
+        output = triton_int4_gemm(
+            x_flat.float(),
+            self.packed_weight,
+            self.scales,
+            self.zeros,
+            group_size=self.groupsize,
+        )
+
+        if self.bias is not None:
+            output += self.bias
+
+        return output.view(*x.shape[:-1], self.out_features)
+
+
+def get_gpu_memory():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        return torch.cuda.memory_allocated() // (1024 ** 2)
+    return 0
+
+
+def test_gptqlinear_vs_nnlinear(
+    in_features=2048,
+    out_features=4096,
+    groupsize=64,
+    wbits=4,
+    device="cuda"
+):
+    torch.manual_seed(42)
+    np.random.seed(42)
+
+    # ---- Create single input vector ----
+    x = torch.randn(in_features, device=device, dtype=torch.float16)
+    linear = nn.Linear(in_features, out_features, bias=True, device=device, dtype=torch.float16).eval()
+
+    weight = linear.weight.detach().to(device).float()
+    bias = linear.bias.detach().to(device).float() if linear.bias is not None else None
+
+    # --- Quantize using GPTQ ---
+    gptq = GPTQ(wbits=wbits, groupsize=groupsize, device=device)
+    qweight, qzeros, qscales = gptq.quantize(weight)
+    packed_weight = GPTQLinear.pack_weight(qweight)
+
+    gptqlinear = GPTQLinear(in_features, out_features, bias=True, groupsize=groupsize, device=device).to(device)
+    gptqlinear.packed_weight = packed_weight
+    gptqlinear.scales = qscales
+    gptqlinear.zeros = qzeros
+    gptqlinear.bias = bias if bias is not None else None
+    gptqlinear.eval()
+
+    print("\n== Latency ==")
+    time_fp = triton.testing.do_bench(lambda: linear(x))
+    time_q = triton.testing.do_bench(lambda: gptqlinear(x))
+    print(f"nn.Linear (fp16): {time_fp:.3f} ms")
+    print(f"GPTQLinear:       {time_q:.3f} ms")
+
+    # print(torch.allclose(linear(x), gptqlinear(x), atol=1e-3))  # True / False
+    a = linear(x)
+    b = gptqlinear(x)
+    abs_error = torch.abs(a - b)
+    rel_error = abs_error / (torch.abs(b) + 1e-8)
+    print("Mean abs error:", abs_error.mean().item())
+    print("Max abs error:", abs_error.max().item())
+    print("Mean rel error:", rel_error.mean().item())
+    print("Max rel error:", rel_error.max().item())
+
+if __name__ == "__main__":
+    test_gptqlinear_vs_nnlinear()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq.py b/lite_llama/quantization/gptq/gptq.py
index 324984f..3624aeb 100755
--- a/lite_llama/quantization/gptq/gptq.py
+++ b/lite_llama/quantization/gptq/gptq.py
@@ -543,9 +543,7 @@ def triton_int4_gemm(
     M, K = inp.shape
     N = weight.shape[1]
 
-
-
-    c = torch.empty((M, N), device=inp.device, dtype=torch.float32)
+    c = torch.empty((M, N), device=inp.device, dtype=torch.float16)
 
     grid = lambda META: (
         triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
@@ -620,75 +618,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return output.view(*x.shape[:-1], self.out_features)
 
 
-def get_gpu_memory():
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        return torch.cuda.memory_allocated() // (1024 ** 2)
-    return 0
-
-
-def test_gptqlinear_vs_nnlinear(
-    in_features=2048,
-    out_features=4096,
-    groupsize=64,
-    wbits=4,
-    device="cuda"
-):
-    torch.manual_seed(42)
-    np.random.seed(42)
-
-    # ---- Create single input vector ----
-    x = torch.randn(in_features, device=device, dtype=torch.float16)
-    linear = nn.Linear(in_features, out_features, bias=True, device=device, dtype=torch.float16).eval()
-
-    weight = linear.weight.detach().to(device).float()
-    bias = linear.bias.detach().to(device).float() if linear.bias is not None else None
-
-    # --- Quantize using GPTQ ---
-    gptq = GPTQ(wbits=wbits, groupsize=groupsize, device=device)
-    qweight, qzeros, qscales = gptq.quantize(weight)
-    packed_weight = GPTQLinear.pack_weight(qweight)
-
-    gptqlinear = GPTQLinear(in_features, out_features, bias=True, groupsize=groupsize, device=device).to(device)
-    gptqlinear.packed_weight = packed_weight
-    gptqlinear.scales = qscales
-    gptqlinear.zeros = qzeros
-    gptqlinear.bias = bias if bias is not None else None
-    gptqlinear.eval()
-
-    # ---- Memory ----
-    gc.collect()
-    torch.cuda.empty_cache()
-    mem0 = get_gpu_memory()
-    _ = linear(x)
-    mem_fp = get_gpu_memory()
-    del _
-    gc.collect()
-    torch.cuda.empty_cache()
-    mem1 = get_gpu_memory()
-    _ = gptqlinear(x)
-    mem_q = get_gpu_memory()
-    del _
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    # ---- Print ----
-
-
-    print("\n== Memory Usage (VRAM, MB) ==")
-    print(f"nn.Linear (fp16): {mem_fp} MB MB)")
-    print(f"GPTQLinear:       {mem_q} MB MB)")
-
-    print("\n== Latency ==")
-    time_fp = triton.testing.do_bench(lambda: linear(x))
-    time_q = triton.testing.do_bench(lambda: gptqlinear(x))
-    print(f"nn.Linear (fp16): {time_fp:.3f} ms")
-    print(f"GPTQLinear:       {time_q:.3f} ms")
-
-
-    print("\n== VRAM saving ratio ==")
-    print(f"GPTQLinear / nn.Linear: {(mem_q-mem1)/(mem_fp-mem0 + 1e-9):.3f}x")
-    print(f"Speedup: {time_fp/time_q:.2f}x\n")
-
-if __name__ == "__main__":
-    test_gptqlinear_vs_nnlinear()
\ No newline at end of file

From f2cb58990139fe925f13b730f89a347485b2389e Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Thu, 10 Jul 2025 14:09:07 +0930
Subject: [PATCH 27/33] add awq

---
 README.md                                   |   7 +-
 apply_weight_convert.py                     |  36 +-
 generate.py                                 |  88 ++--
 lite_llama/executor/model_executor.py       |   4 -
 lite_llama/executor/weight_convert.py       |  50 +-
 lite_llama/kernels/int4_linear.py           |  46 +-
 lite_llama/quantization/awq.py              | 492 ++++++++++++++++++++
 lite_llama/quantization/{gptq => }/gptq.py  | 367 ++-------------
 lite_llama/quantization/gptq/__init__.py    |   0
 lite_llama/quantization/gptq/gptq_loader.py | 147 ------
 lite_llama/quantization/quant_config.py     |  24 +
 lite_llama/quantization/utils.py            |  19 +
 lite_llama/utils/common.py                  |   2 +-
 test.py                                     | 263 +++++++++++
 tests/test_gptq.py                          | 232 +++++++++
 15 files changed, 1196 insertions(+), 581 deletions(-)
 create mode 100644 lite_llama/quantization/awq.py
 rename lite_llama/quantization/{gptq => }/gptq.py (52%)
 delete mode 100755 lite_llama/quantization/gptq/__init__.py
 delete mode 100644 lite_llama/quantization/gptq/gptq_loader.py
 create mode 100644 lite_llama/quantization/quant_config.py
 create mode 100644 lite_llama/quantization/utils.py
 create mode 100644 test.py
 create mode 100644 tests/test_gptq.py

diff --git a/README.md b/README.md
index c35e43d..fb380b2 100644
--- a/README.md
+++ b/README.md
@@ -93,8 +93,11 @@ pip install -r requirement.txt
 python apply_weight_convert.py -m /path/to/model/Llama-3.2-1B-Instruct/# model weight transformation
 python generate.py -p "What is large language model" -m /path/to/model/Llama-3.2-1B-Instruct/ -f /path/to/figure# Run on the basis that the model has been downloaded and placed in the specified directory
 ```
-
-
+Quantitization
+```bash
+python apply_weight_convert.py --checkpoints_dir ../../Model/origin/Llama-3.2-3B-Instruct-cp/ --use_gptq
+python generate.py --prompt "What is llama?" --checkpoint_path  my_weight/Llama-3.2-3B-Instruct-cp_gptq/ --quant gptq.int4
+```
 ## Evaluation
 
 After `generate.py` runs successfully, the terminal displays the interface as shown below, and you can enter your question in the terminal.
diff --git a/apply_weight_convert.py b/apply_weight_convert.py
index 3803019..4973a61 100755
--- a/apply_weight_convert.py
+++ b/apply_weight_convert.py
@@ -29,22 +29,10 @@ def main():
         help="Path to the model checkpoint directory"
     )
     parser.add_argument(
-        "--use_gptq",
-        action="store_true",
+        "--quant_method",
+        type=str,
         help="Enable GPTQ quantization"
     )
-    parser.add_argument(
-        "--wbits",
-        type=int,
-        default=4,
-        help="Number of bits for quantization (default: 4)"
-    )
-    parser.add_argument(
-        "--groupsize",
-        type=int,
-        default=8,
-        help="Group size for quantization (default: 8)"
-    )
     parser.add_argument(
         "--device",
         type=str,
@@ -60,18 +48,12 @@ def main():
     args = parser.parse_args()
 
     checkpoints_dir = args.checkpoints_dir
-    use_gptq = args.use_gptq
-    wbits = args.wbits
-    groupsize = args.groupsize
+    quant_method = args.quant_method
     device = args.device
     print_params = not args.no_print_params
 
     # Print configuration
     print(f"Converting model from: {checkpoints_dir}")
-    if use_gptq:
-        print(f"GPTQ Quantization enabled: {wbits} bits, groupsize {groupsize}")
-    else:
-        print("GPTQ Quantization: Disabled")
     print(f"Device: {device}")
     print("-" * 50)
 
@@ -105,9 +87,7 @@ def main():
             num_layers,
             print_params=print_params,
             device=device,
-            use_gptq=use_gptq,
-            wbits=wbits,
-            groupsize=groupsize
+            quant_method=quant_method,
         )
 
     elif "llama" in checkpoints_dir.lower():
@@ -119,9 +99,7 @@ def main():
             checkpoints_dir,
             hf_sd,
             num_layers,
-            use_gptq=use_gptq,
-            wbits=wbits,
-            groupsize=groupsize,
+            quant_method=quant_method,
             device=device
         )
 
@@ -134,9 +112,7 @@ def main():
             checkpoints_dir,
             hf_sd,
             num_layers,
-            use_gptq=use_gptq,
-            wbits=wbits,
-            groupsize=groupsize,
+            quant_method=quant_method,
             device=device
         )
     else:
diff --git a/generate.py b/generate.py
index ea90bec..0f3be28 100644
--- a/generate.py
+++ b/generate.py
@@ -37,6 +37,8 @@ def report_resource_usage(ram_before, vram_before) -> None:
     log.info(f"GPU VRAM Used: {vram_text}")
 
 
+# Add these modifications to generate.py
+
 def generate_llama(
         prompt: str = "Hello, my name is",
         quantize: Optional[str] = None,
@@ -51,10 +53,13 @@ def generate_llama(
         triton_weight: bool = True,
         gpu_type: str = "nvidia",
         checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
+        use_gptq: bool = False,  # Add GPTQ parameter
+        gptq_groupsize: int = 128,  # Add groupsize parameter
 ):
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
+
     if max_seq_len <= 1024:
         short_prompt = True
     else:
@@ -63,12 +68,15 @@ def generate_llama(
 
     # Start resource tracking
     ram_before = process.memory_info().rss
-
     vram_before = get_gpu_memory(gpu_type)
 
-    # Init LLM generator
-    with quantization(quantize):
+    # Determine quantization method
+    if use_gptq:
+        log.info("Using GPTQ quantization for inference")
+        quantize = None  # GPTQ doesn't use the legacy quantization context manager
 
+    # Init LLM generator with GPTQ support
+    with quantization(quantize):
         generator = GenerateStreamText(
             checkpoints_dir=checkpoint_path,
             tokenizer_path=checkpoint_path,
@@ -80,9 +88,9 @@ def generate_llama(
             device=device,
         )
 
-
     model_prompter.insert_prompt(prompt)
     prompts = [model_prompter.model_input]
+
     # Call the generation function and start the stream generation
     stream = generator.text_completion_stream(
         prompts,
@@ -91,19 +99,19 @@ def generate_llama(
         max_gen_len=max_gen_len,
     )
 
-    completion = ''  # Initialize to generate the result
-    # NOTE: After creating a generator, it can be iterated through a for loop
+    completion = ''
     text_msg = ""
     start = time.perf_counter()
     for batch_completions in stream:
         new_text = batch_completions[0]['generation'][len(completion):]
         completion = batch_completions[0]['generation']
         print(new_text, end='', flush=True)
-        text_msg +=new_text
+        text_msg += new_text
     end = time.perf_counter()
 
     print("\n\n==================================\n")
-    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+    log.info(
+        f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer) / (end - start):.2f} tokens/sec")
 
     # Report resource usage
     report_resource_usage(ram_before, vram_before)
@@ -122,7 +130,9 @@ def generate_llava(
         max_gen_len: Optional[int] = 512,
         load_model: bool = True,
         compiled_model: bool = False,
-        triton_weight: bool = True
+        triton_weight: bool = True,
+        use_gptq: bool = False,  # Add GPTQ parameter
+        gptq_groupsize: int = 128,  # Add groupsize parameter
 ):
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     if max_seq_len <= 1024:
@@ -134,21 +144,22 @@ def generate_llava(
         log.error(f"'{figure_path}' Not a valid file path！")
     else:
         image_input = str(figure_path).strip()
-    image_items = [image_input]  # Prepare the image_items list
-    image_num = len(image_items)  # Calculate the number of input images
-    vis_images(image_items)  # Displaying images in the terminal
+    image_items = [image_input]
+    image_num = len(image_items)
+    vis_images(image_items)
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
     model_prompter = get_prompter("llama", checkpoint_path, short_prompt)
 
     # Start resource tracking
     ram_before = process.memory_info().rss
-
     vram_before = get_gpu_memory(gpu_type)
 
-    # Initializing the Multimodal Model Text Generator
-    with quantization(quantize):
+    # Initialize the Multimodal Model Text Generator with GPTQ support
+    if use_gptq:
+        quantize = None  # GPTQ doesn't use legacy quantization
 
+    with quantization(quantize):
         try:
             generator = LlavaGeneratorStream(
                 checkpoints_dir=checkpoint_path,
@@ -179,19 +190,21 @@ def generate_llava(
     except Exception as e:
         log.error(f"Text Generation Failure: {e}")
 
-    completion = ''  # Initialization generates results
+    completion = ''
     text_msg = ""
     start = time.perf_counter()
 
     for batch_completions in stream:
         next_text = batch_completions[0]['generation'][len(completion):]
         completion = batch_completions[0]['generation']
-        print(f"\033[91m{next_text}\033[0m", end='', flush=True)  # 红色文本
+        print(f"\033[91m{next_text}\033[0m", end='', flush=True)
         text_msg += next_text
     end = time.perf_counter()
 
     print("\n\n==================================\n")
-    log.info(f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer)/(end - start):.2f} tokens/sec")
+    log.info(
+        f"Time for inference: {(end - start):.2f} sec, {count_tokens(text_msg, generator.tokenizer) / (end - start):.2f} tokens/sec")
+
     # Report resource usage
     report_resource_usage(ram_before, vram_before)
 
@@ -201,12 +214,14 @@ def generate_llava(
 
     torch.set_float32_matmul_precision("high")
 
-    # Create a wrapper function that adds the use_gptq parameter
+
     def main(
-        prompt: str = "Hello, my name is",
-        checkpoint_path: Path = Path("checkpoints/lite-llama/7B/"),
-        figure_path: Optional[Path] = None,
-        quant: str = "gpt.int4"
+            prompt: str = "Hello, my name is",
+            checkpoint_path: Path = Path("checkpoints/lite-llama/7B/"),
+            figure_path: Optional[Path] = None,
+            quant: Optional[str] = None,
+            use_gptq: Optional[bool] = False,  # Add GPTQ flag
+            gptq_groupsize: Optional[int] = 8,  # Add groupsize parameter
     ):
         """
         Generate text using lite_llama with automatic GPTQ detection
@@ -215,15 +230,32 @@ def main(
             prompt: Input prompt text
             checkpoint_path: Path to model checkpoint directory
             figure_path: Path to Image file for LLaVA generation, optional
-            quant: GPTQ quantization mode
+            quant: Legacy quantization mode (ignored if use_gptq=True)
+            use_gptq: Whether to use GPTQ quantization
+            gptq_groupsize: Group size for GPTQ quantization
         """
-        # Determine use_gptq based on force flags
         gpu_type = detect_device()
         model_path = os.path.abspath(checkpoint_path)
+
         if figure_path:
-            generate_llava(prompt=prompt, checkpoint_path=Path(model_path), figure_path=Path(figure_path),
-                           gpu_type=gpu_type, quantization=quant)
+            generate_llava(
+                prompt=prompt,
+                checkpoint_path=Path(model_path),
+                figure_path=Path(figure_path),
+                gpu_type=gpu_type,
+                quantize=quant if not use_gptq else None,
+                use_gptq=use_gptq,
+                gptq_groupsize=gptq_groupsize
+            )
         else:
-            generate_llama(prompt=prompt, checkpoint_path=Path(model_path), gpu_type=gpu_type, quantization=quant)
+            generate_llama(
+                prompt=prompt,
+                checkpoint_path=Path(model_path),
+                gpu_type=gpu_type,
+                quantize=quant if not use_gptq else None,
+                use_gptq=use_gptq,
+                gptq_groupsize=gptq_groupsize
+            )
+
 
     CLI(main)
\ No newline at end of file
diff --git a/lite_llama/executor/model_executor.py b/lite_llama/executor/model_executor.py
index 833a8d0..00e6a40 100644
--- a/lite_llama/executor/model_executor.py
+++ b/lite_llama/executor/model_executor.py
@@ -135,10 +135,6 @@ def _load_model_weight(
             state_dict = torch.load(
                 ckpt_path, mmap=True, weights_only=True, map_location=device
             )
-            if any(key.endswith(".qweight") for key in state_dict.keys()):
-                from ..quantization.gptq.gptq_loader import create_dequantized_state_dict
-                log.info("Detected GPTQ quantized weights. Dequantizing...")
-                state_dict = create_dequantized_state_dict(state_dict)
         else:
             conversion_func = get_conversion_func(model_config.model_type)
             if conversion_func is None:
diff --git a/lite_llama/executor/weight_convert.py b/lite_llama/executor/weight_convert.py
index bceecbb..8800efd 100755
--- a/lite_llama/executor/weight_convert.py
+++ b/lite_llama/executor/weight_convert.py
@@ -1,25 +1,26 @@
 from tqdm.auto import tqdm
 import torch, os, shutil, glob
 import os.path as osp
-from typing import Dict, Optional
-from ..quantization.gptq.gptq import quantize_gptq  # Import our GPTQ implementation
+from typing import Dict
+from lite_llama.quantization.gptq import quantize_gptq  # Import our GPTQ implementation
 
 
-def build_new_weight_dir(checkpoints_dir: str, new_sd, quantized: bool = False):
+def build_new_weight_dir(checkpoints_dir: str, new_sd, quant_method: str = "gptq"):
     # 保存 lite_llama 模型权重并构建新的权重目录
     model_id = osp.basename(osp.normpath(checkpoints_dir))
     current_dir = osp.dirname(osp.abspath(__file__))  # 获取当前文件所在的目录
+    save_filename = f"{model_id}.pth"
 
     # Add quantized suffix if using GPTQ
     weight_dir_name = f"../../my_weight/{model_id}"
-    if quantized:
+    if quant_method == "gptq":
         weight_dir_name += "_gptq"
+        save_filename = f"{model_id}_gptq.pth" if quant_method else f"{model_id}.pth"
 
     my_weight_dir = osp.join(current_dir, weight_dir_name)  # 项目所在根目录
     os.makedirs(my_weight_dir, exist_ok=True)  # 创建文件夹（如果不存在）
 
     # 保存模型的状态字典。
-    save_filename = f"{model_id}_gptq.pth" if quantized else f"{model_id}.pth"
     torch.save(
         new_sd,
         osp.join(my_weight_dir, save_filename),
@@ -42,9 +43,7 @@ def convert_qwen2_hf_to_litellama(
         num_layers,
         print_params: bool = True,
         device: str = "cuda",
-        use_gptq: bool = False,
-        wbits: int = 4,
-        groupsize: int = 8,
+        quant_method: str = "gptq",
 ) -> Dict[str, torch.Tensor]:
     """
     将 Hugging Face 格式的预训练模型的权重字典转换为自定义模型的权重字典。
@@ -55,7 +54,7 @@ def convert_qwen2_hf_to_litellama(
         num_layers: 模型层数
         print_params: 是否打印参数信息
         device: 设备
-        use_gptq: 是否使用 GPTQ 量化
+        quant_method: 量化方法
         wbits: 量化位数
         groupsize: 量化组大小
     """
@@ -140,8 +139,7 @@ def convert_qwen2_hf_to_litellama(
             del new_sd[v_bias_key]
 
     # Apply GPTQ quantization if requested
-    if use_gptq:
-        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+    if quant_method == "gptq":
         # Define layers to quantize (excluding embeddings and layer norms)
         target_layers = []
         for name in new_sd.keys():
@@ -154,14 +152,12 @@ def convert_qwen2_hf_to_litellama(
 
         new_sd = quantize_gptq(
             model_state_dict=new_sd,
-            wbits=wbits,
-            groupsize=groupsize,
             target_layers=target_layers,
             device=device
         )
 
     # 保存转换好的自定义权重
-    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
+    build_new_weight_dir(checkpoints_dir, new_sd, quant_method=quant_method)
 
     if print_params:
         # 打印预训练模型的参数名称
@@ -265,9 +261,7 @@ def convert_llama_hf_to_litellama(
         checkpoints_dir,
         hf_sd,
         num_layers,
-        use_gptq: bool = False,
-        wbits: int = 4,
-        groupsize: int = 8,
+        quant_method: str = "gptq",
         device: str = "cuda"
 ):
     """
@@ -276,7 +270,7 @@ def convert_llama_hf_to_litellama(
     参数:
         checkpoints_dir: Hugging Face 模型的目录
         hf_sd (dict): Hugging Face 模型的状态字典。
-        use_gptq: 是否使用 GPTQ 量化
+        quant_method: 量化方法
         wbits: 量化位数
         groupsize: 量化组大小
         device: 设备
@@ -338,8 +332,7 @@ def convert_llama_hf_to_litellama(
             del new_sd[v_key]
 
     # Apply GPTQ quantization if requested
-    if use_gptq:
-        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+    if quant_method == "gptq":
         target_layers = []
         for name in new_sd.keys():
             if any(pattern in name for pattern in [
@@ -351,8 +344,6 @@ def convert_llama_hf_to_litellama(
 
         new_sd = quantize_gptq(
             model_state_dict=new_sd,
-            wbits=wbits,
-            groupsize=groupsize,
             target_layers=target_layers,
             device=device
         )
@@ -364,16 +355,14 @@ def convert_llama_hf_to_litellama(
             print(name, parameters)
 
     # 将处理后的权重保存到指定目录
-    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
+    build_new_weight_dir(checkpoints_dir, new_sd, quant_method=quant_method)
 
 
 def convert_llavallama_hf_to_litellama(
         checkpoints_dir,
         hf_sd,
         num_layers,
-        use_gptq: bool = False,
-        wbits: int = 4,
-        groupsize: int = 8,
+        quant_method: str = "gptq",
         device: str = "cuda"
 ):
     """
@@ -383,7 +372,7 @@ def convert_llavallama_hf_to_litellama(
         checkpoints_dir: Hugging Face 模型的目录
         hf_sd (dict): Hugging Face 模型的状态字典。
         model_config (LlamaConfig): 自定义模型的配置参数。
-        use_gptq: 是否使用 GPTQ 量化
+        quant_method: 量化方法
         wbits: 量化位数
         groupsize: 量化组大小
         device: 设备
@@ -448,8 +437,7 @@ def convert_llavallama_hf_to_litellama(
             del new_sd[v_key]
 
     # Apply GPTQ quantization if requested
-    if use_gptq:
-        print(f"\nApplying GPTQ quantization with {wbits} bits and groupsize {groupsize}...")
+    if quant_method == "gptq":
         target_layers = []
         for name in new_sd.keys():
             if any(pattern in name for pattern in [
@@ -461,8 +449,6 @@ def convert_llavallama_hf_to_litellama(
 
         new_sd = quantize_gptq(
             model_state_dict=new_sd,
-            wbits=wbits,
-            groupsize=groupsize,
             target_layers=target_layers,
             device=device
         )
@@ -473,5 +459,5 @@ def convert_llavallama_hf_to_litellama(
         else:
             print(name, parameters)
 
-    build_new_weight_dir(checkpoints_dir, new_sd, quantized=use_gptq)
+    build_new_weight_dir(checkpoints_dir, new_sd, quant_method=quant_method)
 
diff --git a/lite_llama/kernels/int4_linear.py b/lite_llama/kernels/int4_linear.py
index 805c44b..70663ab 100644
--- a/lite_llama/kernels/int4_linear.py
+++ b/lite_llama/kernels/int4_linear.py
@@ -3,7 +3,9 @@
 import torch
 import torch.nn as nn
 import numpy as np
-from ..quantization.gptq.gptq import GPTQ
+from lite_llama.quantization.gptq import GPTQ
+from lite_llama.quantization.quant_config import GPTQConfig
+
 
 @triton.autotune(
         configs=[
@@ -228,13 +230,14 @@ class GPTQLinear(nn.Module):
     4-bit quantized linear layer using Triton kernels
     """
 
-    def __init__(self, in_features, out_features, bias=True, groupsize=64, device="cuda"):
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.float16, bits=4, groupsize=64, device="cuda", tile_cols=None,):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.groupsize = groupsize
         self.device = device
-
+        self.dtype = dtype  # optional
+        self.bits = bits  # optional
         self.tile_cols = groupsize
         self.original_out_features = out_features
 
@@ -280,11 +283,30 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return output.view(*x.shape[:-1], self.out_features)
 
 
-def get_gpu_memory():
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        return torch.cuda.memory_allocated() // (1024 ** 2)
-    return 0
+class AWQLinear(nn.Module):
+    """AWQ Quantized Linear Layer"""
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = False,
+                 w_bit: int = 4, group_size: int = 128):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size
+
+        # Scales and zeros for each group
+        self.register_buffer("packed_weight", None)
+        self.register_buffer("scales", None)
+        self.register_buffer("zeros", None)
+        self.register_buffer("bias", None if not bias else torch.empty(out_features))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass with dequantization"""
+        # Dequantize weights on the fly
+        weight = self.dequantize_weights()
+        return torch.nn.functional.linear(x, weight.T, self.bias)
+
+
 
 
 def test_gptqlinear_vs_nnlinear(
@@ -305,8 +327,12 @@ def test_gptqlinear_vs_nnlinear(
     bias = linear.bias.detach().to(device).float() if linear.bias is not None else None
 
     # --- Quantize using GPTQ ---
-    gptq = GPTQ(wbits=wbits, groupsize=groupsize, device=device)
-    qweight, qzeros, qscales = gptq.quantize(weight)
+    config = GPTQConfig(
+        w_bit=wbits,
+        group_size=groupsize,
+    )
+    gptq = GPTQ(config)
+    qweight, qzeros, qscales, _ = gptq.quantize(weight)
     packed_weight = GPTQLinear.pack_weight(qweight)
 
     gptqlinear = GPTQLinear(in_features, out_features, bias=True, groupsize=groupsize, device=device).to(device)
diff --git a/lite_llama/quantization/awq.py b/lite_llama/quantization/awq.py
new file mode 100644
index 0000000..a7a5329
--- /dev/null
+++ b/lite_llama/quantization/awq.py
@@ -0,0 +1,492 @@
+from dataclasses import field
+
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Dict, Tuple, Optional, Any, List
+from tqdm.auto import tqdm
+import triton
+import triton.language as tl
+import time, gc, psutil, os, sys
+
+from lite_llama.quantization.quant_config import GPTQConfig  # Reusing config structure
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+
+from lite_llama.utils.common import get_gpu_memory
+from utils import pack_weight, unpack_weight
+from lite_llama.quantization.quant_config import AWQConfig
+
+class AWQ:
+    def __init__(
+            self,
+            config: AWQConfig = field(default_factory=AWQConfig),
+    ):
+        self.wbits = config.w_bit
+        self.groupsize = config.group_size if config.group_size != -1 else float('inf')
+        self.device = config.device
+        self.maxq = 2 ** self.wbits - 1
+        self.zero_point = config.zero_point
+        self.alpha = config.alpha
+        self.search_scale = config.search_scale
+        self.auto_scale = config.auto_scale
+
+        # Store activation statistics
+        self.activation_stats = {}
+        self.collected_inputs = {}
+
+    def collect_activations(self, layer_name: str, input_tensor: torch.Tensor):
+        """Collect activation statistics for AWQ calibration"""
+        if layer_name not in self.activation_stats:
+            self.activation_stats[layer_name] = {
+                'mean': [],
+                'max': [],
+                'inputs': []
+            }
+
+        # Store input activations
+        if len(self.activation_stats[layer_name]['inputs']) < 128:  # Limit storage
+            self.activation_stats[layer_name]['inputs'].append(input_tensor.detach().cpu())
+
+        # Compute statistics across the sequence dimension
+        # Input shape is typically [batch, seq_len, hidden_dim]
+        if input_tensor.dim() == 3:
+            # Average across batch and sequence dimensions
+            channel_means = input_tensor.abs().mean(dim=(0, 1))
+            channel_maxs = input_tensor.abs().max(dim=1)[0].max(dim=0)[0]
+        elif input_tensor.dim() == 2:
+            # Average across batch dimension
+            channel_means = input_tensor.abs().mean(dim=0)
+            channel_maxs = input_tensor.abs().max(dim=0)[0]
+        else:
+            # Flatten and compute
+            channel_means = input_tensor.abs().view(-1, input_tensor.shape[-1]).mean(dim=0)
+            channel_maxs = input_tensor.abs().view(-1, input_tensor.shape[-1]).max(dim=0)[0]
+
+        self.activation_stats[layer_name]['mean'].append(channel_means.cpu())
+        self.activation_stats[layer_name]['max'].append(channel_maxs.cpu())
+
+    def get_salient_channels(self, layer_name: str, top_k: float = 0.01) -> torch.Tensor:
+        """Identify salient channels based on activation statistics"""
+        if layer_name not in self.activation_stats:
+            return None
+
+        stats = self.activation_stats[layer_name]
+
+        # Aggregate statistics across all collected samples
+        if stats['mean']:
+            mean_activations = torch.stack(stats['mean']).mean(dim=0)
+            max_activations = torch.stack(stats['max']).mean(dim=0)
+
+            # Combine mean and max for saliency score
+            saliency_score = mean_activations * 0.7 + max_activations * 0.3
+
+            # Select top-k% most salient channels
+            num_salient = max(1, int(len(saliency_score) * top_k))
+            _, salient_indices = torch.topk(saliency_score, num_salient)
+
+            return salient_indices
+
+        return None
+
+    def pseudo_quantize_tensor(self, w: torch.Tensor, n_bit: int = 4, zero_point: bool = True,
+                               q_group_size: int = -1, inplace: bool = False):
+        """Pseudo-quantize tensor to simulate quantization effects"""
+        org_w_shape = w.shape
+        if q_group_size > 0:
+            assert org_w_shape[-1] % q_group_size == 0
+            w = w.reshape(-1, q_group_size)
+
+        assert w.dim() == 2
+        if zero_point:
+            max_val = w.amax(dim=1, keepdim=True)
+            min_val = w.amin(dim=1, keepdim=True)
+            max_int = 2 ** n_bit - 1
+            min_int = 0
+            scales = (max_val - min_val).clamp(min=1e-5) / max_int
+            zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
+        else:
+            max_val = w.abs().amax(dim=1, keepdim=True)
+            max_val = max_val.clamp(min=1e-5)
+            max_int = 2 ** (n_bit - 1) - 1
+            min_int = -(2 ** (n_bit - 1))
+            scales = max_val / max_int
+            zeros = torch.zeros_like(scales)
+
+        assert torch.isnan(scales).sum() == 0
+        assert torch.isnan(w).sum() == 0
+
+        if inplace:
+            ((w.div_(scales).round_().add_(zeros)).clamp_(min_int, max_int).sub_(zeros)).mul_(scales)
+            return w
+        else:
+            w_sim = ((w / scales).round() + zeros).clamp(min_int, max_int)
+            w_sim = (w_sim - zeros) * scales
+            return w_sim.reshape(org_w_shape)
+
+    def search_best_scale(self, layer_name: str, weight: torch.Tensor, input_feat: torch.Tensor) -> torch.Tensor:
+        """Search for the best per-channel scaling factors"""
+        device = weight.device
+        org_out = torch.matmul(input_feat, weight.t())
+
+        if org_out.abs().max() < 0.2:
+            return torch.ones(weight.shape[0], device=device, dtype=weight.dtype)
+
+        w_abs_max = weight.abs().max(dim=1)[0].clamp(min=1e-5)
+
+        # Get salient channels for this layer
+        salient_channels = self.get_salient_channels(layer_name)
+
+        # Grid search for best scaling factors
+        best_error = float('inf')
+        best_scales = torch.ones_like(w_abs_max)
+
+        # Different alpha values for grid search
+        alpha_candidates = [0.0, 0.1, 0.25, 0.5, 0.75, 1.0] if self.search_scale else [self.alpha]
+
+        for alpha in alpha_candidates:
+            # Compute scales based on activation statistics
+            if salient_channels is not None and len(salient_channels) > 0:
+                # Protect salient channels with different scaling
+                scales = torch.ones_like(w_abs_max)
+
+                # For salient channels, use more conservative scaling
+                if layer_name in self.activation_stats:
+                    stats = self.activation_stats[layer_name]
+                    if stats['mean']:
+                        mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
+
+                        # Scale based on activation magnitude
+                        activation_scales = mean_activations.pow(alpha)
+                        activation_scales = activation_scales / activation_scales.max()
+
+                        # Apply different scaling to salient vs non-salient channels
+                        scales = activation_scales.clamp(min=0.1, max=1.0)
+
+                        # Give salient channels more protection (higher scale values)
+                        scales[salient_channels] = scales[salient_channels].clamp(min=0.5)
+                else:
+                    # Fallback to weight-based scaling
+                    scales = w_abs_max.pow(alpha)
+                    scales = scales / scales.max()
+            else:
+                # Standard AWQ scaling without saliency
+                if layer_name in self.activation_stats and self.activation_stats[layer_name]['mean']:
+                    stats = self.activation_stats[layer_name]
+                    mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
+                    scales = mean_activations.pow(alpha)
+                    scales = scales / scales.max()
+                else:
+                    scales = w_abs_max.pow(alpha)
+                    scales = scales / scales.max()
+
+            scales = scales.clamp(min=0.1, max=1.0)
+
+            # Apply scaling and quantize
+            weight_scaled = weight * scales.view(-1, 1)
+            weight_sim = self.pseudo_quantize_tensor(
+                weight_scaled,
+                n_bit=self.wbits,
+                zero_point=self.zero_point,
+                q_group_size=self.groupsize if self.groupsize != float('inf') else -1
+            )
+
+            # Compute error
+            out_sim = torch.matmul(input_feat, weight_sim.t())
+            loss = (org_out - out_sim).float().pow(2).mean().item()
+
+            if loss < best_error:
+                best_error = loss
+                best_scales = scales.clone()
+
+        return best_scales
+
+    def quantize_with_scales(self, weight: torch.Tensor, scales: torch.Tensor) -> Tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Quantize weight with given per-channel scales"""
+        device = weight.device
+        rows, cols = weight.shape
+
+        # Apply per-channel scaling
+        weight_scaled = weight * scales.view(-1, 1)
+
+        # Group-wise quantization
+        if self.groupsize == float('inf'):
+            groupsize = cols
+        else:
+            groupsize = min(int(self.groupsize), cols)
+
+        num_groups = (cols + groupsize - 1) // groupsize
+
+        qweight = torch.zeros_like(weight_scaled, dtype=torch.uint8)
+        qzeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+        qscales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+
+        for g in range(num_groups):
+            start_col = g * groupsize
+            end_col = min((g + 1) * groupsize, cols)
+
+            w_group = weight_scaled[:, start_col:end_col]
+
+            if self.zero_point:
+                w_min = w_group.min(dim=1, keepdim=True)[0]
+                w_max = w_group.max(dim=1, keepdim=True)[0]
+
+                range_val = (w_max - w_min).clamp(min=1e-5)
+                scale = range_val / self.maxq
+                zero = torch.round(-w_min / scale).clamp(0, self.maxq)
+
+            else:
+                w_max = w_group.abs().max(dim=1, keepdim=True)[0].clamp(min=1e-5)
+                scale = w_max / (2 ** (self.wbits - 1) - 1)
+                zero = torch.zeros_like(scale)
+
+            # Quantize
+            if self.zero_point:
+                q = torch.clamp(torch.round(w_group / scale + zero), 0, self.maxq)
+            else:
+                q = torch.clamp(torch.round(w_group / scale), -(2 ** (self.wbits - 1)), 2 ** (self.wbits - 1) - 1)
+
+            qweight[:, start_col:end_col] = q.to(torch.uint8)
+            qscales[:, g] = scale.squeeze(-1)
+            qzeros[:, g] = zero.squeeze(-1)
+
+        return qweight, qzeros, qscales
+
+    def quantize(self, weight: torch.Tensor, layer_name: str = "") -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Main AWQ quantization function
+        Args:
+            weight: Weight tensor to quantize [out_features, in_features]
+            layer_name: Name of the layer for activation lookup
+        Returns:
+            Tuple of (quantized_weight, zeros, scales)
+        """
+        assert weight.ndim == 2
+        device = weight.device
+
+        # Get representative input if available
+        input_feat = None
+        if layer_name in self.activation_stats and self.activation_stats[layer_name]['inputs']:
+            # Use first few inputs for calibration
+            inputs = self.activation_stats[layer_name]['inputs'][:5]
+            input_feat = torch.cat([inp.to(device) for inp in inputs], dim=0)
+
+            # Reshape if needed: [batch*seq, hidden] -> [batch*seq, hidden]
+            if input_feat.dim() == 3:
+                input_feat = input_feat.view(-1, input_feat.shape[-1])
+
+        # Search for best scales if we have input data
+        if input_feat is not None and self.search_scale:
+            scales = self.search_best_scale(layer_name, weight, input_feat)
+        else:
+            # Fallback to uniform scaling or activation-based scaling
+            if self.auto_scale and layer_name in self.activation_stats:
+                stats = self.activation_stats[layer_name]
+                if stats['mean']:
+                    mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
+                    scales = mean_activations.pow(self.alpha)
+                    scales = scales / scales.max()
+                    scales = scales.clamp(min=0.1, max=1.0)
+                else:
+                    scales = torch.ones(weight.shape[0], device=device, dtype=weight.dtype)
+            else:
+                scales = torch.ones(weight.shape[0], device=device, dtype=weight.dtype)
+
+        # Quantize with computed scales
+        qweight, qzeros, qscales = self.quantize_with_scales(weight, scales)
+
+        return qweight, qzeros, qscales
+
+    def dequantize(self, qweight: torch.Tensor, qzeros: torch.Tensor, qscales: torch.Tensor) -> torch.Tensor:
+        """Dequantize weights back to floating point"""
+        rows, cols = qweight.shape
+        groupsize = min(int(self.groupsize), cols) if self.groupsize != float('inf') else cols
+        num_groups = (cols + groupsize - 1) // groupsize
+
+        weight = torch.zeros_like(qweight, dtype=torch.float16)
+
+        for g in range(num_groups):
+            start_col = g * groupsize
+            end_col = min((g + 1) * groupsize, cols)
+
+            scale = qscales[:, g].unsqueeze(1)
+            zero = qzeros[:, g].unsqueeze(1)
+
+            q = qweight[:, start_col:end_col].float()
+
+            if self.zero_point:
+                weight[:, start_col:end_col] = (q - zero) * scale
+            else:
+                weight[:, start_col:end_col] = q * scale
+
+        return weight
+
+    def dequantize_packed(self, packed_qweight: torch.Tensor, qzeros: torch.Tensor,
+                          qscales: torch.Tensor, original_cols: int) -> torch.Tensor:
+        """Dequantize packed weights"""
+        # Unpack the weights first
+        qweight = unpack_weight(packed_qweight, original_cols)
+        # Then dequantize normally
+        return self.dequantize(qweight, qzeros, qscales)
+
+
+def quantize_awq(
+        model_state_dict: Dict[str, torch.Tensor],
+        calibration_loader: Optional[Any] = None,
+        model: Optional[torch.nn.Module] = None,
+        wbits: int = 4,
+        groupsize: int = 128,
+        target_layers: Optional[List[str]] = None,
+        device: str = "cuda"
+) -> Dict[str, torch.Tensor]:
+    """
+    Quantize model weights using AWQ algorithm
+
+    Args:
+        model_state_dict: Original model state dictionary
+        calibration_loader: DataLoader for calibration data
+        model: Original model for activation collection
+        wbits: Number of bits for quantization
+        groupsize: Group size for quantization
+        target_layers: List of layer names to quantize
+        device: Device to perform quantization on
+
+    Returns:
+        Dictionary containing quantized weights and quantization parameters
+    """
+    config = AWQConfig(
+        w_bit=wbits,
+        group_size=groupsize,
+        device=device
+    )
+
+    awq = AWQ(config)
+    quantized_state_dict = {}
+
+    # Default target layers if not specified
+    if target_layers is None:
+        target_layers = []
+        for name in model_state_dict.keys():
+            if any(pattern in name for pattern in [
+                "q_proj", "k_proj", "v_proj", "o_proj",
+                "gate_proj", "up_proj", "down_proj",
+                "kv_proj", "lm_head"
+            ]):
+                target_layers.append(name)
+
+    # Collect activation statistics if calibration data is provided
+    if calibration_loader is not None and model is not None:
+        print("Collecting activation statistics for AWQ...")
+
+        # Register hooks to collect activations
+        hooks = []
+
+        def make_hook(layer_name):
+            def hook_fn(module, input, output):
+                if isinstance(input, tuple) and len(input) > 0:
+                    awq.collect_activations(layer_name, input[0])
+                else:
+                    awq.collect_activations(layer_name, input)
+
+            return hook_fn
+
+        # Register hooks for target layers
+        for name, module in model.named_modules():
+            if name in target_layers and isinstance(module, torch.nn.Linear):
+                hook = module.register_forward_hook(make_hook(name))
+                hooks.append(hook)
+
+        # Run calibration
+        model.eval()
+        with torch.no_grad():
+            for i, batch in enumerate(tqdm(calibration_loader, desc="Calibration")):
+                if i >= 32:  # Limit calibration samples
+                    break
+
+                # Move batch to device
+                if isinstance(batch, dict):
+                    batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
+                    outputs = model(**batch)
+                elif isinstance(batch, (list, tuple)):
+                    batch = [b.to(device) if torch.is_tensor(b) else b for b in batch]
+                    outputs = model(*batch)
+                else:
+                    batch = batch.to(device)
+                    outputs = model(batch)
+
+        # Remove hooks
+        for hook in hooks:
+            hook.remove()
+
+        print(f"Collected statistics for {len(awq.activation_stats)} layers")
+
+    print(f"Quantizing {len(target_layers)} layers to {wbits} bits with AWQ...")
+
+    # Quantize each target layer
+    for name, param in tqdm(model_state_dict.items(), desc="Quantizing layers"):
+        if name in target_layers and param.dim() == 2:
+            # Move weight to device
+            weight = param.to(device).float()
+
+            # Get layer name without .weight suffix for activation lookup
+            layer_name = name.replace(".weight", "").replace("_weight", "")
+
+            # Quantize using AWQ
+            qweight, qzeros, qscales = awq.quantize(weight, layer_name)
+
+            # Store quantized parameters
+            base_name = layer_name
+            quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
+            quantized_state_dict[f"{base_name}.qzeros"] = qzeros.cpu()
+            quantized_state_dict[f"{base_name}.qscales"] = qscales.cpu()
+
+        else:
+            # Keep non-quantized parameters as is
+            quantized_state_dict[name] = param.cpu()
+
+    print("AWQ quantization completed!")
+    return quantized_state_dict
+
+
+# Example usage function
+def demo_awq():
+    """Demo function showing how to use AWQ"""
+    # Create a dummy model state dict
+    dummy_state_dict = {
+        "layer1.q_proj.weight": torch.randn(768, 768),
+        "layer1.k_proj.weight": torch.randn(768, 768),
+        "layer1.v_proj.weight": torch.randn(768, 768),
+        "layer1.o_proj.weight": torch.randn(768, 768),
+        "other_param": torch.randn(100)
+    }
+
+    # Quantize without calibration data (will use default scaling)
+    quantized_dict = quantize_awq(
+        model_state_dict=dummy_state_dict,
+        wbits=4,
+        groupsize=128,
+        device="cpu"
+    )
+
+    print("Quantized keys:", list(quantized_dict.keys()))
+
+    # Test dequantization
+    config = AWQConfig(w_bit=4, group_size=128, device="cpu")
+    awq = AWQ(config)
+
+    # Dequantize one layer
+    original_weight = dummy_state_dict["layer1.q_proj.weight"]
+    dequant_weight = awq.dequantize(
+        quantized_dict["layer1.q_proj.qweight"],
+        quantized_dict["layer1.q_proj.qzeros"],
+        quantized_dict["layer1.q_proj.qscales"]
+    )
+
+    print(f"Original shape: {original_weight.shape}")
+    print(f"Dequantized shape: {dequant_weight.shape}")
+    print(f"Quantization error: {(original_weight - dequant_weight).abs().mean():.6f}")
+
+
+if __name__ == "__main__":
+    demo_awq()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq/gptq.py b/lite_llama/quantization/gptq.py
similarity index 52%
rename from lite_llama/quantization/gptq/gptq.py
rename to lite_llama/quantization/gptq.py
index 3624aeb..af219dd 100755
--- a/lite_llama/quantization/gptq/gptq.py
+++ b/lite_llama/quantization/gptq.py
@@ -1,3 +1,5 @@
+from dataclasses import field
+
 import torch
 import torch.nn as nn
 import numpy as np
@@ -6,31 +8,23 @@
 import triton
 import triton.language as tl
 import time, gc, psutil, os, sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
 
-from lite_llama.utils.common import get_gpu_memory  # Replace with actual GPU mem check if needed
 
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from lite_llama.quantization.quant_config import GPTQConfig
 
+from lite_llama.utils.common import get_gpu_memory  # Replace with actual GPU mem check if needed
+from lite_llama.quantization.utils import pack_weight, unpack_weight
 
 class GPTQ:
     def __init__(
             self,
-            layer: nn.Module = None,
-            wbits: int = 4,
-            groupsize: int = 8,
-            actorder: bool = False,
-            percdamp: float = 0.01,
-            blocksize: int = 128,
-            device: str = "cuda"
+            config: GPTQConfig = field(default_factory=GPTQConfig),
     ):
-        self.layer = layer
-        self.wbits = wbits
-        self.groupsize = groupsize if groupsize != -1 else float('inf')
-        self.actorder = actorder
-        self.percdamp = percdamp
-        self.blocksize = blocksize
-        self.device = device
-        self.maxq = 2 ** wbits - 1
+        self.wbits = config.w_bit
+        self.groupsize = config.group_size if config.group_size != -1 else float('inf')
+        self.device = config.device
+        self.maxq = 2 ** self.wbits - 1
 
     def relative_error_loss(self, w_original: torch.Tensor, w_reconstructed: torch.Tensor,
                             eps: float = 1e-5) -> torch.Tensor:
@@ -126,34 +120,15 @@ def magnitude_aware_quantization(self, w_group: torch.Tensor) -> Tuple[torch.Ten
         """Use different strategies based on weight magnitudes"""
         device = w_group.device
         w_abs = w_group.abs()
-        w_std = w_group.std(dim=-1, keepdim=True)
-        w_mean = w_group.mean(dim=-1, keepdim=True)
 
-        # Strategy 1: For groups with large dynamic range, use log-scale quantization
         dynamic_range = w_abs.max(dim=-1, keepdim=True)[0] / (w_abs.min(dim=-1, keepdim=True)[0] + 1e-8)
 
         if dynamic_range.mean() > 100:  # High dynamic range
-            # Use log-space quantization for better relative precision
-            sign = torch.sign(w_group)
-            w_abs_log = torch.log(w_abs + 1e-8)
-
-            log_min = w_abs_log.min(dim=-1, keepdim=True)[0]
-            log_max = w_abs_log.max(dim=-1, keepdim=True)[0]
-
-            scale_log = (log_max - log_min) / (self.maxq - 1)
-            zero_log = torch.round(-log_min / scale_log).clamp(0, self.maxq - 1)
-
-            # Convert back to linear scale
-            q_log = torch.clamp(torch.round((w_abs_log - log_min) / scale_log), 0, self.maxq - 1)
-            w_abs_rec = torch.exp(log_min + q_log * scale_log)
-            w_rec = sign * w_abs_rec
-
             # Compute equivalent linear scale and zero
             scale = (w_group.max(dim=-1, keepdim=True)[0] - w_group.min(dim=-1, keepdim=True)[0]) / self.maxq
             zero = torch.round(-w_group.min(dim=-1, keepdim=True)[0] / scale).clamp(0, self.maxq)
 
         else:
-            # Strategy 2: For normal range, use adaptive clipping
             # Use robust statistics to set bounds
             median = w_group.median(dim=-1, keepdim=True)[0]
             mad = (w_group - median).abs().median(dim=-1, keepdim=True)[0]  # Median Absolute Deviation
@@ -171,14 +146,33 @@ def magnitude_aware_quantization(self, w_group: torch.Tensor) -> Tuple[torch.Ten
 
         return scale, zero
 
-    def quantize(self, W: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    def dequantize_packed(self, packed_qweight: torch.Tensor, zeros: torch.Tensor,
+                          scales: torch.Tensor, original_cols: int) -> torch.Tensor:
+        """
+        Dequantize packed weights
+        Args:
+            packed_qweight: Packed quantized weights [O, I//2]
+            zeros: Zero points [O, num_groups]
+            scales: Scales [O, num_groups]
+            original_cols: Original number of columns before packing
+        Returns:
+            Dequantized weights [O, I]
+        """
+        # Unpack the weights first
+        qweight = unpack_weight(packed_qweight, original_cols)
+
+        # Then dequantize normally
+        return self.dequantize(qweight, zeros, scales)
+
+    def quantize(self, W: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
         """
         Quantization optimized specifically for minimal relative error
-        Returns: [O, I] int4, [O, num_groups] zero, [O, num_groups] scale
+        Returns: [O, I//2] packed int4, [O, num_groups] zero, [O, num_groups] scale, original_cols
         """
         assert W.ndim == 2
         rows, cols = W.shape
         device = W.device
+        original_cols = cols
 
         # Use very small groups for maximum precision
         effective_groupsize = min(int(self.groupsize), 8) if self.groupsize != float('inf') else 8
@@ -254,7 +248,10 @@ def quantize(self, W: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.T
                 scales[:, g] = 1.0
                 zeros[:, g] = 0
 
-        return qweight, zeros.to(torch.float16), scales.to(torch.float16)
+        # Pack the weights before returning
+        packed_qweight = pack_weight(qweight)
+
+        return packed_qweight, zeros.to(torch.float16), scales.to(torch.float16), original_cols
 
 
     def dequantize(self, qweight: torch.Tensor, zeros: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
@@ -280,9 +277,6 @@ def dequantize(self, qweight: torch.Tensor, zeros: torch.Tensor, scales: torch.T
 
 def quantize_gptq(
         model_state_dict: Dict[str, torch.Tensor],
-        calibration_data: Optional[torch.Tensor] = None,
-        wbits: int = 4,
-        groupsize: int = 8,
         target_layers: Optional[list] = None,
         device: str = "cuda"
 ) -> Dict[str, torch.Tensor]:
@@ -309,35 +303,27 @@ def quantize_gptq(
             if any(pattern in name for pattern in [
                 "q_proj", "k_proj", "v_proj", "o_proj",
                 "gate_proj", "up_proj", "down_proj",
-                "kv_proj"
+                "kv_proj", "lm_head"
             ]):
                 target_layers.append(name)
-
-    print(f"Quantizing {len(target_layers)} layers to {wbits} bits...")
+    config = GPTQConfig()
 
     for name, param in tqdm(model_state_dict.items(), desc="Processing layers"):
         if name in target_layers and param.dim() == 2:
             # Create GPTQ quantizer for this layer
-            gptq = GPTQ(
-                layer=None,  # We're working directly with tensors
-                wbits=wbits,
-                groupsize=groupsize,
-                device=device
-            )
+            gptq = GPTQ(config)
 
             # Move weight to device
             weight = param.to(device).float()
 
             # Quantize the weight
-            qweight, qzeros, scales = gptq.quantize(weight)
+            qweight, qzeros, scales, _ = gptq.quantize(weight)
 
             # Store quantized parameters
             base_name = name.replace(".weight", "").replace("_weight", "")
             quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
             quantized_state_dict[f"{base_name}.qzeros"] = qzeros.cpu()
             quantized_state_dict[f"{base_name}.scales"] = scales.cpu()
-            quantized_state_dict[f"{base_name}.wbits"] = torch.tensor(wbits)
-            quantized_state_dict[f"{base_name}.groupsize"] = torch.tensor(groupsize)
 
         else:
             # Keep non-quantized parameters as is
@@ -345,276 +331,3 @@ def quantize_gptq(
 
     return quantized_state_dict
 
-@triton.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 256,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 256,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=5,
-                num_warps=2,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=5,
-                num_warps=2,
-            ),
-        ],
-        key=["M", "N", "K"],
-    )
-
-@triton.jit
-def int4_gemm_kernel(
-    a_ptr, b_ptr, c_ptr,
-    bscales_ptr, bzeros_ptr,
-    M, N, K,
-    stride_am, stride_ak,
-    stride_bk, stride_bn,
-    stride_cm, stride_cn,
-    GROUP_SIZE: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
-    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_SIZE_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    offs_k = tl.arange(0, BLOCK_SIZE_K)
-
-    a_mask = offs_am[:, None] < M
-    b_mask = offs_bn[None, :] < N
-
-    a_ptrs = a_ptr + stride_am * offs_am[:, None] + stride_ak * offs_k[None, :]
-    b_ptrs = b_ptr + stride_bn * offs_bn[None, :] + stride_bk * (offs_k[:, None] // 2)
-
-    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float16)
-
-    for k in range(0, K, BLOCK_SIZE_K):
-        b_q = tl.load(b_ptrs, mask=b_mask)
-
-        a = tl.load(a_ptrs, mask=a_mask).to(tl.float16)
-
-        # Compute per-group index
-        k_offset = k + offs_k  # shape: [BLOCK_SIZE_K]
-        group_idx = k_offset // GROUP_SIZE  # [BLOCK_SIZE_K]
-
-        # Load scale and zero for each [N, G]
-        scale = tl.load(bscales_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)  # [BLOCK_SIZE_K, BLOCK_SIZE_N]
-        zero = tl.load(bzeros_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)   # same shape
-
-        # Extract int4 values from uint8
-        shift = (k_offset[:, None] % 2) * 4
-        q = (b_q.to(tl.uint8) >> shift) & 0xF
-        b_deq = (q.to(tl.float16) - zero) * scale
-
-        accumulator += tl.dot(a, b_deq, out_dtype=tl.float16)
-
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
-
-    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
-
-    tl.store(c_ptrs, accumulator, mask=c_mask)
-
-
-def triton_int4_gemm(
-    inp: torch.Tensor,
-    weight: torch.Tensor,
-    scales: torch.Tensor,
-    zeros: torch.Tensor,
-    group_size: int = 64
-) -> torch.Tensor:
-
-
-    weight = weight.t().contiguous()  # [K/2, N]
-    c_shape = inp.shape[:-1] + weight.shape[-1:]
-    inp = inp.view(-1, inp.shape[-1]).contiguous()
-
-    PAD_TO = 256
-    if inp.shape[0] % PAD_TO != 0:
-        c_crop = inp.shape[0]
-        new_inp = inp.new_zeros(((inp.shape[0] + PAD_TO - 1) // PAD_TO * PAD_TO, inp.shape[1]))
-        new_inp[:c_crop] = inp
-        inp = new_inp
-    else:
-        c_crop = None
-
-    M, K = inp.shape
-    N = weight.shape[1]
-
-    c = torch.empty((M, N), device=inp.device, dtype=torch.float16)
-
-    grid = lambda META: (
-        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
-    )
-
-    int4_gemm_kernel[grid](
-        inp, weight, c,
-        scales, zeros,
-        M, N, K,
-        inp.stride(0), inp.stride(1),
-        weight.stride(0), weight.stride(1),
-        c.stride(0), c.stride(1),
-        GROUP_SIZE=group_size,
-    )
-
-    return c[:c_crop] if c_crop is not None else c.view(c_shape)
-
-class GPTQLinear(nn.Module):
-    """
-    4-bit quantized linear layer using Triton kernels
-    """
-
-    def __init__(self, in_features, out_features, bias=True, groupsize=64, device="cuda"):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.groupsize = groupsize
-        self.device = device
-
-        self.tile_cols = groupsize
-        self.original_out_features = out_features
-
-        # Quantized params (assigned later)
-        self.register_buffer("packed_weight", None)
-        self.register_buffer("scales", None)
-        self.register_buffer("zeros", None)
-        self.register_buffer("bias", None if not bias else torch.empty(out_features))
-
-    @staticmethod
-    def pack_weight(weight):
-        rows, cols = weight.shape
-        if cols % 2 != 0:
-            weight = torch.nn.functional.pad(weight, (0, 1), value=0)
-            cols += 1
-        packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
-        return packed.contiguous()
-
-    def get_weight(self, packed: torch.Tensor) -> torch.Tensor:
-        """
-        [rows, ceil(cols/2)] uint8 -> [rows, cols] uint8 in [0, 15]
-        """
-        rows, packed_cols = packed.shape
-        qweight = torch.empty((rows, packed_cols * 2), dtype=torch.uint8, device=packed.device)
-        qweight[:, 0::2] = packed & 0xF
-        qweight[:, 1::2] = (packed >> 4) & 0xF
-        return qweight[:, :self.in_features].contiguous()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x_flat = x.view(-1, self.in_features)
-        # Compute quantized matmul
-        output = triton_int4_gemm(
-            x_flat.float(),
-            self.packed_weight,
-            self.scales,
-            self.zeros,
-            group_size=self.groupsize,
-        )
-
-        if self.bias is not None:
-            output += self.bias
-
-        return output.view(*x.shape[:-1], self.out_features)
-
-
diff --git a/lite_llama/quantization/gptq/__init__.py b/lite_llama/quantization/gptq/__init__.py
deleted file mode 100755
index e69de29..0000000
diff --git a/lite_llama/quantization/gptq/gptq_loader.py b/lite_llama/quantization/gptq/gptq_loader.py
deleted file mode 100644
index 6f70ef3..0000000
--- a/lite_llama/quantization/gptq/gptq_loader.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import torch
-import torch.nn as nn
-from typing import Dict, Optional
-import os.path as osp
-from .gptq import *
-
-
-def load_quantized_state_dict(checkpoint_path: str, device: str = "cuda") -> Dict[str, torch.Tensor]:
-    """
-    Load a quantized state dictionary from checkpoint.
-
-    Args:
-        checkpoint_path: Path to the .pth file
-        device: Device to load tensors to
-
-    Returns:
-        State dictionary with quantized weights
-    """
-    print(f"Loading quantized model from {checkpoint_path}")
-    state_dict = torch.load(checkpoint_path, map_location=device)
-
-    # Check if this is a quantized model
-    quantized_keys = [k for k in state_dict.keys() if '.qweight' in k]
-    if quantized_keys:
-        print(f"Found {len(quantized_keys)} quantized layers")
-    else:
-        print("No quantized layers found - this appears to be a regular model")
-
-    return state_dict
-
-
-def create_dequantized_state_dict(quantized_state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-    """
-    Create a dequantized state dictionary from a quantized one.
-    This is useful for models that don't support on-the-fly dequantization.
-
-    Args:
-        quantized_state_dict: State dictionary with quantized weights
-
-    Returns:
-        State dictionary with dequantized weights
-    """
-    dequantized_dict = {}
-    processed_layers = set()
-
-    for key, value in quantized_state_dict.items():
-        if '.qweight' in key:
-            # Extract base name without the '.qweight' suffix
-            base_name = key.replace('.qweight', '')
-
-            if base_name not in processed_layers:
-                processed_layers.add(base_name)
-
-                # Retrieve quantization parameters
-                qweight = quantized_state_dict[f"{base_name}.qweight"]
-                qzeros = quantized_state_dict[f"{base_name}.qzeros"]
-                scales = quantized_state_dict[f"{base_name}.scales"]
-                wbits = quantized_state_dict.get(f"{base_name}.wbits", torch.tensor(4)).item()
-
-                # Dequantize to regular fp16 weights
-                gptq = GPTQ(wbits=4, groupsize=8)
-                weight = gptq.dequantize(qweight, qzeros, scales)
-
-                # Store dequantized weight; handle naming with or without '.weight'
-                if "_weight" in base_name:
-                    dequantized_dict[base_name] = weight
-                else:
-                    dequantized_dict[f"{base_name}.weight"] = weight
-
-                # Copy bias if present
-                for bias_key in (f"{base_name}.bias", f"{base_name}_bias"):
-                    if bias_key in quantized_state_dict:
-                        dequantized_dict[bias_key] = quantized_state_dict[bias_key]
-
-
-
-        elif not any(suffix in key for suffix in ['.qzeros', '.scales', '.wbits', '.groupsize']):
-
-            # Preserve all other parameters
-            dequantized_dict[key] = value
-
-    print(f"Dequantized {len(processed_layers)} layers")
-    return dequantized_dict
-
-
-# Example usage functions
-
-def load_gptq_model_for_inference(model: nn.Module, checkpoint_path: str, device: str = "cuda"):
-    """
-    Load a GPTQ quantized model for inference.
-
-    Args:
-        model: The model architecture (should match the quantized model)
-        checkpoint_path: Path to the quantized .pth file
-        device: Device to load model to
-
-    Example:
-        >>> model = YourModelClass(config)
-        >>> load_gptq_model_for_inference(model, "my_weight/model_gptq.pth")
-        >>> # Model is now ready for inference with automatic dequantization
-    """
-    # Load quantized state dict
-    quantized_state_dict = load_quantized_state_dict(checkpoint_path, device)
-
-    # Check if model uses quantized weights
-    if any('.qweight' in k for k in quantized_state_dict.keys()):
-        print("Dequantizing weights for standard model inference...")
-        # Create dequantized state dict
-        dequantized_state_dict = create_dequantized_state_dict(quantized_state_dict)
-        # Load into model
-        model.load_state_dict(dequantized_state_dict, strict=False)
-    else:
-        # Regular model, load normally
-        model.load_state_dict(quantized_state_dict)
-
-    model.to(device)
-    model.eval()
-
-    return model
-
-
-def compare_model_sizes(original_path: str, quantized_path: str):
-    """
-    Compare file sizes between original and quantized models.
-
-    Args:
-        original_path: Path to original .pth file
-        quantized_path: Path to quantized .pth file
-    """
-    import os
-
-    if os.path.exists(original_path):
-        original_size = os.path.getsize(original_path) / (1024 ** 3)  # GB
-        print(f"Original model size: {original_size:.2f} GB")
-    else:
-        print(f"Original model not found at {original_path}")
-        return
-
-    if os.path.exists(quantized_path):
-        quantized_size = os.path.getsize(quantized_path) / (1024 ** 3)  # GB
-        print(f"Quantized model size: {quantized_size:.2f} GB")
-
-        compression_ratio = original_size / quantized_size
-        print(f"Compression ratio: {compression_ratio:.2f}x")
-        print(f"Size reduction: {(1 - quantized_size / original_size) * 100:.1f}%")
-    else:
-        print(f"Quantized model not found at {quantized_path}")
\ No newline at end of file
diff --git a/lite_llama/quantization/quant_config.py b/lite_llama/quantization/quant_config.py
new file mode 100644
index 0000000..c4a1d8e
--- /dev/null
+++ b/lite_llama/quantization/quant_config.py
@@ -0,0 +1,24 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class AWQConfig:
+
+    """Configuration for AWQ quantization"""
+    w_bit: int = 4  # Weight quantization bits
+    group_size: int = 128  # Group size for quantization
+    zero_point: bool = True  # Whether to use zero point
+    version: str = "GEMM"  # GEMM or GEMV
+    calib_data_size: int = 128  # Calibration dataset size
+    search_scale: bool = False  # Whether to search for optimal scales
+    auto_scale: bool = True  # Automatic scaling
+    device: str = "cuda"
+    alpha = 0.5
+
+
+@dataclass
+class GPTQConfig:
+    """Configuration for AWQ quantization"""
+    w_bit: int = 4  # Weight quantization bits
+    group_size: int = 128  # Group size for quantization
+    device: str = "cuda"
diff --git a/lite_llama/quantization/utils.py b/lite_llama/quantization/utils.py
new file mode 100644
index 0000000..2482e6e
--- /dev/null
+++ b/lite_llama/quantization/utils.py
@@ -0,0 +1,19 @@
+import torch
+
+
+def pack_weight(weight):
+    """Pack two 4-bit values into one uint8 value"""
+    rows, cols = weight.shape
+    if cols % 2 != 0:
+        weight = torch.nn.functional.pad(weight, (0, 1), value=0)
+        cols += 1
+    packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
+    return packed.contiguous()
+
+def unpack_weight(packed_weight, original_cols):
+    """Unpack uint8 values back to two 4-bit values"""
+    rows, packed_cols = packed_weight.shape
+    unpacked = torch.zeros((rows, packed_cols * 2), dtype=torch.uint8, device=packed_weight.device)
+    unpacked[:, 0::2] = packed_weight & 0xF
+    unpacked[:, 1::2] = (packed_weight >> 4) & 0xF
+    return unpacked[:, :original_cols].contiguous()
\ No newline at end of file
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index e552486..9316369 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -206,7 +206,7 @@ def get_model_dtype(checkpoints_dir: str):
 def quantization(mode: str = None):
     quantized_linear_cls = None
     if mode == 'gptq.int4':
-        from ..quantization.gptq.gptq import GPTQLinear
+        from ..kernels.int4_linear import GPTQLinear
         quantized_linear_cls = functools.partial(GPTQLinear, bits=4, tile_cols=-1)
     elif mode is not None:
         raise ValueError(f"Unknown quantization mode: {mode}")
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..397e719
--- /dev/null
+++ b/test.py
@@ -0,0 +1,263 @@
+import torch
+import triton
+import triton.language as tl
+import torch.nn as nn
+from typing import Optional
+
+
+@triton.jit
+def _int4_linear_kernel(
+    input_ptr,        # [M, K]
+    qweight_ptr,      # [N, K//2]
+    scales_ptr,       # [N, K//groupsize]
+    zeros_ptr,        # [N, K//groupsize]
+    output_ptr,       # [M, N]
+    bias_ptr,         # [N] or dummy
+    M: tl.constexpr,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    groupsize: tl.constexpr,
+    stride_im, stride_ik,
+    stride_wn, stride_wk,
+    stride_sn, stride_sg,
+    stride_zn, stride_zg,
+    stride_om, stride_on,
+    HAS_BIAS: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    mask_m = offs_m < M
+    mask_n = offs_n < N
+    mask_k = offs_k < K
+
+    # Input block: [BLOCK_SIZE_M, BLOCK_SIZE_K]
+    input_ptrs = input_ptr + offs_m[:, None] * stride_im + offs_k[None, :] * stride_ik
+    input_block = tl.load(input_ptrs, mask=mask_m[:, None] & mask_k[None, :], other=0.0)
+
+    # ---- Load and unpack packed int4 weights ----
+    packed_k = offs_k // 2  # [BLOCK_SIZE_K] → K//2 indices
+    is_high = offs_k % 2    # [BLOCK_SIZE_K]
+
+    weight_ptrs = qweight_ptr + offs_n[:, None] * stride_wn + packed_k[None, :] * stride_wk
+    packed_vals = tl.load(weight_ptrs, mask=mask_n[:, None] & (packed_k[None, :] < (K // 2)), other=0)
+
+    low = packed_vals & 0xF
+    high = (packed_vals >> 4) & 0xF
+    unpacked = tl.where(is_high[None, :] == 1, high, low).to(tl.float32)  # [N, K]
+
+    # ---- Dequantization ----
+    group_id = (offs_k // groupsize)[None, :]  # [1, K]
+    scale_ptrs = scales_ptr + offs_n[:, None] * stride_sn + group_id * stride_sg
+    zero_ptrs = zeros_ptr + offs_n[:, None] * stride_zn + group_id * stride_zg
+
+    scale_vals = tl.load(scale_ptrs, mask=mask_n[:, None], other=1.0)
+    zero_vals = tl.load(zero_ptrs, mask=mask_n[:, None], other=0.0)
+
+    dequant = (unpacked - zero_vals) * scale_vals  # [BLOCK_SIZE_N, BLOCK_SIZE_K]
+
+    # ---- GEMM ----
+    acc = tl.dot(input_block, tl.trans(dequant))  # [BLOCK_SIZE_M, BLOCK_SIZE_N]
+
+    # ---- Add bias if present ----
+    if HAS_BIAS:
+        bias_ptrs = bias_ptr + offs_n
+        bias_vals = tl.load(bias_ptrs, mask=mask_n, other=0.0)
+        acc += bias_vals[None, :]
+
+    # ---- Write output ----
+    output_ptrs = output_ptr + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    tl.store(output_ptrs, acc, mask=mask_m[:, None] & mask_n[None, :])
+
+
+
+class Int4Linear(nn.Module):
+    """
+    A linear layer that uses int4 quantized weights with Triton kernel.
+    """
+
+    def __init__(self, qweight: torch.Tensor, scales: torch.Tensor, zeros: torch.Tensor,
+                 bias: Optional[torch.Tensor] = None, groupsize: int = 128):
+        super().__init__()
+
+        # Validate inputs
+        assert qweight.dtype == torch.uint8, "qweight must be uint8"
+        assert scales.dtype in [torch.float16, torch.float32], "scales must be float16 or float32"
+        assert zeros.dtype in [torch.float16, torch.float32], "zeros must be float16 or float32"
+
+        self.out_features, packed_in_features = qweight.shape
+        self.in_features = packed_in_features * 2  # Each uint8 contains 2 int4 values
+        self.groupsize = groupsize
+
+        # Register quantized parameters
+        self.register_buffer('qweight', qweight)
+        self.register_buffer('scales', scales.to(torch.float32))  # Always use fp32 for scales
+        self.register_buffer('zeros', zeros.to(torch.float32))  # Always use fp32 for zeros
+
+        if bias is not None:
+            self.register_buffer('bias', bias)
+        else:
+            self.bias = None
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass using int4 Triton kernel.
+
+        Args:
+            x: Input tensor of shape [..., in_features]
+
+        Returns:
+            Output tensor of shape [..., out_features]
+        """
+        # Reshape input to 2D
+        input_shape = x.shape
+        x_2d = x.view(-1, self.in_features)
+        M, K = x_2d.shape
+        N = self.out_features
+
+        # Allocate output
+        output = torch.empty((M, N), dtype=x.dtype, device=x.device)
+
+        # Calculate grid dimensions
+        BLOCK_SIZE_M = min(64, triton.next_power_of_2(M))
+        BLOCK_SIZE_N = min(64, triton.next_power_of_2(N))
+        BLOCK_SIZE_K = min(64, triton.next_power_of_2(K))
+
+        grid = lambda meta: (
+            triton.cdiv(M, meta['BLOCK_SIZE_M']),
+            triton.cdiv(N, meta['BLOCK_SIZE_N'])
+        )
+
+        # Launch kernel
+        _int4_linear_kernel[grid](
+            x_2d, self.qweight, self.scales, self.zeros, output,
+            self.bias if self.bias is not None else x_2d,  # Dummy pointer if no bias
+            M, N, K, self.groupsize,
+            x_2d.stride(0), x_2d.stride(1),
+            self.qweight.stride(0), self.qweight.stride(1),
+            self.scales.stride(0), self.scales.stride(1),
+            self.zeros.stride(0), self.zeros.stride(1),
+            output.stride(0), output.stride(1),
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            HAS_BIAS=self.bias is not None,
+            num_warps=4,
+            num_stages=2,
+        )
+
+        # Reshape output back to original shape
+        return output.view(*input_shape[:-1], self.out_features)
+
+
+def pack_int4_weights(qweight: torch.Tensor) -> torch.Tensor:
+    """
+    Pack int4 weights from [N, K] uint8 to [N, K//2] uint8.
+    Each output uint8 contains two int4 values.
+    """
+    N, K = qweight.shape
+    assert K % 2 == 0, "K must be even for packing"
+
+    # Pack two int4 values into one uint8
+    packed = torch.zeros((N, K // 2), dtype=torch.uint8, device=qweight.device)
+    packed = (qweight[:, 0::2] & 0xF) | ((qweight[:, 1::2] & 0xF) << 4)
+
+    return packed
+
+
+def create_int4_linear_from_quantized(qweight: torch.Tensor, scales: torch.Tensor,
+                                      zeros: torch.Tensor, bias: Optional[torch.Tensor] = None,
+                                      groupsize: int = 128) -> Int4Linear:
+    """
+    Create an Int4Linear layer from quantized parameters.
+
+    Args:
+        qweight: Quantized weights [out_features, in_features] as uint8
+        scales: Dequantization scales [out_features, num_groups]
+        zeros: Dequantization zeros [out_features, num_groups]
+        bias: Optional bias term [out_features]
+        groupsize: Group size for quantization
+
+    Returns:
+        Int4Linear layer ready for inference
+    """
+    # Pack int4 weights if needed
+    if qweight.shape[1] % 2 == 0:
+        # Assume weights are not packed yet
+        packed_qweight = pack_int4_weights(qweight)
+    else:
+        # Assume weights are already packed
+        packed_qweight = qweight
+
+    return Int4Linear(packed_qweight, scales, zeros, bias, groupsize)
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    def test_int4_linear():
+        # Test parameters
+        batch_size = 32
+        in_features = 512
+        out_features = 256
+        groupsize = 128
+
+        # Create random quantized weights
+        qweight = torch.randint(0, 16, (out_features, in_features), dtype=torch.uint8, device='cuda')
+        scales = torch.randn(out_features, in_features // groupsize, dtype=torch.float16, device='cuda')
+        zeros = torch.randint(0, 16, (out_features, in_features // groupsize), dtype=torch.float16, device='cuda')
+        bias = torch.randn(out_features, dtype=torch.float16, device='cuda')
+
+        # Create Int4Linear layer
+        int4_layer = create_int4_linear_from_quantized(qweight, scales, zeros, bias, groupsize)
+
+        # Test forward pass
+        x = torch.randn(batch_size, in_features, dtype=torch.float16, device='cuda')
+
+        # Warm up
+        for _ in range(10):
+            _ = int4_layer(x)
+
+        torch.cuda.synchronize()
+
+        # Benchmark
+        import time
+        start_time = time.time()
+        for _ in range(100):
+            output = int4_layer(x)
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        print(f"Int4Linear forward time: {(end_time - start_time) * 10:.2f} ms per call")
+        print(f"Output shape: {output.shape}")
+        print(f"Output dtype: {output.dtype}")
+
+        # Compare with standard linear (using dequantized weights)
+        from lite_llama.quantization.gptq import GPTQ
+        gptq = GPTQ(wbits=4, groupsize=groupsize)
+        dequant_weight = gptq.dequantize(qweight, zeros, scales)
+
+        std_layer = nn.Linear(in_features, out_features, bias=True, device='cuda', dtype=torch.float16)
+        std_layer.weight.data = dequant_weight.T.to(torch.float16)
+        std_layer.bias.data = bias
+
+        start_time = time.time()
+        for _ in range(100):
+            std_output = std_layer(x)
+        torch.cuda.synchronize()
+        end_time = time.time()
+
+        print(f"Standard Linear forward time: {(end_time - start_time) * 10:.2f} ms per call")
+
+        # Check numerical accuracy (should be close due to quantization)
+        diff = torch.abs(output - std_output).max()
+        print(f"Max difference between Int4Linear and Standard Linear: {diff:.6f}")
+
+
+    test_int4_linear()
\ No newline at end of file
diff --git a/tests/test_gptq.py b/tests/test_gptq.py
new file mode 100644
index 0000000..f99867f
--- /dev/null
+++ b/tests/test_gptq.py
@@ -0,0 +1,232 @@
+import torch
+import numpy as np
+import pytest
+from typing import Dict, Tuple, Optional, Any
+import time, os, sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
+
+from gptq import GPTQ
+
+# Assuming the GPTQ class and helper functions are imported
+# from your_gptq_module import GPTQ, pack_weight, unpack_weight, GPTQConfig
+
+def pack_weight(weight):
+    """Pack two 4-bit values into one uint8 value"""
+    rows, cols = weight.shape
+    original_cols = cols
+    if cols % 2 != 0:
+        weight = torch.nn.functional.pad(weight, (0, 1), value=0)
+        cols += 1
+    packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
+    return packed.contiguous(), original_cols
+
+
+def unpack_weight(packed_weight, original_cols):
+    """Unpack uint8 values back to two 4-bit values"""
+    rows, packed_cols = packed_weight.shape
+    unpacked = torch.zeros((rows, packed_cols * 2), dtype=torch.uint8, device=packed_weight.device)
+    unpacked[:, 0::2] = packed_weight & 0xF
+    unpacked[:, 1::2] = (packed_weight >> 4) & 0xF
+    return unpacked[:, :original_cols].contiguous()
+
+
+# Mock GPTQConfig for testing
+class GPTQConfig:
+    def __init__(self, w_bit=4, group_size=128, device="cuda"):
+        self.w_bit = w_bit
+        self.group_size = group_size
+        self.device = device
+
+
+def test_pack_unpack_weights():
+    """Test that pack/unpack operations are lossless"""
+    print("Testing pack/unpack operations...")
+
+    # Test with even columns
+    weight_even = torch.randint(0, 16, (4, 8), dtype=torch.uint8)
+    packed, original_cols = pack_weight(weight_even)
+    unpacked = unpack_weight(packed, original_cols)
+
+    assert torch.equal(weight_even, unpacked), "Pack/unpack failed for even columns"
+    assert packed.shape[1] == weight_even.shape[1] // 2, "Packed size incorrect"
+
+    # Test with odd columns
+    weight_odd = torch.randint(0, 16, (4, 7), dtype=torch.uint8)
+    packed, original_cols = pack_weight(weight_odd)
+    unpacked = unpack_weight(packed, original_cols)
+
+    assert torch.equal(weight_odd, unpacked), "Pack/unpack failed for odd columns"
+    assert original_cols == 7, "Original columns not preserved"
+
+    print("✓ Pack/unpack tests passed")
+
+
+def test_quantize_dequantize_cycle():
+    """Test the complete quantize -> dequantize cycle"""
+    print("Testing quantize -> dequantize cycle...")
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    config = GPTQConfig(w_bit=4, group_size=32, device=device)
+
+    gptq = GPTQ(config)
+
+    # Test cases with different weight characteristics
+    test_cases = [
+        {
+            "name": "Normal weights",
+            "weight": torch.randn(128, 256, device=device, dtype=torch.float32) * 0.1
+        },
+        {
+            "name": "Small weights",
+            "weight": torch.randn(64, 128, device=device, dtype=torch.float32) * 0.001
+        },
+        {
+            "name": "Large weights",
+            "weight": torch.randn(32, 64, device=device, dtype=torch.float32) * 10.0
+        },
+        {
+            "name": "Mixed scale weights",
+            "weight": torch.cat([
+                torch.randn(16, 32, device=device) * 0.001,
+                torch.randn(16, 32, device=device) * 1.0
+            ], dim=0)
+        },
+        {
+            "name": "Odd columns",
+            "weight": torch.randn(32, 63, device=device, dtype=torch.float32) * 0.1
+        }
+    ]
+
+    results = []
+
+    for test_case in test_cases:
+        print(f"\n  Testing: {test_case['name']}")
+        weight = test_case["weight"]
+
+        # Quantize
+        start_time = time.time()
+        packed_qweight, qzeros, scales, original_cols = gptq.quantize(weight)
+        quantize_time = time.time() - start_time
+
+        # Dequantize
+        start_time = time.time()
+        reconstructed = gptq.dequantize_packed(packed_qweight, qzeros, scales, original_cols)
+        dequantize_time = time.time() - start_time
+
+        # Check shapes
+        assert reconstructed.shape == weight.shape, f"Shape mismatch: {reconstructed.shape} vs {weight.shape}"
+
+        # Calculate errors
+        mse = torch.mean((weight - reconstructed) ** 2).item()
+        relative_error = gptq.relative_error_loss(weight, reconstructed).item()
+        max_abs_error = torch.max(torch.abs(weight - reconstructed)).item()
+
+        # Memory efficiency check
+        original_memory = weight.numel() * 4  # float32
+        quantized_memory = (packed_qweight.numel() + qzeros.numel() * 2 + scales.numel() * 2)  # uint8 + float16
+        compression_ratio = original_memory / quantized_memory
+
+        result = {
+            "name": test_case["name"],
+            "shape": weight.shape,
+            "mse": mse,
+            "relative_error": relative_error,
+            "max_abs_error": max_abs_error,
+            "compression_ratio": compression_ratio,
+            "quantize_time": quantize_time,
+            "dequantize_time": dequantize_time,
+            "original_cols": original_cols
+        }
+        results.append(result)
+
+        print(f"    Shape: {weight.shape}")
+        print(f"    MSE: {mse:.6f}")
+        print(f"    Relative Error: {relative_error:.6f}")
+        print(f"    Max Abs Error: {max_abs_error:.6f}")
+        print(f"    Compression Ratio: {compression_ratio:.2f}x")
+        print(f"    Original Cols: {original_cols}")
+
+        # Assertions for quality
+        assert relative_error < 1.0, f"Relative error too high: {relative_error}"
+        assert compression_ratio > 1.5, f"Compression ratio too low: {compression_ratio}"
+
+        # Test that packing actually reduced size
+        original_qweight_size = weight.shape[0] * weight.shape[1]  # unpacked size
+        packed_qweight_size = packed_qweight.numel()
+        expected_packed_size = (weight.shape[1] + 1) // 2 * weight.shape[0]  # ceil(cols/2) * rows
+
+        assert packed_qweight_size <= expected_packed_size, "Packing didn't reduce size as expected"
+
+    print("\n✓ All quantize -> dequantize tests passed")
+    return results
+
+
+
+def test_consistency_across_devices():
+    """Test that quantization is consistent across CPU and GPU"""
+    print("Testing device consistency...")
+
+    if not torch.cuda.is_available():
+        print("  CUDA not available, skipping device consistency test")
+        return
+
+    weight_cpu = torch.randn(32, 64, dtype=torch.float16) * 0.1
+    weight_gpu = weight_cpu.cuda()
+
+    # Note: This would require the actual GPTQ implementation
+    # For now, just test that weights can be moved between devices
+
+    assert weight_cpu.device.type == "cpu"
+    assert weight_gpu.device.type == "cuda"
+    assert torch.allclose(weight_cpu, weight_gpu.cpu(), atol=1e-6)
+
+    print("✓ Device consistency tests passed")
+
+
+def run_performance_benchmark():
+    """Benchmark quantization performance"""
+    print("Running performance benchmark...")
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    sizes = [(512, 512), (1024, 1024), (2048, 2048), (4096, 4096)]
+    config = GPTQConfig(w_bit=4, group_size=32, device=device)
+
+    gptq = GPTQ(config)
+    for rows, cols in sizes:
+        weight = torch.randn(rows, cols, device=device, dtype=torch.float16) * 0.1
+
+        # Time quantization (would need actual implementation)
+        start_time = time.time()
+        quantized = gptq.quantize(weight)
+        quantize_time = time.time() - start_time
+
+        print(f"  Size {rows}x{cols}: Quantization took {quantize_time:.3f}s")
+
+
+def main():
+    """Run all tests"""
+    print("=" * 60)
+    print("GPTQ Quantization Test Suite")
+    print("=" * 60)
+
+    try:
+        # Basic functionality tests
+        test_pack_unpack_weights()
+        test_quantize_dequantize_cycle()
+        test_consistency_across_devices()
+
+        # Performance benchmark
+        run_performance_benchmark()
+
+        print("\n" + "=" * 60)
+        print("ALL TESTS PASSED!")
+        print("=" * 60)
+
+    except Exception as e:
+        print(f"\nTEST FAILED: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 2dfb24914f5bc27a311596c579c4bdfbb31022f7 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Mon, 14 Jul 2025 04:24:49 +0930
Subject: [PATCH 28/33] update sqlinear and sq quant, TODO: awqlinear

---
 lite_llama/kernels/awq_linear.py              |   0
 .../{int4_linear.py => gptq_linear.py}        |  28 +-
 lite_llama/kernels/sq_linear.py               | 409 +++++++++++++++
 lite_llama/quantization/awq.py                | 108 +++-
 lite_llama/quantization/quant_config.py       |  21 +-
 lite_llama/quantization/sq.py                 | 481 +++++++++++++++++
 lite_llama/utils/common.py                    |   2 +-
 tests/kernels/__init__.py                     |   0
 tests/kernels/test_AWQLinear.py               |   0
 test.py => tests/kernels/test_GPTQLinear.py   |   0
 tests/kernels/test_SQLinear.py                | 488 ++++++++++++++++++
 11 files changed, 1488 insertions(+), 49 deletions(-)
 create mode 100644 lite_llama/kernels/awq_linear.py
 rename lite_llama/kernels/{int4_linear.py => gptq_linear.py} (92%)
 create mode 100644 lite_llama/kernels/sq_linear.py
 create mode 100644 lite_llama/quantization/sq.py
 create mode 100644 tests/kernels/__init__.py
 create mode 100644 tests/kernels/test_AWQLinear.py
 rename test.py => tests/kernels/test_GPTQLinear.py (100%)
 create mode 100644 tests/kernels/test_SQLinear.py

diff --git a/lite_llama/kernels/awq_linear.py b/lite_llama/kernels/awq_linear.py
new file mode 100644
index 0000000..e69de29
diff --git a/lite_llama/kernels/int4_linear.py b/lite_llama/kernels/gptq_linear.py
similarity index 92%
rename from lite_llama/kernels/int4_linear.py
rename to lite_llama/kernels/gptq_linear.py
index 70663ab..8287e16 100644
--- a/lite_llama/kernels/int4_linear.py
+++ b/lite_llama/kernels/gptq_linear.py
@@ -3,6 +3,8 @@
 import torch
 import torch.nn as nn
 import numpy as np
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 from lite_llama.quantization.gptq import GPTQ
 from lite_llama.quantization.quant_config import GPTQConfig
 
@@ -283,32 +285,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return output.view(*x.shape[:-1], self.out_features)
 
 
-class AWQLinear(nn.Module):
-    """AWQ Quantized Linear Layer"""
-
-    def __init__(self, in_features: int, out_features: int, bias: bool = False,
-                 w_bit: int = 4, group_size: int = 128):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.w_bit = w_bit
-        self.group_size = group_size
-
-        # Scales and zeros for each group
-        self.register_buffer("packed_weight", None)
-        self.register_buffer("scales", None)
-        self.register_buffer("zeros", None)
-        self.register_buffer("bias", None if not bias else torch.empty(out_features))
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass with dequantization"""
-        # Dequantize weights on the fly
-        weight = self.dequantize_weights()
-        return torch.nn.functional.linear(x, weight.T, self.bias)
-
-
-
-
 def test_gptqlinear_vs_nnlinear(
     in_features=2048,
     out_features=4096,
diff --git a/lite_llama/kernels/sq_linear.py b/lite_llama/kernels/sq_linear.py
new file mode 100644
index 0000000..186d5aa
--- /dev/null
+++ b/lite_llama/kernels/sq_linear.py
@@ -0,0 +1,409 @@
+import torch
+import triton
+import triton.language as tl
+from typing import Optional
+import math
+
+
+def pack_weight(weight):
+    """Pack two INT4 values into one UINT8 value"""
+    rows, cols = weight.shape
+    if cols % 2 != 0:
+        weight = torch.nn.functional.pad(weight, (0, 1), value=0)
+        cols += 1
+
+    # Ensure weights are in INT4 range [-8, 7]
+    weight = torch.clamp(weight, min=-8, max=7)
+
+    # Convert to unsigned representation [0, 15]
+    weight_unsigned = weight + 8
+
+    # Pack two INT4 values into one UINT8
+    packed = (weight_unsigned[:, 0::2] & 0xF) | ((weight_unsigned[:, 1::2] & 0xF) << 4)
+    return packed.contiguous().to(torch.uint8)
+
+
+def unpack_weight(packed_weight, original_cols):
+    """Unpack UINT8 values back to two INT4 values"""
+    rows, packed_cols = packed_weight.shape
+    unpacked = torch.zeros((rows, packed_cols * 2), dtype=torch.uint8, device=packed_weight.device)
+    unpacked[:, 0::2] = packed_weight & 0xF
+    unpacked[:, 1::2] = (packed_weight >> 4) & 0xF
+    # Convert back to signed INT4 range [-8, 7]
+    unpacked = unpacked.to(torch.int8) - 8
+    return unpacked[:, :original_cols].contiguous()
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=2,
+                      num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3,
+                      num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4,
+                      num_warps=8),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3,
+                      num_warps=4),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def smoothquant_int4_kernel(
+        # Pointers to matrices
+        x_ptr, w_ptr, bias_ptr, output_ptr,
+        # Quantization parameters
+        scale_ptr, zp_ptr, smooth_ptr,
+        # Matrix dimensions
+        M, N, K,
+        # Strides
+        stride_xm, stride_xk,
+        stride_wn, stride_wk,
+        stride_om, stride_on,
+        stride_sm, stride_sn,
+        stride_zpm, stride_zpn,
+        # Group size for quantization
+        group_size,
+        # Meta-parameters
+        BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+):
+    """
+    Optimized SmoothQuant linear kernel with INT4 weights and FP16 activations.
+    """
+    # Program ID
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    # Block offsets
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    # Initialize accumulator
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float16)
+
+    # Main loop over K dimension
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        # Current k indices
+        k_indices = k * BLOCK_SIZE_K + offs_k
+
+        # Load input activations
+        x_ptrs = x_ptr + offs_am[:, None] * stride_xm + k_indices[None, :] * stride_xk
+        x_mask = (offs_am[:, None] < M) & (k_indices[None, :] < K)
+        x = tl.load(x_ptrs, mask=x_mask, other=0.0).to(tl.float16)
+
+        # Apply SmoothQuant inverse smoothing
+        if smooth_ptr is not None:
+            smooth_ptrs = smooth_ptr + k_indices
+            smooth_mask = k_indices < K
+            smooth_scale = tl.load(smooth_ptrs, mask=smooth_mask, other=1.0).to(tl.float16)
+            # FIX: Ensure the division result stays as fp16
+            x = (x / smooth_scale[None, :]).to(tl.float16)
+
+        # Load packed weights (each packed value contains 2 INT4 weights)
+        k_pack = k_indices // 2  # Packed dimension
+        w_pack_ptrs = w_ptr + offs_bn[None, :] * stride_wn + k_pack[:, None] * stride_wk
+        w_packed = tl.load(w_pack_ptrs, mask=(k_pack[:, None] < tl.cdiv(K, 2)) & (offs_bn[None, :] < N), other=0)
+
+        # Unpack INT4 weights
+        # For even k_indices, use lower 4 bits; for odd k_indices, use upper 4 bits
+        even_mask = (k_indices % 2 == 0)
+        w_vals = tl.where(even_mask[:, None], w_packed & 0xF, (w_packed >> 4) & 0xF)
+        w_vals = w_vals.to(tl.int8) - 8  # Convert to signed INT4 range [-8, 7]
+
+        # Load quantization parameters (group-wise)
+        group_idx = k_indices // group_size
+        scale_ptrs = scale_ptr + group_idx[:, None] * stride_sm + offs_bn[None, :] * stride_sn
+        zp_ptrs = zp_ptr + group_idx[:, None] * stride_zpm + offs_bn[None, :] * stride_zpn
+
+        scale_mask = (group_idx[:, None] < tl.cdiv(K, group_size)) & (offs_bn[None, :] < N)
+        scale = tl.load(scale_ptrs, mask=scale_mask, other=1.0).to(tl.float16)
+        zp = tl.load(zp_ptrs, mask=scale_mask, other=0.0).to(tl.float16)
+
+        # Dequantize weights: (w_vals - zero_point) * scale
+        w_vals = (w_vals.to(tl.float16) - zp) * scale
+
+        # Matrix multiplication
+        # Ensure we only process valid k values
+        valid_mask = k_indices[:, None] < K
+        # FIX: Use fp16 zero value to prevent dtype promotion
+        w_vals = tl.where(valid_mask, w_vals, tl.zeros_like(w_vals))
+
+        accumulator += tl.dot(x, w_vals, out_dtype=tl.float16)
+
+    # Convert to output precision
+    c = accumulator.to(tl.float16)
+
+    # Add bias if provided
+    if bias_ptr is not None:
+        bias_ptrs = bias_ptr + offs_bn
+        bias_mask = offs_bn < N
+        bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0).to(tl.float16)
+        c += bias[None, :]
+
+    # Store output
+    output_ptrs = output_ptr + offs_am[:, None] * stride_om + offs_bn[None, :] * stride_on
+    output_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)
+    tl.store(output_ptrs, c, mask=output_mask)
+
+
+class SmoothQuantLinear(torch.nn.Module):
+    """
+    PyTorch module wrapper for SmoothQuant INT4 linear layer.
+    """
+
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias: bool = True,
+            group_size: int = 128,
+            alpha: float = 0.5
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.group_size = group_size
+        self.alpha = alpha
+
+        # Ensure in_features is compatible with packing
+        self.packed_in_features = (in_features + 1) // 2  # For packing 2 INT4 into 1 UINT8
+
+        # Initialize quantized weight storage (packed)
+        self.register_buffer('packed_weight',
+                             torch.zeros(out_features, self.packed_in_features, dtype=torch.uint8))
+
+        # Quantization parameters (group-wise)
+        self.num_groups = (in_features + group_size - 1) // group_size
+        self.register_buffer('weight_scale',
+                             torch.ones(self.num_groups, out_features, dtype=torch.float16))
+        self.register_buffer('weight_zero',
+                             torch.zeros(self.num_groups, out_features, dtype=torch.float16))
+
+        # SmoothQuant scaling factor
+        self.register_buffer('smooth_scale',
+                             torch.ones(in_features, dtype=torch.float16))
+
+        # Bias
+        if bias:
+            self.register_buffer('bias', torch.zeros(out_features, dtype=torch.float16))
+        else:
+            self.register_buffer('bias', None)
+
+    def quantize_weight(self, weight: torch.Tensor, act_scales: Optional[torch.Tensor] = None):
+        """
+        Quantize FP16/FP32 weight to INT4 with SmoothQuant.
+        """
+        assert weight.shape == (self.out_features, self.in_features)
+
+        # Compute smoothing scale
+        if act_scales is not None:
+            # SmoothQuant formula: s_j = (max|X_j|)^α / (max|W_j|)^(1-α)
+            weight_scales = weight.abs().max(dim=0)[0]
+            # Avoid division by zero
+            weight_scales = torch.clamp(weight_scales, min=1e-5)
+            act_scales = torch.clamp(act_scales, min=1e-5)
+
+            smooth_scale = (act_scales.pow(self.alpha) /
+                            weight_scales.pow(1 - self.alpha))
+            smooth_scale = torch.clamp(smooth_scale, min=0.01, max=100.0)
+            self.smooth_scale.copy_(smooth_scale.to(torch.float16))
+
+            # Apply smoothing to weights
+            weight = weight * self.smooth_scale.unsqueeze(0)
+
+        # Group-wise quantization
+        weight_groups = []
+        scales = []
+        zeros = []
+
+        for i in range(self.num_groups):
+            start_idx = i * self.group_size
+            end_idx = min((i + 1) * self.group_size, self.in_features)
+
+            # Extract group
+            w_group = weight[:, start_idx:end_idx]
+
+            # Compute scale and zero point for this group
+            w_max = w_group.max(dim=1, keepdim=True)[0]
+            w_min = w_group.min(dim=1, keepdim=True)[0]
+
+            # Symmetric quantization for INT4 [-8, 7]
+            scale = (w_max - w_min) / 15.0
+            scale = torch.clamp(scale, min=1e-5)
+            zero = torch.round((w_max + w_min) / 2.0 / scale) * scale
+
+            # Quantize to INT4 range
+            w_quant = torch.round((w_group - zero) / scale).clamp(-8, 7)
+
+            weight_groups.append(w_quant)
+            scales.append(scale.squeeze(1))  # [out_features]
+            zeros.append((zero / scale).squeeze(1))  # [out_features]
+
+        # Concatenate groups back
+        weight_quantized = torch.cat(weight_groups, dim=1).to(torch.int8)
+
+        # Store quantization parameters
+        self.weight_scale.copy_(torch.stack(scales, dim=0).to(torch.float16))
+        self.weight_zero.copy_(torch.stack(zeros, dim=0).to(torch.float16))
+
+        # Pack weights
+        self.packed_weight.copy_(pack_weight(weight_quantized))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass with INT4 quantized weights and FP16 activations.
+        """
+        assert x.shape[-1] == self.in_features
+        assert x.dtype == torch.float16, "Input must be FP16"
+
+        # Flatten input for matrix multiplication
+        x_shape = x.shape
+        x = x.view(-1, self.in_features)
+        M, K = x.shape
+        N = self.out_features
+
+        # Allocate output
+        output = torch.empty(M, N, dtype=torch.float16, device=x.device)
+
+        # Launch kernel
+        grid = lambda META: (
+            triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
+        )
+
+        smoothquant_int4_kernel[grid](
+            x, self.packed_weight, self.bias, output,
+            self.weight_scale, self.weight_zero, self.smooth_scale,
+            M, N, K,
+            x.stride(0), x.stride(1),
+            self.packed_weight.stride(0), self.packed_weight.stride(1),
+            output.stride(0), output.stride(1),
+            self.weight_scale.stride(0), self.weight_scale.stride(1),
+            self.weight_zero.stride(0), self.weight_zero.stride(1),
+            self.group_size
+        )
+
+        return output.view(*x_shape[:-1], N)
+
+
+import torch
+import torch.nn as nn
+import time
+import gc
+import numpy as np
+from typing import Dict, List, Tuple
+
+
+# Import the SmoothQuant implementation
+# from smoothquant_int4 import SmoothQuantLinear
+
+def get_memory_usage():
+    """Get current GPU memory usage in MB"""
+    if torch.cuda.is_available():
+        return torch.cuda.memory_allocated() / 1024 / 1024
+    return 0
+
+
+def clear_memory():
+    """Clear GPU memory cache"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+
+
+def format_table(headers: List[str], rows: List[List[str]], title: str = "") -> str:
+    """Simple table formatter without external dependencies"""
+    if not rows:
+        return ""
+
+    # Calculate column widths
+    widths = [len(header) for header in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            if i < len(widths):
+                widths[i] = max(widths[i], len(str(cell)))
+
+    # Create format string
+    fmt = " | ".join(f"{{:<{w}}}" for w in widths)
+    separator = "-+-".join("-" * w for w in widths)
+
+    # Build table
+    result = []
+    if title:
+        total_width = sum(widths) + 3 * (len(widths) - 1)
+        result.append(f"\n{title}")
+        result.append("=" * max(len(title), total_width))
+
+    result.append(fmt.format(*headers))
+    result.append(separator)
+
+    for row in rows:
+        result.append(fmt.format(*[str(cell) for cell in row]))
+
+    return "\n".join(result)
+
+
+import torch
+import torch.nn as nn
+import time
+import gc
+import numpy as np
+from typing import Dict, List, Tuple
+
+
+# Import the SmoothQuant implementation
+# from smoothquant_int4 import SmoothQuantLinear
+
+def get_memory_usage():
+    """Get current GPU memory usage in MB"""
+    if torch.cuda.is_available():
+        return torch.cuda.memory_allocated() / 1024 / 1024
+    return 0
+
+
+def clear_memory():
+    """Clear GPU memory cache"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+
+
+def format_table(headers: List[str], rows: List[List[str]], title: str = "") -> str:
+    """Simple table formatter without external dependencies"""
+    if not rows:
+        return ""
+
+    # Calculate column widths
+    widths = [len(header) for header in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            if i < len(widths):
+                widths[i] = max(widths[i], len(str(cell)))
+
+    # Create format string
+    fmt = " | ".join(f"{{:<{w}}}" for w in widths)
+    separator = "-+-".join("-" * w for w in widths)
+
+    # Build table
+    result = []
+    if title:
+        total_width = sum(widths) + 3 * (len(widths) - 1)
+        result.append(f"\n{title}")
+        result.append("=" * max(len(title), total_width))
+
+    result.append(fmt.format(*headers))
+    result.append(separator)
+
+    for row in rows:
+        result.append(fmt.format(*[str(cell) for cell in row]))
+
+    return "\n".join(result)
+
+
diff --git a/lite_llama/quantization/awq.py b/lite_llama/quantization/awq.py
index a7a5329..7ad27d1 100644
--- a/lite_llama/quantization/awq.py
+++ b/lite_llama/quantization/awq.py
@@ -9,27 +9,28 @@
 import triton.language as tl
 import time, gc, psutil, os, sys
 
-from lite_llama.quantization.quant_config import GPTQConfig  # Reusing config structure
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
 from lite_llama.utils.common import get_gpu_memory
 from utils import pack_weight, unpack_weight
 from lite_llama.quantization.quant_config import AWQConfig
 
+
 class AWQ:
     def __init__(
             self,
             config: AWQConfig = field(default_factory=AWQConfig),
+            wbits: int = 4,
     ):
-        self.wbits = config.w_bit
-        self.groupsize = config.group_size if config.group_size != -1 else float('inf')
-        self.device = config.device
-        self.maxq = 2 ** self.wbits - 1
+        self.config = config
+        self.wbits = self.config.w_bit
+        self.groupsize = self.config.group_size if self.config.group_size != -1 else float('inf')
+        self.device = self.config.device
+        self.maxq = 2 ** wbits - 1
         self.zero_point = config.zero_point
-        self.alpha = config.alpha
-        self.search_scale = config.search_scale
-        self.auto_scale = config.auto_scale
+        self.alpha = self.config.alpha
+        self.search_scale = self.config.search_scale
+        self.auto_scale = self.config.auto_scale
 
         # Store activation statistics
         self.activation_stats = {}
@@ -248,8 +249,26 @@ def quantize_with_scales(self, weight: torch.Tensor, scales: torch.Tensor) -> Tu
                 q = torch.clamp(torch.round(w_group / scale), -(2 ** (self.wbits - 1)), 2 ** (self.wbits - 1) - 1)
 
             qweight[:, start_col:end_col] = q.to(torch.uint8)
-            qscales[:, g] = scale.squeeze(-1)
-            qzeros[:, g] = zero.squeeze(-1)
+
+            # Ensure proper dimensions when storing scales and zeros
+            scale_flat = scale.squeeze(-1) if scale.dim() > 1 else scale.flatten()
+            zero_flat = zero.squeeze(-1) if zero.dim() > 1 else zero.flatten()
+
+            # Handle dimension mismatches
+            if scale_flat.shape[0] != rows:
+                if scale_flat.numel() == 1:
+                    scale_flat = scale_flat.expand(rows)
+                else:
+                    scale_flat = scale_flat[:rows]
+
+            if zero_flat.shape[0] != rows:
+                if zero_flat.numel() == 1:
+                    zero_flat = zero_flat.expand(rows)
+                else:
+                    zero_flat = zero_flat[:rows]
+
+            qscales[:, g] = scale_flat
+            qzeros[:, g] = zero_flat
 
         return qweight, qzeros, qscales
 
@@ -306,6 +325,37 @@ def dequantize(self, qweight: torch.Tensor, qzeros: torch.Tensor, qscales: torch
 
         weight = torch.zeros_like(qweight, dtype=torch.float16)
 
+        # Handle dimension mismatch - ensure scales and zeros are 2D
+        if qscales.dim() == 1:
+            if len(qscales) == rows:
+                # If scales is per-row, expand to per-group
+                qscales = qscales.unsqueeze(1).expand(-1, num_groups)
+            else:
+                # If scales is per-group, expand to per-row
+                qscales = qscales.unsqueeze(0).expand(rows, -1)
+
+        if qzeros.dim() == 1:
+            if len(qzeros) == rows:
+                # If zeros is per-row, expand to per-group
+                qzeros = qzeros.unsqueeze(1).expand(-1, num_groups)
+            else:
+                # If zeros is per-group, expand to per-row
+                qzeros = qzeros.unsqueeze(0).expand(rows, -1)
+
+        # Ensure we have the right number of groups
+        if qscales.shape[1] != num_groups:
+            # Repeat or truncate to match expected groups
+            if qscales.shape[1] == 1:
+                qscales = qscales.expand(-1, num_groups)
+            else:
+                qscales = qscales[:, :num_groups]
+
+        if qzeros.shape[1] != num_groups:
+            if qzeros.shape[1] == 1:
+                qzeros = qzeros.expand(-1, num_groups)
+            else:
+                qzeros = qzeros[:, :num_groups]
+
         for g in range(num_groups):
             start_col = g * groupsize
             end_col = min((g + 1) * groupsize, cols)
@@ -461,6 +511,8 @@ def demo_awq():
         "other_param": torch.randn(100)
     }
 
+    print("Starting AWQ demo...")
+
     # Quantize without calibration data (will use default scaling)
     quantized_dict = quantize_awq(
         model_state_dict=dummy_state_dict,
@@ -475,17 +527,31 @@ def demo_awq():
     config = AWQConfig(w_bit=4, group_size=128, device="cpu")
     awq = AWQ(config)
 
+    # Debug: Check dimensions of quantized tensors
+    layer_name = "layer1.q_proj"
+    print(f"\nDebugging {layer_name}:")
+    qweight = quantized_dict[f"{layer_name}.qweight"]
+    qzeros = quantized_dict[f"{layer_name}.qzeros"]
+    qscales = quantized_dict[f"{layer_name}.qscales"]
+
+    print(f"qweight shape: {qweight.shape}")
+    print(f"qzeros shape: {qzeros.shape}")
+    print(f"qscales shape: {qscales.shape}")
+
     # Dequantize one layer
     original_weight = dummy_state_dict["layer1.q_proj.weight"]
-    dequant_weight = awq.dequantize(
-        quantized_dict["layer1.q_proj.qweight"],
-        quantized_dict["layer1.q_proj.qzeros"],
-        quantized_dict["layer1.q_proj.qscales"]
-    )
-
-    print(f"Original shape: {original_weight.shape}")
-    print(f"Dequantized shape: {dequant_weight.shape}")
-    print(f"Quantization error: {(original_weight - dequant_weight).abs().mean():.6f}")
+    try:
+        dequant_weight = awq.dequantize(qweight, qzeros, qscales)
+
+        print(f"\nResults:")
+        print(f"Original shape: {original_weight.shape}")
+        print(f"Dequantized shape: {dequant_weight.shape}")
+        print(f"Quantization error: {(original_weight - dequant_weight).abs().mean():.6f}")
+        print("AWQ demo completed successfully!")
+
+    except Exception as e:
+        print(f"Error during dequantization: {e}")
+        print("This might indicate a dimension mismatch in the quantization process.")
 
 
 if __name__ == "__main__":
diff --git a/lite_llama/quantization/quant_config.py b/lite_llama/quantization/quant_config.py
index c4a1d8e..fc4bda6 100644
--- a/lite_llama/quantization/quant_config.py
+++ b/lite_llama/quantization/quant_config.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import List
 
 
 @dataclass
@@ -22,3 +23,21 @@ class GPTQConfig:
     w_bit: int = 4  # Weight quantization bits
     group_size: int = 128  # Group size for quantization
     device: str = "cuda"
+
+
+@dataclass
+class SmoothQuantConfig:
+    """Configuration for SmoothQuant"""
+    alpha: float = 0.5  # Smoothing factor balance between act and weight
+    w_bit: int = 8      # Weight quantization bits
+    a_bit: int = 8      # Activation quantization bits
+    device: str = "cuda"
+    symmetric_weight: bool = True    # Use symmetric quantization for weights
+    symmetric_activation: bool = False  # Use asymmetric quantization for activations
+    per_channel_weight: bool = True  # Per-channel quantization for weights
+    per_token_activation: bool = True  # Per-token quantization for activations
+    calibration_samples: int = 128   # Number of calibration samples
+    smooth_layers: List[str] = field(default_factory=lambda: [
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj"
+    ])
\ No newline at end of file
diff --git a/lite_llama/quantization/sq.py b/lite_llama/quantization/sq.py
new file mode 100644
index 0000000..efe5e4e
--- /dev/null
+++ b/lite_llama/quantization/sq.py
@@ -0,0 +1,481 @@
+from dataclasses import field
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Dict, Tuple, Optional, Any, List
+from tqdm.auto import tqdm
+import time, os, sys, gc
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from lite_llama.quantization.quant_config import SmoothQuantConfig
+
+
+class SmoothQuantizer:
+    def __init__(self, config: SmoothQuantConfig = field(default_factory=SmoothQuantConfig)):
+        self.config = config
+        self.alpha = self.config.alpha
+        self.w_bit = self.config.w_bit
+        self.a_bit = self.config.a_bit
+        self.device = self.config.device
+
+        # Quantization ranges
+        self.w_qmax = 2 ** (self.w_bit - 1) - 1
+        self.w_qmin = -2 ** (self.w_bit - 1)
+        self.a_qmax = 2 ** self.a_bit - 1
+        self.a_qmin = 0
+
+        # Statistics storage
+        self.activation_stats = {}
+        self.weight_stats = {}
+        self.smoothing_factors = {}
+
+    def collect_activation_stats(self, model, calibration_dataloader):
+        """Collect activation statistics for smoothing factor calculation"""
+        print("Collecting activation statistics...")
+
+        # Register hooks to collect activations
+        activation_dict = {}
+        hooks = []
+
+        def make_hook(name):
+            def hook(module, input, output):
+                if isinstance(input, tuple):
+                    x = input[0]
+                else:
+                    x = input
+
+                if name not in activation_dict:
+                    activation_dict[name] = []
+
+                # Store input activations (before linear layer)
+                if x.dim() == 3:  # [batch, seq, hidden]
+                    x_flat = x.view(-1, x.size(-1))  # [batch*seq, hidden]
+                    activation_dict[name].append(x_flat.detach().cpu())
+
+            return hook
+
+        # Register hooks for target layers
+        for name, module in model.named_modules():
+            if any(layer in name for layer in self.config.smooth_layers):
+                if isinstance(module, nn.Linear):
+                    hook = module.register_forward_hook(make_hook(name))
+                    hooks.append(hook)
+
+        # Collect activations
+        model.eval()
+        with torch.no_grad():
+            for i, batch in enumerate(tqdm(calibration_dataloader, desc="Collecting stats")):
+                if i >= self.config.calibration_samples:
+                    break
+
+                # Move batch to device
+                if isinstance(batch, dict):
+                    batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
+                             for k, v in batch.items()}
+                    _ = model(**batch)
+                else:
+                    batch = batch.to(self.device)
+                    _ = model(batch)
+
+        # Remove hooks
+        for hook in hooks:
+            hook.remove()
+
+        # Compute statistics
+        for name, activations in activation_dict.items():
+            if activations:
+                all_acts = torch.cat(activations, dim=0)  # [total_tokens, hidden]
+
+                # Compute per-channel statistics
+                act_max = all_acts.abs().max(dim=0)[0]  # [hidden]
+                act_mean = all_acts.abs().mean(dim=0)  # [hidden]
+
+                self.activation_stats[name] = {
+                    'max': act_max,
+                    'mean': act_mean,
+                    'std': all_acts.std(dim=0)
+                }
+
+        print(f"Collected stats for {len(self.activation_stats)} layers")
+
+    def compute_smoothing_factors(self, model):
+        """Compute per-channel smoothing factors"""
+        print("Computing smoothing factors...")
+
+        for name, module in model.named_modules():
+            if any(layer in name for layer in self.config.smooth_layers):
+                if isinstance(module, nn.Linear) and name in self.activation_stats:
+                    weight = module.weight.data  # [out_features, in_features]
+
+                    # Get activation statistics
+                    act_stats = self.activation_stats[name]
+                    act_max = act_stats['max']  # [in_features]
+
+                    # Compute weight statistics (per input channel)
+                    weight_max = weight.abs().max(dim=0)[0]  # [in_features]
+
+                    # Compute smoothing factor s = (act_max^alpha / weight_max^(1-alpha))
+                    # To avoid division by zero
+                    weight_max = torch.clamp(weight_max, min=1e-5)
+                    act_max = torch.clamp(act_max, min=1e-5)
+
+                    smoothing_factor = (act_max.pow(self.alpha) /
+                                        weight_max.pow(1 - self.alpha))
+
+                    # Normalize to prevent extreme values
+                    smoothing_factor = torch.clamp(smoothing_factor, min=0.01, max=100.0)
+
+                    self.smoothing_factors[name] = smoothing_factor.to(self.device)
+
+                    print(f"Layer {name}: smoothing range [{smoothing_factor.min():.3f}, {smoothing_factor.max():.3f}]")
+
+    def apply_smoothing(self, model):
+        """Apply smoothing factors to model weights"""
+        print("Applying smoothing to model...")
+
+        for name, module in model.named_modules():
+            if name in self.smoothing_factors:
+                smoothing_factor = self.smoothing_factors[name]
+
+                # Apply smoothing: W' = W * diag(s), where s is smoothing factor
+                # Weight: [out_features, in_features]
+                # Smoothing: [in_features]
+                module.weight.data = module.weight.data * smoothing_factor.unsqueeze(0)
+
+                print(f"Applied smoothing to {name}")
+
+    def quantize_weight(self, weight: torch.Tensor, per_channel: bool = True) -> Tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Quantize weights to INT8"""
+        if per_channel:
+            # Per output channel quantization
+            dim = 0  # Quantize along output dimension
+            w_max = weight.abs().max(dim=1, keepdim=True)[0]  # [out_features, 1]
+        else:
+            # Per tensor quantization
+            w_max = weight.abs().max()
+
+        # Compute scale
+        scale = w_max / self.w_qmax
+        scale = torch.clamp(scale, min=1e-5)
+
+        if self.config.symmetric_weight:
+            # Symmetric quantization
+            zero_point = torch.zeros_like(scale)
+            qweight = torch.round(weight / scale).clamp(self.w_qmin, self.w_qmax)
+        else:
+            # Asymmetric quantization
+            w_min = weight.min(dim=1, keepdim=True)[0] if per_channel else weight.min()
+            zero_point = torch.round(-w_min / scale).clamp(self.w_qmin, self.w_qmax)
+            qweight = torch.round(weight / scale + zero_point).clamp(self.w_qmin, self.w_qmax)
+
+        return qweight.to(torch.int8), scale, zero_point
+
+    def dequantize_weight(self, qweight: torch.Tensor, scale: torch.Tensor,
+                          zero_point: torch.Tensor) -> torch.Tensor:
+        """Dequantize weights from INT8"""
+        if self.config.symmetric_weight:
+            return qweight.float() * scale
+        else:
+            return (qweight.float() - zero_point) * scale
+
+    def quantize_activation(self, activation: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Quantize activations to INT8"""
+        original_shape = activation.shape
+
+        if self.config.per_token_activation and activation.dim() == 3:
+            # Per-token quantization: [batch, seq, hidden] -> quantize per token
+            batch_size, seq_len, hidden_size = activation.shape
+            activation_flat = activation.view(-1, hidden_size)  # [batch*seq, hidden]
+
+            # Compute per-token statistics
+            act_max = activation_flat.abs().max(dim=-1, keepdim=True)[0]  # [batch*seq, 1]
+            act_min = activation_flat.min(dim=-1, keepdim=True)[0]  # [batch*seq, 1]
+
+            # Compute scale and zero point
+            if self.config.symmetric_activation:
+                scale = act_max / self.a_qmax  # [batch*seq, 1]
+                zero_point = torch.zeros_like(scale)  # [batch*seq, 1]
+            else:
+                scale = (act_max - act_min) / self.a_qmax  # [batch*seq, 1]
+                zero_point = torch.round(-act_min / scale).clamp(self.a_qmin, self.a_qmax)  # [batch*seq, 1]
+
+            scale = torch.clamp(scale, min=1e-5)
+
+            # Quantize
+            if self.config.symmetric_activation:
+                qactivation_flat = torch.round(activation_flat / scale).clamp(-self.a_qmax, self.a_qmax)
+            else:
+                qactivation_flat = torch.round(activation_flat / scale + zero_point).clamp(self.a_qmin, self.a_qmax)
+
+            # Reshape everything back to original shape
+            qactivation = qactivation_flat.view(original_shape).to(torch.int8)
+            scale = scale.view(batch_size, seq_len, 1)  # [batch, seq, 1]
+            zero_point = zero_point.view(batch_size, seq_len, 1)  # [batch, seq, 1]
+
+        else:
+            # Per-tensor quantization
+            act_max = activation.abs().max()
+            act_min = activation.min()
+
+            # Compute scale and zero point (scalars)
+            if self.config.symmetric_activation:
+                scale = act_max / self.a_qmax
+                zero_point = torch.zeros_like(scale)
+            else:
+                scale = (act_max - act_min) / self.a_qmax
+                zero_point = torch.round(-act_min / scale).clamp(self.a_qmin, self.a_qmax)
+
+            scale = torch.clamp(scale, min=1e-5)
+
+            # Quantize
+            if self.config.symmetric_activation:
+                qactivation = torch.round(activation / scale).clamp(-self.a_qmax, self.a_qmax)
+            else:
+                qactivation = torch.round(activation / scale + zero_point).clamp(self.a_qmin, self.a_qmax)
+
+            qactivation = qactivation.to(torch.int8)
+
+        return qactivation, scale, zero_point
+
+    def dequantize_activation(self, qactivation: torch.Tensor, scale: torch.Tensor,
+                              zero_point: torch.Tensor) -> torch.Tensor:
+        """Dequantize activations from INT8"""
+        if self.config.symmetric_activation:
+            return qactivation.float() * scale
+        else:
+            return (qactivation.float() - zero_point) * scale
+
+
+class SmoothQuantLinear(nn.Module):
+    """Quantized Linear layer with SmoothQuant"""
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 smoothing_factor: Optional[torch.Tensor] = None,
+                 config: SmoothQuantConfig = None):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.config = config or SmoothQuantConfig()
+
+        # Store quantized weights
+        self.register_buffer('qweight', torch.zeros(out_features, in_features, dtype=torch.int8))
+        self.register_buffer('weight_scale', torch.zeros(out_features, 1))
+        self.register_buffer('weight_zero_point', torch.zeros(out_features, 1))
+
+        # Store smoothing factor
+        if smoothing_factor is not None:
+            self.register_buffer('smoothing_factor', smoothing_factor)
+        else:
+            self.register_buffer('smoothing_factor', torch.ones(in_features))
+
+        # Bias
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features))
+        else:
+            self.register_parameter('bias', None)
+
+        self.quantizer = SmoothQuantizer(config)
+
+    def set_quantized_weight(self, qweight: torch.Tensor, scale: torch.Tensor,
+                             zero_point: torch.Tensor):
+        """Set quantized weight parameters"""
+        self.qweight.copy_(qweight)
+        self.weight_scale.copy_(scale)
+        self.weight_zero_point.copy_(zero_point)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Apply inverse smoothing to input activations
+        # x_smoothed = x / smoothing_factor
+        # Ensure smoothing_factor is broadcastable
+        if x.dim() == 3:  # [batch, seq, hidden]
+            smoothing_factor = self.smoothing_factor.unsqueeze(0).unsqueeze(0)  # [1, 1, hidden]
+        else:  # [batch, hidden] or other shapes
+            smoothing_factor = self.smoothing_factor.unsqueeze(0)  # [1, hidden]
+
+        x_smooth = x / smoothing_factor
+
+        # Quantize input activations
+        qx, act_scale, act_zero_point = self.quantizer.quantize_activation(x_smooth)
+
+        # Dequantize for computation (in practice, this would be done in INT8)
+        x_dequant = self.quantizer.dequantize_activation(qx, act_scale, act_zero_point)
+        weight_dequant = self.quantizer.dequantize_weight(
+            self.qweight, self.weight_scale, self.weight_zero_point
+        )
+
+        # Linear computation
+        output = F.linear(x_dequant, weight_dequant, self.bias)
+
+        return output
+
+
+def convert_to_smoothquant(model, calibration_dataloader, config: SmoothQuantConfig = None):
+    """Convert a model to use SmoothQuant"""
+    config = config or SmoothQuantConfig()
+    quantizer = SmoothQuantizer(config)
+
+    # Step 1: Collect activation statistics
+    quantizer.collect_activation_stats(model, calibration_dataloader)
+
+    # Step 2: Compute smoothing factors
+    quantizer.compute_smoothing_factors(model)
+
+    # Step 3: Apply smoothing to weights
+    quantizer.apply_smoothing(model)
+
+    # Step 4: Convert linear layers to quantized versions
+    quantized_state_dict = {}
+
+    for name, module in model.named_modules():
+        if any(layer in name for layer in config.smooth_layers):
+            if isinstance(module, nn.Linear):
+                # Quantize the smoothed weights
+                qweight, weight_scale, weight_zero_point = quantizer.quantize_weight(
+                    module.weight.data, per_channel=config.per_channel_weight
+                )
+
+                # Get smoothing factor for this layer
+                smoothing_factor = quantizer.smoothing_factors.get(name,
+                                                                   torch.ones(module.in_features))
+
+                # Create quantized layer
+                sq_linear = SmoothQuantLinear(
+                    module.in_features,
+                    module.out_features,
+                    bias=module.bias is not None,
+                    smoothing_factor=smoothing_factor,
+                    config=config
+                )
+
+                # Set quantized parameters
+                sq_linear.set_quantized_weight(qweight, weight_scale, weight_zero_point)
+                if module.bias is not None:
+                    sq_linear.bias.data.copy_(module.bias.data)
+
+                # Store in state dict
+                base_name = name.replace(".weight", "").replace("_weight", "")
+                quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
+                quantized_state_dict[f"{base_name}.weight_scale"] = weight_scale.cpu()
+                quantized_state_dict[f"{base_name}.weight_zero_point"] = weight_zero_point.cpu()
+                quantized_state_dict[f"{base_name}.smoothing_factor"] = smoothing_factor.cpu()
+
+                if module.bias is not None:
+                    quantized_state_dict[f"{name}"] = module.bias.cpu()
+
+                print(f"Quantized layer: {name}")
+
+    return quantized_state_dict, quantizer
+
+
+def apply_smoothquant(model_state_dict: Dict[str, torch.Tensor],
+                      calibration_dataloader,
+                      config: SmoothQuantConfig = None) -> Dict[str, torch.Tensor]:
+    """
+    Apply SmoothQuant to a model state dictionary
+
+    Args:
+        model_state_dict: Original model state dictionary
+        calibration_dataloader: DataLoader for calibration data
+        config: SmoothQuant configuration
+
+    Returns:
+        Dictionary containing quantized weights and parameters
+    """
+    print("Starting SmoothQuant quantization...")
+
+    config = config or SmoothQuantConfig()
+
+    # Note: This is a simplified version. In practice, you'd need to:
+    # 1. Load the model from state_dict
+    # 2. Run calibration
+    # 3. Apply smoothing and quantization
+    # 4. Return the quantized state dict
+
+    # For demonstration, we'll show the structure:
+    quantized_state_dict = {}
+
+    # Process each layer
+    for name, param in tqdm(model_state_dict.items(), desc="Processing layers"):
+        if any(layer in name for layer in config.smooth_layers) and param.dim() == 2:
+            # This would be where you apply the full SmoothQuant pipeline
+            quantizer = SmoothQuantizer(config)
+
+            # Simulate quantization (in practice, you'd use actual calibration data)
+            weight = param.float()
+            qweight, scale, zero_point = quantizer.quantize_weight(weight)
+
+            base_name = name.replace(".weight", "")
+            quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
+            quantized_state_dict[f"{base_name}.weight_scale"] = scale.cpu()
+            quantized_state_dict[f"{base_name}.weight_zero_point"] = zero_point.cpu()
+
+        else:
+            # Keep non-quantized parameters
+            quantized_state_dict[name] = param.cpu()
+
+    print("SmoothQuant quantization completed!")
+    return quantized_state_dict
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    # Example configuration
+    config = SmoothQuantConfig(
+        alpha=0.5,
+        w_bit=8,
+        a_bit=8,
+        symmetric_weight=True,
+        symmetric_activation=False,
+        per_channel_weight=True,
+        per_token_activation=True
+    )
+
+    # Test quantization
+    print("Testing SmoothQuant implementation...")
+
+    # Create a simple test case
+    test_weight = torch.randn(1024, 512) * 0.1
+    quantizer = SmoothQuantizer(config)
+
+    # Test weight quantization
+    print("Testing weight quantization...")
+    qweight, scale, zero_point = quantizer.quantize_weight(test_weight)
+    reconstructed = quantizer.dequantize_weight(qweight, scale, zero_point)
+
+    # Compute error
+    error = (test_weight - reconstructed).abs().mean()
+    print(f"Weight quantization error: {error:.6f}")
+    print(f"Weight shapes - original: {test_weight.shape}, quantized: {qweight.shape}")
+    print(f"Scale shape: {scale.shape}, Zero point shape: {zero_point.shape}")
+
+    # Test activation quantization with different shapes
+    print("\nTesting activation quantization...")
+
+    # Test 3D tensor (typical transformer input)
+    test_activation_3d = torch.randn(8, 128, 512) * 2.0
+    print(f"Input activation shape: {test_activation_3d.shape}")
+
+    qact, act_scale, act_zero_point = quantizer.quantize_activation(test_activation_3d)
+    print(f"Quantized activation shape: {qact.shape}")
+    print(f"Activation scale shape: {act_scale.shape}")
+    print(f"Activation zero point shape: {act_zero_point.shape}")
+
+    reconstructed_act = quantizer.dequantize_activation(qact, act_scale, act_zero_point)
+    print(f"Reconstructed activation shape: {reconstructed_act.shape}")
+
+    act_error = (test_activation_3d - reconstructed_act).abs().mean()
+    print(f"Activation quantization error: {act_error:.6f}")
+
+    # Test 2D tensor
+    print("\nTesting 2D activation...")
+    test_activation_2d = torch.randn(64, 512) * 2.0
+    qact_2d, act_scale_2d, act_zero_point_2d = quantizer.quantize_activation(test_activation_2d)
+    reconstructed_act_2d = quantizer.dequantize_activation(qact_2d, act_scale_2d, act_zero_point_2d)
+
+    act_error_2d = (test_activation_2d - reconstructed_act_2d).abs().mean()
+    print(f"2D Activation quantization error: {act_error_2d:.6f}")
+    print(f"2D shapes - scale: {act_scale_2d.shape}, zero_point: {act_zero_point_2d.shape}")
+
+    print("SmoothQuant implementation test completed!")
\ No newline at end of file
diff --git a/lite_llama/utils/common.py b/lite_llama/utils/common.py
index 9316369..f271290 100644
--- a/lite_llama/utils/common.py
+++ b/lite_llama/utils/common.py
@@ -206,7 +206,7 @@ def get_model_dtype(checkpoints_dir: str):
 def quantization(mode: str = None):
     quantized_linear_cls = None
     if mode == 'gptq.int4':
-        from ..kernels.int4_linear import GPTQLinear
+        from ..kernels.gptq_linear import GPTQLinear
         quantized_linear_cls = functools.partial(GPTQLinear, bits=4, tile_cols=-1)
     elif mode is not None:
         raise ValueError(f"Unknown quantization mode: {mode}")
diff --git a/tests/kernels/__init__.py b/tests/kernels/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/kernels/test_AWQLinear.py b/tests/kernels/test_AWQLinear.py
new file mode 100644
index 0000000..e69de29
diff --git a/test.py b/tests/kernels/test_GPTQLinear.py
similarity index 100%
rename from test.py
rename to tests/kernels/test_GPTQLinear.py
diff --git a/tests/kernels/test_SQLinear.py b/tests/kernels/test_SQLinear.py
new file mode 100644
index 0000000..d2e6531
--- /dev/null
+++ b/tests/kernels/test_SQLinear.py
@@ -0,0 +1,488 @@
+import torch
+import torch.nn as nn
+
+import numpy as np
+from typing import Dict, List, Tuple
+import time, os, sys, gc
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+from lite_llama.kernels.sq_linear import SmoothQuantLinear
+
+
+# Import the SmoothQuant implementation
+# from smoothquant_int4 import SmoothQuantLinear
+
+def get_memory_usage():
+    """Get current GPU memory usage in MB"""
+    if torch.cuda.is_available():
+        return torch.cuda.memory_allocated() / 1024 / 1024
+    return 0
+
+
+def clear_memory():
+    """Clear GPU memory cache"""
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+
+
+def format_table(headers: List[str], rows: List[List[str]], title: str = "") -> str:
+    """Simple table formatter without external dependencies"""
+    if not rows:
+        return ""
+
+    # Calculate column widths
+    widths = [len(header) for header in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            if i < len(widths):
+                widths[i] = max(widths[i], len(str(cell)))
+
+    # Create format string
+    fmt = " | ".join(f"{{:<{w}}}" for w in widths)
+    separator = "-+-".join("-" * w for w in widths)
+
+    # Build table
+    result = []
+    if title:
+        total_width = sum(widths) + 3 * (len(widths) - 1)
+        result.append(f"\n{title}")
+        result.append("=" * max(len(title), total_width))
+
+    result.append(fmt.format(*headers))
+    result.append(separator)
+
+    for row in rows:
+        result.append(fmt.format(*[str(cell) for cell in row]))
+
+    return "\n".join(result)
+
+
+class PerformanceComparison:
+    """Class to compare SmoothQuant INT4 with nn.Linear"""
+
+    def __init__(self, device='cuda' if torch.cuda.is_available() else 'cpu'):
+        self.device = device
+        self.results = []
+
+    def create_layers(self, in_features: int, out_features: int, group_size: int = 128):
+        """Create both SmoothQuant and nn.Linear layers with same weights"""
+        # Create standard linear layer
+        linear_layer = nn.Linear(in_features, out_features, bias=True, dtype=torch.float16)
+        linear_layer = linear_layer.to(self.device)
+
+        # Create SmoothQuant layer
+        sq_layer = SmoothQuantLinear(in_features, out_features, bias=True, group_size=group_size)
+        sq_layer = sq_layer.to(self.device)
+
+        # Copy weights and bias from linear to SmoothQuant
+        with torch.no_grad():
+            weight = linear_layer.weight.data.clone()
+            bias = linear_layer.bias.data.clone() if linear_layer.bias is not None else None
+
+            # Generate some sample activations to compute scales for SmoothQuant
+            sample_input = torch.randn(32, 128, in_features, dtype=torch.float16, device=self.device)
+            act_scales = sample_input.abs().amax(dim=(0, 1))
+
+            # Quantize the weights
+            sq_layer.quantize_weight(weight, act_scales)
+            if bias is not None:
+                sq_layer.bias.copy_(bias)
+
+        return linear_layer, sq_layer
+
+    def measure_memory(self, layer, input_tensor, layer_name: str):
+        """Measure memory usage of a layer"""
+        clear_memory()
+
+        # Baseline memory
+        baseline_memory = get_memory_usage()
+
+        # Model memory (parameters) - handle both regular and buffer parameters
+        model_memory = 0
+        for param in layer.parameters():
+            model_memory += param.numel() * param.element_size()
+
+        # Also count registered buffers (important for SmoothQuant)
+        for buffer in layer.buffers():
+            model_memory += buffer.numel() * buffer.element_size()
+
+        model_memory = model_memory / 1024 / 1024  # Convert to MB
+
+        # Ensure we have a minimum memory value to avoid division by zero
+        model_memory = max(model_memory, 0.001)  # At least 1KB
+
+        # Forward pass memory
+        with torch.no_grad():
+            _ = layer(input_tensor)
+            peak_memory = get_memory_usage()
+
+        activation_memory = peak_memory - baseline_memory - model_memory
+
+        return {
+            'layer_name': layer_name,
+            'model_memory_mb': model_memory,
+            'activation_memory_mb': max(0, activation_memory),
+            'total_memory_mb': peak_memory - baseline_memory
+        }
+
+    def measure_speed(self, layer, input_tensor, num_warmup: int = 10, num_runs: int = 100):
+        """Measure inference speed of a layer"""
+        # Warmup
+        with torch.no_grad():
+            for _ in range(num_warmup):
+                _ = layer(input_tensor)
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+
+        # Timing
+        times = []
+        with torch.no_grad():
+            for _ in range(num_runs):
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                start = time.time()
+
+                _ = layer(input_tensor)
+
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                end = time.time()
+
+                times.append((end - start) * 1000)  # Convert to ms
+
+        return {
+            'mean_time_ms': np.mean(times),
+            'std_time_ms': np.std(times),
+            'min_time_ms': np.min(times),
+            'max_time_ms': np.max(times)
+        }
+
+    def measure_accuracy(self, linear_layer, sq_layer, input_tensor):
+        """Compare numerical accuracy between layers"""
+        with torch.no_grad():
+            # Get outputs
+            linear_output = linear_layer(input_tensor)
+            sq_output = sq_layer(input_tensor)
+
+            # Calculate differences
+            abs_diff = (linear_output - sq_output).abs()
+            rel_diff = abs_diff / (linear_output.abs() + 1e-8)
+
+            return {
+                'mean_abs_error': abs_diff.mean().item(),
+                'max_abs_error': abs_diff.max().item(),
+                'mean_rel_error': rel_diff.mean().item(),
+                'max_rel_error': rel_diff.max().item(),
+                'mse': ((linear_output - sq_output) ** 2).mean().item(),
+                'cosine_similarity': torch.nn.functional.cosine_similarity(
+                    linear_output.flatten(), sq_output.flatten(), dim=0
+                ).item()
+            }
+
+    def run_comparison(self, test_configs: List[Dict]):
+        """Run comparison across multiple configurations"""
+        print(f"Running comparison on {self.device}")
+        print("=" * 80)
+
+        for config in test_configs:
+            print(f"\nTesting: {config['name']}")
+            print("-" * 40)
+
+            # Create input
+            input_tensor = torch.randn(
+                config['batch_size'],
+                config['seq_len'],
+                config['in_features'],
+                dtype=torch.float16,
+                device=self.device
+            )
+
+            # Create layers
+            linear_layer, sq_layer = self.create_layers(
+                config['in_features'],
+                config['out_features'],
+                config.get('group_size', 128)
+            )
+
+            # Measure memory
+            linear_memory = self.measure_memory(linear_layer, input_tensor, 'nn.Linear')
+            sq_memory = self.measure_memory(sq_layer, input_tensor, 'SmoothQuant')
+
+            # Measure speed
+            linear_speed = self.measure_speed(linear_layer, input_tensor)
+            sq_speed = self.measure_speed(sq_layer, input_tensor)
+
+            # Measure accuracy
+            accuracy = self.measure_accuracy(linear_layer, sq_layer, input_tensor)
+
+            # Calculate throughput (tokens/sec)
+            total_tokens = config['batch_size'] * config['seq_len']
+            linear_throughput = total_tokens / (linear_speed['mean_time_ms'] / 1000)
+            sq_throughput = total_tokens / (sq_speed['mean_time_ms'] / 1000)
+
+            # Store results
+            result = {
+                'config': config,
+                'linear_memory': linear_memory,
+                'sq_memory': sq_memory,
+                'linear_speed': linear_speed,
+                'sq_speed': sq_speed,
+                'accuracy': accuracy,
+                'linear_throughput': linear_throughput,
+                'sq_throughput': sq_throughput,
+                'speedup': linear_speed['mean_time_ms'] / sq_speed['mean_time_ms'],
+                'memory_reduction': linear_memory['model_memory_mb'] / sq_memory['model_memory_mb']
+            }
+
+            self.results.append(result)
+
+            # Print summary for this config
+            self.print_config_summary(result)
+
+    def print_config_summary(self, result):
+        """Print summary for a single configuration"""
+        config = result['config']
+
+        print(f"Input shape: [{config['batch_size']}, {config['seq_len']}, {config['in_features']}]")
+        print(f"Output features: {config['out_features']}")
+
+        # Speed comparison
+        print(f"\n🕐 Speed Comparison:")
+        print(
+            f"  nn.Linear:    {result['linear_speed']['mean_time_ms']:.3f} ± {result['linear_speed']['std_time_ms']:.3f} ms")
+        print(f"  SmoothQuant:  {result['sq_speed']['mean_time_ms']:.3f} ± {result['sq_speed']['std_time_ms']:.3f} ms")
+        print(f"  Speedup:      {result['speedup']:.2f}x")
+
+        # Throughput
+        print(f"\n🚀 Throughput:")
+        print(f"  nn.Linear:    {result['linear_throughput']:.0f} tokens/sec")
+        print(f"  SmoothQuant:  {result['sq_throughput']:.0f} tokens/sec")
+
+        # Memory comparison
+        print(f"\n💾 Memory Usage (Model Parameters):")
+        print(f"  nn.Linear:    {result['linear_memory']['model_memory_mb']:.2f} MB")
+        print(f"  SmoothQuant:  {result['sq_memory']['model_memory_mb']:.2f} MB")
+        print(f"  Reduction:    {result['memory_reduction']:.2f}x")
+
+        # Accuracy
+        print(f"\n📊 Accuracy:")
+        print(f"  Mean Abs Error:     {result['accuracy']['mean_abs_error']:.6f}")
+        print(f"  Max Abs Error:      {result['accuracy']['max_abs_error']:.6f}")
+        print(f"  Mean Rel Error:     {result['accuracy']['mean_rel_error']:.4f}")
+        print(f"  Cosine Similarity:  {result['accuracy']['cosine_similarity']:.6f}")
+
+    def generate_report(self):
+        """Generate comprehensive report with tables"""
+        if not self.results:
+            print("No results to report!")
+            return
+
+        print("\n" + "=" * 80)
+        print("COMPREHENSIVE COMPARISON REPORT")
+        print("=" * 80)
+
+        # Summary table
+        self.print_summary_table()
+
+        # Additional detailed analysis
+        self.print_detailed_analysis()
+
+    def print_summary_table(self):
+        """Print summary table of all results"""
+        headers = [
+            "Config", "Batch×Seq", "Features",
+            "Linear (ms)", "SmoothQ (ms)", "Speedup",
+            "Linear (MB)", "SmoothQ (MB)", "Mem Reduction",
+            "Mean Abs Err", "Cos Sim"
+        ]
+
+        rows = []
+        for result in self.results:
+            config = result['config']
+            rows.append([
+                config['name'],
+                f"{config['batch_size']}×{config['seq_len']}",
+                f"{config['in_features']}→{config['out_features']}",
+                f"{result['linear_speed']['mean_time_ms']:.2f}",
+                f"{result['sq_speed']['mean_time_ms']:.2f}",
+                f"{result['speedup']:.2f}x",
+                f"{result['linear_memory']['model_memory_mb']:.1f}",
+                f"{result['sq_memory']['model_memory_mb']:.1f}",
+                f"{result['memory_reduction']:.2f}x",
+                f"{result['accuracy']['mean_abs_error']:.4f}",
+                f"{result['accuracy']['cosine_similarity']:.4f}"
+            ])
+
+        print(format_table(headers, rows, "📋 Summary Table"))
+
+        # Overall statistics
+        speedups = [r['speedup'] for r in self.results]
+        memory_reductions = [r['memory_reduction'] for r in self.results]
+        accuracies = [r['accuracy']['cosine_similarity'] for r in self.results]
+
+        print(f"\n📈 Overall Statistics:")
+        print(f"  Average Speedup:       {np.mean(speedups):.2f}x (±{np.std(speedups):.2f})")
+        print(f"  Average Memory Reduction: {np.mean(memory_reductions):.2f}x (±{np.std(memory_reductions):.2f})")
+        print(f"  Average Cosine Similarity: {np.mean(accuracies):.4f} (±{np.std(accuracies):.4f})")
+
+    def print_detailed_analysis(self):
+        """Print detailed analysis without plots"""
+        if not self.results:
+            return
+
+        print("\n" + "=" * 60)
+        print("DETAILED ANALYSIS")
+        print("=" * 60)
+
+        # Performance analysis
+        print("\n🚀 Performance Analysis:")
+        print("-" * 30)
+
+        best_speedup = max(self.results, key=lambda x: x['speedup'])
+        worst_speedup = min(self.results, key=lambda x: x['speedup'])
+
+        print(f"Best speedup:  {best_speedup['speedup']:.2f}x ({best_speedup['config']['name']})")
+        print(f"Worst speedup: {worst_speedup['speedup']:.2f}x ({worst_speedup['config']['name']})")
+
+        # Memory analysis
+        print("\n💾 Memory Analysis:")
+        print("-" * 30)
+
+        best_memory = max(self.results, key=lambda x: x['memory_reduction'])
+        worst_memory = min(self.results, key=lambda x: x['memory_reduction'])
+
+        print(f"Best memory reduction:  {best_memory['memory_reduction']:.2f}x ({best_memory['config']['name']})")
+        print(f"Worst memory reduction: {worst_memory['memory_reduction']:.2f}x ({worst_memory['config']['name']})")
+
+        # Accuracy analysis
+        print("\n📊 Accuracy Analysis:")
+        print("-" * 30)
+
+        best_accuracy = max(self.results, key=lambda x: x['accuracy']['cosine_similarity'])
+        worst_accuracy = min(self.results, key=lambda x: x['accuracy']['cosine_similarity'])
+
+        print(
+            f"Best accuracy:  {best_accuracy['accuracy']['cosine_similarity']:.6f} ({best_accuracy['config']['name']})")
+        print(
+            f"Worst accuracy: {worst_accuracy['accuracy']['cosine_similarity']:.6f} ({worst_accuracy['config']['name']})")
+
+        # Efficiency analysis
+        print("\n⚡ Efficiency Analysis:")
+        print("-" * 30)
+
+        for result in self.results:
+            config = result['config']
+            efficiency_score = result['speedup'] * result['memory_reduction'] * result['accuracy']['cosine_similarity']
+            print(f"{config['name']:12}: Efficiency Score = {efficiency_score:.3f}")
+
+        # Scaling analysis
+        print("\n📈 Scaling Analysis:")
+        print("-" * 30)
+
+        # Sort by problem size (total parameters)
+        sorted_results = sorted(self.results, key=lambda x: x['config']['in_features'] * x['config']['out_features'])
+
+        for result in sorted_results:
+            config = result['config']
+            total_params = config['in_features'] * config['out_features']
+            print(
+                f"{config['name']:12}: {total_params:>10,} params, {result['speedup']:.2f}x speedup, {result['memory_reduction']:.2f}x memory")
+
+        # Recommendations
+        print("\n💡 Recommendations:")
+        print("-" * 30)
+
+        avg_speedup = np.mean([r['speedup'] for r in self.results])
+        avg_memory = np.mean([r['memory_reduction'] for r in self.results])
+        avg_accuracy = np.mean([r['accuracy']['cosine_similarity'] for r in self.results])
+
+        if avg_speedup > 1.5:
+            print("✅ SmoothQuant shows significant speed improvements")
+        else:
+            print("⚠️  SmoothQuant speed improvements are marginal")
+
+        if avg_memory > 3.0:
+            print("✅ SmoothQuant provides excellent memory savings")
+        else:
+            print("⚠️  SmoothQuant memory savings are lower than expected")
+
+        if avg_accuracy > 0.99:
+            print("✅ SmoothQuant maintains high numerical accuracy")
+        elif avg_accuracy > 0.95:
+            print("⚠️  SmoothQuant shows moderate accuracy degradation")
+        else:
+            print("❌ SmoothQuant shows significant accuracy degradation")
+
+
+def run_comprehensive_test():
+    """Run comprehensive comparison test"""
+
+    # Test configurations
+    test_configs = [
+        {
+            'name': 'Small',
+            'batch_size': 1,
+            'seq_len': 128,
+            'in_features': 512,
+            'out_features': 512,
+            'group_size': 128
+        },
+        {
+            'name': 'Medium',
+            'batch_size': 4,
+            'seq_len': 256,
+            'in_features': 1024,
+            'out_features': 1024,
+            'group_size': 128
+        },
+        {
+            'name': 'Large',
+            'batch_size': 8,
+            'seq_len': 512,
+            'in_features': 2048,
+            'out_features': 2048,
+            'group_size': 128
+        },
+        {
+            'name': 'Very Large',
+            'batch_size': 16,
+            'seq_len': 1024,
+            'in_features': 4096,
+            'out_features': 4096,
+            'group_size': 128
+        },
+        {
+            'name': 'LLaMA-like',
+            'batch_size': 1,
+            'seq_len': 2048,
+            'in_features': 4096,
+            'out_features': 11008,  # Typical MLP dimension
+            'group_size': 128
+        }
+    ]
+
+    # Run comparison
+    comparison = PerformanceComparison()
+    comparison.run_comparison(test_configs)
+    comparison.generate_report()
+
+    return comparison
+
+
+if __name__ == "__main__":
+    print("SmoothQuant INT4 vs nn.Linear Comprehensive Comparison")
+    print("=" * 60)
+
+    # Check if CUDA is available
+    if torch.cuda.is_available():
+        print(f"Using GPU: {torch.cuda.get_device_name()}")
+        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024 ** 3:.1f} GB")
+    else:
+        print("Using CPU (GPU not available)")
+
+    print()
+
+    # Run the comprehensive test
+    comparison = run_comprehensive_test()
+
+    print(f"\n🎉 Comparison complete!")
\ No newline at end of file

From f30154fcc305a999ffaac794698ec6ce27653118 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Tue, 15 Jul 2025 01:33:18 +0930
Subject: [PATCH 29/33] update sqlinear and awqlinear

---
 lite_llama/kernels/awq_linear.py | 290 ++++++++++++++++++
 lite_llama/kernels/sq_linear.py  | 112 -------
 lite_llama/quantization/awq.py   |  88 +-----
 tests/kernels/test_AWQLinear.py  | 492 +++++++++++++++++++++++++++++++
 4 files changed, 786 insertions(+), 196 deletions(-)

diff --git a/lite_llama/kernels/awq_linear.py b/lite_llama/kernels/awq_linear.py
index e69de29..7a1363b 100644
--- a/lite_llama/kernels/awq_linear.py
+++ b/lite_llama/kernels/awq_linear.py
@@ -0,0 +1,290 @@
+import torch
+import triton
+import triton.language as tl
+from typing import Optional
+import psutil, os, sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+from lite_llama.quantization.utils import pack_weight
+
+
+@triton.jit
+def awq_linear_kernel(
+        input_ptr, qweight_ptr, qscales_ptr, qzeros_ptr, output_ptr, bias_ptr,
+        M, N, K, group_size,
+        stride_input_m, stride_input_k,
+        stride_qweight_n, stride_qweight_k,
+        stride_qscales_n, stride_qscales_g,
+        stride_qzeros_n, stride_qzeros_g,
+        stride_output_m, stride_output_n,
+        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+        GROUP_SIZE: tl.constexpr, HAS_BIAS: tl.constexpr,
+):
+    """Ultra-simplified AWQ linear kernel"""
+
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    # Only process one element per thread for maximum compatibility
+    m_idx = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    n_idx = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    m_mask = m_idx < M
+    n_mask = n_idx < N
+
+    # Initialize accumulator
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    # Simple loop over all K without chunking
+    for k in range(K):
+        # Load input values [BLOCK_M]
+        input_ptrs = input_ptr + m_idx * stride_input_m + k * stride_input_k
+        input_vals = tl.load(input_ptrs, mask=m_mask, other=0.0)
+
+        # Load and process weights [BLOCK_N]
+        packed_k = k // 2
+        is_high = k % 2
+
+        qweight_ptrs = qweight_ptr + n_idx * stride_qweight_n + packed_k * stride_qweight_k
+        packed_weights = tl.load(qweight_ptrs, mask=n_mask, other=0)
+
+        # Unpack 4-bit weights
+        if is_high == 1:
+            weights_int4 = (packed_weights >> 4) & 0xF
+        else:
+            weights_int4 = packed_weights & 0xF
+
+        # Get quantization parameters
+        group_idx = k // GROUP_SIZE
+
+        qscales_ptrs = qscales_ptr + n_idx * stride_qscales_n + group_idx * stride_qscales_g
+        qzeros_ptrs = qzeros_ptr + n_idx * stride_qzeros_n + group_idx * stride_qzeros_g
+
+        scales = tl.load(qscales_ptrs, mask=n_mask, other=1.0)
+        zeros = tl.load(qzeros_ptrs, mask=n_mask, other=0.0)
+
+        # Dequantize
+        weights_fp = (weights_int4.to(tl.float32) - zeros) * scales
+
+        # Accumulate outer product: input[m] * weight[n] -> acc[m, n]
+        acc += input_vals[:, None] * weights_fp[None, :]
+
+    # Add bias
+    if HAS_BIAS:
+        bias_ptrs = bias_ptr + n_idx
+        bias_vals = tl.load(bias_ptrs, mask=n_mask, other=0.0)
+        acc += bias_vals[None, :]
+
+    # Store result
+    output_ptrs = output_ptr + m_idx[:, None] * stride_output_m + n_idx[None, :] * stride_output_n
+    tl.store(output_ptrs, acc.to(tl.float16), mask=m_mask[:, None] & n_mask[None, :])
+
+
+def awq_linear_triton(
+        input: torch.Tensor,
+        qweight: torch.Tensor,
+        qscales: torch.Tensor,
+        qzeros: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+        group_size: int = 128
+) -> torch.Tensor:
+    """
+    AWQ quantized linear layer using Triton
+
+    Args:
+        input: Input tensor [*, in_features] in fp16
+        qweight: Packed quantized weights [out_features, in_features//2] in int8
+        qscales: Quantization scales [out_features, in_features//group_size]
+        qzeros: Quantization zeros [out_features, in_features//group_size]
+        bias: Optional bias [out_features]
+        group_size: Group size for quantization
+
+    Returns:
+        Output tensor [*, out_features] in fp16
+    """
+
+    # Reshape input to 2D
+    input_shape = input.shape
+    input_2d = input.view(-1, input.shape[-1])
+    M, K = input_2d.shape
+    N = qweight.shape[0]
+
+    # Ensure input is fp16
+    if input_2d.dtype != torch.float16:
+        input_2d = input_2d.to(torch.float16)
+
+    # Create output tensor
+    output = torch.empty((M, N), dtype=torch.float16, device=input.device)
+
+    # Block sizes - smaller for better compatibility
+    BLOCK_M = 16
+    BLOCK_N = 16
+    BLOCK_K = 16
+
+    # Grid configuration
+    grid = (
+        triton.cdiv(M, BLOCK_M),
+        triton.cdiv(N, BLOCK_N),
+    )
+
+    # Launch kernel
+    awq_linear_kernel[grid](
+        input_2d, qweight, qscales, qzeros, output, bias,
+        M, N, K, group_size,
+        # Input strides
+        input_2d.stride(0), input_2d.stride(1),
+        # QWeight strides
+        qweight.stride(0), qweight.stride(1),
+        # QScales strides
+        qscales.stride(0), qscales.stride(1),
+        # QZeros strides
+        qzeros.stride(0), qzeros.stride(1),
+        # Output strides
+        output.stride(0), output.stride(1),
+        # Block sizes
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        BLOCK_K=BLOCK_K,
+        GROUP_SIZE=group_size,
+        HAS_BIAS=bias is not None,
+    )
+
+    # Reshape output back to original shape
+    output_shape = input_shape[:-1] + (N,)
+    return output.view(output_shape)
+
+
+class AWQLinear(torch.nn.Module):
+    """
+    AWQ Quantized Linear Layer using Triton
+    """
+
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias: bool = True,
+            group_size: int = 128,
+            wbits: int = 4,
+    ):
+        super().__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.group_size = group_size
+        self.wbits = wbits
+
+        # Calculate number of groups
+        self.num_groups = (in_features + group_size - 1) // group_size
+
+        # Register quantized weight parameters
+        # Packed weights: 2 int4 values per int8
+        packed_width = (in_features + 1) // 2
+        self.register_buffer('qweight', torch.zeros((out_features, packed_width), dtype=torch.uint8))
+        self.register_buffer('qscales', torch.zeros((out_features, self.num_groups), dtype=torch.float16))
+        self.register_buffer('qzeros', torch.zeros((out_features, self.num_groups), dtype=torch.float16))
+
+        if bias:
+            self.register_parameter('bias', torch.nn.Parameter(torch.zeros(out_features, dtype=torch.float16)))
+        else:
+            self.register_buffer('bias', None)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass using Triton kernel"""
+        return awq_linear_triton(
+            input=x,
+            qweight=self.qweight,
+            qscales=self.qscales,
+            qzeros=self.qzeros,
+            bias=self.bias,
+            group_size=self.group_size
+        )
+
+    @classmethod
+    def from_float(
+            cls,
+            linear: torch.nn.Linear,
+            qweight: torch.Tensor,
+            qscales: torch.Tensor,
+            qzeros: torch.Tensor,
+            group_size: int = 128,
+    ):
+        """
+        Create AWQLinear from a regular Linear layer and quantization parameters
+
+        Args:
+            linear: Original torch.nn.Linear layer
+            qweight: Packed quantized weights
+            qscales: Quantization scales
+            qzeros: Quantization zeros
+            group_size: Group size used for quantization
+        """
+
+        awq_linear = cls(
+            in_features=linear.in_features,
+            out_features=linear.out_features,
+            bias=linear.bias is not None,
+            group_size=group_size,
+        )
+
+        # Copy quantized parameters
+        with torch.no_grad():
+            awq_linear.qweight.copy_(qweight)
+            awq_linear.qscales.copy_(qscales)
+            awq_linear.qzeros.copy_(qzeros)
+
+            if linear.bias is not None:
+                awq_linear.bias.copy_(linear.bias.to(torch.float16))
+
+        return awq_linear
+
+
+def demo_awq_triton():
+    """Demo function for AWQ Triton linear layer"""
+
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if device.type == "cpu":
+        print("CUDA not available, demo will run on CPU (Triton requires CUDA)")
+        return
+
+    # Create test data
+    batch_size, seq_len = 2, 128
+    in_features, out_features = 768, 768
+    group_size = 128
+
+    # Create original linear layer
+    linear = torch.nn.Linear(in_features, out_features, bias=True).to(device)
+
+
+    # Mock quantized weights (in practice, these come from AWQ.quantize())
+    weight_int4 = torch.randint(0, 16, (out_features, in_features), dtype=torch.uint8, device=device)
+    qweight = pack_weight(weight_int4)
+
+    num_groups = (in_features + group_size - 1) // group_size
+    qscales = torch.randn((out_features, num_groups), dtype=torch.float16, device=device).abs() + 0.1
+    qzeros = torch.randint(0, 16, (out_features, num_groups), dtype=torch.float16, device=device)
+
+    # Create AWQ linear layer
+    awq_linear = AWQLinear.from_float(linear, qweight, qscales, qzeros, group_size)
+    awq_linear = awq_linear.to(device)
+
+    # Test input
+    x = torch.randn(batch_size, seq_len, in_features, dtype=torch.float16, device=device)
+
+    print(f"Input shape: {x.shape}")
+    print(f"QWeight shape: {qweight.shape}")
+    print(f"QScales shape: {qscales.shape}")
+    print(f"QZeros shape: {qzeros.shape}")
+
+    # Forward pass
+    with torch.no_grad():
+        output = awq_linear(x)
+
+    print(f"Output shape: {output.shape}")
+    print(f"Output dtype: {output.dtype}")
+    print("AWQ Triton linear demo completed successfully!")
+
+
+if __name__ == "__main__":
+    demo_awq_triton()
\ No newline at end of file
diff --git a/lite_llama/kernels/sq_linear.py b/lite_llama/kernels/sq_linear.py
index 186d5aa..a9ef948 100644
--- a/lite_llama/kernels/sq_linear.py
+++ b/lite_llama/kernels/sq_linear.py
@@ -293,117 +293,5 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return output.view(*x_shape[:-1], N)
 
 
-import torch
-import torch.nn as nn
-import time
-import gc
-import numpy as np
-from typing import Dict, List, Tuple
-
-
-# Import the SmoothQuant implementation
-# from smoothquant_int4 import SmoothQuantLinear
-
-def get_memory_usage():
-    """Get current GPU memory usage in MB"""
-    if torch.cuda.is_available():
-        return torch.cuda.memory_allocated() / 1024 / 1024
-    return 0
-
-
-def clear_memory():
-    """Clear GPU memory cache"""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-
-
-def format_table(headers: List[str], rows: List[List[str]], title: str = "") -> str:
-    """Simple table formatter without external dependencies"""
-    if not rows:
-        return ""
-
-    # Calculate column widths
-    widths = [len(header) for header in headers]
-    for row in rows:
-        for i, cell in enumerate(row):
-            if i < len(widths):
-                widths[i] = max(widths[i], len(str(cell)))
-
-    # Create format string
-    fmt = " | ".join(f"{{:<{w}}}" for w in widths)
-    separator = "-+-".join("-" * w for w in widths)
-
-    # Build table
-    result = []
-    if title:
-        total_width = sum(widths) + 3 * (len(widths) - 1)
-        result.append(f"\n{title}")
-        result.append("=" * max(len(title), total_width))
-
-    result.append(fmt.format(*headers))
-    result.append(separator)
-
-    for row in rows:
-        result.append(fmt.format(*[str(cell) for cell in row]))
-
-    return "\n".join(result)
-
-
-import torch
-import torch.nn as nn
-import time
-import gc
-import numpy as np
-from typing import Dict, List, Tuple
-
-
-# Import the SmoothQuant implementation
-# from smoothquant_int4 import SmoothQuantLinear
-
-def get_memory_usage():
-    """Get current GPU memory usage in MB"""
-    if torch.cuda.is_available():
-        return torch.cuda.memory_allocated() / 1024 / 1024
-    return 0
-
-
-def clear_memory():
-    """Clear GPU memory cache"""
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    gc.collect()
-
-
-def format_table(headers: List[str], rows: List[List[str]], title: str = "") -> str:
-    """Simple table formatter without external dependencies"""
-    if not rows:
-        return ""
-
-    # Calculate column widths
-    widths = [len(header) for header in headers]
-    for row in rows:
-        for i, cell in enumerate(row):
-            if i < len(widths):
-                widths[i] = max(widths[i], len(str(cell)))
-
-    # Create format string
-    fmt = " | ".join(f"{{:<{w}}}" for w in widths)
-    separator = "-+-".join("-" * w for w in widths)
-
-    # Build table
-    result = []
-    if title:
-        total_width = sum(widths) + 3 * (len(widths) - 1)
-        result.append(f"\n{title}")
-        result.append("=" * max(len(title), total_width))
-
-    result.append(fmt.format(*headers))
-    result.append(separator)
-
-    for row in rows:
-        result.append(fmt.format(*[str(cell) for cell in row]))
-
-    return "\n".join(result)
 
 
diff --git a/lite_llama/quantization/awq.py b/lite_llama/quantization/awq.py
index 7ad27d1..90a943d 100644
--- a/lite_llama/quantization/awq.py
+++ b/lite_llama/quantization/awq.py
@@ -7,12 +7,11 @@
 from tqdm.auto import tqdm
 import triton
 import triton.language as tl
-import time, gc, psutil, os, sys
+import psutil, os, sys
 
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
 
-from lite_llama.utils.common import get_gpu_memory
-from utils import pack_weight, unpack_weight
+from lite_llama.quantization.utils import pack_weight
 from lite_llama.quantization.quant_config import AWQConfig
 
 
@@ -314,71 +313,10 @@ def quantize(self, weight: torch.Tensor, layer_name: str = "") -> Tuple[torch.Te
 
         # Quantize with computed scales
         qweight, qzeros, qscales = self.quantize_with_scales(weight, scales)
+        packed_qweight = pack_weight(qweight)
 
-        return qweight, qzeros, qscales
-
-    def dequantize(self, qweight: torch.Tensor, qzeros: torch.Tensor, qscales: torch.Tensor) -> torch.Tensor:
-        """Dequantize weights back to floating point"""
-        rows, cols = qweight.shape
-        groupsize = min(int(self.groupsize), cols) if self.groupsize != float('inf') else cols
-        num_groups = (cols + groupsize - 1) // groupsize
+        return packed_qweight, qzeros.to(torch.float16), qscales.to(torch.float16)
 
-        weight = torch.zeros_like(qweight, dtype=torch.float16)
-
-        # Handle dimension mismatch - ensure scales and zeros are 2D
-        if qscales.dim() == 1:
-            if len(qscales) == rows:
-                # If scales is per-row, expand to per-group
-                qscales = qscales.unsqueeze(1).expand(-1, num_groups)
-            else:
-                # If scales is per-group, expand to per-row
-                qscales = qscales.unsqueeze(0).expand(rows, -1)
-
-        if qzeros.dim() == 1:
-            if len(qzeros) == rows:
-                # If zeros is per-row, expand to per-group
-                qzeros = qzeros.unsqueeze(1).expand(-1, num_groups)
-            else:
-                # If zeros is per-group, expand to per-row
-                qzeros = qzeros.unsqueeze(0).expand(rows, -1)
-
-        # Ensure we have the right number of groups
-        if qscales.shape[1] != num_groups:
-            # Repeat or truncate to match expected groups
-            if qscales.shape[1] == 1:
-                qscales = qscales.expand(-1, num_groups)
-            else:
-                qscales = qscales[:, :num_groups]
-
-        if qzeros.shape[1] != num_groups:
-            if qzeros.shape[1] == 1:
-                qzeros = qzeros.expand(-1, num_groups)
-            else:
-                qzeros = qzeros[:, :num_groups]
-
-        for g in range(num_groups):
-            start_col = g * groupsize
-            end_col = min((g + 1) * groupsize, cols)
-
-            scale = qscales[:, g].unsqueeze(1)
-            zero = qzeros[:, g].unsqueeze(1)
-
-            q = qweight[:, start_col:end_col].float()
-
-            if self.zero_point:
-                weight[:, start_col:end_col] = (q - zero) * scale
-            else:
-                weight[:, start_col:end_col] = q * scale
-
-        return weight
-
-    def dequantize_packed(self, packed_qweight: torch.Tensor, qzeros: torch.Tensor,
-                          qscales: torch.Tensor, original_cols: int) -> torch.Tensor:
-        """Dequantize packed weights"""
-        # Unpack the weights first
-        qweight = unpack_weight(packed_qweight, original_cols)
-        # Then dequantize normally
-        return self.dequantize(qweight, qzeros, qscales)
 
 
 def quantize_awq(
@@ -523,10 +461,6 @@ def demo_awq():
 
     print("Quantized keys:", list(quantized_dict.keys()))
 
-    # Test dequantization
-    config = AWQConfig(w_bit=4, group_size=128, device="cpu")
-    awq = AWQ(config)
-
     # Debug: Check dimensions of quantized tensors
     layer_name = "layer1.q_proj"
     print(f"\nDebugging {layer_name}:")
@@ -538,20 +472,6 @@ def demo_awq():
     print(f"qzeros shape: {qzeros.shape}")
     print(f"qscales shape: {qscales.shape}")
 
-    # Dequantize one layer
-    original_weight = dummy_state_dict["layer1.q_proj.weight"]
-    try:
-        dequant_weight = awq.dequantize(qweight, qzeros, qscales)
-
-        print(f"\nResults:")
-        print(f"Original shape: {original_weight.shape}")
-        print(f"Dequantized shape: {dequant_weight.shape}")
-        print(f"Quantization error: {(original_weight - dequant_weight).abs().mean():.6f}")
-        print("AWQ demo completed successfully!")
-
-    except Exception as e:
-        print(f"Error during dequantization: {e}")
-        print("This might indicate a dimension mismatch in the quantization process.")
 
 
 if __name__ == "__main__":
diff --git a/tests/kernels/test_AWQLinear.py b/tests/kernels/test_AWQLinear.py
index e69de29..8e36ad1 100644
--- a/tests/kernels/test_AWQLinear.py
+++ b/tests/kernels/test_AWQLinear.py
@@ -0,0 +1,492 @@
+import torch
+import torch.nn as nn
+import time
+import psutil
+import os
+import numpy as np
+from typing import Dict, List, Tuple
+from dataclasses import dataclass
+
+import sys, os, time
+import torch
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+try:
+    from lite_llama.quantization.utils import pack_weight, unpack_weight
+    from lite_llama.quantization.quant_config import AWQConfig
+    from lite_llama.quantization.awq import AWQ, quantize_awq
+    from lite_llama.kernels.awq_linear import AWQLinear
+
+    UTILS_AVAILABLE = True
+except ImportError:
+    UTILS_AVAILABLE = False
+
+
+@dataclass
+class BenchmarkResults:
+    """Store benchmark results for comparison"""
+    layer_size: Tuple[int, int]
+    batch_size: int
+    sequence_length: int
+
+    # Speed metrics (milliseconds)
+    fp16_time: float
+    awq_time: float
+    speedup: float
+
+    # Accuracy metrics
+    max_error: float
+    mean_error: float
+    rmse: float
+    cosine_similarity: float
+
+    # Memory metrics (MB)
+    fp16_memory: float
+    awq_memory: float
+    memory_saving: float
+
+
+class AWQBenchmark:
+    """Comprehensive benchmark for AWQ vs nn.Linear"""
+
+    def __init__(self, device: str = "cuda"):
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.results: List[BenchmarkResults] = []
+
+    def get_memory_usage(self) -> float:
+        """Get current GPU memory usage in MB"""
+        if self.device.type == "cuda":
+            return torch.cuda.memory_allocated() / 1024 / 1024
+        else:
+            process = psutil.Process(os.getpid())
+            return process.memory_info().rss / 1024 / 1024
+
+    def measure_model_memory(self, model: nn.Module) -> float:
+        """Measure memory footprint of a model in MB"""
+        param_size = 0
+        buffer_size = 0
+
+        for param in model.parameters():
+            param_size += param.nelement() * param.element_size()
+
+        for buffer in model.buffers():
+            buffer_size += buffer.nelement() * buffer.element_size()
+
+        return (param_size + buffer_size) / 1024 / 1024
+
+    def create_test_data(self, batch_size: int, seq_len: int, hidden_dim: int) -> torch.Tensor:
+        """Create test input data"""
+        return torch.randn(batch_size, seq_len, hidden_dim, dtype=torch.float16, device=self.device)
+
+    def quantize_linear_layer(self, linear_layer: nn.Linear, group_size: int = 128) -> AWQLinear:
+        """Quantize a linear layer using AWQ"""
+        # Create state dict for the layer
+        state_dict = {"test_layer.weight": linear_layer.weight.data}
+        if linear_layer.bias is not None:
+            state_dict["test_layer.bias"] = linear_layer.bias.data
+
+        # Quantize using AWQ
+        quantized_dict = quantize_awq(
+            model_state_dict=state_dict,
+            wbits=4,
+            groupsize=group_size,
+            target_layers=["test_layer.weight"],
+            device=str(self.device)
+        )
+
+        # Extract quantized parameters
+        qweight = quantized_dict["test_layer.qweight"]
+        qscales = quantized_dict["test_layer.qscales"]
+        qzeros = quantized_dict["test_layer.qzeros"]
+
+        # Create AWQ linear layer
+        awq_layer = AWQLinear.from_float(
+            linear_layer, qweight, qscales, qzeros, group_size
+        )
+
+        return awq_layer.to(self.device)
+
+    def warmup_triton_kernel(self, model: nn.Module, input_data: torch.Tensor, warmup_runs: int = 50):
+        """Extensive warmup for Triton kernels to ensure compilation and caching"""
+        model.eval()
+        print(f"    Warming up Triton kernels ({warmup_runs} runs)...")
+
+        with torch.no_grad():
+            # First few runs trigger compilation
+            for i in range(warmup_runs):
+                _ = model(input_data)
+                if i < 10 and self.device.type == "cuda":
+                    # Extra synchronization for first few runs to handle compilation
+                    torch.cuda.synchronize()
+
+    def measure_inference_time(self, model: nn.Module, input_data: torch.Tensor,
+                               warmup_runs: int = 20, test_runs: int = 100,
+                               is_triton: bool = False) -> float:
+        """Measure average inference time in milliseconds"""
+        model.eval()
+
+        # Extended warmup for Triton kernels
+        if is_triton:
+            self.warmup_triton_kernel(model, input_data, warmup_runs=50)
+        else:
+            # Regular warmup for PyTorch
+            with torch.no_grad():
+                for _ in range(warmup_runs):
+                    _ = model(input_data)
+
+        # Clear cache and synchronize
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+
+        # Measure time
+        start_time = time.perf_counter()
+
+        with torch.no_grad():
+            for _ in range(test_runs):
+                _ = model(input_data)
+
+        if self.device.type == "cuda":
+            torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+
+        avg_time_ms = (end_time - start_time) * 1000 / test_runs
+        return avg_time_ms
+
+    def compute_accuracy_metrics(self, fp16_output: torch.Tensor,
+                                 awq_output: torch.Tensor) -> Dict[str, float]:
+        """Compute accuracy metrics between FP16 and AWQ outputs"""
+        # Flatten tensors for easier computation
+        fp16_flat = fp16_output.view(-1).float()
+        awq_flat = awq_output.view(-1).float()
+
+        # Error metrics
+        error = (fp16_flat - awq_flat).abs()
+        max_error = error.max().item()
+        mean_error = error.mean().item()
+        rmse = torch.sqrt(((fp16_flat - awq_flat) ** 2).mean()).item()
+
+        # Cosine similarity
+        cos_sim = torch.nn.functional.cosine_similarity(
+            fp16_flat.unsqueeze(0), awq_flat.unsqueeze(0)
+        ).item()
+
+        return {
+            "max_error": max_error,
+            "mean_error": mean_error,
+            "rmse": rmse,
+            "cosine_similarity": cos_sim
+        }
+
+    def benchmark_layer_size(self, in_features: int, out_features: int,
+                             batch_size: int = 16, seq_len: int = 128) -> BenchmarkResults:
+        """Benchmark a specific layer configuration"""
+        print(f"\nBenchmarking layer [{in_features} -> {out_features}], "
+              f"batch_size={batch_size}, seq_len={seq_len}")
+
+        # Create original FP16 linear layer
+        fp16_layer = nn.Linear(in_features, out_features, bias=True)
+        fp16_layer = fp16_layer.to(self.device).half()
+
+        # Quantize to AWQ
+        print("  Quantizing layer...")
+        awq_layer = self.quantize_linear_layer(fp16_layer)
+
+        # Create test input
+        input_data = self.create_test_data(batch_size, seq_len, in_features)
+
+        # Measure memory usage
+        fp16_memory = self.measure_model_memory(fp16_layer)
+        awq_memory = self.measure_model_memory(awq_layer)
+        memory_saving = (fp16_memory - awq_memory) / fp16_memory * 100
+
+        print(f"  Memory: FP16={fp16_memory:.2f}MB, AWQ={awq_memory:.2f}MB, "
+              f"Saving={memory_saving:.1f}%")
+
+        # Measure inference speed with proper warmup
+        print("  Measuring FP16 speed...")
+        fp16_time = self.measure_inference_time(fp16_layer, input_data, is_triton=False)
+
+        print("  Measuring AWQ speed (with Triton warmup)...")
+        awq_time = self.measure_inference_time(awq_layer, input_data, is_triton=True)
+
+        speedup = fp16_time / awq_time if awq_time > 0 else 0.0
+
+        print(f"  Speed: FP16={fp16_time:.3f}ms, AWQ={awq_time:.3f}ms, "
+              f"Speedup={speedup:.2f}x")
+
+        # Measure accuracy
+        with torch.no_grad():
+            fp16_output = fp16_layer(input_data)
+            awq_output = awq_layer(input_data)
+
+        accuracy_metrics = self.compute_accuracy_metrics(fp16_output, awq_output)
+
+        print(f"  Accuracy: RMSE={accuracy_metrics['rmse']:.6f}, "
+              f"CosSim={accuracy_metrics['cosine_similarity']:.6f}")
+
+        # Store results
+        result = BenchmarkResults(
+            layer_size=(in_features, out_features),
+            batch_size=batch_size,
+            sequence_length=seq_len,
+            fp16_time=fp16_time,
+            awq_time=awq_time,
+            speedup=speedup,
+            max_error=accuracy_metrics["max_error"],
+            mean_error=accuracy_metrics["mean_error"],
+            rmse=accuracy_metrics["rmse"],
+            cosine_similarity=accuracy_metrics["cosine_similarity"],
+            fp16_memory=fp16_memory,
+            awq_memory=awq_memory,
+            memory_saving=memory_saving
+        )
+
+        self.results.append(result)
+        return result
+
+    def run_comprehensive_benchmark(self):
+        """Run benchmark across different layer sizes and configurations"""
+        print("Starting comprehensive AWQ vs FP16 benchmark...")
+        print(f"Device: {self.device}")
+
+        # Test configurations
+        layer_configs = [
+            (768, 768),  # Small transformer layer
+            (768, 3072),  # FFN up projection
+            (3072, 768),  # FFN down projection
+            (1024, 1024),  # Medium layer
+            (2048, 2048),  # Large layer
+            (4096, 4096),  # Very large layer
+        ]
+
+        batch_configs = [
+            (1, 128),  # Single sequence
+            (8, 128),  # Small batch
+            (16, 512),  # Medium batch + longer sequence
+            (32, 128),  # Large batch
+        ]
+
+        # Run benchmarks
+        for in_features, out_features in layer_configs:
+            for batch_size, seq_len in batch_configs:
+                try:
+                    self.benchmark_layer_size(
+                        in_features, out_features, batch_size, seq_len
+                    )
+                except Exception as e:
+                    print(f"  Error: {e}")
+                    continue
+
+        print(f"\nCompleted {len(self.results)} benchmark tests")
+
+    def analyze_results(self):
+        """Analyze and summarize benchmark results"""
+        if not self.results:
+            print("No results to analyze")
+            return
+
+        print("\n" + "=" * 80)
+        print("BENCHMARK SUMMARY")
+        print("=" * 80)
+
+        # Speed analysis
+        avg_speedup = np.mean([r.speedup for r in self.results])
+        max_speedup = max([r.speedup for r in self.results])
+        min_speedup = min([r.speedup for r in self.results])
+
+        print(f"\nSPEED ANALYSIS:")
+        print(f"  Average speedup: {avg_speedup:.2f}x")
+        print(f"  Max speedup: {max_speedup:.2f}x")
+        print(f"  Min speedup: {min_speedup:.2f}x")
+
+        # Memory analysis
+        avg_memory_saving = np.mean([r.memory_saving for r in self.results])
+        max_memory_saving = max([r.memory_saving for r in self.results])
+        min_memory_saving = min([r.memory_saving for r in self.results])
+
+        print(f"\nMEMORY ANALYSIS:")
+        print(f"  Average memory saving: {avg_memory_saving:.1f}%")
+        print(f"  Max memory saving: {max_memory_saving:.1f}%")
+        print(f"  Min memory saving: {min_memory_saving:.1f}%")
+
+        # Accuracy analysis
+        avg_rmse = np.mean([r.rmse for r in self.results])
+        max_rmse = max([r.rmse for r in self.results])
+        avg_cosine_sim = np.mean([r.cosine_similarity for r in self.results])
+        min_cosine_sim = min([r.cosine_similarity for r in self.results])
+
+        print(f"\nACCURACY ANALYSIS:")
+        print(f"  Average RMSE: {avg_rmse:.6f}")
+        print(f"  Max RMSE: {max_rmse:.6f}")
+        print(f"  Average cosine similarity: {avg_cosine_sim:.6f}")
+        print(f"  Min cosine similarity: {min_cosine_sim:.6f}")
+
+        # Find best and worst cases
+        best_speedup_idx = np.argmax([r.speedup for r in self.results])
+        worst_accuracy_idx = np.argmax([r.rmse for r in self.results])
+
+        print(f"\nBEST SPEEDUP:")
+        best_result = self.results[best_speedup_idx]
+        print(f"  Layer size: {best_result.layer_size}")
+        print(f"  Batch config: {best_result.batch_size}x{best_result.sequence_length}")
+        print(f"  Speedup: {best_result.speedup:.2f}x")
+
+        print(f"\nWORST ACCURACY:")
+        worst_result = self.results[worst_accuracy_idx]
+        print(f"  Layer size: {worst_result.layer_size}")
+        print(f"  Batch config: {worst_result.batch_size}x{worst_result.sequence_length}")
+        print(f"  RMSE: {worst_result.rmse:.6f}")
+        print(f"  Cosine similarity: {worst_result.cosine_similarity:.6f}")
+
+    def export_results_csv(self, filename: str = "awq_benchmark_results.csv"):
+        """Export results to CSV file"""
+        if not self.results:
+            print("No results to export")
+            return
+
+        import csv
+
+        with open(filename, 'w', newline='') as csvfile:
+            fieldnames = [
+                'layer_size', 'batch_size', 'sequence_length',
+                'fp16_time_ms', 'awq_time_ms', 'speedup',
+                'max_error', 'mean_error', 'rmse', 'cosine_similarity',
+                'fp16_memory_mb', 'awq_memory_mb', 'memory_saving_percent'
+            ]
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'layer_size': f"{result.layer_size[0]}x{result.layer_size[1]}",
+                    'batch_size': result.batch_size,
+                    'sequence_length': result.sequence_length,
+                    'fp16_time_ms': result.fp16_time,
+                    'awq_time_ms': result.awq_time,
+                    'speedup': result.speedup,
+                    'max_error': result.max_error,
+                    'mean_error': result.mean_error,
+                    'rmse': result.rmse,
+                    'cosine_similarity': result.cosine_similarity,
+                    'fp16_memory_mb': result.fp16_memory,
+                    'awq_memory_mb': result.awq_memory,
+                    'memory_saving_percent': result.memory_saving
+                })
+
+        print(f"Results exported to {filename}")
+
+
+def quick_demo():
+    """Quick demonstration of AWQ vs FP16 comparison"""
+    print("Quick AWQ vs FP16 Demo")
+    print("=" * 50)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+
+    # Create a simple test case
+    in_features, out_features = 768, 768
+    batch_size, seq_len = 16, 128
+
+    print(f"Testing {in_features}→{out_features} layer with batch={batch_size}, seq_len={seq_len}")
+
+    # Create FP16 linear layer
+    fp16_layer = nn.Linear(in_features, out_features, bias=True)
+    fp16_layer = fp16_layer.to(device).half()
+
+    # Create test input
+    input_data = torch.randn(batch_size, seq_len, in_features, dtype=torch.float16, device=device)
+
+    # Create benchmark instance
+    benchmark = AWQBenchmark(device=str(device))
+
+    # Get FP16 timing
+    print("Measuring FP16 performance...")
+    fp16_time = benchmark.measure_inference_time(fp16_layer, input_data, is_triton=False)
+
+    # Create quantized version
+    print("Quantizing layer with AWQ...")
+    awq_layer = benchmark.quantize_linear_layer(fp16_layer)
+
+    # Get AWQ timing with proper Triton warmup
+    print("Measuring AWQ performance (with Triton warmup)...")
+    awq_time = benchmark.measure_inference_time(awq_layer, input_data, is_triton=True)
+
+    # Get outputs for accuracy measurement
+    with torch.no_grad():
+        fp16_output = fp16_layer(input_data)
+        awq_output = awq_layer(input_data)
+
+    # Calculate metrics
+    fp16_memory = benchmark.measure_model_memory(fp16_layer)
+    awq_memory = benchmark.measure_model_memory(awq_layer)
+    memory_saving = (fp16_memory - awq_memory) / fp16_memory * 100
+    speedup = fp16_time / awq_time if awq_time > 0 else 0
+
+    # Accuracy metrics
+    accuracy_metrics = benchmark.compute_accuracy_metrics(fp16_output, awq_output)
+
+    # Print results
+    print(f"\nResults:")
+    print(f"  Speed:")
+    print(f"    FP16: {fp16_time:.3f}ms")
+    print(f"    AWQ:  {awq_time:.3f}ms")
+    print(f"    Speedup: {speedup:.2f}x")
+    print(f"  Memory:")
+    print(f"    FP16: {fp16_memory:.2f}MB")
+    print(f"    AWQ:  {awq_memory:.2f}MB")
+    print(f"    Saving: {memory_saving:.1f}%")
+    print(f"  Accuracy:")
+    print(f"    RMSE: {accuracy_metrics['rmse']:.6f}")
+    print(f"    Cosine Similarity: {accuracy_metrics['cosine_similarity']:.6f}")
+    print(f"    Max Error: {accuracy_metrics['max_error']:.6f}")
+
+
+def main():
+    """Main function to run the comprehensive benchmark"""
+
+    # Check if CUDA is available
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Running benchmark on: {device}")
+
+    if device == "cpu":
+        print("Warning: Running on CPU. Triton kernels require CUDA for optimal performance.")
+
+    # Ask user which test to run
+    print("\nChoose test type:")
+    print("1. Quick demo (single layer test)")
+    print("2. Comprehensive benchmark (multiple configurations)")
+
+    try:
+        choice = input("Enter choice (1 or 2, default=1): ").strip()
+        if choice == "2":
+            # Create benchmark instance
+            benchmark = AWQBenchmark(device=device)
+
+            # Run comprehensive benchmark
+            benchmark.run_comprehensive_benchmark()
+
+            # Analyze results
+            benchmark.analyze_results()
+
+            # Export results
+            benchmark.export_results_csv()
+
+            print("\nBenchmark completed!")
+        else:
+            # Run quick demo
+            quick_demo()
+
+    except KeyboardInterrupt:
+        print("\nBenchmark interrupted by user")
+    except Exception as e:
+        print(f"Error running benchmark: {e}")
+        # Fallback to quick demo
+        print("Running quick demo instead...")
+        quick_demo()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From d96b8a039c591655cc098be6b64bf51f317e0753 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Fri, 18 Jul 2025 15:02:36 +0930
Subject: [PATCH 30/33] remove unnecessary method

---
 lite_llama/kernels/gptq_linear.py | 338 +++++++++++++-----------------
 lite_llama/quantization/gptq.py   |  21 --
 lite_llama/quantization/sq.py     |  76 -------
 3 files changed, 151 insertions(+), 284 deletions(-)

diff --git a/lite_llama/kernels/gptq_linear.py b/lite_llama/kernels/gptq_linear.py
index 8287e16..e23ecef 100644
--- a/lite_llama/kernels/gptq_linear.py
+++ b/lite_llama/kernels/gptq_linear.py
@@ -4,130 +4,69 @@
 import torch.nn as nn
 import numpy as np
 import sys, os
+
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
-from lite_llama.quantization.gptq import GPTQ
-from lite_llama.quantization.quant_config import GPTQConfig
+from lite_llama.quantization.utils import pack_weight
 
 
 @triton.autotune(
-        configs=[
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 256,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=3,
-                num_warps=8,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 256,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 256,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 128,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 128,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=4,
-                num_warps=4,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 64,
-                    "BLOCK_SIZE_N": 32,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=5,
-                num_warps=2,
-            ),
-            triton.Config(
-                {
-                    "BLOCK_SIZE_M": 32,
-                    "BLOCK_SIZE_N": 64,
-                    "BLOCK_SIZE_K": 32,
-                    "GROUP_SIZE_M": 8,
-                },
-                num_stages=5,
-                num_warps=2,
-            ),
-        ],
-        key=["M", "N", "K"],
-    )
-
+    configs=[
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 256,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 64,
+                "BLOCK_SIZE_N": 256,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_SIZE_M": 128,
+                "BLOCK_SIZE_N": 128,
+                "BLOCK_SIZE_K": 32,
+                "GROUP_SIZE_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+    ],
+    key=["M", "N", "K"],
+)
 @triton.jit
 def int4_gemm_kernel(
-    a_ptr, b_ptr, c_ptr,
-    bscales_ptr, bzeros_ptr,
-    M, N, K,
-    stride_am, stride_ak,
-    stride_bk, stride_bn,
-    stride_cm, stride_cn,
-    GROUP_SIZE: tl.constexpr,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-    BLOCK_SIZE_K: tl.constexpr,
-    GROUP_SIZE_M: tl.constexpr,
+        a_ptr, b_ptr, c_ptr,
+        bscales_ptr, bzeros_ptr,
+        M, N, K,
+        stride_am, stride_ak,
+        stride_bk, stride_bn,
+        stride_cm, stride_cn,
+        GROUP_SIZE: tl.constexpr,
+        BLOCK_SIZE_M: tl.constexpr,
+        BLOCK_SIZE_N: tl.constexpr,
+        BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
 ):
     pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
@@ -153,16 +92,15 @@ def int4_gemm_kernel(
 
     for k in range(0, K, BLOCK_SIZE_K):
         b_q = tl.load(b_ptrs, mask=b_mask)
-
         a = tl.load(a_ptrs, mask=a_mask).to(tl.float16)
 
         # Compute per-group index
-        k_offset = k + offs_k  # shape: [BLOCK_SIZE_K]
-        group_idx = k_offset // GROUP_SIZE  # [BLOCK_SIZE_K]
+        k_offset = k + offs_k
+        group_idx = k_offset // GROUP_SIZE
 
         # Load scale and zero for each [N, G]
-        scale = tl.load(bscales_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)  # [BLOCK_SIZE_K, BLOCK_SIZE_N]
-        zero = tl.load(bzeros_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)   # same shape
+        scale = tl.load(bscales_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)
+        zero = tl.load(bzeros_ptr + stride_bn * offs_bn[None, :] + group_idx[:, None]).to(tl.float16)
 
         # Extract int4 values from uint8
         shift = (k_offset[:, None] % 2) * 4
@@ -183,15 +121,13 @@ def int4_gemm_kernel(
 
 
 def triton_int4_gemm(
-    inp: torch.Tensor,
-    weight: torch.Tensor,
-    scales: torch.Tensor,
-    zeros: torch.Tensor,
-    group_size: int = 64
+        inp: torch.Tensor,
+        weight: torch.Tensor,
+        scales: torch.Tensor,
+        zeros: torch.Tensor,
+        group_size: int = 64
 ) -> torch.Tensor:
-
-
-    weight = weight.t().contiguous()  # [K/2, N]
+    weight = weight.t().contiguous()
     c_shape = inp.shape[:-1] + weight.shape[-1:]
     inp = inp.view(-1, inp.shape[-1]).contiguous()
 
@@ -207,8 +143,6 @@ def triton_int4_gemm(
     M, K = inp.shape
     N = weight.shape[1]
 
-
-
     c = torch.empty((M, N), device=inp.device, dtype=torch.float16)
 
     grid = lambda META: (
@@ -227,50 +161,58 @@ def triton_int4_gemm(
 
     return c[:c_crop] if c_crop is not None else c.view(c_shape)
 
+
 class GPTQLinear(nn.Module):
     """
-    4-bit quantized linear layer using Triton kernels
+    4-bit quantized linear layer using Triton kernels (修复版本)
     """
 
-    def __init__(self, in_features, out_features, bias=True, dtype=torch.float16, bits=4, groupsize=64, device="cuda", tile_cols=None,):
+    def __init__(self, in_features, out_features, bias=True, dtype=torch.float16, bits=4, groupsize=64, device="cuda",
+                 tile_cols=None):
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
         self.groupsize = groupsize
         self.device = device
-        self.dtype = dtype  # optional
-        self.bits = bits  # optional
+        self.dtype = dtype
+        self.bits = bits
         self.tile_cols = groupsize
         self.original_out_features = out_features
 
-        # Quantized params (assigned later)
-        self.register_buffer("packed_weight", None)
-        self.register_buffer("scales", None)
-        self.register_buffer("zeros", None)
-        self.register_buffer("bias", None if not bias else torch.empty(out_features))
-
-    @staticmethod
-    def pack_weight(weight):
-        rows, cols = weight.shape
-        if cols % 2 != 0:
-            weight = torch.nn.functional.pad(weight, (0, 1), value=0)
-            cols += 1
-        packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
-        return packed.contiguous()
-
-    def get_weight(self, packed: torch.Tensor) -> torch.Tensor:
-        """
-        [rows, ceil(cols/2)] uint8 -> [rows, cols] uint8 in [0, 15]
-        """
-        rows, packed_cols = packed.shape
-        qweight = torch.empty((rows, packed_cols * 2), dtype=torch.uint8, device=packed.device)
-        qweight[:, 0::2] = packed & 0xF
-        qweight[:, 1::2] = (packed >> 4) & 0xF
-        return qweight[:, :self.in_features].contiguous()
+        # 计算量化参数的形状
+        self.num_groups = (in_features + groupsize - 1) // groupsize
+        packed_width = (in_features + 1) // 2  # 2个int4打包成1个uint8
+
+        # 注册量化参数缓冲区 - 修复：确保所有缓冲区都被正确初始化
+        self.register_buffer("packed_weight", torch.zeros(out_features, packed_width, dtype=torch.uint8))
+        self.register_buffer("scales", torch.ones(out_features, self.num_groups, dtype=torch.float16))
+        self.register_buffer("zeros", torch.zeros(out_features, self.num_groups, dtype=torch.float16))
+
+        # Bias参数
+        if bias:
+            self.register_parameter("bias", nn.Parameter(torch.zeros(out_features, dtype=torch.float16)))
+        else:
+            self.register_buffer("bias", None)
+
+
+    def set_quantized_params(self, packed_weight: torch.Tensor, scales: torch.Tensor, zeros: torch.Tensor):
+        """设置量化参数 - 新增方法"""
+        with torch.no_grad():
+            if packed_weight is not None:
+                self.packed_weight.copy_(packed_weight)
+            if scales is not None:
+                self.scales.copy_(scales)
+            if zeros is not None:
+                self.zeros.copy_(zeros)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         x_flat = x.view(-1, self.in_features)
-        # Compute quantized matmul
+
+        # 确保所有参数都已正确设置
+        if self.packed_weight is None or self.scales is None or self.zeros is None:
+            raise RuntimeError("Quantized parameters not properly initialized. Call set_quantized_params() first.")
+
+        # 使用Triton优化的int4 GEMM
         output = triton_int4_gemm(
             x_flat.float(),
             self.packed_weight,
@@ -284,13 +226,34 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         return output.view(*x.shape[:-1], self.out_features)
 
+    @classmethod
+    def from_linear(cls, linear_layer: nn.Linear, groupsize: int = 128, bits: int = 4):
+        """从标准线性层创建GPTQ层"""
+        gptq_layer = cls(
+            in_features=linear_layer.in_features,
+            out_features=linear_layer.out_features,
+            bias=linear_layer.bias is not None,
+            groupsize=groupsize,
+            bits=bits,
+            device=linear_layer.weight.device
+        )
+
+        # 复制bias
+        if linear_layer.bias is not None:
+            gptq_layer.bias.data.copy_(linear_layer.bias.data)
+
+        return gptq_layer
+
+    def __repr__(self):
+        return f"GPTQLinear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}, bits={self.bits}, groupsize={self.groupsize})"
+
 
 def test_gptqlinear_vs_nnlinear(
-    in_features=2048,
-    out_features=4096,
-    groupsize=64,
-    wbits=4,
-    device="cuda"
+        in_features=2048,
+        out_features=4096,
+        groupsize=64,
+        wbits=4,
+        device="cuda"
 ):
     torch.manual_seed(42)
     np.random.seed(42)
@@ -302,20 +265,24 @@ def test_gptqlinear_vs_nnlinear(
     weight = linear.weight.detach().to(device).float()
     bias = linear.bias.detach().to(device).float() if linear.bias is not None else None
 
-    # --- Quantize using GPTQ ---
-    config = GPTQConfig(
-        w_bit=wbits,
-        group_size=groupsize,
-    )
-    gptq = GPTQ(config)
-    qweight, qzeros, qscales, _ = gptq.quantize(weight)
-    packed_weight = GPTQLinear.pack_weight(qweight)
-
+    # --- 创建GPTQ层并模拟量化参数 ---
     gptqlinear = GPTQLinear(in_features, out_features, bias=True, groupsize=groupsize, device=device).to(device)
-    gptqlinear.packed_weight = packed_weight
-    gptqlinear.scales = qscales
-    gptqlinear.zeros = qzeros
-    gptqlinear.bias = bias if bias is not None else None
+
+    # 模拟量化参数（实际使用中这些来自GPTQ量化算法）
+    num_groups = (in_features + groupsize - 1) // groupsize
+    packed_width = (in_features + 1) // 2
+
+    # 创建模拟的量化参数
+    mock_packed_weight = torch.randint(0, 255, (out_features, packed_width), dtype=torch.uint8, device=device)
+    mock_scales = torch.randn(out_features, num_groups, dtype=torch.float16, device=device).abs() + 0.1
+    mock_zeros = torch.randint(0, 15, (out_features, num_groups), dtype=torch.float16, device=device)
+
+    # 设置量化参数
+    gptqlinear.set_quantized_params(mock_packed_weight, mock_scales, mock_zeros)
+
+    if bias is not None:
+        gptqlinear.bias.data.copy_(bias)
+
     gptqlinear.eval()
 
     print("\n== Latency ==")
@@ -324,15 +291,12 @@ def test_gptqlinear_vs_nnlinear(
     print(f"nn.Linear (fp16): {time_fp:.3f} ms")
     print(f"GPTQLinear:       {time_q:.3f} ms")
 
-    # print(torch.allclose(linear(x), gptqlinear(x), atol=1e-3))  # True / False
+    # 测试输出形状
     a = linear(x)
     b = gptqlinear(x)
-    abs_error = torch.abs(a - b)
-    rel_error = abs_error / (torch.abs(b) + 1e-8)
-    print("Mean abs error:", abs_error.mean().item())
-    print("Max abs error:", abs_error.max().item())
-    print("Mean rel error:", rel_error.mean().item())
-    print("Max rel error:", rel_error.max().item())
+    print(f"Output shapes - Linear: {a.shape}, GPTQ: {b.shape}")
+    print("GPTQ layer test completed successfully!")
+
 
 if __name__ == "__main__":
     test_gptqlinear_vs_nnlinear()
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq.py b/lite_llama/quantization/gptq.py
index af219dd..d609c58 100755
--- a/lite_llama/quantization/gptq.py
+++ b/lite_llama/quantization/gptq.py
@@ -254,27 +254,6 @@ def quantize(self, W: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.T
         return packed_qweight, zeros.to(torch.float16), scales.to(torch.float16), original_cols
 
 
-    def dequantize(self, qweight: torch.Tensor, zeros: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
-        """
-        [O, I] int4, [O, num_groups] zero, [O, num_groups] scale => [O, I] float16
-        """
-        rows, cols = qweight.shape
-        # Use same effective groupsize as quantization
-        effective_groupsize = min(self.groupsize, 8)
-        effective_groupsize = max(effective_groupsize, 4)
-        num_groups = (cols + effective_groupsize - 1) // effective_groupsize
-        W = torch.zeros_like(qweight, dtype=torch.float16)
-
-        for g in range(num_groups):
-            start = g * effective_groupsize
-            end = min((g + 1) * effective_groupsize, cols)
-            scale = scales[:, g].unsqueeze(1)  # [O, 1]
-            zero = zeros[:, g].unsqueeze(1)  # [O, 1]
-            q = qweight[:, start:end].float()
-            W[:, start:end] = (q - zero) * scale
-
-        return W
-
 def quantize_gptq(
         model_state_dict: Dict[str, torch.Tensor],
         target_layers: Optional[list] = None,
diff --git a/lite_llama/quantization/sq.py b/lite_llama/quantization/sq.py
index efe5e4e..20ef2a5 100644
--- a/lite_llama/quantization/sq.py
+++ b/lite_llama/quantization/sq.py
@@ -248,69 +248,6 @@ def dequantize_activation(self, qactivation: torch.Tensor, scale: torch.Tensor,
             return (qactivation.float() - zero_point) * scale
 
 
-class SmoothQuantLinear(nn.Module):
-    """Quantized Linear layer with SmoothQuant"""
-
-    def __init__(self, in_features: int, out_features: int, bias: bool = True,
-                 smoothing_factor: Optional[torch.Tensor] = None,
-                 config: SmoothQuantConfig = None):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.config = config or SmoothQuantConfig()
-
-        # Store quantized weights
-        self.register_buffer('qweight', torch.zeros(out_features, in_features, dtype=torch.int8))
-        self.register_buffer('weight_scale', torch.zeros(out_features, 1))
-        self.register_buffer('weight_zero_point', torch.zeros(out_features, 1))
-
-        # Store smoothing factor
-        if smoothing_factor is not None:
-            self.register_buffer('smoothing_factor', smoothing_factor)
-        else:
-            self.register_buffer('smoothing_factor', torch.ones(in_features))
-
-        # Bias
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_features))
-        else:
-            self.register_parameter('bias', None)
-
-        self.quantizer = SmoothQuantizer(config)
-
-    def set_quantized_weight(self, qweight: torch.Tensor, scale: torch.Tensor,
-                             zero_point: torch.Tensor):
-        """Set quantized weight parameters"""
-        self.qweight.copy_(qweight)
-        self.weight_scale.copy_(scale)
-        self.weight_zero_point.copy_(zero_point)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # Apply inverse smoothing to input activations
-        # x_smoothed = x / smoothing_factor
-        # Ensure smoothing_factor is broadcastable
-        if x.dim() == 3:  # [batch, seq, hidden]
-            smoothing_factor = self.smoothing_factor.unsqueeze(0).unsqueeze(0)  # [1, 1, hidden]
-        else:  # [batch, hidden] or other shapes
-            smoothing_factor = self.smoothing_factor.unsqueeze(0)  # [1, hidden]
-
-        x_smooth = x / smoothing_factor
-
-        # Quantize input activations
-        qx, act_scale, act_zero_point = self.quantizer.quantize_activation(x_smooth)
-
-        # Dequantize for computation (in practice, this would be done in INT8)
-        x_dequant = self.quantizer.dequantize_activation(qx, act_scale, act_zero_point)
-        weight_dequant = self.quantizer.dequantize_weight(
-            self.qweight, self.weight_scale, self.weight_zero_point
-        )
-
-        # Linear computation
-        output = F.linear(x_dequant, weight_dequant, self.bias)
-
-        return output
-
-
 def convert_to_smoothquant(model, calibration_dataloader, config: SmoothQuantConfig = None):
     """Convert a model to use SmoothQuant"""
     config = config or SmoothQuantConfig()
@@ -340,19 +277,6 @@ def convert_to_smoothquant(model, calibration_dataloader, config: SmoothQuantCon
                 smoothing_factor = quantizer.smoothing_factors.get(name,
                                                                    torch.ones(module.in_features))
 
-                # Create quantized layer
-                sq_linear = SmoothQuantLinear(
-                    module.in_features,
-                    module.out_features,
-                    bias=module.bias is not None,
-                    smoothing_factor=smoothing_factor,
-                    config=config
-                )
-
-                # Set quantized parameters
-                sq_linear.set_quantized_weight(qweight, weight_scale, weight_zero_point)
-                if module.bias is not None:
-                    sq_linear.bias.data.copy_(module.bias.data)
 
                 # Store in state dict
                 base_name = name.replace(".weight", "").replace("_weight", "")

From ec28d6502299a4b2f8b1b0679ec9141813d041d4 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Wed, 23 Jul 2025 03:00:29 +0930
Subject: [PATCH 31/33] add quant into generate stream

---
 generate.py                              | 191 +++++++++++-
 lite_llama/executor/model_executor.py    | 124 ++++----
 lite_llama/generate_stream.py            |  74 ++---
 lite_llama/llava_generate_stream.py      | 150 ++++------
 lite_llama/models/__init__.py            |   0
 lite_llama/models/quantized_models.py    | 356 ++++++++++++++++++++++
 lite_llama/quantization/awq.py           |  35 +--
 lite_llama/quantization/quant_config.py  |  24 +-
 lite_llama/quantization/quant_manager.py | 266 +++++++++++++++++
 lite_llama/quantization/sq.py            |   1 -
 lite_llama/quantization/utils.py         |   4 +-
 quantize_lite_llama.py                   | 363 +++++++++++++++++++++++
 12 files changed, 1327 insertions(+), 261 deletions(-)
 create mode 100644 lite_llama/models/__init__.py
 create mode 100644 lite_llama/models/quantized_models.py
 create mode 100644 lite_llama/quantization/quant_manager.py
 create mode 100644 quantize_lite_llama.py

diff --git a/generate.py b/generate.py
index ba205e1..7275461 100644
--- a/generate.py
+++ b/generate.py
@@ -1,13 +1,15 @@
+# 对原有的generate.py进行修改，添加量化支持
+
 import torch
-from typing import Optional
+from typing import Optional, List
 from lite_llama.utils.prompt_templates import get_prompter, get_image_token
-from lite_llama.generate_stream import GenerateStreamText  # import GenerateText
+from lite_llama.generate_stream import GenerateStreamText
 from lite_llama.utils.image_process import vis_images
 
 import warnings
 
 warnings.filterwarnings("ignore", category=UserWarning, module="torch._utils")
-from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type, quantization
+from lite_llama.utils.common import get_gpu_memory, detect_device, count_tokens, get_model_type
 from lite_llama.llava_generate_stream import LlavaGeneratorStream
 
 import sys, os, time
@@ -18,6 +20,9 @@
 import psutil
 from lite_llama.utils.logger import log
 
+# 新增导入
+from lite_llama.quantization.quant_manager import quantization_manager, QuantizationType
+
 process = psutil.Process(os.getpid())
 
 def report_resource_usage(ram_before, vram_before) -> None:
@@ -37,11 +42,9 @@ def report_resource_usage(ram_before, vram_before) -> None:
     log.info(f"GPU VRAM Used: {vram_text}")
 
 
-# Add these modifications to generate.py
-
 def generate_llama(
         prompt: str = "Hello, my name is",
-        quantize: Optional[str] = None,
+        quantization: Optional[str] = None,  # 新增参数
         *,
         temperature: float = 0.6,
         top_p: float = 0.9,
@@ -56,6 +59,12 @@ def generate_llama(
     assert checkpoint_path.is_dir(), checkpoint_path
     checkpoint_path = str(checkpoint_path)
 
+    # 检测量化类型
+    if quantization is None:
+        quantization = quantization_manager.detect_quantization_type(checkpoint_path)
+        if quantization != QuantizationType.NONE:
+            log.info(f"自动检测到量化类型: {quantization}")
+
     if max_seq_len <= 1024:
         short_prompt = True
     else:
@@ -66,6 +75,7 @@ def generate_llama(
     ram_before = process.memory_info().rss
     vram_before = get_gpu_memory(gpu_type)
 
+    # 创建生成器，传入量化参数
     generator = GenerateStreamText(
         checkpoints_dir=checkpoint_path,
         tokenizer_path=checkpoint_path,
@@ -73,6 +83,7 @@ def generate_llama(
         max_seq_len=max_seq_len,
         compiled_model=compiled_model,
         device=device,
+        quantization=quantization,  # 新增参数
     )
 
     model_prompter.insert_prompt(prompt)
@@ -109,7 +120,7 @@ def generate_llava(
         checkpoint_path: Path = Path("checkpoints/lit-llama/7B/"),
         figure_path: Path = Path("figures/lit-llama/"),
         gpu_type: str = "nvidia",
-        quantize: Optional[str] = None,
+        quantization: Optional[str] = None,  # 新增参数
         temperature: float = 0.6,
         top_p: float = 0.9,
         max_seq_len: int = 2048,
@@ -118,6 +129,13 @@ def generate_llava(
         compiled_model: bool = False,
 ):
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    
+    # 检测量化类型
+    if quantization is None:
+        quantization = quantization_manager.detect_quantization_type(str(checkpoint_path))
+        if quantization != QuantizationType.NONE:
+            log.info(f"自动检测到量化类型: {quantization}")
+    
     if max_seq_len <= 1024:
         short_prompt = True
     else:
@@ -146,6 +164,7 @@ def generate_llava(
             max_seq_len=max_seq_len,
             compiled_model=compiled_model,
             device=device,
+            quantization=quantization,  # 新增参数
         )
     except Exception as e:
         log.error(f"Model loading failure: {e}")
@@ -195,37 +214,179 @@ def main(
             prompt: str = "Hello, my name is",
             checkpoint_path: Path = Path("checkpoints/lite-llama/7B/"),
             figure_path: Optional[Path] = None,
-            quant: Optional[str] = None,
+            quantization: Optional[str] = None,  # 新增参数
     ):
         """
-        Generate text using lite_llama with automatic GPTQ detection
+        Generate text using lite_llama with optional quantization support
 
         Args:
             prompt: Input prompt text
             checkpoint_path: Path to model checkpoint directory
             figure_path: Path to Image file for LLaVA generation, optional
-            quant: Legacy quantization mode (ignored if use_gptq=True)
-            use_gptq: Whether to use GPTQ quantization
-            gptq_groupsize: Group size for GPTQ quantization
+            quantization: Quantization method ('gptq', 'awq', 'smoothquant', or None for auto-detection)
         """
         gpu_type = detect_device()
         model_path = os.path.abspath(checkpoint_path)
 
+        # 验证量化参数
+        if quantization and quantization not in ['gptq', 'awq', 'smoothquant']:
+            log.error(f"不支持的量化方法: {quantization}")
+            log.info("支持的量化方法: gptq, awq, smoothquant")
+            return
+
         if figure_path:
             generate_llava(
                 prompt=prompt,
                 checkpoint_path=Path(model_path),
                 figure_path=Path(figure_path),
                 gpu_type=gpu_type,
-                quantize=quant,
+                quantization=quantization,
             )
         else:
             generate_llama(
                 prompt=prompt,
                 checkpoint_path=Path(model_path),
                 gpu_type=gpu_type,
-                quantize=quant
+                quantization=quantization
             )
 
 
-    CLI(main)
\ No newline at end of file
+    CLI(main)
+
+
+# 新增量化推理的便捷函数
+def run_quantized_inference(
+    model_path: str,
+    prompt: str,
+    quantization_method: Optional[str] = None,
+    **kwargs
+):
+    """
+    运行量化推理的便捷函数
+    
+    Args:
+        model_path: 模型路径
+        prompt: 输入提示
+        quantization_method: 量化方法，None为自动检测
+        **kwargs: 其他推理参数
+    """
+    
+    # 检查模型是否存在
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"模型路径不存在: {model_path}")
+    
+    # 获取模型类型
+    model_type = get_model_type(model_path)
+    
+    # 设置默认参数
+    default_params = {
+        'temperature': 0.6,
+        'top_p': 0.9,
+        'max_seq_len': 2048,
+        'max_gen_len': 1024,
+        'compiled_model': False,
+    }
+    default_params.update(kwargs)
+    
+    if model_type == 'llava':
+        # LLaVA模型需要图像输入
+        figure_path = kwargs.get('figure_path')
+        if not figure_path:
+            log.warning("LLaVA模型需要图像输入，将使用默认图像")
+            # 这里可以设置一个默认图像路径
+        
+        generate_llava(
+            prompt=prompt,
+            checkpoint_path=Path(model_path),
+            figure_path=Path(figure_path) if figure_path else None,
+            quantization=quantization_method,
+            **default_params
+        )
+    else:
+        generate_llama(
+            prompt=prompt,
+            checkpoint_path=Path(model_path),
+            quantization=quantization_method,
+            **default_params
+        )
+
+
+# 量化性能测试函数
+def benchmark_quantized_model(
+    model_path: str,
+    quantization_methods: Optional[List[str]] = None,
+    test_prompts: Optional[List[str]] = None,
+    num_runs: int = 3
+):
+    """
+    对量化模型进行性能基准测试
+    
+    Args:
+        model_path: 模型路径
+        quantization_methods: 要测试的量化方法列表
+        test_prompts: 测试提示列表
+        num_runs: 每个配置的运行次数
+    """
+    
+    if quantization_methods is None:
+        quantization_methods = ['gptq', 'awq', 'smoothquant', None]  # None代表无量化
+    
+    if test_prompts is None:
+        test_prompts = [
+            "What is artificial intelligence?",
+            "Explain quantum computing in simple terms.",
+            "Write a short story about a robot."
+        ]
+    
+    results = {}
+    
+    for method in quantization_methods:
+        method_name = method or "no_quantization"
+        log.info(f"测试量化方法: {method_name}")
+        
+        method_results = []
+        
+        for prompt in test_prompts:
+            prompt_results = []
+            
+            for run in range(num_runs):
+                log.info(f"运行 {run + 1}/{num_runs}: {prompt[:50]}...")
+                
+                start_time = time.time()
+                try:
+                    run_quantized_inference(
+                        model_path=model_path,
+                        prompt=prompt,
+                        quantization_method=method,
+                        max_gen_len=256  # 限制生成长度以便快速测试
+                    )
+                    end_time = time.time()
+                    prompt_results.append(end_time - start_time)
+                    
+                except Exception as e:
+                    log.error(f"测试失败 ({method_name}, run {run + 1}): {e}")
+                    prompt_results.append(float('inf'))
+            
+            method_results.append(prompt_results)
+        
+        results[method_name] = method_results
+    
+    # 打印结果摘要
+    log.info("=" * 60)
+    log.info("基准测试结果摘要")
+    log.info("=" * 60)
+    
+    for method_name, method_results in results.items():
+        avg_times = []
+        for prompt_results in method_results:
+            valid_times = [t for t in prompt_results if t != float('inf')]
+            if valid_times:
+                avg_times.append(sum(valid_times) / len(valid_times))
+        
+        if avg_times:
+            overall_avg = sum(avg_times) / len(avg_times)
+            log.info(f"{method_name:15}: {overall_avg:.2f}s 平均响应时间")
+        else:
+            log.info(f"{method_name:15}: 测试失败")
+    
+    return results
\ No newline at end of file
diff --git a/lite_llama/executor/model_executor.py b/lite_llama/executor/model_executor.py
index 92c6ce7..10d5750 100644
--- a/lite_llama/executor/model_executor.py
+++ b/lite_llama/executor/model_executor.py
@@ -1,9 +1,11 @@
+# 在原有的model_executor.py基础上添加以下修改
+
 import torch
 import torch.nn as nn
 
 import json, time
 from pathlib import Path
-from typing import Callable, Type
+from typing import Callable, Type, Optional
 
 from transformers import LlavaConfig
 from accelerate import init_empty_weights, load_checkpoint_and_dispatch
@@ -17,10 +19,9 @@
 from ..kernels import update_kv_index
 from ..utils.logger import log
 
-
-# -----------------------------------------------------------------------------
-# Registry helpers (avoid long if/elif chains)
-# -----------------------------------------------------------------------------
+# 新增导入
+from ..quantization.quant_manager import quantization_manager, QuantizationType
+from ..models.quantized_models import create_quantized_model
 
 CONFIG_CLASS_MAP: dict[str, Type] = {
     "llama": LlamaConfig,
@@ -29,6 +30,7 @@
     "llava": LlavaConfig,
 }
 
+
 class ModelExecutor:
     # 定义类属性
     model_config = None
@@ -38,26 +40,39 @@ class ModelExecutor:
     # 通过静态方法 build 将类属性当作默认配置使用
     @staticmethod
     def build(
-        checkpoints_dir: str,
-        max_seq_len: int,
-        max_gpu_num_blocks: None,
-        compiled_model: bool = False,
-        device: str = "cuda",
+            checkpoints_dir: str,
+            max_seq_len: int,
+            max_gpu_num_blocks: None,
+            compiled_model: bool = False,
+            device: str = "cuda",
+            quantization: Optional[str] = None,  # 新增参数
     ):
         """
         构建 ModelExecutor 实例, 加载模型、分词器和初始化推理信息结构体 atten_info。
 
         参数:
             checkpoints_dir (str): 模型检查点目录路径。
-            load_model (bool): 是否加载模型权重。
             max_seq_len (int): 最大序列长度。
+            max_gpu_num_blocks: GPU块数量限制。
+            compiled_model (bool): 是否使用编译模型。
             device (str): 设备类型（'cuda'或'cpu'）。
+            quantization (str, optional): 量化类型，如'gptq', 'awq', 'smoothquant'等。
 
         返回:
             ModelExecutor: 初始化后的 ModelExecutor 实例。
         """
         model_config = ModelExecutor._load_model_config(checkpoints_dir, max_seq_len)
-        model = ModelExecutor._load_model_weight(model_config, checkpoints_dir, device=device)
+
+        # 检测或使用指定的量化类型
+        if quantization is None:
+            quantization = quantization_manager.detect_quantization_type(checkpoints_dir)
+            log.info(f"自动检测到量化类型: {quantization}")
+        else:
+            log.info(f"使用指定的量化类型: {quantization}")
+
+        model = ModelExecutor._load_model_weight(
+            model_config, checkpoints_dir, device=device, quantization=quantization
+        )
 
         return ModelExecutor(
             model_config, model, max_gpu_num_blocks, compiled_model, device
@@ -71,43 +86,36 @@ def _load_model_config(checkpoints_dir: str, max_seq_len: int):
 
         params = json.loads(cfg_path.read_text())
         cfg_cls = CONFIG_CLASS_MAP.get(params["model_type"].lower())
-        
+
         if cfg_cls is None:
             raise ValueError(f"Unsupported model_type {params['model_type']!r}")
-        
-        return cfg_cls.from_dict(params)
-    
-    @staticmethod
-    def _accelerate_load_weight(
-        model_config,
-        checkpoints_dir,
-        device="cuda",
-    ):
-        with init_empty_weights():
-            model = ModelExecutor._initialize_model(model_config, device=device)
 
-        # 假设 model 是使用 init_empty_weights 初始化的空模型
-        model = load_checkpoint_and_dispatch(
-            model, checkpoints_dir, device_map="auto", dtype=torch.float16
-        )
-
-        # 将模型转换为半精度, 并验证抓换
-        model.to(device)
-        model.half()
-        for param in model.parameters():
-            assert param.dtype == torch.float16, "Model parameters are not in FP16"
-        log.info("Converted model to half precision (FP16)")
-
-        return model
+        config = cfg_cls.from_dict(params)
+        config.max_seq_len = max_seq_len  # 确保设置正确的max_seq_len
+        return config
 
     @staticmethod
     def _load_model_weight(
-        model_config,
-        checkpoints_dir,
-        device="cuda",
+            model_config,
+            checkpoints_dir,
+            device="cuda",
+            quantization: Optional[str] = None,
     ):
         start_time = time.time()
 
+        if quantization and quantization != QuantizationType.NONE:
+            # 加载量化模型
+            log.info(f"加载量化模型: {quantization}")
+            model = quantization_manager.load_quantized_model(
+                model_path=checkpoints_dir,
+                model_config=model_config,
+                device=device
+            )
+
+            log.info(f"量化模型加载完成，耗时 {time.time() - start_time:.2f}s")
+            return model
+
+        # 原有的非量化模型加载逻辑
         # 初始化模型
         with init_empty_weights():
             model = ModelExecutor._initialize_model(model_config, device=device)
@@ -174,12 +182,12 @@ def _initialize_model(model_config, device: str) -> nn.Module:
         return model
 
     def __init__(
-        self,
-        model_config,
-        model,
-        max_gpu_num_blocks=None,
-        compiled_model=False,
-        device="cuda",
+            self,
+            model_config,
+            model,
+            max_gpu_num_blocks=None,
+            compiled_model=False,
+            device="cuda",
     ):
         self.model_config = model_config
         self.device = device
@@ -219,6 +227,8 @@ def __init__(
         if self.compiled_model:
             self.apply_cuda_graph()  # 调用 cuda graph 优化
 
+    # ... 其余方法保持不变 ...
+
     def _get_max_avaliable_tokens(self, gpu_memory_utilization=0.9, block_size=1):
         avaliable_blocks = ComputeMaxAvailableBlocks(
             num_layers=self.llm_config.num_layers,
@@ -234,7 +244,7 @@ def _get_max_avaliable_tokens(self, gpu_memory_utilization=0.9, block_size=1):
         return max_gpu_num_blocks, max_gpu_num_tokens
 
     def _init_mem_manager(
-        self, gpu_num_blocks, block_size=1, dtype=torch.float16, device="cuda"
+            self, gpu_num_blocks, block_size=1, dtype=torch.float16, device="cuda"
     ):
         kv_mem_manager = KVCacheMemoryManager(
             num_layers=self.llm_config.num_layers,
@@ -249,7 +259,7 @@ def _init_mem_manager(
         return kv_mem_manager
 
     def apply_cuda_graph(
-        self,
+            self,
     ):
         """应用 cuda graph 优化
         参数:
@@ -266,7 +276,7 @@ def apply_cuda_graph(
         self.model_runner.capture_decode_graph()
 
     def init_req_to_tokens_table(
-        self, b_req_tokens_table, b_req_idx, b_seq_len, alloc_mem_index
+            self, b_req_tokens_table, b_req_idx, b_seq_len, alloc_mem_index
     ):
         """
         初始化 prefill 阶段已分配的批次请求项的 kv cache 所用 tokens 索引
@@ -282,19 +292,19 @@ def init_req_to_tokens_table(
                 b_start_loc[i] = start_index
             cur_seq_len = b_seq_len_numpy[i]
             b_req_tokens_table[b_req_idx_numpy[i], :cur_seq_len] = alloc_mem_index[
-                start_index : start_index + cur_seq_len
-            ]
+                                                                   start_index: start_index + cur_seq_len
+                                                                   ]
             start_index += cur_seq_len
 
         return b_start_loc
 
     def prefill_alloc_kv_cache(
-        self,
-        max_prompt_len,
-        actual_prompt_lens,
-        b_req_idx,
-        image_batch_size=None,
-        debug_mode=False,
+            self,
+            max_prompt_len,
+            actual_prompt_lens,
+            b_req_idx,
+            image_batch_size=None,
+            debug_mode=False,
     ):
         """
         start_index:        tensor([  0, 270, 540, 810], device='cuda:0', dtype=torch.int32)
diff --git a/lite_llama/generate_stream.py b/lite_llama/generate_stream.py
index e60220a..0d318ef 100644
--- a/lite_llama/generate_stream.py
+++ b/lite_llama/generate_stream.py
@@ -7,6 +7,9 @@
 
 from transformers import AutoTokenizer
 
+# 新增导入
+from .quantization.quant_manager import QuantizationType
+
 # 设置日志
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -18,44 +21,27 @@ class CompletionPrediction(TypedDict, total=False):
     logprobs: list[float]  # not required
 
 
+# 保持采样函数不变
 @torch.inference_mode()
 def sample_top_p(probs, p):
     """
     执行 Top-p (Nucleus) 采样, 从概率分布中采样下一个词。
-
-    参数：
-        probs (torch.Tensor): 概率分布张量，形状为 `[batch_size, vocab_size]`。
-        p (float): 累积概率阈值，取值范围在 0 到 1 之间。
-    返回：
-        torch.Tensor: 采样得到的词索引，形状为 `[batch_size, 1]`。
-
-    说明：
-        Top-p 采样算法: 选择概率累积和超过阈值 p 的最小集合，将这些词的概率重新归一化后进行采样。
     """
-    # 对概率分布进行降序排序。probs_sort: 排序后的概率值，形状与 probs 相同。probs_idx: 排序后的索引，用于映射回原始词汇表。
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    # 计算排序后概率的累积和. 返回的 probs_sum 是累积概率分布。
     probs_sum = torch.cumsum(probs_sort, dim=-1)
-    # 保留累积概率未超过阈值 p 的词汇的概率，其余词汇的概率被置为 0.0。
-    mask = (
-        probs_sum - probs_sort > p
-    )  # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
-    probs_sort[mask] = 0.0  # 将累积概率超过阈值 p 的词的概率置零。
+    mask = (probs_sum - probs_sort > p)
+    probs_sort[mask] = 0.0
 
-    # 对剩余的概率重新归一化, 确保总和为 1。
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    # 从重新归一化的概率分布中采样下一个词. 返回的 next_token 是采样得到的词在排序后概率分布中的索引。
     next_token_sorted_idx = torch.multinomial(probs_sort, num_samples=1)
-    # 在 probs_idx 的最后一维（dim=-1）中，使用 next_token_sorted_idx 作为索引，提取对应的值。沿着 dim=1（列）进行索引提取
-    # NOTE: torch.gather 函数按照给定的索引张量 index，从输入张量中收集 (获取) 数据，并返回一个与索引张量形状一致的张量。
     next_token = torch.gather(probs_idx, -1, index=next_token_sorted_idx)
 
-    return next_token  # 返回采样得到的下一个词的索引
+    return next_token
 
 
 class GenerateStreamText:
     """
-    GenerateText 类用于加载LLaMA模型并执行迭代式生成式推理 (文本生成)。
+    支持量化的GenerateStreamText类
     """
 
     def __init__(
@@ -66,20 +52,28 @@ def __init__(
         max_seq_len=1024,
         compiled_model=False,
         device="cuda",
+        quantization: Optional[str] = None,  # 新增参数
     ):
         self.checkpoints_dir = checkpoints_dir
+        self.quantization = quantization  # 存储量化类型
 
+        # 创建ModelExecutor时传入量化参数
         self.model_executor = ModelExecutor.build(
             checkpoints_dir=checkpoints_dir,
             max_gpu_num_blocks=max_gpu_num_blocks,
             max_seq_len=max_seq_len,
             compiled_model=compiled_model,
             device=device,
+            quantization=quantization,  # 新增参数
         )
         self.tokenizer = self.load_tokenizer(tokenizer_path)
         self.model_config = self.model_executor.model_config
         self.device = device
 
+        # 记录量化信息
+        if self.quantization and self.quantization != QuantizationType.NONE:
+            logger.info(f"使用量化推理: {self.quantization}")
+
     def load_tokenizer(self, pretrained_model_name_or_path):
         model_name = get_model_name_from_path(pretrained_model_name_or_path)
 
@@ -106,22 +100,9 @@ def generate_stream(
     ) -> Generator[tuple[list[str], Optional[list[float]]], None, None]:
         """
         基于提供的 prompt_tokens, 使用语言生成模型逐个生成 token, 并在生成时立即输出。
-
-        参数：
-            prompt_tokens (list[list[int]]): 已经进行分词的 prompt, 每个 prompt 是一个整数列表。
-            max_gen_len (int): 生成的最大长度。
-            temperature (float, optional): 控制采样随机性的温度值。默认为 0.6。
-            top_p (float, optional): 用于 nucleus sampling 的概率阈值。默认为 0.9。
-            logprobs (bool, optional): 是否计算生成 token 的对数概率。默认为 False。
-            echo (bool, optional): 是否在输出中包含 prompt_tokens。默认为 False。
-
-        generator 输出：
-            tuple[list[str], Optional[list[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
-        说明：
-            该方法在生成循环中，每生成一个新 token, 就立即输出对应的文本和概率(如果需要）。
+        支持量化模型推理。
         """
         bsz = len(prompt_tokens)
-        # min_prompt_len = min(len(t) for t in prompt_tokens)
         max_prompt_len = max(len(t) for t in prompt_tokens)
         assert max_prompt_len <= self.model_config.max_seq_len
         total_len = min(self.model_config.max_seq_len, max_gen_len + max_prompt_len)
@@ -141,7 +122,7 @@ def generate_stream(
         prev_pos = 0
         last_yielded_pos = [
             len(prompt_tokens[i]) if not echo else 0 for i in range(bsz)
-        ]  # 初始化每个样本已输出的位置
+        ]
 
         # 填充提示词到 tokens 张量
         for k, t in enumerate(prompt_tokens):
@@ -164,24 +145,19 @@ def generate_stream(
                 .repeat(batch_size, 1)  # shape: [batch_size, seq_len], 不分配额外内存
             )
 
+            # 使用量化模型进行前向推理
             logits = self.model_executor.forward(input_ids, position_ids)
             decode_select_index = self.model_executor.decode_alloc_kv_cache(bsz)
             all_select_index_list.append(decode_select_index)
 
             if temperature > 0:
-                # NOTE: logits[:, -1] 表示选择的是最后一个位置（seq_len 维度的最后一项）对应的 logits。
-                # NOTE: 在生成模型中的 prefill 阶段，我们只关心当前生成的最后一个 token 的分布。
                 probs = softmax_split(logits[:, -1] / temperature)
-                # NOTE: 使用核采样方法，从高概率的候选 token 中选择下一个 token 索引. top_p 控制采样范围（候选 token 的概率累积值）。
                 next_token = sample_top_p(probs, top_p)
             else:
                 next_token = torch.argmax(logits[:, -1], dim=-1)
 
             input_ids = next_token  # [batch_size, 1]
 
-            # 仅在需要生成的情况下替换 token
-            # NOTE: input_text_mask[:, cur_pos]：获取掩码中当前列的布尔值，表示每个序列在当前位置是否为实际输入词元。
-            # NOTE: tokens[:, cur_pos]：获取 tokens 中当前列的值。next_token：包含当前生成的词元 ID。
             mask = ~input_text_mask[:, cur_pos]  # [batch_size]
             tokens[:, cur_pos] = torch.where(
                 mask, next_token.reshape(-1), tokens[:, cur_pos]
@@ -192,12 +168,6 @@ def generate_stream(
             )
             prev_pos = cur_pos
 
-            # eos_reached 是一个布尔张量，记录每个序列是否到达了终止状态, 形状为 [batch_size, 1]。
-            # NOTE: ～input_text_mask[:, cur_pos] 标记当前生成位置是否是模型生成的部分（非输入部分）。True 表示当前列是待生成的部分。False 表示当前列是输入部分。
-            # NOTE: next_token == self.tokenizer.eos_token_id 表示检测当前生成的 next_token 是否等于 eos_token_id，即模型生成了终止标记。
-            # NOTE: & 表示按位与操作，确保当前位置是非输入部分且生成了终止标记。
-            # NOTE: 使用 |= 按位或更新，表示如果某个序列已经到达 eos_token_id，则保持 True 状态，不会被后续重置为 False。
-
             # 为整个批次收集输出
             batch_outputs = []
             for i in range(bsz):
@@ -207,11 +177,11 @@ def generate_stream(
                     token = tokens[i, start:end].tolist()
                     text = self.tokenizer.decode(
                         token, skip_special_tokens=True
-                    )  # 解码时跳过特殊标记。
+                    )
                     batch_outputs.append(text)
                     last_yielded_pos[i] = end
                 else:
-                    batch_outputs.append("")  # 如果没有新生成的内容，添加空字符串
+                    batch_outputs.append("")
 
             # 将整个批次的输出一次性 yield
             yield batch_outputs
@@ -251,4 +221,4 @@ def text_completion_stream(
         for batch_outputs in stream:
             for i, text in enumerate(batch_outputs):
                 completions[i]["generation"] += text
-            yield completions.copy()
+            yield completions.copy()
\ No newline at end of file
diff --git a/lite_llama/llava_generate_stream.py b/lite_llama/llava_generate_stream.py
index 91b664c..ba5f292 100644
--- a/lite_llama/llava_generate_stream.py
+++ b/lite_llama/llava_generate_stream.py
@@ -9,6 +9,9 @@
 
 from transformers import AutoTokenizer, AutoProcessor
 
+# 新增导入
+from .quantization.quant_manager import QuantizationType
+
 # 设置日志
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -21,85 +24,73 @@ class CompletionPrediction(TypedDict, total=False):
 
 
 def tokenizer_image_token(
-    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
+        prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
 ):
     """
     处理包含特殊标记 <image> 的文本提示, 将其转换为相应的 token 序列，并在 <image> 位置插入指定的图像 token 索引。
-
-    "A cat <image> is sitting <image> on the mat."
-    [65,32,99,97,116,32000,32,105,115,32,115,105,116,116,105,110,103,32000,32,111,110,32,116,104,101,32,109,97,116,46]
-
-    参数:
-        prompt (str): 包含 <image> 标记的文本。
-        tokenizer: 分词器对象，需支持调用 tokenizer(chunk).input_ids。
-        image_token_index (int): 用于替换 <image> 标记的图像 token 索引。
-        return_tensors (str, optional): 指定返回的张量类型，例如 'pt' 表示 PyTorch 张量。
-
-    返回:
-        list 或 torch.Tensor: 生成的 token 序列。
     """
-    # 使用正则表达式分割，移除 '<image>' 前的空格，但保留后的空格
     prompt_chunks = re.split(r"\s?<image>", prompt)
-    # 不过滤空片段，以处理多个连续的 '<image>' 标记
     token_chunks = [tokenizer(chunk).input_ids for chunk in prompt_chunks]
 
     input_ids = []
     offset = 0
-    # 检查第一个片段是否以 BOS token 开始
     if (
-        len(token_chunks) > 0
-        and len(token_chunks[0]) > 0
-        and token_chunks[0][0] == tokenizer.bos_token_id
+            len(token_chunks) > 0
+            and len(token_chunks[0]) > 0
+            and token_chunks[0][0] == tokenizer.bos_token_id
     ):
         offset = 1
         input_ids.append(token_chunks[0][0])
 
-    # 插入图像 token
     for i, chunk in enumerate(token_chunks):
-        input_ids.extend(
-            chunk[offset:]
-        )  # 添加当前片段的 token，跳过 BOS token（如果已添加）
-        offset = 0  # 仅适用于第一个片段
-        if i < len(token_chunks) - 1:  # 如果不是最后一个片段，插入图像 token
+        input_ids.extend(chunk[offset:])
+        offset = 0
+        if i < len(token_chunks) - 1:
             input_ids.append(image_token_index)
 
     if return_tensors is not None:
         if return_tensors == "pt":
             return torch.tensor(input_ids, dtype=torch.long)
         raise ValueError(f"Unsupported tensor type: {return_tensors}")
-    """
-    [1, 3148, 1001, 29901, 32000, 1, 29871, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 319, 1799, 9047, 13566, 29901]
-    """
+
     return input_ids
 
 
 class LlavaGeneratorStream:
     """
-    GenerateText 类用于加载LLaMA模型并执行迭代式生成式推理 (文本生成)。
+    支持量化的LlavaGeneratorStream类
     """
 
     def __init__(
-        self,
-        checkpoints_dir: str,
-        tokenizer_path: str,
-        max_gpu_num_blocks=None,
-        max_seq_len=2048,
-        compiled_model=False,
-        device="cuda",
+            self,
+            checkpoints_dir: str,
+            tokenizer_path: str,
+            max_gpu_num_blocks=None,
+            max_seq_len=2048,
+            compiled_model=False,
+            device="cuda",
+            quantization: Optional[str] = None,  # 新增参数
     ):
         self.checkpoints_dir = checkpoints_dir
         self.compiled_model = compiled_model
         self.max_seq_len = max_seq_len
         self.device = device
+        self.quantization = quantization  # 存储量化类型
 
+        # 创建ModelExecutor时传入量化参数
         self.model_executor = ModelExecutor.build(
             checkpoints_dir=checkpoints_dir,
             max_gpu_num_blocks=max_gpu_num_blocks,
             max_seq_len=max_seq_len,
             device=device,
+            quantization=quantization,  # 新增参数
         )
         self.tokenizer = self.load_tokenizer(tokenizer_path)
 
+        # 记录量化信息
+        if self.quantization and self.quantization != QuantizationType.NONE:
+            logger.info(f"使用量化推理 (LLaVA): {self.quantization}")
+
     def load_tokenizer(self, pretrained_model_name_or_path):
         model_name = get_model_name_from_path(pretrained_model_name_or_path)
 
@@ -143,32 +134,18 @@ def encode_images(self, image_items: list[Union[str, Image.Image]]):
 
     @torch.inference_mode()
     def generate_stream(
-        self,
-        prompt_tokens: list[list[int]],
-        image_tensors: Optional[torch.FloatTensor] = None,
-        max_gen_len: int = 2048,
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        echo: bool = False,
+            self,
+            prompt_tokens: list[list[int]],
+            image_tensors: Optional[torch.FloatTensor] = None,
+            max_gen_len: int = 2048,
+            temperature: float = 0.6,
+            top_p: float = 0.9,
+            echo: bool = False,
     ) -> Generator[tuple[list[str], Optional[list[float]]], None, None]:
         """
-        基于提供的 prompt_tokens, 使用语言生成模型逐个生成 token, 并在生成时立即输出。
-
-        参数：
-            prompt_tokens (list[list[int]]): 已经进行分词的 prompt, 每个 prompt 是一个整数列表。
-            max_gen_len (int): 生成的最大长度。
-            temperature (float, optional): 控制采样随机性的温度值。默认为 0.6。
-            top_p (float, optional): 用于 nucleus sampling 的概率阈值。默认为 0.9。
-            logprobs (bool, optional): 是否计算生成 token 的对数概率。默认为 False。
-            echo (bool, optional): 是否在输出中包含 prompt_tokens。默认为 False。
-
-        generator 输出：
-            tuple[list[str], Optional[list[float]]]: 包含生成的文本和对应的对数概率(如果 logprobs 为 True)。
-        说明：
-            该方法在生成循环中，每生成一个新 token, 就立即输出对应的文本和概率(如果需要）。
+        基于提供的 prompt_tokens, 使用量化的LLaVA模型逐个生成 token, 并在生成时立即输出。
         """
         bsz = len(prompt_tokens)
-        # min_prompt_len = min(len(t) for t in prompt_tokens)
         max_prompt_len = max(len(t) for t in prompt_tokens)
         assert max_prompt_len <= self.max_seq_len
         total_seq_len = min(self.max_seq_len, max_gen_len + max_prompt_len)
@@ -185,16 +162,14 @@ def generate_stream(
         tokens = torch.full(
             (bsz, total_seq_len), pad_id, dtype=torch.long, device=self.device
         )
-        # 生成一个布尔张量，它的值为 True 的位置表示输入序列的实际内容（即非填充部分）, 形状为 (batch_size, total_seq_len)
         input_text_mask = tokens != pad_id
         eos_reached = torch.tensor([False] * bsz, device=self.device)
         last_yielded_pos = [
             len(prompt_tokens[i]) if not echo else 0 for i in range(bsz)
-        ]  # 初始化每个样本已输出的位置
+        ]
 
         # 填充提示词到 tokens 张量
         for seq_id, token_ids in enumerate(prompt_tokens):
-            # NOTE: torch.long 等同于 torch.int64
             tokens[seq_id, : len(token_ids)] = (
                 token_ids.clone().detach().to(dtype=torch.long, device=self.device)
             )
@@ -213,9 +188,11 @@ def generate_stream(
         input_ids = tokens[:, :max_prompt_len]  # [batch_size, seq_len]
         for cur_pos in range(max_prompt_len, total_seq_len):
             batch_size, _ = input_ids.shape
+
+            # 使用量化模型进行前向推理
             logits = self.model_executor.forward(
                 input_ids, position_ids, image_tensors
-            )  # step 0: position_ids 由 llava 模型类给出
+            )
 
             start_pos += bsz
             position_ids = (
@@ -240,7 +217,7 @@ def generate_stream(
             )
 
             eos_reached = eos_reached | (
-                mask & (next_token == self.tokenizer.eos_token_id)
+                    mask & (next_token == self.tokenizer.eos_token_id)
             )
 
             # 为整个批次收集输出
@@ -254,7 +231,7 @@ def generate_stream(
                     batch_outputs.append(text)
                     last_yielded_pos[i] = end
                 else:
-                    batch_outputs.append("")  # 如果没有新生成的内容，添加空字符串
+                    batch_outputs.append("")
 
             # 将整个批次的输出一次性 yield
             yield batch_outputs
@@ -267,13 +244,13 @@ def generate_stream(
         self.model_executor.kv_mem_manager.release_ref(all_select_indexs)
 
     def text_completion_stream(
-        self,
-        prompts: list[str],
-        image_items: list[Union[str, Image.Image]],
-        temperature: float = 0.6,
-        top_p: float = 0.9,
-        max_gen_len: Optional[int] = None,
-        echo: bool = False,
+            self,
+            prompts: list[str],
+            image_items: list[Union[str, Image.Image]],
+            temperature: float = 0.6,
+            top_p: float = 0.9,
+            max_gen_len: Optional[int] = None,
+            echo: bool = False,
     ) -> Generator[list[CompletionPrediction], None, None]:
         """每次迭代时，生成器返回一个包含多个 CompletionPrediction 字典的列表。"""
 
@@ -285,11 +262,8 @@ def text_completion_stream(
                 x, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
             )
             for x in prompts
-        ]  # torch.Size([1, 22])
-        image_tensors = self.encode_images(
-            image_items
-        )  # image_tensors shape is torch.Size([1, 3, 336, 336])
-        # print(f"prompt 0 shape: {prompt_tokens[0].shape}, image_tensors shape: {image_tensors.shape}")
+        ]
+        image_tensors = self.encode_images(image_items)
 
         stream = self.generate_stream(
             prompt_tokens=prompt_tokens,
@@ -311,32 +285,14 @@ def text_completion_stream(
 def sample_top_p(probs, p):
     """
     执行 Top-p (Nucleus) 采样, 从概率分布中采样下一个词。
-
-    参数：
-        probs (torch.Tensor): 概率分布张量，形状为 `[batch_size, vocab_size]`。
-        p (float): 累积概率阈值，取值范围在 0 到 1 之间。
-    返回：
-        torch.Tensor: 采样得到的词索引，形状为 `[batch_size, 1]`。
-
-    说明：
-        Top-p 采样算法: 选择概率累积和超过阈值 p 的最小集合，将这些词的概率重新归一化后进行采样。
     """
-    # 对概率分布进行降序排序。probs_sort: 排序后的概率值，形状与 probs 相同。probs_idx: 排序后的索引，用于映射回原始词汇表。
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    # 计算排序后概率的累积和. 返回的 probs_sum 是累积概率分布。
     probs_sum = torch.cumsum(probs_sort, dim=-1)
-    # 保留累积概率未超过阈值 p 的词汇的概率，其余词汇的概率被置为 0.0。
-    mask = (
-        probs_sum - probs_sort > p
-    )  # 创建掩码，对于每个位置，计算累积概率（不包括当前词）是否超过阈值 p。
-    probs_sort[mask] = 0.0  # 将累积概率超过阈值 p 的词的概率置零。
+    mask = (probs_sum - probs_sort > p)
+    probs_sort[mask] = 0.0
 
-    # 对剩余的概率重新归一化, 确保总和为 1。
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
-    # 从重新归一化的概率分布中采样下一个词. 返回的 next_token 是采样得到的词在排序后概率分布中的索引。
     next_token_sorted_idx = torch.multinomial(probs_sort, num_samples=1)
-    # 在 probs_idx 的最后一维（dim=-1）中，使用 next_token_sorted_idx 作为索引，提取对应的值。沿着 dim=1（列）进行索引提取
-    # NOTE: torch.gather 函数按照给定的索引张量 index，从输入张量中收集 (获取) 数据，并返回一个与索引张量形状一致的张量。
     next_token = torch.gather(probs_idx, -1, index=next_token_sorted_idx)
 
-    return next_token  # 返回采样得到的下一个词的索引
+    return next_token
\ No newline at end of file
diff --git a/lite_llama/models/__init__.py b/lite_llama/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lite_llama/models/quantized_models.py b/lite_llama/models/quantized_models.py
new file mode 100644
index 0000000..5eb5d91
--- /dev/null
+++ b/lite_llama/models/quantized_models.py
@@ -0,0 +1,356 @@
+"""
+Quantized Model Builder for lite_llama
+Creates quantized versions of supported models
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, Any, Optional, Union
+import copy
+
+from .llama import LlamaModel, FusedAttention as LlamaAttention, FusedMLP as LlamaMLP
+from .qwen2 import Qwen2Model, Qwen2Attention, FusedMLP as Qwen2MLP
+from .qwen3 import Qwen3Model, Qwen3Attention, FusedMLP as Qwen3MLP
+from .llava import LlavaLlama
+from .model_config import LlamaConfig, Qwen2Config, Qwen3Config
+
+# Import quantized layers
+from lite_llama.kernels.awq_linear import AWQLinear
+from lite_llama.kernels.gptq_linear import GPTQLinear
+from lite_llama.kernels.sq_linear import SmoothQuantLinear
+
+from ..quantization.quant_manager import QuantizationType
+
+
+class QuantizedAttentionMixin:
+    """量化Attention层的Mixin"""
+
+    def replace_linear_with_quantized(self, quantization_method: str, config: Dict[str, Any]):
+        """替换线性层为量化层"""
+
+        if quantization_method == QuantizationType.GPTQ:
+            # 替换投影层为GPTQ量化层
+            if hasattr(self, 'q_proj'):
+                self.q_proj = self._create_gptq_linear(self.q_proj, config)
+            if hasattr(self, 'k_proj'):
+                self.k_proj = self._create_gptq_linear(self.k_proj, config)
+            if hasattr(self, 'v_proj'):
+                self.v_proj = self._create_gptq_linear(self.v_proj, config)
+            if hasattr(self, 'o_proj'):
+                self.o_proj = self._create_gptq_linear(self.o_proj, config)
+            # 处理融合的kv_proj权重
+            if hasattr(self, 'kv_proj_weight'):
+                # 需要特殊处理融合权重
+                pass
+
+        elif quantization_method == QuantizationType.AWQ:
+            # 替换为AWQ量化层
+            if hasattr(self, 'q_proj'):
+                self.q_proj = self._create_awq_linear(self.q_proj, config)
+            if hasattr(self, 'k_proj'):
+                self.k_proj = self._create_awq_linear(self.k_proj, config)
+            if hasattr(self, 'v_proj'):
+                self.v_proj = self._create_awq_linear(self.v_proj, config)
+            if hasattr(self, 'o_proj'):
+                self.o_proj = self._create_awq_linear(self.o_proj, config)
+
+        elif quantization_method == QuantizationType.SMOOTHQUANT:
+            # 替换为SmoothQuant量化层
+            if hasattr(self, 'q_proj'):
+                self.q_proj = self._create_sq_linear(self.q_proj, config)
+            if hasattr(self, 'k_proj'):
+                self.k_proj = self._create_sq_linear(self.k_proj, config)
+            if hasattr(self, 'v_proj'):
+                self.v_proj = self._create_sq_linear(self.v_proj, config)
+            if hasattr(self, 'o_proj'):
+                self.o_proj = self._create_sq_linear(self.o_proj, config)
+
+    def _create_gptq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> GPTQLinear:
+        """创建GPTQ量化线性层"""
+        gptq_layer = GPTQLinear(
+            in_features=original_layer.in_features,
+            out_features=original_layer.out_features,
+            bias=original_layer.bias is not None,
+            dtype=torch.float16,
+            bits=config.get('w_bit', 4),
+            groupsize=config.get('group_size', 128),
+            device=config.get('device', 'cuda')
+        )
+        return gptq_layer
+
+    def _create_awq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> AWQLinear:
+        """创建AWQ量化线性层"""
+        awq_layer = AWQLinear(
+            in_features=original_layer.in_features,
+            out_features=original_layer.out_features,
+            bias=original_layer.bias is not None,
+            group_size=config.get('group_size', 128),
+            wbits=config.get('w_bit', 4)
+        )
+        return awq_layer
+
+    def _create_sq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> SmoothQuantLinear:
+        """创建SmoothQuant量化线性层"""
+        from ..quantization.quant_config import SmoothQuantConfig
+        sq_config = SmoothQuantConfig(**config)
+
+        sq_layer = SmoothQuantLinear(
+            in_features=original_layer.in_features,
+            out_features=original_layer.out_features,
+            bias=original_layer.bias is not None,
+            config=sq_config
+        )
+        return sq_layer
+
+
+class QuantizedMLPMixin:
+    """量化MLP层的Mixin"""
+
+    def replace_linear_with_quantized(self, quantization_method: str, config: Dict[str, Any]):
+        """替换线性层为量化层"""
+
+        if quantization_method == QuantizationType.GPTQ:
+            self.gate_proj = self._create_gptq_linear(self.gate_proj, config)
+            self.up_proj = self._create_gptq_linear(self.up_proj, config)
+            self.down_proj = self._create_gptq_linear(self.down_proj, config)
+
+        elif quantization_method == QuantizationType.AWQ:
+            self.gate_proj = self._create_awq_linear(self.gate_proj, config)
+            self.up_proj = self._create_awq_linear(self.up_proj, config)
+            self.down_proj = self._create_awq_linear(self.down_proj, config)
+
+        elif quantization_method == QuantizationType.SMOOTHQUANT:
+            self.gate_proj = self._create_sq_linear(self.gate_proj, config)
+            self.up_proj = self._create_sq_linear(self.up_proj, config)
+            self.down_proj = self._create_sq_linear(self.down_proj, config)
+
+    def _create_gptq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> GPTQLinear:
+        """创建GPTQ量化线性层"""
+        gptq_layer = GPTQLinear(
+            in_features=original_layer.in_features,
+            out_features=original_layer.out_features,
+            bias=original_layer.bias is not None,
+            dtype=torch.float16,
+            bits=config.get('w_bit', 4),
+            groupsize=config.get('group_size', 128),
+            device=config.get('device', 'cuda')
+        )
+        return gptq_layer
+
+    def _create_awq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> AWQLinear:
+        """创建AWQ量化线性层"""
+        awq_layer = AWQLinear(
+            in_features=original_layer.in_features,
+            out_features=original_layer.out_features,
+            bias=original_layer.bias is not None,
+            group_size=config.get('group_size', 128),
+            wbits=config.get('w_bit', 4)
+        )
+        return awq_layer
+
+    def _create_sq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> SmoothQuantLinear:
+        """创建SmoothQuant量化线性层"""
+        from ..quantization.quant_config import SmoothQuantConfig
+        sq_config = SmoothQuantConfig(**config)
+
+        sq_layer = SmoothQuantLinear(
+            in_features=original_layer.in_features,
+            out_features=original_layer.out_features,
+            bias=original_layer.bias is not None,
+            config=sq_config
+        )
+        return sq_layer
+
+
+# 创建量化版本的Attention层
+class QuantizedLlamaAttention(LlamaAttention, QuantizedAttentionMixin):
+    def __init__(self, config: LlamaConfig, quantization_method: str, quantization_config: Dict[str, Any]):
+        super().__init__(config)
+        self.replace_linear_with_quantized(quantization_method, quantization_config)
+
+
+class QuantizedQwen2Attention(Qwen2Attention, QuantizedAttentionMixin):
+    def __init__(self, hidden_size: int, num_heads: int, num_kv_heads: int,
+                 quantization_method: str, quantization_config: Dict[str, Any], dtype=torch.float16):
+        super().__init__(hidden_size, num_heads, num_kv_heads, dtype)
+        self.replace_linear_with_quantized(quantization_method, quantization_config)
+
+
+class QuantizedQwen3Attention(Qwen3Attention, QuantizedAttentionMixin):
+    def __init__(self, config: Qwen3Config, quantization_method: str, quantization_config: Dict[str, Any]):
+        super().__init__(config)
+        self.replace_linear_with_quantized(quantization_method, quantization_config)
+
+
+# 创建量化版本的MLP层
+class QuantizedLlamaMLP(LlamaMLP, QuantizedMLPMixin):
+    def __init__(self, config: LlamaConfig, quantization_method: str, quantization_config: Dict[str, Any]):
+        super().__init__(config)
+        self.replace_linear_with_quantized(quantization_method, quantization_config)
+
+
+class QuantizedQwen2MLP(Qwen2MLP, QuantizedMLPMixin):
+    def __init__(self, config: Qwen2Config, quantization_method: str, quantization_config: Dict[str, Any]):
+        super().__init__(config)
+        self.replace_linear_with_quantized(quantization_method, quantization_config)
+
+
+class QuantizedQwen3MLP(Qwen3MLP, QuantizedMLPMixin):
+    def __init__(self, config: Qwen3Config, quantization_method: str, quantization_config: Dict[str, Any]):
+        super().__init__(config)
+        self.replace_linear_with_quantized(quantization_method, quantization_config)
+
+
+def create_quantized_model(
+        model_config: Union[LlamaConfig, Qwen2Config, Qwen3Config],
+        quantization_method: str,
+        quantization_config: Dict[str, Any],
+        device: str = "cuda"
+) -> torch.nn.Module:
+    """创建量化模型"""
+
+    model_type = model_config.model_type.lower()
+
+    if model_type == "llama":
+        model = create_quantized_llama(model_config, quantization_method, quantization_config, device)
+    elif model_type == "qwen2":
+        model = create_quantized_qwen2(model_config, quantization_method, quantization_config, device)
+    elif model_type == "qwen3":
+        model = create_quantized_qwen3(model_config, quantization_method, quantization_config, device)
+    elif model_type == "llava":
+        model = create_quantized_llava(model_config, quantization_method, quantization_config, device)
+    else:
+        raise ValueError(f"不支持的模型类型: {model_type}")
+
+    return model.to(device)
+
+
+def create_quantized_llama(
+        config: LlamaConfig,
+        quantization_method: str,
+        quantization_config: Dict[str, Any],
+        device: str
+) -> LlamaModel:
+    """创建量化的Llama模型"""
+
+    # 创建基础模型
+    model = LlamaModel(config)
+
+    # 替换层为量化版本
+    for i, layer in enumerate(model.layers):
+        # 替换attention
+        quantized_attention = QuantizedLlamaAttention(
+            config, quantization_method, quantization_config
+        )
+
+        # 复制权重信息（在实际加载时会被覆盖）
+        layer.self_attn = quantized_attention
+
+        # 替换MLP
+        quantized_mlp = QuantizedLlamaMLP(
+            config, quantization_method, quantization_config
+        )
+        layer.mlp = quantized_mlp
+
+    # 替换lm_head如果需要
+    if quantization_method in [QuantizationType.GPTQ, QuantizationType.AWQ]:
+        if quantization_method == QuantizationType.GPTQ:
+            quantized_lm_head = GPTQLinear(
+                in_features=model.lm_head.in_features,
+                out_features=model.lm_head.out_features,
+                bias=model.lm_head.bias is not None,
+                dtype=torch.float16,
+                bits=quantization_config.get('w_bit', 4),
+                groupsize=quantization_config.get('group_size', 128),
+                device=device
+            )
+        else:  # AWQ
+            quantized_lm_head = AWQLinear(
+                in_features=model.lm_head.in_features,
+                out_features=model.lm_head.out_features,
+                bias=model.lm_head.bias is not None,
+                group_size=quantization_config.get('group_size', 128),
+                wbits=quantization_config.get('w_bit', 4)
+            )
+
+        model.lm_head = quantized_lm_head
+
+    return model
+
+
+def create_quantized_qwen2(
+        config: Qwen2Config,
+        quantization_method: str,
+        quantization_config: Dict[str, Any],
+        device: str
+) -> Qwen2Model:
+    """创建量化的Qwen2模型"""
+
+    # 创建基础模型
+    model = Qwen2Model(config)
+
+    # 替换层为量化版本
+    for i, layer in enumerate(model.layers):
+        # 替换attention
+        quantized_attention = QuantizedQwen2Attention(
+            config.hidden_size, config.num_heads, config.num_kv_heads,
+            quantization_method, quantization_config
+        )
+        layer.self_attn = quantized_attention
+
+        # 替换MLP
+        quantized_mlp = QuantizedQwen2MLP(
+            config, quantization_method, quantization_config
+        )
+        layer.mlp = quantized_mlp
+
+    return model
+
+
+def create_quantized_qwen3(
+        config: Qwen3Config,
+        quantization_method: str,
+        quantization_config: Dict[str, Any],
+        device: str
+) -> Qwen3Model:
+    """创建量化的Qwen3模型"""
+
+    # 创建基础模型
+    model = Qwen3Model(config)
+
+    # 替换层为量化版本
+    for i, layer in enumerate(model.layers):
+        # 替换attention
+        quantized_attention = QuantizedQwen3Attention(
+            config, quantization_method, quantization_config
+        )
+        layer.self_attn = quantized_attention
+
+        # 替换MLP
+        quantized_mlp = QuantizedQwen3MLP(
+            config, quantization_method, quantization_config
+        )
+        layer.mlp = quantized_mlp
+
+    return model
+
+
+def create_quantized_llava(
+        config: Any,  # LlavaConfig
+        quantization_method: str,
+        quantization_config: Dict[str, Any],
+        device: str
+) -> LlavaLlama:
+    """创建量化的LLaVA模型"""
+
+    # 创建基础模型
+    model = LlavaLlama(config)
+
+    # 量化language_model部分
+    llama_config = model.llama_config
+    quantized_language_model = create_quantized_llama(
+        llama_config, quantization_method, quantization_config, device
+    )
+
+    model.language_model = quantized_language_model
+
+    return model
\ No newline at end of file
diff --git a/lite_llama/quantization/awq.py b/lite_llama/quantization/awq.py
index 90a943d..cc8f047 100644
--- a/lite_llama/quantization/awq.py
+++ b/lite_llama/quantization/awq.py
@@ -323,9 +323,8 @@ def quantize_awq(
         model_state_dict: Dict[str, torch.Tensor],
         calibration_loader: Optional[Any] = None,
         model: Optional[torch.nn.Module] = None,
-        wbits: int = 4,
-        groupsize: int = 128,
         target_layers: Optional[List[str]] = None,
+        config: AWQConfig = None,
         device: str = "cuda"
 ) -> Dict[str, torch.Tensor]:
     """
@@ -343,12 +342,7 @@ def quantize_awq(
     Returns:
         Dictionary containing quantized weights and quantization parameters
     """
-    config = AWQConfig(
-        w_bit=wbits,
-        group_size=groupsize,
-        device=device
-    )
-
+    wbits = config.w_bit
     awq = AWQ(config)
     quantized_state_dict = {}
 
@@ -385,29 +379,6 @@ def hook_fn(module, input, output):
                 hook = module.register_forward_hook(make_hook(name))
                 hooks.append(hook)
 
-        # Run calibration
-        model.eval()
-        with torch.no_grad():
-            for i, batch in enumerate(tqdm(calibration_loader, desc="Calibration")):
-                if i >= 32:  # Limit calibration samples
-                    break
-
-                # Move batch to device
-                if isinstance(batch, dict):
-                    batch = {k: v.to(device) if torch.is_tensor(v) else v for k, v in batch.items()}
-                    outputs = model(**batch)
-                elif isinstance(batch, (list, tuple)):
-                    batch = [b.to(device) if torch.is_tensor(b) else b for b in batch]
-                    outputs = model(*batch)
-                else:
-                    batch = batch.to(device)
-                    outputs = model(batch)
-
-        # Remove hooks
-        for hook in hooks:
-            hook.remove()
-
-        print(f"Collected statistics for {len(awq.activation_stats)} layers")
 
     print(f"Quantizing {len(target_layers)} layers to {wbits} bits with AWQ...")
 
@@ -415,7 +386,7 @@ def hook_fn(module, input, output):
     for name, param in tqdm(model_state_dict.items(), desc="Quantizing layers"):
         if name in target_layers and param.dim() == 2:
             # Move weight to device
-            weight = param.to(device).float()
+            weight = param.to(device)
 
             # Get layer name without .weight suffix for activation lookup
             layer_name = name.replace(".weight", "").replace("_weight", "")
diff --git a/lite_llama/quantization/quant_config.py b/lite_llama/quantization/quant_config.py
index fc4bda6..cb35004 100644
--- a/lite_llama/quantization/quant_config.py
+++ b/lite_llama/quantization/quant_config.py
@@ -14,15 +14,20 @@ class AWQConfig:
     search_scale: bool = False  # Whether to search for optimal scales
     auto_scale: bool = True  # Automatic scaling
     device: str = "cuda"
-    alpha = 0.5
+    alpha: float = 0.5
 
 
 @dataclass
 class GPTQConfig:
-    """Configuration for AWQ quantization"""
-    w_bit: int = 4  # Weight quantization bits
-    group_size: int = 128  # Group size for quantization
+    """GPTQ量化配置"""
+    w_bit: int = 4
+    group_size: int = 64  # 减少组大小提高压缩率
     device: str = "cuda"
+    quantize_embedding: bool = True
+    quantize_lm_head: bool = True
+    adaptive_group_size: bool = True  # 自适应组大小
+    optimize_for_compression: bool = True  # 优化压缩率
+
 
 
 @dataclass
@@ -40,4 +45,13 @@ class SmoothQuantConfig:
     smooth_layers: List[str] = field(default_factory=lambda: [
         "q_proj", "k_proj", "v_proj", "o_proj",
         "gate_proj", "up_proj", "down_proj"
-    ])
\ No newline at end of file
+    ])
+
+@dataclass
+class QuantLayerConfig:
+    """Configuration for QuantLayer"""
+    quant_layers: List[str] = field(default_factory=lambda: [
+        "q_proj", "k_proj", "v_proj", "o_proj",
+        "gate_proj", "up_proj", "down_proj",
+        "kv_proj", "lm_head"
+    ])
diff --git a/lite_llama/quantization/quant_manager.py b/lite_llama/quantization/quant_manager.py
new file mode 100644
index 0000000..b536709
--- /dev/null
+++ b/lite_llama/quantization/quant_manager.py
@@ -0,0 +1,266 @@
+"""
+Quantization Manager for lite_llama
+Provides unified interface for GPTQ, AWQ, and SmoothQuant
+"""
+import os
+import json
+import torch
+import torch.nn as nn
+from typing import Dict, Optional, Union, Any, List
+from pathlib import Path
+from tqdm import tqdm
+
+from .awq import AWQ, quantize_awq
+from .gptq import GPTQ, quantize_gptq
+from .sq import SmoothQuantizer, apply_smoothquant
+from .quant_config import AWQConfig, GPTQConfig, SmoothQuantConfig, QuantLayerConfig
+
+# Import quantized linear layers
+from ..kernels.awq_linear import AWQLinear
+from ..kernels.gptq_linear import GPTQLinear
+from ..kernels.sq_linear import SmoothQuantLinear
+
+
+class QuantizationType:
+    NONE = "none"
+    GPTQ = "gptq"
+    AWQ = "awq"
+    SMOOTHQUANT = "smoothquant"
+    INT4 = "int4"
+    INT8 = "int8"
+
+
+class QuantizationManager:
+    """统一的量化管理器"""
+
+    def __init__(self):
+        self.supported_methods = {
+            QuantizationType.GPTQ: self._load_gptq,
+            QuantizationType.AWQ: self._load_awq,
+            QuantizationType.SMOOTHQUANT: self._load_smoothquant,
+        }
+
+    def detect_quantization_type(self, model_path: str) -> str:
+        """自动检测模型的量化类型"""
+        model_path = Path(model_path)
+
+        # 检查量化配置文件
+        quant_config_path = model_path / "quantization_config.json"
+        if quant_config_path.exists():
+            with open(quant_config_path, 'r') as f:
+                config = json.load(f)
+                return config.get("quantization_method", QuantizationType.NONE)
+
+        # 通过权重文件名检测
+        weight_files = list(model_path.glob("*.pth"))
+        if weight_files:
+            state_dict = torch.load(weight_files[0], map_location="cpu")
+
+            # 检查是否有量化相关的键
+            for key in state_dict.keys():
+                if "qweight" in key and "qzeros" in key:
+                    if "qscales" in key:
+                        return QuantizationType.AWQ
+                    elif "scales" in key:
+                        return QuantizationType.GPTQ
+                elif "weight_scale" in key and "smoothing_factor" in key:
+                    return QuantizationType.SMOOTHQUANT
+
+        return QuantizationType.NONE
+
+    def quantize_model(
+            self,
+            model_path: str,
+            output_path: str,
+            method: str,
+            config: Optional[Dict] = None,
+            calibration_data: Optional[Any] = None,
+            model: Optional[torch.nn.Module] = None
+    ) -> str:
+        """量化模型"""
+        print(f"开始使用 {method} 方法量化模型...")
+
+        # 加载原始模型状态字典
+        model_path = Path(model_path)
+        weight_files = list(model_path.glob("*.pth"))
+        if not weight_files:
+            raise ValueError(f"在 {model_path} 中未找到权重文件")
+
+        state_dict = torch.load(weight_files[0], map_location="cpu")
+
+        # 根据方法进行量化
+        if method == QuantizationType.GPTQ:
+            config = config or {}
+            gptq_config = GPTQConfig(**config)
+            quantized_state_dict = quantize_gptq(
+                model_state_dict=state_dict,
+                target_layers=self._get_target_layers(state_dict),
+                device=gptq_config.device
+            )
+
+        elif method == QuantizationType.AWQ:
+            config = config or {}
+            awq_config = AWQConfig(**config)
+            quantized_state_dict = quantize_awq(
+                model_state_dict=state_dict,
+                calibration_loader=calibration_data,
+                model=model,
+                config=awq_config,
+                target_layers=self._get_target_layers(state_dict),
+                device=awq_config.device
+            )
+
+        elif method == QuantizationType.SMOOTHQUANT:
+            config = config or {}
+            config.smooth_layers = self._get_target_layers(state_dict)
+            sq_config = SmoothQuantConfig(**config)
+            quantized_state_dict = apply_smoothquant(
+                model_state_dict=state_dict,
+                config=sq_config,
+            )
+
+        else:
+            raise ValueError(f"不支持的量化方法: {method}")
+
+        # 保存量化后的模型
+        output_path = Path(output_path)
+        output_path.mkdir(parents=True, exist_ok=True)
+
+        # 保存权重
+        torch.save(
+            quantized_state_dict,
+            output_path / f"{model_path.name}.pth",
+            _use_new_zipfile_serialization=True
+        )
+
+        # 复制其他文件
+        for file in model_path.glob("*.json"):
+            if file.name != "quantization_config.json":
+                import shutil
+                shutil.copy2(file, output_path)
+
+        # 复制tokenizer文件
+        for file in model_path.glob("tokenizer*"):
+            import shutil
+            shutil.copy2(file, output_path)
+
+        # 保存量化配置
+        quant_config = {
+            "quantization_method": method,
+            "config": config,
+            "quantized_at": torch.cuda.get_device_name() if torch.cuda.is_available() else "cpu"
+        }
+
+        with open(output_path / "quantization_config.json", 'w') as f:
+            json.dump(quant_config, f, indent=2)
+
+        print(f"量化完成! 输出保存至: {output_path}")
+        return str(output_path)
+
+    def load_quantized_model(
+            self,
+            model_path: str,
+            model_config: Any,
+            device: str = "cuda"
+    ) -> torch.nn.Module:
+        """加载量化后的模型"""
+        quant_type = self.detect_quantization_type(model_path)
+
+        if quant_type == QuantizationType.NONE:
+            # 正常加载非量化模型
+            return self._load_normal_model(model_path, model_config, device)
+
+        if quant_type in self.supported_methods:
+            return self.supported_methods[quant_type](model_path, model_config, device)
+        else:
+            raise ValueError(f"不支持的量化类型: {quant_type}")
+
+    def _load_gptq(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
+        """加载GPTQ量化模型"""
+        from ..models.quantized_models import create_quantized_model
+
+        # 读取量化配置
+        quant_config_path = Path(model_path) / "quantization_config.json"
+        with open(quant_config_path, 'r') as f:
+            quant_config = json.load(f)
+
+        # 创建量化模型
+        model = create_quantized_model(
+            model_config=model_config,
+            quantization_method=QuantizationType.GPTQ,
+            quantization_config=quant_config.get("config", {}),
+            device=device
+        )
+
+        # 加载量化权重
+        weight_files = list(Path(model_path).glob("*.pth"))
+        state_dict = torch.load(weight_files[0], map_location=device)
+        model.load_state_dict(state_dict, strict=False)
+
+        return model
+
+    def _load_awq(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
+        """加载AWQ量化模型"""
+        from ..models.quantized_models import create_quantized_model
+
+        # 读取量化配置
+        quant_config_path = Path(model_path) / "quantization_config.json"
+        with open(quant_config_path, 'r') as f:
+            quant_config = json.load(f)
+
+        # 创建量化模型
+        model = create_quantized_model(
+            model_config=model_config,
+            quantization_method=QuantizationType.AWQ,
+            quantization_config=quant_config.get("config", {}),
+            device=device
+        )
+
+        # 加载量化权重
+        weight_files = list(Path(model_path).glob("*.pth"))
+        state_dict = torch.load(weight_files[0], map_location=device)
+        model.load_state_dict(state_dict, strict=False)
+
+        return model
+
+    def _load_smoothquant(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
+        """加载SmoothQuant量化模型"""
+        from ..models.quantized_models import create_quantized_model
+
+        # 读取量化配置
+        quant_config_path = Path(model_path) / "quantization_config.json"
+        with open(quant_config_path, 'r') as f:
+            quant_config = json.load(f)
+
+        # 创建量化模型
+        model = create_quantized_model(
+            model_config=model_config,
+            quantization_method=QuantizationType.SMOOTHQUANT,
+            quantization_config=quant_config.get("config", {}),
+            device=device
+        )
+
+        # 加载量化权重
+        weight_files = list(Path(model_path).glob("*.pth"))
+        state_dict = torch.load(weight_files[0], map_location=device)
+        model.load_state_dict(state_dict, strict=False)
+
+        return model
+
+    def _load_normal_model(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
+        """加载非量化模型 - 这里需要调用原有的模型加载逻辑"""
+        # 这里应该调用现有的模型加载逻辑
+        # 需要根据具体的模型架构来实现
+        pass
+
+    def _get_target_layers(self, state_dict: Dict[str, torch.Tensor]) -> List[str]:
+        """获取需要量化的层"""
+        target_layers = []
+        for name in state_dict.keys():
+            if any(pattern in name for pattern in QuantLayerConfig.quant_layers):
+                target_layers.append(name)
+        return target_layers
+
+
+# 全局量化管理器实例
+quantization_manager = QuantizationManager()
\ No newline at end of file
diff --git a/lite_llama/quantization/sq.py b/lite_llama/quantization/sq.py
index 20ef2a5..e823ea8 100644
--- a/lite_llama/quantization/sq.py
+++ b/lite_llama/quantization/sq.py
@@ -294,7 +294,6 @@ def convert_to_smoothquant(model, calibration_dataloader, config: SmoothQuantCon
 
 
 def apply_smoothquant(model_state_dict: Dict[str, torch.Tensor],
-                      calibration_dataloader,
                       config: SmoothQuantConfig = None) -> Dict[str, torch.Tensor]:
     """
     Apply SmoothQuant to a model state dictionary
diff --git a/lite_llama/quantization/utils.py b/lite_llama/quantization/utils.py
index 2482e6e..70de35c 100644
--- a/lite_llama/quantization/utils.py
+++ b/lite_llama/quantization/utils.py
@@ -1,11 +1,11 @@
 import torch
-
+from .quant_config import GPTQConfig
 
 def pack_weight(weight):
     """Pack two 4-bit values into one uint8 value"""
     rows, cols = weight.shape
     if cols % 2 != 0:
-        weight = torch.nn.functional.pad(weight, (0, 1), value=0)
+        weight = torch.cat([weight, torch.zeros(rows, 1, dtype=weight.dtype, device=weight.device)], dim=1)
         cols += 1
     packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
     return packed.contiguous()
diff --git a/quantize_lite_llama.py b/quantize_lite_llama.py
new file mode 100644
index 0000000..2f064f5
--- /dev/null
+++ b/quantize_lite_llama.py
@@ -0,0 +1,363 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+quantize_lite_llama.py
+~~~~~~~~~~~~~~~~~~~~
+用于量化lite_llama格式模型的工具脚本
+
+支持GPTQ、AWQ、SmoothQuant三种量化方法
+
+Usage
+-----
+# GPTQ量化
+python quantize_lite_llama.py --model-path /path/to/model --output-path /path/to/output --method gptq --bits 4 --group-size 128
+
+# AWQ量化
+python quantize_lite_llama.py --model-path /path/to/model --output-path /path/to/output --method awq --bits 4 --group-size 128 --calib-data /path/to/calib.txt
+
+# SmoothQuant量化
+python quantize_lite_llama.py --model-path /path/to/model --output-path /path/to/output --method smoothquant --alpha 0.5
+
+Author: AI Assistant (2025-01-22)
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Optional, Dict, Any, List
+import torch
+from tqdm import tqdm
+
+# Add lite_llama to Python path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from lite_llama.quantization.quant_manager import quantization_manager, QuantizationType
+from lite_llama.quantization.quant_config import AWQConfig, GPTQConfig, SmoothQuantConfig
+from lite_llama.utils.common import get_model_info, check_model_compatibility
+from lite_llama.utils.logger import log
+from lite_llama.executor.model_executor import ModelExecutor
+from transformers import AutoTokenizer
+
+
+class CalibrationDataLoader:
+    """校准数据加载器"""
+
+    def __init__(self, data_path: str, tokenizer_path: str, max_samples: int = 128, max_length: int = 512):
+        self.data_path = data_path
+        self.max_samples = max_samples
+        self.max_length = max_length
+
+        # 加载分词器
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        # 加载校准数据
+        self.texts = self._load_calibration_data()
+
+    def _load_calibration_data(self) -> List[str]:
+        """加载校准数据"""
+        texts = []
+
+        if self.data_path.endswith('.txt'):
+            # 纯文本文件，每行一个样本
+            with open(self.data_path, 'r', encoding='utf-8') as f:
+                texts = [line.strip() for line in f if line.strip()]
+
+        elif self.data_path.endswith('.json'):
+            # JSON文件
+            with open(self.data_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    # 假设是文本列表
+                    texts = [item if isinstance(item, str) else item.get('text', '') for item in data]
+                else:
+                    # 假设是包含文本字段的对象
+                    texts = [data.get('text', '')]
+
+        elif self.data_path.endswith('.jsonl'):
+            # JSONL文件
+            with open(self.data_path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    item = json.loads(line.strip())
+                    texts.append(item.get('text', ''))
+
+        else:
+            raise ValueError(f"不支持的文件格式: {self.data_path}")
+
+        # 限制样本数量
+        texts = texts[:self.max_samples]
+        log.info(f"加载了 {len(texts)} 个校准样本")
+
+        return texts
+
+    def __len__(self):
+        return len(self.texts)
+
+    def __iter__(self):
+        """返回批次数据的迭代器"""
+        for text in self.texts:
+            # 编码文本
+            encoding = self.tokenizer(
+                text,
+                return_tensors='pt',
+                max_length=self.max_length,
+                truncation=True,
+                padding=True
+            )
+
+            yield encoding
+
+
+def create_default_calibration_data(tokenizer_path: str, num_samples: int = 32) -> List[str]:
+    """创建默认的校准数据"""
+    default_texts = [
+        "The quick brown fox jumps over the lazy dog.",
+        "Artificial intelligence is transforming the world.",
+        "Machine learning models require careful optimization.",
+        "Deep neural networks can learn complex patterns.",
+        "Natural language processing enables human-computer interaction.",
+        "Computer vision systems can understand visual content.",
+        "Quantization reduces model size while maintaining accuracy.",
+        "Large language models demonstrate emergent capabilities.",
+        "Transformer architectures have revolutionized AI.",
+        "Self-attention mechanisms capture long-range dependencies."
+    ]
+
+    # 重复样本以达到所需数量
+    texts = (default_texts * ((num_samples // len(default_texts)) + 1))[:num_samples]
+    log.info(f"使用默认校准数据，共 {len(texts)} 个样本")
+
+    return texts
+
+
+def validate_quantization_config(method: str, config: Dict[str, Any]) -> Dict[str, Any]:
+    """验证和标准化量化配置"""
+
+    if method == QuantizationType.GPTQ:
+        validated_config = {
+            'w_bit': config.get('bits', 4),
+            'group_size': config.get('group_size', 128),
+            'device': config.get('device', 'cuda')
+        }
+
+        # 验证参数范围
+        if validated_config['w_bit'] not in [2, 3, 4, 8]:
+            raise ValueError(f"GPTQ不支持的位数: {validated_config['w_bit']}")
+
+    elif method == QuantizationType.AWQ:
+        validated_config = {
+            'w_bit': config.get('bits', 4),
+            'group_size': config.get('group_size', 128),
+            'zero_point': config.get('zero_point', True),
+            'search_scale': config.get('search_scale', False),
+            'auto_scale': config.get('auto_scale', True),
+            'alpha': config.get('alpha', 0.5),
+            'device': config.get('device', 'cuda')
+        }
+
+        if validated_config['w_bit'] not in [4, 8]:
+            raise ValueError(f"AWQ不支持的位数: {validated_config['w_bit']}")
+
+    elif method == QuantizationType.SMOOTHQUANT:
+        validated_config = {
+            'alpha': config.get('alpha', 0.5),
+            'w_bit': config.get('w_bits', 8),
+            'a_bit': config.get('a_bits', 8),
+            'symmetric_weight': config.get('symmetric_weight', True),
+            'symmetric_activation': config.get('symmetric_activation', False),
+            'per_channel_weight': config.get('per_channel_weight', True),
+            'per_token_activation': config.get('per_token_activation', True),
+            'calibration_samples': config.get('calibration_samples', 128),
+            'device': config.get('device', 'cuda')
+        }
+
+        if not (0.0 <= validated_config['alpha'] <= 1.0):
+            raise ValueError(f"SmoothQuant的alpha参数必须在0-1之间: {validated_config['alpha']}")
+
+    else:
+        raise ValueError(f"不支持的量化方法: {method}")
+
+    return validated_config
+
+
+def main():
+    parser = argparse.ArgumentParser(description="量化lite_llama格式的模型")
+
+    # 基本参数
+    parser.add_argument("--model-path", type=str, required=True,
+                        help="输入模型路径")
+    parser.add_argument("--output-path", type=str, required=True,
+                        help="输出模型路径")
+    parser.add_argument("--method", type=str, required=True,
+                        choices=['gptq', 'awq', 'smoothquant'],
+                        help="量化方法")
+
+    # 量化参数
+    parser.add_argument("--bits", type=int, default=4,
+                        help="量化位数 (default: 4)")
+    parser.add_argument("--group-size", type=int, default=128,
+                        help="组大小 (default: 128)")
+
+    # AWQ特有参数
+    parser.add_argument("--alpha", type=float, default=0.5,
+                        help="AWQ/SmoothQuant的alpha参数 (default: 0.5)")
+    parser.add_argument("--search-scale", action='store_true',
+                        help="AWQ是否搜索最优缩放因子")
+    parser.add_argument("--auto-scale", action='store_true', default=True,
+                        help="AWQ是否自动缩放")
+
+    # SmoothQuant特有参数
+    parser.add_argument("--w-bits", type=int, default=8,
+                        help="权重量化位数 (SmoothQuant, default: 8)")
+    parser.add_argument("--a-bits", type=int, default=8,
+                        help="激活量化位数 (SmoothQuant, default: 8)")
+
+    # 校准数据
+    parser.add_argument("--calib-data", type=str, default=None,
+                        help="校准数据文件路径 (.txt/.json/.jsonl)")
+    parser.add_argument("--calib-samples", type=int, default=128,
+                        help="校准样本数量 (default: 128)")
+    parser.add_argument("--max-length", type=int, default=512,
+                        help="校准数据最大长度 (default: 512)")
+
+    # 其他参数
+    parser.add_argument("--device", type=str, default="cuda",
+                        choices=['cuda', 'cpu'],
+                        help="设备 (default: cuda)")
+    parser.add_argument("--no-verify", action='store_true',
+                        help="跳过量化验证")
+
+    args = parser.parse_args()
+
+    # 检查模型兼容性
+    is_compatible, message = check_model_compatibility(args.model_path)
+    if not is_compatible:
+        log.error(f"模型兼容性检查失败: {message}")
+        return 1
+
+    # 获取模型信息
+    model_info = get_model_info(args.model_path)
+    log.info(f"模型信息: {model_info}")
+
+    # 准备量化配置
+    config = {
+        'bits': args.bits,
+        'group_size': args.group_size,
+        'alpha': args.alpha,
+        'search_scale': args.search_scale,
+        'auto_scale': args.auto_scale,
+        'w_bits': args.w_bits,
+        'a_bits': args.a_bits,
+        'device': args.device,
+        'calibration_samples': args.calib_samples
+    }
+
+    # 验证配置
+    try:
+        validated_config = validate_quantization_config(args.method, config)
+        log.info(f"量化配置: {validated_config}")
+    except ValueError as e:
+        log.error(f"配置验证失败: {e}")
+        return 1
+
+    # 准备校准数据
+    calibration_data = None
+    model = None
+
+    if args.method in ['awq', 'smoothquant']:
+        log.info("准备校准数据...")
+
+        if args.calib_data:
+            # 使用用户提供的校准数据
+            try:
+                calibration_data = CalibrationDataLoader(
+                    args.calib_data,
+                    args.model_path,
+                    args.calib_samples,
+                    args.max_length
+                )
+                log.info(f"加载校准数据: {len(calibration_data)} 个样本")
+            except Exception as e:
+                log.error(f"加载校准数据失败: {e}")
+                log.info("将使用默认校准数据")
+                calibration_data = create_default_calibration_data(
+                    args.model_path, args.calib_samples
+                )
+        else:
+            # 使用默认校准数据
+            calibration_data = create_default_calibration_data(
+                args.model_path, args.calib_samples
+            )
+
+        # 如果需要，加载原始模型用于校准
+        if args.method == 'awq':
+            log.info("加载原始模型用于AWQ校准...")
+            try:
+                model_executor = ModelExecutor.build(
+                    checkpoints_dir=args.model_path,
+                    max_seq_len=2048,
+                    max_gpu_num_blocks=None,
+                    compiled_model=False,
+                    device=args.device
+                )
+                model = model_executor.model
+                log.info("模型加载成功")
+            except Exception as e:
+                log.error(f"模型加载失败: {e}")
+                return 1
+
+    # 执行量化
+    log.info(f"开始使用 {args.method.upper()} 方法量化模型...")
+    start_time = time.time()
+
+    try:
+        output_path = quantization_manager.quantize_model(
+            model_path=args.model_path,
+            output_path=args.output_path,
+            method=args.method,
+            config=validated_config,
+            calibration_data=calibration_data,
+            model=model
+        )
+
+        quantization_time = time.time() - start_time
+        log.info(f"量化完成! 耗时: {quantization_time:.2f}s")
+        log.info(f"量化模型保存至: {output_path}")
+
+    except Exception as e:
+        log.error(f"量化失败: {e}")
+        return 1
+
+    # 验证量化结果
+    if not args.no_verify:
+        log.info("验证量化结果...")
+        try:
+            # 检测量化类型
+            detected_type = quantization_manager.detect_quantization_type(output_path)
+            if detected_type == args.method:
+                log.info(f"✅ 量化类型验证通过: {detected_type}")
+            else:
+                log.warning(f"⚠️ 量化类型不匹配: 期望 {args.method}, 检测到 {detected_type}")
+
+            # 检查文件大小
+            original_size = sum(f.stat().st_size for f in Path(args.model_path).glob("*.pth"))
+            quantized_size = sum(f.stat().st_size for f in Path(output_path).glob("*.pth"))
+            compression_ratio = original_size / quantized_size if quantized_size > 0 else 1.0
+
+            log.info(f"原始模型大小: {original_size / (1024 ** 3):.2f} GB")
+            log.info(f"量化模型大小: {quantized_size / (1024 ** 3):.2f} GB")
+            log.info(f"压缩比: {compression_ratio:.2f}x")
+
+        except Exception as e:
+            log.warning(f"量化验证失败: {e}")
+
+    log.info("量化任务完成!")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file

From de0b18eaca58deeb3424c02412e6351b54855cf0 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Thu, 24 Jul 2025 22:04:27 +0930
Subject: [PATCH 32/33] refactor quant test

---
 generate.py                                 |   4 +-
 lite_llama/executor/model_executor.py       |   8 +-
 lite_llama/models/quantized_models.py       |   3 -
 lite_llama/quantization/awq.py              | 333 +++++++-------------
 lite_llama/quantization/gptq.py             | 329 ++++++-------------
 lite_llama/quantization/quant_manager.py    |  30 +-
 lite_llama/quantization/utils.py            |  44 ++-
 quantize_lite_llama.py                      |  92 +++---
 test.py                                     | 317 +++++++++++++++++++
 tests/quant/__init__.py                     |   0
 tests/{kernels => quant}/test_AWQLinear.py  |   0
 tests/{kernels => quant}/test_GPTQLinear.py |   0
 tests/{kernels => quant}/test_SQLinear.py   |   0
 tests/quant/test_awq.py                     | 317 +++++++++++++++++++
 tests/{ => quant}/test_gptq.py              |   0
 15 files changed, 945 insertions(+), 532 deletions(-)
 create mode 100644 test.py
 create mode 100644 tests/quant/__init__.py
 rename tests/{kernels => quant}/test_AWQLinear.py (100%)
 rename tests/{kernels => quant}/test_GPTQLinear.py (100%)
 rename tests/{kernels => quant}/test_SQLinear.py (100%)
 create mode 100644 tests/quant/test_awq.py
 rename tests/{ => quant}/test_gptq.py (100%)

diff --git a/generate.py b/generate.py
index 7275461..4b26115 100644
--- a/generate.py
+++ b/generate.py
@@ -63,7 +63,7 @@ def generate_llama(
     if quantization is None:
         quantization = quantization_manager.detect_quantization_type(checkpoint_path)
         if quantization != QuantizationType.NONE:
-            log.info(f"自动检测到量化类型: {quantization}")
+            log.info(f"Automatically detect the quantization type: {quantization}")
 
     if max_seq_len <= 1024:
         short_prompt = True
@@ -134,7 +134,7 @@ def generate_llava(
     if quantization is None:
         quantization = quantization_manager.detect_quantization_type(str(checkpoint_path))
         if quantization != QuantizationType.NONE:
-            log.info(f"自动检测到量化类型: {quantization}")
+            log.info(f"Automatically detect the quantization type: {quantization}")
     
     if max_seq_len <= 1024:
         short_prompt = True
diff --git a/lite_llama/executor/model_executor.py b/lite_llama/executor/model_executor.py
index 10d5750..d933b17 100644
--- a/lite_llama/executor/model_executor.py
+++ b/lite_llama/executor/model_executor.py
@@ -66,9 +66,9 @@ def build(
         # 检测或使用指定的量化类型
         if quantization is None:
             quantization = quantization_manager.detect_quantization_type(checkpoints_dir)
-            log.info(f"自动检测到量化类型: {quantization}")
+            log.info(f"Automatically detect the quantization type: {quantization}")
         else:
-            log.info(f"使用指定的量化类型: {quantization}")
+            log.info(f"Use the specified quantization type: {quantization}")
 
         model = ModelExecutor._load_model_weight(
             model_config, checkpoints_dir, device=device, quantization=quantization
@@ -105,14 +105,14 @@ def _load_model_weight(
 
         if quantization and quantization != QuantizationType.NONE:
             # 加载量化模型
-            log.info(f"加载量化模型: {quantization}")
+            log.info(f"Load quantitative model: {quantization}")
             model = quantization_manager.load_quantized_model(
                 model_path=checkpoints_dir,
                 model_config=model_config,
                 device=device
             )
 
-            log.info(f"量化模型加载完成，耗时 {time.time() - start_time:.2f}s")
+            log.info(f"The quantitative model has been loaded successfully, taking {time.time() - start_time:.2f}s")
             return model
 
         # 原有的非量化模型加载逻辑
diff --git a/lite_llama/models/quantized_models.py b/lite_llama/models/quantized_models.py
index 5eb5d91..8139409 100644
--- a/lite_llama/models/quantized_models.py
+++ b/lite_llama/models/quantized_models.py
@@ -97,7 +97,6 @@ def _create_sq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -
             in_features=original_layer.in_features,
             out_features=original_layer.out_features,
             bias=original_layer.bias is not None,
-            config=sq_config
         )
         return sq_layer
 
@@ -150,13 +149,11 @@ def _create_awq_linear(self, original_layer: nn.Linear, config: Dict[str, Any])
     def _create_sq_linear(self, original_layer: nn.Linear, config: Dict[str, Any]) -> SmoothQuantLinear:
         """创建SmoothQuant量化线性层"""
         from ..quantization.quant_config import SmoothQuantConfig
-        sq_config = SmoothQuantConfig(**config)
 
         sq_layer = SmoothQuantLinear(
             in_features=original_layer.in_features,
             out_features=original_layer.out_features,
             bias=original_layer.bias is not None,
-            config=sq_config
         )
         return sq_layer
 
diff --git a/lite_llama/quantization/awq.py b/lite_llama/quantization/awq.py
index cc8f047..a28dad7 100644
--- a/lite_llama/quantization/awq.py
+++ b/lite_llama/quantization/awq.py
@@ -1,31 +1,19 @@
-from dataclasses import field
-
 import torch
 import torch.nn as nn
 import numpy as np
 from typing import Dict, Tuple, Optional, Any, List
 from tqdm.auto import tqdm
-import triton
-import triton.language as tl
-import psutil, os, sys
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
-
 from lite_llama.quantization.utils import pack_weight
 from lite_llama.quantization.quant_config import AWQConfig
 
 
 class AWQ:
-    def __init__(
-            self,
-            config: AWQConfig = field(default_factory=AWQConfig),
-            wbits: int = 4,
-    ):
+    def __init__(self, config: AWQConfig):
         self.config = config
         self.wbits = self.config.w_bit
         self.groupsize = self.config.group_size if self.config.group_size != -1 else float('inf')
         self.device = self.config.device
-        self.maxq = 2 ** wbits - 1
+        self.maxq = 2 ** self.wbits - 1  # For 4-bit: 0-15
         self.zero_point = config.zero_point
         self.alpha = self.config.alpha
         self.search_scale = self.config.search_scale
@@ -33,7 +21,6 @@ def __init__(
 
         # Store activation statistics
         self.activation_stats = {}
-        self.collected_inputs = {}
 
     def collect_activations(self, layer_name: str, input_tensor: torch.Tensor):
         """Collect activation statistics for AWQ calibration"""
@@ -44,22 +31,18 @@ def collect_activations(self, layer_name: str, input_tensor: torch.Tensor):
                 'inputs': []
             }
 
-        # Store input activations
-        if len(self.activation_stats[layer_name]['inputs']) < 128:  # Limit storage
+        # Store input activations (limit storage to prevent OOM)
+        if len(self.activation_stats[layer_name]['inputs']) < 32:
             self.activation_stats[layer_name]['inputs'].append(input_tensor.detach().cpu())
 
-        # Compute statistics across the sequence dimension
-        # Input shape is typically [batch, seq_len, hidden_dim]
-        if input_tensor.dim() == 3:
-            # Average across batch and sequence dimensions
+        # Compute per-channel statistics
+        if input_tensor.dim() == 3:  # [batch, seq, hidden]
             channel_means = input_tensor.abs().mean(dim=(0, 1))
             channel_maxs = input_tensor.abs().max(dim=1)[0].max(dim=0)[0]
-        elif input_tensor.dim() == 2:
-            # Average across batch dimension
+        elif input_tensor.dim() == 2:  # [batch, hidden]
             channel_means = input_tensor.abs().mean(dim=0)
             channel_maxs = input_tensor.abs().max(dim=0)[0]
         else:
-            # Flatten and compute
             channel_means = input_tensor.abs().view(-1, input_tensor.shape[-1]).mean(dim=0)
             channel_maxs = input_tensor.abs().view(-1, input_tensor.shape[-1]).max(dim=0)[0]
 
@@ -72,8 +55,6 @@ def get_salient_channels(self, layer_name: str, top_k: float = 0.01) -> torch.Te
             return None
 
         stats = self.activation_stats[layer_name]
-
-        # Aggregate statistics across all collected samples
         if stats['mean']:
             mean_activations = torch.stack(stats['mean']).mean(dim=0)
             max_activations = torch.stack(stats['max']).mean(dim=0)
@@ -84,114 +65,57 @@ def get_salient_channels(self, layer_name: str, top_k: float = 0.01) -> torch.Te
             # Select top-k% most salient channels
             num_salient = max(1, int(len(saliency_score) * top_k))
             _, salient_indices = torch.topk(saliency_score, num_salient)
-
             return salient_indices
 
         return None
 
-    def pseudo_quantize_tensor(self, w: torch.Tensor, n_bit: int = 4, zero_point: bool = True,
-                               q_group_size: int = -1, inplace: bool = False):
-        """Pseudo-quantize tensor to simulate quantization effects"""
-        org_w_shape = w.shape
-        if q_group_size > 0:
-            assert org_w_shape[-1] % q_group_size == 0
-            w = w.reshape(-1, q_group_size)
-
-        assert w.dim() == 2
-        if zero_point:
-            max_val = w.amax(dim=1, keepdim=True)
-            min_val = w.amin(dim=1, keepdim=True)
-            max_int = 2 ** n_bit - 1
-            min_int = 0
-            scales = (max_val - min_val).clamp(min=1e-5) / max_int
-            zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
-        else:
-            max_val = w.abs().amax(dim=1, keepdim=True)
-            max_val = max_val.clamp(min=1e-5)
-            max_int = 2 ** (n_bit - 1) - 1
-            min_int = -(2 ** (n_bit - 1))
-            scales = max_val / max_int
-            zeros = torch.zeros_like(scales)
-
-        assert torch.isnan(scales).sum() == 0
-        assert torch.isnan(w).sum() == 0
-
-        if inplace:
-            ((w.div_(scales).round_().add_(zeros)).clamp_(min_int, max_int).sub_(zeros)).mul_(scales)
-            return w
-        else:
-            w_sim = ((w / scales).round() + zeros).clamp(min_int, max_int)
-            w_sim = (w_sim - zeros) * scales
-            return w_sim.reshape(org_w_shape)
-
     def search_best_scale(self, layer_name: str, weight: torch.Tensor, input_feat: torch.Tensor) -> torch.Tensor:
         """Search for the best per-channel scaling factors"""
         device = weight.device
         org_out = torch.matmul(input_feat, weight.t())
 
-        if org_out.abs().max() < 0.2:
+        if org_out.abs().max() < 0.01:  # Very small activations
             return torch.ones(weight.shape[0], device=device, dtype=weight.dtype)
 
-        w_abs_max = weight.abs().max(dim=1)[0].clamp(min=1e-5)
-
-        # Get salient channels for this layer
+        # Get salient channels
         salient_channels = self.get_salient_channels(layer_name)
 
-        # Grid search for best scaling factors
         best_error = float('inf')
-        best_scales = torch.ones_like(w_abs_max)
+        best_scales = torch.ones(weight.shape[0], device=device, dtype=weight.dtype)
 
-        # Different alpha values for grid search
+        # Grid search for best alpha
         alpha_candidates = [0.0, 0.1, 0.25, 0.5, 0.75, 1.0] if self.search_scale else [self.alpha]
 
         for alpha in alpha_candidates:
-            # Compute scales based on activation statistics
-            if salient_channels is not None and len(salient_channels) > 0:
-                # Protect salient channels with different scaling
-                scales = torch.ones_like(w_abs_max)
-
-                # For salient channels, use more conservative scaling
-                if layer_name in self.activation_stats:
-                    stats = self.activation_stats[layer_name]
-                    if stats['mean']:
-                        mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
-
-                        # Scale based on activation magnitude
-                        activation_scales = mean_activations.pow(alpha)
-                        activation_scales = activation_scales / activation_scales.max()
-
-                        # Apply different scaling to salient vs non-salient channels
-                        scales = activation_scales.clamp(min=0.1, max=1.0)
-
-                        # Give salient channels more protection (higher scale values)
-                        scales[salient_channels] = scales[salient_channels].clamp(min=0.5)
-                else:
-                    # Fallback to weight-based scaling
-                    scales = w_abs_max.pow(alpha)
-                    scales = scales / scales.max()
-            else:
-                # Standard AWQ scaling without saliency
-                if layer_name in self.activation_stats and self.activation_stats[layer_name]['mean']:
-                    stats = self.activation_stats[layer_name]
-                    mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
-                    scales = mean_activations.pow(alpha)
-                    scales = scales / scales.max()
-                else:
-                    scales = w_abs_max.pow(alpha)
-                    scales = scales / scales.max()
+            # Compute channel-wise scaling factors
+            if layer_name in self.activation_stats and self.activation_stats[layer_name]['mean']:
+                stats = self.activation_stats[layer_name]
+                mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
+
+                # AWQ scaling: s_j = (max|X_j|^alpha) / (max|W_j|^(1-alpha))
+                weight_max = weight.abs().max(dim=0)[0].clamp(min=1e-5)
+                act_max = mean_activations.clamp(min=1e-5)
 
-            scales = scales.clamp(min=0.1, max=1.0)
+                scales = act_max.pow(alpha) / weight_max.pow(1 - alpha)
+                scales = scales.clamp(min=0.1, max=10.0)  # Prevent extreme values
 
-            # Apply scaling and quantize
+                # Protect salient channels with higher scale values
+                if salient_channels is not None:
+                    scales[salient_channels] = scales[salient_channels].clamp(min=0.5)
+            else:
+                # Fallback to weight-based scaling
+                weight_max = weight.abs().max(dim=0)[0]
+                scales = weight_max.pow(alpha).clamp(min=0.1, max=10.0)
+                scales = scales / scales.max()  # Normalize
+
+            # Test this scaling
             weight_scaled = weight * scales.view(-1, 1)
-            weight_sim = self.pseudo_quantize_tensor(
-                weight_scaled,
-                n_bit=self.wbits,
-                zero_point=self.zero_point,
-                q_group_size=self.groupsize if self.groupsize != float('inf') else -1
-            )
-
-            # Compute error
+            qweight, qzeros, qscales = self.quantize_with_scales(weight_scaled, torch.ones_like(scales))
+
+            # Dequantize to test reconstruction quality
+            weight_sim = self.dequantize_weight(qweight, qzeros, qscales, weight.shape[1])
+
+            # Compute reconstruction error
             out_sim = torch.matmul(input_feat, weight_sim.t())
             loss = (org_out - out_sim).float().pow(2).mean().item()
 
@@ -203,11 +127,11 @@ def search_best_scale(self, layer_name: str, weight: torch.Tensor, input_feat: t
 
     def quantize_with_scales(self, weight: torch.Tensor, scales: torch.Tensor) -> Tuple[
         torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Quantize weight with given per-channel scales"""
+        """quantization with proper scaling"""
         device = weight.device
         rows, cols = weight.shape
 
-        # Apply per-channel scaling
+        # Apply per-output-channel scaling
         weight_scaled = weight * scales.view(-1, 1)
 
         # Group-wise quantization
@@ -225,72 +149,67 @@ def quantize_with_scales(self, weight: torch.Tensor, scales: torch.Tensor) -> Tu
         for g in range(num_groups):
             start_col = g * groupsize
             end_col = min((g + 1) * groupsize, cols)
-
             w_group = weight_scaled[:, start_col:end_col]
 
             if self.zero_point:
+                # Asymmetric quantization: map [w_min, w_max] to [0, 2^bits-1]
                 w_min = w_group.min(dim=1, keepdim=True)[0]
                 w_max = w_group.max(dim=1, keepdim=True)[0]
 
+                # Compute scale and zero point
                 range_val = (w_max - w_min).clamp(min=1e-5)
                 scale = range_val / self.maxq
-                zero = torch.round(-w_min / scale).clamp(0, self.maxq)
+                zero = (-w_min / scale).round().clamp(0, self.maxq)
 
+                # Quantize: q = round((w - w_min) / scale) = round(w/scale + zero)
+                q = torch.round(w_group / scale + zero).clamp(0, self.maxq)
             else:
+                # Symmetric quantization around zero
                 w_max = w_group.abs().max(dim=1, keepdim=True)[0].clamp(min=1e-5)
-                scale = w_max / (2 ** (self.wbits - 1) - 1)
-                zero = torch.zeros_like(scale)
+                scale = w_max / (self.maxq // 2)  # Use half range for signed values
+                zero = torch.full_like(scale, self.maxq // 2)  # Midpoint as zero
 
-            # Quantize
-            if self.zero_point:
-                q = torch.clamp(torch.round(w_group / scale + zero), 0, self.maxq)
-            else:
-                q = torch.clamp(torch.round(w_group / scale), -(2 ** (self.wbits - 1)), 2 ** (self.wbits - 1) - 1)
+                # Quantize: q = round(w / scale) + zero_point
+                q = torch.round(w_group / scale + zero).clamp(0, self.maxq)
 
             qweight[:, start_col:end_col] = q.to(torch.uint8)
+            qscales[:, g] = scale.squeeze(-1)
+            qzeros[:, g] = zero.squeeze(-1)
 
-            # Ensure proper dimensions when storing scales and zeros
-            scale_flat = scale.squeeze(-1) if scale.dim() > 1 else scale.flatten()
-            zero_flat = zero.squeeze(-1) if zero.dim() > 1 else zero.flatten()
+        return qweight, qzeros.to(torch.float16), qscales.to(torch.float16)
 
-            # Handle dimension mismatches
-            if scale_flat.shape[0] != rows:
-                if scale_flat.numel() == 1:
-                    scale_flat = scale_flat.expand(rows)
-                else:
-                    scale_flat = scale_flat[:rows]
+    def dequantize_weight(self, qweight: torch.Tensor, qzeros: torch.Tensor,
+                          qscales: torch.Tensor, original_cols: int) -> torch.Tensor:
+        """Dequantize weights for testing"""
+        rows, _ = qweight.shape
+        num_groups = qzeros.shape[1]
+        groupsize = (original_cols + num_groups - 1) // num_groups
 
-            if zero_flat.shape[0] != rows:
-                if zero_flat.numel() == 1:
-                    zero_flat = zero_flat.expand(rows)
-                else:
-                    zero_flat = zero_flat[:rows]
+        weight = torch.zeros((rows, original_cols), dtype=torch.float16, device=qweight.device)
+
+        for g in range(num_groups):
+            start_col = g * groupsize
+            end_col = min((g + 1) * groupsize, original_cols)
+
+            q = qweight[:, start_col:end_col].float()
+            scale = qscales[:, g:g + 1]
+            zero = qzeros[:, g:g + 1]
 
-            qscales[:, g] = scale_flat
-            qzeros[:, g] = zero_flat
+            # Dequantize: w = (q - zero) * scale
+            weight[:, start_col:end_col] = ((q - zero) * scale).to(torch.float16)
 
-        return qweight, qzeros, qscales
+        return weight
 
     def quantize(self, weight: torch.Tensor, layer_name: str = "") -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Main AWQ quantization function
-        Args:
-            weight: Weight tensor to quantize [out_features, in_features]
-            layer_name: Name of the layer for activation lookup
-        Returns:
-            Tuple of (quantized_weight, zeros, scales)
-        """
+        """Main AWQ quantization function with fixes"""
         assert weight.ndim == 2
         device = weight.device
 
         # Get representative input if available
         input_feat = None
         if layer_name in self.activation_stats and self.activation_stats[layer_name]['inputs']:
-            # Use first few inputs for calibration
-            inputs = self.activation_stats[layer_name]['inputs'][:5]
+            inputs = self.activation_stats[layer_name]['inputs'][:3]  # Use first few
             input_feat = torch.cat([inp.to(device) for inp in inputs], dim=0)
-
-            # Reshape if needed: [batch*seq, hidden] -> [batch*seq, hidden]
             if input_feat.dim() == 3:
                 input_feat = input_feat.view(-1, input_feat.shape[-1])
 
@@ -298,14 +217,17 @@ def quantize(self, weight: torch.Tensor, layer_name: str = "") -> Tuple[torch.Te
         if input_feat is not None and self.search_scale:
             scales = self.search_best_scale(layer_name, weight, input_feat)
         else:
-            # Fallback to uniform scaling or activation-based scaling
+            # Use activation statistics for scaling
             if self.auto_scale and layer_name in self.activation_stats:
                 stats = self.activation_stats[layer_name]
                 if stats['mean']:
                     mean_activations = torch.stack(stats['mean']).mean(dim=0).to(device)
-                    scales = mean_activations.pow(self.alpha)
-                    scales = scales / scales.max()
-                    scales = scales.clamp(min=0.1, max=1.0)
+                    weight_max = weight.abs().max(dim=0)[0].clamp(min=1e-5)
+                    act_max = mean_activations.clamp(min=1e-5)
+
+                    scales = act_max.pow(self.alpha) / weight_max.pow(1 - self.alpha)
+                    scales = scales.clamp(min=0.1, max=10.0)
+                    scales = scales / scales.max()  # Normalize
                 else:
                     scales = torch.ones(weight.shape[0], device=device, dtype=weight.dtype)
             else:
@@ -313,10 +235,11 @@ def quantize(self, weight: torch.Tensor, layer_name: str = "") -> Tuple[torch.Te
 
         # Quantize with computed scales
         qweight, qzeros, qscales = self.quantize_with_scales(weight, scales)
-        packed_qweight = pack_weight(qweight)
 
-        return packed_qweight, qzeros.to(torch.float16), qscales.to(torch.float16)
+        # Pack weights consistently
+        packed_qweight = pack_weight(qweight)
 
+        return packed_qweight, qzeros, qscales
 
 
 def quantize_awq(
@@ -327,26 +250,12 @@ def quantize_awq(
         config: AWQConfig = None,
         device: str = "cuda"
 ) -> Dict[str, torch.Tensor]:
-    """
-    Quantize model weights using AWQ algorithm
-
-    Args:
-        model_state_dict: Original model state dictionary
-        calibration_loader: DataLoader for calibration data
-        model: Original model for activation collection
-        wbits: Number of bits for quantization
-        groupsize: Group size for quantization
-        target_layers: List of layer names to quantize
-        device: Device to perform quantization on
-
-    Returns:
-        Dictionary containing quantized weights and quantization parameters
-    """
-    wbits = config.w_bit
+    """AWQ quantization function"""
+
     awq = AWQ(config)
     quantized_state_dict = {}
 
-    # Default target layers if not specified
+    # Default target layers
     if target_layers is None:
         target_layers = []
         for name in model_state_dict.keys():
@@ -361,7 +270,6 @@ def quantize_awq(
     if calibration_loader is not None and model is not None:
         print("Collecting activation statistics for AWQ...")
 
-        # Register hooks to collect activations
         hooks = []
 
         def make_hook(layer_name):
@@ -373,22 +281,35 @@ def hook_fn(module, input, output):
 
             return hook_fn
 
-        # Register hooks for target layers
+        # Register hooks
         for name, module in model.named_modules():
             if name in target_layers and isinstance(module, torch.nn.Linear):
                 hook = module.register_forward_hook(make_hook(name))
                 hooks.append(hook)
 
-
-    print(f"Quantizing {len(target_layers)} layers to {wbits} bits with AWQ...")
+        # Run calibration
+        model.eval()
+        with torch.no_grad():
+            for i, batch in enumerate(tqdm(calibration_loader, desc="Calibrating")):
+                if i >= 32:  # Limit calibration samples
+                    break
+                try:
+                    if hasattr(model, 'forward'):
+                        _ = model(batch)
+                except Exception as e:
+                    print(f"Calibration batch {i} failed: {e}")
+                    continue
+
+        # Remove hooks
+        for hook in hooks:
+            hook.remove()
+
+    print(f"Quantizing {len(target_layers)} layers to {config.w_bit} bits with AWQ...")
 
     # Quantize each target layer
     for name, param in tqdm(model_state_dict.items(), desc="Quantizing layers"):
         if name in target_layers and param.dim() == 2:
-            # Move weight to device
             weight = param.to(device)
-
-            # Get layer name without .weight suffix for activation lookup
             layer_name = name.replace(".weight", "").replace("_weight", "")
 
             # Quantize using AWQ
@@ -399,51 +320,9 @@ def hook_fn(module, input, output):
             quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
             quantized_state_dict[f"{base_name}.qzeros"] = qzeros.cpu()
             quantized_state_dict[f"{base_name}.qscales"] = qscales.cpu()
-
         else:
-            # Keep non-quantized parameters as is
+            # Keep non-quantized parameters
             quantized_state_dict[name] = param.cpu()
 
     print("AWQ quantization completed!")
-    return quantized_state_dict
-
-
-# Example usage function
-def demo_awq():
-    """Demo function showing how to use AWQ"""
-    # Create a dummy model state dict
-    dummy_state_dict = {
-        "layer1.q_proj.weight": torch.randn(768, 768),
-        "layer1.k_proj.weight": torch.randn(768, 768),
-        "layer1.v_proj.weight": torch.randn(768, 768),
-        "layer1.o_proj.weight": torch.randn(768, 768),
-        "other_param": torch.randn(100)
-    }
-
-    print("Starting AWQ demo...")
-
-    # Quantize without calibration data (will use default scaling)
-    quantized_dict = quantize_awq(
-        model_state_dict=dummy_state_dict,
-        wbits=4,
-        groupsize=128,
-        device="cpu"
-    )
-
-    print("Quantized keys:", list(quantized_dict.keys()))
-
-    # Debug: Check dimensions of quantized tensors
-    layer_name = "layer1.q_proj"
-    print(f"\nDebugging {layer_name}:")
-    qweight = quantized_dict[f"{layer_name}.qweight"]
-    qzeros = quantized_dict[f"{layer_name}.qzeros"]
-    qscales = quantized_dict[f"{layer_name}.qscales"]
-
-    print(f"qweight shape: {qweight.shape}")
-    print(f"qzeros shape: {qzeros.shape}")
-    print(f"qscales shape: {qscales.shape}")
-
-
-
-if __name__ == "__main__":
-    demo_awq()
\ No newline at end of file
+    return quantized_state_dict
\ No newline at end of file
diff --git a/lite_llama/quantization/gptq.py b/lite_llama/quantization/gptq.py
index d609c58..d2273bc 100755
--- a/lite_llama/quantization/gptq.py
+++ b/lite_llama/quantization/gptq.py
@@ -1,257 +1,136 @@
-from dataclasses import field
-
 import torch
 import torch.nn as nn
 import numpy as np
 from typing import Dict, Tuple, Optional, Any
 from tqdm.auto import tqdm
-import triton
-import triton.language as tl
 import time, gc, psutil, os, sys
 
-
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
 from lite_llama.quantization.quant_config import GPTQConfig
-
-from lite_llama.utils.common import get_gpu_memory  # Replace with actual GPU mem check if needed
+from lite_llama.utils.common import get_gpu_memory
 from lite_llama.quantization.utils import pack_weight, unpack_weight
 
+
 class GPTQ:
-    def __init__(
-            self,
-            config: GPTQConfig = field(default_factory=GPTQConfig),
-    ):
+    def __init__(self, config: GPTQConfig):
         self.wbits = config.w_bit
         self.groupsize = config.group_size if config.group_size != -1 else float('inf')
         self.device = config.device
         self.maxq = 2 ** self.wbits - 1
 
-    def relative_error_loss(self, w_original: torch.Tensor, w_reconstructed: torch.Tensor,
-                            eps: float = 1e-5) -> torch.Tensor:
-        """Compute relative error loss with better handling of small weights"""
-        abs_diff = (w_original - w_reconstructed).abs()
-
-        # Use adaptive epsilon based on weight magnitude distribution
-        w_abs = w_original.abs()
-        adaptive_eps = torch.maximum(
-            torch.tensor(eps, device=w_original.device),
-            0.01 * w_abs.median()  # Use median as robust estimate
-        )
-
-        rel_err = abs_diff / (w_abs + adaptive_eps)
-
-        # Use robust loss to handle outliers
-        return rel_err.mean() + 0.1 * rel_err.pow(2).mean()
-
-    def optimize_for_relative_error(self, w_group: torch.Tensor, max_iter: int = 200) -> Tuple[
-        torch.Tensor, torch.Tensor]:
-        """Optimize scale and zero specifically for minimal relative error"""
-        device = w_group.device
-
-        # Separate handling for near-zero and normal weights
-        w_abs = w_group.abs()
-        w_median = w_abs.median()
-        small_weight_threshold = 0.1 * w_median
-
-        # Initialize with better starting points
-        w_min = w_group.min(dim=-1, keepdim=True)[0]
-        w_max = w_group.max(dim=-1, keepdim=True)[0]
-
-        # For groups with many small weights, use tighter bounds
-        if (w_abs < small_weight_threshold).float().mean() > 0.3:
-            # Use percentile-based bounds for groups with many small weights
-            w_flat = w_group.view(w_group.shape[0], -1)
-            w_sorted = torch.sort(w_flat, dim=-1)[0]
-            n = w_sorted.shape[-1]
-            w_min = w_sorted[:, max(0, int(0.05 * n)):max(1, int(0.05 * n) + 1)]
-            w_max = w_sorted[:, min(n - 1, int(0.95 * n)):min(n, int(0.95 * n) + 1)]
-
-        range_val = w_max - w_min
-        range_val = torch.where(range_val < 1e-8, torch.tensor(1e-6, device=device), range_val)
+    def find_params(self, x, weight):
+        """Standard min-max quantization parameter calculation"""
+        self.maxq = torch.tensor(2 ** self.wbits - 1)
 
-        # Initialize parameters
-        scale = nn.Parameter((range_val / self.maxq).clamp(min=1e-8))
-        zero = nn.Parameter(torch.round(-w_min / scale).clamp(0, self.maxq))
-
-        optimizer = torch.optim.AdamW([scale, zero], lr=0.005, weight_decay=1e-6)
-        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter)
-
-        best_loss = float('inf')
-        best_scale = scale.data.clone()
-        best_zero = zero.data.clone()
-        patience = 20
-        no_improve = 0
-
-        for i in range(max_iter):
-            optimizer.zero_grad()
+        shape = weight.shape
+        if self.groupsize != float('inf'):
+            groupsize = min(int(self.groupsize), shape[1])
+        else:
+            groupsize = shape[1]
 
-            # Ensure valid range
-            scale.data.clamp_(min=1e-8, max=1e3)
-            zero.data.clamp_(0, self.maxq)
+        weight = weight.float()
+        weight = weight.reshape((-1, groupsize))
 
-            # Quantize and dequantize
-            q = torch.clamp(torch.round(w_group / scale + zero), 0, self.maxq)
-            w_rec = (q - zero) * scale
+        # Calculate min/max for each group
+        tmp = torch.zeros(weight.shape[0], device=self.device)
+        xmin = torch.minimum(weight.min(1)[0], tmp)
+        xmax = torch.maximum(weight.max(1)[0], tmp)
 
-            # Use relative error loss
-            loss = self.relative_error_loss(w_group, w_rec)
+        # Symmetric quantization around zero
+        tmp = (xmin == 0) & (xmax == 0)
+        xmin[tmp] = -1
+        xmax[tmp] = +1
 
-            if loss.item() < best_loss:
-                best_loss = loss.item()
-                best_scale = scale.data.clone()
-                best_zero = zero.data.clone()
-                no_improve = 0
-            else:
-                no_improve += 1
-                if no_improve >= patience:
-                    break
+        # Calculate scale and zero point
+        scale = (xmax - xmin) / self.maxq
+        zero = torch.round(-xmin / scale)
 
-            loss.backward()
+        # Clamp zero point to valid range
+        zero = torch.clamp(zero, 0, self.maxq)
 
-            # Gradient clipping for stability
-            torch.nn.utils.clip_grad_norm_([scale, zero], 1.0)
+        # Handle edge cases
+        scale = torch.clamp(scale, min=1e-8)
 
-            optimizer.step()
-            scheduler.step()
+        return scale.reshape(shape[0], -1), zero.reshape(shape[0], -1)
 
-        return best_scale.detach(), best_zero.detach()
+    def quantize(self, W: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+        """
+        Improved GPTQ quantization with better numerical stability
+        """
+        assert W.ndim == 2
+        rows, cols = W.shape
+        device = W.device
+        original_cols = cols
 
-    def magnitude_aware_quantization(self, w_group: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Use different strategies based on weight magnitudes"""
-        device = w_group.device
-        w_abs = w_group.abs()
+        # Determine groupsize
+        if self.groupsize == float('inf'):
+            groupsize = cols
+        else:
+            groupsize = min(int(self.groupsize), cols)
 
-        dynamic_range = w_abs.max(dim=-1, keepdim=True)[0] / (w_abs.min(dim=-1, keepdim=True)[0] + 1e-8)
+        num_groups = (cols + groupsize - 1) // groupsize
 
-        if dynamic_range.mean() > 100:  # High dynamic range
-            # Compute equivalent linear scale and zero
-            scale = (w_group.max(dim=-1, keepdim=True)[0] - w_group.min(dim=-1, keepdim=True)[0]) / self.maxq
-            zero = torch.round(-w_group.min(dim=-1, keepdim=True)[0] / scale).clamp(0, self.maxq)
+        # Initialize output tensors
+        qweight = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
+        scales = torch.zeros((rows, num_groups), dtype=torch.float32, device=device)
+        zeros = torch.zeros((rows, num_groups), dtype=torch.float32, device=device)
 
-        else:
-            # Use robust statistics to set bounds
-            median = w_group.median(dim=-1, keepdim=True)[0]
-            mad = (w_group - median).abs().median(dim=-1, keepdim=True)[0]  # Median Absolute Deviation
+        # Process each group
+        for g in range(num_groups):
+            start_col = g * groupsize
+            end_col = min((g + 1) * groupsize, cols)
 
-            # Set bounds using robust statistics
-            bound = 3.0 * mad
-            w_min = torch.maximum(w_group.min(dim=-1, keepdim=True)[0], median - bound)
-            w_max = torch.minimum(w_group.max(dim=-1, keepdim=True)[0], median + bound)
+            W_group = W[:, start_col:end_col].clone()
 
-            range_val = w_max - w_min
-            range_val = torch.where(range_val < 1e-8, torch.tensor(1e-6, device=device), range_val)
+            # Calculate quantization parameters for this group
+            scale, zero = self.find_params(None, W_group)
 
-            scale = range_val / self.maxq
-            zero = torch.round(-w_min / scale).clamp(0, self.maxq)
+            # Store parameters
+            scales[:, g] = scale.squeeze(-1)
+            zeros[:, g] = zero.squeeze(-1)
 
-        return scale, zero
+            # Quantize the group
+            q = torch.clamp(
+                torch.round(W_group / scale + zero),
+                0, self.maxq
+            )
+            qweight[:, start_col:end_col] = q.to(torch.uint8)
 
-    def dequantize_packed(self, packed_qweight: torch.Tensor, zeros: torch.Tensor,
-                          scales: torch.Tensor, original_cols: int) -> torch.Tensor:
-        """
-        Dequantize packed weights
-        Args:
-            packed_qweight: Packed quantized weights [O, I//2]
-            zeros: Zero points [O, num_groups]
-            scales: Scales [O, num_groups]
-            original_cols: Original number of columns before packing
-        Returns:
-            Dequantized weights [O, I]
-        """
-        # Unpack the weights first
-        qweight = unpack_weight(packed_qweight, original_cols)
+        # Pack the weights
+        packed_qweight = pack_weight(qweight)
 
-        # Then dequantize normally
-        return self.dequantize(qweight, zeros, scales)
+        return (
+            packed_qweight,
+            zeros.to(torch.float16),
+            scales.to(torch.float16),
+            original_cols
+        )
 
-    def quantize(self, W: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
-        """
-        Quantization optimized specifically for minimal relative error
-        Returns: [O, I//2] packed int4, [O, num_groups] zero, [O, num_groups] scale, original_cols
-        """
-        assert W.ndim == 2
-        rows, cols = W.shape
-        device = W.device
-        original_cols = cols
+    def dequantize(self, qweight: torch.Tensor, zeros: torch.Tensor,
+                   scales: torch.Tensor) -> torch.Tensor:
+        """Dequantize packed weights"""
+        # Unpack weights first
+        original_cols = qweight.shape[1] * 2  # Assuming 4-bit packing
+        weight = unpack_weight(qweight, original_cols)
 
-        # Use very small groups for maximum precision
-        effective_groupsize = min(int(self.groupsize), 8) if self.groupsize != float('inf') else 8
-        effective_groupsize = max(effective_groupsize, 4)  # Minimum 4 for 4-bit
-        num_groups = (cols + effective_groupsize - 1) // effective_groupsize
+        rows, cols = weight.shape
+        groupsize = min(int(self.groupsize), cols) if self.groupsize != float('inf') else cols
+        num_groups = (cols + groupsize - 1) // groupsize
 
-        qweight = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
-        scales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
-        zeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+        dequantized = torch.zeros_like(weight, dtype=torch.float16)
 
-        # Process each group with relative error optimization
         for g in range(num_groups):
-            start_col = g * effective_groupsize
-            end_col = min((g + 1) * effective_groupsize, cols)
+            start_col = g * groupsize
+            end_col = min((g + 1) * groupsize, cols)
 
-            # Get current group
-            W_group = W[:, start_col:end_col].clone()
+            group_weight = weight[:, start_col:end_col].float()
+            group_scale = scales[:, g].unsqueeze(-1)
+            group_zero = zeros[:, g].unsqueeze(-1)
 
-            # Try different methods and pick best for relative error
-            methods = []
-
-            # Method 1: Relative error optimization
-            try:
-                scale_rel, zero_rel = self.optimize_for_relative_error(W_group, max_iter=100)
-                q_rel = torch.clamp(torch.round(W_group / scale_rel + zero_rel), 0, self.maxq)
-                w_rec_rel = (q_rel - zero_rel) * scale_rel
-                rel_error_rel = self.relative_error_loss(W_group, w_rec_rel).item()
-                methods.append(('rel_opt', scale_rel, zero_rel, q_rel, rel_error_rel))
-            except Exception as e:
-                print(f"Relative opt failed for group {g}: {e}")
-
-            # Method 2: Magnitude-aware quantization
-            try:
-                scale_mag, zero_mag = self.magnitude_aware_quantization(W_group)
-                q_mag = torch.clamp(torch.round(W_group / scale_mag + zero_mag), 0, self.maxq)
-                w_rec_mag = (q_mag - zero_mag) * scale_mag
-                rel_error_mag = self.relative_error_loss(W_group, w_rec_mag).item()
-                methods.append(('mag_aware', scale_mag, zero_mag, q_mag, rel_error_mag))
-            except Exception as e:
-                print(f"Magnitude aware failed for group {g}: {e}")
-
-            # Method 3: Ultra-conservative approach for small weights
-            w_abs = W_group.abs()
-            if w_abs.max() < 0.01:  # Very small weights
-                # Use much finer quantization resolution
-                w_min = W_group.min(dim=-1, keepdim=True)[0]
-                w_max = W_group.max(dim=-1, keepdim=True)[0]
-
-                # Tighten the range for small weights
-                range_val = w_max - w_min
-                range_val = torch.where(range_val < 1e-8, torch.tensor(1e-8, device=device), range_val)
-
-                scale_small = range_val / self.maxq * 0.8  # Use 80% of range for safety
-                zero_small = torch.round(-w_min / scale_small).clamp(0, self.maxq)
-
-                q_small = torch.clamp(torch.round(W_group / scale_small + zero_small), 0, self.maxq)
-                w_rec_small = (q_small - zero_small) * scale_small
-                rel_error_small = self.relative_error_loss(W_group, w_rec_small).item()
-                methods.append(('small_weights', scale_small, zero_small, q_small, rel_error_small))
-
-            # Pick the method with lowest relative error
-            if methods:
-                best_method = min(methods, key=lambda x: x[4])
-                method_name, scale_best, zero_best, q_best, _ = best_method
-
-                qweight[:, start_col:end_col] = q_best.to(torch.uint8)
-                scales[:, g] = scale_best.squeeze(-1)
-                zeros[:, g] = zero_best.squeeze(-1)
-            else:
-                # Ultimate fallback
-                print(f"All methods failed for group {g}, using zero quantization")
-                qweight[:, start_col:end_col] = 0
-                scales[:, g] = 1.0
-                zeros[:, g] = 0
-
-        # Pack the weights before returning
-        packed_qweight = pack_weight(qweight)
+            # Dequantize: (q - zero) * scale
+            dequantized[:, start_col:end_col] = ((group_weight - group_zero) * group_scale).to(torch.float16)
 
-        return packed_qweight, zeros.to(torch.float16), scales.to(torch.float16), original_cols
+        return dequantized
 
 
 def quantize_gptq(
@@ -260,20 +139,10 @@ def quantize_gptq(
         device: str = "cuda"
 ) -> Dict[str, torch.Tensor]:
     """
-    Quantize model weights using GPTQ algorithm
-
-    Args:
-        model_state_dict: Original model state dictionary
-        calibration_data: Optional calibration data for computing Hessian
-        wbits: Number of bits for quantization (default: 4)
-        groupsize: Group size for quantization (default: 128)
-        target_layers: List of layer names to quantize (if None, quantize all linear layers)
-        device: Device to perform quantization on
-
-    Returns:
-        Dictionary containing quantized weights and quantization parameters
+    Improved GPTQ quantization function
     """
     quantized_state_dict = {}
+    config = GPTQConfig()
 
     # Default target layers if not specified
     if target_layers is None:
@@ -285,28 +154,32 @@ def quantize_gptq(
                 "kv_proj", "lm_head"
             ]):
                 target_layers.append(name)
-    config = GPTQConfig()
+
+    print(f"Quantizing {len(target_layers)} layers...")
 
     for name, param in tqdm(model_state_dict.items(), desc="Processing layers"):
         if name in target_layers and param.dim() == 2:
             # Create GPTQ quantizer for this layer
             gptq = GPTQ(config)
 
-            # Move weight to device
+            # Move weight to device and ensure float32 for quantization
             weight = param.to(device).float()
-
             # Quantize the weight
-            qweight, qzeros, scales, _ = gptq.quantize(weight)
+            qweight, qzeros, scales, original_cols = gptq.quantize(weight)
 
             # Store quantized parameters
             base_name = name.replace(".weight", "").replace("_weight", "")
             quantized_state_dict[f"{base_name}.qweight"] = qweight.cpu()
             quantized_state_dict[f"{base_name}.qzeros"] = qzeros.cpu()
             quantized_state_dict[f"{base_name}.scales"] = scales.cpu()
+            quantized_state_dict[f"{base_name}.original_cols"] = torch.tensor(original_cols)
+
+            # Verify quantization quality
+            dequantized = gptq.dequantize(qweight, qzeros, scales)
+            error = (weight.half() - dequantized).abs().mean().item()
 
         else:
             # Keep non-quantized parameters as is
             quantized_state_dict[name] = param.cpu()
 
-    return quantized_state_dict
-
+    return quantized_state_dict
\ No newline at end of file
diff --git a/lite_llama/quantization/quant_manager.py b/lite_llama/quantization/quant_manager.py
index b536709..65af049 100644
--- a/lite_llama/quantization/quant_manager.py
+++ b/lite_llama/quantization/quant_manager.py
@@ -19,7 +19,7 @@
 from ..kernels.awq_linear import AWQLinear
 from ..kernels.gptq_linear import GPTQLinear
 from ..kernels.sq_linear import SmoothQuantLinear
-
+from ..utils.logger import log
 
 class QuantizationType:
     NONE = "none"
@@ -41,7 +41,7 @@ def __init__(self):
         }
 
     def detect_quantization_type(self, model_path: str) -> str:
-        """自动检测模型的量化类型"""
+        """Automatically detect the quantization type of the model"""
         model_path = Path(model_path)
 
         # 检查量化配置文件
@@ -77,14 +77,14 @@ def quantize_model(
             calibration_data: Optional[Any] = None,
             model: Optional[torch.nn.Module] = None
     ) -> str:
-        """量化模型"""
-        print(f"开始使用 {method} 方法量化模型...")
+        """Quantitative model"""
+        log.info(f"Using the {method} method to quantify the model...")
 
         # 加载原始模型状态字典
         model_path = Path(model_path)
         weight_files = list(model_path.glob("*.pth"))
         if not weight_files:
-            raise ValueError(f"在 {model_path} 中未找到权重文件")
+            raise ValueError(f"The weight file was not found in {model_path}")
 
         state_dict = torch.load(weight_files[0], map_location="cpu")
 
@@ -103,7 +103,6 @@ def quantize_model(
             awq_config = AWQConfig(**config)
             quantized_state_dict = quantize_awq(
                 model_state_dict=state_dict,
-                calibration_loader=calibration_data,
                 model=model,
                 config=awq_config,
                 target_layers=self._get_target_layers(state_dict),
@@ -120,7 +119,7 @@ def quantize_model(
             )
 
         else:
-            raise ValueError(f"不支持的量化方法: {method}")
+            raise ValueError(f"Unsupported quantitative methods: {method}")
 
         # 保存量化后的模型
         output_path = Path(output_path)
@@ -154,7 +153,7 @@ def quantize_model(
         with open(output_path / "quantization_config.json", 'w') as f:
             json.dump(quant_config, f, indent=2)
 
-        print(f"量化完成! 输出保存至: {output_path}")
+        log.info(f"Quantification completed! Saved to: {output_path}")
         return str(output_path)
 
     def load_quantized_model(
@@ -163,7 +162,7 @@ def load_quantized_model(
             model_config: Any,
             device: str = "cuda"
     ) -> torch.nn.Module:
-        """加载量化后的模型"""
+        """Load the quantized model"""
         quant_type = self.detect_quantization_type(model_path)
 
         if quant_type == QuantizationType.NONE:
@@ -173,10 +172,10 @@ def load_quantized_model(
         if quant_type in self.supported_methods:
             return self.supported_methods[quant_type](model_path, model_config, device)
         else:
-            raise ValueError(f"不支持的量化类型: {quant_type}")
+            raise ValueError(f"Unsupported quantization types: {quant_type}")
 
     def _load_gptq(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
-        """加载GPTQ量化模型"""
+        """Load the GPTQ quantitative model"""
         from ..models.quantized_models import create_quantized_model
 
         # 读取量化配置
@@ -200,7 +199,7 @@ def _load_gptq(self, model_path: str, model_config: Any, device: str) -> torch.n
         return model
 
     def _load_awq(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
-        """加载AWQ量化模型"""
+        """Load the AWQ quantification model"""
         from ..models.quantized_models import create_quantized_model
 
         # 读取量化配置
@@ -224,7 +223,7 @@ def _load_awq(self, model_path: str, model_config: Any, device: str) -> torch.nn
         return model
 
     def _load_smoothquant(self, model_path: str, model_config: Any, device: str) -> torch.nn.Module:
-        """加载SmoothQuant量化模型"""
+        """Load the SmoothQuant quantitative model"""
         from ..models.quantized_models import create_quantized_model
 
         # 读取量化配置
@@ -254,10 +253,11 @@ def _load_normal_model(self, model_path: str, model_config: Any, device: str) ->
         pass
 
     def _get_target_layers(self, state_dict: Dict[str, torch.Tensor]) -> List[str]:
-        """获取需要量化的层"""
+        """Obtain the layers that need to be quantified"""
         target_layers = []
+        quant_layer = QuantLayerConfig()
         for name in state_dict.keys():
-            if any(pattern in name for pattern in QuantLayerConfig.quant_layers):
+            if any(pattern in name for pattern in quant_layer.quant_layers):
                 target_layers.append(name)
         return target_layers
 
diff --git a/lite_llama/quantization/utils.py b/lite_llama/quantization/utils.py
index 70de35c..8b3f65f 100644
--- a/lite_llama/quantization/utils.py
+++ b/lite_llama/quantization/utils.py
@@ -1,19 +1,49 @@
 import torch
-from .quant_config import GPTQConfig
+
 
 def pack_weight(weight):
-    """Pack two 4-bit values into one uint8 value"""
+    """
+    Pack two 4-bit values into one uint8 value consistently
+
+    Args:
+        weight: Tensor of shape [out_features, in_features] with values in [0, 15]
+
+    Returns:
+        packed: Tensor of shape [out_features, in_features//2] with packed values
+    """
     rows, cols = weight.shape
+
+    # Ensure even number of columns for packing
     if cols % 2 != 0:
         weight = torch.cat([weight, torch.zeros(rows, 1, dtype=weight.dtype, device=weight.device)], dim=1)
         cols += 1
+
+    # Pack: lower 4 bits from even indices, upper 4 bits from odd indices
+    # Format: [odd_value << 4] | even_value
     packed = (weight[:, 0::2] & 0xF) | ((weight[:, 1::2] & 0xF) << 4)
-    return packed.contiguous()
+
+    return packed.contiguous().to(torch.uint8)
+
 
 def unpack_weight(packed_weight, original_cols):
-    """Unpack uint8 values back to two 4-bit values"""
+    """
+    Unpack uint8 values back to two 4-bit values consistently
+
+    Args:
+        packed_weight: Packed tensor of shape [out_features, packed_cols]
+        original_cols: Original number of columns before packing
+
+    Returns:
+        unpacked: Tensor of shape [out_features, original_cols] with unpacked values
+    """
     rows, packed_cols = packed_weight.shape
+
+    # Allocate unpacked tensor
     unpacked = torch.zeros((rows, packed_cols * 2), dtype=torch.uint8, device=packed_weight.device)
-    unpacked[:, 0::2] = packed_weight & 0xF
-    unpacked[:, 1::2] = (packed_weight >> 4) & 0xF
-    return unpacked[:, :original_cols].contiguous()
\ No newline at end of file
+
+    # Unpack: even positions get lower 4 bits, odd positions get upper 4 bits
+    unpacked[:, 0::2] = packed_weight & 0xF  # Lower 4 bits
+    unpacked[:, 1::2] = (packed_weight >> 4) & 0xF  # Upper 4 bits
+
+    # Trim to original size
+    return unpacked[:, :original_cols].contiguous()
diff --git a/quantize_lite_llama.py b/quantize_lite_llama.py
index 2f064f5..30b7248 100644
--- a/quantize_lite_llama.py
+++ b/quantize_lite_llama.py
@@ -86,11 +86,11 @@ def _load_calibration_data(self) -> List[str]:
                     texts.append(item.get('text', ''))
 
         else:
-            raise ValueError(f"不支持的文件格式: {self.data_path}")
+            raise ValueError(f"Unsupported file formats: {self.data_path}")
 
         # 限制样本数量
         texts = texts[:self.max_samples]
-        log.info(f"加载了 {len(texts)} 个校准样本")
+        log.info(f"{len(texts)} calibration samples were loaded")
 
         return texts
 
@@ -129,7 +129,7 @@ def create_default_calibration_data(tokenizer_path: str, num_samples: int = 32)
 
     # 重复样本以达到所需数量
     texts = (default_texts * ((num_samples // len(default_texts)) + 1))[:num_samples]
-    log.info(f"使用默认校准数据，共 {len(texts)} 个样本")
+    log.info(f"Using the default calibration data, there are a total of {len(texts)} samples")
 
     return texts
 
@@ -146,7 +146,7 @@ def validate_quantization_config(method: str, config: Dict[str, Any]) -> Dict[st
 
         # 验证参数范围
         if validated_config['w_bit'] not in [2, 3, 4, 8]:
-            raise ValueError(f"GPTQ不支持的位数: {validated_config['w_bit']}")
+            raise ValueError(f"The number of bits not supported by GPTQ: {validated_config['w_bit']}")
 
     elif method == QuantizationType.AWQ:
         validated_config = {
@@ -160,7 +160,7 @@ def validate_quantization_config(method: str, config: Dict[str, Any]) -> Dict[st
         }
 
         if validated_config['w_bit'] not in [4, 8]:
-            raise ValueError(f"AWQ不支持的位数: {validated_config['w_bit']}")
+            raise ValueError(f"The number of bits not supported by AWQ: {validated_config['w_bit']}")
 
     elif method == QuantizationType.SMOOTHQUANT:
         validated_config = {
@@ -176,72 +176,72 @@ def validate_quantization_config(method: str, config: Dict[str, Any]) -> Dict[st
         }
 
         if not (0.0 <= validated_config['alpha'] <= 1.0):
-            raise ValueError(f"SmoothQuant的alpha参数必须在0-1之间: {validated_config['alpha']}")
+            raise ValueError(f"The alpha parameter of SmoothQuant must be between 0 and 1: {validated_config['alpha']}")
 
     else:
-        raise ValueError(f"不支持的量化方法: {method}")
+        raise ValueError(f"Unsupported quantitative methods: {method}")
 
     return validated_config
 
 
 def main():
-    parser = argparse.ArgumentParser(description="量化lite_llama格式的模型")
+    parser = argparse.ArgumentParser(description="Quantify the model in lite_llama format")
 
     # 基本参数
     parser.add_argument("--model-path", type=str, required=True,
-                        help="输入模型路径")
+                        help="Input model path")
     parser.add_argument("--output-path", type=str, required=True,
-                        help="输出模型路径")
+                        help="Output model path")
     parser.add_argument("--method", type=str, required=True,
                         choices=['gptq', 'awq', 'smoothquant'],
-                        help="量化方法")
+                        help="Quantitative method")
 
     # 量化参数
     parser.add_argument("--bits", type=int, default=4,
-                        help="量化位数 (default: 4)")
+                        help="Quantification bit number (default: 4)")
     parser.add_argument("--group-size", type=int, default=128,
-                        help="组大小 (default: 128)")
+                        help="Group size (default: 128)")
 
     # AWQ特有参数
     parser.add_argument("--alpha", type=float, default=0.5,
-                        help="AWQ/SmoothQuant的alpha参数 (default: 0.5)")
+                        help="The alpha parameter of AWQ/SmoothQuant (default: 0.5)")
     parser.add_argument("--search-scale", action='store_true',
-                        help="AWQ是否搜索最优缩放因子")
+                        help="Does AWQ search for the optimal scaling factor")
     parser.add_argument("--auto-scale", action='store_true', default=True,
-                        help="AWQ是否自动缩放")
+                        help="Does AWQ scale automatically")
 
     # SmoothQuant特有参数
     parser.add_argument("--w-bits", type=int, default=8,
-                        help="权重量化位数 (SmoothQuant, default: 8)")
+                        help="Weighted quantification number of bits (SmoothQuant, default: 8)")
     parser.add_argument("--a-bits", type=int, default=8,
-                        help="激活量化位数 (SmoothQuant, default: 8)")
+                        help="Activation quantization bit number (SmoothQuant, default: 8)")
 
     # 校准数据
     parser.add_argument("--calib-data", type=str, default=None,
-                        help="校准数据文件路径 (.txt/.json/.jsonl)")
+                        help="Calibrate the data file path (.txt/.json/.jsonl)")
     parser.add_argument("--calib-samples", type=int, default=128,
-                        help="校准样本数量 (default: 128)")
+                        help="Calibration sample quantity (default: 128)")
     parser.add_argument("--max-length", type=int, default=512,
-                        help="校准数据最大长度 (default: 512)")
+                        help="The maximum length of the calibration data (default: 512)")
 
     # 其他参数
     parser.add_argument("--device", type=str, default="cuda",
                         choices=['cuda', 'cpu'],
-                        help="设备 (default: cuda)")
+                        help="device (default: cuda)")
     parser.add_argument("--no-verify", action='store_true',
-                        help="跳过量化验证")
+                        help="Skip quantitative validation")
 
     args = parser.parse_args()
 
     # 检查模型兼容性
     is_compatible, message = check_model_compatibility(args.model_path)
     if not is_compatible:
-        log.error(f"模型兼容性检查失败: {message}")
+        log.error(f"The model compatibility check failed: {message}")
         return 1
 
     # 获取模型信息
     model_info = get_model_info(args.model_path)
-    log.info(f"模型信息: {model_info}")
+    log.info(f"Model information: {model_info}")
 
     # 准备量化配置
     config = {
@@ -259,9 +259,9 @@ def main():
     # 验证配置
     try:
         validated_config = validate_quantization_config(args.method, config)
-        log.info(f"量化配置: {validated_config}")
+        log.info(f"Quantitative configuration: {validated_config}")
     except ValueError as e:
-        log.error(f"配置验证失败: {e}")
+        log.error(f"Configuration verification failed: {e}")
         return 1
 
     # 准备校准数据
@@ -269,7 +269,7 @@ def main():
     model = None
 
     if args.method in ['awq', 'smoothquant']:
-        log.info("准备校准数据...")
+        log.info("Prepare calibration data...")
 
         if args.calib_data:
             # 使用用户提供的校准数据
@@ -280,10 +280,10 @@ def main():
                     args.calib_samples,
                     args.max_length
                 )
-                log.info(f"加载校准数据: {len(calibration_data)} 个样本")
+                log.info(f"Load calibration data: {len(calibration_data)} samples")
             except Exception as e:
-                log.error(f"加载校准数据失败: {e}")
-                log.info("将使用默认校准数据")
+                log.error(f"Failed to load calibration data: {e}")
+                log.info("The default calibration data will be used")
                 calibration_data = create_default_calibration_data(
                     args.model_path, args.calib_samples
                 )
@@ -295,7 +295,7 @@ def main():
 
         # 如果需要，加载原始模型用于校准
         if args.method == 'awq':
-            log.info("加载原始模型用于AWQ校准...")
+            log.info("Load the original model for AWQ calibration...")
             try:
                 model_executor = ModelExecutor.build(
                     checkpoints_dir=args.model_path,
@@ -305,13 +305,13 @@ def main():
                     device=args.device
                 )
                 model = model_executor.model
-                log.info("模型加载成功")
+                log.info("The model has been loaded successfully.")
             except Exception as e:
-                log.error(f"模型加载失败: {e}")
+                log.error(f"Model loading failed: {e}")
                 return 1
 
     # 执行量化
-    log.info(f"开始使用 {args.method.upper()} 方法量化模型...")
+    log.info(f"Quantifying the model using the {args.method.upper()} method...")
     start_time = time.time()
 
     try:
@@ -325,37 +325,37 @@ def main():
         )
 
         quantization_time = time.time() - start_time
-        log.info(f"量化完成! 耗时: {quantization_time:.2f}s")
-        log.info(f"量化模型保存至: {output_path}")
+        log.info(f"Quantification completed! Time consumption: {quantization_time:.2f}s")
+        log.info(f"The quantitative model saved to: {output_path}")
 
     except Exception as e:
-        log.error(f"量化失败: {e}")
+        log.error(f"Quantitative failure: {e}")
         return 1
 
     # 验证量化结果
     if not args.no_verify:
-        log.info("验证量化结果...")
+        log.info("Verify the quantification results...")
         try:
             # 检测量化类型
             detected_type = quantization_manager.detect_quantization_type(output_path)
             if detected_type == args.method:
-                log.info(f"✅ 量化类型验证通过: {detected_type}")
+                log.info(f"The quantitative type verification has been passed: {detected_type}")
             else:
-                log.warning(f"⚠️ 量化类型不匹配: 期望 {args.method}, 检测到 {detected_type}")
+                log.warning(f"Quantization type mismatch: expected {args.method}, detected {detected_type}")
 
             # 检查文件大小
             original_size = sum(f.stat().st_size for f in Path(args.model_path).glob("*.pth"))
             quantized_size = sum(f.stat().st_size for f in Path(output_path).glob("*.pth"))
             compression_ratio = original_size / quantized_size if quantized_size > 0 else 1.0
 
-            log.info(f"原始模型大小: {original_size / (1024 ** 3):.2f} GB")
-            log.info(f"量化模型大小: {quantized_size / (1024 ** 3):.2f} GB")
-            log.info(f"压缩比: {compression_ratio:.2f}x")
+            log.info(f"Original model size: {original_size / (1024 ** 3):.2f} GB")
+            log.info(f"Quantitative model size: {quantized_size / (1024 ** 3):.2f} GB")
+            log.info(f"Compression ratio: {compression_ratio:.2f}x")
 
         except Exception as e:
-            log.warning(f"量化验证失败: {e}")
+            log.warning(f"Quantitative verification failed: {e}")
 
-    log.info("量化任务完成!")
+    log.info("Quantitative task completion!")
     return 0
 
 
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..dfd0e2b
--- /dev/null
+++ b/test.py
@@ -0,0 +1,317 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Dict, Tuple
+import sys
+import os
+
+# Add the path to access lite_llama modules
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from lite_llama.kernels.awq_linear import AWQLinear
+from lite_llama.quantization.awq import AWQ
+from lite_llama.quantization.quant_config import AWQConfig
+from lite_llama.quantization.utils import pack_weight, unpack_weight
+
+
+class DummyModel(nn.Module):
+    """A simple model with multiple linear layers for testing"""
+
+    def __init__(self, hidden_size=768, num_layers=3):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size, hidden_size, bias=True)
+            for _ in range(num_layers)
+        ])
+        self.final = nn.Linear(hidden_size, hidden_size // 2, bias=True)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+            x = torch.relu(x)
+        x = self.final(x)
+        return x
+
+
+def quantize_weight_manual(weight: torch.Tensor, wbits: int = 4, groupsize: int = 128) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Manually quantize weight to ensure correct packing"""
+    assert weight.ndim == 2
+    rows, cols = weight.shape
+    device = weight.device
+
+    maxq = 2 ** wbits - 1
+
+    # Calculate number of groups
+    if groupsize == -1 or groupsize >= cols:
+        groupsize = cols
+    num_groups = (cols + groupsize - 1) // groupsize
+
+    # Initialize tensors
+    qweight_unpacked = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
+    qzeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+    qscales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+
+    # Quantize each group
+    for g in range(num_groups):
+        start_col = g * groupsize
+        end_col = min((g + 1) * groupsize, cols)
+
+        # Get weight group
+        w_group = weight[:, start_col:end_col]
+
+        # Compute min/max per row
+        w_min = w_group.min(dim=1, keepdim=True)[0]
+        w_max = w_group.max(dim=1, keepdim=True)[0]
+
+        # Compute scale and zero point
+        scale = (w_max - w_min).clamp(min=1e-5) / maxq
+        zero = torch.round(-w_min / scale).clamp(0, maxq)
+
+        # Quantize
+        q = torch.clamp(torch.round(w_group / scale + zero), 0, maxq)
+
+        # Store
+        qweight_unpacked[:, start_col:end_col] = q.to(torch.uint8)
+        qscales[:, g] = scale.squeeze(1)
+        qzeros[:, g] = zero.squeeze(1)
+
+    # Pack the weights
+    qweight_packed = pack_weight(qweight_unpacked)
+
+    print(f"      Unpacked shape: {qweight_unpacked.shape} -> Packed shape: {qweight_packed.shape}")
+
+    return qweight_packed, qzeros, qscales
+
+
+def compare_awq_with_linear():
+    """Compare outputs between nn.Linear and AWQLinear"""
+
+    print("=" * 80)
+    print("AWQ Linear vs nn.Linear Comparison")
+    print("=" * 80)
+
+    # Configuration
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    if device.type == "cpu":
+        print("WARNING: AWQLinear uses Triton kernels which require CUDA.")
+        print("         The demo may fail on CPU. Please use a CUDA-enabled device.")
+        print("         Attempting to continue anyway...")
+
+    batch_size = 4
+    seq_len = 128
+    hidden_size = 768
+    group_size = 128
+    wbits = 4
+
+    print(f"\nConfiguration:")
+    print(f"Device: {device}")
+    print(f"Batch size: {batch_size}")
+    print(f"Sequence length: {seq_len}")
+    print(f"Hidden size: {hidden_size}")
+    print(f"Quantization bits: {wbits}")
+    print(f"Group size: {group_size}")
+
+    # Create dummy model
+    print("\n1. Creating dummy model...")
+    model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
+    model.eval()
+
+    # Create test input
+    test_input = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16, device=device)
+
+    # Get original output
+    print("\n2. Getting original model output...")
+    with torch.no_grad():
+        original_output = model(test_input)
+    print(f"Original output shape: {original_output.shape}")
+
+    # Create quantized model
+    print("\n3. Creating quantized model...")
+    quantized_model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
+
+    # Quantize and replace each linear layer
+    layer_errors = {}
+
+    for (orig_name, orig_module), (quant_name, quant_module) in zip(
+            model.named_modules(), quantized_model.named_modules()
+    ):
+        if isinstance(orig_module, nn.Linear):
+            print(f"\n   Quantizing layer: {orig_name}")
+
+            # Get original weight
+            orig_weight = orig_module.weight.data
+            print(f"   Original weight shape: {orig_weight.shape}")
+
+            # Manually quantize to ensure correct packing
+            qweight, qzeros, qscales = quantize_weight_manual(orig_weight, wbits=wbits, groupsize=group_size)
+
+            print(f"   Quantized weight packed shape: {qweight.shape}")
+            print(f"   Scales shape: {qscales.shape}")
+            print(f"   Zeros shape: {qzeros.shape}")
+
+            # Create AWQLinear layer manually
+            awq_layer = AWQLinear(
+                in_features=orig_module.in_features,
+                out_features=orig_module.out_features,
+                bias=orig_module.bias is not None,
+                group_size=group_size,
+                wbits=wbits
+            ).to(device)
+
+            # Copy quantized parameters
+            with torch.no_grad():
+                awq_layer.qweight.copy_(qweight)
+                awq_layer.qscales.copy_(qscales)
+                awq_layer.qzeros.copy_(qzeros)
+
+                if orig_module.bias is not None:
+                    awq_layer.bias.copy_(orig_module.bias.to(torch.float16))
+
+            # Replace the layer in quantized model
+            parent_name = quant_name.rsplit('.', 1)[0] if '.' in quant_name else ''
+            child_name = quant_name.rsplit('.', 1)[1] if '.' in quant_name else quant_name
+
+            if parent_name:
+                parent = quantized_model
+                for part in parent_name.split('.'):
+                    parent = getattr(parent, part)
+                setattr(parent, child_name, awq_layer)
+            else:
+                setattr(quantized_model, child_name, awq_layer)
+
+            # Test individual layer error
+            with torch.no_grad():
+                test_layer_input = torch.randn(batch_size, seq_len, orig_module.in_features,
+                                               dtype=torch.float16, device=device)
+                orig_layer_output = orig_module(test_layer_input)
+
+                try:
+                    quant_layer_output = awq_layer(test_layer_input)
+
+                    layer_error = (orig_layer_output - quant_layer_output).abs().mean().item()
+                    layer_rel_error = layer_error / (orig_layer_output.abs().mean().item() + 1e-6)
+
+                    layer_errors[orig_name] = {
+                        'absolute_error': layer_error,
+                        'relative_error': layer_rel_error
+                    }
+
+                    print(f"   Layer absolute error: {layer_error:.6f}")
+                    print(f"   Layer relative error: {layer_rel_error:.2%}")
+                except Exception as e:
+                    print(f"   ERROR testing layer: {e}")
+                    if device.type == "cpu":
+                        print("   This is expected on CPU as Triton kernels require CUDA")
+
+    # Get quantized output
+    print("\n4. Getting quantized model output...")
+    quantized_model.eval()
+
+    try:
+        with torch.no_grad():
+            quantized_output = quantized_model(test_input)
+        print(f"Quantized output shape: {quantized_output.shape}")
+
+        # Compare outputs
+        print("\n5. Comparing outputs...")
+        print("=" * 80)
+
+        # Compute errors
+        absolute_error = (original_output - quantized_output).abs()
+        relative_error = absolute_error / (original_output.abs() + 1e-6)
+
+        print(f"\nOutput Statistics:")
+        print(f"Original output - Mean: {original_output.mean().item():.6f}, "
+              f"Std: {original_output.std().item():.6f}")
+        print(f"Quantized output - Mean: {quantized_output.mean().item():.6f}, "
+              f"Std: {quantized_output.std().item():.6f}")
+
+        print(f"\nError Metrics:")
+        print(f"Mean Absolute Error: {absolute_error.mean().item():.6f}")
+        print(f"Max Absolute Error: {absolute_error.max().item():.6f}")
+        print(f"Mean Relative Error: {relative_error.mean().item():.2%}")
+        print(f"Max Relative Error: {relative_error.max().item():.2%}")
+
+    except Exception as e:
+        print(f"\nERROR during quantized model forward pass: {e}")
+        if device.type == "cpu":
+            print("This is expected on CPU as AWQLinear requires CUDA for Triton kernels")
+        quantized_output = None
+
+    # Per-layer error summary (if we have any)
+    if layer_errors:
+        print("\nPer-Layer Error Summary:")
+        print("-" * 60)
+        print(f"{'Layer Name':<30} {'Abs Error':<15} {'Rel Error':<15}")
+        print("-" * 60)
+        for name, errors in layer_errors.items():
+            print(f"{name:<30} {errors['absolute_error']:<15.6f} {errors['relative_error']:<15.2%}")
+
+    # Memory comparison
+    print("\n6. Memory Usage Comparison:")
+    print("=" * 80)
+
+    # Calculate original model size
+    orig_params = sum(p.numel() * p.element_size() for p in model.parameters())
+    orig_size_mb = orig_params / (1024 * 1024)
+
+    # Calculate quantized model size (approximation)
+    quant_params = 0
+    for name, module in quantized_model.named_modules():
+        if isinstance(module, AWQLinear):
+            # qweight is packed int4 (half the size)
+            quant_params += module.qweight.numel() * module.qweight.element_size()
+            # scales and zeros
+            quant_params += module.qscales.numel() * module.qscales.element_size()
+            quant_params += module.qzeros.numel() * module.qzeros.element_size()
+            # bias if present
+            if module.bias is not None:
+                quant_params += module.bias.numel() * module.bias.element_size()
+
+    quant_size_mb = quant_params / (1024 * 1024)
+    compression_ratio = orig_size_mb / quant_size_mb if quant_size_mb > 0 else 0
+
+    print(f"Original model size: {orig_size_mb:.2f} MB")
+    print(f"Quantized model size: {quant_size_mb:.2f} MB")
+    print(f"Compression ratio: {compression_ratio:.2f}x")
+
+    print("\n" + "=" * 80)
+    print("Comparison completed!")
+
+    return {
+        'original_output': original_output,
+        'quantized_output': quantized_output,
+        'layer_errors': layer_errors,
+        'compression_ratio': compression_ratio
+    }
+
+
+if __name__ == "__main__":
+    # Run the comparison
+    results = compare_awq_with_linear()
+
+    # Additional analysis if needed
+    print("\n\nAdditional Analysis:")
+    print("=" * 80)
+
+    # Check if CUDA is available for better performance
+    if not torch.cuda.is_available():
+        print("Note: Running on CPU. CUDA is required for AWQLinear to work properly.")
+        print("      Triton kernels do not support CPU execution.")
+
+    # Success criteria
+    if results['quantized_output'] is not None:
+        mean_rel_error = ((results['original_output'] - results['quantized_output']).abs() /
+                          (results['original_output'].abs() + 1e-6)).mean().item()
+
+        if mean_rel_error < 0.05:  # Less than 5% error
+            print("✓ Quantization successful! Error is within acceptable range.")
+        else:
+            print("⚠ Warning: Quantization error is higher than expected.")
+
+    if results['compression_ratio'] > 0:
+        print(f"\nCompression achieved: {results['compression_ratio']:.2f}x")
+        print("This means the quantized model uses approximately "
+              f"{100 / results['compression_ratio']:.1f}% of the original model's memory.")
\ No newline at end of file
diff --git a/tests/quant/__init__.py b/tests/quant/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/kernels/test_AWQLinear.py b/tests/quant/test_AWQLinear.py
similarity index 100%
rename from tests/kernels/test_AWQLinear.py
rename to tests/quant/test_AWQLinear.py
diff --git a/tests/kernels/test_GPTQLinear.py b/tests/quant/test_GPTQLinear.py
similarity index 100%
rename from tests/kernels/test_GPTQLinear.py
rename to tests/quant/test_GPTQLinear.py
diff --git a/tests/kernels/test_SQLinear.py b/tests/quant/test_SQLinear.py
similarity index 100%
rename from tests/kernels/test_SQLinear.py
rename to tests/quant/test_SQLinear.py
diff --git a/tests/quant/test_awq.py b/tests/quant/test_awq.py
new file mode 100644
index 0000000..dfd0e2b
--- /dev/null
+++ b/tests/quant/test_awq.py
@@ -0,0 +1,317 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Dict, Tuple
+import sys
+import os
+
+# Add the path to access lite_llama modules
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from lite_llama.kernels.awq_linear import AWQLinear
+from lite_llama.quantization.awq import AWQ
+from lite_llama.quantization.quant_config import AWQConfig
+from lite_llama.quantization.utils import pack_weight, unpack_weight
+
+
+class DummyModel(nn.Module):
+    """A simple model with multiple linear layers for testing"""
+
+    def __init__(self, hidden_size=768, num_layers=3):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            nn.Linear(hidden_size, hidden_size, bias=True)
+            for _ in range(num_layers)
+        ])
+        self.final = nn.Linear(hidden_size, hidden_size // 2, bias=True)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+            x = torch.relu(x)
+        x = self.final(x)
+        return x
+
+
+def quantize_weight_manual(weight: torch.Tensor, wbits: int = 4, groupsize: int = 128) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Manually quantize weight to ensure correct packing"""
+    assert weight.ndim == 2
+    rows, cols = weight.shape
+    device = weight.device
+
+    maxq = 2 ** wbits - 1
+
+    # Calculate number of groups
+    if groupsize == -1 or groupsize >= cols:
+        groupsize = cols
+    num_groups = (cols + groupsize - 1) // groupsize
+
+    # Initialize tensors
+    qweight_unpacked = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
+    qzeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+    qscales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
+
+    # Quantize each group
+    for g in range(num_groups):
+        start_col = g * groupsize
+        end_col = min((g + 1) * groupsize, cols)
+
+        # Get weight group
+        w_group = weight[:, start_col:end_col]
+
+        # Compute min/max per row
+        w_min = w_group.min(dim=1, keepdim=True)[0]
+        w_max = w_group.max(dim=1, keepdim=True)[0]
+
+        # Compute scale and zero point
+        scale = (w_max - w_min).clamp(min=1e-5) / maxq
+        zero = torch.round(-w_min / scale).clamp(0, maxq)
+
+        # Quantize
+        q = torch.clamp(torch.round(w_group / scale + zero), 0, maxq)
+
+        # Store
+        qweight_unpacked[:, start_col:end_col] = q.to(torch.uint8)
+        qscales[:, g] = scale.squeeze(1)
+        qzeros[:, g] = zero.squeeze(1)
+
+    # Pack the weights
+    qweight_packed = pack_weight(qweight_unpacked)
+
+    print(f"      Unpacked shape: {qweight_unpacked.shape} -> Packed shape: {qweight_packed.shape}")
+
+    return qweight_packed, qzeros, qscales
+
+
+def compare_awq_with_linear():
+    """Compare outputs between nn.Linear and AWQLinear"""
+
+    print("=" * 80)
+    print("AWQ Linear vs nn.Linear Comparison")
+    print("=" * 80)
+
+    # Configuration
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    if device.type == "cpu":
+        print("WARNING: AWQLinear uses Triton kernels which require CUDA.")
+        print("         The demo may fail on CPU. Please use a CUDA-enabled device.")
+        print("         Attempting to continue anyway...")
+
+    batch_size = 4
+    seq_len = 128
+    hidden_size = 768
+    group_size = 128
+    wbits = 4
+
+    print(f"\nConfiguration:")
+    print(f"Device: {device}")
+    print(f"Batch size: {batch_size}")
+    print(f"Sequence length: {seq_len}")
+    print(f"Hidden size: {hidden_size}")
+    print(f"Quantization bits: {wbits}")
+    print(f"Group size: {group_size}")
+
+    # Create dummy model
+    print("\n1. Creating dummy model...")
+    model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
+    model.eval()
+
+    # Create test input
+    test_input = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16, device=device)
+
+    # Get original output
+    print("\n2. Getting original model output...")
+    with torch.no_grad():
+        original_output = model(test_input)
+    print(f"Original output shape: {original_output.shape}")
+
+    # Create quantized model
+    print("\n3. Creating quantized model...")
+    quantized_model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
+
+    # Quantize and replace each linear layer
+    layer_errors = {}
+
+    for (orig_name, orig_module), (quant_name, quant_module) in zip(
+            model.named_modules(), quantized_model.named_modules()
+    ):
+        if isinstance(orig_module, nn.Linear):
+            print(f"\n   Quantizing layer: {orig_name}")
+
+            # Get original weight
+            orig_weight = orig_module.weight.data
+            print(f"   Original weight shape: {orig_weight.shape}")
+
+            # Manually quantize to ensure correct packing
+            qweight, qzeros, qscales = quantize_weight_manual(orig_weight, wbits=wbits, groupsize=group_size)
+
+            print(f"   Quantized weight packed shape: {qweight.shape}")
+            print(f"   Scales shape: {qscales.shape}")
+            print(f"   Zeros shape: {qzeros.shape}")
+
+            # Create AWQLinear layer manually
+            awq_layer = AWQLinear(
+                in_features=orig_module.in_features,
+                out_features=orig_module.out_features,
+                bias=orig_module.bias is not None,
+                group_size=group_size,
+                wbits=wbits
+            ).to(device)
+
+            # Copy quantized parameters
+            with torch.no_grad():
+                awq_layer.qweight.copy_(qweight)
+                awq_layer.qscales.copy_(qscales)
+                awq_layer.qzeros.copy_(qzeros)
+
+                if orig_module.bias is not None:
+                    awq_layer.bias.copy_(orig_module.bias.to(torch.float16))
+
+            # Replace the layer in quantized model
+            parent_name = quant_name.rsplit('.', 1)[0] if '.' in quant_name else ''
+            child_name = quant_name.rsplit('.', 1)[1] if '.' in quant_name else quant_name
+
+            if parent_name:
+                parent = quantized_model
+                for part in parent_name.split('.'):
+                    parent = getattr(parent, part)
+                setattr(parent, child_name, awq_layer)
+            else:
+                setattr(quantized_model, child_name, awq_layer)
+
+            # Test individual layer error
+            with torch.no_grad():
+                test_layer_input = torch.randn(batch_size, seq_len, orig_module.in_features,
+                                               dtype=torch.float16, device=device)
+                orig_layer_output = orig_module(test_layer_input)
+
+                try:
+                    quant_layer_output = awq_layer(test_layer_input)
+
+                    layer_error = (orig_layer_output - quant_layer_output).abs().mean().item()
+                    layer_rel_error = layer_error / (orig_layer_output.abs().mean().item() + 1e-6)
+
+                    layer_errors[orig_name] = {
+                        'absolute_error': layer_error,
+                        'relative_error': layer_rel_error
+                    }
+
+                    print(f"   Layer absolute error: {layer_error:.6f}")
+                    print(f"   Layer relative error: {layer_rel_error:.2%}")
+                except Exception as e:
+                    print(f"   ERROR testing layer: {e}")
+                    if device.type == "cpu":
+                        print("   This is expected on CPU as Triton kernels require CUDA")
+
+    # Get quantized output
+    print("\n4. Getting quantized model output...")
+    quantized_model.eval()
+
+    try:
+        with torch.no_grad():
+            quantized_output = quantized_model(test_input)
+        print(f"Quantized output shape: {quantized_output.shape}")
+
+        # Compare outputs
+        print("\n5. Comparing outputs...")
+        print("=" * 80)
+
+        # Compute errors
+        absolute_error = (original_output - quantized_output).abs()
+        relative_error = absolute_error / (original_output.abs() + 1e-6)
+
+        print(f"\nOutput Statistics:")
+        print(f"Original output - Mean: {original_output.mean().item():.6f}, "
+              f"Std: {original_output.std().item():.6f}")
+        print(f"Quantized output - Mean: {quantized_output.mean().item():.6f}, "
+              f"Std: {quantized_output.std().item():.6f}")
+
+        print(f"\nError Metrics:")
+        print(f"Mean Absolute Error: {absolute_error.mean().item():.6f}")
+        print(f"Max Absolute Error: {absolute_error.max().item():.6f}")
+        print(f"Mean Relative Error: {relative_error.mean().item():.2%}")
+        print(f"Max Relative Error: {relative_error.max().item():.2%}")
+
+    except Exception as e:
+        print(f"\nERROR during quantized model forward pass: {e}")
+        if device.type == "cpu":
+            print("This is expected on CPU as AWQLinear requires CUDA for Triton kernels")
+        quantized_output = None
+
+    # Per-layer error summary (if we have any)
+    if layer_errors:
+        print("\nPer-Layer Error Summary:")
+        print("-" * 60)
+        print(f"{'Layer Name':<30} {'Abs Error':<15} {'Rel Error':<15}")
+        print("-" * 60)
+        for name, errors in layer_errors.items():
+            print(f"{name:<30} {errors['absolute_error']:<15.6f} {errors['relative_error']:<15.2%}")
+
+    # Memory comparison
+    print("\n6. Memory Usage Comparison:")
+    print("=" * 80)
+
+    # Calculate original model size
+    orig_params = sum(p.numel() * p.element_size() for p in model.parameters())
+    orig_size_mb = orig_params / (1024 * 1024)
+
+    # Calculate quantized model size (approximation)
+    quant_params = 0
+    for name, module in quantized_model.named_modules():
+        if isinstance(module, AWQLinear):
+            # qweight is packed int4 (half the size)
+            quant_params += module.qweight.numel() * module.qweight.element_size()
+            # scales and zeros
+            quant_params += module.qscales.numel() * module.qscales.element_size()
+            quant_params += module.qzeros.numel() * module.qzeros.element_size()
+            # bias if present
+            if module.bias is not None:
+                quant_params += module.bias.numel() * module.bias.element_size()
+
+    quant_size_mb = quant_params / (1024 * 1024)
+    compression_ratio = orig_size_mb / quant_size_mb if quant_size_mb > 0 else 0
+
+    print(f"Original model size: {orig_size_mb:.2f} MB")
+    print(f"Quantized model size: {quant_size_mb:.2f} MB")
+    print(f"Compression ratio: {compression_ratio:.2f}x")
+
+    print("\n" + "=" * 80)
+    print("Comparison completed!")
+
+    return {
+        'original_output': original_output,
+        'quantized_output': quantized_output,
+        'layer_errors': layer_errors,
+        'compression_ratio': compression_ratio
+    }
+
+
+if __name__ == "__main__":
+    # Run the comparison
+    results = compare_awq_with_linear()
+
+    # Additional analysis if needed
+    print("\n\nAdditional Analysis:")
+    print("=" * 80)
+
+    # Check if CUDA is available for better performance
+    if not torch.cuda.is_available():
+        print("Note: Running on CPU. CUDA is required for AWQLinear to work properly.")
+        print("      Triton kernels do not support CPU execution.")
+
+    # Success criteria
+    if results['quantized_output'] is not None:
+        mean_rel_error = ((results['original_output'] - results['quantized_output']).abs() /
+                          (results['original_output'].abs() + 1e-6)).mean().item()
+
+        if mean_rel_error < 0.05:  # Less than 5% error
+            print("✓ Quantization successful! Error is within acceptable range.")
+        else:
+            print("⚠ Warning: Quantization error is higher than expected.")
+
+    if results['compression_ratio'] > 0:
+        print(f"\nCompression achieved: {results['compression_ratio']:.2f}x")
+        print("This means the quantized model uses approximately "
+              f"{100 / results['compression_ratio']:.1f}% of the original model's memory.")
\ No newline at end of file
diff --git a/tests/test_gptq.py b/tests/quant/test_gptq.py
similarity index 100%
rename from tests/test_gptq.py
rename to tests/quant/test_gptq.py

From 67905cac784f7cc985eb91f7f6c39f85254870a9 Mon Sep 17 00:00:00 2001
From: TATAXIMU <tataximu@gmail.com>
Date: Thu, 24 Jul 2025 22:16:49 +0930
Subject: [PATCH 33/33] update awq test

---
 test.py                       | 317 ----------------------------------
 tests/quant/test_AWQLinear.py |   5 +-
 tests/quant/test_awq.py       | 317 ----------------------------------
 3 files changed, 3 insertions(+), 636 deletions(-)
 delete mode 100644 test.py
 delete mode 100644 tests/quant/test_awq.py

diff --git a/test.py b/test.py
deleted file mode 100644
index dfd0e2b..0000000
--- a/test.py
+++ /dev/null
@@ -1,317 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-from typing import Dict, Tuple
-import sys
-import os
-
-# Add the path to access lite_llama modules
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
-
-from lite_llama.kernels.awq_linear import AWQLinear
-from lite_llama.quantization.awq import AWQ
-from lite_llama.quantization.quant_config import AWQConfig
-from lite_llama.quantization.utils import pack_weight, unpack_weight
-
-
-class DummyModel(nn.Module):
-    """A simple model with multiple linear layers for testing"""
-
-    def __init__(self, hidden_size=768, num_layers=3):
-        super().__init__()
-        self.layers = nn.ModuleList([
-            nn.Linear(hidden_size, hidden_size, bias=True)
-            for _ in range(num_layers)
-        ])
-        self.final = nn.Linear(hidden_size, hidden_size // 2, bias=True)
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-            x = torch.relu(x)
-        x = self.final(x)
-        return x
-
-
-def quantize_weight_manual(weight: torch.Tensor, wbits: int = 4, groupsize: int = 128) -> Tuple[
-    torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Manually quantize weight to ensure correct packing"""
-    assert weight.ndim == 2
-    rows, cols = weight.shape
-    device = weight.device
-
-    maxq = 2 ** wbits - 1
-
-    # Calculate number of groups
-    if groupsize == -1 or groupsize >= cols:
-        groupsize = cols
-    num_groups = (cols + groupsize - 1) // groupsize
-
-    # Initialize tensors
-    qweight_unpacked = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
-    qzeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
-    qscales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
-
-    # Quantize each group
-    for g in range(num_groups):
-        start_col = g * groupsize
-        end_col = min((g + 1) * groupsize, cols)
-
-        # Get weight group
-        w_group = weight[:, start_col:end_col]
-
-        # Compute min/max per row
-        w_min = w_group.min(dim=1, keepdim=True)[0]
-        w_max = w_group.max(dim=1, keepdim=True)[0]
-
-        # Compute scale and zero point
-        scale = (w_max - w_min).clamp(min=1e-5) / maxq
-        zero = torch.round(-w_min / scale).clamp(0, maxq)
-
-        # Quantize
-        q = torch.clamp(torch.round(w_group / scale + zero), 0, maxq)
-
-        # Store
-        qweight_unpacked[:, start_col:end_col] = q.to(torch.uint8)
-        qscales[:, g] = scale.squeeze(1)
-        qzeros[:, g] = zero.squeeze(1)
-
-    # Pack the weights
-    qweight_packed = pack_weight(qweight_unpacked)
-
-    print(f"      Unpacked shape: {qweight_unpacked.shape} -> Packed shape: {qweight_packed.shape}")
-
-    return qweight_packed, qzeros, qscales
-
-
-def compare_awq_with_linear():
-    """Compare outputs between nn.Linear and AWQLinear"""
-
-    print("=" * 80)
-    print("AWQ Linear vs nn.Linear Comparison")
-    print("=" * 80)
-
-    # Configuration
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    if device.type == "cpu":
-        print("WARNING: AWQLinear uses Triton kernels which require CUDA.")
-        print("         The demo may fail on CPU. Please use a CUDA-enabled device.")
-        print("         Attempting to continue anyway...")
-
-    batch_size = 4
-    seq_len = 128
-    hidden_size = 768
-    group_size = 128
-    wbits = 4
-
-    print(f"\nConfiguration:")
-    print(f"Device: {device}")
-    print(f"Batch size: {batch_size}")
-    print(f"Sequence length: {seq_len}")
-    print(f"Hidden size: {hidden_size}")
-    print(f"Quantization bits: {wbits}")
-    print(f"Group size: {group_size}")
-
-    # Create dummy model
-    print("\n1. Creating dummy model...")
-    model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
-    model.eval()
-
-    # Create test input
-    test_input = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16, device=device)
-
-    # Get original output
-    print("\n2. Getting original model output...")
-    with torch.no_grad():
-        original_output = model(test_input)
-    print(f"Original output shape: {original_output.shape}")
-
-    # Create quantized model
-    print("\n3. Creating quantized model...")
-    quantized_model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
-
-    # Quantize and replace each linear layer
-    layer_errors = {}
-
-    for (orig_name, orig_module), (quant_name, quant_module) in zip(
-            model.named_modules(), quantized_model.named_modules()
-    ):
-        if isinstance(orig_module, nn.Linear):
-            print(f"\n   Quantizing layer: {orig_name}")
-
-            # Get original weight
-            orig_weight = orig_module.weight.data
-            print(f"   Original weight shape: {orig_weight.shape}")
-
-            # Manually quantize to ensure correct packing
-            qweight, qzeros, qscales = quantize_weight_manual(orig_weight, wbits=wbits, groupsize=group_size)
-
-            print(f"   Quantized weight packed shape: {qweight.shape}")
-            print(f"   Scales shape: {qscales.shape}")
-            print(f"   Zeros shape: {qzeros.shape}")
-
-            # Create AWQLinear layer manually
-            awq_layer = AWQLinear(
-                in_features=orig_module.in_features,
-                out_features=orig_module.out_features,
-                bias=orig_module.bias is not None,
-                group_size=group_size,
-                wbits=wbits
-            ).to(device)
-
-            # Copy quantized parameters
-            with torch.no_grad():
-                awq_layer.qweight.copy_(qweight)
-                awq_layer.qscales.copy_(qscales)
-                awq_layer.qzeros.copy_(qzeros)
-
-                if orig_module.bias is not None:
-                    awq_layer.bias.copy_(orig_module.bias.to(torch.float16))
-
-            # Replace the layer in quantized model
-            parent_name = quant_name.rsplit('.', 1)[0] if '.' in quant_name else ''
-            child_name = quant_name.rsplit('.', 1)[1] if '.' in quant_name else quant_name
-
-            if parent_name:
-                parent = quantized_model
-                for part in parent_name.split('.'):
-                    parent = getattr(parent, part)
-                setattr(parent, child_name, awq_layer)
-            else:
-                setattr(quantized_model, child_name, awq_layer)
-
-            # Test individual layer error
-            with torch.no_grad():
-                test_layer_input = torch.randn(batch_size, seq_len, orig_module.in_features,
-                                               dtype=torch.float16, device=device)
-                orig_layer_output = orig_module(test_layer_input)
-
-                try:
-                    quant_layer_output = awq_layer(test_layer_input)
-
-                    layer_error = (orig_layer_output - quant_layer_output).abs().mean().item()
-                    layer_rel_error = layer_error / (orig_layer_output.abs().mean().item() + 1e-6)
-
-                    layer_errors[orig_name] = {
-                        'absolute_error': layer_error,
-                        'relative_error': layer_rel_error
-                    }
-
-                    print(f"   Layer absolute error: {layer_error:.6f}")
-                    print(f"   Layer relative error: {layer_rel_error:.2%}")
-                except Exception as e:
-                    print(f"   ERROR testing layer: {e}")
-                    if device.type == "cpu":
-                        print("   This is expected on CPU as Triton kernels require CUDA")
-
-    # Get quantized output
-    print("\n4. Getting quantized model output...")
-    quantized_model.eval()
-
-    try:
-        with torch.no_grad():
-            quantized_output = quantized_model(test_input)
-        print(f"Quantized output shape: {quantized_output.shape}")
-
-        # Compare outputs
-        print("\n5. Comparing outputs...")
-        print("=" * 80)
-
-        # Compute errors
-        absolute_error = (original_output - quantized_output).abs()
-        relative_error = absolute_error / (original_output.abs() + 1e-6)
-
-        print(f"\nOutput Statistics:")
-        print(f"Original output - Mean: {original_output.mean().item():.6f}, "
-              f"Std: {original_output.std().item():.6f}")
-        print(f"Quantized output - Mean: {quantized_output.mean().item():.6f}, "
-              f"Std: {quantized_output.std().item():.6f}")
-
-        print(f"\nError Metrics:")
-        print(f"Mean Absolute Error: {absolute_error.mean().item():.6f}")
-        print(f"Max Absolute Error: {absolute_error.max().item():.6f}")
-        print(f"Mean Relative Error: {relative_error.mean().item():.2%}")
-        print(f"Max Relative Error: {relative_error.max().item():.2%}")
-
-    except Exception as e:
-        print(f"\nERROR during quantized model forward pass: {e}")
-        if device.type == "cpu":
-            print("This is expected on CPU as AWQLinear requires CUDA for Triton kernels")
-        quantized_output = None
-
-    # Per-layer error summary (if we have any)
-    if layer_errors:
-        print("\nPer-Layer Error Summary:")
-        print("-" * 60)
-        print(f"{'Layer Name':<30} {'Abs Error':<15} {'Rel Error':<15}")
-        print("-" * 60)
-        for name, errors in layer_errors.items():
-            print(f"{name:<30} {errors['absolute_error']:<15.6f} {errors['relative_error']:<15.2%}")
-
-    # Memory comparison
-    print("\n6. Memory Usage Comparison:")
-    print("=" * 80)
-
-    # Calculate original model size
-    orig_params = sum(p.numel() * p.element_size() for p in model.parameters())
-    orig_size_mb = orig_params / (1024 * 1024)
-
-    # Calculate quantized model size (approximation)
-    quant_params = 0
-    for name, module in quantized_model.named_modules():
-        if isinstance(module, AWQLinear):
-            # qweight is packed int4 (half the size)
-            quant_params += module.qweight.numel() * module.qweight.element_size()
-            # scales and zeros
-            quant_params += module.qscales.numel() * module.qscales.element_size()
-            quant_params += module.qzeros.numel() * module.qzeros.element_size()
-            # bias if present
-            if module.bias is not None:
-                quant_params += module.bias.numel() * module.bias.element_size()
-
-    quant_size_mb = quant_params / (1024 * 1024)
-    compression_ratio = orig_size_mb / quant_size_mb if quant_size_mb > 0 else 0
-
-    print(f"Original model size: {orig_size_mb:.2f} MB")
-    print(f"Quantized model size: {quant_size_mb:.2f} MB")
-    print(f"Compression ratio: {compression_ratio:.2f}x")
-
-    print("\n" + "=" * 80)
-    print("Comparison completed!")
-
-    return {
-        'original_output': original_output,
-        'quantized_output': quantized_output,
-        'layer_errors': layer_errors,
-        'compression_ratio': compression_ratio
-    }
-
-
-if __name__ == "__main__":
-    # Run the comparison
-    results = compare_awq_with_linear()
-
-    # Additional analysis if needed
-    print("\n\nAdditional Analysis:")
-    print("=" * 80)
-
-    # Check if CUDA is available for better performance
-    if not torch.cuda.is_available():
-        print("Note: Running on CPU. CUDA is required for AWQLinear to work properly.")
-        print("      Triton kernels do not support CPU execution.")
-
-    # Success criteria
-    if results['quantized_output'] is not None:
-        mean_rel_error = ((results['original_output'] - results['quantized_output']).abs() /
-                          (results['original_output'].abs() + 1e-6)).mean().item()
-
-        if mean_rel_error < 0.05:  # Less than 5% error
-            print("✓ Quantization successful! Error is within acceptable range.")
-        else:
-            print("⚠ Warning: Quantization error is higher than expected.")
-
-    if results['compression_ratio'] > 0:
-        print(f"\nCompression achieved: {results['compression_ratio']:.2f}x")
-        print("This means the quantized model uses approximately "
-              f"{100 / results['compression_ratio']:.1f}% of the original model's memory.")
\ No newline at end of file
diff --git a/tests/quant/test_AWQLinear.py b/tests/quant/test_AWQLinear.py
index 8e36ad1..e2efd5c 100644
--- a/tests/quant/test_AWQLinear.py
+++ b/tests/quant/test_AWQLinear.py
@@ -86,10 +86,11 @@ def quantize_linear_layer(self, linear_layer: nn.Linear, group_size: int = 128)
             state_dict["test_layer.bias"] = linear_layer.bias.data
 
         # Quantize using AWQ
+        from lite_llama.quantization.quant_config import AWQConfig
+        awq_config = AWQConfig()
         quantized_dict = quantize_awq(
             model_state_dict=state_dict,
-            wbits=4,
-            groupsize=group_size,
+            config=awq_config,
             target_layers=["test_layer.weight"],
             device=str(self.device)
         )
diff --git a/tests/quant/test_awq.py b/tests/quant/test_awq.py
deleted file mode 100644
index dfd0e2b..0000000
--- a/tests/quant/test_awq.py
+++ /dev/null
@@ -1,317 +0,0 @@
-import torch
-import torch.nn as nn
-import numpy as np
-from typing import Dict, Tuple
-import sys
-import os
-
-# Add the path to access lite_llama modules
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
-
-from lite_llama.kernels.awq_linear import AWQLinear
-from lite_llama.quantization.awq import AWQ
-from lite_llama.quantization.quant_config import AWQConfig
-from lite_llama.quantization.utils import pack_weight, unpack_weight
-
-
-class DummyModel(nn.Module):
-    """A simple model with multiple linear layers for testing"""
-
-    def __init__(self, hidden_size=768, num_layers=3):
-        super().__init__()
-        self.layers = nn.ModuleList([
-            nn.Linear(hidden_size, hidden_size, bias=True)
-            for _ in range(num_layers)
-        ])
-        self.final = nn.Linear(hidden_size, hidden_size // 2, bias=True)
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-            x = torch.relu(x)
-        x = self.final(x)
-        return x
-
-
-def quantize_weight_manual(weight: torch.Tensor, wbits: int = 4, groupsize: int = 128) -> Tuple[
-    torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Manually quantize weight to ensure correct packing"""
-    assert weight.ndim == 2
-    rows, cols = weight.shape
-    device = weight.device
-
-    maxq = 2 ** wbits - 1
-
-    # Calculate number of groups
-    if groupsize == -1 or groupsize >= cols:
-        groupsize = cols
-    num_groups = (cols + groupsize - 1) // groupsize
-
-    # Initialize tensors
-    qweight_unpacked = torch.zeros((rows, cols), dtype=torch.uint8, device=device)
-    qzeros = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
-    qscales = torch.zeros((rows, num_groups), dtype=torch.float16, device=device)
-
-    # Quantize each group
-    for g in range(num_groups):
-        start_col = g * groupsize
-        end_col = min((g + 1) * groupsize, cols)
-
-        # Get weight group
-        w_group = weight[:, start_col:end_col]
-
-        # Compute min/max per row
-        w_min = w_group.min(dim=1, keepdim=True)[0]
-        w_max = w_group.max(dim=1, keepdim=True)[0]
-
-        # Compute scale and zero point
-        scale = (w_max - w_min).clamp(min=1e-5) / maxq
-        zero = torch.round(-w_min / scale).clamp(0, maxq)
-
-        # Quantize
-        q = torch.clamp(torch.round(w_group / scale + zero), 0, maxq)
-
-        # Store
-        qweight_unpacked[:, start_col:end_col] = q.to(torch.uint8)
-        qscales[:, g] = scale.squeeze(1)
-        qzeros[:, g] = zero.squeeze(1)
-
-    # Pack the weights
-    qweight_packed = pack_weight(qweight_unpacked)
-
-    print(f"      Unpacked shape: {qweight_unpacked.shape} -> Packed shape: {qweight_packed.shape}")
-
-    return qweight_packed, qzeros, qscales
-
-
-def compare_awq_with_linear():
-    """Compare outputs between nn.Linear and AWQLinear"""
-
-    print("=" * 80)
-    print("AWQ Linear vs nn.Linear Comparison")
-    print("=" * 80)
-
-    # Configuration
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    if device.type == "cpu":
-        print("WARNING: AWQLinear uses Triton kernels which require CUDA.")
-        print("         The demo may fail on CPU. Please use a CUDA-enabled device.")
-        print("         Attempting to continue anyway...")
-
-    batch_size = 4
-    seq_len = 128
-    hidden_size = 768
-    group_size = 128
-    wbits = 4
-
-    print(f"\nConfiguration:")
-    print(f"Device: {device}")
-    print(f"Batch size: {batch_size}")
-    print(f"Sequence length: {seq_len}")
-    print(f"Hidden size: {hidden_size}")
-    print(f"Quantization bits: {wbits}")
-    print(f"Group size: {group_size}")
-
-    # Create dummy model
-    print("\n1. Creating dummy model...")
-    model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
-    model.eval()
-
-    # Create test input
-    test_input = torch.randn(batch_size, seq_len, hidden_size, dtype=torch.float16, device=device)
-
-    # Get original output
-    print("\n2. Getting original model output...")
-    with torch.no_grad():
-        original_output = model(test_input)
-    print(f"Original output shape: {original_output.shape}")
-
-    # Create quantized model
-    print("\n3. Creating quantized model...")
-    quantized_model = DummyModel(hidden_size=hidden_size).to(device).to(torch.float16)
-
-    # Quantize and replace each linear layer
-    layer_errors = {}
-
-    for (orig_name, orig_module), (quant_name, quant_module) in zip(
-            model.named_modules(), quantized_model.named_modules()
-    ):
-        if isinstance(orig_module, nn.Linear):
-            print(f"\n   Quantizing layer: {orig_name}")
-
-            # Get original weight
-            orig_weight = orig_module.weight.data
-            print(f"   Original weight shape: {orig_weight.shape}")
-
-            # Manually quantize to ensure correct packing
-            qweight, qzeros, qscales = quantize_weight_manual(orig_weight, wbits=wbits, groupsize=group_size)
-
-            print(f"   Quantized weight packed shape: {qweight.shape}")
-            print(f"   Scales shape: {qscales.shape}")
-            print(f"   Zeros shape: {qzeros.shape}")
-
-            # Create AWQLinear layer manually
-            awq_layer = AWQLinear(
-                in_features=orig_module.in_features,
-                out_features=orig_module.out_features,
-                bias=orig_module.bias is not None,
-                group_size=group_size,
-                wbits=wbits
-            ).to(device)
-
-            # Copy quantized parameters
-            with torch.no_grad():
-                awq_layer.qweight.copy_(qweight)
-                awq_layer.qscales.copy_(qscales)
-                awq_layer.qzeros.copy_(qzeros)
-
-                if orig_module.bias is not None:
-                    awq_layer.bias.copy_(orig_module.bias.to(torch.float16))
-
-            # Replace the layer in quantized model
-            parent_name = quant_name.rsplit('.', 1)[0] if '.' in quant_name else ''
-            child_name = quant_name.rsplit('.', 1)[1] if '.' in quant_name else quant_name
-
-            if parent_name:
-                parent = quantized_model
-                for part in parent_name.split('.'):
-                    parent = getattr(parent, part)
-                setattr(parent, child_name, awq_layer)
-            else:
-                setattr(quantized_model, child_name, awq_layer)
-
-            # Test individual layer error
-            with torch.no_grad():
-                test_layer_input = torch.randn(batch_size, seq_len, orig_module.in_features,
-                                               dtype=torch.float16, device=device)
-                orig_layer_output = orig_module(test_layer_input)
-
-                try:
-                    quant_layer_output = awq_layer(test_layer_input)
-
-                    layer_error = (orig_layer_output - quant_layer_output).abs().mean().item()
-                    layer_rel_error = layer_error / (orig_layer_output.abs().mean().item() + 1e-6)
-
-                    layer_errors[orig_name] = {
-                        'absolute_error': layer_error,
-                        'relative_error': layer_rel_error
-                    }
-
-                    print(f"   Layer absolute error: {layer_error:.6f}")
-                    print(f"   Layer relative error: {layer_rel_error:.2%}")
-                except Exception as e:
-                    print(f"   ERROR testing layer: {e}")
-                    if device.type == "cpu":
-                        print("   This is expected on CPU as Triton kernels require CUDA")
-
-    # Get quantized output
-    print("\n4. Getting quantized model output...")
-    quantized_model.eval()
-
-    try:
-        with torch.no_grad():
-            quantized_output = quantized_model(test_input)
-        print(f"Quantized output shape: {quantized_output.shape}")
-
-        # Compare outputs
-        print("\n5. Comparing outputs...")
-        print("=" * 80)
-
-        # Compute errors
-        absolute_error = (original_output - quantized_output).abs()
-        relative_error = absolute_error / (original_output.abs() + 1e-6)
-
-        print(f"\nOutput Statistics:")
-        print(f"Original output - Mean: {original_output.mean().item():.6f}, "
-              f"Std: {original_output.std().item():.6f}")
-        print(f"Quantized output - Mean: {quantized_output.mean().item():.6f}, "
-              f"Std: {quantized_output.std().item():.6f}")
-
-        print(f"\nError Metrics:")
-        print(f"Mean Absolute Error: {absolute_error.mean().item():.6f}")
-        print(f"Max Absolute Error: {absolute_error.max().item():.6f}")
-        print(f"Mean Relative Error: {relative_error.mean().item():.2%}")
-        print(f"Max Relative Error: {relative_error.max().item():.2%}")
-
-    except Exception as e:
-        print(f"\nERROR during quantized model forward pass: {e}")
-        if device.type == "cpu":
-            print("This is expected on CPU as AWQLinear requires CUDA for Triton kernels")
-        quantized_output = None
-
-    # Per-layer error summary (if we have any)
-    if layer_errors:
-        print("\nPer-Layer Error Summary:")
-        print("-" * 60)
-        print(f"{'Layer Name':<30} {'Abs Error':<15} {'Rel Error':<15}")
-        print("-" * 60)
-        for name, errors in layer_errors.items():
-            print(f"{name:<30} {errors['absolute_error']:<15.6f} {errors['relative_error']:<15.2%}")
-
-    # Memory comparison
-    print("\n6. Memory Usage Comparison:")
-    print("=" * 80)
-
-    # Calculate original model size
-    orig_params = sum(p.numel() * p.element_size() for p in model.parameters())
-    orig_size_mb = orig_params / (1024 * 1024)
-
-    # Calculate quantized model size (approximation)
-    quant_params = 0
-    for name, module in quantized_model.named_modules():
-        if isinstance(module, AWQLinear):
-            # qweight is packed int4 (half the size)
-            quant_params += module.qweight.numel() * module.qweight.element_size()
-            # scales and zeros
-            quant_params += module.qscales.numel() * module.qscales.element_size()
-            quant_params += module.qzeros.numel() * module.qzeros.element_size()
-            # bias if present
-            if module.bias is not None:
-                quant_params += module.bias.numel() * module.bias.element_size()
-
-    quant_size_mb = quant_params / (1024 * 1024)
-    compression_ratio = orig_size_mb / quant_size_mb if quant_size_mb > 0 else 0
-
-    print(f"Original model size: {orig_size_mb:.2f} MB")
-    print(f"Quantized model size: {quant_size_mb:.2f} MB")
-    print(f"Compression ratio: {compression_ratio:.2f}x")
-
-    print("\n" + "=" * 80)
-    print("Comparison completed!")
-
-    return {
-        'original_output': original_output,
-        'quantized_output': quantized_output,
-        'layer_errors': layer_errors,
-        'compression_ratio': compression_ratio
-    }
-
-
-if __name__ == "__main__":
-    # Run the comparison
-    results = compare_awq_with_linear()
-
-    # Additional analysis if needed
-    print("\n\nAdditional Analysis:")
-    print("=" * 80)
-
-    # Check if CUDA is available for better performance
-    if not torch.cuda.is_available():
-        print("Note: Running on CPU. CUDA is required for AWQLinear to work properly.")
-        print("      Triton kernels do not support CPU execution.")
-
-    # Success criteria
-    if results['quantized_output'] is not None:
-        mean_rel_error = ((results['original_output'] - results['quantized_output']).abs() /
-                          (results['original_output'].abs() + 1e-6)).mean().item()
-
-        if mean_rel_error < 0.05:  # Less than 5% error
-            print("✓ Quantization successful! Error is within acceptable range.")
-        else:
-            print("⚠ Warning: Quantization error is higher than expected.")
-
-    if results['compression_ratio'] > 0:
-        print(f"\nCompression achieved: {results['compression_ratio']:.2f}x")
-        print("This means the quantized model uses approximately "
-              f"{100 / results['compression_ratio']:.1f}% of the original model's memory.")
\ No newline at end of file