From 66fdd68ad6bcd0a86605f23a55824ca609da2751 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Thu, 20 Feb 2025 16:37:19 +0800
Subject: [PATCH 01/15] initial

---
 .../llm/example/GPU/DeepSeek-R1/generate.py   | 317 ++++++++++++++++++
 1 file changed, 317 insertions(+)
 create mode 100644 python/llm/example/GPU/DeepSeek-R1/generate.py
diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
new file mode 100644
index 00000000000..2c14163cd80
--- /dev/null
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -0,0 +1,317 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional, Tuple, Union
+import warnings
+import os
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import time
+import argparse
+import ipex_llm
+
+from ipex_llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.models.common import scaled_dot_product_attention
+from transformers import AutoTokenizer, GenerationConfig
+from transformers.cache_utils import Cache, DynamicCache
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
+from transformers.utils.import_utils import is_torch_fx_available
+
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+deepseek_prompt = """
+A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: Question: If \( a > 1 \), then the sum of the real solutions of \( \sqrt{a} - \sqrt{a + x} = x \) is equal to:. Assistant: <think>
+"""
+
+def convert_forward_to_xpu(m, target_m, new_forward):
+    # print(m.__class__.__name__)
+    if m.__class__.__name__ == target_m:
+        bound_method = new_forward.__get__(m, m.__class__)
+        setattr(m, "forward", bound_method)
+        # m = m.to(device="xpu", dtype=torch.float16)
+    for _, sub_m in m.named_children():
+        convert_forward_to_xpu(sub_m, target_m, new_forward)
+
+
+def hybrid_DeepseekV3MoE_forward(self, hidden_states):
+    # convert1_start = time.time()
+    hidden_states = hidden_states.to(device="cpu")#, dtype=torch.bfloat16)
+    # convert1_end = time.time()
+    # moe_start = time.time()
+    identity = hidden_states
+    orig_shape = hidden_states.shape
+    topk_idx, topk_weight = self.gate(hidden_states)
+    hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    if not self.training:
+        y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
+    if self.config.n_shared_experts is not None:
+        y = y + self.shared_experts(identity)
+    # moe_end = time.time()
+    # convert2_start = time.time()
+    y = y.to(device="xpu")#, dtype=torch.float16)
+    # convert2_end = time.time()
+    # print("convert to cpu time: ", (convert1_end - convert1_start)*1000)
+    # print("moe time: ", (moe_end - moe_start) * 1000)
+    # print("convert to xpu time: ", (convert2_end - convert2_start) * 1000)
+    return y
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def hybrid_DeepseekV3Attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+    bsz, q_len, _ = hidden_states.size()
+
+    if self.q_lora_rank is None:
+        q = self.q_proj(hidden_states)
+    else:
+        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+    q_nope, q_pe = torch.split(
+        q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+    )
+
+    compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+    compressed_kv, k_pe = torch.split(
+        compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+    )
+    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+    kv = (
+        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        .transpose(1, 2)
+    )
+
+    k_nope, value_states = torch.split(
+        kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+    )
+    kv_seq_len = value_states.shape[-2]
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+
+    query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+    query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+    key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+    key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(
+            key_states, value_states, self.layer_idx, cache_kwargs
+        )
+
+    attn_weights = None
+    # TODO: pad value_states [1, 128, 130, 128] to query/key_states [1, 128, 130, 192] for correctness
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len, scale=self.softmax_scale
+    )
+
+    # attn_weights = (
+    #     torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
+    # )
+    #
+    # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+    #     raise ValueError(
+    #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+    #         f" {attn_weights.size()}"
+    #     )
+    # assert attention_mask is not None
+    # if attention_mask is not None:
+    #     if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+    #         raise ValueError(
+    #             f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+    #         )
+    #     attn_weights = attn_weights + attention_mask
+    #
+    # # upcast attention to fp32
+    # attn_weights = nn.functional.softmax(
+    #     attn_weights, dim=-1, dtype=torch.float32
+    # ).to(query_states.dtype)
+    # attn_weights = nn.functional.dropout(
+    #     attn_weights, p=self.attention_dropout, training=self.training
+    # )
+    # attn_output = torch.matmul(attn_weights, value_states)
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
+                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
+                             ', or the path to the huggingface checkpoint folder')
+    parser.add_argument('--prompt', type=str, default="What is AI?",
+                        help='Prompt to infer')
+    parser.add_argument('--n-predict', type=int, default=32,
+                        help='Max tokens to predict')
+    parser.add_argument('--load-path', type=str, default=None,
+                        help='The path to load the low-bit model.')
+
+    args = parser.parse_args()
+    model_path = args.repo_id_or_model_path
+
+    load_path = args.load_path
+    if load_path:
+        model = AutoModelForCausalLM.load_low_bit(load_path, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(load_path,
+                                              trust_remote_code=True)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                    load_in_4bit=True,
+                                                    optimize_model=True,
+                                                    trust_remote_code=True,
+                                                    use_cache=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path,
+                                              trust_remote_code=True)
+
+    # model = model.bfloat16()
+    print(model)
+    convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
+    convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+    for i in range(0, model.config.num_hidden_layers):
+        model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
+        model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
+        model.model.layers[i].post_attention_layernorm = model.model.layers[i].post_attention_layernorm.to(device="xpu")#, dtype=torch.float16)
+        if i < model.config.first_k_dense_replace:
+            model.model.layers[i].mlp = model.model.layers[i].mlp.to(device="xpu")#, dtype=torch.float16)
+        # else:
+            # model.model.layers[i].mlp.gate = model.model.layers[i].mlp.gate.to(device="xpu", dtype=torch.float16)
+            # model.model.layers[i].mlp.shared_experts = model.model.layers[i].mlp.shared_experts.to(device="xpu", dtype=torch.float16)
+    model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
+    model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
+    model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
+    from ipex_llm.transformers.models.common import rms_norm_forward
+    from ipex_llm.transformers.models.common import mlp_silu_forward
+    convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
+    convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
+
+    print("load completed")
+    # model = BenchmarkWrapper(model, do_print=True)
+    # Generate predicted tokens
+    with torch.inference_mode():
+        prompt = deepseek_prompt
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+        # ipex_llm model needs a warmup, then inference time can be accurate
+        output = model.generate(input_ids,
+                                max_new_tokens=args.n_predict)
+
+        # start inference
+        st = time.time()
+        output = model.generate(input_ids,
+                                max_new_tokens=args.n_predict)
+        torch.xpu.synchronize()
+        end = time.time()
+        output = output.cpu()
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+        print(f'Inference time: {end-st} s')
+        print('-'*20, 'Prompt', '-'*20)
+        print(prompt)
+        print('-'*20, 'Output', '-'*20)
+        print(output_str)

From 74969ceac6f8e8811b67c2c4b263faec4bf7d6e6 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Thu, 20 Feb 2025 18:30:22 +0800
Subject: [PATCH 02/15] update benchmark

---
 .../llm/example/GPU/DeepSeek-R1/generate.py   | 87 +++++++++++--------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
index 2c14163cd80..e1496cec994 100644
--- a/python/llm/example/GPU/DeepSeek-R1/generate.py
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -17,16 +17,21 @@
 from typing import List, Optional, Tuple, Union
 import warnings
 import os
+import numpy as np
 
 import torch
 from torch import nn
 import torch.nn.functional as F
 import time
 import argparse
-import ipex_llm
 
 from ipex_llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.convert import convert_forward
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
+from ipex_llm.transformers.models.common import rms_norm_forward
+from ipex_llm.transformers.models.common import mlp_silu_forward
+from ipex_llm.utils.benchmark_util_deepseek import BenchmarkWrapper
+
 from transformers import AutoTokenizer, GenerationConfig
 from transformers.cache_utils import Cache, DynamicCache
 from torch.nn import CrossEntropyLoss
@@ -36,19 +41,13 @@
 from transformers.utils.import_utils import is_torch_fx_available
 
 
-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
-# It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    if not is_torch_greater_or_equal_than_1_13:
-        import torch.fx
-
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-
-
-deepseek_prompt = """
-A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: Question: If \( a > 1 \), then the sum of the real solutions of \( \sqrt{a} - \sqrt{a + x} = x \) is equal to:. Assistant: <think>
+PROMPT_FORMAT = """
+A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
+User: {prompt}.
+Assistant: <think>
 """
 
+
 def convert_forward_to_xpu(m, target_m, new_forward):
     # print(m.__class__.__name__)
     if m.__class__.__name__ == target_m:
@@ -224,11 +223,11 @@ def hybrid_DeepseekV3Attention_forward(
     # )
     # attn_output = torch.matmul(attn_weights, value_states)
 
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
+    # if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
+    #     raise ValueError(
+    #         f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
+    #         f" {attn_output.size()}"
+    #     )
 
     attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -247,12 +246,16 @@ def hybrid_DeepseekV3Attention_forward(
     parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
                         help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
                              ', or the path to the huggingface checkpoint folder')
-    parser.add_argument('--prompt', type=str, default="What is AI?",
+    parser.add_argument('--prompt', type=str, default="If \( a > 1 \), then the sum of the real solutions of \( \sqrt{a} - \sqrt{a + x} = x \) is equal to:",
                         help='Prompt to infer')
     parser.add_argument('--n-predict', type=int, default=32,
                         help='Max tokens to predict')
     parser.add_argument('--load-path', type=str, default=None,
                         help='The path to load the low-bit model.')
+    parser.add_argument('--warm-up', type=int, default=1,
+                        help='Num of warm-up trials.')
+    parser.add_argument('--num-trials', type=int, default=1,
+                        help='Num of trials to run.')
 
     args = parser.parse_args()
     model_path = args.repo_id_or_model_path
@@ -274,7 +277,7 @@ def hybrid_DeepseekV3Attention_forward(
     # model = model.bfloat16()
     print(model)
     convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
-    convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+    # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
     for i in range(0, model.config.num_hidden_layers):
         model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
         model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
@@ -287,31 +290,39 @@ def hybrid_DeepseekV3Attention_forward(
     model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
     model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
     model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
-    from ipex_llm.transformers.models.common import rms_norm_forward
-    from ipex_llm.transformers.models.common import mlp_silu_forward
     convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
     convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
 
     print("load completed")
-    # model = BenchmarkWrapper(model, do_print=True)
+    model = BenchmarkWrapper(model)
+    e2e_time_list = []
+    prefill_time_list = []
+    rest_cost_mean_list = []
+
     # Generate predicted tokens
     with torch.inference_mode():
-        prompt = deepseek_prompt
-        input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+        prompt = PROMPT_FORMAT.format(prompt=args.prompt)
+        input_ids = tokenizer.encode(prompt, return_tensors="pt").to("xpu")
         # ipex_llm model needs a warmup, then inference time can be accurate
-        output = model.generate(input_ids,
-                                max_new_tokens=args.n_predict)
+        for i in range(args.warm_up):
+            output = model.generate(input_ids,
+                                    max_new_tokens=args.n_predict,
+                                    min_new_tokens=args.n_predict)
 
         # start inference
-        st = time.time()
-        output = model.generate(input_ids,
-                                max_new_tokens=args.n_predict)
-        torch.xpu.synchronize()
-        end = time.time()
-        output = output.cpu()
-        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
-        print(f'Inference time: {end-st} s')
-        print('-'*20, 'Prompt', '-'*20)
-        print(prompt)
-        print('-'*20, 'Output', '-'*20)
-        print(output_str)
+        for i in range(args.num_trials):
+            st = time.time()
+            output = model.generate(input_ids,
+                                    max_new_tokens=args.n_predict,
+                                    min_new_tokens=args.n_predict)
+            torch.xpu.synchronize()
+            end = time.time()
+            output = output.cpu()
+            e2e_time_list.append(end - st)
+            prefill_time_list.append(model.first_cost)
+            rest_cost_mean_list.append(model.rest_cost_mean)
+
+        print('-' * 20, 'Performance', '-' * 20)
+        print(f"End-to-end time: {np.mean(e2e_time_list)} s")
+        print(f"Prefill time: {np.mean(prefill_time_list)} s")
+        print(f"Rest cost mean: {np.mean(rest_cost_mean_list) * 1000} ms")

From fa6c0d99aaff3f905c4cf6689adb358c1bbf02f6 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Mon, 24 Feb 2025 17:10:51 +0800
Subject: [PATCH 03/15] try sdpa

---
 python/llm/example/GPU/DeepSeek-R1/generate.py         | 10 ++++++----
 python/llm/src/ipex_llm/transformers/convert.py        |  3 +++
 .../llm/src/ipex_llm/transformers/models/minicpm3.py   |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
index e1496cec994..8e6a659301f 100644
--- a/python/llm/example/GPU/DeepSeek-R1/generate.py
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -157,12 +157,12 @@ def hybrid_DeepseekV3Attention_forward(
     k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
     kv = (
         self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.q_head_dim)
         .transpose(1, 2)
     )
 
     k_nope, value_states = torch.split(
-        kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        kv, [self.qk_nope_head_dim, self.q_head_dim], dim=-1
     )
     kv_seq_len = value_states.shape[-2]
     if past_key_value is not None:
@@ -191,11 +191,13 @@ def hybrid_DeepseekV3Attention_forward(
         )
 
     attn_weights = None
-    # TODO: pad value_states [1, 128, 130, 128] to query/key_states [1, 128, 130, 192] for correctness
+    # import pdb
+    # breakpoint()
     attn_output = scaled_dot_product_attention(
         query_states, key_states, value_states,
         attention_mask, q_len == kv_seq_len, scale=self.softmax_scale
     )
+    attn_output = attn_output[:, :, :, :self.v_head_dim]
 
     # attn_weights = (
     #     torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
@@ -277,7 +279,7 @@ def hybrid_DeepseekV3Attention_forward(
     # model = model.bfloat16()
     print(model)
     convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
-    # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+    convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
     for i in range(0, model.config.num_hidden_layers):
         model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
         model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index bbc7547a8a5..3b069c5e18f 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -1012,6 +1012,9 @@ def _optimize_pre(model, qtype=None):
         model.apply(pre_compute_inv_freq)
         from ipex_llm.transformers.models.minicpm3 import padding_v_head_dim
         model.apply(padding_v_head_dim)
+    elif model.config.model_type == "deepseek_v3":
+        from ipex_llm.transformers.models.minicpm3 import padding_v_head_dim
+        model.apply(padding_v_head_dim)
     elif model.config.model_type == "minicpmv":
         from ipex_llm.transformers.models.minicpmv import merge_qkv
         model.vpm.apply(merge_qkv)
diff --git a/python/llm/src/ipex_llm/transformers/models/minicpm3.py b/python/llm/src/ipex_llm/transformers/models/minicpm3.py
index 03e45912a58..d73033277e2 100644
--- a/python/llm/src/ipex_llm/transformers/models/minicpm3.py
+++ b/python/llm/src/ipex_llm/transformers/models/minicpm3.py
@@ -24,7 +24,7 @@ def pre_compute_inv_freq(module: torch.nn.Module):
 
 
 def padding_v_head_dim(module: torch.nn.Module):
-    if module.__class__.__name__ == "MiniCPMAttention":
+    if module.__class__.__name__ == "MiniCPMAttention" or module.__class__.__name__ == "DeepseekV3Attention":
         k_head_dim = module.q_head_dim
         v_head_dim = module.v_head_dim
         invalidInputError(k_head_dim >= v_head_dim,

From addf7e20c37f480abf7c900b5d94455954475aeb Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Mon, 24 Feb 2025 18:34:24 +0800
Subject: [PATCH 04/15] finish sdpa

---
 python/llm/example/GPU/DeepSeek-R1/generate.py          | 9 ++++++---
 python/llm/src/ipex_llm/transformers/convert.py         | 6 +++---
 python/llm/src/ipex_llm/transformers/models/minicpm3.py | 2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
index 8e6a659301f..0e784dd51c1 100644
--- a/python/llm/example/GPU/DeepSeek-R1/generate.py
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -157,12 +157,12 @@ def hybrid_DeepseekV3Attention_forward(
     k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
     kv = (
         self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.q_head_dim)
+        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
         .transpose(1, 2)
     )
 
     k_nope, value_states = torch.split(
-        kv, [self.qk_nope_head_dim, self.q_head_dim], dim=-1
+        kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
     )
     kv_seq_len = value_states.shape[-2]
     if past_key_value is not None:
@@ -186,8 +186,11 @@ def hybrid_DeepseekV3Attention_forward(
     key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
     if past_key_value is not None:
         cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        padded_value_states = torch.zeros([value_states.shape[0], value_states.shape[1], value_states.shape[2], key_states.shape[-1]],
+                                          dtype=value_states.dtype, device=value_states.device)
+        padded_value_states[:, :, :, :value_states.shape[-1]] = value_states
         key_states, value_states = past_key_value.update(
-            key_states, value_states, self.layer_idx, cache_kwargs
+            key_states, padded_value_states, self.layer_idx, cache_kwargs
         )
 
     attn_weights = None
diff --git a/python/llm/src/ipex_llm/transformers/convert.py b/python/llm/src/ipex_llm/transformers/convert.py
index 3b069c5e18f..299324485e3 100644
--- a/python/llm/src/ipex_llm/transformers/convert.py
+++ b/python/llm/src/ipex_llm/transformers/convert.py
@@ -1012,9 +1012,9 @@ def _optimize_pre(model, qtype=None):
         model.apply(pre_compute_inv_freq)
         from ipex_llm.transformers.models.minicpm3 import padding_v_head_dim
         model.apply(padding_v_head_dim)
-    elif model.config.model_type == "deepseek_v3":
-        from ipex_llm.transformers.models.minicpm3 import padding_v_head_dim
-        model.apply(padding_v_head_dim)
+    # elif model.config.model_type == "deepseek_v3":
+    #     from ipex_llm.transformers.models.minicpm3 import padding_v_head_dim
+    #     model.apply(padding_v_head_dim)
     elif model.config.model_type == "minicpmv":
         from ipex_llm.transformers.models.minicpmv import merge_qkv
         model.vpm.apply(merge_qkv)
diff --git a/python/llm/src/ipex_llm/transformers/models/minicpm3.py b/python/llm/src/ipex_llm/transformers/models/minicpm3.py
index d73033277e2..03e45912a58 100644
--- a/python/llm/src/ipex_llm/transformers/models/minicpm3.py
+++ b/python/llm/src/ipex_llm/transformers/models/minicpm3.py
@@ -24,7 +24,7 @@ def pre_compute_inv_freq(module: torch.nn.Module):
 
 
 def padding_v_head_dim(module: torch.nn.Module):
-    if module.__class__.__name__ == "MiniCPMAttention" or module.__class__.__name__ == "DeepseekV3Attention":
+    if module.__class__.__name__ == "MiniCPMAttention":
         k_head_dim = module.q_head_dim
         v_head_dim = module.v_head_dim
         invalidInputError(k_head_dim >= v_head_dim,

From 6e114f433bd41ef67f155ce418dff27797b7a14a Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Mon, 24 Feb 2025 20:17:11 +0800
Subject: [PATCH 05/15] breakdown initial

---
 .../llm/example/GPU/DeepSeek-R1/breakdown.py  | 320 ++++++++++++++++++
 1 file changed, 320 insertions(+)
 create mode 100644 python/llm/example/GPU/DeepSeek-R1/breakdown.py

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
new file mode 100644
index 00000000000..f4ddd1e59bd
--- /dev/null
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -0,0 +1,320 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional, Tuple, Union
+import warnings
+import os
+import numpy as np
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import time
+import argparse
+
+from ipex_llm.transformers import AutoModelForCausalLM
+from ipex_llm.transformers.convert import convert_forward
+from ipex_llm.transformers.models.common import scaled_dot_product_attention
+from ipex_llm.transformers.models.common import rms_norm_forward
+from ipex_llm.transformers.models.common import mlp_silu_forward
+from ipex_llm.utils.benchmark_util_deepseek import BenchmarkWrapper
+
+from transformers import AutoTokenizer, GenerationConfig
+from transformers.cache_utils import Cache, DynamicCache
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
+from transformers.utils.import_utils import is_torch_fx_available
+
+
+PROMPT_FORMAT = """
+A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
+User: {prompt}.
+Assistant: <think>
+"""
+
+
+def convert_forward_to_xpu(m, target_m, new_forward):
+    # print(m.__class__.__name__)
+    if m.__class__.__name__ == target_m:
+        bound_method = new_forward.__get__(m, m.__class__)
+        setattr(m, "forward", bound_method)
+        # m = m.to(device="xpu", dtype=torch.float16)
+    for _, sub_m in m.named_children():
+        convert_forward_to_xpu(sub_m, target_m, new_forward)
+
+
+def hybrid_DeepseekV3MoE_forward(self, hidden_states):
+    # convert1_start = time.time()
+    hidden_states = hidden_states.to(device="cpu")#, dtype=torch.bfloat16)
+    # convert1_end = time.time()
+    # moe_start = time.time()
+    identity = hidden_states
+    orig_shape = hidden_states.shape
+    topk_idx, topk_weight = self.gate(hidden_states)
+    hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+    if not self.training:
+        y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(*orig_shape)
+    if self.config.n_shared_experts is not None:
+        y = y + self.shared_experts(identity)
+    # moe_end = time.time()
+    # convert2_start = time.time()
+    y = y.to(device="xpu")#, dtype=torch.float16)
+    # convert2_end = time.time()
+    # print("convert to cpu time: ", (convert1_end - convert1_start)*1000)
+    # print("moe time: ", (moe_end - moe_start) * 1000)
+    # print("convert to xpu time: ", (convert2_end - convert2_start) * 1000)
+    return y
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def hybrid_DeepseekV3Attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+    bsz, q_len, _ = hidden_states.size()
+
+    if self.q_lora_rank is None:
+        q = self.q_proj(hidden_states)
+    else:
+        q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+    q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+    q_nope, q_pe = torch.split(
+        q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+    )
+
+    compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+    compressed_kv, k_pe = torch.split(
+        compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+    )
+    k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+    kv = (
+        self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+        .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        .transpose(1, 2)
+    )
+
+    k_nope, value_states = torch.split(
+        kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+    )
+    kv_seq_len = value_states.shape[-2]
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+    q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+
+    query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+    query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+
+    key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+    key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+    key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        padded_value_states = torch.zeros([value_states.shape[0], value_states.shape[1], value_states.shape[2], key_states.shape[-1]],
+                                          dtype=value_states.dtype, device=value_states.device)
+        padded_value_states[:, :, :, :value_states.shape[-1]] = value_states
+        key_states, value_states = past_key_value.update(
+            key_states, padded_value_states, self.layer_idx, cache_kwargs
+        )
+
+    attn_weights = None
+    # import pdb
+    # breakpoint()
+    attn_output = scaled_dot_product_attention(
+        query_states, key_states, value_states,
+        attention_mask, q_len == kv_seq_len, scale=self.softmax_scale
+    )
+    attn_output = attn_output[:, :, :, :self.v_head_dim]
+
+    # attn_weights = (
+    #     torch.matmul(query_states, key_states.transpose(2, 3)) * self.softmax_scale
+    # )
+    #
+    # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+    #     raise ValueError(
+    #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+    #         f" {attn_weights.size()}"
+    #     )
+    # assert attention_mask is not None
+    # if attention_mask is not None:
+    #     if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+    #         raise ValueError(
+    #             f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+    #         )
+    #     attn_weights = attn_weights + attention_mask
+    #
+    # # upcast attention to fp32
+    # attn_weights = nn.functional.softmax(
+    #     attn_weights, dim=-1, dtype=torch.float32
+    # ).to(query_states.dtype)
+    # attn_weights = nn.functional.dropout(
+    #     attn_weights, p=self.attention_dropout, training=self.training
+    # )
+    # attn_output = torch.matmul(attn_weights, value_states)
+
+    # if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
+    #     raise ValueError(
+    #         f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.v_head_dim)}, but is"
+    #         f" {attn_output.size()}"
+    #     )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+
+    attn_output = self.o_proj(attn_output)
+
+    if not output_attentions:
+        attn_weights = None
+
+    return attn_output, attn_weights, past_key_value
+
+
+def do_benchmark(layer, input_data, num_warmup=3, num_trials=10):
+    for i in range(num_warmup):
+        layer(input_data)
+
+    start_time = time.time()
+    for i in range(num_trials):
+        output = layer(input_data)
+    end_time = time.time()
+    average = (end_time-start_time)*1000 / num_trials
+    print("{} latency: {} ms".format(str(layer), average))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
+    parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
+                        help='The huggingface repo id for the Llama2 (e.g. `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Llama-2-13b-chat-hf`) to be downloaded'
+                             ', or the path to the huggingface checkpoint folder')
+    parser.add_argument('--load-path', type=str, default=None,
+                        help='The path to load the low-bit model.')
+    parser.add_argument('--warm-up', type=int, default=3,
+                        help='Num of warm-up trials.')
+    parser.add_argument('--num-trials', type=int, default=10,
+                        help='Num of trials to run.')
+
+    args = parser.parse_args()
+    model_path = args.repo_id_or_model_path
+
+    load_path = args.load_path
+    if load_path:
+        model = AutoModelForCausalLM.load_low_bit(load_path, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(load_path,
+                                              trust_remote_code=True)
+    else:
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                    load_in_4bit=True,
+                                                    optimize_model=True,
+                                                    trust_remote_code=True,
+                                                    use_cache=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path,
+                                              trust_remote_code=True)
+
+    # model = model.bfloat16()
+    print(model)
+    convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
+    convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+    for i in range(0, model.config.num_hidden_layers):
+        model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
+        model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
+        model.model.layers[i].post_attention_layernorm = model.model.layers[i].post_attention_layernorm.to(device="xpu")#, dtype=torch.float16)
+        if i < model.config.first_k_dense_replace:
+            model.model.layers[i].mlp = model.model.layers[i].mlp.to(device="xpu")#, dtype=torch.float16)
+        # else:
+            # model.model.layers[i].mlp.gate = model.model.layers[i].mlp.gate.to(device="xpu", dtype=torch.float16)
+            # model.model.layers[i].mlp.shared_experts = model.model.layers[i].mlp.shared_experts.to(device="xpu", dtype=torch.float16)
+    model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
+    model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
+    model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
+    convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
+    convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
+
+    # device = "cpu"
+    device = "xpu"
+    embed = model.model.embed_tokens
+    input_ids = torch.tensor([[1128]]).to(device)
+    do_benchmark(embed, input_ids, args.warm_up, args.num_trials)
+
+    norm = model.model.norm
+    hidden_states = torch.randn(1, 1, 7168).to(device)
+    do_benchmark(norm, hidden_states, args.warm_up, args.num_trials)
+
+    lm_head = model.lm_head
+    do_benchmark(lm_head, hidden_states, args.warm_up, args.num_trials)

From 5f50f361d8bf279bb3178a1fd00ab7f0726acab1 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Tue, 25 Feb 2025 11:15:30 +0800
Subject: [PATCH 06/15] update

---
 .../llm/example/GPU/DeepSeek-R1/breakdown.py   | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index f4ddd1e59bd..973eb0b1b3f 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -246,13 +246,17 @@ def hybrid_DeepseekV3Attention_forward(
     return attn_output, attn_weights, past_key_value
 
 
-def do_benchmark(layer, input_data, num_warmup=3, num_trials=10):
+def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
     for i in range(num_warmup):
-        layer(input_data)
+        layer(**kwargs)
+        if device == "xpu":
+            torch.xpu.synchronize()
 
     start_time = time.time()
     for i in range(num_trials):
-        output = layer(input_data)
+        output = layer(**kwargs)
+        if device == "xpu":
+            torch.xpu.synchronize()
     end_time = time.time()
     average = (end_time-start_time)*1000 / num_trials
     print("{} latency: {} ms".format(str(layer), average))
@@ -310,11 +314,13 @@ def do_benchmark(layer, input_data, num_warmup=3, num_trials=10):
     device = "xpu"
     embed = model.model.embed_tokens
     input_ids = torch.tensor([[1128]]).to(device)
-    do_benchmark(embed, input_ids, args.warm_up, args.num_trials)
+    do_benchmark(embed, args.warm_up, args.num_trials, device, input=input_ids)
 
     norm = model.model.norm
     hidden_states = torch.randn(1, 1, 7168).to(device)
-    do_benchmark(norm, hidden_states, args.warm_up, args.num_trials)
+    do_benchmark(norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
 
     lm_head = model.lm_head
-    do_benchmark(lm_head, hidden_states, args.warm_up, args.num_trials)
+    do_benchmark(lm_head, args.warm_up, args.num_trials, device, x=hidden_states)
+
+    # dense_decoder = model.model.layers[0]

From 0e0d73a963e02f02c27ed433f13fa18f335b7d24 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Tue, 25 Feb 2025 16:04:23 +0800
Subject: [PATCH 07/15] add

---
 .../llm/example/GPU/DeepSeek-R1/breakdown.py  | 46 +++++++++++++++++--
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index 973eb0b1b3f..d5a040bc828 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -259,7 +259,32 @@ def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
             torch.xpu.synchronize()
     end_time = time.time()
     average = (end_time-start_time)*1000 / num_trials
-    print("{} latency: {} ms".format(str(layer), average))
+    print("{} latency: {} ms".format(layer.__class__.__name__, average))
+
+
+# kvcache will increment after each run, can't reuse the same input to run multiple trials
+def do_benchmark_attn(layer, num_warmup=3, num_trials=10, device="xpu"):
+    hidden_states = torch.randn(1, 1, 7168).to(device)
+    kv_seq_length = 128 + 1000  # Simulate the last few tokens of 128-1024
+    past_key = torch.randn(1, 128, kv_seq_length, 192).to(device)
+    past_value = torch.randn(1, 128, kv_seq_length, 128).to(device)  # Not padded
+    past_key_values = DynamicCache.from_legacy_cache([(past_key, past_value)])
+    total_time = 0
+    for i in range(num_warmup+num_trials):
+        position_ids = torch.tensor([[kv_seq_length]]).to(device)
+        attention_mask = torch.zeros([1, 1, 1, kv_seq_length + 1]).to(device)
+        start_time = time.time()
+        output = layer(hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids,
+                       past_key_value=past_key_values, use_cache=True)
+        if device == "xpu":
+            torch.xpu.synchronize()
+        end_time = time.time()
+        kv_seq_length += 1
+        if i >= num_warmup:
+            total_time += (end_time-start_time)
+            # print((end_time-start_time)*1000)
+    average = total_time * 1000 / num_trials
+    print("{} latency: {} ms".format(layer.__class__.__name__, average))
 
 
 if __name__ == '__main__':
@@ -294,7 +319,7 @@ def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
     # model = model.bfloat16()
     print(model)
     convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
-    convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+    # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
     for i in range(0, model.config.num_hidden_layers):
         model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
         model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
@@ -323,4 +348,19 @@ def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
     lm_head = model.lm_head
     do_benchmark(lm_head, args.warm_up, args.num_trials, device, x=hidden_states)
 
-    # dense_decoder = model.model.layers[0]
+    dense_decoder = model.model.layers[0]
+    do_benchmark_attn(dense_decoder, args.warm_up, args.num_trials, device)
+    self_attn = model.model.layers[0].self_attn
+    do_benchmark_attn(self_attn, args.warm_up, args.num_trials, device)
+
+    mlp = model.model.layers[0].mlp
+    do_benchmark(mlp, args.warm_up, args.num_trials, device, x=hidden_states)
+    input_norm = model.model.layers[0].input_layernorm
+    do_benchmark(input_norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
+    post_norm = model.model.layers[0].post_attention_layernorm
+    do_benchmark(post_norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
+
+    moe_decoder = model.model.layers[1]
+    do_benchmark_attn(moe_decoder, args.warm_up, args.num_trials, device)
+    moe = model.model.layers[1].mlp
+    do_benchmark(moe, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
\ No newline at end of file

From 0830a5b43ad2fc0e14878c4cf7eb65c919b90dc9 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Tue, 25 Feb 2025 16:11:30 +0800
Subject: [PATCH 08/15] update

---
 python/llm/example/GPU/DeepSeek-R1/breakdown.py | 4 ++--
 python/llm/example/GPU/DeepSeek-R1/generate.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index d5a040bc828..a3d079dd9b8 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -268,7 +268,7 @@ def do_benchmark_attn(layer, num_warmup=3, num_trials=10, device="xpu"):
     kv_seq_length = 128 + 1000  # Simulate the last few tokens of 128-1024
     past_key = torch.randn(1, 128, kv_seq_length, 192).to(device)
     past_value = torch.randn(1, 128, kv_seq_length, 128).to(device)  # Not padded
-    past_key_values = DynamicCache.from_legacy_cache([(past_key, past_value)])
+    past_key_values = DynamicCache.from_legacy_cache([(past_key, past_value), (past_key, past_value)])  # kv for 2 layers
     total_time = 0
     for i in range(num_warmup+num_trials):
         position_ids = torch.tensor([[kv_seq_length]]).to(device)
@@ -362,5 +362,5 @@ def do_benchmark_attn(layer, num_warmup=3, num_trials=10, device="xpu"):
 
     moe_decoder = model.model.layers[1]
     do_benchmark_attn(moe_decoder, args.warm_up, args.num_trials, device)
-    moe = model.model.layers[1].mlp
+    moe = model.model.layers[1].mlp  # including cpu/xpu data conversion
     do_benchmark(moe, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
\ No newline at end of file
diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
index 0e784dd51c1..3ad401b55ef 100644
--- a/python/llm/example/GPU/DeepSeek-R1/generate.py
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -282,7 +282,7 @@ def hybrid_DeepseekV3Attention_forward(
     # model = model.bfloat16()
     print(model)
     convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
-    convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+    # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
     for i in range(0, model.config.num_hidden_layers):
         model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
         model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)

From 759757853bf7398927aee1674616fdc1fff7a16f Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Tue, 25 Feb 2025 17:28:26 +0800
Subject: [PATCH 09/15] update

---
 .../llm/example/GPU/DeepSeek-R1/breakdown.py  | 82 ++++++++++++-------
 1 file changed, 52 insertions(+), 30 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index a3d079dd9b8..6e637a193df 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -260,19 +260,24 @@ def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
     end_time = time.time()
     average = (end_time-start_time)*1000 / num_trials
     print("{} latency: {} ms".format(layer.__class__.__name__, average))
+    return average
 
 
 # kvcache will increment after each run, can't reuse the same input to run multiple trials
-def do_benchmark_attn(layer, num_warmup=3, num_trials=10, device="xpu"):
-    hidden_states = torch.randn(1, 1, 7168).to(device)
+def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=10, device="xpu"):
     kv_seq_length = 128 + 1000  # Simulate the last few tokens of 128-1024
     past_key = torch.randn(1, 128, kv_seq_length, 192).to(device)
     past_value = torch.randn(1, 128, kv_seq_length, 128).to(device)  # Not padded
+    if device == "cpu":
+        past_key = past_key.bfloat16()
+        past_value = past_value.bfloat16()
     past_key_values = DynamicCache.from_legacy_cache([(past_key, past_value), (past_key, past_value)])  # kv for 2 layers
     total_time = 0
     for i in range(num_warmup+num_trials):
         position_ids = torch.tensor([[kv_seq_length]]).to(device)
         attention_mask = torch.zeros([1, 1, 1, kv_seq_length + 1]).to(device)
+        if device == "cpu":
+            attention_mask = attention_mask.bfloat16()
         start_time = time.time()
         output = layer(hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids,
                        past_key_value=past_key_values, use_cache=True)
@@ -285,6 +290,7 @@ def do_benchmark_attn(layer, num_warmup=3, num_trials=10, device="xpu"):
             # print((end_time-start_time)*1000)
     average = total_time * 1000 / num_trials
     print("{} latency: {} ms".format(layer.__class__.__name__, average))
+    return average
 
 
 if __name__ == '__main__':
@@ -316,51 +322,67 @@ def do_benchmark_attn(layer, num_warmup=3, num_trials=10, device="xpu"):
         tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True)
 
-    # model = model.bfloat16()
     print(model)
-    convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
-    # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
-    for i in range(0, model.config.num_hidden_layers):
-        model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
-        model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
-        model.model.layers[i].post_attention_layernorm = model.model.layers[i].post_attention_layernorm.to(device="xpu")#, dtype=torch.float16)
-        if i < model.config.first_k_dense_replace:
-            model.model.layers[i].mlp = model.model.layers[i].mlp.to(device="xpu")#, dtype=torch.float16)
-        # else:
-            # model.model.layers[i].mlp.gate = model.model.layers[i].mlp.gate.to(device="xpu", dtype=torch.float16)
-            # model.model.layers[i].mlp.shared_experts = model.model.layers[i].mlp.shared_experts.to(device="xpu", dtype=torch.float16)
-    model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
-    model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
-    model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
-    convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
-    convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
 
     # device = "cpu"
     device = "xpu"
+    input_ids = torch.tensor([[1128]])
+    hidden_states = torch.randn(1, 1, 7168)
+    if device == "xpu":
+        convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
+        # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+        for i in range(0, model.config.num_hidden_layers):
+            model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
+            model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
+            model.model.layers[i].post_attention_layernorm = model.model.layers[i].post_attention_layernorm.to(device="xpu")#, dtype=torch.float16)
+            if i < model.config.first_k_dense_replace:
+                model.model.layers[i].mlp = model.model.layers[i].mlp.to(device="xpu")#, dtype=torch.float16)
+            # else:
+                # model.model.layers[i].mlp.gate = model.model.layers[i].mlp.gate.to(device="xpu", dtype=torch.float16)
+                # model.model.layers[i].mlp.shared_experts = model.model.layers[i].mlp.shared_experts.to(device="xpu", dtype=torch.float16)
+        model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
+        model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
+        model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
+        convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
+        convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
+    else:  # cpu
+        model = model.bfloat16()
+        hidden_states = hidden_states.bfloat16()
+    input_ids = input_ids.to(device)
+    hidden_states = hidden_states.to(device)
+
+    # Breakdown of e2e
     embed = model.model.embed_tokens
-    input_ids = torch.tensor([[1128]]).to(device)
-    do_benchmark(embed, args.warm_up, args.num_trials, device, input=input_ids)
+    embed_time = do_benchmark(embed, args.warm_up, args.num_trials, device, input=input_ids)
+
+    dense_decoder = model.model.layers[0]
+    dense_decoder_time = do_benchmark_attn(dense_decoder, hidden_states, args.warm_up, args.num_trials, device)
+
+    moe_decoder = model.model.layers[1]
+    moe_decoder_time = do_benchmark_attn(moe_decoder, hidden_states, args.warm_up, args.num_trials, device)
 
     norm = model.model.norm
-    hidden_states = torch.randn(1, 1, 7168).to(device)
-    do_benchmark(norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
+    norm_time = do_benchmark(norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
 
     lm_head = model.lm_head
-    do_benchmark(lm_head, args.warm_up, args.num_trials, device, x=hidden_states)
+    lm_head_time = do_benchmark(lm_head, args.warm_up, args.num_trials, device, x=hidden_states)
 
-    dense_decoder = model.model.layers[0]
-    do_benchmark_attn(dense_decoder, args.warm_up, args.num_trials, device)
+    total_time = embed_time + dense_decoder_time + moe_decoder_time +norm_time + lm_head_time
+    print("Overall latency: {} ms".format(total_time))
+    print("==================")
+
+    # Breakdown of decoder layer
     self_attn = model.model.layers[0].self_attn
-    do_benchmark_attn(self_attn, args.warm_up, args.num_trials, device)
+    do_benchmark_attn(self_attn, hidden_states, args.warm_up, args.num_trials, device)
 
     mlp = model.model.layers[0].mlp
     do_benchmark(mlp, args.warm_up, args.num_trials, device, x=hidden_states)
+
     input_norm = model.model.layers[0].input_layernorm
     do_benchmark(input_norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
+
     post_norm = model.model.layers[0].post_attention_layernorm
     do_benchmark(post_norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
 
-    moe_decoder = model.model.layers[1]
-    do_benchmark_attn(moe_decoder, args.warm_up, args.num_trials, device)
     moe = model.model.layers[1].mlp  # including cpu/xpu data conversion
-    do_benchmark(moe, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
\ No newline at end of file
+    do_benchmark(moe, args.warm_up, args.num_trials, device, hidden_states=hidden_states)

From fed3f87db179dbba071bbe799e12d7ce56eb1191 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Tue, 25 Feb 2025 18:52:25 +0800
Subject: [PATCH 10/15] update

---
 .../llm/example/GPU/DeepSeek-R1/breakdown.py  | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index 6e637a193df..80d51231380 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -252,20 +252,24 @@ def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
         if device == "xpu":
             torch.xpu.synchronize()
 
-    start_time = time.time()
+    total_time = 0
     for i in range(num_trials):
+        start_time = time.time()
         output = layer(**kwargs)
         if device == "xpu":
             torch.xpu.synchronize()
-    end_time = time.time()
-    average = (end_time-start_time)*1000 / num_trials
+        end_time = time.time()
+        total_time += (end_time - start_time)
+    average = total_time * 1000 / num_trials
     print("{} latency: {} ms".format(layer.__class__.__name__, average))
+    if device == "xpu":  # TODO: need to empty cache after each run?
+        torch.xpu.empty_cache()
     return average
 
 
 # kvcache will increment after each run, can't reuse the same input to run multiple trials
-def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=10, device="xpu"):
-    kv_seq_length = 128 + 1000  # Simulate the last few tokens of 128-1024
+def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=128, device="xpu"):
+    kv_seq_length = 128 - num_warmup  # Simulate the average of 128-128
     past_key = torch.randn(1, 128, kv_seq_length, 192).to(device)
     past_value = torch.randn(1, 128, kv_seq_length, 128).to(device)  # Not padded
     if device == "cpu":
@@ -290,6 +294,8 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=10, device=
             # print((end_time-start_time)*1000)
     average = total_time * 1000 / num_trials
     print("{} latency: {} ms".format(layer.__class__.__name__, average))
+    if device == "xpu":
+        torch.xpu.empty_cache()
     return average
 
 
@@ -302,7 +308,7 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=10, device=
                         help='The path to load the low-bit model.')
     parser.add_argument('--warm-up', type=int, default=3,
                         help='Num of warm-up trials.')
-    parser.add_argument('--num-trials', type=int, default=10,
+    parser.add_argument('--num-trials', type=int, default=128,
                         help='Num of trials to run.')
 
     args = parser.parse_args()
@@ -356,10 +362,10 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=10, device=
     embed_time = do_benchmark(embed, args.warm_up, args.num_trials, device, input=input_ids)
 
     dense_decoder = model.model.layers[0]
-    dense_decoder_time = do_benchmark_attn(dense_decoder, hidden_states, args.warm_up, args.num_trials, device)
+    dense_decoder_time = do_benchmark_attn(dense_decoder, hidden_states, args.warm_up, num_trials=128, device=device)
 
     moe_decoder = model.model.layers[1]
-    moe_decoder_time = do_benchmark_attn(moe_decoder, hidden_states, args.warm_up, args.num_trials, device)
+    moe_decoder_time = do_benchmark_attn(moe_decoder, hidden_states, args.warm_up, num_trials=128, device=device)
 
     norm = model.model.norm
     norm_time = do_benchmark(norm, args.warm_up, args.num_trials, device, hidden_states=hidden_states)
@@ -373,7 +379,7 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=10, device=
 
     # Breakdown of decoder layer
     self_attn = model.model.layers[0].self_attn
-    do_benchmark_attn(self_attn, hidden_states, args.warm_up, args.num_trials, device)
+    do_benchmark_attn(self_attn, hidden_states, args.warm_up, num_trials=128, device=device)
 
     mlp = model.model.layers[0].mlp
     do_benchmark(mlp, args.warm_up, args.num_trials, device, x=hidden_states)

From 6190ba6ae6949a01229e63bf9720eda14422e435 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Wed, 26 Feb 2025 10:17:13 +0800
Subject: [PATCH 11/15] update

---
 .../llm/example/GPU/DeepSeek-R1/breakdown.py   | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index 80d51231380..9a4b138dc2e 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -270,21 +270,16 @@ def do_benchmark(layer, num_warmup=3, num_trials=10, device="xpu", **kwargs):
 # kvcache will increment after each run, can't reuse the same input to run multiple trials
 def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=128, device="xpu"):
     kv_seq_length = 128 - num_warmup  # Simulate the average of 128-128
-    past_key = torch.randn(1, 128, kv_seq_length, 192).to(device)
-    past_value = torch.randn(1, 128, kv_seq_length, 128).to(device)  # Not padded
-    if device == "cpu":
-        past_key = past_key.bfloat16()
-        past_value = past_value.bfloat16()
+    past_key = torch.randn(1, 128, kv_seq_length, 192, dtype=hidden_states.dtype).to(device)
+    past_value = torch.randn(1, 128, kv_seq_length, 128, dtype=hidden_states.dtype).to(device)  # Not padded
     past_key_values = DynamicCache.from_legacy_cache([(past_key, past_value), (past_key, past_value)])  # kv for 2 layers
     total_time = 0
     for i in range(num_warmup+num_trials):
         position_ids = torch.tensor([[kv_seq_length]]).to(device)
-        attention_mask = torch.zeros([1, 1, 1, kv_seq_length + 1]).to(device)
-        if device == "cpu":
-            attention_mask = attention_mask.bfloat16()
+        attention_mask = torch.zeros([1, 1, 1, kv_seq_length + 1], dtype=hidden_states.dtype).to(device)
         start_time = time.time()
         output = layer(hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids,
-                       past_key_value=past_key_values, use_cache=True)
+                       past_key_value=past_key_values, output_attention=False, use_cache=True)
         if device == "xpu":
             torch.xpu.synchronize()
         end_time = time.time()
@@ -332,7 +327,7 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=128, device
 
     # device = "cpu"
     device = "xpu"
-    input_ids = torch.tensor([[1128]])
+    input_ids = torch.tensor([[1128]]).to(device)
     hidden_states = torch.randn(1, 1, 7168)
     if device == "xpu":
         convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
@@ -351,10 +346,9 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=128, device
         model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
         convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
         convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
-    else:  # cpu
+    else:  # cpu, bf16
         model = model.bfloat16()
         hidden_states = hidden_states.bfloat16()
-    input_ids = input_ids.to(device)
     hidden_states = hidden_states.to(device)
 
     # Breakdown of e2e

From 49879b1eb15899728140d470eaa08c08cc8c496b Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Wed, 26 Feb 2025 16:21:43 +0800
Subject: [PATCH 12/15] update

---
 python/llm/example/GPU/DeepSeek-R1/breakdown.py         | 1 +
 python/llm/src/ipex_llm/transformers/models/minicpm3.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index 9a4b138dc2e..5bf2e9532dc 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -344,6 +344,7 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=128, device
         model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
         model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
         model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
+        # hidden_states = hidden_states.half()
         convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
         convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
     else:  # cpu, bf16
diff --git a/python/llm/src/ipex_llm/transformers/models/minicpm3.py b/python/llm/src/ipex_llm/transformers/models/minicpm3.py
index 03e45912a58..e5a1ecc526e 100644
--- a/python/llm/src/ipex_llm/transformers/models/minicpm3.py
+++ b/python/llm/src/ipex_llm/transformers/models/minicpm3.py
@@ -24,7 +24,7 @@ def pre_compute_inv_freq(module: torch.nn.Module):
 
 
 def padding_v_head_dim(module: torch.nn.Module):
-    if module.__class__.__name__ == "MiniCPMAttention":
+    if module.__class__.__name__ == "MiniCPMAttention":# or module.__class__.__name__ == "DeepseekV3Attention":
         k_head_dim = module.q_head_dim
         v_head_dim = module.v_head_dim
         invalidInputError(k_head_dim >= v_head_dim,

From f7d7a5041d717e70bdff3ccab914c3b03ed16120 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Wed, 26 Feb 2025 17:47:49 +0800
Subject: [PATCH 13/15] update generate

---
 .../llm/example/GPU/DeepSeek-R1/generate.py   | 90 +++++++++++++++----
 1 file changed, 71 insertions(+), 19 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
index 3ad401b55ef..1caa93c0329 100644
--- a/python/llm/example/GPU/DeepSeek-R1/generate.py
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -18,6 +18,7 @@
 import warnings
 import os
 import numpy as np
+import importlib.util
 
 import torch
 from torch import nn
@@ -30,6 +31,7 @@
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
 from ipex_llm.transformers.models.common import rms_norm_forward
 from ipex_llm.transformers.models.common import mlp_silu_forward
+from ipex_llm.transformers.kv import DynamicNormalCache
 from ipex_llm.utils.benchmark_util_deepseek import BenchmarkWrapper
 
 from transformers import AutoTokenizer, GenerationConfig
@@ -246,6 +248,38 @@ def hybrid_DeepseekV3Attention_forward(
     return attn_output, attn_weights, past_key_value
 
 
+def deepseek_model_forward_wrapper(origin_forward):
+    def deepseek_model_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # IPEX-LLM OPT: kv cache
+        past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
+
+        return origin_forward(
+            self=self,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+    return deepseek_model_forward
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for Llama2 model')
     parser.add_argument('--repo-id-or-model-path', type=str, default="meta-llama/Llama-2-7b-chat-hf",
@@ -279,24 +313,34 @@ def hybrid_DeepseekV3Attention_forward(
         tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               trust_remote_code=True)
 
-    # model = model.bfloat16()
     print(model)
-    convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
-    # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
-    for i in range(0, model.config.num_hidden_layers):
-        model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
-        model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
-        model.model.layers[i].post_attention_layernorm = model.model.layers[i].post_attention_layernorm.to(device="xpu")#, dtype=torch.float16)
-        if i < model.config.first_k_dense_replace:
-            model.model.layers[i].mlp = model.model.layers[i].mlp.to(device="xpu")#, dtype=torch.float16)
-        # else:
-            # model.model.layers[i].mlp.gate = model.model.layers[i].mlp.gate.to(device="xpu", dtype=torch.float16)
-            # model.model.layers[i].mlp.shared_experts = model.model.layers[i].mlp.shared_experts.to(device="xpu", dtype=torch.float16)
-    model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
-    model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
-    model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
-    convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
-    convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
+
+    # device = "cpu"
+    device = "xpu"
+
+    # modeling_module_name = model.__class__.__module__
+    # module = importlib.import_module(modeling_module_name)
+    # deepseek_model_forward = deepseek_model_forward_wrapper(module.DeepseekV3Model.forward)
+    # convert_forward(model, module.DeepseekV3Model, deepseek_model_forward)
+    if device == "xpu":
+        convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
+        # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
+        for i in range(0, model.config.num_hidden_layers):
+            model.model.layers[i].input_layernorm = model.model.layers[i].input_layernorm.to(device="xpu")#, dtype=torch.float16)
+            model.model.layers[i].self_attn = model.model.layers[i].self_attn.to(device="xpu")#, dtype=torch.float16)
+            model.model.layers[i].post_attention_layernorm = model.model.layers[i].post_attention_layernorm.to(device="xpu")#, dtype=torch.float16)
+            if i < model.config.first_k_dense_replace:
+                model.model.layers[i].mlp = model.model.layers[i].mlp.to(device="xpu")#, dtype=torch.float16)
+            # else:
+                # model.model.layers[i].mlp.gate = model.model.layers[i].mlp.gate.to(device="xpu", dtype=torch.float16)
+                # model.model.layers[i].mlp.shared_experts = model.model.layers[i].mlp.shared_experts.to(device="xpu", dtype=torch.float16)
+        model.model.embed_tokens = model.model.embed_tokens.to(device="xpu")#, dtype=torch.float16)
+        model.model.norm = model.model.norm.to(device="xpu")#, dtype=torch.float16)
+        model.lm_head = model.lm_head.to(device="xpu")#, dtype=torch.float16)
+        convert_forward_to_xpu(model, "DeepseekV3RMSNorm", rms_norm_forward)
+        convert_forward_to_xpu(model, "DeepseekV3MLP", mlp_silu_forward)
+    else:
+        model = model.bfloat16()
 
     print("load completed")
     model = BenchmarkWrapper(model)
@@ -307,7 +351,9 @@ def hybrid_DeepseekV3Attention_forward(
     # Generate predicted tokens
     with torch.inference_mode():
         prompt = PROMPT_FORMAT.format(prompt=args.prompt)
-        input_ids = tokenizer.encode(prompt, return_tensors="pt").to("xpu")
+        input_ids = tokenizer.encode(prompt, return_tensors="pt")
+        if device == "xpu":
+            input_ids = input_ids.to("xpu")
         # ipex_llm model needs a warmup, then inference time can be accurate
         for i in range(args.warm_up):
             output = model.generate(input_ids,
@@ -320,7 +366,8 @@ def hybrid_DeepseekV3Attention_forward(
             output = model.generate(input_ids,
                                     max_new_tokens=args.n_predict,
                                     min_new_tokens=args.n_predict)
-            torch.xpu.synchronize()
+            if device == "xpu":
+                torch.xpu.synchronize()
             end = time.time()
             output = output.cpu()
             e2e_time_list.append(end - st)
@@ -331,3 +378,8 @@ def hybrid_DeepseekV3Attention_forward(
         print(f"End-to-end time: {np.mean(e2e_time_list)} s")
         print(f"Prefill time: {np.mean(prefill_time_list)} s")
         print(f"Rest cost mean: {np.mean(rest_cost_mean_list) * 1000} ms")
+        output_str = tokenizer.decode(output[0], skip_special_tokens=True)
+        print('-'*20, 'Prompt', '-'*20)
+        print(prompt)
+        print('-'*20, 'Output', '-'*20)
+        print(output_str)

From 25787807ac0c3aaa7e1424ee5092e8982c76f9ca Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Wed, 26 Feb 2025 19:24:21 +0800
Subject: [PATCH 14/15] update

---
 .../llm/example/GPU/DeepSeek-R1/generate.py   |  4 +++-
 .../src/ipex_llm/transformers/models/utils.py | 22 +++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/python/llm/example/GPU/DeepSeek-R1/generate.py b/python/llm/example/GPU/DeepSeek-R1/generate.py
index 1caa93c0329..078272b98de 100644
--- a/python/llm/example/GPU/DeepSeek-R1/generate.py
+++ b/python/llm/example/GPU/DeepSeek-R1/generate.py
@@ -262,7 +262,8 @@ def deepseek_model_forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         # IPEX-LLM OPT: kv cache
-        past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
+        if use_cache and not isinstance(past_key_values, DynamicNormalCache):
+            past_key_values = DynamicNormalCache.from_legacy_cache(past_key_values)
 
         return origin_forward(
             self=self,
@@ -322,6 +323,7 @@ def deepseek_model_forward(
     # module = importlib.import_module(modeling_module_name)
     # deepseek_model_forward = deepseek_model_forward_wrapper(module.DeepseekV3Model.forward)
     # convert_forward(model, module.DeepseekV3Model, deepseek_model_forward)
+    # model = model.bfloat16()
     if device == "xpu":
         convert_forward_to_xpu(model.model, "DeepseekV3MoE", hybrid_DeepseekV3MoE_forward)
         # convert_forward_to_xpu(model.model, "DeepseekV3Attention", hybrid_DeepseekV3Attention_forward)
diff --git a/python/llm/src/ipex_llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py
index 0e3e897c975..0a912460b92 100644
--- a/python/llm/src/ipex_llm/transformers/models/utils.py
+++ b/python/llm/src/ipex_llm/transformers/models/utils.py
@@ -41,7 +41,7 @@ def init_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, d
                                     max_length, head_dim,
                                     dtype=dtype, device=device)
     value_cache_storage = torch.empty(batch_size, num_heads,
-                                      max_length, head_dim,
+                                      max_length, 128,
                                       dtype=dtype, device=device)
 
     key_cache = key_cache_storage.as_strided((batch_size, num_heads,
@@ -49,7 +49,7 @@ def init_kv_cache(batch_size, num_heads, head_dim, current_length, max_length, d
                                              key_cache_storage.stride(),
                                              storage_offset=0)
     value_cache = value_cache_storage.as_strided((batch_size, num_heads,
-                                                  current_length, head_dim),
+                                                  current_length, 128),
                                                  value_cache_storage.stride(),
                                                  storage_offset=0)
     return key_cache, value_cache
@@ -63,14 +63,18 @@ def extend_kv_cache(batch_size, num_heads, head_dim, current_length, max_length,
 
 
 def append_kv_cache(cache_k, cache_v, key_states, value_states):
-    new_size = (cache_k.size(0),
-                cache_k.size(1),
-                cache_k.size(2) + key_states.size(2),
-                cache_k.size(3))
-    new_cache_k = cache_k.as_strided(new_size, cache_k.stride(), storage_offset=0)
+    new_k_size = (cache_k.size(0),
+                  cache_k.size(1),
+                  cache_k.size(2) + key_states.size(2),
+                  cache_k.size(3))
+    new_v_size = (cache_v.size(0),
+                  cache_v.size(1),
+                  cache_v.size(2) + value_states.size(2),
+                  cache_v.size(3))
+    new_cache_k = cache_k.as_strided(new_k_size, cache_k.stride(), storage_offset=0)
     new_cache_k[:, :, cache_k.size(2):cache_k.size(2) + key_states.size(2), :] = key_states
-    new_cache_v = cache_v.as_strided(new_size, cache_v.stride(), storage_offset=0)
-    new_cache_v[:, :, cache_v.size(2):cache_v.size(2) + key_states.size(2), :] = value_states
+    new_cache_v = cache_v.as_strided(new_v_size, cache_v.stride(), storage_offset=0)
+    new_cache_v[:, :, cache_v.size(2):cache_v.size(2) + value_states.size(2), :] = value_states
     return new_cache_k, new_cache_v
 
 

From a012584dcef5ecb070e0798cb0da795254263668 Mon Sep 17 00:00:00 2001
From: Kai Huang <huangkaivision@gmail.com>
Date: Wed, 26 Feb 2025 20:39:12 +0800
Subject: [PATCH 15/15] add dynamic normal cache

---
 python/llm/example/GPU/DeepSeek-R1/breakdown.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/llm/example/GPU/DeepSeek-R1/breakdown.py b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
index 5bf2e9532dc..4f5dffcc0bf 100644
--- a/python/llm/example/GPU/DeepSeek-R1/breakdown.py
+++ b/python/llm/example/GPU/DeepSeek-R1/breakdown.py
@@ -30,6 +30,7 @@
 from ipex_llm.transformers.models.common import scaled_dot_product_attention
 from ipex_llm.transformers.models.common import rms_norm_forward
 from ipex_llm.transformers.models.common import mlp_silu_forward
+from ipex_llm.transformers.kv import DynamicNormalCache
 from ipex_llm.utils.benchmark_util_deepseek import BenchmarkWrapper
 
 from transformers import AutoTokenizer, GenerationConfig
@@ -272,6 +273,9 @@ def do_benchmark_attn(layer, hidden_states, num_warmup=3, num_trials=128, device
     kv_seq_length = 128 - num_warmup  # Simulate the average of 128-128
     past_key = torch.randn(1, 128, kv_seq_length, 192, dtype=hidden_states.dtype).to(device)
     past_value = torch.randn(1, 128, kv_seq_length, 128, dtype=hidden_states.dtype).to(device)  # Not padded
+    # past_key_values = DynamicNormalCache()
+    # past_key_values.update(past_key, past_value, 0)
+    # past_key_values.update(past_key, past_value, 1)
     past_key_values = DynamicCache.from_legacy_cache([(past_key, past_value), (past_key, past_value)])  # kv for 2 layers
     total_time = 0
     for i in range(num_warmup+num_trials):