support batch_size>1

xwang365 · Ubuntu · commit 8ed81a4ab01a · 2024-02-19T02:49:39.000Z
clean
diff --git a/medusa/inference/inference_test.py b/medusa/inference/inference_test.py
@@ -0,0 +1,96 @@
+# Adapted from: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/cli.py
+"""
+Chat with a model with command line interface.
+
+Usage:
+python3 -m medusa.inference.cli --model <model_name_or_path>
+Other commands:
+- Type "!!exit" or an empty line to exit.
+- Type "!!reset" to start a new conversation.
+- Type "!!remove" to remove the last prompt.
+- Type "!!regen" to regenerate the last message.
+- Type "!!save <filename>" to save the conversation history to a json file.
+- Type "!!load <filename>" to load a conversation history from a json file.
+"""
+import argparse
+import os
+import re
+import sys
+import torch
+from fastchat.serve.cli import SimpleChatIO, RichChatIO, ProgrammaticChatIO
+from fastchat.model.model_adapter import get_conversation_template
+from fastchat.conversation import get_conv_template
+import json
+from medusa.model.medusa_model import MedusaModel
+import pdb
+
+def main(args):
+    prefix = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {0} ASSISTANT:"
+    # prompt = ["你叫什么名字"]
+    prompt = ["你叫什么名字", "中国的首都是哪里呢？"]
+    prompt = [prefix.format(p) for p in prompt]
+    model = MedusaModel.from_pretrained(
+        args.model,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        device_map="auto",
+        load_in_8bit=args.load_in_8bit,
+        load_in_4bit=args.load_in_4bit,
+    )
+    tokenizer = model.get_tokenizer()
+    # 使用tokenizer处理批量输入
+    encoded_inputs = tokenizer(prompt, padding=True, truncation=True, return_tensors="pt")
+    # 将编码后的输入移动到模型所在的设备
+    input_ids = encoded_inputs['input_ids'].to(model.base_model.device)
+    attention_mask = encoded_inputs['attention_mask'].to(model.base_model.device) 
+    for output in model.medusa_generate(
+            input_ids,
+            attention_mask=attention_mask,
+            temperature=args.temperature,
+            max_steps=args.max_steps,
+        ):
+        print(output['text'])
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True, help="Model name or path.")
+    parser.add_argument(
+        "--load-in-8bit", action="store_true", help="Use 8-bit quantization"
+    )
+    parser.add_argument(
+        "--load-in-4bit", action="store_true", help="Use 4-bit quantization"
+    )
+    parser.add_argument(
+        "--conv-template", type=str, default=None, help="Conversation prompt template."
+    )
+    parser.add_argument(
+        "--conv-system-msg", type=str, default=None, help="Conversation system message."
+    )
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--max-steps", type=int, default=512)
+    parser.add_argument("--no-history", action="store_true")
+    parser.add_argument(
+        "--style",
+        type=str,
+        default="simple",
+        choices=["simple", "rich", "programmatic"],
+        help="Display style.",
+    )
+    parser.add_argument(
+        "--multiline",
+        action="store_true",
+        help="Enable multiline input. Use ESC+Enter for newline.",
+    )
+    parser.add_argument(
+        "--mouse",
+        action="store_true",
+        help="[Rich Style]: Enable mouse support for cursor positioning.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Print useful debug information (e.g., prompts)",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/medusa/model/kv_cache.py b/medusa/model/kv_cache.py
@@ -1,5 +1,5 @@
 import torch
-
+import copy
 
 class KVCache:
     """
@@ -41,32 +41,51 @@ def copy(self, indices: torch.Tensor, prev_length: int, dim: int = 2):
 
         Args:
             indices (torch.Tensor): Indices of the data tensor to be copied.
-            prev_length (int): Previous length before adding new data.
+            prev_length (int): Previous lengths before adding new data
             dim (int, optional): Dimension along which copying should be performed. Default is 2.
         """
+        # 选取需要复制的数据
         tgt = self.data.index_select(dim, indices)
-        dst = self.data.narrow(dim, prev_length, tgt.shape[dim])
-        dst.copy_(tgt, non_blocking=True)
-        self.current_length.fill_(prev_length + tgt.shape[dim])
+        prev_len = prev_length
+        start_index = prev_len
+        end_index = start_index + tgt.shape[dim]
+        # 根据维度选取目标区域并复制数据
+        if dim == 2:
+            dst = self.data[:, :, :, start_index:end_index, :]
+        elif dim == 3:
+            dst = self.data[:, :, :, :, start_index:end_index]
+        else:
+            raise ValueError("Unsupported dimension for copying.")
+        dst.copy_(tgt[:, :], non_blocking=True)   
+        self.current_length.fill_(prev_length + tgt.shape[dim]) 
 
     def cat(self, tensor: torch.Tensor, dim: int = 2):
         """
-        Concatenate the given tensor with the current data.
+        Concatenate the given tensor with the current data for batch_size > 1, and return the tensor
+        truncated to the maximum current length across all batches.
 
         Args:
-            tensor (torch.Tensor): The tensor to be concatenated.
+            tensor (torch.Tensor): The tensor to be concatenated, assuming the first dimension is the batch size.
             dim (int, optional): The dimension along which concatenation should be done. Default is 2.
 
         Returns:
-            torch.Tensor: The data tensor after concatenation up to the current length.
+            torch.Tensor: The data tensor after concatenation and truncation to the maximum current length.
         """
-        dst = self.data.narrow(dim, self.current_length, tensor.shape[dim])
-        dst.copy_(tensor)
+        cur_len = copy.deepcopy(self.current_length)
+        new_len = cur_len + tensor.size(dim)
         self.current_length.add_(tensor.shape[dim])
-        return torch.narrow(self.data, 2, 0, self.current_length)
-
-
-def initialize_past_key_values(model):
+        if dim == 2:
+            self.data[:, :, cur_len:new_len, :] = tensor[:,:,:,:]
+            truncated_data = self.data[:, :, :self.current_length, :]
+        elif dim == 3:
+            self.data[:, :, :, cur_len:new_len] = tensor[:,:,:,:]
+            truncated_data = self.data[:, :, :, :self.current_length]
+        else:
+            raise ValueError("Unsupported dimension for concatenation.")        
+        return truncated_data
+
+
+def initialize_past_key_values(model, batch_size=1):
     """
     Initialize past key and value states for a given transformer model.
 
@@ -84,8 +103,6 @@ def initialize_past_key_values(model):
     """
     # Extracting configuration from the model
     config = model.config
-    # Initializing the batch size to 1, this can be modified if different batch sizes are required
-    batch_size = 1
     # Initializing a tensor to store past keys and values for all layers
     past_key_values_data = torch.zeros(
         config.num_hidden_layers * 2,
diff --git a/medusa/model/medusa_model.py b/medusa/model/medusa_model.py
@@ -3,7 +3,7 @@
 from .modeling_llama_kv import LlamaForCausalLM as KVLlamaForCausalLM
 from .modeling_mistral_kv import MistralForCausalLM as KVMistralForCausalLM
 # import transformers
-
+import pdb
 # # monkey patch
 # transformers.models.llama.modeling_llama.LlamaForCausalLM = KVLlamaForCausalLM
 # transformers.models.mistral.modeling_mistral.MistralForCausalLM = KVMistralForCausalLM
@@ -121,6 +121,7 @@ def __init__(
     @property
     def base_model(self):
         return self
+
     @classmethod
     def from_pretrained(
         cls,
@@ -219,6 +220,7 @@ def forward(
         if output_orig:
             return torch.stack(medusa_logits, dim=0), outputs, orig
         return torch.stack(medusa_logits, dim=0)
+
     def get_medusa_choice(self, model_name):
         if 'vicuna' in model_name:
             if '7b' in model_name:
@@ -264,10 +266,11 @@ def medusa_generate(
 
         Warning: Only support batch size 1 for now!!
         """
-        assert input_ids.shape[0] == 1, "Only support batch size 1 for now!!"
+        # assert input_ids.shape[0] == 1, "Only support batch size 1 for now!!"
+        batch_size = input_ids.shape[0]
+        valid_length = attention_mask.sum(dim=1)
         # Avoid modifying the input_ids in-place
         input_ids = input_ids.clone()
-
         # Cache medusa buffers (the fixed patterns for tree attention)
         if medusa_choices is None:
             medusa_choices = self.get_medusa_choice(self.base_model_name_or_path)
@@ -295,7 +298,7 @@ def medusa_generate(
                 past_key_values,
                 past_key_values_data,
                 current_length_data,
-            ) = initialize_past_key_values(self.base_model)
+            ) = initialize_past_key_values(self.base_model, batch_size)
             self.past_key_values = past_key_values
             self.past_key_values_data = past_key_values_data
             self.current_length_data = current_length_data
@@ -305,12 +308,11 @@ def medusa_generate(
         reset_medusa_mode(self)
         # Initialize tree attention mask and process prefill tokens
         medusa_logits, logits = initialize_medusa(
-            input_ids, self, medusa_buffers["medusa_attn_mask"], past_key_values
+            input_ids, self, medusa_buffers["medusa_attn_mask"], past_key_values, attention_mask
         )
-
         new_token = 0
         last_round_token = 0
-
+        ends = [input_len] * batch_size
         for idx in range(max_steps):
             # Generate candidates with topk predictions from Medusa heads
             candidates, tree_candidates = generate_candidates(
@@ -324,8 +326,8 @@ def medusa_generate(
                 top_p=top_p,
                 sampling=sampling,
                 fast=fast,
+                valid_length=valid_length
             )
-
             # Use tree attention to verify the candidates and get predictions
             medusa_logits, logits, outputs = tree_decoding(
                 self,
@@ -334,15 +336,14 @@ def medusa_generate(
                 medusa_buffers["medusa_position_ids"],
                 input_ids,
                 medusa_buffers["retrieve_indices"],
+                attention_mask=attention_mask
             )
-
             # Evaluate the posterior of the candidates to select the accepted candidate prefix
             best_candidate, accept_length = evaluate_posterior(
                 logits, candidates, temperature, posterior_threshold, posterior_alpha, top_p=top_p, sampling=sampling, fast=fast
             )
-
             # Update the input_ids and logits
-            input_ids, logits, medusa_logits, new_token = update_inference_inputs(
+            input_ids, logits, medusa_logits, new_token, valid_length, attention_mask = update_inference_inputs(
                 input_ids,
                 candidates,
                 best_candidate,
@@ -354,18 +355,29 @@ def medusa_generate(
                 new_token,
                 past_key_values_data,
                 current_length_data,
+                attention_mask=attention_mask,
+                padding_idx=self.tokenizer.pad_token_id
             )
 
-            yield {
-                "text": self.tokenizer.decode(
-                    input_ids[0, input_len:],
+            decoded_texts = []
+            eos_encountered = [False] * batch_size
+            for i in range(batch_size):
+                # 检查当前批次的文本是否包含结束符
+                if self.tokenizer.eos_token_id in input_ids[i, input_len:]:
+                    eos_encountered[i] = True
+                else:
+                    ends[i] = len(input_ids[i])
+                decoded_text = self.tokenizer.decode(
+                    input_ids[i, input_len:ends[i]],
                     skip_special_tokens=True,
                     spaces_between_special_tokens=False,
                     clean_up_tokenization_spaces=True,
                 )
-            }
+                decoded_texts.append(decoded_text)
+            yield{ "text": decoded_texts}
 
-            if self.tokenizer.eos_token_id in input_ids[0, input_len:]:
+            # 如果所有批次都遇到了 EOS，则停止
+            if all(eos_encountered):
                 break
 
 
diff --git a/medusa/model/modeling_llama_kv.py b/medusa/model/modeling_llama_kv.py
@@ -32,7 +32,7 @@
 if is_flash_attn_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-
+import pdb
 
 logger = logging.get_logger(__name__)
 
@@ -315,7 +315,6 @@ def forward(
         padding_mask: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
-
         if self.config.pretraining_tp > 1:
             key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
             query_slices = self.q_proj.weight.split(
@@ -815,6 +814,8 @@ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_em
         # [MODIFIED] add medusa mask
         if hasattr(self, "medusa_mask") and self.medusa_mask is not None:
             medusa_mask = self.medusa_mask
+            bs = combined_attention_mask.shape[0]
+            medusa_mask = medusa_mask.repeat(bs,1,1,1)
             medusa_len = medusa_mask.size(-1)
             combined_attention_mask[:, :, -medusa_len:, -medusa_len:][
                 medusa_mask == 0
@@ -886,7 +887,6 @@ def forward(
                 padding_mask = attention_mask
             else:
                 padding_mask = None
-
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
         )
@@ -1038,7 +1038,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
diff --git a/medusa/model/utils.py b/medusa/model/utils.py