Fix the bug of the code (#7)

xidianwym · xidianwym · commit 2b02994636d4 · 2023-07-13T03:25:30.000Z
diff --git a/README.md b/README.md
@@ -1,6 +1,5 @@
 # Large Language Model Accelerator
 
-[![Bors enabled](https://bors.tech/images/badge_small.svg)](https://app.bors.tech/repositories/65566)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 
 LLMA is an end-to-end optimizing framework for large language models.
@@ -23,7 +22,7 @@ LLMA supports several optimizing technologies like model fine-tuning and model q
 
 ## Getting Started
 
-The following tutorial demonstrates how to use LLAM to deploy LLaMA-7B.
+This example demonstrates how to use LLAM to deploy LLaMA-7B on Cloudblazer Yunsui t20.
 
 - [Tutorial](example/TUTORIAL.md)
 
diff --git a/example/TUTORIAL.md b/example/TUTORIAL.md
@@ -1,37 +1,53 @@
 # TUTORIAL
 
-This example demonstrates how to use LLAM to deploy LLaMA-7B on Cloudblazer Yunsui t20.
+## Preparation
 
-## Setup
+### Pull the docker image
 
-In a conda env with pytorch available, run:
+In the server with Yunsui t20, run:
 
 ```sh
-cd LLMA
+docker pull artifact.enflame.cn/enflame_docker_images/ubuntu/qic_ubuntu_1804_gcc7:latest
+```
+
+### Load LLaMA-7B
 
-pip install -e .
+```sh
+cd llma && \
+mkdir tmp/llama-7b/7B && \
+cd tmp/llama-7b/7B && \
+wget https://llama-7b.oss-cn-beijing.aliyuncs.com/7B/ && \
+cd .. && \
+wget https://llama-7b.oss-cn-beijing.aliyuncs.com/tokenizer.model
 ```
 
-## Deploy LLaMA-7B
+### Run the image
 
-In the deployment environment, run:
+Run the following command:
 
 ```sh
-cd LLMA/examples/llama-7b
+cd llma
+docker run -it -v $PWD:/home/join/model --privileged -p 7999:8080 artifact.enflame.cn/enflame_docker_images/ubuntu/qic_ubuntu_1804_gcc7:latest bash
+```
 
-bash ./run.sh $CKPT_DIR $TOKENIZER_PATH
+## Deploy LLaMA-7B
+
+In the docker container, run:
+
+```sh
+cd /home/join/model/example/llama-7b
+bash ./run.sh ../tmp/llama-7b/7B/ ../tmp/llama-7b/tokenizer.model
 ```
 
 ## Do inference
 
-Examples of model inference are as follows:
-
 Infer with the python script.
 
-In the LLMA/examples/llama-7b directory, run the command:
+Outside the docker container in the server, run the command:
 
 ```sh
-python3 client.py -u $url 
+cd llma/examples/llama-7b
+python3 client.py -u 'http://localhost:7999/chat' -p 'I believe the meaning of life'
 ```
 
 The results will be as follows:
diff --git a/example/llama-7b/client.py b/example/llama-7b/client.py
@@ -6,16 +6,14 @@
 import requests
 
 
-MODEL_NAME = 'LLaMA-7B'
-PROMPT = 'I believe the meaning of life is'
-info = {
-    "instruction": PROMPT,
-    "model": MODEL_NAME
-}
-headers = {"Content-Type": "application/json"}
-
-
 def get_response():
+    model_name = 'LLaMA-7B'
+    prompt = FLAGS.prompt
+    info = {
+            "instruction": prompt,
+            "model": model_name
+            }
+    headers = {"Content-Type": "application/json"}
     response = requests.post(FLAGS.url, json=json.dumps(info), headers=headers)
     if response.status_code == 200:
         for chunk in response.iter_content(chunk_size=1):
@@ -28,5 +26,6 @@ def get_response():
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-u', '--url', type=str, required=False)
+    parser.add_argument('-p', '--prompt', type=str, required=False)
     FLAGS = parser.parse_args()
     get_response()
diff --git a/llama_inference_service/llama/__init__.py b/llama_inference_service/llama/__init__.py
@@ -1,6 +1,17 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 
+import importlib.util
+import sys
+import os 
+
 from .generation import LLaMA
 from .model import ModelArgs, Transformer
 from .tokenizer import Tokenizer
+
+def is_torch_gcu_available():
+    if importlib.util.find_spec("torch_gcu") is None:
+        return False
+    if importlib.util.find_spec("torch_gcu.core") is None:
+        return False
+    return importlib.util.find_spec("torch_gcu.core.model") is not None
diff --git a/llama_inference_service/llama/generation.py b/llama_inference_service/llama/generation.py
@@ -2,17 +2,48 @@
 # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 
 from typing import List
-
+import time
+import numpy as np
+import os 
 import torch
 
 from llama.tokenizer import Tokenizer
 from llama.model import Transformer
+from llama import is_torch_gcu_available
+
+torch.autograd.set_detect_anomaly(True)
+
+if is_torch_gcu_available():
+    import torch_gcu
+    import torch_gcu.distributed as dist
+    local_rank = int(os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else dist.get_rank()
+    #gcu_device = torch_gcu.gcu_device(local_rank * int(os.getenv("LEO_CLUSTER_NUM", '1')))
+    gcu_device = torch_gcu.gcu_device(local_rank)
+else:
+    import torch as torch_gcu
 
 
 class LLaMA:
     def __init__(self, model: Transformer, tokenizer: Tokenizer):
         self.model = model
         self.tokenizer = tokenizer
+        self.max_prompts_len = 32
+        self.max_seq_len = 512
+
+    def gen_mask_stage_0(self, tokens: torch.Tensor, pad_id: int):
+        temp = torch.full((1, 1, self.max_prompts_len, self.max_prompts_len), -65500.0, device="cpu")
+        temp = torch.triu(temp, diagonal=1)
+        expand_tokens = tokens[:, None, None, :].expand(1, 1, self.max_prompts_len, self.max_prompts_len)
+        temp.masked_fill_(expand_tokens == pad_id, -65500.0)
+        temp[0,0,:,:].fill_diagonal_(fill_value = 0., wrap = False).reshape(1,1,self.max_prompts_len,self.max_prompts_len)
+        mask = torch.full((1, 1, self.max_prompts_len, self.max_seq_len), -65500.0, device="cpu")
+        mask[0, 0, :, -self.max_prompts_len:] = temp
+        return mask.to(gcu_device)
+
+    def gen_mask_stage_1(self, cur_pos: int):
+        mask = torch.full((1, 1, 1, self.max_seq_len), -65500.0, device="cpu")
+        mask[:, :, :, self.max_seq_len-cur_pos:] = 0
+        return mask.to(gcu_device)
 
     def generate(
         self,
@@ -31,46 +62,69 @@ def generate(
         max_prompt_size = max([len(t) for t in prompt_tokens])
 
         total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
+        total_padding_len = params.max_seq_len
+        if not is_torch_gcu_available():
+            tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
+
+        else:
+            tokens = torch.full((bsz, total_padding_len), 0,device="cpu")
+            tokens = tokens.long()
 
-        tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
         for k, t in enumerate(prompt_tokens):
-            tokens[k, : len(t)] = torch.tensor(t).long()
-        input_text_mask = tokens != self.tokenizer.pad_id
+            assert len(t) <= self.max_prompts_len, \
+                f"prompt size of {prompts[k]}({len(t)}) is greater than max_prompts_len: {self.max_prompts_len}"
+            if not is_torch_gcu_available():
+                tokens[k, : len(t)] = torch.tensor(t).long()
+            else:
+                tokens[k,  -len(t):] = torch.tensor(t).long()
         start_pos = min_prompt_size
         prev_pos = 0
+        token_time_list = list()
         for cur_pos in range(start_pos, total_len):
-            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+            start_time = time.time()
+            if prev_pos == 0:
+                mask = self.gen_mask_stage_0(tokens[:, -self.max_prompts_len:], 0);
+                logits = self.model.forward(tokens[:, -self.max_prompts_len:].to(gcu_device), start_pos = prev_pos, mask=mask)
+            else:
+                mask = self.gen_mask_stage_1(cur_pos)
+                logits = self.model.forward(tokens[:, -1:].to(gcu_device), start_pos = prev_pos, mask=mask)
             if temperature > 0:
                 probs = torch.softmax(logits / temperature, dim=-1)
                 next_token = sample_top_p(probs, top_p)
             else:
                 next_token = torch.argmax(logits, dim=-1)
-            next_token = next_token.reshape(-1)
+
+            next_token = next_token.reshape(tokens.shape[0],-1).cpu()
             # only replace token if prompt has already been generated
-            next_token = torch.where(
-                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
-            )
-            tokens[:, cur_pos] = next_token
+            tokens = torch.cat([tokens,next_token],dim = 1)
+            tokens = tokens[:, 1:]
             prev_pos = cur_pos
+            end_time = time.time()
+            token_time_list.append(end_time - start_time)
 
         decoded = []
         for i, t in enumerate(tokens.tolist()):
             # cut to max gen len
-            t = t[: len(prompt_tokens[i]) + max_gen_len]
+            t = t[-len(prompt_tokens[i]) - max_gen_len :]
             # cut to eos tok if any
             try:
                 t = t[: t.index(self.tokenizer.eos_id)]
             except ValueError:
                 pass
             decoded.append(self.tokenizer.decode(t))
-        return decoded
+        return decoded, token_time_list
 
 
 def sample_top_p(probs, p):
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    # call sync_lived_tensor to avoid repeat computing in different subgraphs
+    torch_gcu.sync_lived_tensor()
+    itemp = probs_sort.cpu()
+    probs_sum = torch.cumsum(itemp, dim=-1)
+    probs_sum = probs_sum.to(gcu_device)
     mask = probs_sum - probs_sort > p
-    probs_sort[mask] = 0.0
+    #probs_sort[mask] = 0.0
+    probs_sort.masked_fill_(mask, 0.0)
     probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
     next_token = torch.multinomial(probs_sort, num_samples=1)
     next_token = torch.gather(probs_idx, -1, next_token)
diff --git a/llama_inference_service/llama/model.py b/llama_inference_service/llama/model.py
diff --git a/llama_inference_service/src/inference_service.py b/llama_inference_service/src/inference_service.py