Skip to content

Commit 2b02994

Browse files
author
xidianwym
committed
Fix the bug of the code (#7)
1 parent cecd774 commit 2b02994

File tree

7 files changed

+197
-76
lines changed

7 files changed

+197
-76
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Large Language Model Accelerator
22

3-
[![Bors enabled](https://bors.tech/images/badge_small.svg)](https://app.bors.tech/repositories/65566)
43
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
54

65
LLMA is an end-to-end optimizing framework for large language models.
@@ -23,7 +22,7 @@ LLMA supports several optimizing technologies like model fine-tuning and model q
2322

2423
## Getting Started
2524

26-
The following tutorial demonstrates how to use LLAM to deploy LLaMA-7B.
25+
This example demonstrates how to use LLAM to deploy LLaMA-7B on Cloudblazer Yunsui t20.
2726

2827
- [Tutorial](example/TUTORIAL.md)
2928

example/TUTORIAL.md

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,53 @@
11
# TUTORIAL
22

3-
This example demonstrates how to use LLAM to deploy LLaMA-7B on Cloudblazer Yunsui t20.
3+
## Preparation
44

5-
## Setup
5+
### Pull the docker image
66

7-
In a conda env with pytorch available, run:
7+
In the server with Yunsui t20, run:
88

99
```sh
10-
cd LLMA
10+
docker pull artifact.enflame.cn/enflame_docker_images/ubuntu/qic_ubuntu_1804_gcc7:latest
11+
```
12+
13+
### Load LLaMA-7B
1114

12-
pip install -e .
15+
```sh
16+
cd llma && \
17+
mkdir tmp/llama-7b/7B && \
18+
cd tmp/llama-7b/7B && \
19+
wget https://llama-7b.oss-cn-beijing.aliyuncs.com/7B/ && \
20+
cd .. && \
21+
wget https://llama-7b.oss-cn-beijing.aliyuncs.com/tokenizer.model
1322
```
1423

15-
## Deploy LLaMA-7B
24+
### Run the image
1625

17-
In the deployment environment, run:
26+
Run the following command:
1827

1928
```sh
20-
cd LLMA/examples/llama-7b
29+
cd llma
30+
docker run -it -v $PWD:/home/join/model --privileged -p 7999:8080 artifact.enflame.cn/enflame_docker_images/ubuntu/qic_ubuntu_1804_gcc7:latest bash
31+
```
2132

22-
bash ./run.sh $CKPT_DIR $TOKENIZER_PATH
33+
## Deploy LLaMA-7B
34+
35+
In the docker container, run:
36+
37+
```sh
38+
cd /home/join/model/example/llama-7b
39+
bash ./run.sh ../tmp/llama-7b/7B/ ../tmp/llama-7b/tokenizer.model
2340
```
2441

2542
## Do inference
2643

27-
Examples of model inference are as follows:
28-
2944
Infer with the python script.
3045

31-
In the LLMA/examples/llama-7b directory, run the command:
46+
Outside the docker container in the server, run the command:
3247

3348
```sh
34-
python3 client.py -u $url
49+
cd llma/examples/llama-7b
50+
python3 client.py -u 'http://localhost:7999/chat' -p 'I believe the meaning of life'
3551
```
3652

3753
The results will be as follows:

example/llama-7b/client.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,14 @@
66
import requests
77

88

9-
MODEL_NAME = 'LLaMA-7B'
10-
PROMPT = 'I believe the meaning of life is'
11-
info = {
12-
"instruction": PROMPT,
13-
"model": MODEL_NAME
14-
}
15-
headers = {"Content-Type": "application/json"}
16-
17-
189
def get_response():
10+
model_name = 'LLaMA-7B'
11+
prompt = FLAGS.prompt
12+
info = {
13+
"instruction": prompt,
14+
"model": model_name
15+
}
16+
headers = {"Content-Type": "application/json"}
1917
response = requests.post(FLAGS.url, json=json.dumps(info), headers=headers)
2018
if response.status_code == 200:
2119
for chunk in response.iter_content(chunk_size=1):
@@ -28,5 +26,6 @@ def get_response():
2826
if __name__ == '__main__':
2927
parser = argparse.ArgumentParser()
3028
parser.add_argument('-u', '--url', type=str, required=False)
29+
parser.add_argument('-p', '--prompt', type=str, required=False)
3130
FLAGS = parser.parse_args()
3231
get_response()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
11
# Copyright (c) Meta Platforms, Inc. and affiliates.
22
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
33

4+
import importlib.util
5+
import sys
6+
import os
7+
48
from .generation import LLaMA
59
from .model import ModelArgs, Transformer
610
from .tokenizer import Tokenizer
11+
12+
def is_torch_gcu_available():
13+
if importlib.util.find_spec("torch_gcu") is None:
14+
return False
15+
if importlib.util.find_spec("torch_gcu.core") is None:
16+
return False
17+
return importlib.util.find_spec("torch_gcu.core.model") is not None

llama_inference_service/llama/generation.py

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,48 @@
22
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
33

44
from typing import List
5-
5+
import time
6+
import numpy as np
7+
import os
68
import torch
79

810
from llama.tokenizer import Tokenizer
911
from llama.model import Transformer
12+
from llama import is_torch_gcu_available
13+
14+
torch.autograd.set_detect_anomaly(True)
15+
16+
if is_torch_gcu_available():
17+
import torch_gcu
18+
import torch_gcu.distributed as dist
19+
local_rank = int(os.environ['LOCAL_RANK']) if 'LOCAL_RANK' in os.environ else dist.get_rank()
20+
#gcu_device = torch_gcu.gcu_device(local_rank * int(os.getenv("LEO_CLUSTER_NUM", '1')))
21+
gcu_device = torch_gcu.gcu_device(local_rank)
22+
else:
23+
import torch as torch_gcu
1024

1125

1226
class LLaMA:
1327
def __init__(self, model: Transformer, tokenizer: Tokenizer):
1428
self.model = model
1529
self.tokenizer = tokenizer
30+
self.max_prompts_len = 32
31+
self.max_seq_len = 512
32+
33+
def gen_mask_stage_0(self, tokens: torch.Tensor, pad_id: int):
34+
temp = torch.full((1, 1, self.max_prompts_len, self.max_prompts_len), -65500.0, device="cpu")
35+
temp = torch.triu(temp, diagonal=1)
36+
expand_tokens = tokens[:, None, None, :].expand(1, 1, self.max_prompts_len, self.max_prompts_len)
37+
temp.masked_fill_(expand_tokens == pad_id, -65500.0)
38+
temp[0,0,:,:].fill_diagonal_(fill_value = 0., wrap = False).reshape(1,1,self.max_prompts_len,self.max_prompts_len)
39+
mask = torch.full((1, 1, self.max_prompts_len, self.max_seq_len), -65500.0, device="cpu")
40+
mask[0, 0, :, -self.max_prompts_len:] = temp
41+
return mask.to(gcu_device)
42+
43+
def gen_mask_stage_1(self, cur_pos: int):
44+
mask = torch.full((1, 1, 1, self.max_seq_len), -65500.0, device="cpu")
45+
mask[:, :, :, self.max_seq_len-cur_pos:] = 0
46+
return mask.to(gcu_device)
1647

1748
def generate(
1849
self,
@@ -31,46 +62,69 @@ def generate(
3162
max_prompt_size = max([len(t) for t in prompt_tokens])
3263

3364
total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
65+
total_padding_len = params.max_seq_len
66+
if not is_torch_gcu_available():
67+
tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
68+
69+
else:
70+
tokens = torch.full((bsz, total_padding_len), 0,device="cpu")
71+
tokens = tokens.long()
3472

35-
tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
3673
for k, t in enumerate(prompt_tokens):
37-
tokens[k, : len(t)] = torch.tensor(t).long()
38-
input_text_mask = tokens != self.tokenizer.pad_id
74+
assert len(t) <= self.max_prompts_len, \
75+
f"prompt size of {prompts[k]}({len(t)}) is greater than max_prompts_len: {self.max_prompts_len}"
76+
if not is_torch_gcu_available():
77+
tokens[k, : len(t)] = torch.tensor(t).long()
78+
else:
79+
tokens[k, -len(t):] = torch.tensor(t).long()
3980
start_pos = min_prompt_size
4081
prev_pos = 0
82+
token_time_list = list()
4183
for cur_pos in range(start_pos, total_len):
42-
logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
84+
start_time = time.time()
85+
if prev_pos == 0:
86+
mask = self.gen_mask_stage_0(tokens[:, -self.max_prompts_len:], 0);
87+
logits = self.model.forward(tokens[:, -self.max_prompts_len:].to(gcu_device), start_pos = prev_pos, mask=mask)
88+
else:
89+
mask = self.gen_mask_stage_1(cur_pos)
90+
logits = self.model.forward(tokens[:, -1:].to(gcu_device), start_pos = prev_pos, mask=mask)
4391
if temperature > 0:
4492
probs = torch.softmax(logits / temperature, dim=-1)
4593
next_token = sample_top_p(probs, top_p)
4694
else:
4795
next_token = torch.argmax(logits, dim=-1)
48-
next_token = next_token.reshape(-1)
96+
97+
next_token = next_token.reshape(tokens.shape[0],-1).cpu()
4998
# only replace token if prompt has already been generated
50-
next_token = torch.where(
51-
input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
52-
)
53-
tokens[:, cur_pos] = next_token
99+
tokens = torch.cat([tokens,next_token],dim = 1)
100+
tokens = tokens[:, 1:]
54101
prev_pos = cur_pos
102+
end_time = time.time()
103+
token_time_list.append(end_time - start_time)
55104

56105
decoded = []
57106
for i, t in enumerate(tokens.tolist()):
58107
# cut to max gen len
59-
t = t[: len(prompt_tokens[i]) + max_gen_len]
108+
t = t[-len(prompt_tokens[i]) - max_gen_len :]
60109
# cut to eos tok if any
61110
try:
62111
t = t[: t.index(self.tokenizer.eos_id)]
63112
except ValueError:
64113
pass
65114
decoded.append(self.tokenizer.decode(t))
66-
return decoded
115+
return decoded, token_time_list
67116

68117

69118
def sample_top_p(probs, p):
70119
probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
71-
probs_sum = torch.cumsum(probs_sort, dim=-1)
120+
# call sync_lived_tensor to avoid repeat computing in different subgraphs
121+
torch_gcu.sync_lived_tensor()
122+
itemp = probs_sort.cpu()
123+
probs_sum = torch.cumsum(itemp, dim=-1)
124+
probs_sum = probs_sum.to(gcu_device)
72125
mask = probs_sum - probs_sort > p
73-
probs_sort[mask] = 0.0
126+
#probs_sort[mask] = 0.0
127+
probs_sort.masked_fill_(mask, 0.0)
74128
probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
75129
next_token = torch.multinomial(probs_sort, num_samples=1)
76130
next_token = torch.gather(probs_idx, -1, next_token)

0 commit comments

Comments
 (0)