-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinfer_compare.py
77 lines (57 loc) · 2.2 KB
/
infer_compare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from lmdeploy import turbomind as tm
def get_prompt(query: str) -> str:
prompt = f"""<|System|>:You are an AI assistant whose name is InternLM (书生·浦语).
- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
<|User|>:{query}
<|Bot|>:"""
return prompt
def gen_lmdeploy(query: str, tokenizer, model) -> str:
prompt = get_prompt(query)
input_ids = tokenizer.encode(prompt)
for outputs in model.stream_infer(
session_id=0,
input_ids=[input_ids],
request_output_len=512,
temperature=0.0,
):
...
res, _tokens = outputs[0]
output = tokenizer.decode(res)
return output
def gen_transformers(query: str, tokenizer, model) -> str:
prompt = get_prompt(query)
inputs = tokenizer([prompt], return_tensors="pt").to(device)
res = model.generate(
**inputs,
max_new_tokens=512,
)
output = tokenizer.decode(res[0])
output = output.replace(prompt, "").replace("<eoa>", "").strip()
return output
import sys
import time
device = "cuda:0"
model_path = "/root/share/temp/model_repos/internlm-chat-7b/"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
engine = sys.argv[1]
if engine == "hf":
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
model.to(device).eval();
gen_func = gen_transformers
else:
tm_model = tm.TurboMind.from_pretrained("./workspace/")
model = tm_model.create_instance()
gen_func = gen_lmdeploy
count = 0
start = time.time()
for season in ["春天", "夏天", "秋天", "冬天"]:
q = f"写一篇关于{season}的300字小作文。"
output = gen_func(q, tokenizer, model)
count += len(output)
end = time.time()
cost = (end - start)
throughput = round(count / cost)
print(f"{engine} 耗时 {cost:.2f}秒 {throughput} 字/秒")