Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vllm 接口支持vision(minicpm-v) #306

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
28 changes: 28 additions & 0 deletions .env
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
PORT=8080

# model related
MODEL_NAME=minicpm-v
MODEL_PATH=/input1/
PROMPT_NAME=minicpm-v

# rag related
EMBEDDING_NAME=
RERANK_NAME=

# api related
API_PREFIX=/v1

# vllm related
ENGINE=vllm
TRUST_REMOTE_CODE=true
TOKENIZE_MODE=auto
TENSOR_PARALLEL_SIZE=2
DTYPE=half

MAX_NUM_SEQS=10
MAX_NUM_BATCHED_TOKENS=-1
GPU_MEMORY_UTILIZATION=0.9
# 使用internvl模型必须设置CONTEXT_LEN=4096
CONTEXT_LEN=4096
TASKS=llm
# TASKS=llm,rag
89 changes: 89 additions & 0 deletions README_OPENBAYES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
## 环境配置

### 本地环境

安装依赖,确保安装顺序严格按照下面的命令:

```shell
pip install vllm>=0.4.3
pip install -r requirements.txt
```

## 启动模型

### 环境变量含义


+ `MODEL_NAME`: 模型名称,如 `chatglm4`、`qwen2`、`llama3`等


+ `PROMPT_NAME`: 使用的对话模板名称,如果不指定,则将根据 `tokenizer` 找到对应的模板


+ `MODEL_PATH`: 开源大模型的文件所在路径


+ `TRUST_REMOTE_CODE`: 是否使用外部代码


+ `TOKENIZE_MODE`(可选项): `tokenizer` 的模式,默认为 `auto`


+ `TENSOR_PARALLEL_SIZE`(可选项): `GPU` 数量,默认为 `1`


+ `EMBEDDING_NAME`(可选项): 嵌入模型的文件所在路径,推荐使用 `moka-ai/m3e-base` 或者 `BAAI/bge-large-zh`


+ `GPU_MEMORY_UTILIZATION`(可选项): `GPU` 占用率


+ `MAX_NUM_BATCHED_TOKENS`(可选项): 每个批处理的最大 `token` 数量


+ `MAX_NUM_SEQS`(可选项): 批量大小


+ `TASKS`(可选项): `llm` 表示启动对话大模型,`rag` 表示启动文档文档相关接口,比如`embedding`、`rerank`


### 启动方式

#### 本地启动

根据需求修改 `.env` 文件中的环境变量

```shell
python server.py
```
#### 调用样例
```shell
curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer YOUR_API_KEY" \
-d '{
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": "这张图像有什么东西?"},
{
"type": "image_url",
"image_url": {
"url": "https://github.com/ByungKwanLee/TroL/blob/master/figures/demo.png?raw=true"
}
}
]
}
],
"model": "minicpm-v"
}'
```
### 说明
目前只支持minicpm-v、internvl模型,下面是minicpm-v最大并发量测试结果(RTX_8000 x 2):
GPU_MEMORY_UTILIZATION=0.9 并发量10
GPU_MEMORY_UTILIZATION=0.8 并发量14
GPU_MEMORY_UTILIZATION=0.7 并发量20
GPU_MEMORY_UTILIZATION=0.6 并发量28
GPU_MEMORY_UTILIZATION=0.5 并发量30
GPU_MEMORY_UTILIZATION=0.4 并发量36
Binary file added api/.DS_Store
Binary file not shown.
67 changes: 66 additions & 1 deletion api/vllm_routes/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,44 @@
def get_engine():
yield LLM_ENGINE

def load_image(image_url: str):
from PIL import Image
from io import BytesIO

if image_url.startswith("data:"):
import base64

image_bytes = base64.b64decode(image_url.split(",")[1])
else:
import urllib.request

with urllib.request.urlopen(image_url) as f:
image_bytes = f.read()

return Image.open(BytesIO(image_bytes)).convert("RGB")

def process_messages(messages):
_messages = []
for message in messages:
if isinstance(message["content"], str):
_content = [message["content"]]
else:
_content = []
for c in message["content"]:
if isinstance(c, dict) and "type" in c:
if c["type"] == "text":
_content.append(c["text"])
elif c["type"] == "image_url":
if (
isinstance(c["image_url"], dict)
and "url" in c["image_url"]
):
image = load_image(image_url=c["image_url"]["url"])
else:
image = load_image(image_url=c["image_url"])
_content.insert(0, image)
_messages.append({"role": message["role"], "content": _content})
return _messages

@chat_router.post(
"/completions",
Expand Down Expand Up @@ -74,6 +112,22 @@ async def create_chat_completion(
logger.debug(f"==== request ====\n{params}")

request_id: str = f"chatcmpl-{str(uuid.uuid4())}"

# 使用minicpm-v模型
minicpmv_messages = process_messages(request.messages)
image = minicpmv_messages[0]['content'][0]
question = minicpmv_messages[0]['content'][1]
minicpmv_messages[0]['content'] = f'(<image>./</image>)\n{question}'
request.messages = minicpmv_messages

# 使用internvl模型需要解注释
# internvl_messages = process_messages(request.messages)
# image = internvl_messages[0]['content'][0]
# question = internvl_messages[0]['content'][1]
# internvl_messages[0]['content'] = f"<image>\n{question}\n"
# request.messages = internvl_messages
# stop_token_ids = [0, 92543, 92542, 0]

token_ids = engine.template.convert_messages_to_ids(
messages=request.messages,
tools=request.tools,
Expand All @@ -97,13 +151,21 @@ async def create_chat_completion(
"spaces_between_special_tokens",
}
kwargs = dictify(request, include=include)
# 使用minicpm-v模型
sampling_params = SamplingParams(
stop=request.stop or [],
stop_token_ids=request.stop_token_ids or [],
max_tokens=request.max_tokens,
**kwargs,
)

# 使用internvl模型需要解注释
# sampling_params = SamplingParams(
# stop=request.stop or [],
# stop_token_ids=stop_token_ids or [],
# max_tokens=request.max_tokens,
# **kwargs,
# )

# Todo: support for lora
lora_request = None
try:
Expand Down Expand Up @@ -136,6 +198,9 @@ async def create_chat_completion(
{
"prompt": None,
"prompt_token_ids": token_ids,
"multi_modal_data": {
"image": image
}
},
sampling_params,
request_id,
Expand Down
49 changes: 49 additions & 0 deletions server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from api.config import SETTINGS
from api.models import (
app,
EMBEDDING_MODEL,
LLM_ENGINE,
RERANK_MODEL,
)


prefix = SETTINGS.api_prefix

if EMBEDDING_MODEL is not None:
from api.routes.embedding import embedding_router

app.include_router(embedding_router, prefix=prefix, tags=["Embedding"])

try:
from api.routes.file import file_router

app.include_router(file_router, prefix=prefix, tags=["File"])
except ImportError:
pass

if RERANK_MODEL is not None:
from api.routes.rerank import rerank_router

app.include_router(rerank_router, prefix=prefix, tags=["Rerank"])


if LLM_ENGINE is not None:
from api.routes import model_router

app.include_router(model_router, prefix=prefix, tags=["Model"])

if SETTINGS.engine == "vllm":
from api.vllm_routes import chat_router as chat_router
from api.vllm_routes import completion_router as completion_router

else:
from api.routes.chat import chat_router as chat_router
from api.routes.completion import completion_router as completion_router

app.include_router(chat_router, prefix=prefix, tags=["Chat Completion"])
app.include_router(completion_router, prefix=prefix, tags=["Completion"])


if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=SETTINGS.host, port=SETTINGS.port, log_level="info")