diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..f953888
Binary files /dev/null and b/.DS_Store differ
diff --git a/.env b/.env
new file mode 100644
index 0000000..c8ba9e6
--- /dev/null
+++ b/.env
@@ -0,0 +1,28 @@
+PORT=8080
+
+# model related
+MODEL_NAME=minicpm-v
+MODEL_PATH=/input1/
+PROMPT_NAME=minicpm-v
+
+# rag related
+EMBEDDING_NAME=
+RERANK_NAME=
+
+# api related
+API_PREFIX=/v1
+
+# vllm related
+ENGINE=vllm
+TRUST_REMOTE_CODE=true
+TOKENIZE_MODE=auto
+TENSOR_PARALLEL_SIZE=2
+DTYPE=half
+
+MAX_NUM_SEQS=10
+MAX_NUM_BATCHED_TOKENS=-1
+GPU_MEMORY_UTILIZATION=0.9
+# 使用internvl模型必须设置CONTEXT_LEN=4096
+CONTEXT_LEN=4096
+TASKS=llm
+# TASKS=llm,rag
\ No newline at end of file
diff --git a/README_OPENBAYES.md b/README_OPENBAYES.md
new file mode 100644
index 0000000..b33ea42
--- /dev/null
+++ b/README_OPENBAYES.md
@@ -0,0 +1,89 @@
+## 环境配置
+
+### 本地环境
+
+安装依赖,确保安装顺序严格按照下面的命令:
+
+```shell
+pip install vllm>=0.4.3
+pip install -r requirements.txt
+```
+
+## 启动模型
+
+### 环境变量含义
+
+
++ `MODEL_NAME`: 模型名称,如 `chatglm4`、`qwen2`、`llama3`等
+
+
++ `PROMPT_NAME`: 使用的对话模板名称,如果不指定,则将根据 `tokenizer` 找到对应的模板
+
+
++ `MODEL_PATH`: 开源大模型的文件所在路径
+
+
++ `TRUST_REMOTE_CODE`: 是否使用外部代码
+
+
++ `TOKENIZE_MODE`(可选项): `tokenizer` 的模式,默认为 `auto`
+
+
++ `TENSOR_PARALLEL_SIZE`(可选项): `GPU` 数量,默认为 `1`
+
+
++ `EMBEDDING_NAME`(可选项): 嵌入模型的文件所在路径,推荐使用 `moka-ai/m3e-base` 或者 `BAAI/bge-large-zh`
+
+
++ `GPU_MEMORY_UTILIZATION`(可选项): `GPU` 占用率
+
+
++ `MAX_NUM_BATCHED_TOKENS`(可选项): 每个批处理的最大 `token` 数量
+
+
++ `MAX_NUM_SEQS`(可选项): 批量大小
+
+
++ `TASKS`(可选项): `llm` 表示启动对话大模型,`rag` 表示启动文档文档相关接口,比如`embedding`、`rerank`
+
+
+### 启动方式
+
+#### 本地启动
+
+根据需求修改 `.env` 文件中的环境变量
+
+```shell
+python server.py
+```
+#### 调用样例
+```shell
+curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer YOUR_API_KEY" \
+-d '{
+ "messages": [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "这张图像有什么东西?"},
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://github.com/ByungKwanLee/TroL/blob/master/figures/demo.png?raw=true"
+ }
+ }
+ ]
+ }
+ ],
+ "model": "minicpm-v"
+}'
+```
+### 说明
+目前只支持minicpm-v、internvl模型,下面是minicpm-v最大并发量测试结果(RTX_8000 x 2):
+GPU_MEMORY_UTILIZATION=0.9 并发量10
+GPU_MEMORY_UTILIZATION=0.8 并发量14
+GPU_MEMORY_UTILIZATION=0.7 并发量20
+GPU_MEMORY_UTILIZATION=0.6 并发量28
+GPU_MEMORY_UTILIZATION=0.5 并发量30
+GPU_MEMORY_UTILIZATION=0.4 并发量36
diff --git a/api/.DS_Store b/api/.DS_Store
new file mode 100644
index 0000000..642dd58
Binary files /dev/null and b/api/.DS_Store differ
diff --git a/api/vllm_routes/chat.py b/api/vllm_routes/chat.py
index 31e31fe..d28f8cd 100644
--- a/api/vllm_routes/chat.py
+++ b/api/vllm_routes/chat.py
@@ -45,6 +45,44 @@
def get_engine():
yield LLM_ENGINE
+def load_image(image_url: str):
+ from PIL import Image
+ from io import BytesIO
+
+ if image_url.startswith("data:"):
+ import base64
+
+ image_bytes = base64.b64decode(image_url.split(",")[1])
+ else:
+ import urllib.request
+
+ with urllib.request.urlopen(image_url) as f:
+ image_bytes = f.read()
+
+ return Image.open(BytesIO(image_bytes)).convert("RGB")
+
+def process_messages(messages):
+ _messages = []
+ for message in messages:
+ if isinstance(message["content"], str):
+ _content = [message["content"]]
+ else:
+ _content = []
+ for c in message["content"]:
+ if isinstance(c, dict) and "type" in c:
+ if c["type"] == "text":
+ _content.append(c["text"])
+ elif c["type"] == "image_url":
+ if (
+ isinstance(c["image_url"], dict)
+ and "url" in c["image_url"]
+ ):
+ image = load_image(image_url=c["image_url"]["url"])
+ else:
+ image = load_image(image_url=c["image_url"])
+ _content.insert(0, image)
+ _messages.append({"role": message["role"], "content": _content})
+ return _messages
@chat_router.post(
"/completions",
@@ -74,6 +112,22 @@ async def create_chat_completion(
logger.debug(f"==== request ====\n{params}")
request_id: str = f"chatcmpl-{str(uuid.uuid4())}"
+
+ # 使用minicpm-v模型
+ minicpmv_messages = process_messages(request.messages)
+ image = minicpmv_messages[0]['content'][0]
+ question = minicpmv_messages[0]['content'][1]
+ minicpmv_messages[0]['content'] = f'(./)\n{question}'
+ request.messages = minicpmv_messages
+
+ # 使用internvl模型需要解注释
+ # internvl_messages = process_messages(request.messages)
+ # image = internvl_messages[0]['content'][0]
+ # question = internvl_messages[0]['content'][1]
+ # internvl_messages[0]['content'] = f"\n{question}\n"
+ # request.messages = internvl_messages
+ # stop_token_ids = [0, 92543, 92542, 0]
+
token_ids = engine.template.convert_messages_to_ids(
messages=request.messages,
tools=request.tools,
@@ -97,13 +151,21 @@ async def create_chat_completion(
"spaces_between_special_tokens",
}
kwargs = dictify(request, include=include)
+ # 使用minicpm-v模型
sampling_params = SamplingParams(
stop=request.stop or [],
stop_token_ids=request.stop_token_ids or [],
max_tokens=request.max_tokens,
**kwargs,
)
-
+ # 使用internvl模型需要解注释
+ # sampling_params = SamplingParams(
+ # stop=request.stop or [],
+ # stop_token_ids=stop_token_ids or [],
+ # max_tokens=request.max_tokens,
+ # **kwargs,
+ # )
+
# Todo: support for lora
lora_request = None
try:
@@ -136,6 +198,9 @@ async def create_chat_completion(
{
"prompt": None,
"prompt_token_ids": token_ids,
+ "multi_modal_data": {
+ "image": image
+ }
},
sampling_params,
request_id,
diff --git a/server.py b/server.py
new file mode 100644
index 0000000..a24c3a6
--- /dev/null
+++ b/server.py
@@ -0,0 +1,49 @@
+from api.config import SETTINGS
+from api.models import (
+ app,
+ EMBEDDING_MODEL,
+ LLM_ENGINE,
+ RERANK_MODEL,
+)
+
+
+prefix = SETTINGS.api_prefix
+
+if EMBEDDING_MODEL is not None:
+ from api.routes.embedding import embedding_router
+
+ app.include_router(embedding_router, prefix=prefix, tags=["Embedding"])
+
+ try:
+ from api.routes.file import file_router
+
+ app.include_router(file_router, prefix=prefix, tags=["File"])
+ except ImportError:
+ pass
+
+if RERANK_MODEL is not None:
+ from api.routes.rerank import rerank_router
+
+ app.include_router(rerank_router, prefix=prefix, tags=["Rerank"])
+
+
+if LLM_ENGINE is not None:
+ from api.routes import model_router
+
+ app.include_router(model_router, prefix=prefix, tags=["Model"])
+
+ if SETTINGS.engine == "vllm":
+ from api.vllm_routes import chat_router as chat_router
+ from api.vllm_routes import completion_router as completion_router
+
+ else:
+ from api.routes.chat import chat_router as chat_router
+ from api.routes.completion import completion_router as completion_router
+
+ app.include_router(chat_router, prefix=prefix, tags=["Chat Completion"])
+ app.include_router(completion_router, prefix=prefix, tags=["Completion"])
+
+
+if __name__ == "__main__":
+ import uvicorn
+ uvicorn.run(app, host=SETTINGS.host, port=SETTINGS.port, log_level="info")