diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..f953888 Binary files /dev/null and b/.DS_Store differ diff --git a/.env b/.env new file mode 100644 index 0000000..c8ba9e6 --- /dev/null +++ b/.env @@ -0,0 +1,28 @@ +PORT=8080 + +# model related +MODEL_NAME=minicpm-v +MODEL_PATH=/input1/ +PROMPT_NAME=minicpm-v + +# rag related +EMBEDDING_NAME= +RERANK_NAME= + +# api related +API_PREFIX=/v1 + +# vllm related +ENGINE=vllm +TRUST_REMOTE_CODE=true +TOKENIZE_MODE=auto +TENSOR_PARALLEL_SIZE=2 +DTYPE=half + +MAX_NUM_SEQS=10 +MAX_NUM_BATCHED_TOKENS=-1 +GPU_MEMORY_UTILIZATION=0.9 +# 使用internvl模型必须设置CONTEXT_LEN=4096 +CONTEXT_LEN=4096 +TASKS=llm +# TASKS=llm,rag \ No newline at end of file diff --git a/README_OPENBAYES.md b/README_OPENBAYES.md new file mode 100644 index 0000000..b33ea42 --- /dev/null +++ b/README_OPENBAYES.md @@ -0,0 +1,89 @@ +## 环境配置 + +### 本地环境 + +安装依赖,确保安装顺序严格按照下面的命令: + +```shell +pip install vllm>=0.4.3 +pip install -r requirements.txt +``` + +## 启动模型 + +### 环境变量含义 + + ++ `MODEL_NAME`: 模型名称,如 `chatglm4`、`qwen2`、`llama3`等 + + ++ `PROMPT_NAME`: 使用的对话模板名称,如果不指定,则将根据 `tokenizer` 找到对应的模板 + + ++ `MODEL_PATH`: 开源大模型的文件所在路径 + + ++ `TRUST_REMOTE_CODE`: 是否使用外部代码 + + ++ `TOKENIZE_MODE`(可选项): `tokenizer` 的模式,默认为 `auto` + + ++ `TENSOR_PARALLEL_SIZE`(可选项): `GPU` 数量,默认为 `1` + + ++ `EMBEDDING_NAME`(可选项): 嵌入模型的文件所在路径,推荐使用 `moka-ai/m3e-base` 或者 `BAAI/bge-large-zh` + + ++ `GPU_MEMORY_UTILIZATION`(可选项): `GPU` 占用率 + + ++ `MAX_NUM_BATCHED_TOKENS`(可选项): 每个批处理的最大 `token` 数量 + + ++ `MAX_NUM_SEQS`(可选项): 批量大小 + + ++ `TASKS`(可选项): `llm` 表示启动对话大模型,`rag` 表示启动文档文档相关接口,比如`embedding`、`rerank` + + +### 启动方式 + +#### 本地启动 + +根据需求修改 `.env` 文件中的环境变量 + +```shell +python server.py +``` +#### 调用样例 +```shell +curl -X POST "http://127.0.0.1:8080/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer YOUR_API_KEY" \ +-d '{ + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "这张图像有什么东西?"}, + { + "type": "image_url", + "image_url": { + "url": "https://github.com/ByungKwanLee/TroL/blob/master/figures/demo.png?raw=true" + } + } + ] + } + ], + "model": "minicpm-v" +}' +``` +### 说明 +目前只支持minicpm-v、internvl模型,下面是minicpm-v最大并发量测试结果(RTX_8000 x 2): +GPU_MEMORY_UTILIZATION=0.9 并发量10 +GPU_MEMORY_UTILIZATION=0.8 并发量14 +GPU_MEMORY_UTILIZATION=0.7 并发量20 +GPU_MEMORY_UTILIZATION=0.6 并发量28 +GPU_MEMORY_UTILIZATION=0.5 并发量30 +GPU_MEMORY_UTILIZATION=0.4 并发量36 diff --git a/api/.DS_Store b/api/.DS_Store new file mode 100644 index 0000000..642dd58 Binary files /dev/null and b/api/.DS_Store differ diff --git a/api/vllm_routes/chat.py b/api/vllm_routes/chat.py index 31e31fe..d28f8cd 100644 --- a/api/vllm_routes/chat.py +++ b/api/vllm_routes/chat.py @@ -45,6 +45,44 @@ def get_engine(): yield LLM_ENGINE +def load_image(image_url: str): + from PIL import Image + from io import BytesIO + + if image_url.startswith("data:"): + import base64 + + image_bytes = base64.b64decode(image_url.split(",")[1]) + else: + import urllib.request + + with urllib.request.urlopen(image_url) as f: + image_bytes = f.read() + + return Image.open(BytesIO(image_bytes)).convert("RGB") + +def process_messages(messages): + _messages = [] + for message in messages: + if isinstance(message["content"], str): + _content = [message["content"]] + else: + _content = [] + for c in message["content"]: + if isinstance(c, dict) and "type" in c: + if c["type"] == "text": + _content.append(c["text"]) + elif c["type"] == "image_url": + if ( + isinstance(c["image_url"], dict) + and "url" in c["image_url"] + ): + image = load_image(image_url=c["image_url"]["url"]) + else: + image = load_image(image_url=c["image_url"]) + _content.insert(0, image) + _messages.append({"role": message["role"], "content": _content}) + return _messages @chat_router.post( "/completions", @@ -74,6 +112,22 @@ async def create_chat_completion( logger.debug(f"==== request ====\n{params}") request_id: str = f"chatcmpl-{str(uuid.uuid4())}" + + # 使用minicpm-v模型 + minicpmv_messages = process_messages(request.messages) + image = minicpmv_messages[0]['content'][0] + question = minicpmv_messages[0]['content'][1] + minicpmv_messages[0]['content'] = f'(./)\n{question}' + request.messages = minicpmv_messages + + # 使用internvl模型需要解注释 + # internvl_messages = process_messages(request.messages) + # image = internvl_messages[0]['content'][0] + # question = internvl_messages[0]['content'][1] + # internvl_messages[0]['content'] = f"\n{question}\n" + # request.messages = internvl_messages + # stop_token_ids = [0, 92543, 92542, 0] + token_ids = engine.template.convert_messages_to_ids( messages=request.messages, tools=request.tools, @@ -97,13 +151,21 @@ async def create_chat_completion( "spaces_between_special_tokens", } kwargs = dictify(request, include=include) + # 使用minicpm-v模型 sampling_params = SamplingParams( stop=request.stop or [], stop_token_ids=request.stop_token_ids or [], max_tokens=request.max_tokens, **kwargs, ) - + # 使用internvl模型需要解注释 + # sampling_params = SamplingParams( + # stop=request.stop or [], + # stop_token_ids=stop_token_ids or [], + # max_tokens=request.max_tokens, + # **kwargs, + # ) + # Todo: support for lora lora_request = None try: @@ -136,6 +198,9 @@ async def create_chat_completion( { "prompt": None, "prompt_token_ids": token_ids, + "multi_modal_data": { + "image": image + } }, sampling_params, request_id, diff --git a/server.py b/server.py new file mode 100644 index 0000000..a24c3a6 --- /dev/null +++ b/server.py @@ -0,0 +1,49 @@ +from api.config import SETTINGS +from api.models import ( + app, + EMBEDDING_MODEL, + LLM_ENGINE, + RERANK_MODEL, +) + + +prefix = SETTINGS.api_prefix + +if EMBEDDING_MODEL is not None: + from api.routes.embedding import embedding_router + + app.include_router(embedding_router, prefix=prefix, tags=["Embedding"]) + + try: + from api.routes.file import file_router + + app.include_router(file_router, prefix=prefix, tags=["File"]) + except ImportError: + pass + +if RERANK_MODEL is not None: + from api.routes.rerank import rerank_router + + app.include_router(rerank_router, prefix=prefix, tags=["Rerank"]) + + +if LLM_ENGINE is not None: + from api.routes import model_router + + app.include_router(model_router, prefix=prefix, tags=["Model"]) + + if SETTINGS.engine == "vllm": + from api.vllm_routes import chat_router as chat_router + from api.vllm_routes import completion_router as completion_router + + else: + from api.routes.chat import chat_router as chat_router + from api.routes.completion import completion_router as completion_router + + app.include_router(chat_router, prefix=prefix, tags=["Chat Completion"]) + app.include_router(completion_router, prefix=prefix, tags=["Completion"]) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host=SETTINGS.host, port=SETTINGS.port, log_level="info")