Add README and scripts which can run successfully.

robinji0 · robinji0 · commit 6d0624408ef3 · 2023-09-05T15:19:38.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+/.idea
diff --git a/README.md b/README.md
@@ -1,2 +1,32 @@
 # chat2db-chatglm-6b-deploy
-It shows how to deploy your own chatglm-6b and use it in chat2db
+
+## 📖 Introduction
+This project shows how to deploy chatglm-6b to the free cloud resources or your local machine. And it also shows how to use the chatglm-6b in chat2db client.
+
+## 📦 Prerequisites
+| Model | GPU(Inference) | GPU(Finetue) |
+| :----: | :----: | :----: |
+| ChatGLM-6B-int4 | 6GB | 7GB |
+## 📦 Deploy
+### 📦 Deploy to the google colab
+1. Open the [chatglm-6b-int4-deploy.ipynb](https://colab.research.google.com/drive/1-jKsKISmlMCWTbaV-3HYBWbrTWxNLzOo?usp=sharing) in the google colab. In our case, we can run the model in google colab absolutely free.
+2. Run the code of step1 to step6 in the notebook.
+3. After the step6, you will get the public demo url for your model such as `https://3cef73d65765afdfea.gradio.live`. Click the url to check if the model is deployed successfully. And you can also experiment with the model as you want. Click stop button to stop the web demo.
+<img src="https://alidocs.oss-cn-zhangjiakou.aliyuncs.com/res/4j6OJdYA60Y7n3p8/img/4bc2c26f-fa57-44be-a336-3e5729a2d104.png?x-oss-process=image/resize,w_640,m_lfit,limit_1">
+4. Run the code of step7 to step9 in the notebook.
+5. After the step9, you will get the api url for your model such as `https://dfb1-34-87-2-137.ngrok.io`. Run below code in your local machine to check if the model is deployed successfully. 
+<img src="https://alidocs.oss-cn-zhangjiakou.aliyuncs.com/res/4j6OJdYA60Y7n3p8/img/bec200c8-a343-45ff-b9a0-2bd21985da9a.png?x-oss-process=image/resize,w_640,m_lfit,limit_1">
+```bash
+curl -X POST "your api url" \
+     -H 'Content-Type: application/json' \
+     -d '{"prompt": "Hello", "history": []}'
+```
+6. After you get the result, you can copy the url and use it in the chat2db client. Set the url in the client as below:
+<img src="https://alidocs.oss-cn-zhangjiakou.aliyuncs.com/res/4j6OJdYA60Y7n3p8/img/ca844185-2744-49e0-ab75-245e19b872d6.png?x-oss-process=image/resize,w_640,m_lfit,limit_1">
+7. Now you can chat with the model in the chat2db client. Enjoy it!
+
+* Note: The google colab will disconnect after 12 hours. You can rerun the notebook to get the public demo url and api url again. And also, the network speed of google colab is not very fast. So it may take a long time to download the model and run the model. Please be patient.
+
+### 📦 Deploy to the local machine
+* Since the network in google colab is not very fast, we can also deploy the model to our local machine. The script for deploy in your local machine is similar to the script in the google colab. Just follow the steps in [chatglm-6b-int4-deploy.ipynb](https://colab.research.google.com/drive/1-jKsKISmlMCWTbaV-3HYBWbrTWxNLzOo?usp=sharing).
+* Note: when you deploy the model in your local machine, you need to change the model path from '/content/chatglm-6b-int4' to the path of your local machine. You need also change the api url in the chat2db client to the url of your local machine.
diff --git a/api.py b/api.py
@@ -0,0 +1,61 @@
+from fastapi import FastAPI, Request
+from transformers import AutoTokenizer, AutoModel
+import uvicorn, json, datetime
+import torch
+import nest_asyncio
+from pyngrok import ngrok
+
+DEVICE = "cuda"
+DEVICE_ID = "0"
+CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
+
+
+def torch_gc():
+    if torch.cuda.is_available():
+        with torch.cuda.device(CUDA_DEVICE):
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+
+
+app = FastAPI()
+
+
+@app.post("/")
+async def create_item(request: Request):
+    global model, tokenizer
+    json_post_raw = await request.json()
+    json_post = json.dumps(json_post_raw)
+    json_post_list = json.loads(json_post)
+    prompt = json_post_list.get('prompt')
+    history = json_post_list.get('history')
+    max_length = json_post_list.get('max_length')
+    top_p = json_post_list.get('top_p')
+    temperature = json_post_list.get('temperature')
+    response, history = model.chat(tokenizer,
+                                   prompt,
+                                   history=history,
+                                   max_length=max_length if max_length else 2048,
+                                   top_p=top_p if top_p else 0.7,
+                                   temperature=temperature if temperature else 0.95)
+    now = datetime.datetime.now()
+    time = now.strftime("%Y-%m-%d %H:%M:%S")
+    answer = {
+        "response": response,
+        "history": history,
+        "status": 200,
+        "time": time
+    }
+    log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
+    print(log)
+    torch_gc()
+    return response
+
+
+if __name__ == '__main__':
+    tokenizer = AutoTokenizer.from_pretrained("/content/chatglm-6b-int4", trust_remote_code=True)
+    model = AutoModel.from_pretrained("/content/chatglm-6b-int4", trust_remote_code=True).half().cuda()
+    model.eval()
+    ngrok_tunnel = ngrok.connect(8000)
+    print('Public URL:', ngrok_tunnel.public_url)
+    nest_asyncio.apply()
+    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
diff --git a/web_demo.py b/web_demo.py
@@ -0,0 +1,101 @@
+from transformers import AutoModel, AutoTokenizer
+import gradio as gr
+import mdtex2html
+
+tokenizer = AutoTokenizer.from_pretrained("/content/chatglm-6b-int4", trust_remote_code=True)
+model = AutoModel.from_pretrained("/content/chatglm-6b-int4", trust_remote_code=True).half().cuda()
+model = model.eval()
+
+"""Override Chatbot.postprocess"""
+
+
+def postprocess(self, y):
+    if y is None:
+        return []
+    for i, (message, response) in enumerate(y):
+        y[i] = (
+            None if message is None else mdtex2html.convert((message)),
+            None if response is None else mdtex2html.convert(response),
+        )
+    return y
+
+
+gr.Chatbot.postprocess = postprocess
+
+
+def parse_text(text):
+    """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/"""
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split('`')
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = f'<br></code></pre>'
+        else:
+            if i > 0:
+                if count % 2 == 1:
+                    line = line.replace("`", "\`")
+                    line = line.replace("<", "&lt;")
+                    line = line.replace(">", "&gt;")
+                    line = line.replace(" ", "&nbsp;")
+                    line = line.replace("*", "&ast;")
+                    line = line.replace("_", "&lowbar;")
+                    line = line.replace("-", "&#45;")
+                    line = line.replace(".", "&#46;")
+                    line = line.replace("!", "&#33;")
+                    line = line.replace("(", "&#40;")
+                    line = line.replace(")", "&#41;")
+                    line = line.replace("$", "&#36;")
+                lines[i] = "<br>"+line
+    text = "".join(lines)
+    return text
+
+
+def predict(input, chatbot, max_length, top_p, temperature, history):
+    chatbot.append((parse_text(input), ""))
+    for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p,
+                                               temperature=temperature):
+        chatbot[-1] = (parse_text(input), parse_text(response))
+
+        yield chatbot, history
+
+
+def reset_user_input():
+    return gr.update(value='')
+
+
+def reset_state():
+    return [], []
+
+
+with gr.Blocks() as demo:
+    gr.HTML("""<h1 align="center">ChatGLM</h1>""")
+
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        with gr.Column(scale=4):
+            with gr.Column(scale=12):
+                user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style(
+                    container=False)
+            with gr.Column(min_width=32, scale=1):
+                submitBtn = gr.Button("Submit", variant="primary")
+        with gr.Column(scale=1):
+            emptyBtn = gr.Button("Clear History")
+            max_length = gr.Slider(0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True)
+            top_p = gr.Slider(0, 1, value=0.7, step=0.01, label="Top P", interactive=True)
+            temperature = gr.Slider(0, 1, value=0.95, step=0.01, label="Temperature", interactive=True)
+
+    history = gr.State([])
+
+    submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], [chatbot, history],
+                    show_progress=True)
+    submitBtn.click(reset_user_input, [], [user_input])
+
+    emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True)
+
+demo.queue().launch(share=True, inbrowser=True)