Improve the state in gradio web server (lm-sys#1348)

merrymercy · web-flow · commit 00dbb6f48ff0 · 2023-05-19T12:36:41.000-07:00
diff --git a/README.md b/README.md
@@ -111,13 +111,13 @@ The following models are tested:
 - [BlinkDL/RWKV-4-Raven](https://huggingface.co/BlinkDL/rwkv-4-raven)
 - [databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b)
 - [FreedomIntelligence/phoenix-inst-chat-7b](https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b)
+- [h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2](https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2)
 - [mosaicml/mpt-7b-chat](https://huggingface.co/mosaicml/mpt-7b-chat)
 - [OpenAssistant/oasst-sft-1-pythia-12b](https://huggingface.co/OpenAssistant/oasst-sft-1-pythia-12b)
 - [project-baize/baize-lora-7B](https://huggingface.co/project-baize/baize-lora-7B)
 - [StabilityAI/stablelm-tuned-alpha-7b](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
 - [THUDM/chatglm-6b](https://huggingface.co/THUDM/chatglm-6b)
 - [Neutralzz/BiLLa-7B-SFT](https://huggingface.co/Neutralzz/BiLLa-7B-SFT)
-- [h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2](https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2)
 
 Help us [add more](https://github.com/lm-sys/FastChat/blob/main/docs/arena.md#how-to-add-a-new-model).
 
diff --git a/docs/arena.md b/docs/arena.md
@@ -17,8 +17,10 @@ If you want to see a specific model in the arena, you can follow the steps below
    ```
    
    Some major files you need to modify include
-   - Implement a conversation template for the new model at https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py. You can follow existing examples and use `register_conv_template` to add a new one.
-   - Implement a model adapter for the new model at https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_adapter.py. You can follow existing examples and use `register_model_adapter` to add a new one.
+   - Implement a conversation template for the new model at [fastchat/conversation.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py). You can follow existing examples and use `register_conv_template` to add a new one.
+   - Implement a model adapter for the new model at [fastchat/model/model_adapter.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_adapter.py). You can follow existing examples and use `register_model_adapter` to add a new one.
+   - (Optional) add the model name to the "Supported Models" section in [README.md](https://github.com/lm-sys/FastChat#supported-models) and add more inforamtion in [fastchat/model/model_registry.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_registry.py).
+
 2. After the model is supported, we will try to schedule some computing resources to host the model in the arena.
    However, due to the limited resources we have, we may not be able to serve every model.
    We will select the models based on popularity, quality, diversity, and other factors.
diff --git a/docs/openai_api.md b/docs/openai_api.md
@@ -102,8 +102,8 @@ curl http://localhost:8000/v1/embeddings \
   }'
 ```
 
-## Tunning
-Runner should answer within 20 seconds. If your model/hardware is slower, you wil get Timeout errors. You can change this timeout through ENV variables : "export WORKER_API_TIMEOUT=<larger timeout in seconds>"
+## Adjusting Timeout
+By default, a timeout error will occur if a model worker does not response within 20 seconds. If your model/hardware is slower, you can change this timeout through an environment variable: `export FASTCHAT_WORKER_API_TIMEOUT=<larger timeout in seconds>`
 
 ## Todos
 Some features to be implemented:
diff --git a/fastchat/constants.py b/fastchat/constants.py
@@ -12,9 +12,11 @@
 LOGDIR = "."
 
 # For the controller and workers(could be overwritten through ENV variables.)
-CONTROLLER_HEART_BEAT_EXPIRATION = int(os.getenv("CONTROLLER_HEART_BEAT_EXPIRATION", 90))
-WORKER_HEART_BEAT_INTERVAL = int(os.getenv("WORKER_HEART_BEAT_INTERVAL", 30))
-WORKER_API_TIMEOUT = int(os.getenv("WORKER_API_TIMEOUT", 20)) 
+CONTROLLER_HEART_BEAT_EXPIRATION = int(
+    os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
+)
+WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 30))
+WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 20))
 
 
 class ErrorCode(IntEnum):
diff --git a/fastchat/conversation.py b/fastchat/conversation.py
@@ -27,7 +27,7 @@ class Conversation:
 
     # The name of this template
     name: str
-    # System prompts
+    # The System prompt
     system: str
     # Two roles
     roles: List[str]
@@ -44,12 +44,6 @@ class Conversation:
     # Stops generation if meeting any token in this list
     stop_token_ids: List[int] = None
 
-    # Used for the state in the gradio servers.
-    # TODO(lmzheng): move this out of this class.
-    conv_id: Any = None
-    skip_next: bool = False
-    model_name: str = None
-
     def get_prompt(self) -> str:
         """Get the prompt for generation."""
         if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
@@ -174,8 +168,6 @@ def copy(self):
             sep2=self.sep2,
             stop_str=self.stop_str,
             stop_token_ids=self.stop_token_ids,
-            conv_id=self.conv_id,
-            model_name=self.model_name,
         )
 
     def dict(self):
@@ -185,8 +177,6 @@ def dict(self):
             "roles": self.roles,
             "messages": self.messages,
             "offset": self.offset,
-            "conv_id": self.conv_id,
-            "model_name": self.model_name,
         }
 
 
@@ -479,8 +469,8 @@ def get_conv_template(name: str) -> Conversation:
         offset=0,
         sep_style=SeparatorStyle.ADD_COLON_SINGLE,
         sep="\n",
-        stop_str="<human>:",
-     )
+        stop_str="<human>",
+    )
 )
 
 # h2oGPT default template
@@ -493,10 +483,10 @@ def get_conv_template(name: str) -> Conversation:
         offset=0,
         sep_style=SeparatorStyle.NO_COLON_SINGLE,
         sep="</s>",
-        stop_str="</s>",
     )
 )
 
+
 if __name__ == "__main__":
     conv = get_conv_template("vicuna_v1.1")
     conv.append_message(conv.roles[0], "Hello!")
diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
@@ -445,7 +445,7 @@ class ClaudeAdapter(BaseAdapter):
     """The model adapter for Claude."""
 
     def match(self, model_path: str):
-        return model_path in ["claude-v1", "claude-instant-v1.1"]
+        return model_path in ["claude-v1", "claude-instant-v1"]
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()
@@ -495,7 +495,7 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
     def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("redpajama-incite")
 
-      
+
 class H2OGPTAdapter(BaseAdapter):
     """The model adapter for h2oGPT."""
 
diff --git a/fastchat/model/model_registry.py b/fastchat/model/model_registry.py
@@ -1,3 +1,4 @@
+"""Additional information of the models."""
 from collections import namedtuple
 from typing import List
 
@@ -37,7 +38,7 @@ def get_model_info(name: str) -> ModelInfo:
     "Claude by Anthropic",
 )
 register_model_info(
-    ["claude-instant-v1.1"],
+    ["claude-instant-v1"],
     "Claude Instant",
     "https://www.anthropic.com/index/introducing-claude",
     "Claude Instant by Anthropic",
@@ -49,7 +50,7 @@ def get_model_info(name: str) -> ModelInfo:
     "Bard based on the PaLM 2 Chat API by Google",
 )
 register_model_info(
-    ["vicuna-13b"],
+    ["vicuna-13b", "vicuna-7b"],
     "Vicuna",
     "https://lmsys.org/blog/2023-03-30-vicuna/",
     "a chat assistant fine-tuned from LLaMA on user-shared conversations by LMSYS",
@@ -62,7 +63,7 @@ def get_model_info(name: str) -> ModelInfo:
 )
 register_model_info(
     ["oasst-pythia-12b"],
-    "OpenAssistant",
+    "OpenAssistant (oasst)",
     "https://open-assistant.io",
     "an Open Assistant for everyone by LAION",
 )
@@ -124,11 +125,11 @@ def get_model_info(name: str) -> ModelInfo:
     ["billa-7b-sft"],
     "BiLLa-7B-SFT",
     "https://huggingface.co/Neutralzz/BiLLa-7B-SFT",
-    "an instruction-tuned bilingual llama with enhanced reasoning ability by an independent researcher",
+    "an instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher",
 )
 register_model_info(
     ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"],
     "h2oGPT-GM-7b",
     "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
-    "an instruction-tuned Apache 2.0 licensed llama with enhanced conversational ability by H2O.ai",
+    "an instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai",
 )
diff --git a/fastchat/serve/api_provider.py b/fastchat/serve/api_provider.py
@@ -70,6 +70,7 @@ def bard_api_stream_iter(state):
     # TODO: we will use the official PaLM 2 API sooner or later,
     # and we will update this function accordingly. So here we just hard code the
     # Bard worker address. It is going to be deprecated anyway.
+    conv = state.conv
 
     # Make requests
     gen_params = {
@@ -81,14 +82,14 @@ def bard_api_stream_iter(state):
     response = requests.post(
         "http://localhost:18900/chat",
         json={
-            "content": state.messages[-2][-1],
-            "state": state.session_state,
+            "content": conv.messages[-2][-1],
+            "state": state.bard_session_state,
         },
         stream=False,
         timeout=WORKER_API_TIMEOUT,
     )
     resp_json = response.json()
-    state.session_state = resp_json["state"]
+    state.bard_session_state = resp_json["state"]
     content = resp_json["content"]
     # The Bard Web API does not support streaming yet. Here we have to simulate
     # the streaming behavior by adding some time.sleep().
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
@@ -150,6 +150,10 @@ def main(args):
         choices=["simple", "rich"],
         help="Display style.",
     )
-    parser.add_argument("--debug", action="store_true", help="Print debug information")
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Print useful debug information (e.g., prompts)",
+    )
     args = parser.parse_args()
     main(args)
diff --git a/fastchat/serve/gradio_block_arena_anony.py b/fastchat/serve/gradio_block_arena_anony.py
@@ -18,6 +18,7 @@
 from fastchat.model.model_adapter import get_conversation_template
 from fastchat.serve.gradio_patch import Chatbot as grChatbot
 from fastchat.serve.gradio_web_server import (
+    State,
     http_bot,
     get_conv_log_filename,
     no_change_btn,
@@ -138,8 +139,7 @@ def regenerate(state0, state1, request: gr.Request):
     logger.info(f"regenerate (anony). ip: {request.client.host}")
     states = [state0, state1]
     for i in range(num_models):
-        states[i].messages[-1][-1] = None
-        states[i].skip_next = False
+        states[i].conv.messages[-1][-1] = None
     return states + [x.to_gradio_chatbot() for x in states] + [""] + [disable_btn] * 6
 
 
@@ -166,13 +166,14 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
     "gpt-4": 1.5,
     "gpt-3.5-turbo": 1.5,
     "claude-v1": 1.5,
-    "claude-instant-v1.1": 1.5,
+    "claude-instant-v1": 1.5,
     "bard": 1.5,
     "vicuna-13b": 1.5,
     "koala-13b": 1.5,
-    "RWKV-4-Raven-14B": 1.2,
-    "oasst-pythia-12b": 1.2,
+    "vicuna-7b": 1.2,
     "mpt-7b-chat": 1.2,
+    "oasst-pythia-12b": 1.2,
+    "RWKV-4-Raven-14B": 1.2,
     "fastchat-t5-3b": 1,
     "alpaca-13b": 1,
     "chatglm-6b": 1,
@@ -182,9 +183,12 @@ def share_click(state0, state1, model_selector0, model_selector1, request: gr.Re
 }
 
 
-def add_text(state0, state1, text, request: gr.Request):
+def add_text(
+    state0, state1, model_selector0, model_selector1, text, request: gr.Request
+):
     logger.info(f"add_text (anony). ip: {request.client.host}. len: {len(text)}")
     states = [state0, state1]
+    model_selectors = [model_selector0, model_selector1]
 
     if states[0] is None:
         assert states[1] is None
@@ -198,11 +202,9 @@ def add_text(state0, state1, text, request: gr.Request):
             model_left = model_right = models[0]
 
         states = [
-            get_conversation_template("vicuna"),
-            get_conversation_template("vicuna"),
+            State(model_left),
+            State(model_right),
         ]
-        states[0].model_name = model_left
-        states[1].model_name = model_right
 
     if len(text) <= 0:
         for i in range(num_models):
@@ -235,7 +237,8 @@ def add_text(state0, state1, text, request: gr.Request):
                 * 6
             )
 
-    if (len(states[0].messages) - states[0].offset) // 2 >= CONVERSATION_LEN_LIMIT:
+    conv = states[0].conv
+    if (len(conv.messages) - conv.offset) // 2 >= CONVERSATION_LEN_LIMIT:
         logger.info(
             f"hit conversation length limit. ip: {request.client.host}. text: {text}"
         )
@@ -253,8 +256,8 @@ def add_text(state0, state1, text, request: gr.Request):
 
     text = text[:INPUT_CHAR_LEN_LIMIT]  # Hard cut-off
     for i in range(num_models):
-        states[i].append_message(states[i].roles[0], text)
-        states[i].append_message(states[i].roles[1], None)
+        states[i].conv.append_message(states[i].conv.roles[0], text)
+        states[i].conv.append_message(states[i].conv.roles[1], None)
         states[i].skip_next = False
 
     return (
@@ -271,8 +274,6 @@ def add_text(state0, state1, text, request: gr.Request):
 def http_bot_all(
     state0,
     state1,
-    model_selector0,
-    model_selector1,
     temperature,
     top_p,
     max_new_tokens,
@@ -291,13 +292,11 @@ def http_bot_all(
         return
 
     states = [state0, state1]
-    model_selector = [state0.model_name, state1.model_name]
     gen = []
     for i in range(num_models):
         gen.append(
             http_bot(
                 states[i],
-                model_selector[i],
                 temperature,
                 top_p,
                 max_new_tokens,
@@ -447,7 +446,7 @@ def build_side_by_side_ui_anony(models):
         regenerate, states, states + chatbots + [textbox] + btn_list
     ).then(
         http_bot_all,
-        states + model_selectors + [temperature, top_p, max_output_tokens],
+        states + [temperature, top_p, max_output_tokens],
         states + chatbots + btn_list,
     )
     clear_btn.click(
@@ -477,17 +476,21 @@ def build_side_by_side_ui_anony(models):
     share_btn.click(share_click, states + model_selectors, [], _js=share_js)
 
     textbox.submit(
-        add_text, states + [textbox], states + chatbots + [textbox] + btn_list
+        add_text,
+        states + model_selectors + [textbox],
+        states + chatbots + [textbox] + btn_list,
     ).then(
         http_bot_all,
-        states + model_selectors + [temperature, top_p, max_output_tokens],
+        states + [temperature, top_p, max_output_tokens],
         states + chatbots + btn_list,
     )
     send_btn.click(
-        add_text, states + [textbox], states + chatbots + [textbox] + btn_list
+        add_text,
+        states + model_selectors + [textbox],
+        states + chatbots + [textbox] + btn_list,
     ).then(
         http_bot_all,
-        states + model_selectors + [temperature, top_p, max_output_tokens],
+        states + [temperature, top_p, max_output_tokens],
         states + chatbots + btn_list,
     )
 
diff --git a/fastchat/serve/gradio_block_arena_named.py b/fastchat/serve/gradio_block_arena_named.py
diff --git a/fastchat/serve/gradio_web_server.py b/fastchat/serve/gradio_web_server.py
diff --git a/fastchat/serve/gradio_web_server_multi.py b/fastchat/serve/gradio_web_server_multi.py
diff --git a/fastchat/serve/inference.py b/fastchat/serve/inference.py
diff --git a/playground/inspect_conv.py b/playground/inspect_conv.py