Add experimental feature: multi-stage search copilot command

yuerbujin · Feb 1, 2024 · 0db07ff · 0db07ff
1 parent a12b5f6
commit 0db07ff
Show file tree

Hide file tree

Showing 7 changed files with 120 additions and 75 deletions.
diff --git a/bot.py b/bot.py
@@ -1,5 +1,7 @@
 import re
 import os
+import sys
+sys.dont_write_bytecode = True
 import config
 import logging
 import traceback
@@ -90,9 +92,6 @@ async def command_bot(update, context, language=None, prompt=translator_prompt,
         if message:
             if "claude" in config.GPT_ENGINE and config.ClaudeAPI:
                 robot = config.claudeBot
-            # if not config.API or config.PLUGINS["USE_G4F"]:
-            #     import utils.gpt4free as gpt4free
-            #     robot = gpt4free
             if image_url:
                 robot = config.GPT4visionbot
                 title = "`🤖️ gpt-4-vision-preview`\n\n"
@@ -130,10 +129,11 @@ async def reset_chat(update, context):
     )
 
 async def getChatGPT(update, context, title, robot, message, chatid, messageid):
-    result = title
+    result = ""
     text = message
     modifytime = 0
-    lastresult = ''
+    time_out = 600
+    lastresult = title
 
     message = await context.bot.send_message(
         chat_id=chatid,
@@ -151,23 +151,29 @@ async def getChatGPT(update, context, title, robot, message, chatid, messageid):
         for data in get_answer(text, convo_id=str(chatid), pass_history=pass_history):
             if "🌐" not in data:
                 result = result + data
-            tmpresult = result
+            tmpresult = title + result
             modifytime = modifytime + 1
             if re.sub(r"```", '', result).count("`") % 2 != 0:
-                tmpresult = result + "`"
+                tmpresult = title + result + "`"
             if result.count("```") % 2 != 0:
-                tmpresult = result + "\n```"
+                tmpresult = title + result + "\n```"
+            if 'claude2' in title:
+                tmpresult = re.sub(r",", '，', tmpresult)
+            if "🌐" in data:
+                tmpresult = data
+            if "answer:" in result:
+                tmpresult = re.sub(r"thought:[\S\s]+?answer:\s", '', tmpresult)
+                tmpresult = re.sub(r"action:[\S\s]+?answer:\s", '', tmpresult)
+                tmpresult = re.sub(r"answer:\s", '', tmpresult)
+                tmpresult = re.sub(r"thought:[\S\s]+", '', tmpresult)
+                tmpresult = re.sub(r"action:[\S\s]+", '', tmpresult)
+            else:
+                tmpresult = re.sub(r"thought:[\S\s]+", '', tmpresult)
             if (modifytime % 20 == 0 and lastresult != tmpresult) or "🌐" in data:
-                if 'claude2' in title:
-                    tmpresult = re.sub(r",", '，', tmpresult)
-                if "🌐" in data:
-                    tmpresult = data
-                await context.bot.edit_message_text(chat_id=chatid, message_id=messageid, text=escape(tmpresult), parse_mode='MarkdownV2', disable_web_page_preview=True)
+                await context.bot.edit_message_text(chat_id=chatid, message_id=messageid, text=escape(tmpresult), parse_mode='MarkdownV2', disable_web_page_preview=True, read_timeout=time_out, write_timeout=time_out, pool_timeout=time_out, connect_timeout=time_out)
                 lastresult = tmpresult
     except Exception as e:
         print('\033[31m')
-        print("response_msg", result)
-        print("error", e)
         traceback.print_exc()
         print('\033[0m')
         if config.API:
@@ -177,12 +183,12 @@ async def getChatGPT(update, context, title, robot, message, chatid, messageid):
             await context.bot.delete_message(chat_id=chatid, message_id=messageid)
             messageid = ''
             config.API = ''
-        result += f"`出错啦！{e}`"
-    print(result)
-    if lastresult != result and messageid:
+        tmpresult = f"`{e}`"
+    print(tmpresult)
+    if lastresult != tmpresult and messageid:
         if 'claude2' in title:
-            result = re.sub(r",", '，', result)
-        await context.bot.edit_message_text(chat_id=chatid, message_id=messageid, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
+            tmpresult = re.sub(r",", '，', tmpresult)
+        await context.bot.edit_message_text(chat_id=chatid, message_id=messageid, text=escape(tmpresult), parse_mode='MarkdownV2', disable_web_page_preview=True, read_timeout=time_out, write_timeout=time_out, pool_timeout=time_out, connect_timeout=time_out)
 
 @decorators.GroupAuthorization
 @decorators.Authorization
@@ -491,10 +497,10 @@ async def post_init(application: Application) -> None:
     await application.bot.set_my_commands([
         BotCommand('info', 'basic information'),
         BotCommand('pic', 'Generate image'),
+        BotCommand('copilot', 'Advanced search mode'),
         BotCommand('search', 'search Google or duckduckgo'),
         BotCommand('en2zh', 'translate to Chinese'),
         BotCommand('zh2en', 'translate to English'),
-        # BotCommand('qa', 'Document Q&A with Embedding Database Search'),
         BotCommand('start', 'Start the bot'),
         BotCommand('reset', 'Reset the bot'),
     ])
@@ -520,6 +526,7 @@ async def post_init(application: Application) -> None:
     application.add_handler(CommandHandler("reset", reset_chat))
     application.add_handler(CommandHandler("en2zh", lambda update, context: command_bot(update, context, "Simplified Chinese", robot=config.translate_bot)))
     application.add_handler(CommandHandler("zh2en", lambda update, context: command_bot(update, context, "english", robot=config.translate_bot)))
+    application.add_handler(CommandHandler("copilot", lambda update, context: command_bot(update, context, None, None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.copilot_bot)))
     application.add_handler(CommandHandler("info", info))
     application.add_handler(InlineQueryHandler(inlinequery))
     # application.add_handler(CommandHandler("qa", qa))

diff --git a/config.py b/config.py
@@ -1,6 +1,7 @@
 import os
 from dotenv import load_dotenv
 load_dotenv()
+import utils.prompt as prompt
 
 WEB_HOOK = os.environ.get('WEB_HOOK', None)
 BOT_TOKEN = os.environ.get('BOT_TOKEN', None)
@@ -22,7 +23,7 @@
 from datetime import datetime
 current_date = datetime.now()
 Current_Date = current_date.strftime("%Y-%m-%d")
-systemprompt = os.environ.get('SYSTEMPROMPT', f"You are ChatGPT, a large language model trained by OpenAI. Respond conversationally in {LANGUAGE}. Knowledge cutoff: 2023-04. Current date: [ {Current_Date} ]")
+systemprompt = os.environ.get('SYSTEMPROMPT', prompt.system_prompt.format(LANGUAGE, Current_Date))
 
 from utils.chatgpt2api import Chatbot as GPT
 from utils.chatgpt2api import Imagebot, claudebot
@@ -37,6 +38,7 @@
     except:
         print("无法使用 gpt-4-vision-preview 模型")
     translate_bot = GPT(api_key=f"{API}", engine=GPT_ENGINE, system_prompt=systemprompt, temperature=temperature)
+    copilot_bot = GPT(api_key=f"{API}", engine=GPT_ENGINE, system_prompt=prompt.search_system_prompt.format(LANGUAGE), temperature=temperature)
     dallbot = Imagebot(api_key=f"{API}")
 else:
     ChatGPTbot = None

diff --git a/test/test_Web_crawler.py b/test/test_Web_crawler.py
@@ -80,7 +80,7 @@ def Web_crawler(url: str, isSearch=False) -> str:
             print("Skipping large file:", url)
             return result
         soup = BeautifulSoup(response.text.encode(response.encoding), 'lxml', from_encoding='utf-8')
-        
+
         table_contents = ""
         tables = soup.find_all('table')
         for table in tables:
@@ -133,7 +133,8 @@ def Web_crawler(url: str, isSearch=False) -> str:
 # for url in ['https://s.weibo.com/top/summary?cate=realtimehot']:
 # for url in ['https://tophub.today/n/KqndgxeLl9']:
 # for url in ['https://support.apple.com/zh-cn/HT213931']:
-for url in ['https://www.usnews.com/news/entertainment/articles/2023-12-22/china-drafts-new-rules-proposing-restrictions-on-online-gaming']:
+for url in ['https://finance.sina.com.cn/stock/roll/2023-06-26/doc-imyyrexk4053724.shtml', 'https://s.weibo.com/top/summary?cate=realtimehot', 'https://tophub.today/n/KqndgxeLl9', 'https://www.whatsonweibo.com/', 'https://www.trendingonweibo.com/?ref=producthunt', 'https://www.trendingonweibo.com/', 'https://www.statista.com/statistics/1377073/china-most-popular-news-on-weibo/']:
+# for url in ['https://www.usnews.com/news/entertainment/articles/2023-12-22/china-drafts-new-rules-proposing-restrictions-on-online-gaming']:
 # for url in ['https://developer.aliyun.com/article/721836']:
 # for url in ['https://cn.aliyun.com/page-source/price/detail/machinelearning_price']:
 # for url in ['https://mp.weixin.qq.com/s/Itad7Y-QBcr991JkF3SrIg']:
@@ -148,4 +149,3 @@ def Web_crawler(url: str, isSearch=False) -> str:
 run_time = end_time - start_time
 # 打印运行时间
 print(f"程序运行时间：{run_time}秒")
-
diff --git a/test/test_re_agent.py b/test/test_re_agent.py
@@ -0,0 +1,9 @@
+import re
+matches = re.finditer(r"answer: (.*)", test_str, re.MULTILINE)
+result = []
+for matchNum, match in enumerate(matches, start=1):
+    for groupNum in range(0, len(match.groups())):
+        groupNum = groupNum + 1
+        result.append(match.group(groupNum))
+
+print("\n\n".join(result))
diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
@@ -254,7 +254,7 @@ def __init__(
         api_key: str,
         engine: str = os.environ.get("GPT_ENGINE") or "gpt-3.5-turbo",
         proxy: str = None,
-        timeout: float = None,
+        timeout: float = 600,
         max_tokens: int = None,
         temperature: float = 0.5,
         top_p: float = 1.0,
@@ -285,7 +285,7 @@ def __init__(
         )
         # context max tokens
         self.truncate_limit: int = truncate_limit or (
-            16000
+            32000
             # 126500 Control the number of search characters to prevent excessive spending
             if "gpt-4-1106-preview" in engine or "gpt-4-0125-preview" in engine or "gpt-4-turbo-preview" in engine or self.engine == "gpt-4-vision-preview"
             else 30500
@@ -337,7 +337,7 @@ def __init__(
             ],
         }
         self.function_calls_counter = {}
-        self.function_call_max_loop = 3
+        self.function_call_max_loop = 10
         # self.encode_web_text_list = []
 
         if self.get_token_count("default") > self.max_tokens:
@@ -362,7 +362,7 @@ def add_to_conversation(
         else:
             print('\033[31m')
             print("error: add_to_conversation message is None or empty")
-            print(self.conversation[convo_id])
+            print("role", role, "function_name", function_name, "message", message)
             print('\033[0m')
 
     def __truncate_conversation(self, convo_id: str = "default") -> None:
@@ -593,6 +593,7 @@ def ask_stream(
             )
         response_role: str or None = None
         full_response: str = ""
+        function_full_response: str = ""
         function_call_name: str = ""
         need_function_call: bool = False
         for line in response.iter_lines():
@@ -609,6 +610,7 @@ def ask_stream(
             if line == "[DONE]":
                 break
             resp: dict = json.loads(line)
+            # print("resp", resp)
             choices = resp.get("choices")
             if not choices:
                 continue
@@ -627,12 +629,12 @@ def ask_stream(
                 function_call_content = delta["function_call"]["arguments"]
                 if "name" in delta["function_call"]:
                     function_call_name = delta["function_call"]["name"]
-                full_response += function_call_content
-                if full_response.count("\\n") > 2 or "}" in full_response:
+                function_full_response += function_call_content
+                if function_full_response.count("\\n") > 2 or "}" in function_full_response:
                     break
         if need_function_call:
-            full_response = check_json(full_response)
-            print("full_response", full_response)
+            function_full_response = check_json(function_full_response)
+            print("function_full_response", function_full_response)
             if not self.function_calls_counter.get(function_call_name):
                 self.function_calls_counter[function_call_name] = 1
             else:
@@ -641,51 +643,31 @@ def ask_stream(
                 function_call_max_tokens = self.truncate_limit - message_token["total"] - 1000
                 if function_call_max_tokens <= 0:
                     function_call_max_tokens = int(self.truncate_limit / 2)
-                print("function_call_max_tokens", function_call_max_tokens)
+                print("\033[32m function_call", function_call_name, "max token:", function_call_max_tokens, "\033[0m")
                 if function_call_name == "get_search_results":
-                    # g4t 提取的 prompt 有问题
-                    # prompt = json.loads(full_response)["prompt"]
-                    for index in range(len(self.conversation[convo_id])):
-                        if self.conversation[convo_id][-1 - index]["role"] == "user":
-                            self.conversation[convo_id][-1 - index]["content"][0]["text"] = self.conversation[convo_id][-1 - index]["content"][0]["text"].replace("search: ", "")
-                            prompt = self.conversation[convo_id][-1 - index]["content"][0]["text"]
-                            if json.loads(full_response)["prompt"].strip() != prompt:
-                                prompt = " ".join([prompt, json.loads(full_response)["prompt"].strip()]).strip()
-                            print("\n\nprompt", prompt)
-                            break
-                    tiktoken.get_encoding("cl100k_base")
-                    encoding = tiktoken.encoding_for_model(config.GPT_ENGINE)
-                    web_result = yield from get_url_text_list(prompt)
-
-                    encode_web_text_list = encoding.encode(" ".join(web_result))
-                    print("search len", len(encode_web_text_list))
-                    function_response = encoding.decode(encode_web_text_list[:function_call_max_tokens])
-                    # if self.encode_web_text_list == []:
-                    #     self.encode_web_text_list = encoding.encode(" ".join(get_url_text_list(prompt)))
-                    #     print("search len", len(self.encode_web_text_list))
-                    # function_response = encoding.decode(self.encode_web_text_list[:function_call_max_tokens])
-                    # self.encode_web_text_list = self.encode_web_text_list[function_call_max_tokens:]
-
-                    # function_response = eval(function_call_name)(prompt, function_call_max_tokens)
+                    prompt = json.loads(function_full_response)["prompt"]
+                    function_response = eval(function_call_name)(prompt)
+                    function_response, text_len = cut_message(function_response, function_call_max_tokens)
                     function_response = (
+                        f"You need to response the following question: {prompt}. Search results is provided inside <Search_results></Search_results> XML tags. Your task is to think about the question step by step and then answer the above question in {config.LANGUAGE} based on the Search results provided. Please response in {config.LANGUAGE} and adopt a style that is logical, in-depth, and detailed. Note: In order to make the answer appear highly professional, you should be an expert in textual analysis, aiming to make the answer precise and comprehensive. Directly response markdown format, without using markdown code blocks"
                         "Here is the Search results, inside <Search_results></Search_results> XML tags:"
                         "<Search_results>"
                         "{}"
                         "</Search_results>"
                     ).format(function_response)
-                    user_prompt = f"You need to response the following question: {prompt}. Search results is provided inside <Search_results></Search_results> XML tags. Your task is to think about the question step by step and then answer the above question in {config.LANGUAGE} based on the Search results provided. Please response in {config.LANGUAGE} and adopt a style that is logical, in-depth, and detailed. Note: In order to make the answer appear highly professional, you should be an expert in textual analysis, aiming to make the answer precise and comprehensive. Directly response markdown format, without using markdown code blocks"
-                    self.add_to_conversation(user_prompt, "user", convo_id=convo_id)
+                    # user_prompt = f"You need to response the following question: {prompt}. Search results is provided inside <Search_results></Search_results> XML tags. Your task is to think about the question step by step and then answer the above question in {config.LANGUAGE} based on the Search results provided. Please response in {config.LANGUAGE} and adopt a style that is logical, in-depth, and detailed. Note: In order to make the answer appear highly professional, you should be an expert in textual analysis, aiming to make the answer precise and comprehensive. Directly response markdown format, without using markdown code blocks"
+                    # self.add_to_conversation(user_prompt, "user", convo_id=convo_id)
                 if function_call_name == "get_url_content":
-                    url = json.loads(full_response)["url"]
+                    url = json.loads(function_full_response)["url"]
                     print("\n\nurl", url)
                     function_response = Web_crawler(url)
+                    function_response, text_len = cut_message(function_response, function_call_max_tokens)
                     function_response = (
                         "Here is the documentation, inside <document></document> XML tags:"
                         "<document>"
                         "{}"
                         "</document>"
                     ).format(function_response)
-                    function_response, text_len = cut_message(function_response, function_call_max_tokens)
                 if function_call_name == "get_date_time_weekday":
                     function_response = eval(function_call_name)()
                     function_response, text_len = cut_message(function_response, function_call_max_tokens)
@@ -695,8 +677,15 @@ def ask_stream(
             else:
                 function_response = "抱歉，直接告诉用户，无法找到相关信息"
             response_role = "function"
+            # print(self.conversation[convo_id][-1])
+            if self.conversation[convo_id][-1]["role"] == "function" and self.conversation[convo_id][-1]["name"] == "get_search_results":
+                mess = self.conversation[convo_id].pop(-1)
+                # print("Truncate message:", mess)
+            self.add_to_conversation(full_response, "assistant", convo_id=convo_id)
             yield from self.ask_stream(function_response, response_role, convo_id=convo_id, function_name=function_call_name)
         else:
+            if self.conversation[convo_id][-1]["role"] == "function" and self.conversation[convo_id][-1]["name"] == "get_search_results":
+                mess = self.conversation[convo_id].pop(-1)
             self.add_to_conversation(full_response, response_role, convo_id=convo_id)
             self.function_calls_counter = {}
             # self.clear_function_call(convo_id=convo_id)