Updated prompts a bit

whitead · whitead · commit 8f8cfbaa9dd1 · 2023-03-02T00:42:45.000-05:00
diff --git a/README.md b/README.md
@@ -52,17 +52,17 @@ Make sure you have set your OPENAI_API_KEY environment variable to your [openai
 
 To use paper-qa, you need to have a list of paths (valid extensions include: .pdf, .txt) and a list of citations (strings) that correspond to the paths. You can then use the `Docs` class to add the documents and then query them.
 
-*This uses a lot of tokens!! About 5-10k tokens per answer + embedding cost (negligible unless many documents used). That is up to $0.20 per answer with current GPT-3 pricing. Use wisely.*
+*This uses a lot of tokens!! About 5-10k tokens per answer + embedding cost (negligible unless many documents used). That is up to $0.02 per answer with current GPT-3 pricing. Use wisely.*
 
 ```python
 
 from paperqa import Docs
 
-# get a list of paths, citations
+# get a list of paths
 
 docs = Docs()
-for d, c in zip(my_docs, my_citations):
-    docs.add(d, c)
+for d in my_docs:
+    docs.add(d)
 
 # takes ~ 1 min and costs $0.10-$0.20 to execute this line
 answer = docs.query("What manufacturing challenges are unique to bispecific antibodies?")
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -22,6 +22,7 @@
 from langchain.callbacks import get_openai_callback
 from langchain.cache import SQLiteCache
 import langchain
+from datetime import datetime
 
 CACHE_PATH = Path.home() / ".paperqa" / "llm_cache.db"
 os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True)
@@ -115,7 +116,8 @@ def add(
             texts, _ = read_doc(path, "", "", chunk_chars=chunk_chars)
             with get_openai_callback() as cb:
                 citation = self.cite_chain.run(texts[0])
-            print(f"Guessed citation {citation} for {cb.total_tokens} tokens")
+            if len(citation) < 3 or "Unknown" in citation or "insufficient" in citation:
+                citation = f"Unknown, {os.path.basename(path)}, {datetime.now().year}"
 
         if path in self.docs:
             raise ValueError(f"Document {path} already in collection.")
diff --git a/paperqa/qaprompts.py b/paperqa/qaprompts.py
@@ -3,11 +3,11 @@
 
 summary_prompt = prompts.PromptTemplate(
     input_variables=["question", "context_str"],
-    template="Summarize the text below to help answer a question. "
-    "Do not directly answer the question, instead provide a summary with the context of the question. "
+    template="Summarize and provide direct quotes from the text below to help answer a question. "
+    "Do not directly answer the question, instead provide a summary and quotes with the context of the question. "
     "Do not use outside sources. "
     'Reply with "Not applicable" if the text is unrelated to the question. '
-    "Use 75 or less words. Include quotations if possible."
+    "Use 75 or less words."
     "\n\n"
     "{context_str}\n"
     "\n"
@@ -18,13 +18,13 @@
 
 qa_prompt = prompts.PromptTemplate(
     input_variables=["question", "context_str", "length"],
-    template="Write a comprehensive answer ({length}) "
+    template="Write an answer ({length}) "
     "for the question below solely based on the provided context. "
     "If the context is irrelevant, "
     'reply "I cannot answer". '
     "For each sentence in your answer, indicate which sources most support it "
     "via valid citation markers at the end of sentences, like (Example2012). "
-    "Answer in an unbiased, balanced, and scientific tone. "
+    "Answer in an unbiased and scholarly tone. Make clear what is your opinion. "
     "Use Markdown for formatting code or text, and try to use direct quotes to support arguments.\n\n"
     "{context_str}\n"
     "Question: {question}\n"