Fixing used_images usage in summary_json_system_prompt (#1109)

jamesbraza · web-flow · commit 68af51944dc5 · 2025-09-25T22:09:03.000-07:00
diff --git a/README.md b/README.md
@@ -751,6 +751,12 @@ When creating contextual summaries on a given chunk (a `Text`),
 the summary LLM is passed both the chunk's text and the chunk's associated media,
 but the output contextual summary itself remains text-only.
 
+If you would like,
+specifying the prompt `paperqa.prompts.summary_json_multimodal_system_prompt`
+to the setting `prompt.summary_json_system`
+will include a `used_images` flag attributing
+usage of images in any contextual summarizations.
+
 ### Using External DB/Vector DB and Caching
 
 You may want to cache parsed texts and embeddings in an external database or file.
diff --git a/src/paperqa/prompts.py b/src/paperqa/prompts.py
@@ -112,7 +112,19 @@
     " Your summary, combined with many others,"
     " will be given to the model to generate an answer."
     " Respond with the following JSON format:"
-    '\n\n{{\n  "summary": "...",\n  "relevance_score": 0-10,\n  "used_images"\n}}'
+    '\n\n{{\n  "summary": "...",\n  "relevance_score": 0-10\n}}'
+    "\n\nwhere `summary` is relevant information from the text - {summary_length} words."
+    " `relevance_score` is an integer 0-10 for the relevance of `summary` to the question."
+    "\n\nThe excerpt may or may not contain relevant information."
+    " If not, leave `summary` empty, and make `relevance_score` be 0."
+)
+summary_json_multimodal_system_prompt = (
+    "Provide a summary of the relevant information"
+    " that could help answer the question based on the excerpt."
+    " Your summary, combined with many others,"
+    " will be given to the model to generate an answer."
+    " Respond with the following JSON format:"
+    '\n\n{{\n  "summary": "...",\n  "relevance_score": 0-10,\n  "used_images": "..."\n}}'
     "\n\nwhere `summary` is relevant information from the text - {summary_length} words."
     " `relevance_score` is an integer 0-10 for the relevance of `summary` to the question."
     " `used_images` is a boolean flag indicating"
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -61,7 +61,7 @@
     llm_parse_json,
     map_fxn_summary,
 )
-from paperqa.prompts import CANNOT_ANSWER_PHRASE
+from paperqa.prompts import CANNOT_ANSWER_PHRASE, summary_json_multimodal_system_prompt
 from paperqa.prompts import qa_prompt as default_qa_prompt
 from paperqa.readers import PDFParserFn, parse_image, read_doc
 from paperqa.settings import AsyncContextSerializer
@@ -1594,6 +1594,7 @@ async def test_images(stub_data_dir: Path) -> None:
     # We don't support image embeddings yet, so disable embedding
     settings.answer.evidence_retrieval = False
     settings.parsing.defer_embedding = True
+    settings.prompts.summary_json_system = summary_json_multimodal_system_prompt
 
     docs = Docs()
     districts_docname = await docs.aadd(
@@ -1625,7 +1626,7 @@ async def test_images(stub_data_dir: Path) -> None:
         if c.id in session.used_contexts and c.text.doc == districts_doc
     ]
     assert contexts_used
-    assert all(c.used_images for c in contexts_used)  # type: ignore[attr-defined]
+    assert all(bool(c.used_images) for c in contexts_used)  # type: ignore[attr-defined]
 
 
 @pytest.mark.asyncio
@@ -1636,6 +1637,7 @@ async def test_images_corrupt(stub_data_dir: Path) -> None:
     # We don't support image embeddings yet, so disable embedding
     settings.answer.evidence_retrieval = False
     settings.parsing.defer_embedding = True
+    settings.prompts.summary_json_system = summary_json_multimodal_system_prompt
 
     docs = Docs()
     districts_docname = await docs.aadd(
@@ -1687,7 +1689,7 @@ async def test_images_corrupt(stub_data_dir: Path) -> None:
         if c.id in session.used_contexts and c.text.doc == districts_doc
     ]
     assert contexts_used
-    assert all(not c.used_images for c in contexts_used)  # type: ignore[attr-defined]
+    assert all(not bool(c.used_images) for c in contexts_used)  # type: ignore[attr-defined]
 
 
 def test_zotero() -> None: