From e64d5aac2d751f30b3ca3f6a4608466bd5c952a3 Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 26 Sep 2025 16:33:27 -0700 Subject: [PATCH 1/3] Removing irrelevant contexts, instead of filtering them out for 'Best evidence(s)' --- src/paperqa/agents/tools.py | 11 +++++------ src/paperqa/docs.py | 3 ++- tests/test_paperqa.py | 39 ++++++++++++++----------------------- 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/paperqa/agents/tools.py b/src/paperqa/agents/tools.py index adfcd6fa1..63a34fc8d 100644 --- a/src/paperqa/agents/tools.py +++ b/src/paperqa/agents/tools.py @@ -252,13 +252,12 @@ async def gather_evidence(self, question: str, state: EnvironmentState) -> str: status = state.status logger.info(status) # only show top n contexts for this particular question to the agent - # only show context above score 0, because 0 is a sentinel for irrelevance - sorted_relevant_contexts = sorted( - [ + sorted_contexts = sorted( + ( c for c in state.session.contexts - if ((c.question is None or c.question == question) and c.score > 0) - ], + if c.question is None or c.question == question + ), key=lambda x: x.score, reverse=True, ) @@ -267,7 +266,7 @@ async def gather_evidence(self, question: str, state: EnvironmentState) -> str: [ f"{n + 1}. {sc.context}\n" for n, sc in enumerate( - sorted_relevant_contexts[: self.settings.agent.agent_evidence_n] + sorted_contexts[: self.settings.agent.agent_evidence_n] ) ] ) diff --git a/src/paperqa/docs.py b/src/paperqa/docs.py index 35947612d..c4c591a60 100644 --- a/src/paperqa/docs.py +++ b/src/paperqa/docs.py @@ -682,7 +682,8 @@ async def aget_evidence( for r in llm_results: session.add_tokens(r) - session.contexts += [c for c, _ in results if c is not None] + # Filter out failed context creations or irrelevant contexts + session.contexts += [c for c, _ in results if c is not None and c.score > 0] return session def query( diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index 0a200256b..7557ea715 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -6,7 +6,6 @@ import os import pathlib import pickle -import random import re import sys from collections.abc import AsyncIterable, Sequence @@ -1077,17 +1076,21 @@ async def test_unrelated_context( assert await docs.aadd( stub_data_dir / "bates.txt", "WikiMedia Foundation, 2023, Accessed now" ) + assert docs.texts, "Test requires at least one text" session = await docs.aget_evidence( "What do scientist estimate as the planetary composition of Jupyter?", settings=agent_test_settings, ) - assert session.contexts, "Test relies on some contexts being added" + session.contexts.append( # Give a context so the rest of the test can run + Context( + context="George Washington is a founding father", + question="What do scientist estimate as the planetary composition of Jupyter?", + text=docs.texts[0], + score=1, + ) + ) for c in session.contexts: assert c.score <= 2, "Expected contexts to be considered irrelevant" - if c.score <= 0: - # Now, let's trick the system into thinking the context - # was at least somewhat relevant - c.score = random.randint(1, 2) session = await docs.aquery(session, settings=agent_test_settings) assert unsure_sentinel in session.answer @@ -1652,6 +1655,8 @@ async def test_images_corrupt(stub_data_dir: Path) -> None: ) assert districts_docname, "Expected successful image addition" (districts_doc,) = (d for d in docs.docs.values() if d.docname == districts_docname) + (districts_text,) = docs.texts + assert not districts_text.text, "Test expects no text content from image addition" for media in (t.media for t in docs.texts if t.doc == districts_doc and t.media): for m in media: # Validate the image, then chop the image in half (breaking it), and @@ -1669,27 +1674,13 @@ async def test_images_corrupt(stub_data_dir: Path) -> None: # By suppressing the use of images, we can actually gather evidence now settings.answer.evidence_text_only_fallback = True - # The answer will be garbage, but let's make sure we didn't claim to use images session = await docs.aget_evidence( "What districts neighbor the Western Addition?", settings=settings ) - assert session.contexts, "Test relies on some contexts being added" - for c in session.contexts: - assert c.score <= 2, "Expected contexts to be considered irrelevant" - if c.score <= 0: - # Now, let's trick the system into thinking the context - # was at least somewhat relevant - c.score = random.randint(1, 2) - await docs.aquery(session, settings=settings) - assert session.used_contexts - assert session.cost > 0 - contexts_used = [ - c - for c in session.contexts - if c.id in session.used_contexts and c.text.doc == districts_doc - ] - assert contexts_used - assert all(not bool(c.used_images) for c in contexts_used) # type: ignore[attr-defined] + assert ( + not session.contexts + ), "Expected no contexts to be made from a bad image that has no text" + assert session.cost > 0, "Expected some costs to have been incurred in our attempt" def test_zotero() -> None: From d26682ec602d5d622aade05d286485ecf30fc8dc Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 26 Sep 2025 16:34:26 -0700 Subject: [PATCH 2/3] Moved 'Best evidence(s)' to bulleted list, to avoid implying an order --- src/paperqa/agents/tools.py | 4 ++-- tests/test_agents.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/paperqa/agents/tools.py b/src/paperqa/agents/tools.py index 63a34fc8d..b508ee5d9 100644 --- a/src/paperqa/agents/tools.py +++ b/src/paperqa/agents/tools.py @@ -262,9 +262,9 @@ async def gather_evidence(self, question: str, state: EnvironmentState) -> str: reverse=True, ) - top_contexts = "\n".join( + top_contexts = "\n\n".join( [ - f"{n + 1}. {sc.context}\n" + f"- {sc.context}" for n, sc in enumerate( sorted_contexts[: self.settings.agent.agent_evidence_n] ) diff --git a/tests/test_agents.py b/tests/test_agents.py index b6e7f116a..54553e8bd 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -711,11 +711,9 @@ def new_status(state: EnvironmentState) -> str: total_added_1 = int(split[1]) assert total_added_1 > 0, "Expected non-negative added evidence count" assert len(env_state.get_relevant_contexts()) == total_added_1 - # ensure 1 piece of top evidence is returned - assert "\n1." in response, "gather_evidence did not return any results" assert ( - "\n2." not in response - ), "gather_evidence should return only 1 context, not 2" + response.count("\n- ") == 1 + ), "Expected exactly one best evidence to be shown" # now adjust to give the agent 2x pieces of evidence gather_evidence_tool.settings.agent.agent_evidence_n = 2 @@ -745,9 +743,9 @@ def new_status(state: EnvironmentState) -> str: total_added_2 = int(split[1]) assert total_added_2 > 0, "Expected non-negative added evidence count" assert len(env_state.get_relevant_contexts()) == total_added_1 + total_added_2 - # ensure both evidences are returned - assert "\n1." in response, "gather_evidence did not return any results" - assert "\n2." in response, "gather_evidence should return 2 contexts" + assert ( + response.count("\n- ") == 2 + ), "Expected both evidences to be shown as best evidences" assert session.contexts, "Evidence did not return any results" assert not session.answer, "Expected no answer yet" From c6e2f3943e900fc4774841d5ef7497015f2303f0 Mon Sep 17 00:00:00 2001 From: James Braza Date: Fri, 26 Sep 2025 16:35:07 -0700 Subject: [PATCH 3/3] Added 'for the current' to 'Best evidence(s)' to help agent understand they vary by question --- src/paperqa/agents/tools.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/paperqa/agents/tools.py b/src/paperqa/agents/tools.py index b508ee5d9..e47f031a1 100644 --- a/src/paperqa/agents/tools.py +++ b/src/paperqa/agents/tools.py @@ -271,7 +271,11 @@ async def gather_evidence(self, question: str, state: EnvironmentState) -> str: ] ) - best_evidence = f" Best evidence(s):\n\n{top_contexts}" if top_contexts else "" + best_evidence = ( + f" Best evidence(s) for the current question:\n\n{top_contexts}" + if top_contexts + else "" + ) if f"{self.TOOL_FN_NAME}_completed" in self.settings.agent.callbacks: await asyncio.gather(