Generalizing tests for smarter LLMs (#1149)

jamesbraza · web-flow · commit 8d26e2420a06 · 2025-10-21T14:56:15.000-07:00
diff --git a/tests/cassettes/test_partitioning_fn_docs[False].yaml b/tests/cassettes/test_partitioning_fn_docs[False].yaml
diff --git a/tests/cassettes/test_partitioning_fn_docs[True].yaml b/tests/cassettes/test_partitioning_fn_docs[True].yaml
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -123,7 +123,7 @@ def agent_stub_session() -> PQASession:
     # > are already imported: paperqa
     from paperqa.types import PQASession
 
-    return PQASession(question="What is is a self-explanatory model?")
+    return PQASession(question="What is a self-explanatory model?")
 
 
 @pytest.fixture
diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -516,17 +516,23 @@ async def test_propagate_options(agent_test_settings: Settings) -> None:
     agent_test_settings.answer.evidence_skip_summary = True
 
     response = await agent_query(
-        query="What is is a self-explanatory model?",
+        query="What is a self-explanatory model?",
         settings=agent_test_settings,
         agent_type=FAKE_AGENT_TYPE,
     )
     assert response.status == AgentStatus.SUCCESS, "Agent did not succeed"
     result = response.session
     assert len(result.answer) > 200, "Answer did not return any results"
     assert "###" in result.answer, "Answer did not propagate system prompt"
+    assert len(result.contexts) >= 2, "Test expects a few contexts"
     # Subtract 2 to allow tolerance for chunks with leading/trailing whitespace
+    num_contexts_sufficient_length = sum(
+        len(c.context) >= agent_test_settings.parsing.chunk_size - 2
+        for c in result.contexts
+    )
+    # Check most contexts have the expected length
     assert (
-        len(result.contexts[0].context) >= agent_test_settings.parsing.chunk_size - 2
+        num_contexts_sufficient_length >= len(result.contexts) - 1
     ), "Summary was not skipped"
 
 
@@ -622,7 +628,7 @@ def files_filter(f) -> bool:
 
     agent_test_settings.agent.callbacks = callbacks
 
-    session = PQASession(question="What is is a self-explanatory model?")
+    session = PQASession(question="What is a self-explanatory model?")
     env_state = EnvironmentState(docs=Docs(), session=session)
     built_index = await get_directory_index(settings=agent_test_settings)
     assert await built_index.count, "Index build did not work"
@@ -730,11 +736,11 @@ def new_status(state: EnvironmentState) -> str:
         for context in session.contexts:
             if context.question != new_question:
                 assert (
-                    context.context[:20] not in response
+                    context.context[:30] not in response
                 ), "gather_evidence should not return any contexts for the old question"
         assert (
             sum(
-                (1 if (context.context[:20] in response) else 0)
+                (1 if (context.context[:30] in response) else 0)
                 for context in session.contexts
                 if context.question == new_question
             )
diff --git a/tests/test_configs.py b/tests/test_configs.py
@@ -67,7 +67,7 @@ def test_settings_default_instantiation(tmpdir, subtests: SubTests) -> None:
     # Also let's check our default settings work fine with round-trip JSON serialization
     serde_default_settings = Settings(**default_settings.model_dump(mode="json"))
     for setting in (default_settings, serde_default_settings):
-        assert "gpt-" in setting.llm
+        assert any(x in setting.llm for x in ("gpt-", "claude-"))
         assert setting.answer.evidence_k == 10
         assert HOME_DIR in str(setting.agent.index.index_directory)
         assert ".pqa" in str(setting.agent.index.index_directory)
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -1635,7 +1635,8 @@ async def test_querying_tables(stub_data_dir: Path) -> None:
     assert all(
         [m.data for m in t.media] for t in used_texts
     ), "Expected image data to be present in the used contexts"
-    assert any(x in session.answer for x in ("1.0 mm", "1.0-mm"))
+    # Check for 1.0mm, 1.0-mm, 1.0 mm
+    assert re.search(r"1\.0[ -]?mm", session.answer)
     assert session.cost > 0
 
     # Filter contexts for HTTP requests, and ensure no images are present
@@ -2239,12 +2240,12 @@ def test_docdetails_doc_id_roundtrip() -> None:
 @pytest.mark.asyncio
 async def test_partitioning_fn_docs(use_partition: bool) -> None:
     settings = Settings.from_name("fast")
-    settings.answer.evidence_k = 2  # limit to only 2
+    settings.answer.evidence_k = 2  # Match positive or negative statement count below
 
     # imagine we have some special selection we want to
     # embedding rank by itself
     def partition_by_citation(t: Embeddable) -> int:
-        if isinstance(t, Text) and "special" in t.doc.citation:
+        if isinstance(t, Text) and "negative" in t.doc.citation:
             return 1
         return 0
 
@@ -2257,9 +2258,11 @@ def partition_by_citation(t: Embeddable) -> int:
     ), "We want this test to cover NumpyVectorStore"
 
     # add docs that we can use our partitioning function on
-    positive_statements_doc = Doc(docname="stub", citation="stub", dockey="stub")
+    positive_statements_doc = Doc(
+        docname="positive", citation="positive", dockey="positive"
+    )
     negative_statements_doc = Doc(
-        docname="special", citation="special", dockey="special"
+        docname="negative", citation="negative", dockey="negative"
     )
     texts = []
     for i, (statement, doc) in enumerate(
@@ -2275,10 +2278,11 @@ def partition_by_citation(t: Embeddable) -> int:
             await settings.get_embedding_model().embed_documents([texts[-1].text])
         )[0]
     await docs.aadd_texts(
-        texts=[t for t in texts if t.doc.docname == "stub"], doc=positive_statements_doc
+        texts=[t for t in texts if t.doc.docname == "positive"],
+        doc=positive_statements_doc,
     )
     await docs.aadd_texts(
-        texts=[t for t in texts if t.doc.docname == "special"],
+        texts=[t for t in texts if t.doc.docname == "negative"],
         doc=negative_statements_doc,
     )
 
@@ -2330,10 +2334,11 @@ def partition_by_citation(t: Embeddable) -> int:
     # with partitioning, we are forcing them to be interleaved, thus
     # at least one "I don't like X" statements will be in the top 2
     session = await docs.aget_evidence(
-        "What do I like?", settings=settings, partitioning_fn=partitioning_fn
+        "What do I like or dislike?", settings=settings, partitioning_fn=partitioning_fn
     )
     assert docs.texts_index.texts == docs.texts == texts
 
+    assert session.contexts, "Test requires contexts to be made"
     if use_partition:
         assert any(
             "don't" in c.text.text for c in session.contexts