Skip to content
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
Expand All @@ -185,3 +185,7 @@ pyrightconfig.json
#chainlit
.chainlit/

#deepeval
.deepeval/
src/tests/.deepeval/

24 changes: 24 additions & 0 deletions src/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
""" pytest fixtures:
pytest_generate_tests is a list of questions for testing with expect_success [True,False] and a min_score

Other potential queries
queries = [
"The sky is green during a storm.",
"Grass is usually yellow.",
"Water normally boils at 90°C.",
"The Great Wall of China is visible from space with the naked eye."
]
Can potentially add testing with "strict" in [False, True]
"""

def pytest_generate_tests(metafunc):
if {"query", "expect_success", "min_score"}.issubset(metafunc.fixturenames):
test_data = [
("Grass is usually yellow.", False, 0.3),
("The sun rises in the west.", False, 0.0),
("Mount Everest is the tallest mountain in the world.", True, 0.7),
]
metafunc.parametrize(
argnames=("query", "expect_success", "min_score"),
argvalues=test_data
)
28 changes: 28 additions & 0 deletions src/tests/mocks/mock_agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from typing import Any
from src.verifact_agents.base import Agent

class MockResult:
def __init__(self, output: Any):
self.output = output

def final_output_as(self, _type):
return self.output

class MockAgent(Agent):
def __init__(self, output, name="MockAgent"):
self._output = output
self.handoffs = []
self._name = name
self.mcp_config = {"prompt": "mock-prompt"}
self.mcp_servers = []
self.tools = []
self.input_guardrails = []
self.output_guardrails = []
self.model_settings = []

async def process(self, input_data):
return MockResult(self._output)

@property
def name(self):
return self._name
63 changes: 63 additions & 0 deletions src/tests/test_faithfulness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pytest
from deepeval.evaluate import evaluate
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from dotenv import load_dotenv

from src.verifact_manager import VerifactManager

load_dotenv()


def validate_metrics(metric_data, expect_success, min_score):
assert metric_data.score > min_score, "Score should be greater than specified in the test data"
assert metric_data.success is expect_success
assert metric_data.reason is not None
assert "Truths" in metric_data.verbose_logs
assert "Claims" in metric_data.verbose_logs
assert "Verdicts" in metric_data.verbose_logs
print(f"Metric reason: {metric_data.reason}")
print(f"Verbose logs: {metric_data.verbose_logs}")


def handle_evaluation_results(evaluation_results, expect_success, min_score):
for result in evaluation_results:
_, test_result_list = result
if test_result_list:
for test_result in test_result_list:
for metric_data in test_result.metrics_data:
validate_metrics(metric_data, expect_success, min_score)
else:
print("Test result list is None or empty")


def process_results(results, query, expect_success, min_score):
for _claim, evidence_list, verdict in results:
if not verdict or not evidence_list:
continue

test_case = LLMTestCase(
input=query,
actual_output=verdict.explanation,
retrieval_context=[str(ev.content) for ev in evidence_list if ev.content],
)

metric = FaithfulnessMetric(include_reason=True, strict_mode=True)
evaluation_results = evaluate(test_cases=[test_case], metrics=[metric])
handle_evaluation_results(evaluation_results, expect_success, min_score)

Comment on lines +34 to +48
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add docstring and improve robustness.

The function needs documentation and should handle potential None values more defensively.

 def process_results(results, query, expect_success, min_score):
+    """
+    Process VerifactManager results and evaluate faithfulness.
+    
+    Args:
+        results: List of (claim, evidence_list, verdict) tuples
+        query: Original query string
+        expect_success: Expected success boolean value
+        min_score: Minimum score threshold
+    """
     for _claim, evidence_list, verdict in results:
-        if not verdict or not evidence_list:
+        if not verdict or not evidence_list or not verdict.explanation:
             continue

         test_case = LLMTestCase(
             input=query,
             actual_output=verdict.explanation,
             retrieval_context=[str(ev.content) for ev in evidence_list if ev.content],
         )

         metric = FaithfulnessMetric(include_reason=True, strict_mode=True)
         evaluation_results = evaluate(test_cases=[test_case], metrics=[metric])
         handle_evaluation_results(evaluation_results, expect_success, min_score)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def process_results(results, query, expect_success, min_score):
for _claim, evidence_list, verdict in results:
if not verdict or not evidence_list:
continue
test_case = LLMTestCase(
input=query,
actual_output=verdict.explanation,
retrieval_context=[str(ev.content) for ev in evidence_list if ev.content],
)
metric = FaithfulnessMetric(include_reason=True, strict_mode=True)
evaluation_results = evaluate(test_cases=[test_case], metrics=[metric])
handle_evaluation_results(evaluation_results, expect_success, min_score)
def process_results(results, query, expect_success, min_score):
"""
Process VerifactManager results and evaluate faithfulness.
Args:
results: List of (claim, evidence_list, verdict) tuples
query: Original query string
expect_success: Expected success boolean value
min_score: Minimum score threshold
"""
for _claim, evidence_list, verdict in results:
if not verdict or not evidence_list or not verdict.explanation:
continue
test_case = LLMTestCase(
input=query,
actual_output=verdict.explanation,
retrieval_context=[str(ev.content) for ev in evidence_list if ev.content],
)
metric = FaithfulnessMetric(include_reason=True, strict_mode=True)
evaluation_results = evaluate(test_cases=[test_case], metrics=[metric])
handle_evaluation_results(evaluation_results, expect_success, min_score)
🧰 Tools
🪛 Ruff (0.11.9)

34-34: Missing docstring in public function

(D103)

🪛 Pylint (3.3.7)

[convention] 34-34: Missing function or method docstring

(C0116)

🤖 Prompt for AI Agents
In src/tests/test_faithfulness.py around lines 34 to 48, add a clear docstring
to the process_results function explaining its purpose, parameters, and
behavior. Improve robustness by adding checks to handle potential None values
for results, query, and verdict.explanation before processing. Also, ensure
evidence_list and its contents are validated to avoid errors during list
comprehension.


@pytest.mark.asyncio
async def test_faithfulness_real_output(query, expect_success, min_score):
"""Test the faithfulness metric against expected thresholds.

:param query: test question
:param expect_success: expected success [True, False]
:param min_score: minimum score expected
"""
manager = VerifactManager()
results = await manager.run(query)
assert results, "No output from VerifactManager."
process_results(results, query, expect_success, min_score)


87 changes: 87 additions & 0 deletions src/tests/test_verifact_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""
Basic pipeline execution:
uses monkeypatched agents (claim_detector_agent, evidence_hunter_agent, verdict_writer_agent) and Runner.run()
instead of real agents.

1. Testing if the full VerifactManager.run() pipeline executes successfully with mocked agents.
2. Testing if a claim goes through detection, evidence gathering, and verdict generation using mocked data
with a range of verdict cases: "false", "partially true", "unverifiable", and "true".
a. The verdict text returned matches the expected result.
b. The sources in the verdict match the mock evidence source.
c. The evidence content and source returned from the mock agent match what was injected

"""
import pytest
from tests.mocks.mock_agents import MockAgent
from src.verifact_agents.claim_detector import Claim
from src.verifact_agents.evidence_hunter import Evidence
from src.verifact_agents.verdict_writer import Verdict

@pytest.mark.asyncio
@pytest.mark.parametrize("claim_text, verdict_text, evidence_content, evidence_source", [
(
"The moon is made of cheese",
"false",
"Scientific consensus disproves this",
"https://nasa.gov"
),
(
"The Great Wall of China is visible from space",
"partially true",
"Astronauts report visibility depends on conditions",
"https://esa.int"
),
(
"Aliens built the pyramids",
"unverifiable",
"There is no direct evidence confirming or denying alien involvement",
"https://historychannel.com"
),
(
"Water boils at 100 degrees Celsius at sea level",
"true",
"This is a well-documented scientific fact",
"https://science.org"
),
])
async def test_factcheck_pipeline(monkeypatch, claim_text, verdict_text, evidence_content, evidence_source):
# Prepare mock data
claims = [Claim(text=claim_text)]
evidence = [Evidence(content=evidence_content, source=evidence_source, relevance=0.9, stance="neutral")]
verdict = Verdict(
claim=claim_text,
verdict=verdict_text,
confidence=0.85,
explanation=f"Mock explanation for verdict '{verdict_text}'.",
sources=[evidence_source]
)

# Patch agent instances
monkeypatch.setattr("src.verifact_manager.claim_detector_agent", MockAgent(claims))
monkeypatch.setattr("src.verifact_manager.evidence_hunter_agent", MockAgent(evidence))
monkeypatch.setattr("src.verifact_manager.verdict_writer_agent", MockAgent(verdict))

# Import manager after monkeypatching
import src.verifact_manager as vm

# Patch Runner.run
async def mock_runner_run(agent, input_data):
return await agent.process(input_data)

monkeypatch.setattr(vm.Runner, "run", mock_runner_run)

# Run pipeline
manager = vm.VerifactManager()
result = await manager.run(claim_text)

_, evidence_result, verdict_result = result[0]

# Verdict checks
assert verdict_result.verdict == verdict_text
assert evidence_source in verdict_result.sources

# Evidence checks
assert evidence_result is not None
assert len(evidence_result) > 0
assert evidence_result[0].content == evidence_content
assert evidence_result[0].source == evidence_source
Comment on lines +77 to +87
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Verify the return structure assumption.

The test assumes a specific return structure from manager.run() where result[0] contains a tuple with at least 3 elements. Consider adding defensive checks or documentation about the expected return format.

Run this script to verify the return structure of VerifactManager.run():


🏁 Script executed:

#!/bin/bash
# Description: Check the return structure of VerifactManager.run method
# Expected: Find the method definition and understand its return format

ast-grep --pattern $'class VerifactManager {
  $$$
  async def run($$$) {
    $$$
    return $$$
  }
  $$$
}'

# Also search for any documentation about the return format
rg -A 10 -B 5 "def run.*:" --type py | grep -A 15 -B 5 "VerifactManager"

Length of output: 1144


🏁 Script executed:

#!/bin/bash
# Inspect the implementation and return statements of VerifactManager.run
rg -n "async def run" -A 50 src/verifact_manager.py

# Locate the Verdict class definition to understand its structure
rg -n "class Verdict" -A 30 src

# Preview the test file to see how run() is used and unpacked
sed -n '1,200p' src/tests/test_verifact_manager.py

Length of output: 9172


🏁 Script executed:

#!/bin/bash
# Extract the full run method to examine its return structure
sed -n '41,200p' src/verifact_manager.py

# Locate all return statements within VerifactManager.run
rg -n "return" -B 5 -A 5 src/verifact_manager.py

Length of output: 11671


Update VerifactManager.run return signature and docstring

The test’s unpacking of result[0] as (claim, evidence_list, verdict) is correct—the method returns a List[tuple[Claim, List[Evidence], Verdict]], not just List[Verdict] (and it’s currently annotated as returning None). Please update the code to make this explicit:

• In src/verifact_manager.py (Line 41):

- async def run(self, query: str, progress_callback=None, progress_msg=None) -> None:
+ async def run(self, query: str, progress_callback=None, progress_msg=None) 
+     -> list[tuple[Claim, list[Evidence], Verdict]]:

• In the same docstring’s Returns: section:

- Returns:
-     List[Verdict]: A list of verdicts for claims in the text
+ Returns:
+     List[tuple[Claim, List[Evidence], Verdict]]: 
+         A list of (claim, evidence_list, verdict) tuples for each detected claim

This will align the signature and documentation with the actual return structure and remove any confusion for future maintainers.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
_, evidence_result, verdict_result = result[0]
# Verdict checks
assert verdict_result.verdict == verdict_text
assert evidence_source in verdict_result.sources
# Evidence checks
assert evidence_result is not None
assert len(evidence_result) > 0
assert evidence_result[0].content == evidence_content
assert evidence_result[0].source == evidence_source
async def run(self, query: str, progress_callback=None, progress_msg=None) -> list[tuple[Claim, list[Evidence], Verdict]]:
"""
Execute the verifact process on the given query.
Args:
query: The input text to analyze for claims.
progress_callback: Optional callback for progress updates.
progress_msg: Optional base message for progress updates.
Returns:
List[tuple[Claim, List[Evidence], Verdict]]:
A list of (claim, evidence_list, verdict) tuples for each detected claim
"""
# … rest of implementation …
🤖 Prompt for AI Agents
In src/verifact_manager.py around line 41, update the async def run method's
return type annotation to reflect that it returns a List of tuples containing
Claim, List of Evidence, and Verdict, instead of None. Also, revise the method's
docstring Returns section to explicitly describe this return structure,
clarifying that it returns List[tuple[Claim, List[Evidence], Verdict]]. This
will align the code and documentation with the actual return format used in the
tests and improve maintainability.