diff --git a/.gitignore b/.gitignore index e038b7a..fff16ca 100644 --- a/.gitignore +++ b/.gitignore @@ -161,7 +161,7 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ ### Python Patch ### # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration @@ -185,3 +185,7 @@ pyrightconfig.json #chainlit .chainlit/ +#deepeval +.deepeval/ +src/tests/.deepeval/ + diff --git a/src/tests/conftest.py b/src/tests/conftest.py new file mode 100644 index 0000000..7929349 --- /dev/null +++ b/src/tests/conftest.py @@ -0,0 +1,24 @@ +""" pytest fixtures: + pytest_generate_tests is a list of questions for testing with expect_success [True,False] and a min_score + +Other potential queries + queries = [ + "The sky is green during a storm.", + "Grass is usually yellow.", + "Water normally boils at 90°C.", + "The Great Wall of China is visible from space with the naked eye." + ] +Can potentially add testing with "strict" in [False, True] +""" + +def pytest_generate_tests(metafunc): + if {"query", "expect_success", "min_score"}.issubset(metafunc.fixturenames): + test_data = [ + ("Grass is usually yellow.", False, 0.3), + ("The sun rises in the west.", False, 0.0), + ("Mount Everest is the tallest mountain in the world.", True, 0.7), + ] + metafunc.parametrize( + argnames=("query", "expect_success", "min_score"), + argvalues=test_data + ) diff --git a/src/tests/mocks/mock_agents.py b/src/tests/mocks/mock_agents.py new file mode 100644 index 0000000..f9e456b --- /dev/null +++ b/src/tests/mocks/mock_agents.py @@ -0,0 +1,28 @@ +from typing import Any +from src.verifact_agents.base import Agent + +class MockResult: + def __init__(self, output: Any): + self.output = output + + def final_output_as(self, _type): + return self.output + +class MockAgent(Agent): + def __init__(self, output, name="MockAgent"): + self._output = output + self.handoffs = [] + self._name = name + self.mcp_config = {"prompt": "mock-prompt"} + self.mcp_servers = [] + self.tools = [] + self.input_guardrails = [] + self.output_guardrails = [] + self.model_settings = [] + + async def process(self, input_data): + return MockResult(self._output) + + @property + def name(self): + return self._name diff --git a/src/tests/test_faithfulness.py b/src/tests/test_faithfulness.py new file mode 100644 index 0000000..641c805 --- /dev/null +++ b/src/tests/test_faithfulness.py @@ -0,0 +1,63 @@ +import pytest +from deepeval.evaluate import evaluate +from deepeval.metrics import FaithfulnessMetric +from deepeval.test_case import LLMTestCase +from dotenv import load_dotenv + +from src.verifact_manager import VerifactManager + +load_dotenv() + + +def validate_metrics(metric_data, expect_success, min_score): + assert metric_data.score > min_score, "Score should be greater than specified in the test data" + assert metric_data.success is expect_success + assert metric_data.reason is not None + assert "Truths" in metric_data.verbose_logs + assert "Claims" in metric_data.verbose_logs + assert "Verdicts" in metric_data.verbose_logs + print(f"Metric reason: {metric_data.reason}") + print(f"Verbose logs: {metric_data.verbose_logs}") + + +def handle_evaluation_results(evaluation_results, expect_success, min_score): + for result in evaluation_results: + _, test_result_list = result + if test_result_list: + for test_result in test_result_list: + for metric_data in test_result.metrics_data: + validate_metrics(metric_data, expect_success, min_score) + else: + print("Test result list is None or empty") + + +def process_results(results, query, expect_success, min_score): + for _claim, evidence_list, verdict in results: + if not verdict or not evidence_list: + continue + + test_case = LLMTestCase( + input=query, + actual_output=verdict.explanation, + retrieval_context=[str(ev.content) for ev in evidence_list if ev.content], + ) + + metric = FaithfulnessMetric(include_reason=True, strict_mode=True) + evaluation_results = evaluate(test_cases=[test_case], metrics=[metric]) + handle_evaluation_results(evaluation_results, expect_success, min_score) + + +@pytest.mark.asyncio +async def test_faithfulness_real_output(query, expect_success, min_score): + """Test the faithfulness metric against expected thresholds. + + :param query: test question + :param expect_success: expected success [True, False] + :param min_score: minimum score expected + """ + manager = VerifactManager() + results = await manager.run(query) + assert results, "No output from VerifactManager." + process_results(results, query, expect_success, min_score) + + diff --git a/src/tests/test_verifact_manager.py b/src/tests/test_verifact_manager.py new file mode 100644 index 0000000..344fec6 --- /dev/null +++ b/src/tests/test_verifact_manager.py @@ -0,0 +1,87 @@ +""" +Basic pipeline execution: +uses monkeypatched agents (claim_detector_agent, evidence_hunter_agent, verdict_writer_agent) and Runner.run() +instead of real agents. + +1. Testing if the full VerifactManager.run() pipeline executes successfully with mocked agents. +2. Testing if a claim goes through detection, evidence gathering, and verdict generation using mocked data +with a range of verdict cases: "false", "partially true", "unverifiable", and "true". +a. The verdict text returned matches the expected result. +b. The sources in the verdict match the mock evidence source. +c. The evidence content and source returned from the mock agent match what was injected + +""" +import pytest +from tests.mocks.mock_agents import MockAgent +from src.verifact_agents.claim_detector import Claim +from src.verifact_agents.evidence_hunter import Evidence +from src.verifact_agents.verdict_writer import Verdict + +@pytest.mark.asyncio +@pytest.mark.parametrize("claim_text, verdict_text, evidence_content, evidence_source", [ + ( + "The moon is made of cheese", + "false", + "Scientific consensus disproves this", + "https://nasa.gov" + ), + ( + "The Great Wall of China is visible from space", + "partially true", + "Astronauts report visibility depends on conditions", + "https://esa.int" + ), + ( + "Aliens built the pyramids", + "unverifiable", + "There is no direct evidence confirming or denying alien involvement", + "https://historychannel.com" + ), + ( + "Water boils at 100 degrees Celsius at sea level", + "true", + "This is a well-documented scientific fact", + "https://science.org" + ), +]) +async def test_factcheck_pipeline(monkeypatch, claim_text, verdict_text, evidence_content, evidence_source): + # Prepare mock data + claims = [Claim(text=claim_text)] + evidence = [Evidence(content=evidence_content, source=evidence_source, relevance=0.9, stance="neutral")] + verdict = Verdict( + claim=claim_text, + verdict=verdict_text, + confidence=0.85, + explanation=f"Mock explanation for verdict '{verdict_text}'.", + sources=[evidence_source] + ) + + # Patch agent instances + monkeypatch.setattr("src.verifact_manager.claim_detector_agent", MockAgent(claims)) + monkeypatch.setattr("src.verifact_manager.evidence_hunter_agent", MockAgent(evidence)) + monkeypatch.setattr("src.verifact_manager.verdict_writer_agent", MockAgent(verdict)) + + # Import manager after monkeypatching + import src.verifact_manager as vm + + # Patch Runner.run + async def mock_runner_run(agent, input_data): + return await agent.process(input_data) + + monkeypatch.setattr(vm.Runner, "run", mock_runner_run) + + # Run pipeline + manager = vm.VerifactManager() + result = await manager.run(claim_text) + + _, evidence_result, verdict_result = result[0] + + # Verdict checks + assert verdict_result.verdict == verdict_text + assert evidence_source in verdict_result.sources + + # Evidence checks + assert evidence_result is not None + assert len(evidence_result) > 0 + assert evidence_result[0].content == evidence_content + assert evidence_result[0].source == evidence_source