Skip to content

Commit 745b345

Browse files
authored
Merge pull request #1 from neo4j-field/add-local-eval
Add local eval
2 parents f843deb + 438a077 commit 745b345

17 files changed

+2641
-5
lines changed

Makefile

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,22 @@ format:
88
uv run ruff check .
99

1010
run-agent-uv:
11-
uv run python3 agent.py
11+
uv run python3 single_file_agent.py
1212

1313
run-agent:
14-
python3 agent.py
14+
python3 single_file_agent.py
15+
16+
run-eval-uv:
17+
uv run python3 eval.py
18+
19+
run-eval:
20+
python3 eval.py
21+
22+
generate-report-uv:
23+
uv run python3 scripts/generate_report.py $(csv-name)
24+
25+
generate-report:
26+
python3 scripts/generate_report.py $(csv-name)
1527

1628
help:
1729
@echo '----'

README.md

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,14 @@ A conversational AI agent that connects to a Neo4j Movies database and can answe
9797
```bash
9898
make run-agent-uv
9999
# or
100-
uv run python3 agent.py
100+
uv run python3 single_file_agent.py
101101
```
102102

103103
### Using pip/standard Python
104104
```bash
105105
make run-agent
106106
# or
107-
python3 agent.py
107+
python3 single_file_agent.py
108108
```
109109

110110
## Usage Examples
@@ -120,6 +120,38 @@ To exit the agent, type any of:
120120
- `quit`
121121
- `q`
122122

123+
## Evaluation
124+
125+
This repo also contains a simple local evaluation suite. This may be used and extended to evaluate your own agents.
126+
127+
### Running Evaluations
128+
129+
To run
130+
1. Configure the `eval.py` file with the LLM name, tools and prompt you would like to use
131+
2. Ensure you have populated the `questions.yaml` file with your eval question set
132+
3. Run `make run-eval-uv` or `make run-eval` depending on your package manager
133+
4. The eval results CSV will be saved to `evals/output/<file-name>.csv`
134+
5. View the contents with `review.ipynb`
135+
6. Generate a `.txt` report with `make generate-report-uv csv-name=<file-name>` or `make generate-report csv-name=<file-name>` depending on your package manager
136+
137+
### Eval CSV Structure
138+
139+
The resulting evaluation CSV will contain the following columns:
140+
141+
* question_id: str
142+
* question: str
143+
* expected_answer: str
144+
* agent_final_answer: Optional[str]
145+
* generated_cypher: list[ReadNeo4jCypherToolInput]
146+
* model: str
147+
* available_tools: list[str]
148+
* called_tools: list[str]
149+
* num_messages: Optional[int]
150+
* num_llm_calls: Optional[int]
151+
* num_tool_calls: Optional[int]
152+
* response_time: Optional[float]
153+
* error: Optional[str]
154+
123155
## Development
124156

125157
### Code Formatting

agent.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
"""This is a single file containing all the code to create a simple agent that can answer questions about the movie database."""
2+
13
import asyncio
24
import os
35
from typing import Any

eval.py

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
import asyncio
2+
import os
3+
from datetime import datetime
4+
from math import ceil
5+
from time import perf_counter
6+
from uuid import uuid4
7+
8+
import pandas as pd
9+
from dotenv import load_dotenv
10+
from langchain_core.messages import AIMessage
11+
from langchain_core.tools import StructuredTool
12+
from langchain_mcp_adapters.tools import load_mcp_tools
13+
from langgraph.checkpoint.memory import InMemorySaver
14+
from langgraph.prebuilt import create_react_agent
15+
from mcp import ClientSession, StdioServerParameters
16+
from mcp.client.stdio import stdio_client
17+
18+
from evals.models import QuestionRecord, ResponseTableRecord
19+
from prompt import get_movies_system_prompt
20+
from tools.find_movie_recommendations import find_movie_recommendations_tool
21+
from utils import get_questions_from_yaml, pre_model_hook
22+
23+
if load_dotenv():
24+
print("Loaded .env file")
25+
else:
26+
print("No .env file found")
27+
28+
neo4j_cypher_mcp = StdioServerParameters(
29+
command="uvx",
30+
args=["[email protected]", "--transport", "stdio"],
31+
env={
32+
"NEO4J_URI": os.getenv("NEO4J_URI"),
33+
"NEO4J_USERNAME": os.getenv("NEO4J_USERNAME"),
34+
"NEO4J_PASSWORD": os.getenv("NEO4J_PASSWORD"),
35+
"NEO4J_DATABASE": os.getenv("NEO4J_DATABASE"),
36+
},
37+
)
38+
39+
evals_loc = "evals/output/"
40+
eval_results = list()
41+
42+
43+
async def evaluate_single_question(
44+
question_dict: dict[str, str],
45+
prompt: str,
46+
tools: list[StructuredTool],
47+
model: str = "openai:gpt-4.1",
48+
) -> ResponseTableRecord:
49+
"""
50+
Initialize a fresh agent and evaluate a single question.
51+
"""
52+
try:
53+
assert question_dict.get("question") is not None, "Question not found"
54+
55+
# create the thread id for the agent eval
56+
# use the question id if it exists, otherwise generate a random uuid
57+
thread_id = "eval-" + question_dict.get("id", str(uuid4()))
58+
config = {"configurable": {"thread_id": thread_id}}
59+
60+
agent = create_react_agent(
61+
model=model,
62+
pre_model_hook=pre_model_hook,
63+
checkpointer=InMemorySaver(),
64+
tools=tools,
65+
prompt=prompt,
66+
)
67+
68+
response_time_start = perf_counter()
69+
response = await agent.ainvoke({"messages": question_dict["question"]}, config=config)
70+
response_time = perf_counter() - response_time_start
71+
72+
tool_calls = [
73+
tool_call
74+
for message in response["messages"]
75+
if isinstance(message, AIMessage)
76+
and hasattr(message, "tool_calls")
77+
and message.tool_calls
78+
for tool_call in message.tool_calls
79+
]
80+
81+
# capture all text2cypher queries
82+
cyphers = [c.get("args") for c in tool_calls if c.get("name") == "read_neo4j_cypher"]
83+
84+
return ResponseTableRecord(
85+
question_id=question_dict.get("id"),
86+
question=question_dict.get("question"),
87+
expected_answer=question_dict.get("answer"),
88+
agent_final_answer=response["messages"][-1].content,
89+
generated_cypher=cyphers,
90+
model=model,
91+
available_tools=[t.name for t in tools],
92+
called_tools=tool_calls,
93+
num_messages=len(response["messages"]),
94+
num_llm_calls=len([m for m in response["messages"] if isinstance(m, AIMessage)]),
95+
num_tool_calls=len(tool_calls),
96+
response_time=response_time,
97+
error=None,
98+
)
99+
100+
except Exception as e:
101+
print(f"Error: {e}")
102+
return ResponseTableRecord(
103+
question_id=question_dict.get("id"),
104+
question=question_dict.get("question"),
105+
expected_answer=question_dict.get("answer"),
106+
agent_final_answer=None,
107+
generated_cypher=list(),
108+
model=model,
109+
available_tools=[t.name for t in tools],
110+
called_tools=list(),
111+
num_messages=None,
112+
num_llm_calls=None,
113+
num_tool_calls=None,
114+
response_time=None,
115+
error=str(e),
116+
)
117+
118+
119+
async def _evaluate_single_batch(
120+
batch: list[QuestionRecord],
121+
prompt: str,
122+
tools: list[StructuredTool],
123+
model: str = "openai:gpt-4.1",
124+
) -> list[ResponseTableRecord]:
125+
"""
126+
Evaluate a batch of questions asynchronously.
127+
128+
Parameters
129+
----------
130+
batch : list[QuestionRecord]
131+
A list of question records containing the question, expected answer and the question id.
132+
133+
Returns
134+
-------
135+
list[ResponseTableRecord]
136+
A list of response table records containing the agent response and associated metadata.
137+
"""
138+
139+
tasks = [
140+
evaluate_single_question(question_dict, prompt, tools, model) for question_dict in batch
141+
]
142+
return await asyncio.gather(*tasks)
143+
144+
145+
async def _evaluate_batches(
146+
questions: list[QuestionRecord],
147+
prompt: str,
148+
tools: list[StructuredTool],
149+
model: str = "openai:gpt-4.1",
150+
batch_size: int = 10,
151+
) -> list[ResponseTableRecord]:
152+
"""
153+
Create embeddings for a Pandas DataFrame of text chunks in batches.
154+
155+
Parameters
156+
----------
157+
questions : list[QuestionRecord]
158+
A list of question records containing the question, expected answer and the question id.
159+
prompt : str
160+
The system prompt to use.
161+
tools : list[StructuredTool]
162+
The tools to use.
163+
model : str
164+
The model to use.
165+
batch_size : int
166+
The number of questions to process in each batch.
167+
168+
Returns
169+
-------
170+
list[ResponseTableRecord]
171+
A list of response table records containing the agent response and associated metadata.
172+
"""
173+
174+
results = list()
175+
for batch_idx, i in enumerate(range(0, len(questions), batch_size)):
176+
print(
177+
f"Processing batch {batch_idx + 1} of {ceil(len(questions) / (batch_size))} \n",
178+
end="\r",
179+
)
180+
if i + batch_size >= len(questions):
181+
batch = questions[i:]
182+
else:
183+
batch = questions[i : i + batch_size]
184+
batch_results = await _evaluate_single_batch(batch, prompt, tools, model)
185+
186+
# Add extracted records to the results list
187+
results.extend(batch_results)
188+
189+
return results
190+
191+
192+
async def main():
193+
"""
194+
Main function to run the agent.
195+
196+
Based on the documentation:
197+
https://github.com/langchain-ai/langchain-mcp-adapters?tab=readme-ov-file#client
198+
"""
199+
200+
questions = get_questions_from_yaml("questions.yaml")
201+
print(f"Retrieved {len(questions)} questions for evaluation.")
202+
203+
async with stdio_client(neo4j_cypher_mcp) as (read, write):
204+
async with ClientSession(read, write) as session:
205+
# Initialize the connection
206+
await session.initialize()
207+
208+
# Get tools
209+
mcp_tools = await load_mcp_tools(session)
210+
211+
# We only need to get schema and execute read queries from the Cypher MCP server
212+
allowed_tools = [
213+
tool for tool in mcp_tools if tool.name in {"get_neo4j_schema", "read_neo4j_cypher"}
214+
]
215+
216+
# We can also add non-mcp tools for our agent to use
217+
allowed_tools.append(find_movie_recommendations_tool)
218+
219+
prompt = get_movies_system_prompt()
220+
221+
model = "openai:gpt-4.1"
222+
batch_size = 10
223+
224+
eval_results = await _evaluate_batches(
225+
questions, prompt, allowed_tools, model, batch_size
226+
)
227+
228+
df = pd.DataFrame(eval_results)
229+
230+
df.to_csv(
231+
f"{evals_loc}eval_benchmark_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv",
232+
index=False,
233+
)
234+
235+
236+
if __name__ == "__main__":
237+
asyncio.run(main())

evals/models.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from typing import Any, Optional, TypedDict
2+
3+
4+
class ReadNeo4jCypherToolInput(TypedDict):
5+
query: str
6+
params: Optional[dict[str, Any]]
7+
8+
9+
class ResponseTableRecord(TypedDict):
10+
"A record created for the response table. The contents of this record may be used for further evaluation."
11+
12+
question_id: str
13+
question: str
14+
expected_answer: str
15+
agent_final_answer: Optional[str]
16+
generated_cypher: list[ReadNeo4jCypherToolInput]
17+
model: str
18+
available_tools: list[str]
19+
called_tools: list[str]
20+
num_messages: Optional[int]
21+
num_llm_calls: Optional[int]
22+
num_tool_calls: Optional[int]
23+
response_time: Optional[float]
24+
error: Optional[str]
25+
26+
27+
class QuestionRecord(TypedDict):
28+
"A record read from the questions yaml file."
29+
30+
id: Optional[str]
31+
question: str
32+
answer: Optional[str]

evals/output/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Evaluation Outputs
2+
3+
This folder contains the output files from the evaluation process.

0 commit comments

Comments
 (0)