-
Notifications
You must be signed in to change notification settings - Fork 867
fix(evals): evals API supports input + config, generate mbt functions #3534
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
31abd4c
38511d3
9b15bec
747845b
6af89b2
403dbbf
c35518b
6f9cd16
f628e5e
a27c7eb
acabf5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| """ | ||
| Agent Tool Trajectory Experiment | ||
| This example demonstrates Traceloop's agent tool trajectory evaluator: | ||
| - Agent Tool Trajectory: Validates the agent tool trajectory | ||
| This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory. | ||
| """ | ||
|
|
||
| import asyncio | ||
| from traceloop.sdk import Traceloop | ||
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloopDefinition | ||
|
|
||
| # Initialize Traceloop | ||
| client = Traceloop.init() | ||
|
|
||
|
|
||
| def agent_evaluators_task(row): | ||
| executed_tool_calls = row.get("actual", "") | ||
| default_expected = ( | ||
| "[{'name': 'search', 'input': {'query': 'weather'}}, " | ||
| "{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, " | ||
| "{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]" | ||
| ) | ||
| expected_tool_calls = row.get("expected", default_expected) | ||
|
|
||
| return { | ||
| "executed_tool_calls": executed_tool_calls, | ||
| "expected_tool_calls": expected_tool_calls, | ||
| } | ||
|
|
||
|
|
||
| async def run_agent_tool_trajectory_experiment(): | ||
| print("\n" + "="*80) | ||
| print("AGENT TOOL TRAJECTORY EXPERIMENT") | ||
| print("="*80 + "\n") | ||
| print("This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n") | ||
| print("1. Agent Tool Trajectory - Validates the agent tool trajectory") | ||
| print("\n" + "-"*80 + "\n") | ||
|
|
||
| # Configure agent evaluators | ||
| evaluators = [ | ||
| EvaluatorMadeByTraceloopDefinition.agent_tool_trajectory( | ||
| input_params_sensitive=True, | ||
| mismatch_sensitive=False, | ||
| order_sensitive=False, | ||
| threshold=0.7, | ||
| ), | ||
| ] | ||
|
|
||
| print("Running experiment with evaluators:") | ||
| for evaluator in evaluators: | ||
| print(f" - {evaluator.slug}") | ||
|
|
||
| print("\n" + "-"*80 + "\n") | ||
|
|
||
| # Run the experiment | ||
| # Note: You'll need to create a dataset with appropriate test cases for agents | ||
| results, errors = await client.experiment.run( | ||
| dataset_slug="agent-tool-trajectory", # Set a dataset slug that exists in the traceloop platform | ||
| dataset_version="v1", | ||
| task=agent_evaluators_task, | ||
| evaluators=evaluators, | ||
| experiment_slug="agent-tool-trajectory-exp", | ||
| stop_on_error=False, | ||
| wait_for_results=True, | ||
| ) | ||
nina-kollman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| print("\n" + "="*80) | ||
| print("Agent tool trajectory experiment completed!") | ||
| print("="*80 + "\n") | ||
|
|
||
| print("Results summary:") | ||
| print(f" - Total rows processed: {len(results) if results else 0}") | ||
| print(f" - Errors encountered: {len(errors) if errors else 0}") | ||
|
|
||
| if errors: | ||
| print("\nErrors:") | ||
| for error in errors: | ||
| print(f" - {error}") | ||
|
|
||
| if __name__ == "__main__": | ||
| print("\nAgent Tool Trajectory Experiment\n") | ||
|
|
||
| asyncio.run(run_agent_tool_trajectory_experiment()) | ||
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |||||||||||
| import os | ||||||||||||
| from openai import AsyncOpenAI | ||||||||||||
| from traceloop.sdk import Traceloop | ||||||||||||
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop | ||||||||||||
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloopDefinition | ||||||||||||
|
|
||||||||||||
| # Initialize Traceloop | ||||||||||||
| client = Traceloop.init() | ||||||||||||
|
|
@@ -135,11 +135,11 @@ async def run_agents_experiment(): | |||||||||||
|
|
||||||||||||
| # Configure agent evaluators | ||||||||||||
| evaluators = [ | ||||||||||||
| EvaluatorMadeByTraceloop.agent_goal_accuracy(), | ||||||||||||
| EvaluatorMadeByTraceloop.agent_tool_error_detector(), | ||||||||||||
| EvaluatorMadeByTraceloop.agent_flow_quality(), | ||||||||||||
| EvaluatorMadeByTraceloop.agent_efficiency(), | ||||||||||||
| EvaluatorMadeByTraceloop.agent_goal_completeness(), | ||||||||||||
| EvaluatorMadeByTraceloopDefinition.agent_goal_accuracy(), | ||||||||||||
| EvaluatorMadeByTraceloopDefinition.agent_tool_error_detector(), | ||||||||||||
| EvaluatorMadeByTraceloopDefinition.agent_flow_quality(), | ||||||||||||
|
||||||||||||
| EvaluatorMadeByTraceloopDefinition.agent_flow_quality(), | |
| EvaluatorMadeByTraceloopDefinition.agent_flow_quality( | |
| conditions=["Agent should not repeat questions", "Agent should complete task efficiently"], | |
| threshold=0.7, | |
| ), |
🤖 Prompt for AI Agents
In @packages/sample-app/sample_app/experiment/made_by_traceloop/agents_exp.py at
line 140, The call to EvaluatorMadeByTraceloopDefinition.agent_flow_quality() is
missing its required parameters and will raise a TypeError; update the call in
agents_exp.py to pass a list of condition strings for the first arg (conditions)
and a float for the second arg (threshold) per the signature defined in
definitions.py (agent_flow_quality(conditions: list[str], threshold: float));
either supply literal values that match the evaluator expectations (e.g.,
["conditionA", "conditionB"], 0.8) or pass through appropriately named variables
(e.g., conditions, threshold) that are defined earlier in the module.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing required parameter for agent_goal_completeness().
According to the method signature in definitions.py (lines 76-94), agent_goal_completeness() requires a mandatory threshold (float) parameter. Calling it without arguments will cause a TypeError.
🐛 Proposed fix
Add the required parameter:
- EvaluatorMadeByTraceloopDefinition.agent_goal_completeness(),
+ EvaluatorMadeByTraceloopDefinition.agent_goal_completeness(
+ threshold=0.8,
+ ),📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| EvaluatorMadeByTraceloopDefinition.agent_goal_completeness(), | |
| EvaluatorMadeByTraceloopDefinition.agent_goal_completeness( | |
| threshold=0.8, | |
| ), |
🤖 Prompt for AI Agents
In @packages/sample-app/sample_app/experiment/made_by_traceloop/agents_exp.py at
line 142, The call to
EvaluatorMadeByTraceloopDefinition.agent_goal_completeness() is missing its
required threshold parameter; update the call in agents_exp.py to pass a float
threshold (e.g., 0.8 or the appropriate value for your tests) so it matches the
signature defined in definitions.py and avoids the TypeError.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -11,7 +11,7 @@ | |||||
| sys.path.insert(0, str(agents_dir)) | ||||||
|
|
||||||
| from traceloop.sdk.guardrails.guardrails import guardrail # noqa: E402 | ||||||
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop # noqa: E402 | ||||||
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloopDefinition # noqa: E402 | ||||||
|
||||||
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloopDefinition # noqa: E402 | |
| from traceloop.sdk.evaluator import EvaluatorMadeByTraceloopDefinition |
🧰 Tools
🪛 Ruff (0.14.10)
14-14: Unused noqa directive (non-enabled: E402)
Remove unused noqa directive
(RUF100)
🤖 Prompt for AI Agents
In @packages/sample-app/sample_app/guardrail_travel_agent_example.py at line 14,
The import line for EvaluatorMadeByTraceloopDefinition contains an unnecessary
noqa directive; remove the trailing "# noqa: E402" from the import statement
(the symbol is EvaluatorMadeByTraceloopDefinition) so the line becomes a normal
import without the unused suppression comment.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,9 @@ | ||
| from .evaluator import Evaluator | ||
| from .config import EvaluatorDetails | ||
| from .evaluators_made_by_traceloop import EvaluatorMadeByTraceloop, create_evaluator | ||
| from ..generated.evaluators.definitions import EvaluatorMadeByTraceloopDefinition | ||
|
|
||
| __all__ = [ | ||
| "Evaluator", | ||
| "EvaluatorDetails", | ||
| "EvaluatorMadeByTraceloop", | ||
| "create_evaluator", | ||
| "EvaluatorMadeByTraceloopDefinition", | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
rg -n -A5 -B5 "tool_calls" --type=pyRepository: traceloop/openllmetry
Length of output: 50378
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 3431
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 6137
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 6133
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 47
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 5601
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 12825
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 416
🏁 Script executed:
# Check the request.py to see AgentToolTrajectoryRequest structure cat packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.pyRepository: traceloop/openllmetry
Length of output: 13294
🏁 Script executed:
Repository: traceloop/openllmetry
Length of output: 47
Fix JSON string format for tool calls to use valid JSON syntax.
Lines 20-24 define
default_expectedas a string, which is correct. However, the string uses single quotes instead of double quotes, creating invalid JSON that will fail parsing. Update to proper JSON format:Corrected code
The agent-tool-trajectory evaluator expects both
executed_tool_callsandexpected_tool_callsas JSON strings (not Python lists).🤖 Prompt for AI Agents