Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
Agent Tool Trajectory Experiment

This example demonstrates Traceloop's agent tool trajectory evaluator:
- Agent Tool Trajectory: Validates the agent tool trajectory

This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory.
"""

import asyncio
from traceloop.sdk import Traceloop
from traceloop.sdk.evaluator import EvaluatorMadeByTraceloop

# Initialize Traceloop
client = Traceloop.init()


def agent_evaluators_task(row):
executed_tool_calls = row.get("actual", "")
default_expected = (
"[{'name': 'search', 'input': {'query': 'weather'}}, "
"{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, "
"{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]"
)
expected_tool_calls = row.get("expected", default_expected)

return {
"executed_tool_calls": executed_tool_calls,
"expected_tool_calls": expected_tool_calls,
}
Comment on lines +18 to +30
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

rg -n -A5 -B5 "tool_calls" --type=py

Repository: traceloop/openllmetry

Length of output: 50378


🏁 Script executed:

# First, let's look at the file in question to understand the context
cat -n packages/sample-app/sample_app/experiment/made_by_traceloop/agent_tool_trajectory.py

# Search for the evaluator that uses agent_evaluators_task
rg -n "agent_evaluators_task\|AgentToolTrajectoryEvaluator\|agent_tool_trajectory" --type=py packages/sample-app/

Repository: traceloop/openllmetry

Length of output: 3431


🏁 Script executed:

# Search for the agent_tool_trajectory evaluator implementation
rg -n "agent_tool_trajectory" --type=py -A10 -B5 | head -100

# Also search for EvaluatorMadeByTraceloopDefinition
fd -e py -x grep -l "agent_tool_trajectory\|AgentToolTrajectory" {} \;

Repository: traceloop/openllmetry

Length of output: 6137


🏁 Script executed:

# Get the full definition of agent_tool_trajectory evaluator
sed -n '109,135p' packages/traceloop-sdk/traceloop/sdk/generated/evaluators/definitions.py

# Also check the response.py to understand how it processes tool calls
cat packages/traceloop-sdk/traceloop/sdk/generated/evaluators/response.py

Repository: traceloop/openllmetry

Length of output: 6133


🏁 Script executed:

# Search for other examples of agent_tool_trajectory evaluator usage
rg -n "executed_tool_calls\|expected_tool_calls" --type=py -B5 -A5

# Also search for any tests or documentation
fd -e md -e txt -x grep -l "agent.*tool.*trajectory\|executed_tool_calls" {} \;

Repository: traceloop/openllmetry

Length of output: 47


🏁 Script executed:

# Search for any README or documentation files related to experiments
fd -e md "README\|GUIDE\|EXAMPLE" packages/sample-app/ | head -20

# Check if there are any other evaluator examples in sample-app
ls -la packages/sample-app/sample_app/experiment/made_by_traceloop/

# Search for how row data is structured or passed
rg -n "agent-tool-trajectory" --type=py -B3 -A3

Repository: traceloop/openllmetry

Length of output: 5601


🏁 Script executed:

# Check other agent experiment files to see how they handle tool calls
cat packages/sample-app/sample_app/experiment/made_by_traceloop/agents_exp.py

# Also check travel_agent_exp.py
cat packages/sample-app/sample_app/experiment/made_by_traceloop/travel_agent_exp.py

Repository: traceloop/openllmetry

Length of output: 12825


🏁 Script executed:

# Search for the actual evaluator implementation or any processing logic
find packages/traceloop-sdk -name "*.py" -type f -exec grep -l "agent_tool_trajectory\|AgentToolTrajectory" {} \;

# Check if there's any validation or parsing code for tool_calls
rg -n "json.loads\|ast.literal_eval\|parse.*tool" --type=py packages/traceloop-sdk/ | head -50

Repository: traceloop/openllmetry

Length of output: 416


🏁 Script executed:

# Check the request.py to see AgentToolTrajectoryRequest structure
cat packages/traceloop-sdk/traceloop/sdk/generated/evaluators/request.py

Repository: traceloop/openllmetry

Length of output: 13294


🏁 Script executed:

# Check if there are any examples or tests showing proper format
rg -n "executed_tool_calls\|expected_tool_calls" --type=py -B2 -A2 packages/

Repository: traceloop/openllmetry

Length of output: 47


Fix JSON string format for tool calls to use valid JSON syntax.

Lines 20-24 define default_expected as a string, which is correct. However, the string uses single quotes instead of double quotes, creating invalid JSON that will fail parsing. Update to proper JSON format:

Corrected code
default_expected = (
    '[{"name": "search", "input": {"query": "weather"}}, '
    '{"name": "book_flight", "input": {"flight": "NYC to Paris"}}, '
    '{"name": "get_confirmation", "input": {"confirmation": "flight booked"}}]'
)

The agent-tool-trajectory evaluator expects both executed_tool_calls and expected_tool_calls as JSON strings (not Python lists).

🤖 Prompt for AI Agents
In
@packages/sample-app/sample_app/experiment/made_by_traceloop/agent_tool_trajectory.py
around lines 18 - 30, The default_expected string in agent_evaluators_task uses
Python-style single quotes and will not be valid JSON; update the
default_expected value to a JSON-valid string using double quotes (e.g., change
the list/dict literals to use double quotes for keys and string values) so that
expected_tool_calls (and executed_tool_calls) remain JSON strings parsable by
downstream evaluators; ensure you only modify the default_expected literal
inside the agent_evaluators_task function.



async def run_agent_tool_trajectory_experiment():
print("\n" + "="*80)
print("AGENT TOOL TRAJECTORY EXPERIMENT")
print("="*80 + "\n")
print("This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n")
print("1. Agent Tool Trajectory - Validates the agent tool trajectory")
print("\n" + "-"*80 + "\n")

# Configure agent evaluators
evaluators = [
EvaluatorMadeByTraceloop.agent_tool_trajectory(
input_params_sensitive=True,
mismatch_sensitive=False,
order_sensitive=False,
threshold=0.7,
),
]

print("Running experiment with evaluators:")
for evaluator in evaluators:
print(f" - {evaluator.slug}")

print("\n" + "-"*80 + "\n")

# Run the experiment
# Note: You'll need to create a dataset with appropriate test cases for agents
results, errors = await client.experiment.run(
dataset_slug="agent-tool-trajectory", # Set a dataset slug that exists in the traceloop platform
dataset_version="v1",
task=agent_evaluators_task,
evaluators=evaluators,
experiment_slug="agent-tool-trajectory-exp",
stop_on_error=False,
wait_for_results=True,
)

print("\n" + "="*80)
print("Agent tool trajectory experiment completed!")
print("="*80 + "\n")

print("Results summary:")
print(f" - Total rows processed: {len(results) if results else 0}")
print(f" - Errors encountered: {len(errors) if errors else 0}")

if errors:
print("\nErrors:")
for error in errors:
print(f" - {error}")

if __name__ == "__main__":
print("\nAgent Tool Trajectory Experiment\n")

asyncio.run(run_agent_tool_trajectory_experiment())
3 changes: 1 addition & 2 deletions packages/traceloop-sdk/traceloop/sdk/evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from .evaluator import Evaluator
from .config import EvaluatorDetails
from .evaluators_made_by_traceloop import EvaluatorMadeByTraceloop, create_evaluator
from ..generated.evaluators.definitions import EvaluatorMadeByTraceloop

__all__ = [
"Evaluator",
"EvaluatorDetails",
"EvaluatorMadeByTraceloop",
"create_evaluator",
]
3 changes: 2 additions & 1 deletion packages/traceloop-sdk/traceloop/sdk/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def _validate_evaluator_input(slug: str, input: Dict[str, str]) -> None:
request_model = get_request_model(slug)
if request_model:
try:
request_model(**input)
# Request models expect data nested under 'input' field
request_model(input=input)
except ValidationError as e:
raise ValueError(f"Invalid input for '{slug}': {e}") from e

Expand Down

This file was deleted.

Loading