Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions pydantic_ai_slim/pydantic_ai/history_processors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Built-in history processor functions for common message history repair tasks.

These functions can be passed directly to `Agent(history_processors=[...])` or
used with `capabilities.HistoryProcessor(processor=...)`.
"""

from __future__ import annotations

import logging
from dataclasses import replace

from pydantic_ai import messages as _messages

__all__ = ('repair_orphaned_tool_parts',)

logger = logging.getLogger(__name__)


def repair_orphaned_tool_parts(
messages: list[_messages.ModelMessage],
) -> list[_messages.ModelMessage]:
"""Remove orphaned tool call/return parts from message history.

Multi-turn agent conversations can accumulate structurally invalid history
when tool calls and their corresponding results become mismatched. Common
causes include streaming timeouts, deferred tool result drops, and history
trimming by other processors.

Providers like Anthropic strictly enforce that every `ToolCallPart` has a
matching `ToolReturnPart` (or `RetryPromptPart`) and vice versa; orphaned
entries cause 400 errors.

This processor performs a two-pass repair:

1. **Orphaned returns/retries**: `ToolReturnPart` or `RetryPromptPart` whose
`tool_call_id` does not match any preceding `ToolCallPart` are removed.
2. **Orphaned calls**: `ToolCallPart` whose `tool_call_id` does not match
any following `ToolReturnPart` or `RetryPromptPart` are removed.

Empty messages (all parts removed) are dropped entirely.

Example:
```python
from pydantic_ai import Agent
from pydantic_ai.history_processors import repair_orphaned_tool_parts

agent = Agent('openai:gpt-4o', history_processors=[repair_orphaned_tool_parts])
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.
Outdated
```
"""
call_ids: set[str] = set()
for message in messages:
if isinstance(message, _messages.ModelResponse):
for part in message.parts:
if isinstance(part, _messages.ToolCallPart) and part.tool_call_id:
call_ids.add(part.tool_call_id)

return_ids: set[str] = set()
for message in messages:
if isinstance(message, _messages.ModelRequest):
for part in message.parts:
if isinstance(part, (_messages.ToolReturnPart, _messages.RetryPromptPart)):
if part.tool_call_id:
return_ids.add(part.tool_call_id)

repaired: list[_messages.ModelMessage] = []
for message in messages:
if isinstance(message, _messages.ModelRequest):
kept_parts: list[_messages.ModelRequestPart] = []
for part in message.parts:
if isinstance(part, _messages.ToolReturnPart):
if part.tool_call_id and part.tool_call_id not in call_ids:
logger.debug(
'Removing orphaned ToolReturnPart with tool_call_id=%r (no matching ToolCallPart)',
part.tool_call_id,
)
continue
elif isinstance(part, _messages.RetryPromptPart):
if part.tool_name is not None and part.tool_call_id and part.tool_call_id not in call_ids:
logger.debug(
'Removing orphaned RetryPromptPart with tool_call_id=%r (no matching ToolCallPart)',
part.tool_call_id,
)
continue
kept_parts.append(part)

if kept_parts:
if len(kept_parts) != len(message.parts):
repaired.append(replace(message, parts=kept_parts))
else:
repaired.append(message)

elif isinstance(message, _messages.ModelResponse):
kept_response_parts: list[_messages.ModelResponsePart] = []
for part in message.parts:
if isinstance(part, _messages.ToolCallPart):
if part.tool_call_id and part.tool_call_id not in return_ids:
logger.debug(
'Removing orphaned ToolCallPart with tool_call_id=%r (no matching return)',
part.tool_call_id,
)
continue
kept_response_parts.append(part)

if kept_response_parts:
if len(kept_response_parts) != len(message.parts):
repaired.append(replace(message, parts=kept_response_parts))
else:
repaired.append(message)

return repaired
51 changes: 34 additions & 17 deletions pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
Comment thread
anmolg1997 marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,37 @@
class GradingOutput(BaseModel, populate_by_name=True):
"""The output of a grading operation."""

reason: str
reason: str = Field(
description='A concise 1-2 sentence explanation of why the output passed or failed.',
)
pass_: bool = Field(validation_alias='pass', serialization_alias='pass')
score: float


_JUDGE_REASON_INSTRUCTION = (
'The "reason" field must be a concise 1-2 sentence summary of your verdict. '
'Do not include your reasoning process, self-corrections, or re-checking in the reason. '
'State only the final conclusion.'
)


_judge_output_agent = Agent(
name='judge_output',
system_prompt=dedent(
"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
f"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}

{_JUDGE_REASON_INSTRUCTION}

Examples:

<Output>Hello world</Output>
<Rubric>Content contains a greeting</Rubric>
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
{{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}}

<Output>Avast ye swabs, repel the invaders!</Output>
<Rubric>Does not speak like a pirate</Rubric>
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
{{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}}
"""
),
output_type=GradingOutput,
Expand All @@ -73,20 +84,22 @@ async def judge_output(
_judge_input_output_agent = Agent(
name='judge_input_output',
system_prompt=dedent(
"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
f"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}

{_JUDGE_REASON_INSTRUCTION}

Examples:

<Input>Hello world</Input>
<Output>Hello</Output>
<Rubric>Content contains a greeting word which is present in the input</Rubric>
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
{{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}}

<Input>Pirate</Input>
<Output>Avast ye swabs, repel the invaders!</Output>
<Rubric>Does not speak in the style described by the input</Rubric>
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
{{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}}
"""
),
output_type=GradingOutput,
Expand Down Expand Up @@ -115,22 +128,24 @@ async def judge_input_output(
_judge_input_output_expected_agent = Agent(
name='judge_input_output_expected',
system_prompt=dedent(
"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
f"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}

{_JUDGE_REASON_INSTRUCTION}

Examples:

<Input>What color is the sky?</Input>
<ExpectedOutput>Blue</ExpectedOutput>
<Output>Cerulean</Output>
<Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
{{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}}

<Input>How many legs does a spider have?</Input>
<ExpectedOutput>8</ExpectedOutput>
<Output>Six</Output>
<Rubric>The output is factually consistent with the expected output</Rubric>
{"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
{{"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}}
"""
),
output_type=GradingOutput,
Expand Down Expand Up @@ -162,20 +177,22 @@ async def judge_input_output_expected(
_judge_output_expected_agent = Agent(
name='judge_output_expected',
system_prompt=dedent(
"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
f"""
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}

{_JUDGE_REASON_INSTRUCTION}

Examples:

<ExpectedOutput>Blue</ExpectedOutput>
<Output>Cerulean</Output>
<Rubric>The output should be a shade of the expected output color</Rubric>
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
{{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}}

<ExpectedOutput>8</ExpectedOutput>
<Output>Six</Output>
<Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
{"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
{{"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}}
"""
),
output_type=GradingOutput,
Expand Down
Loading
Loading