pydantic · anmolg1997 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/pydantic_ai_slim/pydantic_ai/history_processors.py b/pydantic_ai_slim/pydantic_ai/history_processors.py
@@ -0,0 +1,110 @@
+"""Built-in history processor functions for common message history repair tasks.
+
+These functions can be passed directly to `Agent(history_processors=[...])` or
+used with `capabilities.HistoryProcessor(processor=...)`.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import replace
+
+from pydantic_ai import messages as _messages
+
+__all__ = ('repair_orphaned_tool_parts',)
+
+logger = logging.getLogger(__name__)
+
+
+def repair_orphaned_tool_parts(
+    messages: list[_messages.ModelMessage],
+) -> list[_messages.ModelMessage]:
+    """Remove orphaned tool call/return parts from message history.
+
+    Multi-turn agent conversations can accumulate structurally invalid history
+    when tool calls and their corresponding results become mismatched. Common
+    causes include streaming timeouts, deferred tool result drops, and history
+    trimming by other processors.
+
+    Providers like Anthropic strictly enforce that every `ToolCallPart` has a
+    matching `ToolReturnPart` (or `RetryPromptPart`) and vice versa; orphaned
+    entries cause 400 errors.
+
+    This processor performs a two-pass repair:
+
+    1. **Orphaned returns/retries**: `ToolReturnPart` or `RetryPromptPart` whose
+       `tool_call_id` does not match any preceding `ToolCallPart` are removed.
+    2. **Orphaned calls**: `ToolCallPart` whose `tool_call_id` does not match
+       any following `ToolReturnPart` or `RetryPromptPart` are removed.
+
+    Empty messages (all parts removed) are dropped entirely.
+
+    Example:
+        ```python
+        from pydantic_ai import Agent
+        from pydantic_ai.history_processors import repair_orphaned_tool_parts
+
+        agent = Agent('openai:gpt-4o', history_processors=[repair_orphaned_tool_parts])
+        ```
+    """
+    call_ids: set[str] = set()
+    for message in messages:
+        if isinstance(message, _messages.ModelResponse):
+            for part in message.parts:
+                if isinstance(part, _messages.ToolCallPart) and part.tool_call_id:
+                    call_ids.add(part.tool_call_id)
+
+    return_ids: set[str] = set()
+    for message in messages:
+        if isinstance(message, _messages.ModelRequest):
+            for part in message.parts:
+                if isinstance(part, (_messages.ToolReturnPart, _messages.RetryPromptPart)):
+                    if part.tool_call_id:
+                        return_ids.add(part.tool_call_id)
+
+    repaired: list[_messages.ModelMessage] = []
+    for message in messages:
+        if isinstance(message, _messages.ModelRequest):
+            kept_parts: list[_messages.ModelRequestPart] = []
+            for part in message.parts:
+                if isinstance(part, _messages.ToolReturnPart):
+                    if part.tool_call_id and part.tool_call_id not in call_ids:
+                        logger.debug(
+                            'Removing orphaned ToolReturnPart with tool_call_id=%r (no matching ToolCallPart)',
+                            part.tool_call_id,
+                        )
+                        continue
+                elif isinstance(part, _messages.RetryPromptPart):
+                    if part.tool_name is not None and part.tool_call_id and part.tool_call_id not in call_ids:
+                        logger.debug(
+                            'Removing orphaned RetryPromptPart with tool_call_id=%r (no matching ToolCallPart)',
+                            part.tool_call_id,
+                        )
+                        continue
+                kept_parts.append(part)
+
+            if kept_parts:
+                if len(kept_parts) != len(message.parts):
+                    repaired.append(replace(message, parts=kept_parts))
+                else:
+                    repaired.append(message)
+
+        elif isinstance(message, _messages.ModelResponse):
+            kept_response_parts: list[_messages.ModelResponsePart] = []
+            for part in message.parts:
+                if isinstance(part, _messages.ToolCallPart):
+                    if part.tool_call_id and part.tool_call_id not in return_ids:
+                        logger.debug(
+                            'Removing orphaned ToolCallPart with tool_call_id=%r (no matching return)',
+                            part.tool_call_id,
+                        )
+                        continue
+                kept_response_parts.append(part)
+
+            if kept_response_parts:
+                if len(kept_response_parts) != len(message.parts):
+                    repaired.append(replace(message, parts=kept_response_parts))
+                else:
+                    repaired.append(message)
+
+    return repaired
diff --git a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
@@ -27,26 +27,37 @@
 class GradingOutput(BaseModel, populate_by_name=True):
     """The output of a grading operation."""
 
-    reason: str
+    reason: str = Field(
+        description='A concise 1-2 sentence explanation of why the output passed or failed.',
+    )
     pass_: bool = Field(validation_alias='pass', serialization_alias='pass')
     score: float
 
 
+_JUDGE_REASON_INSTRUCTION = (
+    'The "reason" field must be a concise 1-2 sentence summary of your verdict. '
+    'Do not include your reasoning process, self-corrections, or re-checking in the reason. '
+    'State only the final conclusion.'
+)
+
+
 _judge_output_agent = Agent(
     name='judge_output',
     system_prompt=dedent(
-        """
-        You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+        f"""
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}
+
+        {_JUDGE_REASON_INSTRUCTION}
 
         Examples:
 
         <Output>Hello world</Output>
         <Rubric>Content contains a greeting</Rubric>
-        {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
+        {{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}}
 
         <Output>Avast ye swabs, repel the invaders!</Output>
         <Rubric>Does not speak like a pirate</Rubric>
-        {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
+        {{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}}
         """
     ),
     output_type=GradingOutput,
@@ -73,20 +84,22 @@ async def judge_output(
 _judge_input_output_agent = Agent(
     name='judge_input_output',
     system_prompt=dedent(
-        """
-        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+        f"""
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}
+
+        {_JUDGE_REASON_INSTRUCTION}
 
         Examples:
 
         <Input>Hello world</Input>
         <Output>Hello</Output>
         <Rubric>Content contains a greeting word which is present in the input</Rubric>
-        {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
+        {{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}}
 
         <Input>Pirate</Input>
         <Output>Avast ye swabs, repel the invaders!</Output>
         <Rubric>Does not speak in the style described by the input</Rubric>
-        {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
+        {{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}}
         """
     ),
     output_type=GradingOutput,
@@ -115,22 +128,24 @@ async def judge_input_output(
 _judge_input_output_expected_agent = Agent(
     name='judge_input_output_expected',
     system_prompt=dedent(
-        """
-        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+        f"""
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}
+
+        {_JUDGE_REASON_INSTRUCTION}
 
         Examples:
 
         <Input>What color is the sky?</Input>
         <ExpectedOutput>Blue</ExpectedOutput>
         <Output>Cerulean</Output>
         <Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
-        {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
+        {{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}}
 
         <Input>How many legs does a spider have?</Input>
         <ExpectedOutput>8</ExpectedOutput>
         <Output>Six</Output>
         <Rubric>The output is factually consistent with the expected output</Rubric>
-        {"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
+        {{"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}}
         """
     ),
     output_type=GradingOutput,
@@ -162,20 +177,22 @@ async def judge_input_output_expected(
 _judge_output_expected_agent = Agent(
     name='judge_output_expected',
     system_prompt=dedent(
-        """
-        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+        f"""
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {{reason: string, pass: boolean, score: number}}
+
+        {_JUDGE_REASON_INSTRUCTION}
 
         Examples:
 
         <ExpectedOutput>Blue</ExpectedOutput>
         <Output>Cerulean</Output>
         <Rubric>The output should be a shade of the expected output color</Rubric>
-        {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
+        {{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}}
 
         <ExpectedOutput>8</ExpectedOutput>
         <Output>Six</Output>
         <Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
-        {"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
+        {{"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}}
         """
     ),
     output_type=GradingOutput,