Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,31 @@ jobs:
path: coverage-data/.coverage.*
include-hidden-files: true

# Verify the harness still works against the lowest pydantic-ai-slim version we
# claim compatibility with (the `>=` floor in pyproject.toml). The default `test`
# matrix above tracks pydantic-ai-slim's main branch via `[tool.uv.sources]`, so
# this is the only job that exercises the published-PyPI install path.
test-floor:
runs-on: ubuntu-latest
name: test on floor pydantic-ai-slim
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

- uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0
with:
python-version: '3.14'
enable-cache: true # zizmor: ignore[cache-poisoning] -- Job does not produce release artifacts and does not have sensitive permissions
cache-suffix: floor

# Install all transitive deps from the lock first, then override slim to the floor
# version. `--no-deps` keeps everything else lock-pinned; the floor only needs to
# pin slim itself.
- run: uv sync --frozen --group dev --all-extras
- run: uv pip install --no-deps "pydantic-ai-slim==1.80.0"
- run: uv run --no-sync pytest

coverage:
needs: [test]
runs-on: ubuntu-latest
Expand All @@ -96,7 +121,7 @@ jobs:

check:
if: always()
needs: [lint, test, coverage]
needs: [lint, test, test-floor, coverage]
runs-on: ubuntu-latest
steps:
- uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # v1.2.2
Expand Down
67 changes: 27 additions & 40 deletions pydantic_ai_harness/code_mode/_toolset.py
Copy link
Copy Markdown

@devin-ai-integration devin-ai-integration Bot Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Class docstring now describes behavior the PR removed

The CodeModeToolset class docstring at pydantic_ai_harness/code_mode/_toolset.py:170-171 still says "Tools that require deferred execution (kind external/unapproved) cannot be called from inside the sandbox and are dropped with a one-time UserWarning." This is now factually incorrect — the entire point of this PR is to sandbox those tools instead. The docstring was not updated because it falls in unchanged context lines, but it will be misleading to anyone reading the class documentation.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Copy link
Copy Markdown

@devin-ai-integration devin-ai-integration Bot Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Stale docstring claims deferred-execution tools are excluded from sandbox

The CodeModeToolset class docstring at lines 188-189 states: "Tools that require deferred execution (kind external/unapproved) cannot be called from inside the sandbox and are dropped with a one-time UserWarning." This PR specifically removes that behavior — deferred-execution tools are now sandboxed like any other tool. The docstring was not updated to match, leaving incorrect documentation that contradicts the implementation.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pydantic_ai.function_signature import FunctionSignature
from pydantic_ai.messages import ToolCallPart, ToolReturn, ToolReturnContent, ToolReturnPart, is_multi_modal_content
from pydantic_ai.tool_manager import ToolManager
from pydantic_ai.tools import AgentDepsT, ToolSelector, matches_tool_selector
from pydantic_ai.tools import AgentDepsT, ToolDenied, ToolSelector, matches_tool_selector
from pydantic_ai.toolsets.abstract import SchemaValidatorProt, ToolsetTool

try:
Expand Down Expand Up @@ -218,13 +218,7 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
else:
native_tools[name] = tool

callable_defs, sanitized_to_original, native_fallbacks = self._partition_callable_tools(sandboxed_tools)

# Tools that matched the selector but can't run in the sandbox (deferred
# execution, deferred loading) are promoted back to native tools so
# they remain visible to the model.
for name in native_fallbacks:
native_tools[name] = sandboxed_tools[name]
callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools)

description = self._build_description(callable_defs)

Expand Down Expand Up @@ -265,7 +259,7 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
)
return result

async def call_tool(
async def call_tool( # noqa: C901
self, name: str, tool_args: dict[str, Any], ctx: RunContext[AgentDepsT], tool: ToolsetTool[AgentDepsT]
) -> Any:
"""Execute Python code in the sandbox, or pass through to a native tool."""
Expand Down Expand Up @@ -332,17 +326,30 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
try:
result = await tool_manager.handle_call(call_part, wrap_validation_errors=False)
except (CallDeferred, ApprovalRequired) as e:
# Approval/deferral require a round-trip back to the caller,
# which the sandbox cannot do. Raise UserError so the execution
# loop passes it into Monty as an ExternalException; Monty
# re-raises it as MontyRuntimeError, which we catch and convert
# to ModelRetry. The error message is preserved through the chain.
# No handler resolved the deferral. The sandbox can't round-trip to the
# caller, so we convert it to a UserError that propagates through
# Monty → MontyRuntimeError → ModelRetry.
raise UserError(
'Tool approval and deferral are not supported in code mode. '
f'Tool {original_name!r} raised {type(e).__name__}; ensure wrapped '
'tools do not use approval or deferral when used with CodeMode.'
f'Tool {original_name!r} raised {type(e).__name__} inside code mode, '
'but no `HandleDeferredToolCalls` capability resolved it. Add a handler '
'capability on the agent so deferred and approval-required calls can '
'be resolved inline.'
) from e

if isinstance(result, ToolDenied):
# Handler denied the call. Record the denial with outcome='denied' so
# message history reflects it, then raise inside the sandbox: surfacing
# `ToolDenied` to the user's script would let it masquerade as a string
# tool result, and the script has no way to introspect the marker class
# since `ToolDenied` isn't exposed inside Monty.
nested_returns[tool_call_id] = ToolReturnPart(
tool_name=original_name,
content=result.message,
tool_call_id=tool_call_id,
outcome='denied',
)
raise RuntimeError(f'Tool {original_name!r} call denied: {result.message}')
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.

# Unwrap ToolReturn to get the plain value for the sandbox,
# preserving the full ToolReturn metadata on the return part.
return_metadata: Any = None
Expand Down Expand Up @@ -431,40 +438,20 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:

def _partition_callable_tools(
self, wrapped_tools: dict[str, ToolsetTool[AgentDepsT]]
) -> tuple[dict[str, ToolDefinition], dict[str, str], set[str]]:
) -> tuple[dict[str, ToolDefinition], dict[str, str]]:
"""Return tool definitions that can be called from inside the sandbox.

Tool names that are not valid Python identifiers (e.g. MCP tools with
hyphens or dots like `get-weather`, `api.call`) are sanitized to
underscored forms and mapped back to their original names for dispatch.

Tools requiring deferred execution (kind `external`/`unapproved`) cannot
run in the sandbox and are excluded from `callable_defs`. Their names
are returned in the third element so the caller can promote them back
to native tools.

Returns:
A tuple of `(callable_defs, sanitized_to_original, native_fallbacks)`
where `native_fallbacks` contains original tool names that should
be exposed as native tools instead of being sandboxed.
A tuple of `(callable_defs, sanitized_to_original)`.
"""
callable_defs: dict[str, ToolDefinition] = {}
sanitized_to_original: dict[str, str] = {}
native_fallbacks: set[str] = set()
for name, tool in wrapped_tools.items():
td = tool.tool_def
if td.defer:
if name not in self._warned_deferred:
self._warned_deferred.add(name)
warnings.warn(
f'CodeMode: tool {name!r} requires deferred execution '
f'(kind={td.kind!r}) and cannot be called from inside the '
f'sandbox; it will be exposed as a native tool instead.',
UserWarning,
stacklevel=2,
)
native_fallbacks.add(name)
continue

safe_name = _sanitize_tool_name(name)
if safe_name == _RUN_CODE_TOOL_NAME:
Expand Down Expand Up @@ -498,7 +485,7 @@ def _partition_callable_tools(
td = replace(td, name=safe_name)

callable_defs[safe_name] = td
return callable_defs, sanitized_to_original, native_fallbacks
return callable_defs, sanitized_to_original

@staticmethod
def _build_description(callable_defs: dict[str, ToolDefinition]) -> str:
Expand Down
67 changes: 50 additions & 17 deletions tests/_code_mode/test_code_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,13 @@ def build_run_context(deps: T, run_step: int = 0) -> RunContext[T]:
)


async def build_ctx(deps: T, toolset: AbstractToolset[T], run_step: int = 0) -> RunContext[T]:
async def build_ctx(
deps: T,
toolset: AbstractToolset[T],
run_step: int = 0,
*,
root_capability: Any = None,
) -> RunContext[T]:
"""Build a `RunContext` with a prepared `ToolManager`.

Use this for tests that call `call_tool` — `CodeModeToolset` requires
Expand All @@ -70,7 +76,7 @@ async def build_ctx(deps: T, toolset: AbstractToolset[T], run_step: int = 0) ->
from pydantic_ai.tool_manager import ToolManager

ctx = build_run_context(deps, run_step=run_step)
tm = ToolManager(toolset=toolset)
tm = ToolManager(toolset=toolset, root_capability=root_capability)
prepared_tm = await tm.for_run_step(ctx)
ctx.tool_manager = prepared_tm
return ctx
Expand Down Expand Up @@ -682,34 +688,28 @@ def later(x: int) -> str:
# The deferred-loading tool should NOT be exposed as a native tool
assert 'later' not in tools

async def test_deferred_execution_tools_promoted_to_native_with_warning(self) -> None:
"""Tools with `kind='external'` (deferred execution) are excluded from sandbox but promoted to native."""
async def test_deferred_execution_tools_sandboxed(self) -> None:
"""Tools with `kind='external'`/`'unapproved'` are sandboxed like any other tool; resolution happens via a `HandleDeferredToolCalls` capability."""
td_external = ToolDefinition(
name='approve_action',
description='Needs approval.',
parameters_json_schema={'type': 'object', 'properties': {'x': {'type': 'string'}}, 'required': ['x']},
return_schema={'type': 'string'},
kind='external',
)
static = _StaticToolset([_make_address_tool_def('get_user', 'Get a user.', 'street'), td_external])
wrapper = CodeMode[None]().get_wrapper_toolset(static)
assert isinstance(wrapper, CodeModeToolset)

ctx = build_run_context(None)
with pytest.warns(UserWarning, match=r"tool 'approve_action' requires deferred execution"):
tools = await wrapper.get_tools(ctx)
tools = await wrapper.get_tools(ctx)

description = tools['run_code'].tool_def.description
assert description is not None
assert 'approve_action' not in description
# External tool is promoted to native — not lost
assert 'approve_action' in tools

# Second call must not warn again.
import warnings as _warnings

with _warnings.catch_warnings():
_warnings.simplefilter('error')
await wrapper.get_tools(ctx)
# The external tool appears as a sandboxed function signature.
assert 'async def approve_action' in description
# Not exposed as a native tool.
assert 'approve_action' not in tools

async def test_tool_without_return_schema_warns(self) -> None:
"""A sandboxed tool with no return_schema triggers a one-time warning."""
Expand Down Expand Up @@ -1018,7 +1018,40 @@ def needs_approval() -> str:
ctx = await build_ctx(None, wrapper)
tools = await wrapper.get_tools(ctx)

with pytest.raises(ModelRetry, match='approval and deferral are not supported'):
with pytest.raises(ModelRetry, match='no `HandleDeferredToolCalls` capability resolved it'):
await wrapper.call_tool('run_code', {'code': 'await needs_approval()'}, ctx, tools['run_code'])

async def test_handler_denial_surfaces_as_model_retry(self) -> None:
"""A `HandleDeferredToolCalls` handler denying a sandboxed tool call surfaces the denial.

The denial raises `RuntimeError` inside the sandbox so the script can't mistake
the denial message for a regular string return. If the script doesn't catch it,
Monty re-raises as `MontyRuntimeError`, which the harness converts to `ModelRetry`
with the original denial message preserved in the trace.
"""
try:
from pydantic_ai.capabilities import HandleDeferredToolCalls
except ImportError: # pragma: no cover — only fires on floor-slim CI, which doesn't gate on coverage
pytest.skip('Requires pydantic-ai-slim with `HandleDeferredToolCalls` (next release after 1.86.1)')

from pydantic_ai.exceptions import ApprovalRequired as _ApprovalRequired
from pydantic_ai.tools import DeferredToolRequests, DeferredToolResults, ToolDenied

def needs_approval() -> str:
"""A tool that requires approval."""
raise _ApprovalRequired()

async def handler(ctx: RunContext[None], requests: DeferredToolRequests) -> DeferredToolResults:
return DeferredToolResults(
approvals={call.tool_call_id: ToolDenied(message='nope') for call in requests.approvals}
)

wrapper = CodeMode[None]().get_wrapper_toolset(_build_function_toolset(needs_approval))
assert isinstance(wrapper, CodeModeToolset)
ctx = await build_ctx(None, wrapper, root_capability=HandleDeferredToolCalls(handler=handler))
tools = await wrapper.get_tools(ctx)

with pytest.raises(ModelRetry, match=r'call denied: nope'):
await wrapper.call_tool('run_code', {'code': 'await needs_approval()'}, ctx, tools['run_code'])

async def test_model_retry_from_wrapped_tool_surfaces_as_model_retry(self) -> None:
Expand Down
Loading
Loading