pydantic · DouweM · Apr 25, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -71,6 +71,31 @@ jobs:
           path: coverage-data/.coverage.*
           include-hidden-files: true
 
+  # Verify the harness still works against the lowest pydantic-ai-slim version we
+  # claim compatibility with (the `>=` floor in pyproject.toml). The default `test`
+  # matrix above tracks pydantic-ai-slim's main branch via `[tool.uv.sources]`, so
+  # this is the only job that exercises the published-PyPI install path.
+  test-floor:
+    runs-on: ubuntu-latest
+    name: test on floor pydantic-ai-slim
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          persist-credentials: false
+
+      - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0
+        with:
+          python-version: '3.14'
+          enable-cache: true # zizmor: ignore[cache-poisoning] -- Job does not produce release artifacts and does not have sensitive permissions
+          cache-suffix: floor
+
+      # Install all transitive deps from the lock first, then override slim to the floor
+      # version. `--no-deps` keeps everything else lock-pinned; the floor only needs to
+      # pin slim itself.
+      - run: uv sync --frozen --group dev --all-extras
+      - run: uv pip install --no-deps "pydantic-ai-slim==1.80.0"
+      - run: uv run --no-sync pytest
+
   coverage:
     needs: [test]
     runs-on: ubuntu-latest
@@ -96,7 +121,7 @@ jobs:
 
   check:
     if: always()
-    needs: [lint, test, coverage]
+    needs: [lint, test, test-floor, coverage]
     runs-on: ubuntu-latest
     steps:
       - uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe # v1.2.2

diff --git a/pydantic_ai_harness/code_mode/_toolset.py b/pydantic_ai_harness/code_mode/_toolset.py
@@ -16,7 +16,7 @@
 from pydantic_ai.function_signature import FunctionSignature
 from pydantic_ai.messages import ToolCallPart, ToolReturn, ToolReturnContent, ToolReturnPart, is_multi_modal_content
 from pydantic_ai.tool_manager import ToolManager
-from pydantic_ai.tools import AgentDepsT, ToolSelector, matches_tool_selector
+from pydantic_ai.tools import AgentDepsT, ToolDenied, ToolSelector, matches_tool_selector
 from pydantic_ai.toolsets.abstract import SchemaValidatorProt, ToolsetTool
 
 try:
@@ -218,13 +218,7 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
             else:
                 native_tools[name] = tool
 
-        callable_defs, sanitized_to_original, native_fallbacks = self._partition_callable_tools(sandboxed_tools)
-
-        # Tools that matched the selector but can't run in the sandbox (deferred
-        # execution, deferred loading) are promoted back to native tools so
-        # they remain visible to the model.
-        for name in native_fallbacks:
-            native_tools[name] = sandboxed_tools[name]
+        callable_defs, sanitized_to_original = self._partition_callable_tools(sandboxed_tools)
 
         description = self._build_description(callable_defs)
 
@@ -265,7 +259,7 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
         )
         return result
 
-    async def call_tool(
+    async def call_tool(  # noqa: C901
         self, name: str, tool_args: dict[str, Any], ctx: RunContext[AgentDepsT], tool: ToolsetTool[AgentDepsT]
     ) -> Any:
         """Execute Python code in the sandbox, or pass through to a native tool."""
@@ -332,17 +326,30 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
             try:
                 result = await tool_manager.handle_call(call_part, wrap_validation_errors=False)
             except (CallDeferred, ApprovalRequired) as e:
-                # Approval/deferral require a round-trip back to the caller,
-                # which the sandbox cannot do. Raise UserError so the execution
-                # loop passes it into Monty as an ExternalException; Monty
-                # re-raises it as MontyRuntimeError, which we catch and convert
-                # to ModelRetry. The error message is preserved through the chain.
+                # No handler resolved the deferral. The sandbox can't round-trip to the
+                # caller, so we convert it to a UserError that propagates through
+                # Monty → MontyRuntimeError → ModelRetry.
                 raise UserError(
-                    'Tool approval and deferral are not supported in code mode. '
-                    f'Tool {original_name!r} raised {type(e).__name__}; ensure wrapped '
-                    'tools do not use approval or deferral when used with CodeMode.'
+                    f'Tool {original_name!r} raised {type(e).__name__} inside code mode, '
+                    'but no `HandleDeferredToolCalls` capability resolved it. Add a handler '
+                    'capability on the agent so deferred and approval-required calls can '
+                    'be resolved inline.'
                 ) from e
 
+            if isinstance(result, ToolDenied):
+                # Handler denied the call. Record the denial with outcome='denied' so
+                # message history reflects it, then raise inside the sandbox: surfacing
+                # `ToolDenied` to the user's script would let it masquerade as a string
+                # tool result, and the script has no way to introspect the marker class
+                # since `ToolDenied` isn't exposed inside Monty.
+                nested_returns[tool_call_id] = ToolReturnPart(
+                    tool_name=original_name,
+                    content=result.message,
+                    tool_call_id=tool_call_id,
+                    outcome='denied',
+                )
+                raise RuntimeError(f'Tool {original_name!r} call denied: {result.message}')
+
             # Unwrap ToolReturn to get the plain value for the sandbox,
             # preserving the full ToolReturn metadata on the return part.
             return_metadata: Any = None
@@ -431,40 +438,20 @@ async def dispatch_tool_call(original_name: str, kwargs: dict[str, Any]) -> Any:
 
     def _partition_callable_tools(
         self, wrapped_tools: dict[str, ToolsetTool[AgentDepsT]]
-    ) -> tuple[dict[str, ToolDefinition], dict[str, str], set[str]]:
+    ) -> tuple[dict[str, ToolDefinition], dict[str, str]]:
         """Return tool definitions that can be called from inside the sandbox.
 
         Tool names that are not valid Python identifiers (e.g. MCP tools with
         hyphens or dots like `get-weather`, `api.call`) are sanitized to
         underscored forms and mapped back to their original names for dispatch.
 
-        Tools requiring deferred execution (kind `external`/`unapproved`) cannot
-        run in the sandbox and are excluded from `callable_defs`. Their names
-        are returned in the third element so the caller can promote them back
-        to native tools.
-
         Returns:
-            A tuple of `(callable_defs, sanitized_to_original, native_fallbacks)`
-            where `native_fallbacks` contains original tool names that should
-            be exposed as native tools instead of being sandboxed.
+            A tuple of `(callable_defs, sanitized_to_original)`.
         """
         callable_defs: dict[str, ToolDefinition] = {}
         sanitized_to_original: dict[str, str] = {}
-        native_fallbacks: set[str] = set()
         for name, tool in wrapped_tools.items():
             td = tool.tool_def
-            if td.defer:
-                if name not in self._warned_deferred:
-                    self._warned_deferred.add(name)
-                    warnings.warn(
-                        f'CodeMode: tool {name!r} requires deferred execution '
-                        f'(kind={td.kind!r}) and cannot be called from inside the '
-                        f'sandbox; it will be exposed as a native tool instead.',
-                        UserWarning,
-                        stacklevel=2,
-                    )
-                native_fallbacks.add(name)
-                continue
 
             safe_name = _sanitize_tool_name(name)
             if safe_name == _RUN_CODE_TOOL_NAME:
@@ -498,7 +485,7 @@ def _partition_callable_tools(
                 td = replace(td, name=safe_name)
 
             callable_defs[safe_name] = td
-        return callable_defs, sanitized_to_original, native_fallbacks
+        return callable_defs, sanitized_to_original
 
     @staticmethod
     def _build_description(callable_defs: dict[str, ToolDefinition]) -> str:

diff --git a/tests/_code_mode/test_code_mode.py b/tests/_code_mode/test_code_mode.py
@@ -61,7 +61,13 @@ def build_run_context(deps: T, run_step: int = 0) -> RunContext[T]:
     )
 
 
-async def build_ctx(deps: T, toolset: AbstractToolset[T], run_step: int = 0) -> RunContext[T]:
+async def build_ctx(
+    deps: T,
+    toolset: AbstractToolset[T],
+    run_step: int = 0,
+    *,
+    root_capability: Any = None,
+) -> RunContext[T]:
     """Build a `RunContext` with a prepared `ToolManager`.
 
     Use this for tests that call `call_tool` — `CodeModeToolset` requires
@@ -70,7 +76,7 @@ async def build_ctx(deps: T, toolset: AbstractToolset[T], run_step: int = 0) ->
     from pydantic_ai.tool_manager import ToolManager
 
     ctx = build_run_context(deps, run_step=run_step)
-    tm = ToolManager(toolset=toolset)
+    tm = ToolManager(toolset=toolset, root_capability=root_capability)
     prepared_tm = await tm.for_run_step(ctx)
     ctx.tool_manager = prepared_tm
     return ctx
@@ -682,34 +688,28 @@ def later(x: int) -> str:
         # The deferred-loading tool should NOT be exposed as a native tool
         assert 'later' not in tools
 
-    async def test_deferred_execution_tools_promoted_to_native_with_warning(self) -> None:
-        """Tools with `kind='external'` (deferred execution) are excluded from sandbox but promoted to native."""
+    async def test_deferred_execution_tools_sandboxed(self) -> None:
+        """Tools with `kind='external'`/`'unapproved'` are sandboxed like any other tool; resolution happens via a `HandleDeferredToolCalls` capability."""
         td_external = ToolDefinition(
             name='approve_action',
             description='Needs approval.',
             parameters_json_schema={'type': 'object', 'properties': {'x': {'type': 'string'}}, 'required': ['x']},
+            return_schema={'type': 'string'},
             kind='external',
         )
         static = _StaticToolset([_make_address_tool_def('get_user', 'Get a user.', 'street'), td_external])
         wrapper = CodeMode[None]().get_wrapper_toolset(static)
         assert isinstance(wrapper, CodeModeToolset)
 
         ctx = build_run_context(None)
-        with pytest.warns(UserWarning, match=r"tool 'approve_action' requires deferred execution"):
-            tools = await wrapper.get_tools(ctx)
+        tools = await wrapper.get_tools(ctx)
 
         description = tools['run_code'].tool_def.description
         assert description is not None
-        assert 'approve_action' not in description
-        # External tool is promoted to native — not lost
-        assert 'approve_action' in tools
-
-        # Second call must not warn again.
-        import warnings as _warnings
-
-        with _warnings.catch_warnings():
-            _warnings.simplefilter('error')
-            await wrapper.get_tools(ctx)
+        # The external tool appears as a sandboxed function signature.
+        assert 'async def approve_action' in description
+        # Not exposed as a native tool.
+        assert 'approve_action' not in tools
 
     async def test_tool_without_return_schema_warns(self) -> None:
         """A sandboxed tool with no return_schema triggers a one-time warning."""
@@ -1018,7 +1018,40 @@ def needs_approval() -> str:
         ctx = await build_ctx(None, wrapper)
         tools = await wrapper.get_tools(ctx)
 
-        with pytest.raises(ModelRetry, match='approval and deferral are not supported'):
+        with pytest.raises(ModelRetry, match='no `HandleDeferredToolCalls` capability resolved it'):
+            await wrapper.call_tool('run_code', {'code': 'await needs_approval()'}, ctx, tools['run_code'])
+
+    async def test_handler_denial_surfaces_as_model_retry(self) -> None:
+        """A `HandleDeferredToolCalls` handler denying a sandboxed tool call surfaces the denial.
+
+        The denial raises `RuntimeError` inside the sandbox so the script can't mistake
+        the denial message for a regular string return. If the script doesn't catch it,
+        Monty re-raises as `MontyRuntimeError`, which the harness converts to `ModelRetry`
+        with the original denial message preserved in the trace.
+        """
+        try:
+            from pydantic_ai.capabilities import HandleDeferredToolCalls
+        except ImportError:  # pragma: no cover — only fires on floor-slim CI, which doesn't gate on coverage
+            pytest.skip('Requires pydantic-ai-slim with `HandleDeferredToolCalls` (next release after 1.86.1)')
+
+        from pydantic_ai.exceptions import ApprovalRequired as _ApprovalRequired
+        from pydantic_ai.tools import DeferredToolRequests, DeferredToolResults, ToolDenied
+
+        def needs_approval() -> str:
+            """A tool that requires approval."""
+            raise _ApprovalRequired()
+
+        async def handler(ctx: RunContext[None], requests: DeferredToolRequests) -> DeferredToolResults:
+            return DeferredToolResults(
+                approvals={call.tool_call_id: ToolDenied(message='nope') for call in requests.approvals}
+            )
+
+        wrapper = CodeMode[None]().get_wrapper_toolset(_build_function_toolset(needs_approval))
+        assert isinstance(wrapper, CodeModeToolset)
+        ctx = await build_ctx(None, wrapper, root_capability=HandleDeferredToolCalls(handler=handler))
+        tools = await wrapper.get_tools(ctx)
+
+        with pytest.raises(ModelRetry, match=r'call denied: nope'):
             await wrapper.call_tool('run_code', {'code': 'await needs_approval()'}, ctx, tools['run_code'])
 
     async def test_model_retry_from_wrapped_tool_surfaces_as_model_retry(self) -> None: