From 15455d5c2c5057ba9dc62a5dc4116795c262e3a0 Mon Sep 17 00:00:00 2001
From: marcelo <marcelojimenezrocabado@gmail.com>
Date: Wed, 11 Mar 2026 08:25:43 -0700
Subject: [PATCH 1/3] add stopWhen

---
 docs/sdk/concepts/testing-with-llms.mdx       | 19 ++++
 docs/sdk/reference/test-agent.mdx             | 36 +++++++
 sdk/README.md                                 | 10 ++
 sdk/skills/create-mcp-eval/SKILL.md           |  9 +-
 sdk/src/EvalAgent.ts                          | 27 ++++++
 sdk/src/TestAgent.ts                          | 26 +++--
 sdk/src/index.ts                              |  4 +
 .../TestAgent.stopWhen.integration.test.ts    | 91 +++++++++++++++++
 sdk/tests/TestAgent.test.ts                   | 97 +++++++++++++++++++
 9 files changed, 312 insertions(+), 7 deletions(-)
 create mode 100644 sdk/tests/TestAgent.stopWhen.integration.test.ts
diff --git a/docs/sdk/concepts/testing-with-llms.mdx b/docs/sdk/concepts/testing-with-llms.mdx
index 6605c360c..1b5556d80 100644
--- a/docs/sdk/concepts/testing-with-llms.mdx
+++ b/docs/sdk/concepts/testing-with-llms.mdx
@@ -143,6 +143,25 @@ const agent = new TestAgent({
 });
 ```
 
+## Control Multi-Step Loops with stopWhen
+
+Use `stopWhen` to pass AI SDK stop conditions into `TestAgent.prompt()` when you want to stop the multi-step loop after a particular step completes:
+
+```typescript
+import { hasToolCall } from "@mcpjam/sdk";
+
+// Stop after the step where the tool is called
+const result = await agent.prompt("Search for open tasks", {
+  stopWhen: hasToolCall("search_tasks"),
+});
+
+expect(result.hasToolCall("search_tasks")).toBe(true);
+```
+
+<Tip>
+`stopWhen` does not skip tool execution. AI SDK evaluates stop conditions after the current step completes and tool results are available. `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+</Tip>
+
 ## Writing Assertions
 
 Use validators to assert tool call behavior:
diff --git a/docs/sdk/reference/test-agent.mdx b/docs/sdk/reference/test-agent.mdx
index bea039067..1dda2f9d4 100644
--- a/docs/sdk/reference/test-agent.mdx
+++ b/docs/sdk/reference/test-agent.mdx
@@ -76,6 +76,7 @@ prompt(
 | Property | Type | Description |
 |----------|------|-------------|
 | `context` | `PromptResult \| PromptResult[]` | Previous result(s) for multi-turn conversations |
+| `stopWhen` | `StopCondition<ToolSet> \| Array<StopCondition<ToolSet>>` | Additional AI SDK stop conditions for the multi-step loop. Tools still execute normally. `TestAgent` always applies `stepCountIs(maxSteps)` as a safety guard. |
 
 #### Returns
 
@@ -84,6 +85,8 @@ prompt(
 #### Example
 
 ```typescript
+import { hasToolCall } from "@mcpjam/sdk";
+
 // Simple prompt
 const result = await agent.prompt("Add 2 and 3");
 
@@ -93,6 +96,12 @@ const r2 = await agent.prompt("Mark it complete", { context: r1 });
 
 // Multiple context items
 const r3 = await agent.prompt("Show summary", { context: [r1, r2] });
+
+// Stop the loop after the step where a tool is called
+const r4 = await agent.prompt("Search for tasks", {
+  stopWhen: hasToolCall("search_tasks"),
+});
+console.log(r4.hasToolCall("search_tasks"));
 ```
 
 <Note>
@@ -233,6 +242,33 @@ Setting `maxSteps` too low may prevent complex tasks from completing. Setting it
 
 ---
 
+## Control Multi-Step Loops with stopWhen
+
+Use `stopWhen` to pass raw AI SDK stop conditions into `TestAgent.prompt()`.
+This controls whether the agent starts another generation step after the current step completes.
+
+```typescript
+import { hasToolCall } from "@mcpjam/sdk";
+
+// Stop after the step where "search_tasks" is called
+const result = await agent.prompt("Find my open tasks", {
+  stopWhen: hasToolCall("search_tasks"),
+});
+
+expect(result.hasToolCall("search_tasks")).toBe(true);
+
+// Stop after any of multiple conditions
+const result2 = await agent.prompt("Do something", {
+  stopWhen: [hasToolCall("tool_a"), hasToolCall("tool_b")],
+});
+```
+
+<Info>
+`stopWhen` does not skip tool execution. AI SDK evaluates stop conditions after the current step completes and tool results are available. `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+</Info>
+
+---
+
 ## Complete Example
 
 ```typescript
diff --git a/sdk/README.md b/sdk/README.md
index cd7597c95..18e71f8b4 100644
--- a/sdk/README.md
+++ b/sdk/README.md
@@ -197,13 +197,23 @@ const agent = new TestAgent({
 });
 
 // Run a prompt
+import { hasToolCall } from "@mcpjam/sdk";
+
 const result = await agent.prompt("Add 2 and 3");
 
 // Multi-turn with context
 const r1 = await agent.prompt("Who am I?");
 const r2 = await agent.prompt("List my projects", { context: [r1] });
+
+// Stop the loop after the step where a tool is called
+const r3 = await agent.prompt("Search tasks", {
+  stopWhen: hasToolCall("search_tasks"),
+});
+r3.hasToolCall("search_tasks");          // true
 ```
 
+`stopWhen` does not skip tool execution. AI SDK evaluates stop conditions after the current step completes and tool results are available. `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+
 **Supported providers:** `openai`, `anthropic`, `azure`, `google`, `mistral`, `deepseek`, `ollama`, `openrouter`, `xai`
 
 </details>
diff --git a/sdk/skills/create-mcp-eval/SKILL.md b/sdk/skills/create-mcp-eval/SKILL.md
index ac48df0ff..643539da5 100644
--- a/sdk/skills/create-mcp-eval/SKILL.md
+++ b/sdk/skills/create-mcp-eval/SKILL.md
@@ -194,12 +194,20 @@ const agent = new TestAgent({
 });
 
 // Single prompt
+import { hasToolCall } from "@mcpjam/sdk";
+
 const result = await agent.prompt("List all projects");
 
 // Multi-turn with context
 const r1 = await agent.prompt("Get my user profile");
 const r2 = await agent.prompt("List workspaces for that user", { context: r1 });
 
+// Stop the loop after the step where a tool is called
+const r3 = await agent.prompt("Search tasks", {
+  stopWhen: hasToolCall("search_tasks"),
+});
+r3.hasToolCall("search_tasks");          // true
+
 // Mock agent for deterministic tests (no LLM needed)
 const mockAgent = TestAgent.mock(async (message) =>
   PromptResult.from({
@@ -998,4 +1006,3 @@ it("selects search_tasks", async () => {
   expect(result.hasToolCall("search_tasks")).toBe(true);
 }, 90_000);
 ```
-
diff --git a/sdk/src/EvalAgent.ts b/sdk/src/EvalAgent.ts
index 0945fc55b..59606dd4b 100644
--- a/sdk/src/EvalAgent.ts
+++ b/sdk/src/EvalAgent.ts
@@ -1,3 +1,4 @@
+import type { StopCondition, ToolSet } from "ai";
 import type { PromptResult } from "./PromptResult.js";
 
 /**
@@ -6,6 +7,32 @@ import type { PromptResult } from "./PromptResult.js";
 export interface PromptOptions {
   /** Previous PromptResult(s) to include as conversation context for multi-turn conversations */
   context?: PromptResult | PromptResult[];
+
+  /**
+   * Additional AI SDK stop conditions for the agentic loop.
+   * Evaluated after each step completes (tools execute normally).
+   * `stepCountIs(maxSteps)` is always applied as a safety guard
+   * in addition to any conditions provided here.
+   *
+   * Import helpers like `hasToolCall` and `stepCountIs` from `"@mcpjam/sdk"`.
+   *
+   * @example
+   * ```typescript
+   * import { hasToolCall } from "@mcpjam/sdk";
+   *
+   * // Stop the loop after the step where "search_tasks" is called
+   * const result = await agent.prompt("Find my tasks", {
+   *   stopWhen: hasToolCall("search_tasks"),
+   * });
+   * expect(result.hasToolCall("search_tasks")).toBe(true);
+   *
+   * // Multiple conditions (any one being true stops the loop)
+   * const result = await agent.prompt("Do something", {
+   *   stopWhen: [hasToolCall("tool_a"), hasToolCall("tool_b")],
+   * });
+   * ```
+   */
+  stopWhen?: StopCondition<ToolSet> | Array<StopCondition<ToolSet>>;
 }
 
 /**
diff --git a/sdk/src/TestAgent.ts b/sdk/src/TestAgent.ts
index 8e1c80cb8..5e3a9eb58 100644
--- a/sdk/src/TestAgent.ts
+++ b/sdk/src/TestAgent.ts
@@ -3,7 +3,7 @@
  */
 
 import { generateText, stepCountIs, dynamicTool, jsonSchema } from "ai";
-import type { ToolSet, ModelMessage, UserModelMessage } from "ai";
+import type { StopCondition, ToolSet, ModelMessage, UserModelMessage } from "ai";
 import { CallToolResultSchema } from "@modelcontextprotocol/sdk/types.js";
 import { createModelFromString, parseLLMString } from "./model-factory.js";
 import type { CreateModelOptions } from "./model-factory.js";
@@ -318,6 +318,17 @@ export class TestAgent implements EvalAgent {
     return instrumented;
   }
 
+  private resolveStopWhen(
+    stopWhen?: PromptOptions["stopWhen"]
+  ): StopCondition<ToolSet> | Array<StopCondition<ToolSet>> {
+    if (stopWhen == null) {
+      return stepCountIs(this.maxSteps);
+    }
+
+    const conditions = Array.isArray(stopWhen) ? stopWhen : [stopWhen];
+    return [stepCountIs(this.maxSteps), ...conditions];
+  }
+
   /**
    * Build an array of ModelMessages from previous PromptResult(s) for multi-turn context.
    * @param context - Single PromptResult or array of PromptResults to include as context
@@ -383,10 +394,13 @@ export class TestAgent implements EvalAgent {
       const model = createModelFromString(this.model, modelOptions);
 
       // Instrument tools to track MCP execution time
-      const instrumentedTools = this.createInstrumentedTools((ms) => {
-        totalMcpMs += ms;
-        stepMcpMs += ms; // Accumulate per-step for LLM calculation
-      }, widgetSnapshots);
+      const instrumentedTools = this.createInstrumentedTools(
+        (ms) => {
+          totalMcpMs += ms;
+          stepMcpMs += ms; // Accumulate per-step for LLM calculation
+        },
+        widgetSnapshots
+      );
 
       // Build messages array if context is provided for multi-turn
       const contextMessages = this.buildContextMessages(options?.context);
@@ -407,7 +421,7 @@ export class TestAgent implements EvalAgent {
         }),
         // Use stopWhen with stepCountIs for controlling max agentic steps
         // AI SDK v6+ uses this instead of maxSteps
-        stopWhen: stepCountIs(this.maxSteps),
+        stopWhen: this.resolveStopWhen(options?.stopWhen),
         onStepFinish: () => {
           const now = Date.now();
           const stepDuration = now - lastStepEndTime;
diff --git a/sdk/src/index.ts b/sdk/src/index.ts
index 2ec293a72..93ce1b894 100644
--- a/sdk/src/index.ts
+++ b/sdk/src/index.ts
@@ -82,6 +82,10 @@ export { EvalReportingError, SdkError } from "./errors.js";
 // EvalAgent interface (for deterministic testing without concrete TestAgent)
 export type { EvalAgent, PromptOptions } from "./EvalAgent.js";
 
+// AI SDK stop condition helpers re-exported for TestAgent.prompt()
+export { hasToolCall, stepCountIs } from "ai";
+export type { StopCondition } from "ai";
+
 // TestAgent
 export { TestAgent } from "./TestAgent.js";
 export type { TestAgentConfig } from "./TestAgent.js";
diff --git a/sdk/tests/TestAgent.stopWhen.integration.test.ts b/sdk/tests/TestAgent.stopWhen.integration.test.ts
new file mode 100644
index 000000000..f192e3952
--- /dev/null
+++ b/sdk/tests/TestAgent.stopWhen.integration.test.ts
@@ -0,0 +1,91 @@
+import { dynamicTool, hasToolCall, jsonSchema } from "ai";
+import { MockLanguageModelV3 } from "ai/test";
+import { TestAgent } from "../src/TestAgent";
+
+let currentModel: MockLanguageModelV3;
+const mockCreateModelFromString = jest.fn(() => currentModel);
+
+jest.mock("../src/model-factory", () => {
+  const actual = jest.requireActual("../src/model-factory");
+  return {
+    ...actual,
+    createModelFromString: (...args: any[]) => mockCreateModelFromString(...args),
+  };
+});
+
+describe("TestAgent stopWhen integration", () => {
+  beforeEach(() => {
+    mockCreateModelFromString.mockClear();
+  });
+
+  it("executes the tool and stops before the next generation step", async () => {
+    const toolExecutions: Array<Record<string, unknown>> = [];
+    let stepNumber = 0;
+
+    currentModel = new MockLanguageModelV3({
+      doGenerate: async () => {
+        stepNumber += 1;
+
+        if (stepNumber === 1) {
+          return {
+            content: [
+              {
+                type: "tool-call" as const,
+                toolCallId: "call-1",
+                toolName: "add",
+                input: JSON.stringify({ a: 2, b: 3 }),
+              },
+            ],
+            finishReason: "tool-calls",
+            usage: { inputTokens: 5, outputTokens: 3, totalTokens: 8 },
+            warnings: [],
+          };
+        }
+
+        return {
+          content: [{ type: "text" as const, text: "The result is 5" }],
+          finishReason: "stop",
+          usage: { inputTokens: 4, outputTokens: 2, totalTokens: 6 },
+          warnings: [],
+        };
+      },
+    });
+
+    const agent = new TestAgent({
+      tools: {
+        add: dynamicTool({
+          description: "Add two numbers",
+          inputSchema: jsonSchema({
+            type: "object",
+            properties: {
+              a: { type: "number" },
+              b: { type: "number" },
+            },
+            required: ["a", "b"],
+          }),
+          execute: async (input) => {
+            const args = input as { a: number; b: number };
+            toolExecutions.push(args);
+            return args.a + args.b;
+          },
+        }),
+      },
+      model: "openai/gpt-4o",
+      apiKey: "test-key",
+    });
+
+    const result = await agent.prompt("Add 2 and 3", {
+      stopWhen: hasToolCall("add"),
+    });
+
+    expect(toolExecutions).toEqual([{ a: 2, b: 3 }]);
+    expect(result.hasToolCall("add")).toBe(true);
+    expect(result.getToolArguments("add")).toEqual({ a: 2, b: 3 });
+    expect(result.text).toBe("");
+    expect(currentModel.doGenerateCalls).toHaveLength(1);
+    expect(mockCreateModelFromString).toHaveBeenCalledWith(
+      "openai/gpt-4o",
+      expect.objectContaining({ apiKey: "test-key" })
+    );
+  });
+});
diff --git a/sdk/tests/TestAgent.test.ts b/sdk/tests/TestAgent.test.ts
index b0b66dfad..1e0eb7b85 100644
--- a/sdk/tests/TestAgent.test.ts
+++ b/sdk/tests/TestAgent.test.ts
@@ -894,6 +894,103 @@ describe("TestAgent", () => {
     });
   });
 
+  describe("stopWhen", () => {
+    it("should merge a single stop condition with maxSteps and still execute tools", async () => {
+      const stopCondition = jest.fn(() => false);
+
+      mockGenerateText.mockImplementationOnce(async (params: any) => {
+        const result = await params.tools.add.execute(
+          { a: 2, b: 3 },
+          { abortSignal: { throwIfAborted: jest.fn() } }
+        );
+        expect(result).toBe(5);
+        params.onStepFinish?.();
+        return {
+          text: "Done",
+          steps: [
+            {
+              toolCalls: [
+                {
+                  type: "tool-call",
+                  toolCallId: "1",
+                  toolName: "add",
+                  input: { a: 2, b: 3 },
+                },
+              ],
+            },
+          ],
+          usage: { inputTokens: 5, outputTokens: 3, totalTokens: 8 },
+        } as any;
+      });
+
+      const agent = new TestAgent({
+        tools: mockToolSet,
+        model: "openai/gpt-4o",
+        apiKey: "test-key",
+      });
+
+      const result = await agent.prompt("Add 2 and 3", {
+        stopWhen: stopCondition as any,
+      });
+
+      expect(result.hasToolCall("add")).toBe(true);
+      expect(result.getToolArguments("add")).toEqual({ a: 2, b: 3 });
+
+      const callArgs = mockGenerateText.mock.calls[0][0] as any;
+      expect(callArgs.stopWhen).toEqual([
+        { type: "stepCount", value: 10 },
+        stopCondition,
+      ]);
+    });
+
+    it("should merge multiple stop conditions with maxSteps", async () => {
+      const stopA = jest.fn(() => false);
+      const stopB = jest.fn(() => true);
+
+      mockGenerateText.mockResolvedValueOnce({
+        text: "Done",
+        steps: [],
+        usage: { inputTokens: 5, outputTokens: 3, totalTokens: 8 },
+      } as any);
+
+      const agent = new TestAgent({
+        tools: mockToolSet,
+        model: "openai/gpt-4o",
+        apiKey: "test-key",
+      });
+
+      await agent.prompt("Do math", {
+        stopWhen: [stopA as any, stopB as any],
+      });
+
+      const callArgs = mockGenerateText.mock.calls[0][0] as any;
+      expect(callArgs.stopWhen).toEqual([
+        { type: "stepCount", value: 10 },
+        stopA,
+        stopB,
+      ]);
+    });
+
+    it("should default to stepCountIs when stopWhen is not set", async () => {
+      mockGenerateText.mockResolvedValueOnce({
+        text: "OK",
+        steps: [],
+        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
+      } as any);
+
+      const agent = new TestAgent({
+        tools: mockToolSet,
+        model: "openai/gpt-4o",
+        apiKey: "test-key",
+      });
+
+      await agent.prompt("Test");
+
+      const callArgs = mockGenerateText.mock.calls[0][0] as any;
+      expect(callArgs.stopWhen).toEqual({ type: "stepCount", value: 10 });
+    });
+  });
+
   describe("app-only tool filtering", () => {
     // Helper to create a mock Tool with visibility
     const createMockTool = (

From 974ecdac593c00b0274ecb1676ef4213d74d51fb Mon Sep 17 00:00:00 2001
From: marcelo <marcelojimenezrocabado@gmail.com>
Date: Wed, 11 Mar 2026 08:44:15 -0700
Subject: [PATCH 2/3] add timeout

---
 docs/sdk/concepts/connecting-servers.mdx      |   2 +-
 docs/sdk/concepts/testing-with-llms.mdx       |  22 +++-
 docs/sdk/reference/llm-providers.mdx          |   2 +-
 docs/sdk/reference/prompt-result.mdx          |   2 +-
 docs/sdk/reference/test-agent.mdx             |  43 +++++-
 sdk/README.md                                 |  12 +-
 sdk/skills/create-mcp-eval/SKILL.md           |  12 +-
 sdk/src/EvalAgent.ts                          |  17 ++-
 sdk/src/TestAgent.ts                          |   3 +
 sdk/tests/TestAgent.test.ts                   |  65 ++++++++++
 .../TestAgent.timeout.integration.test.ts     | 122 ++++++++++++++++++
 11 files changed, 287 insertions(+), 15 deletions(-)
 create mode 100644 sdk/tests/TestAgent.timeout.integration.test.ts

diff --git a/docs/sdk/concepts/connecting-servers.mdx b/docs/sdk/concepts/connecting-servers.mdx
index d83fd905c..eef38d276 100644
--- a/docs/sdk/concepts/connecting-servers.mdx
+++ b/docs/sdk/concepts/connecting-servers.mdx
@@ -176,7 +176,7 @@ const manager = new MCPClientManager({
 
 await manager.connectToServer("myServer");
 
-// Get tools in AI SDK format
+// Get tools for TestAgent
 const tools = await manager.getTools();
 
 // Create agent with those tools
diff --git a/docs/sdk/concepts/testing-with-llms.mdx b/docs/sdk/concepts/testing-with-llms.mdx
index 1b5556d80..228c42e9c 100644
--- a/docs/sdk/concepts/testing-with-llms.mdx
+++ b/docs/sdk/concepts/testing-with-llms.mdx
@@ -145,7 +145,7 @@ const agent = new TestAgent({
 
 ## Control Multi-Step Loops with stopWhen
 
-Use `stopWhen` to pass AI SDK stop conditions into `TestAgent.prompt()` when you want to stop the multi-step loop after a particular step completes:
+Use `stopWhen` when you want to stop the multi-step loop after a particular step completes:
 
 ```typescript
 import { hasToolCall } from "@mcpjam/sdk";
@@ -159,7 +159,25 @@ expect(result.hasToolCall("search_tasks")).toBe(true);
 ```
 
 <Tip>
-`stopWhen` does not skip tool execution. AI SDK evaluates stop conditions after the current step completes and tool results are available. `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+`stopWhen` does not skip tool execution. It controls whether the prompt loop continues after the current step completes, and `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+</Tip>
+
+## Bound Prompt Runtime with timeout
+
+Use `timeout` when you want to bound how long `TestAgent.prompt()` can run:
+
+```typescript
+const result = await agent.prompt("Run a long workflow", {
+  timeout: { totalMs: 10_000, stepMs: 2_500 },
+});
+
+if (result.hasError()) {
+  console.error(result.getError());
+}
+```
+
+<Tip>
+`timeout` accepts `number`, `totalMs`, `stepMs`, and `chunkMs`. In practice, `number` and `totalMs` cap the full prompt, `stepMs` caps each step, and `chunkMs` mainly matters in streaming flows. The runtime creates an internal abort signal, so tools can stop early if their implementation respects the provided `abortSignal`.
 </Tip>
 
 ## Writing Assertions
diff --git a/docs/sdk/reference/llm-providers.mdx b/docs/sdk/reference/llm-providers.mdx
index bd8ff88d7..424e781af 100644
--- a/docs/sdk/reference/llm-providers.mdx
+++ b/docs/sdk/reference/llm-providers.mdx
@@ -384,7 +384,7 @@ const { provider, modelId } = parseLLMString("anthropic/claude-sonnet-4-20250514
 
 ### createModelFromString()
 
-Create a Vercel AI SDK model directly.
+Create a provider model instance directly.
 
 ```typescript
 import { createModelFromString } from "@mcpjam/sdk";
diff --git a/docs/sdk/reference/prompt-result.mdx b/docs/sdk/reference/prompt-result.mdx
index 0c015ac5a..b1e24daf4 100644
--- a/docs/sdk/reference/prompt-result.mdx
+++ b/docs/sdk/reference/prompt-result.mdx
@@ -328,7 +328,7 @@ getMessages(): CoreMessage[]
 
 #### Returns
 
-`CoreMessage[]` - Vercel AI SDK message format.
+`CoreMessage[]` - The full conversation message format used by the SDK.
 
 #### Example
 
diff --git a/docs/sdk/reference/test-agent.mdx b/docs/sdk/reference/test-agent.mdx
index 1dda2f9d4..ae74533b3 100644
--- a/docs/sdk/reference/test-agent.mdx
+++ b/docs/sdk/reference/test-agent.mdx
@@ -4,7 +4,7 @@ description: "API reference for TestAgent"
 icon: "book"
 ---
 
-The `TestAgent` class wraps LLM providers via the Vercel AI SDK, enabling you to run prompts with MCP tools. It handles the agentic loop and returns rich result objects.
+The `TestAgent` class runs prompts with MCP tools enabled. It handles the multi-step prompt loop and returns rich result objects.
 
 ## Import
 
@@ -76,7 +76,8 @@ prompt(
 | Property | Type | Description |
 |----------|------|-------------|
 | `context` | `PromptResult \| PromptResult[]` | Previous result(s) for multi-turn conversations |
-| `stopWhen` | `StopCondition<ToolSet> \| Array<StopCondition<ToolSet>>` | Additional AI SDK stop conditions for the multi-step loop. Tools still execute normally. `TestAgent` always applies `stepCountIs(maxSteps)` as a safety guard. |
+| `stopWhen` | `StopCondition<ToolSet> \| Array<StopCondition<ToolSet>>` | Additional conditions for the multi-step prompt loop. Tools still execute normally. `TestAgent` always applies `stepCountIs(maxSteps)` as a safety guard. |
+| `timeout` | `number \| { totalMs?: number; stepMs?: number; chunkMs?: number }` | Bounds prompt runtime. `number` and `totalMs` cap the full prompt, `stepMs` caps each generation step, and `chunkMs` is accepted for parity but is mainly relevant to streaming APIs. |
 
 #### Returns
 
@@ -102,12 +103,25 @@ const r4 = await agent.prompt("Search for tasks", {
   stopWhen: hasToolCall("search_tasks"),
 });
 console.log(r4.hasToolCall("search_tasks"));
+
+// Bound prompt runtime
+const r5 = await agent.prompt("Run a long workflow", {
+  timeout: { totalMs: 10_000, stepMs: 2_500 },
+});
+
+if (r5.hasError()) {
+  console.error(r5.getError());
+}
 ```
 
 <Note>
 `prompt()` never throws exceptions. Errors are captured in the `PromptResult`. Check `result.hasError()` to detect failures.
 </Note>
 
+<Info>
+`timeout` bounds prompt runtime. The runtime creates an internal abort signal, so tools can stop early if their implementation respects the provided `abortSignal`. If a tool ignores that signal, its underlying work may continue briefly after the prompt returns an error result.
+</Info>
+
 ---
 
 ## Model String Format
@@ -244,8 +258,7 @@ Setting `maxSteps` too low may prevent complex tasks from completing. Setting it
 
 ## Control Multi-Step Loops with stopWhen
 
-Use `stopWhen` to pass raw AI SDK stop conditions into `TestAgent.prompt()`.
-This controls whether the agent starts another generation step after the current step completes.
+Use `stopWhen` to control whether the agent starts another step after the current step completes.
 
 ```typescript
 import { hasToolCall } from "@mcpjam/sdk";
@@ -264,7 +277,27 @@ const result2 = await agent.prompt("Do something", {
 ```
 
 <Info>
-`stopWhen` does not skip tool execution. AI SDK evaluates stop conditions after the current step completes and tool results are available. `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+`stopWhen` does not skip tool execution. It controls whether the prompt loop continues after the current step completes, and `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+</Info>
+
+---
+
+## Bound Prompt Runtime with timeout
+
+Use `timeout` when you want to bound how long `TestAgent.prompt()` can run:
+
+```typescript
+const result = await agent.prompt("Run a long workflow", {
+  timeout: 10_000,
+});
+
+const result2 = await agent.prompt("Run a long workflow", {
+  timeout: { totalMs: 10_000, stepMs: 2_500, chunkMs: 1_000 },
+});
+```
+
+<Info>
+`chunkMs` is accepted for parity, but it is mainly useful for streaming APIs. For `TestAgent.prompt()`, `number`, `totalMs`, and `stepMs` are the main settings to focus on.
 </Info>
 
 ---
diff --git a/sdk/README.md b/sdk/README.md
index 18e71f8b4..116397a3a 100644
--- a/sdk/README.md
+++ b/sdk/README.md
@@ -163,7 +163,7 @@ await manager.connectToServer("asana", {
   },
 });
 
-// Get tools for AI SDK integration
+// Get tools for TestAgent
 const tools = await manager.getToolsForAiSdk(["everything", "asana"]);
 
 // Direct MCP operations
@@ -210,9 +210,17 @@ const r3 = await agent.prompt("Search tasks", {
   stopWhen: hasToolCall("search_tasks"),
 });
 r3.hasToolCall("search_tasks");          // true
+
+// Bound prompt runtime
+const r4 = await agent.prompt("Run a long workflow", {
+  timeout: { totalMs: 10_000, stepMs: 2_500 },
+});
+r4.hasError();                           // true if the prompt timed out
 ```
 
-`stopWhen` does not skip tool execution. AI SDK evaluates stop conditions after the current step completes and tool results are available. `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+`stopWhen` does not skip tool execution. It controls whether the prompt loop continues after the current step completes, and `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+
+`timeout` bounds prompt runtime. `number` and `totalMs` cap the full prompt, `stepMs` caps each step, and `chunkMs` is accepted for parity but mainly matters in streaming flows. The runtime creates an internal abort signal, so tools can stop early if their implementation respects the provided `abortSignal`.
 
 **Supported providers:** `openai`, `anthropic`, `azure`, `google`, `mistral`, `deepseek`, `ollama`, `openrouter`, `xai`
 
diff --git a/sdk/skills/create-mcp-eval/SKILL.md b/sdk/skills/create-mcp-eval/SKILL.md
index 643539da5..caba61cd9 100644
--- a/sdk/skills/create-mcp-eval/SKILL.md
+++ b/sdk/skills/create-mcp-eval/SKILL.md
@@ -172,7 +172,7 @@ await manager.connectToServer("server-id", {
   env: { API_KEY: "..." },
 });
 
-// Get AI SDK-compatible tools for TestAgent
+// Get tools for TestAgent
 const tools = await manager.getToolsForAiSdk(["server-id"]);
 
 // Cleanup
@@ -208,6 +208,12 @@ const r3 = await agent.prompt("Search tasks", {
 });
 r3.hasToolCall("search_tasks");          // true
 
+// Bound prompt runtime
+const r4 = await agent.prompt("Run a long workflow", {
+  timeout: { totalMs: 10_000, stepMs: 2_500 },
+});
+r4.hasError();                           // true if the prompt timed out
+
 // Mock agent for deterministic tests (no LLM needed)
 const mockAgent = TestAgent.mock(async (message) =>
   PromptResult.from({
@@ -224,6 +230,10 @@ const mockAgent = TestAgent.mock(async (message) =>
 );
 ```
 
+`stopWhen` does not skip tool execution. It controls whether the prompt loop continues after the current step completes, and `TestAgent` also applies `stepCountIs(maxSteps)` as a safety guard.
+
+`timeout` bounds prompt runtime. `number` and `totalMs` cap the full prompt, `stepMs` caps each step, and `chunkMs` is accepted for parity but mainly matters in streaming flows. The runtime creates an internal abort signal, so tools can stop early if their implementation respects the provided `abortSignal`.
+
 ### PromptResult — Inspect Agent Responses
 
 ```typescript
diff --git a/sdk/src/EvalAgent.ts b/sdk/src/EvalAgent.ts
index 59606dd4b..8b9fa1c19 100644
--- a/sdk/src/EvalAgent.ts
+++ b/sdk/src/EvalAgent.ts
@@ -1,4 +1,4 @@
-import type { StopCondition, ToolSet } from "ai";
+import type { StopCondition, TimeoutConfiguration, ToolSet } from "ai";
 import type { PromptResult } from "./PromptResult.js";
 
 /**
@@ -9,7 +9,7 @@ export interface PromptOptions {
   context?: PromptResult | PromptResult[];
 
   /**
-   * Additional AI SDK stop conditions for the agentic loop.
+   * Additional stop conditions for the agentic loop.
    * Evaluated after each step completes (tools execute normally).
    * `stepCountIs(maxSteps)` is always applied as a safety guard
    * in addition to any conditions provided here.
@@ -33,6 +33,19 @@ export interface PromptOptions {
    * ```
    */
   stopWhen?: StopCondition<ToolSet> | Array<StopCondition<ToolSet>>;
+
+  /**
+   * Timeout for the prompt runtime.
+   *
+   * - `number`: total timeout for the entire prompt call in milliseconds
+   * - `{ totalMs }`: total timeout across all steps
+   * - `{ stepMs }`: timeout for each generation step
+   * - `{ chunkMs }`: accepted for parity and primarily relevant to streaming APIs
+   *
+   * The runtime creates an internal abort signal. Tools can stop early if they
+   * respect the `abortSignal` passed to `execute()`.
+   */
+  timeout?: TimeoutConfiguration;
 }
 
 /**
diff --git a/sdk/src/TestAgent.ts b/sdk/src/TestAgent.ts
index 5e3a9eb58..8bf9e66ae 100644
--- a/sdk/src/TestAgent.ts
+++ b/sdk/src/TestAgent.ts
@@ -419,6 +419,9 @@ export class TestAgent implements EvalAgent {
         ...(this.temperature !== undefined && {
           temperature: this.temperature,
         }),
+        ...(options?.timeout !== undefined && {
+          timeout: options.timeout,
+        }),
         // Use stopWhen with stepCountIs for controlling max agentic steps
         // AI SDK v6+ uses this instead of maxSteps
         stopWhen: this.resolveStopWhen(options?.stopWhen),
diff --git a/sdk/tests/TestAgent.test.ts b/sdk/tests/TestAgent.test.ts
index 1e0eb7b85..0f7460ec6 100644
--- a/sdk/tests/TestAgent.test.ts
+++ b/sdk/tests/TestAgent.test.ts
@@ -991,6 +991,71 @@ describe("TestAgent", () => {
     });
   });
 
+  describe("timeout", () => {
+    it("should pass through a numeric timeout", async () => {
+      mockGenerateText.mockResolvedValueOnce({
+        text: "OK",
+        steps: [],
+        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
+      } as any);
+
+      const agent = new TestAgent({
+        tools: mockToolSet,
+        model: "openai/gpt-4o",
+        apiKey: "test-key",
+      });
+
+      await agent.prompt("Test", { timeout: 5000 });
+
+      const callArgs = mockGenerateText.mock.calls[0][0] as any;
+      expect(callArgs.timeout).toBe(5000);
+    });
+
+    it("should pass through an object timeout", async () => {
+      mockGenerateText.mockResolvedValueOnce({
+        text: "OK",
+        steps: [],
+        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
+      } as any);
+
+      const agent = new TestAgent({
+        tools: mockToolSet,
+        model: "openai/gpt-4o",
+        apiKey: "test-key",
+      });
+
+      await agent.prompt("Test", {
+        timeout: { totalMs: 5000, stepMs: 1000, chunkMs: 250 },
+      });
+
+      const callArgs = mockGenerateText.mock.calls[0][0] as any;
+      expect(callArgs.timeout).toEqual({
+        totalMs: 5000,
+        stepMs: 1000,
+        chunkMs: 250,
+      });
+    });
+
+    it("should omit timeout when it is not set", async () => {
+      mockGenerateText.mockResolvedValueOnce({
+        text: "OK",
+        steps: [],
+        usage: { inputTokens: 1, outputTokens: 1, totalTokens: 2 },
+      } as any);
+
+      const agent = new TestAgent({
+        tools: mockToolSet,
+        model: "openai/gpt-4o",
+        apiKey: "test-key",
+      });
+
+      await agent.prompt("Test");
+
+      const callArgs = mockGenerateText.mock.calls[0][0] as any;
+      expect(callArgs).not.toHaveProperty("timeout");
+    });
+  });
+
   describe("app-only tool filtering", () => {
     // Helper to create a mock Tool with visibility
     const createMockTool = (
diff --git a/sdk/tests/TestAgent.timeout.integration.test.ts b/sdk/tests/TestAgent.timeout.integration.test.ts
new file mode 100644
index 000000000..205b4cc65
--- /dev/null
+++ b/sdk/tests/TestAgent.timeout.integration.test.ts
@@ -0,0 +1,122 @@
+import { dynamicTool, jsonSchema } from "ai";
+import { MockLanguageModelV3 } from "ai/test";
+import { TestAgent } from "../src/TestAgent";
+
+let currentModel: MockLanguageModelV3;
+const mockCreateModelFromString = jest.fn(() => currentModel);
+
+jest.mock("../src/model-factory", () => {
+  const actual = jest.requireActual("../src/model-factory");
+  return {
+    ...actual,
+    createModelFromString: (...args: any[]) =>
+      mockCreateModelFromString(...args),
+  };
+});
+
+function toError(reason: unknown): Error {
+  if (reason instanceof Error) {
+    return reason;
+  }
+
+  return new Error(String(reason ?? "aborted"));
+}
+
+describe("TestAgent timeout integration", () => {
+  beforeEach(() => {
+    mockCreateModelFromString.mockClear();
+  });
+
+  it("returns an error result when AI SDK timeout aborts a tool cooperatively", async () => {
+    let sawAbortSignal = false;
+    let abortObserved = false;
+    let stepNumber = 0;
+
+    currentModel = new MockLanguageModelV3({
+      doGenerate: async ({ abortSignal }) => {
+        stepNumber += 1;
+
+        if (stepNumber === 1) {
+          return {
+            content: [
+              {
+                type: "tool-call" as const,
+                toolCallId: "call-1",
+                toolName: "wait",
+                input: JSON.stringify({}),
+              },
+            ],
+            finishReason: "tool-calls",
+            usage: { inputTokens: 5, outputTokens: 3, totalTokens: 8 },
+            warnings: [],
+          };
+        }
+
+        if (abortSignal?.aborted) {
+          throw toError(abortSignal.reason);
+        }
+
+        return {
+          content: [{ type: "text" as const, text: "unexpected follow-up" }],
+          finishReason: "stop",
+          usage: { inputTokens: 4, outputTokens: 2, totalTokens: 6 },
+          warnings: [],
+        };
+      },
+    });
+
+    const agent = new TestAgent({
+      tools: {
+        wait: dynamicTool({
+          description: "Wait until the abort signal fires",
+          inputSchema: jsonSchema({
+            type: "object",
+            properties: {},
+          }),
+          execute: async (_input, { abortSignal }) => {
+            sawAbortSignal = abortSignal != null;
+
+            if (abortSignal == null) {
+              throw new Error("missing abort signal");
+            }
+
+            if (abortSignal.aborted) {
+              abortObserved = true;
+              throw toError(abortSignal.reason);
+            }
+
+            await new Promise<never>((_, reject) => {
+              abortSignal.addEventListener(
+                "abort",
+                () => {
+                  abortObserved = true;
+                  reject(toError(abortSignal.reason));
+                },
+                { once: true }
+              );
+            });
+
+            throw new Error("unreachable");
+          },
+        }),
+      },
+      model: "openai/gpt-4o",
+      apiKey: "test-key",
+    });
+
+    const startedAt = Date.now();
+    const result = await agent.prompt("Run the long tool", { timeout: 25 });
+    const elapsedMs = Date.now() - startedAt;
+
+    expect(sawAbortSignal).toBe(true);
+    expect(abortObserved).toBe(true);
+    expect(result.hasError()).toBe(true);
+    expect(result.getError()).toEqual(expect.any(String));
+    expect(elapsedMs).toBeLessThan(1000);
+    expect(currentModel.doGenerateCalls).toHaveLength(2);
+    expect(mockCreateModelFromString).toHaveBeenCalledWith(
+      "openai/gpt-4o",
+      expect.objectContaining({ apiKey: "test-key" })
+    );
+  });
+});

From 8bfc602706b8c107d5468723e24dde92894cc3db Mon Sep 17 00:00:00 2001
From: marcelo <marcelojimenezrocabado@gmail.com>
Date: Wed, 11 Mar 2026 09:01:35 -0700
Subject: [PATCH 3/3] nits

---
 sdk/README.md                       |  4 ++--
 sdk/skills/create-mcp-eval/SKILL.md |  3 +--
 sdk/src/TestAgent.ts                |  8 +++++---
 sdk/tests/TestAgent.test.ts         | 32 ++++++++++++++++++-----------
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/sdk/README.md b/sdk/README.md
index 116397a3a..b01c0c24d 100644
--- a/sdk/README.md
+++ b/sdk/README.md
@@ -187,6 +187,8 @@ await manager.disconnectServer("everything");
 Runs LLM prompts with MCP tool access.
 
 ```ts
+import { hasToolCall } from "@mcpjam/sdk";
+
 const agent = new TestAgent({
   tools: await manager.getToolsForAiSdk(),
   model: "openai/gpt-4o",        // provider/model format
@@ -197,8 +199,6 @@ const agent = new TestAgent({
 });
 
 // Run a prompt
-import { hasToolCall } from "@mcpjam/sdk";
-
 const result = await agent.prompt("Add 2 and 3");
 
 // Multi-turn with context
diff --git a/sdk/skills/create-mcp-eval/SKILL.md b/sdk/skills/create-mcp-eval/SKILL.md
index caba61cd9..22f326256 100644
--- a/sdk/skills/create-mcp-eval/SKILL.md
+++ b/sdk/skills/create-mcp-eval/SKILL.md
@@ -185,6 +185,7 @@ await manager.disconnectAllServers();
 
 ```typescript
 import { TestAgent } from "@mcpjam/sdk";
+import { hasToolCall } from "@mcpjam/sdk";
 
 const agent = new TestAgent({
   tools,                              // from manager.getToolsForAiSdk()
@@ -194,8 +195,6 @@ const agent = new TestAgent({
 });
 
 // Single prompt
-import { hasToolCall } from "@mcpjam/sdk";
-
 const result = await agent.prompt("List all projects");
 
 // Multi-turn with context
diff --git a/sdk/src/TestAgent.ts b/sdk/src/TestAgent.ts
index 8bf9e66ae..53400f208 100644
--- a/sdk/src/TestAgent.ts
+++ b/sdk/src/TestAgent.ts
@@ -320,13 +320,15 @@ export class TestAgent implements EvalAgent {
 
   private resolveStopWhen(
     stopWhen?: PromptOptions["stopWhen"]
-  ): StopCondition<ToolSet> | Array<StopCondition<ToolSet>> {
+  ): Array<StopCondition<ToolSet>> {
+    const base = [stepCountIs(this.maxSteps)];
+
     if (stopWhen == null) {
-      return stepCountIs(this.maxSteps);
+      return base;
     }
 
     const conditions = Array.isArray(stopWhen) ? stopWhen : [stopWhen];
-    return [stepCountIs(this.maxSteps), ...conditions];
+    return [...base, ...conditions];
   }
 
   /**
diff --git a/sdk/tests/TestAgent.test.ts b/sdk/tests/TestAgent.test.ts
index 0f7460ec6..7c1045fdf 100644
--- a/sdk/tests/TestAgent.test.ts
+++ b/sdk/tests/TestAgent.test.ts
@@ -19,12 +19,13 @@ jest.mock("../src/model-factory", () => ({
   createModelFromString: jest.fn(() => ({})),
 }));
 
-import { generateText, jsonSchema } from "ai";
+import { generateText, jsonSchema, stepCountIs } from "ai";
 import { createModelFromString } from "../src/model-factory";
 
 const mockGenerateText = generateText as jest.MockedFunction<
   typeof generateText
 >;
+const mockStepCountIs = stepCountIs as jest.MockedFunction<typeof stepCountIs>;
 const mockCreateModel = createModelFromString as jest.MockedFunction<
   typeof createModelFromString
 >;
@@ -531,6 +532,9 @@ describe("TestAgent", () => {
     });
 
     it("should pass system prompt and temperature to generateText", async () => {
+      const guard = { kind: "max-step-guard" } as any;
+      mockStepCountIs.mockReturnValueOnce(guard);
+
       mockGenerateText.mockResolvedValueOnce({
         text: "OK",
         steps: [],
@@ -548,12 +552,13 @@ describe("TestAgent", () => {
 
       await agent.prompt("What is 2+2?");
 
+      expect(mockStepCountIs).toHaveBeenCalledWith(15);
       expect(mockGenerateText).toHaveBeenCalledWith(
         expect.objectContaining({
           system: "You are a math tutor.",
           prompt: "What is 2+2?",
           temperature: 0.3,
-          stopWhen: { type: "stepCount", value: 15 },
+          stopWhen: [guard],
         })
       );
 
@@ -897,6 +902,8 @@ describe("TestAgent", () => {
   describe("stopWhen", () => {
     it("should merge a single stop condition with maxSteps and still execute tools", async () => {
       const stopCondition = jest.fn(() => false);
+      const guard = { kind: "max-step-guard" } as any;
+      mockStepCountIs.mockReturnValueOnce(guard);
 
       mockGenerateText.mockImplementationOnce(async (params: any) => {
         const result = await params.tools.add.execute(
@@ -933,19 +940,19 @@ describe("TestAgent", () => {
         stopWhen: stopCondition as any,
       });
 
+      expect(mockStepCountIs).toHaveBeenCalledWith(10);
       expect(result.hasToolCall("add")).toBe(true);
       expect(result.getToolArguments("add")).toEqual({ a: 2, b: 3 });
 
       const callArgs = mockGenerateText.mock.calls[0][0] as any;
-      expect(callArgs.stopWhen).toEqual([
-        { type: "stepCount", value: 10 },
-        stopCondition,
-      ]);
+      expect(callArgs.stopWhen).toEqual([guard, stopCondition]);
     });
 
     it("should merge multiple stop conditions with maxSteps", async () => {
       const stopA = jest.fn(() => false);
       const stopB = jest.fn(() => true);
+      const guard = { kind: "max-step-guard" } as any;
+      mockStepCountIs.mockReturnValueOnce(guard);
 
       mockGenerateText.mockResolvedValueOnce({
         text: "Done",
@@ -963,15 +970,15 @@ describe("TestAgent", () => {
         stopWhen: [stopA as any, stopB as any],
       });
 
+      expect(mockStepCountIs).toHaveBeenCalledWith(10);
       const callArgs = mockGenerateText.mock.calls[0][0] as any;
-      expect(callArgs.stopWhen).toEqual([
-        { type: "stepCount", value: 10 },
-        stopA,
-        stopB,
-      ]);
+      expect(callArgs.stopWhen).toEqual([guard, stopA, stopB]);
     });
 
     it("should default to stepCountIs when stopWhen is not set", async () => {
+      const guard = { kind: "max-step-guard" } as any;
+      mockStepCountIs.mockReturnValueOnce(guard);
+
       mockGenerateText.mockResolvedValueOnce({
         text: "OK",
         steps: [],
@@ -986,8 +993,9 @@ describe("TestAgent", () => {
 
       await agent.prompt("Test");
 
+      expect(mockStepCountIs).toHaveBeenCalledWith(10);
       const callArgs = mockGenerateText.mock.calls[0][0] as any;
-      expect(callArgs.stopWhen).toEqual({ type: "stepCount", value: 10 });
+      expect(callArgs.stopWhen).toEqual([guard]);
     });
   });