diff --git a/.gitignore b/.gitignore index 9a941be..0e235ba 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ temp/ # Jest cache .jest/ +.codex diff --git a/CLAUDE.md b/CLAUDE.md index 11ae7fa..36aa673 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -73,7 +73,7 @@ Sessions enable multi-turn conversations with Codex: 2. **Subsequent requests with same sessionId**: Uses `codex exec resume ` (native Codex resume) 3. **Fallback**: If no conversation ID exists, manually builds enhanced prompt from conversation history -**Important**: When resuming sessions, `sandbox`, `fullAuto`, and `workingDirectory` parameters are NOT applied (Codex CLI limitation). +**Important**: When resuming sessions, `sandbox` and `workingDirectory` are not applied. This server does forward `fullAuto` and `bypassApprovals` on resume, but those flags still depend on the installed Codex CLI supporting them. ### Streaming Progress @@ -113,16 +113,18 @@ codex exec --model X --sandbox Y [-c config=value] --skip-git-repo-check "prompt **Resume mode** (existing conversations): ``` -codex exec --skip-git-repo-check -c model="X" -c model_reasoning_effort="Y" resume "prompt" +codex exec --skip-git-repo-check -c model="X" -c model_reasoning_effort="Y" [--full-auto] [--dangerously-bypass-approvals-and-sandbox] resume "prompt" ``` Note: Config flags (`-c`) must come BEFORE the subcommand (`exec` or `resume`). ### Environment Variables -- `CODEX_DEFAULT_MODEL`: Default model for codex/review tools (default: `gpt-5.3-codex`) +- `CODEX_DEFAULT_MODEL`: Default model for codex/review tools (default: `gpt-5.4`) - `CODEX_MCP_CALLBACK_URI`: Static MCP callback URI passed to Codex (override via tool arg) - `STRUCTURED_CONTENT_ENABLED`: Enable `structuredContent` in responses (default: false) +- `CODEX_MCP_DEBUG_STARTUP`: Emit startup banner for debugging (default: false) +- `CODEX_MCP_DEBUG_COMMANDS`: Emit command argv/stderr for debugging (default: false) ## TypeScript Configuration diff --git a/README.md b/README.md index d297ef1..e78f70e 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,8 @@ Use review with uncommitted true to review my local changes ``` Use codex with model "o3" and reasoningEffort "high" for complex analysis Use codex with fullAuto true and sandbox "workspace-write" for automated tasks +Use codex with bypassApprovals true only inside an externally sandboxed environment +Use codex with timeoutMs 300000 for long-running tasks Use codex with callbackUri "http://localhost:1234/callback" for static callbacks Use codex to return structuredContent with threadId metadata when available ``` @@ -107,9 +109,17 @@ Use websearch with query "React Server Components" and searchDepth "full" - **[API Reference](docs/api-reference.md)** — Full tool parameters and response formats - **[Session Management](docs/session-management.md)** — How conversations work - **[Codex CLI Integration](docs/codex-cli-integration.md)** — Version compatibility and CLI details +- **[Claude Code Lifecycle Investigation](docs/claude-code-lifecycle-investigation.md)** — Reproductions, fixes, and what Claude tears down itself ## Environment Variables +- `CODEX_DEFAULT_MODEL`: Override the default model for `codex` and `review` (default `gpt-5.4`) - `CODEX_MCP_CALLBACK_URI`: Static MCP callback URI passed to Codex when set (overridden by `callbackUri` tool arg) +- `CODEX_TOOL_TIMEOUT_MS`: Timeout in milliseconds for serialized tool calls (default `120000`) +- `STRUCTURED_CONTENT_ENABLED`: Emit `structuredContent` alongside text metadata when truthy +- `CODEX_MCP_DEBUG_STARTUP`: Set to `1`, `true`, `yes`, or `on` to emit the startup banner on stderr for debugging +- `CODEX_MCP_DEBUG_COMMANDS`: Set to `1`, `true`, `yes`, or `on` to emit command argv and stderr for debugging + +`timeoutMs` on the `codex` tool overrides `CODEX_TOOL_TIMEOUT_MS` for a single request. ## Development diff --git a/docs/TODO.md b/docs/TODO.md index 759943f..9472624 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -1,5 +1,9 @@ # Codex MCP Server - TODO +This file is backlog-oriented. Do not treat the "Implemented" section below as +the current runtime source of truth; check `README.md`, `docs/api-reference.md`, +and the code first. + ## Features from Codex CLI v0.98.0 These features were introduced/stabilized in Codex CLI v0.98.0 but are not yet implemented in this MCP server. @@ -50,11 +54,11 @@ These features were introduced/stabilized in Codex CLI v0.98.0 but are not yet i ## Implemented in v1.3.4+ -### ✅ GPT-5.3-Codex Model +### ✅ GPT-5.4 Default Model - **Status**: Implemented - **Description**: New default model - **Changes**: - - Updated `DEFAULT_CODEX_MODEL` constant to `'gpt-5.3-codex'` + - Updated `DEFAULT_CODEX_MODEL` constant to `'gpt-5.4'` - Updated tool definitions to reflect new default - Single source of truth for model updates diff --git a/docs/api-reference.md b/docs/api-reference.md index 666a298..0131efc 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -1,442 +1,157 @@ # API Reference -## Overview -Complete reference for the Codex MCP Server tools and interfaces. +Current reference for the public MCP surface exposed by `codex-mcp-server`. -This server implements the **MCP 2025-11-25 specification**, including tool annotations and progress notifications. +## Requirements -## Installation Options +- Codex CLI `v0.75.0+` +- An authenticated Codex CLI (`codex login --api-key "..."`) +- For resumed-session `fullAuto` or `bypassApprovals`, use a Codex CLI build whose `codex exec resume --help` lists those flags -### Claude Code -```bash -claude mcp add codex-cli -- npx -y codex-mcp-server -``` - -### Claude Desktop -Add to your configuration file: - -**macOS:** `~/Library/Application Support/Claude/claude_desktop_config.json` - -**Windows:** `%APPDATA%/Claude/claude_desktop_config.json` - -```json -{ - "mcpServers": { - "codex-cli": { - "command": "npx", - "args": ["-y", "codex-mcp-server"] - } - } -} -``` - -## MCP Protocol Features - -### Tool Annotations -All tools include annotations that provide hints to MCP clients about tool behavior: - -| Annotation | Type | Description | -|------------|------|-------------| -| `title` | string | Human-readable tool name | -| `readOnlyHint` | boolean | Tool doesn't modify state (safe to call) | -| `destructiveHint` | boolean | Tool may modify files or external state | -| `idempotentHint` | boolean | Multiple calls produce same result | -| `openWorldHint` | boolean | Tool interacts with external services (network, APIs) | - -#### Tool Annotation Matrix -| Tool | `title` | `readOnlyHint` | `destructiveHint` | `idempotentHint` | `openWorldHint` | -|------|---------|---------------|-------------------|------------------|-----------------| -| `codex` | Execute Codex CLI | `false` | `true` | `false` | `true` | -| `review` | Code Review | `true` | `false` | `true` | `true` | -| `ping` | Ping Server | `true` | `false` | `true` | `false` | -| `help` | Get Help | `true` | `false` | `true` | `false` | -| `listSessions` | List Sessions | `true` | `false` | `true` | `false` | - -### Progress Notifications -For long-running operations, the server sends `notifications/progress` messages when the client includes a `progressToken` in the request `_meta`. - -**Request with Progress Token:** -```json -{ - "jsonrpc": "2.0", - "id": 1, - "method": "tools/call", - "params": { - "name": "codex", - "arguments": { "prompt": "Analyze this codebase" }, - "_meta": { "progressToken": "unique-token-123" } - } -} -``` - -**Progress Notification (sent during execution):** -```json -{ - "jsonrpc": "2.0", - "method": "notifications/progress", - "params": { - "progressToken": "unique-token-123", - "progress": 1, - "message": "Processing output from Codex..." - } -} -``` - -**Supported Tools:** `codex`, `review` (long-running operations) - -> **Note:** Progress notifications are streamed in real-time from CLI stdout/stderr. Client support for displaying these notifications varies. - -## Tools - -### `codex` - AI Coding Assistant - -Execute Codex CLI with advanced session management and model control. - -**Annotations:** `readOnlyHint: false`, `destructiveHint: true`, `idempotentHint: false`, `openWorldHint: true` - -#### Parameters - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `prompt` | string | ✅ | - | The coding task, question, or analysis request | -| `sessionId` | string | ❌ | - | Session ID for conversational context | -| `resetSession` | boolean | ❌ | `false` | Reset session history before processing | -| `model` | string | ❌ | `gpt-5.2-codex` | Model to use for processing | -| `reasoningEffort` | enum | ❌ | - | Control reasoning depth | -| `sandbox` | enum | ❌ | - | Sandbox policy: `read-only`, `workspace-write`, `danger-full-access` | -| `fullAuto` | boolean | ❌ | `false` | Enable full-auto mode (sandboxed automatic execution) | -| `workingDirectory` | string | ❌ | - | Working directory for the agent | -| `callbackUri` | string | ❌ | - | Static MCP callback URI passed via env to Codex | - -#### Model Options -- `gpt-5.2-codex` (default) - Latest specialized coding model optimized for agentic tasks -- `gpt-5.1-codex` - Previous coding model version -- `gpt-5.1-codex-max` - Enhanced coding model for complex tasks -- `gpt-5-codex` - Base GPT-5 coding model -- `gpt-4o` - Fast multimodal model -- `gpt-4` - Advanced reasoning capabilities - -#### Reasoning Effort Levels -- `low` - Quick responses, minimal processing -- `medium` - Balanced quality and speed -- `high` - Thorough analysis and comprehensive responses - -#### Response Format -```typescript -interface CodexToolResponse { - content: Array<{ - type: 'text'; - text: string; - _meta?: { - threadId?: string; - model?: string; - sessionId?: string; - callbackUri?: string; - }; - }>; - structuredContent?: { - threadId?: string; - model?: string; - sessionId?: string; - callbackUri?: string; - }; -} -``` - -**Note:** `structuredContent` is only emitted when `STRUCTURED_CONTENT_ENABLED` is set to a truthy value (`1`, `true`, `yes`, `on`). It is **disabled by default**. `_meta` remains available in `content` for Claude Code compatibility. - -#### Output Schema (structuredContent) -The `codex` tool advertises an `outputSchema` that describes the structure of `structuredContent` returned in tool results when enabled. -```json -{ - "type": "object", - "properties": { - "threadId": { "type": "string" } - } -} -``` - -#### Examples - -**Basic Usage:** -```json -{ - "prompt": "Explain this Python function and suggest improvements" -} -``` - -**With Model Selection:** -```json -{ - "prompt": "Perform complex architectural analysis", - "model": "gpt-4", - "reasoningEffort": "high" -} -``` - -**Session Management:** -```json -{ - "prompt": "Continue our previous discussion", - "sessionId": "my-coding-session" -} -``` - -**Reset Session:** -```json -{ - "prompt": "Start fresh analysis", - "sessionId": "my-coding-session", - "resetSession": true -} -``` +## Tool Matrix ---- +| Tool | Purpose | Progress-capable | Spawns Codex CLI | +| --- | --- | --- | --- | +| `codex` | General Codex execution with sessions and execution controls | Yes | Yes | +| `review` | Repository review against working tree, base branch, or commit | Yes | Yes | +| `websearch` | Search-backed Codex run via `codex --search exec` | Yes | Yes | +| `help` | Return `codex --help` output | Yes | Yes | +| `listSessions` | List active in-memory sessions | No | No | +| `ping` | Echo/health check | No | No | -### `review` - Code Review +## `codex` -Run AI-powered code reviews against your repository using Codex CLI. +Execute Codex CLI in non-interactive mode. -**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true`, `openWorldHint: true` +| Parameter | Type | Required | Default | Notes | +| --- | --- | --- | --- | --- | +| `prompt` | string | Yes | - | Main Codex task | +| `sessionId` | string | No | - | Caller-supplied session key, 1-256 chars, `[A-Za-z0-9_-]+` | +| `resetSession` | boolean | No | `false` | Clears stored server-side history before this request | +| `model` | string | No | `gpt-5.4` | Free-form Codex model string | +| `reasoningEffort` | enum | No | - | `none`, `minimal`, `low`, `medium`, `high`, `xhigh` | +| `sandbox` | enum | No | - | `read-only`, `workspace-write`, `danger-full-access` | +| `fullAuto` | boolean | No | `false` | Sandboxed automatic execution without approval prompts | +| `bypassApprovals` | boolean | No | `false` | Disables prompts and sandboxing entirely; use only in externally sandboxed environments | +| `workingDirectory` | string | No | - | Passed via `-C` for new executions | +| `callbackUri` | string | No | - | Overrides `CODEX_MCP_CALLBACK_URI` for this request | +| `timeoutMs` | integer | No | `CODEX_TOOL_TIMEOUT_MS` or `120000` | Per-request override for this tool only | -#### Parameters +Important behavior: -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `prompt` | string | ❌ | - | Custom review instructions or focus areas | -| `uncommitted` | boolean | ❌ | `false` | Review staged, unstaged, and untracked changes | -| `base` | string | ❌ | - | Review changes against a specific base branch | -| `commit` | string | ❌ | - | Review changes introduced by a specific commit SHA | -| `title` | string | ❌ | - | Title to display in the review summary | -| `model` | string | ❌ | `gpt-5.2-codex` | Model to use for the review (passed via `-c model="..."`) | -| `workingDirectory` | string | ❌ | - | Working directory to run the review in (passed via `-C`) | +- `bypassApprovals` is mutually exclusive with `sandbox`. +- `bypassApprovals` is mutually exclusive with `fullAuto`. +- On resumed sessions, `sandbox` and `workingDirectory` are not applied. +- On resumed sessions, this server still forwards `fullAuto` and `bypassApprovals` when requested. If your local Codex CLI does not support those resume flags, the CLI itself will reject the command. -#### Examples +Response behavior: -**Review Uncommitted Changes:** -```json -{ - "uncommitted": true -} -``` +- The primary text result is returned in `content[0].text`. +- `content[0]._meta` includes `model`, plus `sessionId`, `callbackUri`, and `threadId` when available. +- `structuredContent` is only included when `STRUCTURED_CONTENT_ENABLED` is truthy. -**Review Against Main Branch:** -```json -{ - "base": "main", - "prompt": "Focus on security vulnerabilities" -} -``` +Example: -**Review Specific Commit:** ```json { - "commit": "abc123def", - "title": "Security Audit" + "prompt": "Review this refactor for race conditions", + "sessionId": "race-audit", + "model": "gpt-5.4", + "reasoningEffort": "high", + "timeoutMs": 300000 } ``` -#### Response Format -```typescript -interface ReviewToolResponse { - content: Array<{ - type: 'text'; - text: string; // Review output from Codex - _meta?: { - model: string; - base?: string; - commit?: string; - }; - }>; - structuredContent?: { - model: string; - base?: string; - commit?: string; - }; -} -``` +## `review` -**Note:** `structuredContent` is only emitted when `STRUCTURED_CONTENT_ENABLED` is set to a truthy value (`1`, `true`, `yes`, `on`). It is **disabled by default**. `_meta` remains available in `content` for Claude Code compatibility. +Run `codex review`. ---- +| Parameter | Type | Required | Default | Notes | +| --- | --- | --- | --- | --- | +| `prompt` | string | No | - | Extra review instructions | +| `uncommitted` | boolean | No | `false` | Review staged, unstaged, and untracked changes | +| `base` | string | No | - | Review against a branch or ref | +| `commit` | string | No | - | Review a single commit | +| `title` | string | No | - | Optional review title | +| `model` | string | No | `gpt-5.4` | Passed via `-c model="..."` | +| `workingDirectory` | string | No | - | Passed via global `-C` and spawn `cwd` | -### `listSessions` - Session Management +Notes: -List all active conversation sessions with metadata. +- `prompt` cannot be combined with `uncommitted: true`. +- Global `CODEX_TOOL_TIMEOUT_MS` still applies. +- Timed-out reviews now honor abort and unblock the serialized queue promptly. -**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true`, `openWorldHint: false` +## `websearch` -#### Parameters -No parameters required. +Run `codex --search exec`. -#### Response Format -```typescript -interface SessionInfo { - id: string; - createdAt: string; // ISO 8601 timestamp - lastAccessedAt: string; // ISO 8601 timestamp - turnCount: number; -} -``` +| Parameter | Type | Required | Default | Notes | +| --- | --- | --- | --- | --- | +| `query` | string | Yes | - | Search request | +| `numResults` | integer | No | `10` | `1` through `50` | +| `searchDepth` | enum | No | `basic` | `basic` or `full` | -#### Example Response -```json -{ - "content": [{ - "type": "text", - "text": "[\n {\n \"id\": \"abc-123-def\",\n \"createdAt\": \"2025-01-01T12:00:00.000Z\",\n \"lastAccessedAt\": \"2025-01-01T12:30:00.000Z\",\n \"turnCount\": 5\n }\n]" - }] -} -``` +Notes: ---- +- Global `CODEX_TOOL_TIMEOUT_MS` applies. +- Timed-out web searches honor abort and unblock the serialized queue promptly. -### `ping` - Connection Test +## `help` -Test MCP server connection and responsiveness. +Returns `codex --help`. -**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true`, `openWorldHint: false` +- No arguments +- Global `CODEX_TOOL_TIMEOUT_MS` applies +- Timed-out help calls honor abort and unblock the serialized queue promptly -#### Parameters +## `listSessions` -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `message` | string | ❌ | `pong` | Message to echo back | +Returns active in-memory session metadata for the current server process. -#### Example -```json -{ - "message": "Hello, server!" -} -``` +Example payload shape: -#### Response ```json -{ - "content": [{ - "type": "text", - "text": "Hello, server!" - }] -} -``` - ---- - -### `help` - Codex CLI Help - -Get information about Codex CLI capabilities and commands. - -**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true`, `openWorldHint: false` - -#### Parameters -No parameters required. - -#### Response -Returns the output of `codex --help` command, providing comprehensive CLI documentation. - -## Session Management - -### Session Lifecycle - -1. **Creation**: Sessions are created automatically or explicitly via `sessionId` -2. **Activity**: Each interaction updates `lastAccessedAt` timestamp -3. **Persistence**: Sessions persist for 24 hours of inactivity -4. **Cleanup**: Automatic removal of expired sessions -5. **Limits**: Maximum 100 concurrent sessions - -### Session Data Structure - -```typescript -interface SessionData { - id: string; // UUID-based session identifier - createdAt: Date; // Session creation timestamp - lastAccessedAt: Date; // Last interaction timestamp - turns: ConversationTurn[]; // Conversation history - codexConversationId?: string; // Native Codex conversation ID -} - -interface ConversationTurn { - prompt: string; // User's original prompt - response: string; // Codex response - timestamp: Date; // Turn timestamp -} -``` - -### Resume Functionality - -The server leverages Codex CLI v0.50.0+ native resume functionality: - -1. **Conversation ID Extraction**: Automatically captures conversation IDs from Codex output (supports both "session id" and "conversation id" formats) -2. **Native Resume**: Uses `codex exec resume ` for optimal continuity -3. **Fallback Context**: Manual context building when native resume unavailable -4. **Seamless Integration**: Transparent to end users - -## Error Handling - -### Error Response Format -```typescript -interface ErrorResponse { - content: Array<{ - type: 'text'; - text: string; // Error description - }>; - isError: true; -} +[ + { + "id": "refactor-thread", + "createdAt": "2026-04-26T08:00:00.000Z", + "lastAccessedAt": "2026-04-26T08:15:00.000Z", + "turnCount": 3 + } +] ``` -### Common Error Scenarios - -#### Authentication Errors -- **Cause**: Codex CLI not authenticated -- **Message**: "Authentication failed: Please run `codex login`" -- **Resolution**: Run `codex login --api-key "your-key"` - -#### Model Errors -- **Cause**: Invalid or unavailable model specified -- **Message**: "Invalid model: " -- **Resolution**: Use supported model or omit for default - -#### Session Errors -- **Cause**: Corrupted session data or invalid session ID -- **Behavior**: Graceful degradation, continues with fresh context -- **Impact**: Minimal - system auto-recovers - -#### CLI Errors -- **Cause**: Codex CLI not installed or network issues -- **Message**: "Failed to execute codex command" -- **Resolution**: Install CLI and check network connectivity - -## Performance Considerations +## `ping` -### Memory Management -- **Session TTL**: 24-hour automatic cleanup -- **Session Limits**: Maximum 100 concurrent sessions -- **Context Optimization**: Recent turns only (last 2) for fallback context +Echo/health check. -### Response Optimization -- **Model Selection**: Default `gpt-5.2-codex` optimized for agentic coding -- **Reasoning Control**: Adjust effort based on task complexity -- **Native Resume**: Preferred over manual context building +| Parameter | Type | Required | Default | +| --- | --- | --- | --- | +| `message` | string | No | `pong` | -### Scalability -- **Stateless Design**: Core functionality works without sessions -- **Graceful Degradation**: Continues operation despite component failures -- **Resource Cleanup**: Automatic management of memory and storage +## Sessions -## Configuration +- Sessions are server-process local and stored only in memory. +- A session is created on first `codex` call that supplies `sessionId`. +- The server stores recent turns plus the Codex conversation ID when it can extract one from CLI output. +- Sessions expire after 24 hours of inactivity. +- The store keeps at most 100 sessions. +- If the server restarts, sessions are lost. -### Environment Variables -None required - authentication handled by Codex CLI. +## Environment Variables -Optional: -- `CODEX_MCP_CALLBACK_URI`: Static MCP callback URI passed to Codex CLI when invoking tools. +| Variable | Purpose | Default | +| --- | --- | --- | +| `CODEX_DEFAULT_MODEL` | Override the default model for `codex` and `review` | `gpt-5.4` | +| `CODEX_MCP_CALLBACK_URI` | Default callback URI for `codex` requests | unset | +| `CODEX_TOOL_TIMEOUT_MS` | Timeout for serialized tool calls | `120000` | +| `STRUCTURED_CONTENT_ENABLED` | Emit `structuredContent` alongside `_meta` | unset / false | +| `CODEX_MCP_DEBUG_STARTUP` | Emit a startup banner to stderr | unset / false | +| `CODEX_MCP_DEBUG_COMMANDS` | Emit command argv and stderr to stderr | unset / false | -### Codex CLI Requirements -- **Version**: 0.36.0 or later -- **Authentication**: `codex login --api-key "your-key"` -- **Verification**: `codex --help` should execute successfully +## Current Constraints -### Optional Configuration -- **CODEX_HOME**: Custom directory for Codex CLI configuration -- **Session Limits**: Configurable in server implementation (default: 100) -- **TTL Settings**: Configurable session expiration (default: 24 hours) \ No newline at end of file +- Calls are serialized through one queue to avoid stdio races. +- `timeoutMs` is only a `codex` tool parameter; other tools use the global timeout. +- `sandbox` and `workingDirectory` are not applied on resumed Codex sessions. diff --git a/docs/architecture-and-fixes.md b/docs/architecture-and-fixes.md new file mode 100644 index 0000000..5653d6f --- /dev/null +++ b/docs/architecture-and-fixes.md @@ -0,0 +1,138 @@ +# codex-mcp-server: Current Architecture and Hardening Notes + +Current-state reference for the server architecture and the lifecycle fixes +landed so far. + +## Source Layout + +```text +src/ +├── index.ts +├── server.ts +├── runtime-config.ts +├── types.ts +├── errors.ts +├── tools/ +│ ├── definitions.ts +│ └── handlers.ts +├── utils/ +│ └── command.ts +└── session/ + └── storage.ts +``` + +## Core Execution Model + +1. The MCP stdio transport receives `tools/call`. +2. `server.ts` serializes every call through `callQueue`. +3. A per-request `AbortController` is created. +4. `withTimeout()` races the tool handler against the configured timeout. +5. On timeout, the controller aborts and the queue slot remains blocked until + the underlying operation actually settles. + +This serialization is intentional. It avoids the original stdio race where two +concurrent child processes could corrupt the JSON-RPC stream. + +## Command Execution Model + +`utils/command.ts` is the shared spawn path for command-backed tools. + +Current behavior: + +- closes child stdin immediately with `child.stdin?.end()` +- creates a detached process group when abort support is in use +- sends `SIGTERM`, then `SIGKILL` after a short grace period +- buffers stdout and stderr up to 10MB +- treats `stdout` or `stderr` output as a usable command result even if the exit + code is non-zero, because Codex often writes primary output to stderr +- keeps command argv and stderr logging disabled by default + +## Current Tool Coverage + +Command-backed tools: + +- `codex` +- `review` +- `websearch` +- `help` + +Non-command-backed tools: + +- `ping` +- `listSessions` + +All command-backed tools now honor the shared abort path. + +## Hardening Fixes Already Landed + +### Concurrent-call serialization + +`callQueue` ensures only one `tools/call` handler runs at a time. + +### Queue does not release before child exit + +The queue slot stays blocked until the underlying operation settles, including +timed-out calls. + +### Non-interactive stdin handling + +Spawned Codex children get EOF on stdin immediately so they do not hang waiting +for additional input. + +### Timeout recovery + +- global timeout via `CODEX_TOOL_TIMEOUT_MS` +- per-call timeout override via `timeoutMs` on the `codex` tool +- shared abort path now applies to `codex`, `review`, `websearch`, and `help` + +### Startup and command logging cleanup + +- startup banner is gated by `CODEX_MCP_DEBUG_STARTUP` +- command argv/stderr logging is gated by `CODEX_MCP_DEBUG_COMMANDS` + +### Default model drift fix + +Default model is `gpt-5.4`, with `CODEX_DEFAULT_MODEL` as the override. + +### `bypassApprovals` contract completion + +- exposed in the MCP tool definition +- forwarded on new and resumed Codex executions +- rejected when combined with `sandbox` +- rejected when combined with `fullAuto` + +## Session Model + +- sessions are in-memory only +- caller provides `sessionId` +- server stores recent turns plus the Codex conversation ID when available +- native Codex resume is preferred +- fallback context reconstruction uses recent turns when native resume is not + available + +## Current Environment Variables + +| Variable | Purpose | Default | +| --- | --- | --- | +| `CODEX_DEFAULT_MODEL` | Override the default model for `codex` and `review` | `gpt-5.4` | +| `CODEX_MCP_CALLBACK_URI` | Default callback URI for `codex` requests | unset | +| `CODEX_TOOL_TIMEOUT_MS` | Timeout for serialized tool calls | `120000` | +| `STRUCTURED_CONTENT_ENABLED` | Emit `structuredContent` | unset / false | +| `CODEX_MCP_DEBUG_STARTUP` | Emit startup banner | unset / false | +| `CODEX_MCP_DEBUG_COMMANDS` | Emit command argv/stderr | unset / false | + +## Intentional Constraints + +- The queue still serializes all tool calls, not only Codex-heavy ones. +- `timeoutMs` is only a `codex` tool argument. +- `sandbox` and `workingDirectory` are not applied on resumed sessions. +- Session state is not persisted across server restarts. + +## Verification Gate + +Use the current repo state, not historical green claims: + +```bash +npm run build +npm test -- --runInBand +``` diff --git a/docs/claude-code-lifecycle-investigation.md b/docs/claude-code-lifecycle-investigation.md new file mode 100644 index 0000000..353d067 --- /dev/null +++ b/docs/claude-code-lifecycle-investigation.md @@ -0,0 +1,224 @@ +# Claude Code Lifecycle Investigation + +Date: 2026-04-25 + +This document captures the end-to-end investigation of `codex-mcp-server` +behavior under Claude Code after the following fixes: + +- `ac66ca1` `fix: serialize concurrent tools/call requests to prevent stdio race` +- `358c106` `feat: add per-call timeout to prevent queue starvation on hung API calls` +- `a1cd967` `fix: block queue until timed out codex child exits` +- follow-up runtime cleanup on 2026-04-25: + - startup banner moved behind `CODEX_MCP_DEBUG_STARTUP` + - server version now resolves from `package.json` +- follow-up timeout override on 2026-04-25: + - `timeoutMs` added to the `codex` tool schema and MCP definition + - per-call timeout now overrides `CODEX_TOOL_TIMEOUT_MS` +- follow-up lifecycle hardening on 2026-04-26: + - `review`, `websearch`, and `help` now honor the shared abort path + - command argv/stderr logging is now gated behind `CODEX_MCP_DEBUG_COMMANDS` + - `bypassApprovals` is exposed and validated end-to-end + +## Executive Summary + +The current build does **not** reproduce the original "first call works, second +call hangs" failure under Claude Code. + +What is fixed: + +- Concurrent `tools/call` requests are serialized. +- A timed-out Codex child is aborted and the queue remains blocked until the + child actually exits. +- Claude can make a second `mcp__codex__codex` call after a timeout in the same + session. + +What is not a server bug: + +- Claude Code tears down stdio MCP servers at the end of `claude -p` sessions. +- Resuming a Claude conversation is path-scoped by Claude; the same session ID + may fail outside the original local path. + +## Scenario Matrix + +| Scenario | Command shape | Result | Notes | +| --- | --- | --- | --- | +| Direct stdio lifecycle tests | `npm test -- --runInBand src/__tests__/mcp-lifecycle.test.ts --runTestsByPath` | Pass | Covers sequential calls, concurrent serialization, codex timeout recovery, and review timeout recovery | +| Claude print, single live Codex call | `claude -p --allowedTools mcp__codex__codex --tools "" ...` | Pass in 20s | Returned `LIVE_OK` | +| Claude print, two sequential live Codex calls | Same as above, but force two tool calls | Pass in 34s | Returned `FIRST_OK` then `SECOND_OK` | +| Claude print, resumed session, single live Codex call | `claude -p --resume ...` from original local path | Pass in 30s | Returned `RESUMED_LIVE_OK` | +| Claude interactive resume | `claude --resume ` | Pass | Session opened normally and could invoke Codex | +| Claude print, strict stub, two fast calls | Temp MCP config with stub `codex` binary | Pass in 11s | Returned `stub stdout` twice | +| Claude print, strict stub, timeout then recovery | Temp MCP config with `CODEX_TOOL_TIMEOUT_MS=100` | Pass in 11s | First call timed out, second call still returned `stub stdout` | +| Claude print, strict stub, per-call timeout override | Temp MCP config with `CODEX_TOOL_TIMEOUT_MS=9000` and request `timeoutMs=100` | Pass in 15s | First call timed out at 100ms, second call still returned `stub stdout` | + +## Key Evidence + +### 1. Claude does not lose the second live call anymore + +In the two-call live probe, Claude debug showed two distinct successful tool +dispatches: + +- first `mcp__codex__codex` call started and ended successfully +- second `mcp__codex__codex` call started and ended successfully + +After that, Claude sent `SIGINT` and closed the stdio connection cleanly. The +server did not crash between the two calls. + +### 2. Timeout recovery now works under Claude itself + +Using a strict temporary MCP config and a stub `codex` executable that: + +- sleeps for 5 seconds for `slow-timeout` +- fails with `Not connected` on overlap +- returns quickly otherwise + +Claude produced: + +```text +Error in tool "codex": Tool call timed out after 100ms +stub stdout +``` + +That is the critical regression check. Before `a1cd967`, the timeout response +could be returned before the killed child had actually released its resources, +so the next queued call could still fail or wedge. + +### 3. Claude still tears the server down after the session + +Successful Claude debug logs consistently show this pattern after the final +response: + +- Claude sends `SIGINT` to the MCP server process +- the `codex` stdio connection closes cleanly +- Claude records `MCP server process exited cleanly` + +That supports the earlier lifecycle finding: Claude owns MCP server lifetime for +these print sessions. + +## Root Cause Breakdown + +### Bug 1: Concurrent request race + +Cause: + +- the `CallToolRequestSchema` handler allowed multiple `handler.execute()` + calls to run at once +- concurrent child processes raced on the same Codex stdio path + +Fix: + +- serialize `tools/call` execution through `callQueue` + +Status: + +- fixed by `ac66ca1` + +### Bug 2: Timeout released the queue too early + +Cause: + +- the server returned a timeout error to Claude +- but the underlying `handler.execute()` was still unwinding +- the next queued request could start before the timed-out child had fully + exited + +Fix: + +- propagate an `AbortSignal` +- terminate the Codex child on timeout +- keep the queue blocked until the aborted operation actually settles + +Status: + +- fully fixed by `a1cd967` + +### Non-bug: Claude session teardown + +Observed behavior: + +- at the end of successful `claude -p` runs, Claude sends `SIGINT` +- the MCP stdio connection closes cleanly + +Interpretation: + +- this is expected client lifecycle behavior +- it is not evidence that `codex-mcp-server` crashed after one call + +## What This Rules Out + +- The current server does not reproduce a "one successful call, then dead + server" failure under Claude Code. +- The current server does not jam the queue after a timed-out Codex child in the + tested recovery path. +- The resumed Claude session itself is not inherently broken. + +## Remaining Rough Edges + +These are real follow-up candidates, but they are not the root cause of the +hang investigated here. + +### Timeout tuning + +`CODEX_TOOL_TIMEOUT_MS` exists and works, and callers can now override it per +request with `timeoutMs`. The default `120000` can still be too short for large +real Codex tasks. + +Suggested cleanup: + +- document the env var and per-call override clearly +- consider revisiting the default timeout once more real Codex latency data is available + +## Practical Conclusion + +At this point the server-side lifecycle and queue bugs are fixed well enough for +Claude-driven use: + +- single live calls work +- sequential live calls work +- resumed-session live calls work +- timeout followed by another call works +- startup debug noise is off by default +- reported server version matches the package version + +If a user still sees a hang now, the next place to investigate is no longer the +original stdio race. It is more likely to be one of: + +- a real long-running Codex API call +- Claude-side tool selection or control-flow behavior +- an environment-specific issue outside the fixed queue/timeout path + +## Final Verification + +Final verification after the startup-noise and version-drift cleanup: + +### Repo Checks + +- `npm run lint` — PASS +- `npm run format:check` — PASS +- `npm run build` — PASS +- `npm test -- --runInBand` — PASS (`13/13` suites, `88/88` tests) + +### Focused Regression Checks + +- `npm test -- --runInBand src/__tests__/runtime-config.test.ts src/__tests__/startup-logging.test.ts src/__tests__/mcp-lifecycle.test.ts --runTestsByPath` + - PASS (`3/3` suites, `8/8` tests) + +### Claude-Facing Verification + +A final strict stub-backed Claude probe was used to avoid live Codex account +usage-limit noise while still exercising the Claude MCP client path. + +Observed result: + +- Claude received `stub stdout` +- Claude debug showed `Calling MCP tool: codex` +- Claude debug showed `Tool 'codex' completed successfully` +- the old startup line `Server stderr: codex-mcp-server started successfully` + was absent +- a strict stub-backed override probe returned: + - `Error in tool "codex": Tool call timed out after 100ms` + - `stub stdout` + +This confirms the final build still works through Claude and the startup banner +no longer pollutes Claude's debug log by default, and that `timeoutMs` +overrides the global timeout without jamming the queue. diff --git a/docs/codex-cli-integration.md b/docs/codex-cli-integration.md index a50dc07..95860cc 100644 --- a/docs/codex-cli-integration.md +++ b/docs/codex-cli-integration.md @@ -1,331 +1,108 @@ -# Codex CLI v0.75.0+ Integration Guide +# Codex CLI Integration -## Overview -This document outlines the integration with OpenAI Codex CLI v0.75.0+, highlighting breaking changes, new features, and implementation details for the MCP server wrapper. +Current integration notes for `codex-mcp-server`. -## Version Compatibility +## Supported Baseline -### Recommended Version: v0.75.0+ -This MCP server is optimized for **codex CLI v0.75.0 or later** for full feature support. +- Recommended minimum: Codex CLI `v0.75.0+` +- Authentication: `codex login --api-key "..."` +- This server expects modern `codex exec resume`, `codex review`, sandbox, and + working-directory support -**Version History:** -- **v0.75.0**: Added `codex review` command, sandbox modes, full-auto mode -- **v0.74.0**: Introduced `gpt-5.2-codex` model -- **v0.50.0**: Introduced `--skip-git-repo-check` flag, removed `--reasoning-effort` flag -- **v0.36.0-v0.49.x**: Not compatible with this MCP server version (use older MCP releases) +If you rely on resumed-session `fullAuto` or `bypassApprovals`, verify your +local CLI supports them with: -## Breaking Changes ⚠️ - -### v0.75.0 Changes (Current) -1. **Resume command moved under exec** - - **Old**: `codex resume ` - - **New**: `codex exec resume ` - - Impact: MCP server updated to use new command structure - -### v0.50.0 Changes -1. **`--skip-git-repo-check` flag now required** - - Required when running outside git repositories or in untrusted directories - - Prevents "Not inside a trusted directory" errors - - Impact: All exec-based MCP server commands include this flag - -2. **`--reasoning-effort` flag changed** - - The standalone flag was removed in codex CLI v0.50.0 - - Now passed via `-c model_reasoning_effort=` config flag - - Impact: MCP server updated to use new config-based approach - -### v0.36.0 Changes (Historical) -1. **Authentication Method Change** - - **Old Method**: `OPENAI_API_KEY` environment variable - - **New Method**: `codex login --api-key "your-api-key"` - - **Storage**: Credentials now stored in `CODEX_HOME/auth.json` - - **Impact**: Users must re-authenticate using the new login command - -## New Features Implemented - -### 1. Code Review (v0.75.0+) -- **Command**: `codex review` (top-level subcommand) -- **CLI Flags**: - - `--uncommitted`: Review staged, unstaged, and untracked changes - - `--base `: Review changes against a base branch - - `--commit `: Review changes introduced by a specific commit - - `--title `: Optional title for review summary -- **MCP Tool**: New `review` tool with all parameters exposed - -### 2. Sandbox Mode (v0.75.0+) -- **CLI Flag**: `--sandbox <mode>` -- **Modes**: - - `read-only`: No file writes allowed - - `workspace-write`: Writes only in workspace directory - - `danger-full-access`: Full system access (dangerous) -- **MCP Parameter**: `sandbox` parameter in codex tool - -### 3. Full-Auto Mode (v0.75.0+) -- **CLI Flag**: `--full-auto` -- **Description**: Sandboxed automatic execution without approval prompts -- **Equivalent to**: `-a on-request --sandbox workspace-write` -- **MCP Parameter**: `fullAuto` boolean parameter - -### 4. Working Directory (v0.75.0+) -- **CLI Flag**: `-C <dir>` -- **Description**: Set working directory for the agent (global option) -- **MCP Parameter**: `workingDirectory` parameter in codex tool and review tool - -### 5. Model Selection -- **Default Model**: `gpt-5.2-codex` (optimal for agentic coding tasks) -- **CLI Flag**: `--model <model-name>` -- **Supported Models**: - - `gpt-5.2-codex` (default, specialized for agentic coding) - - `gpt-5.1-codex` (previous coding model) - - `gpt-5.1-codex-max` (enhanced coding) - - `gpt-5-codex` (base GPT-5 coding) - - `gpt-4o` (fast multimodal) - - `gpt-4` (advanced reasoning) - - `o3` (OpenAI reasoning model) - - `o4-mini` (compact reasoning model) -- **Usage**: Model parameter available in `exec`, `resume`, and `review` modes (use `-c model="..."` for review/resume) - -### 6. Reasoning Effort Control -- **CLI Flag**: `-c model_reasoning_effort="<level>"` -- **Levels**: `minimal`, `low`, `medium`, `high` -- **MCP Parameter**: `reasoningEffort` parameter in codex tool -- **Note**: The standalone `--reasoning-effort` flag was removed in v0.50.0, now uses quoted config values for consistency - -### 7. Native Resume Functionality -- **Command**: `codex exec resume <conversation-id>` -- **Automatic ID Extraction**: Server extracts conversation IDs from CLI output (supports both "session id" and "conversation id" formats) -- **Regex Pattern**: `/(conversation|session)\s*id\s*:\s*([a-zA-Z0-9-]+)/i` -- **Fallback Strategy**: Manual context building when resume unavailable -- **Session Integration**: Seamless integration with session management - -### 8. Thread ID Metadata (v0.87.0+) -- **Output**: Codex CLI emits `threadId` in MCP responses -- **Server Behavior**: This MCP server surfaces `threadId` in tool response metadata and content element metadata when present. `structuredContent` is only emitted when `STRUCTURED_CONTENT_ENABLED` is truthy and is **disabled by default**. -- **Regex Pattern**: `/thread\s*id\s*:\s*([a-zA-Z0-9_-]+)/i` -- **Structured Output**: The codex tool advertises an `outputSchema` for `structuredContent` (currently `threadId`) when enabled. - -## Features Not Yet Supported - -The following Codex CLI features are not currently exposed through the MCP server: - -| Feature | CLI Flag | Notes | -|---------|----------|-------| -| Image Attachments | `-i, --image` | Attach images to prompts | -| OSS/Local Models | `--oss`, `--local-provider` | LMStudio/Ollama support | -| Config Profiles | `-p, --profile` | Named configuration profiles | -| Approval Policy | `-a, --ask-for-approval` | Fine-grained approval control | -| Additional Dirs | `--add-dir` | Extra writable directories | -| JSON Output | `--json` | JSONL event stream output | -| Output Schema | `--output-schema` | Structured JSON output | -| Output File | `-o, --output-last-message` | Write response to file | - -These features may be added in future versions based on user demand. - -## Web Search Implementation - -**Note:** This MCP server provides web search capability through the dedicated `websearch` tool. The tool uses `codex --search exec` to enable Codex's native web_search tool, providing: - -- Customizable result count (1-50 results) -- Search depth control (basic/full) -- Streaming progress updates -- Integration with Codex's search-enabled models - -See the Tools section in README.md for usage examples. - -## MCP Callback URI (v0.81.0+) -Codex CLI added static MCP callback URI support. This MCP server forwards the callback URI via environment variable when provided. - -- **Env Var**: `CODEX_MCP_CALLBACK_URI` -- **MCP Parameter**: `callbackUri` (takes precedence over the env var) - -## Implementation Details - -### Command Construction (v0.75.0+) - -**IMPORTANT**: All `exec` options (`--model`, `-c`, `--skip-git-repo-check`, `-C`, `--sandbox`, `--full-auto`) must come BEFORE subcommands (`resume`). - -```typescript -// Basic execution (v0.75.0+) -['exec', '--model', selectedModel, '--skip-git-repo-check', prompt] - -// Execution with new parameters (v0.75.0+) -['exec', '--model', selectedModel, '--sandbox', 'workspace-write', '--full-auto', '-C', workingDir, '--skip-git-repo-check', prompt] - -// Resume mode (v0.75.0+) - All exec options BEFORE 'resume' subcommand -['exec', '--skip-git-repo-check', '-c', 'model="modelName"', '-c', 'model_reasoning_effort="high"', 'resume', conversationId, prompt] - -// Code review (v0.75.0+) -['-C', workingDir, 'review', '-c', 'model="modelName"', '--uncommitted', '--base', 'main', prompt] - -// Code review with model config before subcommand (also accepted) -['-C', workingDir, '-c', 'model="modelName"', 'review', '--uncommitted', '--base', 'main', prompt] +```bash +codex exec resume --help ``` -**Important: Resume Mode Limitations** - -The `codex exec resume` subcommand has a **limited set of flags** compared to `codex exec`: -- ✅ `-c, --config` - Configuration overrides (use for model selection) -- ✅ `--enable/--disable` - Feature toggles -- ❌ `--model` - Not available (use `-c model="..."` instead) -- ❌ `--sandbox` - Not available in resume mode -- ❌ `--full-auto` - Not available in resume mode -- ❌ `-C` - Not available in resume mode -- ⚠️ `--skip-git-repo-check` - Must be placed on `exec` command BEFORE `resume` subcommand +## Command Shapes Used by the Server -**Important: Review Mode Limitations** +New execution: -The `codex review` subcommand also has limited flags: -- ✅ `-c, --config` - Configuration overrides (use for model selection) -- ✅ `--uncommitted`, `--base`, `--commit`, `--title` - Review-specific flags -- ✅ `--enable/--disable` - Feature toggles -- ❌ `--model` - Not available (use `-c model="..."` instead) -- ❌ `--sandbox` - Not available in review mode -- ❌ `--full-auto` - Not available in review mode -- ✅ `-C` - Global option before `review` -- ⚠️ `-c` is accepted by `codex review` and can also be passed before `review` as a global option - -**Key Changes in v0.75.0:** -- Added: `codex review` subcommand for code reviews -- Added: `--sandbox` flag for sandbox modes (exec only) -- Added: `--full-auto` flag for automatic execution (exec only) -- Changed: `codex resume` moved to `codex exec resume` -- Note: Resume subcommand and review command have limited flag support - -**Key Changes in v0.50.0:** -- Added: `--skip-git-repo-check` flag (exec only) -- Changed: `--reasoning-effort` to `-c model_reasoning_effort=<level>` - -### Conversation ID Extraction -```typescript -const conversationIdMatch = result.stderr?.match(/conversation\s*id\s*:\s*([a-zA-Z0-9-]+)/i); -if (conversationIdMatch) { - sessionStorage.setCodexConversationId(sessionId, conversationIdMatch[1]); -} +```text +codex exec --model <model> [flags...] --skip-git-repo-check <prompt> ``` -### Error Handling Enhancements -- **Authentication Errors**: Clear messaging for login requirement -- **Model Validation**: Graceful handling of invalid model names -- **Network Issues**: Proper error propagation and user feedback -- **CLI Availability**: Detection of missing Codex CLI installation - -## Migration Guide - -### For Existing Users (Upgrading to v0.75.0+) -1. **Check Current Version**: - ```bash - codex --version - ``` +Resume execution: -2. **Update Codex CLI** (if below v0.75.0): - ```bash - npm update -g @openai/codex - # or - brew upgrade codex - ``` - -3. **Verify Version** (must be v0.75.0 or later): - ```bash - codex --version # Should show v0.75.0 or higher - ``` - -4. **Test New Features**: - ```bash - # Test code review - codex review --uncommitted +```text +codex exec --skip-git-repo-check -c model="<model>" [-c model_reasoning_effort="..."] [flags...] resume <conversation-id> <prompt> +``` - # Test sandbox mode - codex exec --sandbox workspace-write --skip-git-repo-check "list files" - ``` +Review: -### For New Users -1. **Install Codex CLI** (v0.75.0+): - ```bash - npm install -g @openai/codex - # or - brew install codex - ``` +```text +codex [-C <dir>] -c model="<model>" review [review flags...] +``` -2. **Verify Version**: - ```bash - codex --version # Must be v0.75.0 or later - ``` +Web search: -3. **Authenticate**: - ```bash - codex login --api-key "your-openai-api-key" - ``` +```text +codex --search exec --skip-git-repo-check <search prompt> +``` -4. **Configure (Optional)**: - ```bash - # Edit ~/.codex/config.toml to set preferences - # Example: - # model = "gpt-5.2-codex" - # model_reasoning_effort = "medium" - ``` +## Flags the Server Exposes -5. **Test Setup**: - ```bash - codex exec --skip-git-repo-check "console.log('Hello, Codex!')" - ``` +### `codex` -## Performance Optimizations +- `--model` +- `-c model_reasoning_effort="..."` +- `--sandbox <mode>` for new executions only +- `--full-auto` +- `--dangerously-bypass-approvals-and-sandbox` +- `-C <dir>` for new executions only +- `--skip-git-repo-check` -### Smart Model Selection -- **Default to gpt-5.2-codex**: Optimal for agentic coding without configuration -- **Context-Aware Suggestions**: Better model recommendations based on task type -- **Consistent Experience**: Same model across session interactions +### `review` -### Efficient Context Management -- **Native Resume Priority**: Use Codex's built-in conversation continuity -- **Fallback Context**: Only when native resume unavailable -- **Token Optimization**: Minimal context overhead for better performance +- `-c model="..."` +- `-C <dir>` +- `--uncommitted` +- `--base <ref>` +- `--commit <sha>` +- `--title <text>` -### Error Recovery -- **Graceful Degradation**: Continue operation despite CLI issues -- **Automatic Retry**: For transient network issues -- **Clear Error Messages**: Actionable feedback for user troubleshooting +### `websearch` -## Testing Strategy +- `--search` +- `exec` +- `--skip-git-repo-check` -### Integration Testing -- **CLI Command Validation**: Verify correct parameter passing -- **Conversation ID Extraction**: Test various output formats -- **Error Scenario Handling**: Comprehensive failure mode coverage +## Resume Notes -### Edge Case Coverage -- **Malformed CLI Output**: Handle unexpected response formats -- **Network Interruptions**: Graceful handling of connectivity issues -- **Model Availability**: Handle model deprecation or unavailability +- `sandbox` is not applied on resumed sessions. +- `workingDirectory` is not applied on resumed sessions. +- This server now forwards `fullAuto` and `bypassApprovals` on resume when the + caller asks for them. +- The server still uses `-c model="..."` for resume mode for consistency, + even though newer Codex CLI builds also expose `--model`. -## Best Practices +## Timeout and Abort -### For Developers -- **Always specify model explicitly** when behavior consistency is critical -- **Use appropriate reasoning effort** based on task complexity -- **Implement proper error handling** for CLI interactions -- **Monitor session lifecycle** to prevent memory leaks +- All `tools/call` requests are serialized. +- Global timeout: `CODEX_TOOL_TIMEOUT_MS`, default `120000` +- `codex` also supports per-request `timeoutMs` +- `codex`, `review`, `websearch`, and `help` now all honor the shared abort + path when they time out -### For Users -- **Start with default settings** for optimal experience -- **Use sessions for complex tasks** requiring multiple interactions -- **Choose reasoning effort wisely** to balance speed and quality -- **Keep CLI updated** for latest features and bug fixes +## Environment Variables Used by This Server -## Troubleshooting +- `CODEX_DEFAULT_MODEL` +- `CODEX_MCP_CALLBACK_URI` +- `CODEX_TOOL_TIMEOUT_MS` +- `STRUCTURED_CONTENT_ENABLED` +- `CODEX_MCP_DEBUG_STARTUP` +- `CODEX_MCP_DEBUG_COMMANDS` -### Common Issues -1. **Authentication Failures** - - Solution: Run `codex login --api-key "your-key"` - - Verify: Check `CODEX_HOME/auth.json` exists +## Current Default Model and Reasoning Values -2. **Model Not Available** - - Solution: Use default `gpt-5.2-codex` or try alternative models - - Check: Codex CLI documentation for available models +- Default model: `gpt-5.4` +- Documented reasoning values: `none`, `minimal`, `low`, `medium`, `high`, + `xhigh` -3. **Resume Functionality Not Working** - - Solution: System falls back to manual context building - - Check: Conversation ID extraction in server logs +## Current Gaps -4. **Performance Issues** - - Solution: Lower reasoning effort or use faster models - - Monitor: Response times and adjust parameters accordingly \ No newline at end of file +- `timeoutMs` is only exposed on the `codex` tool, not on `review`, `help`, or + `websearch` +- Sessions are process-local and in-memory only diff --git a/docs/plan.md b/docs/plan.md index bbe8a2c..6acb9c0 100644 --- a/docs/plan.md +++ b/docs/plan.md @@ -1,5 +1,8 @@ # Codex MCP Server Implementation Plan +Historical planning document. It does not describe the current repository +layout or current public tool surface. + ## Overview Create an MCP server wrapper for OpenAI Codex CLI that enables single-command integration with Claude, similar to gemini-mcp-tool. @@ -128,4 +131,4 @@ codex exec "Add error handling to this function" Enable single-command Claude integration: ```bash claude mcp add codex-cli -- npx -y codex-mcp-server -``` \ No newline at end of file +``` diff --git a/docs/plans/2026-04-25-timeout-followups.md b/docs/plans/2026-04-25-timeout-followups.md new file mode 100644 index 0000000..cb45574 --- /dev/null +++ b/docs/plans/2026-04-25-timeout-followups.md @@ -0,0 +1,142 @@ +# Timeout Follow-Ups Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Status:** Completed on 2026-04-25 + +**Outcome:** +- Added validated `timeoutMs` to the `codex` tool schema. +- Added per-call timeout override behavior on top of `CODEX_TOOL_TIMEOUT_MS`. +- Exposed `timeoutMs` in the MCP tool definition and public docs. +- Verified repo-wide with lint, format, build, full Jest suite, and a Claude stub probe. + +**Goal:** Add safer timeout controls for long Codex tasks without regressing the fixed queue and child-abort behavior. + +**Architecture:** Keep the existing queue and abort model intact. Add the smallest possible caller-controlled timeout override on the `codex` tool, validate it in the schema layer, and cover it with focused lifecycle tests plus one docs sweep for drifted defaults. + +**Tech Stack:** TypeScript, Jest, ts-jest, MCP SDK, Node child processes + +--- + +### Task 1: Add a validated per-call timeout argument + +**Files:** +- Modify: `src/types.ts` +- Modify: `src/tools/handlers.ts` +- Test: `src/__tests__/runtime-config.test.ts` + +**Step 1: Write the failing test** + +Add a schema-level test that accepts a positive `timeoutMs` and rejects `0`, +negative values, and non-integers. + +**Step 2: Run test to verify it fails** + +Run: + +```bash +npm test -- --runInBand src/__tests__/runtime-config.test.ts --runTestsByPath +``` + +Expected: FAIL because `timeoutMs` is not part of the codex schema yet. + +**Step 3: Write minimal implementation** + +- add optional `timeoutMs` to `CodexToolSchema` +- thread it into the server/tool execution path without changing current + defaults + +**Step 4: Run test to verify it passes** + +Run the same focused test command and confirm PASS. + +### Task 2: Prove override behavior in lifecycle tests + +**Files:** +- Modify: `src/__tests__/mcp-lifecycle.test.ts` +- Modify: `src/server.ts` + +**Step 1: Write the failing test** + +Add a focused lifecycle test where the default timeout is high, but a single +`tools/call` request passes a low `timeoutMs` override and must time out without +blocking the next request. + +**Step 2: Run test to verify it fails** + +Run: + +```bash +npm test -- --runInBand src/__tests__/mcp-lifecycle.test.ts -t "uses per-call timeout override" --runTestsByPath +``` + +Expected: FAIL because the override is ignored. + +**Step 3: Write minimal implementation** + +- prefer request `timeoutMs` over `CODEX_TOOL_TIMEOUT_MS` +- keep the existing abort-and-wait queue semantics unchanged + +**Step 4: Run test to verify it passes** + +Run the same focused test command and confirm PASS. + +### Task 3: Sweep docs for timeout and default-model drift + +**Files:** +- Modify: `README.md` +- Modify: `docs/codex-cli-integration.md` +- Modify: `docs/session-management.md` + +**Step 1: Write the failing check** + +Search for stale default model or timeout documentation: + +```bash +rg -n "gpt-5\\.2-codex|120000|timeoutMs" README.md docs +``` + +**Step 2: Verify drift exists** + +Expected: at least one stale `gpt-5.2-codex` reference and no mention of +per-call timeout override. + +**Step 3: Write minimal documentation updates** + +- align docs with the current default model +- document `CODEX_TOOL_TIMEOUT_MS` +- document the per-call timeout override once implemented + +**Step 4: Verify docs** + +Run the same `rg` command and confirm stale references are gone or explicitly +intentional. + +### Task 4: Final verification + +**Files:** +- Test: `src/__tests__/runtime-config.test.ts` +- Test: `src/__tests__/mcp-lifecycle.test.ts` +- Verify: repo-wide + +**Step 1: Run focused tests** + +```bash +npm test -- --runInBand src/__tests__/runtime-config.test.ts src/__tests__/mcp-lifecycle.test.ts --runTestsByPath +``` + +**Step 2: Run build** + +```bash +npm run build +``` + +**Step 3: Run full suite** + +```bash +npm test -- --runInBand +``` + +**Step 4: Commit** + +Commit only after all verification is green. diff --git a/docs/session-management.md b/docs/session-management.md index b899aee..7e73565 100644 --- a/docs/session-management.md +++ b/docs/session-management.md @@ -1,143 +1,86 @@ -# Session Management Implementation Guide - -## Overview -The Codex MCP Server provides advanced session management with native Codex CLI v0.50.0+ integration, enabling persistent conversational context and sophisticated AI coding assistance. - -Sessions are created on first use when a sessionId is provided. If no sessionId -is supplied, this request does not create a session (so it won't appear in -listSessions). - -## Architecture - -### Session Storage -- **In-memory Map-based storage** with automatic cleanup -- **UUID-based session IDs** for unique identification -- **TTL management** - 24 hour automatic session expiration -- **Session limit enforcement** - maximum 100 concurrent sessions - -### Enhanced Session Data Structure -```typescript -interface SessionData { - id: string; - createdAt: Date; - lastAccessedAt: Date; - turns: ConversationTurn[]; - codexConversationId?: string; // Native Codex conversation ID -} +# Session Management -interface ConversationTurn { - prompt: string; - response: string; - timestamp: Date; -} -``` +This server provides lightweight in-memory session management for the `codex` +tool. -### Native Codex Integration -- **Automatic conversation ID extraction** from Codex CLI output (supports both "session id" and "conversation id" formats) -- **Resume functionality** using `codex exec resume <conversation-id>` -- **Fallback context building** when native resume unavailable -- **Model consistency** across session interactions - -### Tool Enhancements - -#### Enhanced Codex Tool -- **sessionId** (optional): Session ID for conversational context -- **resetSession** (optional): Reset session history before processing -- **model** (optional): Model selection (defaults to `gpt-5.2-codex`) -- **reasoningEffort** (optional): Control reasoning depth (low/medium/high) -- **Smart context building**: Uses native resume or fallback context -- **Robust error handling**: Graceful degradation for various failure modes - -#### ListSessions Tool -- **Session enumeration**: Returns all active session IDs with comprehensive metadata -- **Session introspection**: Creation time, last access, turn count, conversation ID -- **Management interface**: Enables session lifecycle monitoring - -## Implementation Status ✅ - -### ✅ Completed Features -1. **Session Storage System** - - InMemorySessionStorage with TTL and cleanup - - Defensive programming against data corruption - - Conversation ID tracking and management - -2. **Enhanced Codex Tool Handler** - - Native resume functionality with fallback - - GPT-5.2-Codex as intelligent default model - - Model and reasoning effort parameter support - - Comprehensive error handling and validation - -3. **ListSessions Tool** - - Complete session metadata exposure - - JSON-formatted session information - - Real-time session status tracking - -4. **Robust Testing Suite** - - 54 comprehensive tests covering all functionality - - Edge case handling and error scenario validation - - Integration testing with Codex CLI interactions - -## Advanced Benefits -- **Native Codex Resume**: Optimal conversation continuity using Codex CLI's built-in resume feature -- **Intelligent Defaults**: GPT-5.2-Codex model selection for superior agentic coding assistance -- **Production-Ready**: Comprehensive error handling, data validation, and graceful degradation -- **Enterprise-Scale**: Session management suitable for professional development workflows -- **Flexible Configuration**: Per-request model and reasoning effort customization - -## Usage Patterns - -### Basic Session Usage -```bash -# Explicit session management (creates the session on first use) -codex --sessionId "auth-review" "Continue analysis" -codex --sessionId "auth-review" --resetSession true "Start fresh review" -``` +## What a Session Is + +- A session is keyed by a caller-supplied `sessionId`. +- Valid IDs are `1` to `256` characters and may contain only letters, numbers, + `_`, and `-`. +- Sessions exist only inside the current server process. +- If the server exits or is restarted, all sessions are lost. + +## What the Server Stores + +For each session, the server keeps: + +- `id` +- `createdAt` +- `lastAccessedAt` +- `turns` +- `codexConversationId` when Codex CLI emits one + +Limits: + +- TTL: 24 hours of inactivity +- Maximum active sessions: 100 -### Advanced Configuration -```bash -# Model and reasoning control -codex --model "gpt-4" --reasoningEffort "high" "Complex architectural analysis" +## How Resume Works -# Session with custom parameters -codex --sessionId "deep-dive" --model "gpt-4" --reasoningEffort "high" "Advanced optimization review" +1. The first `codex` call with a `sessionId` creates the session. +2. If Codex CLI emits a conversation/session ID, the server stores it. +3. Later calls with the same `sessionId` use `codex exec resume <conversation-id>`. +4. If no native Codex conversation ID exists yet, the server falls back to a + short prompt-context reconstruction using the most recent turns. -# Session management -listSessions # View all active sessions +## Resume Constraints + +- `sandbox` is not applied on resumed sessions. +- `workingDirectory` is not applied on resumed sessions. +- This server forwards `fullAuto` and `bypassApprovals` on resumed sessions. + Whether those flags are accepted depends on the installed Codex CLI version. + +## MCP Examples + +Create or continue a session: + +```json +{ + "name": "codex", + "arguments": { + "prompt": "Audit this module for race conditions", + "sessionId": "race-audit" + } +} ``` -## Technical Architecture - -### Command Flow -```mermaid -graph TD - A[User Request] --> B{Session ID provided?} - B -->|Yes| C[Ensure Session Exists] - C --> D{Reset Session?} - B -->|No| M[Execute with Default Model] - D -->|Yes| E[Clear Session History] - D -->|No| F{Codex Conversation ID exists?} - E --> G[Execute with Default Model] - F -->|Yes| H[Use Codex Resume] - F -->|No| I[Build Enhanced Context] - H --> J[Execute with Parameters] - I --> J - G --> K[Extract Conversation ID] - J --> L[Save Turn to Session] - K --> L - M --> N[Return Response] - L --> N +Reset a session: + +```json +{ + "name": "codex", + "arguments": { + "prompt": "Start over from scratch", + "sessionId": "race-audit", + "resetSession": true + } +} ``` -### Error Handling Strategy -- **Graceful Degradation**: System continues operation even with corrupted session data -- **Defensive Programming**: Validates array structures and handles null/undefined gracefully -- **Comprehensive Logging**: Error context preserved for debugging and monitoring -- **Fallback Mechanisms**: Manual context building when native resume fails +List current sessions: + +```json +{ + "name": "listSessions", + "arguments": {} +} +``` -### Performance Considerations -- **Memory Management**: In-memory, per-process sessions with automatic cleanup - of expired sessions (24hr TTL) -- **Session Limits**: Maximum 100 concurrent sessions to prevent memory exhaustion -- **Context Optimization**: Only recent turns (last 2) used for manual context building -- **Efficient Storage**: Minimal session metadata for optimal memory usage +## Operational Notes +- `listSessions` only shows sessions for the current running server. +- A caller can choose meaningful IDs like `refactor-auth-flow`; IDs are not + UUID-only. +- Native Codex resume is preferred when available because it preserves Codex’s + own conversation state, not just the server’s short turn history. diff --git a/src/__tests__/command-logging.test.ts b/src/__tests__/command-logging.test.ts new file mode 100644 index 0000000..630b068 --- /dev/null +++ b/src/__tests__/command-logging.test.ts @@ -0,0 +1,93 @@ +// Mock chalk to avoid ESM issues in Jest +jest.mock('chalk', () => ({ + __esModule: true, + default: { + blue: (text: string) => text, + yellow: (text: string) => text, + green: (text: string) => text, + red: (text: string) => text, + }, +})); + +jest.mock('child_process', () => ({ + spawn: jest.fn(), +})); + +import { EventEmitter } from 'events'; +import { Buffer } from 'node:buffer'; + +import { executeCommand } from '../utils/command.js'; +import { COMMAND_LOG_ENV_VAR } from '../runtime-config.js'; +import { spawn } from 'child_process'; + +type MockedSpawn = jest.MockedFunction<typeof spawn>; +type MockChild = ReturnType<typeof spawn> & + EventEmitter & { + stderr: EventEmitter; + stdout: EventEmitter; + }; + +function createMockChild() { + const child = new EventEmitter() as unknown as MockChild; + + Object.assign(child, { + stdout: new EventEmitter(), + stderr: new EventEmitter(), + stdin: { end: jest.fn() } as unknown as ReturnType<typeof spawn>['stdin'], + kill: jest.fn(), + }); + Object.defineProperty(child, 'pid', { value: 12345 }); + + return child; +} + +describe('command logging', () => { + const mockedSpawn = spawn as MockedSpawn; + + beforeEach(() => { + jest.restoreAllMocks(); + mockedSpawn.mockReset(); + Reflect.deleteProperty(process.env, COMMAND_LOG_ENV_VAR); + }); + + test('does not log command execution details by default', async () => { + const child = createMockChild(); + mockedSpawn.mockReturnValue(child); + const errorSpy = jest.spyOn(console, 'error').mockImplementation(() => {}); + + const commandPromise = executeCommand('codex', ['exec', 'test prompt']); + + child.stderr.emit('data', Buffer.from('stderr output')); + child.emit('close', 0); + + await expect(commandPromise).resolves.toEqual({ + stdout: '', + stderr: 'stderr output', + }); + expect(errorSpy).not.toHaveBeenCalled(); + }); + + test('logs command execution details when explicitly enabled', async () => { + process.env[COMMAND_LOG_ENV_VAR] = '1'; + + const child = createMockChild(); + mockedSpawn.mockReturnValue(child); + const errorSpy = jest.spyOn(console, 'error').mockImplementation(() => {}); + + const commandPromise = executeCommand('codex', ['exec', 'test prompt']); + + child.stderr.emit('data', Buffer.from('stderr output')); + child.emit('close', 0); + + await expect(commandPromise).resolves.toEqual({ + stdout: '', + stderr: 'stderr output', + }); + expect(errorSpy).toHaveBeenCalledWith( + 'Executing:', + 'codex', + 'exec test prompt' + ); + expect(errorSpy).toHaveBeenCalledWith('Command stderr:', 'stderr output'); + }); +}); diff --git a/src/__tests__/context-building.test.ts b/src/__tests__/context-building.test.ts index 74d81ef..b0be669 100644 --- a/src/__tests__/context-building.test.ts +++ b/src/__tests__/context-building.test.ts @@ -1,18 +1,17 @@ -import { CodexToolHandler } from '../tools/handlers.js'; +import type { CodexToolHandler as CodexToolHandlerType } from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; -import { executeCommand } from '../utils/command.js'; // Mock the command execution jest.mock('../utils/command.js', () => ({ executeCommand: jest.fn(), })); -const mockedExecuteCommand = executeCommand as jest.MockedFunction< - typeof executeCommand ->; - describe('Context Building Analysis', () => { - let handler: CodexToolHandler; + let CodexToolHandler: typeof import('../tools/handlers.js').CodexToolHandler; + let handler: CodexToolHandlerType; + let mockedExecuteCommand: jest.MockedFunction< + typeof import('../utils/command.js').executeCommand + >; let sessionStorage: InMemorySessionStorage; let originalStructuredContent: string | undefined; @@ -28,7 +27,14 @@ describe('Context Building Analysis', () => { } }); - beforeEach(() => { + beforeEach(async () => { + jest.resetModules(); + process.env.STRUCTURED_CONTENT_ENABLED = '1'; + ({ CodexToolHandler } = await import('../tools/handlers.js')); + const commandModule = await import('../utils/command.js'); + mockedExecuteCommand = commandModule.executeCommand as jest.MockedFunction< + typeof commandModule.executeCommand + >; sessionStorage = new InMemorySessionStorage(); handler = new CodexToolHandler(sessionStorage); mockedExecuteCommand.mockClear(); @@ -36,7 +42,6 @@ describe('Context Building Analysis', () => { stdout: 'Test response', stderr: '', }); - process.env.STRUCTURED_CONTENT_ENABLED = '1'; }); test('should build enhanced prompt correctly', async () => { @@ -61,7 +66,7 @@ describe('Context Building Analysis', () => { // Check what prompt was sent to Codex - should be enhanced but not conversational const call = mockedExecuteCommand.mock.calls[0]; - const sentPrompt = call?.[1]?.[4]; // After exec, --model, gpt-5.3-codex, --skip-git-repo-check, prompt + const sentPrompt = call?.[1]?.[4]; // After exec, --model, gpt-5.4, --skip-git-repo-check, prompt expect(sentPrompt).toContain('Previous code context:'); expect(sentPrompt).toContain('Task: Make it more efficient'); expect(sentPrompt).not.toContain('Previous: What is recursion?'); // No conversational format diff --git a/src/__tests__/default-model.test.ts b/src/__tests__/default-model.test.ts index 4329e5f..fa4fec3 100644 --- a/src/__tests__/default-model.test.ts +++ b/src/__tests__/default-model.test.ts @@ -1,18 +1,17 @@ -import { CodexToolHandler } from '../tools/handlers.js'; +import type { CodexToolHandler as CodexToolHandlerType } from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; -import { executeCommand } from '../utils/command.js'; // Mock the command execution jest.mock('../utils/command.js', () => ({ executeCommand: jest.fn(), })); -const mockedExecuteCommand = executeCommand as jest.MockedFunction< - typeof executeCommand ->; - describe('Default Model Configuration', () => { - let handler: CodexToolHandler; + let CodexToolHandler: typeof import('../tools/handlers.js').CodexToolHandler; + let handler: CodexToolHandlerType; + let mockedExecuteCommand: jest.MockedFunction< + typeof import('../utils/command.js').executeCommand + >; let sessionStorage: InMemorySessionStorage; let originalStructuredContent: string | undefined; @@ -28,7 +27,14 @@ describe('Default Model Configuration', () => { } }); - beforeEach(() => { + beforeEach(async () => { + jest.resetModules(); + process.env.STRUCTURED_CONTENT_ENABLED = '1'; + ({ CodexToolHandler } = await import('../tools/handlers.js')); + const commandModule = await import('../utils/command.js'); + mockedExecuteCommand = commandModule.executeCommand as jest.MockedFunction< + typeof commandModule.executeCommand + >; sessionStorage = new InMemorySessionStorage(); handler = new CodexToolHandler(sessionStorage); mockedExecuteCommand.mockClear(); @@ -36,22 +42,15 @@ describe('Default Model Configuration', () => { stdout: 'Test response', stderr: '', }); - process.env.STRUCTURED_CONTENT_ENABLED = '1'; delete process.env.CODEX_MCP_CALLBACK_URI; }); - test('should use gpt-5.3-codex as default model when no model specified', async () => { + test('should use gpt-5.4 as default model when no model specified', async () => { await handler.execute({ prompt: 'Test prompt' }); expect(mockedExecuteCommand).toHaveBeenCalledWith( 'codex', - [ - 'exec', - '--model', - 'gpt-5.3-codex', - '--skip-git-repo-check', - 'Test prompt', - ], + ['exec', '--model', 'gpt-5.4', '--skip-git-repo-check', 'Test prompt'], expect.any(Object) ); }); @@ -59,8 +58,8 @@ describe('Default Model Configuration', () => { test('should include default model in response metadata', async () => { const result = await handler.execute({ prompt: 'Test prompt' }); - expect(result.content[0]._meta?.model).toBe('gpt-5.3-codex'); - expect(result.structuredContent?.model).toBe('gpt-5.3-codex'); + expect(result.content[0]._meta?.model).toBe('gpt-5.4'); + expect(result.structuredContent?.model).toBe('gpt-5.4'); expect(result._meta?.callbackUri).toBeUndefined(); }); @@ -87,13 +86,7 @@ describe('Default Model Configuration', () => { expect(mockedExecuteCommand).toHaveBeenCalledWith( 'codex', - [ - 'exec', - '--model', - 'gpt-5.3-codex', - '--skip-git-repo-check', - 'Test prompt', - ], + ['exec', '--model', 'gpt-5.4', '--skip-git-repo-check', 'Test prompt'], expect.any(Object) ); }); @@ -114,7 +107,7 @@ describe('Default Model Configuration', () => { 'exec', '--skip-git-repo-check', '-c', - 'model="gpt-5.3-codex"', + 'model="gpt-5.4"', 'resume', 'existing-conv-id', 'Resume with default model', @@ -134,7 +127,7 @@ describe('Default Model Configuration', () => { [ 'exec', '--model', - 'gpt-5.3-codex', + 'gpt-5.4', '-c', 'model_reasoning_effort="high"', '--skip-git-repo-check', diff --git a/src/__tests__/edge-cases.test.ts b/src/__tests__/edge-cases.test.ts index 2ec8f6e..a511190 100644 --- a/src/__tests__/edge-cases.test.ts +++ b/src/__tests__/edge-cases.test.ts @@ -133,7 +133,7 @@ describe('Edge Cases and Integration Issues', () => { // Should only use recent turns, not crash with too much context const call = mockedExecuteCommand.mock.calls[0]; - const prompt = call?.[1]?.[4]; // After exec, --model, gpt-5.3-codex, --skip-git-repo-check, prompt + const prompt = call?.[1]?.[4]; // After exec, --model, gpt-5.4, --skip-git-repo-check, prompt expect(typeof prompt).toBe('string'); if (prompt) { expect(prompt.length).toBeLessThan(5000); // Reasonable limit diff --git a/src/__tests__/error-scenarios.test.ts b/src/__tests__/error-scenarios.test.ts index ba4f57b..b656c8c 100644 --- a/src/__tests__/error-scenarios.test.ts +++ b/src/__tests__/error-scenarios.test.ts @@ -111,6 +111,30 @@ describe('Error Handling Scenarios', () => { expect(mockedExecuteCommand).not.toHaveBeenCalled(); }); + test('should reject bypassApprovals when sandbox is also set', async () => { + await expect( + handler.execute({ + prompt: 'Test prompt', + sandbox: 'workspace-write', + bypassApprovals: true, + }) + ).rejects.toThrow(ValidationError); + + expect(mockedExecuteCommand).not.toHaveBeenCalled(); + }); + + test('should reject bypassApprovals when fullAuto is also set', async () => { + await expect( + handler.execute({ + prompt: 'Test prompt', + fullAuto: true, + bypassApprovals: true, + }) + ).rejects.toThrow(ValidationError); + + expect(mockedExecuteCommand).not.toHaveBeenCalled(); + }); + test('should handle corrupted session data', async () => { const sessionId = sessionStorage.createSession(); @@ -157,7 +181,7 @@ describe('Error Handling Scenarios', () => { expect(result.content[0].text).toBe('Response'); expect(mockedExecuteCommand).toHaveBeenCalledWith( 'codex', - ['exec', '--model', 'gpt-5.3-codex', '--skip-git-repo-check', longPrompt], + ['exec', '--model', 'gpt-5.4', '--skip-git-repo-check', longPrompt], expect.any(Object) ); }); diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts index 1762942..c362728 100644 --- a/src/__tests__/index.test.ts +++ b/src/__tests__/index.test.ts @@ -65,6 +65,24 @@ describe('Codex MCP Server', () => { expect(codexTool?.description).toContain('Execute Codex CLI'); }); + test('codex tool should expose timeout override parameter', () => { + const codexTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.CODEX + ); + expect(codexTool?.inputSchema.properties).toHaveProperty('timeoutMs'); + }); + + test('codex tool should expose bypass approvals parameter', () => { + const codexTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.CODEX + ); + const bypassApprovals = codexTool?.inputSchema.properties + .bypassApprovals as { type?: string; description?: string } | undefined; + + expect(bypassApprovals?.type).toBe('boolean'); + expect(bypassApprovals?.description).toContain('externally sandboxed'); + }); + test('ping tool should have optional message parameter', () => { const pingTool = toolDefinitions.find((tool) => tool.name === TOOLS.PING); expect(pingTool).toBeDefined(); @@ -152,7 +170,7 @@ describe('Codex MCP Server', () => { const result = { content: [{ type: 'text', text: 'ok', _meta: { threadId: 'th_123' } }], structuredContent: { threadId: 'th_123' }, - _meta: { model: 'gpt-5.3-codex' }, + _meta: { model: 'gpt-5.4' }, }; const parsed = CallToolResultSchema.safeParse(result); diff --git a/src/__tests__/mcp-lifecycle.test.ts b/src/__tests__/mcp-lifecycle.test.ts new file mode 100644 index 0000000..776fba3 --- /dev/null +++ b/src/__tests__/mcp-lifecycle.test.ts @@ -0,0 +1,695 @@ +import { spawn } from 'child_process'; +import { chmodSync, existsSync, mkdtempSync, rmSync, writeFileSync } from 'fs'; +import { tmpdir } from 'os'; +import path from 'path'; +import { promisify } from 'util'; +import { exec } from 'child_process'; + +const execAsync = promisify(exec); + +const JSONRPC_VERSION = '2.0'; +const TEST_TIMEOUT_MS = 10000; +type ExitSignal = string | null; + +async function ensureBuild(distPath: string): Promise<void> { + if (existsSync(distPath)) return; + await execAsync('npm run build'); +} + +function createCodexStub(): string { + const stubDir = mkdtempSync(path.join(tmpdir(), 'codex-mcp-life-')); + const stubPath = path.join(stubDir, 'codex'); + const lockDir = path.join(stubDir, 'active.lock'); + const stubScript = `#!/bin/sh +LOCK_DIR=${JSON.stringify(lockDir)} +if ! mkdir "$LOCK_DIR" 2>/dev/null; then +printf "Not connected\\n" 1>&2 +exit 1 +fi +cleanup() { + rmdir "$LOCK_DIR" 2>/dev/null || true +} +trap cleanup EXIT INT TERM +sleep 0.2 +printf "stub stdout\\n" +printf "thread id: th_lifecycle_123\\n" 1>&2 +printf "session id: sess_lifecycle_123\\n" 1>&2 +exit 0 +`; + writeFileSync(stubPath, stubScript, { mode: 0o755 }); + chmodSync(stubPath, 0o755); + return stubDir; +} + +function createTimeoutCodexStub(): string { + const stubDir = mkdtempSync(path.join(tmpdir(), 'codex-mcp-timeout-')); + const stubPath = path.join(stubDir, 'codex'); + const lockDir = path.join(stubDir, 'active.lock'); + const stubScript = `#!/bin/sh +LOCK_DIR=${JSON.stringify(lockDir)} +if ! mkdir "$LOCK_DIR" 2>/dev/null; then +printf "Not connected\\n" 1>&2 +exit 1 +fi +cleanup() { + rmdir "$LOCK_DIR" 2>/dev/null || true +} +trap cleanup EXIT INT TERM +if printf "%s" "$*" | grep -q "slow-timeout"; then + sleep 5 +else + sleep 0.01 +fi +printf "stub stdout\\n" +printf "thread id: th_timeout_123\\n" 1>&2 +printf "session id: sess_timeout_123\\n" 1>&2 +exit 0 +`; + writeFileSync(stubPath, stubScript, { mode: 0o755 }); + chmodSync(stubPath, 0o755); + return stubDir; +} + +describe('MCP server lifecycle', () => { + jest.setTimeout(TEST_TIMEOUT_MS); + + let server: ReturnType<typeof spawn> | null = null; + let stubDir: string | null = null; + let buffer = ''; + let exitCode: number | null = null; + let exitSignal: ExitSignal = null; + const pending = new Map<number, (payload: unknown) => void>(); + + const sendRequest = (request: Record<string, unknown>) => + new Promise<unknown>((resolve, reject) => { + if (!server?.stdin) { + reject(new Error('Server stdin not available')); + return; + } + + const id = request.id as number; + const timer = globalThis.setTimeout(() => { + pending.delete(id); + reject(new Error(`Timed out waiting for response ${id}`)); + }, TEST_TIMEOUT_MS); + + pending.set(id, (payload) => { + globalThis.clearTimeout(timer); + resolve(payload); + }); + + server.stdin.write(`${JSON.stringify(request)}\n`); + }); + + beforeAll(async () => { + const distPath = path.join(process.cwd(), 'dist', 'index.js'); + await ensureBuild(distPath); + stubDir = createCodexStub(); + + server = spawn(process.execPath, [distPath], { + env: { + ...process.env, + PATH: `${stubDir}${path.delimiter}${process.env.PATH}`, + }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + + server.stdout?.setEncoding('utf8'); + server.stdout?.on('data', (chunk: string) => { + buffer += chunk; + let newlineIndex = buffer.indexOf('\n'); + + while (newlineIndex >= 0) { + const line = buffer.slice(0, newlineIndex).trim(); + buffer = buffer.slice(newlineIndex + 1); + + if (line) { + try { + const payload = JSON.parse(line) as { + id?: number; + result?: unknown; + }; + + if (typeof payload.id === 'number') { + const resolver = pending.get(payload.id); + if (resolver) { + resolver(payload.result ?? payload); + pending.delete(payload.id); + } + } + } catch { + // Ignore non-JSON output + } + } + + newlineIndex = buffer.indexOf('\n'); + } + }); + + server.stderr?.on('data', () => {}); + server.on('exit', (code, signal) => { + exitCode = code; + exitSignal = signal; + }); + }); + + afterAll(async () => { + if (server && exitCode === null && exitSignal === null) { + server.kill(); + await new Promise((resolve) => server?.once('exit', resolve)); + } + + if (stubDir) { + rmSync(stubDir, { recursive: true, force: true }); + } + }); + + test('stays alive across sequential codex tool calls', async () => { + await sendRequest({ + jsonrpc: JSONRPC_VERSION, + id: 1, + method: 'initialize', + params: { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'test', version: '1' }, + }, + }); + + server?.stdin?.write( + `${JSON.stringify({ + jsonrpc: JSONRPC_VERSION, + method: 'notifications/initialized', + params: {}, + })}\n` + ); + + await sendRequest({ + jsonrpc: JSONRPC_VERSION, + id: 2, + method: 'tools/list', + params: {}, + }); + + const firstResponse = (await sendRequest({ + jsonrpc: JSONRPC_VERSION, + id: 3, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'echo hello' } }, + })) as { + content: Array<{ text: string }>; + }; + + expect(firstResponse.content[0]?.text).toBe('stub stdout\n'); + expect({ exitCode, exitSignal }).toEqual({ + exitCode: null, + exitSignal: null, + }); + + await new Promise((resolve) => globalThis.setTimeout(resolve, 250)); + expect({ exitCode, exitSignal }).toEqual({ + exitCode: null, + exitSignal: null, + }); + + const secondResponse = (await sendRequest({ + jsonrpc: JSONRPC_VERSION, + id: 4, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'echo world' } }, + })) as { + content: Array<{ text: string }>; + }; + + expect(secondResponse.content[0]?.text).toBe('stub stdout\n'); + expect({ exitCode, exitSignal }).toEqual({ + exitCode: null, + exitSignal: null, + }); + }); + + test('serializes concurrent tools/call requests', async () => { + const firstResponsePromise = sendRequest({ + jsonrpc: JSONRPC_VERSION, + id: 5, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'echo concurrent one' } }, + }); + + const secondResponsePromise = sendRequest({ + jsonrpc: JSONRPC_VERSION, + id: 6, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'echo concurrent two' } }, + }); + + const [firstResponse, secondResponse] = (await Promise.all([ + firstResponsePromise, + secondResponsePromise, + ])) as Array<{ content: Array<{ text: string }> }>; + + expect(firstResponse.content[0]?.text).toBe('stub stdout\n'); + expect(secondResponse.content[0]?.text).toBe('stub stdout\n'); + expect({ exitCode, exitSignal }).toEqual({ + exitCode: null, + exitSignal: null, + }); + }); + + test('kills child process on timeout', async () => { + const distPath = path.join(process.cwd(), 'dist', 'index.js'); + await ensureBuild(distPath); + + const timeoutStubDir = createTimeoutCodexStub(); + let timeoutServer: ReturnType<typeof spawn> | null = null; + let timeoutBuffer = ''; + let timeoutExitCode: number | null = null; + let timeoutExitSignal: ExitSignal = null; + const timeoutPending = new Map<number, (payload: unknown) => void>(); + + const sendTimeoutRequest = (request: Record<string, unknown>) => + new Promise<unknown>((resolve, reject) => { + if (!timeoutServer?.stdin) { + reject(new Error('Timeout test server stdin not available')); + return; + } + + const id = request.id as number; + const timer = globalThis.setTimeout(() => { + timeoutPending.delete(id); + reject(new Error(`Timed out waiting for response ${id}`)); + }, TEST_TIMEOUT_MS); + + timeoutPending.set(id, (payload) => { + globalThis.clearTimeout(timer); + resolve(payload); + }); + + timeoutServer.stdin.write(`${JSON.stringify(request)}\n`); + }); + + try { + timeoutServer = spawn(process.execPath, [distPath], { + env: { + ...process.env, + PATH: `${timeoutStubDir}${path.delimiter}${process.env.PATH}`, + CODEX_TOOL_TIMEOUT_MS: '100', + }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + + timeoutServer.stdout?.setEncoding('utf8'); + timeoutServer.stdout?.on('data', (chunk: string) => { + timeoutBuffer += chunk; + let newlineIndex = timeoutBuffer.indexOf('\n'); + + while (newlineIndex >= 0) { + const line = timeoutBuffer.slice(0, newlineIndex).trim(); + timeoutBuffer = timeoutBuffer.slice(newlineIndex + 1); + + if (line) { + try { + const payload = JSON.parse(line) as { + id?: number; + result?: unknown; + }; + + if (typeof payload.id === 'number') { + const resolver = timeoutPending.get(payload.id); + if (resolver) { + resolver(payload.result ?? payload); + timeoutPending.delete(payload.id); + } + } + } catch { + // Ignore non-JSON output + } + } + + newlineIndex = timeoutBuffer.indexOf('\n'); + } + }); + + timeoutServer.stderr?.on('data', () => {}); + timeoutServer.on('exit', (code, signal) => { + timeoutExitCode = code; + timeoutExitSignal = signal; + }); + + await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 100, + method: 'initialize', + params: { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'test-timeout', version: '1' }, + }, + }); + + timeoutServer.stdin?.write( + `${JSON.stringify({ + jsonrpc: JSONRPC_VERSION, + method: 'notifications/initialized', + params: {}, + })}\n` + ); + + const timedOutResponse = (await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 101, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'slow-timeout' } }, + })) as { + content: Array<{ text: string }>; + isError?: boolean; + }; + + expect(timedOutResponse.isError).toBe(true); + expect(timedOutResponse.content[0]?.text).toContain( + 'Tool call timed out after 100ms' + ); + + const secondResponse = (await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 102, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'fast-after-timeout' } }, + })) as { + content: Array<{ text: string }>; + }; + + expect(secondResponse.content[0]?.text).toBe('stub stdout\n'); + expect({ timeoutExitCode, timeoutExitSignal }).toEqual({ + timeoutExitCode: null, + timeoutExitSignal: null, + }); + } finally { + if ( + timeoutServer && + timeoutExitCode === null && + timeoutExitSignal === null + ) { + timeoutServer.kill(); + await new Promise((resolve) => timeoutServer?.once('exit', resolve)); + } + rmSync(timeoutStubDir, { recursive: true, force: true }); + } + }); + + test('uses per-call timeout override', async () => { + const distPath = path.join(process.cwd(), 'dist', 'index.js'); + await ensureBuild(distPath); + + const timeoutStubDir = createTimeoutCodexStub(); + let timeoutServer: ReturnType<typeof spawn> | null = null; + let timeoutBuffer = ''; + let timeoutExitCode: number | null = null; + let timeoutExitSignal: ExitSignal = null; + const timeoutPending = new Map<number, (payload: unknown) => void>(); + + const sendTimeoutRequest = (request: Record<string, unknown>) => + new Promise<unknown>((resolve, reject) => { + if (!timeoutServer?.stdin) { + reject(new Error('Timeout override test server stdin not available')); + return; + } + + const id = request.id as number; + const timer = globalThis.setTimeout(() => { + timeoutPending.delete(id); + reject(new Error(`Timed out waiting for response ${id}`)); + }, TEST_TIMEOUT_MS); + + timeoutPending.set(id, (payload) => { + globalThis.clearTimeout(timer); + resolve(payload); + }); + + timeoutServer.stdin.write(`${JSON.stringify(request)}\n`); + }); + + try { + timeoutServer = spawn(process.execPath, [distPath], { + env: { + ...process.env, + PATH: `${timeoutStubDir}${path.delimiter}${process.env.PATH}`, + CODEX_TOOL_TIMEOUT_MS: '9000', + }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + + timeoutServer.stdout?.setEncoding('utf8'); + timeoutServer.stdout?.on('data', (chunk: string) => { + timeoutBuffer += chunk; + let newlineIndex = timeoutBuffer.indexOf('\n'); + + while (newlineIndex >= 0) { + const line = timeoutBuffer.slice(0, newlineIndex).trim(); + timeoutBuffer = timeoutBuffer.slice(newlineIndex + 1); + + if (line) { + try { + const payload = JSON.parse(line) as { + id?: number; + result?: unknown; + }; + + if (typeof payload.id === 'number') { + const resolver = timeoutPending.get(payload.id); + if (resolver) { + resolver(payload.result ?? payload); + timeoutPending.delete(payload.id); + } + } + } catch { + // Ignore non-JSON output + } + } + + newlineIndex = timeoutBuffer.indexOf('\n'); + } + }); + + timeoutServer.stderr?.on('data', () => {}); + timeoutServer.on('exit', (code, signal) => { + timeoutExitCode = code; + timeoutExitSignal = signal; + }); + + await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 200, + method: 'initialize', + params: { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'test-timeout-override', version: '1' }, + }, + }); + + timeoutServer.stdin?.write( + `${JSON.stringify({ + jsonrpc: JSONRPC_VERSION, + method: 'notifications/initialized', + params: {}, + })}\n` + ); + + const timedOutResponse = (await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 201, + method: 'tools/call', + params: { + name: 'codex', + arguments: { prompt: 'slow-timeout', timeoutMs: 100 }, + }, + })) as { + content: Array<{ text: string }>; + isError?: boolean; + }; + + expect(timedOutResponse.isError).toBe(true); + expect(timedOutResponse.content[0]?.text).toContain( + 'Tool call timed out after 100ms' + ); + + const secondResponse = (await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 202, + method: 'tools/call', + params: { name: 'codex', arguments: { prompt: 'fast-after-timeout' } }, + })) as { + content: Array<{ text: string }>; + }; + + expect(secondResponse.content[0]?.text).toBe('stub stdout\n'); + expect({ timeoutExitCode, timeoutExitSignal }).toEqual({ + timeoutExitCode: null, + timeoutExitSignal: null, + }); + } finally { + if ( + timeoutServer && + timeoutExitCode === null && + timeoutExitSignal === null + ) { + timeoutServer.kill(); + await new Promise((resolve) => timeoutServer?.once('exit', resolve)); + } + rmSync(timeoutStubDir, { recursive: true, force: true }); + } + }); + + test('unblocks queued calls after a review timeout', async () => { + const distPath = path.join(process.cwd(), 'dist', 'index.js'); + await ensureBuild(distPath); + + const timeoutStubDir = createTimeoutCodexStub(); + let timeoutServer: ReturnType<typeof spawn> | null = null; + let timeoutBuffer = ''; + let timeoutExitCode: number | null = null; + let timeoutExitSignal: ExitSignal = null; + const timeoutPending = new Map<number, (payload: unknown) => void>(); + + const sendTimeoutRequest = (request: Record<string, unknown>) => + new Promise<unknown>((resolve, reject) => { + if (!timeoutServer?.stdin) { + reject(new Error('Review timeout test server stdin not available')); + return; + } + + const id = request.id as number; + const timer = globalThis.setTimeout(() => { + timeoutPending.delete(id); + reject(new Error(`Timed out waiting for response ${id}`)); + }, TEST_TIMEOUT_MS); + + timeoutPending.set(id, (payload) => { + globalThis.clearTimeout(timer); + resolve(payload); + }); + + timeoutServer.stdin.write(`${JSON.stringify(request)}\n`); + }); + + try { + timeoutServer = spawn(process.execPath, [distPath], { + env: { + ...process.env, + PATH: `${timeoutStubDir}${path.delimiter}${process.env.PATH}`, + CODEX_TOOL_TIMEOUT_MS: '100', + }, + stdio: ['pipe', 'pipe', 'pipe'], + }); + + timeoutServer.stdout?.setEncoding('utf8'); + timeoutServer.stdout?.on('data', (chunk: string) => { + timeoutBuffer += chunk; + let newlineIndex = timeoutBuffer.indexOf('\n'); + + while (newlineIndex >= 0) { + const line = timeoutBuffer.slice(0, newlineIndex).trim(); + timeoutBuffer = timeoutBuffer.slice(newlineIndex + 1); + + if (line) { + try { + const payload = JSON.parse(line) as { + id?: number; + result?: unknown; + }; + + if (typeof payload.id === 'number') { + const resolver = timeoutPending.get(payload.id); + if (resolver) { + resolver(payload.result ?? payload); + timeoutPending.delete(payload.id); + } + } + } catch { + // Ignore non-JSON output + } + } + + newlineIndex = timeoutBuffer.indexOf('\n'); + } + }); + + timeoutServer.stderr?.on('data', () => {}); + timeoutServer.on('exit', (code, signal) => { + timeoutExitCode = code; + timeoutExitSignal = signal; + }); + + await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 300, + method: 'initialize', + params: { + protocolVersion: '2024-11-05', + capabilities: {}, + clientInfo: { name: 'test-review-timeout', version: '1' }, + }, + }); + + timeoutServer.stdin?.write( + `${JSON.stringify({ + jsonrpc: JSONRPC_VERSION, + method: 'notifications/initialized', + params: {}, + })}\n` + ); + + const timedOutResponse = (await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 301, + method: 'tools/call', + params: { + name: 'review', + arguments: { + base: 'main', + prompt: 'slow-timeout', + }, + }, + })) as { + content: Array<{ text: string }>; + isError?: boolean; + }; + + expect(timedOutResponse.isError).toBe(true); + expect(timedOutResponse.content[0]?.text).toContain( + 'Tool call timed out after 100ms' + ); + + const pingStartedAt = Date.now(); + const pingResponse = (await sendTimeoutRequest({ + jsonrpc: JSONRPC_VERSION, + id: 302, + method: 'tools/call', + params: { + name: 'ping', + arguments: { message: 'after-review-timeout' }, + }, + })) as { + content: Array<{ text: string }>; + }; + + expect(Date.now() - pingStartedAt).toBeLessThan(1000); + expect(pingResponse.content[0]?.text).toBe('after-review-timeout'); + expect({ timeoutExitCode, timeoutExitSignal }).toEqual({ + timeoutExitCode: null, + timeoutExitSignal: null, + }); + } finally { + if ( + timeoutServer && + timeoutExitCode === null && + timeoutExitSignal === null + ) { + timeoutServer.kill(); + await new Promise((resolve) => timeoutServer?.once('exit', resolve)); + } + rmSync(timeoutStubDir, { recursive: true, force: true }); + } + }); +}); diff --git a/src/__tests__/model-selection.test.ts b/src/__tests__/model-selection.test.ts index a261883..2211627 100644 --- a/src/__tests__/model-selection.test.ts +++ b/src/__tests__/model-selection.test.ts @@ -1,18 +1,17 @@ -import { CodexToolHandler } from '../tools/handlers.js'; +import type { CodexToolHandler as CodexToolHandlerType } from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; -import { executeCommand } from '../utils/command.js'; // Mock the command execution jest.mock('../utils/command.js', () => ({ executeCommand: jest.fn(), })); -const mockedExecuteCommand = executeCommand as jest.MockedFunction< - typeof executeCommand ->; - describe('Model Selection and Reasoning Effort', () => { - let handler: CodexToolHandler; + let CodexToolHandler: typeof import('../tools/handlers.js').CodexToolHandler; + let handler: CodexToolHandlerType; + let mockedExecuteCommand: jest.MockedFunction< + typeof import('../utils/command.js').executeCommand + >; let sessionStorage: InMemorySessionStorage; let originalStructuredContent: string | undefined; @@ -28,7 +27,14 @@ describe('Model Selection and Reasoning Effort', () => { } }); - beforeEach(() => { + beforeEach(async () => { + jest.resetModules(); + process.env.STRUCTURED_CONTENT_ENABLED = '1'; + ({ CodexToolHandler } = await import('../tools/handlers.js')); + const commandModule = await import('../utils/command.js'); + mockedExecuteCommand = commandModule.executeCommand as jest.MockedFunction< + typeof commandModule.executeCommand + >; sessionStorage = new InMemorySessionStorage(); handler = new CodexToolHandler(sessionStorage); mockedExecuteCommand.mockClear(); @@ -36,7 +42,6 @@ describe('Model Selection and Reasoning Effort', () => { stdout: 'Test response', stderr: '', }); - process.env.STRUCTURED_CONTENT_ENABLED = '1'; }); test('should pass model parameter to codex CLI', async () => { @@ -63,7 +68,7 @@ describe('Model Selection and Reasoning Effort', () => { [ 'exec', '--model', - 'gpt-5.3-codex', + 'gpt-5.4', '-c', 'model_reasoning_effort="high"', '--skip-git-repo-check', @@ -141,7 +146,7 @@ describe('Model Selection and Reasoning Effort', () => { [ 'exec', '--model', - 'gpt-5.3-codex', + 'gpt-5.4', '-c', 'model_reasoning_effort="minimal"', '--skip-git-repo-check', @@ -162,7 +167,7 @@ describe('Model Selection and Reasoning Effort', () => { [ 'exec', '--model', - 'gpt-5.3-codex', + 'gpt-5.4', '-c', 'model_reasoning_effort="none"', '--skip-git-repo-check', @@ -183,7 +188,7 @@ describe('Model Selection and Reasoning Effort', () => { [ 'exec', '--model', - 'gpt-5.3-codex', + 'gpt-5.4', '-c', 'model_reasoning_effort="xhigh"', '--skip-git-repo-check', diff --git a/src/__tests__/resume-functionality.test.ts b/src/__tests__/resume-functionality.test.ts index e76dd63..b991b59 100644 --- a/src/__tests__/resume-functionality.test.ts +++ b/src/__tests__/resume-functionality.test.ts @@ -1,18 +1,17 @@ -import { CodexToolHandler } from '../tools/handlers.js'; +import type { CodexToolHandler as CodexToolHandlerType } from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; -import { executeCommand } from '../utils/command.js'; // Mock the command execution jest.mock('../utils/command.js', () => ({ executeCommand: jest.fn(), })); -const mockedExecuteCommand = executeCommand as jest.MockedFunction< - typeof executeCommand ->; - describe('Codex Resume Functionality', () => { - let handler: CodexToolHandler; + let CodexToolHandler: typeof import('../tools/handlers.js').CodexToolHandler; + let handler: CodexToolHandlerType; + let mockedExecuteCommand: jest.MockedFunction< + typeof import('../utils/command.js').executeCommand + >; let sessionStorage: InMemorySessionStorage; let originalStructuredContent: string | undefined; @@ -28,7 +27,14 @@ describe('Codex Resume Functionality', () => { } }); - beforeEach(() => { + beforeEach(async () => { + jest.resetModules(); + process.env.STRUCTURED_CONTENT_ENABLED = '1'; + ({ CodexToolHandler } = await import('../tools/handlers.js')); + const commandModule = await import('../utils/command.js'); + mockedExecuteCommand = commandModule.executeCommand as jest.MockedFunction< + typeof commandModule.executeCommand + >; sessionStorage = new InMemorySessionStorage(); handler = new CodexToolHandler(sessionStorage); mockedExecuteCommand.mockClear(); @@ -50,13 +56,7 @@ describe('Codex Resume Functionality', () => { expect(mockedExecuteCommand).toHaveBeenCalledWith( 'codex', - [ - 'exec', - '--model', - 'gpt-5.3-codex', - '--skip-git-repo-check', - 'First message', - ], + ['exec', '--model', 'gpt-5.4', '--skip-git-repo-check', 'First message'], expect.any(Object) ); }); @@ -166,7 +166,7 @@ describe('Codex Resume Functionality', () => { 'exec', '--skip-git-repo-check', '-c', - 'model="gpt-5.3-codex"', + 'model="gpt-5.4"', 'resume', 'existing-codex-session-id', 'Continue the task', @@ -196,7 +196,7 @@ describe('Codex Resume Functionality', () => { [ 'exec', '--model', - 'gpt-5.3-codex', + 'gpt-5.4', '--skip-git-repo-check', 'Reset and start new', ], @@ -229,7 +229,7 @@ describe('Codex Resume Functionality', () => { // Should build enhanced prompt since no codex session ID const call = mockedExecuteCommand.mock.calls[0]; - const sentPrompt = call?.[1]?.[4]; // After exec, --model, gpt-5.3-codex, --skip-git-repo-check, prompt + const sentPrompt = call?.[1]?.[4]; // After exec, --model, gpt-5.4, --skip-git-repo-check, prompt expect(sentPrompt).toContain('Context:'); expect(sentPrompt).toContain('Task: Follow up question'); }); diff --git a/src/__tests__/runtime-config.test.ts b/src/__tests__/runtime-config.test.ts new file mode 100644 index 0000000..c80bb0a --- /dev/null +++ b/src/__tests__/runtime-config.test.ts @@ -0,0 +1,81 @@ +import path from 'path'; +import { readFileSync } from 'fs'; + +import { CodexToolSchema } from '../types.js'; +import { + COMMAND_LOG_ENV_VAR, + SERVER_CONFIG, + STARTUP_LOG_ENV_VAR, + getServerVersion, + isCommandLoggingEnabled, + isStartupLoggingEnabled, +} from '../runtime-config.js'; + +describe('runtime config', () => { + test('uses package version for server config', () => { + const packageJsonPath = path.join(process.cwd(), 'package.json'); + const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf8')) as { + version: string; + }; + + expect(SERVER_CONFIG.name).toBe('codex-mcp-server'); + // Test getServerVersion with an explicit path to avoid cwd/argv sensitivity + expect(getServerVersion(path.join(process.cwd(), 'dist/index.js'))).toBe( + packageJson.version + ); + }); + + test('startup logging is disabled by default', () => { + expect(isStartupLoggingEnabled({})).toBe(false); + }); + + test('startup logging can be enabled via env var', () => { + expect(isStartupLoggingEnabled({ [STARTUP_LOG_ENV_VAR]: '1' })).toBe(true); + expect(isStartupLoggingEnabled({ [STARTUP_LOG_ENV_VAR]: 'true' })).toBe( + true + ); + }); + + test('command logging is disabled by default', () => { + expect(isCommandLoggingEnabled({})).toBe(false); + }); + + test('command logging can be enabled via env var', () => { + expect(isCommandLoggingEnabled({ [COMMAND_LOG_ENV_VAR]: '1' })).toBe(true); + expect(isCommandLoggingEnabled({ [COMMAND_LOG_ENV_VAR]: 'yes' })).toBe( + true + ); + }); + + test('codex schema accepts a positive timeout override', () => { + const parsedArgs = CodexToolSchema.parse({ + prompt: 'test prompt', + timeoutMs: 250, + }); + + expect(parsedArgs).toEqual(expect.objectContaining({ timeoutMs: 250 })); + }); + + test('codex schema rejects invalid timeout overrides', () => { + expect(() => + CodexToolSchema.parse({ + prompt: 'test prompt', + timeoutMs: 0, + }) + ).toThrow(); + + expect(() => + CodexToolSchema.parse({ + prompt: 'test prompt', + timeoutMs: -1, + }) + ).toThrow(); + + expect(() => + CodexToolSchema.parse({ + prompt: 'test prompt', + timeoutMs: 12.5, + }) + ).toThrow(); + }); +}); diff --git a/src/__tests__/startup-logging.test.ts b/src/__tests__/startup-logging.test.ts new file mode 100644 index 0000000..c4a1fda --- /dev/null +++ b/src/__tests__/startup-logging.test.ts @@ -0,0 +1,58 @@ +// Mock chalk to avoid ESM issues in Jest +jest.mock('chalk', () => ({ + __esModule: true, + default: { + blue: (text: string) => text, + yellow: (text: string) => text, + green: (text: string) => text, + red: (text: string) => text, + }, +})); + +import { Server } from '@modelcontextprotocol/sdk/server/index.js'; + +import { CodexMcpServer } from '../server.js'; +import { STARTUP_LOG_ENV_VAR } from '../runtime-config.js'; + +describe('startup logging', () => { + beforeEach(() => { + Reflect.deleteProperty(process.env, STARTUP_LOG_ENV_VAR); + jest.restoreAllMocks(); + }); + + test('does not log startup to stderr by default', async () => { + const connectSpy = jest + .spyOn(Server.prototype, 'connect') + .mockResolvedValue(undefined as never); + const errorSpy = jest.spyOn(console, 'error').mockImplementation(() => {}); + + const server = new CodexMcpServer({ + name: 'test-server', + version: '1.0.0', + }); + + await server.start(); + + expect(connectSpy).toHaveBeenCalled(); + expect(errorSpy).not.toHaveBeenCalled(); + }); + + test('logs startup when explicitly enabled', async () => { + process.env[STARTUP_LOG_ENV_VAR] = '1'; + + const connectSpy = jest + .spyOn(Server.prototype, 'connect') + .mockResolvedValue(undefined as never); + const errorSpy = jest.spyOn(console, 'error').mockImplementation(() => {}); + + const server = new CodexMcpServer({ + name: 'test-server', + version: '1.0.0', + }); + + await server.start(); + + expect(connectSpy).toHaveBeenCalled(); + expect(errorSpy).toHaveBeenCalledWith('test-server started successfully'); + }); +}); diff --git a/src/__tests__/working-directory.test.ts b/src/__tests__/working-directory.test.ts index b3bb799..896f14e 100644 --- a/src/__tests__/working-directory.test.ts +++ b/src/__tests__/working-directory.test.ts @@ -1,4 +1,9 @@ -import { CodexToolHandler, ReviewToolHandler } from '../tools/handlers.js'; +import { + CodexToolHandler, + HelpToolHandler, + ReviewToolHandler, + WebSearchToolHandler, +} from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; import { executeCommand, executeCommandStreaming } from '../utils/command.js'; @@ -18,13 +23,17 @@ const mockedExecuteCommandStreaming = describe('Working Directory (cwd) Support', () => { let codexHandler: CodexToolHandler; + let helpHandler: HelpToolHandler; let reviewHandler: ReviewToolHandler; let sessionStorage: InMemorySessionStorage; + let webSearchHandler: WebSearchToolHandler; beforeEach(() => { sessionStorage = new InMemorySessionStorage(); codexHandler = new CodexToolHandler(sessionStorage); + helpHandler = new HelpToolHandler(); reviewHandler = new ReviewToolHandler(); + webSearchHandler = new WebSearchToolHandler(); mockedExecuteCommand.mockClear(); mockedExecuteCommandStreaming.mockClear(); mockedExecuteCommand.mockResolvedValue({ @@ -70,12 +79,15 @@ describe('Working Directory (cwd) Support', () => { }); test('should pass cwd to streaming execution', async () => { + const controller = new AbortController(); + await reviewHandler.execute( { uncommitted: true, workingDirectory: '/path/to/worktree', }, { + abortSignal: controller.signal, sendProgress: async () => {}, progressToken: 'test-token', } @@ -84,7 +96,10 @@ describe('Working Directory (cwd) Support', () => { expect(mockedExecuteCommandStreaming).toHaveBeenCalledWith( 'codex', expect.arrayContaining(['-C', '/path/to/worktree']), - expect.objectContaining({ cwd: '/path/to/worktree' }) + expect.objectContaining({ + cwd: '/path/to/worktree', + signal: controller.signal, + }) ); }); }); @@ -115,6 +130,19 @@ describe('Working Directory (cwd) Support', () => { ); }); + test('should pass bypassApprovals flag when starting a new execution', async () => { + await codexHandler.execute({ + prompt: 'Test prompt', + bypassApprovals: true, + }); + + expect(mockedExecuteCommand).toHaveBeenCalledWith( + 'codex', + expect.arrayContaining(['--dangerously-bypass-approvals-and-sandbox']), + expect.any(Object) + ); + }); + test('should not apply cwd when resuming a session', async () => { const sessionId = sessionStorage.createSession(); sessionStorage.setCodexConversationId(sessionId, 'conv-123'); @@ -135,5 +163,89 @@ describe('Working Directory (cwd) Support', () => { const args = mockedExecuteCommand.mock.calls[0][1]; expect(args).not.toContain('-C'); }); + + test('should pass bypassApprovals flag when resuming a session', async () => { + const sessionId = sessionStorage.createSession(); + sessionStorage.setCodexConversationId(sessionId, 'conv-123'); + + await codexHandler.execute({ + prompt: 'Continue task', + sessionId, + bypassApprovals: true, + }); + + expect(mockedExecuteCommand).toHaveBeenCalledWith( + 'codex', + expect.arrayContaining([ + '--dangerously-bypass-approvals-and-sandbox', + 'resume', + 'conv-123', + 'Continue task', + ]), + expect.any(Object) + ); + }); + + test('should pass fullAuto flag when resuming a session', async () => { + const sessionId = sessionStorage.createSession(); + sessionStorage.setCodexConversationId(sessionId, 'conv-123'); + + await codexHandler.execute({ + prompt: 'Continue task', + sessionId, + fullAuto: true, + }); + + expect(mockedExecuteCommand).toHaveBeenCalledWith( + 'codex', + expect.arrayContaining([ + '--full-auto', + 'resume', + 'conv-123', + 'Continue task', + ]), + expect.any(Object) + ); + }); + }); + + describe('HelpToolHandler', () => { + test('should pass abortSignal to command execution', async () => { + const controller = new AbortController(); + + await helpHandler.execute( + {}, + { + abortSignal: controller.signal, + sendProgress: async () => {}, + } + ); + + expect(mockedExecuteCommand).toHaveBeenCalledWith( + 'codex', + ['--help'], + expect.objectContaining({ signal: controller.signal }) + ); + }); + }); + + describe('WebSearchToolHandler', () => { + test('should pass abortSignal to command execution', async () => { + const controller = new AbortController(); + + await webSearchHandler.execute( + { query: 'timeout regression' }, + { + abortSignal: controller.signal, + sendProgress: async () => {}, + } + ); + + expect(mockedExecuteCommand).toHaveBeenCalledWith( + 'codex', + expect.arrayContaining(['--search', 'exec']), + expect.objectContaining({ signal: controller.signal }) + ); + }); }); }); diff --git a/src/index.ts b/src/index.ts index 595de7e..c3dab8e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,11 +2,7 @@ import chalk from 'chalk'; import { CodexMcpServer } from './server.js'; - -const SERVER_CONFIG = { - name: 'codex-mcp-server', - version: '0.0.6', -} as const; +import { SERVER_CONFIG } from './runtime-config.js'; async function main(): Promise<void> { try { diff --git a/src/runtime-config.ts b/src/runtime-config.ts new file mode 100644 index 0000000..b7e276b --- /dev/null +++ b/src/runtime-config.ts @@ -0,0 +1,59 @@ +import { existsSync, readFileSync } from 'fs'; +import path from 'path'; + +export const STARTUP_LOG_ENV_VAR = 'CODEX_MCP_DEBUG_STARTUP' as const; +export const COMMAND_LOG_ENV_VAR = 'CODEX_MCP_DEBUG_COMMANDS' as const; + +function getPackageJsonPath(entryScriptPath = process.argv[1]): string { + if (entryScriptPath) { + const candidatePaths = [ + path.resolve(path.dirname(entryScriptPath), '../package.json'), + path.resolve(path.dirname(entryScriptPath), 'package.json'), + ]; + + for (const candidatePath of candidatePaths) { + if (existsSync(candidatePath)) { + return candidatePath; + } + } + } + + return path.resolve(process.cwd(), 'package.json'); +} + +export function getServerVersion(entryScriptPath = process.argv[1]): string { + try { + const packageJson = JSON.parse( + readFileSync(getPackageJsonPath(entryScriptPath), 'utf8') + ) as { version?: string }; + + return packageJson.version ?? '0.0.0'; + } catch { + return '0.0.0'; + } +} + +export const SERVER_CONFIG = { + name: 'codex-mcp-server', + version: getServerVersion(), +} as const; + +function isTruthyDebugFlag(rawValue: string | undefined): boolean { + if (!rawValue) { + return false; + } + + return ['1', 'true', 'yes', 'on'].includes(rawValue.toLowerCase()); +} + +export function isStartupLoggingEnabled( + env: Record<string, string | undefined> = process.env +): boolean { + return isTruthyDebugFlag(env[STARTUP_LOG_ENV_VAR]); +} + +export function isCommandLoggingEnabled( + env: Record<string, string | undefined> = process.env +): boolean { + return isTruthyDebugFlag(env[COMMAND_LOG_ENV_VAR]); +} diff --git a/src/server.ts b/src/server.ts index 4c9a69e..3143970 100644 --- a/src/server.ts +++ b/src/server.ts @@ -9,6 +9,7 @@ import chalk from 'chalk'; import { type ServerConfig, type ToolName, + type ToolResult, type ToolHandlerContext, type ProgressToken, TOOLS, @@ -16,13 +17,17 @@ import { import { handleError } from './errors.js'; import { toolDefinitions } from './tools/definitions.js'; import { toolHandlers } from './tools/handlers.js'; +import { isStartupLoggingEnabled } from './runtime-config.js'; export class CodexMcpServer { private readonly server: Server; private readonly config: ServerConfig; + private callQueue: Promise<void> = Promise.resolve(); + private readonly toolTimeoutMs: number; constructor(config: ServerConfig) { this.config = config; + this.toolTimeoutMs = this.getToolTimeoutMs(); this.server = new Server( { name: config.name, @@ -50,10 +55,11 @@ export class CodexMcpServer { const progressToken = request.params._meta?.progressToken as ProgressToken | undefined; // Create progress sender that uses MCP notifications - const createProgressContext = (): ToolHandlerContext => { + const createProgressContext = (abortSignal?: AbortSignal): ToolHandlerContext => { let progressCount = 0; return { progressToken, + abortSignal, sendProgress: async (message: string, progress?: number, total?: number) => { if (!progressToken) return; @@ -77,13 +83,39 @@ export class CodexMcpServer { }; try { - if (!this.isValidToolName(name)) { - throw new Error(`Unknown tool: ${name}`); - } + return await new Promise<ToolResult>((resolve, reject) => { + this.callQueue = this.callQueue.then(async () => { + let operation: Promise<ToolResult> | undefined; - const handler = toolHandlers[name]; - const context = createProgressContext(); - return await handler.execute(args, context); + try { + if (!this.isValidToolName(name)) { + throw new Error(`Unknown tool: ${name}`); + } + + const handler = toolHandlers[name]; + const controller = new AbortController(); + const context = createProgressContext(controller.signal); + const timeoutMs = this.getRequestTimeoutMs(name, args); + operation = handler.execute(args, context); + + resolve( + await this.withTimeout( + operation, + timeoutMs, + controller + ) + ); + } catch (err) { + reject(err); + } finally { + // Keep the queue blocked until the child process actually exits. + await operation?.then( + () => undefined, + () => undefined + ); + } + }); + }); } catch (error) { return { content: [ @@ -102,9 +134,65 @@ export class CodexMcpServer { return Object.values(TOOLS).includes(name as ToolName); } + private getToolTimeoutMs(): number { + const rawTimeout = process.env.CODEX_TOOL_TIMEOUT_MS; + if (!rawTimeout) { + return 120_000; + } + + const parsedTimeout = Number.parseInt(rawTimeout, 10); + if (Number.isFinite(parsedTimeout) && parsedTimeout > 0) { + return parsedTimeout; + } + + return 120_000; + } + + private getRequestTimeoutMs(name: ToolName, args: unknown): number { + if (name !== TOOLS.CODEX || !args || typeof args !== 'object') { + return this.toolTimeoutMs; + } + + const timeoutMs = (args as { timeoutMs?: unknown }).timeoutMs; + if ( + typeof timeoutMs === 'number' && + Number.isInteger(timeoutMs) && + timeoutMs > 0 + ) { + return timeoutMs; + } + + return this.toolTimeoutMs; + } + + private async withTimeout<T>( + operation: Promise<T>, + timeoutMs: number, + controller?: AbortController + ): Promise<T> { + let timeoutHandle: ReturnType<typeof setTimeout> | undefined; + + try { + const timeoutPromise = new Promise<never>((_resolve, reject) => { + timeoutHandle = setTimeout(() => { + controller?.abort(); + reject(new Error(`Tool call timed out after ${timeoutMs}ms`)); + }, timeoutMs); + }); + + return await Promise.race([operation, timeoutPromise]); + } finally { + if (timeoutHandle) { + clearTimeout(timeoutHandle); + } + } + } + async start(): Promise<void> { const transport = new StdioServerTransport(); await this.server.connect(transport); - console.error(chalk.green(`${this.config.name} started successfully`)); + if (isStartupLoggingEnabled()) { + console.error(chalk.green(`${this.config.name} started successfully`)); + } } } diff --git a/src/tools/definitions.ts b/src/tools/definitions.ts index dd52a15..89da858 100644 --- a/src/tools/definitions.ts +++ b/src/tools/definitions.ts @@ -14,7 +14,7 @@ export const toolDefinitions: ToolDefinition[] = [ sessionId: { type: 'string', description: - 'Optional session ID for conversational context. Note: when resuming a session, sandbox/fullAuto/workingDirectory parameters are not applied (CLI limitation)', + 'Optional session ID for conversational context. Note: when resuming a session, sandbox and workingDirectory are not applied (CLI limitation)', }, resetSession: { type: 'boolean', @@ -42,6 +42,11 @@ export const toolDefinitions: ToolDefinition[] = [ description: 'Enable full-auto mode: sandboxed automatic execution without approval prompts (equivalent to -a on-request --sandbox workspace-write)', }, + bypassApprovals: { + type: 'boolean', + description: + 'Skip all approval prompts and disable sandboxing. Extremely dangerous; use only in externally sandboxed environments', + }, workingDirectory: { type: 'string', description: @@ -52,6 +57,12 @@ export const toolDefinitions: ToolDefinition[] = [ description: 'Static MCP callback URI to pass to Codex via environment (if provided)', }, + timeoutMs: { + type: 'integer', + description: + 'Optional per-call timeout in milliseconds. Overrides CODEX_TOOL_TIMEOUT_MS for this request only', + minimum: 1, + }, }, required: ['prompt'], }, diff --git a/src/tools/handlers.ts b/src/tools/handlers.ts index 9d7eabd..adfe200 100644 --- a/src/tools/handlers.ts +++ b/src/tools/handlers.ts @@ -2,12 +2,14 @@ import { TOOLS, DEFAULT_CODEX_MODEL, CODEX_DEFAULT_MODEL_ENV_VAR, + type ToolName, type ToolResult, type ToolHandlerContext, type CodexToolArgs, type ReviewToolArgs, type PingToolArgs, type WebSearchToolArgs, + type CommandResult, CodexToolSchema, ReviewToolSchema, PingToolSchema, @@ -21,29 +23,96 @@ import { type ConversationTurn, } from '../session/storage.js'; import { ToolExecutionError, ValidationError } from '../errors.js'; -import { executeCommand, executeCommandStreaming } from '../utils/command.js'; +import { + executeCommand, + executeCommandStreaming, + type CommandOptions, +} from '../utils/command.js'; import { ZodError } from 'zod'; import path from 'node:path'; -// Default no-op context for handlers that don't need progress +interface ToolHandler { + execute(args: unknown, context?: ToolHandlerContext): Promise<ToolResult>; +} + const defaultContext: ToolHandlerContext = { sendProgress: async () => {}, }; -const isStructuredContentEnabled = (): boolean => { - const raw = process.env.STRUCTURED_CONTENT_ENABLED; - if (!raw) return false; - return ['1', 'true', 'yes', 'on'].includes(raw.toLowerCase()); -}; +const STRUCTURED_CONTENT_ENABLED = ['1', 'true', 'yes', 'on'].includes( + (process.env.STRUCTURED_CONTENT_ENABLED ?? '').toLowerCase() +); -export class CodexToolHandler { - constructor(private sessionStorage: SessionStorage) {} +function getToolContext(context?: ToolHandlerContext): ToolHandlerContext { + return context ?? defaultContext; +} + +function getSelectedModel(model?: string): string { + return ( + model || process.env[CODEX_DEFAULT_MODEL_ENV_VAR] || DEFAULT_CODEX_MODEL + ); +} + +function getCommandResponse(result: CommandResult, fallback: string): string { + return result.stdout || result.stderr || fallback; +} + +class CommandBackedToolHandler { + protected readonly structuredContentEnabled = STRUCTURED_CONTENT_ENABLED; + + protected async executeCodexCommand( + cmdArgs: string[], + context: ToolHandlerContext, + options?: CommandOptions + ): Promise<CommandResult> { + const commandOptions = { + ...options, + signal: options?.signal ?? context.abortSignal, + }; + + if (context.progressToken) { + return executeCommandStreaming('codex', cmdArgs, { + ...commandOptions, + onProgress: (message) => { + void context.sendProgress(message); + }, + }); + } + + return executeCommand('codex', cmdArgs, commandOptions); + } + + protected createTextResult( + text: string, + metadata?: Record<string, unknown> + ): ToolResult { + const hasMetadata = !!metadata && Object.keys(metadata).length > 0; + + return { + content: [ + { + type: 'text', + text, + ...(hasMetadata ? { _meta: metadata } : {}), + }, + ], + structuredContent: + this.structuredContentEnabled && hasMetadata ? metadata : undefined, + }; + } +} + +export class CodexToolHandler extends CommandBackedToolHandler { + constructor(private sessionStorage: SessionStorage) { + super(); + } async execute( args: unknown, - context: ToolHandlerContext = defaultContext + context?: ToolHandlerContext ): Promise<ToolResult> { try { + const toolContext = getToolContext(context); const { prompt, sessionId, @@ -52,6 +121,7 @@ export class CodexToolHandler { reasoningEffort, sandbox, fullAuto, + bypassApprovals, workingDirectory, callbackUri, }: CodexToolArgs = CodexToolSchema.parse(args); @@ -61,7 +131,7 @@ export class CodexToolHandler { ? path.resolve(workingDirectory) : undefined; - let activeSessionId = sessionId; + const activeSessionId = sessionId; let enhancedPrompt = prompt; // Only work with sessions if explicitly requested @@ -92,10 +162,7 @@ export class CodexToolHandler { } // Build command arguments with v0.75.0+ features - const selectedModel = - model || - process.env[CODEX_DEFAULT_MODEL_ENV_VAR] || - DEFAULT_CODEX_MODEL; + const selectedModel = getSelectedModel(model); const effectiveCallbackUri = callbackUri || process.env.CODEX_MCP_CALLBACK_URI; @@ -115,6 +182,15 @@ export class CodexToolHandler { cmdArgs.push('-c', `model_reasoning_effort="${reasoningEffort}"`); } + if (fullAuto) { + cmdArgs.push('--full-auto'); + } + + // Bypass all approval prompts and sandboxing — use only in externally sandboxed envs + if (bypassApprovals) { + cmdArgs.push('--dangerously-bypass-approvals-and-sandbox'); + } + // Add resume subcommand with conversation ID and prompt cmdArgs.push('resume', codexConversationId, enhancedPrompt); } else { @@ -139,6 +215,11 @@ export class CodexToolHandler { cmdArgs.push('--full-auto'); } + // Bypass all approval prompts and sandboxing — use only in externally sandboxed envs + if (bypassApprovals) { + cmdArgs.push('--dangerously-bypass-approvals-and-sandbox'); + } + // Add working directory (v0.75.0+) if (resolvedWorkDir) { cmdArgs.push('-C', resolvedWorkDir); @@ -151,35 +232,29 @@ export class CodexToolHandler { } // Send initial progress notification - await context.sendProgress('Starting Codex execution...', 0); - - // Use streaming execution if progress is enabled - const useStreaming = !!context.progressToken; + await toolContext.sendProgress('Starting Codex execution...', 0); const envOverride = effectiveCallbackUri ? { CODEX_MCP_CALLBACK_URI: effectiveCallbackUri } : undefined; // Pass cwd to spawn so the child process starts in the correct directory. // This works around openai/codex#9084 where -C is ignored by some subcommands. - // Skip cwd during resume: sandbox, fullAuto, and workingDirectory are not - // applied in resume mode (Codex CLI limitation). + // Skip cwd during resume: sandbox and workingDirectory are not applied in + // resume mode (Codex CLI limitation). const cmdOptions = { cwd: useResume ? undefined : resolvedWorkDir, envOverride, + signal: toolContext.abortSignal, }; - const result = useStreaming - ? await executeCommandStreaming('codex', cmdArgs, { - ...cmdOptions, - onProgress: (message) => { - // Send progress notification for each chunk of output - context.sendProgress(message); - }, - }) - : await executeCommand('codex', cmdArgs, cmdOptions); + const result = await this.executeCodexCommand( + cmdArgs, + toolContext, + cmdOptions + ); // Codex CLI may output to stderr, so check both - const response = result.stdout || result.stderr || 'No output from Codex'; + const response = getCommandResponse(result, 'No output from Codex'); // Extract conversation/session ID from new conversations for future resume // Codex CLI outputs have varied between "session id" and "conversation id" @@ -216,25 +291,13 @@ ${result.stdout || ''}`.trim(); // - content[0]._meta: For Claude Code compatibility (avoids structuredContent bug) // - structuredContent: For other MCP clients that properly support it const metadata: Record<string, unknown> = { - ...(threadId && { threadId }), - ...(selectedModel && { model: selectedModel }), + model: selectedModel, ...(activeSessionId && { sessionId: activeSessionId }), ...(effectiveCallbackUri && { callbackUri: effectiveCallbackUri }), + ...(threadId && { threadId }), }; - return { - content: [ - { - type: 'text', - text: response, - _meta: metadata, - }, - ], - structuredContent: - isStructuredContentEnabled() && Object.keys(metadata).length > 0 - ? metadata - : undefined, - }; + return this.createTextResult(response, metadata); } catch (error) { if (error instanceof ValidationError) { throw error; @@ -277,10 +340,7 @@ ${result.stdout || ''}`.trim(); } export class PingToolHandler { - async execute( - args: unknown, - _context: ToolHandlerContext = defaultContext - ): Promise<ToolResult> { + async execute(args: unknown): Promise<ToolResult> { try { const { message = 'pong' }: PingToolArgs = PingToolSchema.parse(args); @@ -305,24 +365,20 @@ export class PingToolHandler { } } -export class HelpToolHandler { +export class HelpToolHandler extends CommandBackedToolHandler { async execute( args: unknown, - _context: ToolHandlerContext = defaultContext + context?: ToolHandlerContext ): Promise<ToolResult> { try { + const toolContext = getToolContext(context); HelpToolSchema.parse(args); - const result = await executeCommand('codex', ['--help']); + const result = await this.executeCodexCommand(['--help'], toolContext); - return { - content: [ - { - type: 'text', - text: result.stdout || 'No help information available', - }, - ], - }; + return this.createTextResult( + getCommandResponse(result, 'No help information available') + ); } catch (error) { if (error instanceof ZodError) { throw new ValidationError(TOOLS.HELP, error.message); @@ -339,10 +395,7 @@ export class HelpToolHandler { export class ListSessionsToolHandler { constructor(private sessionStorage: SessionStorage) {} - async execute( - args: unknown, - _context: ToolHandlerContext = defaultContext - ): Promise<ToolResult> { + async execute(args: unknown): Promise<ToolResult> { try { ListSessionsToolSchema.parse(args); @@ -378,12 +431,13 @@ export class ListSessionsToolHandler { } } -export class ReviewToolHandler { +export class ReviewToolHandler extends CommandBackedToolHandler { async execute( args: unknown, - context: ToolHandlerContext = defaultContext + context?: ToolHandlerContext ): Promise<ToolResult> { try { + const toolContext = getToolContext(context); const { prompt, uncommitted, @@ -414,10 +468,7 @@ export class ReviewToolHandler { } // Add model parameter via config - const selectedModel = - model || - process.env[CODEX_DEFAULT_MODEL_ENV_VAR] || - DEFAULT_CODEX_MODEL; + const selectedModel = getSelectedModel(model); cmdArgs.push('-c', `model="${selectedModel}"`); cmdArgs.push('review'); @@ -445,24 +496,21 @@ export class ReviewToolHandler { } // Send initial progress notification - await context.sendProgress('Starting code review...', 0); - - const useStreaming = !!context.progressToken; + await toolContext.sendProgress('Starting code review...', 0); // Pass cwd to spawn so the child process starts in the correct directory. // This works around openai/codex#9084 where -C is ignored by `review`. const cmdOptions = { cwd: resolvedWorkDir }; - const result = useStreaming - ? await executeCommandStreaming('codex', cmdArgs, { - ...cmdOptions, - onProgress: (message) => { - context.sendProgress(message); - }, - }) - : await executeCommand('codex', cmdArgs, cmdOptions); + const result = await this.executeCodexCommand( + cmdArgs, + toolContext, + cmdOptions + ); // Codex CLI outputs to stderr, so check both stdout and stderr - const response = - result.stdout || result.stderr || 'No review output from Codex'; + const response = getCommandResponse( + result, + 'No review output from Codex' + ); // Prepare metadata for dual approach: // - content[0]._meta: For Claude Code compatibility (avoids structuredContent bug) @@ -473,16 +521,7 @@ export class ReviewToolHandler { ...(commit && { commit }), }; - return { - content: [ - { - type: 'text', - text: response, - _meta: metadata, - }, - ], - structuredContent: isStructuredContentEnabled() ? metadata : undefined, - }; + return this.createTextResult(response, metadata); } catch (error) { if (error instanceof ZodError) { throw new ValidationError(TOOLS.REVIEW, error.message); @@ -503,12 +542,13 @@ export class ReviewToolHandler { * WebSearchToolHandler - Perform web search via Codex CLI with --search flag * Enables Codex's native web_search tool by using --search before exec subcommand */ -export class WebSearchToolHandler { +export class WebSearchToolHandler extends CommandBackedToolHandler { async execute( args: unknown, - context: ToolHandlerContext = defaultContext + context?: ToolHandlerContext ): Promise<ToolResult> { try { + const toolContext = getToolContext(context); const { query, numResults = 10, @@ -516,7 +556,7 @@ export class WebSearchToolHandler { }: WebSearchToolArgs = WebSearchToolSchema.parse(args); // Send initial progress notification - await context.sendProgress(`Searching for: ${query}...`, 0); + await toolContext.sendProgress(`Searching for: ${query}...`, 0); // Build direct search prompt that leverages the enabled web_search tool const searchPrompt = `Search for: ${query}. Provide ${numResults} key findings.${searchDepth === 'full' ? ' Include detailed analysis and context.' : ''}`; @@ -529,20 +569,13 @@ export class WebSearchToolHandler { searchPrompt, ]; - // Use streaming execution if progress is enabled - const useStreaming = !!context.progressToken; - - const result = useStreaming - ? await executeCommandStreaming('codex', cmdArgs, { - onProgress: (message) => { - context.sendProgress(message); - }, - }) - : await executeCommand('codex', cmdArgs); + const result = await this.executeCodexCommand(cmdArgs, toolContext); // Get response from stdout or stderr (Codex may output to either) - const response = - result.stdout || result.stderr || 'No search output from Codex'; + const response = getCommandResponse( + result, + 'No search output from Codex' + ); // Prepare metadata const metadata: Record<string, unknown> = { @@ -552,16 +585,7 @@ export class WebSearchToolHandler { timestamp: new Date().toISOString(), }; - return { - content: [ - { - type: 'text', - text: response, - _meta: metadata, - }, - ], - structuredContent: isStructuredContentEnabled() ? metadata : undefined, - }; + return this.createTextResult(response, metadata); } catch (error) { if (error instanceof ZodError) { throw new ValidationError(TOOLS.WEBSEARCH, error.message); @@ -578,11 +602,11 @@ export class WebSearchToolHandler { // Tool handler registry const sessionStorage = new InMemorySessionStorage(); -export const toolHandlers = { +export const toolHandlers: Record<ToolName, ToolHandler> = { [TOOLS.CODEX]: new CodexToolHandler(sessionStorage), [TOOLS.REVIEW]: new ReviewToolHandler(), [TOOLS.PING]: new PingToolHandler(), [TOOLS.HELP]: new HelpToolHandler(), [TOOLS.LIST_SESSIONS]: new ListSessionsToolHandler(sessionStorage), [TOOLS.WEBSEARCH]: new WebSearchToolHandler(), -} as const; +}; diff --git a/src/types.ts b/src/types.ts index a940f81..bac0ebe 100644 --- a/src/types.ts +++ b/src/types.ts @@ -13,11 +13,13 @@ export const TOOLS = { export type ToolName = typeof TOOLS[keyof typeof TOOLS]; // Codex model constants -export const DEFAULT_CODEX_MODEL = 'gpt-5.3-codex' as const; +export const DEFAULT_CODEX_MODEL = 'gpt-5.4' as const; export const CODEX_DEFAULT_MODEL_ENV_VAR = 'CODEX_DEFAULT_MODEL' as const; -// Available model options (for documentation/reference) -export const AVAILABLE_CODEX_MODELS = [ +// Common model examples shown in tool descriptions. Validation stays free-form +// so callers can pass any Codex CLI-supported model string. +const DOCUMENTED_CODEX_MODELS = [ + 'gpt-5.4', 'gpt-5.3-codex', 'gpt-5.2-codex', 'gpt-5.1-codex', @@ -31,7 +33,7 @@ export const AVAILABLE_CODEX_MODELS = [ // Helper function to generate model description export const getModelDescription = (toolType: 'codex' | 'review') => { - const modelList = AVAILABLE_CODEX_MODELS.join(', '); + const modelList = DOCUMENTED_CODEX_MODELS.join(', '); if (toolType === 'codex') { return `Specify which model to use (defaults to ${DEFAULT_CODEX_MODEL}). Options: ${modelList}`; } @@ -90,7 +92,7 @@ export const SandboxMode = z.enum([ ]); // Zod schemas for tool arguments -export const CodexToolSchema = z.object({ +const CodexToolBaseSchema = z.object({ prompt: z.string(), sessionId: z .string() @@ -104,8 +106,30 @@ export const CodexToolSchema = z.object({ reasoningEffort: z.enum(['none', 'minimal', 'low', 'medium', 'high', 'xhigh']).optional(), sandbox: SandboxMode.optional(), fullAuto: z.boolean().optional(), + bypassApprovals: z.boolean().optional(), workingDirectory: z.string().optional(), callbackUri: z.string().optional(), + timeoutMs: z.number().int().positive().optional(), +}); + +export const CodexToolSchema = CodexToolBaseSchema.superRefine((value, ctx) => { + if (value.bypassApprovals && value.sandbox) { + ctx.addIssue({ + code: 'custom', + path: ['bypassApprovals'], + message: + 'bypassApprovals cannot be combined with sandbox because it disables sandboxing entirely', + }); + } + + if (value.bypassApprovals && value.fullAuto) { + ctx.addIssue({ + code: 'custom', + path: ['bypassApprovals'], + message: + 'bypassApprovals cannot be combined with fullAuto because fullAuto requires sandboxed execution', + }); + } }); // Review tool schema @@ -152,5 +176,6 @@ export type ProgressToken = string | number; // Context passed to tool handlers for sending progress notifications export interface ToolHandlerContext { progressToken?: ProgressToken; + abortSignal?: AbortSignal; sendProgress: (message: string, progress?: number, total?: number) => Promise<void>; } diff --git a/src/utils/command.ts b/src/utils/command.ts index c5571a0..9454fe7 100644 --- a/src/utils/command.ts +++ b/src/utils/command.ts @@ -1,10 +1,12 @@ -type ProcessEnv = Record<string, string | undefined>; import { type SpawnOptionsWithoutStdio, spawn } from 'child_process'; import { Buffer } from 'node:buffer'; import path from 'node:path'; import chalk from 'chalk'; import { CommandExecutionError } from '../errors.js'; import { type CommandResult } from '../types.js'; +import { isCommandLoggingEnabled } from '../runtime-config.js'; + +type ProcessEnv = Record<string, string | undefined>; /** * Escape argument for Windows shell (cmd.exe) @@ -30,143 +32,152 @@ export type ProgressCallback = (message: string) => void; export interface CommandOptions { envOverride?: ProcessEnv; cwd?: string; + signal?: AbortSignal; + onProgress?: ProgressCallback; } -export interface StreamingCommandOptions extends CommandOptions { - onProgress?: ProgressCallback; +export type StreamingCommandOptions = CommandOptions; + +type StreamName = 'stdout' | 'stderr'; +type SpawnedChildProcess = ReturnType<typeof spawn>; + +const FORCE_KILL_TIMEOUT_MS = 2000; +const PROGRESS_DEBOUNCE_MS = 100; + +function getEscapedArgs(args: string[]): string[] { + return isWindows ? args.map(escapeArgForWindows) : args; } -export async function executeCommand( - file: string, - args: string[] = [], - options?: CommandOptions -): Promise<CommandResult> { - return new Promise((resolve, reject) => { - // Escape args for Windows shell - const escapedArgs = isWindows ? args.map(escapeArgForWindows) : args; +function getSpawnOptions( + options: CommandOptions = {} +): SpawnOptionsWithoutStdio { + const spawnOptions: SpawnOptionsWithoutStdio = { + shell: isWindows, + env: options.envOverride + ? { ...process.env, ...options.envOverride } + : process.env, + }; + + if (options.cwd) { + spawnOptions.cwd = path.resolve(options.cwd); + } - console.error(chalk.blue('Executing:'), file, escapedArgs.join(' ')); + if (options.signal && !isWindows) { + // Create an isolated process group so abort can terminate the full tree. + spawnOptions.detached = true; + } - const spawnOptions: SpawnOptionsWithoutStdio = { - shell: isWindows, - env: options?.envOverride - ? { ...process.env, ...options.envOverride } - : process.env, - }; + return spawnOptions; +} - if (options?.cwd) { - spawnOptions.cwd = path.resolve(options.cwd); +function sendKillSignal( + child: SpawnedChildProcess, + signal: 'SIGTERM' | 'SIGKILL', + isWindowsPlatform: boolean +): void { + child.kill(signal); + if (!isWindowsPlatform && child.pid) { + try { + process.kill(-child.pid, signal); + } catch { + // Ignore if process group no longer exists. } + } +} - const child = spawn(file, escapedArgs, spawnOptions); +function killChild( + child: SpawnedChildProcess, + isWindowsPlatform: boolean +): void { + sendKillSignal(child, 'SIGTERM', isWindowsPlatform); + const forceKillTimer = globalThis.setTimeout( + () => sendKillSignal(child, 'SIGKILL', isWindowsPlatform), + FORCE_KILL_TIMEOUT_MS + ); + child.once('exit', () => globalThis.clearTimeout(forceKillTimer)); +} - // Close stdin to prevent processes like codex exec from waiting forever for input - // When spawned with stdio pipe, codex waits for stdin EOF that never arrives - child.stdin?.end(); +function bindAbortSignal( + child: SpawnedChildProcess, + signal?: AbortSignal +): void { + if (!signal) { + return; + } - let stdout = ''; - let stderr = ''; - let stdoutTruncated = false; - let stderrTruncated = false; + if (signal.aborted) { + killChild(child, isWindows); + return; + } - child.stdout.on('data', (data: Buffer) => { - if (!stdoutTruncated) { - const chunk = data.toString(); - if (stdout.length + chunk.length > MAX_BUFFER_SIZE) { - stdout += chunk.slice(0, MAX_BUFFER_SIZE - stdout.length); - stdoutTruncated = true; - console.error(chalk.yellow('Warning: stdout truncated at 10MB')); - } else { - stdout += chunk; - } - } - }); + signal.addEventListener('abort', () => killChild(child, isWindows), { + once: true, + }); +} - child.stderr.on('data', (data: Buffer) => { - if (!stderrTruncated) { - const chunk = data.toString(); - if (stderr.length + chunk.length > MAX_BUFFER_SIZE) { - stderr += chunk.slice(0, MAX_BUFFER_SIZE - stderr.length); - stderrTruncated = true; - console.error(chalk.yellow('Warning: stderr truncated at 10MB')); - } else { - stderr += chunk; - } - } - }); +function appendOutput( + currentValue: string, + chunk: string, + streamName: StreamName +): { nextValue: string; truncated: boolean } { + if (currentValue.length + chunk.length > MAX_BUFFER_SIZE) { + console.error(chalk.yellow(`Warning: ${streamName} truncated at 10MB`)); + return { + nextValue: + currentValue + chunk.slice(0, MAX_BUFFER_SIZE - currentValue.length), + truncated: true, + }; + } - child.on('close', (code) => { - if (stderr) { - console.error(chalk.yellow('Command stderr:'), stderr); - } + return { + nextValue: currentValue + chunk, + truncated: false, + }; +} - // Accept exit code 0 or if we got stdout/stderr output - // Note: codex CLI writes most output to stderr, so we must check both - if (code === 0 || stdout || stderr) { - if (code !== 0 && (stdout || stderr)) { - console.error( - chalk.yellow('Command failed but produced output, using output') - ); - } - resolve({ stdout, stderr }); - } else { - reject( - new CommandExecutionError( - [file, ...args].join(' '), - `Command failed with exit code ${code}`, - new Error(stderr || 'Unknown error') - ) - ); - } - }); +function createProgressReporter( + onProgress?: ProgressCallback +): ProgressCallback | undefined { + if (!onProgress) { + return undefined; + } - child.on('error', (error) => { - reject( - new CommandExecutionError( - [file, ...args].join(' '), - 'Command execution failed', - error - ) - ); - }); - }); + let lastProgressTime = 0; + return (message: string) => { + const now = Date.now(); + if (now - lastProgressTime >= PROGRESS_DEBOUNCE_MS) { + onProgress(message); + lastProgressTime = now; + } + }; } -/** - * Execute a command with streaming output support. - * Calls onProgress callback with each chunk of output for real-time feedback. - * - * Note: Unlike executeCommand, this function treats stderr output as success - * because tools like codex write their primary output to stderr. This is - * intentional for streaming use cases where we want to capture all output. - */ -export async function executeCommandStreaming( +function createCommandError( + file: string, + args: string[], + message: string, + cause: unknown +): CommandExecutionError { + return new CommandExecutionError([file, ...args].join(' '), message, cause); +} + +export async function executeCommand( file: string, args: string[] = [], - options: StreamingCommandOptions = {} + options: CommandOptions = {} ): Promise<CommandResult> { return new Promise((resolve, reject) => { - // Escape args for Windows shell - const escapedArgs = isWindows ? args.map(escapeArgForWindows) : args; - - console.error( - chalk.blue('Executing (streaming):'), - file, - escapedArgs.join(' ') - ); - - const spawnOptions: SpawnOptionsWithoutStdio = { - shell: isWindows, // Use shell on Windows to inherit PATH correctly - env: options.envOverride - ? { ...process.env, ...options.envOverride } - : process.env, - }; + const escapedArgs = getEscapedArgs(args); + const isStreaming = !!options.onProgress; + const logLabel = isStreaming ? 'Executing (streaming):' : 'Executing:'; + const commandLoggingEnabled = isCommandLoggingEnabled(); - if (options.cwd) { - spawnOptions.cwd = path.resolve(options.cwd); + if (commandLoggingEnabled) { + console.error(chalk.blue(logLabel), file, escapedArgs.join(' ')); } - const child = spawn(file, escapedArgs, spawnOptions); + const child = spawn(file, escapedArgs, getSpawnOptions(options)); + bindAbortSignal(child, options.signal); // Close stdin to prevent processes like codex exec from waiting forever for input // When spawned with stdio pipe, codex waits for stdin EOF that never arrives @@ -176,54 +187,32 @@ export async function executeCommandStreaming( let stderr = ''; let stdoutTruncated = false; let stderrTruncated = false; - let lastProgressTime = 0; - const PROGRESS_DEBOUNCE_MS = 100; // Debounce progress updates - - const sendProgress = (message: string) => { - if (!options.onProgress) return; - - const now = Date.now(); - // Debounce to avoid flooding with progress updates - if (now - lastProgressTime >= PROGRESS_DEBOUNCE_MS) { - options.onProgress(message); - lastProgressTime = now; - } - }; + const reportProgress = createProgressReporter(options.onProgress); child.stdout?.on('data', (data: Buffer) => { const chunk = data.toString(); if (!stdoutTruncated) { - if (stdout.length + chunk.length > MAX_BUFFER_SIZE) { - stdout += chunk.slice(0, MAX_BUFFER_SIZE - stdout.length); - stdoutTruncated = true; - console.error(chalk.yellow('Warning: stdout truncated at 10MB')); - } else { - stdout += chunk; - } + const nextOutput = appendOutput(stdout, chunk, 'stdout'); + stdout = nextOutput.nextValue; + stdoutTruncated = nextOutput.truncated; } - sendProgress(chunk.trim()); + reportProgress?.(chunk.trim()); }); child.stderr?.on('data', (data: Buffer) => { const chunk = data.toString(); if (!stderrTruncated) { - if (stderr.length + chunk.length > MAX_BUFFER_SIZE) { - stderr += chunk.slice(0, MAX_BUFFER_SIZE - stderr.length); - stderrTruncated = true; - console.error(chalk.yellow('Warning: stderr truncated at 10MB')); - } else { - stderr += chunk; - } + const nextOutput = appendOutput(stderr, chunk, 'stderr'); + stderr = nextOutput.nextValue; + stderrTruncated = nextOutput.truncated; } - // Also send stderr as progress - codex outputs to stderr - sendProgress(chunk.trim()); + reportProgress?.(chunk.trim()); }); child.on('close', (code) => { - // Send final progress if there's any remaining output if (options.onProgress && (stdout || stderr)) { const finalOutput = stdout || stderr; - const lastChunk = finalOutput.slice(-500); // Last 500 chars + const lastChunk = finalOutput.slice(-500); if (lastChunk.trim()) { options.onProgress( `[Completed] ${lastChunk.trim().slice(0, 200)}...` @@ -231,33 +220,56 @@ export async function executeCommandStreaming( } } + if (!isStreaming && stderr && commandLoggingEnabled) { + console.error(chalk.yellow('Command stderr:'), stderr); + } + if (code === 0 || stdout || stderr) { - // Success or we have output (treat as success like the original) - if (code !== 0 && (stdout || stderr)) { + if (code !== 0 && (stdout || stderr) && commandLoggingEnabled) { console.error( chalk.yellow('Command failed but produced output, using output') ); } resolve({ stdout, stderr }); - } else { - reject( - new CommandExecutionError( - [file, ...args].join(' '), - `Command exited with code ${code}`, - new Error(`Exit code: ${code}`) - ) - ); + return; } - }); - child.on('error', (error) => { reject( - new CommandExecutionError( - [file, ...args].join(' '), - 'Command execution failed', - error - ) + isStreaming + ? createCommandError( + file, + args, + `Command exited with code ${code}`, + new Error(`Exit code: ${code}`) + ) + : createCommandError( + file, + args, + `Command failed with exit code ${code}`, + new Error(stderr || 'Unknown error') + ) ); }); + + child.on('error', (error) => { + reject(createCommandError(file, args, 'Command execution failed', error)); + }); }); } + +/** + * Execute a command with streaming output support. + * Calls onProgress callback with each chunk of output for real-time feedback + * by delegating to executeCommand with onProgress set. + * + * Note: Commands that produce stdout or stderr are treated as success even if + * exit code is non-zero. This preserves codex CLI behavior, which frequently + * writes primary output to stderr. + */ +export async function executeCommandStreaming( + file: string, + args: string[] = [], + options: StreamingCommandOptions = {} +): Promise<CommandResult> { + return executeCommand(file, args, options); +}