From 239590d3f7cf78dd56e060fa0a8167eb16f66bb3 Mon Sep 17 00:00:00 2001 From: Tommy Nguyen Date: Sat, 18 Apr 2026 22:17:06 -0700 Subject: [PATCH 1/6] feat(computer-use): add macOS computer control via Codex binary Bridge to open-computer-use/SkyComputerUseClient binary over stdio JSON-RPC. Adds 10 MCP tools (cu_list_apps, cu_get_app_state, cu_click, etc.) with lazy init, per-tool timeouts, image passthrough, and macOS platform guard. 97 tests passing, lint clean. Co-authored-by: Claude Entire-Checkpoint: 8f81eabf57ca --- .gitignore | 1 + eslint.config.mjs | 6 + src/__tests__/index.test.ts | 3 +- src/computer-use/__tests__/discovery.test.ts | 42 +++ src/computer-use/__tests__/handlers.test.ts | 51 ++++ src/computer-use/__tests__/types.test.ts | 125 +++++++++ src/computer-use/bridge.ts | 92 +++++++ src/computer-use/client.ts | 183 +++++++++++++ src/computer-use/definitions.ts | 274 +++++++++++++++++++ src/computer-use/discovery.ts | 52 ++++ src/computer-use/handlers.ts | 93 +++++++ src/computer-use/types.ts | 140 ++++++++++ src/tools/definitions.ts | 2 + src/tools/handlers.ts | 21 +- src/types.ts | 14 +- 15 files changed, 1096 insertions(+), 3 deletions(-) create mode 100644 src/computer-use/__tests__/discovery.test.ts create mode 100644 src/computer-use/__tests__/handlers.test.ts create mode 100644 src/computer-use/__tests__/types.test.ts create mode 100644 src/computer-use/bridge.ts create mode 100644 src/computer-use/client.ts create mode 100644 src/computer-use/definitions.ts create mode 100644 src/computer-use/discovery.ts create mode 100644 src/computer-use/handlers.ts create mode 100644 src/computer-use/types.ts diff --git a/.gitignore b/.gitignore index 9a941be..97590ee 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ temp/ # Jest cache .jest/ +.claude/harness/ diff --git a/eslint.config.mjs b/eslint.config.mjs index dac91df..fc0a075 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -16,6 +16,12 @@ export default [ globals: { console: 'readonly', process: 'readonly', + Buffer: 'readonly', + setTimeout: 'readonly', + clearTimeout: 'readonly', + setInterval: 'readonly', + clearInterval: 'readonly', + setImmediate: 'readonly', }, }, plugins: { diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts index 1762942..b51ea47 100644 --- a/src/__tests__/index.test.ts +++ b/src/__tests__/index.test.ts @@ -37,7 +37,8 @@ import { CodexMcpServer } from '../server.js'; describe('Codex MCP Server', () => { describe('Tool Definitions', () => { test('should have all required tools defined', () => { - expect(toolDefinitions).toHaveLength(6); + // 6 core tools + 10 computer-use tools = 16 + expect(toolDefinitions).toHaveLength(16); const toolNames = toolDefinitions.map((tool) => tool.name); expect(toolNames).toContain(TOOLS.CODEX); diff --git a/src/computer-use/__tests__/discovery.test.ts b/src/computer-use/__tests__/discovery.test.ts new file mode 100644 index 0000000..c0153f6 --- /dev/null +++ b/src/computer-use/__tests__/discovery.test.ts @@ -0,0 +1,42 @@ +import { discoverBinary } from '../discovery.js'; +import { execSync } from 'node:child_process'; +import { existsSync } from 'node:fs'; + +const hasNpmBinary = (() => { + try { + return !!execSync('which open-computer-use 2>/dev/null', { encoding: 'utf-8' }).trim(); + } catch { + return false; + } +})(); + +const hasCodexBinary = existsSync( + '/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient' +); + +const hasBinary = hasNpmBinary || hasCodexBinary; + +describe('Binary Discovery', () => { + (hasBinary ? describe : describe.skip)('with binary available', () => { + test('should discover a binary', () => { + const info = discoverBinary(); + expect(info.path).toBeDefined(); + expect(info.args).toEqual(['mcp']); + expect(['npm-package', 'codex-app', 'custom']).toContain(info.type); + }); + }); + + test('should throw when env var points to non-existent path', () => { + const original = process.env.CODEX_COMPUTER_USE_BINARY; + process.env.CODEX_COMPUTER_USE_BINARY = '/nonexistent/path/to/binary'; + try { + expect(() => discoverBinary()).toThrow('non-existent path'); + } finally { + if (original) { + process.env.CODEX_COMPUTER_USE_BINARY = original; + } else { + delete process.env.CODEX_COMPUTER_USE_BINARY; + } + } + }); +}); diff --git a/src/computer-use/__tests__/handlers.test.ts b/src/computer-use/__tests__/handlers.test.ts new file mode 100644 index 0000000..a1646f7 --- /dev/null +++ b/src/computer-use/__tests__/handlers.test.ts @@ -0,0 +1,51 @@ +import { ComputerUseToolHandler } from '../handlers.js'; +import { CU_TOOLS } from '../types.js'; + +describe('ComputerUseToolHandler', () => { + let handler: ComputerUseToolHandler; + + beforeEach(() => { + handler = new ComputerUseToolHandler(); + }); + + describe('cu_status', () => { + test('should return status JSON without connecting to binary', async () => { + const result = await handler.execute(CU_TOOLS.STATUS, {}); + // Status should always work — no binary needed. + expect(result).toBeDefined(); + expect(result.content).toHaveLength(1); + expect(result.content[0].type).toBe('text'); + + const parsed = JSON.parse(result.content[0].text); + expect(parsed).toHaveProperty('connected'); + expect(parsed).toHaveProperty('binary'); + expect(parsed).toHaveProperty('error'); + }); + }); + + describe('validation', () => { + test('should reject unknown tool name', async () => { + await expect( + handler.execute('cu_nonexistent', {}) + ).rejects.toThrow('Unknown computer-use tool'); + }); + + test('should reject invalid click args (no app)', async () => { + await expect( + handler.execute(CU_TOOLS.CLICK, {}) + ).rejects.toThrow(); + }); + + test('should reject invalid get_app_state args (empty app)', async () => { + await expect( + handler.execute(CU_TOOLS.GET_APP_STATE, { app: '' }) + ).rejects.toThrow(); + }); + + test('should reject invalid drag args (missing coordinates)', async () => { + await expect( + handler.execute(CU_TOOLS.DRAG, { app: 'Safari' }) + ).rejects.toThrow(); + }); + }); +}); diff --git a/src/computer-use/__tests__/types.test.ts b/src/computer-use/__tests__/types.test.ts new file mode 100644 index 0000000..e1ef6da --- /dev/null +++ b/src/computer-use/__tests__/types.test.ts @@ -0,0 +1,125 @@ +import { + CU_TOOLS, + BINARY_TOOLS, + CU_TO_BINARY, + CU_SCHEMAS, + ListAppsSchema, + GetAppStateSchema, + ClickSchema, + DragSchema, + PerformSecondaryActionSchema, + SetValueSchema, + ScrollSchema, + PressKeySchema, + TypeTextSchema, + CuStatusSchema, +} from '../types.js'; + +describe('Computer Use Types', () => { + test('CU_TOOLS should have all 10 tools', () => { + const values = Object.values(CU_TOOLS); + expect(values).toHaveLength(10); + expect(values).toContain('cu_list_apps'); + expect(values).toContain('cu_get_app_state'); + expect(values).toContain('cu_click'); + expect(values).toContain('cu_perform_secondary_action'); + expect(values).toContain('cu_set_value'); + expect(values).toContain('cu_scroll'); + expect(values).toContain('cu_drag'); + expect(values).toContain('cu_press_key'); + expect(values).toContain('cu_type_text'); + expect(values).toContain('cu_status'); + }); + + test('BINARY_TOOLS should have 9 tools (no status)', () => { + expect(Object.keys(BINARY_TOOLS)).toHaveLength(9); + }); + + test('CU_TO_BINARY should map all CU_TOOLS except status', () => { + for (const [, binaryName] of Object.entries(CU_TO_BINARY)) { + expect(binaryName).toBeDefined(); + expect(Object.values(BINARY_TOOLS) as unknown[]).toContain(binaryName); + } + // status should NOT be in the map (it's handled locally). + expect(CU_TO_BINARY[CU_TOOLS.STATUS]).toBeUndefined(); + }); + + test('CU_SCHEMAS should have a schema for every CU tool', () => { + for (const key of Object.values(CU_TOOLS)) { + expect(CU_SCHEMAS[key]).toBeDefined(); + } + }); + + describe('Zod schemas', () => { + test('ListAppsSchema accepts empty object', () => { + expect(ListAppsSchema.parse({})).toEqual({}); + }); + + test('GetAppStateSchema requires app', () => { + expect(GetAppStateSchema.parse({ app: 'Safari' })).toEqual({ app: 'Safari' }); + expect(() => GetAppStateSchema.parse({})).toThrow(); + expect(() => GetAppStateSchema.parse({ app: '' })).toThrow(); + }); + + test('ClickSchema requires app, optional fields', () => { + expect(ClickSchema.parse({ app: 'Safari' })).toEqual({ app: 'Safari' }); + expect( + ClickSchema.parse({ app: 'Safari', element_index: '5', x: 100, y: 200 }) + ).toEqual({ app: 'Safari', element_index: '5', x: 100, y: 200 }); + }); + + test('DragSchema requires app and all coordinates', () => { + expect(DragSchema.parse({ app: 'Safari', from_x: 0, from_y: 0, to_x: 100, to_y: 100 })).toEqual({ + app: 'Safari', + from_x: 0, + from_y: 0, + to_x: 100, + to_y: 100, + }); + expect(() => DragSchema.parse({ app: 'Safari' })).toThrow(); + }); + + test('PerformSecondaryActionSchema requires app, element_index, action', () => { + expect( + PerformSecondaryActionSchema.parse({ app: 'Safari', element_index: '3', action: 'toggle' }) + ).toEqual({ app: 'Safari', element_index: '3', action: 'toggle' }); + expect(() => PerformSecondaryActionSchema.parse({ app: 'Safari' })).toThrow(); + }); + + test('SetValueSchema requires app, element_index, value', () => { + expect( + SetValueSchema.parse({ app: 'Safari', element_index: '1', value: 'hello' }) + ).toEqual({ app: 'Safari', element_index: '1', value: 'hello' }); + }); + + test('ScrollSchema requires app, element_index, direction', () => { + expect( + ScrollSchema.parse({ app: 'Safari', element_index: '0', direction: 'down' }) + ).toEqual({ app: 'Safari', element_index: '0', direction: 'down' }); + expect(ScrollSchema.parse({ app: 'Safari', element_index: '0', direction: 'down', pages: 3 })).toEqual({ + app: 'Safari', + element_index: '0', + direction: 'down', + pages: 3, + }); + }); + + test('PressKeySchema requires app and key', () => { + expect(PressKeySchema.parse({ app: 'Terminal', key: 'Return' })).toEqual({ + app: 'Terminal', + key: 'Return', + }); + }); + + test('TypeTextSchema requires app and text', () => { + expect(TypeTextSchema.parse({ app: 'TextEdit', text: 'hello world' })).toEqual({ + app: 'TextEdit', + text: 'hello world', + }); + }); + + test('CuStatusSchema accepts empty object', () => { + expect(CuStatusSchema.parse({})).toEqual({}); + }); + }); +}); diff --git a/src/computer-use/bridge.ts b/src/computer-use/bridge.ts new file mode 100644 index 0000000..303a54a --- /dev/null +++ b/src/computer-use/bridge.ts @@ -0,0 +1,92 @@ +import { type BinaryInfo } from './types.js'; +import { discoverBinary } from './discovery.js'; +import { ComputerUseClient } from './client.js'; + +export class ComputerUseBridge { + private static instance?: ComputerUseBridge; + private client?: ComputerUseClient; + private binaryInfo?: BinaryInfo; + private initialized = false; + private error?: Error; + private initPromise?: Promise; + + private constructor() {} + + static getInstance(): ComputerUseBridge { + if (!ComputerUseBridge.instance) { + ComputerUseBridge.instance = new ComputerUseBridge(); + } + return ComputerUseBridge.instance; + } + + async initialize(): Promise { + // Fast path: already connected. + if (this.initialized && this.client?.isConnected()) return; + + // Serialize concurrent callers — they coalesce onto the same promise. + if (this.initPromise) return this.initPromise; + + this.initPromise = this._doInitialize(); + try { + await this.initPromise; + } finally { + this.initPromise = undefined; + } + } + + private async _doInitialize(): Promise { + // Re-check after acquiring the lock. + if (this.initialized && this.client?.isConnected()) return; + + if (this.client) { + this.client.disconnect(); + this.client = undefined; + } + this.initialized = false; + this.error = undefined; + + try { + this.binaryInfo = discoverBinary(); + console.error( + `Computer Use: using binary at ${this.binaryInfo.path} (${this.binaryInfo.type})` + ); + + this.client = new ComputerUseClient(this.binaryInfo); + await this.client.connect(); + + this.initialized = true; + console.error('Computer Use: bridge connected'); + } catch (err) { + this.error = err instanceof Error ? err : new Error(String(err)); + console.error(`Computer Use: ${this.error.message}`); + this.client?.disconnect(); + this.client = undefined; + throw this.error; + } + } + + getClient(): ComputerUseClient { + if (!this.client || !this.initialized) { + throw new Error('Computer Use bridge not initialized'); + } + return this.client; + } + + isReady(): boolean { + return this.initialized && this.client?.isConnected() === true; + } + + getBinaryInfo(): BinaryInfo | undefined { + return this.binaryInfo; + } + + getError(): Error | undefined { + return this.error; + } + + async shutdown(): Promise { + this.client?.disconnect(); + this.client = undefined; + this.initialized = false; + } +} diff --git a/src/computer-use/client.ts b/src/computer-use/client.ts new file mode 100644 index 0000000..4a5be1a --- /dev/null +++ b/src/computer-use/client.ts @@ -0,0 +1,183 @@ +import { spawn, type ChildProcess } from 'child_process'; +import { type BinaryInfo, type JsonRpcResponse } from './types.js'; +import { type ToolResult } from '../types.js'; + +const DEFAULT_TIMEOUT_MS = 30_000; +const SCREENSHOT_TIMEOUT_MS = 60_000; +const INIT_PROTOCOL_VERSION = '2024-11-05'; + +export class ComputerUseClient { + private process: ChildProcess | null = null; + private requestId = 0; + private pending = new Map< + number, + { + resolve: (resp: JsonRpcResponse) => void; + reject: (err: Error) => void; + timer: ReturnType; + } + >(); + private buffer = ''; + private connected = false; + + constructor(private binary: BinaryInfo) {} + + async connect(): Promise { + if (this.connected) return; + + this.process = spawn(this.binary.path, this.binary.args, { + stdio: ['pipe', 'pipe', 'pipe'], + env: { ...process.env }, + }); + + this.process.on('error', (err) => { + this.rejectAll(new Error(`Binary spawn failed: ${err.message}`)); + }); + + this.process.on('close', () => { + this.connected = false; + this.process = null; + this.rejectAll(new Error('Binary process exited unexpectedly')); + }); + + this.process.stdout?.on('data', (data: Buffer) => { + this.buffer += data.toString(); + this.drainBuffer(); + }); + + this.process.stderr?.on('data', (data: Buffer) => { + // Binary log messages go to stderr — forward at debug level. + if (process.env.NODE_DEBUG?.includes('computer-use')) { + process.stderr.write(data); + } + }); + + // MCP initialize handshake. + const initResult = await this.sendRequest('initialize', { + protocolVersion: INIT_PROTOCOL_VERSION, + capabilities: {}, + clientInfo: { name: 'codex-mcp-server', version: '0.1.0' }, + }); + + if (!initResult.result) { + throw new Error(`Binary initialize failed: ${JSON.stringify(initResult.error)}`); + } + + // Send initialized notification (no id = notification). + this.sendNotification('notifications/initialized', {}); + this.connected = true; + } + + async callTool( + name: string, + args: Record, + timeoutMs = name === 'get_app_state' ? SCREENSHOT_TIMEOUT_MS : DEFAULT_TIMEOUT_MS + ): Promise { + if (!this.connected) throw new Error('Not connected'); + + const resp = await this.sendRequest('tools/call', { name, arguments: args }, timeoutMs); + + if (resp.error) { + return { + content: [{ type: 'text', text: `Binary error: ${resp.error.message}` }], + isError: true, + }; + } + + // The binary returns MCP CallToolResult shape. + const result = resp.result as { + content?: Array<{ + type: string; + text?: string; + data?: string; + mimeType?: string; + }>; + isError?: boolean; + }; + + // Pass through content as-is, preserving image data from get_app_state screenshots. + const content = (result?.content || []).map((item) => ({ + type: item.type as 'text' | 'image', + text: item.text || '', + data: item.data, + mimeType: item.mimeType, + })); + + return { + content: content.length > 0 ? content : [{ type: 'text', text: JSON.stringify(result) }], + isError: result?.isError, + }; + } + + disconnect(): void { + this.process?.kill(); + this.process = null; + this.connected = false; + this.rejectAll(new Error('Client disconnected')); + } + + isConnected(): boolean { + return this.connected; + } + + private sendRequest( + method: string, + params: Record, + timeoutMs = DEFAULT_TIMEOUT_MS + ): Promise { + return new Promise((resolve, reject) => { + const id = ++this.requestId; + const timer = setTimeout(() => { + this.pending.delete(id); + reject(new Error(`Request ${method} timed out after ${timeoutMs}ms`)); + }, timeoutMs); + + this.pending.set(id, { resolve, reject, timer }); + + const msg = JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'; + this.process?.stdin?.write(msg); + }); + } + + private sendNotification( + method: string, + params: Record + ): void { + const msg = JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n'; + this.process?.stdin?.write(msg); + } + + private drainBuffer(): void { + // Binary uses bare newline-delimited JSON-RPC (no Content-Length framing). + const lines = this.buffer.split('\n'); + // Last element may be incomplete — keep it in the buffer. + this.buffer = lines.pop() || ''; + + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + const msg = JSON.parse(trimmed) as JsonRpcResponse; + // Only handle responses (have an id). Notifications are id-less. + if (msg.id !== undefined) { + const pending = this.pending.get(msg.id); + if (pending) { + clearTimeout(pending.timer); + this.pending.delete(msg.id); + pending.resolve(msg); + } + } + } catch { + // Ignore non-JSON lines (log messages, etc.) + } + } + } + + private rejectAll(err: Error): void { + for (const [, entry] of this.pending) { + clearTimeout(entry.timer); + entry.reject(err); + } + this.pending.clear(); + } +} diff --git a/src/computer-use/definitions.ts b/src/computer-use/definitions.ts new file mode 100644 index 0000000..b8e4035 --- /dev/null +++ b/src/computer-use/definitions.ts @@ -0,0 +1,274 @@ +import { type ToolDefinition } from '../types.js'; +import { CU_TOOLS } from './types.js'; + +export const computerUseToolDefinitions: ToolDefinition[] = [ + { + name: CU_TOOLS.LIST_APPS, + description: + 'List running and recently used macOS apps. Call this first to discover available apps.', + inputSchema: { + type: 'object', + properties: {}, + required: [], + }, + annotations: { + title: 'List Apps', + readOnlyHint: true, + idempotentHint: true, + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.GET_APP_STATE, + description: + 'Get screenshot and accessibility tree for an app. Must be called once per turn before interacting with the app.', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier (e.g., "Safari", "com.apple.Safari")', + }, + }, + required: ['app'], + }, + annotations: { + title: 'Get App State', + readOnlyHint: true, + idempotentHint: true, + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.CLICK, + description: + 'Click an element by index or pixel coordinates. Prefer element_index when available from accessibility tree.', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + element_index: { + type: 'string', + description: 'Element index from accessibility tree', + }, + x: { + type: 'number', + description: 'X coordinate in screenshot pixel coordinates', + }, + y: { + type: 'number', + description: 'Y coordinate in screenshot pixel coordinates', + }, + mouse_button: { + type: 'string', + enum: ['left', 'right', 'middle'], + description: 'Mouse button to click (default: left)', + }, + click_count: { + type: 'integer', + description: 'Number of clicks (default: 1)', + }, + }, + required: ['app'], + }, + annotations: { + title: 'Click', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.PERFORM_SECONDARY_ACTION, + description: + 'Invoke a secondary accessibility action exposed by an element (e.g., toggle, expand, pick)', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + element_index: { + type: 'string', + description: 'Element identifier', + }, + action: { + type: 'string', + description: 'Secondary accessibility action name', + }, + }, + required: ['app', 'element_index', 'action'], + }, + annotations: { + title: 'Secondary Action', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.SET_VALUE, + description: + 'Set the value of a settable accessibility element (e.g., slider, text field)', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + element_index: { + type: 'string', + description: 'Element identifier', + }, + value: { + type: 'string', + description: 'Value to assign', + }, + }, + required: ['app', 'element_index', 'value'], + }, + annotations: { + title: 'Set Value', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.SCROLL, + description: + 'Scroll an element in a direction by a number of pages', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + element_index: { + type: 'string', + description: 'Element identifier', + }, + direction: { + type: 'string', + description: 'Scroll direction: up, down, left, or right', + }, + pages: { + type: 'integer', + description: 'Number of page scroll actions (default: 1)', + }, + }, + required: ['app', 'element_index', 'direction'], + }, + annotations: { + title: 'Scroll', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.DRAG, + description: + 'Drag from one point to another using pixel coordinates', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + from_x: { + type: 'number', + description: 'Start X coordinate', + }, + from_y: { + type: 'number', + description: 'Start Y coordinate', + }, + to_x: { + type: 'number', + description: 'End X coordinate', + }, + to_y: { + type: 'number', + description: 'End Y coordinate', + }, + }, + required: ['app', 'from_x', 'from_y', 'to_x', 'to_y'], + }, + annotations: { + title: 'Drag', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.PRESS_KEY, + description: + 'Press a key or key-combination. Supports xdotool key syntax: "a", "Return", "Tab", "super+c", "Up"', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + key: { + type: 'string', + description: 'Key or key combination to press', + }, + }, + required: ['app', 'key'], + }, + annotations: { + title: 'Press Key', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.TYPE_TEXT, + description: + 'Type literal text using keyboard input', + inputSchema: { + type: 'object', + properties: { + app: { + type: 'string', + description: 'App name or bundle identifier', + }, + text: { + type: 'string', + description: 'Literal text to type', + }, + }, + required: ['app', 'text'], + }, + annotations: { + title: 'Type Text', + destructiveHint: false, + openWorldHint: false, + }, + }, + { + name: CU_TOOLS.STATUS, + description: + 'Check Computer Use service status, binary path, and connection health', + inputSchema: { + type: 'object', + properties: {}, + required: [], + }, + annotations: { + title: 'Computer Use Status', + readOnlyHint: true, + idempotentHint: true, + destructiveHint: false, + openWorldHint: false, + }, + }, +]; diff --git a/src/computer-use/discovery.ts b/src/computer-use/discovery.ts new file mode 100644 index 0000000..83575cf --- /dev/null +++ b/src/computer-use/discovery.ts @@ -0,0 +1,52 @@ +import { existsSync } from 'node:fs'; +import { execSync } from 'node:child_process'; +import { resolve, join } from 'node:path'; +import { type BinaryInfo } from './types.js'; + +const CODEX_APP_PATH = + '/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use'; +const CODEX_BINARY = join( + CODEX_APP_PATH, + 'Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient' +); + +function findInPath(name: string): string | null { + try { + return execSync(`which ${name} 2>/dev/null`, { encoding: 'utf-8' }).trim(); + } catch { + return null; + } +} + +export function discoverBinary(): BinaryInfo { + // 1. Explicit env var override. + const envPath = process.env.CODEX_COMPUTER_USE_BINARY; + if (envPath) { + const resolved = resolve(envPath); + if (!existsSync(resolved)) { + throw new Error( + `CODEX_COMPUTER_USE_BINARY points to non-existent path: ${resolved}` + ); + } + return { path: resolved, type: 'custom', args: ['mcp'] }; + } + + // 2. open-computer-use in PATH (npm global install). + const npmPath = findInPath('open-computer-use'); + if (npmPath && existsSync(npmPath)) { + return { path: npmPath, type: 'npm-package', args: ['mcp'] }; + } + + // 3. Codex.app bundled binary. + if (existsSync(CODEX_BINARY)) { + return { path: CODEX_BINARY, type: 'codex-app', args: ['mcp'] }; + } + + throw new Error( + 'Computer Use binary not found.\n' + + 'Install one of:\n' + + ' npm install -g open-codex-computer-use-mcp\n' + + ' (then run: open-computer-use doctor)\n' + + 'Or set CODEX_COMPUTER_USE_BINARY env var to a custom path.' + ); +} diff --git a/src/computer-use/handlers.ts b/src/computer-use/handlers.ts new file mode 100644 index 0000000..c8b161d --- /dev/null +++ b/src/computer-use/handlers.ts @@ -0,0 +1,93 @@ +import { type ToolResult, type ToolHandlerContext } from '../types.js'; +import { ToolExecutionError, ValidationError } from '../errors.js'; +import { ZodError } from 'zod'; +import { + CU_TOOLS, + CU_TO_BINARY, + CU_SCHEMAS, +} from './types.js'; +import { ComputerUseBridge } from './bridge.js'; + +// Default no-op context for handlers that don't need progress. +const defaultContext: ToolHandlerContext = { + sendProgress: async () => {}, +}; + +export class ComputerUseToolHandler { + async execute( + toolName: string, + args: unknown, + context: ToolHandlerContext = defaultContext + ): Promise { + // Status tool works on any platform (reports binary not found). + if (toolName === CU_TOOLS.STATUS) { + return this.handleStatus(); + } + + // Guard: computer-use tools require macOS. + if (process.platform !== 'darwin') { + throw new ToolExecutionError( + toolName, + 'Computer Use tools are only available on macOS' + ); + } + + // Validate tool name. + const binaryToolName = CU_TO_BINARY[toolName]; + if (!binaryToolName) { + throw new Error(`Unknown computer-use tool: ${toolName}`); + } + + try { + // Validate input with Zod. + const schema = CU_SCHEMAS[toolName]; + if (!schema) { + throw new Error(`No schema for tool: ${toolName}`); + } + const parsed = schema.parse(args); + + // Ensure bridge is initialized (lazy). + const bridge = ComputerUseBridge.getInstance(); + if (!bridge.isReady()) { + await context.sendProgress('Connecting to Computer Use binary...', 0); + await bridge.initialize(); + } + + const client = bridge.getClient(); + const result = await client.callTool(binaryToolName, parsed as Record); + return result; + } catch (error) { + if (error instanceof ValidationError) throw error; + if (error instanceof ZodError) { + throw new ValidationError(toolName, error.message); + } + throw new ToolExecutionError(toolName, 'Computer Use tool failed', error); + } + } + + private handleStatus(): ToolResult { + const bridge = ComputerUseBridge.getInstance(); + const binaryInfo = bridge.getBinaryInfo(); + const error = bridge.getError(); + + const status = { + connected: bridge.isReady(), + binary: binaryInfo + ? { + path: binaryInfo.path, + type: binaryInfo.type, + } + : null, + error: error?.message || null, + }; + + return { + content: [ + { + type: 'text', + text: JSON.stringify(status, null, 2), + }, + ], + }; + } +} diff --git a/src/computer-use/types.ts b/src/computer-use/types.ts new file mode 100644 index 0000000..7ac0623 --- /dev/null +++ b/src/computer-use/types.ts @@ -0,0 +1,140 @@ +import { z } from 'zod'; + +// Tool names exposed by the Codex computer-use binary. +// Mapped to our prefixed names to avoid collision with Claude Code's +// native computer-use module. +export const CU_TOOLS = { + LIST_APPS: 'cu_list_apps', + GET_APP_STATE: 'cu_get_app_state', + CLICK: 'cu_click', + PERFORM_SECONDARY_ACTION: 'cu_perform_secondary_action', + SET_VALUE: 'cu_set_value', + SCROLL: 'cu_scroll', + DRAG: 'cu_drag', + PRESS_KEY: 'cu_press_key', + TYPE_TEXT: 'cu_type_text', + STATUS: 'cu_status', +} as const; + +// Internal binary tool names (no prefix) for proxying to the subprocess. +export const BINARY_TOOLS = { + LIST_APPS: 'list_apps', + GET_APP_STATE: 'get_app_state', + CLICK: 'click', + PERFORM_SECONDARY_ACTION: 'perform_secondary_action', + SET_VALUE: 'set_value', + SCROLL: 'scroll', + DRAG: 'drag', + PRESS_KEY: 'press_key', + TYPE_TEXT: 'type_text', +} as const; + +// Map from our prefixed name → binary tool name. +export const CU_TO_BINARY: Record = { + [CU_TOOLS.LIST_APPS]: BINARY_TOOLS.LIST_APPS, + [CU_TOOLS.GET_APP_STATE]: BINARY_TOOLS.GET_APP_STATE, + [CU_TOOLS.CLICK]: BINARY_TOOLS.CLICK, + [CU_TOOLS.PERFORM_SECONDARY_ACTION]: BINARY_TOOLS.PERFORM_SECONDARY_ACTION, + [CU_TOOLS.SET_VALUE]: BINARY_TOOLS.SET_VALUE, + [CU_TOOLS.SCROLL]: BINARY_TOOLS.SCROLL, + [CU_TOOLS.DRAG]: BINARY_TOOLS.DRAG, + [CU_TOOLS.PRESS_KEY]: BINARY_TOOLS.PRESS_KEY, + [CU_TOOLS.TYPE_TEXT]: BINARY_TOOLS.TYPE_TEXT, +}; + +// Zod schemas matching the binary's exact input schemas (from tools/list). +export const ListAppsSchema = z.object({}); + +export const GetAppStateSchema = z.object({ + app: z.string().min(1, 'App name is required'), +}); + +export const ClickSchema = z.object({ + app: z.string().min(1, 'App name is required'), + element_index: z.string().optional(), + x: z.number().optional(), + y: z.number().optional(), + mouse_button: z.enum(['left', 'right', 'middle']).optional(), + click_count: z.int().optional(), +}); + +export const PerformSecondaryActionSchema = z.object({ + app: z.string().min(1, 'App name is required'), + element_index: z.string().min(1, 'Element index is required'), + action: z.string().min(1, 'Action name is required'), +}); + +export const SetValueSchema = z.object({ + app: z.string().min(1, 'App name is required'), + element_index: z.string().min(1, 'Element index is required'), + value: z.string().min(1, 'Value is required'), +}); + +export const ScrollSchema = z.object({ + app: z.string().min(1, 'App name is required'), + element_index: z.string().min(1, 'Element index is required'), + direction: z.string().min(1, 'Scroll direction is required'), + pages: z.int().optional(), +}); + +export const DragSchema = z.object({ + app: z.string().min(1, 'App name is required'), + from_x: z.number(), + from_y: z.number(), + to_x: z.number(), + to_y: z.number(), +}); + +export const PressKeySchema = z.object({ + app: z.string().min(1, 'App name is required'), + key: z.string().min(1, 'Key is required'), +}); + +export const TypeTextSchema = z.object({ + app: z.string().min(1, 'App name is required'), + text: z.string().min(1, 'Text is required'), +}); + +export const CuStatusSchema = z.object({}); + +// Map tool name → schema for runtime validation. +export const CU_SCHEMAS: Record = { + [CU_TOOLS.LIST_APPS]: ListAppsSchema, + [CU_TOOLS.GET_APP_STATE]: GetAppStateSchema, + [CU_TOOLS.CLICK]: ClickSchema, + [CU_TOOLS.PERFORM_SECONDARY_ACTION]: PerformSecondaryActionSchema, + [CU_TOOLS.SET_VALUE]: SetValueSchema, + [CU_TOOLS.SCROLL]: ScrollSchema, + [CU_TOOLS.DRAG]: DragSchema, + [CU_TOOLS.PRESS_KEY]: PressKeySchema, + [CU_TOOLS.TYPE_TEXT]: TypeTextSchema, + [CU_TOOLS.STATUS]: CuStatusSchema, +}; + +// Binary discovery info. +export interface BinaryInfo { + path: string; + type: 'npm-package' | 'codex-app' | 'custom'; + args: string[]; +} + +// JSON-RPC message types for direct stdio communication. +export interface JsonRpcRequest { + jsonrpc: '2.0'; + id: number; + method: string; + params?: Record; +} + +export interface JsonRpcNotification { + jsonrpc: '2.0'; + method: string; + params?: Record; +} + +export interface JsonRpcResponse { + jsonrpc: '2.0'; + id?: number; + result?: unknown; + error?: { code: number; message: string; data?: unknown }; +} diff --git a/src/tools/definitions.ts b/src/tools/definitions.ts index dd52a15..576ef62 100644 --- a/src/tools/definitions.ts +++ b/src/tools/definitions.ts @@ -1,4 +1,5 @@ import { TOOLS, getModelDescription, type ToolDefinition } from '../types.js'; +import { computerUseToolDefinitions } from '../computer-use/definitions.js'; export const toolDefinitions: ToolDefinition[] = [ { @@ -205,4 +206,5 @@ export const toolDefinitions: ToolDefinition[] = [ openWorldHint: true, }, }, + ...computerUseToolDefinitions, ]; diff --git a/src/tools/handlers.ts b/src/tools/handlers.ts index 9d7eabd..0ab68b4 100644 --- a/src/tools/handlers.ts +++ b/src/tools/handlers.ts @@ -24,6 +24,7 @@ import { ToolExecutionError, ValidationError } from '../errors.js'; import { executeCommand, executeCommandStreaming } from '../utils/command.js'; import { ZodError } from 'zod'; import path from 'node:path'; +import { ComputerUseToolHandler } from '../computer-use/handlers.js'; // Default no-op context for handlers that don't need progress const defaultContext: ToolHandlerContext = { @@ -577,6 +578,14 @@ export class WebSearchToolHandler { // Tool handler registry const sessionStorage = new InMemorySessionStorage(); +const computerUseHandler = new ComputerUseToolHandler(); + +// Wrap ComputerUseToolHandler to match the execute(args, context) signature +// expected by server.ts. Each wrapper injects the tool name. +const cuHandler = (toolName: string) => ({ + execute: (args: unknown, context: ToolHandlerContext) => + computerUseHandler.execute(toolName, args, context), +}); export const toolHandlers = { [TOOLS.CODEX]: new CodexToolHandler(sessionStorage), @@ -585,4 +594,14 @@ export const toolHandlers = { [TOOLS.HELP]: new HelpToolHandler(), [TOOLS.LIST_SESSIONS]: new ListSessionsToolHandler(sessionStorage), [TOOLS.WEBSEARCH]: new WebSearchToolHandler(), -} as const; + [TOOLS.CU_LIST_APPS]: cuHandler(TOOLS.CU_LIST_APPS), + [TOOLS.CU_GET_APP_STATE]: cuHandler(TOOLS.CU_GET_APP_STATE), + [TOOLS.CU_CLICK]: cuHandler(TOOLS.CU_CLICK), + [TOOLS.CU_PERFORM_SECONDARY_ACTION]: cuHandler(TOOLS.CU_PERFORM_SECONDARY_ACTION), + [TOOLS.CU_SET_VALUE]: cuHandler(TOOLS.CU_SET_VALUE), + [TOOLS.CU_SCROLL]: cuHandler(TOOLS.CU_SCROLL), + [TOOLS.CU_DRAG]: cuHandler(TOOLS.CU_DRAG), + [TOOLS.CU_PRESS_KEY]: cuHandler(TOOLS.CU_PRESS_KEY), + [TOOLS.CU_TYPE_TEXT]: cuHandler(TOOLS.CU_TYPE_TEXT), + [TOOLS.CU_STATUS]: cuHandler(TOOLS.CU_STATUS), +}; diff --git a/src/types.ts b/src/types.ts index a940f81..b177c1c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -8,6 +8,16 @@ export const TOOLS = { HELP: 'help', LIST_SESSIONS: 'listSessions', WEBSEARCH: 'websearch', + CU_LIST_APPS: 'cu_list_apps', + CU_GET_APP_STATE: 'cu_get_app_state', + CU_CLICK: 'cu_click', + CU_PERFORM_SECONDARY_ACTION: 'cu_perform_secondary_action', + CU_SET_VALUE: 'cu_set_value', + CU_SCROLL: 'cu_scroll', + CU_DRAG: 'cu_drag', + CU_PRESS_KEY: 'cu_press_key', + CU_TYPE_TEXT: 'cu_type_text', + CU_STATUS: 'cu_status', } as const; export type ToolName = typeof TOOLS[keyof typeof TOOLS]; @@ -67,8 +77,10 @@ export interface ToolDefinition { // Tool result interface matching MCP SDK expectations export interface ToolResult { content: Array<{ - type: 'text'; + type: 'text' | 'image'; text: string; + data?: string; + mimeType?: string; _meta?: Record; }>; structuredContent?: Record; From ba7d0282c5576cf9981c23277085f22668c0058c Mon Sep 17 00:00:00 2001 From: Tommy Nguyen Date: Sat, 18 Apr 2026 22:32:53 -0700 Subject: [PATCH 2/6] docs: add computer use documentation Signed-off-by: Tommy Nguyen Entire-Checkpoint: de373f85600c --- .claude/commands/codex-setup.md | 57 ++++++++++ README.md | 17 ++- docs/api-reference.md | 101 ++++++++++++++++- docs/computer-use.md | 186 ++++++++++++++++++++++++++++++++ 4 files changed, 357 insertions(+), 4 deletions(-) create mode 100644 .claude/commands/codex-setup.md create mode 100644 docs/computer-use.md diff --git a/.claude/commands/codex-setup.md b/.claude/commands/codex-setup.md new file mode 100644 index 0000000..4e7c9b9 --- /dev/null +++ b/.claude/commands/codex-setup.md @@ -0,0 +1,57 @@ +# /codex-setup — Re-sign Codex Computer Use Binary + +Re-sign the Codex computer-use binary so Claude Code can launch it. This is needed when the binary has the hardened runtime flag, which prevents exec from a non-signed parent process (like node). + +## Steps + +1. **Find the binary** — check these locations in order: + - `$CODEX_COMPUTER_USE_BINARY` env var (if set) + - `open-computer-use` in PATH (`which open-computer-use`) + - Codex.app bundled binary at: + ``` + /Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient + ``` + + Run: + ```bash + if [ -n "$CODEX_COMPUTER_USE_BINARY" ]; then + echo "$CODEX_COMPUTER_USE_BINARY" + elif command -v open-computer-use &>/dev/null; then + which open-computer-use + elif [ -f "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" ]; then + echo "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" + else + echo "NOT_FOUND" + fi + ``` + +2. **Check current signature** — if the output includes `restricted` or `runtime`, re-signing is needed: + ```bash + codesign -dvvv 2>&1 | grep -E 'flags|runtime|Signature' + ``` + +3. **Re-sign with ad-hoc signature** — this removes the hardened runtime restriction: + ```bash + sudo codesign --force --deep --sign - "" + ``` + +4. **Verify** — confirm the new signature: + ```bash + codesign --verify --deep --strict 2>&1 + ``` + + Expected: `valid on disk` (no errors). + +5. **Test the binary** — confirm it starts and speaks JSON-RPC: + ```bash + echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"0.1.0"}}}' | timeout 5 "" mcp 2>/dev/null | head -1 + ``` + + Expected: A JSON-RPC response with `"result"` containing `"serverInfo"`. + +## Notes + +- Requires `sudo` for re-signing (the binary lives in `/Applications`) +- The ad-hoc signature (`-` flag) is machine-local — it won't work if you move the binary to another Mac +- If Codex.app auto-updates, you may need to re-run this command +- The `open-codex-computer-use-mcp` npm package (v0.1.11+) includes a pre-signed binary that may not need this step diff --git a/README.md b/README.md index d297ef1..c84aa66 100644 --- a/README.md +++ b/README.md @@ -4,18 +4,22 @@ [![npm downloads](https://img.shields.io/npm/dm/codex-mcp-server.svg)](https://www.npmjs.com/package/codex-mcp-server) [![license](https://img.shields.io/npm/l/codex-mcp-server.svg)](https://www.npmjs.com/package/codex-mcp-server) -Bridge between Claude and OpenAI's Codex CLI — get AI-powered code analysis, generation, and review right in your editor. +Bridge between Claude and OpenAI's Codex CLI — get AI-powered code analysis, generation, and review right in your editor. Also includes optional macOS computer control. ```mermaid graph LR A[Claude Code] --> B[Codex MCP Server] B --> C[Codex CLI] C --> D[OpenAI API] + B --> E[Computer Use] + E --> F[macOS Apps] style A fill:#FF6B35 style B fill:#4A90E2 style C fill:#00D4AA style D fill:#FFA500 + style E fill:#9B59B6 + style F fill:#E74C3C ``` ## Quick Start @@ -54,6 +58,7 @@ Use review to check my uncommitted changes | `codex` | AI coding assistant with session support, model selection, and structured output metadata | | `review` | AI-powered code review for uncommitted changes, branches, or commits | | `websearch` | Web search using Codex CLI with customizable result count and search depth | +| `cu_*` | 10 macOS computer-use tools — control apps via screenshots and accessibility ([setup](docs/computer-use.md)) | | `listSessions` | View active conversation sessions | | `ping` | Test server connection | | `help` | Get Codex CLI help | @@ -90,8 +95,13 @@ Use codex to return structuredContent with threadId metadata when available **Web search:** ``` Use websearch with query "TypeScript 5.8 new features" -Use websearch with query "Rust vs Go performance 2025" and numResults 15 -Use websearch with query "React Server Components" and searchDepth "full" +``` + +**Computer Use (macOS):** +``` +Use cu_status to check connection +Use cu_get_app_state with app "Finder" to see the desktop +Use cu_click with app "Safari" and element_index "42" to click a link ``` ## Requirements @@ -105,6 +115,7 @@ Use websearch with query "React Server Components" and searchDepth "full" ## Documentation - **[API Reference](docs/api-reference.md)** — Full tool parameters and response formats +- **[Computer Use](docs/computer-use.md)** — macOS setup, tools, and troubleshooting - **[Session Management](docs/session-management.md)** — How conversations work - **[Codex CLI Integration](docs/codex-cli-integration.md)** — Version compatibility and CLI details diff --git a/docs/api-reference.md b/docs/api-reference.md index 666a298..34dcd29 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -48,9 +48,20 @@ All tools include annotations that provide hints to MCP clients about tool behav |------|---------|---------------|-------------------|------------------|-----------------| | `codex` | Execute Codex CLI | `false` | `true` | `false` | `true` | | `review` | Code Review | `true` | `false` | `true` | `true` | +| `websearch` | Web Search | `true` | `false` | `true` | `true` | | `ping` | Ping Server | `true` | `false` | `true` | `false` | | `help` | Get Help | `true` | `false` | `true` | `false` | | `listSessions` | List Sessions | `true` | `false` | `true` | `false` | +| `cu_list_apps` | List Apps | `true` | `false` | `true` | `false` | +| `cu_get_app_state` | Get App State | `true` | `false` | `true` | `false` | +| `cu_status` | Computer Use Status | `true` | `false` | `true` | `false` | +| `cu_click` | Click | — | `false` | — | `false` | +| `cu_type_text` | Type Text | — | `false` | — | `false` | +| `cu_press_key` | Press Key | — | `false` | — | `false` | +| `cu_scroll` | Scroll | — | `false` | — | `false` | +| `cu_drag` | Drag | — | `false` | — | `false` | +| `cu_set_value` | Set Value | — | `false` | — | `false` | +| `cu_perform_secondary_action` | Secondary Action | — | `false` | — | `false` | ### Progress Notifications For long-running operations, the server sends `notifications/progress` messages when the client includes a `progressToken` in the request `_meta`. @@ -439,4 +450,92 @@ Optional: ### Optional Configuration - **CODEX_HOME**: Custom directory for Codex CLI configuration - **Session Limits**: Configurable in server implementation (default: 100) -- **TTL Settings**: Configurable session expiration (default: 24 hours) \ No newline at end of file +- **TTL Settings**: Configurable session expiration (default: 24 hours) +- **CODEX_COMPUTER_USE_BINARY**: Path to computer-use binary (macOS only) +- **NODE_DEBUG**: Include `computer-use` to forward binary stderr + +## Computer Use Tools + +macOS-only tools for controlling apps via accessibility APIs. See [Computer Use](computer-use.md) for setup instructions. + +### `cu_status` — Health Check + +**Annotations:** `readOnlyHint: true`, `idempotentHint: true` + +No parameters. Returns binary path, type, connection status, and any error. + +### `cu_list_apps` — List Running Apps + +**Annotations:** `readOnlyHint: true`, `idempotentHint: true` + +No parameters. Returns list of running and recently used macOS apps. + +### `cu_get_app_state` — Screenshot & Accessibility Tree + +**Annotations:** `readOnlyHint: true`, `idempotentHint: true` + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | + +Returns screenshot (base64 image) and accessibility tree with element indices. + +### `cu_click` — Click Element + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `element_index` | string | no | Element index from accessibility tree | +| `x` | number | no | X coordinate (pixels) | +| `y` | number | no | Y coordinate (pixels) | +| `mouse_button` | enum | no | `left`, `right`, or `middle` | +| `click_count` | integer | no | Number of clicks (default: 1) | + +### `cu_type_text` — Type Text + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `text` | string | yes | Literal text to type | + +### `cu_press_key` — Key Press + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `key` | string | yes | Key or combo (`a`, `Return`, `cmd+s`, `Up`) | + +### `cu_scroll` — Scroll + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `element_index` | string | yes | Element identifier | +| `direction` | string | yes | `up`, `down`, `left`, or `right` | +| `pages` | integer | no | Page scroll count (default: 1) | + +### `cu_drag` — Drag + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `from_x` | number | yes | Start X coordinate | +| `from_y` | number | yes | Start Y coordinate | +| `to_x` | number | yes | End X coordinate | +| `to_y` | number | yes | End Y coordinate | + +### `cu_set_value` — Set Value + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `element_index` | string | yes | Element identifier | +| `value` | string | yes | Value to assign | + +### `cu_perform_secondary_action` — Secondary Action + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `app` | string | yes | App name or bundle identifier | +| `element_index` | string | yes | Element identifier | +| `action` | string | yes | Action name (`toggle`, `expand`, `pick`, etc.) | \ No newline at end of file diff --git a/docs/computer-use.md b/docs/computer-use.md new file mode 100644 index 0000000..65fd08b --- /dev/null +++ b/docs/computer-use.md @@ -0,0 +1,186 @@ +# Computer Use + +OS-level macOS computer control via Codex's accessibility binary. Control running apps through screenshots, accessibility trees, clicks, typing, and keyboard shortcuts. + +## Architecture + +``` +Claude Code + → codex-mcp-server + → ComputerUseBridge (singleton, lazy init) + → open-computer-use / SkyComputerUseClient (subprocess) + → stdio JSON-RPC (newline-delimited) + → macOS Accessibility APIs +``` + +The bridge spawns the binary once and maintains a persistent connection. Binary discovery happens on the first computer-use tool call — no impact on codex/review startup. + +## Binary Setup + +### Option 1: npm package (recommended) + +```bash +npm install -g open-codex-computer-use-mcp +open-computer-use doctor +``` + +The `doctor` command verifies the binary runs and can communicate. + +### Option 2: Codex.app bundled binary + +If Codex.app is installed at `/Applications/Codex.app`, the `SkyComputerUseClient` binary is detected automatically. + +If it fails to launch (hardened runtime restriction), re-sign it: + +```bash +sudo codesign --force --deep --sign - \ + "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" +``` + +### Custom binary path + +Set `CODEX_COMPUTER_USE_BINARY` to any binary that speaks the same protocol: + +```json +{ + "mcpServers": { + "codex-cli": { + "command": "npx", + "args": ["-y", "codex-mcp-server"], + "env": { + "CODEX_COMPUTER_USE_BINARY": "/usr/local/bin/open-computer-use" + } + } + } +} +``` + +## Discovery Priority + +1. `CODEX_COMPUTER_USE_BINARY` env var (explicit override) +2. `open-computer-use` in PATH (npm global install) +3. Codex.app bundled `SkyComputerUseClient` + +## Protocol + +The binary speaks bare **newline-delimited JSON-RPC** (not Content-Length framed). The bridge handles: + +1. MCP `initialize` handshake (`protocolVersion: "2024-11-05"`) +2. `notifications/initialized` confirmation +3. `tools/call` requests with request/response correlation via incrementing `id` + +**Timeouts**: 30s default, 60s for `get_app_state` (screenshots take longer). + +## Tools + +### `cu_status` — Health Check + +Works on any platform (no binary needed). Returns binary path, discovery type, and connection status. + +```json +{ "connected": true, "binary": { "path": "/usr/local/bin/open-computer-use", "type": "npm-package" }, "error": null } +``` + +### `cu_list_apps` — List Apps + +Returns running and recently used macOS apps. Call this first to discover what's available. + +```json +{ "app": "Finder" } +``` + +### `cu_get_app_state` — Screenshot + Accessibility Tree + +Returns a screenshot (as base64 image data) and the accessibility tree with element indices. Must be called once per turn before interacting with an app. + +```json +{ "app": "Safari" } +``` + +Response includes both `text` (accessibility tree) and `image` content (screenshot). + +### `cu_click` — Click Element + +Click by accessibility index (preferred) or pixel coordinates. + +```json +{ "app": "Safari", "element_index": "42" } +{ "app": "Finder", "x": 100, "y": 200, "mouse_button": "right", "click_count": 2 } +``` + +### `cu_type_text` — Type Text + +Type literal text into a focused input field. + +```json +{ "app": "Terminal", "text": "npm run build" } +``` + +### `cu_press_key` — Key Press + +Press a key or key-combination using xdotool-style syntax. + +```json +{ "app": "VSCode", "key": "cmd+s" } +{ "app": "Finder", "key": "cmd+shift+n" } +{ "app": "Terminal", "key": "Return" } +``` + +### `cu_scroll` — Scroll Element + +Scroll in a direction by a number of pages. + +```json +{ "app": "Safari", "element_index": "5", "direction": "down", "pages": 3 } +``` + +### `cu_drag` — Drag + +Drag from one pixel coordinate to another. + +```json +{ "app": "Finder", "from_x": 100, "from_y": 100, "to_x": 300, "to_y": 300 } +``` + +### `cu_set_value` — Set Value + +Set the value of a slider, text field, or other settable accessibility element. + +```json +{ "app": "System Preferences", "element_index": "12", "value": "80" } +``` + +### `cu_perform_secondary_action` — Secondary Action + +Invoke a secondary accessibility action (toggle, expand, pick, etc.). + +```json +{ "app": "Finder", "element_index": "8", "action": "toggle" } +``` + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `CODEX_COMPUTER_USE_BINARY` | Path to binary (overrides auto-discovery) | +| `NODE_DEBUG` | Include `computer-use` to forward binary stderr | + +## Troubleshooting + +**"Computer Use tools are only available on macOS"** +These tools require macOS accessibility APIs. They won't work on Linux or Windows. + +**"Computer Use binary not found"** +Install via npm (`npm install -g open-codex-computer-use-mcp`), install Codex.app, or set `CODEX_COMPUTER_USE_BINARY`. + +**"Binary spawn failed" or "initialize failed"** +The binary likely needs re-signing. Run `/codex-setup` in Claude Code, or manually: +```bash +sudo codesign --force --deep --sign - +``` + +**"Request timed out"** +`get_app_state` has a 60s timeout; other tools have 30s. The app may be unresponsive. + +**No screenshot in `get_app_state` response** +Some apps don't expose their window content to accessibility. This is an OS limitation. From 75847355d2aa91d3266de032c7e6d24b6fb39452 Mon Sep 17 00:00:00 2001 From: Tommy Nguyen Date: Sat, 18 Apr 2026 23:06:05 -0700 Subject: [PATCH 3/6] docs(computer-use): update Codex.app binary setup docs Signed-off-by: Tommy Nguyen Entire-Checkpoint: d5b3202aaa87 --- .claude/commands/codex-setup.md | 136 ++++++++++++------- docs/computer-use.md | 48 +++---- src/computer-use/__tests__/discovery.test.ts | 19 +-- src/computer-use/discovery.ts | 22 +-- src/computer-use/types.ts | 2 +- 5 files changed, 107 insertions(+), 120 deletions(-) diff --git a/.claude/commands/codex-setup.md b/.claude/commands/codex-setup.md index 4e7c9b9..9930247 100644 --- a/.claude/commands/codex-setup.md +++ b/.claude/commands/codex-setup.md @@ -1,57 +1,89 @@ # /codex-setup — Re-sign Codex Computer Use Binary -Re-sign the Codex computer-use binary so Claude Code can launch it. This is needed when the binary has the hardened runtime flag, which prevents exec from a non-signed parent process (like node). +Re-sign the Codex.app SkyComputerUseClient binary so Claude Code can launch it. Required when the binary has the hardened runtime flag (Apple event error -10000). + +## Why This Is Needed + +Codex.app ships with a hardened runtime that prevents other processes (like Node.js) from sending it Apple Events. Ad-hoc re-signing removes this restriction. This is **not** a security risk — it only affects your local machine. + +**You must re-run this after every Codex.app update.** Auto-updates replace the binary, which resets the signature. If `cu_*` tools suddenly fail with `Apple event error -10000`, re-sign. + +## Binary Path + +``` +/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient +``` ## Steps -1. **Find the binary** — check these locations in order: - - `$CODEX_COMPUTER_USE_BINARY` env var (if set) - - `open-computer-use` in PATH (`which open-computer-use`) - - Codex.app bundled binary at: - ``` - /Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient - ``` - - Run: - ```bash - if [ -n "$CODEX_COMPUTER_USE_BINARY" ]; then - echo "$CODEX_COMPUTER_USE_BINARY" - elif command -v open-computer-use &>/dev/null; then - which open-computer-use - elif [ -f "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" ]; then - echo "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" - else - echo "NOT_FOUND" - fi - ``` - -2. **Check current signature** — if the output includes `restricted` or `runtime`, re-signing is needed: - ```bash - codesign -dvvv 2>&1 | grep -E 'flags|runtime|Signature' - ``` - -3. **Re-sign with ad-hoc signature** — this removes the hardened runtime restriction: - ```bash - sudo codesign --force --deep --sign - "" - ``` - -4. **Verify** — confirm the new signature: - ```bash - codesign --verify --deep --strict 2>&1 - ``` - - Expected: `valid on disk` (no errors). - -5. **Test the binary** — confirm it starts and speaks JSON-RPC: - ```bash - echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"0.1.0"}}}' | timeout 5 "" mcp 2>/dev/null | head -1 - ``` - - Expected: A JSON-RPC response with `"result"` containing `"serverInfo"`. - -## Notes - -- Requires `sudo` for re-signing (the binary lives in `/Applications`) -- The ad-hoc signature (`-` flag) is machine-local — it won't work if you move the binary to another Mac -- If Codex.app auto-updates, you may need to re-run this command -- The `open-codex-computer-use-mcp` npm package (v0.1.11+) includes a pre-signed binary that may not need this step +Follow these steps in order. Run each command and check the output before proceeding. + +### 1. Check if Codex.app is installed + +```bash +ls "/Applications/Codex.app" &>/dev/null && echo "Codex.app found" || echo "Codex.app NOT found — install from https://codex.ai" +``` + +If not found, stop and tell the user to install Codex.app first. + +### 2. Check the binary exists + +```bash +BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" +ls "$BINARY" &>/dev/null && echo "Binary found" || echo "Binary NOT found — Codex.app may need reinstall" +``` + +### 3. Check current signature status + +```bash +BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" +codesign -dvvv "$BINARY" 2>&1 | grep -E 'flags=|Signature|runtime' +``` + +Look for `flags=0x20002` (or similar with `runtime` bit set) — that means re-signing is needed. If it shows `Signature=adhoc`, it may already be signed. + +### 4. Re-sign with ad-hoc signature + +Tell the user this requires `sudo` and their password, then run: + +```bash +sudo codesign --force --deep --sign - "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" +``` + +If this fails, the binary may be protected by SIP or Codex.app may not be installed properly. + +### 5. Verify the signature + +```bash +BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" +codesign --verify --deep --strict "$BINARY" 2>&1 +``` + +Expected output: `BINARY: valid on disk` (no errors). Any error means re-signing failed. + +### 6. Test the binary responds to JSON-RPC + +```bash +BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" +echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"0.1.0"}}}' | timeout 5 "$BINARY" mcp 2>/dev/null | head -1 +``` + +Expected: a JSON-RPC response containing `"serverInfo"`. If no output or error, the binary can't start. + +### 7. Confirm success + +Tell the user: +- The binary is signed and working +- `cu_*` tools should now work in Claude Code +- **Reminder:** Re-run `/codex-setup` after any Codex.app update (auto-update or manual) +- Consider adding a calendar reminder or note to re-sign after updates + +## Troubleshooting + +| Symptom | Cause | Fix | +|---------|-------|-----| +| `Apple event error -10000` | Hardened runtime not removed | Re-run step 4 | +| `Binary NOT found` | Codex.app updated and changed path | Reinstall Codex.app | +| `operation not permitted` | SIP or permission issue | Run from an admin terminal | +| No JSON-RPC response in step 6 | Binary crashed on start | Check `/var/log/system.log` for crash reports | +| Works in Terminal but not Claude Code | Claude Code spawns a different process | Restart Claude Code after re-signing | diff --git a/docs/computer-use.md b/docs/computer-use.md index 65fd08b..4a016c5 100644 --- a/docs/computer-use.md +++ b/docs/computer-use.md @@ -8,38 +8,31 @@ OS-level macOS computer control via Codex's accessibility binary. Control runnin Claude Code → codex-mcp-server → ComputerUseBridge (singleton, lazy init) - → open-computer-use / SkyComputerUseClient (subprocess) + → SkyComputerUseClient (Codex.app subprocess) → stdio JSON-RPC (newline-delimited) → macOS Accessibility APIs ``` The bridge spawns the binary once and maintains a persistent connection. Binary discovery happens on the first computer-use tool call — no impact on codex/review startup. -## Binary Setup +## Setup -### Option 1: npm package (recommended) +Requires [Codex.app](https://codex.ai) installed. The `SkyComputerUseClient` binary is detected automatically from `/Applications/Codex.app`. -```bash -npm install -g open-codex-computer-use-mcp -open-computer-use doctor -``` - -The `doctor` command verifies the binary runs and can communicate. - -### Option 2: Codex.app bundled binary - -If Codex.app is installed at `/Applications/Codex.app`, the `SkyComputerUseClient` binary is detected automatically. - -If it fails to launch (hardened runtime restriction), re-sign it: +If you get `Apple event error -10000`, the binary needs re-signing: ```bash sudo codesign --force --deep --sign - \ "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" ``` +Or run `/codex-setup` in Claude Code to do this automatically. + +**Note:** Re-signing is machine-local. Re-run after Codex.app auto-updates. + ### Custom binary path -Set `CODEX_COMPUTER_USE_BINARY` to any binary that speaks the same protocol: +Set `CODEX_COMPUTER_USE_BINARY` to override auto-discovery: ```json { @@ -48,7 +41,7 @@ Set `CODEX_COMPUTER_USE_BINARY` to any binary that speaks the same protocol: "command": "npx", "args": ["-y", "codex-mcp-server"], "env": { - "CODEX_COMPUTER_USE_BINARY": "/usr/local/bin/open-computer-use" + "CODEX_COMPUTER_USE_BINARY": "/path/to/SkyComputerUseClient" } } } @@ -58,8 +51,7 @@ Set `CODEX_COMPUTER_USE_BINARY` to any binary that speaks the same protocol: ## Discovery Priority 1. `CODEX_COMPUTER_USE_BINARY` env var (explicit override) -2. `open-computer-use` in PATH (npm global install) -3. Codex.app bundled `SkyComputerUseClient` +2. Codex.app bundled `SkyComputerUseClient` ## Protocol @@ -78,20 +70,16 @@ The binary speaks bare **newline-delimited JSON-RPC** (not Content-Length framed Works on any platform (no binary needed). Returns binary path, discovery type, and connection status. ```json -{ "connected": true, "binary": { "path": "/usr/local/bin/open-computer-use", "type": "npm-package" }, "error": null } +{ "connected": true, "binary": { "path": "/Applications/.../SkyComputerUseClient", "type": "codex-app" }, "error": null } ``` ### `cu_list_apps` — List Apps Returns running and recently used macOS apps. Call this first to discover what's available. -```json -{ "app": "Finder" } -``` - ### `cu_get_app_state` — Screenshot + Accessibility Tree -Returns a screenshot (as base64 image data) and the accessibility tree with element indices. Must be called once per turn before interacting with an app. +Returns a screenshot (base64 image data) and the accessibility tree with element indices. Must be called once per turn before interacting with an app. ```json { "app": "Safari" } @@ -122,7 +110,6 @@ Press a key or key-combination using xdotool-style syntax. ```json { "app": "VSCode", "key": "cmd+s" } -{ "app": "Finder", "key": "cmd+shift+n" } { "app": "Terminal", "key": "Return" } ``` @@ -171,13 +158,10 @@ Invoke a secondary accessibility action (toggle, expand, pick, etc.). These tools require macOS accessibility APIs. They won't work on Linux or Windows. **"Computer Use binary not found"** -Install via npm (`npm install -g open-codex-computer-use-mcp`), install Codex.app, or set `CODEX_COMPUTER_USE_BINARY`. +Install Codex.app from https://codex.ai, or set `CODEX_COMPUTER_USE_BINARY` to a custom path. -**"Binary spawn failed" or "initialize failed"** -The binary likely needs re-signing. Run `/codex-setup` in Claude Code, or manually: -```bash -sudo codesign --force --deep --sign - -``` +**"Apple event error -10000: Sender process is not authenticated"** +The binary needs re-signing. Run `/codex-setup` or the codesign command above. **"Request timed out"** `get_app_state` has a 60s timeout; other tools have 30s. The app may be unresponsive. diff --git a/src/computer-use/__tests__/discovery.test.ts b/src/computer-use/__tests__/discovery.test.ts index c0153f6..3c9cc23 100644 --- a/src/computer-use/__tests__/discovery.test.ts +++ b/src/computer-use/__tests__/discovery.test.ts @@ -1,28 +1,17 @@ import { discoverBinary } from '../discovery.js'; -import { execSync } from 'node:child_process'; import { existsSync } from 'node:fs'; -const hasNpmBinary = (() => { - try { - return !!execSync('which open-computer-use 2>/dev/null', { encoding: 'utf-8' }).trim(); - } catch { - return false; - } -})(); - const hasCodexBinary = existsSync( '/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient' ); -const hasBinary = hasNpmBinary || hasCodexBinary; - describe('Binary Discovery', () => { - (hasBinary ? describe : describe.skip)('with binary available', () => { - test('should discover a binary', () => { + (hasCodexBinary ? describe : describe.skip)('with Codex.app installed', () => { + test('should discover the Codex.app binary', () => { const info = discoverBinary(); - expect(info.path).toBeDefined(); + expect(info.path).toContain('SkyComputerUseClient'); expect(info.args).toEqual(['mcp']); - expect(['npm-package', 'codex-app', 'custom']).toContain(info.type); + expect(['codex-app', 'custom']).toContain(info.type); }); }); diff --git a/src/computer-use/discovery.ts b/src/computer-use/discovery.ts index 83575cf..76d3759 100644 --- a/src/computer-use/discovery.ts +++ b/src/computer-use/discovery.ts @@ -1,5 +1,4 @@ import { existsSync } from 'node:fs'; -import { execSync } from 'node:child_process'; import { resolve, join } from 'node:path'; import { type BinaryInfo } from './types.js'; @@ -10,14 +9,6 @@ const CODEX_BINARY = join( 'Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient' ); -function findInPath(name: string): string | null { - try { - return execSync(`which ${name} 2>/dev/null`, { encoding: 'utf-8' }).trim(); - } catch { - return null; - } -} - export function discoverBinary(): BinaryInfo { // 1. Explicit env var override. const envPath = process.env.CODEX_COMPUTER_USE_BINARY; @@ -31,22 +22,13 @@ export function discoverBinary(): BinaryInfo { return { path: resolved, type: 'custom', args: ['mcp'] }; } - // 2. open-computer-use in PATH (npm global install). - const npmPath = findInPath('open-computer-use'); - if (npmPath && existsSync(npmPath)) { - return { path: npmPath, type: 'npm-package', args: ['mcp'] }; - } - - // 3. Codex.app bundled binary. + // 2. Codex.app bundled binary (SkyComputerUseClient). if (existsSync(CODEX_BINARY)) { return { path: CODEX_BINARY, type: 'codex-app', args: ['mcp'] }; } throw new Error( 'Computer Use binary not found.\n' + - 'Install one of:\n' + - ' npm install -g open-codex-computer-use-mcp\n' + - ' (then run: open-computer-use doctor)\n' + - 'Or set CODEX_COMPUTER_USE_BINARY env var to a custom path.' + 'Install Codex.app from https://codex.ai or set CODEX_COMPUTER_USE_BINARY env var.' ); } diff --git a/src/computer-use/types.ts b/src/computer-use/types.ts index 7ac0623..a63d753 100644 --- a/src/computer-use/types.ts +++ b/src/computer-use/types.ts @@ -114,7 +114,7 @@ export const CU_SCHEMAS: Record = { // Binary discovery info. export interface BinaryInfo { path: string; - type: 'npm-package' | 'codex-app' | 'custom'; + type: 'codex-app' | 'custom'; args: string[]; } From 7255d86664b00d71b9de0d58b3304156e7d9f85f Mon Sep 17 00:00:00 2001 From: Tommy Nguyen Date: Sun, 19 Apr 2026 11:16:53 -0700 Subject: [PATCH 4/6] feat(browser-use): replace macOS computer-use with Playwright browser automation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove dead src/computer-use/ module (macOS binary locked behind team ID check) and add cross-platform browser automation via Playwright. New module src/browser-use/ with 10 MCP tools: browser_launch, browser_screenshot, browser_click, browser_type, browser_scroll, browser_drag, browser_key, browser_navigate, browser_close, browser_status. - Singleton BrowserUseBridge manages concurrent sessions - Playwright is a peer dependency (lazy init, graceful errors) - SIGINT/SIGTERM handlers clean up browser sessions on shutdown - Key combinations auto-normalized (Cmd→Meta, Ctrl→Control, etc.) - 86 tests pass, lint clean Co-authored-by: Claude --- .claude/commands/codex-setup.md | 89 ------ README.md | 19 +- docs/api-reference.md | 125 +++++---- docs/browser-use.md | 127 +++++++++ docs/computer-use.md | 170 ------------ jest.config.mjs | 1 + src/__tests__/index.test.ts | 169 +++++++++++- src/browser-use/bridge.ts | 106 +++++++ src/browser-use/client.ts | 99 +++++++ src/browser-use/definitions.ts | 162 +++++++++++ src/browser-use/handlers.ts | 131 +++++++++ src/browser-use/playwright.d.ts | 34 +++ src/browser-use/types.ts | 118 ++++++++ src/computer-use/__tests__/discovery.test.ts | 31 --- src/computer-use/__tests__/handlers.test.ts | 51 ---- src/computer-use/__tests__/types.test.ts | 125 --------- src/computer-use/bridge.ts | 92 ------- src/computer-use/client.ts | 183 ------------- src/computer-use/definitions.ts | 274 ------------------- src/computer-use/discovery.ts | 34 --- src/computer-use/handlers.ts | 93 ------- src/computer-use/types.ts | 140 ---------- src/index.ts | 13 + src/tools/definitions.ts | 4 +- src/tools/handlers.ts | 37 +-- src/types.ts | 20 +- 26 files changed, 1065 insertions(+), 1382 deletions(-) delete mode 100644 .claude/commands/codex-setup.md create mode 100644 docs/browser-use.md delete mode 100644 docs/computer-use.md create mode 100644 src/browser-use/bridge.ts create mode 100644 src/browser-use/client.ts create mode 100644 src/browser-use/definitions.ts create mode 100644 src/browser-use/handlers.ts create mode 100644 src/browser-use/playwright.d.ts create mode 100644 src/browser-use/types.ts delete mode 100644 src/computer-use/__tests__/discovery.test.ts delete mode 100644 src/computer-use/__tests__/handlers.test.ts delete mode 100644 src/computer-use/__tests__/types.test.ts delete mode 100644 src/computer-use/bridge.ts delete mode 100644 src/computer-use/client.ts delete mode 100644 src/computer-use/definitions.ts delete mode 100644 src/computer-use/discovery.ts delete mode 100644 src/computer-use/handlers.ts delete mode 100644 src/computer-use/types.ts diff --git a/.claude/commands/codex-setup.md b/.claude/commands/codex-setup.md deleted file mode 100644 index 9930247..0000000 --- a/.claude/commands/codex-setup.md +++ /dev/null @@ -1,89 +0,0 @@ -# /codex-setup — Re-sign Codex Computer Use Binary - -Re-sign the Codex.app SkyComputerUseClient binary so Claude Code can launch it. Required when the binary has the hardened runtime flag (Apple event error -10000). - -## Why This Is Needed - -Codex.app ships with a hardened runtime that prevents other processes (like Node.js) from sending it Apple Events. Ad-hoc re-signing removes this restriction. This is **not** a security risk — it only affects your local machine. - -**You must re-run this after every Codex.app update.** Auto-updates replace the binary, which resets the signature. If `cu_*` tools suddenly fail with `Apple event error -10000`, re-sign. - -## Binary Path - -``` -/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient -``` - -## Steps - -Follow these steps in order. Run each command and check the output before proceeding. - -### 1. Check if Codex.app is installed - -```bash -ls "/Applications/Codex.app" &>/dev/null && echo "Codex.app found" || echo "Codex.app NOT found — install from https://codex.ai" -``` - -If not found, stop and tell the user to install Codex.app first. - -### 2. Check the binary exists - -```bash -BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" -ls "$BINARY" &>/dev/null && echo "Binary found" || echo "Binary NOT found — Codex.app may need reinstall" -``` - -### 3. Check current signature status - -```bash -BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" -codesign -dvvv "$BINARY" 2>&1 | grep -E 'flags=|Signature|runtime' -``` - -Look for `flags=0x20002` (or similar with `runtime` bit set) — that means re-signing is needed. If it shows `Signature=adhoc`, it may already be signed. - -### 4. Re-sign with ad-hoc signature - -Tell the user this requires `sudo` and their password, then run: - -```bash -sudo codesign --force --deep --sign - "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" -``` - -If this fails, the binary may be protected by SIP or Codex.app may not be installed properly. - -### 5. Verify the signature - -```bash -BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" -codesign --verify --deep --strict "$BINARY" 2>&1 -``` - -Expected output: `BINARY: valid on disk` (no errors). Any error means re-signing failed. - -### 6. Test the binary responds to JSON-RPC - -```bash -BINARY="/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" -echo '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"0.1.0"}}}' | timeout 5 "$BINARY" mcp 2>/dev/null | head -1 -``` - -Expected: a JSON-RPC response containing `"serverInfo"`. If no output or error, the binary can't start. - -### 7. Confirm success - -Tell the user: -- The binary is signed and working -- `cu_*` tools should now work in Claude Code -- **Reminder:** Re-run `/codex-setup` after any Codex.app update (auto-update or manual) -- Consider adding a calendar reminder or note to re-sign after updates - -## Troubleshooting - -| Symptom | Cause | Fix | -|---------|-------|-----| -| `Apple event error -10000` | Hardened runtime not removed | Re-run step 4 | -| `Binary NOT found` | Codex.app updated and changed path | Reinstall Codex.app | -| `operation not permitted` | SIP or permission issue | Run from an admin terminal | -| No JSON-RPC response in step 6 | Binary crashed on start | Check `/var/log/system.log` for crash reports | -| Works in Terminal but not Claude Code | Claude Code spawns a different process | Restart Claude Code after re-signing | diff --git a/README.md b/README.md index c84aa66..f72e254 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,15 @@ [![npm downloads](https://img.shields.io/npm/dm/codex-mcp-server.svg)](https://www.npmjs.com/package/codex-mcp-server) [![license](https://img.shields.io/npm/l/codex-mcp-server.svg)](https://www.npmjs.com/package/codex-mcp-server) -Bridge between Claude and OpenAI's Codex CLI — get AI-powered code analysis, generation, and review right in your editor. Also includes optional macOS computer control. +Bridge between Claude and OpenAI's Codex CLI — get AI-powered code analysis, generation, and review right in your editor. Also includes optional browser automation via Playwright. ```mermaid graph LR A[Claude Code] --> B[Codex MCP Server] B --> C[Codex CLI] C --> D[OpenAI API] - B --> E[Computer Use] - E --> F[macOS Apps] + B --> E[Browser Use] + E --> F[Chromium] style A fill:#FF6B35 style B fill:#4A90E2 @@ -58,7 +58,7 @@ Use review to check my uncommitted changes | `codex` | AI coding assistant with session support, model selection, and structured output metadata | | `review` | AI-powered code review for uncommitted changes, branches, or commits | | `websearch` | Web search using Codex CLI with customizable result count and search depth | -| `cu_*` | 10 macOS computer-use tools — control apps via screenshots and accessibility ([setup](docs/computer-use.md)) | +| `browser_*` | 10 browser automation tools — launch, screenshot, click, type, scroll, drag, navigate ([setup](docs/browser-use.md)) | | `listSessions` | View active conversation sessions | | `ping` | Test server connection | | `help` | Get Codex CLI help | @@ -97,11 +97,12 @@ Use codex to return structuredContent with threadId metadata when available Use websearch with query "TypeScript 5.8 new features" ``` -**Computer Use (macOS):** +**Browser Automation:** ``` -Use cu_status to check connection -Use cu_get_app_state with app "Finder" to see the desktop -Use cu_click with app "Safari" and element_index "42" to click a link +Use browser_status to check if Playwright is available +Use browser_launch with sessionId "web1" and url "https://example.com" +Use browser_screenshot with sessionId "web1" to see the page +Use browser_click with sessionId "web1", x 100, and y 200 ``` ## Requirements @@ -115,7 +116,7 @@ Use cu_click with app "Safari" and element_index "42" to click a link ## Documentation - **[API Reference](docs/api-reference.md)** — Full tool parameters and response formats -- **[Computer Use](docs/computer-use.md)** — macOS setup, tools, and troubleshooting +- **[Browser Use](docs/browser-use.md)** — Playwright setup, tools, and troubleshooting - **[Session Management](docs/session-management.md)** — How conversations work - **[Codex CLI Integration](docs/codex-cli-integration.md)** — Version compatibility and CLI details diff --git a/docs/api-reference.md b/docs/api-reference.md index 34dcd29..e6bdd23 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -52,16 +52,16 @@ All tools include annotations that provide hints to MCP clients about tool behav | `ping` | Ping Server | `true` | `false` | `true` | `false` | | `help` | Get Help | `true` | `false` | `true` | `false` | | `listSessions` | List Sessions | `true` | `false` | `true` | `false` | -| `cu_list_apps` | List Apps | `true` | `false` | `true` | `false` | -| `cu_get_app_state` | Get App State | `true` | `false` | `true` | `false` | -| `cu_status` | Computer Use Status | `true` | `false` | `true` | `false` | -| `cu_click` | Click | — | `false` | — | `false` | -| `cu_type_text` | Type Text | — | `false` | — | `false` | -| `cu_press_key` | Press Key | — | `false` | — | `false` | -| `cu_scroll` | Scroll | — | `false` | — | `false` | -| `cu_drag` | Drag | — | `false` | — | `false` | -| `cu_set_value` | Set Value | — | `false` | — | `false` | -| `cu_perform_secondary_action` | Secondary Action | — | `false` | — | `false` | +| `browser_launch` | Launch Browser | `false` | `false` | `false` | `true` | +| `browser_screenshot` | Browser Screenshot | `true` | `false` | `true` | `false` | +| `browser_click` | Browser Click | `false` | `false` | `false` | `false` | +| `browser_type` | Browser Type | `false` | `false` | `false` | `false` | +| `browser_scroll` | Browser Scroll | `false` | `false` | `false` | `false` | +| `browser_drag` | Browser Drag | `false` | `false` | `false` | `false` | +| `browser_key` | Browser Key Press | `false` | `false` | `false` | `false` | +| `browser_navigate` | Browser Navigate | `false` | `false` | `false` | `true` | +| `browser_close` | Close Browser | `false` | `true` | `true` | `false` | +| `browser_status` | Browser Status | `true` | `false` | `true` | `false` | ### Progress Notifications For long-running operations, the server sends `notifications/progress` messages when the client includes a `progressToken` in the request `_meta`. @@ -451,91 +451,96 @@ Optional: - **CODEX_HOME**: Custom directory for Codex CLI configuration - **Session Limits**: Configurable in server implementation (default: 100) - **TTL Settings**: Configurable session expiration (default: 24 hours) -- **CODEX_COMPUTER_USE_BINARY**: Path to computer-use binary (macOS only) -- **NODE_DEBUG**: Include `computer-use` to forward binary stderr -## Computer Use Tools +## Browser Use Tools -macOS-only tools for controlling apps via accessibility APIs. See [Computer Use](computer-use.md) for setup instructions. +Cross-platform browser automation via Playwright. See [Browser Use](browser-use.md) for setup instructions. -### `cu_status` — Health Check +### `browser_status` — Health Check -**Annotations:** `readOnlyHint: true`, `idempotentHint: true` +**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true` -No parameters. Returns binary path, type, connection status, and any error. +No parameters. Returns Playwright availability, active sessions, and any error. -### `cu_list_apps` — List Running Apps +### `browser_launch` — Launch Browser -**Annotations:** `readOnlyHint: true`, `idempotentHint: true` +**Annotations:** `readOnlyHint: false`, `destructiveHint: false`, `idempotentHint: false`, `openWorldHint: true` -No parameters. Returns list of running and recently used macOS apps. +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `sessionId` | string | yes | - | Unique session identifier | +| `url` | string | no | - | URL to navigate to on launch | +| `headless` | boolean | no | `true` | Run without visible window | +| `viewportWidth` | integer | no | `1440` | Viewport width in pixels | +| `viewportHeight` | integer | no | `900` | Viewport height in pixels | -### `cu_get_app_state` — Screenshot & Accessibility Tree +### `browser_screenshot` — Take Screenshot -**Annotations:** `readOnlyHint: true`, `idempotentHint: true` +**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true` | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | +| `sessionId` | string | yes | Browser session ID | -Returns screenshot (base64 image) and accessibility tree with element indices. +Returns base64 PNG image, page URL, and page title. -### `cu_click` — Click Element +### `browser_click` — Click -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `element_index` | string | no | Element index from accessibility tree | -| `x` | number | no | X coordinate (pixels) | -| `y` | number | no | Y coordinate (pixels) | -| `mouse_button` | enum | no | `left`, `right`, or `middle` | -| `click_count` | integer | no | Number of clicks (default: 1) | +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `sessionId` | string | yes | - | Browser session ID | +| `x` | number | yes | - | X coordinate (viewport-relative) | +| `y` | number | yes | - | Y coordinate (viewport-relative) | +| `button` | enum | no | `left` | `left`, `right`, or `middle` | +| `clickCount` | integer | no | `1` | Number of clicks | -### `cu_type_text` — Type Text +### `browser_type` — Type Text | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `text` | string | yes | Literal text to type | +| `sessionId` | string | yes | Browser session ID | +| `text` | string | yes | Text to type into focused element | -### `cu_press_key` — Key Press +### `browser_scroll` — Scroll -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `key` | string | yes | Key or combo (`a`, `Return`, `cmd+s`, `Up`) | +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `sessionId` | string | yes | - | Browser session ID | +| `direction` | enum | yes | - | `up`, `down`, `left`, or `right` | +| `amount` | integer | no | `300` | Scroll amount in pixels | -### `cu_scroll` — Scroll +### `browser_drag` — Drag | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `element_index` | string | yes | Element identifier | -| `direction` | string | yes | `up`, `down`, `left`, or `right` | -| `pages` | integer | no | Page scroll count (default: 1) | +| `sessionId` | string | yes | Browser session ID | +| `fromX` | number | yes | Start X coordinate | +| `fromY` | number | yes | Start Y coordinate | +| `toX` | number | yes | End X coordinate | +| `toY` | number | yes | End Y coordinate | -### `cu_drag` — Drag +### `browser_key` — Key Press | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `from_x` | number | yes | Start X coordinate | -| `from_y` | number | yes | Start Y coordinate | -| `to_x` | number | yes | End X coordinate | -| `to_y` | number | yes | End Y coordinate | +| `sessionId` | string | yes | Browser session ID | +| `key` | string | yes | Key or combo (`Enter`, `Control+a`, `Meta+s`) | -### `cu_set_value` — Set Value +Modifier keys are auto-normalized: `Cmd`/`Command` → `Meta`, `Ctrl` → `Control`, `Opt`/`Option` → `Alt`. + +### `browser_navigate` — Navigate + +**Annotations:** `readOnlyHint: false`, `destructiveHint: false`, `idempotentHint: false`, `openWorldHint: true` | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `element_index` | string | yes | Element identifier | -| `value` | string | yes | Value to assign | +| `sessionId` | string | yes | Browser session ID | +| `url` | string | yes | URL to navigate to | + +### `browser_close` — Close Session -### `cu_perform_secondary_action` — Secondary Action +**Annotations:** `readOnlyHint: false`, `destructiveHint: true`, `idempotentHint: true` | Parameter | Type | Required | Description | |-----------|------|----------|-------------| -| `app` | string | yes | App name or bundle identifier | -| `element_index` | string | yes | Element identifier | -| `action` | string | yes | Action name (`toggle`, `expand`, `pick`, etc.) | \ No newline at end of file +| `sessionId` | string | yes | Browser session ID to close | \ No newline at end of file diff --git a/docs/browser-use.md b/docs/browser-use.md new file mode 100644 index 0000000..1b45e87 --- /dev/null +++ b/docs/browser-use.md @@ -0,0 +1,127 @@ +# Browser Use + +Playwright-based browser automation. Launch a real Chromium browser, take screenshots, click, type, scroll, drag, and navigate — all via MCP tools. + +## Architecture + +``` +Claude Code + → codex-mcp-server + → BrowserUseBridge (singleton, lazy init) + → Playwright (peer dependency) + → Chromium instances +``` + +The bridge manages multiple concurrent browser sessions. Playwright is only imported on first use — no impact on codex/review startup when browser tools aren't used. + +## Setup + +```bash +npm install playwright +npx playwright install chromium +``` + +Playwright is a **peer dependency** — install it separately. The server works fine without it; browser tools will return a helpful error if Playwright isn't available. + +## Tools + +### `browser_status` — Health Check + +Works even without Playwright installed. Returns availability, active sessions, and any error. + +```json +{ "sessionId": "my-session" } +``` + +### `browser_launch` — Launch Browser + +Creates a new browser session. Supports multiple concurrent sessions with unique IDs. + +```json +{ + "sessionId": "my-session", + "url": "https://example.com", + "headless": true, + "viewportWidth": 1440, + "viewportHeight": 900 +} +``` + +### `browser_screenshot` — Take Screenshot + +Returns base64 PNG image data along with the current page URL and title. + +```json +{ "sessionId": "my-session" } +``` + +### `browser_click` — Click + +Click at viewport-relative pixel coordinates. + +```json +{ "sessionId": "my-session", "x": 100, "y": 200, "button": "left", "clickCount": 1 } +``` + +### `browser_type` — Type Text + +Type literal text into the currently focused element. Click on an input field first. + +```json +{ "sessionId": "my-session", "text": "hello world" } +``` + +### `browser_scroll` — Scroll + +Scroll the page in a direction by a pixel amount. + +```json +{ "sessionId": "my-session", "direction": "down", "amount": 300 } +``` + +### `browser_drag` — Drag + +Drag from one coordinate to another (viewport-relative). + +```json +{ "sessionId": "my-session", "fromX": 100, "fromY": 100, "toX": 300, "toY": 300 } +``` + +### `browser_key` — Key Press + +Press a key or key combination. Supports Playwright key names. Modifier keys are auto-normalized: `Cmd` → `Meta`, `Ctrl` → `Control`, `Opt` → `Alt`, etc. + +```json +{ "sessionId": "my-session", "key": "Control+a" } +{ "sessionId": "my-session", "key": "Cmd+s" } +``` + +### `browser_navigate` — Navigate + +Go to a URL in the current page. + +```json +{ "sessionId": "my-session", "url": "https://example.com" } +``` + +### `browser_close` — Close Session + +Close a browser session and clean up resources. + +```json +{ "sessionId": "my-session" } +``` + +## Troubleshooting + +**"Playwright is not installed"** +Install it: `npm install playwright && npx playwright install chromium` + +**"Session already exists"** +Use a different sessionId or close the existing session first with `browser_close`. + +**"No active browser session"** +You must call `browser_launch` before using other browser tools. + +**Screenshots not loading** +Check that the client supports `image` content type in MCP tool results. Claude Code supports this natively. diff --git a/docs/computer-use.md b/docs/computer-use.md deleted file mode 100644 index 4a016c5..0000000 --- a/docs/computer-use.md +++ /dev/null @@ -1,170 +0,0 @@ -# Computer Use - -OS-level macOS computer control via Codex's accessibility binary. Control running apps through screenshots, accessibility trees, clicks, typing, and keyboard shortcuts. - -## Architecture - -``` -Claude Code - → codex-mcp-server - → ComputerUseBridge (singleton, lazy init) - → SkyComputerUseClient (Codex.app subprocess) - → stdio JSON-RPC (newline-delimited) - → macOS Accessibility APIs -``` - -The bridge spawns the binary once and maintains a persistent connection. Binary discovery happens on the first computer-use tool call — no impact on codex/review startup. - -## Setup - -Requires [Codex.app](https://codex.ai) installed. The `SkyComputerUseClient` binary is detected automatically from `/Applications/Codex.app`. - -If you get `Apple event error -10000`, the binary needs re-signing: - -```bash -sudo codesign --force --deep --sign - \ - "/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient" -``` - -Or run `/codex-setup` in Claude Code to do this automatically. - -**Note:** Re-signing is machine-local. Re-run after Codex.app auto-updates. - -### Custom binary path - -Set `CODEX_COMPUTER_USE_BINARY` to override auto-discovery: - -```json -{ - "mcpServers": { - "codex-cli": { - "command": "npx", - "args": ["-y", "codex-mcp-server"], - "env": { - "CODEX_COMPUTER_USE_BINARY": "/path/to/SkyComputerUseClient" - } - } - } -} -``` - -## Discovery Priority - -1. `CODEX_COMPUTER_USE_BINARY` env var (explicit override) -2. Codex.app bundled `SkyComputerUseClient` - -## Protocol - -The binary speaks bare **newline-delimited JSON-RPC** (not Content-Length framed). The bridge handles: - -1. MCP `initialize` handshake (`protocolVersion: "2024-11-05"`) -2. `notifications/initialized` confirmation -3. `tools/call` requests with request/response correlation via incrementing `id` - -**Timeouts**: 30s default, 60s for `get_app_state` (screenshots take longer). - -## Tools - -### `cu_status` — Health Check - -Works on any platform (no binary needed). Returns binary path, discovery type, and connection status. - -```json -{ "connected": true, "binary": { "path": "/Applications/.../SkyComputerUseClient", "type": "codex-app" }, "error": null } -``` - -### `cu_list_apps` — List Apps - -Returns running and recently used macOS apps. Call this first to discover what's available. - -### `cu_get_app_state` — Screenshot + Accessibility Tree - -Returns a screenshot (base64 image data) and the accessibility tree with element indices. Must be called once per turn before interacting with an app. - -```json -{ "app": "Safari" } -``` - -Response includes both `text` (accessibility tree) and `image` content (screenshot). - -### `cu_click` — Click Element - -Click by accessibility index (preferred) or pixel coordinates. - -```json -{ "app": "Safari", "element_index": "42" } -{ "app": "Finder", "x": 100, "y": 200, "mouse_button": "right", "click_count": 2 } -``` - -### `cu_type_text` — Type Text - -Type literal text into a focused input field. - -```json -{ "app": "Terminal", "text": "npm run build" } -``` - -### `cu_press_key` — Key Press - -Press a key or key-combination using xdotool-style syntax. - -```json -{ "app": "VSCode", "key": "cmd+s" } -{ "app": "Terminal", "key": "Return" } -``` - -### `cu_scroll` — Scroll Element - -Scroll in a direction by a number of pages. - -```json -{ "app": "Safari", "element_index": "5", "direction": "down", "pages": 3 } -``` - -### `cu_drag` — Drag - -Drag from one pixel coordinate to another. - -```json -{ "app": "Finder", "from_x": 100, "from_y": 100, "to_x": 300, "to_y": 300 } -``` - -### `cu_set_value` — Set Value - -Set the value of a slider, text field, or other settable accessibility element. - -```json -{ "app": "System Preferences", "element_index": "12", "value": "80" } -``` - -### `cu_perform_secondary_action` — Secondary Action - -Invoke a secondary accessibility action (toggle, expand, pick, etc.). - -```json -{ "app": "Finder", "element_index": "8", "action": "toggle" } -``` - -## Environment Variables - -| Variable | Description | -|----------|-------------| -| `CODEX_COMPUTER_USE_BINARY` | Path to binary (overrides auto-discovery) | -| `NODE_DEBUG` | Include `computer-use` to forward binary stderr | - -## Troubleshooting - -**"Computer Use tools are only available on macOS"** -These tools require macOS accessibility APIs. They won't work on Linux or Windows. - -**"Computer Use binary not found"** -Install Codex.app from https://codex.ai, or set `CODEX_COMPUTER_USE_BINARY` to a custom path. - -**"Apple event error -10000: Sender process is not authenticated"** -The binary needs re-signing. Run `/codex-setup` or the codesign command above. - -**"Request timed out"** -`get_app_state` has a 60s timeout; other tools have 30s. The app may be unresponsive. - -**No screenshot in `get_app_state` response** -Some apps don't expose their window content to accessibility. This is an OS limitation. diff --git a/jest.config.mjs b/jest.config.mjs index f8677ca..2793df4 100644 --- a/jest.config.mjs +++ b/jest.config.mjs @@ -28,4 +28,5 @@ export default { '**/__tests__/**/*.test.ts', '**/?(*.)+(spec|test).ts', ], + testPathIgnorePatterns: ['/node_modules/', '/trash/'], }; \ No newline at end of file diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts index b51ea47..e63bb09 100644 --- a/src/__tests__/index.test.ts +++ b/src/__tests__/index.test.ts @@ -16,6 +16,36 @@ jest.mock('../utils/command.js', () => ({ }), })); +// Mock playwright to avoid requiring it at test time +jest.mock('playwright', () => ({ + chromium: { + launch: jest.fn().mockResolvedValue({ + newContext: jest.fn().mockResolvedValue({ + newPage: jest.fn().mockResolvedValue({ + goto: jest.fn(), + screenshot: jest.fn().mockResolvedValue(Buffer.from('fake-png')), + title: jest.fn().mockResolvedValue('Test Page'), + url: jest.fn().mockReturnValue('https://example.com'), + mouse: { + click: jest.fn(), + move: jest.fn(), + down: jest.fn(), + up: jest.fn(), + wheel: jest.fn(), + }, + keyboard: { + press: jest.fn(), + type: jest.fn(), + }, + waitForLoadState: jest.fn(), + }), + close: jest.fn(), + }), + close: jest.fn(), + }), + }, +}), { virtual: true }); + import { TOOLS } from '../types.js'; import { toolDefinitions } from '../tools/definitions.js'; import { @@ -33,11 +63,12 @@ import { } from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; import { CodexMcpServer } from '../server.js'; +import { bridge } from '../browser-use/bridge.js'; describe('Codex MCP Server', () => { describe('Tool Definitions', () => { test('should have all required tools defined', () => { - // 6 core tools + 10 computer-use tools = 16 + // 6 core tools + 10 browser tools = 16 expect(toolDefinitions).toHaveLength(16); const toolNames = toolDefinitions.map((tool) => tool.name); @@ -49,6 +80,20 @@ describe('Codex MCP Server', () => { expect(toolNames).toContain(TOOLS.LIST_SESSIONS); }); + test('should have all browser tools defined', () => { + const toolNames = toolDefinitions.map((tool) => tool.name); + expect(toolNames).toContain(TOOLS.BROWSER_LAUNCH); + expect(toolNames).toContain(TOOLS.BROWSER_SCREENSHOT); + expect(toolNames).toContain(TOOLS.BROWSER_CLICK); + expect(toolNames).toContain(TOOLS.BROWSER_TYPE); + expect(toolNames).toContain(TOOLS.BROWSER_SCROLL); + expect(toolNames).toContain(TOOLS.BROWSER_DRAG); + expect(toolNames).toContain(TOOLS.BROWSER_KEY); + expect(toolNames).toContain(TOOLS.BROWSER_NAVIGATE); + expect(toolNames).toContain(TOOLS.BROWSER_CLOSE); + expect(toolNames).toContain(TOOLS.BROWSER_STATUS); + }); + test('codex tool should define output schema', () => { const codexTool = toolDefinitions.find( (tool) => tool.name === TOOLS.CODEX @@ -79,10 +124,27 @@ describe('Codex MCP Server', () => { expect(helpTool?.inputSchema.required).toEqual([]); expect(helpTool?.description).toContain('Get Codex CLI help'); }); + + test('browser tools should have sessionId required', () => { + const browserTools = toolDefinitions.filter((t) => + t.name.startsWith('browser_') && t.name !== 'browser_status' + ); + for (const tool of browserTools) { + expect(tool.inputSchema.required).toContain('sessionId'); + } + }); + + test('browser_status should have no required parameters', () => { + const statusTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.BROWSER_STATUS + ); + expect(statusTool).toBeDefined(); + expect(statusTool?.inputSchema.required).toEqual([]); + }); }); describe('Tool Handlers', () => { - test('should have handlers for all tools', () => { + test('should have handlers for all core tools', () => { expect(toolHandlers[TOOLS.CODEX]).toBeInstanceOf(CodexToolHandler); expect(toolHandlers[TOOLS.REVIEW]).toBeInstanceOf(ReviewToolHandler); expect(toolHandlers[TOOLS.WEBSEARCH]).toBeInstanceOf( @@ -95,6 +157,19 @@ describe('Codex MCP Server', () => { ); }); + test('should have handlers for all browser tools', () => { + expect(toolHandlers[TOOLS.BROWSER_LAUNCH]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_SCREENSHOT]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_CLICK]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_TYPE]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_SCROLL]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_DRAG]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_KEY]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_NAVIGATE]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_CLOSE]).toBeDefined(); + expect(toolHandlers[TOOLS.BROWSER_STATUS]).toBeDefined(); + }); + test('ping handler should return message', async () => { const handler = new PingToolHandler(); const result = await handler.execute({ message: 'test' }); @@ -140,6 +215,96 @@ describe('Codex MCP Server', () => { }); }); + describe('Browser Tool Handlers', () => { + afterEach(async () => { + await bridge.shutdown(); + }); + + test('browser_status should return status JSON', async () => { + const handler = toolHandlers[TOOLS.BROWSER_STATUS]; + const result = await handler.execute({}); + + expect(result.content[0].type).toBe('text'); + const status = JSON.parse(result.content[0].text); + expect(status).toHaveProperty('available'); + expect(status).toHaveProperty('activeSessions'); + expect(status).toHaveProperty('sessionIds'); + }); + + test('browser_launch should create a session', async () => { + const handler = toolHandlers[TOOLS.BROWSER_LAUNCH]; + const result = await handler.execute({ sessionId: 'test-session-1' }); + + expect(result.content[0].type).toBe('text'); + expect(result.content[0].text).toContain('test-session-1'); + expect(result.content[0].text).toContain('launched successfully'); + + // Clean up + const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; + await closeHandler.execute({ sessionId: 'test-session-1' }); + }); + + test('browser_close should close a session', async () => { + // First launch + const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; + await launchHandler.execute({ sessionId: 'test-session-close' }); + + // Then close + const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; + const result = await closeHandler.execute({ sessionId: 'test-session-close' }); + + expect(result.content[0].text).toContain('test-session-close'); + expect(result.content[0].text).toContain('closed'); + }); + + test('browser_screenshot should return image data', async () => { + const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; + await launchHandler.execute({ sessionId: 'test-screenshot' }); + + const screenshotHandler = toolHandlers[TOOLS.BROWSER_SCREENSHOT]; + const result = await screenshotHandler.execute({ sessionId: 'test-screenshot' }); + + expect(result.content).toHaveLength(2); + expect(result.content[0].type).toBe('image'); + expect(result.content[0].data).toBeDefined(); + expect(result.content[0].mimeType).toBe('image/png'); + expect(result.content[1].type).toBe('text'); + + // Clean up + const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; + await closeHandler.execute({ sessionId: 'test-screenshot' }); + }); + + test('browser_click should return confirmation', async () => { + const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; + await launchHandler.execute({ sessionId: 'test-click' }); + + const clickHandler = toolHandlers[TOOLS.BROWSER_CLICK]; + const result = await clickHandler.execute({ sessionId: 'test-click', x: 100, y: 200 }); + + expect(result.content[0].text).toContain('100'); + expect(result.content[0].text).toContain('200'); + + // Clean up + const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; + await closeHandler.execute({ sessionId: 'test-click' }); + }); + + test('browser_key should normalize key names', async () => { + const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; + await launchHandler.execute({ sessionId: 'test-key' }); + + const keyHandler = toolHandlers[TOOLS.BROWSER_KEY]; + const result = await keyHandler.execute({ sessionId: 'test-key', key: 'Cmd+s' }); + + expect(result.content[0].text).toContain('Cmd+s'); + + // Clean up + const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; + await closeHandler.execute({ sessionId: 'test-key' }); + }); + }); + describe('Server Initialization', () => { test('should initialize server with config', () => { const config = { name: 'test-server', version: '1.0.0' }; diff --git a/src/browser-use/bridge.ts b/src/browser-use/bridge.ts new file mode 100644 index 0000000..ec3c9b3 --- /dev/null +++ b/src/browser-use/bridge.ts @@ -0,0 +1,106 @@ +import type { BrowserSession, BrowserStatus } from './types.js'; +import { + createBrowserSession, + takeScreenshot, + clickAt, + typeText, + pressKey, + scrollPage, + dragFromTo, + navigateTo, + closeSession, +} from './client.js'; + +class BrowserUseBridge { + private sessions: Map = new Map(); + private available: boolean | null = null; + private checkError: string | null = null; + + async checkAvailability(): Promise { + if (this.available !== null) return this.available; + try { + await import('playwright'); + this.available = true; + this.checkError = null; + } catch { + this.available = false; + this.checkError = 'Playwright is not installed. Install with: npm install playwright && npx playwright install chromium'; + } + return this.available; + } + + async launch(sessionId: string, opts?: { url?: string; headless?: boolean; viewportWidth?: number; viewportHeight?: number }): Promise { + const canUse = await this.checkAvailability(); + if (!canUse) { + throw new Error(this.checkError ?? 'Playwright is not installed'); + } + if (this.sessions.has(sessionId)) { + throw new Error(`Session "${sessionId}" already exists. Close it first or use a different session ID.`); + } + const session = await createBrowserSession(sessionId, opts); + this.sessions.set(sessionId, session); + return session; + } + + getSession(sessionId: string): BrowserSession { + const session = this.sessions.get(sessionId); + if (!session) { + throw new Error(`No active browser session with ID "${sessionId}". Use browser_launch to create one.`); + } + return session; + } + + async screenshot(sessionId: string): Promise<{ image: Buffer; url: string; title: string }> { + return takeScreenshot(this.getSession(sessionId)); + } + + async click(sessionId: string, x: number, y: number, opts?: { button?: string; clickCount?: number }): Promise { + return clickAt(this.getSession(sessionId), x, y, opts); + } + + async type(sessionId: string, text: string): Promise { + return typeText(this.getSession(sessionId), text); + } + + async key(sessionId: string, key: string): Promise { + return pressKey(this.getSession(sessionId), key); + } + + async scroll(sessionId: string, direction: string, amount: number): Promise { + return scrollPage(this.getSession(sessionId), direction, amount); + } + + async drag(sessionId: string, fromX: number, fromY: number, toX: number, toY: number): Promise { + return dragFromTo(this.getSession(sessionId), fromX, fromY, toX, toY); + } + + async navigate(sessionId: string, url: string): Promise { + return navigateTo(this.getSession(sessionId), url); + } + + async close(sessionId: string): Promise { + const session = this.sessions.get(sessionId); + if (session) { + await closeSession(session); + this.sessions.delete(sessionId); + } + } + + async shutdown(): Promise { + const closings = Array.from(this.sessions.values()).map((s) => closeSession(s).catch(() => {})); + await Promise.all(closings); + this.sessions.clear(); + } + + getStatus(): BrowserStatus { + return { + available: this.available ?? false, + error: this.checkError, + activeSessions: this.sessions.size, + sessionIds: Array.from(this.sessions.keys()), + }; + } +} + +// Singleton +export const bridge = new BrowserUseBridge(); diff --git a/src/browser-use/client.ts b/src/browser-use/client.ts new file mode 100644 index 0000000..92ab4f1 --- /dev/null +++ b/src/browser-use/client.ts @@ -0,0 +1,99 @@ +import type { Browser, BrowserContext, Page } from 'playwright'; +import type { BrowserSession } from './types.js'; + +let playwrightModule: { chromium: { launch(opts?: Record): Promise } } | null = null; + +async function getPlaywright() { + if (!playwrightModule) { + try { + playwrightModule = await import('playwright'); + } catch { + throw new Error( + 'Playwright is not installed. Install it with: npm install playwright && npx playwright install chromium' + ); + } + } + return playwrightModule; +} + +export async function createBrowserSession( + sessionId: string, + opts: { url?: string; headless?: boolean; viewportWidth?: number; viewportHeight?: number } = {} +): Promise { + const pw = await getPlaywright(); + const browser = await pw.chromium.launch({ + headless: opts.headless ?? true, + }); + + const context = await (browser as unknown as { newContext(opts?: Record): Promise }).newContext({ + viewport: { + width: opts.viewportWidth ?? 1440, + height: opts.viewportHeight ?? 900, + }, + }); + + const page = await context.newPage(); + + if (opts.url) { + await page.goto(opts.url, { waitUntil: 'domcontentloaded', timeout: 30000 }); + } + + return { sessionId, browser, page, createdAt: new Date() }; +} + +export async function takeScreenshot(session: BrowserSession): Promise<{ image: Buffer; url: string; title: string }> { + const page = session.page as Page; + const [image, title] = await Promise.all([ + page.screenshot({ type: 'png', fullPage: false }), + page.title(), + ]); + return { image, url: page.url(), title }; +} + +export async function clickAt(session: BrowserSession, x: number, y: number, opts?: { button?: string; clickCount?: number }): Promise { + const page = session.page as Page; + await page.mouse.click(x, y, { + button: opts?.button ?? 'left', + clickCount: opts?.clickCount ?? 1, + }); +} + +export async function typeText(session: BrowserSession, text: string): Promise { + const page = session.page as Page; + await page.keyboard.type(text, { delay: 20 }); +} + +export async function pressKey(session: BrowserSession, key: string): Promise { + const page = session.page as Page; + await page.keyboard.press(key); +} + +export async function scrollPage(session: BrowserSession, direction: string, amount: number): Promise { + const page = session.page as Page; + const delta = direction === 'up' || direction === 'left' ? -amount : amount; + const x = direction === 'left' || direction === 'right' ? delta : 0; + const y = direction === 'up' || direction === 'down' ? delta : 0; + await page.mouse.wheel(x, y); +} + +export async function dragFromTo(session: BrowserSession, fromX: number, fromY: number, toX: number, toY: number): Promise { + const page = session.page as Page; + await page.mouse.move(fromX, fromY); + await page.mouse.down(); + await page.mouse.move(toX, toY, { steps: 10 }); + await page.mouse.up(); +} + +export async function navigateTo(session: BrowserSession, url: string): Promise { + const page = session.page as Page; + await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); +} + +export async function closeSession(session: BrowserSession): Promise { + const browser = session.browser as Browser; + try { + await browser.close(); + } catch { + // Browser may already be closed + } +} diff --git a/src/browser-use/definitions.ts b/src/browser-use/definitions.ts new file mode 100644 index 0000000..bacc411 --- /dev/null +++ b/src/browser-use/definitions.ts @@ -0,0 +1,162 @@ +import { TOOLS, type ToolDefinition } from '../types.js'; +import { BROWSER_TOOLS } from './types.js'; + +const browserLaunch: ToolDefinition = { + name: TOOLS.BROWSER_LAUNCH, + description: 'Launch a new browser session. Returns a sessionId for use with other browser tools. Supports multiple concurrent sessions.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Unique identifier for this browser session' }, + url: { type: 'string', description: 'Optional URL to navigate to on launch' }, + headless: { type: 'boolean', description: 'Run in headless mode (default: true)' }, + viewportWidth: { type: 'integer', description: 'Viewport width in pixels (default: 1440)' }, + viewportHeight: { type: 'integer', description: 'Viewport height in pixels (default: 900)' }, + }, + required: ['sessionId'], + }, + annotations: { title: 'Launch Browser', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true }, +}; + +const browserScreenshot: ToolDefinition = { + name: TOOLS.BROWSER_SCREENSHOT, + description: 'Take a screenshot of the current browser page. Returns base64 PNG image data along with the page URL and title.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + }, + required: ['sessionId'], + }, + annotations: { title: 'Browser Screenshot', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, +}; + +const browserClick: ToolDefinition = { + name: TOOLS.BROWSER_CLICK, + description: 'Click at a specific pixel coordinate on the browser page. Coordinates are relative to the viewport.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + x: { type: 'number', description: 'X coordinate in pixels (viewport-relative)' }, + y: { type: 'number', description: 'Y coordinate in pixels (viewport-relative)' }, + button: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Mouse button (default: left)' }, + clickCount: { type: 'integer', description: 'Number of clicks (default: 1, use 2 for double-click)' }, + }, + required: ['sessionId', 'x', 'y'], + }, + annotations: { title: 'Browser Click', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, +}; + +const browserType: ToolDefinition = { + name: TOOLS.BROWSER_TYPE, + description: 'Type text into the currently focused element in the browser. Make sure to click on an input field first.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + text: { type: 'string', description: 'Text to type' }, + }, + required: ['sessionId', 'text'], + }, + annotations: { title: 'Browser Type', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, +}; + +const browserScroll: ToolDefinition = { + name: TOOLS.BROWSER_SCROLL, + description: 'Scroll the browser page in a given direction by a specified pixel amount.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Scroll direction' }, + amount: { type: 'integer', description: 'Scroll amount in pixels (default: 300)' }, + }, + required: ['sessionId', 'direction'], + }, + annotations: { title: 'Browser Scroll', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, +}; + +const browserDrag: ToolDefinition = { + name: TOOLS.BROWSER_DRAG, + description: 'Drag from one pixel coordinate to another on the browser page. Coordinates are viewport-relative.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + fromX: { type: 'number', description: 'Start X coordinate' }, + fromY: { type: 'number', description: 'Start Y coordinate' }, + toX: { type: 'number', description: 'End X coordinate' }, + toY: { type: 'number', description: 'End Y coordinate' }, + }, + required: ['sessionId', 'fromX', 'fromY', 'toX', 'toY'], + }, + annotations: { title: 'Browser Drag', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, +}; + +const browserKey: ToolDefinition = { + name: TOOLS.BROWSER_KEY, + description: 'Press a key or key combination in the browser. Supports Playwright key names (e.g., "Enter", "Control+a", "Meta+s").', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + key: { type: 'string', description: 'Key or key combination to press' }, + }, + required: ['sessionId', 'key'], + }, + annotations: { title: 'Browser Key Press', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, +}; + +const browserNavigate: ToolDefinition = { + name: TOOLS.BROWSER_NAVIGATE, + description: 'Navigate the browser to a URL.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID' }, + url: { type: 'string', description: 'URL to navigate to' }, + }, + required: ['sessionId', 'url'], + }, + annotations: { title: 'Browser Navigate', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true }, +}; + +const browserClose: ToolDefinition = { + name: TOOLS.BROWSER_CLOSE, + description: 'Close a browser session and clean up resources.', + inputSchema: { + type: 'object', + properties: { + sessionId: { type: 'string', description: 'Browser session ID to close' }, + }, + required: ['sessionId'], + }, + annotations: { title: 'Close Browser', readOnlyHint: false, destructiveHint: true, idempotentHint: true, openWorldHint: false }, +}; + +const browserStatus: ToolDefinition = { + name: TOOLS.BROWSER_STATUS, + description: 'Check browser automation status. Returns whether Playwright is available and lists active sessions. Works even without Playwright installed.', + inputSchema: { + type: 'object', + properties: {}, + required: [], + }, + annotations: { title: 'Browser Status', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, +}; + +export const browserUseToolDefinitions: ToolDefinition[] = [ + browserLaunch, + browserScreenshot, + browserClick, + browserType, + browserScroll, + browserDrag, + browserKey, + browserNavigate, + browserClose, + browserStatus, +]; + +export { BROWSER_TOOLS }; diff --git a/src/browser-use/handlers.ts b/src/browser-use/handlers.ts new file mode 100644 index 0000000..aa71ee2 --- /dev/null +++ b/src/browser-use/handlers.ts @@ -0,0 +1,131 @@ +import { TOOLS, type ToolResult, type ToolHandlerContext } from '../types.js'; +import { ToolExecutionError, ValidationError } from '../errors.js'; +import { ZodError } from 'zod'; +import { bridge } from './bridge.js'; +import { BROWSER_SCHEMAS, normalizeKey } from './types.js'; + +const noopContext: ToolHandlerContext = { sendProgress: async () => {} }; + +export class BrowserUseToolHandler { + async execute( + toolName: string, + args: unknown, + _context: ToolHandlerContext = noopContext + ): Promise { + try { + const schema = BROWSER_SCHEMAS[toolName]; + if (!schema) { + throw new ValidationError(toolName, `Unknown browser tool: ${toolName}`); + } + const p = schema.parse(args) as Record; + + switch (toolName) { + case TOOLS.BROWSER_STATUS: { + await bridge.checkAvailability(); + const status = bridge.getStatus(); + return { + content: [ + { type: 'text', text: JSON.stringify(status, null, 2) }, + ], + }; + } + case TOOLS.BROWSER_LAUNCH: { + const session = await bridge.launch( + p.sessionId as string, + { + url: p.url as string | undefined, + headless: p.headless as boolean | undefined, + viewportWidth: p.viewportWidth as number | undefined, + viewportHeight: p.viewportHeight as number | undefined, + } + ); + return { + content: [ + { + type: 'text', + text: `Browser session "${session.sessionId}" launched successfully at ${session.createdAt.toISOString()}`, + }, + ], + }; + } + case TOOLS.BROWSER_SCREENSHOT: { + const { image, url, title } = await bridge.screenshot(p.sessionId as string); + return { + content: [ + { type: 'image', text: '', data: image.toString('base64'), mimeType: 'image/png' }, + { type: 'text', text: `URL: ${url}\nTitle: ${title}` }, + ], + }; + } + case TOOLS.BROWSER_CLICK: { + const x = p.x as number; + const y = p.y as number; + await bridge.click(p.sessionId as string, x, y, { + button: p.button as string | undefined, + clickCount: p.clickCount as number | undefined, + }); + return { + content: [{ type: 'text', text: `Clicked at (${x}, ${y}) with ${p.button ?? 'left'} button` }], + }; + } + case TOOLS.BROWSER_TYPE: { + const text = p.text as string; + await bridge.type(p.sessionId as string, text); + return { + content: [{ type: 'text', text: `Typed "${text}"` }], + }; + } + case TOOLS.BROWSER_SCROLL: { + const direction = p.direction as string; + const amount = p.amount as number; + await bridge.scroll(p.sessionId as string, direction, amount); + return { + content: [{ type: 'text', text: `Scrolled ${direction} by ${amount}px` }], + }; + } + case TOOLS.BROWSER_DRAG: { + const fromX = p.fromX as number; + const fromY = p.fromY as number; + const toX = p.toX as number; + const toY = p.toY as number; + await bridge.drag(p.sessionId as string, fromX, fromY, toX, toY); + return { + content: [{ type: 'text', text: `Dragged from (${fromX}, ${fromY}) to (${toX}, ${toY})` }], + }; + } + case TOOLS.BROWSER_KEY: { + const key = p.key as string; + const normalizedKey = normalizeKey(key); + await bridge.key(p.sessionId as string, normalizedKey); + return { + content: [{ type: 'text', text: `Pressed key: ${key}` }], + }; + } + case TOOLS.BROWSER_NAVIGATE: { + const url = p.url as string; + await bridge.navigate(p.sessionId as string, url); + return { + content: [{ type: 'text', text: `Navigated to ${url}` }], + }; + } + case TOOLS.BROWSER_CLOSE: { + const sessionId = p.sessionId as string; + await bridge.close(sessionId); + return { + content: [{ type: 'text', text: `Session "${sessionId}" closed` }], + }; + } + default: + throw new ValidationError(toolName, `Unknown browser tool: ${toolName}`); + } + } catch (error) { + if (error instanceof ValidationError) throw error; + if (error instanceof ZodError) { + throw new ValidationError(toolName, error.message); + } + throw new ToolExecutionError(toolName, 'Browser operation failed', error); + } + } +} + +export const browserUseHandler = new BrowserUseToolHandler(); diff --git a/src/browser-use/playwright.d.ts b/src/browser-use/playwright.d.ts new file mode 100644 index 0000000..8d00abf --- /dev/null +++ b/src/browser-use/playwright.d.ts @@ -0,0 +1,34 @@ +declare module 'playwright' { + export interface Browser { + newContext(opts?: Record): Promise; + close(): Promise; + } + + export interface BrowserContext { + newPage(): Promise; + close(): Promise; + } + + export interface Page { + goto(url: string, opts?: Record): Promise; + screenshot(opts?: Record): Promise; + mouse: { + click(x: number, y: number, opts?: Record): Promise; + move(x: number, y: number, opts?: Record): Promise; + down(): Promise; + up(): Promise; + wheel(deltaX: number, deltaY: number): Promise; + }; + keyboard: { + press(key: string): Promise; + type(text: string, opts?: Record): Promise; + }; + title(): Promise; + url(): string; + waitForLoadState(state?: string): Promise; + } + + export const chromium: { + launch(opts?: Record): Promise; + }; +} diff --git a/src/browser-use/types.ts b/src/browser-use/types.ts new file mode 100644 index 0000000..8591f0f --- /dev/null +++ b/src/browser-use/types.ts @@ -0,0 +1,118 @@ +import { z } from 'zod'; + +export const BROWSER_TOOLS = { + LAUNCH: 'browser_launch', + SCREENSHOT: 'browser_screenshot', + CLICK: 'browser_click', + TYPE: 'browser_type', + SCROLL: 'browser_scroll', + DRAG: 'browser_drag', + KEY: 'browser_key', + NAVIGATE: 'browser_navigate', + CLOSE: 'browser_close', + STATUS: 'browser_status', +} as const; + +export const LaunchSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + url: z.string().optional(), + headless: z.boolean().optional().default(true), + viewportWidth: z.int().optional().default(1440), + viewportHeight: z.int().optional().default(900), +}); + +export const ScreenshotSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), +}); + +export const ClickSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + x: z.number(), + y: z.number(), + button: z.enum(['left', 'right', 'middle']).optional().default('left'), + clickCount: z.int().optional().default(1), +}); + +export const TypeSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + text: z.string().min(1, 'Text to type is required'), +}); + +export const ScrollSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + direction: z.enum(['up', 'down', 'left', 'right']), + amount: z.int().optional().default(300), +}); + +export const DragSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + fromX: z.number(), + fromY: z.number(), + toX: z.number(), + toY: z.number(), +}); + +export const KeySchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + key: z.string().min(1, 'Key is required'), +}); + +export const NavigateSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), + url: z.string().min(1, 'URL is required'), +}); + +export const CloseSchema = z.object({ + sessionId: z.string().min(1, 'Session ID is required'), +}); + +export const StatusSchema = z.object({}); + +export const BROWSER_SCHEMAS: Record = { + [BROWSER_TOOLS.LAUNCH]: LaunchSchema, + [BROWSER_TOOLS.SCREENSHOT]: ScreenshotSchema, + [BROWSER_TOOLS.CLICK]: ClickSchema, + [BROWSER_TOOLS.TYPE]: TypeSchema, + [BROWSER_TOOLS.SCROLL]: ScrollSchema, + [BROWSER_TOOLS.DRAG]: DragSchema, + [BROWSER_TOOLS.KEY]: KeySchema, + [BROWSER_TOOLS.NAVIGATE]: NavigateSchema, + [BROWSER_TOOLS.CLOSE]: CloseSchema, + [BROWSER_TOOLS.STATUS]: StatusSchema, +}; + +export interface BrowserSession { + sessionId: string; + browser: unknown; + page: unknown; + createdAt: Date; +} + +export interface BrowserStatus { + available: boolean; + error: string | null; + activeSessions: number; + sessionIds: string[]; +} + +// Map modifier keys from user-friendly names to Playwright key names +export function normalizeKey(key: string): string { + const map: Record = { + Ctrl: 'Control', + Cmd: 'Meta', + Command: 'Meta', + Opt: 'Alt', + Option: 'Alt', + Del: 'Delete', + Backspace: 'Backspace', + Enter: 'Enter', + Tab: 'Tab', + Esc: 'Escape', + Escape: 'Escape', + Return: 'Enter', + }; + if (key.includes('+')) { + return key.split('+').map((part) => map[part.trim()] ?? part.trim()).join('+'); + } + return map[key] ?? key; +} diff --git a/src/computer-use/__tests__/discovery.test.ts b/src/computer-use/__tests__/discovery.test.ts deleted file mode 100644 index 3c9cc23..0000000 --- a/src/computer-use/__tests__/discovery.test.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { discoverBinary } from '../discovery.js'; -import { existsSync } from 'node:fs'; - -const hasCodexBinary = existsSync( - '/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use/Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient' -); - -describe('Binary Discovery', () => { - (hasCodexBinary ? describe : describe.skip)('with Codex.app installed', () => { - test('should discover the Codex.app binary', () => { - const info = discoverBinary(); - expect(info.path).toContain('SkyComputerUseClient'); - expect(info.args).toEqual(['mcp']); - expect(['codex-app', 'custom']).toContain(info.type); - }); - }); - - test('should throw when env var points to non-existent path', () => { - const original = process.env.CODEX_COMPUTER_USE_BINARY; - process.env.CODEX_COMPUTER_USE_BINARY = '/nonexistent/path/to/binary'; - try { - expect(() => discoverBinary()).toThrow('non-existent path'); - } finally { - if (original) { - process.env.CODEX_COMPUTER_USE_BINARY = original; - } else { - delete process.env.CODEX_COMPUTER_USE_BINARY; - } - } - }); -}); diff --git a/src/computer-use/__tests__/handlers.test.ts b/src/computer-use/__tests__/handlers.test.ts deleted file mode 100644 index a1646f7..0000000 --- a/src/computer-use/__tests__/handlers.test.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { ComputerUseToolHandler } from '../handlers.js'; -import { CU_TOOLS } from '../types.js'; - -describe('ComputerUseToolHandler', () => { - let handler: ComputerUseToolHandler; - - beforeEach(() => { - handler = new ComputerUseToolHandler(); - }); - - describe('cu_status', () => { - test('should return status JSON without connecting to binary', async () => { - const result = await handler.execute(CU_TOOLS.STATUS, {}); - // Status should always work — no binary needed. - expect(result).toBeDefined(); - expect(result.content).toHaveLength(1); - expect(result.content[0].type).toBe('text'); - - const parsed = JSON.parse(result.content[0].text); - expect(parsed).toHaveProperty('connected'); - expect(parsed).toHaveProperty('binary'); - expect(parsed).toHaveProperty('error'); - }); - }); - - describe('validation', () => { - test('should reject unknown tool name', async () => { - await expect( - handler.execute('cu_nonexistent', {}) - ).rejects.toThrow('Unknown computer-use tool'); - }); - - test('should reject invalid click args (no app)', async () => { - await expect( - handler.execute(CU_TOOLS.CLICK, {}) - ).rejects.toThrow(); - }); - - test('should reject invalid get_app_state args (empty app)', async () => { - await expect( - handler.execute(CU_TOOLS.GET_APP_STATE, { app: '' }) - ).rejects.toThrow(); - }); - - test('should reject invalid drag args (missing coordinates)', async () => { - await expect( - handler.execute(CU_TOOLS.DRAG, { app: 'Safari' }) - ).rejects.toThrow(); - }); - }); -}); diff --git a/src/computer-use/__tests__/types.test.ts b/src/computer-use/__tests__/types.test.ts deleted file mode 100644 index e1ef6da..0000000 --- a/src/computer-use/__tests__/types.test.ts +++ /dev/null @@ -1,125 +0,0 @@ -import { - CU_TOOLS, - BINARY_TOOLS, - CU_TO_BINARY, - CU_SCHEMAS, - ListAppsSchema, - GetAppStateSchema, - ClickSchema, - DragSchema, - PerformSecondaryActionSchema, - SetValueSchema, - ScrollSchema, - PressKeySchema, - TypeTextSchema, - CuStatusSchema, -} from '../types.js'; - -describe('Computer Use Types', () => { - test('CU_TOOLS should have all 10 tools', () => { - const values = Object.values(CU_TOOLS); - expect(values).toHaveLength(10); - expect(values).toContain('cu_list_apps'); - expect(values).toContain('cu_get_app_state'); - expect(values).toContain('cu_click'); - expect(values).toContain('cu_perform_secondary_action'); - expect(values).toContain('cu_set_value'); - expect(values).toContain('cu_scroll'); - expect(values).toContain('cu_drag'); - expect(values).toContain('cu_press_key'); - expect(values).toContain('cu_type_text'); - expect(values).toContain('cu_status'); - }); - - test('BINARY_TOOLS should have 9 tools (no status)', () => { - expect(Object.keys(BINARY_TOOLS)).toHaveLength(9); - }); - - test('CU_TO_BINARY should map all CU_TOOLS except status', () => { - for (const [, binaryName] of Object.entries(CU_TO_BINARY)) { - expect(binaryName).toBeDefined(); - expect(Object.values(BINARY_TOOLS) as unknown[]).toContain(binaryName); - } - // status should NOT be in the map (it's handled locally). - expect(CU_TO_BINARY[CU_TOOLS.STATUS]).toBeUndefined(); - }); - - test('CU_SCHEMAS should have a schema for every CU tool', () => { - for (const key of Object.values(CU_TOOLS)) { - expect(CU_SCHEMAS[key]).toBeDefined(); - } - }); - - describe('Zod schemas', () => { - test('ListAppsSchema accepts empty object', () => { - expect(ListAppsSchema.parse({})).toEqual({}); - }); - - test('GetAppStateSchema requires app', () => { - expect(GetAppStateSchema.parse({ app: 'Safari' })).toEqual({ app: 'Safari' }); - expect(() => GetAppStateSchema.parse({})).toThrow(); - expect(() => GetAppStateSchema.parse({ app: '' })).toThrow(); - }); - - test('ClickSchema requires app, optional fields', () => { - expect(ClickSchema.parse({ app: 'Safari' })).toEqual({ app: 'Safari' }); - expect( - ClickSchema.parse({ app: 'Safari', element_index: '5', x: 100, y: 200 }) - ).toEqual({ app: 'Safari', element_index: '5', x: 100, y: 200 }); - }); - - test('DragSchema requires app and all coordinates', () => { - expect(DragSchema.parse({ app: 'Safari', from_x: 0, from_y: 0, to_x: 100, to_y: 100 })).toEqual({ - app: 'Safari', - from_x: 0, - from_y: 0, - to_x: 100, - to_y: 100, - }); - expect(() => DragSchema.parse({ app: 'Safari' })).toThrow(); - }); - - test('PerformSecondaryActionSchema requires app, element_index, action', () => { - expect( - PerformSecondaryActionSchema.parse({ app: 'Safari', element_index: '3', action: 'toggle' }) - ).toEqual({ app: 'Safari', element_index: '3', action: 'toggle' }); - expect(() => PerformSecondaryActionSchema.parse({ app: 'Safari' })).toThrow(); - }); - - test('SetValueSchema requires app, element_index, value', () => { - expect( - SetValueSchema.parse({ app: 'Safari', element_index: '1', value: 'hello' }) - ).toEqual({ app: 'Safari', element_index: '1', value: 'hello' }); - }); - - test('ScrollSchema requires app, element_index, direction', () => { - expect( - ScrollSchema.parse({ app: 'Safari', element_index: '0', direction: 'down' }) - ).toEqual({ app: 'Safari', element_index: '0', direction: 'down' }); - expect(ScrollSchema.parse({ app: 'Safari', element_index: '0', direction: 'down', pages: 3 })).toEqual({ - app: 'Safari', - element_index: '0', - direction: 'down', - pages: 3, - }); - }); - - test('PressKeySchema requires app and key', () => { - expect(PressKeySchema.parse({ app: 'Terminal', key: 'Return' })).toEqual({ - app: 'Terminal', - key: 'Return', - }); - }); - - test('TypeTextSchema requires app and text', () => { - expect(TypeTextSchema.parse({ app: 'TextEdit', text: 'hello world' })).toEqual({ - app: 'TextEdit', - text: 'hello world', - }); - }); - - test('CuStatusSchema accepts empty object', () => { - expect(CuStatusSchema.parse({})).toEqual({}); - }); - }); -}); diff --git a/src/computer-use/bridge.ts b/src/computer-use/bridge.ts deleted file mode 100644 index 303a54a..0000000 --- a/src/computer-use/bridge.ts +++ /dev/null @@ -1,92 +0,0 @@ -import { type BinaryInfo } from './types.js'; -import { discoverBinary } from './discovery.js'; -import { ComputerUseClient } from './client.js'; - -export class ComputerUseBridge { - private static instance?: ComputerUseBridge; - private client?: ComputerUseClient; - private binaryInfo?: BinaryInfo; - private initialized = false; - private error?: Error; - private initPromise?: Promise; - - private constructor() {} - - static getInstance(): ComputerUseBridge { - if (!ComputerUseBridge.instance) { - ComputerUseBridge.instance = new ComputerUseBridge(); - } - return ComputerUseBridge.instance; - } - - async initialize(): Promise { - // Fast path: already connected. - if (this.initialized && this.client?.isConnected()) return; - - // Serialize concurrent callers — they coalesce onto the same promise. - if (this.initPromise) return this.initPromise; - - this.initPromise = this._doInitialize(); - try { - await this.initPromise; - } finally { - this.initPromise = undefined; - } - } - - private async _doInitialize(): Promise { - // Re-check after acquiring the lock. - if (this.initialized && this.client?.isConnected()) return; - - if (this.client) { - this.client.disconnect(); - this.client = undefined; - } - this.initialized = false; - this.error = undefined; - - try { - this.binaryInfo = discoverBinary(); - console.error( - `Computer Use: using binary at ${this.binaryInfo.path} (${this.binaryInfo.type})` - ); - - this.client = new ComputerUseClient(this.binaryInfo); - await this.client.connect(); - - this.initialized = true; - console.error('Computer Use: bridge connected'); - } catch (err) { - this.error = err instanceof Error ? err : new Error(String(err)); - console.error(`Computer Use: ${this.error.message}`); - this.client?.disconnect(); - this.client = undefined; - throw this.error; - } - } - - getClient(): ComputerUseClient { - if (!this.client || !this.initialized) { - throw new Error('Computer Use bridge not initialized'); - } - return this.client; - } - - isReady(): boolean { - return this.initialized && this.client?.isConnected() === true; - } - - getBinaryInfo(): BinaryInfo | undefined { - return this.binaryInfo; - } - - getError(): Error | undefined { - return this.error; - } - - async shutdown(): Promise { - this.client?.disconnect(); - this.client = undefined; - this.initialized = false; - } -} diff --git a/src/computer-use/client.ts b/src/computer-use/client.ts deleted file mode 100644 index 4a5be1a..0000000 --- a/src/computer-use/client.ts +++ /dev/null @@ -1,183 +0,0 @@ -import { spawn, type ChildProcess } from 'child_process'; -import { type BinaryInfo, type JsonRpcResponse } from './types.js'; -import { type ToolResult } from '../types.js'; - -const DEFAULT_TIMEOUT_MS = 30_000; -const SCREENSHOT_TIMEOUT_MS = 60_000; -const INIT_PROTOCOL_VERSION = '2024-11-05'; - -export class ComputerUseClient { - private process: ChildProcess | null = null; - private requestId = 0; - private pending = new Map< - number, - { - resolve: (resp: JsonRpcResponse) => void; - reject: (err: Error) => void; - timer: ReturnType; - } - >(); - private buffer = ''; - private connected = false; - - constructor(private binary: BinaryInfo) {} - - async connect(): Promise { - if (this.connected) return; - - this.process = spawn(this.binary.path, this.binary.args, { - stdio: ['pipe', 'pipe', 'pipe'], - env: { ...process.env }, - }); - - this.process.on('error', (err) => { - this.rejectAll(new Error(`Binary spawn failed: ${err.message}`)); - }); - - this.process.on('close', () => { - this.connected = false; - this.process = null; - this.rejectAll(new Error('Binary process exited unexpectedly')); - }); - - this.process.stdout?.on('data', (data: Buffer) => { - this.buffer += data.toString(); - this.drainBuffer(); - }); - - this.process.stderr?.on('data', (data: Buffer) => { - // Binary log messages go to stderr — forward at debug level. - if (process.env.NODE_DEBUG?.includes('computer-use')) { - process.stderr.write(data); - } - }); - - // MCP initialize handshake. - const initResult = await this.sendRequest('initialize', { - protocolVersion: INIT_PROTOCOL_VERSION, - capabilities: {}, - clientInfo: { name: 'codex-mcp-server', version: '0.1.0' }, - }); - - if (!initResult.result) { - throw new Error(`Binary initialize failed: ${JSON.stringify(initResult.error)}`); - } - - // Send initialized notification (no id = notification). - this.sendNotification('notifications/initialized', {}); - this.connected = true; - } - - async callTool( - name: string, - args: Record, - timeoutMs = name === 'get_app_state' ? SCREENSHOT_TIMEOUT_MS : DEFAULT_TIMEOUT_MS - ): Promise { - if (!this.connected) throw new Error('Not connected'); - - const resp = await this.sendRequest('tools/call', { name, arguments: args }, timeoutMs); - - if (resp.error) { - return { - content: [{ type: 'text', text: `Binary error: ${resp.error.message}` }], - isError: true, - }; - } - - // The binary returns MCP CallToolResult shape. - const result = resp.result as { - content?: Array<{ - type: string; - text?: string; - data?: string; - mimeType?: string; - }>; - isError?: boolean; - }; - - // Pass through content as-is, preserving image data from get_app_state screenshots. - const content = (result?.content || []).map((item) => ({ - type: item.type as 'text' | 'image', - text: item.text || '', - data: item.data, - mimeType: item.mimeType, - })); - - return { - content: content.length > 0 ? content : [{ type: 'text', text: JSON.stringify(result) }], - isError: result?.isError, - }; - } - - disconnect(): void { - this.process?.kill(); - this.process = null; - this.connected = false; - this.rejectAll(new Error('Client disconnected')); - } - - isConnected(): boolean { - return this.connected; - } - - private sendRequest( - method: string, - params: Record, - timeoutMs = DEFAULT_TIMEOUT_MS - ): Promise { - return new Promise((resolve, reject) => { - const id = ++this.requestId; - const timer = setTimeout(() => { - this.pending.delete(id); - reject(new Error(`Request ${method} timed out after ${timeoutMs}ms`)); - }, timeoutMs); - - this.pending.set(id, { resolve, reject, timer }); - - const msg = JSON.stringify({ jsonrpc: '2.0', id, method, params }) + '\n'; - this.process?.stdin?.write(msg); - }); - } - - private sendNotification( - method: string, - params: Record - ): void { - const msg = JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n'; - this.process?.stdin?.write(msg); - } - - private drainBuffer(): void { - // Binary uses bare newline-delimited JSON-RPC (no Content-Length framing). - const lines = this.buffer.split('\n'); - // Last element may be incomplete — keep it in the buffer. - this.buffer = lines.pop() || ''; - - for (const line of lines) { - const trimmed = line.trim(); - if (!trimmed) continue; - try { - const msg = JSON.parse(trimmed) as JsonRpcResponse; - // Only handle responses (have an id). Notifications are id-less. - if (msg.id !== undefined) { - const pending = this.pending.get(msg.id); - if (pending) { - clearTimeout(pending.timer); - this.pending.delete(msg.id); - pending.resolve(msg); - } - } - } catch { - // Ignore non-JSON lines (log messages, etc.) - } - } - } - - private rejectAll(err: Error): void { - for (const [, entry] of this.pending) { - clearTimeout(entry.timer); - entry.reject(err); - } - this.pending.clear(); - } -} diff --git a/src/computer-use/definitions.ts b/src/computer-use/definitions.ts deleted file mode 100644 index b8e4035..0000000 --- a/src/computer-use/definitions.ts +++ /dev/null @@ -1,274 +0,0 @@ -import { type ToolDefinition } from '../types.js'; -import { CU_TOOLS } from './types.js'; - -export const computerUseToolDefinitions: ToolDefinition[] = [ - { - name: CU_TOOLS.LIST_APPS, - description: - 'List running and recently used macOS apps. Call this first to discover available apps.', - inputSchema: { - type: 'object', - properties: {}, - required: [], - }, - annotations: { - title: 'List Apps', - readOnlyHint: true, - idempotentHint: true, - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.GET_APP_STATE, - description: - 'Get screenshot and accessibility tree for an app. Must be called once per turn before interacting with the app.', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier (e.g., "Safari", "com.apple.Safari")', - }, - }, - required: ['app'], - }, - annotations: { - title: 'Get App State', - readOnlyHint: true, - idempotentHint: true, - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.CLICK, - description: - 'Click an element by index or pixel coordinates. Prefer element_index when available from accessibility tree.', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - element_index: { - type: 'string', - description: 'Element index from accessibility tree', - }, - x: { - type: 'number', - description: 'X coordinate in screenshot pixel coordinates', - }, - y: { - type: 'number', - description: 'Y coordinate in screenshot pixel coordinates', - }, - mouse_button: { - type: 'string', - enum: ['left', 'right', 'middle'], - description: 'Mouse button to click (default: left)', - }, - click_count: { - type: 'integer', - description: 'Number of clicks (default: 1)', - }, - }, - required: ['app'], - }, - annotations: { - title: 'Click', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.PERFORM_SECONDARY_ACTION, - description: - 'Invoke a secondary accessibility action exposed by an element (e.g., toggle, expand, pick)', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - element_index: { - type: 'string', - description: 'Element identifier', - }, - action: { - type: 'string', - description: 'Secondary accessibility action name', - }, - }, - required: ['app', 'element_index', 'action'], - }, - annotations: { - title: 'Secondary Action', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.SET_VALUE, - description: - 'Set the value of a settable accessibility element (e.g., slider, text field)', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - element_index: { - type: 'string', - description: 'Element identifier', - }, - value: { - type: 'string', - description: 'Value to assign', - }, - }, - required: ['app', 'element_index', 'value'], - }, - annotations: { - title: 'Set Value', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.SCROLL, - description: - 'Scroll an element in a direction by a number of pages', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - element_index: { - type: 'string', - description: 'Element identifier', - }, - direction: { - type: 'string', - description: 'Scroll direction: up, down, left, or right', - }, - pages: { - type: 'integer', - description: 'Number of page scroll actions (default: 1)', - }, - }, - required: ['app', 'element_index', 'direction'], - }, - annotations: { - title: 'Scroll', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.DRAG, - description: - 'Drag from one point to another using pixel coordinates', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - from_x: { - type: 'number', - description: 'Start X coordinate', - }, - from_y: { - type: 'number', - description: 'Start Y coordinate', - }, - to_x: { - type: 'number', - description: 'End X coordinate', - }, - to_y: { - type: 'number', - description: 'End Y coordinate', - }, - }, - required: ['app', 'from_x', 'from_y', 'to_x', 'to_y'], - }, - annotations: { - title: 'Drag', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.PRESS_KEY, - description: - 'Press a key or key-combination. Supports xdotool key syntax: "a", "Return", "Tab", "super+c", "Up"', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - key: { - type: 'string', - description: 'Key or key combination to press', - }, - }, - required: ['app', 'key'], - }, - annotations: { - title: 'Press Key', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.TYPE_TEXT, - description: - 'Type literal text using keyboard input', - inputSchema: { - type: 'object', - properties: { - app: { - type: 'string', - description: 'App name or bundle identifier', - }, - text: { - type: 'string', - description: 'Literal text to type', - }, - }, - required: ['app', 'text'], - }, - annotations: { - title: 'Type Text', - destructiveHint: false, - openWorldHint: false, - }, - }, - { - name: CU_TOOLS.STATUS, - description: - 'Check Computer Use service status, binary path, and connection health', - inputSchema: { - type: 'object', - properties: {}, - required: [], - }, - annotations: { - title: 'Computer Use Status', - readOnlyHint: true, - idempotentHint: true, - destructiveHint: false, - openWorldHint: false, - }, - }, -]; diff --git a/src/computer-use/discovery.ts b/src/computer-use/discovery.ts deleted file mode 100644 index 76d3759..0000000 --- a/src/computer-use/discovery.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { existsSync } from 'node:fs'; -import { resolve, join } from 'node:path'; -import { type BinaryInfo } from './types.js'; - -const CODEX_APP_PATH = - '/Applications/Codex.app/Contents/Resources/plugins/openai-bundled/plugins/computer-use'; -const CODEX_BINARY = join( - CODEX_APP_PATH, - 'Codex Computer Use.app/Contents/SharedSupport/SkyComputerUseClient.app/Contents/MacOS/SkyComputerUseClient' -); - -export function discoverBinary(): BinaryInfo { - // 1. Explicit env var override. - const envPath = process.env.CODEX_COMPUTER_USE_BINARY; - if (envPath) { - const resolved = resolve(envPath); - if (!existsSync(resolved)) { - throw new Error( - `CODEX_COMPUTER_USE_BINARY points to non-existent path: ${resolved}` - ); - } - return { path: resolved, type: 'custom', args: ['mcp'] }; - } - - // 2. Codex.app bundled binary (SkyComputerUseClient). - if (existsSync(CODEX_BINARY)) { - return { path: CODEX_BINARY, type: 'codex-app', args: ['mcp'] }; - } - - throw new Error( - 'Computer Use binary not found.\n' + - 'Install Codex.app from https://codex.ai or set CODEX_COMPUTER_USE_BINARY env var.' - ); -} diff --git a/src/computer-use/handlers.ts b/src/computer-use/handlers.ts deleted file mode 100644 index c8b161d..0000000 --- a/src/computer-use/handlers.ts +++ /dev/null @@ -1,93 +0,0 @@ -import { type ToolResult, type ToolHandlerContext } from '../types.js'; -import { ToolExecutionError, ValidationError } from '../errors.js'; -import { ZodError } from 'zod'; -import { - CU_TOOLS, - CU_TO_BINARY, - CU_SCHEMAS, -} from './types.js'; -import { ComputerUseBridge } from './bridge.js'; - -// Default no-op context for handlers that don't need progress. -const defaultContext: ToolHandlerContext = { - sendProgress: async () => {}, -}; - -export class ComputerUseToolHandler { - async execute( - toolName: string, - args: unknown, - context: ToolHandlerContext = defaultContext - ): Promise { - // Status tool works on any platform (reports binary not found). - if (toolName === CU_TOOLS.STATUS) { - return this.handleStatus(); - } - - // Guard: computer-use tools require macOS. - if (process.platform !== 'darwin') { - throw new ToolExecutionError( - toolName, - 'Computer Use tools are only available on macOS' - ); - } - - // Validate tool name. - const binaryToolName = CU_TO_BINARY[toolName]; - if (!binaryToolName) { - throw new Error(`Unknown computer-use tool: ${toolName}`); - } - - try { - // Validate input with Zod. - const schema = CU_SCHEMAS[toolName]; - if (!schema) { - throw new Error(`No schema for tool: ${toolName}`); - } - const parsed = schema.parse(args); - - // Ensure bridge is initialized (lazy). - const bridge = ComputerUseBridge.getInstance(); - if (!bridge.isReady()) { - await context.sendProgress('Connecting to Computer Use binary...', 0); - await bridge.initialize(); - } - - const client = bridge.getClient(); - const result = await client.callTool(binaryToolName, parsed as Record); - return result; - } catch (error) { - if (error instanceof ValidationError) throw error; - if (error instanceof ZodError) { - throw new ValidationError(toolName, error.message); - } - throw new ToolExecutionError(toolName, 'Computer Use tool failed', error); - } - } - - private handleStatus(): ToolResult { - const bridge = ComputerUseBridge.getInstance(); - const binaryInfo = bridge.getBinaryInfo(); - const error = bridge.getError(); - - const status = { - connected: bridge.isReady(), - binary: binaryInfo - ? { - path: binaryInfo.path, - type: binaryInfo.type, - } - : null, - error: error?.message || null, - }; - - return { - content: [ - { - type: 'text', - text: JSON.stringify(status, null, 2), - }, - ], - }; - } -} diff --git a/src/computer-use/types.ts b/src/computer-use/types.ts deleted file mode 100644 index a63d753..0000000 --- a/src/computer-use/types.ts +++ /dev/null @@ -1,140 +0,0 @@ -import { z } from 'zod'; - -// Tool names exposed by the Codex computer-use binary. -// Mapped to our prefixed names to avoid collision with Claude Code's -// native computer-use module. -export const CU_TOOLS = { - LIST_APPS: 'cu_list_apps', - GET_APP_STATE: 'cu_get_app_state', - CLICK: 'cu_click', - PERFORM_SECONDARY_ACTION: 'cu_perform_secondary_action', - SET_VALUE: 'cu_set_value', - SCROLL: 'cu_scroll', - DRAG: 'cu_drag', - PRESS_KEY: 'cu_press_key', - TYPE_TEXT: 'cu_type_text', - STATUS: 'cu_status', -} as const; - -// Internal binary tool names (no prefix) for proxying to the subprocess. -export const BINARY_TOOLS = { - LIST_APPS: 'list_apps', - GET_APP_STATE: 'get_app_state', - CLICK: 'click', - PERFORM_SECONDARY_ACTION: 'perform_secondary_action', - SET_VALUE: 'set_value', - SCROLL: 'scroll', - DRAG: 'drag', - PRESS_KEY: 'press_key', - TYPE_TEXT: 'type_text', -} as const; - -// Map from our prefixed name → binary tool name. -export const CU_TO_BINARY: Record = { - [CU_TOOLS.LIST_APPS]: BINARY_TOOLS.LIST_APPS, - [CU_TOOLS.GET_APP_STATE]: BINARY_TOOLS.GET_APP_STATE, - [CU_TOOLS.CLICK]: BINARY_TOOLS.CLICK, - [CU_TOOLS.PERFORM_SECONDARY_ACTION]: BINARY_TOOLS.PERFORM_SECONDARY_ACTION, - [CU_TOOLS.SET_VALUE]: BINARY_TOOLS.SET_VALUE, - [CU_TOOLS.SCROLL]: BINARY_TOOLS.SCROLL, - [CU_TOOLS.DRAG]: BINARY_TOOLS.DRAG, - [CU_TOOLS.PRESS_KEY]: BINARY_TOOLS.PRESS_KEY, - [CU_TOOLS.TYPE_TEXT]: BINARY_TOOLS.TYPE_TEXT, -}; - -// Zod schemas matching the binary's exact input schemas (from tools/list). -export const ListAppsSchema = z.object({}); - -export const GetAppStateSchema = z.object({ - app: z.string().min(1, 'App name is required'), -}); - -export const ClickSchema = z.object({ - app: z.string().min(1, 'App name is required'), - element_index: z.string().optional(), - x: z.number().optional(), - y: z.number().optional(), - mouse_button: z.enum(['left', 'right', 'middle']).optional(), - click_count: z.int().optional(), -}); - -export const PerformSecondaryActionSchema = z.object({ - app: z.string().min(1, 'App name is required'), - element_index: z.string().min(1, 'Element index is required'), - action: z.string().min(1, 'Action name is required'), -}); - -export const SetValueSchema = z.object({ - app: z.string().min(1, 'App name is required'), - element_index: z.string().min(1, 'Element index is required'), - value: z.string().min(1, 'Value is required'), -}); - -export const ScrollSchema = z.object({ - app: z.string().min(1, 'App name is required'), - element_index: z.string().min(1, 'Element index is required'), - direction: z.string().min(1, 'Scroll direction is required'), - pages: z.int().optional(), -}); - -export const DragSchema = z.object({ - app: z.string().min(1, 'App name is required'), - from_x: z.number(), - from_y: z.number(), - to_x: z.number(), - to_y: z.number(), -}); - -export const PressKeySchema = z.object({ - app: z.string().min(1, 'App name is required'), - key: z.string().min(1, 'Key is required'), -}); - -export const TypeTextSchema = z.object({ - app: z.string().min(1, 'App name is required'), - text: z.string().min(1, 'Text is required'), -}); - -export const CuStatusSchema = z.object({}); - -// Map tool name → schema for runtime validation. -export const CU_SCHEMAS: Record = { - [CU_TOOLS.LIST_APPS]: ListAppsSchema, - [CU_TOOLS.GET_APP_STATE]: GetAppStateSchema, - [CU_TOOLS.CLICK]: ClickSchema, - [CU_TOOLS.PERFORM_SECONDARY_ACTION]: PerformSecondaryActionSchema, - [CU_TOOLS.SET_VALUE]: SetValueSchema, - [CU_TOOLS.SCROLL]: ScrollSchema, - [CU_TOOLS.DRAG]: DragSchema, - [CU_TOOLS.PRESS_KEY]: PressKeySchema, - [CU_TOOLS.TYPE_TEXT]: TypeTextSchema, - [CU_TOOLS.STATUS]: CuStatusSchema, -}; - -// Binary discovery info. -export interface BinaryInfo { - path: string; - type: 'codex-app' | 'custom'; - args: string[]; -} - -// JSON-RPC message types for direct stdio communication. -export interface JsonRpcRequest { - jsonrpc: '2.0'; - id: number; - method: string; - params?: Record; -} - -export interface JsonRpcNotification { - jsonrpc: '2.0'; - method: string; - params?: Record; -} - -export interface JsonRpcResponse { - jsonrpc: '2.0'; - id?: number; - result?: unknown; - error?: { code: number; message: string; data?: unknown }; -} diff --git a/src/index.ts b/src/index.ts index 595de7e..7ebca8a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,6 +2,7 @@ import chalk from 'chalk'; import { CodexMcpServer } from './server.js'; +import { shutdownBrowserSessions } from './tools/handlers.js'; const SERVER_CONFIG = { name: 'codex-mcp-server', @@ -12,6 +13,18 @@ async function main(): Promise { try { const server = new CodexMcpServer(SERVER_CONFIG); await server.start(); + + // Clean up browser sessions on shutdown + const shutdown = async (signal: string) => { + try { + await shutdownBrowserSessions(); + } catch { + // Best-effort cleanup + } + process.exit(signal === 'SIGINT' ? 130 : 0); + }; + process.on('SIGINT', () => shutdown('SIGINT')); + process.on('SIGTERM', () => shutdown('SIGTERM')); } catch (error) { console.error(chalk.red('Failed to start server:'), error); process.exit(1); diff --git a/src/tools/definitions.ts b/src/tools/definitions.ts index 576ef62..50dd4c5 100644 --- a/src/tools/definitions.ts +++ b/src/tools/definitions.ts @@ -1,5 +1,5 @@ import { TOOLS, getModelDescription, type ToolDefinition } from '../types.js'; -import { computerUseToolDefinitions } from '../computer-use/definitions.js'; +import { browserUseToolDefinitions } from '../browser-use/definitions.js'; export const toolDefinitions: ToolDefinition[] = [ { @@ -206,5 +206,5 @@ export const toolDefinitions: ToolDefinition[] = [ openWorldHint: true, }, }, - ...computerUseToolDefinitions, + ...browserUseToolDefinitions, ]; diff --git a/src/tools/handlers.ts b/src/tools/handlers.ts index 0ab68b4..4185e00 100644 --- a/src/tools/handlers.ts +++ b/src/tools/handlers.ts @@ -24,7 +24,8 @@ import { ToolExecutionError, ValidationError } from '../errors.js'; import { executeCommand, executeCommandStreaming } from '../utils/command.js'; import { ZodError } from 'zod'; import path from 'node:path'; -import { ComputerUseToolHandler } from '../computer-use/handlers.js'; +import { browserUseHandler } from '../browser-use/handlers.js'; +import { bridge } from '../browser-use/bridge.js'; // Default no-op context for handlers that don't need progress const defaultContext: ToolHandlerContext = { @@ -578,13 +579,10 @@ export class WebSearchToolHandler { // Tool handler registry const sessionStorage = new InMemorySessionStorage(); -const computerUseHandler = new ComputerUseToolHandler(); -// Wrap ComputerUseToolHandler to match the execute(args, context) signature -// expected by server.ts. Each wrapper injects the tool name. -const cuHandler = (toolName: string) => ({ - execute: (args: unknown, context: ToolHandlerContext) => - computerUseHandler.execute(toolName, args, context), +const browserHandler = (toolName: string) => ({ + execute: (args: unknown, context?: ToolHandlerContext) => + browserUseHandler.execute(toolName, args, context), }); export const toolHandlers = { @@ -594,14 +592,19 @@ export const toolHandlers = { [TOOLS.HELP]: new HelpToolHandler(), [TOOLS.LIST_SESSIONS]: new ListSessionsToolHandler(sessionStorage), [TOOLS.WEBSEARCH]: new WebSearchToolHandler(), - [TOOLS.CU_LIST_APPS]: cuHandler(TOOLS.CU_LIST_APPS), - [TOOLS.CU_GET_APP_STATE]: cuHandler(TOOLS.CU_GET_APP_STATE), - [TOOLS.CU_CLICK]: cuHandler(TOOLS.CU_CLICK), - [TOOLS.CU_PERFORM_SECONDARY_ACTION]: cuHandler(TOOLS.CU_PERFORM_SECONDARY_ACTION), - [TOOLS.CU_SET_VALUE]: cuHandler(TOOLS.CU_SET_VALUE), - [TOOLS.CU_SCROLL]: cuHandler(TOOLS.CU_SCROLL), - [TOOLS.CU_DRAG]: cuHandler(TOOLS.CU_DRAG), - [TOOLS.CU_PRESS_KEY]: cuHandler(TOOLS.CU_PRESS_KEY), - [TOOLS.CU_TYPE_TEXT]: cuHandler(TOOLS.CU_TYPE_TEXT), - [TOOLS.CU_STATUS]: cuHandler(TOOLS.CU_STATUS), + [TOOLS.BROWSER_LAUNCH]: browserHandler(TOOLS.BROWSER_LAUNCH), + [TOOLS.BROWSER_SCREENSHOT]: browserHandler(TOOLS.BROWSER_SCREENSHOT), + [TOOLS.BROWSER_CLICK]: browserHandler(TOOLS.BROWSER_CLICK), + [TOOLS.BROWSER_TYPE]: browserHandler(TOOLS.BROWSER_TYPE), + [TOOLS.BROWSER_SCROLL]: browserHandler(TOOLS.BROWSER_SCROLL), + [TOOLS.BROWSER_DRAG]: browserHandler(TOOLS.BROWSER_DRAG), + [TOOLS.BROWSER_KEY]: browserHandler(TOOLS.BROWSER_KEY), + [TOOLS.BROWSER_NAVIGATE]: browserHandler(TOOLS.BROWSER_NAVIGATE), + [TOOLS.BROWSER_CLOSE]: browserHandler(TOOLS.BROWSER_CLOSE), + [TOOLS.BROWSER_STATUS]: browserHandler(TOOLS.BROWSER_STATUS), }; + +// Export shutdown function for browser cleanup +export async function shutdownBrowserSessions(): Promise { + await bridge.shutdown(); +} diff --git a/src/types.ts b/src/types.ts index b177c1c..323acaa 100644 --- a/src/types.ts +++ b/src/types.ts @@ -8,16 +8,16 @@ export const TOOLS = { HELP: 'help', LIST_SESSIONS: 'listSessions', WEBSEARCH: 'websearch', - CU_LIST_APPS: 'cu_list_apps', - CU_GET_APP_STATE: 'cu_get_app_state', - CU_CLICK: 'cu_click', - CU_PERFORM_SECONDARY_ACTION: 'cu_perform_secondary_action', - CU_SET_VALUE: 'cu_set_value', - CU_SCROLL: 'cu_scroll', - CU_DRAG: 'cu_drag', - CU_PRESS_KEY: 'cu_press_key', - CU_TYPE_TEXT: 'cu_type_text', - CU_STATUS: 'cu_status', + BROWSER_LAUNCH: 'browser_launch', + BROWSER_SCREENSHOT: 'browser_screenshot', + BROWSER_CLICK: 'browser_click', + BROWSER_TYPE: 'browser_type', + BROWSER_SCROLL: 'browser_scroll', + BROWSER_DRAG: 'browser_drag', + BROWSER_KEY: 'browser_key', + BROWSER_NAVIGATE: 'browser_navigate', + BROWSER_CLOSE: 'browser_close', + BROWSER_STATUS: 'browser_status', } as const; export type ToolName = typeof TOOLS[keyof typeof TOOLS]; From 0a30c4089dd4f4f095e8863e11bf495bddd11ec5 Mon Sep 17 00:00:00 2001 From: Tommy Nguyen Date: Sun, 19 Apr 2026 12:41:55 -0700 Subject: [PATCH 5/6] feat(browser): consolidate to single MCP tool with type-safe handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace 3 browser MCP tools (browser, browser_screenshot, browser_interact) with a single `browser` tool dispatching 10 actions via Zod discriminated union. Per-action schema validation gives clean error messages. All handler methods have typed parameter signatures — zero unsafe casts. Co-authored-by: Claude --- .gitignore | 8 +- CLAUDE.md | 11 ++ README.md | 10 +- docs/api-reference.md | 125 +++++-------------- docs/browser-use.md | 110 ++++------------- package-lock.json | 50 ++++++++ package.json | 3 + src/__tests__/index.test.ts | 175 +++++++++++++-------------- src/browser-use/bridge.ts | 2 +- src/browser-use/definitions.ts | 213 ++++++++------------------------- src/browser-use/handlers.ts | 190 +++++++++++++---------------- src/browser-use/types.ts | 103 +++++++++------- src/tools/handlers.ts | 16 +-- src/types.ts | 11 +- 14 files changed, 411 insertions(+), 616 deletions(-) diff --git a/.gitignore b/.gitignore index 97590ee..7339791 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,10 @@ temp/ # Jest cache .jest/ -.claude/harness/ +.claude/ + +# Local artifacts +trash/ +.entire/ +test-mcp.mjs +tmux/ diff --git a/CLAUDE.md b/CLAUDE.md index 11ae7fa..188dd4b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,6 +41,11 @@ MCP Client (Claude Code) → ToolHandlers (handlers.ts) → executeCommand (command.ts) → Codex CLI + + → BrowserUseToolHandler (handlers.ts) + → BrowserUseBridge (bridge.ts) + → Playwright (peer dependency) + → Chromium instances ``` ### Key Files @@ -55,6 +60,11 @@ MCP Client (Claude Code) | `src/session/storage.ts` | In-memory session storage for conversation context | | `src/utils/command.ts` | Command spawning with streaming support and Windows compatibility | | `src/errors.ts` | Custom error classes (ValidationError, ToolExecutionError) | +| `src/browser-use/bridge.ts` | Singleton browser session manager (lazy Playwright init) | +| `src/browser-use/client.ts` | Playwright session operations (launch, screenshot, click, etc.) | +| `src/browser-use/handlers.ts` | Browser tool handler — dispatches on `action` parameter | +| `src/browser-use/definitions.ts` | Browser tool MCP schema and annotations | +| `src/browser-use/types.ts` | Zod schemas for browser actions, key normalization | ### MCP Tools @@ -64,6 +74,7 @@ MCP Client (Claude Code) - **listSessions**: View active conversation sessions - **ping**: Test server connection - **help**: Get Codex CLI help +- **browser**: Playwright-based browser automation via a single `action` parameter (`open`, `screenshot`, `navigate`, `click`, `type`, `key`, `scroll`, `drag`, `close`, `status`). Playwright is a peer dependency — lazy-loaded on first use. ### Session Management diff --git a/README.md b/README.md index f72e254..6dff126 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Use review to check my uncommitted changes | `codex` | AI coding assistant with session support, model selection, and structured output metadata | | `review` | AI-powered code review for uncommitted changes, branches, or commits | | `websearch` | Web search using Codex CLI with customizable result count and search depth | -| `browser_*` | 10 browser automation tools — launch, screenshot, click, type, scroll, drag, navigate ([setup](docs/browser-use.md)) | +| `browser` | Browser automation via Playwright — open, screenshot, click, type, scroll, drag, navigate ([setup](docs/browser-use.md)) | | `listSessions` | View active conversation sessions | | `ping` | Test server connection | | `help` | Get Codex CLI help | @@ -99,10 +99,10 @@ Use websearch with query "TypeScript 5.8 new features" **Browser Automation:** ``` -Use browser_status to check if Playwright is available -Use browser_launch with sessionId "web1" and url "https://example.com" -Use browser_screenshot with sessionId "web1" to see the page -Use browser_click with sessionId "web1", x 100, and y 200 +Use browser with action "status" to check if Playwright is available +Use browser with action "open", sessionId "web1", and url "https://example.com" +Use browser with action "screenshot" and sessionId "web1" to see the page +Use browser with action "click", sessionId "web1", x 100, and y 200 ``` ## Requirements diff --git a/docs/api-reference.md b/docs/api-reference.md index e6bdd23..6f0ecc8 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -52,16 +52,7 @@ All tools include annotations that provide hints to MCP clients about tool behav | `ping` | Ping Server | `true` | `false` | `true` | `false` | | `help` | Get Help | `true` | `false` | `true` | `false` | | `listSessions` | List Sessions | `true` | `false` | `true` | `false` | -| `browser_launch` | Launch Browser | `false` | `false` | `false` | `true` | -| `browser_screenshot` | Browser Screenshot | `true` | `false` | `true` | `false` | -| `browser_click` | Browser Click | `false` | `false` | `false` | `false` | -| `browser_type` | Browser Type | `false` | `false` | `false` | `false` | -| `browser_scroll` | Browser Scroll | `false` | `false` | `false` | `false` | -| `browser_drag` | Browser Drag | `false` | `false` | `false` | `false` | -| `browser_key` | Browser Key Press | `false` | `false` | `false` | `false` | -| `browser_navigate` | Browser Navigate | `false` | `false` | `false` | `true` | -| `browser_close` | Close Browser | `false` | `true` | `true` | `false` | -| `browser_status` | Browser Status | `true` | `false` | `true` | `false` | +| `browser` | Browser Control | `false` | `true` | `false` | `true` | ### Progress Notifications For long-running operations, the server sends `notifications/progress` messages when the client includes a `progressToken` in the request `_meta`. @@ -452,95 +443,35 @@ Optional: - **Session Limits**: Configurable in server implementation (default: 100) - **TTL Settings**: Configurable session expiration (default: 24 hours) -## Browser Use Tools +## Browser Use Cross-platform browser automation via Playwright. See [Browser Use](browser-use.md) for setup instructions. -### `browser_status` — Health Check +### `browser` — Browser Control -**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true` - -No parameters. Returns Playwright availability, active sessions, and any error. - -### `browser_launch` — Launch Browser - -**Annotations:** `readOnlyHint: false`, `destructiveHint: false`, `idempotentHint: false`, `openWorldHint: true` - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `sessionId` | string | yes | - | Unique session identifier | -| `url` | string | no | - | URL to navigate to on launch | -| `headless` | boolean | no | `true` | Run without visible window | -| `viewportWidth` | integer | no | `1440` | Viewport width in pixels | -| `viewportHeight` | integer | no | `900` | Viewport height in pixels | - -### `browser_screenshot` — Take Screenshot - -**Annotations:** `readOnlyHint: true`, `destructiveHint: false`, `idempotentHint: true` - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `sessionId` | string | yes | Browser session ID | - -Returns base64 PNG image, page URL, and page title. - -### `browser_click` — Click - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `sessionId` | string | yes | - | Browser session ID | -| `x` | number | yes | - | X coordinate (viewport-relative) | -| `y` | number | yes | - | Y coordinate (viewport-relative) | -| `button` | enum | no | `left` | `left`, `right`, or `middle` | -| `clickCount` | integer | no | `1` | Number of clicks | - -### `browser_type` — Type Text - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `sessionId` | string | yes | Browser session ID | -| `text` | string | yes | Text to type into focused element | - -### `browser_scroll` — Scroll - -| Parameter | Type | Required | Default | Description | -|-----------|------|----------|---------|-------------| -| `sessionId` | string | yes | - | Browser session ID | -| `direction` | enum | yes | - | `up`, `down`, `left`, or `right` | -| `amount` | integer | no | `300` | Scroll amount in pixels | - -### `browser_drag` — Drag - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `sessionId` | string | yes | Browser session ID | -| `fromX` | number | yes | Start X coordinate | -| `fromY` | number | yes | Start Y coordinate | -| `toX` | number | yes | End X coordinate | -| `toY` | number | yes | End Y coordinate | - -### `browser_key` — Key Press - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `sessionId` | string | yes | Browser session ID | -| `key` | string | yes | Key or combo (`Enter`, `Control+a`, `Meta+s`) | - -Modifier keys are auto-normalized: `Cmd`/`Command` → `Meta`, `Ctrl` → `Control`, `Opt`/`Option` → `Alt`. - -### `browser_navigate` — Navigate - -**Annotations:** `readOnlyHint: false`, `destructiveHint: false`, `idempotentHint: false`, `openWorldHint: true` - -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `sessionId` | string | yes | Browser session ID | -| `url` | string | yes | URL to navigate to | - -### `browser_close` — Close Session - -**Annotations:** `readOnlyHint: false`, `destructiveHint: true`, `idempotentHint: true` +**Annotations:** `readOnlyHint: false`, `destructiveHint: true`, `idempotentHint: false`, `openWorldHint: true` -| Parameter | Type | Required | Description | -|-----------|------|----------|-------------| -| `sessionId` | string | yes | Browser session ID to close | \ No newline at end of file +A single tool for all browser operations, selected via the `action` parameter. + +| Parameter | Type | Required | Action | Default | Description | +|-----------|------|----------|--------|---------|-------------| +| `action` | enum | yes | all | — | `open`, `screenshot`, `navigate`, `click`, `type`, `key`, `scroll`, `drag`, `close`, `status` | +| `sessionId` | string | see below | all except `status` | — | Browser session ID | +| `url` | string | no | `open`, `navigate` | — | URL to open or navigate to | +| `headless` | boolean | no | `open` | `true` | Run without visible window | +| `viewportWidth` | integer | no | `open` | `1440` | Viewport width in pixels | +| `viewportHeight` | integer | no | `open` | `900` | Viewport height in pixels | +| `x` | number | no | `click` | — | X coordinate (viewport-relative) | +| `y` | number | no | `click` | — | Y coordinate (viewport-relative) | +| `button` | enum | no | `click` | `left` | `left`, `right`, or `middle` | +| `clickCount` | integer | no | `click` | `1` | Number of clicks | +| `text` | string | no | `type` | — | Text to type into focused element | +| `key` | string | no | `key` | — | Key or combo (`Enter`, `Control+a`, `Meta+s`) | +| `direction` | enum | no | `scroll` | — | `up`, `down`, `left`, or `right` | +| `amount` | integer | no | `scroll` | `300` | Scroll amount in pixels | +| `fromX` | number | no | `drag` | — | Start X coordinate | +| `fromY` | number | no | `drag` | — | Start Y coordinate | +| `toX` | number | no | `drag` | — | End X coordinate | +| `toY` | number | no | `drag` | — | End Y coordinate | + +Modifier keys are auto-normalized: `Cmd`/`Command` → `Meta`, `Ctrl` → `Control`, `Opt`/`Option` → `Alt`. \ No newline at end of file diff --git a/docs/browser-use.md b/docs/browser-use.md index 1b45e87..789f36d 100644 --- a/docs/browser-use.md +++ b/docs/browser-use.md @@ -1,6 +1,6 @@ # Browser Use -Playwright-based browser automation. Launch a real Chromium browser, take screenshots, click, type, scroll, drag, and navigate — all via MCP tools. +Playwright-based browser automation. Launch a real Chromium browser, take screenshots, click, type, scroll, drag, and navigate — all via a single MCP tool with an `action` parameter. ## Architecture @@ -21,96 +21,36 @@ npm install playwright npx playwright install chromium ``` -Playwright is a **peer dependency** — install it separately. The server works fine without it; browser tools will return a helpful error if Playwright isn't available. +Playwright is a **peer dependency** — install it separately. The server works fine without it; the browser tool will return a helpful error if Playwright isn't available. -## Tools +## Tool: `browser` -### `browser_status` — Health Check +A single tool for all browser operations. Every call requires an `action` parameter. -Works even without Playwright installed. Returns availability, active sessions, and any error. +### Session Lifecycle -```json -{ "sessionId": "my-session" } -``` - -### `browser_launch` — Launch Browser +| Action | Example | Description | +|--------|---------|-------------| +| `open` | `{ "action": "open", "sessionId": "s1", "url": "https://example.com" }` | Launch a browser session. Optional: `url`, `headless` (default true), `viewportWidth` (default 1440), `viewportHeight` (default 900) | +| `status` | `{ "action": "status" }` | Check if Playwright is available and list active sessions | +| `close` | `{ "action": "close", "sessionId": "s1" }` | Close a session and clean up resources | -Creates a new browser session. Supports multiple concurrent sessions with unique IDs. +### Navigation & Capture -```json -{ - "sessionId": "my-session", - "url": "https://example.com", - "headless": true, - "viewportWidth": 1440, - "viewportHeight": 900 -} -``` +| Action | Example | Description | +|--------|---------|-------------| +| `navigate` | `{ "action": "navigate", "sessionId": "s1", "url": "https://example.com/page2" }` | Navigate to a URL | +| `screenshot` | `{ "action": "screenshot", "sessionId": "s1" }` | Take a screenshot. Returns base64 PNG image data + page URL + title | -### `browser_screenshot` — Take Screenshot +### Input Actions -Returns base64 PNG image data along with the current page URL and title. - -```json -{ "sessionId": "my-session" } -``` - -### `browser_click` — Click - -Click at viewport-relative pixel coordinates. - -```json -{ "sessionId": "my-session", "x": 100, "y": 200, "button": "left", "clickCount": 1 } -``` - -### `browser_type` — Type Text - -Type literal text into the currently focused element. Click on an input field first. - -```json -{ "sessionId": "my-session", "text": "hello world" } -``` - -### `browser_scroll` — Scroll - -Scroll the page in a direction by a pixel amount. - -```json -{ "sessionId": "my-session", "direction": "down", "amount": 300 } -``` - -### `browser_drag` — Drag - -Drag from one coordinate to another (viewport-relative). - -```json -{ "sessionId": "my-session", "fromX": 100, "fromY": 100, "toX": 300, "toY": 300 } -``` - -### `browser_key` — Key Press - -Press a key or key combination. Supports Playwright key names. Modifier keys are auto-normalized: `Cmd` → `Meta`, `Ctrl` → `Control`, `Opt` → `Alt`, etc. - -```json -{ "sessionId": "my-session", "key": "Control+a" } -{ "sessionId": "my-session", "key": "Cmd+s" } -``` - -### `browser_navigate` — Navigate - -Go to a URL in the current page. - -```json -{ "sessionId": "my-session", "url": "https://example.com" } -``` - -### `browser_close` — Close Session - -Close a browser session and clean up resources. - -```json -{ "sessionId": "my-session" } -``` +| Action | Example | Description | +|--------|---------|-------------| +| `click` | `{ "action": "click", "sessionId": "s1", "x": 100, "y": 200 }` | Click at viewport coordinates. Optional: `button` (left/right/middle), `clickCount` | +| `type` | `{ "action": "type", "sessionId": "s1", "text": "hello world" }` | Type text into the focused element | +| `key` | `{ "action": "key", "sessionId": "s1", "key": "Control+a" }` | Press a key or combo. Modifier keys auto-normalized (`Cmd`→`Meta`, `Ctrl`→`Control`) | +| `scroll` | `{ "action": "scroll", "sessionId": "s1", "direction": "down", "amount": 300 }` | Scroll page: `up`/`down`/`left`/`right` by pixel amount | +| `drag` | `{ "action": "drag", "sessionId": "s1", "fromX": 0, "fromY": 0, "toX": 100, "toY": 100 }` | Drag from one coordinate to another (viewport-relative) | ## Troubleshooting @@ -118,10 +58,10 @@ Close a browser session and clean up resources. Install it: `npm install playwright && npx playwright install chromium` **"Session already exists"** -Use a different sessionId or close the existing session first with `browser_close`. +Use a different sessionId or close the existing session first. **"No active browser session"** -You must call `browser_launch` before using other browser tools. +You must call `{ "action": "open" }` before any other action. **Screenshots not loading** Check that the client supports `image` content type in MCP tool results. Claude Code supports this natively. diff --git a/package-lock.json b/package-lock.json index c7ab951..7415e85 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,6 +27,9 @@ "ts-jest": "^29.4.1", "tsx": "^4.20.4", "typescript": "^5.9.2" + }, + "peerDependencies": { + "playwright": "^1.59.1" } }, "node_modules/@ampproject/remapping": { @@ -6725,6 +6728,53 @@ "node": ">=8" } }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "license": "Apache-2.0", + "peer": true, + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "license": "Apache-2.0", + "peer": true, + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/playwright/node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "peer": true, + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/prelude-ls": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", diff --git a/package.json b/package.json index 7b342e3..1dccb1c 100644 --- a/package.json +++ b/package.json @@ -60,5 +60,8 @@ "ts-jest": "^29.4.1", "tsx": "^4.20.4", "typescript": "^5.9.2" + }, + "peerDependencies": { + "playwright": "^1.59.1" } } diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts index e63bb09..1a8eab6 100644 --- a/src/__tests__/index.test.ts +++ b/src/__tests__/index.test.ts @@ -68,8 +68,8 @@ import { bridge } from '../browser-use/bridge.js'; describe('Codex MCP Server', () => { describe('Tool Definitions', () => { test('should have all required tools defined', () => { - // 6 core tools + 10 browser tools = 16 - expect(toolDefinitions).toHaveLength(16); + // 6 core tools + 1 browser tool = 7 + expect(toolDefinitions).toHaveLength(7); const toolNames = toolDefinitions.map((tool) => tool.name); expect(toolNames).toContain(TOOLS.CODEX); @@ -80,18 +80,24 @@ describe('Codex MCP Server', () => { expect(toolNames).toContain(TOOLS.LIST_SESSIONS); }); - test('should have all browser tools defined', () => { + test('should have browser tool defined', () => { const toolNames = toolDefinitions.map((tool) => tool.name); - expect(toolNames).toContain(TOOLS.BROWSER_LAUNCH); - expect(toolNames).toContain(TOOLS.BROWSER_SCREENSHOT); - expect(toolNames).toContain(TOOLS.BROWSER_CLICK); - expect(toolNames).toContain(TOOLS.BROWSER_TYPE); - expect(toolNames).toContain(TOOLS.BROWSER_SCROLL); - expect(toolNames).toContain(TOOLS.BROWSER_DRAG); - expect(toolNames).toContain(TOOLS.BROWSER_KEY); - expect(toolNames).toContain(TOOLS.BROWSER_NAVIGATE); - expect(toolNames).toContain(TOOLS.BROWSER_CLOSE); - expect(toolNames).toContain(TOOLS.BROWSER_STATUS); + expect(toolNames).toContain(TOOLS.BROWSER); + }); + + test('browser tool should require action parameter', () => { + const browserTool = toolDefinitions.find((tool) => tool.name === TOOLS.BROWSER); + expect(browserTool).toBeDefined(); + expect(browserTool?.inputSchema.required).toContain('action'); + expect(browserTool?.inputSchema.required).not.toContain('sessionId'); + }); + + test('browser tool should include all action types in enum', () => { + const browserTool = toolDefinitions.find((tool) => tool.name === TOOLS.BROWSER); + const actionEnum = browserTool?.inputSchema.properties.action as { enum: string[] }; + expect(actionEnum.enum).toEqual( + expect.arrayContaining(['open', 'screenshot', 'navigate', 'click', 'type', 'key', 'scroll', 'drag', 'close', 'status']) + ); }); test('codex tool should define output schema', () => { @@ -124,23 +130,6 @@ describe('Codex MCP Server', () => { expect(helpTool?.inputSchema.required).toEqual([]); expect(helpTool?.description).toContain('Get Codex CLI help'); }); - - test('browser tools should have sessionId required', () => { - const browserTools = toolDefinitions.filter((t) => - t.name.startsWith('browser_') && t.name !== 'browser_status' - ); - for (const tool of browserTools) { - expect(tool.inputSchema.required).toContain('sessionId'); - } - }); - - test('browser_status should have no required parameters', () => { - const statusTool = toolDefinitions.find( - (tool) => tool.name === TOOLS.BROWSER_STATUS - ); - expect(statusTool).toBeDefined(); - expect(statusTool?.inputSchema.required).toEqual([]); - }); }); describe('Tool Handlers', () => { @@ -157,17 +146,8 @@ describe('Codex MCP Server', () => { ); }); - test('should have handlers for all browser tools', () => { - expect(toolHandlers[TOOLS.BROWSER_LAUNCH]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_SCREENSHOT]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_CLICK]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_TYPE]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_SCROLL]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_DRAG]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_KEY]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_NAVIGATE]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_CLOSE]).toBeDefined(); - expect(toolHandlers[TOOLS.BROWSER_STATUS]).toBeDefined(); + test('should have handler for browser tool', () => { + expect(toolHandlers[TOOLS.BROWSER]).toBeDefined(); }); test('ping handler should return message', async () => { @@ -215,15 +195,15 @@ describe('Codex MCP Server', () => { }); }); - describe('Browser Tool Handlers', () => { + describe('Browser Tool Handler', () => { afterEach(async () => { await bridge.shutdown(); }); - test('browser_status should return status JSON', async () => { - const handler = toolHandlers[TOOLS.BROWSER_STATUS]; - const result = await handler.execute({}); + const handler = () => toolHandlers[TOOLS.BROWSER]; + test('action=status should return status JSON', async () => { + const result = await handler().execute({ action: 'status' }); expect(result.content[0].type).toBe('text'); const status = JSON.parse(result.content[0].text); expect(status).toHaveProperty('available'); @@ -231,77 +211,84 @@ describe('Codex MCP Server', () => { expect(status).toHaveProperty('sessionIds'); }); - test('browser_launch should create a session', async () => { - const handler = toolHandlers[TOOLS.BROWSER_LAUNCH]; - const result = await handler.execute({ sessionId: 'test-session-1' }); - - expect(result.content[0].type).toBe('text'); - expect(result.content[0].text).toContain('test-session-1'); - expect(result.content[0].text).toContain('launched successfully'); - - // Clean up - const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; - await closeHandler.execute({ sessionId: 'test-session-1' }); + test('action=open should create a session', async () => { + const result = await handler().execute({ action: 'open', sessionId: 'test-1' }); + expect(result.content[0].text).toContain('test-1'); + expect(result.content[0].text).toContain('opened'); + await handler().execute({ action: 'close', sessionId: 'test-1' }); }); - test('browser_close should close a session', async () => { - // First launch - const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; - await launchHandler.execute({ sessionId: 'test-session-close' }); - - // Then close - const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; - const result = await closeHandler.execute({ sessionId: 'test-session-close' }); + test('action=open with url should navigate on launch', async () => { + const result = await handler().execute({ action: 'open', sessionId: 'test-url', url: 'https://example.com' }); + expect(result.content[0].text).toContain('test-url'); + await handler().execute({ action: 'close', sessionId: 'test-url' }); + }); - expect(result.content[0].text).toContain('test-session-close'); + test('action=close should close a session', async () => { + await handler().execute({ action: 'open', sessionId: 'test-close' }); + const result = await handler().execute({ action: 'close', sessionId: 'test-close' }); + expect(result.content[0].text).toContain('test-close'); expect(result.content[0].text).toContain('closed'); }); - test('browser_screenshot should return image data', async () => { - const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; - await launchHandler.execute({ sessionId: 'test-screenshot' }); - - const screenshotHandler = toolHandlers[TOOLS.BROWSER_SCREENSHOT]; - const result = await screenshotHandler.execute({ sessionId: 'test-screenshot' }); + test('action=navigate should go to URL', async () => { + await handler().execute({ action: 'open', sessionId: 'test-nav' }); + const result = await handler().execute({ action: 'navigate', sessionId: 'test-nav', url: 'https://example.com' }); + expect(result.content[0].text).toContain('Navigated'); + await handler().execute({ action: 'close', sessionId: 'test-nav' }); + }); + test('action=screenshot should return image data', async () => { + await handler().execute({ action: 'open', sessionId: 'test-ss' }); + const result = await handler().execute({ action: 'screenshot', sessionId: 'test-ss' }); expect(result.content).toHaveLength(2); expect(result.content[0].type).toBe('image'); expect(result.content[0].data).toBeDefined(); expect(result.content[0].mimeType).toBe('image/png'); expect(result.content[1].type).toBe('text'); - - // Clean up - const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; - await closeHandler.execute({ sessionId: 'test-screenshot' }); + await handler().execute({ action: 'close', sessionId: 'test-ss' }); }); - test('browser_click should return confirmation', async () => { - const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; - await launchHandler.execute({ sessionId: 'test-click' }); - - const clickHandler = toolHandlers[TOOLS.BROWSER_CLICK]; - const result = await clickHandler.execute({ sessionId: 'test-click', x: 100, y: 200 }); - + test('action=click should return confirmation', async () => { + await handler().execute({ action: 'open', sessionId: 'test-click' }); + const result = await handler().execute({ action: 'click', sessionId: 'test-click', x: 100, y: 200 }); expect(result.content[0].text).toContain('100'); expect(result.content[0].text).toContain('200'); + await handler().execute({ action: 'close', sessionId: 'test-click' }); + }); - // Clean up - const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; - await closeHandler.execute({ sessionId: 'test-click' }); + test('action=type should type text', async () => { + await handler().execute({ action: 'open', sessionId: 'test-type' }); + const result = await handler().execute({ action: 'type', sessionId: 'test-type', text: 'hello' }); + expect(result.content[0].text).toContain('Typed'); + expect(result.content[0].text).toContain('hello'); + await handler().execute({ action: 'close', sessionId: 'test-type' }); }); - test('browser_key should normalize key names', async () => { - const launchHandler = toolHandlers[TOOLS.BROWSER_LAUNCH]; - await launchHandler.execute({ sessionId: 'test-key' }); + test('action=key should normalize key names', async () => { + await handler().execute({ action: 'open', sessionId: 'test-key' }); + const result = await handler().execute({ action: 'key', sessionId: 'test-key', key: 'Cmd+s' }); + expect(result.content[0].text).toContain('Cmd+s'); + await handler().execute({ action: 'close', sessionId: 'test-key' }); + }); - const keyHandler = toolHandlers[TOOLS.BROWSER_KEY]; - const result = await keyHandler.execute({ sessionId: 'test-key', key: 'Cmd+s' }); + test('action=scroll should scroll page', async () => { + await handler().execute({ action: 'open', sessionId: 'test-scroll' }); + const result = await handler().execute({ action: 'scroll', sessionId: 'test-scroll', direction: 'down', amount: 500 }); + expect(result.content[0].text).toContain('down'); + expect(result.content[0].text).toContain('500'); + await handler().execute({ action: 'close', sessionId: 'test-scroll' }); + }); - expect(result.content[0].text).toContain('Cmd+s'); + test('action=drag should drag between coordinates', async () => { + await handler().execute({ action: 'open', sessionId: 'test-drag' }); + const result = await handler().execute({ action: 'drag', sessionId: 'test-drag', fromX: 0, fromY: 0, toX: 100, toY: 100 }); + expect(result.content[0].text).toContain('Dragged'); + await handler().execute({ action: 'close', sessionId: 'test-drag' }); + }); - // Clean up - const closeHandler = toolHandlers[TOOLS.BROWSER_CLOSE]; - await closeHandler.execute({ sessionId: 'test-key' }); + test('should reject invalid action', async () => { + await expect(handler().execute({ action: 'invalid' })).rejects.toThrow('Validation failed'); }); }); diff --git a/src/browser-use/bridge.ts b/src/browser-use/bridge.ts index ec3c9b3..4028c83 100644 --- a/src/browser-use/bridge.ts +++ b/src/browser-use/bridge.ts @@ -45,7 +45,7 @@ class BrowserUseBridge { getSession(sessionId: string): BrowserSession { const session = this.sessions.get(sessionId); if (!session) { - throw new Error(`No active browser session with ID "${sessionId}". Use browser_launch to create one.`); + throw new Error(`No active browser session with ID "${sessionId}". Use browser with action "open" to create one.`); } return session; } diff --git a/src/browser-use/definitions.ts b/src/browser-use/definitions.ts index bacc411..f1812a5 100644 --- a/src/browser-use/definitions.ts +++ b/src/browser-use/definitions.ts @@ -1,162 +1,55 @@ import { TOOLS, type ToolDefinition } from '../types.js'; -import { BROWSER_TOOLS } from './types.js'; -const browserLaunch: ToolDefinition = { - name: TOOLS.BROWSER_LAUNCH, - description: 'Launch a new browser session. Returns a sessionId for use with other browser tools. Supports multiple concurrent sessions.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Unique identifier for this browser session' }, - url: { type: 'string', description: 'Optional URL to navigate to on launch' }, - headless: { type: 'boolean', description: 'Run in headless mode (default: true)' }, - viewportWidth: { type: 'integer', description: 'Viewport width in pixels (default: 1440)' }, - viewportHeight: { type: 'integer', description: 'Viewport height in pixels (default: 900)' }, - }, - required: ['sessionId'], - }, - annotations: { title: 'Launch Browser', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true }, -}; - -const browserScreenshot: ToolDefinition = { - name: TOOLS.BROWSER_SCREENSHOT, - description: 'Take a screenshot of the current browser page. Returns base64 PNG image data along with the page URL and title.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - }, - required: ['sessionId'], - }, - annotations: { title: 'Browser Screenshot', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, -}; - -const browserClick: ToolDefinition = { - name: TOOLS.BROWSER_CLICK, - description: 'Click at a specific pixel coordinate on the browser page. Coordinates are relative to the viewport.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - x: { type: 'number', description: 'X coordinate in pixels (viewport-relative)' }, - y: { type: 'number', description: 'Y coordinate in pixels (viewport-relative)' }, - button: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Mouse button (default: left)' }, - clickCount: { type: 'integer', description: 'Number of clicks (default: 1, use 2 for double-click)' }, - }, - required: ['sessionId', 'x', 'y'], - }, - annotations: { title: 'Browser Click', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, -}; - -const browserType: ToolDefinition = { - name: TOOLS.BROWSER_TYPE, - description: 'Type text into the currently focused element in the browser. Make sure to click on an input field first.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - text: { type: 'string', description: 'Text to type' }, - }, - required: ['sessionId', 'text'], - }, - annotations: { title: 'Browser Type', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, -}; - -const browserScroll: ToolDefinition = { - name: TOOLS.BROWSER_SCROLL, - description: 'Scroll the browser page in a given direction by a specified pixel amount.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Scroll direction' }, - amount: { type: 'integer', description: 'Scroll amount in pixels (default: 300)' }, - }, - required: ['sessionId', 'direction'], - }, - annotations: { title: 'Browser Scroll', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, -}; - -const browserDrag: ToolDefinition = { - name: TOOLS.BROWSER_DRAG, - description: 'Drag from one pixel coordinate to another on the browser page. Coordinates are viewport-relative.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - fromX: { type: 'number', description: 'Start X coordinate' }, - fromY: { type: 'number', description: 'Start Y coordinate' }, - toX: { type: 'number', description: 'End X coordinate' }, - toY: { type: 'number', description: 'End Y coordinate' }, - }, - required: ['sessionId', 'fromX', 'fromY', 'toX', 'toY'], - }, - annotations: { title: 'Browser Drag', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, -}; - -const browserKey: ToolDefinition = { - name: TOOLS.BROWSER_KEY, - description: 'Press a key or key combination in the browser. Supports Playwright key names (e.g., "Enter", "Control+a", "Meta+s").', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - key: { type: 'string', description: 'Key or key combination to press' }, - }, - required: ['sessionId', 'key'], - }, - annotations: { title: 'Browser Key Press', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: false }, -}; - -const browserNavigate: ToolDefinition = { - name: TOOLS.BROWSER_NAVIGATE, - description: 'Navigate the browser to a URL.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID' }, - url: { type: 'string', description: 'URL to navigate to' }, - }, - required: ['sessionId', 'url'], - }, - annotations: { title: 'Browser Navigate', readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true }, -}; - -const browserClose: ToolDefinition = { - name: TOOLS.BROWSER_CLOSE, - description: 'Close a browser session and clean up resources.', - inputSchema: { - type: 'object', - properties: { - sessionId: { type: 'string', description: 'Browser session ID to close' }, - }, - required: ['sessionId'], - }, - annotations: { title: 'Close Browser', readOnlyHint: false, destructiveHint: true, idempotentHint: true, openWorldHint: false }, -}; - -const browserStatus: ToolDefinition = { - name: TOOLS.BROWSER_STATUS, - description: 'Check browser automation status. Returns whether Playwright is available and lists active sessions. Works even without Playwright installed.', - inputSchema: { - type: 'object', - properties: {}, - required: [], - }, - annotations: { title: 'Browser Status', readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, -}; - -export const browserUseToolDefinitions: ToolDefinition[] = [ - browserLaunch, - browserScreenshot, - browserClick, - browserType, - browserScroll, - browserDrag, - browserKey, - browserNavigate, - browserClose, - browserStatus, -]; - -export { BROWSER_TOOLS }; +const browserDef: ToolDefinition = { + name: TOOLS.BROWSER, + description: + 'Control a Chromium browser via Playwright. Actions:\n' + + '• "open" — launch a browser session (required: sessionId; optional: url, headless, viewportWidth, viewportHeight)\n' + + '• "screenshot" — capture the page as a PNG image, returns base64 data + URL + title (required: sessionId)\n' + + '• "navigate" — go to a URL in an existing session (required: sessionId, url)\n' + + '• "click" — click at viewport coordinates (required: sessionId, x, y; optional: button, clickCount)\n' + + '• "type" — type text into the focused element (required: sessionId, text)\n' + + '• "key" — press a key or combo like "Enter", "Control+a", "Meta+s" (required: sessionId, key; modifiers auto-normalized)\n' + + '• "scroll" — scroll the page (required: sessionId, direction; optional: amount in pixels)\n' + + '• "drag" — drag from one coordinate to another (required: sessionId, fromX, fromY, toX, toY)\n' + + '• "close" — close a session and free resources (required: sessionId)\n' + + '• "status" — check if Playwright is available and list active sessions (no params)\n' + + 'You must call "open" before any other action. Use "screenshot" to see the page after navigation or interaction.', + inputSchema: { + type: 'object', + properties: { + action: { + type: 'string', + enum: ['open', 'screenshot', 'navigate', 'click', 'type', 'key', 'scroll', 'drag', 'close', 'status'], + description: 'The action to perform', + }, + sessionId: { type: 'string', description: 'Browser session ID (required for all actions except "status")' }, + url: { type: 'string', description: 'URL to open or navigate to (for "open" and "navigate")' }, + headless: { type: 'boolean', description: 'Run without visible window (default: true, for "open")' }, + viewportWidth: { type: 'integer', description: 'Viewport width in pixels (default: 1440, for "open")' }, + viewportHeight: { type: 'integer', description: 'Viewport height in pixels (default: 900, for "open")' }, + x: { type: 'number', description: 'X coordinate in pixels, viewport-relative (for "click")' }, + y: { type: 'number', description: 'Y coordinate in pixels, viewport-relative (for "click")' }, + button: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Mouse button (default: left, for "click")' }, + clickCount: { type: 'integer', description: 'Number of clicks (default: 1, for "click")' }, + text: { type: 'string', description: 'Text to type into focused element (for "type")' }, + key: { type: 'string', description: 'Key or combo, e.g. "Enter", "Control+a", "Meta+s" (for "key")' }, + direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Scroll direction (for "scroll")' }, + amount: { type: 'integer', description: 'Scroll amount in pixels (default: 300, for "scroll")' }, + fromX: { type: 'number', description: 'Start X coordinate (for "drag")' }, + fromY: { type: 'number', description: 'Start Y coordinate (for "drag")' }, + toX: { type: 'number', description: 'End X coordinate (for "drag")' }, + toY: { type: 'number', description: 'End Y coordinate (for "drag")' }, + }, + required: ['action'], + }, + annotations: { + title: 'Browser Control', + readOnlyHint: false, + destructiveHint: true, + idempotentHint: false, + openWorldHint: true, + }, +}; + +export const browserUseToolDefinitions: ToolDefinition[] = [browserDef]; diff --git a/src/browser-use/handlers.ts b/src/browser-use/handlers.ts index aa71ee2..65561c0 100644 --- a/src/browser-use/handlers.ts +++ b/src/browser-use/handlers.ts @@ -2,130 +2,106 @@ import { TOOLS, type ToolResult, type ToolHandlerContext } from '../types.js'; import { ToolExecutionError, ValidationError } from '../errors.js'; import { ZodError } from 'zod'; import { bridge } from './bridge.js'; -import { BROWSER_SCHEMAS, normalizeKey } from './types.js'; +import { parseBrowserAction, normalizeKey } from './types.js'; const noopContext: ToolHandlerContext = { sendProgress: async () => {} }; export class BrowserUseToolHandler { async execute( - toolName: string, args: unknown, _context: ToolHandlerContext = noopContext ): Promise { try { - const schema = BROWSER_SCHEMAS[toolName]; - if (!schema) { - throw new ValidationError(toolName, `Unknown browser tool: ${toolName}`); - } - const p = schema.parse(args) as Record; + const parsed = parseBrowserAction(args); - switch (toolName) { - case TOOLS.BROWSER_STATUS: { - await bridge.checkAvailability(); - const status = bridge.getStatus(); - return { - content: [ - { type: 'text', text: JSON.stringify(status, null, 2) }, - ], - }; - } - case TOOLS.BROWSER_LAUNCH: { - const session = await bridge.launch( - p.sessionId as string, - { - url: p.url as string | undefined, - headless: p.headless as boolean | undefined, - viewportWidth: p.viewportWidth as number | undefined, - viewportHeight: p.viewportHeight as number | undefined, - } - ); - return { - content: [ - { - type: 'text', - text: `Browser session "${session.sessionId}" launched successfully at ${session.createdAt.toISOString()}`, - }, - ], - }; - } - case TOOLS.BROWSER_SCREENSHOT: { - const { image, url, title } = await bridge.screenshot(p.sessionId as string); - return { - content: [ - { type: 'image', text: '', data: image.toString('base64'), mimeType: 'image/png' }, - { type: 'text', text: `URL: ${url}\nTitle: ${title}` }, - ], - }; - } - case TOOLS.BROWSER_CLICK: { - const x = p.x as number; - const y = p.y as number; - await bridge.click(p.sessionId as string, x, y, { - button: p.button as string | undefined, - clickCount: p.clickCount as number | undefined, - }); - return { - content: [{ type: 'text', text: `Clicked at (${x}, ${y}) with ${p.button ?? 'left'} button` }], - }; - } - case TOOLS.BROWSER_TYPE: { - const text = p.text as string; - await bridge.type(p.sessionId as string, text); - return { - content: [{ type: 'text', text: `Typed "${text}"` }], - }; - } - case TOOLS.BROWSER_SCROLL: { - const direction = p.direction as string; - const amount = p.amount as number; - await bridge.scroll(p.sessionId as string, direction, amount); - return { - content: [{ type: 'text', text: `Scrolled ${direction} by ${amount}px` }], - }; - } - case TOOLS.BROWSER_DRAG: { - const fromX = p.fromX as number; - const fromY = p.fromY as number; - const toX = p.toX as number; - const toY = p.toY as number; - await bridge.drag(p.sessionId as string, fromX, fromY, toX, toY); - return { - content: [{ type: 'text', text: `Dragged from (${fromX}, ${fromY}) to (${toX}, ${toY})` }], - }; - } - case TOOLS.BROWSER_KEY: { - const key = p.key as string; - const normalizedKey = normalizeKey(key); - await bridge.key(p.sessionId as string, normalizedKey); - return { - content: [{ type: 'text', text: `Pressed key: ${key}` }], - }; - } - case TOOLS.BROWSER_NAVIGATE: { - const url = p.url as string; - await bridge.navigate(p.sessionId as string, url); - return { - content: [{ type: 'text', text: `Navigated to ${url}` }], - }; - } - case TOOLS.BROWSER_CLOSE: { - const sessionId = p.sessionId as string; - await bridge.close(sessionId); - return { - content: [{ type: 'text', text: `Session "${sessionId}" closed` }], - }; - } - default: - throw new ValidationError(toolName, `Unknown browser tool: ${toolName}`); + switch (parsed.action) { + case 'open': return this.handleOpen(parsed); + case 'screenshot': return this.handleScreenshot(parsed); + case 'navigate': return this.handleNavigate(parsed); + case 'click': return this.handleClick(parsed); + case 'type': return this.handleType(parsed); + case 'key': return this.handleKey(parsed); + case 'scroll': return this.handleScroll(parsed); + case 'drag': return this.handleDrag(parsed); + case 'close': return this.handleClose(parsed); + case 'status': return this.handleStatus(); } } catch (error) { if (error instanceof ValidationError) throw error; if (error instanceof ZodError) { - throw new ValidationError(toolName, error.message); + throw new ValidationError(TOOLS.BROWSER, error.message); } - throw new ToolExecutionError(toolName, 'Browser operation failed', error); + throw new ToolExecutionError(TOOLS.BROWSER, 'Browser operation failed', error); } } + + private async handleOpen(args: { action: 'open'; sessionId: string; url?: string; headless?: boolean; viewportWidth?: number; viewportHeight?: number }): Promise { + const session = await bridge.launch(args.sessionId, { + url: args.url, + headless: args.headless, + viewportWidth: args.viewportWidth, + viewportHeight: args.viewportHeight, + }); + return { + content: [ + { type: 'text', text: `Session "${session.sessionId}" opened at ${session.createdAt.toISOString()}` }, + ], + }; + } + + private async handleScreenshot(args: { action: 'screenshot'; sessionId: string }): Promise { + const { image, url, title } = await bridge.screenshot(args.sessionId); + return { + content: [ + { type: 'image', text: '', data: image.toString('base64'), mimeType: 'image/png' }, + { type: 'text', text: `URL: ${url}\nTitle: ${title}` }, + ], + }; + } + + private async handleNavigate(args: { action: 'navigate'; sessionId: string; url: string }): Promise { + await bridge.navigate(args.sessionId, args.url); + return { content: [{ type: 'text', text: `Navigated to ${args.url}` }] }; + } + + private async handleClick(args: { action: 'click'; sessionId: string; x: number; y: number; button?: string; clickCount?: number }): Promise { + await bridge.click(args.sessionId, args.x, args.y, { + button: args.button, + clickCount: args.clickCount, + }); + return { content: [{ type: 'text', text: `Clicked at (${args.x}, ${args.y})` }] }; + } + + private async handleType(args: { action: 'type'; sessionId: string; text: string }): Promise { + await bridge.type(args.sessionId, args.text); + return { content: [{ type: 'text', text: `Typed "${args.text}"` }] }; + } + + private async handleKey(args: { action: 'key'; sessionId: string; key: string }): Promise { + await bridge.key(args.sessionId, normalizeKey(args.key)); + return { content: [{ type: 'text', text: `Pressed key: ${args.key}` }] }; + } + + private async handleScroll(args: { action: 'scroll'; sessionId: string; direction: string; amount: number }): Promise { + await bridge.scroll(args.sessionId, args.direction, args.amount); + return { content: [{ type: 'text', text: `Scrolled ${args.direction} by ${args.amount}px` }] }; + } + + private async handleDrag(args: { action: 'drag'; sessionId: string; fromX: number; fromY: number; toX: number; toY: number }): Promise { + await bridge.drag(args.sessionId, args.fromX, args.fromY, args.toX, args.toY); + return { content: [{ type: 'text', text: `Dragged from (${args.fromX}, ${args.fromY}) to (${args.toX}, ${args.toY})` }] }; + } + + private async handleClose(args: { action: 'close'; sessionId: string }): Promise { + await bridge.close(args.sessionId); + return { content: [{ type: 'text', text: `Session "${args.sessionId}" closed` }] }; + } + + private async handleStatus(): Promise { + await bridge.checkAvailability(); + const status = bridge.getStatus(); + return { content: [{ type: 'text', text: JSON.stringify(status, null, 2) }] }; + } } export const browserUseHandler = new BrowserUseToolHandler(); diff --git a/src/browser-use/types.ts b/src/browser-use/types.ts index 8591f0f..af0e6db 100644 --- a/src/browser-use/types.ts +++ b/src/browser-use/types.ts @@ -1,19 +1,8 @@ import { z } from 'zod'; -export const BROWSER_TOOLS = { - LAUNCH: 'browser_launch', - SCREENSHOT: 'browser_screenshot', - CLICK: 'browser_click', - TYPE: 'browser_type', - SCROLL: 'browser_scroll', - DRAG: 'browser_drag', - KEY: 'browser_key', - NAVIGATE: 'browser_navigate', - CLOSE: 'browser_close', - STATUS: 'browser_status', -} as const; - -export const LaunchSchema = z.object({ +// Per-action schemas (separate for validation reuse and clean errors) +const openSchema = z.object({ + action: z.literal('open'), sessionId: z.string().min(1, 'Session ID is required'), url: z.string().optional(), headless: z.boolean().optional().default(true), @@ -21,11 +10,19 @@ export const LaunchSchema = z.object({ viewportHeight: z.int().optional().default(900), }); -export const ScreenshotSchema = z.object({ +const navigateSchema = z.object({ + action: z.literal('navigate'), sessionId: z.string().min(1, 'Session ID is required'), + url: z.string().min(1, 'URL is required'), }); -export const ClickSchema = z.object({ +const screenshotSchema = z.object({ + action: z.literal('screenshot'), + sessionId: z.string().min(1, 'Session ID is required'), +}); + +const clickSchema = z.object({ + action: z.literal('click'), sessionId: z.string().min(1, 'Session ID is required'), x: z.number(), y: z.number(), @@ -33,18 +30,27 @@ export const ClickSchema = z.object({ clickCount: z.int().optional().default(1), }); -export const TypeSchema = z.object({ +const typeSchema = z.object({ + action: z.literal('type'), sessionId: z.string().min(1, 'Session ID is required'), text: z.string().min(1, 'Text to type is required'), }); -export const ScrollSchema = z.object({ +const keySchema = z.object({ + action: z.literal('key'), + sessionId: z.string().min(1, 'Session ID is required'), + key: z.string().min(1, 'Key is required'), +}); + +const scrollSchema = z.object({ + action: z.literal('scroll'), sessionId: z.string().min(1, 'Session ID is required'), direction: z.enum(['up', 'down', 'left', 'right']), amount: z.int().optional().default(300), }); -export const DragSchema = z.object({ +const dragSchema = z.object({ + action: z.literal('drag'), sessionId: z.string().min(1, 'Session ID is required'), fromX: z.number(), fromY: z.number(), @@ -52,35 +58,51 @@ export const DragSchema = z.object({ toY: z.number(), }); -export const KeySchema = z.object({ +const closeSchema = z.object({ + action: z.literal('close'), sessionId: z.string().min(1, 'Session ID is required'), - key: z.string().min(1, 'Key is required'), }); -export const NavigateSchema = z.object({ - sessionId: z.string().min(1, 'Session ID is required'), - url: z.string().min(1, 'URL is required'), +const statusSchema = z.object({ + action: z.literal('status'), }); -export const CloseSchema = z.object({ - sessionId: z.string().min(1, 'Session ID is required'), -}); +// Discriminated union for single-parse validation +export const BrowserActionSchema = z.discriminatedUnion('action', [ + openSchema, navigateSchema, screenshotSchema, clickSchema, + typeSchema, keySchema, scrollSchema, dragSchema, + closeSchema, statusSchema, +]); -export const StatusSchema = z.object({}); - -export const BROWSER_SCHEMAS: Record = { - [BROWSER_TOOLS.LAUNCH]: LaunchSchema, - [BROWSER_TOOLS.SCREENSHOT]: ScreenshotSchema, - [BROWSER_TOOLS.CLICK]: ClickSchema, - [BROWSER_TOOLS.TYPE]: TypeSchema, - [BROWSER_TOOLS.SCROLL]: ScrollSchema, - [BROWSER_TOOLS.DRAG]: DragSchema, - [BROWSER_TOOLS.KEY]: KeySchema, - [BROWSER_TOOLS.NAVIGATE]: NavigateSchema, - [BROWSER_TOOLS.CLOSE]: CloseSchema, - [BROWSER_TOOLS.STATUS]: StatusSchema, +// Map action to its schema for per-action validation (fixes #3: clean error messages) +const ACTION_SCHEMAS: Record = { + open: openSchema, + navigate: navigateSchema, + screenshot: screenshotSchema, + click: clickSchema, + type: typeSchema, + key: keySchema, + scroll: scrollSchema, + drag: dragSchema, + close: closeSchema, + status: statusSchema, }; +export type BrowserAction = z.infer; + +export function parseBrowserAction(args: unknown): BrowserAction { + const raw = args as Record; + const action = typeof raw.action === 'string' ? raw.action : undefined; + + // Per-action validation for clean error messages + if (action && ACTION_SCHEMAS[action]) { + return ACTION_SCHEMAS[action].parse(args) as BrowserAction; + } + + // Fallback: full discriminated union parse (handles missing/invalid action) + return BrowserActionSchema.parse(args) as BrowserAction; +} + export interface BrowserSession { sessionId: string; browser: unknown; @@ -95,7 +117,6 @@ export interface BrowserStatus { sessionIds: string[]; } -// Map modifier keys from user-friendly names to Playwright key names export function normalizeKey(key: string): string { const map: Record = { Ctrl: 'Control', diff --git a/src/tools/handlers.ts b/src/tools/handlers.ts index 4185e00..07d5ef6 100644 --- a/src/tools/handlers.ts +++ b/src/tools/handlers.ts @@ -580,11 +580,6 @@ export class WebSearchToolHandler { // Tool handler registry const sessionStorage = new InMemorySessionStorage(); -const browserHandler = (toolName: string) => ({ - execute: (args: unknown, context?: ToolHandlerContext) => - browserUseHandler.execute(toolName, args, context), -}); - export const toolHandlers = { [TOOLS.CODEX]: new CodexToolHandler(sessionStorage), [TOOLS.REVIEW]: new ReviewToolHandler(), @@ -592,16 +587,7 @@ export const toolHandlers = { [TOOLS.HELP]: new HelpToolHandler(), [TOOLS.LIST_SESSIONS]: new ListSessionsToolHandler(sessionStorage), [TOOLS.WEBSEARCH]: new WebSearchToolHandler(), - [TOOLS.BROWSER_LAUNCH]: browserHandler(TOOLS.BROWSER_LAUNCH), - [TOOLS.BROWSER_SCREENSHOT]: browserHandler(TOOLS.BROWSER_SCREENSHOT), - [TOOLS.BROWSER_CLICK]: browserHandler(TOOLS.BROWSER_CLICK), - [TOOLS.BROWSER_TYPE]: browserHandler(TOOLS.BROWSER_TYPE), - [TOOLS.BROWSER_SCROLL]: browserHandler(TOOLS.BROWSER_SCROLL), - [TOOLS.BROWSER_DRAG]: browserHandler(TOOLS.BROWSER_DRAG), - [TOOLS.BROWSER_KEY]: browserHandler(TOOLS.BROWSER_KEY), - [TOOLS.BROWSER_NAVIGATE]: browserHandler(TOOLS.BROWSER_NAVIGATE), - [TOOLS.BROWSER_CLOSE]: browserHandler(TOOLS.BROWSER_CLOSE), - [TOOLS.BROWSER_STATUS]: browserHandler(TOOLS.BROWSER_STATUS), + [TOOLS.BROWSER]: browserUseHandler, }; // Export shutdown function for browser cleanup diff --git a/src/types.ts b/src/types.ts index 323acaa..3d8f58c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -8,16 +8,7 @@ export const TOOLS = { HELP: 'help', LIST_SESSIONS: 'listSessions', WEBSEARCH: 'websearch', - BROWSER_LAUNCH: 'browser_launch', - BROWSER_SCREENSHOT: 'browser_screenshot', - BROWSER_CLICK: 'browser_click', - BROWSER_TYPE: 'browser_type', - BROWSER_SCROLL: 'browser_scroll', - BROWSER_DRAG: 'browser_drag', - BROWSER_KEY: 'browser_key', - BROWSER_NAVIGATE: 'browser_navigate', - BROWSER_CLOSE: 'browser_close', - BROWSER_STATUS: 'browser_status', + BROWSER: 'browser', } as const; export type ToolName = typeof TOOLS[keyof typeof TOOLS]; From 3057cde01e7c6dcf2d4adb7cc240cc33201e4174 Mon Sep 17 00:00:00 2001 From: Tommy Nguyen Date: Sun, 19 Apr 2026 13:01:38 -0700 Subject: [PATCH 6/6] fix(browser-use): address CodeRabbit PR review comments - client.ts: cache in-flight promise instead of resolved module; use ToolExecutionError instead of bare Error; prevent browser leak on session creation failure; validate URLs are http(s) only; log close errors instead of silently swallowing - types.ts: remove redundant .optional() before .default(); add http(s) URL validation; case-insensitive normalizeKey with lowercase lookup - bridge.ts: only cache positive availability (retry import on failure); add TOCTOU guard on sessionId with pending set - handlers.ts: forward progress context; emit progress notifications for open/screenshot/navigate actions - docs/browser-use.md: add language hint to architecture diagram --- docs/browser-use.md | 2 +- src/__tests__/index.test.ts | 145 ++++++++++++++++++-------- src/browser-use/bridge.ts | 69 ++++++++++--- src/browser-use/client.ts | 123 +++++++++++++++++------ src/browser-use/definitions.ts | 82 ++++++++++++--- src/browser-use/handlers.ts | 173 ++++++++++++++++++++++++++------ src/browser-use/playwright.d.ts | 6 +- src/browser-use/types.ts | 67 +++++++------ 8 files changed, 503 insertions(+), 164 deletions(-) diff --git a/docs/browser-use.md b/docs/browser-use.md index 789f36d..2449617 100644 --- a/docs/browser-use.md +++ b/docs/browser-use.md @@ -4,7 +4,7 @@ Playwright-based browser automation. Launch a real Chromium browser, take screen ## Architecture -``` +```text Claude Code → codex-mcp-server → BrowserUseBridge (singleton, lazy init) diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts index 1a8eab6..9f91e62 100644 --- a/src/__tests__/index.test.ts +++ b/src/__tests__/index.test.ts @@ -17,34 +17,38 @@ jest.mock('../utils/command.js', () => ({ })); // Mock playwright to avoid requiring it at test time -jest.mock('playwright', () => ({ - chromium: { - launch: jest.fn().mockResolvedValue({ - newContext: jest.fn().mockResolvedValue({ - newPage: jest.fn().mockResolvedValue({ - goto: jest.fn(), - screenshot: jest.fn().mockResolvedValue(Buffer.from('fake-png')), - title: jest.fn().mockResolvedValue('Test Page'), - url: jest.fn().mockReturnValue('https://example.com'), - mouse: { - click: jest.fn(), - move: jest.fn(), - down: jest.fn(), - up: jest.fn(), - wheel: jest.fn(), - }, - keyboard: { - press: jest.fn(), - type: jest.fn(), - }, - waitForLoadState: jest.fn(), +jest.mock( + 'playwright', + () => ({ + chromium: { + launch: jest.fn().mockResolvedValue({ + newContext: jest.fn().mockResolvedValue({ + newPage: jest.fn().mockResolvedValue({ + goto: jest.fn(), + screenshot: jest.fn().mockResolvedValue(Buffer.from('fake-png')), + title: jest.fn().mockResolvedValue('Test Page'), + url: jest.fn().mockReturnValue('https://example.com'), + mouse: { + click: jest.fn(), + move: jest.fn(), + down: jest.fn(), + up: jest.fn(), + wheel: jest.fn(), + }, + keyboard: { + press: jest.fn(), + type: jest.fn(), + }, + waitForLoadState: jest.fn(), + }), + close: jest.fn(), }), close: jest.fn(), }), - close: jest.fn(), - }), - }, -}), { virtual: true }); + }, + }), + { virtual: true } +); import { TOOLS } from '../types.js'; import { toolDefinitions } from '../tools/definitions.js'; @@ -86,17 +90,34 @@ describe('Codex MCP Server', () => { }); test('browser tool should require action parameter', () => { - const browserTool = toolDefinitions.find((tool) => tool.name === TOOLS.BROWSER); + const browserTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.BROWSER + ); expect(browserTool).toBeDefined(); expect(browserTool?.inputSchema.required).toContain('action'); expect(browserTool?.inputSchema.required).not.toContain('sessionId'); }); test('browser tool should include all action types in enum', () => { - const browserTool = toolDefinitions.find((tool) => tool.name === TOOLS.BROWSER); - const actionEnum = browserTool?.inputSchema.properties.action as { enum: string[] }; + const browserTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.BROWSER + ); + const actionEnum = browserTool?.inputSchema.properties.action as { + enum: string[]; + }; expect(actionEnum.enum).toEqual( - expect.arrayContaining(['open', 'screenshot', 'navigate', 'click', 'type', 'key', 'scroll', 'drag', 'close', 'status']) + expect.arrayContaining([ + 'open', + 'screenshot', + 'navigate', + 'click', + 'type', + 'key', + 'scroll', + 'drag', + 'close', + 'status', + ]) ); }); @@ -212,35 +233,52 @@ describe('Codex MCP Server', () => { }); test('action=open should create a session', async () => { - const result = await handler().execute({ action: 'open', sessionId: 'test-1' }); + const result = await handler().execute({ + action: 'open', + sessionId: 'test-1', + }); expect(result.content[0].text).toContain('test-1'); expect(result.content[0].text).toContain('opened'); await handler().execute({ action: 'close', sessionId: 'test-1' }); }); test('action=open with url should navigate on launch', async () => { - const result = await handler().execute({ action: 'open', sessionId: 'test-url', url: 'https://example.com' }); + const result = await handler().execute({ + action: 'open', + sessionId: 'test-url', + url: 'https://example.com', + }); expect(result.content[0].text).toContain('test-url'); await handler().execute({ action: 'close', sessionId: 'test-url' }); }); test('action=close should close a session', async () => { await handler().execute({ action: 'open', sessionId: 'test-close' }); - const result = await handler().execute({ action: 'close', sessionId: 'test-close' }); + const result = await handler().execute({ + action: 'close', + sessionId: 'test-close', + }); expect(result.content[0].text).toContain('test-close'); expect(result.content[0].text).toContain('closed'); }); test('action=navigate should go to URL', async () => { await handler().execute({ action: 'open', sessionId: 'test-nav' }); - const result = await handler().execute({ action: 'navigate', sessionId: 'test-nav', url: 'https://example.com' }); + const result = await handler().execute({ + action: 'navigate', + sessionId: 'test-nav', + url: 'https://example.com', + }); expect(result.content[0].text).toContain('Navigated'); await handler().execute({ action: 'close', sessionId: 'test-nav' }); }); test('action=screenshot should return image data', async () => { await handler().execute({ action: 'open', sessionId: 'test-ss' }); - const result = await handler().execute({ action: 'screenshot', sessionId: 'test-ss' }); + const result = await handler().execute({ + action: 'screenshot', + sessionId: 'test-ss', + }); expect(result.content).toHaveLength(2); expect(result.content[0].type).toBe('image'); expect(result.content[0].data).toBeDefined(); @@ -251,7 +289,12 @@ describe('Codex MCP Server', () => { test('action=click should return confirmation', async () => { await handler().execute({ action: 'open', sessionId: 'test-click' }); - const result = await handler().execute({ action: 'click', sessionId: 'test-click', x: 100, y: 200 }); + const result = await handler().execute({ + action: 'click', + sessionId: 'test-click', + x: 100, + y: 200, + }); expect(result.content[0].text).toContain('100'); expect(result.content[0].text).toContain('200'); await handler().execute({ action: 'close', sessionId: 'test-click' }); @@ -259,7 +302,11 @@ describe('Codex MCP Server', () => { test('action=type should type text', async () => { await handler().execute({ action: 'open', sessionId: 'test-type' }); - const result = await handler().execute({ action: 'type', sessionId: 'test-type', text: 'hello' }); + const result = await handler().execute({ + action: 'type', + sessionId: 'test-type', + text: 'hello', + }); expect(result.content[0].text).toContain('Typed'); expect(result.content[0].text).toContain('hello'); await handler().execute({ action: 'close', sessionId: 'test-type' }); @@ -267,14 +314,23 @@ describe('Codex MCP Server', () => { test('action=key should normalize key names', async () => { await handler().execute({ action: 'open', sessionId: 'test-key' }); - const result = await handler().execute({ action: 'key', sessionId: 'test-key', key: 'Cmd+s' }); + const result = await handler().execute({ + action: 'key', + sessionId: 'test-key', + key: 'Cmd+s', + }); expect(result.content[0].text).toContain('Cmd+s'); await handler().execute({ action: 'close', sessionId: 'test-key' }); }); test('action=scroll should scroll page', async () => { await handler().execute({ action: 'open', sessionId: 'test-scroll' }); - const result = await handler().execute({ action: 'scroll', sessionId: 'test-scroll', direction: 'down', amount: 500 }); + const result = await handler().execute({ + action: 'scroll', + sessionId: 'test-scroll', + direction: 'down', + amount: 500, + }); expect(result.content[0].text).toContain('down'); expect(result.content[0].text).toContain('500'); await handler().execute({ action: 'close', sessionId: 'test-scroll' }); @@ -282,13 +338,22 @@ describe('Codex MCP Server', () => { test('action=drag should drag between coordinates', async () => { await handler().execute({ action: 'open', sessionId: 'test-drag' }); - const result = await handler().execute({ action: 'drag', sessionId: 'test-drag', fromX: 0, fromY: 0, toX: 100, toY: 100 }); + const result = await handler().execute({ + action: 'drag', + sessionId: 'test-drag', + fromX: 0, + fromY: 0, + toX: 100, + toY: 100, + }); expect(result.content[0].text).toContain('Dragged'); await handler().execute({ action: 'close', sessionId: 'test-drag' }); }); test('should reject invalid action', async () => { - await expect(handler().execute({ action: 'invalid' })).rejects.toThrow('Validation failed'); + await expect(handler().execute({ action: 'invalid' })).rejects.toThrow( + 'Validation failed' + ); }); }); diff --git a/src/browser-use/bridge.ts b/src/browser-use/bridge.ts index 4028c83..5f44cee 100644 --- a/src/browser-use/bridge.ts +++ b/src/browser-use/bridge.ts @@ -15,46 +15,71 @@ class BrowserUseBridge { private sessions: Map = new Map(); private available: boolean | null = null; private checkError: string | null = null; + private pending = new Set(); async checkAvailability(): Promise { - if (this.available !== null) return this.available; + if (this.available === true) return true; try { await import('playwright'); this.available = true; this.checkError = null; } catch { - this.available = false; - this.checkError = 'Playwright is not installed. Install with: npm install playwright && npx playwright install chromium'; + this.checkError = + 'Playwright is not installed. Install with: npm install playwright && npx playwright install chromium'; } - return this.available; + return this.available ?? false; } - async launch(sessionId: string, opts?: { url?: string; headless?: boolean; viewportWidth?: number; viewportHeight?: number }): Promise { + async launch( + sessionId: string, + opts?: { + url?: string; + headless?: boolean; + viewportWidth?: number; + viewportHeight?: number; + } + ): Promise { const canUse = await this.checkAvailability(); if (!canUse) { throw new Error(this.checkError ?? 'Playwright is not installed'); } - if (this.sessions.has(sessionId)) { - throw new Error(`Session "${sessionId}" already exists. Close it first or use a different session ID.`); + if (this.sessions.has(sessionId) || this.pending.has(sessionId)) { + throw new Error( + `Session "${sessionId}" already exists. Close it first or use a different session ID.` + ); + } + this.pending.add(sessionId); + try { + const session = await createBrowserSession(sessionId, opts); + this.sessions.set(sessionId, session); + return session; + } finally { + this.pending.delete(sessionId); } - const session = await createBrowserSession(sessionId, opts); - this.sessions.set(sessionId, session); - return session; } getSession(sessionId: string): BrowserSession { const session = this.sessions.get(sessionId); if (!session) { - throw new Error(`No active browser session with ID "${sessionId}". Use browser with action "open" to create one.`); + throw new Error( + `No active browser session with ID "${sessionId}". Use browser with action "open" to create one.` + ); } return session; } - async screenshot(sessionId: string): Promise<{ image: Buffer; url: string; title: string }> { + async screenshot( + sessionId: string + ): Promise<{ image: Buffer; url: string; title: string }> { return takeScreenshot(this.getSession(sessionId)); } - async click(sessionId: string, x: number, y: number, opts?: { button?: string; clickCount?: number }): Promise { + async click( + sessionId: string, + x: number, + y: number, + opts?: { button?: string; clickCount?: number } + ): Promise { return clickAt(this.getSession(sessionId), x, y, opts); } @@ -66,11 +91,21 @@ class BrowserUseBridge { return pressKey(this.getSession(sessionId), key); } - async scroll(sessionId: string, direction: string, amount: number): Promise { + async scroll( + sessionId: string, + direction: string, + amount: number + ): Promise { return scrollPage(this.getSession(sessionId), direction, amount); } - async drag(sessionId: string, fromX: number, fromY: number, toX: number, toY: number): Promise { + async drag( + sessionId: string, + fromX: number, + fromY: number, + toX: number, + toY: number + ): Promise { return dragFromTo(this.getSession(sessionId), fromX, fromY, toX, toY); } @@ -87,7 +122,9 @@ class BrowserUseBridge { } async shutdown(): Promise { - const closings = Array.from(this.sessions.values()).map((s) => closeSession(s).catch(() => {})); + const closings = Array.from(this.sessions.values()).map((s) => + closeSession(s).catch(() => {}) + ); await Promise.all(closings); this.sessions.clear(); } diff --git a/src/browser-use/client.ts b/src/browser-use/client.ts index 92ab4f1..d3b6b45 100644 --- a/src/browser-use/client.ts +++ b/src/browser-use/client.ts @@ -1,47 +1,77 @@ import type { Browser, BrowserContext, Page } from 'playwright'; import type { BrowserSession } from './types.js'; +import { ToolExecutionError } from '../errors.js'; -let playwrightModule: { chromium: { launch(opts?: Record): Promise } } | null = null; +let playwrightPromise: Promise | null = null; -async function getPlaywright() { - if (!playwrightModule) { - try { - playwrightModule = await import('playwright'); - } catch { - throw new Error( - 'Playwright is not installed. Install it with: npm install playwright && npx playwright install chromium' +function getPlaywright(): Promise { + if (!playwrightPromise) { + playwrightPromise = import('playwright').catch((err) => { + playwrightPromise = null; + throw new ToolExecutionError( + 'Playwright is not installed. Install it with: npm install playwright && npx playwright install chromium', + err ); - } + }); } - return playwrightModule; + return playwrightPromise; } export async function createBrowserSession( sessionId: string, - opts: { url?: string; headless?: boolean; viewportWidth?: number; viewportHeight?: number } = {} + opts: { + url?: string; + headless?: boolean; + viewportWidth?: number; + viewportHeight?: number; + } = {} ): Promise { + if (opts.url) { + const parsed = new URL(opts.url); + if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { + throw new ToolExecutionError( + 'browser', + `Unsupported URL protocol "${parsed.protocol}". Only http(s) is allowed.` + ); + } + } + const pw = await getPlaywright(); const browser = await pw.chromium.launch({ headless: opts.headless ?? true, }); - const context = await (browser as unknown as { newContext(opts?: Record): Promise }).newContext({ - viewport: { - width: opts.viewportWidth ?? 1440, - height: opts.viewportHeight ?? 900, - }, - }); + try { + const context = await ( + browser as unknown as { + newContext(opts?: Record): Promise; + } + ).newContext({ + viewport: { + width: opts.viewportWidth ?? 1440, + height: opts.viewportHeight ?? 900, + }, + }); - const page = await context.newPage(); + const page = await context.newPage(); - if (opts.url) { - await page.goto(opts.url, { waitUntil: 'domcontentloaded', timeout: 30000 }); - } + if (opts.url) { + await page.goto(opts.url, { + waitUntil: 'domcontentloaded', + timeout: 30000, + }); + } - return { sessionId, browser, page, createdAt: new Date() }; + return { sessionId, browser, page, createdAt: new Date() }; + } catch (err) { + await browser.close().catch(() => {}); + throw err; + } } -export async function takeScreenshot(session: BrowserSession): Promise<{ image: Buffer; url: string; title: string }> { +export async function takeScreenshot( + session: BrowserSession +): Promise<{ image: Buffer; url: string; title: string }> { const page = session.page as Page; const [image, title] = await Promise.all([ page.screenshot({ type: 'png', fullPage: false }), @@ -50,7 +80,12 @@ export async function takeScreenshot(session: BrowserSession): Promise<{ image: return { image, url: page.url(), title }; } -export async function clickAt(session: BrowserSession, x: number, y: number, opts?: { button?: string; clickCount?: number }): Promise { +export async function clickAt( + session: BrowserSession, + x: number, + y: number, + opts?: { button?: string; clickCount?: number } +): Promise { const page = session.page as Page; await page.mouse.click(x, y, { button: opts?.button ?? 'left', @@ -58,17 +93,27 @@ export async function clickAt(session: BrowserSession, x: number, y: number, opt }); } -export async function typeText(session: BrowserSession, text: string): Promise { +export async function typeText( + session: BrowserSession, + text: string +): Promise { const page = session.page as Page; await page.keyboard.type(text, { delay: 20 }); } -export async function pressKey(session: BrowserSession, key: string): Promise { +export async function pressKey( + session: BrowserSession, + key: string +): Promise { const page = session.page as Page; await page.keyboard.press(key); } -export async function scrollPage(session: BrowserSession, direction: string, amount: number): Promise { +export async function scrollPage( + session: BrowserSession, + direction: string, + amount: number +): Promise { const page = session.page as Page; const delta = direction === 'up' || direction === 'left' ? -amount : amount; const x = direction === 'left' || direction === 'right' ? delta : 0; @@ -76,7 +121,13 @@ export async function scrollPage(session: BrowserSession, direction: string, amo await page.mouse.wheel(x, y); } -export async function dragFromTo(session: BrowserSession, fromX: number, fromY: number, toX: number, toY: number): Promise { +export async function dragFromTo( + session: BrowserSession, + fromX: number, + fromY: number, + toX: number, + toY: number +): Promise { const page = session.page as Page; await page.mouse.move(fromX, fromY); await page.mouse.down(); @@ -84,7 +135,10 @@ export async function dragFromTo(session: BrowserSession, fromX: number, fromY: await page.mouse.up(); } -export async function navigateTo(session: BrowserSession, url: string): Promise { +export async function navigateTo( + session: BrowserSession, + url: string +): Promise { const page = session.page as Page; await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); } @@ -93,7 +147,14 @@ export async function closeSession(session: BrowserSession): Promise { const browser = session.browser as Browser; try { await browser.close(); - } catch { - // Browser may already be closed + } catch (err) { + // Browser may already be closed; log unexpected failures + if ( + err instanceof Error && + !err.message.includes('Target closed') && + !err.message.includes('Connection closed') + ) { + console.debug(`closeSession error:`, err); + } } } diff --git a/src/browser-use/definitions.ts b/src/browser-use/definitions.ts index f1812a5..e05b93d 100644 --- a/src/browser-use/definitions.ts +++ b/src/browser-use/definitions.ts @@ -20,22 +20,76 @@ const browserDef: ToolDefinition = { properties: { action: { type: 'string', - enum: ['open', 'screenshot', 'navigate', 'click', 'type', 'key', 'scroll', 'drag', 'close', 'status'], + enum: [ + 'open', + 'screenshot', + 'navigate', + 'click', + 'type', + 'key', + 'scroll', + 'drag', + 'close', + 'status', + ], description: 'The action to perform', }, - sessionId: { type: 'string', description: 'Browser session ID (required for all actions except "status")' }, - url: { type: 'string', description: 'URL to open or navigate to (for "open" and "navigate")' }, - headless: { type: 'boolean', description: 'Run without visible window (default: true, for "open")' }, - viewportWidth: { type: 'integer', description: 'Viewport width in pixels (default: 1440, for "open")' }, - viewportHeight: { type: 'integer', description: 'Viewport height in pixels (default: 900, for "open")' }, - x: { type: 'number', description: 'X coordinate in pixels, viewport-relative (for "click")' }, - y: { type: 'number', description: 'Y coordinate in pixels, viewport-relative (for "click")' }, - button: { type: 'string', enum: ['left', 'right', 'middle'], description: 'Mouse button (default: left, for "click")' }, - clickCount: { type: 'integer', description: 'Number of clicks (default: 1, for "click")' }, - text: { type: 'string', description: 'Text to type into focused element (for "type")' }, - key: { type: 'string', description: 'Key or combo, e.g. "Enter", "Control+a", "Meta+s" (for "key")' }, - direction: { type: 'string', enum: ['up', 'down', 'left', 'right'], description: 'Scroll direction (for "scroll")' }, - amount: { type: 'integer', description: 'Scroll amount in pixels (default: 300, for "scroll")' }, + sessionId: { + type: 'string', + description: + 'Browser session ID (required for all actions except "status")', + }, + url: { + type: 'string', + description: 'URL to open or navigate to (for "open" and "navigate")', + }, + headless: { + type: 'boolean', + description: 'Run without visible window (default: true, for "open")', + }, + viewportWidth: { + type: 'integer', + description: 'Viewport width in pixels (default: 1440, for "open")', + }, + viewportHeight: { + type: 'integer', + description: 'Viewport height in pixels (default: 900, for "open")', + }, + x: { + type: 'number', + description: 'X coordinate in pixels, viewport-relative (for "click")', + }, + y: { + type: 'number', + description: 'Y coordinate in pixels, viewport-relative (for "click")', + }, + button: { + type: 'string', + enum: ['left', 'right', 'middle'], + description: 'Mouse button (default: left, for "click")', + }, + clickCount: { + type: 'integer', + description: 'Number of clicks (default: 1, for "click")', + }, + text: { + type: 'string', + description: 'Text to type into focused element (for "type")', + }, + key: { + type: 'string', + description: + 'Key or combo, e.g. "Enter", "Control+a", "Meta+s" (for "key")', + }, + direction: { + type: 'string', + enum: ['up', 'down', 'left', 'right'], + description: 'Scroll direction (for "scroll")', + }, + amount: { + type: 'integer', + description: 'Scroll amount in pixels (default: 300, for "scroll")', + }, fromX: { type: 'number', description: 'Start X coordinate (for "drag")' }, fromY: { type: 'number', description: 'Start Y coordinate (for "drag")' }, toX: { type: 'number', description: 'End X coordinate (for "drag")' }, diff --git a/src/browser-use/handlers.ts b/src/browser-use/handlers.ts index 65561c0..28fb47c 100644 --- a/src/browser-use/handlers.ts +++ b/src/browser-use/handlers.ts @@ -2,40 +2,75 @@ import { TOOLS, type ToolResult, type ToolHandlerContext } from '../types.js'; import { ToolExecutionError, ValidationError } from '../errors.js'; import { ZodError } from 'zod'; import { bridge } from './bridge.js'; -import { parseBrowserAction, normalizeKey } from './types.js'; +import { + parseBrowserAction, + normalizeKey, + type BrowserAction, +} from './types.js'; const noopContext: ToolHandlerContext = { sendProgress: async () => {} }; export class BrowserUseToolHandler { async execute( args: unknown, - _context: ToolHandlerContext = noopContext + context: ToolHandlerContext = noopContext ): Promise { try { const parsed = parseBrowserAction(args); - - switch (parsed.action) { - case 'open': return this.handleOpen(parsed); - case 'screenshot': return this.handleScreenshot(parsed); - case 'navigate': return this.handleNavigate(parsed); - case 'click': return this.handleClick(parsed); - case 'type': return this.handleType(parsed); - case 'key': return this.handleKey(parsed); - case 'scroll': return this.handleScroll(parsed); - case 'drag': return this.handleDrag(parsed); - case 'close': return this.handleClose(parsed); - case 'status': return this.handleStatus(); - } + return this.dispatch(parsed, context); } catch (error) { if (error instanceof ValidationError) throw error; if (error instanceof ZodError) { throw new ValidationError(TOOLS.BROWSER, error.message); } - throw new ToolExecutionError(TOOLS.BROWSER, 'Browser operation failed', error); + throw new ToolExecutionError( + TOOLS.BROWSER, + 'Browser operation failed', + error + ); + } + } + + private async dispatch( + parsed: BrowserAction, + ctx: ToolHandlerContext + ): Promise { + switch (parsed.action) { + case 'open': + return this.handleOpen(parsed, ctx); + case 'screenshot': + return this.handleScreenshot(parsed, ctx); + case 'navigate': + return this.handleNavigate(parsed, ctx); + case 'click': + return this.handleClick(parsed); + case 'type': + return this.handleType(parsed); + case 'key': + return this.handleKey(parsed); + case 'scroll': + return this.handleScroll(parsed); + case 'drag': + return this.handleDrag(parsed); + case 'close': + return this.handleClose(parsed); + case 'status': + return this.handleStatus(); } } - private async handleOpen(args: { action: 'open'; sessionId: string; url?: string; headless?: boolean; viewportWidth?: number; viewportHeight?: number }): Promise { + private async handleOpen( + args: { + action: 'open'; + sessionId: string; + url?: string; + headless?: boolean; + viewportWidth?: number; + viewportHeight?: number; + }, + context: ToolHandlerContext + ): Promise { + await context.sendProgress('Launching browser...', 0); const session = await bridge.launch(args.sessionId, { url: args.url, headless: args.headless, @@ -44,63 +79,135 @@ export class BrowserUseToolHandler { }); return { content: [ - { type: 'text', text: `Session "${session.sessionId}" opened at ${session.createdAt.toISOString()}` }, + { + type: 'text', + text: `Session "${session.sessionId}" opened at ${session.createdAt.toISOString()}`, + }, ], }; } - private async handleScreenshot(args: { action: 'screenshot'; sessionId: string }): Promise { + private async handleScreenshot( + args: { action: 'screenshot'; sessionId: string }, + context: ToolHandlerContext + ): Promise { + await context.sendProgress('Taking screenshot...', 0); const { image, url, title } = await bridge.screenshot(args.sessionId); return { content: [ - { type: 'image', text: '', data: image.toString('base64'), mimeType: 'image/png' }, + { + type: 'image', + text: '', + data: image.toString('base64'), + mimeType: 'image/png', + }, { type: 'text', text: `URL: ${url}\nTitle: ${title}` }, ], }; } - private async handleNavigate(args: { action: 'navigate'; sessionId: string; url: string }): Promise { + private async handleNavigate( + args: { action: 'navigate'; sessionId: string; url: string }, + context: ToolHandlerContext + ): Promise { + await context.sendProgress(`Navigating to ${args.url}...`, 0); await bridge.navigate(args.sessionId, args.url); return { content: [{ type: 'text', text: `Navigated to ${args.url}` }] }; } - private async handleClick(args: { action: 'click'; sessionId: string; x: number; y: number; button?: string; clickCount?: number }): Promise { + private async handleClick(args: { + action: 'click'; + sessionId: string; + x: number; + y: number; + button?: string; + clickCount?: number; + }): Promise { await bridge.click(args.sessionId, args.x, args.y, { button: args.button, clickCount: args.clickCount, }); - return { content: [{ type: 'text', text: `Clicked at (${args.x}, ${args.y})` }] }; + return { + content: [{ type: 'text', text: `Clicked at (${args.x}, ${args.y})` }], + }; } - private async handleType(args: { action: 'type'; sessionId: string; text: string }): Promise { + private async handleType(args: { + action: 'type'; + sessionId: string; + text: string; + }): Promise { await bridge.type(args.sessionId, args.text); return { content: [{ type: 'text', text: `Typed "${args.text}"` }] }; } - private async handleKey(args: { action: 'key'; sessionId: string; key: string }): Promise { + private async handleKey(args: { + action: 'key'; + sessionId: string; + key: string; + }): Promise { await bridge.key(args.sessionId, normalizeKey(args.key)); return { content: [{ type: 'text', text: `Pressed key: ${args.key}` }] }; } - private async handleScroll(args: { action: 'scroll'; sessionId: string; direction: string; amount: number }): Promise { + private async handleScroll(args: { + action: 'scroll'; + sessionId: string; + direction: string; + amount: number; + }): Promise { await bridge.scroll(args.sessionId, args.direction, args.amount); - return { content: [{ type: 'text', text: `Scrolled ${args.direction} by ${args.amount}px` }] }; + return { + content: [ + { + type: 'text', + text: `Scrolled ${args.direction} by ${args.amount}px`, + }, + ], + }; } - private async handleDrag(args: { action: 'drag'; sessionId: string; fromX: number; fromY: number; toX: number; toY: number }): Promise { - await bridge.drag(args.sessionId, args.fromX, args.fromY, args.toX, args.toY); - return { content: [{ type: 'text', text: `Dragged from (${args.fromX}, ${args.fromY}) to (${args.toX}, ${args.toY})` }] }; + private async handleDrag(args: { + action: 'drag'; + sessionId: string; + fromX: number; + fromY: number; + toX: number; + toY: number; + }): Promise { + await bridge.drag( + args.sessionId, + args.fromX, + args.fromY, + args.toX, + args.toY + ); + return { + content: [ + { + type: 'text', + text: `Dragged from (${args.fromX}, ${args.fromY}) to (${args.toX}, ${args.toY})`, + }, + ], + }; } - private async handleClose(args: { action: 'close'; sessionId: string }): Promise { + private async handleClose(args: { + action: 'close'; + sessionId: string; + }): Promise { await bridge.close(args.sessionId); - return { content: [{ type: 'text', text: `Session "${args.sessionId}" closed` }] }; + return { + content: [{ type: 'text', text: `Session "${args.sessionId}" closed` }], + }; } private async handleStatus(): Promise { await bridge.checkAvailability(); const status = bridge.getStatus(); - return { content: [{ type: 'text', text: JSON.stringify(status, null, 2) }] }; + return { + content: [{ type: 'text', text: JSON.stringify(status, null, 2) }], + }; } } diff --git a/src/browser-use/playwright.d.ts b/src/browser-use/playwright.d.ts index 8d00abf..87b5b7a 100644 --- a/src/browser-use/playwright.d.ts +++ b/src/browser-use/playwright.d.ts @@ -13,7 +13,11 @@ declare module 'playwright' { goto(url: string, opts?: Record): Promise; screenshot(opts?: Record): Promise; mouse: { - click(x: number, y: number, opts?: Record): Promise; + click( + x: number, + y: number, + opts?: Record + ): Promise; move(x: number, y: number, opts?: Record): Promise; down(): Promise; up(): Promise; diff --git a/src/browser-use/types.ts b/src/browser-use/types.ts index af0e6db..9b84aa8 100644 --- a/src/browser-use/types.ts +++ b/src/browser-use/types.ts @@ -1,19 +1,23 @@ import { z } from 'zod'; +const httpUrl = z + .url() + .refine((u: string) => /^https?:\/\//i.test(u), 'Only http(s) URLs allowed'); + // Per-action schemas (separate for validation reuse and clean errors) const openSchema = z.object({ action: z.literal('open'), sessionId: z.string().min(1, 'Session ID is required'), - url: z.string().optional(), - headless: z.boolean().optional().default(true), - viewportWidth: z.int().optional().default(1440), - viewportHeight: z.int().optional().default(900), + url: httpUrl.optional(), + headless: z.boolean().default(true), + viewportWidth: z.int().default(1440), + viewportHeight: z.int().default(900), }); const navigateSchema = z.object({ action: z.literal('navigate'), sessionId: z.string().min(1, 'Session ID is required'), - url: z.string().min(1, 'URL is required'), + url: httpUrl.min(1, 'URL is required'), }); const screenshotSchema = z.object({ @@ -26,8 +30,8 @@ const clickSchema = z.object({ sessionId: z.string().min(1, 'Session ID is required'), x: z.number(), y: z.number(), - button: z.enum(['left', 'right', 'middle']).optional().default('left'), - clickCount: z.int().optional().default(1), + button: z.enum(['left', 'right', 'middle']).default('left'), + clickCount: z.int().default(1), }); const typeSchema = z.object({ @@ -46,7 +50,7 @@ const scrollSchema = z.object({ action: z.literal('scroll'), sessionId: z.string().min(1, 'Session ID is required'), direction: z.enum(['up', 'down', 'left', 'right']), - amount: z.int().optional().default(300), + amount: z.int().default(300), }); const dragSchema = z.object({ @@ -69,12 +73,19 @@ const statusSchema = z.object({ // Discriminated union for single-parse validation export const BrowserActionSchema = z.discriminatedUnion('action', [ - openSchema, navigateSchema, screenshotSchema, clickSchema, - typeSchema, keySchema, scrollSchema, dragSchema, - closeSchema, statusSchema, + openSchema, + navigateSchema, + screenshotSchema, + clickSchema, + typeSchema, + keySchema, + scrollSchema, + dragSchema, + closeSchema, + statusSchema, ]); -// Map action to its schema for per-action validation (fixes #3: clean error messages) +// Map action to its schema for per-action validation const ACTION_SCHEMAS: Record = { open: openSchema, navigate: navigateSchema, @@ -94,12 +105,10 @@ export function parseBrowserAction(args: unknown): BrowserAction { const raw = args as Record; const action = typeof raw.action === 'string' ? raw.action : undefined; - // Per-action validation for clean error messages if (action && ACTION_SCHEMAS[action]) { return ACTION_SCHEMAS[action].parse(args) as BrowserAction; } - // Fallback: full discriminated union parse (handles missing/invalid action) return BrowserActionSchema.parse(args) as BrowserAction; } @@ -119,21 +128,23 @@ export interface BrowserStatus { export function normalizeKey(key: string): string { const map: Record = { - Ctrl: 'Control', - Cmd: 'Meta', - Command: 'Meta', - Opt: 'Alt', - Option: 'Alt', - Del: 'Delete', - Backspace: 'Backspace', - Enter: 'Enter', - Tab: 'Tab', - Esc: 'Escape', - Escape: 'Escape', - Return: 'Enter', + ctrl: 'Control', + cmd: 'Meta', + command: 'Meta', + opt: 'Alt', + option: 'Alt', + del: 'Delete', + backspace: 'Delete', + esc: 'Escape', + escape: 'Escape', + return: 'Enter', + }; + const lookup = (k: string): string => { + const lower = k.trim().toLowerCase(); + return map[lower] ?? k.trim(); }; if (key.includes('+')) { - return key.split('+').map((part) => map[part.trim()] ?? part.trim()).join('+'); + return key.split('+').map(lookup).join('+'); } - return map[key] ?? key; + return lookup(key); }