diff --git a/.gitignore b/.gitignore index 9a941be..7339791 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,10 @@ temp/ # Jest cache .jest/ +.claude/ + +# Local artifacts +trash/ +.entire/ +test-mcp.mjs +tmux/ diff --git a/CLAUDE.md b/CLAUDE.md index 11ae7fa..188dd4b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -41,6 +41,11 @@ MCP Client (Claude Code) → ToolHandlers (handlers.ts) → executeCommand (command.ts) → Codex CLI + + → BrowserUseToolHandler (handlers.ts) + → BrowserUseBridge (bridge.ts) + → Playwright (peer dependency) + → Chromium instances ``` ### Key Files @@ -55,6 +60,11 @@ MCP Client (Claude Code) | `src/session/storage.ts` | In-memory session storage for conversation context | | `src/utils/command.ts` | Command spawning with streaming support and Windows compatibility | | `src/errors.ts` | Custom error classes (ValidationError, ToolExecutionError) | +| `src/browser-use/bridge.ts` | Singleton browser session manager (lazy Playwright init) | +| `src/browser-use/client.ts` | Playwright session operations (launch, screenshot, click, etc.) | +| `src/browser-use/handlers.ts` | Browser tool handler — dispatches on `action` parameter | +| `src/browser-use/definitions.ts` | Browser tool MCP schema and annotations | +| `src/browser-use/types.ts` | Zod schemas for browser actions, key normalization | ### MCP Tools @@ -64,6 +74,7 @@ MCP Client (Claude Code) - **listSessions**: View active conversation sessions - **ping**: Test server connection - **help**: Get Codex CLI help +- **browser**: Playwright-based browser automation via a single `action` parameter (`open`, `screenshot`, `navigate`, `click`, `type`, `key`, `scroll`, `drag`, `close`, `status`). Playwright is a peer dependency — lazy-loaded on first use. ### Session Management diff --git a/README.md b/README.md index d297ef1..6dff126 100644 --- a/README.md +++ b/README.md @@ -4,18 +4,22 @@ [![npm downloads](https://img.shields.io/npm/dm/codex-mcp-server.svg)](https://www.npmjs.com/package/codex-mcp-server) [![license](https://img.shields.io/npm/l/codex-mcp-server.svg)](https://www.npmjs.com/package/codex-mcp-server) -Bridge between Claude and OpenAI's Codex CLI — get AI-powered code analysis, generation, and review right in your editor. +Bridge between Claude and OpenAI's Codex CLI — get AI-powered code analysis, generation, and review right in your editor. Also includes optional browser automation via Playwright. ```mermaid graph LR A[Claude Code] --> B[Codex MCP Server] B --> C[Codex CLI] C --> D[OpenAI API] + B --> E[Browser Use] + E --> F[Chromium] style A fill:#FF6B35 style B fill:#4A90E2 style C fill:#00D4AA style D fill:#FFA500 + style E fill:#9B59B6 + style F fill:#E74C3C ``` ## Quick Start @@ -54,6 +58,7 @@ Use review to check my uncommitted changes | `codex` | AI coding assistant with session support, model selection, and structured output metadata | | `review` | AI-powered code review for uncommitted changes, branches, or commits | | `websearch` | Web search using Codex CLI with customizable result count and search depth | +| `browser` | Browser automation via Playwright — open, screenshot, click, type, scroll, drag, navigate ([setup](docs/browser-use.md)) | | `listSessions` | View active conversation sessions | | `ping` | Test server connection | | `help` | Get Codex CLI help | @@ -90,8 +95,14 @@ Use codex to return structuredContent with threadId metadata when available **Web search:** ``` Use websearch with query "TypeScript 5.8 new features" -Use websearch with query "Rust vs Go performance 2025" and numResults 15 -Use websearch with query "React Server Components" and searchDepth "full" +``` + +**Browser Automation:** +``` +Use browser with action "status" to check if Playwright is available +Use browser with action "open", sessionId "web1", and url "https://example.com" +Use browser with action "screenshot" and sessionId "web1" to see the page +Use browser with action "click", sessionId "web1", x 100, and y 200 ``` ## Requirements @@ -105,6 +116,7 @@ Use websearch with query "React Server Components" and searchDepth "full" ## Documentation - **[API Reference](docs/api-reference.md)** — Full tool parameters and response formats +- **[Browser Use](docs/browser-use.md)** — Playwright setup, tools, and troubleshooting - **[Session Management](docs/session-management.md)** — How conversations work - **[Codex CLI Integration](docs/codex-cli-integration.md)** — Version compatibility and CLI details diff --git a/docs/api-reference.md b/docs/api-reference.md index 666a298..6f0ecc8 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -48,9 +48,11 @@ All tools include annotations that provide hints to MCP clients about tool behav |------|---------|---------------|-------------------|------------------|-----------------| | `codex` | Execute Codex CLI | `false` | `true` | `false` | `true` | | `review` | Code Review | `true` | `false` | `true` | `true` | +| `websearch` | Web Search | `true` | `false` | `true` | `true` | | `ping` | Ping Server | `true` | `false` | `true` | `false` | | `help` | Get Help | `true` | `false` | `true` | `false` | | `listSessions` | List Sessions | `true` | `false` | `true` | `false` | +| `browser` | Browser Control | `false` | `true` | `false` | `true` | ### Progress Notifications For long-running operations, the server sends `notifications/progress` messages when the client includes a `progressToken` in the request `_meta`. @@ -439,4 +441,37 @@ Optional: ### Optional Configuration - **CODEX_HOME**: Custom directory for Codex CLI configuration - **Session Limits**: Configurable in server implementation (default: 100) -- **TTL Settings**: Configurable session expiration (default: 24 hours) \ No newline at end of file +- **TTL Settings**: Configurable session expiration (default: 24 hours) + +## Browser Use + +Cross-platform browser automation via Playwright. See [Browser Use](browser-use.md) for setup instructions. + +### `browser` — Browser Control + +**Annotations:** `readOnlyHint: false`, `destructiveHint: true`, `idempotentHint: false`, `openWorldHint: true` + +A single tool for all browser operations, selected via the `action` parameter. + +| Parameter | Type | Required | Action | Default | Description | +|-----------|------|----------|--------|---------|-------------| +| `action` | enum | yes | all | — | `open`, `screenshot`, `navigate`, `click`, `type`, `key`, `scroll`, `drag`, `close`, `status` | +| `sessionId` | string | see below | all except `status` | — | Browser session ID | +| `url` | string | no | `open`, `navigate` | — | URL to open or navigate to | +| `headless` | boolean | no | `open` | `true` | Run without visible window | +| `viewportWidth` | integer | no | `open` | `1440` | Viewport width in pixels | +| `viewportHeight` | integer | no | `open` | `900` | Viewport height in pixels | +| `x` | number | no | `click` | — | X coordinate (viewport-relative) | +| `y` | number | no | `click` | — | Y coordinate (viewport-relative) | +| `button` | enum | no | `click` | `left` | `left`, `right`, or `middle` | +| `clickCount` | integer | no | `click` | `1` | Number of clicks | +| `text` | string | no | `type` | — | Text to type into focused element | +| `key` | string | no | `key` | — | Key or combo (`Enter`, `Control+a`, `Meta+s`) | +| `direction` | enum | no | `scroll` | — | `up`, `down`, `left`, or `right` | +| `amount` | integer | no | `scroll` | `300` | Scroll amount in pixels | +| `fromX` | number | no | `drag` | — | Start X coordinate | +| `fromY` | number | no | `drag` | — | Start Y coordinate | +| `toX` | number | no | `drag` | — | End X coordinate | +| `toY` | number | no | `drag` | — | End Y coordinate | + +Modifier keys are auto-normalized: `Cmd`/`Command` → `Meta`, `Ctrl` → `Control`, `Opt`/`Option` → `Alt`. \ No newline at end of file diff --git a/docs/browser-use.md b/docs/browser-use.md new file mode 100644 index 0000000..2449617 --- /dev/null +++ b/docs/browser-use.md @@ -0,0 +1,67 @@ +# Browser Use + +Playwright-based browser automation. Launch a real Chromium browser, take screenshots, click, type, scroll, drag, and navigate — all via a single MCP tool with an `action` parameter. + +## Architecture + +```text +Claude Code + → codex-mcp-server + → BrowserUseBridge (singleton, lazy init) + → Playwright (peer dependency) + → Chromium instances +``` + +The bridge manages multiple concurrent browser sessions. Playwright is only imported on first use — no impact on codex/review startup when browser tools aren't used. + +## Setup + +```bash +npm install playwright +npx playwright install chromium +``` + +Playwright is a **peer dependency** — install it separately. The server works fine without it; the browser tool will return a helpful error if Playwright isn't available. + +## Tool: `browser` + +A single tool for all browser operations. Every call requires an `action` parameter. + +### Session Lifecycle + +| Action | Example | Description | +|--------|---------|-------------| +| `open` | `{ "action": "open", "sessionId": "s1", "url": "https://example.com" }` | Launch a browser session. Optional: `url`, `headless` (default true), `viewportWidth` (default 1440), `viewportHeight` (default 900) | +| `status` | `{ "action": "status" }` | Check if Playwright is available and list active sessions | +| `close` | `{ "action": "close", "sessionId": "s1" }` | Close a session and clean up resources | + +### Navigation & Capture + +| Action | Example | Description | +|--------|---------|-------------| +| `navigate` | `{ "action": "navigate", "sessionId": "s1", "url": "https://example.com/page2" }` | Navigate to a URL | +| `screenshot` | `{ "action": "screenshot", "sessionId": "s1" }` | Take a screenshot. Returns base64 PNG image data + page URL + title | + +### Input Actions + +| Action | Example | Description | +|--------|---------|-------------| +| `click` | `{ "action": "click", "sessionId": "s1", "x": 100, "y": 200 }` | Click at viewport coordinates. Optional: `button` (left/right/middle), `clickCount` | +| `type` | `{ "action": "type", "sessionId": "s1", "text": "hello world" }` | Type text into the focused element | +| `key` | `{ "action": "key", "sessionId": "s1", "key": "Control+a" }` | Press a key or combo. Modifier keys auto-normalized (`Cmd`→`Meta`, `Ctrl`→`Control`) | +| `scroll` | `{ "action": "scroll", "sessionId": "s1", "direction": "down", "amount": 300 }` | Scroll page: `up`/`down`/`left`/`right` by pixel amount | +| `drag` | `{ "action": "drag", "sessionId": "s1", "fromX": 0, "fromY": 0, "toX": 100, "toY": 100 }` | Drag from one coordinate to another (viewport-relative) | + +## Troubleshooting + +**"Playwright is not installed"** +Install it: `npm install playwright && npx playwright install chromium` + +**"Session already exists"** +Use a different sessionId or close the existing session first. + +**"No active browser session"** +You must call `{ "action": "open" }` before any other action. + +**Screenshots not loading** +Check that the client supports `image` content type in MCP tool results. Claude Code supports this natively. diff --git a/eslint.config.mjs b/eslint.config.mjs index dac91df..fc0a075 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -16,6 +16,12 @@ export default [ globals: { console: 'readonly', process: 'readonly', + Buffer: 'readonly', + setTimeout: 'readonly', + clearTimeout: 'readonly', + setInterval: 'readonly', + clearInterval: 'readonly', + setImmediate: 'readonly', }, }, plugins: { diff --git a/jest.config.mjs b/jest.config.mjs index f8677ca..2793df4 100644 --- a/jest.config.mjs +++ b/jest.config.mjs @@ -28,4 +28,5 @@ export default { '**/__tests__/**/*.test.ts', '**/?(*.)+(spec|test).ts', ], + testPathIgnorePatterns: ['/node_modules/', '/trash/'], }; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index c7ab951..7415e85 100644 --- a/package-lock.json +++ b/package-lock.json @@ -27,6 +27,9 @@ "ts-jest": "^29.4.1", "tsx": "^4.20.4", "typescript": "^5.9.2" + }, + "peerDependencies": { + "playwright": "^1.59.1" } }, "node_modules/@ampproject/remapping": { @@ -6725,6 +6728,53 @@ "node": ">=8" } }, + "node_modules/playwright": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.59.1.tgz", + "integrity": "sha512-C8oWjPR3F81yljW9o5OxcWzfh6avkVwDD2VYdwIGqTkl+OGFISgypqzfu7dOe4QNLL2aqcWBmI3PMtLIK233lw==", + "license": "Apache-2.0", + "peer": true, + "dependencies": { + "playwright-core": "1.59.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.59.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.59.1.tgz", + "integrity": "sha512-HBV/RJg81z5BiiZ9yPzIiClYV/QMsDCKUyogwH9p3MCP6IYjUFu/MActgYAvK0oWyV9NlwM3GLBjADyWgydVyg==", + "license": "Apache-2.0", + "peer": true, + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/playwright/node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "peer": true, + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/prelude-ls": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", diff --git a/package.json b/package.json index 7b342e3..1dccb1c 100644 --- a/package.json +++ b/package.json @@ -60,5 +60,8 @@ "ts-jest": "^29.4.1", "tsx": "^4.20.4", "typescript": "^5.9.2" + }, + "peerDependencies": { + "playwright": "^1.59.1" } } diff --git a/src/__tests__/index.test.ts b/src/__tests__/index.test.ts index 1762942..9f91e62 100644 --- a/src/__tests__/index.test.ts +++ b/src/__tests__/index.test.ts @@ -16,6 +16,40 @@ jest.mock('../utils/command.js', () => ({ }), })); +// Mock playwright to avoid requiring it at test time +jest.mock( + 'playwright', + () => ({ + chromium: { + launch: jest.fn().mockResolvedValue({ + newContext: jest.fn().mockResolvedValue({ + newPage: jest.fn().mockResolvedValue({ + goto: jest.fn(), + screenshot: jest.fn().mockResolvedValue(Buffer.from('fake-png')), + title: jest.fn().mockResolvedValue('Test Page'), + url: jest.fn().mockReturnValue('https://example.com'), + mouse: { + click: jest.fn(), + move: jest.fn(), + down: jest.fn(), + up: jest.fn(), + wheel: jest.fn(), + }, + keyboard: { + press: jest.fn(), + type: jest.fn(), + }, + waitForLoadState: jest.fn(), + }), + close: jest.fn(), + }), + close: jest.fn(), + }), + }, + }), + { virtual: true } +); + import { TOOLS } from '../types.js'; import { toolDefinitions } from '../tools/definitions.js'; import { @@ -33,11 +67,13 @@ import { } from '../tools/handlers.js'; import { InMemorySessionStorage } from '../session/storage.js'; import { CodexMcpServer } from '../server.js'; +import { bridge } from '../browser-use/bridge.js'; describe('Codex MCP Server', () => { describe('Tool Definitions', () => { test('should have all required tools defined', () => { - expect(toolDefinitions).toHaveLength(6); + // 6 core tools + 1 browser tool = 7 + expect(toolDefinitions).toHaveLength(7); const toolNames = toolDefinitions.map((tool) => tool.name); expect(toolNames).toContain(TOOLS.CODEX); @@ -48,6 +84,43 @@ describe('Codex MCP Server', () => { expect(toolNames).toContain(TOOLS.LIST_SESSIONS); }); + test('should have browser tool defined', () => { + const toolNames = toolDefinitions.map((tool) => tool.name); + expect(toolNames).toContain(TOOLS.BROWSER); + }); + + test('browser tool should require action parameter', () => { + const browserTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.BROWSER + ); + expect(browserTool).toBeDefined(); + expect(browserTool?.inputSchema.required).toContain('action'); + expect(browserTool?.inputSchema.required).not.toContain('sessionId'); + }); + + test('browser tool should include all action types in enum', () => { + const browserTool = toolDefinitions.find( + (tool) => tool.name === TOOLS.BROWSER + ); + const actionEnum = browserTool?.inputSchema.properties.action as { + enum: string[]; + }; + expect(actionEnum.enum).toEqual( + expect.arrayContaining([ + 'open', + 'screenshot', + 'navigate', + 'click', + 'type', + 'key', + 'scroll', + 'drag', + 'close', + 'status', + ]) + ); + }); + test('codex tool should define output schema', () => { const codexTool = toolDefinitions.find( (tool) => tool.name === TOOLS.CODEX @@ -81,7 +154,7 @@ describe('Codex MCP Server', () => { }); describe('Tool Handlers', () => { - test('should have handlers for all tools', () => { + test('should have handlers for all core tools', () => { expect(toolHandlers[TOOLS.CODEX]).toBeInstanceOf(CodexToolHandler); expect(toolHandlers[TOOLS.REVIEW]).toBeInstanceOf(ReviewToolHandler); expect(toolHandlers[TOOLS.WEBSEARCH]).toBeInstanceOf( @@ -94,6 +167,10 @@ describe('Codex MCP Server', () => { ); }); + test('should have handler for browser tool', () => { + expect(toolHandlers[TOOLS.BROWSER]).toBeDefined(); + }); + test('ping handler should return message', async () => { const handler = new PingToolHandler(); const result = await handler.execute({ message: 'test' }); @@ -139,6 +216,147 @@ describe('Codex MCP Server', () => { }); }); + describe('Browser Tool Handler', () => { + afterEach(async () => { + await bridge.shutdown(); + }); + + const handler = () => toolHandlers[TOOLS.BROWSER]; + + test('action=status should return status JSON', async () => { + const result = await handler().execute({ action: 'status' }); + expect(result.content[0].type).toBe('text'); + const status = JSON.parse(result.content[0].text); + expect(status).toHaveProperty('available'); + expect(status).toHaveProperty('activeSessions'); + expect(status).toHaveProperty('sessionIds'); + }); + + test('action=open should create a session', async () => { + const result = await handler().execute({ + action: 'open', + sessionId: 'test-1', + }); + expect(result.content[0].text).toContain('test-1'); + expect(result.content[0].text).toContain('opened'); + await handler().execute({ action: 'close', sessionId: 'test-1' }); + }); + + test('action=open with url should navigate on launch', async () => { + const result = await handler().execute({ + action: 'open', + sessionId: 'test-url', + url: 'https://example.com', + }); + expect(result.content[0].text).toContain('test-url'); + await handler().execute({ action: 'close', sessionId: 'test-url' }); + }); + + test('action=close should close a session', async () => { + await handler().execute({ action: 'open', sessionId: 'test-close' }); + const result = await handler().execute({ + action: 'close', + sessionId: 'test-close', + }); + expect(result.content[0].text).toContain('test-close'); + expect(result.content[0].text).toContain('closed'); + }); + + test('action=navigate should go to URL', async () => { + await handler().execute({ action: 'open', sessionId: 'test-nav' }); + const result = await handler().execute({ + action: 'navigate', + sessionId: 'test-nav', + url: 'https://example.com', + }); + expect(result.content[0].text).toContain('Navigated'); + await handler().execute({ action: 'close', sessionId: 'test-nav' }); + }); + + test('action=screenshot should return image data', async () => { + await handler().execute({ action: 'open', sessionId: 'test-ss' }); + const result = await handler().execute({ + action: 'screenshot', + sessionId: 'test-ss', + }); + expect(result.content).toHaveLength(2); + expect(result.content[0].type).toBe('image'); + expect(result.content[0].data).toBeDefined(); + expect(result.content[0].mimeType).toBe('image/png'); + expect(result.content[1].type).toBe('text'); + await handler().execute({ action: 'close', sessionId: 'test-ss' }); + }); + + test('action=click should return confirmation', async () => { + await handler().execute({ action: 'open', sessionId: 'test-click' }); + const result = await handler().execute({ + action: 'click', + sessionId: 'test-click', + x: 100, + y: 200, + }); + expect(result.content[0].text).toContain('100'); + expect(result.content[0].text).toContain('200'); + await handler().execute({ action: 'close', sessionId: 'test-click' }); + }); + + test('action=type should type text', async () => { + await handler().execute({ action: 'open', sessionId: 'test-type' }); + const result = await handler().execute({ + action: 'type', + sessionId: 'test-type', + text: 'hello', + }); + expect(result.content[0].text).toContain('Typed'); + expect(result.content[0].text).toContain('hello'); + await handler().execute({ action: 'close', sessionId: 'test-type' }); + }); + + test('action=key should normalize key names', async () => { + await handler().execute({ action: 'open', sessionId: 'test-key' }); + const result = await handler().execute({ + action: 'key', + sessionId: 'test-key', + key: 'Cmd+s', + }); + expect(result.content[0].text).toContain('Cmd+s'); + await handler().execute({ action: 'close', sessionId: 'test-key' }); + }); + + test('action=scroll should scroll page', async () => { + await handler().execute({ action: 'open', sessionId: 'test-scroll' }); + const result = await handler().execute({ + action: 'scroll', + sessionId: 'test-scroll', + direction: 'down', + amount: 500, + }); + expect(result.content[0].text).toContain('down'); + expect(result.content[0].text).toContain('500'); + await handler().execute({ action: 'close', sessionId: 'test-scroll' }); + }); + + test('action=drag should drag between coordinates', async () => { + await handler().execute({ action: 'open', sessionId: 'test-drag' }); + const result = await handler().execute({ + action: 'drag', + sessionId: 'test-drag', + fromX: 0, + fromY: 0, + toX: 100, + toY: 100, + }); + expect(result.content[0].text).toContain('Dragged'); + await handler().execute({ action: 'close', sessionId: 'test-drag' }); + }); + + test('should reject invalid action', async () => { + await expect(handler().execute({ action: 'invalid' })).rejects.toThrow( + 'Validation failed' + ); + }); + }); + describe('Server Initialization', () => { test('should initialize server with config', () => { const config = { name: 'test-server', version: '1.0.0' }; diff --git a/src/browser-use/bridge.ts b/src/browser-use/bridge.ts new file mode 100644 index 0000000..5f44cee --- /dev/null +++ b/src/browser-use/bridge.ts @@ -0,0 +1,143 @@ +import type { BrowserSession, BrowserStatus } from './types.js'; +import { + createBrowserSession, + takeScreenshot, + clickAt, + typeText, + pressKey, + scrollPage, + dragFromTo, + navigateTo, + closeSession, +} from './client.js'; + +class BrowserUseBridge { + private sessions: Map = new Map(); + private available: boolean | null = null; + private checkError: string | null = null; + private pending = new Set(); + + async checkAvailability(): Promise { + if (this.available === true) return true; + try { + await import('playwright'); + this.available = true; + this.checkError = null; + } catch { + this.checkError = + 'Playwright is not installed. Install with: npm install playwright && npx playwright install chromium'; + } + return this.available ?? false; + } + + async launch( + sessionId: string, + opts?: { + url?: string; + headless?: boolean; + viewportWidth?: number; + viewportHeight?: number; + } + ): Promise { + const canUse = await this.checkAvailability(); + if (!canUse) { + throw new Error(this.checkError ?? 'Playwright is not installed'); + } + if (this.sessions.has(sessionId) || this.pending.has(sessionId)) { + throw new Error( + `Session "${sessionId}" already exists. Close it first or use a different session ID.` + ); + } + this.pending.add(sessionId); + try { + const session = await createBrowserSession(sessionId, opts); + this.sessions.set(sessionId, session); + return session; + } finally { + this.pending.delete(sessionId); + } + } + + getSession(sessionId: string): BrowserSession { + const session = this.sessions.get(sessionId); + if (!session) { + throw new Error( + `No active browser session with ID "${sessionId}". Use browser with action "open" to create one.` + ); + } + return session; + } + + async screenshot( + sessionId: string + ): Promise<{ image: Buffer; url: string; title: string }> { + return takeScreenshot(this.getSession(sessionId)); + } + + async click( + sessionId: string, + x: number, + y: number, + opts?: { button?: string; clickCount?: number } + ): Promise { + return clickAt(this.getSession(sessionId), x, y, opts); + } + + async type(sessionId: string, text: string): Promise { + return typeText(this.getSession(sessionId), text); + } + + async key(sessionId: string, key: string): Promise { + return pressKey(this.getSession(sessionId), key); + } + + async scroll( + sessionId: string, + direction: string, + amount: number + ): Promise { + return scrollPage(this.getSession(sessionId), direction, amount); + } + + async drag( + sessionId: string, + fromX: number, + fromY: number, + toX: number, + toY: number + ): Promise { + return dragFromTo(this.getSession(sessionId), fromX, fromY, toX, toY); + } + + async navigate(sessionId: string, url: string): Promise { + return navigateTo(this.getSession(sessionId), url); + } + + async close(sessionId: string): Promise { + const session = this.sessions.get(sessionId); + if (session) { + await closeSession(session); + this.sessions.delete(sessionId); + } + } + + async shutdown(): Promise { + const closings = Array.from(this.sessions.values()).map((s) => + closeSession(s).catch(() => {}) + ); + await Promise.all(closings); + this.sessions.clear(); + } + + getStatus(): BrowserStatus { + return { + available: this.available ?? false, + error: this.checkError, + activeSessions: this.sessions.size, + sessionIds: Array.from(this.sessions.keys()), + }; + } +} + +// Singleton +export const bridge = new BrowserUseBridge(); diff --git a/src/browser-use/client.ts b/src/browser-use/client.ts new file mode 100644 index 0000000..d3b6b45 --- /dev/null +++ b/src/browser-use/client.ts @@ -0,0 +1,160 @@ +import type { Browser, BrowserContext, Page } from 'playwright'; +import type { BrowserSession } from './types.js'; +import { ToolExecutionError } from '../errors.js'; + +let playwrightPromise: Promise | null = null; + +function getPlaywright(): Promise { + if (!playwrightPromise) { + playwrightPromise = import('playwright').catch((err) => { + playwrightPromise = null; + throw new ToolExecutionError( + 'Playwright is not installed. Install it with: npm install playwright && npx playwright install chromium', + err + ); + }); + } + return playwrightPromise; +} + +export async function createBrowserSession( + sessionId: string, + opts: { + url?: string; + headless?: boolean; + viewportWidth?: number; + viewportHeight?: number; + } = {} +): Promise { + if (opts.url) { + const parsed = new URL(opts.url); + if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') { + throw new ToolExecutionError( + 'browser', + `Unsupported URL protocol "${parsed.protocol}". Only http(s) is allowed.` + ); + } + } + + const pw = await getPlaywright(); + const browser = await pw.chromium.launch({ + headless: opts.headless ?? true, + }); + + try { + const context = await ( + browser as unknown as { + newContext(opts?: Record): Promise; + } + ).newContext({ + viewport: { + width: opts.viewportWidth ?? 1440, + height: opts.viewportHeight ?? 900, + }, + }); + + const page = await context.newPage(); + + if (opts.url) { + await page.goto(opts.url, { + waitUntil: 'domcontentloaded', + timeout: 30000, + }); + } + + return { sessionId, browser, page, createdAt: new Date() }; + } catch (err) { + await browser.close().catch(() => {}); + throw err; + } +} + +export async function takeScreenshot( + session: BrowserSession +): Promise<{ image: Buffer; url: string; title: string }> { + const page = session.page as Page; + const [image, title] = await Promise.all([ + page.screenshot({ type: 'png', fullPage: false }), + page.title(), + ]); + return { image, url: page.url(), title }; +} + +export async function clickAt( + session: BrowserSession, + x: number, + y: number, + opts?: { button?: string; clickCount?: number } +): Promise { + const page = session.page as Page; + await page.mouse.click(x, y, { + button: opts?.button ?? 'left', + clickCount: opts?.clickCount ?? 1, + }); +} + +export async function typeText( + session: BrowserSession, + text: string +): Promise { + const page = session.page as Page; + await page.keyboard.type(text, { delay: 20 }); +} + +export async function pressKey( + session: BrowserSession, + key: string +): Promise { + const page = session.page as Page; + await page.keyboard.press(key); +} + +export async function scrollPage( + session: BrowserSession, + direction: string, + amount: number +): Promise { + const page = session.page as Page; + const delta = direction === 'up' || direction === 'left' ? -amount : amount; + const x = direction === 'left' || direction === 'right' ? delta : 0; + const y = direction === 'up' || direction === 'down' ? delta : 0; + await page.mouse.wheel(x, y); +} + +export async function dragFromTo( + session: BrowserSession, + fromX: number, + fromY: number, + toX: number, + toY: number +): Promise { + const page = session.page as Page; + await page.mouse.move(fromX, fromY); + await page.mouse.down(); + await page.mouse.move(toX, toY, { steps: 10 }); + await page.mouse.up(); +} + +export async function navigateTo( + session: BrowserSession, + url: string +): Promise { + const page = session.page as Page; + await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); +} + +export async function closeSession(session: BrowserSession): Promise { + const browser = session.browser as Browser; + try { + await browser.close(); + } catch (err) { + // Browser may already be closed; log unexpected failures + if ( + err instanceof Error && + !err.message.includes('Target closed') && + !err.message.includes('Connection closed') + ) { + console.debug(`closeSession error:`, err); + } + } +} diff --git a/src/browser-use/definitions.ts b/src/browser-use/definitions.ts new file mode 100644 index 0000000..e05b93d --- /dev/null +++ b/src/browser-use/definitions.ts @@ -0,0 +1,109 @@ +import { TOOLS, type ToolDefinition } from '../types.js'; + +const browserDef: ToolDefinition = { + name: TOOLS.BROWSER, + description: + 'Control a Chromium browser via Playwright. Actions:\n' + + '• "open" — launch a browser session (required: sessionId; optional: url, headless, viewportWidth, viewportHeight)\n' + + '• "screenshot" — capture the page as a PNG image, returns base64 data + URL + title (required: sessionId)\n' + + '• "navigate" — go to a URL in an existing session (required: sessionId, url)\n' + + '• "click" — click at viewport coordinates (required: sessionId, x, y; optional: button, clickCount)\n' + + '• "type" — type text into the focused element (required: sessionId, text)\n' + + '• "key" — press a key or combo like "Enter", "Control+a", "Meta+s" (required: sessionId, key; modifiers auto-normalized)\n' + + '• "scroll" — scroll the page (required: sessionId, direction; optional: amount in pixels)\n' + + '• "drag" — drag from one coordinate to another (required: sessionId, fromX, fromY, toX, toY)\n' + + '• "close" — close a session and free resources (required: sessionId)\n' + + '• "status" — check if Playwright is available and list active sessions (no params)\n' + + 'You must call "open" before any other action. Use "screenshot" to see the page after navigation or interaction.', + inputSchema: { + type: 'object', + properties: { + action: { + type: 'string', + enum: [ + 'open', + 'screenshot', + 'navigate', + 'click', + 'type', + 'key', + 'scroll', + 'drag', + 'close', + 'status', + ], + description: 'The action to perform', + }, + sessionId: { + type: 'string', + description: + 'Browser session ID (required for all actions except "status")', + }, + url: { + type: 'string', + description: 'URL to open or navigate to (for "open" and "navigate")', + }, + headless: { + type: 'boolean', + description: 'Run without visible window (default: true, for "open")', + }, + viewportWidth: { + type: 'integer', + description: 'Viewport width in pixels (default: 1440, for "open")', + }, + viewportHeight: { + type: 'integer', + description: 'Viewport height in pixels (default: 900, for "open")', + }, + x: { + type: 'number', + description: 'X coordinate in pixels, viewport-relative (for "click")', + }, + y: { + type: 'number', + description: 'Y coordinate in pixels, viewport-relative (for "click")', + }, + button: { + type: 'string', + enum: ['left', 'right', 'middle'], + description: 'Mouse button (default: left, for "click")', + }, + clickCount: { + type: 'integer', + description: 'Number of clicks (default: 1, for "click")', + }, + text: { + type: 'string', + description: 'Text to type into focused element (for "type")', + }, + key: { + type: 'string', + description: + 'Key or combo, e.g. "Enter", "Control+a", "Meta+s" (for "key")', + }, + direction: { + type: 'string', + enum: ['up', 'down', 'left', 'right'], + description: 'Scroll direction (for "scroll")', + }, + amount: { + type: 'integer', + description: 'Scroll amount in pixels (default: 300, for "scroll")', + }, + fromX: { type: 'number', description: 'Start X coordinate (for "drag")' }, + fromY: { type: 'number', description: 'Start Y coordinate (for "drag")' }, + toX: { type: 'number', description: 'End X coordinate (for "drag")' }, + toY: { type: 'number', description: 'End Y coordinate (for "drag")' }, + }, + required: ['action'], + }, + annotations: { + title: 'Browser Control', + readOnlyHint: false, + destructiveHint: true, + idempotentHint: false, + openWorldHint: true, + }, +}; + +export const browserUseToolDefinitions: ToolDefinition[] = [browserDef]; diff --git a/src/browser-use/handlers.ts b/src/browser-use/handlers.ts new file mode 100644 index 0000000..28fb47c --- /dev/null +++ b/src/browser-use/handlers.ts @@ -0,0 +1,214 @@ +import { TOOLS, type ToolResult, type ToolHandlerContext } from '../types.js'; +import { ToolExecutionError, ValidationError } from '../errors.js'; +import { ZodError } from 'zod'; +import { bridge } from './bridge.js'; +import { + parseBrowserAction, + normalizeKey, + type BrowserAction, +} from './types.js'; + +const noopContext: ToolHandlerContext = { sendProgress: async () => {} }; + +export class BrowserUseToolHandler { + async execute( + args: unknown, + context: ToolHandlerContext = noopContext + ): Promise { + try { + const parsed = parseBrowserAction(args); + return this.dispatch(parsed, context); + } catch (error) { + if (error instanceof ValidationError) throw error; + if (error instanceof ZodError) { + throw new ValidationError(TOOLS.BROWSER, error.message); + } + throw new ToolExecutionError( + TOOLS.BROWSER, + 'Browser operation failed', + error + ); + } + } + + private async dispatch( + parsed: BrowserAction, + ctx: ToolHandlerContext + ): Promise { + switch (parsed.action) { + case 'open': + return this.handleOpen(parsed, ctx); + case 'screenshot': + return this.handleScreenshot(parsed, ctx); + case 'navigate': + return this.handleNavigate(parsed, ctx); + case 'click': + return this.handleClick(parsed); + case 'type': + return this.handleType(parsed); + case 'key': + return this.handleKey(parsed); + case 'scroll': + return this.handleScroll(parsed); + case 'drag': + return this.handleDrag(parsed); + case 'close': + return this.handleClose(parsed); + case 'status': + return this.handleStatus(); + } + } + + private async handleOpen( + args: { + action: 'open'; + sessionId: string; + url?: string; + headless?: boolean; + viewportWidth?: number; + viewportHeight?: number; + }, + context: ToolHandlerContext + ): Promise { + await context.sendProgress('Launching browser...', 0); + const session = await bridge.launch(args.sessionId, { + url: args.url, + headless: args.headless, + viewportWidth: args.viewportWidth, + viewportHeight: args.viewportHeight, + }); + return { + content: [ + { + type: 'text', + text: `Session "${session.sessionId}" opened at ${session.createdAt.toISOString()}`, + }, + ], + }; + } + + private async handleScreenshot( + args: { action: 'screenshot'; sessionId: string }, + context: ToolHandlerContext + ): Promise { + await context.sendProgress('Taking screenshot...', 0); + const { image, url, title } = await bridge.screenshot(args.sessionId); + return { + content: [ + { + type: 'image', + text: '', + data: image.toString('base64'), + mimeType: 'image/png', + }, + { type: 'text', text: `URL: ${url}\nTitle: ${title}` }, + ], + }; + } + + private async handleNavigate( + args: { action: 'navigate'; sessionId: string; url: string }, + context: ToolHandlerContext + ): Promise { + await context.sendProgress(`Navigating to ${args.url}...`, 0); + await bridge.navigate(args.sessionId, args.url); + return { content: [{ type: 'text', text: `Navigated to ${args.url}` }] }; + } + + private async handleClick(args: { + action: 'click'; + sessionId: string; + x: number; + y: number; + button?: string; + clickCount?: number; + }): Promise { + await bridge.click(args.sessionId, args.x, args.y, { + button: args.button, + clickCount: args.clickCount, + }); + return { + content: [{ type: 'text', text: `Clicked at (${args.x}, ${args.y})` }], + }; + } + + private async handleType(args: { + action: 'type'; + sessionId: string; + text: string; + }): Promise { + await bridge.type(args.sessionId, args.text); + return { content: [{ type: 'text', text: `Typed "${args.text}"` }] }; + } + + private async handleKey(args: { + action: 'key'; + sessionId: string; + key: string; + }): Promise { + await bridge.key(args.sessionId, normalizeKey(args.key)); + return { content: [{ type: 'text', text: `Pressed key: ${args.key}` }] }; + } + + private async handleScroll(args: { + action: 'scroll'; + sessionId: string; + direction: string; + amount: number; + }): Promise { + await bridge.scroll(args.sessionId, args.direction, args.amount); + return { + content: [ + { + type: 'text', + text: `Scrolled ${args.direction} by ${args.amount}px`, + }, + ], + }; + } + + private async handleDrag(args: { + action: 'drag'; + sessionId: string; + fromX: number; + fromY: number; + toX: number; + toY: number; + }): Promise { + await bridge.drag( + args.sessionId, + args.fromX, + args.fromY, + args.toX, + args.toY + ); + return { + content: [ + { + type: 'text', + text: `Dragged from (${args.fromX}, ${args.fromY}) to (${args.toX}, ${args.toY})`, + }, + ], + }; + } + + private async handleClose(args: { + action: 'close'; + sessionId: string; + }): Promise { + await bridge.close(args.sessionId); + return { + content: [{ type: 'text', text: `Session "${args.sessionId}" closed` }], + }; + } + + private async handleStatus(): Promise { + await bridge.checkAvailability(); + const status = bridge.getStatus(); + return { + content: [{ type: 'text', text: JSON.stringify(status, null, 2) }], + }; + } +} + +export const browserUseHandler = new BrowserUseToolHandler(); diff --git a/src/browser-use/playwright.d.ts b/src/browser-use/playwright.d.ts new file mode 100644 index 0000000..87b5b7a --- /dev/null +++ b/src/browser-use/playwright.d.ts @@ -0,0 +1,38 @@ +declare module 'playwright' { + export interface Browser { + newContext(opts?: Record): Promise; + close(): Promise; + } + + export interface BrowserContext { + newPage(): Promise; + close(): Promise; + } + + export interface Page { + goto(url: string, opts?: Record): Promise; + screenshot(opts?: Record): Promise; + mouse: { + click( + x: number, + y: number, + opts?: Record + ): Promise; + move(x: number, y: number, opts?: Record): Promise; + down(): Promise; + up(): Promise; + wheel(deltaX: number, deltaY: number): Promise; + }; + keyboard: { + press(key: string): Promise; + type(text: string, opts?: Record): Promise; + }; + title(): Promise; + url(): string; + waitForLoadState(state?: string): Promise; + } + + export const chromium: { + launch(opts?: Record): Promise; + }; +} diff --git a/src/browser-use/types.ts b/src/browser-use/types.ts new file mode 100644 index 0000000..9b84aa8 --- /dev/null +++ b/src/browser-use/types.ts @@ -0,0 +1,150 @@ +import { z } from 'zod'; + +const httpUrl = z + .url() + .refine((u: string) => /^https?:\/\//i.test(u), 'Only http(s) URLs allowed'); + +// Per-action schemas (separate for validation reuse and clean errors) +const openSchema = z.object({ + action: z.literal('open'), + sessionId: z.string().min(1, 'Session ID is required'), + url: httpUrl.optional(), + headless: z.boolean().default(true), + viewportWidth: z.int().default(1440), + viewportHeight: z.int().default(900), +}); + +const navigateSchema = z.object({ + action: z.literal('navigate'), + sessionId: z.string().min(1, 'Session ID is required'), + url: httpUrl.min(1, 'URL is required'), +}); + +const screenshotSchema = z.object({ + action: z.literal('screenshot'), + sessionId: z.string().min(1, 'Session ID is required'), +}); + +const clickSchema = z.object({ + action: z.literal('click'), + sessionId: z.string().min(1, 'Session ID is required'), + x: z.number(), + y: z.number(), + button: z.enum(['left', 'right', 'middle']).default('left'), + clickCount: z.int().default(1), +}); + +const typeSchema = z.object({ + action: z.literal('type'), + sessionId: z.string().min(1, 'Session ID is required'), + text: z.string().min(1, 'Text to type is required'), +}); + +const keySchema = z.object({ + action: z.literal('key'), + sessionId: z.string().min(1, 'Session ID is required'), + key: z.string().min(1, 'Key is required'), +}); + +const scrollSchema = z.object({ + action: z.literal('scroll'), + sessionId: z.string().min(1, 'Session ID is required'), + direction: z.enum(['up', 'down', 'left', 'right']), + amount: z.int().default(300), +}); + +const dragSchema = z.object({ + action: z.literal('drag'), + sessionId: z.string().min(1, 'Session ID is required'), + fromX: z.number(), + fromY: z.number(), + toX: z.number(), + toY: z.number(), +}); + +const closeSchema = z.object({ + action: z.literal('close'), + sessionId: z.string().min(1, 'Session ID is required'), +}); + +const statusSchema = z.object({ + action: z.literal('status'), +}); + +// Discriminated union for single-parse validation +export const BrowserActionSchema = z.discriminatedUnion('action', [ + openSchema, + navigateSchema, + screenshotSchema, + clickSchema, + typeSchema, + keySchema, + scrollSchema, + dragSchema, + closeSchema, + statusSchema, +]); + +// Map action to its schema for per-action validation +const ACTION_SCHEMAS: Record = { + open: openSchema, + navigate: navigateSchema, + screenshot: screenshotSchema, + click: clickSchema, + type: typeSchema, + key: keySchema, + scroll: scrollSchema, + drag: dragSchema, + close: closeSchema, + status: statusSchema, +}; + +export type BrowserAction = z.infer; + +export function parseBrowserAction(args: unknown): BrowserAction { + const raw = args as Record; + const action = typeof raw.action === 'string' ? raw.action : undefined; + + if (action && ACTION_SCHEMAS[action]) { + return ACTION_SCHEMAS[action].parse(args) as BrowserAction; + } + + return BrowserActionSchema.parse(args) as BrowserAction; +} + +export interface BrowserSession { + sessionId: string; + browser: unknown; + page: unknown; + createdAt: Date; +} + +export interface BrowserStatus { + available: boolean; + error: string | null; + activeSessions: number; + sessionIds: string[]; +} + +export function normalizeKey(key: string): string { + const map: Record = { + ctrl: 'Control', + cmd: 'Meta', + command: 'Meta', + opt: 'Alt', + option: 'Alt', + del: 'Delete', + backspace: 'Delete', + esc: 'Escape', + escape: 'Escape', + return: 'Enter', + }; + const lookup = (k: string): string => { + const lower = k.trim().toLowerCase(); + return map[lower] ?? k.trim(); + }; + if (key.includes('+')) { + return key.split('+').map(lookup).join('+'); + } + return lookup(key); +} diff --git a/src/index.ts b/src/index.ts index 595de7e..7ebca8a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,6 +2,7 @@ import chalk from 'chalk'; import { CodexMcpServer } from './server.js'; +import { shutdownBrowserSessions } from './tools/handlers.js'; const SERVER_CONFIG = { name: 'codex-mcp-server', @@ -12,6 +13,18 @@ async function main(): Promise { try { const server = new CodexMcpServer(SERVER_CONFIG); await server.start(); + + // Clean up browser sessions on shutdown + const shutdown = async (signal: string) => { + try { + await shutdownBrowserSessions(); + } catch { + // Best-effort cleanup + } + process.exit(signal === 'SIGINT' ? 130 : 0); + }; + process.on('SIGINT', () => shutdown('SIGINT')); + process.on('SIGTERM', () => shutdown('SIGTERM')); } catch (error) { console.error(chalk.red('Failed to start server:'), error); process.exit(1); diff --git a/src/tools/definitions.ts b/src/tools/definitions.ts index dd52a15..50dd4c5 100644 --- a/src/tools/definitions.ts +++ b/src/tools/definitions.ts @@ -1,4 +1,5 @@ import { TOOLS, getModelDescription, type ToolDefinition } from '../types.js'; +import { browserUseToolDefinitions } from '../browser-use/definitions.js'; export const toolDefinitions: ToolDefinition[] = [ { @@ -205,4 +206,5 @@ export const toolDefinitions: ToolDefinition[] = [ openWorldHint: true, }, }, + ...browserUseToolDefinitions, ]; diff --git a/src/tools/handlers.ts b/src/tools/handlers.ts index 9d7eabd..07d5ef6 100644 --- a/src/tools/handlers.ts +++ b/src/tools/handlers.ts @@ -24,6 +24,8 @@ import { ToolExecutionError, ValidationError } from '../errors.js'; import { executeCommand, executeCommandStreaming } from '../utils/command.js'; import { ZodError } from 'zod'; import path from 'node:path'; +import { browserUseHandler } from '../browser-use/handlers.js'; +import { bridge } from '../browser-use/bridge.js'; // Default no-op context for handlers that don't need progress const defaultContext: ToolHandlerContext = { @@ -585,4 +587,10 @@ export const toolHandlers = { [TOOLS.HELP]: new HelpToolHandler(), [TOOLS.LIST_SESSIONS]: new ListSessionsToolHandler(sessionStorage), [TOOLS.WEBSEARCH]: new WebSearchToolHandler(), -} as const; + [TOOLS.BROWSER]: browserUseHandler, +}; + +// Export shutdown function for browser cleanup +export async function shutdownBrowserSessions(): Promise { + await bridge.shutdown(); +} diff --git a/src/types.ts b/src/types.ts index a940f81..3d8f58c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -8,6 +8,7 @@ export const TOOLS = { HELP: 'help', LIST_SESSIONS: 'listSessions', WEBSEARCH: 'websearch', + BROWSER: 'browser', } as const; export type ToolName = typeof TOOLS[keyof typeof TOOLS]; @@ -67,8 +68,10 @@ export interface ToolDefinition { // Tool result interface matching MCP SDK expectations export interface ToolResult { content: Array<{ - type: 'text'; + type: 'text' | 'image'; text: string; + data?: string; + mimeType?: string; _meta?: Record; }>; structuredContent?: Record;