diff --git a/packages/core/src/device/index.ts b/packages/core/src/device/index.ts index c618485396..f667c1a5bc 100644 --- a/packages/core/src/device/index.ts +++ b/packages/core/src/device/index.ts @@ -168,8 +168,7 @@ const inputLocateDescription = 'the position of the placeholder or text content in the target input field. If there is no content, locate the center of the input field.'; export const actionInputParamSchema = z.object({ value: z - .union([z.string(), z.number()]) - .transform((val) => String(val)) + .string() .describe( 'The text to input. Provide the final content for replace/append modes, or an empty string when using clear mode to remove existing text.', ), diff --git a/packages/shared/src/mcp/tool-generator.ts b/packages/shared/src/mcp/tool-generator.ts index 86dd675ce0..08ef8d6fcd 100644 --- a/packages/shared/src/mcp/tool-generator.ts +++ b/packages/shared/src/mcp/tool-generator.ts @@ -244,6 +244,70 @@ function createErrorResult(message: string): ToolResult { }; } +/** + * Get all locate field names from an action's paramSchema + */ +function getLocateFieldNames(paramSchema: z.ZodTypeAny | undefined): string[] { + if (!paramSchema) { + return []; + } + + const schema = paramSchema as z.ZodTypeAny; + if (!isZodObject(schema)) { + return []; + } + + const locateFields: string[] = []; + for (const [key, value] of Object.entries(schema.shape)) { + const { innerValue } = unwrapOptional(value as z.ZodTypeAny); + if (isZodObject(innerValue) && isLocateField(innerValue)) { + locateFields.push(key); + } + } + return locateFields; +} + +/** + * Check if an action needs AI reasoning based on the provided args + * An action needs AI if it has locate fields with prompt strings that need resolution + */ +function actionNeedsAI( + locateFields: string[], + args: Record, +): boolean { + // If no locate fields, no AI needed + if (locateFields.length === 0) { + return false; + } + + // Check if any locate field is provided with a prompt (string or object with prompt) + for (const field of locateFields) { + const value = args[field]; + if (!value) { + continue; + } + + // If value is a string, it's a locate prompt that needs AI + if (typeof value === 'string') { + return true; + } + + // If value is an object with a prompt field (not empty), it needs AI + if (typeof value === 'object' && value !== null) { + const prompt = (value as { prompt?: unknown }).prompt; + if (prompt && typeof prompt === 'string' && prompt.trim() !== '') { + return true; + } + // Also check for nested prompt object + if (prompt && typeof prompt === 'object' && (prompt as { prompt?: unknown }).prompt) { + return true; + } + } + } + + return false; +} + /** * Converts DeviceAction from actionSpace into MCP ToolDefinition * This is the core logic that removes need for hardcoded tool definitions @@ -254,6 +318,7 @@ export function generateToolsFromActionSpace( ): ToolDefinition[] { return actionSpace.map((action) => { const schema = extractActionSchema(action.paramSchema as z.ZodTypeAny); + const locateFields = getLocateFieldNames(action.paramSchema as z.ZodTypeAny); return { name: action.name, @@ -263,7 +328,11 @@ export function generateToolsFromActionSpace( try { const agent = await getAgent(); - if (agent.aiAction) { + // Check if this action needs AI reasoning + const needsAI = actionNeedsAI(locateFields, args); + + if (needsAI && agent.aiAction) { + // Use AI-based action execution for actions that need element location const instruction = buildActionInstruction(action.name, args); try { await agent.aiAction(instruction); @@ -277,6 +346,53 @@ export function generateToolsFromActionSpace( `Failed to execute action "${action.name}": ${errorMessage}`, ); } + } else if (agent.callActionInActionSpace) { + // Use direct action execution for actions that don't need AI + // This is more efficient and doesn't require AI model service + try { + await agent.callActionInActionSpace(action.name, args); + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + console.error( + `Error executing action "${action.name}":`, + errorMessage, + ); + return createErrorResult( + `Failed to execute action "${action.name}": ${errorMessage}`, + ); + } + } else if (agent.aiAction) { + // Fallback to AI action if callActionInActionSpace is not available + const instruction = buildActionInstruction(action.name, args); + try { + await agent.aiAction(instruction); + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + console.error( + `Error executing action "${action.name}":`, + errorMessage, + ); + return createErrorResult( + `Failed to execute action "${action.name}": ${errorMessage}`, + ); + } + } else { + return createErrorResult( + `Cannot execute action "${action.name}": No execution method available`, + ); + } + + // Wait for network idle after action to ensure page stability + // This is especially important for actions that may trigger navigation (e.g., clicking links) + if (agent.waitForNetworkIdle) { + try { + await agent.waitForNetworkIdle(); + } catch (error: unknown) { + // Network idle timeout is not critical, continue to take screenshot + console.warn( + `[midscene:warning] waitForNetworkIdle timed out after action "${action.name}", continuing execution`, + ); + } } return await captureScreenshotResult(agent, action.name); @@ -294,8 +410,8 @@ export function generateToolsFromActionSpace( } /** - * Generate common tools (screenshot, waitFor) - * SIMPLIFIED: Only keep essential helper tools, removed assert + * Generate common tools (screenshot, navigation, page info) + * These are direct browser commands that don't need AI reasoning */ export function generateCommonTools( getAgent: () => Promise, @@ -326,5 +442,141 @@ export function generateCommonTools( }, autoDestroy: true, }, + { + name: 'navigate', + description: + 'Navigate the browser to a specified URL. Opens the URL in the current tab.', + schema: { + url: z.string().describe('The URL to navigate to'), + }, + handler: async (args): Promise => { + try { + const { url } = args as { url: string }; + const agent = await getAgent(); + if (!agent.page?.navigate) { + return createErrorResult( + 'Navigate operation is not supported on this page type', + ); + } + await agent.page.navigate(url); + return await captureScreenshotResult(agent, 'navigate'); + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + return createErrorResult(`Failed to navigate: ${errorMessage}`); + } + }, + autoDestroy: true, + }, + { + name: 'reload', + description: 'Reload the current page', + schema: {}, + handler: async (): Promise => { + try { + const agent = await getAgent(); + if (!agent.page?.reload) { + return createErrorResult( + 'Reload operation is not supported on this page type', + ); + } + await agent.page.reload(); + return await captureScreenshotResult(agent, 'reload'); + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + return createErrorResult(`Failed to reload: ${errorMessage}`); + } + }, + autoDestroy: true, + }, + { + name: 'go_back', + description: 'Navigate back in browser history', + schema: {}, + handler: async (): Promise => { + try { + const agent = await getAgent(); + if (!agent.page?.goBack) { + return createErrorResult( + 'GoBack operation is not supported on this page type', + ); + } + await agent.page.goBack(); + return await captureScreenshotResult(agent, 'go_back'); + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + return createErrorResult(`Failed to go back: ${errorMessage}`); + } + }, + autoDestroy: true, + }, + { + name: 'go_forward', + description: 'Navigate forward in browser history', + schema: {}, + handler: async (): Promise => { + try { + const agent = await getAgent(); + if (!agent.page?.goForward) { + return createErrorResult( + 'GoForward operation is not supported on this page type', + ); + } + await agent.page.goForward(); + return await captureScreenshotResult(agent, 'go_forward'); + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + return createErrorResult(`Failed to go forward: ${errorMessage}`); + } + }, + autoDestroy: true, + }, + { + name: 'get_url', + description: 'Get the current URL of the page', + schema: {}, + handler: async (): Promise => { + try { + const agent = await getAgent(); + if (!agent.page?.getCurrentUrl) { + return createErrorResult( + 'GetCurrentUrl operation is not supported on this page type', + ); + } + const url = agent.page.getCurrentUrl(); + return { + content: [{ type: 'text', text: `Current URL: ${url}` }], + }; + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + return createErrorResult( + `Failed to get current URL: ${errorMessage}`, + ); + } + }, + autoDestroy: true, + }, + { + name: 'get_title', + description: 'Get the title of the current page', + schema: {}, + handler: async (): Promise => { + try { + const agent = await getAgent(); + if (!agent.page?.getPageTitle) { + return createErrorResult( + 'GetPageTitle operation is not supported on this page type', + ); + } + const title = await agent.page.getPageTitle(); + return { + content: [{ type: 'text', text: `Page title: ${title}` }], + }; + } catch (error: unknown) { + const errorMessage = getErrorMessage(error); + return createErrorResult(`Failed to get page title: ${errorMessage}`); + } + }, + autoDestroy: true, + }, ]; } diff --git a/packages/shared/src/mcp/types.ts b/packages/shared/src/mcp/types.ts index 005921c6e9..0774cd0e55 100644 --- a/packages/shared/src/mcp/types.ts +++ b/packages/shared/src/mcp/types.ts @@ -79,6 +79,13 @@ export interface BaseAgent { destroy?(): Promise; page?: { screenshotBase64(): Promise; + // Navigation methods - available on web pages + navigate?(url: string): Promise; + reload?(): Promise; + goBack?(): Promise; + goForward?(): Promise; + getCurrentUrl?(): string; + getPageTitle?(): Promise; }; aiAction?: ( description: string, @@ -88,6 +95,20 @@ export interface BaseAgent { assertion: string, options: Record, ) => Promise; + /** + * Call an action directly without AI reasoning. + * This is useful for actions that don't need AI to locate elements, + * e.g., KeyboardPress with no locate parameter. + */ + callActionInActionSpace?: ( + actionName: string, + params: Record, + ) => Promise; + /** + * Wait for network idle after actions (e.g., after clicking a link that triggers navigation) + * This helps ensure consistent screenshots after navigation-triggering actions + */ + waitForNetworkIdle?: (timeout?: number) => Promise; } /** diff --git a/packages/web-bridge-mcp/src/web-tools.ts b/packages/web-bridge-mcp/src/web-tools.ts index c666890c21..78a448913e 100644 --- a/packages/web-bridge-mcp/src/web-tools.ts +++ b/packages/web-bridge-mcp/src/web-tools.ts @@ -89,6 +89,25 @@ export class WebMidsceneTools extends BaseMidsceneTools { }, autoDestroy: false, }, + { + name: 'web_close', + description: + 'Close the browser connection and end the automation session.', + schema: {}, + handler: async () => { + await this.closeBrowser(); + + return { + content: [ + { + type: 'text', + text: 'Browser connection closed successfully.', + }, + ], + }; + }, + autoDestroy: false, + }, ]; } } diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts index 1e000fc4ca..941037e275 100644 --- a/packages/web-integration/src/chrome-extension/page.ts +++ b/packages/web-integration/src/chrome-extension/page.ts @@ -475,6 +475,29 @@ export default class ChromeExtensionProxyPage implements AbstractInterface { await this.waitUntilNetworkIdle(); } + async goForward(): Promise { + const tabId = await this.getTabIdOrConnectToCurrentTab(); + await chrome.tabs.goForward(tabId); + // Wait for navigation to complete + await this.waitUntilNetworkIdle(); + } + + getCurrentUrl(): string { + // Note: This is a sync method but we may not have the URL cached. + // For chrome extension, use getPageInfo for async URL retrieval. + // This returns empty string as chrome.tabs.get is async. + // Consider using evaluate to get window.location.href if needed. + throw new Error( + 'getCurrentUrl is not supported in chrome extension mode. Use evaluate to get window.location.href instead.', + ); + } + + async getPageTitle(): Promise { + const tabId = await this.getTabIdOrConnectToCurrentTab(); + const tab = await chrome.tabs.get(tabId); + return tab.title || ''; + } + async scrollUntilTop(startingPoint?: Point) { if (startingPoint) { await this.mouse.move(startingPoint.left, startingPoint.top); diff --git a/packages/web-integration/src/playwright/index.ts b/packages/web-integration/src/playwright/index.ts index 144c62a2a6..305f5ca53e 100644 --- a/packages/web-integration/src/playwright/index.ts +++ b/packages/web-integration/src/playwright/index.ts @@ -55,7 +55,7 @@ export class PlaywrightAgent extends PageAgent { } } - async waitForNetworkIdle(timeout = 1000) { + async waitForNetworkIdle(timeout?: number) { await this.page.underlyingPage.waitForLoadState('networkidle', { timeout }); } } diff --git a/packages/web-integration/src/puppeteer/base-page.ts b/packages/web-integration/src/puppeteer/base-page.ts index 187eb5ced2..682b63b3f0 100644 --- a/packages/web-integration/src/puppeteer/base-page.ts +++ b/packages/web-integration/src/puppeteer/base-page.ts @@ -592,6 +592,25 @@ export class Page< } } + async goForward(): Promise { + debugPage('go forward'); + if (this.interfaceType === 'puppeteer') { + await (this.underlyingPage as PuppeteerPage).goForward(); + } else if (this.interfaceType === 'playwright') { + await (this.underlyingPage as PlaywrightPage).goForward(); + } else { + throw new Error('Unsupported page type for go forward'); + } + } + + getCurrentUrl(): string { + return this.underlyingPage.url(); + } + + async getPageTitle(): Promise { + return await this.underlyingPage.title(); + } + async beforeInvokeAction(name: string, param: any): Promise { if (this.onBeforeInvokeAction) { await this.onBeforeInvokeAction(name, param); diff --git a/packages/web-integration/src/web-page.ts b/packages/web-integration/src/web-page.ts index b9f33f7546..9654ec7d1c 100644 --- a/packages/web-integration/src/web-page.ts +++ b/packages/web-integration/src/web-page.ts @@ -376,6 +376,9 @@ export abstract class AbstractWebPage extends AbstractInterface { navigate?(url: string): Promise; reload?(): Promise; goBack?(): Promise; + goForward?(): Promise; + getCurrentUrl?(): string; + getPageTitle?(): Promise; get mouse(): MouseAction { return { @@ -625,43 +628,4 @@ export const commonWebActionsForWebPage = ( assert(element, 'Element not found, cannot clear input'); await page.clearInput(element as unknown as ElementInfo); }), - - defineAction({ - name: 'Navigate', - description: - 'Navigate the browser to a specified URL. Opens the URL in the current tab.', - paramSchema: z.object({ - url: z.string().describe('The URL to navigate to'), - }), - call: async (param) => { - if (!page.navigate) { - throw new Error( - 'Navigate operation is not supported on this page type', - ); - } - await page.navigate(param.url); - }, - }), - - defineAction({ - name: 'Reload', - description: 'Reload the current page', - call: async () => { - if (!page.reload) { - throw new Error('Reload operation is not supported on this page type'); - } - await page.reload(); - }, - }), - - defineAction({ - name: 'GoBack', - description: 'Navigate back in browser history', - call: async () => { - if (!page.goBack) { - throw new Error('GoBack operation is not supported on this page type'); - } - await page.goBack(); - }, - }), ]; diff --git a/packages/web-integration/tests/unit-test/browser.test.ts b/packages/web-integration/tests/unit-test/browser.test.ts new file mode 100644 index 0000000000..e79526f18b --- /dev/null +++ b/packages/web-integration/tests/unit-test/browser.test.ts @@ -0,0 +1,97 @@ +import { chromium } from 'playwright'; +import type { Browser, BrowserServer } from 'playwright'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; + +describe('Playwright Browser Server', { + timeout: 30000, +}, () => { + let browserServer: BrowserServer; + let wsEndpoint: string; + + beforeAll(async () => { + browserServer = await chromium.launchServer({ + headless: false, + }); + wsEndpoint = browserServer.wsEndpoint(); + }); + + afterAll(async () => { + if (browserServer) { + await browserServer.close(); + } + }); + + it('should launch server and retrieve WebSocket endpoint', () => { + expect(wsEndpoint).toBeDefined(); + expect(wsEndpoint).toMatch(/^wss?:\/\//); + }); + + it('should connect to browser server and open Google', async () => { + let browser: Browser | null = null; + + try { + // Connect to the browser server using WebSocket endpoint + browser = await chromium.connect(wsEndpoint); + expect(browser).toBeDefined(); + expect(browser.isConnected()).toBe(true); + + // Create a new page and navigate to Google + const context = await browser.newContext(); + const page = await context.newPage(); + + await page.goto('https://www.google.com', { + waitUntil: 'domcontentloaded', + timeout: 10000, + }); + + // Verify the page loaded successfully + const title = await page.title(); + expect(title).toContain('Google'); + + // Verify the page URL + const url = page.url(); + expect(url).toMatch(/google\.com/); + + // Clean up + await page.close(); + await context.close(); + } finally { + if (browser) { + await browser.close(); + } + } + }); + + it('should support multiple browser connections', async () => { + const browser1 = await chromium.connect(wsEndpoint); + const browser2 = await chromium.connect(wsEndpoint); + + try { + expect(browser1.isConnected()).toBe(true); + expect(browser2.isConnected()).toBe(true); + + // Verify both connections work independently + const context1 = await browser1.newContext(); + const context2 = await browser2.newContext(); + + const page1 = await context1.newPage(); + const page2 = await context2.newPage(); + + await Promise.all([ + page1.goto('https://www.google.com', { waitUntil: 'domcontentloaded' }), + page2.goto('https://www.google.com', { waitUntil: 'domcontentloaded' }), + ]); + + expect(await page1.title()).toContain('Google'); + expect(await page2.title()).toContain('Google'); + + await page1.close(); + await page2.close(); + await context1.close(); + await context2.close(); + } finally { + await browser1.close(); + await browser2.close(); + } + }); +}); diff --git a/packages/web-playwright-mcp/README.md b/packages/web-playwright-mcp/README.md new file mode 100644 index 0000000000..9c2d9664e1 --- /dev/null +++ b/packages/web-playwright-mcp/README.md @@ -0,0 +1,62 @@ +# Midscene Web Playwright MCP + +Midscene MCP Server for Web automation using Playwright. + +## Overview + +This package provides an MCP (Model Context Protocol) server that enables AI agents to control web browsers using Playwright. Unlike the bridge mode (`@midscene/web-bridge-mcp`), this mode launches and controls a browser instance directly through Playwright. + +## Features + +- **Direct Browser Control**: Launches and controls Chromium browser via Playwright +- **Full Automation**: Supports navigation, clicking, typing, scrolling, and more +- **AI-Powered Actions**: Use natural language to describe actions +- **Screenshot Capture**: Returns screenshots after each action + +## Installation + +```bash +npm install @midscene/web-playwright-mcp +# or +pnpm add @midscene/web-playwright-mcp +``` + +## Usage + +### As CLI (stdio mode) + +```bash +npx @midscene/web-playwright-mcp +``` + +### As HTTP Server + +```bash +npx @midscene/web-playwright-mcp --mode http --port 3000 +``` + +### In MCP Configuration + +```json +{ + "mcpServers": { + "midscene-web-playwright": { + "command": "npx", + "args": ["@midscene/web-playwright-mcp"] + } + } +} +``` + +## Available Tools + +- `web_connect`: Launch browser and navigate to a URL +- Action tools generated from Midscene's web action space (click, type, scroll, etc.) + +## Documentation + +Full documentation: https://midscenejs.com/mcp.html + +## License + +MIT diff --git a/packages/web-playwright-mcp/package.json b/packages/web-playwright-mcp/package.json new file mode 100644 index 0000000000..33988d073d --- /dev/null +++ b/packages/web-playwright-mcp/package.json @@ -0,0 +1,50 @@ +{ + "name": "@midscene/web-playwright-mcp", + "version": "1.0.2", + "description": "Midscene MCP Server for Web automation (Playwright mode)", + "bin": "dist/index.js", + "files": ["dist"], + "main": "./dist/server.js", + "types": "./dist/server.d.ts", + "exports": { + ".": { + "types": "./dist/server.d.ts", + "default": "./dist/server.js" + }, + "./server": { + "types": "./dist/server.d.ts", + "default": "./dist/server.js" + } + }, + "scripts": { + "build": "rslib build", + "dev": "npm run build:watch", + "build:watch": "rslib build --watch --no-clean", + "mcp-playground": "npx @modelcontextprotocol/inspector node ./dist/index.js", + "run": "node ./dist/index.js --mode http --port 3000", + "test": "vitest run" + }, + "devDependencies": { + "@midscene/core": "workspace:*", + "@midscene/report": "workspace:*", + "@midscene/shared": "workspace:*", + "@midscene/web": "workspace:*", + "@modelcontextprotocol/inspector": "^0.16.3", + "@modelcontextprotocol/sdk": "1.10.2", + "@rslib/core": "^0.18.3", + "@rspack/core": "1.6.6", + "@types/node": "^18.0.0", + "dotenv": "^16.4.5", + "typescript": "^5.8.3", + "vitest": "3.0.5" + }, + "dependencies": { + "@silvia-odwyer/photon": "0.3.3", + "@silvia-odwyer/photon-node": "0.3.3", + "bufferutil": "4.0.9", + "playwright": "^1.52.0", + "sharp": "^0.34.3", + "utf-8-validate": "6.0.5" + }, + "license": "MIT" +} diff --git a/packages/web-playwright-mcp/rslib.config.ts b/packages/web-playwright-mcp/rslib.config.ts new file mode 100644 index 0000000000..c92d24c8ee --- /dev/null +++ b/packages/web-playwright-mcp/rslib.config.ts @@ -0,0 +1,60 @@ +import { injectReportHtmlFromCore } from '@midscene/shared/mcp'; +import { defineConfig } from '@rslib/core'; +import { rspack } from '@rspack/core'; +import { version } from './package.json'; + +export default defineConfig({ + source: { + define: { + __VERSION__: `'${version}'`, + }, + entry: { + index: './src/index.ts', + server: './src/server.ts', + }, + }, + output: { + externals: [ + (data, cb) => { + if ( + data.context?.includes('/node_modules/ws/lib') && + ['bufferutil', 'utf-8-validate'].includes(data.request as string) + ) { + cb(undefined, data.request); + } + cb(); + }, + '@silvia-odwyer/photon', + '@silvia-odwyer/photon-node', + '@modelcontextprotocol/sdk', + 'playwright', + 'playwright-core', + ], + }, + plugins: [injectReportHtmlFromCore(__dirname)], + tools: { + rspack: { + plugins: [ + new rspack.BannerPlugin({ + banner: '#!/usr/bin/env node', + raw: true, + test: /^index\.js$/, + }), + ], + optimization: { + minimize: false, + }, + }, + }, + lib: [ + { + format: 'cjs', + syntax: 'es2021', + output: { + distPath: { + root: 'dist', + }, + }, + }, + ], +}); diff --git a/packages/web-playwright-mcp/src/index.ts b/packages/web-playwright-mcp/src/index.ts new file mode 100644 index 0000000000..7bc4b1b148 --- /dev/null +++ b/packages/web-playwright-mcp/src/index.ts @@ -0,0 +1,22 @@ +import { parseArgs } from 'node:util'; +import { type CLIArgs, CLI_ARGS_CONFIG } from '@midscene/shared/mcp'; +import dotenv from 'dotenv'; +import { PlaywrightMCPServer } from './server.js'; + +dotenv.config(); + +const { values } = parseArgs({ options: CLI_ARGS_CONFIG }); +const args = values as CLIArgs; + +const server = new PlaywrightMCPServer(); + +if (args.mode === 'http') { + server + .launchHttp({ + port: Number.parseInt(args.port || '3000', 10), + host: args.host || 'localhost', + }) + .catch(console.error); +} else { + server.launch().catch(console.error); +} diff --git a/packages/web-playwright-mcp/src/playwright-tools.ts b/packages/web-playwright-mcp/src/playwright-tools.ts new file mode 100644 index 0000000000..86899ee6ed --- /dev/null +++ b/packages/web-playwright-mcp/src/playwright-tools.ts @@ -0,0 +1,165 @@ +import { z } from '@midscene/core'; +import { getDebug } from '@midscene/shared/logger'; +import { BaseMidsceneTools, type ToolDefinition } from '@midscene/shared/mcp'; +import type { PlaywrightAgent } from '@midscene/web/playwright'; +import type { Browser, BrowserContext } from 'playwright'; + +const debug = getDebug('mcp:playwright-tools'); + +// Use 'any' to avoid version conflicts between different playwright-core versions +// The runtime types are compatible, but TypeScript sees different versions as incompatible +type PlaywrightPage = any; + +/** + * Playwright-specific tools manager + * Extends BaseMidsceneTools to provide Playwright browser automation tools + */ +export class PlaywrightMidsceneTools extends BaseMidsceneTools { + private browser?: Browser; + private context?: BrowserContext; + + protected createTemporaryDevice() { + // Use require to avoid type incompatibility with DeviceAction vs ActionSpaceItem + // StaticPage.actionSpace() returns DeviceAction[] which is compatible at runtime + const { StaticPage } = require('@midscene/web/static'); + return new StaticPage(); + } + + protected async ensureAgent(url?: string): Promise { + // Re-init if URL provided (navigate to new page) + if (this.agent && url) { + try { + await this.closeBrowser(); + } catch (error) { + debug('Failed to close browser during re-init:', error); + } + } + + if (this.agent) { + return this.agent; + } + + // Playwright mode requires a URL to connect + if (!url) { + throw new Error( + 'Playwright mode requires a URL. Use web_connect tool to connect to a page first.', + ); + } + + debug('Launching browser and navigating to:', url); + this.agent = await this.launchAndConnect(url); + return this.agent; + } + + /** + * Launch Playwright browser and navigate to URL + */ + private async launchAndConnect(url: string): Promise { + const { chromium } = await import('playwright'); + const { PlaywrightAgent } = await import('@midscene/web/playwright'); + + // Launch browser in headed mode for visibility + this.browser = await chromium.launch({ + headless: false, + }); + + // Create browser context + this.context = await this.browser.newContext(); + + // Create page and navigate + const page: PlaywrightPage = await this.context.newPage(); + await page.goto(url, { waitUntil: 'domcontentloaded' }); + + // Create PlaywrightAgent + const agent = new PlaywrightAgent(page); + return agent; + } + + /** + * Close browser and cleanup resources + * Override base class method to also close browser and context + */ + public override async closeBrowser(): Promise { + try { + await this.agent?.destroy(); + } catch (error) { + debug('Failed to destroy agent:', error); + } + this.agent = undefined; + + try { + await this.context?.close(); + } catch (error) { + debug('Failed to close context:', error); + } + this.context = undefined; + + try { + await this.browser?.close(); + } catch (error) { + debug('Failed to close browser:', error); + } + this.browser = undefined; + } + + /** + * Provide Playwright-specific platform tools + */ + protected preparePlatformTools(): ToolDefinition[] { + return [ + { + name: 'web_connect', + description: + 'Launch browser and connect to web page by URL. This will open a new Chromium browser window.', + schema: { + url: z.string().url().describe('URL to navigate to'), + }, + handler: async (args) => { + const { url } = args as { url: string }; + const agent = await this.ensureAgent(url); + const screenshot = await agent.page.screenshotBase64(); + + if (!screenshot) { + return { + content: [ + { + type: 'text', + text: `Connected to: ${url}`, + }, + ], + }; + } + + return { + content: [ + { + type: 'text', + text: `Connected to: ${url}`, + }, + ...this.buildScreenshotContent(screenshot), + ], + }; + }, + autoDestroy: false, // Keep browser alive for subsequent operations + }, + { + name: 'web_close', + description: 'Close the browser and end the automation session.', + schema: {}, + handler: async () => { + await this.closeBrowser(); + + return { + content: [ + { + type: 'text', + text: 'Browser closed successfully.', + }, + ], + }; + }, + autoDestroy: false, + }, + ]; + } +} diff --git a/packages/web-playwright-mcp/src/server.ts b/packages/web-playwright-mcp/src/server.ts new file mode 100644 index 0000000000..e2b4377292 --- /dev/null +++ b/packages/web-playwright-mcp/src/server.ts @@ -0,0 +1,24 @@ +import { BaseMCPServer } from '@midscene/shared/mcp'; +import { PlaywrightMidsceneTools } from './playwright-tools.js'; + +declare const __VERSION__: string; + +/** + * Playwright MCP Server class + * Usage: + * const server = new PlaywrightMCPServer(); + * await server.launch(); + */ +export class PlaywrightMCPServer extends BaseMCPServer { + constructor() { + super({ + name: '@midscene/web-playwright-mcp', + version: __VERSION__, + description: 'Midscene MCP Server for Web automation (Playwright mode)', + }); + } + + protected createToolsManager(): PlaywrightMidsceneTools { + return new PlaywrightMidsceneTools(); + } +} diff --git a/packages/web-playwright-mcp/tsconfig.json b/packages/web-playwright-mcp/tsconfig.json new file mode 100644 index 0000000000..813445ceac --- /dev/null +++ b/packages/web-playwright-mcp/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "../shared/tsconfig.base.json", + "compilerOptions": { + "lib": ["ES2021"], + "noEmit": true, + "useDefineForClassFields": true, + "allowImportingTsExtensions": true, + "resolveJsonModule": true + }, + "include": ["src"], + "references": [ + { "path": "../web-integration" }, + { "path": "../core" }, + { "path": "../shared" } + ] +} diff --git a/packages/web-playwright-mcp/vitest.config.ts b/packages/web-playwright-mcp/vitest.config.ts new file mode 100644 index 0000000000..7c747ac048 --- /dev/null +++ b/packages/web-playwright-mcp/vitest.config.ts @@ -0,0 +1,15 @@ +import { defineConfig } from 'vitest/config'; +import { version } from './package.json'; + +export default defineConfig({ + test: { + globals: true, + environment: 'node', + }, + define: { + __VERSION__: JSON.stringify(version), + }, + ssr: { + external: ['@silvia-odwyer/photon'], + }, +}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5b4e94b76c..d606173412 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1359,6 +1359,64 @@ importers: specifier: 3.0.5 version: 3.0.5(@types/debug@4.1.12)(@types/node@18.19.62)(jsdom@26.1.0(bufferutil@4.0.9)(utf-8-validate@6.0.5))(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.44.1) + packages/web-playwright-mcp: + dependencies: + '@silvia-odwyer/photon': + specifier: 0.3.3 + version: 0.3.3 + '@silvia-odwyer/photon-node': + specifier: 0.3.3 + version: 0.3.3 + bufferutil: + specifier: 4.0.9 + version: 4.0.9 + playwright: + specifier: ^1.52.0 + version: 1.57.0 + sharp: + specifier: ^0.34.3 + version: 0.34.3 + utf-8-validate: + specifier: 6.0.5 + version: 6.0.5 + devDependencies: + '@midscene/core': + specifier: workspace:* + version: link:../core + '@midscene/report': + specifier: workspace:* + version: link:../../apps/report + '@midscene/shared': + specifier: workspace:* + version: link:../shared + '@midscene/web': + specifier: workspace:* + version: link:../web-integration + '@modelcontextprotocol/inspector': + specifier: ^0.16.3 + version: 0.16.3(@types/node@18.19.118)(@types/react-dom@19.1.5(@types/react@19.1.5))(@types/react@19.1.5)(bufferutil@4.0.9)(typescript@5.8.3)(utf-8-validate@6.0.5) + '@modelcontextprotocol/sdk': + specifier: 1.10.2 + version: 1.10.2 + '@rslib/core': + specifier: ^0.18.3 + version: 0.18.3(@microsoft/api-extractor@7.52.10(@types/node@18.19.118))(typescript@5.8.3) + '@rspack/core': + specifier: 1.6.6 + version: 1.6.6(@swc/helpers@0.5.17) + '@types/node': + specifier: ^18.0.0 + version: 18.19.118 + dotenv: + specifier: ^16.4.5 + version: 16.4.7 + typescript: + specifier: ^5.8.3 + version: 5.8.3 + vitest: + specifier: 3.0.5 + version: 3.0.5(@types/debug@4.1.12)(@types/node@18.19.118)(jsdom@26.1.0(bufferutil@4.0.9)(utf-8-validate@6.0.5))(less@4.3.0)(lightningcss@1.30.1)(sass-embedded@1.86.3)(terser@5.44.1) + packages/webdriver: dependencies: '@midscene/shared': @@ -4301,9 +4359,6 @@ packages: '@types/node@18.19.62': resolution: {integrity: sha512-UOGhw+yZV/icyM0qohQVh3ktpY40Sp7tdTW7HxG3pTd7AiMrlFlAJNUrGK9t5mdW0+ViQcFV74zCSIx9ZJpncA==} - '@types/node@24.10.1': - resolution: {integrity: sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ==} - '@types/node@24.10.2': resolution: {integrity: sha512-WOhQTZ4G8xZ1tjJTvKOpyEVSGgOTvJAfDK3FNFgELyaTpzhdgHVHeqW8V+UJvzF5BT+/B54T/1S2K6gd9c7bbA==} @@ -8523,11 +8578,21 @@ packages: engines: {node: '>=16'} hasBin: true + playwright-core@1.57.0: + resolution: {integrity: sha512-agTcKlMw/mjBWOnD6kFZttAAGHgi/Nw0CZ2o6JqWSbMlI219lAFLZZCyqByTsvVAJq5XA5H8cA6PrvBRpBWEuQ==} + engines: {node: '>=18'} + hasBin: true + playwright@1.44.1: resolution: {integrity: sha512-qr/0UJ5CFAtloI3avF95Y0L1xQo6r3LQArLIg/z/PoGJ6xa+EwzrwO5lpNr/09STxdHuUoP2mvuELJS+hLdtgg==} engines: {node: '>=16'} hasBin: true + playwright@1.57.0: + resolution: {integrity: sha512-ilYQj1s8sr2ppEJ2YVadYBN0Mb3mdo9J0wQ+UuDhzYqURwSoW4n1Xs5vs7ORwgDGmyEh33tRMeS8KhdkMoLXQw==} + engines: {node: '>=18'} + hasBin: true + plist@3.1.0: resolution: {integrity: sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ==} engines: {node: '>=10.4.0'} @@ -14407,7 +14472,7 @@ snapshots: '@types/conventional-commits-parser@5.0.2': dependencies: - '@types/node': 24.10.1 + '@types/node': 18.19.118 '@types/cookie@0.4.1': {} @@ -14533,10 +14598,6 @@ snapshots: dependencies: undici-types: 5.26.5 - '@types/node@24.10.1': - dependencies: - undici-types: 7.16.0 - '@types/node@24.10.2': dependencies: undici-types: 7.16.0 @@ -14604,7 +14665,7 @@ snapshots: '@types/yauzl@2.10.3': dependencies: - '@types/node': 24.10.2 + '@types/node': 18.19.118 '@ui-tars/action-parser@1.2.3': dependencies: @@ -15372,7 +15433,7 @@ snapshots: dependencies: bytes: 3.1.2 content-type: 1.0.5 - debug: 4.4.3 + debug: 4.4.0 http-errors: 2.0.0 iconv-lite: 0.6.3 on-finished: 2.4.1 @@ -15675,7 +15736,7 @@ snapshots: chrome-launcher@1.2.0: dependencies: - '@types/node': 24.10.2 + '@types/node': 18.19.118 escape-string-regexp: 4.0.0 is-wsl: 2.2.0 lighthouse-logger: 2.0.2 @@ -16881,7 +16942,7 @@ snapshots: content-type: 1.0.5 cookie: 0.7.2 cookie-signature: 1.2.2 - debug: 4.4.3 + debug: 4.4.0 encodeurl: 2.0.0 escape-html: 1.0.3 etag: 1.8.1 @@ -17023,7 +17084,7 @@ snapshots: finalhandler@2.1.0: dependencies: - debug: 4.4.3 + debug: 4.4.0 encodeurl: 2.0.0 escape-html: 1.0.3 on-finished: 2.4.1 @@ -19670,12 +19731,20 @@ snapshots: playwright-core@1.44.1: {} + playwright-core@1.57.0: {} + playwright@1.44.1: dependencies: playwright-core: 1.44.1 optionalDependencies: fsevents: 2.3.2 + playwright@1.57.0: + dependencies: + playwright-core: 1.57.0 + optionalDependencies: + fsevents: 2.3.2 + plist@3.1.0: dependencies: '@xmldom/xmldom': 0.8.10 @@ -20617,7 +20686,7 @@ snapshots: router@2.2.0: dependencies: - debug: 4.4.3 + debug: 4.4.0 depd: 2.0.0 is-promise: 4.0.0 parseurl: 1.3.3 @@ -20881,7 +20950,7 @@ snapshots: send@1.2.0: dependencies: - debug: 4.4.3 + debug: 4.4.0 encodeurl: 2.0.0 escape-html: 1.0.3 etag: 1.8.1