Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions packages/core/src/device/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,7 @@ const inputLocateDescription =
'the position of the placeholder or text content in the target input field. If there is no content, locate the center of the input field.';
export const actionInputParamSchema = z.object({
value: z
.union([z.string(), z.number()])
.transform((val) => String(val))
.string()
.describe(
'The text to input. Provide the final content for replace/append modes, or an empty string when using clear mode to remove existing text.',
),
Expand Down
258 changes: 255 additions & 3 deletions packages/shared/src/mcp/tool-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,70 @@ function createErrorResult(message: string): ToolResult {
};
}

/**
* Get all locate field names from an action's paramSchema
*/
function getLocateFieldNames(paramSchema: z.ZodTypeAny | undefined): string[] {
if (!paramSchema) {
return [];
}

const schema = paramSchema as z.ZodTypeAny;
if (!isZodObject(schema)) {
return [];
}

const locateFields: string[] = [];
for (const [key, value] of Object.entries(schema.shape)) {
const { innerValue } = unwrapOptional(value as z.ZodTypeAny);
if (isZodObject(innerValue) && isLocateField(innerValue)) {
locateFields.push(key);
}
}
return locateFields;
}

/**
* Check if an action needs AI reasoning based on the provided args
* An action needs AI if it has locate fields with prompt strings that need resolution
*/
function actionNeedsAI(
locateFields: string[],
args: Record<string, unknown>,
): boolean {
// If no locate fields, no AI needed
if (locateFields.length === 0) {
return false;
}

// Check if any locate field is provided with a prompt (string or object with prompt)
for (const field of locateFields) {
const value = args[field];
if (!value) {
continue;
}

// If value is a string, it's a locate prompt that needs AI
if (typeof value === 'string') {
return true;
}

// If value is an object with a prompt field (not empty), it needs AI
if (typeof value === 'object' && value !== null) {
const prompt = (value as { prompt?: unknown }).prompt;
if (prompt && typeof prompt === 'string' && prompt.trim() !== '') {
return true;
}
// Also check for nested prompt object
if (prompt && typeof prompt === 'object' && (prompt as { prompt?: unknown }).prompt) {
return true;
}
}
}

return false;
}

/**
* Converts DeviceAction from actionSpace into MCP ToolDefinition
* This is the core logic that removes need for hardcoded tool definitions
Expand All @@ -254,6 +318,7 @@ export function generateToolsFromActionSpace(
): ToolDefinition[] {
return actionSpace.map((action) => {
const schema = extractActionSchema(action.paramSchema as z.ZodTypeAny);
const locateFields = getLocateFieldNames(action.paramSchema as z.ZodTypeAny);

return {
name: action.name,
Expand All @@ -263,7 +328,11 @@ export function generateToolsFromActionSpace(
try {
const agent = await getAgent();

if (agent.aiAction) {
// Check if this action needs AI reasoning
const needsAI = actionNeedsAI(locateFields, args);

if (needsAI && agent.aiAction) {
// Use AI-based action execution for actions that need element location
const instruction = buildActionInstruction(action.name, args);
try {
await agent.aiAction(instruction);
Expand All @@ -277,6 +346,53 @@ export function generateToolsFromActionSpace(
`Failed to execute action "${action.name}": ${errorMessage}`,
);
}
} else if (agent.callActionInActionSpace) {
// Use direct action execution for actions that don't need AI
// This is more efficient and doesn't require AI model service
try {
await agent.callActionInActionSpace(action.name, args);
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
console.error(
`Error executing action "${action.name}":`,
errorMessage,
);
return createErrorResult(
`Failed to execute action "${action.name}": ${errorMessage}`,
);
}
} else if (agent.aiAction) {
// Fallback to AI action if callActionInActionSpace is not available
const instruction = buildActionInstruction(action.name, args);
try {
await agent.aiAction(instruction);
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
console.error(
`Error executing action "${action.name}":`,
errorMessage,
);
return createErrorResult(
`Failed to execute action "${action.name}": ${errorMessage}`,
);
}
} else {
return createErrorResult(
`Cannot execute action "${action.name}": No execution method available`,
);
}

// Wait for network idle after action to ensure page stability
// This is especially important for actions that may trigger navigation (e.g., clicking links)
if (agent.waitForNetworkIdle) {
try {
await agent.waitForNetworkIdle();
} catch (error: unknown) {
// Network idle timeout is not critical, continue to take screenshot
console.warn(
`[midscene:warning] waitForNetworkIdle timed out after action "${action.name}", continuing execution`,
);
}
}

return await captureScreenshotResult(agent, action.name);
Expand All @@ -294,8 +410,8 @@ export function generateToolsFromActionSpace(
}

/**
* Generate common tools (screenshot, waitFor)
* SIMPLIFIED: Only keep essential helper tools, removed assert
* Generate common tools (screenshot, navigation, page info)
* These are direct browser commands that don't need AI reasoning
*/
export function generateCommonTools(
getAgent: () => Promise<BaseAgent>,
Expand Down Expand Up @@ -326,5 +442,141 @@ export function generateCommonTools(
},
autoDestroy: true,
},
{
name: 'navigate',
description:
'Navigate the browser to a specified URL. Opens the URL in the current tab.',
schema: {
url: z.string().describe('The URL to navigate to'),
},
handler: async (args): Promise<ToolResult> => {
try {
const { url } = args as { url: string };
const agent = await getAgent();
if (!agent.page?.navigate) {
return createErrorResult(
'Navigate operation is not supported on this page type',
);
}
await agent.page.navigate(url);
return await captureScreenshotResult(agent, 'navigate');
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
return createErrorResult(`Failed to navigate: ${errorMessage}`);
}
},
autoDestroy: true,
},
{
name: 'reload',
description: 'Reload the current page',
schema: {},
handler: async (): Promise<ToolResult> => {
try {
const agent = await getAgent();
if (!agent.page?.reload) {
return createErrorResult(
'Reload operation is not supported on this page type',
);
}
await agent.page.reload();
return await captureScreenshotResult(agent, 'reload');
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
return createErrorResult(`Failed to reload: ${errorMessage}`);
}
},
autoDestroy: true,
},
{
name: 'go_back',
description: 'Navigate back in browser history',
schema: {},
handler: async (): Promise<ToolResult> => {
try {
const agent = await getAgent();
if (!agent.page?.goBack) {
return createErrorResult(
'GoBack operation is not supported on this page type',
);
}
await agent.page.goBack();
return await captureScreenshotResult(agent, 'go_back');
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
return createErrorResult(`Failed to go back: ${errorMessage}`);
}
},
autoDestroy: true,
},
{
name: 'go_forward',
description: 'Navigate forward in browser history',
schema: {},
handler: async (): Promise<ToolResult> => {
try {
const agent = await getAgent();
if (!agent.page?.goForward) {
return createErrorResult(
'GoForward operation is not supported on this page type',
);
}
await agent.page.goForward();
return await captureScreenshotResult(agent, 'go_forward');
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
return createErrorResult(`Failed to go forward: ${errorMessage}`);
}
},
autoDestroy: true,
},
{
name: 'get_url',
description: 'Get the current URL of the page',
schema: {},
handler: async (): Promise<ToolResult> => {
try {
const agent = await getAgent();
if (!agent.page?.getCurrentUrl) {
return createErrorResult(
'GetCurrentUrl operation is not supported on this page type',
);
}
const url = agent.page.getCurrentUrl();
return {
content: [{ type: 'text', text: `Current URL: ${url}` }],
};
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
return createErrorResult(
`Failed to get current URL: ${errorMessage}`,
);
}
},
autoDestroy: true,
},
{
name: 'get_title',
description: 'Get the title of the current page',
schema: {},
handler: async (): Promise<ToolResult> => {
try {
const agent = await getAgent();
if (!agent.page?.getPageTitle) {
return createErrorResult(
'GetPageTitle operation is not supported on this page type',
);
}
const title = await agent.page.getPageTitle();
return {
content: [{ type: 'text', text: `Page title: ${title}` }],
};
} catch (error: unknown) {
const errorMessage = getErrorMessage(error);
return createErrorResult(`Failed to get page title: ${errorMessage}`);
}
},
autoDestroy: true,
},
];
}
21 changes: 21 additions & 0 deletions packages/shared/src/mcp/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,13 @@ export interface BaseAgent {
destroy?(): Promise<void>;
page?: {
screenshotBase64(): Promise<string>;
// Navigation methods - available on web pages
navigate?(url: string): Promise<void>;
reload?(): Promise<void>;
goBack?(): Promise<void>;
goForward?(): Promise<void>;
getCurrentUrl?(): string;
getPageTitle?(): Promise<string>;
};
aiAction?: (
description: string,
Expand All @@ -88,6 +95,20 @@ export interface BaseAgent {
assertion: string,
options: Record<string, unknown>,
) => Promise<unknown>;
/**
* Call an action directly without AI reasoning.
* This is useful for actions that don't need AI to locate elements,
* e.g., KeyboardPress with no locate parameter.
*/
callActionInActionSpace?: (
actionName: string,
params: Record<string, unknown>,
) => Promise<unknown>;
/**
* Wait for network idle after actions (e.g., after clicking a link that triggers navigation)
* This helps ensure consistent screenshots after navigation-triggering actions
*/
waitForNetworkIdle?: (timeout?: number) => Promise<void>;
}

/**
Expand Down
19 changes: 19 additions & 0 deletions packages/web-bridge-mcp/src/web-tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ export class WebMidsceneTools extends BaseMidsceneTools<AgentOverChromeBridge> {
},
autoDestroy: false,
},
{
name: 'web_close',
description:
'Close the browser connection and end the automation session.',
schema: {},
handler: async () => {
await this.closeBrowser();

return {
content: [
{
type: 'text',
text: 'Browser connection closed successfully.',
},
],
};
},
autoDestroy: false,
},
];
}
}
23 changes: 23 additions & 0 deletions packages/web-integration/src/chrome-extension/page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,29 @@ export default class ChromeExtensionProxyPage implements AbstractInterface {
await this.waitUntilNetworkIdle();
}

async goForward(): Promise<void> {
const tabId = await this.getTabIdOrConnectToCurrentTab();
await chrome.tabs.goForward(tabId);
// Wait for navigation to complete
await this.waitUntilNetworkIdle();
}

getCurrentUrl(): string {
// Note: This is a sync method but we may not have the URL cached.
// For chrome extension, use getPageInfo for async URL retrieval.
// This returns empty string as chrome.tabs.get is async.
// Consider using evaluate to get window.location.href if needed.
throw new Error(
'getCurrentUrl is not supported in chrome extension mode. Use evaluate to get window.location.href instead.',
);
}

async getPageTitle(): Promise<string> {
const tabId = await this.getTabIdOrConnectToCurrentTab();
const tab = await chrome.tabs.get(tabId);
return tab.title || '';
}

async scrollUntilTop(startingPoint?: Point) {
if (startingPoint) {
await this.mouse.move(startingPoint.left, startingPoint.top);
Expand Down
Loading