web-infra-dev · lilac · Dec 21, 2025 · Dec 21, 2025 · Dec 21, 2025 · Dec 21, 2025
diff --git a/packages/core/src/device/index.ts b/packages/core/src/device/index.ts
@@ -168,8 +168,7 @@ const inputLocateDescription =
   'the position of the placeholder or text content in the target input field. If there is no content, locate the center of the input field.';
 export const actionInputParamSchema = z.object({
   value: z
-    .union([z.string(), z.number()])
-    .transform((val) => String(val))
+    .string()
     .describe(
       'The text to input. Provide the final content for replace/append modes, or an empty string when using clear mode to remove existing text.',
     ),

diff --git a/packages/shared/src/mcp/tool-generator.ts b/packages/shared/src/mcp/tool-generator.ts
@@ -244,6 +244,70 @@ function createErrorResult(message: string): ToolResult {
   };
 }
 
+/**
+ * Get all locate field names from an action's paramSchema
+ */
+function getLocateFieldNames(paramSchema: z.ZodTypeAny | undefined): string[] {
+  if (!paramSchema) {
+    return [];
+  }
+
+  const schema = paramSchema as z.ZodTypeAny;
+  if (!isZodObject(schema)) {
+    return [];
+  }
+
+  const locateFields: string[] = [];
+  for (const [key, value] of Object.entries(schema.shape)) {
+    const { innerValue } = unwrapOptional(value as z.ZodTypeAny);
+    if (isZodObject(innerValue) && isLocateField(innerValue)) {
+      locateFields.push(key);
+    }
+  }
+  return locateFields;
+}
+
+/**
+ * Check if an action needs AI reasoning based on the provided args
+ * An action needs AI if it has locate fields with prompt strings that need resolution
+ */
+function actionNeedsAI(
+  locateFields: string[],
+  args: Record<string, unknown>,
+): boolean {
+  // If no locate fields, no AI needed
+  if (locateFields.length === 0) {
+    return false;
+  }
+
+  // Check if any locate field is provided with a prompt (string or object with prompt)
+  for (const field of locateFields) {
+    const value = args[field];
+    if (!value) {
+      continue;
+    }
+
+    // If value is a string, it's a locate prompt that needs AI
+    if (typeof value === 'string') {
+      return true;
+    }
+
+    // If value is an object with a prompt field (not empty), it needs AI
+    if (typeof value === 'object' && value !== null) {
+      const prompt = (value as { prompt?: unknown }).prompt;
+      if (prompt && typeof prompt === 'string' && prompt.trim() !== '') {
+        return true;
+      }
+      // Also check for nested prompt object
+      if (prompt && typeof prompt === 'object' && (prompt as { prompt?: unknown }).prompt) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 /**
  * Converts DeviceAction from actionSpace into MCP ToolDefinition
  * This is the core logic that removes need for hardcoded tool definitions
@@ -254,6 +318,7 @@ export function generateToolsFromActionSpace(
 ): ToolDefinition[] {
   return actionSpace.map((action) => {
     const schema = extractActionSchema(action.paramSchema as z.ZodTypeAny);
+    const locateFields = getLocateFieldNames(action.paramSchema as z.ZodTypeAny);
 
     return {
       name: action.name,
@@ -263,7 +328,11 @@ export function generateToolsFromActionSpace(
         try {
           const agent = await getAgent();
 
-          if (agent.aiAction) {
+          // Check if this action needs AI reasoning
+          const needsAI = actionNeedsAI(locateFields, args);
+
+          if (needsAI && agent.aiAction) {
+            // Use AI-based action execution for actions that need element location
             const instruction = buildActionInstruction(action.name, args);
             try {
               await agent.aiAction(instruction);
@@ -277,6 +346,53 @@ export function generateToolsFromActionSpace(
                 `Failed to execute action "${action.name}": ${errorMessage}`,
               );
             }
+          } else if (agent.callActionInActionSpace) {
+            // Use direct action execution for actions that don't need AI
+            // This is more efficient and doesn't require AI model service
+            try {
+              await agent.callActionInActionSpace(action.name, args);
+            } catch (error: unknown) {
+              const errorMessage = getErrorMessage(error);
+              console.error(
+                `Error executing action "${action.name}":`,
+                errorMessage,
+              );
+              return createErrorResult(
+                `Failed to execute action "${action.name}": ${errorMessage}`,
+              );
+            }
+          } else if (agent.aiAction) {
+            // Fallback to AI action if callActionInActionSpace is not available
+            const instruction = buildActionInstruction(action.name, args);
+            try {
+              await agent.aiAction(instruction);
+            } catch (error: unknown) {
+              const errorMessage = getErrorMessage(error);
+              console.error(
+                `Error executing action "${action.name}":`,
+                errorMessage,
+              );
+              return createErrorResult(
+                `Failed to execute action "${action.name}": ${errorMessage}`,
+              );
+            }
+          } else {
+            return createErrorResult(
+              `Cannot execute action "${action.name}": No execution method available`,
+            );
+          }
+
+          // Wait for network idle after action to ensure page stability
+          // This is especially important for actions that may trigger navigation (e.g., clicking links)
+          if (agent.waitForNetworkIdle) {
+            try {
+              await agent.waitForNetworkIdle();
+            } catch (error: unknown) {
+              // Network idle timeout is not critical, continue to take screenshot
+              console.warn(
+                `[midscene:warning] waitForNetworkIdle timed out after action "${action.name}", continuing execution`,
+              );
+            }
           }
 
           return await captureScreenshotResult(agent, action.name);
@@ -294,8 +410,8 @@ export function generateToolsFromActionSpace(
 }
 
 /**
- * Generate common tools (screenshot, waitFor)
- * SIMPLIFIED: Only keep essential helper tools, removed assert
+ * Generate common tools (screenshot, navigation, page info)
+ * These are direct browser commands that don't need AI reasoning
  */
 export function generateCommonTools(
   getAgent: () => Promise<BaseAgent>,
@@ -326,5 +442,141 @@ export function generateCommonTools(
       },
       autoDestroy: true,
     },
+    {
+      name: 'navigate',
+      description:
+        'Navigate the browser to a specified URL. Opens the URL in the current tab.',
+      schema: {
+        url: z.string().describe('The URL to navigate to'),
+      },
+      handler: async (args): Promise<ToolResult> => {
+        try {
+          const { url } = args as { url: string };
+          const agent = await getAgent();
+          if (!agent.page?.navigate) {
+            return createErrorResult(
+              'Navigate operation is not supported on this page type',
+            );
+          }
+          await agent.page.navigate(url);
+          return await captureScreenshotResult(agent, 'navigate');
+        } catch (error: unknown) {
+          const errorMessage = getErrorMessage(error);
+          return createErrorResult(`Failed to navigate: ${errorMessage}`);
+        }
+      },
+      autoDestroy: true,
+    },
+    {
+      name: 'reload',
+      description: 'Reload the current page',
+      schema: {},
+      handler: async (): Promise<ToolResult> => {
+        try {
+          const agent = await getAgent();
+          if (!agent.page?.reload) {
+            return createErrorResult(
+              'Reload operation is not supported on this page type',
+            );
+          }
+          await agent.page.reload();
+          return await captureScreenshotResult(agent, 'reload');
+        } catch (error: unknown) {
+          const errorMessage = getErrorMessage(error);
+          return createErrorResult(`Failed to reload: ${errorMessage}`);
+        }
+      },
+      autoDestroy: true,
+    },
+    {
+      name: 'go_back',
+      description: 'Navigate back in browser history',
+      schema: {},
+      handler: async (): Promise<ToolResult> => {
+        try {
+          const agent = await getAgent();
+          if (!agent.page?.goBack) {
+            return createErrorResult(
+              'GoBack operation is not supported on this page type',
+            );
+          }
+          await agent.page.goBack();
+          return await captureScreenshotResult(agent, 'go_back');
+        } catch (error: unknown) {
+          const errorMessage = getErrorMessage(error);
+          return createErrorResult(`Failed to go back: ${errorMessage}`);
+        }
+      },
+      autoDestroy: true,
+    },
+    {
+      name: 'go_forward',
+      description: 'Navigate forward in browser history',
+      schema: {},
+      handler: async (): Promise<ToolResult> => {
+        try {
+          const agent = await getAgent();
+          if (!agent.page?.goForward) {
+            return createErrorResult(
+              'GoForward operation is not supported on this page type',
+            );
+          }
+          await agent.page.goForward();
+          return await captureScreenshotResult(agent, 'go_forward');
+        } catch (error: unknown) {
+          const errorMessage = getErrorMessage(error);
+          return createErrorResult(`Failed to go forward: ${errorMessage}`);
+        }
+      },
+      autoDestroy: true,
+    },
+    {
+      name: 'get_url',
+      description: 'Get the current URL of the page',
+      schema: {},
+      handler: async (): Promise<ToolResult> => {
+        try {
+          const agent = await getAgent();
+          if (!agent.page?.getCurrentUrl) {
+            return createErrorResult(
+              'GetCurrentUrl operation is not supported on this page type',
+            );
+          }
+          const url = agent.page.getCurrentUrl();
+          return {
+            content: [{ type: 'text', text: `Current URL: ${url}` }],
+          };
+        } catch (error: unknown) {
+          const errorMessage = getErrorMessage(error);
+          return createErrorResult(
+            `Failed to get current URL: ${errorMessage}`,
+          );
+        }
+      },
+      autoDestroy: true,
+    },
+    {
+      name: 'get_title',
+      description: 'Get the title of the current page',
+      schema: {},
+      handler: async (): Promise<ToolResult> => {
+        try {
+          const agent = await getAgent();
+          if (!agent.page?.getPageTitle) {
+            return createErrorResult(
+              'GetPageTitle operation is not supported on this page type',
+            );
+          }
+          const title = await agent.page.getPageTitle();
+          return {
+            content: [{ type: 'text', text: `Page title: ${title}` }],
+          };
+        } catch (error: unknown) {
+          const errorMessage = getErrorMessage(error);
+          return createErrorResult(`Failed to get page title: ${errorMessage}`);
+        }
+      },
+      autoDestroy: true,
+    },
   ];
 }
diff --git a/packages/shared/src/mcp/types.ts b/packages/shared/src/mcp/types.ts
@@ -79,6 +79,13 @@ export interface BaseAgent {
   destroy?(): Promise<void>;
   page?: {
     screenshotBase64(): Promise<string>;
+    // Navigation methods - available on web pages
+    navigate?(url: string): Promise<void>;
+    reload?(): Promise<void>;
+    goBack?(): Promise<void>;
+    goForward?(): Promise<void>;
+    getCurrentUrl?(): string;
+    getPageTitle?(): Promise<string>;
   };
   aiAction?: (
     description: string,
@@ -88,6 +95,20 @@ export interface BaseAgent {
     assertion: string,
     options: Record<string, unknown>,
   ) => Promise<unknown>;
+  /**
+   * Call an action directly without AI reasoning.
+   * This is useful for actions that don't need AI to locate elements,
+   * e.g., KeyboardPress with no locate parameter.
+   */
+  callActionInActionSpace?: (
+    actionName: string,
+    params: Record<string, unknown>,
+  ) => Promise<unknown>;
+  /**
+   * Wait for network idle after actions (e.g., after clicking a link that triggers navigation)
+   * This helps ensure consistent screenshots after navigation-triggering actions
+   */
+  waitForNetworkIdle?: (timeout?: number) => Promise<void>;
 }
 
 /**

diff --git a/packages/web-bridge-mcp/src/web-tools.ts b/packages/web-bridge-mcp/src/web-tools.ts
@@ -89,6 +89,25 @@ export class WebMidsceneTools extends BaseMidsceneTools<AgentOverChromeBridge> {
         },
         autoDestroy: false,
       },
+      {
+        name: 'web_close',
+        description:
+          'Close the browser connection and end the automation session.',
+        schema: {},
+        handler: async () => {
+          await this.closeBrowser();
+
+          return {
+            content: [
+              {
+                type: 'text',
+                text: 'Browser connection closed successfully.',
+              },
+            ],
+          };
+        },
+        autoDestroy: false,
+      },
     ];
   }
 }
diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts
@@ -475,6 +475,29 @@ export default class ChromeExtensionProxyPage implements AbstractInterface {
     await this.waitUntilNetworkIdle();
   }
 
+  async goForward(): Promise<void> {
+    const tabId = await this.getTabIdOrConnectToCurrentTab();
+    await chrome.tabs.goForward(tabId);
+    // Wait for navigation to complete
+    await this.waitUntilNetworkIdle();
+  }
+
+  getCurrentUrl(): string {
+    // Note: This is a sync method but we may not have the URL cached.
+    // For chrome extension, use getPageInfo for async URL retrieval.
+    // This returns empty string as chrome.tabs.get is async.
+    // Consider using evaluate to get window.location.href if needed.
+    throw new Error(
+      'getCurrentUrl is not supported in chrome extension mode. Use evaluate to get window.location.href instead.',
+    );
+  }
+
+  async getPageTitle(): Promise<string> {
+    const tabId = await this.getTabIdOrConnectToCurrentTab();
+    const tab = await chrome.tabs.get(tabId);
+    return tab.title || '';
+  }
+
   async scrollUntilTop(startingPoint?: Point) {
     if (startingPoint) {
       await this.mouse.move(startingPoint.left, startingPoint.top);