web-infra-dev · quanru · Jan 8, 2026 · Dec 31, 2025 · Dec 31, 2025 · Jan 4, 2026
diff --git a/apps/site/docs/en/api.mdx b/apps/site/docs/en/api.mdx
@@ -113,7 +113,7 @@ In Midscene, you can choose to use either auto planning or instant action.
 
 :::
 
-### `agent.aiAct()` or `.ai()`
+### `agent.aiAct()` or `agent.ai()`
 
 This method allows you to perform a series of UI actions described in natural language. Midscene automatically plans the steps and executes them.
 
@@ -131,6 +131,7 @@ function aiAct(
   options?: {
     cacheable?: boolean;
     deepThink?: 'unset' | true | false;
+    fileChooserAccept?: string | string[];
   },
 ): Promise<void>;
 function ai(prompt: string): Promise<void>; // shorthand form
@@ -142,6 +143,9 @@ function ai(prompt: string): Promise<void>; // shorthand form
   - `options?: Object` - Optional, a configuration object containing:
     - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
     - `deepThink?: 'unset' | true | false` - Whether to enable deep thinking during planning when the model supports it (depends on MIDSCENE_MODEL_FAMILY). Default is `'unset'` (same as omitting) and follows the model provider's default strategy. [Learn more about deepThink](./model-strategy#about-the-deepthink-option-in-aiact).
+    - `fileChooserAccept?: string | string[]` - When a file chooser pops up, specify the file path(s) to accept. Can be a single file path or an array of paths. Only available in web pages (Playwright, Puppeteer).
+      - **Note**: If the file input does not support multiple files (no `multiple` attribute) but multiple files are provided, an error will be thrown.
+      - **Note**: If a file chooser is triggered but no `fileChooserAccept` parameter is provided, the file chooser will be ignored and the page can continue to operate normally.
 
 - Return Value:
 
@@ -193,6 +197,9 @@ function aiTap(locate: string | Object, options?: Object): Promise<void>;
     - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element, which can improve accuracy. False by default. With newer models (e.g. Qwen3 / Doubao 1.6 / Gemini 3), the gain is less obvious.
     - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default.
     - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default.
+    - `fileChooserAccept?: string | string[]` - When a file chooser pops up, specify the file path(s) to accept. Can be a single file path or an array of paths. Only available in web pages (Playwright, Puppeteer).
+      - **Note**: If the file input does not support multiple files (no `multiple` attribute) but multiple files are provided, an error will be thrown.
+      - **Note**: If a file chooser is triggered but no `fileChooserAccept` parameter is provided, the file chooser will be ignored and the page can continue to operate normally.
 
 - Return Value:
 
@@ -207,6 +214,10 @@ await agent.aiTap('The login button at the top of the page');
 await agent.aiTap('The login button at the top of the page', {
   deepThink: true,
 });
+
+// File upload: tap the upload button and select files
+await agent.aiTap('Choose file button', { fileChooserAccept: ['./document.pdf'] });
+await agent.aiTap('Upload images', { fileChooserAccept: ['./image1.jpg', './image2.png'] });
 ```
 
 ### `agent.aiHover()`

diff --git a/apps/site/docs/zh/api.mdx b/apps/site/docs/zh/api.mdx
@@ -115,7 +115,7 @@ const agent = new PuppeteerAgent(page, {
 
 :::
 
-### `agent.aiAct()` 或 `.ai()`
+### `agent.aiAct()` 或 `agent.ai()`
 
 这个方法允许你通过自然语言描述一系列 UI 操作步骤。Midscene 会自动规划这些步骤并执行。
 
@@ -133,6 +133,7 @@ function aiAct(
   options?: {
     cacheable?: boolean;
     deepThink?: 'unset' | true | false;
+    fileChooserAccept?: string | string[];
   },
 ): Promise<void>;
 function ai(prompt: string): Promise<void>; // 简写形式
@@ -144,6 +145,9 @@ function ai(prompt: string): Promise<void>; // 简写形式
   - `options?: Object` - 可选，一个配置对象，包含：
     - `cacheable?: boolean` - 当启用 [缓存功能](./caching.mdx) 时，是否允许缓存当前 API 调用结果。默认值为 true
     - `deepThink?: 'unset' | true | false` - 当模型支持时（取决于 MIDSCENE_MODEL_FAMILY），是否开启规划阶段的深度思考能力。默认值为 `'unset'`（等同于省略该参数），跟随模型服务商的默认策略。[详情参阅 deepThink 说明](./model-strategy#关于-aiact-方法的-deepthink-参数)。
+    - `fileChooserAccept?: string | string[]` - 当文件选择器弹出时，指定对应的文件路径。可以是单个文件路径或路径数组。仅在 web 页面（Playwright、Puppeteer）中可用。
+      - **注意**：如果文件输入框不支持多文件（没有 `multiple` 属性），但是传入了多个文件，会抛出错误。
+      - **注意**：如果点击触发了文件选择器但没有传入 `fileChooserAccept` 参数，文件选择器会被忽略，页面可以继续正常操作。
 
 - 返回值：
 
@@ -178,7 +182,7 @@ await agent.aiAct('发布一条微博，内容为 "Hello World"');
 
 ### `agent.aiTap()`
 
-点击某个元素。
+点击某个元素
 
 - 类型
 
@@ -192,6 +196,9 @@ function aiTap(locate: string | Object, options?: Object): Promise<void>;
     - `deepThink?: boolean` - 是否开启深度思考。如果为 true，Midscene 会调用 AI 模型两次以精确定位元素，从而提升准确性。默认值为 false。对于新一代模型（如 Qwen3 / Doubao 1.6 / Gemini 3），带来的收益不明显。
     - `xpath?: string` - 目标元素的 xpath 路径，用于执行当前操作。如果提供了这个 xpath，Midscene 会优先使用该 xpath 来找到元素，然后依次使用缓存和 AI 模型。默认值为空
     - `cacheable?: boolean` - 当启用 [缓存功能](./caching.mdx) 时，是否允许缓存当前 API 调用结果。默认值为 true
+    - `fileChooserAccept?: string | string[]` - 当文件选择器弹出时，指定对应的文件路径。可以是单个文件路径或路径数组。仅在 web 页面（Playwright、Puppeteer）中可用。
+      - **注意**：如果文件输入框不支持多文件（没有 `multiple` 属性），但是传入了多个文件，会抛出错误。
+      - **注意**：如果点击触发了文件选择器但没有传入 `fileChooserAccept` 参数，文件选择器会被忽略，页面可以继续正常操作。
 - 返回值：
 
   - `Promise<void>`
@@ -203,6 +210,10 @@ await agent.aiTap('页面顶部的登录按钮');
 
 // 使用 deepThink 功能精确定位元素
 await agent.aiTap('页面顶部的登录按钮', { deepThink: true });
+
+// 文件上传：点击上传按钮并选择文件
+await agent.aiTap('选择文件按钮', { fileChooserAccept: ['./document.pdf'] });
+await agent.aiTap('上传图片', { fileChooserAccept: ['./image1.jpg', './image2.png'] });
 ```
 
 ### `agent.aiHover()`

diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
@@ -7,6 +7,7 @@ import {
   type AgentWaitForOpt,
   type CacheConfig,
   type DeepThinkOption,
+  type DetailedLocateParam,
   type DeviceAction,
   type ExecutionDump,
   type ExecutionRecorderItem,
@@ -52,6 +53,8 @@ import {
   parseYamlScript,
 } from '../yaml/index';
 
+import { existsSync } from 'node:fs';
+import { resolve } from 'node:path';
 import type { AbstractInterface } from '@/device';
 import type { TaskRunner } from '@/task-runner';
 import {
@@ -65,9 +68,13 @@ import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img';
 import { getDebug } from '@midscene/shared/logger';
 import { assert } from '@midscene/shared/utils';
 import { defineActionAssert } from '../device';
-// import type { AndroidDeviceInputOpt } from '../device';
 import { TaskCache } from './task-cache';
-import { TaskExecutionError, TaskExecutor, locatePlanForLocate } from './tasks';
+import {
+  TaskExecutionError,
+  TaskExecutor,
+  locatePlanForLocate,
+  withFileChooser,
+} from './tasks';
 import { locateParamStr, paramStr, taskTitleStr, typeStr } from './ui-utils';
 import {
   commonContextParser,
@@ -139,6 +146,7 @@ const defaultVlmUiTarsReplanningCycleLimit = 40;
 
 export type AiActOptions = {
   cacheable?: boolean;
+  fileChooserAccept?: string | string[];
   deepThink?: DeepThinkOption;
 };
 
@@ -590,13 +598,22 @@ export class Agent<
     return output;
   }
 
-  async aiTap(locatePrompt: TUserPrompt, opt?: LocateOption) {
+  async aiTap(
+    locatePrompt: TUserPrompt,
+    opt?: LocateOption & { fileChooserAccept?: string | string[] },
+  ) {
     assert(locatePrompt, 'missing locate prompt for tap');
 
     const detailedLocateParam = buildDetailedLocateParam(locatePrompt, opt);
 
-    return this.callActionInActionSpace('Tap', {
-      locate: detailedLocateParam,
+    const fileChooserAccept = opt?.fileChooserAccept
+      ? this.normalizeFileInput(opt.fileChooserAccept)
+      : undefined;
+
+    return withFileChooser(this.interface, fileChooserAccept, async () => {
+      return this.callActionInActionSpace('Tap', {
+        locate: detailedLocateParam,
+      });
     });
   }
 
@@ -848,81 +865,97 @@ export class Agent<
   }
 
   async aiAct(taskPrompt: string, opt?: AiActOptions) {
-    const modelConfigForPlanning =
-      this.modelConfigManager.getModelConfig('planning');
-    const defaultIntentModelConfig =
-      this.modelConfigManager.getModelConfig('default');
-
-    const includeBboxInPlanning =
-      modelConfigForPlanning.modelName === defaultIntentModelConfig.modelName &&
-      modelConfigForPlanning.openaiBaseURL ===
-        defaultIntentModelConfig.openaiBaseURL;
-    debug('setting includeBboxInPlanning to', includeBboxInPlanning);
+    const fileChooserAccept = opt?.fileChooserAccept
+      ? this.normalizeFileInput(opt.fileChooserAccept)
+      : undefined;
 
-    const cacheable = opt?.cacheable;
-    const deepThink = opt?.deepThink === 'unset' ? undefined : opt?.deepThink;
-    const replanningCycleLimit = this.resolveReplanningCycleLimit(
-      modelConfigForPlanning,
-    );
-    // if vlm-ui-tars, plan cache is not used
-    const isVlmUiTars = modelConfigForPlanning.vlMode === 'vlm-ui-tars';
-    const matchedCache =
-      isVlmUiTars || cacheable === false
-        ? undefined
-        : this.taskCache?.matchPlanCache(taskPrompt);
-    if (
-      matchedCache &&
-      this.taskCache?.isCacheResultUsed &&
-      matchedCache.cacheContent?.yamlWorkflow?.trim()
-    ) {
-      // log into report file
-      await this.taskExecutor.loadYamlFlowAsPlanning(
-        taskPrompt,
-        matchedCache.cacheContent.yamlWorkflow,
+    const runAiAct = async () => {
+      const modelConfigForPlanning =
+        this.modelConfigManager.getModelConfig('planning');
+      const defaultIntentModelConfig =
+        this.modelConfigManager.getModelConfig('default');
+
+      const includeBboxInPlanning =
+        modelConfigForPlanning.modelName ===
+          defaultIntentModelConfig.modelName &&
+        modelConfigForPlanning.openaiBaseURL ===
+          defaultIntentModelConfig.openaiBaseURL;
+      debug('setting includeBboxInPlanning to', includeBboxInPlanning);
+
+      const cacheable = opt?.cacheable;
+      const deepThink = opt?.deepThink === 'unset' ? undefined : opt?.deepThink;
+      const replanningCycleLimit = this.resolveReplanningCycleLimit(
+        modelConfigForPlanning,
       );
+      // if vlm-ui-tars, plan cache is not used
+      const isVlmUiTars = modelConfigForPlanning.vlMode === 'vlm-ui-tars';
+      const matchedCache =
+        isVlmUiTars || cacheable === false
+          ? undefined
+          : this.taskCache?.matchPlanCache(taskPrompt);
+      if (
+        matchedCache &&
+        this.taskCache?.isCacheResultUsed &&
+        matchedCache.cacheContent?.yamlWorkflow?.trim()
+      ) {
+        // log into report file
+        await this.taskExecutor.loadYamlFlowAsPlanning(
+          taskPrompt,
+          matchedCache.cacheContent.yamlWorkflow,
+        );
 
-      debug('matched cache, will call .runYaml to run the action');
-      const yaml = matchedCache.cacheContent.yamlWorkflow;
-      return this.runYaml(yaml);
-    }
+        debug('matched cache, will call .runYaml to run the action');
+        const yaml = matchedCache.cacheContent.yamlWorkflow;
+        return this.runYaml(yaml);
+      }
 
-    // If cache matched but yamlWorkflow is empty, fall through to normal execution
+      // If cache matched but yamlWorkflow is empty, fall through to normal execution
 
-    const imagesIncludeCount: number | undefined = 2;
-    const { output } = await this.taskExecutor.action(
-      taskPrompt,
-      modelConfigForPlanning,
-      defaultIntentModelConfig,
-      includeBboxInPlanning,
-      this.aiActContext,
-      cacheable,
-      replanningCycleLimit,
-      imagesIncludeCount,
-      deepThink,
-    );
+      const useDeepThink = (this.opts as any)?._deepThink;
+      if (useDeepThink) {
+        debug('using deep think planning settings');
+      }
+      const imagesIncludeCount: number | undefined = useDeepThink
+        ? undefined
+        : 2;
+      const { output } = await this.taskExecutor.action(
+        taskPrompt,
+        modelConfigForPlanning,
+        defaultIntentModelConfig,
+        includeBboxInPlanning,
+        this.aiActContext,
+        cacheable,
+        replanningCycleLimit,
+        imagesIncludeCount,
+        deepThink,
+        fileChooserAccept,
+      );
 
-    // update cache
-    if (this.taskCache && output?.yamlFlow && cacheable !== false) {
-      const yamlContent: MidsceneYamlScript = {
-        tasks: [
+      // update cache
+      if (this.taskCache && output?.yamlFlow && cacheable !== false) {
+        const yamlContent: MidsceneYamlScript = {
+          tasks: [
+            {
+              name: taskPrompt,
+              flow: output.yamlFlow,
+            },
+          ],
+        };
+        const yamlFlowStr = yaml.dump(yamlContent);
+        this.taskCache.updateOrAppendCacheRecord(
           {
-            name: taskPrompt,
-            flow: output.yamlFlow,
+            type: 'plan',
+            prompt: taskPrompt,
+            yamlWorkflow: yamlFlowStr,
           },
-        ],
-      };
-      const yamlFlowStr = yaml.dump(yamlContent);
-      this.taskCache.updateOrAppendCacheRecord(
-        {
-          type: 'plan',
-          prompt: taskPrompt,
-          yamlWorkflow: yamlFlowStr,
-        },
-        matchedCache,
-      );
-    }
+          matchedCache,
+        );
+      }
 
-    return output;
+      return output;
+    };
+
+    return await runAiAct();
   }
 
   /**
@@ -1472,6 +1505,21 @@ export class Agent<
     return null;
   }
 
+  private normalizeFilePaths(files: string[]): string[] {
+    return files.map((file) => {
+      const absolutePath = resolve(file);
+      if (!existsSync(absolutePath)) {
+        throw new Error(`File not found: ${file}`);
+      }
+      return absolutePath;
+    });
+  }
+
+  private normalizeFileInput(files: string | string[]): string[] {
+    const filesArray = Array.isArray(files) ? files : [files];
+    return this.normalizeFilePaths(filesArray);
+  }
+
   /**
    * Manually flush cache to file
    * @param options - Optional configuration