diff --git a/apps/site/docs/en/api.mdx b/apps/site/docs/en/api.mdx index 1ad22c0d36..a4c2aeb66a 100644 --- a/apps/site/docs/en/api.mdx +++ b/apps/site/docs/en/api.mdx @@ -113,7 +113,7 @@ In Midscene, you can choose to use either auto planning or instant action. ::: -### `agent.aiAct()` or `.ai()` +### `agent.aiAct()` or `agent.ai()` This method allows you to perform a series of UI actions described in natural language. Midscene automatically plans the steps and executes them. @@ -131,6 +131,7 @@ function aiAct( options?: { cacheable?: boolean; deepThink?: 'unset' | true | false; + fileChooserAccept?: string | string[]; }, ): Promise; function ai(prompt: string): Promise; // shorthand form @@ -142,6 +143,9 @@ function ai(prompt: string): Promise; // shorthand form - `options?: Object` - Optional, a configuration object containing: - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default. - `deepThink?: 'unset' | true | false` - Whether to enable deep thinking during planning when the model supports it (depends on MIDSCENE_MODEL_FAMILY). Default is `'unset'` (same as omitting) and follows the model provider's default strategy. [Learn more about deepThink](./model-strategy#about-the-deepthink-option-in-aiact). + - `fileChooserAccept?: string | string[]` - When a file chooser pops up, specify the file path(s) to accept. Can be a single file path or an array of paths. Only available in web pages (Playwright, Puppeteer). + - **Note**: If the file input does not support multiple files (no `multiple` attribute) but multiple files are provided, an error will be thrown. + - **Note**: If a file chooser is triggered but no `fileChooserAccept` parameter is provided, the file chooser will be ignored and the page can continue to operate normally. - Return Value: @@ -193,6 +197,9 @@ function aiTap(locate: string | Object, options?: Object): Promise; - `deepThink?: boolean` - If true, Midscene will call AI model twice to precisely locate the element, which can improve accuracy. False by default. With newer models (e.g. Qwen3 / Doubao 1.6 / Gemini 3), the gain is less obvious. - `xpath?: string` - The xpath of the element to operate. If provided, Midscene will first use this xpath to locate the element before using the cache and the AI model. Empty by default. - `cacheable?: boolean` - Whether cacheable when enabling [caching feature](./caching.mdx). True by default. + - `fileChooserAccept?: string | string[]` - When a file chooser pops up, specify the file path(s) to accept. Can be a single file path or an array of paths. Only available in web pages (Playwright, Puppeteer). + - **Note**: If the file input does not support multiple files (no `multiple` attribute) but multiple files are provided, an error will be thrown. + - **Note**: If a file chooser is triggered but no `fileChooserAccept` parameter is provided, the file chooser will be ignored and the page can continue to operate normally. - Return Value: @@ -207,6 +214,10 @@ await agent.aiTap('The login button at the top of the page'); await agent.aiTap('The login button at the top of the page', { deepThink: true, }); + +// File upload: tap the upload button and select files +await agent.aiTap('Choose file button', { fileChooserAccept: ['./document.pdf'] }); +await agent.aiTap('Upload images', { fileChooserAccept: ['./image1.jpg', './image2.png'] }); ``` ### `agent.aiHover()` diff --git a/apps/site/docs/zh/api.mdx b/apps/site/docs/zh/api.mdx index 0d46567e54..20dc524bf8 100644 --- a/apps/site/docs/zh/api.mdx +++ b/apps/site/docs/zh/api.mdx @@ -115,7 +115,7 @@ const agent = new PuppeteerAgent(page, { ::: -### `agent.aiAct()` 或 `.ai()` +### `agent.aiAct()` 或 `agent.ai()` 这个方法允许你通过自然语言描述一系列 UI 操作步骤。Midscene 会自动规划这些步骤并执行。 @@ -133,6 +133,7 @@ function aiAct( options?: { cacheable?: boolean; deepThink?: 'unset' | true | false; + fileChooserAccept?: string | string[]; }, ): Promise; function ai(prompt: string): Promise; // 简写形式 @@ -144,6 +145,9 @@ function ai(prompt: string): Promise; // 简写形式 - `options?: Object` - 可选,一个配置对象,包含: - `cacheable?: boolean` - 当启用 [缓存功能](./caching.mdx) 时,是否允许缓存当前 API 调用结果。默认值为 true - `deepThink?: 'unset' | true | false` - 当模型支持时(取决于 MIDSCENE_MODEL_FAMILY),是否开启规划阶段的深度思考能力。默认值为 `'unset'`(等同于省略该参数),跟随模型服务商的默认策略。[详情参阅 deepThink 说明](./model-strategy#关于-aiact-方法的-deepthink-参数)。 + - `fileChooserAccept?: string | string[]` - 当文件选择器弹出时,指定对应的文件路径。可以是单个文件路径或路径数组。仅在 web 页面(Playwright、Puppeteer)中可用。 + - **注意**:如果文件输入框不支持多文件(没有 `multiple` 属性),但是传入了多个文件,会抛出错误。 + - **注意**:如果点击触发了文件选择器但没有传入 `fileChooserAccept` 参数,文件选择器会被忽略,页面可以继续正常操作。 - 返回值: @@ -178,7 +182,7 @@ await agent.aiAct('发布一条微博,内容为 "Hello World"'); ### `agent.aiTap()` -点击某个元素。 +点击某个元素 - 类型 @@ -192,6 +196,9 @@ function aiTap(locate: string | Object, options?: Object): Promise; - `deepThink?: boolean` - 是否开启深度思考。如果为 true,Midscene 会调用 AI 模型两次以精确定位元素,从而提升准确性。默认值为 false。对于新一代模型(如 Qwen3 / Doubao 1.6 / Gemini 3),带来的收益不明显。 - `xpath?: string` - 目标元素的 xpath 路径,用于执行当前操作。如果提供了这个 xpath,Midscene 会优先使用该 xpath 来找到元素,然后依次使用缓存和 AI 模型。默认值为空 - `cacheable?: boolean` - 当启用 [缓存功能](./caching.mdx) 时,是否允许缓存当前 API 调用结果。默认值为 true + - `fileChooserAccept?: string | string[]` - 当文件选择器弹出时,指定对应的文件路径。可以是单个文件路径或路径数组。仅在 web 页面(Playwright、Puppeteer)中可用。 + - **注意**:如果文件输入框不支持多文件(没有 `multiple` 属性),但是传入了多个文件,会抛出错误。 + - **注意**:如果点击触发了文件选择器但没有传入 `fileChooserAccept` 参数,文件选择器会被忽略,页面可以继续正常操作。 - 返回值: - `Promise` @@ -203,6 +210,10 @@ await agent.aiTap('页面顶部的登录按钮'); // 使用 deepThink 功能精确定位元素 await agent.aiTap('页面顶部的登录按钮', { deepThink: true }); + +// 文件上传:点击上传按钮并选择文件 +await agent.aiTap('选择文件按钮', { fileChooserAccept: ['./document.pdf'] }); +await agent.aiTap('上传图片', { fileChooserAccept: ['./image1.jpg', './image2.png'] }); ``` ### `agent.aiHover()` diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts index c548f0de16..bbcaf1f865 100644 --- a/packages/core/src/agent/agent.ts +++ b/packages/core/src/agent/agent.ts @@ -7,6 +7,7 @@ import { type AgentWaitForOpt, type CacheConfig, type DeepThinkOption, + type DetailedLocateParam, type DeviceAction, type ExecutionDump, type ExecutionRecorderItem, @@ -52,6 +53,8 @@ import { parseYamlScript, } from '../yaml/index'; +import { existsSync } from 'node:fs'; +import { resolve } from 'node:path'; import type { AbstractInterface } from '@/device'; import type { TaskRunner } from '@/task-runner'; import { @@ -65,9 +68,13 @@ import { imageInfoOfBase64, resizeImgBase64 } from '@midscene/shared/img'; import { getDebug } from '@midscene/shared/logger'; import { assert } from '@midscene/shared/utils'; import { defineActionAssert } from '../device'; -// import type { AndroidDeviceInputOpt } from '../device'; import { TaskCache } from './task-cache'; -import { TaskExecutionError, TaskExecutor, locatePlanForLocate } from './tasks'; +import { + TaskExecutionError, + TaskExecutor, + locatePlanForLocate, + withFileChooser, +} from './tasks'; import { locateParamStr, paramStr, taskTitleStr, typeStr } from './ui-utils'; import { commonContextParser, @@ -139,6 +146,7 @@ const defaultVlmUiTarsReplanningCycleLimit = 40; export type AiActOptions = { cacheable?: boolean; + fileChooserAccept?: string | string[]; deepThink?: DeepThinkOption; }; @@ -590,13 +598,22 @@ export class Agent< return output; } - async aiTap(locatePrompt: TUserPrompt, opt?: LocateOption) { + async aiTap( + locatePrompt: TUserPrompt, + opt?: LocateOption & { fileChooserAccept?: string | string[] }, + ) { assert(locatePrompt, 'missing locate prompt for tap'); const detailedLocateParam = buildDetailedLocateParam(locatePrompt, opt); - return this.callActionInActionSpace('Tap', { - locate: detailedLocateParam, + const fileChooserAccept = opt?.fileChooserAccept + ? this.normalizeFileInput(opt.fileChooserAccept) + : undefined; + + return withFileChooser(this.interface, fileChooserAccept, async () => { + return this.callActionInActionSpace('Tap', { + locate: detailedLocateParam, + }); }); } @@ -848,81 +865,97 @@ export class Agent< } async aiAct(taskPrompt: string, opt?: AiActOptions) { - const modelConfigForPlanning = - this.modelConfigManager.getModelConfig('planning'); - const defaultIntentModelConfig = - this.modelConfigManager.getModelConfig('default'); - - const includeBboxInPlanning = - modelConfigForPlanning.modelName === defaultIntentModelConfig.modelName && - modelConfigForPlanning.openaiBaseURL === - defaultIntentModelConfig.openaiBaseURL; - debug('setting includeBboxInPlanning to', includeBboxInPlanning); + const fileChooserAccept = opt?.fileChooserAccept + ? this.normalizeFileInput(opt.fileChooserAccept) + : undefined; - const cacheable = opt?.cacheable; - const deepThink = opt?.deepThink === 'unset' ? undefined : opt?.deepThink; - const replanningCycleLimit = this.resolveReplanningCycleLimit( - modelConfigForPlanning, - ); - // if vlm-ui-tars, plan cache is not used - const isVlmUiTars = modelConfigForPlanning.vlMode === 'vlm-ui-tars'; - const matchedCache = - isVlmUiTars || cacheable === false - ? undefined - : this.taskCache?.matchPlanCache(taskPrompt); - if ( - matchedCache && - this.taskCache?.isCacheResultUsed && - matchedCache.cacheContent?.yamlWorkflow?.trim() - ) { - // log into report file - await this.taskExecutor.loadYamlFlowAsPlanning( - taskPrompt, - matchedCache.cacheContent.yamlWorkflow, + const runAiAct = async () => { + const modelConfigForPlanning = + this.modelConfigManager.getModelConfig('planning'); + const defaultIntentModelConfig = + this.modelConfigManager.getModelConfig('default'); + + const includeBboxInPlanning = + modelConfigForPlanning.modelName === + defaultIntentModelConfig.modelName && + modelConfigForPlanning.openaiBaseURL === + defaultIntentModelConfig.openaiBaseURL; + debug('setting includeBboxInPlanning to', includeBboxInPlanning); + + const cacheable = opt?.cacheable; + const deepThink = opt?.deepThink === 'unset' ? undefined : opt?.deepThink; + const replanningCycleLimit = this.resolveReplanningCycleLimit( + modelConfigForPlanning, ); + // if vlm-ui-tars, plan cache is not used + const isVlmUiTars = modelConfigForPlanning.vlMode === 'vlm-ui-tars'; + const matchedCache = + isVlmUiTars || cacheable === false + ? undefined + : this.taskCache?.matchPlanCache(taskPrompt); + if ( + matchedCache && + this.taskCache?.isCacheResultUsed && + matchedCache.cacheContent?.yamlWorkflow?.trim() + ) { + // log into report file + await this.taskExecutor.loadYamlFlowAsPlanning( + taskPrompt, + matchedCache.cacheContent.yamlWorkflow, + ); - debug('matched cache, will call .runYaml to run the action'); - const yaml = matchedCache.cacheContent.yamlWorkflow; - return this.runYaml(yaml); - } + debug('matched cache, will call .runYaml to run the action'); + const yaml = matchedCache.cacheContent.yamlWorkflow; + return this.runYaml(yaml); + } - // If cache matched but yamlWorkflow is empty, fall through to normal execution + // If cache matched but yamlWorkflow is empty, fall through to normal execution - const imagesIncludeCount: number | undefined = 2; - const { output } = await this.taskExecutor.action( - taskPrompt, - modelConfigForPlanning, - defaultIntentModelConfig, - includeBboxInPlanning, - this.aiActContext, - cacheable, - replanningCycleLimit, - imagesIncludeCount, - deepThink, - ); + const useDeepThink = (this.opts as any)?._deepThink; + if (useDeepThink) { + debug('using deep think planning settings'); + } + const imagesIncludeCount: number | undefined = useDeepThink + ? undefined + : 2; + const { output } = await this.taskExecutor.action( + taskPrompt, + modelConfigForPlanning, + defaultIntentModelConfig, + includeBboxInPlanning, + this.aiActContext, + cacheable, + replanningCycleLimit, + imagesIncludeCount, + deepThink, + fileChooserAccept, + ); - // update cache - if (this.taskCache && output?.yamlFlow && cacheable !== false) { - const yamlContent: MidsceneYamlScript = { - tasks: [ + // update cache + if (this.taskCache && output?.yamlFlow && cacheable !== false) { + const yamlContent: MidsceneYamlScript = { + tasks: [ + { + name: taskPrompt, + flow: output.yamlFlow, + }, + ], + }; + const yamlFlowStr = yaml.dump(yamlContent); + this.taskCache.updateOrAppendCacheRecord( { - name: taskPrompt, - flow: output.yamlFlow, + type: 'plan', + prompt: taskPrompt, + yamlWorkflow: yamlFlowStr, }, - ], - }; - const yamlFlowStr = yaml.dump(yamlContent); - this.taskCache.updateOrAppendCacheRecord( - { - type: 'plan', - prompt: taskPrompt, - yamlWorkflow: yamlFlowStr, - }, - matchedCache, - ); - } + matchedCache, + ); + } - return output; + return output; + }; + + return await runAiAct(); } /** @@ -1472,6 +1505,21 @@ export class Agent< return null; } + private normalizeFilePaths(files: string[]): string[] { + return files.map((file) => { + const absolutePath = resolve(file); + if (!existsSync(absolutePath)) { + throw new Error(`File not found: ${file}`); + } + return absolutePath; + }); + } + + private normalizeFileInput(files: string | string[]): string[] { + const filesArray = Array.isArray(files) ? files : [files]; + return this.normalizeFilePaths(filesArray); + } + /** * Manually flush cache to file * @param options - Optional configuration diff --git a/packages/core/src/agent/tasks.ts b/packages/core/src/agent/tasks.ts index 332d2ce96e..61afb9bd07 100644 --- a/packages/core/src/agent/tasks.ts +++ b/packages/core/src/agent/tasks.ts @@ -1,6 +1,6 @@ import { ConversationHistory, plan, uiTarsPlanning } from '@/ai-model'; import type { TMultimodalPrompt, TUserPrompt } from '@/common'; -import type { AbstractInterface } from '@/device'; +import type { AbstractInterface, FileChooserHandler } from '@/device'; import type Service from '@/service'; import type { TaskRunner } from '@/task-runner'; import { TaskExecutionError } from '@/task-runner'; @@ -210,6 +210,40 @@ export class TaskExecutor { replanningCycleLimitOverride?: number, imagesIncludeCount?: number, deepThink?: DeepThinkOption, + fileChooserAccept?: string[], + ): Promise< + ExecutionResult< + | { + yamlFlow?: MidsceneYamlFlowItem[]; // for cache use + } + | undefined + > + > { + return withFileChooser(this.interface, fileChooserAccept, async () => { + return this.runAction( + userPrompt, + modelConfigForPlanning, + modelConfigForDefaultIntent, + includeBboxInPlanning, + aiActContext, + cacheable, + replanningCycleLimitOverride, + imagesIncludeCount, + deepThink, + ); + }); + } + + private async runAction( + userPrompt: string, + modelConfigForPlanning: IModelConfig, + modelConfigForDefaultIntent: IModelConfig, + includeBboxInPlanning: boolean, + aiActContext?: string, + cacheable?: boolean, + replanningCycleLimitOverride?: number, + imagesIncludeCount?: number, + deepThink?: DeepThinkOption, ): Promise< ExecutionResult< | { @@ -408,13 +442,12 @@ export class TaskExecutor { } } - const finalResult = { + return { output: { yamlFlow, }, runner, }; - return finalResult; } private createTypeQueryTask( @@ -663,3 +696,37 @@ export class TaskExecutor { return session.appendErrorPlan(`waitFor timeout: ${errorThought}`); } } + +export async function withFileChooser( + interfaceInstance: AbstractInterface, + fileChooserAccept: string[] | undefined, + action: () => Promise, +): Promise { + if (!fileChooserAccept?.length) { + return action(); + } + + if (!interfaceInstance.registerFileChooserListener) { + throw new Error( + `File upload is not supported on ${interfaceInstance.interfaceType}`, + ); + } + + const handler = async (chooser: FileChooserHandler) => { + await chooser.accept(fileChooserAccept); + }; + + const { dispose, getError } = + await interfaceInstance.registerFileChooserListener(handler); + try { + const result = await action(); + // Check for errors that occurred during file chooser handling + const error = getError(); + if (error) { + throw error; + } + return result; + } finally { + dispose(); + } +} diff --git a/packages/core/src/device/index.ts b/packages/core/src/device/index.ts index 75be7f1d55..d7220e887e 100644 --- a/packages/core/src/device/index.ts +++ b/packages/core/src/device/index.ts @@ -11,6 +11,10 @@ import { _keyDefinitions } from '@midscene/shared/us-keyboard-layout'; import { z } from 'zod'; import type { ElementCacheFeature, Rect, Size, UIContext } from '../types'; +export interface FileChooserHandler { + accept(files: string[]): Promise; +} + export abstract class AbstractInterface { abstract interfaceType: string; @@ -35,6 +39,11 @@ export abstract class AbstractInterface { abstract beforeInvokeAction?(actionName: string, param: any): Promise; abstract afterInvokeAction?(actionName: string, param: any): Promise; + // for web only + registerFileChooserListener?( + handler: (chooser: FileChooserHandler) => Promise, + ): Promise<{ dispose: () => void; getError: () => Error | undefined }>; + // @deprecated do NOT extend this method abstract getElementsNodeTree?: () => Promise; @@ -76,7 +85,6 @@ export const defineAction = < export const actionTapParamSchema = z.object({ locate: getMidsceneLocationSchema().describe('The element to be tapped'), }); -// Override the inferred type to use LocateResultElement for the runtime locate field export type ActionTapParam = { locate: LocateResultElement; }; diff --git a/packages/core/src/yaml.ts b/packages/core/src/yaml.ts index 266c2eac88..68452286be 100644 --- a/packages/core/src/yaml.ts +++ b/packages/core/src/yaml.ts @@ -9,6 +9,7 @@ export interface LocateOption { cacheable?: boolean; // user can set this param to false to disable the cache for a single agent api xpath?: string; // only available in web uiContext?: UIContext; + fileChooserAccept?: string | string[]; // file path(s) to upload when tapping triggers a file chooser } export interface ServiceExtractOption { diff --git a/packages/web-integration/src/playwright/page.ts b/packages/web-integration/src/playwright/page.ts index 0b8a525f01..086305edde 100644 --- a/packages/web-integration/src/playwright/page.ts +++ b/packages/web-integration/src/playwright/page.ts @@ -1,9 +1,47 @@ -import type { Page as PlaywrightPageType } from 'playwright'; +import type { FileChooser, Page as PlaywrightPageType } from 'playwright'; import { Page as BasePage } from '../puppeteer/base-page'; import type { WebPageOpt } from '../web-element'; export class WebPage extends BasePage<'playwright', PlaywrightPageType> { + private playwrightFileChooserHandler?: ( + chooser: FileChooser, + ) => Promise; + constructor(page: PlaywrightPageType, opts?: WebPageOpt) { super(page, 'playwright', opts); } + + async registerFileChooserListener( + handler: ( + chooser: import('@midscene/core/device').FileChooserHandler, + ) => Promise, + ): Promise<{ dispose: () => void; getError: () => Error | undefined }> { + const page = this.underlyingPage as PlaywrightPageType; + + let capturedError: Error | undefined; + + this.playwrightFileChooserHandler = async (chooser: FileChooser) => { + try { + await handler({ + accept: async (files: string[]) => { + await chooser.setFiles(files); + }, + }); + } catch (error) { + capturedError = error as Error; + } + }; + + page.on('filechooser', this.playwrightFileChooserHandler); + + return { + dispose: () => { + if (this.playwrightFileChooserHandler) { + page.off('filechooser', this.playwrightFileChooserHandler); + this.playwrightFileChooserHandler = undefined; + } + }, + getError: () => capturedError, + }; + } } diff --git a/packages/web-integration/src/puppeteer/base-page.ts b/packages/web-integration/src/puppeteer/base-page.ts index 187eb5ced2..84c4697303 100644 --- a/packages/web-integration/src/puppeteer/base-page.ts +++ b/packages/web-integration/src/puppeteer/base-page.ts @@ -30,7 +30,7 @@ import { } from '@midscene/shared/node'; import { assert } from '@midscene/shared/utils'; import type { Page as PlaywrightPage } from 'playwright'; -import type { Page as PuppeteerPage } from 'puppeteer'; +import type { CDPSession, Protocol, Page as PuppeteerPage } from 'puppeteer'; import { type KeyInput, type MouseButton, @@ -66,6 +66,10 @@ export class Page< private onAfterInvokeAction?: AbstractInterface['afterInvokeAction']; private customActions?: DeviceAction[]; private enableTouchEventsInActionSpace: boolean; + private puppeteerFileChooserSession?: CDPSession; + private puppeteerFileChooserHandler?: ( + event: Protocol.Page.FileChooserOpenedEvent, + ) => Promise; interfaceType: AgentType; actionSpace(): DeviceAction[] { @@ -365,22 +369,18 @@ export class Page< await (this.underlyingPage as PlaywrightPage).mouse.dblclick(x, y, { button, }); - } else { - if (this.interfaceType === 'puppeteer') { - if (button === 'left' && count === 1) { - await (this.underlyingPage as PuppeteerPage).mouse.click(x, y); - } else { - await (this.underlyingPage as PuppeteerPage).mouse.click(x, y, { - button, - count, - }); - } - } else if (this.interfaceType === 'playwright') { - (this.underlyingPage as PlaywrightPage).mouse.click(x, y, { - button, - clickCount: count, - }); + } else if (this.interfaceType === 'puppeteer') { + const page = this.underlyingPage as PuppeteerPage; + if (button === 'left' && count === 1) { + await page.mouse.click(x, y); + } else { + await page.mouse.click(x, y, { button, count }); } + } else if (this.interfaceType === 'playwright') { + await (this.underlyingPage as PlaywrightPage).mouse.click(x, y, { + button, + clickCount: count, + }); } }, wheel: async (deltaX: number, deltaY: number) => { @@ -686,6 +686,89 @@ export class Page< await page.mouse.up({ button: 'left' }); } } + + private async ensurePuppeteerFileChooserSession( + page: PuppeteerPage, + ): Promise { + if (this.puppeteerFileChooserSession) { + return this.puppeteerFileChooserSession; + } + const session = await page.target().createCDPSession(); + await session.send('Page.enable'); + await session.send('DOM.enable'); + await session.send('Page.setInterceptFileChooserDialog', { enabled: true }); + this.puppeteerFileChooserSession = session; + return session; + } + + async registerFileChooserListener( + handler: ( + chooser: import('@midscene/core/device').FileChooserHandler, + ) => Promise, + ): Promise<{ dispose: () => void; getError: () => Error | undefined }> { + if (this.interfaceType !== 'puppeteer') { + throw new Error( + 'registerFileChooserListener is only supported in Puppeteer', + ); + } + + const page = this.underlyingPage as PuppeteerPage; + const session = await this.ensurePuppeteerFileChooserSession(page); + if (this.puppeteerFileChooserHandler) { + session.off('Page.fileChooserOpened', this.puppeteerFileChooserHandler); + } + + let capturedError: Error | undefined; + + this.puppeteerFileChooserHandler = async (event) => { + if (event.backendNodeId === undefined) { + debugPage('puppeteer file chooser opened without backendNodeId, skip'); + return; + } + try { + await handler({ + accept: async (files: string[]) => { + // Check if input supports multiple files + if (files.length > 1) { + const { node } = await session.send('DOM.describeNode', { + backendNodeId: event.backendNodeId, + }); + // attributes is a flat array: ['attr1', 'value1', 'attr2', 'value2', ...] + const hasMultiple = node.attributes?.includes('multiple'); + if (!hasMultiple) { + throw new Error( + 'Non-multiple file input can only accept single file', + ); + } + } + await session.send('DOM.setFileInputFiles', { + files, + backendNodeId: event.backendNodeId, + }); + }, + }); + } catch (error) { + capturedError = error as Error; + } + }; + session.on('Page.fileChooserOpened', this.puppeteerFileChooserHandler); + return { + dispose: () => { + if (this.puppeteerFileChooserHandler) { + session.off( + 'Page.fileChooserOpened', + this.puppeteerFileChooserHandler, + ); + } + void session.detach(); + this.puppeteerFileChooserHandler = undefined; + if (this.puppeteerFileChooserSession === session) { + this.puppeteerFileChooserSession = undefined; + } + }, + getError: () => capturedError, + }; + } } export function forceClosePopup( diff --git a/packages/web-integration/src/web-page.ts b/packages/web-integration/src/web-page.ts index a4c69b530b..6e3c50c1a8 100644 --- a/packages/web-integration/src/web-page.ts +++ b/packages/web-integration/src/web-page.ts @@ -429,6 +429,8 @@ export const commonWebActionsForWebPage = ( defineActionTap(async (param) => { const element = param.locate; assert(element, 'Element not found, cannot tap'); + + // Pure tap action - file handling is done at Page layer via setFileChooserHandler await page.mouse.click(element.center[0], element.center[1], { button: 'left', }); diff --git a/packages/web-integration/tests/ai/fixtures/file-upload.html b/packages/web-integration/tests/ai/fixtures/file-upload.html new file mode 100644 index 0000000000..445b8125d7 --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/file-upload.html @@ -0,0 +1,57 @@ + + + + File Upload Test + + + +

File Upload Test Page

+ +
+ + +

Supports multiple file upload

+
+ +
+ + +

Supports single file upload only

+
+ +
+

Selected Files:

+
+
+ + + + diff --git a/packages/web-integration/tests/ai/fixtures/relative-test.txt b/packages/web-integration/tests/ai/fixtures/relative-test.txt new file mode 100644 index 0000000000..c10bdc72eb --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/relative-test.txt @@ -0,0 +1 @@ +Relative path test \ No newline at end of file diff --git a/packages/web-integration/tests/ai/fixtures/test-file-1.txt b/packages/web-integration/tests/ai/fixtures/test-file-1.txt new file mode 100644 index 0000000000..387f499b27 --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/test-file-1.txt @@ -0,0 +1 @@ +Test file 1 content \ No newline at end of file diff --git a/packages/web-integration/tests/ai/fixtures/test-file-2.txt b/packages/web-integration/tests/ai/fixtures/test-file-2.txt new file mode 100644 index 0000000000..cadc748681 --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/test-file-2.txt @@ -0,0 +1 @@ +Test file 2 content \ No newline at end of file diff --git a/packages/web-integration/tests/ai/fixtures/test-file.txt b/packages/web-integration/tests/ai/fixtures/test-file.txt new file mode 100644 index 0000000000..e4f2ad66fb --- /dev/null +++ b/packages/web-integration/tests/ai/fixtures/test-file.txt @@ -0,0 +1 @@ +This is a test file for upload \ No newline at end of file diff --git a/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts b/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts new file mode 100644 index 0000000000..72de174434 --- /dev/null +++ b/packages/web-integration/tests/ai/web/playwright/file-upload.spec.ts @@ -0,0 +1,118 @@ +import { join } from 'node:path'; +import { expect } from 'playwright/test'; +import { test } from './fixture'; + +test.describe('file upload functionality', () => { + test('should upload single file', async ({ aiTap, aiAssert, page }) => { + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + + await page.goto( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + + // Upload single file + await aiTap('Choose Single File', { fileChooserAccept: [testFile] }); + + // Verify file is selected + await aiAssert('page displays "test-file.txt"'); + await aiAssert('page displays "single"'); + }); + + test('should upload multiple files', async ({ aiTap, aiAssert, page }) => { + const testFile1 = join(__dirname, '../../fixtures/test-file-1.txt'); + const testFile2 = join(__dirname, '../../fixtures/test-file-2.txt'); + + await page.goto( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + + // Upload multiple files + await aiTap('Choose Files', { + fileChooserAccept: [testFile1, testFile2], + }); + + // Verify files are selected + await aiAssert('page displays "test-file-1.txt"'); + await aiAssert('page displays "test-file-2.txt"'); + await aiAssert('page displays "multiple"'); + }); + + test('should handle relative paths', async ({ aiTap, aiAssert, page }) => { + await page.goto( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + + // Upload file using relative path + await aiTap('Choose Single File', { + fileChooserAccept: ['./tests/ai/fixtures/relative-test.txt'], + }); + + // Verify file is selected + await aiAssert('page displays "relative-test.txt"'); + }); + + test('should throw error for non-existent file', async ({ aiTap, page }) => { + await page.goto( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + + // Attempt to upload non-existent file + await expect( + aiTap('Choose Files', { + fileChooserAccept: ['./non-existent-file.txt'], + }), + ).rejects.toThrow(/File not found/); + }); + + test('should throw error when uploading multiple files to single-file input', async ({ + aiTap, + aiAssert, + page, + }) => { + const testFile1 = join(__dirname, '../../fixtures/test-file-1.txt'); + const testFile2 = join(__dirname, '../../fixtures/test-file-2.txt'); + + await page.goto( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + + // Attempt to upload multiple files to single-file input (no 'multiple' attribute) + // This should throw an error because the input only accepts single file + await expect( + aiTap('Choose Single File', { + fileChooserAccept: [testFile1, testFile2], + }), + ).rejects.toThrow(/Non-multiple file input/); + + // Verify that no files were uploaded after the error + await aiAssert('page does not display "test-file-1.txt"'); + await aiAssert('page does not display "test-file-2.txt"'); + + // Verify page is still interactive - can upload a single file successfully + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + await aiTap('Choose Single File', { fileChooserAccept: [testFile] }); + await aiAssert('page displays "test-file.txt"'); + }); + + test('should allow page interaction when file chooser is triggered but no files provided', async ({ + aiTap, + aiAssert, + page, + }) => { + await page.goto( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + + // Click the upload button without providing fileChooserAccept + // The file chooser will be triggered but dismissed without selecting files + await aiTap('Choose Single File'); + + // Verify page is still interactive - can perform other actions + await aiAssert('page displays "File Upload Test Page"'); + + // Can still upload files after dismissing the chooser + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + await aiTap('Choose Single File', { fileChooserAccept: [testFile] }); + await aiAssert('page displays "test-file.txt"'); + }); +}); diff --git a/packages/web-integration/tests/ai/web/puppeteer/file-upload.test.ts b/packages/web-integration/tests/ai/web/puppeteer/file-upload.test.ts new file mode 100644 index 0000000000..3fa2b473b3 --- /dev/null +++ b/packages/web-integration/tests/ai/web/puppeteer/file-upload.test.ts @@ -0,0 +1,204 @@ +import { join } from 'node:path'; +import { PuppeteerAgent } from '@/puppeteer'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import { launchPage } from './utils'; + +vi.setConfig({ + testTimeout: 60 * 1000, +}); + +describe('file upload functionality', () => { + let resetFn: () => Promise; + let agent: PuppeteerAgent; + + afterEach(async () => { + if (agent) { + try { + await agent.destroy(); + } catch (e) { + console.warn('agent destroy error', e); + } + } + if (resetFn) { + await resetFn(); + } + }); + + it('should upload single file', async () => { + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + // Upload single file + await agent.aiTap('Choose Single File', { fileChooserAccept: [testFile] }); + + // Verify file is selected + await agent.aiAssert('page displays "test-file.txt"'); + await agent.aiAssert('page displays "single"'); + }); + + it('should upload multiple files', async () => { + const testFile1 = join(__dirname, '../../fixtures/test-file-1.txt'); + const testFile2 = join(__dirname, '../../fixtures/test-file-2.txt'); + + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + // Upload multiple files + await agent.aiTap('Choose Files', { + fileChooserAccept: [testFile1, testFile2], + }); + + // Verify files are selected + await agent.aiAssert('page displays "test-file-1.txt"'); + await agent.aiAssert('page displays "test-file-2.txt"'); + await agent.aiAssert('page displays "multiple"'); + }); + + it('should upload files via aiAct', async () => { + const testFile1 = join(__dirname, '../../fixtures/test-file-1.txt'); + const testFile2 = join(__dirname, '../../fixtures/test-file-2.txt'); + + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + await agent.aiAct( + 'click "Choose Files" button above the text "Supports multiple file upload"', + { + fileChooserAccept: [testFile1, testFile2], + }, + ); + + await agent.aiAssert('page displays "test-file-1.txt"'); + await agent.aiAssert('page displays "test-file-2.txt"'); + await agent.aiAssert('page displays "multiple"'); + }); + + it('should not time out when no file chooser is triggered in aiAct', async () => { + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + await agent.aiAct('click the page title', { + fileChooserAccept: [testFile], + }); + }); + + it('should handle relative paths', async () => { + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + // Upload file using relative path + await agent.aiTap('Choose Single File', { + fileChooserAccept: ['./tests/ai/fixtures/relative-test.txt'], + }); + + // Verify file is selected + await agent.aiAssert('page displays "relative-test.txt"'); + }); + + it('should throw error for non-existent file', async () => { + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + // Attempt to upload non-existent file + await expect( + agent.aiTap('Choose Files', { + fileChooserAccept: ['./non-existent-file.txt'], + }), + ).rejects.toThrow(/File not found/); + }); + + it('should not time out when no file chooser is triggered', async () => { + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + await agent.aiTap('the title "File Upload Test Page"', { + fileChooserAccept: [testFile], + }); + }); + + it('should throw error when uploading multiple files to single-file input', async () => { + const testFile1 = join(__dirname, '../../fixtures/test-file-1.txt'); + const testFile2 = join(__dirname, '../../fixtures/test-file-2.txt'); + + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + // Attempt to upload multiple files to single-file input (no 'multiple' attribute) + // This should throw an error because the input only accepts single file + await expect( + agent.aiTap('Choose Single File', { + fileChooserAccept: [testFile1, testFile2], + }), + ).rejects.toThrow(/Non-multiple file input can only accept single file/); + + // Verify that no files were uploaded after the error + await agent.aiAssert('page does not display "test-file-1.txt"'); + await agent.aiAssert('page does not display "test-file-2.txt"'); + + // Verify page is still interactive - can upload a single file successfully + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + await agent.aiTap('Choose Single File', { fileChooserAccept: [testFile] }); + await agent.aiAssert('page displays "test-file.txt"'); + }); + + it('should allow page interaction when file chooser is triggered but no files provided', async () => { + const { originPage, reset } = await launchPage( + `file://${join(__dirname, '../../fixtures/file-upload.html')}`, + ); + resetFn = reset; + + agent = new PuppeteerAgent(originPage); + + // Click the upload button without providing fileChooserAccept + // The file chooser will be triggered but dismissed without selecting files + await agent.aiTap('Choose Single File'); + + // Verify page is still interactive - can perform other actions + await agent.aiAssert('page displays "File Upload Test Page"'); + + // Can still upload files after dismissing the chooser + const testFile = join(__dirname, '../../fixtures/test-file.txt'); + await agent.aiTap('Choose Single File', { + fileChooserAccept: [testFile], + }); + await agent.aiAssert('page displays "test-file.txt"'); + }); +});