@@ -26,7 +26,12 @@ import { TestCase } from "@/core/runner/test-case";
26
26
import { getConfig , initializeConfig } from "@/index" ;
27
27
import { getLogger , Log } from "@/log/index" ;
28
28
import { TestContext , BrowserToolConfig , ShortestConfig } from "@/types" ;
29
- import { ActionInput , ToolResult , BetaToolType } from "@/types/browser" ;
29
+ import {
30
+ ActionInput ,
31
+ ToolResult ,
32
+ BetaToolType ,
33
+ InternalActionEnum ,
34
+ } from "@/types/browser" ;
30
35
import { getErrorDetails , ToolError , TestError } from "@/utils/errors" ;
31
36
32
37
export class BrowserTool extends BaseBrowserTool {
@@ -201,24 +206,58 @@ export class BrowserTool extends BaseBrowserTool {
201
206
await this . page . click ( selector ) ;
202
207
}
203
208
204
- public async clickAtCoordinates ( x : number , y : number ) : Promise < void > {
205
- this . log . debug ( "Clicking at coordinates" , { x, y } ) ;
206
- await actions . click ( this . page , x , y ) ;
207
- }
208
-
209
209
async execute ( input : ActionInput ) : Promise < ToolResult > {
210
210
try {
211
211
this . log . setGroup ( `🛠️ ${ input . action } ` ) ;
212
212
let output = "" ;
213
213
let metadata = { } ;
214
214
215
215
switch ( input . action ) {
216
- case "left_click" :
217
- case "right_click" :
218
- case "middle_click" :
219
- case "double_click" : {
220
- const clickCoords = input . coordinates || this . lastMousePosition ;
221
- await this . clickAtCoordinates ( clickCoords [ 0 ] , clickCoords [ 1 ] ) ;
216
+ case InternalActionEnum . LEFT_CLICK :
217
+ case InternalActionEnum . RIGHT_CLICK :
218
+ case InternalActionEnum . MIDDLE_CLICK :
219
+ case InternalActionEnum . DOUBLE_CLICK :
220
+ case InternalActionEnum . TRIPLE_CLICK : {
221
+ const clickCoords =
222
+ input . coordinate || input . coordinates || this . lastMousePosition ;
223
+ const x = clickCoords [ 0 ] ;
224
+ const y = clickCoords [ 1 ] ;
225
+ const button = ( ) => {
226
+ switch ( input . action ) {
227
+ case InternalActionEnum . LEFT_CLICK :
228
+ case InternalActionEnum . DOUBLE_CLICK :
229
+ case InternalActionEnum . TRIPLE_CLICK :
230
+ return "left" ;
231
+ case InternalActionEnum . RIGHT_CLICK :
232
+ return "right" ;
233
+ case InternalActionEnum . MIDDLE_CLICK :
234
+ return "middle" ;
235
+ default :
236
+ throw new ToolError (
237
+ `Unsupported click action: ${ input . action } ` ,
238
+ ) ;
239
+ }
240
+ } ;
241
+ const clickCount = ( ) => {
242
+ switch ( input . action ) {
243
+ case InternalActionEnum . DOUBLE_CLICK :
244
+ return 2 ;
245
+ case InternalActionEnum . TRIPLE_CLICK :
246
+ return 3 ;
247
+ default :
248
+ return 1 ;
249
+ }
250
+ } ;
251
+ this . log . debug ( "Clicking at coordinates" , {
252
+ x,
253
+ y,
254
+ button : button ( ) ,
255
+ clickCount : clickCount ( ) ,
256
+ } ) ;
257
+ await actions . click ( this . page , x , y , {
258
+ button : button ( ) ,
259
+ clickCount : clickCount ( ) ,
260
+ } ) ;
222
261
output = `${ input . action } at (${ clickCoords [ 0 ] } , ${ clickCoords [ 1 ] } )` ;
223
262
224
263
// Get initial metadata before potential navigation
@@ -245,7 +284,7 @@ export class BrowserTool extends BaseBrowserTool {
245
284
break ;
246
285
}
247
286
248
- case "mouse_move" :
287
+ case InternalActionEnum . MOUSE_MOVE :
249
288
const coords = input . coordinates || ( input as any ) . coordinate ;
250
289
if ( ! coords ) {
251
290
throw new ToolError ( "Coordinates required for mouse_move" ) ;
@@ -255,7 +294,7 @@ export class BrowserTool extends BaseBrowserTool {
255
294
output = `Mouse moved to (${ coords [ 0 ] } , ${ coords [ 1 ] } )` ;
256
295
break ;
257
296
258
- case "left_click_drag" :
297
+ case InternalActionEnum . LEFT_CLICK_DRAG :
259
298
if ( ! input . coordinates ) {
260
299
throw new ToolError ( "Coordinates required for left_click_drag" ) ;
261
300
}
@@ -267,15 +306,25 @@ export class BrowserTool extends BaseBrowserTool {
267
306
output = `Dragged mouse to (${ input . coordinates [ 0 ] } , ${ input . coordinates [ 1 ] } )` ;
268
307
break ;
269
308
270
- case "cursor_position" :
309
+ case InternalActionEnum . LEFT_MOUSE_DOWN :
310
+ await this . page . mouse . down ( ) ;
311
+ output = "Pressed left mouse button" ;
312
+ break ;
313
+
314
+ case InternalActionEnum . LEFT_MOUSE_UP :
315
+ await this . page . mouse . up ( ) ;
316
+ output = "Released left mouse button" ;
317
+ break ;
318
+
319
+ case InternalActionEnum . CURSOR_POSITION :
271
320
const position = await actions . getCursorPosition ( this . page ) ;
272
321
output = `Cursor position: (${ position [ 0 ] } , ${ position [ 1 ] } )` ;
273
322
break ;
274
323
275
- case "screenshot" :
324
+ case InternalActionEnum . SCREENSHOT :
276
325
return await this . takeScreenshotWithMetadata ( ) ;
277
326
278
- case "type" :
327
+ case InternalActionEnum . TYPE :
279
328
if ( ! input . text ) {
280
329
throw new ToolError ( "Text required for type action" ) ;
281
330
}
@@ -285,7 +334,7 @@ export class BrowserTool extends BaseBrowserTool {
285
334
output = `Typed: ${ input . text } ` ;
286
335
break ;
287
336
288
- case "key" : {
337
+ case InternalActionEnum . KEY : {
289
338
if ( ! input . text ) {
290
339
throw new ToolError ( "Key required for key action" ) ;
291
340
}
@@ -313,7 +362,31 @@ export class BrowserTool extends BaseBrowserTool {
313
362
break ;
314
363
}
315
364
316
- case "github_login" : {
365
+ case InternalActionEnum . HOLD_KEY : {
366
+ if ( ! input . text ) {
367
+ throw new ToolError ( "Key required for hold_key action" ) ;
368
+ }
369
+
370
+ if ( ! input . duration ) {
371
+ throw new ToolError ( "Duration required for hold_key action" ) ;
372
+ }
373
+
374
+ const seconds = input . duration ;
375
+ const delay = seconds / 1000 ;
376
+
377
+ const keyText = input . text . toLowerCase ( ) ;
378
+ const keys = Array . isArray ( actions . keyboardShortcuts [ keyText ] )
379
+ ? actions . keyboardShortcuts [ keyText ]
380
+ : [ actions . keyboardShortcuts [ keyText ] || input . text ] ;
381
+
382
+ const parsedKeys = keys . join ( "+" ) ;
383
+ await this . page . keyboard . press ( parsedKeys , { delay } ) ;
384
+
385
+ output = `Held key: ${ parsedKeys } for ${ seconds } second${ seconds !== 1 ? "s" : "" } ` ;
386
+ break ;
387
+ }
388
+
389
+ case InternalActionEnum . GITHUB_LOGIN : {
317
390
if ( ! this . githubTool ) {
318
391
this . githubTool = new GitHubTool ( ) ;
319
392
}
@@ -328,7 +401,7 @@ export class BrowserTool extends BaseBrowserTool {
328
401
break ;
329
402
}
330
403
331
- case "clear_session" :
404
+ case InternalActionEnum . CLEAR_SESSION :
332
405
const newContext = await this . browserManager . recreateContext ( ) ;
333
406
this . page = newContext . pages ( ) [ 0 ] || ( await newContext . newPage ( ) ) ;
334
407
await this . page . evaluate ( ( ) => {
@@ -341,7 +414,7 @@ export class BrowserTool extends BaseBrowserTool {
341
414
metadata : { } ,
342
415
} ;
343
416
344
- case "run_callback" : {
417
+ case InternalActionEnum . RUN_CALLBACK : {
345
418
if ( ! this . testContext ?. currentTest ) {
346
419
throw new ToolError (
347
420
"No test context available for callback execution" ,
@@ -389,7 +462,7 @@ export class BrowserTool extends BaseBrowserTool {
389
462
}
390
463
}
391
464
392
- case "navigate" : {
465
+ case InternalActionEnum . NAVIGATE : {
393
466
if ( ! input . url ) {
394
467
throw new ToolError ( "URL required for navigation" ) ;
395
468
}
@@ -440,7 +513,37 @@ export class BrowserTool extends BaseBrowserTool {
440
513
}
441
514
}
442
515
443
- case "sleep" : {
516
+ case InternalActionEnum . WAIT :
517
+ if ( ! input . duration ) {
518
+ throw new ToolError ( "Duration required for wait action" ) ;
519
+ }
520
+ const seconds = input . duration ;
521
+ await this . page . waitForTimeout ( seconds * 1000 ) ;
522
+ output = `Waited for ${ seconds } second${ seconds !== 1 ? "s" : "" } ` ;
523
+ break ;
524
+
525
+ case InternalActionEnum . SCROLL :
526
+ if (
527
+ ! input . coordinate ||
528
+ ! input . scroll_amount ||
529
+ ! input . scroll_direction
530
+ ) {
531
+ throw new ToolError ( "Missing args for scroll action" ) ;
532
+ }
533
+ await this . page . mouse . move ( input . coordinate [ 0 ] , input . coordinate [ 1 ] ) ;
534
+ const deltaX =
535
+ ( input . scroll_direction === "up"
536
+ ? - input . scroll_amount
537
+ : input . scroll_amount ) || 0 ;
538
+ const deltaY =
539
+ ( input . scroll_direction === "left"
540
+ ? - input . scroll_amount
541
+ : input . scroll_amount ) || 0 ;
542
+ await this . page . mouse . wheel ( deltaX , deltaY ) ;
543
+ output = `Scrolled ${ input . scroll_amount } clicks ${ input . scroll_direction } ` ;
544
+ break ;
545
+
546
+ case InternalActionEnum . SLEEP : {
444
547
const defaultDuration = 1000 ;
445
548
const maxDuration = 60000 ;
446
549
let duration = input . duration ?? defaultDuration ;
@@ -461,7 +564,7 @@ export class BrowserTool extends BaseBrowserTool {
461
564
break ;
462
565
}
463
566
464
- case "check_email" : {
567
+ case InternalActionEnum . CHECK_EMAIL : {
465
568
if ( ! this . mailosaurTool ) {
466
569
const mailosaurAPIKey =
467
570
this . config . mailosaur ?. apiKey || process . env . MAILOSAUR_API_KEY ;
@@ -714,15 +817,17 @@ export class BrowserTool extends BaseBrowserTool {
714
817
715
818
writeFileSync ( filePath , buffer ) ;
716
819
const filePathWithoutCwd = filePath . replace ( process . cwd ( ) + "/" , "" ) ;
717
- this . log . debug ( "📺" , "Screenshot saved" , { filePath : filePathWithoutCwd } ) ;
718
820
719
- const metadata = {
821
+ const browserMetadata = await this . getMetadata ( ) ;
822
+ this . log . trace ( "Screenshot saved" , {
823
+ filePath : filePathWithoutCwd ,
824
+ ...browserMetadata [ "window_info" ] ,
825
+ } ) ;
826
+ return {
720
827
output : "Screenshot taken" ,
721
828
base64_image : buffer . toString ( "base64" ) ,
722
- metadata : await this . getMetadata ( ) ,
829
+ metadata : browserMetadata ,
723
830
} ;
724
- this . log . trace ( "Screenshot details" , metadata ) ;
725
- return metadata ;
726
831
}
727
832
728
833
toToolParameters ( ) {
0 commit comments