Skip to content

Commit 17fd881

Browse files
authored
feat: add claude-3-7-sonnet-20250219 (#369)
* Add 2 new model options for config.ai.model for `anthropic` * `claude-3-7-sonnet-20250219 ` * `claude-3-7-sonnet-latest ` * Add 2 new computer tools: `computer_20250124`, `bash_20250124` * Add support for new actions: `triple_click`, `hold_key`, `left_mouse_down`, `left_mouse_up`, `wait`, `scroll`
1 parent 97a464b commit 17fd881

File tree

14 files changed

+355
-64
lines changed

14 files changed

+355
-64
lines changed

packages/shortest/package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@
5858
"node": ">=18"
5959
},
6060
"peerDependencies": {
61-
"@ai-sdk/anthropic": "^1.1.9",
62-
"@ai-sdk/provider": "^1.0.8",
63-
"ai": "^4.1.45",
61+
"@ai-sdk/anthropic": "^1.1.15",
62+
"@ai-sdk/provider": "^1.0.10",
63+
"ai": "^4.1.53",
6464
"@babel/parser": "^7.26.9",
6565
"@babel/traverse": "^7.26.9",
6666
"@babel/types": "^7.26.9",
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { anthropic } from "@ai-sdk/anthropic";
2+
import { BashTool } from "@/browser/core/bash-tool";
3+
4+
/**
5+
* @see https://sdk.vercel.ai/providers/ai-sdk-providers/anthropic#bash-tool
6+
*/
7+
export const createAnthropicBash20250124 = () =>
8+
anthropic.tools.bash_20250124({
9+
execute: async ({ command }) => await new BashTool().execute(command),
10+
experimental_toToolResultContent(result) {
11+
return [
12+
{
13+
type: "text",
14+
text: result,
15+
},
16+
];
17+
},
18+
});

packages/shortest/src/ai/tools/anthropic/computer_20241022.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { anthropic } from "@ai-sdk/anthropic";
22
import { Tool } from "ai";
33
import { BrowserTool } from "@/browser/core/browser-tool";
4+
import { getLogger } from "@/log";
45
import { InternalActionEnum } from "@/types/browser";
56

67
/**
@@ -17,6 +18,11 @@ export const createAnthropicComputer20241022 = (
1718
const { action, ...restOfInput } = input;
1819
const internalAction = actionMap[action];
1920
if (!internalAction) {
21+
const log = getLogger();
22+
log.error(`Computer action not supported`, {
23+
tool: "anthropic.computer_20241022",
24+
action,
25+
});
2026
return { output: `Action '${action}' not supported` };
2127
}
2228
return browserTool.execute({ action: internalAction, ...restOfInput });
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import { anthropic } from "@ai-sdk/anthropic";
2+
import { Tool } from "ai";
3+
import { BrowserTool } from "@/browser/core/browser-tool";
4+
import { getLogger } from "@/log";
5+
import { InternalActionEnum } from "@/types/browser";
6+
7+
/**
8+
* @see https://sdk.vercel.ai/providers/ai-sdk-providers/anthropic#computer-tool
9+
*/
10+
export const createAnthropicComputer20250124 = (
11+
browserTool: BrowserTool,
12+
): Tool =>
13+
anthropic.tools.computer_20250124({
14+
displayWidthPx: 1920,
15+
displayHeightPx: 1080,
16+
displayNumber: 0,
17+
execute: async (input) => {
18+
const { action, ...restOfInput } = input;
19+
const internalAction = actionMap[action];
20+
if (!internalAction) {
21+
const log = getLogger();
22+
log.error(`Computer action not supported`, {
23+
tool: "anthropic.computer_20250124",
24+
action,
25+
});
26+
return { output: `Action '${action}' not supported` };
27+
}
28+
return browserTool.execute({ action: internalAction, ...restOfInput });
29+
},
30+
experimental_toToolResultContent: browserTool.resultToToolResultContent,
31+
});
32+
33+
/**
34+
* Map of Anthropic computer_20250124 actions to internal actions
35+
*
36+
* @see https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#computer-tool
37+
*/
38+
const actionMap: Record<string, InternalActionEnum> = {
39+
key: InternalActionEnum.KEY,
40+
hold_key: InternalActionEnum.HOLD_KEY,
41+
type: InternalActionEnum.TYPE,
42+
cursor_position: InternalActionEnum.CURSOR_POSITION,
43+
mouse_move: InternalActionEnum.MOUSE_MOVE,
44+
left_mouse_down: InternalActionEnum.LEFT_MOUSE_DOWN,
45+
left_mouse_up: InternalActionEnum.LEFT_MOUSE_UP,
46+
left_click: InternalActionEnum.LEFT_CLICK,
47+
left_click_drag: InternalActionEnum.LEFT_CLICK_DRAG,
48+
right_click: InternalActionEnum.RIGHT_CLICK,
49+
middle_click: InternalActionEnum.MIDDLE_CLICK,
50+
double_click: InternalActionEnum.DOUBLE_CLICK,
51+
triple_click: InternalActionEnum.TRIPLE_CLICK,
52+
scroll: InternalActionEnum.SCROLL,
53+
wait: InternalActionEnum.WAIT,
54+
screenshot: InternalActionEnum.SCREENSHOT,
55+
};

packages/shortest/src/browser/actions/index.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ export const click = async (
6767
page: Page,
6868
x: number,
6969
y: number,
70+
options: { button?: "left" | "right" | "middle"; clickCount?: number } = {
71+
button: "left",
72+
clickCount: 1,
73+
},
7074
): Promise<void> => {
7175
const scaledX = Math.round(x * scaleRatio.x);
7276
const scaledY = Math.round(y * scaleRatio.y);
@@ -75,7 +79,7 @@ export const click = async (
7579
const animationPromise = showClickAnimation(page, "left");
7680

7781
await Promise.all([
78-
page.mouse.click(scaledX, scaledY, { delay: 200 }), // delay to match animation duration
82+
page.mouse.click(scaledX, scaledY, options),
7983
animationPromise,
8084
]);
8185
};

packages/shortest/src/browser/core/browser-tool.ts

Lines changed: 134 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,12 @@ import { TestCase } from "@/core/runner/test-case";
2626
import { getConfig, initializeConfig } from "@/index";
2727
import { getLogger, Log } from "@/log/index";
2828
import { TestContext, BrowserToolConfig, ShortestConfig } from "@/types";
29-
import { ActionInput, ToolResult, BetaToolType } from "@/types/browser";
29+
import {
30+
ActionInput,
31+
ToolResult,
32+
BetaToolType,
33+
InternalActionEnum,
34+
} from "@/types/browser";
3035
import { getErrorDetails, ToolError, TestError } from "@/utils/errors";
3136

3237
export class BrowserTool extends BaseBrowserTool {
@@ -201,24 +206,58 @@ export class BrowserTool extends BaseBrowserTool {
201206
await this.page.click(selector);
202207
}
203208

204-
public async clickAtCoordinates(x: number, y: number): Promise<void> {
205-
this.log.debug("Clicking at coordinates", { x, y });
206-
await actions.click(this.page, x, y);
207-
}
208-
209209
async execute(input: ActionInput): Promise<ToolResult> {
210210
try {
211211
this.log.setGroup(`🛠️ ${input.action}`);
212212
let output = "";
213213
let metadata = {};
214214

215215
switch (input.action) {
216-
case "left_click":
217-
case "right_click":
218-
case "middle_click":
219-
case "double_click": {
220-
const clickCoords = input.coordinates || this.lastMousePosition;
221-
await this.clickAtCoordinates(clickCoords[0], clickCoords[1]);
216+
case InternalActionEnum.LEFT_CLICK:
217+
case InternalActionEnum.RIGHT_CLICK:
218+
case InternalActionEnum.MIDDLE_CLICK:
219+
case InternalActionEnum.DOUBLE_CLICK:
220+
case InternalActionEnum.TRIPLE_CLICK: {
221+
const clickCoords =
222+
input.coordinate || input.coordinates || this.lastMousePosition;
223+
const x = clickCoords[0];
224+
const y = clickCoords[1];
225+
const button = () => {
226+
switch (input.action) {
227+
case InternalActionEnum.LEFT_CLICK:
228+
case InternalActionEnum.DOUBLE_CLICK:
229+
case InternalActionEnum.TRIPLE_CLICK:
230+
return "left";
231+
case InternalActionEnum.RIGHT_CLICK:
232+
return "right";
233+
case InternalActionEnum.MIDDLE_CLICK:
234+
return "middle";
235+
default:
236+
throw new ToolError(
237+
`Unsupported click action: ${input.action}`,
238+
);
239+
}
240+
};
241+
const clickCount = () => {
242+
switch (input.action) {
243+
case InternalActionEnum.DOUBLE_CLICK:
244+
return 2;
245+
case InternalActionEnum.TRIPLE_CLICK:
246+
return 3;
247+
default:
248+
return 1;
249+
}
250+
};
251+
this.log.debug("Clicking at coordinates", {
252+
x,
253+
y,
254+
button: button(),
255+
clickCount: clickCount(),
256+
});
257+
await actions.click(this.page, x, y, {
258+
button: button(),
259+
clickCount: clickCount(),
260+
});
222261
output = `${input.action} at (${clickCoords[0]}, ${clickCoords[1]})`;
223262

224263
// Get initial metadata before potential navigation
@@ -245,7 +284,7 @@ export class BrowserTool extends BaseBrowserTool {
245284
break;
246285
}
247286

248-
case "mouse_move":
287+
case InternalActionEnum.MOUSE_MOVE:
249288
const coords = input.coordinates || (input as any).coordinate;
250289
if (!coords) {
251290
throw new ToolError("Coordinates required for mouse_move");
@@ -255,7 +294,7 @@ export class BrowserTool extends BaseBrowserTool {
255294
output = `Mouse moved to (${coords[0]}, ${coords[1]})`;
256295
break;
257296

258-
case "left_click_drag":
297+
case InternalActionEnum.LEFT_CLICK_DRAG:
259298
if (!input.coordinates) {
260299
throw new ToolError("Coordinates required for left_click_drag");
261300
}
@@ -267,15 +306,25 @@ export class BrowserTool extends BaseBrowserTool {
267306
output = `Dragged mouse to (${input.coordinates[0]}, ${input.coordinates[1]})`;
268307
break;
269308

270-
case "cursor_position":
309+
case InternalActionEnum.LEFT_MOUSE_DOWN:
310+
await this.page.mouse.down();
311+
output = "Pressed left mouse button";
312+
break;
313+
314+
case InternalActionEnum.LEFT_MOUSE_UP:
315+
await this.page.mouse.up();
316+
output = "Released left mouse button";
317+
break;
318+
319+
case InternalActionEnum.CURSOR_POSITION:
271320
const position = await actions.getCursorPosition(this.page);
272321
output = `Cursor position: (${position[0]}, ${position[1]})`;
273322
break;
274323

275-
case "screenshot":
324+
case InternalActionEnum.SCREENSHOT:
276325
return await this.takeScreenshotWithMetadata();
277326

278-
case "type":
327+
case InternalActionEnum.TYPE:
279328
if (!input.text) {
280329
throw new ToolError("Text required for type action");
281330
}
@@ -285,7 +334,7 @@ export class BrowserTool extends BaseBrowserTool {
285334
output = `Typed: ${input.text}`;
286335
break;
287336

288-
case "key": {
337+
case InternalActionEnum.KEY: {
289338
if (!input.text) {
290339
throw new ToolError("Key required for key action");
291340
}
@@ -313,7 +362,31 @@ export class BrowserTool extends BaseBrowserTool {
313362
break;
314363
}
315364

316-
case "github_login": {
365+
case InternalActionEnum.HOLD_KEY: {
366+
if (!input.text) {
367+
throw new ToolError("Key required for hold_key action");
368+
}
369+
370+
if (!input.duration) {
371+
throw new ToolError("Duration required for hold_key action");
372+
}
373+
374+
const seconds = input.duration;
375+
const delay = seconds / 1000;
376+
377+
const keyText = input.text.toLowerCase();
378+
const keys = Array.isArray(actions.keyboardShortcuts[keyText])
379+
? actions.keyboardShortcuts[keyText]
380+
: [actions.keyboardShortcuts[keyText] || input.text];
381+
382+
const parsedKeys = keys.join("+");
383+
await this.page.keyboard.press(parsedKeys, { delay });
384+
385+
output = `Held key: ${parsedKeys} for ${seconds} second${seconds !== 1 ? "s" : ""}`;
386+
break;
387+
}
388+
389+
case InternalActionEnum.GITHUB_LOGIN: {
317390
if (!this.githubTool) {
318391
this.githubTool = new GitHubTool();
319392
}
@@ -328,7 +401,7 @@ export class BrowserTool extends BaseBrowserTool {
328401
break;
329402
}
330403

331-
case "clear_session":
404+
case InternalActionEnum.CLEAR_SESSION:
332405
const newContext = await this.browserManager.recreateContext();
333406
this.page = newContext.pages()[0] || (await newContext.newPage());
334407
await this.page.evaluate(() => {
@@ -341,7 +414,7 @@ export class BrowserTool extends BaseBrowserTool {
341414
metadata: {},
342415
};
343416

344-
case "run_callback": {
417+
case InternalActionEnum.RUN_CALLBACK: {
345418
if (!this.testContext?.currentTest) {
346419
throw new ToolError(
347420
"No test context available for callback execution",
@@ -389,7 +462,7 @@ export class BrowserTool extends BaseBrowserTool {
389462
}
390463
}
391464

392-
case "navigate": {
465+
case InternalActionEnum.NAVIGATE: {
393466
if (!input.url) {
394467
throw new ToolError("URL required for navigation");
395468
}
@@ -440,7 +513,37 @@ export class BrowserTool extends BaseBrowserTool {
440513
}
441514
}
442515

443-
case "sleep": {
516+
case InternalActionEnum.WAIT:
517+
if (!input.duration) {
518+
throw new ToolError("Duration required for wait action");
519+
}
520+
const seconds = input.duration;
521+
await this.page.waitForTimeout(seconds * 1000);
522+
output = `Waited for ${seconds} second${seconds !== 1 ? "s" : ""}`;
523+
break;
524+
525+
case InternalActionEnum.SCROLL:
526+
if (
527+
!input.coordinate ||
528+
!input.scroll_amount ||
529+
!input.scroll_direction
530+
) {
531+
throw new ToolError("Missing args for scroll action");
532+
}
533+
await this.page.mouse.move(input.coordinate[0], input.coordinate[1]);
534+
const deltaX =
535+
(input.scroll_direction === "up"
536+
? -input.scroll_amount
537+
: input.scroll_amount) || 0;
538+
const deltaY =
539+
(input.scroll_direction === "left"
540+
? -input.scroll_amount
541+
: input.scroll_amount) || 0;
542+
await this.page.mouse.wheel(deltaX, deltaY);
543+
output = `Scrolled ${input.scroll_amount} clicks ${input.scroll_direction}`;
544+
break;
545+
546+
case InternalActionEnum.SLEEP: {
444547
const defaultDuration = 1000;
445548
const maxDuration = 60000;
446549
let duration = input.duration ?? defaultDuration;
@@ -461,7 +564,7 @@ export class BrowserTool extends BaseBrowserTool {
461564
break;
462565
}
463566

464-
case "check_email": {
567+
case InternalActionEnum.CHECK_EMAIL: {
465568
if (!this.mailosaurTool) {
466569
const mailosaurAPIKey =
467570
this.config.mailosaur?.apiKey || process.env.MAILOSAUR_API_KEY;
@@ -714,15 +817,17 @@ export class BrowserTool extends BaseBrowserTool {
714817

715818
writeFileSync(filePath, buffer);
716819
const filePathWithoutCwd = filePath.replace(process.cwd() + "/", "");
717-
this.log.debug("📺", "Screenshot saved", { filePath: filePathWithoutCwd });
718820

719-
const metadata = {
821+
const browserMetadata = await this.getMetadata();
822+
this.log.trace("Screenshot saved", {
823+
filePath: filePathWithoutCwd,
824+
...browserMetadata["window_info"],
825+
});
826+
return {
720827
output: "Screenshot taken",
721828
base64_image: buffer.toString("base64"),
722-
metadata: await this.getMetadata(),
829+
metadata: browserMetadata,
723830
};
724-
this.log.trace("Screenshot details", metadata);
725-
return metadata;
726831
}
727832

728833
toToolParameters() {

0 commit comments

Comments
 (0)