-
-
-
-
- Need more credits?
-
- Coming soon
-
-
-
- Additional credit packages will be available soon
-
-
+
+
+ Earn More Credits
+
diff --git a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx
index 78dacdf88..0aca1589e 100644
--- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx
+++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx
@@ -1,7 +1,9 @@
import { AlertCircle, RefreshCw } from 'lucide-react'
import type { FC } from 'react'
import { useMemo } from 'react'
+import { ShareForCredits } from '@/components/referral/ShareForCredits'
import { Button } from '@/components/ui/button'
+import type { ProviderType } from '@/lib/llm-providers/types'
const SURVEY_DIRECTIONS = [
'competitor',
@@ -14,6 +16,45 @@ function pickRandomDirection(): string {
return SURVEY_DIRECTIONS[Math.floor(Math.random() * SURVEY_DIRECTIONS.length)]
}
+const PROVIDER_DISPLAY_NAMES: Record
= {
+ anthropic: 'Anthropic',
+ openai: 'OpenAI',
+ 'openai-compatible': 'OpenAI-compatible',
+ google: 'Google',
+ openrouter: 'OpenRouter',
+ azure: 'Azure OpenAI',
+ ollama: 'Ollama',
+ lmstudio: 'LM Studio',
+ bedrock: 'AWS Bedrock',
+ browseros: 'BrowserOS',
+ moonshot: 'Moonshot',
+ 'chatgpt-pro': 'ChatGPT Pro',
+ 'github-copilot': 'GitHub Copilot',
+ 'qwen-code': 'Qwen Code',
+ minimax: 'MiniMax',
+}
+
+const UPSTREAM_RATE_LIMIT_PATTERNS: Array = [
+ 'usage limit',
+ 'rate limit',
+ 'rate-limit',
+ 'quota',
+ /\b429\b/,
+ 'too many requests',
+ 'insufficient_quota',
+]
+
+function getProviderDisplayName(providerType?: string): string {
+ if (providerType && providerType in PROVIDER_DISPLAY_NAMES) {
+ return PROVIDER_DISPLAY_NAMES[providerType as ProviderType]
+ }
+ return 'your provider'
+}
+
+function stripRetryPrefix(message: string): string {
+ return message.replace(/^Failed after \d+ attempts?\.\s*Last error:\s*/i, '')
+}
+
interface ChatErrorProps {
error: Error
onRetry?: () => void
@@ -29,6 +70,8 @@ function parseErrorMessage(
isRateLimit?: boolean
isCreditsExhausted?: boolean
isConnectionError?: boolean
+ isUpstreamRateLimit?: boolean
+ providerName?: string
} {
const isBrowserosProvider = providerType === 'browseros'
@@ -69,6 +112,28 @@ function parseErrorMessage(
}
}
+ // Detect rate limits from non-BrowserOS upstream providers. Users were
+ // confused that a quota/429 from OpenAI/Anthropic/etc. looked like a
+ // BrowserOS-imposed limit.
+ if (!isBrowserosProvider && providerType) {
+ const lower = message.toLowerCase()
+ const matchesRateLimit = UPSTREAM_RATE_LIMIT_PATTERNS.some((p) =>
+ typeof p === 'string' ? lower.includes(p) : p.test(lower),
+ )
+ if (matchesRateLimit) {
+ let stripped = stripRetryPrefix(message).trim()
+ try {
+ const parsed = JSON.parse(stripped)
+ if (parsed?.error?.message) stripped = parsed.error.message
+ } catch {}
+ return {
+ text: stripped || message,
+ isUpstreamRateLimit: true,
+ providerName: getProviderDisplayName(providerType),
+ }
+ }
+ }
+
let text = message
try {
const parsed = JSON.parse(message)
@@ -90,8 +155,15 @@ export const ChatError: FC = ({
onRetry,
providerType,
}) => {
- const { text, url, isRateLimit, isCreditsExhausted, isConnectionError } =
- parseErrorMessage(error.message, providerType)
+ const {
+ text,
+ url,
+ isRateLimit,
+ isCreditsExhausted,
+ isConnectionError,
+ isUpstreamRateLimit,
+ providerName,
+ } = parseErrorMessage(error.message, providerType)
const surveyUrl = useMemo(
() =>
@@ -100,6 +172,11 @@ export const ChatError: FC = ({
)
const getTitle = () => {
+ if (isUpstreamRateLimit) {
+ return providerName && providerName !== 'your provider'
+ ? `${providerName} rate limit reached`
+ : 'Upstream rate limit reached'
+ }
if (isRateLimit) return 'Daily limit reached'
if (isConnectionError) return 'Connection failed'
return 'Something went wrong'
@@ -112,6 +189,14 @@ export const ChatError: FC = ({
{getTitle()}
{text}
+ {isUpstreamRateLimit && (
+
+ This is a limit from{' '}
+ {providerName}
+ {' — your configured model provider — not BrowserOS. Check your '}
+ provider's dashboard for quota, usage, or billing details.
+
+ )}
{isConnectionError && url && (
= ({
View troubleshooting guide
)}
- {isCreditsExhausted && url && (
-
- View Usage & Billing
-
+ {isCreditsExhausted && (
+ <>
+
+
+
+ {url && (
+
+ View Usage & Billing
+
+ )}
+ >
)}
{isRateLimit && !isCreditsExhausted && (
diff --git a/packages/browseros-agent/apps/agent/lib/credits/browseros-id.ts b/packages/browseros-agent/apps/agent/lib/credits/browseros-id.ts
new file mode 100644
index 000000000..9b3454bfb
--- /dev/null
+++ b/packages/browseros-agent/apps/agent/lib/credits/browseros-id.ts
@@ -0,0 +1,15 @@
+import { getBrowserOSAdapter } from '@/lib/browseros/adapter'
+import { BROWSEROS_PREFS } from '@/lib/browseros/prefs'
+
+// TODO(credits-identity): temporary shim — reuses the BrowserOS metrics
+// install_id as the credits/referral identifier. Replace with a dedicated
+// identity module once we have one.
+export async function getBrowserosId(): Promise {
+ const adapter = getBrowserOSAdapter()
+ const pref = await adapter.getPref(BROWSEROS_PREFS.INSTALL_ID)
+ const id = pref.value
+ if (typeof id !== 'string' || id.length === 0) {
+ throw new Error('browseros.metrics_install_id is not set')
+ }
+ return id
+}
diff --git a/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts b/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts
index e648881f5..b74a54f86 100644
--- a/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts
+++ b/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts
@@ -1,20 +1,25 @@
+import { EXTERNAL_URLS } from '@browseros/shared/constants/urls'
import { useQuery, useQueryClient } from '@tanstack/react-query'
-import { getAgentServerUrl } from '@/lib/browseros/helpers'
+import { getBrowserosId } from './browseros-id'
export interface CreditsInfo {
credits: number
dailyLimit: number
lastResetAt?: string
+ browserosId?: string
}
const CREDITS_QUERY_KEY = ['credits']
async function fetchCredits(): Promise {
- const baseUrl = await getAgentServerUrl()
- const response = await fetch(`${baseUrl}/credits`)
+ const browserosId = await getBrowserosId()
+ const response = await fetch(
+ `${EXTERNAL_URLS.CREDITS_GATEWAY}/credits/${browserosId}`,
+ )
if (!response.ok)
throw new Error(`Failed to fetch credits: ${response.status}`)
- return response.json()
+ const data = (await response.json()) as CreditsInfo
+ return { ...data, browserosId }
}
export function useCredits() {
diff --git a/packages/browseros-agent/apps/agent/lib/llm-providers/models-dev-data.json b/packages/browseros-agent/apps/agent/lib/llm-providers/models-dev-data.json
index df8a5083a..35df9a169 100644
--- a/packages/browseros-agent/apps/agent/lib/llm-providers/models-dev-data.json
+++ b/packages/browseros-agent/apps/agent/lib/llm-providers/models-dev-data.json
@@ -5402,5 +5402,89 @@
"outputCost": 0
}
]
+ },
+ "minimax": {
+ "name": "MiniMax",
+ "api": "https://api.minimaxi.com/v1",
+ "doc": "https://platform.minimax.io",
+ "models": [
+ {
+ "id": "MiniMax-M2.7",
+ "name": "MiniMax M2.7",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": true,
+ "supportsToolCall": true,
+ "inputCost": 0.3,
+ "outputCost": 1.2
+ },
+ {
+ "id": "MiniMax-M2.7-highspeed",
+ "name": "MiniMax M2.7 Highspeed",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": true,
+ "supportsToolCall": true,
+ "inputCost": 0.6,
+ "outputCost": 2.4
+ },
+ {
+ "id": "MiniMax-M2.5",
+ "name": "MiniMax M2.5",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": true,
+ "supportsToolCall": true,
+ "inputCost": 0.3,
+ "outputCost": 1.2
+ },
+ {
+ "id": "MiniMax-M2.5-highspeed",
+ "name": "MiniMax M2.5 Highspeed",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": true,
+ "supportsToolCall": true,
+ "inputCost": 0.6,
+ "outputCost": 2.4
+ },
+ {
+ "id": "MiniMax-M2.1",
+ "name": "MiniMax M2.1",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": true,
+ "supportsToolCall": true,
+ "inputCost": 0.3,
+ "outputCost": 1.2
+ },
+ {
+ "id": "MiniMax-M2.1-highspeed",
+ "name": "MiniMax M2.1 Highspeed",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": true,
+ "supportsToolCall": true,
+ "inputCost": 0.6,
+ "outputCost": 2.4
+ },
+ {
+ "id": "M2-her",
+ "name": "M2-her",
+ "contextWindow": 204800,
+ "maxOutput": 8192,
+ "supportsImages": false,
+ "supportsReasoning": false,
+ "supportsToolCall": true,
+ "inputCost": 0.3,
+ "outputCost": 1.2
+ }
+ ]
}
}
diff --git a/packages/browseros-agent/apps/agent/lib/llm-providers/providerIcons.tsx b/packages/browseros-agent/apps/agent/lib/llm-providers/providerIcons.tsx
index 8fd9f8892..8b3c8a179 100644
--- a/packages/browseros-agent/apps/agent/lib/llm-providers/providerIcons.tsx
+++ b/packages/browseros-agent/apps/agent/lib/llm-providers/providerIcons.tsx
@@ -5,6 +5,7 @@ import {
Gemini,
Kimi,
LmStudio,
+ Minimax,
Ollama,
OpenAI,
OpenRouter,
@@ -36,6 +37,7 @@ const providerIconMap: Record = {
'chatgpt-pro': OpenAI,
'github-copilot': Github,
'qwen-code': Qwen,
+ minimax: Minimax,
}
interface ProviderIconProps {
diff --git a/packages/browseros-agent/apps/agent/lib/llm-providers/providerTemplates.ts b/packages/browseros-agent/apps/agent/lib/llm-providers/providerTemplates.ts
index 4d8799b45..f3712b90e 100644
--- a/packages/browseros-agent/apps/agent/lib/llm-providers/providerTemplates.ts
+++ b/packages/browseros-agent/apps/agent/lib/llm-providers/providerTemplates.ts
@@ -140,8 +140,31 @@ export const providerTemplates: ProviderTemplate[] = [
setupGuideUrl:
'https://docs.aws.amazon.com/bedrock/latest/userguide/getting-started.html',
}),
+ enrichTemplate('minimax', {
+ defaultModelId: 'MiniMax-M2.7',
+ apiKeyUrl:
+ 'https://platform.minimax.io/user-center/basic-information/interface-key',
+ setupGuideUrl: 'https://platform.minimax.io/docs/guides/models-intro',
+ }),
]
+export const MINIMAX_REGIONS = {
+ chinese: {
+ api: 'https://api.minimaxi.com/v1',
+ apiKeyUrl:
+ 'https://platform.minimaxi.com/user-center/basic-information/interface-key',
+ setupGuideUrl: 'https://platform.minimaxi.com/document',
+ },
+ international: {
+ api: 'https://api.minimax.io/v1',
+ apiKeyUrl:
+ 'https://platform.minimax.io/user-center/basic-information/interface-key',
+ setupGuideUrl: 'https://platform.minimax.io/docs/guides/models-intro',
+ },
+} as const
+
+export type MinimaxRegion = keyof typeof MINIMAX_REGIONS
+
/**
* Provider type options for select dropdowns
* @public
@@ -161,6 +184,7 @@ export const providerTypeOptions: { value: ProviderType; label: string }[] = [
{ value: 'lmstudio', label: 'LM Studio' },
{ value: 'bedrock', label: 'AWS Bedrock' },
{ value: 'browseros', label: 'BrowserOS' },
+ { value: 'minimax', label: 'MiniMax' },
]
/**
@@ -192,6 +216,7 @@ export const DEFAULT_BASE_URLS: Record = {
lmstudio: 'http://localhost:1234/v1',
bedrock: '',
browseros: '',
+ minimax: MINIMAX_REGIONS.chinese.api,
}
/**
diff --git a/packages/browseros-agent/apps/agent/lib/llm-providers/types.ts b/packages/browseros-agent/apps/agent/lib/llm-providers/types.ts
index df537f2fb..595cc5c13 100644
--- a/packages/browseros-agent/apps/agent/lib/llm-providers/types.ts
+++ b/packages/browseros-agent/apps/agent/lib/llm-providers/types.ts
@@ -17,6 +17,7 @@ export type ProviderType =
| 'chatgpt-pro'
| 'github-copilot'
| 'qwen-code'
+ | 'minimax'
/**
* LLM Provider configuration
diff --git a/packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts b/packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts
new file mode 100644
index 000000000..5456935e0
--- /dev/null
+++ b/packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts
@@ -0,0 +1,108 @@
+import { EXTERNAL_URLS } from '@browseros/shared/constants/urls'
+
+interface ReferralResult {
+ success: boolean
+ creditsAdded?: number
+ reason?: string
+}
+
+export async function submitReferral(
+ tweetUrl: string,
+ browserosId: string,
+): Promise {
+ const response = await fetch(
+ `${EXTERNAL_URLS.REFERRAL_SERVICE}/referral/submit`,
+ {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ tweetUrl, browserosId }),
+ },
+ )
+ if (!response.ok) {
+ return {
+ success: false,
+ reason: `Request failed with status ${response.status}`,
+ }
+ }
+ return response.json()
+}
+
+const TWEET_VARIATIONS = [
+ `ngl @browseros_ai is kinda wild
+
+just type what u want in plain english and it handles the annoying web shit
+
+forms, research, data pulls... all automated
+
+actually works`,
+
+ `been using @browseros_ai to chat with webpages lately
+
+summarize articles, pull data, translate stuff
+
+all happens in the same tab
+
+no copy/paste, no switching windows
+
+just ask and it does it`,
+
+ `wake up to @browseros_ai having already read ur emails and calendar while u were sleeping
+
+scheduled agents are lowkey magic`,
+
+ `ngl @browseros_ai is kinda crazy
+
+connects gmail, slack, linear, notion + 40 other apps into one ai assistant
+
+just talk to it in plain english and it handles cross-app workflows for u
+
+no more switching between tabs like a psycho`,
+
+ `i use @browseros_ai to automate research
+
+it handles the browser work and drops reports straight into local folders
+
+no switching between tools or manually saving files
+
+just one task instead of three`,
+
+ `been messing with @browseros_ai lately
+
+it comes with a prebuilt MCP server and I connect it claude code or codex and it just runs things for you
+
+set it up once, use it whenever
+
+way better than clicking through the same shit manually every time`,
+
+ `the ai actually remembers what we talked about yesterday
+
+no more "here's the context again" every single conversation
+
+@browseros_ai just picks up where we left off
+
+feels like talking to someone who actually pays attention`,
+
+ `i built a skill library for my ai agent
+
+now when i need it to do something specific, i just load the recipe i made earlier
+
+@browseros_ai MCP is very handy`,
+
+ `been running @browseros_ai with ollama locally
+
+everything stays on my machine, nothing gets sent out
+
+kinda nice not having to think about what data i'm sharing`,
+
+ `switched to @browseros_ai from chrome
+
+blocks 10x more ads and runs full ublock origin (not the lite version)
+
+check it out`,
+]
+
+export function getShareOnTwitterUrl(): string {
+ const text =
+ TWEET_VARIATIONS[Math.floor(Math.random() * TWEET_VARIATIONS.length)]
+ return `https://x.com/intent/tweet?text=${encodeURIComponent(text)}`
+}
diff --git a/packages/browseros-agent/apps/agent/package.json b/packages/browseros-agent/apps/agent/package.json
index aa79397b1..b957a4dfb 100644
--- a/packages/browseros-agent/apps/agent/package.json
+++ b/packages/browseros-agent/apps/agent/package.json
@@ -20,6 +20,7 @@
"dependencies": {
"@ai-sdk/react": "^3.0.96",
"@browseros/server": "workspace:*",
+ "@browseros/shared": "workspace:*",
"@hookform/resolvers": "^5.2.2",
"@lobehub/icons": "^2.44.0",
"@mdxeditor/editor": "^3.52.4",
diff --git a/packages/browseros-agent/apps/eval/DESIGN_DOC.md b/packages/browseros-agent/apps/eval/DESIGN_DOC.md
index 3a4771938..321aa89fb 100644
--- a/packages/browseros-agent/apps/eval/DESIGN_DOC.md
+++ b/packages/browseros-agent/apps/eval/DESIGN_DOC.md
@@ -520,7 +520,6 @@ import { randomUUID } from 'node:crypto'
import { GeminiAgent } from '@browseros/server/agent'
import { AgentExecutionError } from '@browseros/server/agent/errors'
import type { ResolvedAgentConfig } from '@browseros/server/agent/types'
-import { MCPServerConfig } from '@google/gemini-cli-core'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
import type { SingleAgentConfig, TaskMetadata } from '../../types'
import { resolveEnvValue } from '../../utils/env'
diff --git a/packages/browseros-agent/apps/eval/README.md b/packages/browseros-agent/apps/eval/README.md
index ee20bbe6c..cb10fdb44 100644
--- a/packages/browseros-agent/apps/eval/README.md
+++ b/packages/browseros-agent/apps/eval/README.md
@@ -141,8 +141,6 @@ The executor config stays the same across all orchestrator providers — it alwa
|------|-------------|----------------|
| `single` | Single LLM agent via Gemini CLI + MCP | `webvoyager-test.json` |
| `tool-loop` | AI SDK tool loop, connects via CDP | `tool-loop-test.json` |
-| `gemini-computer-use` | Google native computer use API | `gemini-computer-use.json` |
-| `yutori-navigator` | Yutori N1 visual model | `yutori-navigator.json` |
## Configuration Reference
diff --git a/packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json b/packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json
new file mode 100644
index 000000000..29075edf0
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/configs/agisdk-real-smoke.json
@@ -0,0 +1,26 @@
+{
+ "agent": {
+ "type": "single",
+ "provider": "openai-compatible",
+ "model": "accounts/fireworks/models/kimi-k2p5",
+ "apiKey": "FIREWORKS_API_KEY",
+ "baseUrl": "https://api.fireworks.ai/inference/v1",
+ "supportsImages": true
+ },
+ "dataset": "../data/agisdk-real.jsonl",
+ "num_workers": 10,
+ "restart_server_per_task": true,
+ "browseros": {
+ "server_url": "http://127.0.0.1:9110",
+ "base_cdp_port": 9010,
+ "base_server_port": 9110,
+ "base_extension_port": 9310,
+ "load_extensions": false,
+ "headless": false
+ },
+ "captcha": {
+ "api_key_env": "NOPECHA_API_KEY"
+ },
+ "graders": ["agisdk_state_diff"],
+ "timeout_ms": 1800000
+}
diff --git a/packages/browseros-agent/apps/eval/configs/infinity-hard-50.json b/packages/browseros-agent/apps/eval/configs/infinity-hard-50.json
new file mode 100644
index 000000000..c91cb5ee4
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/configs/infinity-hard-50.json
@@ -0,0 +1,26 @@
+{
+ "agent": {
+ "type": "single",
+ "provider": "openai-compatible",
+ "model": "accounts/fireworks/models/kimi-k2p5",
+ "apiKey": "FIREWORKS_API_KEY",
+ "baseUrl": "https://api.fireworks.ai/inference/v1",
+ "supportsImages": true
+ },
+ "dataset": "../data/webarena-infinity-hard-50.jsonl",
+ "num_workers": 10,
+ "restart_server_per_task": true,
+ "browseros": {
+ "server_url": "http://127.0.0.1:9110",
+ "base_cdp_port": 9010,
+ "base_server_port": 9110,
+ "base_extension_port": 9310,
+ "load_extensions": false,
+ "headless": false
+ },
+ "captcha": {
+ "api_key_env": "NOPECHA_API_KEY"
+ },
+ "graders": ["infinity_state"],
+ "timeout_ms": 1800000
+}
diff --git a/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json b/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json
deleted file mode 100644
index 6c5174333..000000000
--- a/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "agent": {
- "type": "gemini-computer-use",
- "apiKey": "GOOGLE_AI_API_KEY",
- "screenSize": {
- "width": 1440,
- "height": 900
- },
- "turnLimit": 100
- },
- "dataset": "../data/test-set.jsonl",
- "num_workers": 1,
- "restart_server_per_task": true,
- "browseros": {
- "server_url": "http://127.0.0.1:9110",
- "base_cdp_port": 9010,
- "base_server_port": 9110,
- "base_extension_port": 9310,
- "load_extensions": false,
- "headless": false
- },
- "captcha": {
- "api_key_env": "NOPECHA_API_KEY"
- },
- "graders": ["performance_grader"],
- "grader_api_key_env": "OPENROUTER_API_KEY",
- "grader_base_url": "https://openrouter.ai/api/v1",
- "grader_model": "openai/gpt-4.1",
- "timeout_ms": 1200000
-}
diff --git a/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json b/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json
deleted file mode 100644
index fd35fba36..000000000
--- a/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "agent": {
- "type": "yutori-navigator",
- "apiKey": "YUTORI_API_KEY",
- "screenSize": {
- "width": 1280,
- "height": 800
- },
- "turnLimit": 100
- },
- "dataset": "../data/test-set.jsonl",
- "num_workers": 1,
- "restart_server_per_task": true,
- "browseros": {
- "server_url": "http://127.0.0.1:9110",
- "base_cdp_port": 9010,
- "base_server_port": 9110,
- "base_extension_port": 9310,
- "load_extensions": false,
- "headless": false
- },
- "captcha": {
- "api_key_env": "NOPECHA_API_KEY"
- },
- "graders": ["performance_grader"],
- "grader_api_key_env": "OPENROUTER_API_KEY",
- "grader_base_url": "https://openrouter.ai/api/v1",
- "grader_model": "openai/gpt-4.1",
- "timeout_ms": 1200000
-}
diff --git a/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl b/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
new file mode 100644
index 000000000..708f60d77
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/data/agisdk-real.jsonl
@@ -0,0 +1,52 @@
+{"query_id": "agisdk-dashdish-10", "dataset": "agisdk-real", "query": "Place an order from \"Souvla\" for a \"Medium Classic Cheeseburger\" and a \"Small Bacon Double Cheeseburger\" with \"Standard Delivery\" as the method with the default charged options.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-10", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-fly-unified-5", "dataset": "agisdk-real", "query": "Find me the cheapest fare for a flight from Orlando to Milwaukee on December 5th, 2024 and book it.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 420 Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-5", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-udriver-10", "dataset": "agisdk-real", "query": "Order me a ride for 4pm, I'll be at the de Young muesum headed to the Waterbar, fanciest option possible please.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-10", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-udriver-9", "dataset": "agisdk-real", "query": "Book me a ride from the thai restaurant I last took a ride to for later today at 2pm, I'll be at 333 Apartments on Fremont", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-9", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-9", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-topwork-4", "dataset": "agisdk-real", "query": "Create a job post for a UI/UX Designer with expertise in Figma, Sketch, and Adobe Creative Suite, including project details, timeline, and required skills (Wireframing, Prototyping, Responsive Design).", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-4", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-4", "dataset": "agisdk-real", "query": "Change the \"Team Check-In\" event on July 18, 2024, name to \"Project Kickoff\" and update the location to \"Zoom\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-4", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-staynb-6", "dataset": "agisdk-real", "query": "Find and book the stay with the best value for money (cheapest stay with the best reviews) for 1 day. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-6", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-6", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-omnizon-10", "dataset": "agisdk-real", "query": "Click on \"buy now\" on any product, increase its quantity to the maximum allowed, update the delivery date to the last available, and place the order.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-10", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-10", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-fly-unified-9", "dataset": "agisdk-real", "query": "Book me a flight from San Francisco to Chicago in Basic Economy on December 18th at 10:00. Ensure no seat selection is made.\nPassenger: David Lee\nDate of Birth: 07/22/1985\nSex: Male\nSeat Selection: No\nPayment: Credit Card (9999 8888 7777), Exp: 03/30, Address: 987 Cedar St, Chicago, IL, 60601, USA, Phone: 555-987-1234, Email: davidlee@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-9", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-networkin-9", "dataset": "agisdk-real", "query": "Find a professional who attended Stanford and send them a connection request and a message.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-9", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-9", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-udriver-11", "dataset": "agisdk-real", "query": "I need to go from Pacific Catch on Chestnut back home to 333 Fremont now. If the fancy version is within ten dollars of the regular one, book that.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-11", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-11", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-fly-unified-4", "dataset": "agisdk-real", "query": "Book me a round-trip flight from Providence (Rhode Island) to Indianapolis, departing on December 5th, 2024 at 08:00 and returning on December 9th at 14:00.\nPassenger: Jane Smith\nDate of Birth: 02/14/1995\nSex: Female\nSeat Selection: Yes (Window seat)\nPayment: Credit Card (378342143523967), Exp: 06/26, security code: 345 Address: 456 Elm St, Miami, FL, 33101, USA, Phone: 555-987-6543, Email: janesmith@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-4", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-networkin-5", "dataset": "agisdk-real", "query": "Send a connection request to John Smith.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-5", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-5", "challenge_type": "action", "difficulty": "easy", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-zilloft-6", "dataset": "agisdk-real", "query": "Select a property listed in San Francisco as \"Condos\" within a price range under $300,000 and request a tour for tomorrow at 4:00 PM. Use these contact details: Name: Sarah Brown, Email: sarahbrown@example.com, Phone: 555-987-6543.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-6", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "Zillow"}}}
+{"query_id": "agisdk-topwork-2", "dataset": "agisdk-real", "query": "Create a job posting for a Backend Developer specializing in Python, Django, and Flask to develop a high-performance web application. Include project details such as required skills (PostgreSQL, Docker, AWS, CI/CD), estimated project timeline, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-2", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-3", "dataset": "agisdk-real", "query": "Delete the event titled \"Breakfast Meeting with Client\" scheduled for July 19, 2024", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-3", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-topwork-3", "dataset": "agisdk-real", "query": "Create a job listing for a Full-Stack Developer with expertise in Java, Spring Boot, and Angular, outlining the project scope, estimated duration, and required skills (MySQL, Docker, Kubernetes, and Jenkins). The ideal candidate should have experience in enterprise-level applications and building scalable microservices. After creating the job post, please describe what you included in the job listing.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-3", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-3", "challenge_type": "retrieval", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-fly-unified-2", "dataset": "agisdk-real", "query": "Book me a one-way flight from Indiana to New York on December 2nd 2024 at 12:00.\nPassenger: John Doe\nDate of Birth: 01/01/1990\nSex: Male\nSeat Selection: No\nPayment: Credit Card (378342143523967), Exp: 12/25, Security Code: 245, Address: 123 Main St, San Francisco, CA, 94105, USA, Phone: 555-123-4567, Email: johndoe@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-2", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-dashdish-7", "dataset": "agisdk-real", "query": "Select \"Express Delivery\" for an order from \"DragonEats\" of \"Mushroom Swiss Burger\" and complete the checkout with the pre-loaded Visa card.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-7", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-7", "challenge_type": "action", "difficulty": "hard", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-networkin-3", "dataset": "agisdk-real", "query": "Write a post inviting users to a networking event, including details about the event's purpose, date, and target audience.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-3", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-3", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-gomail-7", "dataset": "agisdk-real", "query": "Delete the email with the subject \"New Leadership Articles You Can't Miss\" from the Inbox.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-7", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-7", "challenge_type": "retrieval-action", "difficulty": "hard", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-opendining-8", "dataset": "agisdk-real", "query": "Identify and book the restaurant with the lowest rating. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-8", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-omnizon-2", "dataset": "agisdk-real", "query": "Search for \"smartphones\" using the search bar, add the first two to your cart, view the details of the third product, click on \"Buy Now,\" and proceed through the checkout process.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-2", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-2", "challenge_type": "action", "difficulty": "medium", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-udriver-1", "dataset": "agisdk-real", "query": "Book a ride from Fitness Urbano to Pacific Cafe", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-1", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-1", "challenge_type": "action", "difficulty": "easy", "similar_to": "Uber"}}}
+{"query_id": "agisdk-staynb-2", "dataset": "agisdk-real", "query": "Click on one of the stays displayed on the homepage and book it for a family of 4 (2 adults and 2 children). For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-2", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-opendining-10", "dataset": "agisdk-real", "query": "Check the menus of all restaurants for vegetarian options and make a reservation at the one with the most vegetarian choices. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-10", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-10", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-opendining-4", "dataset": "agisdk-real", "query": "Use the search bar to search for a restaurant on September 2nd at 4:30 PM for 7 people, using \"Japanese\" as the search term, and book the first result. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-4", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-gomail-8", "dataset": "agisdk-real", "query": "Clear all emails from \"GitHub\" in the inbox to trash.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-8", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-8", "challenge_type": "action", "difficulty": "medium", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-dashdish-4", "dataset": "agisdk-real", "query": "Schedule a delivery order from \"Taco Bell\" adding a \"Classic Cheeseburger\" large size for later and add the note \"Leave at the front door\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-4", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-networkin-1", "dataset": "agisdk-real", "query": "Create a new text post for the feed with a professional update about AI trends in 2025, mentioning three key advancements and their impact on the job market.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-1", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-dashdish-5", "dataset": "agisdk-real", "query": "Add three \"Loaded Bacon Cheese Fries\" to the shopping cart from \"Man vs. Fries\". Proceed to checkout and select \"Pickup\" as the delivery method.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-5", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-opendining-5", "dataset": "agisdk-real", "query": "Scroll through the homepage carousel until \"Ocean Breeze\" is visible, select the second available time slot, and complete the reservation. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-5", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-topwork-1", "dataset": "agisdk-real", "query": "Create a new job post for a Frontend Developer with expertise in React and TypeScript, specifying project details such as estimated duration, required skills, and budget.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-topwork.vercel.app", "metadata": {"original_task_id": "topwork-1", "website": "TopWork", "category": "agisdk-real", "additional": {"agisdk_task_id": "topwork-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Upwork"}}}
+{"query_id": "agisdk-gocalendar-1", "dataset": "agisdk-real", "query": "Create a new event titled \"Team Meeting\" on July 19, 2024, from 2 PM to 2:30 PM, and include \"Conference Room A\" as the location", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-1", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-1", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-gomail-5", "dataset": "agisdk-real", "query": "Schedule an email to jane.doe@example.com with the subject \"Weekly Update\" to be sent next Monday at 9:00 AM.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-5", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-5", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-staynb-4", "dataset": "agisdk-real", "query": "Book a stay for 2 children with 1 adult. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-4", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-4", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-omnizon-8", "dataset": "agisdk-real", "query": "Search for \"Automatic Espresso Machine,\" click on the cheapest one, change the quantity to 5, use \"buy now\" to purchase them and complete the checkout.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-8", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-8", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-networkin-6", "dataset": "agisdk-real", "query": "Choose a random person who you haven't connected with, connect with them, and send them a message saying, 'howdy, partner'.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-6", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-dashdish-2", "dataset": "agisdk-real", "query": "Add a \"Medium Pepperoni Pizza\" from the restaurant \"Papa Johns Pizza\" to the shopping cart and purchase it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-dashdish.vercel.app", "metadata": {"original_task_id": "dashdish-2", "website": "DashDish", "category": "agisdk-real", "additional": {"agisdk_task_id": "dashdish-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Doordash"}}}
+{"query_id": "agisdk-staynb-8", "dataset": "agisdk-real", "query": "Scroll through the homepage and book the last stay located in Paris.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-8", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-8", "challenge_type": "retrieval-action", "difficulty": "medium", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-omnizon-4", "dataset": "agisdk-real", "query": "Search for a \"Marshall Emberton II Portable Bluetooth Speaker\" and add it to your cart, then search for the \"Michael Kors Oversized Slim Runway Men's Watch,\" add it to the cart, and complete the checkout with both items.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-4", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-4", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-gomail-2", "dataset": "agisdk-real", "query": "Mark the first email in the Inbox as \"read\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-2", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-2", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-networkin-10", "dataset": "agisdk-real", "query": "Generate a polite follow-up message for a previous unanswered chat, starting with \"Following up on\".", "graders": ["agisdk_state_diff"], "start_url": "https://evals-networkin.vercel.app", "metadata": {"original_task_id": "networkin-10", "website": "Networkin", "category": "agisdk-real", "additional": {"agisdk_task_id": "networkin-10", "challenge_type": "action", "difficulty": "medium", "similar_to": "LinkedIn"}}}
+{"query_id": "agisdk-gomail-3", "dataset": "agisdk-real", "query": "Compose a new email to jonathan.smith@example.com with the subject \"Meeting Notes\" and body \"Please find the meeting notes attached.\"", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gomail.vercel.app", "metadata": {"original_task_id": "gomail-3", "website": "GoMail", "category": "agisdk-real", "additional": {"agisdk_task_id": "gomail-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "Gmail"}}}
+{"query_id": "agisdk-udriver-6", "dataset": "agisdk-real", "query": "Me and 4 friends need a ride from the Palace Hotel to dinner at Osha Thai leaving now", "graders": ["agisdk_state_diff"], "start_url": "https://evals-udriver.vercel.app", "metadata": {"original_task_id": "udriver-6", "website": "UDriver", "category": "agisdk-real", "additional": {"agisdk_task_id": "udriver-6", "challenge_type": "action", "difficulty": "hard", "similar_to": "Uber"}}}
+{"query_id": "agisdk-staynb-9", "dataset": "agisdk-real", "query": "Book a stay with the maximum number of guests supported. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-9", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Airbnb"}}}
+{"query_id": "agisdk-zilloft-3", "dataset": "agisdk-real", "query": "Find a home in San Diego priced under $150,000 with at least 2 bedrooms and request a tour. Use these details: Contact Name: John Doe, Email: johndoe@example.com, Phone: 555-123-4567, Tour Time: 2:00 PM, Tour Date: First available.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-zilloft.vercel.app", "metadata": {"original_task_id": "zilloft-3", "website": "Zilloft", "category": "agisdk-real", "additional": {"agisdk_task_id": "zilloft-3", "challenge_type": "retrieval-action", "difficulty": "easy", "similar_to": "Zillow"}}}
+{"query_id": "agisdk-fly-unified-6", "dataset": "agisdk-real", "query": "Reserve me a seat for the flight from Austin to Pittsburgh departing on December 11th, 2024 at 8:00 in Basic Economy.\nPassenger: Alice Brown\nDate of Birth: 05/20/1992\nSex: Female\nSeat Selection: Yes (Aisle seat)\nPayment: Credit Card (378342143523967), Exp: 09/27, security code: 332 Address: 789 Pine St, Los Angeles, CA, 90012, USA, Phone: 555-456-7890, Email: alicebrown@example.com.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-fly-unified.vercel.app", "metadata": {"original_task_id": "fly-unified-6", "website": "Fly Unified", "category": "agisdk-real", "additional": {"agisdk_task_id": "fly-unified-6", "challenge_type": "action", "difficulty": "medium", "similar_to": "United Airlines"}}}
+{"query_id": "agisdk-opendining-3", "dataset": "agisdk-real", "query": "Book a table at \"The Royal Dine\" for a party of 4 on July 20, 2024, at 7 PM. For fields you don't know the answer for, just fill them in with anything of your choice.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-opendining.vercel.app", "metadata": {"original_task_id": "opendining-3", "website": "OpenDining", "category": "agisdk-real", "additional": {"agisdk_task_id": "opendining-3", "challenge_type": "action", "difficulty": "easy", "similar_to": "OpenTable"}}}
+{"query_id": "agisdk-omnizon-9", "dataset": "agisdk-real", "query": "Search for \"PlayStation DualSense\", purchase it using the \"buy now\" button after opening the first result and change the default payment method to:\nname: Jack Fulton\ncard number: 9231 3432 8927 7764\nexp date: 1/2029\nsecurity code: 128\n before placing your order. ", "graders": ["agisdk_state_diff"], "start_url": "https://evals-omnizon.vercel.app", "metadata": {"original_task_id": "omnizon-9", "website": "Omnizon", "category": "agisdk-real", "additional": {"agisdk_task_id": "omnizon-9", "challenge_type": "action", "difficulty": "hard", "similar_to": "Amazon"}}}
+{"query_id": "agisdk-gocalendar-7", "dataset": "agisdk-real", "query": "Reschedule the \"Morning Coffee with sister\" event from July 18, 2024, at 9 AM to July 19, 2024, at 10AM using drag-and-drop functionality", "graders": ["agisdk_state_diff"], "start_url": "https://evals-gocalendar.vercel.app", "metadata": {"original_task_id": "gocalendar-7", "website": "GoCalendar", "category": "agisdk-real", "additional": {"agisdk_task_id": "gocalendar-7", "challenge_type": "action", "difficulty": "medium", "similar_to": "Google Calendar"}}}
+{"query_id": "agisdk-staynb-5", "dataset": "agisdk-real", "query": "Use the search bar to look for a stay. For the \"Where\" section, use the \"Search by region\" popover and select \"Europe\". Set the check-in date to October 13th and the check-out date to October 23rd. For the \"Who\" section, select 1 infant, 2 children, and 2 adults. Press the search button, select the first stay, and book it.", "graders": ["agisdk_state_diff"], "start_url": "https://evals-staynb.vercel.app", "metadata": {"original_task_id": "staynb-5", "website": "StayNB", "category": "agisdk-real", "additional": {"agisdk_task_id": "staynb-5", "challenge_type": "action", "difficulty": "medium", "similar_to": "Airbnb"}}}
diff --git a/packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl b/packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl
new file mode 100644
index 000000000..c4f309799
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/data/webarena-infinity-hard-50.jsonl
@@ -0,0 +1,50 @@
+{"query_id": "infinity-elation-prescriptions-task_h69", "dataset": "webarena-infinity", "query": "Approve all pending refill requests except for any medication that is involved in a major drug-drug interaction with another of the patient's active medications. Deny those with the reason 'Drug interaction \u2014 needs provider review before renewal'.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h69", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h69.py", "app_base_port": 8020}}}
+{"query_id": "infinity-elation-clinical-records-task_h52", "dataset": "webarena-infinity", "query": "Add the document tag 'Provider-Reviewed' to every visit note template that was created by the current logged-in provider. Do not modify templates created by other providers.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h52", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8000}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h44", "dataset": "webarena-infinity", "query": "Your sister's husband is one of your contacts. Find him, star his entry, and add the Friends label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h44", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h44.py", "app_base_port": 8070}}}
+{"query_id": "infinity-gmail-task_h2", "dataset": "webarena-infinity", "query": "Update the Datadog alerts filter to also archive matching emails and forward them to priya.sharma@cloudnine.dev instead of nate.patel@devops.tools.", "graders": ["infinity_state"], "start_url": "http://localhost:8060", "metadata": {"original_task_id": "gmail-task_h2", "website": "gmail", "category": "webarena-infinity", "additional": {"app_name": "gmail", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8060}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h58", "dataset": "webarena-infinity", "query": "The Performance Initiative epic has two child epics. For the child epic with more open issues, set the weight of every issue in it to 13. For the other child epic, close all its open issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h58", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h58.py", "app_base_port": 8050}}}
+{"query_id": "infinity-figma-slides-task_h46", "dataset": "webarena-infinity", "query": "There are two slides with tables in the deck. Lock the table that compares competitors, and change the font size to 16 on the table that tracks quarterly feature adoption.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h46", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h46.py", "app_base_port": 8030}}}
+{"query_id": "infinity-elation-prescriptions-task_h50", "dataset": "webarena-infinity", "query": "Deny the pending refill for the patient's cholesterol medication because his lipid panel is overdue. Then deny the Lisinopril refill as well \u2014 he needs a follow-up blood pressure check first.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h50", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8020}}}
+{"query_id": "infinity-elation-prescriptions-task_h19", "dataset": "webarena-infinity", "query": "Discontinue the Omeprazole and prescribe Famotidine 20mg tablet twice daily as a replacement for GERD \u2014 qty 60, 3 refills, send to CVS #4521.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h19", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8020}}}
+{"query_id": "infinity-paypal-my-wallet-task_h25", "dataset": "webarena-infinity", "query": "Convert all of my Australian dollars to euros.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h25", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h25.py", "app_base_port": 8100}}}
+{"query_id": "infinity-elation-clinical-records-task_h66", "dataset": "webarena-infinity", "query": "Create a new template called 'Anxiety Management' with HPI and Assessment sections, and billing code 99213 with description 'Office visit, established, low complexity'. Then create a visit note for Emily Nakamura using that new template and the Telehealth category, add a Psychological Status block to the note, and sign it.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h66", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h66.py", "app_base_port": 8000}}}
+{"query_id": "infinity-elation-clinical-records-task_h62", "dataset": "webarena-infinity", "query": "Look up which template is assigned to the COVID Vaccine appointment type. Remove all its existing document tags and replace them with the single tag 'COVID-Protocol'. Then also assign that same template to the Urgent Same-Day appointment type.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h62", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h62.py", "app_base_port": 8000}}}
+{"query_id": "infinity-elation-prescriptions-task_h32", "dataset": "webarena-infinity", "query": "The patient has a medication that's being dispensed as written (brand name only). Discontinue that prescription and replace it with a new one \u2014 same medication, same sig, same pharmacy \u2014 but allow generic substitution this time. Qty 30, 3 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h32", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h32.py", "app_base_port": 8020}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h48", "dataset": "webarena-infinity", "query": "Add the 'breaking-change' label to every open issue in the API v3 Migration epic and remove any existing workflow-scoped labels from those issues.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h48", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h48.py", "app_base_port": 8050}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h77", "dataset": "webarena-infinity", "query": "Rename the 'UX' label to 'user-experience', change its type to 'group', and then add it to every open issue in the Frontend Modernization epic that doesn't already have it.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h77", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h77.py", "app_base_port": 8050}}}
+{"query_id": "infinity-xero-invoicing-task_h15", "dataset": "webarena-infinity", "query": "Create a new invoice for Summit Health Group for an annual software license and 12 months of support with a 10% discount on support.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h15", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h15.py", "app_base_port": 8120}}}
+{"query_id": "infinity-elation-clinical-records-task_h55", "dataset": "webarena-infinity", "query": "Resolve every problem across all patients in the system that currently has a status of Controlled.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h55", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h55.py", "app_base_port": 8000}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h8", "dataset": "webarena-infinity", "query": "Create a confidential issue titled 'Emergency security patch' with priority::critical and the 'security' label, assigned to James O'Brien and Oliver Schmidt, with weight 2 in the Security Hardening milestone.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h8", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h8.py", "app_base_port": 8050}}}
+{"query_id": "infinity-paypal-my-wallet-task_h20", "dataset": "webarena-infinity", "query": "Make a $200 payment on PayPal Credit and change autopay to pay the full balance.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h20", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h20.py", "app_base_port": 8100}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h52", "dataset": "webarena-infinity", "query": "Create a new board called 'Performance Tracker' with lists for the priority::critical, priority::high, and priority::medium labels. Then add the 'priority::high' label to every open issue in the v4.1 milestone that has the 'performance' label.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h52", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h52.py", "app_base_port": 8050}}}
+{"query_id": "infinity-paypal-my-wallet-task_h80", "dataset": "webarena-infinity", "query": "Save all available Food & Drink offers, buy a $25 DoorDash gift card for yourself, and switch currency conversion to use my card issuer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h80", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h80.py", "app_base_port": 8100}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h50", "dataset": "webarena-infinity", "query": "Add the Emergency label to every contact who is currently listed as a delegate (active, pending, or expired). Then remove all delegates whose status is not 'active'.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h50", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h50.py", "app_base_port": 8070}}}
+{"query_id": "infinity-elation-clinical-records-task_h14", "dataset": "webarena-infinity", "query": "Add the tag 'Flu-Season' to every patient whose primary provider is Dr. Sarah Chen.", "graders": ["infinity_state"], "start_url": "http://localhost:8000", "metadata": {"original_task_id": "elation-clinical-records-task_h14", "website": "elation-clinical-records", "category": "webarena-infinity", "additional": {"app_name": "elation-clinical-records", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8000}}}
+{"query_id": "infinity-figma-text-and-typography-task_h7", "dataset": "webarena-infinity", "query": "Remove all list formatting from every layer.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h7", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h7.py", "app_base_port": 8040}}}
+{"query_id": "infinity-paypal-my-wallet-task_h26", "dataset": "webarena-infinity", "query": "Send a $50 Amazon gift card to sarah.chen@email.com with 'Thank you!' as the message, and save the Amazon cashback offer.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h26", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h26.py", "app_base_port": 8100}}}
+{"query_id": "infinity-handshake-career-exploration-task_h97", "dataset": "webarena-infinity", "query": "Find the single most helpful answer across all Q&A questions and mark it helpful. Then find the most-viewed question and submit your own answer to it.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h97", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h97.py", "app_base_port": 8080}}}
+{"query_id": "infinity-figma-slides-task_h79", "dataset": "webarena-infinity", "query": "In the adoption table, find the feature with the highest Target Q4 percentage. In the competitive table, change DesignCraft's entry for that same feature to 'Market Leader'. Then update that feature's Target Q4 to '95%'.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h79", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8030}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h41", "dataset": "webarena-infinity", "query": "For every open issue in the v4.2 - Security Hardening milestone: if it is already confidential, set its health status to 'at risk'. If it is not confidential, make it confidential and set its health status to 'needs attention'.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h41", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h41.py", "app_base_port": 8050}}}
+{"query_id": "infinity-handshake-career-exploration-task_h90", "dataset": "webarena-infinity", "query": "A student in the feed mentioned attending the NSBE conference. That student also answered a Q&A question about diversity programs in tech. Submit your own answer to that same question sharing your experience, then bookmark that student's feed post.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h90", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h90.py", "app_base_port": 8080}}}
+{"query_id": "infinity-elation-prescriptions-task_h30", "dataset": "webarena-infinity", "query": "The patient has three temporary medications. Discontinue the corticosteroid taper and the penicillin antibiotic \u2014 the patient completed both courses. Move the remaining temporary medication to permanent Rx.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h30", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h30.py", "app_base_port": 8020}}}
+{"query_id": "infinity-linear-account-settings-task_h19", "dataset": "webarena-infinity", "query": "Turn off all desktop application settings: open in desktop app, notification badge, and spell check.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h19", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h19.py", "app_base_port": 8090}}}
+{"query_id": "infinity-elation-prescriptions-task_h39", "dataset": "webarena-infinity", "query": "Change the default pharmacy to Express Scripts Mail Pharmacy for mail-order prescriptions. Then document that the patient takes Magnesium Citrate 400mg tablet as an OTC supplement \u2014 once daily at bedtime, 30-day supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h39", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h39.py", "app_base_port": 8020}}}
+{"query_id": "infinity-handshake-career-exploration-task_h136", "dataset": "webarena-infinity", "query": "Your earliest completed appointment was a specific type. Schedule a follow-up appointment of the same category and type with the same staff member, for March 28, 2026 at 9:00 AM, in person.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h136", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h136.py", "app_base_port": 8080}}}
+{"query_id": "infinity-handshake-career-exploration-task_h105", "dataset": "webarena-infinity", "query": "Find the second-most-viewed question in Q&A. It has two answers \u2014 mark the one with fewer helpful votes as helpful.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h105", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h105.py", "app_base_port": 8080}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h22", "dataset": "webarena-infinity", "query": "The Engineering Manager at TechCorp is listed as one of your delegates. Remove her delegation and unstar her contact.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h22", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h22.py", "app_base_port": 8070}}}
+{"query_id": "infinity-elation-patient-communication-task_h9", "dataset": "webarena-infinity", "query": "Acknowledge all unacknowledged reminders in the system.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h9", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h9.py", "app_base_port": 8010}}}
+{"query_id": "infinity-superhuman-general-task_h1", "dataset": "webarena-infinity", "query": "Label the FinancePlus partnership email and the QuantumLab prototype email as 'Clients'.", "graders": ["infinity_state"], "start_url": "http://localhost:8110", "metadata": {"original_task_id": "superhuman-general-task_h1", "website": "superhuman-general", "category": "webarena-infinity", "additional": {"app_name": "superhuman-general", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8110}}}
+{"query_id": "infinity-xero-invoicing-task_h79", "dataset": "webarena-infinity", "query": "Change the invoice prefix to 'AUS-' and the next number to 100, then create a new invoice for CloudNine Analytics for 8 hours of UI/UX design work.", "graders": ["infinity_state"], "start_url": "http://localhost:8120", "metadata": {"original_task_id": "xero-invoicing-task_h79", "website": "xero-invoicing", "category": "webarena-infinity", "additional": {"app_name": "xero-invoicing", "difficulty": "hard", "verifier_path": "real-tasks/task_h79.py", "app_base_port": 8120}}}
+{"query_id": "infinity-figma-slides-task_h16", "dataset": "webarena-infinity", "query": "Enable slide numbers on every slide using the 'with total' format and change the aspect ratio to 4:3.", "graders": ["infinity_state"], "start_url": "http://localhost:8030", "metadata": {"original_task_id": "figma-slides-task_h16", "website": "figma-slides", "category": "webarena-infinity", "additional": {"app_name": "figma-slides", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8030}}}
+{"query_id": "infinity-linear-account-settings-task_h16", "dataset": "webarena-infinity", "query": "Revoke all API keys that have an expiration date.", "graders": ["infinity_state"], "start_url": "http://localhost:8090", "metadata": {"original_task_id": "linear-account-settings-task_h16", "website": "linear-account-settings", "category": "webarena-infinity", "additional": {"app_name": "linear-account-settings", "difficulty": "hard", "verifier_path": "real-tasks/task_h16.py", "app_base_port": 8090}}}
+{"query_id": "infinity-elation-prescriptions-task_h2", "dataset": "webarena-infinity", "query": "Prescribe Buspirone 10mg for the patient's anxiety \u2014 once daily in the morning, qty 30, 5 refills. Send it to the same pharmacy that fills his Sertraline.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h2", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h2.py", "app_base_port": 8020}}}
+{"query_id": "infinity-handshake-career-exploration-task_h1", "dataset": "webarena-infinity", "query": "Follow all consulting firms on Handshake.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h1", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h1.py", "app_base_port": 8080}}}
+{"query_id": "infinity-handshake-career-exploration-task_h141", "dataset": "webarena-infinity", "query": "Some of your saved jobs are from employers you haven't followed yet. Find and follow each of those employers.", "graders": ["infinity_state"], "start_url": "http://localhost:8080", "metadata": {"original_task_id": "handshake-career-exploration-task_h141", "website": "handshake-career-exploration", "category": "webarena-infinity", "additional": {"app_name": "handshake-career-exploration", "difficulty": "hard", "verifier_path": "real-tasks/task_h141.py", "app_base_port": 8080}}}
+{"query_id": "infinity-figma-text-and-typography-task_h74", "dataset": "webarena-infinity", "query": "Set the spelling language to Japanese, the big nudge amount to 50, and the default horizontal alignment to right.", "graders": ["infinity_state"], "start_url": "http://localhost:8040", "metadata": {"original_task_id": "figma-text-and-typography-task_h74", "website": "figma-text-and-typography", "category": "webarena-infinity", "additional": {"app_name": "figma-text-and-typography", "difficulty": "hard", "verifier_path": "real-tasks/task_h74.py", "app_base_port": 8040}}}
+{"query_id": "infinity-elation-patient-communication-task_h63", "dataset": "webarena-infinity", "query": "Check the visit summaries to find the patient whose BNP level improved. Reply to their most recent message confirming they can resume light activity, then update their emergency contact's phone number to (650) 555-0001.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h63", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h63.py", "app_base_port": 8010}}}
+{"query_id": "infinity-elation-patient-communication-task_h14", "dataset": "webarena-infinity", "query": "Change Dr. Torres's notification timeframe to 'Do not notify me' and remove Dr. Torres from Dr. Chen's General Question routing.", "graders": ["infinity_state"], "start_url": "http://localhost:8010", "metadata": {"original_task_id": "elation-patient-communication-task_h14", "website": "elation-patient-communication", "category": "webarena-infinity", "additional": {"app_name": "elation-patient-communication", "difficulty": "hard", "verifier_path": "real-tasks/task_h14.py", "app_base_port": 8010}}}
+{"query_id": "infinity-gitlab-plan-and-track-task_h67", "dataset": "webarena-infinity", "query": "Delete all time entries from the GraphQL gateway issue, add a single new entry of 16 hours with summary 'Complete rewrite estimate', and set its time estimate to 40 hours.", "graders": ["infinity_state"], "start_url": "http://localhost:8050", "metadata": {"original_task_id": "gitlab-plan-and-track-task_h67", "website": "gitlab-plan-and-track", "category": "webarena-infinity", "additional": {"app_name": "gitlab-plan-and-track", "difficulty": "hard", "verifier_path": "real-tasks/task_h67.py", "app_base_port": 8050}}}
+{"query_id": "infinity-gmail-accounts-and-contacts-task_h73", "dataset": "webarena-infinity", "query": "Among the individual people in your other contacts (those with a first and last name), find the one who was saved most recently. Move them to your main contacts, set their company to 'Salesforce', job title to 'Account Executive', and add the Work label.", "graders": ["infinity_state"], "start_url": "http://localhost:8070", "metadata": {"original_task_id": "gmail-accounts-and-contacts-task_h73", "website": "gmail-accounts-and-contacts", "category": "webarena-infinity", "additional": {"app_name": "gmail-accounts-and-contacts", "difficulty": "hard", "verifier_path": "real-tasks/task_h73.py", "app_base_port": 8070}}}
+{"query_id": "infinity-elation-prescriptions-task_h4", "dataset": "webarena-infinity", "query": "Run a medication reconciliation and mark the Calcium+D3 supplement for discontinuation during the review.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h4", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h4.py", "app_base_port": 8020}}}
+{"query_id": "infinity-elation-prescriptions-task_h47", "dataset": "webarena-infinity", "query": "The patient's SSRI is currently dispensed at a different pharmacy than most of his other medications. Prescribe a refill of the same SSRI at the same dose and sig, but send it to CVS #4521 instead \u2014 qty 30, 5 refills, 30 days supply.", "graders": ["infinity_state"], "start_url": "http://localhost:8020", "metadata": {"original_task_id": "elation-prescriptions-task_h47", "website": "elation-prescriptions", "category": "webarena-infinity", "additional": {"app_name": "elation-prescriptions", "difficulty": "hard", "verifier_path": "real-tasks/task_h47.py", "app_base_port": 8020}}}
+{"query_id": "infinity-paypal-my-wallet-task_h89", "dataset": "webarena-infinity", "query": "If your USD PayPal balance is above $2,500, convert $500 to Japanese Yen. If it is $2,500 or below, first add $500 from your Chase bank account, then convert $500 to JPY. Either way, set the debit card cash back category to Fuel.", "graders": ["infinity_state"], "start_url": "http://localhost:8100", "metadata": {"original_task_id": "paypal-my-wallet-task_h89", "website": "paypal-my-wallet", "category": "webarena-infinity", "additional": {"app_name": "paypal-my-wallet", "difficulty": "hard", "verifier_path": "real-tasks/task_h89.py", "app_base_port": 8100}}}
diff --git a/packages/browseros-agent/apps/eval/package.json b/packages/browseros-agent/apps/eval/package.json
index b7abbfe34..8d6608f48 100644
--- a/packages/browseros-agent/apps/eval/package.json
+++ b/packages/browseros-agent/apps/eval/package.json
@@ -12,8 +12,6 @@
"@aws-sdk/client-s3": "^3.1014.0",
"@browseros/server": "workspace:*",
"@browseros/shared": "workspace:*",
- "@google/gemini-cli-core": "^0.16.0",
- "@google/genai": "1.30.0",
"@modelcontextprotocol/sdk": "^1.25.2",
"ai": "^6.0.94",
"hono": "^4.6.0",
diff --git a/packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py b/packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py
new file mode 100644
index 000000000..a2c6e3653
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/scripts/agisdk-evaluate.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+AGI SDK evaluation helper for BrowserOS eval framework.
+
+Reads JSON from stdin with task_id and env_state, runs the agisdk
+evaluator, and outputs the result as JSON to stdout.
+
+Input format:
+ {"task_id": "dashdish-1", "env_state": {...}, "model_response": ""}
+
+Output format:
+ {"reward": 0.0, "pass": false, "message": "...", "per_criterion": [...]}
+"""
+
+import json
+import sys
+
+
+def main():
+ data = json.loads(sys.stdin.read())
+ task_id = data["task_id"]
+ env_state = data["env_state"]
+ model_response = data.get("model_response", "")
+
+ try:
+ from agisdk.REAL.browsergym.webclones.evaluate import WebCloneEvaluator
+ from agisdk.REAL.browsergym.webclones.task_config import TaskConfig
+ except ImportError:
+ print(
+ json.dumps(
+ {
+ "reward": 0,
+ "pass": False,
+ "message": "agisdk package not installed. Run: pip install agisdk",
+ "per_criterion": [],
+ }
+ )
+ )
+ sys.exit(0)
+
+ try:
+ # Redirect stdout to stderr during evaluation — agisdk's rich logger
+ # prints directly to stdout, which would corrupt our JSON output
+ real_stdout = sys.stdout
+ sys.stdout = sys.stderr
+
+ tc = TaskConfig(task_id)
+ evaluator = WebCloneEvaluator(tc)
+ reward_val, _done, message, info = evaluator.evaluate(
+ env_state=env_state, model_response=model_response
+ )
+
+ sys.stdout = real_stdout
+
+ reward_val = float(reward_val) if reward_val is not None else 0.0
+ results = info.get("results", [])
+ per_criterion = [
+ {"passed": r[0], "detail": str(r[1]) if len(r) > 1 else ""}
+ for r in results
+ ]
+
+ print(
+ json.dumps(
+ {
+ "reward": reward_val,
+ "pass": reward_val == 1.0,
+ "message": str(message),
+ "per_criterion": per_criterion,
+ }
+ )
+ )
+
+ except Exception as e:
+ sys.stdout = real_stdout if "real_stdout" in dir() else sys.__stdout__
+ print(
+ json.dumps(
+ {
+ "reward": 0,
+ "pass": False,
+ "message": f"Evaluation error: {str(e)}",
+ "per_criterion": [],
+ }
+ )
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py b/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
new file mode 100644
index 000000000..bc0f1e8cc
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/scripts/build-agisdk-dataset.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+Build JSONL dataset for AGI SDK / REAL Bench evaluation.
+
+Reads task definitions from the agisdk package, filters to feasible
+action-only tasks (excludes llm_boolean evaluators), and outputs JSONL
+to stdout in the BrowserOS eval framework format.
+
+Usage:
+ python scripts/build-agisdk-dataset.py > data/agisdk-real.jsonl
+"""
+
+import json
+import sys
+
+
+def has_llm_eval(task: dict) -> bool:
+ return any(e.get("type") == "llm_boolean" for e in task.get("evals", []))
+
+
+def main():
+ try:
+ from agisdk.REAL.tasks import all_tasks
+ except ImportError:
+ print(
+ "Error: agisdk package not installed. Run: pip install agisdk",
+ file=sys.stderr,
+ )
+ sys.exit(1)
+
+ count = 0
+ skipped_infeasible = 0
+ skipped_llm = 0
+
+ for task in all_tasks:
+ if not task.get("possible", True):
+ skipped_infeasible += 1
+ continue
+
+ if has_llm_eval(task):
+ skipped_llm += 1
+ continue
+
+ task_id = task["id"]
+ website = task.get("website", {})
+ goal = task.get("goal", "")
+ start_url = website.get("url", "")
+
+ if not start_url or not goal:
+ print(f"Warning: Skipping {task_id} — missing url or goal", file=sys.stderr)
+ continue
+
+ entry = {
+ "query_id": f"agisdk-{task_id}",
+ "dataset": "agisdk-real",
+ "query": goal,
+ "graders": ["agisdk_state_diff"],
+ "start_url": start_url,
+ "metadata": {
+ "original_task_id": task_id,
+ "website": website.get("name", ""),
+ "category": "agisdk-real",
+ "additional": {
+ "agisdk_task_id": task_id,
+ "challenge_type": task.get("challengeType", "action"),
+ "difficulty": task.get("difficulty", "unknown"),
+ "similar_to": website.get("similarTo", ""),
+ },
+ },
+ }
+
+ print(json.dumps(entry))
+ count += 1
+
+ print(
+ f"Generated {count} tasks (skipped {skipped_infeasible} infeasible, "
+ f"{skipped_llm} llm_boolean)",
+ file=sys.stderr,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py b/packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py
new file mode 100644
index 000000000..3d4130b25
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/scripts/build-infinity-dataset.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""
+Dataset generator for WebArena-Infinity benchmark.
+
+Reads real-tasks.json from each app directory and outputs JSONL
+in the eval framework's TaskSchema format.
+
+Usage:
+ python build-infinity-dataset.py --apps-dir /path/to/webarena-infinity/apps
+ python build-infinity-dataset.py --apps-dir /path/to/apps --apps gmail linear --difficulty medium
+"""
+
+import argparse
+import json
+import os
+import sys
+
+
+def load_tasks(app_dir: str, app_name: str) -> list[dict]:
+ tasks_file = os.path.join(app_dir, "real-tasks.json")
+ if not os.path.exists(tasks_file):
+ print(f"Warning: No real-tasks.json found in {app_dir}", file=sys.stderr)
+ return []
+ with open(tasks_file) as f:
+ return json.load(f)
+
+
+def build_task_entry(
+ app_name: str,
+ task: dict,
+ base_port: int,
+) -> dict:
+ task_id = task.get("id", task.get("task_id", "unknown"))
+ difficulty = task.get("difficulty", "unknown")
+ query = task.get("query", task.get("instruction", task.get("task", "")))
+ verifier_path = task.get(
+ "verify",
+ task.get("verifier_path", f"real-tasks/{task_id}.py"),
+ )
+
+ return {
+ "query_id": f"infinity-{app_name}-{task_id}",
+ "dataset": "webarena-infinity",
+ "query": query,
+ "graders": ["infinity_state"],
+ "start_url": f"http://localhost:{base_port}",
+ "setup_script": f"POST http://localhost:{base_port}/api/reset",
+ "metadata": {
+ "original_task_id": f"{app_name}-{task_id}",
+ "website": app_name,
+ "category": "webarena-infinity",
+ "additional": {
+ "app_name": app_name,
+ "difficulty": difficulty,
+ "verifier_path": verifier_path,
+ "app_port": base_port,
+ },
+ },
+ }
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Generate JSONL dataset from WebArena-Infinity apps"
+ )
+ parser.add_argument(
+ "--apps-dir",
+ required=True,
+ help="Path to webarena-infinity/apps/ directory",
+ )
+ parser.add_argument(
+ "--apps",
+ nargs="*",
+ default=None,
+ help="Filter to specific app names (default: all)",
+ )
+ parser.add_argument(
+ "--difficulty",
+ choices=["easy", "medium", "hard"],
+ default=None,
+ help="Filter by difficulty tier",
+ )
+ parser.add_argument(
+ "--base-port",
+ type=int,
+ default=8000,
+ help="Starting port number for apps (default: 8000)",
+ )
+ args = parser.parse_args()
+
+ if not os.path.isdir(args.apps_dir):
+ print(f"Error: {args.apps_dir} is not a directory", file=sys.stderr)
+ sys.exit(1)
+
+ app_dirs = sorted(os.listdir(args.apps_dir))
+ if args.apps:
+ app_dirs = [d for d in app_dirs if d in args.apps]
+
+ port = args.base_port
+ for app_name in app_dirs:
+ app_path = os.path.join(args.apps_dir, app_name)
+ if not os.path.isdir(app_path):
+ continue
+
+ tasks = load_tasks(app_path, app_name)
+ for task in tasks:
+ difficulty = task.get("difficulty", "unknown")
+ if args.difficulty and difficulty != args.difficulty:
+ continue
+
+ entry = build_task_entry(app_name, task, port)
+ print(json.dumps(entry))
+
+ port += 1
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py b/packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py
new file mode 100644
index 000000000..a041ffdc9
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/scripts/infinity-evaluate.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Evaluation helper for WebArena-Infinity verifier scripts.
+
+Reads JSON from stdin with app_server_url, verifier_path, and task_id.
+Runs the verifier against the app server and outputs a JSON result.
+
+Verifiers have the signature: verify(server_url: str) -> tuple[bool, str]
+They fetch /api/state internally and return (passed, message).
+
+Usage:
+ echo '{"app_server_url": "http://localhost:8000", "verifier_path": "/path/to/verify.py"}' | python infinity-evaluate.py
+"""
+
+import importlib.util
+import json
+import sys
+import traceback
+
+
+def load_verifier(verifier_path: str):
+ spec = importlib.util.spec_from_file_location("verifier", verifier_path)
+ if spec is None or spec.loader is None:
+ raise ImportError(f"Cannot load verifier from {verifier_path}")
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ return module
+
+
+def main():
+ try:
+ data = json.loads(sys.stdin.read())
+ except json.JSONDecodeError as e:
+ print(json.dumps({"pass": False, "reward": 0.0, "message": f"Invalid JSON input: {e}"}))
+ sys.exit(1)
+
+ server_url = data.get("app_server_url", "")
+ verifier_path = data.get("verifier_path", "")
+
+ if not server_url or not verifier_path:
+ print(json.dumps({
+ "pass": False,
+ "reward": 0.0,
+ "message": "Missing app_server_url or verifier_path",
+ }))
+ sys.exit(1)
+
+ try:
+ verifier = load_verifier(verifier_path)
+ fn = getattr(verifier, "verify", None)
+ if not callable(fn):
+ raise AttributeError(
+ f"Verifier has no verify() function. "
+ f"Available: {[a for a in dir(verifier) if not a.startswith('_')]}"
+ )
+
+ # Verifiers take server_url and fetch state internally
+ result = fn(server_url)
+
+ # Return is tuple[bool, str]
+ if isinstance(result, tuple) and len(result) >= 2:
+ passed, message = result[0], str(result[1])
+ else:
+ passed, message = bool(result), str(result)
+
+ except Exception as e:
+ print(json.dumps({
+ "pass": False,
+ "reward": 0.0,
+ "message": f"Verifier error: {e}\n{traceback.format_exc()}",
+ }))
+ sys.exit(1)
+
+ print(json.dumps({
+ "pass": passed,
+ "reward": 1.0 if passed else 0.0,
+ "message": message,
+ }))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts b/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts
deleted file mode 100644
index 43edb4ab3..000000000
--- a/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts
+++ /dev/null
@@ -1,200 +0,0 @@
-/**
- * Validation script for Gemini Computer Use integration
- * Run: bun apps/eval/scripts/validate-computer-use-tools.ts
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-
-const MCP_URL = process.env.MCP_URL || 'http://127.0.0.1:9105/mcp'
-
-interface McpToolResult {
- content: Array<{
- type: string
- text?: string
- data?: string
- mimeType?: string
- }>
- isError?: boolean
-}
-
-async function callMcpTool(
- serverUrl: string,
- name: string,
- args: Record = {},
-): Promise {
- const client = new Client({ name: 'validate-computer-use', version: '1.0.0' })
- const transport = new StreamableHTTPClientTransport(new URL(serverUrl), {
- requestInit: { headers: { 'X-BrowserOS-Source': 'validation' } },
- })
-
- try {
- await client.connect(transport)
- return (await client.callTool({ name, arguments: args })) as McpToolResult
- } finally {
- try {
- await transport.close()
- } catch {}
- }
-}
-
-async function validateTools() {
- console.log('🔍 Validating MCP tools for Gemini Computer Use integration\n')
- console.log(`MCP URL: ${MCP_URL}\n`)
-
- // Get active tab first
- console.log('1. Getting active tab...')
- const tabResult = await callMcpTool(MCP_URL, 'browser_get_active_tab', {})
- if (tabResult.isError) {
- console.error('❌ Failed to get active tab:', tabResult.content)
- process.exit(1)
- }
- const tabText = tabResult.content.find((c) => c.type === 'text')?.text ?? ''
- const tabIdMatch = tabText.match(/ID: (\d+)/)
- const tabId = tabIdMatch ? parseInt(tabIdMatch[1], 10) : 1
- console.log(` ✅ Active tab ID: ${tabId}\n`)
-
- // Validate each tool needed for Computer Use
- const toolTests = [
- {
- name: 'browser_get_screenshot',
- args: { tabId, size: 'medium' },
- description: 'Screenshot capture',
- validate: (r: McpToolResult) => r.content.some((c) => c.type === 'image'),
- },
- {
- name: 'browser_click_coordinates',
- args: { tabId, x: 100, y: 100 },
- description: 'Click at coordinates',
- validate: (r: McpToolResult) => !r.isError,
- },
- {
- name: 'browser_type_at_coordinates',
- args: { tabId, x: 100, y: 100, text: 'test' },
- description: 'Type at coordinates',
- validate: (r: McpToolResult) => !r.isError,
- },
- {
- name: 'browser_scroll_down',
- args: { tabId },
- description: 'Scroll down',
- validate: (r: McpToolResult) => !r.isError,
- },
- {
- name: 'browser_scroll_up',
- args: { tabId },
- description: 'Scroll up',
- validate: (r: McpToolResult) => !r.isError,
- },
- {
- name: 'browser_send_keys',
- args: { tabId, key: 'Enter' },
- description: 'Send keyboard key',
- validate: (r: McpToolResult) => !r.isError,
- },
- {
- name: 'browser_execute_javascript',
- args: { tabId, code: 'window.location.href' },
- description: 'Execute JavaScript (for go_back/forward workaround)',
- validate: (r: McpToolResult) => !r.isError,
- },
- ]
-
- let passed = 0
- let failed = 0
-
- for (const test of toolTests) {
- process.stdout.write(`2. Testing ${test.name} (${test.description})... `)
- try {
- const result = await callMcpTool(MCP_URL, test.name, test.args)
- if (test.validate(result)) {
- console.log('✅')
- passed++
- } else {
- console.log('❌ Validation failed')
- console.log(' Result:', JSON.stringify(result, null, 2))
- failed++
- }
- } catch (err) {
- console.log('❌ Error:', err instanceof Error ? err.message : err)
- failed++
- }
- }
-
- console.log(`\n${'='.repeat(50)}`)
- console.log(`Results: ${passed} passed, ${failed} failed`)
- console.log('='.repeat(50))
-
- if (failed === 0) {
- console.log(
- '\n✅ All tools validated! Gemini Computer Use integration should work.',
- )
- console.log('\nGaps to address with workarounds:')
- console.log(' - key_combination: Use browser_execute_javascript')
- console.log(
- ' - go_back/go_forward: Use browser_execute_javascript with history.back()/forward()',
- )
- console.log(
- ' - type_text_at press_enter: Chain browser_send_keys after typing',
- )
- } else {
- console.log('\n⚠️ Some tools failed. Check your server is running.')
- }
-}
-
-// Validate Gemini API access
-async function validateGeminiApi() {
- const apiKey = process.env.GOOGLE_AI_API_KEY || process.env.GEMINI_API_KEY
- if (!apiKey) {
- console.log('\n⚠️ GOOGLE_AI_API_KEY not set - skipping API validation')
- return
- }
-
- console.log('\n3. Validating Gemini Computer Use API access...')
-
- const MODEL = 'gemini-2.5-computer-use-preview-10-2025'
- const url = `https://generativelanguage.googleapis.com/v1beta/models/${MODEL}:generateContent`
-
- // Minimal test - just check if model is accessible
- const testPayload = {
- contents: [{ role: 'user', parts: [{ text: 'test' }] }],
- }
-
- try {
- const response = await fetch(url, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- 'x-goog-api-key': apiKey,
- },
- body: JSON.stringify(testPayload),
- })
-
- if (response.ok) {
- console.log(' ✅ Gemini Computer Use API is accessible')
- } else {
- const error = await response.json()
- console.log(
- ' ❌ API error:',
- error.error?.message || response.statusText,
- )
- }
- } catch (err) {
- console.log(
- ' ❌ Network error:',
- err instanceof Error ? err.message : err,
- )
- }
-}
-
-async function main() {
- try {
- await validateTools()
- await validateGeminiApi()
- } catch (err) {
- console.error('Validation failed:', err)
- process.exit(1)
- }
-}
-
-main()
diff --git a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
index ec67ea9a4..af99b35cb 100644
--- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
+++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts
@@ -59,6 +59,8 @@ interface RunSummary {
}
const PASS_FAIL_GRADER_ORDER = [
+ 'agisdk_state_diff',
+ 'infinity_state',
'performance_grader',
'webvoyager_grader',
'fara_combined',
@@ -332,9 +334,7 @@ const html = `
? 'Orch-Exec'
: r.agentType === 'single'
? 'Tool Loop'
- : r.agentType === 'gemini-computer-use'
- ? 'Gemini CU'
- : r.agentType || '—'
+ : r.agentType || '—'
return `
${escHtml(r.date)}
${escHtml(r.runId)}
@@ -383,7 +383,6 @@ const html = `
var latest = runs[runs.length - 1];
var archLabel = latest.agentType === 'orchestrator-executor' ? 'Orchestrator-Executor'
: latest.agentType === 'single' ? 'Single Agent (Tool Loop)'
- : latest.agentType === 'gemini-computer-use' ? 'Gemini Computer Use'
: latest.agentType || 'Unknown';
var scoreColor = latest.avgScore >= 75 ? '#3fb950' : latest.avgScore >= 40 ? '#f0883e' : '#f85149';
el.innerHTML =
diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts
deleted file mode 100644
index f2200861d..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts
+++ /dev/null
@@ -1,643 +0,0 @@
-/**
- * Maps Gemini Computer Use actions to MCP tool calls
- *
- * Coordinate System:
- * - Screenshots captured with size='large' (1028px width, aspect ratio preserved)
- * - Gemini outputs normalized coordinates (0-999) relative to the screenshot
- * - We convert these to actual viewport coordinates by:
- * 1. Getting the real viewport dimensions via JavaScript
- * 2. Scaling normalized coords to actual viewport pixels
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-import type { ActionContext, ComputerUseAction, ScreenSize } from './types'
-import { DEFAULTS } from './types'
-
-interface McpToolResult {
- content: Array<{
- type: string
- text?: string
- data?: string
- mimeType?: string
- }>
- isError?: boolean
-}
-
-const MCP_TIMEOUT_MS = 30000
-
-export class ActionMapper {
- private ctx: ActionContext
- private cachedViewport: ScreenSize | null = null
-
- constructor(ctx: ActionContext) {
- this.ctx = ctx
- }
-
- // Store debug info about viewport detection for inclusion in responses
- private viewportDebugInfo: string = ''
-
- /**
- * Get the actual browser viewport size via JavaScript
- * Caches the result to avoid repeated calls
- * Also stores debug info for troubleshooting
- */
- async getViewportSize(): Promise {
- if (this.cachedViewport) {
- return this.cachedViewport
- }
-
- try {
- const result = await this.callMcp('browser_execute_javascript', {
- tabId: this.ctx.tabId,
- windowId: this.ctx.windowId,
- code: '[window.innerWidth, window.innerHeight]',
- })
-
- const textContent =
- result.content.find((c) => c.type === 'text')?.text ?? ''
-
- // Check for error in result
- if (result.isError) {
- this.viewportDebugInfo = `[VIEWPORT ERROR] JS execution failed: ${textContent}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- return this.ctx.screenSize
- }
-
- // Response format can be multiline:
- // "Result: [1440, 900]" or "Result: [\n 1200,\n 712\n]"
- const arrayMatch = textContent.match(/\[\s*(\d+)\s*,\s*(\d+)\s*\]/s)
- if (arrayMatch) {
- const width = parseInt(arrayMatch[1], 10)
- const height = parseInt(arrayMatch[2], 10)
- if (width > 0 && height > 0) {
- this.cachedViewport = { width, height }
- this.viewportDebugInfo = `[VIEWPORT OK] Detected: ${width}x${height} (raw response: "${textContent.substring(0, 100)}")`
- console.log(this.viewportDebugInfo)
- return this.cachedViewport
- } else {
- this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Invalid dimensions: ${width}x${height} from "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- }
- } else {
- this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Could not parse response: "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- }
- } catch (error) {
- const errMsg = error instanceof Error ? error.message : String(error)
- this.viewportDebugInfo = `[VIEWPORT EXCEPTION] ${errMsg}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- }
-
- // Fallback to configured screenSize
- return this.ctx.screenSize
- }
-
- /**
- * Get the current viewport debug info
- */
- getViewportDebugInfo(): string {
- return this.viewportDebugInfo
- }
-
- /**
- * Clear cached viewport (call when tab/window changes or before new task)
- */
- clearViewportCache(): void {
- this.cachedViewport = null
- }
-
- /**
- * Scale normalized coordinate (0-999) to actual viewport pixel value
- */
- private async scaleCoordinates(
- normalizedX: number,
- normalizedY: number,
- ): Promise<{ x: number; y: number }> {
- const viewport = await this.getViewportSize()
- return {
- x: Math.round((normalizedX / 1000) * viewport.width),
- y: Math.round((normalizedY / 1000) * viewport.height),
- }
- }
-
- /**
- * Call an MCP tool
- */
- private async callMcp(
- name: string,
- args: Record = {},
- ): Promise {
- const client = new Client({
- name: 'gemini-computer-use',
- version: '1.0.0',
- })
-
- const transport = new StreamableHTTPClientTransport(
- new URL(this.ctx.mcpUrl),
- {
- requestInit: {
- headers: { 'X-BrowserOS-Source': 'gemini-computer-use' },
- },
- },
- )
-
- try {
- await client.connect(transport)
-
- const toolCallPromise = client.callTool({ name, arguments: args })
- let timeoutId: ReturnType | null = null
- const timeoutPromise = new Promise((_, reject) => {
- timeoutId = setTimeout(
- () =>
- reject(
- new Error(`MCP tool call timed out after ${MCP_TIMEOUT_MS}ms`),
- ),
- MCP_TIMEOUT_MS,
- )
- })
-
- try {
- return (await Promise.race([
- toolCallPromise,
- timeoutPromise,
- ])) as McpToolResult
- } finally {
- if (timeoutId) clearTimeout(timeoutId)
- }
- } finally {
- try {
- await transport.close()
- } catch {
- // Ignore close errors
- }
- }
- }
-
- /**
- * Execute a Computer Use action by mapping to MCP tools
- */
- async execute(
- action: ComputerUseAction,
- ): Promise<{ success: boolean; message: string }> {
- const { tabId, windowId } = this.ctx
-
- try {
- switch (action.name) {
- case 'click_at': {
- const viewport = await this.getViewportSize()
- const { x, y } = await this.scaleCoordinates(
- action.args.x,
- action.args.y,
- )
- await this.callMcp('browser_click_coordinates', {
- tabId,
- windowId,
- x,
- y,
- })
- // Return original coordinates + debug info for troubleshooting
- // Debug info shows: model input → viewport coords, viewport size, and any errors
- const debugInfo = `[DEBUG: input=(${action.args.x},${action.args.y}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}`
- return {
- success: true,
- message: `Clicked at (${action.args.x}, ${action.args.y}). ${debugInfo}`,
- }
- }
-
- case 'type_text_at': {
- const viewport = await this.getViewportSize()
- const { x, y } = await this.scaleCoordinates(
- action.args.x,
- action.args.y,
- )
- const { text, press_enter, clear_before_typing } = action.args
-
- // Clear field first if requested (select all + delete)
- if (clear_before_typing) {
- await this.callMcp('browser_click_coordinates', {
- tabId,
- windowId,
- x,
- y,
- })
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `document.execCommand('selectAll')`,
- })
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Delete',
- })
- }
-
- // Type the text
- await this.callMcp('browser_type_at_coordinates', {
- tabId,
- windowId,
- x,
- y,
- text,
- })
-
- // Press Enter if requested
- if (press_enter) {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Enter',
- })
- }
-
- // Return original coordinates + debug info
- const debugInfo = `[DEBUG: input=(${action.args.x},${action.args.y}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}`
- return {
- success: true,
- message: `Typed "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}" at (${action.args.x}, ${action.args.y}). ${debugInfo}`,
- }
- }
-
- case 'navigate': {
- await this.callMcp('browser_navigate', {
- tabId,
- windowId,
- url: action.args.url,
- })
- return { success: true, message: `Navigated to ${action.args.url}` }
- }
-
- case 'scroll_document': {
- const { direction } = action.args
- if (direction === 'up') {
- await this.callMcp('browser_scroll_up', { tabId, windowId })
- } else if (direction === 'down') {
- await this.callMcp('browser_scroll_down', { tabId, windowId })
- } else {
- // Left/right scroll via JavaScript
- const scrollCode =
- direction === 'left'
- ? 'window.scrollBy(-window.innerWidth, 0)'
- : 'window.scrollBy(window.innerWidth, 0)'
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: scrollCode,
- })
- }
- return { success: true, message: `Scrolled ${direction}` }
- }
-
- case 'scroll_at': {
- const { x, y } = await this.scaleCoordinates(
- action.args.x,
- action.args.y,
- )
- const { direction, magnitude = 500 } = action.args
-
- // Click at position first to focus element
- await this.callMcp('browser_click_coordinates', {
- tabId,
- windowId,
- x,
- y,
- })
-
- // Scale magnitude from 0-999 to actual pixels
- const viewport = await this.getViewportSize()
- const scrollAmount = Math.round((magnitude / 1000) * viewport.height)
-
- // Use JavaScript scrollBy for precise control with magnitude
- const scrollCode =
- direction === 'up'
- ? `window.scrollBy(0, -${scrollAmount})`
- : direction === 'down'
- ? `window.scrollBy(0, ${scrollAmount})`
- : direction === 'left'
- ? `window.scrollBy(-${scrollAmount}, 0)`
- : `window.scrollBy(${scrollAmount}, 0)`
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: scrollCode,
- })
-
- // Return original coordinates to avoid confusing the model
- return {
- success: true,
- message: `Scrolled ${direction} at (${action.args.x}, ${action.args.y})`,
- }
- }
-
- case 'key_combination': {
- const { keys } = action.args
-
- // Map common key combinations to JavaScript or available keys
- const keyMap: Record Promise> = {
- 'Control+a': async () => {
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `document.execCommand('selectAll')`,
- })
- },
- 'Control+c': async () => {
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `document.execCommand('copy')`,
- })
- },
- 'Control+v': async () => {
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `document.execCommand('paste')`,
- })
- },
- 'Control+z': async () => {
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `document.execCommand('undo')`,
- })
- },
- Enter: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Enter',
- })
- },
- Escape: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Escape',
- })
- },
- Tab: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Tab',
- })
- },
- Backspace: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Backspace',
- })
- },
- Delete: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Delete',
- })
- },
- ArrowUp: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'ArrowUp',
- })
- },
- ArrowDown: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'ArrowDown',
- })
- },
- ArrowLeft: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'ArrowLeft',
- })
- },
- ArrowRight: async () => {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'ArrowRight',
- })
- },
- }
-
- // Normalize key string (case insensitive for modifiers)
- const normalizedKeys = keys
- .replace(/ctrl/i, 'Control')
- .replace(/cmd/i, 'Control')
- const handler = keyMap[normalizedKeys] || keyMap[keys]
-
- if (handler) {
- await handler()
- } else {
- const keyName = keys.split('+').pop() || ''
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `
- const event = new KeyboardEvent('keydown', {
- key: ${JSON.stringify(keyName)},
- ctrlKey: ${keys.toLowerCase().includes('control')},
- shiftKey: ${keys.toLowerCase().includes('shift')},
- altKey: ${keys.toLowerCase().includes('alt')},
- metaKey: ${keys.toLowerCase().includes('meta')},
- bubbles: true
- });
- document.activeElement?.dispatchEvent(event);
- `,
- })
- }
-
- return { success: true, message: `Pressed ${keys}` }
- }
-
- case 'hover_at': {
- const { x, y } = await this.scaleCoordinates(
- action.args.x,
- action.args.y,
- )
-
- // Simulate hover via JavaScript mouseover event
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `
- const elem = document.elementFromPoint(${x}, ${y});
- if (elem) {
- const event = new MouseEvent('mouseover', { bubbles: true, clientX: ${x}, clientY: ${y} });
- elem.dispatchEvent(event);
- }
- `,
- })
-
- // Return original coordinates to avoid confusing the model
- return {
- success: true,
- message: `Hovered at (${action.args.x}, ${action.args.y})`,
- }
- }
-
- case 'go_back': {
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: 'history.back()',
- })
- return { success: true, message: 'Navigated back' }
- }
-
- case 'go_forward': {
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: 'history.forward()',
- })
- return { success: true, message: 'Navigated forward' }
- }
-
- case 'wait_5_seconds': {
- await new Promise((resolve) => setTimeout(resolve, 5000))
- return { success: true, message: 'Waited 5 seconds' }
- }
-
- case 'drag_and_drop': {
- const start = await this.scaleCoordinates(
- action.args.x,
- action.args.y,
- )
- const end = await this.scaleCoordinates(
- action.args.destination_x,
- action.args.destination_y,
- )
-
- // Simulate drag and drop via JavaScript
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `
- const startElem = document.elementFromPoint(${start.x}, ${start.y});
- const endElem = document.elementFromPoint(${end.x}, ${end.y});
- if (startElem && endElem) {
- const dragStart = new DragEvent('dragstart', { bubbles: true, clientX: ${start.x}, clientY: ${start.y} });
- const drop = new DragEvent('drop', { bubbles: true, clientX: ${end.x}, clientY: ${end.y} });
- const dragEnd = new DragEvent('dragend', { bubbles: true });
- startElem.dispatchEvent(dragStart);
- endElem.dispatchEvent(drop);
- startElem.dispatchEvent(dragEnd);
- }
- `,
- })
-
- // Return original coordinates to avoid confusing the model
- return {
- success: true,
- message: `Dragged from (${action.args.x}, ${action.args.y}) to (${action.args.destination_x}, ${action.args.destination_y})`,
- }
- }
-
- default: {
- const _exhaustive: never = action
- return {
- success: false,
- message: `Unknown action: ${JSON.stringify(action)}`,
- }
- }
- }
- } catch (error) {
- const message = error instanceof Error ? error.message : String(error)
- return { success: false, message: `Action failed: ${message}` }
- }
- }
-
- /**
- * Capture a screenshot via MCP with retry logic
- *
- * Uses Gemini's recommended screenshot size (1440x900) for optimal model performance.
- * Now that viewport detection is working correctly, the coordinate mapping will be accurate.
- */
- async captureScreenshot(retries = 2): Promise {
- const { width, height } = DEFAULTS.screenshotSize
-
- for (let attempt = 0; attempt <= retries; attempt++) {
- try {
- const result = await this.callMcp('browser_get_screenshot', {
- tabId: this.ctx.tabId,
- windowId: this.ctx.windowId,
- width,
- height,
- showHighlights: false,
- })
-
- if (result.isError) {
- const errorText =
- result.content?.find((c) => c.type === 'text')?.text ??
- 'Unknown error'
- if (attempt < retries) {
- console.warn(
- `Screenshot attempt ${attempt + 1} failed: ${errorText}, retrying...`,
- )
- await new Promise((r) => setTimeout(r, 500))
- continue
- }
- console.warn('Screenshot capture failed:', errorText)
- return null
- }
-
- const imageContent = result.content.find((c) => c.type === 'image')
- if (imageContent?.data) {
- return imageContent.data
- }
-
- if (attempt < retries) {
- console.warn(
- `Screenshot attempt ${attempt + 1}: No image data, retrying...`,
- )
- await new Promise((r) => setTimeout(r, 500))
- continue
- }
- return null
- } catch (error) {
- if (attempt < retries) {
- console.warn(
- `Screenshot attempt ${attempt + 1} error:`,
- error,
- 'retrying...',
- )
- await new Promise((r) => setTimeout(r, 500))
- continue
- }
- console.warn('Screenshot capture error:', error)
- return null
- }
- }
- return null
- }
-
- /**
- * Get current page URL via MCP
- */
- async getCurrentUrl(): Promise {
- try {
- const result = await this.callMcp('browser_execute_javascript', {
- tabId: this.ctx.tabId,
- windowId: this.ctx.windowId,
- code: 'window.location.href',
- })
-
- const textContent =
- result.content.find((c) => c.type === 'text')?.text ?? ''
- // Extract URL from result text
- const urlMatch = textContent.match(/Result:\s*"?([^"\n]+)"?/)
- return urlMatch?.[1] ?? 'unknown'
- } catch {
- return 'unknown'
- }
- }
-}
diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts
deleted file mode 100644
index 04dee1fb7..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts
+++ /dev/null
@@ -1,327 +0,0 @@
-/**
- * Gemini Computer Use Agent
- * Implements the agent loop that calls Gemini API and executes actions
- * Uses UIMessageStreamEvent format for logging compatibility
- */
-
-import { randomUUID } from 'node:crypto'
-import { ActionMapper } from './action-mapper'
-import {
- type ComputerUseAction,
- DEFAULTS,
- type GeminiComputerUseAgentConfig,
- type GeminiContent,
- type GeminiPart,
- type GeminiResponse,
-} from './types'
-
-const GEMINI_API_BASE = 'https://generativelanguage.googleapis.com/v1beta'
-
-interface StreamWriter {
- write: (data: string) => Promise
-}
-
-type ActionHook = (
- action: ComputerUseAction,
- result: { success: boolean; message: string },
-) => Promise
-
-/**
- * Emit SSE-formatted UIMessageStreamEvent
- */
-function emitEvent(
- writer: StreamWriter,
- event: Record,
-): Promise {
- return writer.write(`data: ${JSON.stringify(event)}\n\n`)
-}
-
-export class GeminiComputerUseAgent {
- private config: GeminiComputerUseAgentConfig
- private actionMapper: ActionMapper
- private actionHook?: ActionHook
- private contents: GeminiContent[] = []
-
- constructor(config: GeminiComputerUseAgentConfig) {
- this.config = config
- this.actionMapper = new ActionMapper({
- mcpUrl: config.mcpUrl,
- tabId: config.tabId,
- windowId: config.windowId,
- screenSize: config.screenSize,
- })
- }
-
- /**
- * Set a hook to be called after each action execution
- */
- setActionHook(hook: ActionHook): void {
- this.actionHook = hook
- }
-
- /**
- * Call the Gemini Computer Use API
- */
- private async callGeminiApi(): Promise {
- const url = `${GEMINI_API_BASE}/models/${DEFAULTS.model}:generateContent`
-
- const response = await fetch(url, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- 'x-goog-api-key': this.config.apiKey,
- },
- body: JSON.stringify({
- contents: this.contents,
- tools: [
- {
- computer_use: {
- environment: 'ENVIRONMENT_BROWSER',
- },
- },
- ],
- }),
- })
-
- if (!response.ok) {
- const errorBody = await response.text()
- throw new Error(
- `Gemini API error: ${response.status} ${response.statusText} - ${errorBody}`,
- )
- }
-
- return response.json()
- }
-
- /**
- * Extract function calls from a Gemini response
- */
- private extractFunctionCalls(response: GeminiResponse): ComputerUseAction[] {
- const candidate = response.candidates?.[0]
- if (!candidate?.content?.parts) {
- return []
- }
-
- const actions: ComputerUseAction[] = []
- for (const part of candidate.content.parts) {
- if (part.functionCall) {
- const { name, args } = part.functionCall
- // Construct action object
- actions.push({ name, args: args ?? {} } as ComputerUseAction)
- }
- }
-
- return actions
- }
-
- /**
- * Extract text response from Gemini response
- */
- private extractTextResponse(response: GeminiResponse): string | null {
- const candidate = response.candidates?.[0]
- if (!candidate?.content?.parts) {
- return null
- }
-
- const textParts = candidate.content.parts
- .map((p) => p.text)
- .filter((text): text is string => text !== undefined)
- return textParts.length > 0 ? textParts.join('\n') : null
- }
-
- /**
- * Build function response parts for the next turn
- */
- private buildFunctionResponses(
- actions: ComputerUseAction[],
- currentUrl: string,
- screenshotBase64: string | null,
- ): GeminiPart[] {
- const parts: GeminiPart[] = []
-
- for (const action of actions) {
- parts.push({
- functionResponse: {
- name: action.name,
- response: { url: currentUrl },
- },
- })
- }
-
- // Add screenshot as inline data
- if (screenshotBase64) {
- parts.push({
- inlineData: {
- mimeType: 'image/png',
- data: screenshotBase64,
- },
- })
- }
-
- return parts
- }
-
- /**
- * Execute the agent loop
- */
- async execute(
- query: string,
- streamWriter: StreamWriter,
- signal: AbortSignal,
- ): Promise<{ finalText: string | null; totalActions: number }> {
- let totalActions = 0
- let finalText: string | null = null
-
- // Wait for page to stabilize before first screenshot
- await new Promise((resolve) => setTimeout(resolve, 2000))
-
- // Capture initial screenshot with retries
- let initialScreenshot: string | null = null
- for (let attempt = 1; attempt <= 3; attempt++) {
- initialScreenshot = await this.actionMapper.captureScreenshot()
- if (initialScreenshot) break
- console.warn(`Initial screenshot attempt ${attempt} failed, retrying...`)
- await new Promise((resolve) => setTimeout(resolve, 1000))
- }
-
- if (!initialScreenshot) {
- throw new Error('Failed to capture initial screenshot after 3 attempts')
- }
-
- // Build initial content
- const initialParts: GeminiPart[] = [
- { text: query },
- { inlineData: { mimeType: 'image/png', data: initialScreenshot } },
- ]
- this.contents.push({ role: 'user', parts: initialParts })
-
- const messageId = randomUUID()
- await emitEvent(streamWriter, { type: 'start', messageId })
-
- let finished = false
- for (let turn = 0; turn < this.config.turnLimit; turn++) {
- if (signal.aborted) {
- await emitEvent(streamWriter, { type: 'abort' })
- break
- }
-
- // Start step (turn)
- await emitEvent(streamWriter, { type: 'start-step' })
-
- // Call Gemini API
- let response: GeminiResponse
- try {
- response = await this.callGeminiApi()
- } catch (error) {
- const errorMsg = error instanceof Error ? error.message : String(error)
- await emitEvent(streamWriter, {
- type: 'error',
- errorText: `API error: ${errorMsg}`,
- })
- throw error
- }
-
- // Check for API errors
- if (response.error) {
- await emitEvent(streamWriter, {
- type: 'error',
- errorText: response.error.message,
- })
- throw new Error(`Gemini API error: ${response.error.message}`)
- }
-
- // Extract text response
- const textResponse = this.extractTextResponse(response)
- if (textResponse) {
- finalText = textResponse
- const textId = randomUUID()
- await emitEvent(streamWriter, { type: 'text-start', id: textId })
- await emitEvent(streamWriter, {
- type: 'text-delta',
- id: textId,
- delta: textResponse,
- })
- await emitEvent(streamWriter, { type: 'text-end', id: textId })
- }
-
- // Extract function calls
- const actions = this.extractFunctionCalls(response)
-
- // If no actions, task is complete
- if (actions.length === 0) {
- await emitEvent(streamWriter, { type: 'finish-step' })
- await emitEvent(streamWriter, {
- type: 'finish',
- finishReason: 'completed',
- })
- finished = true
- break
- }
-
- // Add model response to conversation
- const candidate = response.candidates?.[0]
- if (candidate?.content) {
- this.contents.push(candidate.content)
- }
-
- // Execute each action
- for (const action of actions) {
- if (signal.aborted) break
-
- const toolCallId = randomUUID()
-
- // Tool input events
- await emitEvent(streamWriter, {
- type: 'tool-input-start',
- toolCallId,
- toolName: action.name,
- })
- await emitEvent(streamWriter, {
- type: 'tool-input-available',
- toolCallId,
- toolName: action.name,
- input: action.args,
- })
-
- const result = await this.actionMapper.execute(action)
- totalActions++
-
- // Tool output event
- await emitEvent(streamWriter, {
- type: 'tool-output-available',
- toolCallId,
- output: result,
- })
-
- // Call action hook (for screenshot capture)
- if (this.actionHook) {
- await this.actionHook(action, result)
- }
- }
-
- // Capture new screenshot and URL
- const newScreenshot = await this.actionMapper.captureScreenshot()
- const currentUrl = await this.actionMapper.getCurrentUrl()
-
- // Build function responses and add to conversation
- const functionResponseParts = this.buildFunctionResponses(
- actions,
- currentUrl,
- newScreenshot,
- )
- this.contents.push({ role: 'user', parts: functionResponseParts })
-
- // Finish step (turn)
- await emitEvent(streamWriter, { type: 'finish-step' })
- }
-
- if (!finished && !signal.aborted) {
- await emitEvent(streamWriter, {
- type: 'finish',
- finishReason: 'max_turns',
- })
- }
-
- return { finalText, totalActions }
- }
-}
diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts
deleted file mode 100644
index f012d0564..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts
+++ /dev/null
@@ -1,97 +0,0 @@
-/**
- * Gemini Computer Use Evaluator
- * Implements AgentEvaluator interface for the eval framework
- */
-
-import { DEFAULT_TIMEOUT_MS } from '../../constants'
-import type { GeminiComputerUseConfig, TaskMetadata } from '../../types'
-import { resolveEnvValue } from '../../utils/resolve-env'
-import { withEvalTimeout } from '../../utils/with-eval-timeout'
-import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-import { GeminiComputerUseAgent } from './agent'
-import { DEFAULTS } from './types'
-
-export class GeminiComputerUseEvaluator implements AgentEvaluator {
- constructor(private ctx: AgentContext) {}
-
- async execute(): Promise {
- const { config, task, capture, windowId = 0, tabId = 0 } = this.ctx
- const agentConfig = config.agent as GeminiComputerUseConfig
-
- const startTime = Date.now()
- const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
-
- await capture.messageLogger.logUser(task.query)
-
- const apiKey = resolveEnvValue(agentConfig.apiKey)
- if (!apiKey) {
- throw new Error(
- `API key not found. Set ${agentConfig.apiKey} environment variable or provide the key directly.`,
- )
- }
-
- const agent = new GeminiComputerUseAgent({
- apiKey,
- turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
- screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
- tabId,
- windowId,
- mcpUrl: `${config.browseros.server_url}/mcp`,
- })
-
- agent.setActionHook(async (_action, _result) => {
- try {
- await capture.screenshot.capture(capture.getActivePageId())
- } catch (err) {
- console.warn('Screenshot capture failed in hook:', err)
- }
- })
-
- const streamWriter = capture.createStreamWriter()
-
- let finalText: string | null = null
- let totalActions = 0
-
- const { terminationReason } = await withEvalTimeout(
- timeoutMs,
- capture,
- async (signal) => {
- const result = await agent.execute(task.query, streamWriter, signal)
- finalText = result.finalText
- totalActions = result.totalActions
- return result
- },
- )
-
- const endTime = Date.now()
-
- const metadata: TaskMetadata = {
- query_id: task.query_id,
- dataset: task.dataset,
- query: task.query,
- started_at: new Date(startTime).toISOString(),
- completed_at: new Date(endTime).toISOString(),
- total_duration_ms: endTime - startTime,
- total_steps: totalActions,
- termination_reason: terminationReason,
- final_answer: finalText ?? capture.getLastAssistantText(),
- errors: capture.getErrors(),
- warnings: capture.getWarnings(),
- agent_config: {
- type: 'gemini-computer-use',
- model: DEFAULTS.model,
- turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
- screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
- },
- grader_results: {},
- }
-
- await capture.trajectorySaver.saveMetadata(metadata)
-
- return {
- metadata,
- messages: capture.getMessages(),
- finalAnswer: finalText ?? capture.getLastAssistantText(),
- }
- }
-}
diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts
deleted file mode 100644
index ef3af48b2..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts
+++ /dev/null
@@ -1,156 +0,0 @@
-/**
- * Types for Gemini Computer Use agent
- */
-
-import { z } from 'zod'
-
-// Gemini Computer Use predefined actions (from API docs)
-export const ComputerUseActionSchema = z.discriminatedUnion('name', [
- z.object({
- name: z.literal('click_at'),
- args: z.object({
- x: z.number().min(0).max(999),
- y: z.number().min(0).max(999),
- }),
- }),
- z.object({
- name: z.literal('type_text_at'),
- args: z.object({
- x: z.number().min(0).max(999),
- y: z.number().min(0).max(999),
- text: z.string(),
- press_enter: z.boolean().optional(),
- clear_before_typing: z.boolean().optional(),
- }),
- }),
- z.object({
- name: z.literal('navigate'),
- args: z.object({
- url: z.string(),
- }),
- }),
- z.object({
- name: z.literal('scroll_document'),
- args: z.object({
- direction: z.enum(['up', 'down', 'left', 'right']),
- }),
- }),
- z.object({
- name: z.literal('scroll_at'),
- args: z.object({
- x: z.number().min(0).max(999),
- y: z.number().min(0).max(999),
- direction: z.enum(['up', 'down', 'left', 'right']),
- magnitude: z.number().optional(),
- }),
- }),
- z.object({
- name: z.literal('key_combination'),
- args: z.object({
- keys: z.string(),
- }),
- }),
- z.object({
- name: z.literal('hover_at'),
- args: z.object({
- x: z.number().min(0).max(999),
- y: z.number().min(0).max(999),
- }),
- }),
- z.object({
- name: z.literal('go_back'),
- args: z.object({}).optional(),
- }),
- z.object({
- name: z.literal('go_forward'),
- args: z.object({}).optional(),
- }),
- z.object({
- name: z.literal('wait_5_seconds'),
- args: z.object({}).optional(),
- }),
- z.object({
- name: z.literal('drag_and_drop'),
- args: z.object({
- x: z.number().min(0).max(999),
- y: z.number().min(0).max(999),
- destination_x: z.number().min(0).max(999),
- destination_y: z.number().min(0).max(999),
- }),
- }),
-])
-
-export type ComputerUseAction = z.infer
-
-// Screen size configuration
-export interface ScreenSize {
- width: number
- height: number
-}
-
-// Context for action execution
-export interface ActionContext {
- mcpUrl: string
- tabId: number
- windowId: number
- screenSize: ScreenSize
-}
-
-// Gemini API types
-export interface GeminiContent {
- role: 'user' | 'model'
- parts: GeminiPart[]
-}
-
-export interface GeminiPart {
- text?: string
- inlineData?: {
- mimeType: string
- data: string
- }
- functionCall?: {
- name: string
- args?: Record
- }
- functionResponse?: {
- name: string
- response: Record
- }
-}
-
-export interface GeminiResponse {
- candidates?: Array<{
- content: GeminiContent
- finishReason?: string
- }>
- error?: {
- message: string
- code: number
- }
-}
-
-// Safety decision from Computer Use
-export interface SafetyDecision {
- decision: 'allow' | 'require_confirmation' | 'block'
- explanation?: string
-}
-
-// Agent configuration
-export interface GeminiComputerUseAgentConfig {
- apiKey: string
- turnLimit: number
- screenSize: ScreenSize
- tabId: number
- windowId: number
- mcpUrl: string
-}
-
-// Defaults
-export const DEFAULTS = {
- // Gemini's recommended screenshot size for optimal model accuracy
- screenshotSize: { width: 1440, height: 900 },
- // Fallback viewport size (used when actual viewport can't be determined)
- screenSize: { width: 1440, height: 900 },
- turnLimit: 30,
- model: 'gemini-2.5-computer-use-preview-10-2025',
-} as const
diff --git a/packages/browseros-agent/apps/eval/src/agents/index.ts b/packages/browseros-agent/apps/eval/src/agents/index.ts
index 1ace1f82a..12bc40272 100644
--- a/packages/browseros-agent/apps/eval/src/agents/index.ts
+++ b/packages/browseros-agent/apps/eval/src/agents/index.ts
@@ -1,8 +1,6 @@
-import { GeminiComputerUseEvaluator } from './gemini-computer-use'
import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
import { registerAgent } from './registry'
import { SingleAgentEvaluator } from './single-agent'
-import { YutoriNavigatorEvaluator } from './yutori-navigator'
// Register built-in agent types
registerAgent('single', (ctx) => new SingleAgentEvaluator(ctx))
@@ -10,11 +8,6 @@ registerAgent(
'orchestrator-executor',
(ctx) => new OrchestratorExecutorEvaluator(ctx),
)
-registerAgent(
- 'gemini-computer-use',
- (ctx) => new GeminiComputerUseEvaluator(ctx),
-)
-registerAgent('yutori-navigator', (ctx) => new YutoriNavigatorEvaluator(ctx))
// Re-exports
export {
diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts
deleted file mode 100644
index e8bee5e17..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts
+++ /dev/null
@@ -1,677 +0,0 @@
-/**
- * Maps Yutori n1 actions to MCP tool calls
- *
- * Coordinate System:
- * - n1 outputs normalized coordinates in 1000x1000 grid
- * - Screenshots captured with size='large' (1028px width, aspect ratio preserved)
- * - We scale normalized coords to actual viewport pixels
- *
- * Action Mapping (prioritize MCP tools over execute_javascript):
- * - click → browser_click_coordinates ✅
- * - type → browser_type_at_coordinates (uses last clicked coords) ✅
- * - scroll up/down → browser_scroll_up/down ✅
- * - scroll left/right → browser_execute_javascript (no horizontal scroll tool)
- * - key_press → browser_send_keys (for supported keys) ✅
- * - hover → browser_execute_javascript (no dedicated MCP tool)
- * - drag → browser_execute_javascript (no dedicated MCP tool)
- * - wait → setTimeout
- * - refresh → browser_execute_javascript (no dedicated MCP tool)
- * - go_back → browser_execute_javascript (no dedicated MCP tool)
- * - goto_url → browser_navigate ✅
- * - stop → returns answer (no MCP call)
- * - read_texts_and_links → browser_get_page_content ✅
- */
-
-import { Client } from '@modelcontextprotocol/sdk/client/index.js'
-import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'
-import sharp from 'sharp'
-import type { ActionContext, N1Action, ScreenSize } from './types'
-import { DEFAULTS } from './types'
-
-/**
- * Convert PNG base64 to WebP base64 for smaller payload size.
- * Yutori n1 recommends WebP format for better compression.
- */
-async function convertToWebP(pngBase64: string): Promise {
- const pngBuffer = Buffer.from(pngBase64, 'base64')
- const webpBuffer = await sharp(pngBuffer)
- .webp({ quality: 80 }) // Good balance of quality and size
- .toBuffer()
- return webpBuffer.toString('base64')
-}
-
-interface McpToolResult {
- content: Array<{
- type: string
- text?: string
- data?: string
- mimeType?: string
- }>
- isError?: boolean
-}
-
-const MCP_TIMEOUT_MS = 30000
-
-// Scroll amount per unit (n1 recommends treating each amount as 10-15% of screen)
-const SCROLL_PERCENT_PER_UNIT = 0.12 // 12% of viewport per scroll unit
-
-export class ActionMapper {
- private ctx: ActionContext
- private cachedViewport: ScreenSize | null = null
- // Track last clicked coordinates for type action (n1 type has no coords)
- private lastClickCoordinates: { x: number; y: number } | null = null
-
- constructor(ctx: ActionContext) {
- this.ctx = ctx
- }
-
- // Store debug info about viewport detection for inclusion in responses
- private viewportDebugInfo: string = ''
-
- /**
- * Get the actual browser viewport size via JavaScript
- * This is critical for correct coordinate mapping:
- * - Screenshot is scaled to 1028px width (aspect ratio preserved)
- * - Clicks must be at actual viewport coordinates
- * - We scale: (normalized/1000) * viewport
- * Caches the result to avoid repeated calls
- * Also stores debug info for troubleshooting
- */
- async getViewportSize(): Promise {
- if (this.cachedViewport) {
- return this.cachedViewport
- }
-
- try {
- const result = await this.callMcp('browser_execute_javascript', {
- tabId: this.ctx.tabId,
- windowId: this.ctx.windowId,
- code: '[window.innerWidth, window.innerHeight]',
- })
-
- const textContent =
- result.content.find((c) => c.type === 'text')?.text ?? ''
-
- // Check for error in result
- if (result.isError) {
- this.viewportDebugInfo = `[VIEWPORT ERROR] JS execution failed: ${textContent}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- return this.ctx.screenSize
- }
-
- // Parse array format - can be multiline: [1440, 900] or "Result: [\n 1200,\n 712\n]"
- const arrayMatch = textContent.match(/\[\s*(\d+)\s*,\s*(\d+)\s*\]/s)
- if (arrayMatch) {
- const width = parseInt(arrayMatch[1], 10)
- const height = parseInt(arrayMatch[2], 10)
- if (width > 0 && height > 0) {
- this.cachedViewport = { width, height }
- this.viewportDebugInfo = `[VIEWPORT OK] Detected: ${width}x${height} (raw: "${textContent.substring(0, 100)}")`
- console.log(this.viewportDebugInfo)
- return this.cachedViewport
- } else {
- this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Invalid dimensions: ${width}x${height} from "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- }
- } else {
- this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Could not parse: "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- }
- } catch (error) {
- const errMsg = error instanceof Error ? error.message : String(error)
- this.viewportDebugInfo = `[VIEWPORT EXCEPTION] ${errMsg}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}`
- console.warn(this.viewportDebugInfo)
- }
-
- // Fallback to config screenSize
- return this.ctx.screenSize
- }
-
- /**
- * Clear cached viewport (call when tab/window changes or before new task)
- */
- clearViewportCache(): void {
- this.cachedViewport = null
- }
-
- /**
- * Reset all tracked state (call before starting a new task)
- */
- reset(): void {
- this.cachedViewport = null
- this.lastClickCoordinates = null
- }
-
- /**
- * Scale normalized coordinate (0-1000) to actual viewport pixel value
- *
- * How it works:
- * - Screenshot is captured at 1028px width with preserved aspect ratio
- * - n1 predicts normalized coords (0-1000) for that screenshot
- * - Since aspect ratio is preserved, we can scale directly to viewport
- * - Formula: actualX = (normalizedX / 1000) * viewport.innerWidth
- */
- private async scaleCoordinates(
- normalizedX: number,
- normalizedY: number,
- ): Promise<{ x: number; y: number }> {
- const viewport = await this.getViewportSize()
- return {
- x: Math.round((normalizedX / DEFAULTS.normalizedMax) * viewport.width),
- y: Math.round((normalizedY / DEFAULTS.normalizedMax) * viewport.height),
- }
- }
-
- /**
- * Call an MCP tool
- */
- private async callMcp(
- name: string,
- args: Record = {},
- ): Promise {
- const client = new Client({
- name: 'yutori-navigator',
- version: '1.0.0',
- })
-
- const transport = new StreamableHTTPClientTransport(
- new URL(this.ctx.mcpUrl),
- {
- requestInit: {
- headers: { 'X-BrowserOS-Source': 'yutori-navigator' },
- },
- },
- )
-
- try {
- await client.connect(transport)
-
- const toolCallPromise = client.callTool({ name, arguments: args })
- let timeoutId: ReturnType | null = null
- const timeoutPromise = new Promise((_, reject) => {
- timeoutId = setTimeout(
- () =>
- reject(
- new Error(`MCP tool call timed out after ${MCP_TIMEOUT_MS}ms`),
- ),
- MCP_TIMEOUT_MS,
- )
- })
-
- try {
- return (await Promise.race([
- toolCallPromise,
- timeoutPromise,
- ])) as McpToolResult
- } finally {
- if (timeoutId) clearTimeout(timeoutId)
- }
- } finally {
- try {
- await transport.close()
- } catch {
- // Ignore close errors
- }
- }
- }
-
- /**
- * Execute an n1 action by mapping to MCP tools
- * Prioritizes native MCP tools over browser_execute_javascript for reliability
- * Returns the result message and optionally the stop answer
- */
- async execute(
- action: N1Action,
- ): Promise<{ success: boolean; message: string; stopAnswer?: string }> {
- const { tabId, windowId } = this.ctx
-
- try {
- switch (action.action_type) {
- case 'click': {
- const [normX, normY] = action.center_coordinates
- const viewport = await this.getViewportSize()
- const { x, y } = await this.scaleCoordinates(normX, normY)
-
- // Track coordinates for subsequent type action (n1 type has no coords)
- this.lastClickCoordinates = { x, y }
-
- await this.callMcp('browser_click_coordinates', {
- tabId,
- windowId,
- x,
- y,
- })
- // Return original coordinates + debug info
- const debugInfo = `[DEBUG: input=(${normX},${normY}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}`
- return {
- success: true,
- message: `Clicked at (${normX}, ${normY}). ${debugInfo}`,
- }
- }
-
- case 'type': {
- const { text, press_enter_after, clear_before_typing } = action
-
- // n1 type action has no coordinates - it expects element to be focused
- // Use last clicked coordinates with browser_type_at_coordinates
- if (!this.lastClickCoordinates) {
- // Fallback: click center of screen if no prior click
- const viewport = await this.getViewportSize()
- this.lastClickCoordinates = {
- x: Math.round(viewport.width / 2),
- y: Math.round(viewport.height / 2),
- }
- }
-
- const { x, y } = this.lastClickCoordinates
-
- // Clear field first if requested using native MCP tools
- if (clear_before_typing) {
- // Triple-click to select all text in the field
- await this.callMcp('browser_click_coordinates', {
- tabId,
- windowId,
- x,
- y,
- })
- // Use Delete key to clear
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Delete',
- })
- }
-
- // Use browser_type_at_coordinates - the proper MCP tool for typing
- await this.callMcp('browser_type_at_coordinates', {
- tabId,
- windowId,
- x,
- y,
- text,
- })
-
- // Press Enter if requested using native MCP tool
- if (press_enter_after) {
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: 'Enter',
- })
- }
-
- // n1 type action has no coordinates - don't include viewport coords in response
- return {
- success: true,
- message: `Typed "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}"`,
- }
- }
-
- case 'scroll': {
- const { direction, center_coordinates, amount } = action
- const [normX, normY] = center_coordinates
- const { x, y } = await this.scaleCoordinates(normX, normY)
-
- // Track coordinates
- this.lastClickCoordinates = { x, y }
-
- // Click at position first to focus element (for scrollable containers)
- await this.callMcp('browser_click_coordinates', {
- tabId,
- windowId,
- x,
- y,
- })
-
- // For vertical scroll (up/down): use native MCP scroll tools
- // For horizontal scroll (left/right): use JS (no MCP tool available)
- if (direction === 'up' || direction === 'down') {
- const scrollTool =
- direction === 'up' ? 'browser_scroll_up' : 'browser_scroll_down'
-
- // Calculate how many scroll calls based on amount
- // n1 amount 1-2 = ~20% viewport, our tool = 100% viewport
- // So we scroll once for small amounts, more for larger
- const scrollCount = Math.max(1, Math.round(amount / 5))
-
- for (let i = 0; i < scrollCount; i++) {
- await this.callMcp(scrollTool, { tabId, windowId })
- // Small delay between scrolls for stability
- if (i < scrollCount - 1) {
- await new Promise((r) => setTimeout(r, 100))
- }
- }
-
- // Return original normalized coordinates
- return {
- success: true,
- message: `Scrolled ${direction} at (${normX}, ${normY})`,
- }
- } else {
- // Horizontal scroll - no MCP tool, use JS
- const viewport = await this.getViewportSize()
- const scrollPixels = Math.round(
- amount * SCROLL_PERCENT_PER_UNIT * viewport.width,
- )
- const scrollCode =
- direction === 'left'
- ? `window.scrollBy(-${scrollPixels}, 0)`
- : `window.scrollBy(${scrollPixels}, 0)`
-
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: scrollCode,
- })
-
- // Return original normalized coordinates
- return {
- success: true,
- message: `Scrolled ${direction} at (${normX}, ${normY})`,
- }
- }
- }
-
- case 'key_press': {
- const { key_comb } = action
-
- // Map keys to browser_send_keys supported keys
- // browser_send_keys supports: Enter, Delete, Backspace, Tab, Escape,
- // ArrowUp, ArrowDown, ArrowLeft, ArrowRight, Home, End, PageUp, PageDown
- const keyMap: Record = {
- Enter: 'Enter',
- Escape: 'Escape',
- Tab: 'Tab',
- Backspace: 'Backspace',
- Delete: 'Delete',
- ArrowUp: 'ArrowUp',
- ArrowDown: 'ArrowDown',
- ArrowLeft: 'ArrowLeft',
- ArrowRight: 'ArrowRight',
- Home: 'Home',
- End: 'End',
- PageUp: 'PageUp',
- PageDown: 'PageDown',
- // Alternative names n1 might use
- Return: 'Enter',
- Esc: 'Escape',
- Up: 'ArrowUp',
- Down: 'ArrowDown',
- Left: 'ArrowLeft',
- Right: 'ArrowRight',
- }
-
- const mappedKey = keyMap[key_comb]
- if (mappedKey) {
- // Use native MCP tool
- await this.callMcp('browser_send_keys', {
- tabId,
- windowId,
- key: mappedKey,
- })
- } else {
- // For complex key combinations (Ctrl+A, etc.), use JavaScript
- const parts = key_comb.split('+')
- const mainKey = parts.pop() || ''
- const modifiers = parts.map((p) => p.toLowerCase())
-
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `
- const event = new KeyboardEvent('keydown', {
- key: '${mainKey}',
- code: 'Key${mainKey.toUpperCase()}',
- ctrlKey: ${modifiers.includes('control') || modifiers.includes('ctrl')},
- shiftKey: ${modifiers.includes('shift')},
- altKey: ${modifiers.includes('alt')},
- metaKey: ${modifiers.includes('meta') || modifiers.includes('cmd')},
- bubbles: true
- });
- document.activeElement?.dispatchEvent(event);
- `,
- })
- }
-
- return { success: true, message: `Pressed ${key_comb}` }
- }
-
- case 'hover': {
- // No dedicated MCP hover tool - use JS
- const [normX, normY] = action.center_coordinates
- const { x, y } = await this.scaleCoordinates(normX, normY)
-
- // Track coordinates
- this.lastClickCoordinates = { x, y }
-
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `
- const elem = document.elementFromPoint(${x}, ${y});
- if (elem) {
- const event = new MouseEvent('mouseover', {
- bubbles: true,
- clientX: ${x},
- clientY: ${y}
- });
- elem.dispatchEvent(event);
- }
- `,
- })
-
- // Return original normalized coordinates
- return { success: true, message: `Hovered at (${normX}, ${normY})` }
- }
-
- case 'drag': {
- // No dedicated MCP drag tool - use JS
- const [startNormX, startNormY] = action.start_coordinates
- const [endNormX, endNormY] = action.center_coordinates
- const start = await this.scaleCoordinates(startNormX, startNormY)
- const end = await this.scaleCoordinates(endNormX, endNormY)
-
- // Track end coordinates
- this.lastClickCoordinates = end
-
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: `
- const startElem = document.elementFromPoint(${start.x}, ${start.y});
- const endElem = document.elementFromPoint(${end.x}, ${end.y});
- if (startElem && endElem) {
- const dragStart = new DragEvent('dragstart', {
- bubbles: true,
- clientX: ${start.x},
- clientY: ${start.y}
- });
- const drop = new DragEvent('drop', {
- bubbles: true,
- clientX: ${end.x},
- clientY: ${end.y}
- });
- const dragEnd = new DragEvent('dragend', { bubbles: true });
- startElem.dispatchEvent(dragStart);
- endElem.dispatchEvent(drop);
- startElem.dispatchEvent(dragEnd);
- }
- `,
- })
-
- // Return original normalized coordinates
- return {
- success: true,
- message: `Dragged from (${startNormX}, ${startNormY}) to (${endNormX}, ${endNormY})`,
- }
- }
-
- case 'wait': {
- // n1 uses this for page loads
- await new Promise((resolve) => setTimeout(resolve, 2000))
- return { success: true, message: 'Waited 2 seconds' }
- }
-
- case 'refresh': {
- // No dedicated MCP refresh tool - use JS
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: 'location.reload()',
- })
- // Wait for page to start reloading
- await new Promise((resolve) => setTimeout(resolve, 1000))
- return { success: true, message: 'Refreshed page' }
- }
-
- case 'go_back': {
- // No dedicated MCP go_back tool - use JS
- await this.callMcp('browser_execute_javascript', {
- tabId,
- windowId,
- code: 'history.back()',
- })
- return { success: true, message: 'Navigated back' }
- }
-
- case 'goto_url': {
- // Use native MCP navigate tool
- await this.callMcp('browser_navigate', {
- tabId,
- windowId,
- url: action.url,
- })
- return { success: true, message: `Navigated to ${action.url}` }
- }
-
- case 'read_texts_and_links': {
- // Use native MCP tool
- const result = await this.callMcp('browser_get_page_content', {
- tabId,
- windowId,
- type: 'text-with-links',
- })
- const content =
- result.content.find((c) => c.type === 'text')?.text ?? ''
- return {
- success: true,
- message: `Read page content (${content.length} chars)`,
- }
- }
-
- case 'stop': {
- // Stop action - task is complete, return the answer
- return {
- success: true,
- message: 'Task completed',
- stopAnswer: action.answer,
- }
- }
-
- default: {
- const _exhaustive: never = action
- return {
- success: false,
- message: `Unknown action: ${JSON.stringify(action)}`,
- }
- }
- }
- } catch (error) {
- const message = error instanceof Error ? error.message : String(error)
- return { success: false, message: `Action failed: ${message}` }
- }
- }
-
- /**
- * Capture a screenshot via MCP with retry logic
- *
- * Uses Yutori's recommended screenshot size (1280x800) for optimal model performance.
- * Now that viewport detection is working correctly, the coordinate mapping will be accurate.
- *
- * Returns WebP base64 string
- */
- async captureScreenshot(retries = 2): Promise {
- const { width, height } = DEFAULTS.screenshotSize
-
- for (let attempt = 0; attempt <= retries; attempt++) {
- try {
- const result = await this.callMcp('browser_get_screenshot', {
- tabId: this.ctx.tabId,
- windowId: this.ctx.windowId,
- width,
- height,
- showHighlights: false,
- })
-
- if (result.isError) {
- const errorText =
- result.content?.find((c) => c.type === 'text')?.text ??
- 'Unknown error'
- if (attempt < retries) {
- console.warn(
- `Screenshot attempt ${attempt + 1} failed: ${errorText}, retrying...`,
- )
- await new Promise((r) => setTimeout(r, 500))
- continue
- }
- console.warn('Screenshot capture failed:', errorText)
- return null
- }
-
- const imageContent = result.content.find((c) => c.type === 'image')
- if (imageContent?.data) {
- // Convert PNG to WebP for smaller payload (n1 recommends WebP)
- try {
- const webpBase64 = await convertToWebP(imageContent.data)
- return webpBase64
- } catch (conversionError) {
- console.warn('WebP conversion failed, using PNG:', conversionError)
- return imageContent.data
- }
- }
-
- if (attempt < retries) {
- console.warn(
- `Screenshot attempt ${attempt + 1}: No image data, retrying...`,
- )
- await new Promise((r) => setTimeout(r, 500))
- continue
- }
- return null
- } catch (error) {
- if (attempt < retries) {
- console.warn(
- `Screenshot attempt ${attempt + 1} error:`,
- error,
- 'retrying...',
- )
- await new Promise((r) => setTimeout(r, 500))
- continue
- }
- console.warn('Screenshot capture error:', error)
- return null
- }
- }
- return null
- }
-
- /**
- * Get current page URL via MCP
- */
- async getCurrentUrl(): Promise {
- try {
- const result = await this.callMcp('browser_execute_javascript', {
- tabId: this.ctx.tabId,
- windowId: this.ctx.windowId,
- code: 'window.location.href',
- })
-
- const textContent =
- result.content.find((c) => c.type === 'text')?.text ?? ''
- const urlMatch = textContent.match(/Result:\s*"?([^"\n]+)"?/)
- return urlMatch?.[1] ?? 'unknown'
- } catch {
- return 'unknown'
- }
- }
-}
diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts
deleted file mode 100644
index 408327caa..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts
+++ /dev/null
@@ -1,353 +0,0 @@
-/**
- * Yutori Navigator n1 Agent
- *
- * Implements the agent loop that calls Yutori n1 API and executes actions.
- * Uses UIMessageStreamEvent format for logging compatibility.
- *
- * n1 API follows OpenAI Chat Completions interface with special 'observation' role
- * for screenshots. Full conversation history must be maintained.
- */
-
-import { randomUUID } from 'node:crypto'
-import { ActionMapper } from './action-mapper'
-import {
- DEFAULTS,
- type N1Action,
- type N1ChatCompletionResponse,
- type N1Message,
- N1ResponseSchema,
- YUTORI_API_BASE,
- type YutoriNavigatorAgentConfig,
-} from './types'
-
-interface StreamWriter {
- write: (data: string) => Promise
-}
-
-type ActionHook = (
- action: N1Action,
- result: { success: boolean; message: string },
-) => Promise
-
-/**
- * Emit SSE-formatted UIMessageStreamEvent
- */
-function emitEvent(
- writer: StreamWriter,
- event: Record,
-): Promise {
- return writer.write(`data: ${JSON.stringify(event)}\n\n`)
-}
-
-export class YutoriNavigatorAgent {
- private config: YutoriNavigatorAgentConfig
- private actionMapper: ActionMapper
- private actionHook?: ActionHook
- private messages: N1Message[] = []
-
- constructor(config: YutoriNavigatorAgentConfig) {
- this.config = config
- this.actionMapper = new ActionMapper({
- mcpUrl: config.mcpUrl,
- tabId: config.tabId,
- windowId: config.windowId,
- screenSize: config.screenSize,
- })
- }
-
- /**
- * Set a hook to be called after each action execution
- */
- setActionHook(hook: ActionHook): void {
- this.actionHook = hook
- }
-
- /**
- * Build observation message with screenshot and optional URL
- */
- private buildObservationMessage(
- screenshotBase64: string,
- currentUrl?: string,
- ): N1Message {
- const content: N1Message['content'] = []
-
- // Include URL if available (recommended by Yutori for better attribution)
- if (currentUrl) {
- content.push({
- type: 'text',
- text: `Current URL: ${currentUrl}`,
- })
- }
-
- // Add screenshot as base64 data URL (WebP for smaller payload)
- content.push({
- type: 'image_url',
- image_url: {
- url: `data:image/webp;base64,${screenshotBase64}`,
- },
- })
-
- return {
- role: 'observation',
- content,
- }
- }
-
- /**
- * Call the Yutori n1 API
- */
- private async callN1Api(): Promise {
- const url = `${YUTORI_API_BASE}/chat/completions`
-
- const response = await fetch(url, {
- method: 'POST',
- headers: {
- 'Content-Type': 'application/json',
- Authorization: `Bearer ${this.config.apiKey}`,
- },
- body: JSON.stringify({
- model: DEFAULTS.model,
- messages: this.messages,
- temperature: DEFAULTS.temperature,
- }),
- })
-
- if (!response.ok) {
- const errorBody = await response.text()
- throw new Error(
- `Yutori n1 API error: ${response.status} ${response.statusText} - ${errorBody}`,
- )
- }
-
- return response.json()
- }
-
- /**
- * Parse n1 response content to extract thoughts and actions
- */
- private parseN1Response(
- content: string,
- ): { thoughts: string; actions: N1Action[] } | null {
- try {
- const parsed = JSON.parse(content)
- const validated = N1ResponseSchema.safeParse(parsed)
-
- if (validated.success) {
- return validated.data
- }
-
- console.warn('n1 response validation failed:', validated.error.message)
- // Try to extract what we can
- return {
- thoughts: parsed.thoughts ?? '',
- actions: Array.isArray(parsed.actions) ? parsed.actions : [],
- }
- } catch (error) {
- console.warn('Failed to parse n1 response:', error)
- return null
- }
- }
-
- /**
- * Execute the agent loop
- */
- async execute(
- query: string,
- streamWriter: StreamWriter,
- signal: AbortSignal,
- ): Promise<{ finalText: string | null; totalActions: number }> {
- let totalActions = 0
- let finalText: string | null = null
-
- // Wait for page to stabilize before first screenshot
- await new Promise((resolve) => setTimeout(resolve, 2000))
-
- // Capture initial screenshot with retries
- let initialScreenshot: string | null = null
- for (let attempt = 1; attempt <= 3; attempt++) {
- initialScreenshot = await this.actionMapper.captureScreenshot()
- if (initialScreenshot) break
- console.warn(`Initial screenshot attempt ${attempt} failed, retrying...`)
- await new Promise((resolve) => setTimeout(resolve, 1000))
- }
-
- if (!initialScreenshot) {
- throw new Error('Failed to capture initial screenshot after 3 attempts')
- }
-
- // Get initial URL
- const initialUrl = await this.actionMapper.getCurrentUrl()
-
- // Build initial messages
- // 1. User message with task
- this.messages.push({
- role: 'user',
- content: [{ type: 'text', text: query }],
- })
-
- // 2. Initial observation with screenshot
- this.messages.push(
- this.buildObservationMessage(initialScreenshot, initialUrl),
- )
-
- // Emit start event
- const messageId = randomUUID()
- await emitEvent(streamWriter, { type: 'start', messageId })
-
- let finished = false
- for (let turn = 0; turn < this.config.turnLimit; turn++) {
- if (signal.aborted) {
- await emitEvent(streamWriter, { type: 'abort' })
- break
- }
-
- // Start step (turn)
- await emitEvent(streamWriter, { type: 'start-step' })
-
- // Call n1 API
- let response: N1ChatCompletionResponse
- try {
- response = await this.callN1Api()
- } catch (error) {
- const errorMsg = error instanceof Error ? error.message : String(error)
- await emitEvent(streamWriter, {
- type: 'error',
- errorText: `API error: ${errorMsg}`,
- })
- throw error
- }
-
- // Extract response content
- const choice = response.choices?.[0]
- if (!choice?.message?.content) {
- await emitEvent(streamWriter, {
- type: 'error',
- errorText: 'Empty response from n1 API',
- })
- throw new Error('Empty response from n1 API')
- }
-
- const assistantContent = choice.message.content
-
- // Parse the JSON response
- const parsed = this.parseN1Response(assistantContent)
- if (!parsed) {
- await emitEvent(streamWriter, {
- type: 'error',
- errorText: 'Failed to parse n1 response',
- })
- throw new Error('Failed to parse n1 response')
- }
-
- const { thoughts, actions } = parsed
-
- // Emit thoughts as text
- if (thoughts) {
- finalText = thoughts
- const textId = randomUUID()
- await emitEvent(streamWriter, { type: 'text-start', id: textId })
- await emitEvent(streamWriter, {
- type: 'text-delta',
- id: textId,
- delta: thoughts,
- })
- await emitEvent(streamWriter, { type: 'text-end', id: textId })
- }
-
- // Check for stop action or no actions
- const stopAction = actions.find((a) => a.action_type === 'stop')
- if (stopAction && stopAction.action_type === 'stop') {
- finalText = stopAction.answer
- await emitEvent(streamWriter, { type: 'finish-step' })
- await emitEvent(streamWriter, {
- type: 'finish',
- finishReason: 'completed',
- })
- finished = true
- break
- }
-
- if (actions.length === 0) {
- await emitEvent(streamWriter, { type: 'finish-step' })
- await emitEvent(streamWriter, {
- type: 'finish',
- finishReason: 'completed',
- })
- finished = true
- break
- }
-
- // Add assistant response to conversation history
- this.messages.push({
- role: 'assistant',
- content: assistantContent,
- })
-
- // Execute each action
- for (const action of actions) {
- if (signal.aborted) break
-
- // Skip stop actions (handled above)
- if (action.action_type === 'stop') continue
-
- const toolCallId = randomUUID()
-
- // Tool input events
- await emitEvent(streamWriter, {
- type: 'tool-input-start',
- toolCallId,
- toolName: action.action_type,
- })
- await emitEvent(streamWriter, {
- type: 'tool-input-available',
- toolCallId,
- toolName: action.action_type,
- input: action,
- })
-
- const result = await this.actionMapper.execute(action)
- totalActions++
-
- // Check if this was a stop action that returned an answer
- if (result.stopAnswer) {
- finalText = result.stopAnswer
- }
-
- // Tool output event
- await emitEvent(streamWriter, {
- type: 'tool-output-available',
- toolCallId,
- output: result,
- })
-
- // Call action hook (for screenshot capture)
- if (this.actionHook) {
- await this.actionHook(action, result)
- }
- }
-
- // Capture new screenshot and URL for next turn
- const newScreenshot = await this.actionMapper.captureScreenshot()
- const currentUrl = await this.actionMapper.getCurrentUrl()
-
- // Add observation for next turn (n1 requires full history)
- if (newScreenshot) {
- this.messages.push(
- this.buildObservationMessage(newScreenshot, currentUrl),
- )
- }
-
- // Finish step (turn)
- await emitEvent(streamWriter, { type: 'finish-step' })
- }
-
- if (!finished && !signal.aborted) {
- await emitEvent(streamWriter, {
- type: 'finish',
- finishReason: 'max_turns',
- })
- }
-
- return { finalText, totalActions }
- }
-}
diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts
deleted file mode 100644
index ec41783a5..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts
+++ /dev/null
@@ -1,97 +0,0 @@
-/**
- * Yutori Navigator Evaluator
- * Implements AgentEvaluator interface for the eval framework
- */
-
-import { DEFAULT_TIMEOUT_MS } from '../../constants'
-import type { TaskMetadata, YutoriNavigatorConfig } from '../../types'
-import { resolveEnvValue } from '../../utils/resolve-env'
-import { withEvalTimeout } from '../../utils/with-eval-timeout'
-import type { AgentContext, AgentEvaluator, AgentResult } from '../types'
-import { YutoriNavigatorAgent } from './agent'
-import { DEFAULTS } from './types'
-
-export class YutoriNavigatorEvaluator implements AgentEvaluator {
- constructor(private ctx: AgentContext) {}
-
- async execute(): Promise {
- const { config, task, capture, windowId = 0, tabId = 0 } = this.ctx
- const agentConfig = config.agent as YutoriNavigatorConfig
-
- const startTime = Date.now()
- const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS
-
- await capture.messageLogger.logUser(task.query)
-
- const apiKey = resolveEnvValue(agentConfig.apiKey)
- if (!apiKey) {
- throw new Error(
- `API key not found. Set ${agentConfig.apiKey} environment variable or provide the key directly.`,
- )
- }
-
- const agent = new YutoriNavigatorAgent({
- apiKey,
- turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
- screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
- tabId,
- windowId,
- mcpUrl: `${config.browseros.server_url}/mcp`,
- })
-
- agent.setActionHook(async (_action, _result) => {
- try {
- await capture.screenshot.capture(capture.getActivePageId())
- } catch (err) {
- console.warn('Screenshot capture failed in hook:', err)
- }
- })
-
- const streamWriter = capture.createStreamWriter()
-
- let finalText: string | null = null
- let totalActions = 0
-
- const { terminationReason } = await withEvalTimeout(
- timeoutMs,
- capture,
- async (signal) => {
- const result = await agent.execute(task.query, streamWriter, signal)
- finalText = result.finalText
- totalActions = result.totalActions
- return result
- },
- )
-
- const endTime = Date.now()
-
- const metadata: TaskMetadata = {
- query_id: task.query_id,
- dataset: task.dataset,
- query: task.query,
- started_at: new Date(startTime).toISOString(),
- completed_at: new Date(endTime).toISOString(),
- total_duration_ms: endTime - startTime,
- total_steps: totalActions,
- termination_reason: terminationReason,
- final_answer: finalText ?? capture.getLastAssistantText(),
- errors: capture.getErrors(),
- warnings: capture.getWarnings(),
- agent_config: {
- type: 'yutori-navigator',
- model: DEFAULTS.model,
- turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit,
- screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize,
- },
- grader_results: {},
- }
-
- await capture.trajectorySaver.saveMetadata(metadata)
-
- return {
- metadata,
- messages: capture.getMessages(),
- finalAnswer: finalText ?? capture.getLastAssistantText(),
- }
- }
-}
diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts
deleted file mode 100644
index a1e3903b1..000000000
--- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Types for Yutori Navigator n1 agent
- *
- * n1 is a pixels-to-actions LLM that follows OpenAI Chat Completions interface.
- * Coordinates are normalized to 1000x1000 grid.
- * Recommended screenshot size: 1280x800 (WXGA 16:10)
- */
-
-import { z } from 'zod'
-
-// n1 action schemas based on API documentation
-export const N1ActionSchema = z.discriminatedUnion('action_type', [
- z.object({
- action_type: z.literal('click'),
- center_coordinates: z.tuple([z.number(), z.number()]),
- }),
- z.object({
- action_type: z.literal('scroll'),
- direction: z.enum(['up', 'down', 'left', 'right']),
- center_coordinates: z.tuple([z.number(), z.number()]),
- amount: z.number().int().min(1).max(10),
- }),
- z.object({
- action_type: z.literal('type'),
- text: z.string(),
- press_enter_after: z.boolean().optional(),
- clear_before_typing: z.boolean().optional(),
- }),
- z.object({
- action_type: z.literal('key_press'),
- key_comb: z.string(), // Playwright keyboard press format
- }),
- z.object({
- action_type: z.literal('hover'),
- center_coordinates: z.tuple([z.number(), z.number()]),
- }),
- z.object({
- action_type: z.literal('drag'),
- start_coordinates: z.tuple([z.number(), z.number()]),
- center_coordinates: z.tuple([z.number(), z.number()]), // destination
- }),
- z.object({
- action_type: z.literal('wait'),
- }),
- z.object({
- action_type: z.literal('refresh'),
- }),
- z.object({
- action_type: z.literal('go_back'),
- }),
- z.object({
- action_type: z.literal('goto_url'),
- url: z.string(),
- }),
- z.object({
- action_type: z.literal('read_texts_and_links'),
- }),
- z.object({
- action_type: z.literal('stop'),
- answer: z.string(),
- }),
-])
-
-export type N1Action = z.infer
-
-// n1 API response format
-export const N1ResponseSchema = z.object({
- thoughts: z.string(),
- actions: z.array(N1ActionSchema),
-})
-
-export type N1Response = z.infer
-
-// Screen size configuration
-export interface ScreenSize {
- width: number
- height: number
-}
-
-// Context for action execution
-export interface ActionContext {
- mcpUrl: string
- tabId: number
- windowId: number
- screenSize: ScreenSize
-}
-
-// OpenAI-compatible message types for n1 API
-export type N1MessageRole = 'user' | 'assistant' | 'observation'
-
-export interface N1TextContent {
- type: 'text'
- text: string
-}
-
-export interface N1ImageContent {
- type: 'image_url'
- image_url: {
- url: string // Can be URL or data:image/webp;base64,...
- }
-}
-
-export type N1ContentPart = N1TextContent | N1ImageContent
-
-export interface N1Message {
- role: N1MessageRole
- content: string | N1ContentPart[]
-}
-
-export interface N1ChatCompletionRequest {
- model: string
- messages: N1Message[]
- temperature?: number
-}
-
-export interface N1ChatCompletionResponse {
- id: string
- object: string
- created: number
- model: string
- choices: Array<{
- index: number
- message: {
- role: 'assistant'
- content: string // JSON string containing N1Response
- }
- finish_reason: string
- }>
- usage?: {
- prompt_tokens: number
- completion_tokens: number
- total_tokens: number
- }
-}
-
-// Agent configuration
-export interface YutoriNavigatorAgentConfig {
- apiKey: string
- turnLimit: number
- screenSize: ScreenSize
- tabId: number
- windowId: number
- mcpUrl: string
-}
-
-// Defaults based on Yutori documentation
-export const DEFAULTS = {
- // WXGA 16:10 - Yutori's recommended screenshot size
- screenshotSize: { width: 1280, height: 800 },
- screenSize: { width: 1280, height: 800 },
- turnLimit: 30,
- model: 'n1-preview-2025-11',
- temperature: 0.3,
- // n1 uses 1000x1000 normalized coordinate system
- normalizedMax: 1000,
-} as const
-
-export const YUTORI_API_BASE = 'https://api.yutori.com/v1'
diff --git a/packages/browseros-agent/apps/eval/src/dashboard/index.html b/packages/browseros-agent/apps/eval/src/dashboard/index.html
index 4b120f5e8..b65ba5d9f 100644
--- a/packages/browseros-agent/apps/eval/src/dashboard/index.html
+++ b/packages/browseros-agent/apps/eval/src/dashboard/index.html
@@ -153,6 +153,13 @@
.btn-stop:hover { background: #f85149; }
.btn-stop:disabled { background: #6e2b2b; cursor: default; }
+ /* Test Run button (highlighted secondary) */
+ .btn-test { background: #1f6feb; color: #fff; border: none; padding: 6px 14px; border-radius: 4px; font-size: 13px; font-weight: 600; cursor: pointer; }
+ .btn-test:hover { background: #388bfd; }
+ .btn-test:disabled { background: #21262d; color: #484f58; cursor: default; }
+ .loaded-config-label { font-size: 11px; color: #8b949e; margin-left: 8px; font-style: italic; }
+ .config-success { color: #3fb950; font-size: 12px; flex: 1; text-align: center; }
+
/* Empty states */
.empty-state { display: flex; align-items: center; justify-content: center; height: 100%; color: #484f58; font-size: 13px; }
@@ -190,8 +197,6 @@ Agent
Single Agent
Orchestrator-Executor
- Gemini Computer Use
- Yutori Navigator
@@ -280,50 +285,6 @@ Executor
-
-
-
- API Key *
-
-
-
-
- Turn Limit
-
-
-
-
-
-
-
- API Key *
-
-
-
-
- Turn Limit
-
-
-
-
@@ -447,10 +408,13 @@ Grader
-- select --
+
Load File
Save Config
+ Save to Reference
+ Test Run
Run Eval
@@ -558,6 +522,7 @@ Grader
renderTaskList();
updateProgress();
+ updateLoadedConfigUI();
connectSSE();
}
@@ -716,16 +681,70 @@ Grader
}
async function loadSavedConfig(name) {
- if (!name) return;
+ if (!name) {
+ loadedConfigName = null;
+ updateLoadedConfigUI();
+ return;
+ }
try {
const res = await fetch(`/api/config/${encodeURIComponent(name)}`);
if (!res.ok) return;
const cfg = await res.json();
loadedConfigName = name;
fillForm(cfg);
+ updateLoadedConfigUI();
} catch {}
}
+function updateLoadedConfigUI() {
+ const btn = document.getElementById('btn-save-ref');
+ const lbl = document.getElementById('loaded-config-label');
+ if (loadedConfigName) {
+ btn.style.display = '';
+ lbl.textContent = `Loaded: ${loadedConfigName}`;
+ } else {
+ btn.style.display = 'none';
+ lbl.textContent = '';
+ }
+}
+
+async function saveToReferenceConfig() {
+ if (!loadedConfigName) return;
+ const errEl = document.getElementById('config-error');
+ errEl.style.color = '';
+ errEl.textContent = '';
+ // build current form state and PUT it back to the loaded config file
+ const config = buildConfigFromForm();
+ const btn = document.getElementById('btn-save-ref');
+ btn.disabled = true;
+ btn.textContent = 'Saving...';
+ try {
+ const res = await fetch(`/api/config/${encodeURIComponent(loadedConfigName)}`, {
+ method: 'PUT',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify(config),
+ });
+ const result = await res.json();
+ if (!res.ok) {
+ const details = result.details ? `\n${result.details.join('\n')}` : '';
+ errEl.textContent = (result.error || 'Failed to save config') + details;
+ return;
+ }
+ // success — show inline confirmation that auto-clears
+ errEl.style.color = '#3fb950';
+ errEl.textContent = `Saved to ${loadedConfigName}`;
+ setTimeout(() => {
+ errEl.style.color = '';
+ errEl.textContent = '';
+ }, 2500);
+ } catch (e) {
+ errEl.textContent = `Network error: ${e.message}`;
+ } finally {
+ btn.disabled = false;
+ btn.textContent = 'Save to Reference';
+ }
+}
+
// Fill form fields from a config object
function fillForm(cfg) {
const agent = cfg.agent || {};
@@ -751,20 +770,6 @@ Grader
setVal('cfg-exec-model', exec.model);
setVal('cfg-exec-apikey', exec.apiKey);
setVal('cfg-exec-baseurl', exec.baseUrl);
- } else if (type === 'gemini-computer-use') {
- setVal('cfg-gemini-apikey', agent.apiKey);
- if (agent.screenSize) {
- setVal('cfg-gemini-width', agent.screenSize.width);
- setVal('cfg-gemini-height', agent.screenSize.height);
- }
- setVal('cfg-gemini-turns', agent.turnLimit);
- } else if (type === 'yutori-navigator') {
- setVal('cfg-yutori-apikey', agent.apiKey);
- if (agent.screenSize) {
- setVal('cfg-yutori-width', agent.screenSize.width);
- setVal('cfg-yutori-height', agent.screenSize.height);
- }
- setVal('cfg-yutori-turns', agent.turnLimit);
}
// Infrastructure
@@ -848,26 +853,6 @@ Grader
baseUrl: getVal('cfg-exec-baseurl') || undefined,
},
};
- } else if (type === 'gemini-computer-use') {
- agent = {
- type: 'gemini-computer-use',
- apiKey: getVal('cfg-gemini-apikey'),
- screenSize: {
- width: parseInt(getVal('cfg-gemini-width'), 10) || 1440,
- height: parseInt(getVal('cfg-gemini-height'), 10) || 900,
- },
- turnLimit: parseInt(getVal('cfg-gemini-turns'), 10) || 30,
- };
- } else if (type === 'yutori-navigator') {
- agent = {
- type: 'yutori-navigator',
- apiKey: getVal('cfg-yutori-apikey'),
- screenSize: {
- width: parseInt(getVal('cfg-yutori-width'), 10) || 1280,
- height: parseInt(getVal('cfg-yutori-height'), 10) || 800,
- },
- turnLimit: parseInt(getVal('cfg-yutori-turns'), 10) || 30,
- };
}
// Dataset: use dropdown value unless custom is selected
@@ -908,8 +893,10 @@ Grader
return config;
}
-async function submitConfig() {
+async function submitConfig(opts = {}) {
+ const isTestRun = opts.testRun === true;
const errEl = document.getElementById('config-error');
+ errEl.style.color = '';
errEl.textContent = '';
const config = buildConfigFromForm();
@@ -920,29 +907,38 @@ Grader
return;
}
- const btn = document.getElementById('btn-run');
- btn.disabled = true;
- btn.textContent = 'Starting...';
+ // disable both buttons during submit; show appropriate loading text
+ const runBtn = document.getElementById('btn-run');
+ const testBtn = document.getElementById('btn-test-run');
+ runBtn.disabled = true;
+ testBtn.disabled = true;
+ if (isTestRun) testBtn.textContent = 'Starting...';
+ else runBtn.textContent = 'Starting...';
try {
const res = await fetch('/api/run', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
- body: JSON.stringify({ config, configName: loadedConfigName }),
+ body: JSON.stringify({ config, configName: loadedConfigName, testRun: isTestRun }),
});
const result = await res.json();
if (!res.ok) {
const details = result.details ? `\n${result.details.join('\n')}` : '';
errEl.textContent = (result.error || 'Failed to start eval') + details;
- btn.disabled = false;
- btn.textContent = 'Run Eval';
+ runBtn.disabled = false;
+ runBtn.textContent = 'Run Eval';
+ testBtn.disabled = false;
+ testBtn.textContent = 'Test Run';
return;
}
// Eval started — collapse panel, refresh state
setConfigPanelOpen(false);
- updateConfigSummary(loadedConfigName || 'dashboard', config.agent.type);
+ const summaryLabel = isTestRun
+ ? `${loadedConfigName || 'dashboard'} (test run)`
+ : (loadedConfigName || 'dashboard');
+ updateConfigSummary(summaryLabel, config.agent.type);
// Re-fetch state to get the loaded tasks
const stateRes = await fetch('/api/state');
@@ -957,8 +953,10 @@ Grader
setEvalRunningUI(true);
} catch (e) {
errEl.textContent = `Network error: ${e.message}`;
- btn.disabled = false;
- btn.textContent = 'Run Eval';
+ runBtn.disabled = false;
+ runBtn.textContent = 'Run Eval';
+ testBtn.disabled = false;
+ testBtn.textContent = 'Test Run';
}
}
@@ -981,16 +979,20 @@ Grader
function setEvalRunningUI(running) {
const stopBtn = document.getElementById('btn-stop');
const runBtn = document.getElementById('btn-run');
+ const testBtn = document.getElementById('btn-test-run');
if (running) {
stopBtn.style.display = '';
stopBtn.disabled = false;
stopBtn.textContent = 'Stop';
runBtn.disabled = true;
runBtn.textContent = 'Running...';
+ testBtn.disabled = true;
} else {
stopBtn.style.display = 'none';
runBtn.disabled = false;
runBtn.textContent = 'Run Eval';
+ testBtn.disabled = false;
+ testBtn.textContent = 'Test Run';
}
}
@@ -1524,6 +1526,11 @@ Grader
// Parse and populate form fields
const cfg = JSON.parse(e.target.result);
fillForm(cfg);
+ // file-picker loads a config from arbitrary path — not a "reference" config,
+ // so disable Save-to-Reference until user picks a saved config from the dropdown
+ loadedConfigName = null;
+ document.getElementById('cfg-load-select').value = '';
+ updateLoadedConfigUI();
document.getElementById('config-error').textContent = '';
} catch (err) {
document.getElementById('config-error').textContent = `Invalid JSON file: ${err.message}`;
diff --git a/packages/browseros-agent/apps/eval/src/dashboard/server.ts b/packages/browseros-agent/apps/eval/src/dashboard/server.ts
index 6f1ff4c14..9c98ec695 100644
--- a/packages/browseros-agent/apps/eval/src/dashboard/server.ts
+++ b/packages/browseros-agent/apps/eval/src/dashboard/server.ts
@@ -361,11 +361,53 @@ app.get('/api/config/:name', async (c) => {
}
})
+// Overwrite an existing config file (used by "Save to Reference" in the UI)
+app.put('/api/config/:name', async (c) => {
+ if (evalRunning) {
+ return c.json({ error: 'Cannot save config while eval is running' }, 409)
+ }
+ // validate filename
+ const name = c.req.param('name')
+ if (
+ !name ||
+ name.includes('/') ||
+ name.includes('..') ||
+ !name.endsWith('.json')
+ ) {
+ return c.json({ error: 'Invalid config name' }, 400)
+ }
+ const filepath = join(configsDir, name)
+ if (!resolve(filepath).startsWith(resolve(configsDir))) {
+ return c.json({ error: 'Invalid config name' }, 400)
+ }
+ // overwrite-only — file must already exist
+ const existing = await stat(filepath).catch(() => null)
+ if (!existing?.isFile()) {
+ return c.json({ error: 'Config not found' }, 404)
+ }
+ // parse + validate body against the same schema as /api/run
+ let body: unknown
+ try {
+ body = await c.req.json()
+ } catch {
+ return c.json({ error: 'Invalid JSON body' }, 400)
+ }
+ const parseResult = EvalConfigSchema.safeParse(body)
+ if (!parseResult.success) {
+ const errors = parseResult.error.errors.map(
+ (e) => `${e.path.join('.')}: ${e.message}`,
+ )
+ return c.json({ error: 'Config validation failed', details: errors }, 400)
+ }
+ await Bun.write(filepath, `${JSON.stringify(parseResult.data, null, 2)}\n`)
+ return c.json({ status: 'saved', name })
+})
+
// Start an eval run from the dashboard
app.post('/api/run', async (c) => {
if (evalRunning) return c.json({ error: 'Eval already running' }, 409)
- let body: { config: unknown; configName?: string }
+ let body: { config: unknown; configName?: string; testRun?: boolean }
try {
body = await c.req.json()
} catch {
@@ -382,6 +424,7 @@ app.post('/api/run', async (c) => {
}
const config = parseResult.data
+ const isTestRun = body.testRun === true
// Resolve relative paths from configs/ dir (dataset dropdown values are relative to it)
const baseDir = configsDir
@@ -425,17 +468,26 @@ app.post('/api/run', async (c) => {
return c.json({ error: `Failed to load tasks: ${msg}` }, 400)
}
+ // Test run mode: 1 worker + first task only, to verify setup before a full run
+ if (isTestRun) {
+ if (tasks.length === 0) {
+ return c.json({ error: 'Dataset is empty — nothing to test' }, 400)
+ }
+ tasks = tasks.slice(0, 1)
+ }
+
await mkdir(outputDir, { recursive: true })
// Re-init dashboard state with loaded tasks
const configLabel = body.configName || 'dashboard'
- dashboardState.init(tasks, configLabel, config.agent.type, outputDir)
+ const stateLabel = isTestRun ? `${configLabel} (test run)` : configLabel
+ dashboardState.init(tasks, stateLabel, config.agent.type, outputDir)
const graderOptions = resolveGraderOptions(config)
// Run eval in background — don't await
const executor = new ParallelExecutor({
- numWorkers: config.num_workers || 1,
+ numWorkers: isTestRun ? 1 : config.num_workers || 1,
config,
outputDir,
graderOptions,
@@ -467,7 +519,12 @@ app.post('/api/run', async (c) => {
console.log('\nEval run complete.')
})
- return c.json({ status: 'started', taskCount: tasks.length, outputDir })
+ return c.json({
+ status: 'started',
+ taskCount: tasks.length,
+ outputDir,
+ testRun: isTestRun,
+ })
})
// Stop a running eval
diff --git a/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts b/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
new file mode 100644
index 000000000..8089cc628
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/agisdk-state-diff.ts
@@ -0,0 +1,202 @@
+import { spawn } from 'node:child_process'
+import { join } from 'node:path'
+import type { GraderResult } from '../../types'
+import { callMcpTool } from '../../utils/mcp-client'
+import type { Grader, GraderInput } from '../types'
+
+const EVAL_SCRIPT = join(
+ import.meta.dirname,
+ '..',
+ '..',
+ '..',
+ 'scripts',
+ 'agisdk-evaluate.py',
+)
+
+export class AgisdkStateDiffGrader implements Grader {
+ name = 'agisdk_state_diff'
+
+ async grade(input: GraderInput): Promise {
+ const taskId = this.extractTaskId(input.task.query_id)
+ const startUrl = this.extractStartUrl(input)
+ const mcpEndpoint =
+ input.mcpUrl ||
+ `${process.env.BROWSEROS_SERVER_URL || 'http://127.0.0.1:9110'}/mcp`
+
+ if (!startUrl) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning: 'Could not determine clone site URL from task',
+ }
+ }
+
+ const origin = new URL(startUrl).origin
+
+ let envState: Record
+ try {
+ envState = await this.fetchFinishState(origin, mcpEndpoint)
+ } catch (error) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning: `Failed to fetch /finish endpoint: ${error instanceof Error ? error.message : String(error)}`,
+ details: { origin, error: true },
+ }
+ }
+
+ try {
+ const result = await this.runPythonEvaluator(
+ taskId,
+ envState,
+ input.finalAnswer || '',
+ )
+ return {
+ score: result.reward,
+ pass: result.pass,
+ reasoning:
+ result.message ||
+ (result.pass ? 'All criteria passed' : 'Some criteria failed'),
+ details: {
+ reward: result.reward,
+ per_criterion: result.per_criterion,
+ origin,
+ agisdk_task_id: taskId,
+ },
+ }
+ } catch (error) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning: `Python evaluator error: ${error instanceof Error ? error.message : String(error)}`,
+ details: { error: true },
+ }
+ }
+ }
+
+ private extractTaskId(queryId: string): string {
+ return queryId.replace(/^agisdk-/, '')
+ }
+
+ private extractStartUrl(input: GraderInput): string | null {
+ // Derive from task_id: "dashdish-10" → "https://evals-dashdish.vercel.app"
+ // Task IDs are "{site}-{number}" where site may contain hyphens (e.g. "fly-unified-5")
+ const taskId = this.extractTaskId(input.task.query_id)
+ const siteId = taskId.replace(/-\d+$/, '')
+ if (siteId) return `https://evals-${siteId}.vercel.app`
+
+ // Fallback: search messages for vercel.app URLs
+ for (const msg of input.messages) {
+ const text =
+ msg.type === 'user'
+ ? msg.content
+ : msg.type === 'tool-input-available'
+ ? JSON.stringify(msg.input)
+ : ''
+ const urlMatch = text.match(/https?:\/\/[^\s"']+\.vercel\.app/)
+ if (urlMatch) return urlMatch[0]
+ }
+
+ return null
+ }
+
+ private async fetchFinishState(
+ origin: string,
+ mcpEndpoint: string,
+ ): Promise> {
+ const finishUrl = `${origin}/finish`
+
+ // Navigate browser to /finish page (state diff is rendered client-side)
+ await callMcpTool(mcpEndpoint, 'navigate_page', {
+ url: finishUrl,
+ page: 1,
+ })
+
+ // Wait for the page to render, then extract JSON from element
+ const result = await callMcpTool(mcpEndpoint, 'evaluate_script', {
+ page: 1,
+ expression: `
+ new Promise((resolve, reject) => {
+ let attempts = 0;
+ const check = () => {
+ const pre = document.querySelector('pre');
+ if (pre && pre.textContent.trim().startsWith('{')) {
+ resolve(pre.textContent);
+ } else if (++attempts > 20) {
+ reject(new Error('Timed out waiting for JSON on /finish'));
+ } else {
+ setTimeout(check, 500);
+ }
+ };
+ check();
+ })
+ `,
+ })
+
+ const textContent = result.content?.find(
+ (c: { type: string }) => c.type === 'text',
+ )
+ if (!textContent?.text) {
+ throw new Error('No text content returned from /finish page')
+ }
+
+ return JSON.parse(textContent.text) as Record
+ }
+
+ private runPythonEvaluator(
+ taskId: string,
+ envState: Record,
+ modelResponse: string,
+ ): Promise<{
+ reward: number
+ pass: boolean
+ message: string
+ per_criterion: unknown[]
+ }> {
+ return new Promise((resolve, reject) => {
+ const proc = spawn('python3', [EVAL_SCRIPT], {
+ stdio: ['pipe', 'pipe', 'pipe'],
+ })
+
+ const inputData = JSON.stringify({
+ task_id: taskId,
+ env_state: envState,
+ model_response: modelResponse,
+ })
+
+ let stdout = ''
+ let stderr = ''
+
+ proc.stdout.on('data', (data: Buffer) => {
+ stdout += data.toString()
+ })
+
+ proc.stderr.on('data', (data: Buffer) => {
+ stderr += data.toString()
+ })
+
+ proc.on('close', (code) => {
+ if (code !== 0) {
+ reject(
+ new Error(`Python evaluator exited with code ${code}: ${stderr}`),
+ )
+ return
+ }
+
+ try {
+ const result = JSON.parse(stdout.trim())
+ resolve(result)
+ } catch {
+ reject(new Error(`Failed to parse evaluator output: ${stdout}`))
+ }
+ })
+
+ proc.on('error', (err) => {
+ reject(new Error(`Failed to spawn Python evaluator: ${err.message}`))
+ })
+
+ proc.stdin.write(inputData)
+ proc.stdin.end()
+ })
+ }
+}
diff --git a/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts b/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
new file mode 100644
index 000000000..63afcaf59
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/src/graders/benchmark/infinity-state.ts
@@ -0,0 +1,134 @@
+import { join, resolve } from 'node:path'
+import type { GraderResult } from '../../types'
+import type { Grader, GraderInput } from '../types'
+
+interface InfinityEvalInput {
+ app_server_url: string
+ verifier_path: string
+ task_id: string
+}
+
+interface InfinityEvalOutput {
+ pass: boolean
+ reward: number
+ message: string
+}
+
+const EVAL_SCRIPT = resolve(
+ import.meta.dir,
+ '../../../scripts/infinity-evaluate.py',
+)
+
+export class InfinityStateGrader implements Grader {
+ name = 'infinity_state'
+
+ async grade(input: GraderInput): Promise {
+ const parsed = this.parseQueryId(input.task.query_id)
+ if (!parsed) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning: `Cannot parse query_id "${input.task.query_id}" — expected format: infinity-{app}-{task_id}`,
+ }
+ }
+
+ const appServerUrl = this.resolveAppServerUrl(input)
+ if (!appServerUrl) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning: 'Cannot determine app server URL',
+ }
+ }
+
+ const infinityDir = process.env.WEBARENA_INFINITY_DIR
+ if (!infinityDir) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning:
+ 'WEBARENA_INFINITY_DIR env var not set. Point it to the webarena-infinity repo root.',
+ }
+ }
+
+ const verifierPath = join(
+ infinityDir,
+ 'apps',
+ parsed.appName,
+ 'real-tasks',
+ `${parsed.taskId}.py`,
+ )
+
+ const evalInput: InfinityEvalInput = {
+ app_server_url: appServerUrl,
+ verifier_path: verifierPath,
+ task_id: input.task.query_id,
+ }
+
+ try {
+ const result = await this.runPythonEvaluator(evalInput)
+ return {
+ score: result.pass ? 1 : 0,
+ pass: result.pass,
+ reasoning: result.message,
+ details: {
+ reward: result.reward,
+ app_name: parsed.appName,
+ app_server_url: appServerUrl,
+ },
+ }
+ } catch (error) {
+ return {
+ score: 0,
+ pass: false,
+ reasoning: `Evaluator process error: ${error instanceof Error ? error.message : String(error)}`,
+ }
+ }
+ }
+
+ private parseQueryId(
+ queryId: string,
+ ): { appName: string; taskId: string } | null {
+ // Task IDs start with "task_", app names may contain hyphens
+ // e.g. "infinity-elation-prescriptions-task_h69"
+ const match = queryId.match(/^infinity-(.+)-(task_.+)$/)
+ if (!match) return null
+ return { appName: match[1], taskId: match[2] }
+ }
+
+ private resolveAppServerUrl(input: GraderInput): string | null {
+ // Passed directly from task executor (started by InfinityAppManager)
+ if (input.infinityAppUrl) return input.infinityAppUrl
+
+ // Fallback: env var for manual testing
+ if (process.env.INFINITY_APP_URL) return process.env.INFINITY_APP_URL
+
+ return null
+ }
+
+ private async runPythonEvaluator(
+ evalInput: InfinityEvalInput,
+ ): Promise {
+ const proc = Bun.spawn(['python3', EVAL_SCRIPT], {
+ stdin: 'pipe',
+ stdout: 'pipe',
+ stderr: 'pipe',
+ })
+
+ const inputJson = JSON.stringify(evalInput)
+ proc.stdin.write(inputJson)
+ proc.stdin.end()
+
+ const stdout = await new Response(proc.stdout).text()
+ const stderr = await new Response(proc.stderr).text()
+ const exitCode = await proc.exited
+
+ if (exitCode !== 0) {
+ throw new Error(
+ `Python evaluator exited with code ${exitCode}: ${stderr || stdout}`,
+ )
+ }
+
+ return JSON.parse(stdout.trim()) as InfinityEvalOutput
+ }
+}
diff --git a/packages/browseros-agent/apps/eval/src/graders/registry.ts b/packages/browseros-agent/apps/eval/src/graders/registry.ts
index 678fff915..62bd2aed0 100644
--- a/packages/browseros-agent/apps/eval/src/graders/registry.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/registry.ts
@@ -1,4 +1,6 @@
import type { GraderResult } from '../types'
+import { AgisdkStateDiffGrader } from './benchmark/agisdk-state-diff'
+import { InfinityStateGrader } from './benchmark/infinity-state'
import { Mind2WebJudgeGrader } from './benchmark/mind2web'
import { WebVoyagerGrader } from './benchmark/webvoyager'
import { FaraAlignmentGrader } from './fara/alignment'
@@ -19,7 +21,13 @@ export function createGrader(
options: GraderOptions | null,
): Grader | null {
switch (name) {
- // Benchmark graders
+ // Deterministic benchmark graders (no LLM judge)
+ case 'agisdk_state_diff':
+ return new AgisdkStateDiffGrader()
+ case 'infinity_state':
+ return new InfinityStateGrader()
+
+ // LLM-based benchmark graders
case 'webvoyager_grader':
if (!options?.apiKey) return null
return new WebVoyagerGrader(
@@ -107,10 +115,12 @@ export async function runGraders(
// Export grader classes for direct use
export {
+ AgisdkStateDiffGrader,
FaraAlignmentGrader,
FaraCombinedGrader,
FaraMultimodalGrader,
FaraRubricGrader,
+ InfinityStateGrader,
Mind2WebJudgeGrader,
PerformanceGrader,
WebVoyagerGrader,
diff --git a/packages/browseros-agent/apps/eval/src/graders/types.ts b/packages/browseros-agent/apps/eval/src/graders/types.ts
index 3197a602e..f1451b1d6 100644
--- a/packages/browseros-agent/apps/eval/src/graders/types.ts
+++ b/packages/browseros-agent/apps/eval/src/graders/types.ts
@@ -11,6 +11,8 @@ export interface GraderInput {
finalAnswer: string | null
expectedAnswer?: string | null
outputDir: string
+ mcpUrl?: string
+ infinityAppUrl?: string
}
export interface Grader {
diff --git a/packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts b/packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts
new file mode 100644
index 000000000..f3f8df4f4
--- /dev/null
+++ b/packages/browseros-agent/apps/eval/src/runner/infinity-app-manager.ts
@@ -0,0 +1,89 @@
+/**
+ * Manages WebArena-Infinity app server lifecycle per task.
+ *
+ * Each worker gets a unique port: base_port + worker_index.
+ * Server is started fresh before each task and killed after,
+ * guaranteeing clean state.
+ */
+
+import { type ChildProcess, spawn } from 'node:child_process'
+import { join } from 'node:path'
+
+export class InfinityAppManager {
+ private proc: ChildProcess | null = null
+ private port: number
+ private infinityDir: string
+
+ constructor(
+ private workerIndex: number,
+ private basePort: number = 8000,
+ ) {
+ this.port = basePort + workerIndex
+ this.infinityDir = process.env.WEBARENA_INFINITY_DIR || ''
+ }
+
+ async startApp(appName: string): Promise {
+ await this.stop()
+
+ if (!this.infinityDir) {
+ throw new Error('WEBARENA_INFINITY_DIR env var not set')
+ }
+
+ const serverScript = join(this.infinityDir, 'apps', appName, 'server.py')
+ this.proc = spawn('python3', [serverScript, '--port', String(this.port)], {
+ stdio: ['ignore', 'pipe', 'pipe'],
+ cwd: join(this.infinityDir, 'apps', appName),
+ })
+
+ // Wait for server to be ready
+ const url = `http://localhost:${this.port}`
+ await this.waitForReady(url)
+ return url
+ }
+
+ async stop(): Promise {
+ if (this.proc) {
+ this.proc.kill('SIGTERM')
+ await new Promise((resolve) => {
+ const timeout = setTimeout(() => {
+ this.proc?.kill('SIGKILL')
+ resolve()
+ }, 3000)
+ this.proc?.on('exit', () => {
+ clearTimeout(timeout)
+ resolve()
+ })
+ })
+ this.proc = null
+ }
+ }
+
+ getPort(): number {
+ return this.port
+ }
+
+ getUrl(): string {
+ return `http://localhost:${this.port}`
+ }
+
+ private async waitForReady(
+ url: string,
+ maxAttempts = 30,
+ intervalMs = 500,
+ ): Promise {
+ for (let i = 0; i < maxAttempts; i++) {
+ try {
+ const resp = await fetch(url, {
+ signal: AbortSignal.timeout(2000),
+ })
+ if (resp.ok) return
+ } catch {
+ // Server not ready yet
+ }
+ await new Promise((r) => setTimeout(r, intervalMs))
+ }
+ throw new Error(
+ `Infinity app server not ready after ${maxAttempts * intervalMs}ms on port ${this.port}`,
+ )
+ }
+}
diff --git a/packages/browseros-agent/apps/eval/src/runner/task-executor.ts b/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
index aa779d06e..b99fc49e9 100644
--- a/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/task-executor.ts
@@ -9,6 +9,7 @@ import {
import { runGraders } from '../graders/registry'
import type { ErrorSource, EvalConfig, GraderResult, Task } from '../types'
import { callMcpTool } from '../utils/mcp-client'
+import { InfinityAppManager } from './infinity-app-manager'
import type { GraderOptions, TaskResult } from './types'
// ============================================================================
@@ -101,6 +102,36 @@ export class TaskExecutor {
// Resolve page ID once — fresh browser has exactly one page
const pageId = await this.resolveInitialPageId(mcpUrl)
+ // For Infinity tasks, start a fresh app server per task
+ let infinityManager: InfinityAppManager | null = null
+ let actualStartUrl = task.start_url
+
+ if (task.dataset === 'webarena-infinity') {
+ const appName = (task.metadata?.additional as Record)
+ ?.app_name as string
+ const appBasePort =
+ ((task.metadata?.additional as Record)
+ ?.app_base_port as number) || 8000
+ const workerIndex = this.config.browseros.base_server_port - 9110 // derive from port offset
+
+ if (appName && process.env.WEBARENA_INFINITY_DIR) {
+ infinityManager = new InfinityAppManager(workerIndex, appBasePort)
+ try {
+ actualStartUrl = await infinityManager.startApp(appName)
+ console.log(
+ ` Infinity app "${appName}" started on port ${infinityManager.getPort()}`,
+ )
+ } catch (error) {
+ throw new TaskExecutionError(
+ `Failed to start Infinity app: ${error instanceof Error ? error.message : String(error)}`,
+ task,
+ 'navigation',
+ error instanceof Error ? error : undefined,
+ )
+ }
+ }
+ }
+
try {
// Phase 1: Set viewport + navigate to start URL
try {
@@ -114,10 +145,10 @@ export class TaskExecutor {
)
}
- if (task.start_url && task.start_url !== 'about:blank') {
+ if (actualStartUrl && actualStartUrl !== 'about:blank') {
try {
await callMcpTool(mcpUrl, 'navigate_page', {
- url: task.start_url,
+ url: actualStartUrl,
page: pageId,
})
} catch (error) {
@@ -134,7 +165,11 @@ export class TaskExecutor {
const agentResult = await this.executeAgent(task, pageId)
// Phase 3: Run graders
- const graderResults = await this.runGraders(task, agentResult)
+ const graderResults = await this.runGraders(
+ task,
+ agentResult,
+ infinityManager?.getUrl(),
+ )
const status =
agentResult.metadata.termination_reason === 'timeout'
@@ -169,6 +204,11 @@ export class TaskExecutor {
} catch {
// Ignore cleanup errors
}
+
+ // Stop Infinity app server if running
+ if (infinityManager) {
+ await infinityManager.stop().catch(() => {})
+ }
}
}
@@ -209,6 +249,7 @@ export class TaskExecutor {
private async runGraders(
task: Task,
agentResult: AgentResult,
+ infinityAppUrl?: string,
): Promise> {
const configGraders = this.config.graders ?? []
const taskGraders = task.graders ?? []
@@ -234,6 +275,8 @@ export class TaskExecutor {
expectedAnswer: (task.metadata?.additional as Record)
?.answer as string | undefined,
outputDir: join(this.outputDir, task.query_id),
+ mcpUrl: `${this.config.browseros.server_url}/mcp`,
+ infinityAppUrl,
},
this.deps.graderOptions,
)
diff --git a/packages/browseros-agent/apps/eval/src/runner/types.ts b/packages/browseros-agent/apps/eval/src/runner/types.ts
index 48059bd9f..ee92b3209 100644
--- a/packages/browseros-agent/apps/eval/src/runner/types.ts
+++ b/packages/browseros-agent/apps/eval/src/runner/types.ts
@@ -100,6 +100,8 @@ export interface TaskResultSummary {
// ============================================================================
export const PASS_FAIL_GRADER_ORDER = [
+ 'agisdk_state_diff',
+ 'infinity_state',
'performance_grader',
'webvoyager_grader',
'fara_combined',
diff --git a/packages/browseros-agent/apps/eval/src/types/config.ts b/packages/browseros-agent/apps/eval/src/types/config.ts
index bf181535b..b2e9ff387 100644
--- a/packages/browseros-agent/apps/eval/src/types/config.ts
+++ b/packages/browseros-agent/apps/eval/src/types/config.ts
@@ -19,37 +19,9 @@ export const OrchestratorExecutorConfigSchema = z.object({
}),
})
-export const GeminiComputerUseConfigSchema = z.object({
- type: z.literal('gemini-computer-use'),
- apiKey: z
- .string()
- .describe('API key or env var name (e.g., GOOGLE_AI_API_KEY)'),
- screenSize: z
- .object({
- width: z.number().int().min(800).max(2560).default(1440),
- height: z.number().int().min(600).max(1440).default(900),
- })
- .optional(),
- turnLimit: z.number().int().min(1).max(100).default(30),
-})
-
-export const YutoriNavigatorConfigSchema = z.object({
- type: z.literal('yutori-navigator'),
- apiKey: z.string().describe('API key or env var name (e.g., YUTORI_API_KEY)'),
- screenSize: z
- .object({
- width: z.number().int().min(800).max(2560).default(1280),
- height: z.number().int().min(600).max(1440).default(800),
- })
- .optional(),
- turnLimit: z.number().int().min(1).max(100).default(30),
-})
-
export const AgentConfigSchema = z.discriminatedUnion('type', [
SingleAgentConfigSchema,
OrchestratorExecutorConfigSchema,
- GeminiComputerUseConfigSchema,
- YutoriNavigatorConfigSchema,
])
export const EvalConfigSchema = z.object({
@@ -84,9 +56,5 @@ export type SingleAgentConfig = z.infer
export type OrchestratorExecutorConfig = z.infer<
typeof OrchestratorExecutorConfigSchema
>
-export type GeminiComputerUseConfig = z.infer<
- typeof GeminiComputerUseConfigSchema
->
-export type YutoriNavigatorConfig = z.infer
export type AgentConfig = z.infer
export type EvalConfig = z.infer
diff --git a/packages/browseros-agent/apps/eval/src/types/index.ts b/packages/browseros-agent/apps/eval/src/types/index.ts
index e55f73a71..84c23a3c4 100644
--- a/packages/browseros-agent/apps/eval/src/types/index.ts
+++ b/packages/browseros-agent/apps/eval/src/types/index.ts
@@ -4,14 +4,10 @@ export {
AgentConfigSchema,
type EvalConfig,
EvalConfigSchema,
- type GeminiComputerUseConfig,
- GeminiComputerUseConfigSchema,
type OrchestratorExecutorConfig,
OrchestratorExecutorConfigSchema,
type SingleAgentConfig,
SingleAgentConfigSchema,
- type YutoriNavigatorConfig,
- YutoriNavigatorConfigSchema,
} from './config'
// Error types
export {
diff --git a/packages/browseros-agent/apps/eval/src/types/result.ts b/packages/browseros-agent/apps/eval/src/types/result.ts
index 29fa3f5c9..5678f8a3e 100644
--- a/packages/browseros-agent/apps/eval/src/types/result.ts
+++ b/packages/browseros-agent/apps/eval/src/types/result.ts
@@ -13,12 +13,7 @@ export const GraderResultSchema = z.object({
// Agent config in metadata
const AgentConfigMetaSchema = z
.object({
- type: z.enum([
- 'single',
- 'orchestrator-executor',
- 'gemini-computer-use',
- 'yutori-navigator',
- ]),
+ type: z.enum(['single', 'orchestrator-executor']),
model: z.string().optional(),
})
.passthrough()
diff --git a/packages/browseros-agent/apps/eval/src/utils/config-validator.ts b/packages/browseros-agent/apps/eval/src/utils/config-validator.ts
index 8b68ea843..d41a62288 100644
--- a/packages/browseros-agent/apps/eval/src/utils/config-validator.ts
+++ b/packages/browseros-agent/apps/eval/src/utils/config-validator.ts
@@ -70,13 +70,6 @@ export async function validateConfig(
envVarsToCheck.push(config.agent.executor.apiKey)
}
}
- } else if (config.agent.type === 'gemini-computer-use') {
- // Gemini Computer Use agent
- if (config.agent.apiKey) {
- if (/^[A-Z][A-Z0-9_]*$/.test(config.agent.apiKey)) {
- envVarsToCheck.push(config.agent.apiKey)
- }
- }
}
// Grader API key is checked at runtime - just warn if not set
diff --git a/packages/browseros-agent/apps/server/src/agent/session-store.ts b/packages/browseros-agent/apps/server/src/agent/session-store.ts
index 12b0848d9..9000cbf81 100644
--- a/packages/browseros-agent/apps/server/src/agent/session-store.ts
+++ b/packages/browseros-agent/apps/server/src/agent/session-store.ts
@@ -11,6 +11,8 @@ export interface AgentSession {
mcpServerKey?: string
/** Workspace directory when the session was created, for change detection. */
workingDir?: string
+ /** LLM config used when the session was created, for provider/model changes. */
+ llmConfigKey?: string
}
export class SessionStore {
diff --git a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts
index e919c72a6..5afc5618a 100644
--- a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts
+++ b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts
@@ -65,6 +65,7 @@ export class ChatService {
declinedApps: request.declinedApps,
browserosId: this.deps.browserosId,
}
+ const llmConfigKey = this.buildLlmConfigKey(agentConfig)
let session = sessionStore.get(request.conversationId)
let isNewSession = false
@@ -144,6 +145,24 @@ export class ChatService {
}
}
+ // Detect provider/model/auth change mid-conversation -> rebuild session.
+ // The AI SDK agent captures the language model at construction time, so a
+ // reused session would keep calling the previous provider.
+ if (session && session.llmConfigKey !== llmConfigKey) {
+ logger.info('LLM config changed mid-conversation, rebuilding session', {
+ conversationId: request.conversationId,
+ provider: agentConfig.provider,
+ model: agentConfig.model,
+ })
+ session = await this.rebuildSession(
+ session,
+ request,
+ agentConfig,
+ mcpServerKey,
+ llmConfigKey,
+ )
+ }
+
if (!session) {
isNewSession = true
let hiddenPageId: number | undefined
@@ -209,6 +228,7 @@ export class ChatService {
browserContext,
mcpServerKey,
workingDir: request.userWorkingDir,
+ llmConfigKey,
}
sessionStore.set(request.conversationId, session)
}
@@ -341,6 +361,7 @@ export class ChatService {
request: ChatRequest,
agentConfig: ResolvedAgentConfig,
mcpServerKey: string,
+ llmConfigKey = this.buildLlmConfigKey(agentConfig),
): Promise {
const previousMessages = session.agent.messages
await session.agent.dispose()
@@ -365,6 +386,7 @@ export class ChatService {
browserContext,
mcpServerKey,
workingDir: request.userWorkingDir,
+ llmConfigKey,
}
newSession.agent.messages = sanitizeMessagesForToolset(
previousMessages,
@@ -374,6 +396,26 @@ export class ChatService {
return newSession
}
+ private buildLlmConfigKey(config: ResolvedAgentConfig): string {
+ return JSON.stringify({
+ provider: config.provider,
+ model: config.model,
+ apiKey: config.apiKey,
+ baseUrl: config.baseUrl,
+ upstreamProvider: config.upstreamProvider,
+ resourceName: config.resourceName,
+ region: config.region,
+ accessKeyId: config.accessKeyId,
+ secretAccessKey: config.secretAccessKey,
+ sessionToken: config.sessionToken,
+ accountId: config.accountId,
+ reasoningEffort: config.reasoningEffort,
+ reasoningSummary: config.reasoningSummary,
+ contextWindowSize: config.contextWindowSize,
+ supportsImages: config.supportsImages,
+ })
+ }
+
private buildMcpServerKey(browserContext?: BrowserContext): string {
const managed = browserContext?.enabledMcpServers?.slice().sort() ?? []
const custom =
diff --git a/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts b/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts
index 106752b8d..36f05649b 100644
--- a/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts
+++ b/packages/browseros-agent/apps/server/src/lib/clients/llm/provider.ts
@@ -148,6 +148,16 @@ function createMoonshotModel(config: ResolvedLLMConfig): LanguageModel {
})(config.model)
}
+function createMinimaxModel(config: ResolvedLLMConfig): LanguageModel {
+ if (!config.baseUrl) throw new Error('Minimax provider requires baseUrl')
+ if (!config.apiKey) throw new Error('Minimax provider requires apiKey')
+ return createOpenAICompatible({
+ name: 'minimax',
+ baseURL: config.baseUrl,
+ apiKey: config.apiKey,
+ })(config.model)
+}
+
function createQwenCodeModel(config: ResolvedLLMConfig): LanguageModel {
if (!config.apiKey) throw new Error('Qwen Code requires OAuth authentication')
return createOpenAICompatible({
@@ -192,6 +202,7 @@ const PROVIDER_FACTORIES: Record = {
[LLM_PROVIDERS.CHATGPT_PRO]: createChatGPTProModel,
[LLM_PROVIDERS.GITHUB_COPILOT]: createGitHubCopilotModel,
[LLM_PROVIDERS.QWEN_CODE]: createQwenCodeModel,
+ [LLM_PROVIDERS.MINIMAX]: createMinimaxModel,
}
export function createLLMProvider(config: ResolvedLLMConfig): LanguageModel {
diff --git a/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts b/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts
index b4bd0e6c8..83b1ea319 100644
--- a/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts
+++ b/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts
@@ -44,11 +44,19 @@ const createAgentUIStreamResponseSpy = mock(
},
)
-const resolveLLMConfigSpy = mock(async () => ({
- provider: 'openai',
- model: 'gpt-5',
- apiKey: 'test-key',
-}))
+const resolveLLMConfigSpy = mock(
+ async (config: {
+ provider?: string
+ model?: string
+ apiKey?: string
+ baseUrl?: string
+ }) => ({
+ provider: config.provider ?? 'openai',
+ model: config.model ?? 'gpt-5',
+ apiKey: config.apiKey ?? 'test-key',
+ baseUrl: config.baseUrl,
+ }),
+)
mock.module('ai', () => ({
createAgentUIStreamResponse: createAgentUIStreamResponseSpy,
@@ -288,4 +296,65 @@ describe('ChatService scheduled task hidden page lifecycle', () => {
})
expect(browser.closePage).toHaveBeenCalledWith(88)
})
+
+ it('rebuilds an existing session when the LLM provider changes', async () => {
+ const firstAgent = createFakeAgent()
+ agentToReturn = firstAgent
+ streamResponseHandler = async ({ onFinish }) => {
+ await onFinish({ messages: agentToReturn?.messages ?? [] })
+ return new Response('ok')
+ }
+
+ const browser = {
+ resolveTabIds: mock(async () => new Map()),
+ }
+ const sessionStore = createSessionStore()
+ const service = new ChatService({
+ sessionStore: sessionStore as never,
+ klavisClient: {} as never,
+ browser: browser as never,
+ registry: {} as never,
+ })
+ const conversationId = crypto.randomUUID()
+ const createCallsBefore = createAgentSpy.mock.calls.length
+
+ await service.processMessage(
+ {
+ conversationId,
+ message: 'First message',
+ provider: 'browseros',
+ model: 'browseros-auto',
+ mode: 'agent',
+ origin: 'sidepanel',
+ } as never,
+ new AbortController().signal,
+ )
+
+ const secondAgent = createFakeAgent()
+ agentToReturn = secondAgent
+
+ await service.processMessage(
+ {
+ conversationId,
+ message: 'Second message',
+ provider: 'chatgpt-pro',
+ model: 'gpt-5.3-codex',
+ mode: 'agent',
+ origin: 'sidepanel',
+ } as never,
+ new AbortController().signal,
+ )
+
+ expect(createAgentSpy.mock.calls.length).toBe(createCallsBefore + 2)
+ expect(firstAgent.dispose).toHaveBeenCalledTimes(1)
+ expect(sessionStore.get(conversationId)?.agent).toBe(secondAgent)
+
+ const latestCreateArgs = createAgentSpy.mock.calls.at(-1)?.[0] as {
+ resolvedConfig: { provider: string; model: string }
+ }
+ expect(latestCreateArgs.resolvedConfig).toMatchObject({
+ provider: 'chatgpt-pro',
+ model: 'gpt-5.3-codex',
+ })
+ })
})
diff --git a/packages/browseros-agent/bun.lock b/packages/browseros-agent/bun.lock
index a29ff8de4..c8b268c9b 100644
--- a/packages/browseros-agent/bun.lock
+++ b/packages/browseros-agent/bun.lock
@@ -27,6 +27,7 @@
"dependencies": {
"@ai-sdk/react": "^3.0.96",
"@browseros/server": "workspace:*",
+ "@browseros/shared": "workspace:*",
"@hookform/resolvers": "^5.2.2",
"@lobehub/icons": "^2.44.0",
"@mdxeditor/editor": "^3.52.4",
@@ -134,8 +135,6 @@
"@aws-sdk/client-s3": "^3.1014.0",
"@browseros/server": "workspace:*",
"@browseros/shared": "workspace:*",
- "@google/gemini-cli-core": "^0.16.0",
- "@google/genai": "1.30.0",
"@modelcontextprotocol/sdk": "^1.25.2",
"ai": "^6.0.94",
"hono": "^4.6.0",
@@ -2210,7 +2209,7 @@
"chrome-devtools-frontend": ["chrome-devtools-frontend@1.0.1577886", "", {}, "sha512-B9hY3o/0RuVCDWNYh9YnkEbRrPUMCY+NaOgBxvZRzGvqbGSMNckkVSdO67SwWR8bm4fo/qplXbUj0cSr229V6w=="],
- "chrome-devtools-mcp": ["chrome-devtools-mcp@0.20.3", "", { "bin": { "chrome-devtools-mcp": "build/src/bin/chrome-devtools-mcp.js", "chrome-devtools": "build/src/bin/chrome-devtools.js" } }, "sha512-6MlNKlKa+J1FX9w4SUnFERF4MRGWLlrnZvIJGhhsuuMPM7qUG0F4SwheRyjwl0+tsTemxMCBHiib8mXkg5j6og=="],
+ "chrome-devtools-mcp": ["chrome-devtools-mcp@0.21.0", "", { "bin": { "chrome-devtools-mcp": "build/src/bin/chrome-devtools-mcp.js", "chrome-devtools": "build/src/bin/chrome-devtools.js" } }, "sha512-d+iqrRmcwpRFV3Q4DRCF2LCoq+WCRU3GhISKQ9v8g+1C2Uh8upj3urkjxNO4QIjhBMIYei/VQ1OQLFceby80Og=="],
"chrome-launcher": ["chrome-launcher@1.2.0", "", { "dependencies": { "@types/node": "*", "escape-string-regexp": "^4.0.0", "is-wsl": "^2.2.0", "lighthouse-logger": "^2.0.1" }, "bin": { "print-chrome-path": "bin/print-chrome-path.cjs" } }, "sha512-JbuGuBNss258bvGil7FT4HKdC3SC2K7UAEUqiPy3ACS3Yxo3hAW6bvFpCu2HsIJLgTqxgEX6BkujvzZfLpUD0Q=="],
diff --git a/packages/browseros-agent/packages/shared/src/constants/limits.ts b/packages/browseros-agent/packages/shared/src/constants/limits.ts
index 707cccc61..4626be278 100644
--- a/packages/browseros-agent/packages/shared/src/constants/limits.ts
+++ b/packages/browseros-agent/packages/shared/src/constants/limits.ts
@@ -80,3 +80,8 @@ export const CONTENT_LIMITS = {
CONSOLE_DEFAULT_LIMIT: 50,
CONSOLE_MAX_LIMIT: 200,
} as const
+
+export const REFERRAL_LIMITS = {
+ MAX_DAILY_CREDITS: 500,
+ CREDITS_PER_REFERRAL: 200,
+} as const
diff --git a/packages/browseros-agent/packages/shared/src/constants/urls.ts b/packages/browseros-agent/packages/shared/src/constants/urls.ts
index e762e1949..8e25e6f23 100644
--- a/packages/browseros-agent/packages/shared/src/constants/urls.ts
+++ b/packages/browseros-agent/packages/shared/src/constants/urls.ts
@@ -19,4 +19,6 @@ export const EXTERNAL_URLS = {
QWEN_DEVICE_CODE: 'https://chat.qwen.ai/api/v1/oauth2/device/code',
QWEN_OAUTH_TOKEN: 'https://chat.qwen.ai/api/v1/oauth2/token',
QWEN_CODE_API: 'https://portal.qwen.ai/v1',
+ REFERRAL_SERVICE: 'https://browseros-referral.fly.dev',
+ CREDITS_GATEWAY: 'https://llm.browseros.com',
} as const
diff --git a/packages/browseros-agent/packages/shared/src/schemas/llm.ts b/packages/browseros-agent/packages/shared/src/schemas/llm.ts
index 45e7cc029..b93d582d2 100644
--- a/packages/browseros-agent/packages/shared/src/schemas/llm.ts
+++ b/packages/browseros-agent/packages/shared/src/schemas/llm.ts
@@ -27,6 +27,7 @@ export const LLM_PROVIDERS = {
CHATGPT_PRO: 'chatgpt-pro',
GITHUB_COPILOT: 'github-copilot',
QWEN_CODE: 'qwen-code',
+ MINIMAX: 'minimax',
} as const
/**
@@ -48,6 +49,7 @@ export const LLMProviderSchema: z.ZodEnum<
'chatgpt-pro',
'github-copilot',
'qwen-code',
+ 'minimax',
]
> = z.enum([
LLM_PROVIDERS.ANTHROPIC,
@@ -64,6 +66,7 @@ export const LLMProviderSchema: z.ZodEnum<
LLM_PROVIDERS.CHATGPT_PRO,
LLM_PROVIDERS.GITHUB_COPILOT,
LLM_PROVIDERS.QWEN_CODE,
+ LLM_PROVIDERS.MINIMAX,
])
export type LLMProvider = z.infer