From b6d6d4eb1dafb75577d916b210b15d6e1b1d4688 Mon Sep 17 00:00:00 2001 From: Felarof Date: Thu, 16 Apr 2026 15:25:04 -0700 Subject: [PATCH 01/11] feat: Twitter share referral UI for credit rewards (#729) * feat: add Twitter share referral UI and expose browserosId When credits are exhausted, users now see a "Share on Twitter" CTA with a pre-filled tweet URL and an input to paste their tweet link. Reusable ShareForCredits component used in both ChatError and UsagePage. Server's GET /credits now includes browserosId for the extension to pass to the referral service. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: rebuild chat session on provider change * fix: address Greptile review comments - Move referral service URL to EXTERNAL_URLS - Guard submitReferral on !response.ok - Remove stale TODO comment Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .../components/referral/ShareForCredits.tsx | 117 ++++++++++++++++++ .../agent/entrypoints/app/usage/UsagePage.tsx | 20 +-- .../entrypoints/sidepanel/index/ChatError.tsx | 26 ++-- .../apps/agent/lib/credits/useCredits.ts | 1 + .../agent/lib/referral/submit-referral.ts | 33 +++++ .../apps/server/src/agent/session-store.ts | 2 + .../apps/server/src/api/routes/credits.ts | 2 +- .../server/src/api/services/chat-service.ts | 42 +++++++ .../tests/api/services/chat-service.test.ts | 79 +++++++++++- .../packages/shared/src/constants/urls.ts | 1 + 10 files changed, 294 insertions(+), 29 deletions(-) create mode 100644 packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx create mode 100644 packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts diff --git a/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx b/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx new file mode 100644 index 000000000..fcd1ac4ff --- /dev/null +++ b/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx @@ -0,0 +1,117 @@ +import { ExternalLink, Loader2, Send } from 'lucide-react' +import type { FC } from 'react' +import { useState } from 'react' +import { Button } from '@/components/ui/button' +import { Input } from '@/components/ui/input' +import { useCredits, useInvalidateCredits } from '@/lib/credits/useCredits' +import { + getShareOnTwitterUrl, + submitReferral, +} from '@/lib/referral/submit-referral' + +interface ShareForCreditsProps { + compact?: boolean +} + +export const ShareForCredits: FC = ({ compact }) => { + const [tweetUrl, setTweetUrl] = useState('') + const [isSubmitting, setIsSubmitting] = useState(false) + const [result, setResult] = useState<{ + success: boolean + message: string + } | null>(null) + + const { data } = useCredits() + const invalidateCredits = useInvalidateCredits() + + const handleSubmit = async () => { + if (!tweetUrl.trim() || !data?.browserosId) return + + setIsSubmitting(true) + setResult(null) + + try { + const res = await submitReferral(tweetUrl.trim(), data.browserosId) + if (res.success) { + setResult({ + success: true, + message: `${res.creditsAdded ?? 200} credits added!`, + }) + setTweetUrl('') + invalidateCredits() + } else { + setResult({ + success: false, + message: res.reason ?? 'Submission failed. Please try again.', + }) + } + } catch { + setResult({ + success: false, + message: 'Network error. Please try again.', + }) + } finally { + setIsSubmitting(false) + } + } + + return ( +
+

+ Share BrowserOS on Twitter to earn 200 bonus credits! +

+ + + +

+ Already shared? Paste your tweet link: +

+ +
+ setTweetUrl(e.target.value)} + className="h-8 text-xs" + disabled={isSubmitting} + /> + +
+ + {result && ( +

+ {result.message} +

+ )} +
+ ) +} diff --git a/packages/browseros-agent/apps/agent/entrypoints/app/usage/UsagePage.tsx b/packages/browseros-agent/apps/agent/entrypoints/app/usage/UsagePage.tsx index e3ab9e3b2..9dc4642f1 100644 --- a/packages/browseros-agent/apps/agent/entrypoints/app/usage/UsagePage.tsx +++ b/packages/browseros-agent/apps/agent/entrypoints/app/usage/UsagePage.tsx @@ -1,5 +1,6 @@ -import { AlertCircle, Clock, Coins, CreditCard, Zap } from 'lucide-react' +import { AlertCircle, Clock, Coins, Gift, Zap } from 'lucide-react' import type { FC } from 'react' +import { ShareForCredits } from '@/components/referral/ShareForCredits' import { Button } from '@/components/ui/button' import { getCreditBarColor, @@ -105,20 +106,11 @@ export const UsagePage: FC = () => {
-
- -
-

- Need more credits? - - Coming soon - -

-

- Additional credit packages will be available soon -

-
+
+ + Earn More Credits
+
diff --git a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx index 78dacdf88..511e692ff 100644 --- a/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx +++ b/packages/browseros-agent/apps/agent/entrypoints/sidepanel/index/ChatError.tsx @@ -1,6 +1,7 @@ import { AlertCircle, RefreshCw } from 'lucide-react' import type { FC } from 'react' import { useMemo } from 'react' +import { ShareForCredits } from '@/components/referral/ShareForCredits' import { Button } from '@/components/ui/button' const SURVEY_DIRECTIONS = [ @@ -122,15 +123,22 @@ export const ChatError: FC = ({ View troubleshooting guide )} - {isCreditsExhausted && url && ( - - View Usage & Billing - + {isCreditsExhausted && ( + <> +
+ +
+ {url && ( + + View Usage & Billing + + )} + )} {isRateLimit && !isCreditsExhausted && (

diff --git a/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts b/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts index e648881f5..674d3819c 100644 --- a/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts +++ b/packages/browseros-agent/apps/agent/lib/credits/useCredits.ts @@ -5,6 +5,7 @@ export interface CreditsInfo { credits: number dailyLimit: number lastResetAt?: string + browserosId?: string } const CREDITS_QUERY_KEY = ['credits'] diff --git a/packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts b/packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts new file mode 100644 index 000000000..b33905055 --- /dev/null +++ b/packages/browseros-agent/apps/agent/lib/referral/submit-referral.ts @@ -0,0 +1,33 @@ +import { EXTERNAL_URLS } from '@browseros/shared/constants/urls' + +interface ReferralResult { + success: boolean + creditsAdded?: number + reason?: string +} + +export async function submitReferral( + tweetUrl: string, + browserosId: string, +): Promise { + const response = await fetch( + `${EXTERNAL_URLS.REFERRAL_SERVICE}/referral/submit`, + { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ tweetUrl, browserosId }), + }, + ) + if (!response.ok) { + return { + success: false, + reason: `Request failed with status ${response.status}`, + } + } + return response.json() +} + +export function getShareOnTwitterUrl(): string { + const text = 'I use @browseros_ai to browse the web with AI. Check it out!' + return `https://x.com/intent/tweet?text=${encodeURIComponent(text)}` +} diff --git a/packages/browseros-agent/apps/server/src/agent/session-store.ts b/packages/browseros-agent/apps/server/src/agent/session-store.ts index 12b0848d9..9000cbf81 100644 --- a/packages/browseros-agent/apps/server/src/agent/session-store.ts +++ b/packages/browseros-agent/apps/server/src/agent/session-store.ts @@ -11,6 +11,8 @@ export interface AgentSession { mcpServerKey?: string /** Workspace directory when the session was created, for change detection. */ workingDir?: string + /** LLM config used when the session was created, for provider/model changes. */ + llmConfigKey?: string } export class SessionStore { diff --git a/packages/browseros-agent/apps/server/src/api/routes/credits.ts b/packages/browseros-agent/apps/server/src/api/routes/credits.ts index ccd433b5c..631be2173 100644 --- a/packages/browseros-agent/apps/server/src/api/routes/credits.ts +++ b/packages/browseros-agent/apps/server/src/api/routes/credits.ts @@ -25,7 +25,7 @@ export function createCreditsRoutes(deps: CreditsDeps) { return new Hono().get('/', async (c) => { try { const credits = await fetchCredits(gatewayBaseUrl, browserosId) - return c.json(credits) + return c.json({ ...credits, browserosId }) } catch (error) { logger.error('Failed to fetch credits', { error: error instanceof Error ? error.message : String(error), diff --git a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts index e919c72a6..5afc5618a 100644 --- a/packages/browseros-agent/apps/server/src/api/services/chat-service.ts +++ b/packages/browseros-agent/apps/server/src/api/services/chat-service.ts @@ -65,6 +65,7 @@ export class ChatService { declinedApps: request.declinedApps, browserosId: this.deps.browserosId, } + const llmConfigKey = this.buildLlmConfigKey(agentConfig) let session = sessionStore.get(request.conversationId) let isNewSession = false @@ -144,6 +145,24 @@ export class ChatService { } } + // Detect provider/model/auth change mid-conversation -> rebuild session. + // The AI SDK agent captures the language model at construction time, so a + // reused session would keep calling the previous provider. + if (session && session.llmConfigKey !== llmConfigKey) { + logger.info('LLM config changed mid-conversation, rebuilding session', { + conversationId: request.conversationId, + provider: agentConfig.provider, + model: agentConfig.model, + }) + session = await this.rebuildSession( + session, + request, + agentConfig, + mcpServerKey, + llmConfigKey, + ) + } + if (!session) { isNewSession = true let hiddenPageId: number | undefined @@ -209,6 +228,7 @@ export class ChatService { browserContext, mcpServerKey, workingDir: request.userWorkingDir, + llmConfigKey, } sessionStore.set(request.conversationId, session) } @@ -341,6 +361,7 @@ export class ChatService { request: ChatRequest, agentConfig: ResolvedAgentConfig, mcpServerKey: string, + llmConfigKey = this.buildLlmConfigKey(agentConfig), ): Promise { const previousMessages = session.agent.messages await session.agent.dispose() @@ -365,6 +386,7 @@ export class ChatService { browserContext, mcpServerKey, workingDir: request.userWorkingDir, + llmConfigKey, } newSession.agent.messages = sanitizeMessagesForToolset( previousMessages, @@ -374,6 +396,26 @@ export class ChatService { return newSession } + private buildLlmConfigKey(config: ResolvedAgentConfig): string { + return JSON.stringify({ + provider: config.provider, + model: config.model, + apiKey: config.apiKey, + baseUrl: config.baseUrl, + upstreamProvider: config.upstreamProvider, + resourceName: config.resourceName, + region: config.region, + accessKeyId: config.accessKeyId, + secretAccessKey: config.secretAccessKey, + sessionToken: config.sessionToken, + accountId: config.accountId, + reasoningEffort: config.reasoningEffort, + reasoningSummary: config.reasoningSummary, + contextWindowSize: config.contextWindowSize, + supportsImages: config.supportsImages, + }) + } + private buildMcpServerKey(browserContext?: BrowserContext): string { const managed = browserContext?.enabledMcpServers?.slice().sort() ?? [] const custom = diff --git a/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts b/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts index b4bd0e6c8..83b1ea319 100644 --- a/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts +++ b/packages/browseros-agent/apps/server/tests/api/services/chat-service.test.ts @@ -44,11 +44,19 @@ const createAgentUIStreamResponseSpy = mock( }, ) -const resolveLLMConfigSpy = mock(async () => ({ - provider: 'openai', - model: 'gpt-5', - apiKey: 'test-key', -})) +const resolveLLMConfigSpy = mock( + async (config: { + provider?: string + model?: string + apiKey?: string + baseUrl?: string + }) => ({ + provider: config.provider ?? 'openai', + model: config.model ?? 'gpt-5', + apiKey: config.apiKey ?? 'test-key', + baseUrl: config.baseUrl, + }), +) mock.module('ai', () => ({ createAgentUIStreamResponse: createAgentUIStreamResponseSpy, @@ -288,4 +296,65 @@ describe('ChatService scheduled task hidden page lifecycle', () => { }) expect(browser.closePage).toHaveBeenCalledWith(88) }) + + it('rebuilds an existing session when the LLM provider changes', async () => { + const firstAgent = createFakeAgent() + agentToReturn = firstAgent + streamResponseHandler = async ({ onFinish }) => { + await onFinish({ messages: agentToReturn?.messages ?? [] }) + return new Response('ok') + } + + const browser = { + resolveTabIds: mock(async () => new Map()), + } + const sessionStore = createSessionStore() + const service = new ChatService({ + sessionStore: sessionStore as never, + klavisClient: {} as never, + browser: browser as never, + registry: {} as never, + }) + const conversationId = crypto.randomUUID() + const createCallsBefore = createAgentSpy.mock.calls.length + + await service.processMessage( + { + conversationId, + message: 'First message', + provider: 'browseros', + model: 'browseros-auto', + mode: 'agent', + origin: 'sidepanel', + } as never, + new AbortController().signal, + ) + + const secondAgent = createFakeAgent() + agentToReturn = secondAgent + + await service.processMessage( + { + conversationId, + message: 'Second message', + provider: 'chatgpt-pro', + model: 'gpt-5.3-codex', + mode: 'agent', + origin: 'sidepanel', + } as never, + new AbortController().signal, + ) + + expect(createAgentSpy.mock.calls.length).toBe(createCallsBefore + 2) + expect(firstAgent.dispose).toHaveBeenCalledTimes(1) + expect(sessionStore.get(conversationId)?.agent).toBe(secondAgent) + + const latestCreateArgs = createAgentSpy.mock.calls.at(-1)?.[0] as { + resolvedConfig: { provider: string; model: string } + } + expect(latestCreateArgs.resolvedConfig).toMatchObject({ + provider: 'chatgpt-pro', + model: 'gpt-5.3-codex', + }) + }) }) diff --git a/packages/browseros-agent/packages/shared/src/constants/urls.ts b/packages/browseros-agent/packages/shared/src/constants/urls.ts index e762e1949..56810bffe 100644 --- a/packages/browseros-agent/packages/shared/src/constants/urls.ts +++ b/packages/browseros-agent/packages/shared/src/constants/urls.ts @@ -19,4 +19,5 @@ export const EXTERNAL_URLS = { QWEN_DEVICE_CODE: 'https://chat.qwen.ai/api/v1/oauth2/device/code', QWEN_OAUTH_TOKEN: 'https://chat.qwen.ai/api/v1/oauth2/token', QWEN_CODE_API: 'https://portal.qwen.ai/v1', + REFERRAL_SERVICE: 'https://browseros-referral.fly.dev', } as const From fc00ed23bfe2f8289a43c35a5d3ea41b0f33968d Mon Sep 17 00:00:00 2001 From: Felarof Date: Thu, 16 Apr 2026 15:34:33 -0700 Subject: [PATCH 02/11] feat(referral): show tweet share rules and lower default daily limit fallback (#731) * feat(referral): show share rules and lower default daily limit fallback Surface the three referral validation rules (must mention @browserOS_ai, posted within last 30 minutes, single-use) directly in the ShareForCredits UI so users understand submission requirements before pasting a tweet link. Also align the UsagePage daily-limit fallback (used while credits load) with the gateway default of 50. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(usage): handle credit balance exceeding daily limit The "Credits used today" stat was computed as `dailyLimit - credits`, which goes negative once a referral bonus pushes the balance above the daily cap (e.g. balance 294 with cap 100 showed "-194 of 100"). Clamp the math to zero and surface a separate "Bonus credits" stat when the balance exceeds the daily allowance. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .../components/referral/ShareForCredits.tsx | 8 +++++++ .../agent/entrypoints/app/usage/UsagePage.tsx | 23 +++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx b/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx index fcd1ac4ff..c5841c381 100644 --- a/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx +++ b/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx @@ -61,6 +61,14 @@ export const ShareForCredits: FC = ({ compact }) => { Share BrowserOS on Twitter to earn 200 bonus credits!

+
    +
  • + Tweet must mention @browserOS_ai +
  • +
  • Tweet must be posted within the last 30 minutes
  • +
  • Each tweet can only be submitted once
  • +
+

{text}

+ {isUpstreamRateLimit && ( +

+ This is a limit from{' '} + {providerName} + {' — your configured model provider — not BrowserOS. Check your '} + provider's dashboard for quota, usage, or billing details. +

+ )} {isConnectionError && url && (
Date: Thu, 16 Apr 2026 17:09:28 -0700 Subject: [PATCH 04/11] fix: randomized tweet variations + referral fixes (#737) * fix(agent): declare @browseros/shared as workspace dependency The agent app imports @browseros/shared/constants/urls in lib/referral/submit-referral.ts but never declared the package in its dependencies, so vite failed to resolve the import during dev. Co-Authored-By: Claude Opus 4.7 (1M context) * feat(referral): cap daily referral earnings at 500 credits Block tweet submissions client-side once the user's balance reaches 500 to prevent unlimited credit farming via repeated shares. Co-Authored-By: Claude Opus 4.7 (1M context) * feat(referral): randomize tweet variations for Twitter share Replace the single hardcoded share text with 10 feature-specific variations (agent mode, chat, scheduled tasks, connect apps, cowork, workflows, memory, skills, local models, ad blocking) and pick one at random each time the share button is clicked. Co-Authored-By: Claude Opus 4.7 (1M context) * fix(referral): regenerate share URL on click Previously getShareOnTwitterUrl() was evaluated once at render time as a static href, so every click produced the same tweet variation. Move the call into onClick so a new random variation is picked each time. Addresses Greptile P1 review on PR #737. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .../components/referral/ShareForCredits.tsx | 27 ++++++- .../agent/lib/referral/submit-referral.ts | 77 ++++++++++++++++++- .../browseros-agent/apps/agent/package.json | 1 + packages/browseros-agent/bun.lock | 3 +- .../packages/shared/src/constants/limits.ts | 5 ++ 5 files changed, 109 insertions(+), 4 deletions(-) diff --git a/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx b/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx index c5841c381..946d76659 100644 --- a/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx +++ b/packages/browseros-agent/apps/agent/components/referral/ShareForCredits.tsx @@ -1,3 +1,4 @@ +import { REFERRAL_LIMITS } from '@browseros/shared/constants/limits' import { ExternalLink, Loader2, Send } from 'lucide-react' import type { FC } from 'react' import { useState } from 'react' @@ -24,8 +25,11 @@ export const ShareForCredits: FC = ({ compact }) => { const { data } = useCredits() const invalidateCredits = useInvalidateCredits() + const credits = data?.credits ?? 0 + const atDailyMax = credits >= REFERRAL_LIMITS.MAX_DAILY_CREDITS + const handleSubmit = async () => { - if (!tweetUrl.trim() || !data?.browserosId) return + if (!tweetUrl.trim() || !data?.browserosId || atDailyMax) return setIsSubmitting(true) setResult(null) @@ -55,10 +59,22 @@ export const ShareForCredits: FC = ({ compact }) => { } } + if (atDailyMax) { + return ( +
+

+ You've reached the daily cap of {REFERRAL_LIMITS.MAX_DAILY_CREDITS}{' '} + credits. Come back tomorrow to earn more! +

+
+ ) + } + return (

- Share BrowserOS on Twitter to earn 200 bonus credits! + Share BrowserOS on Twitter to earn{' '} + {REFERRAL_LIMITS.CREDITS_PER_REFERRAL} bonus credits!

    @@ -67,6 +83,10 @@ export const ShareForCredits: FC = ({ compact }) => {
  • Tweet must be posted within the last 30 minutes
  • Each tweet can only be submitted once
  • +
  • + Daily cap of {REFERRAL_LIMITS.MAX_DAILY_CREDITS} credits — resets at + midnight UTC +
+ +
@@ -558,6 +568,7 @@

Grader

renderTaskList(); updateProgress(); + updateLoadedConfigUI(); connectSSE(); } @@ -716,16 +727,70 @@

Grader

} async function loadSavedConfig(name) { - if (!name) return; + if (!name) { + loadedConfigName = null; + updateLoadedConfigUI(); + return; + } try { const res = await fetch(`/api/config/${encodeURIComponent(name)}`); if (!res.ok) return; const cfg = await res.json(); loadedConfigName = name; fillForm(cfg); + updateLoadedConfigUI(); } catch {} } +function updateLoadedConfigUI() { + const btn = document.getElementById('btn-save-ref'); + const lbl = document.getElementById('loaded-config-label'); + if (loadedConfigName) { + btn.style.display = ''; + lbl.textContent = `Loaded: ${loadedConfigName}`; + } else { + btn.style.display = 'none'; + lbl.textContent = ''; + } +} + +async function saveToReferenceConfig() { + if (!loadedConfigName) return; + const errEl = document.getElementById('config-error'); + errEl.style.color = ''; + errEl.textContent = ''; + // build current form state and PUT it back to the loaded config file + const config = buildConfigFromForm(); + const btn = document.getElementById('btn-save-ref'); + btn.disabled = true; + btn.textContent = 'Saving...'; + try { + const res = await fetch(`/api/config/${encodeURIComponent(loadedConfigName)}`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(config), + }); + const result = await res.json(); + if (!res.ok) { + const details = result.details ? `\n${result.details.join('\n')}` : ''; + errEl.textContent = (result.error || 'Failed to save config') + details; + return; + } + // success — show inline confirmation that auto-clears + errEl.style.color = '#3fb950'; + errEl.textContent = `Saved to ${loadedConfigName}`; + setTimeout(() => { + errEl.style.color = ''; + errEl.textContent = ''; + }, 2500); + } catch (e) { + errEl.textContent = `Network error: ${e.message}`; + } finally { + btn.disabled = false; + btn.textContent = 'Save to Reference'; + } +} + // Fill form fields from a config object function fillForm(cfg) { const agent = cfg.agent || {}; @@ -908,8 +973,10 @@

Grader

return config; } -async function submitConfig() { +async function submitConfig(opts = {}) { + const isTestRun = opts.testRun === true; const errEl = document.getElementById('config-error'); + errEl.style.color = ''; errEl.textContent = ''; const config = buildConfigFromForm(); @@ -920,29 +987,38 @@

Grader

return; } - const btn = document.getElementById('btn-run'); - btn.disabled = true; - btn.textContent = 'Starting...'; + // disable both buttons during submit; show appropriate loading text + const runBtn = document.getElementById('btn-run'); + const testBtn = document.getElementById('btn-test-run'); + runBtn.disabled = true; + testBtn.disabled = true; + if (isTestRun) testBtn.textContent = 'Starting...'; + else runBtn.textContent = 'Starting...'; try { const res = await fetch('/api/run', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ config, configName: loadedConfigName }), + body: JSON.stringify({ config, configName: loadedConfigName, testRun: isTestRun }), }); const result = await res.json(); if (!res.ok) { const details = result.details ? `\n${result.details.join('\n')}` : ''; errEl.textContent = (result.error || 'Failed to start eval') + details; - btn.disabled = false; - btn.textContent = 'Run Eval'; + runBtn.disabled = false; + runBtn.textContent = 'Run Eval'; + testBtn.disabled = false; + testBtn.textContent = 'Test Run'; return; } // Eval started — collapse panel, refresh state setConfigPanelOpen(false); - updateConfigSummary(loadedConfigName || 'dashboard', config.agent.type); + const summaryLabel = isTestRun + ? `${loadedConfigName || 'dashboard'} (test run)` + : (loadedConfigName || 'dashboard'); + updateConfigSummary(summaryLabel, config.agent.type); // Re-fetch state to get the loaded tasks const stateRes = await fetch('/api/state'); @@ -957,8 +1033,10 @@

Grader

setEvalRunningUI(true); } catch (e) { errEl.textContent = `Network error: ${e.message}`; - btn.disabled = false; - btn.textContent = 'Run Eval'; + runBtn.disabled = false; + runBtn.textContent = 'Run Eval'; + testBtn.disabled = false; + testBtn.textContent = 'Test Run'; } } @@ -981,16 +1059,20 @@

Grader

function setEvalRunningUI(running) { const stopBtn = document.getElementById('btn-stop'); const runBtn = document.getElementById('btn-run'); + const testBtn = document.getElementById('btn-test-run'); if (running) { stopBtn.style.display = ''; stopBtn.disabled = false; stopBtn.textContent = 'Stop'; runBtn.disabled = true; runBtn.textContent = 'Running...'; + testBtn.disabled = true; } else { stopBtn.style.display = 'none'; runBtn.disabled = false; runBtn.textContent = 'Run Eval'; + testBtn.disabled = false; + testBtn.textContent = 'Test Run'; } } @@ -1524,6 +1606,11 @@

Grader

// Parse and populate form fields const cfg = JSON.parse(e.target.result); fillForm(cfg); + // file-picker loads a config from arbitrary path — not a "reference" config, + // so disable Save-to-Reference until user picks a saved config from the dropdown + loadedConfigName = null; + document.getElementById('cfg-load-select').value = ''; + updateLoadedConfigUI(); document.getElementById('config-error').textContent = ''; } catch (err) { document.getElementById('config-error').textContent = `Invalid JSON file: ${err.message}`; diff --git a/packages/browseros-agent/apps/eval/src/dashboard/server.ts b/packages/browseros-agent/apps/eval/src/dashboard/server.ts index 6f1ff4c14..9c98ec695 100644 --- a/packages/browseros-agent/apps/eval/src/dashboard/server.ts +++ b/packages/browseros-agent/apps/eval/src/dashboard/server.ts @@ -361,11 +361,53 @@ app.get('/api/config/:name', async (c) => { } }) +// Overwrite an existing config file (used by "Save to Reference" in the UI) +app.put('/api/config/:name', async (c) => { + if (evalRunning) { + return c.json({ error: 'Cannot save config while eval is running' }, 409) + } + // validate filename + const name = c.req.param('name') + if ( + !name || + name.includes('/') || + name.includes('..') || + !name.endsWith('.json') + ) { + return c.json({ error: 'Invalid config name' }, 400) + } + const filepath = join(configsDir, name) + if (!resolve(filepath).startsWith(resolve(configsDir))) { + return c.json({ error: 'Invalid config name' }, 400) + } + // overwrite-only — file must already exist + const existing = await stat(filepath).catch(() => null) + if (!existing?.isFile()) { + return c.json({ error: 'Config not found' }, 404) + } + // parse + validate body against the same schema as /api/run + let body: unknown + try { + body = await c.req.json() + } catch { + return c.json({ error: 'Invalid JSON body' }, 400) + } + const parseResult = EvalConfigSchema.safeParse(body) + if (!parseResult.success) { + const errors = parseResult.error.errors.map( + (e) => `${e.path.join('.')}: ${e.message}`, + ) + return c.json({ error: 'Config validation failed', details: errors }, 400) + } + await Bun.write(filepath, `${JSON.stringify(parseResult.data, null, 2)}\n`) + return c.json({ status: 'saved', name }) +}) + // Start an eval run from the dashboard app.post('/api/run', async (c) => { if (evalRunning) return c.json({ error: 'Eval already running' }, 409) - let body: { config: unknown; configName?: string } + let body: { config: unknown; configName?: string; testRun?: boolean } try { body = await c.req.json() } catch { @@ -382,6 +424,7 @@ app.post('/api/run', async (c) => { } const config = parseResult.data + const isTestRun = body.testRun === true // Resolve relative paths from configs/ dir (dataset dropdown values are relative to it) const baseDir = configsDir @@ -425,17 +468,26 @@ app.post('/api/run', async (c) => { return c.json({ error: `Failed to load tasks: ${msg}` }, 400) } + // Test run mode: 1 worker + first task only, to verify setup before a full run + if (isTestRun) { + if (tasks.length === 0) { + return c.json({ error: 'Dataset is empty — nothing to test' }, 400) + } + tasks = tasks.slice(0, 1) + } + await mkdir(outputDir, { recursive: true }) // Re-init dashboard state with loaded tasks const configLabel = body.configName || 'dashboard' - dashboardState.init(tasks, configLabel, config.agent.type, outputDir) + const stateLabel = isTestRun ? `${configLabel} (test run)` : configLabel + dashboardState.init(tasks, stateLabel, config.agent.type, outputDir) const graderOptions = resolveGraderOptions(config) // Run eval in background — don't await const executor = new ParallelExecutor({ - numWorkers: config.num_workers || 1, + numWorkers: isTestRun ? 1 : config.num_workers || 1, config, outputDir, graderOptions, @@ -467,7 +519,12 @@ app.post('/api/run', async (c) => { console.log('\nEval run complete.') }) - return c.json({ status: 'started', taskCount: tasks.length, outputDir }) + return c.json({ + status: 'started', + taskCount: tasks.length, + outputDir, + testRun: isTestRun, + }) }) // Stop a running eval From ed717f0b25aa8a8aefbe3bc63691512a68162080 Mon Sep 17 00:00:00 2001 From: Felarof Date: Tue, 28 Apr 2026 10:33:39 -0700 Subject: [PATCH 11/11] chore(eval): remove Gemini and Yutori agents Remove the Gemini Computer Use and Yutori Navigator eval integrations, including their agent folders, dashboard config branches, sample configs, docs, and eval workspace dependencies. Co-Authored-By: Claude Opus 4.6 --- .../browseros-agent/apps/eval/DESIGN_DOC.md | 1 - packages/browseros-agent/apps/eval/README.md | 2 - .../configs/test_gemini-computer-use.json | 30 - .../eval/configs/test_yutori-navigator.json | 30 - .../browseros-agent/apps/eval/package.json | 2 - .../scripts/validate-computer-use-tools.ts | 200 ------ .../apps/eval/scripts/weekly-report.ts | 5 +- .../gemini-computer-use/action-mapper.ts | 643 ----------------- .../src/agents/gemini-computer-use/agent.ts | 327 --------- .../src/agents/gemini-computer-use/index.ts | 97 --- .../src/agents/gemini-computer-use/types.ts | 156 ---- .../apps/eval/src/agents/index.ts | 7 - .../agents/yutori-navigator/action-mapper.ts | 677 ------------------ .../eval/src/agents/yutori-navigator/agent.ts | 353 --------- .../eval/src/agents/yutori-navigator/index.ts | 97 --- .../eval/src/agents/yutori-navigator/types.ts | 158 ---- .../apps/eval/src/dashboard/index.html | 80 --- .../apps/eval/src/types/config.ts | 32 - .../apps/eval/src/types/index.ts | 4 - .../apps/eval/src/types/result.ts | 7 +- .../apps/eval/src/utils/config-validator.ts | 7 - packages/browseros-agent/bun.lock | 2 - 22 files changed, 2 insertions(+), 2915 deletions(-) delete mode 100644 packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json delete mode 100644 packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json delete mode 100644 packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts delete mode 100644 packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts diff --git a/packages/browseros-agent/apps/eval/DESIGN_DOC.md b/packages/browseros-agent/apps/eval/DESIGN_DOC.md index 3a4771938..321aa89fb 100644 --- a/packages/browseros-agent/apps/eval/DESIGN_DOC.md +++ b/packages/browseros-agent/apps/eval/DESIGN_DOC.md @@ -520,7 +520,6 @@ import { randomUUID } from 'node:crypto' import { GeminiAgent } from '@browseros/server/agent' import { AgentExecutionError } from '@browseros/server/agent/errors' import type { ResolvedAgentConfig } from '@browseros/server/agent/types' -import { MCPServerConfig } from '@google/gemini-cli-core' import type { AgentContext, AgentEvaluator, AgentResult } from '../types' import type { SingleAgentConfig, TaskMetadata } from '../../types' import { resolveEnvValue } from '../../utils/env' diff --git a/packages/browseros-agent/apps/eval/README.md b/packages/browseros-agent/apps/eval/README.md index ee20bbe6c..cb10fdb44 100644 --- a/packages/browseros-agent/apps/eval/README.md +++ b/packages/browseros-agent/apps/eval/README.md @@ -141,8 +141,6 @@ The executor config stays the same across all orchestrator providers — it alwa |------|-------------|----------------| | `single` | Single LLM agent via Gemini CLI + MCP | `webvoyager-test.json` | | `tool-loop` | AI SDK tool loop, connects via CDP | `tool-loop-test.json` | -| `gemini-computer-use` | Google native computer use API | `gemini-computer-use.json` | -| `yutori-navigator` | Yutori N1 visual model | `yutori-navigator.json` | ## Configuration Reference diff --git a/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json b/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json deleted file mode 100644 index 6c5174333..000000000 --- a/packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "agent": { - "type": "gemini-computer-use", - "apiKey": "GOOGLE_AI_API_KEY", - "screenSize": { - "width": 1440, - "height": 900 - }, - "turnLimit": 100 - }, - "dataset": "../data/test-set.jsonl", - "num_workers": 1, - "restart_server_per_task": true, - "browseros": { - "server_url": "http://127.0.0.1:9110", - "base_cdp_port": 9010, - "base_server_port": 9110, - "base_extension_port": 9310, - "load_extensions": false, - "headless": false - }, - "captcha": { - "api_key_env": "NOPECHA_API_KEY" - }, - "graders": ["performance_grader"], - "grader_api_key_env": "OPENROUTER_API_KEY", - "grader_base_url": "https://openrouter.ai/api/v1", - "grader_model": "openai/gpt-4.1", - "timeout_ms": 1200000 -} diff --git a/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json b/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json deleted file mode 100644 index fd35fba36..000000000 --- a/packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "agent": { - "type": "yutori-navigator", - "apiKey": "YUTORI_API_KEY", - "screenSize": { - "width": 1280, - "height": 800 - }, - "turnLimit": 100 - }, - "dataset": "../data/test-set.jsonl", - "num_workers": 1, - "restart_server_per_task": true, - "browseros": { - "server_url": "http://127.0.0.1:9110", - "base_cdp_port": 9010, - "base_server_port": 9110, - "base_extension_port": 9310, - "load_extensions": false, - "headless": false - }, - "captcha": { - "api_key_env": "NOPECHA_API_KEY" - }, - "graders": ["performance_grader"], - "grader_api_key_env": "OPENROUTER_API_KEY", - "grader_base_url": "https://openrouter.ai/api/v1", - "grader_model": "openai/gpt-4.1", - "timeout_ms": 1200000 -} diff --git a/packages/browseros-agent/apps/eval/package.json b/packages/browseros-agent/apps/eval/package.json index b7abbfe34..8d6608f48 100644 --- a/packages/browseros-agent/apps/eval/package.json +++ b/packages/browseros-agent/apps/eval/package.json @@ -12,8 +12,6 @@ "@aws-sdk/client-s3": "^3.1014.0", "@browseros/server": "workspace:*", "@browseros/shared": "workspace:*", - "@google/gemini-cli-core": "^0.16.0", - "@google/genai": "1.30.0", "@modelcontextprotocol/sdk": "^1.25.2", "ai": "^6.0.94", "hono": "^4.6.0", diff --git a/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts b/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts deleted file mode 100644 index 43edb4ab3..000000000 --- a/packages/browseros-agent/apps/eval/scripts/validate-computer-use-tools.ts +++ /dev/null @@ -1,200 +0,0 @@ -/** - * Validation script for Gemini Computer Use integration - * Run: bun apps/eval/scripts/validate-computer-use-tools.ts - */ - -import { Client } from '@modelcontextprotocol/sdk/client/index.js' -import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js' - -const MCP_URL = process.env.MCP_URL || 'http://127.0.0.1:9105/mcp' - -interface McpToolResult { - content: Array<{ - type: string - text?: string - data?: string - mimeType?: string - }> - isError?: boolean -} - -async function callMcpTool( - serverUrl: string, - name: string, - args: Record = {}, -): Promise { - const client = new Client({ name: 'validate-computer-use', version: '1.0.0' }) - const transport = new StreamableHTTPClientTransport(new URL(serverUrl), { - requestInit: { headers: { 'X-BrowserOS-Source': 'validation' } }, - }) - - try { - await client.connect(transport) - return (await client.callTool({ name, arguments: args })) as McpToolResult - } finally { - try { - await transport.close() - } catch {} - } -} - -async function validateTools() { - console.log('🔍 Validating MCP tools for Gemini Computer Use integration\n') - console.log(`MCP URL: ${MCP_URL}\n`) - - // Get active tab first - console.log('1. Getting active tab...') - const tabResult = await callMcpTool(MCP_URL, 'browser_get_active_tab', {}) - if (tabResult.isError) { - console.error('❌ Failed to get active tab:', tabResult.content) - process.exit(1) - } - const tabText = tabResult.content.find((c) => c.type === 'text')?.text ?? '' - const tabIdMatch = tabText.match(/ID: (\d+)/) - const tabId = tabIdMatch ? parseInt(tabIdMatch[1], 10) : 1 - console.log(` ✅ Active tab ID: ${tabId}\n`) - - // Validate each tool needed for Computer Use - const toolTests = [ - { - name: 'browser_get_screenshot', - args: { tabId, size: 'medium' }, - description: 'Screenshot capture', - validate: (r: McpToolResult) => r.content.some((c) => c.type === 'image'), - }, - { - name: 'browser_click_coordinates', - args: { tabId, x: 100, y: 100 }, - description: 'Click at coordinates', - validate: (r: McpToolResult) => !r.isError, - }, - { - name: 'browser_type_at_coordinates', - args: { tabId, x: 100, y: 100, text: 'test' }, - description: 'Type at coordinates', - validate: (r: McpToolResult) => !r.isError, - }, - { - name: 'browser_scroll_down', - args: { tabId }, - description: 'Scroll down', - validate: (r: McpToolResult) => !r.isError, - }, - { - name: 'browser_scroll_up', - args: { tabId }, - description: 'Scroll up', - validate: (r: McpToolResult) => !r.isError, - }, - { - name: 'browser_send_keys', - args: { tabId, key: 'Enter' }, - description: 'Send keyboard key', - validate: (r: McpToolResult) => !r.isError, - }, - { - name: 'browser_execute_javascript', - args: { tabId, code: 'window.location.href' }, - description: 'Execute JavaScript (for go_back/forward workaround)', - validate: (r: McpToolResult) => !r.isError, - }, - ] - - let passed = 0 - let failed = 0 - - for (const test of toolTests) { - process.stdout.write(`2. Testing ${test.name} (${test.description})... `) - try { - const result = await callMcpTool(MCP_URL, test.name, test.args) - if (test.validate(result)) { - console.log('✅') - passed++ - } else { - console.log('❌ Validation failed') - console.log(' Result:', JSON.stringify(result, null, 2)) - failed++ - } - } catch (err) { - console.log('❌ Error:', err instanceof Error ? err.message : err) - failed++ - } - } - - console.log(`\n${'='.repeat(50)}`) - console.log(`Results: ${passed} passed, ${failed} failed`) - console.log('='.repeat(50)) - - if (failed === 0) { - console.log( - '\n✅ All tools validated! Gemini Computer Use integration should work.', - ) - console.log('\nGaps to address with workarounds:') - console.log(' - key_combination: Use browser_execute_javascript') - console.log( - ' - go_back/go_forward: Use browser_execute_javascript with history.back()/forward()', - ) - console.log( - ' - type_text_at press_enter: Chain browser_send_keys after typing', - ) - } else { - console.log('\n⚠️ Some tools failed. Check your server is running.') - } -} - -// Validate Gemini API access -async function validateGeminiApi() { - const apiKey = process.env.GOOGLE_AI_API_KEY || process.env.GEMINI_API_KEY - if (!apiKey) { - console.log('\n⚠️ GOOGLE_AI_API_KEY not set - skipping API validation') - return - } - - console.log('\n3. Validating Gemini Computer Use API access...') - - const MODEL = 'gemini-2.5-computer-use-preview-10-2025' - const url = `https://generativelanguage.googleapis.com/v1beta/models/${MODEL}:generateContent` - - // Minimal test - just check if model is accessible - const testPayload = { - contents: [{ role: 'user', parts: [{ text: 'test' }] }], - } - - try { - const response = await fetch(url, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'x-goog-api-key': apiKey, - }, - body: JSON.stringify(testPayload), - }) - - if (response.ok) { - console.log(' ✅ Gemini Computer Use API is accessible') - } else { - const error = await response.json() - console.log( - ' ❌ API error:', - error.error?.message || response.statusText, - ) - } - } catch (err) { - console.log( - ' ❌ Network error:', - err instanceof Error ? err.message : err, - ) - } -} - -async function main() { - try { - await validateTools() - await validateGeminiApi() - } catch (err) { - console.error('Validation failed:', err) - process.exit(1) - } -} - -main() diff --git a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts index 029539fe8..af99b35cb 100644 --- a/packages/browseros-agent/apps/eval/scripts/weekly-report.ts +++ b/packages/browseros-agent/apps/eval/scripts/weekly-report.ts @@ -334,9 +334,7 @@ const html = ` ? 'Orch-Exec' : r.agentType === 'single' ? 'Tool Loop' - : r.agentType === 'gemini-computer-use' - ? 'Gemini CU' - : r.agentType || '—' + : r.agentType || '—' return ` ${escHtml(r.date)} ${escHtml(r.runId)} @@ -385,7 +383,6 @@ const html = ` var latest = runs[runs.length - 1]; var archLabel = latest.agentType === 'orchestrator-executor' ? 'Orchestrator-Executor' : latest.agentType === 'single' ? 'Single Agent (Tool Loop)' - : latest.agentType === 'gemini-computer-use' ? 'Gemini Computer Use' : latest.agentType || 'Unknown'; var scoreColor = latest.avgScore >= 75 ? '#3fb950' : latest.avgScore >= 40 ? '#f0883e' : '#f85149'; el.innerHTML = diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts deleted file mode 100644 index f2200861d..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/action-mapper.ts +++ /dev/null @@ -1,643 +0,0 @@ -/** - * Maps Gemini Computer Use actions to MCP tool calls - * - * Coordinate System: - * - Screenshots captured with size='large' (1028px width, aspect ratio preserved) - * - Gemini outputs normalized coordinates (0-999) relative to the screenshot - * - We convert these to actual viewport coordinates by: - * 1. Getting the real viewport dimensions via JavaScript - * 2. Scaling normalized coords to actual viewport pixels - */ - -import { Client } from '@modelcontextprotocol/sdk/client/index.js' -import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js' -import type { ActionContext, ComputerUseAction, ScreenSize } from './types' -import { DEFAULTS } from './types' - -interface McpToolResult { - content: Array<{ - type: string - text?: string - data?: string - mimeType?: string - }> - isError?: boolean -} - -const MCP_TIMEOUT_MS = 30000 - -export class ActionMapper { - private ctx: ActionContext - private cachedViewport: ScreenSize | null = null - - constructor(ctx: ActionContext) { - this.ctx = ctx - } - - // Store debug info about viewport detection for inclusion in responses - private viewportDebugInfo: string = '' - - /** - * Get the actual browser viewport size via JavaScript - * Caches the result to avoid repeated calls - * Also stores debug info for troubleshooting - */ - async getViewportSize(): Promise { - if (this.cachedViewport) { - return this.cachedViewport - } - - try { - const result = await this.callMcp('browser_execute_javascript', { - tabId: this.ctx.tabId, - windowId: this.ctx.windowId, - code: '[window.innerWidth, window.innerHeight]', - }) - - const textContent = - result.content.find((c) => c.type === 'text')?.text ?? '' - - // Check for error in result - if (result.isError) { - this.viewportDebugInfo = `[VIEWPORT ERROR] JS execution failed: ${textContent}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - return this.ctx.screenSize - } - - // Response format can be multiline: - // "Result: [1440, 900]" or "Result: [\n 1200,\n 712\n]" - const arrayMatch = textContent.match(/\[\s*(\d+)\s*,\s*(\d+)\s*\]/s) - if (arrayMatch) { - const width = parseInt(arrayMatch[1], 10) - const height = parseInt(arrayMatch[2], 10) - if (width > 0 && height > 0) { - this.cachedViewport = { width, height } - this.viewportDebugInfo = `[VIEWPORT OK] Detected: ${width}x${height} (raw response: "${textContent.substring(0, 100)}")` - console.log(this.viewportDebugInfo) - return this.cachedViewport - } else { - this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Invalid dimensions: ${width}x${height} from "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - } - } else { - this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Could not parse response: "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - } - } catch (error) { - const errMsg = error instanceof Error ? error.message : String(error) - this.viewportDebugInfo = `[VIEWPORT EXCEPTION] ${errMsg}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - } - - // Fallback to configured screenSize - return this.ctx.screenSize - } - - /** - * Get the current viewport debug info - */ - getViewportDebugInfo(): string { - return this.viewportDebugInfo - } - - /** - * Clear cached viewport (call when tab/window changes or before new task) - */ - clearViewportCache(): void { - this.cachedViewport = null - } - - /** - * Scale normalized coordinate (0-999) to actual viewport pixel value - */ - private async scaleCoordinates( - normalizedX: number, - normalizedY: number, - ): Promise<{ x: number; y: number }> { - const viewport = await this.getViewportSize() - return { - x: Math.round((normalizedX / 1000) * viewport.width), - y: Math.round((normalizedY / 1000) * viewport.height), - } - } - - /** - * Call an MCP tool - */ - private async callMcp( - name: string, - args: Record = {}, - ): Promise { - const client = new Client({ - name: 'gemini-computer-use', - version: '1.0.0', - }) - - const transport = new StreamableHTTPClientTransport( - new URL(this.ctx.mcpUrl), - { - requestInit: { - headers: { 'X-BrowserOS-Source': 'gemini-computer-use' }, - }, - }, - ) - - try { - await client.connect(transport) - - const toolCallPromise = client.callTool({ name, arguments: args }) - let timeoutId: ReturnType | null = null - const timeoutPromise = new Promise((_, reject) => { - timeoutId = setTimeout( - () => - reject( - new Error(`MCP tool call timed out after ${MCP_TIMEOUT_MS}ms`), - ), - MCP_TIMEOUT_MS, - ) - }) - - try { - return (await Promise.race([ - toolCallPromise, - timeoutPromise, - ])) as McpToolResult - } finally { - if (timeoutId) clearTimeout(timeoutId) - } - } finally { - try { - await transport.close() - } catch { - // Ignore close errors - } - } - } - - /** - * Execute a Computer Use action by mapping to MCP tools - */ - async execute( - action: ComputerUseAction, - ): Promise<{ success: boolean; message: string }> { - const { tabId, windowId } = this.ctx - - try { - switch (action.name) { - case 'click_at': { - const viewport = await this.getViewportSize() - const { x, y } = await this.scaleCoordinates( - action.args.x, - action.args.y, - ) - await this.callMcp('browser_click_coordinates', { - tabId, - windowId, - x, - y, - }) - // Return original coordinates + debug info for troubleshooting - // Debug info shows: model input → viewport coords, viewport size, and any errors - const debugInfo = `[DEBUG: input=(${action.args.x},${action.args.y}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}` - return { - success: true, - message: `Clicked at (${action.args.x}, ${action.args.y}). ${debugInfo}`, - } - } - - case 'type_text_at': { - const viewport = await this.getViewportSize() - const { x, y } = await this.scaleCoordinates( - action.args.x, - action.args.y, - ) - const { text, press_enter, clear_before_typing } = action.args - - // Clear field first if requested (select all + delete) - if (clear_before_typing) { - await this.callMcp('browser_click_coordinates', { - tabId, - windowId, - x, - y, - }) - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: `document.execCommand('selectAll')`, - }) - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Delete', - }) - } - - // Type the text - await this.callMcp('browser_type_at_coordinates', { - tabId, - windowId, - x, - y, - text, - }) - - // Press Enter if requested - if (press_enter) { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Enter', - }) - } - - // Return original coordinates + debug info - const debugInfo = `[DEBUG: input=(${action.args.x},${action.args.y}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}` - return { - success: true, - message: `Typed "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}" at (${action.args.x}, ${action.args.y}). ${debugInfo}`, - } - } - - case 'navigate': { - await this.callMcp('browser_navigate', { - tabId, - windowId, - url: action.args.url, - }) - return { success: true, message: `Navigated to ${action.args.url}` } - } - - case 'scroll_document': { - const { direction } = action.args - if (direction === 'up') { - await this.callMcp('browser_scroll_up', { tabId, windowId }) - } else if (direction === 'down') { - await this.callMcp('browser_scroll_down', { tabId, windowId }) - } else { - // Left/right scroll via JavaScript - const scrollCode = - direction === 'left' - ? 'window.scrollBy(-window.innerWidth, 0)' - : 'window.scrollBy(window.innerWidth, 0)' - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: scrollCode, - }) - } - return { success: true, message: `Scrolled ${direction}` } - } - - case 'scroll_at': { - const { x, y } = await this.scaleCoordinates( - action.args.x, - action.args.y, - ) - const { direction, magnitude = 500 } = action.args - - // Click at position first to focus element - await this.callMcp('browser_click_coordinates', { - tabId, - windowId, - x, - y, - }) - - // Scale magnitude from 0-999 to actual pixels - const viewport = await this.getViewportSize() - const scrollAmount = Math.round((magnitude / 1000) * viewport.height) - - // Use JavaScript scrollBy for precise control with magnitude - const scrollCode = - direction === 'up' - ? `window.scrollBy(0, -${scrollAmount})` - : direction === 'down' - ? `window.scrollBy(0, ${scrollAmount})` - : direction === 'left' - ? `window.scrollBy(-${scrollAmount}, 0)` - : `window.scrollBy(${scrollAmount}, 0)` - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: scrollCode, - }) - - // Return original coordinates to avoid confusing the model - return { - success: true, - message: `Scrolled ${direction} at (${action.args.x}, ${action.args.y})`, - } - } - - case 'key_combination': { - const { keys } = action.args - - // Map common key combinations to JavaScript or available keys - const keyMap: Record Promise> = { - 'Control+a': async () => { - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: `document.execCommand('selectAll')`, - }) - }, - 'Control+c': async () => { - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: `document.execCommand('copy')`, - }) - }, - 'Control+v': async () => { - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: `document.execCommand('paste')`, - }) - }, - 'Control+z': async () => { - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: `document.execCommand('undo')`, - }) - }, - Enter: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Enter', - }) - }, - Escape: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Escape', - }) - }, - Tab: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Tab', - }) - }, - Backspace: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Backspace', - }) - }, - Delete: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Delete', - }) - }, - ArrowUp: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'ArrowUp', - }) - }, - ArrowDown: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'ArrowDown', - }) - }, - ArrowLeft: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'ArrowLeft', - }) - }, - ArrowRight: async () => { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'ArrowRight', - }) - }, - } - - // Normalize key string (case insensitive for modifiers) - const normalizedKeys = keys - .replace(/ctrl/i, 'Control') - .replace(/cmd/i, 'Control') - const handler = keyMap[normalizedKeys] || keyMap[keys] - - if (handler) { - await handler() - } else { - const keyName = keys.split('+').pop() || '' - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: ` - const event = new KeyboardEvent('keydown', { - key: ${JSON.stringify(keyName)}, - ctrlKey: ${keys.toLowerCase().includes('control')}, - shiftKey: ${keys.toLowerCase().includes('shift')}, - altKey: ${keys.toLowerCase().includes('alt')}, - metaKey: ${keys.toLowerCase().includes('meta')}, - bubbles: true - }); - document.activeElement?.dispatchEvent(event); - `, - }) - } - - return { success: true, message: `Pressed ${keys}` } - } - - case 'hover_at': { - const { x, y } = await this.scaleCoordinates( - action.args.x, - action.args.y, - ) - - // Simulate hover via JavaScript mouseover event - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: ` - const elem = document.elementFromPoint(${x}, ${y}); - if (elem) { - const event = new MouseEvent('mouseover', { bubbles: true, clientX: ${x}, clientY: ${y} }); - elem.dispatchEvent(event); - } - `, - }) - - // Return original coordinates to avoid confusing the model - return { - success: true, - message: `Hovered at (${action.args.x}, ${action.args.y})`, - } - } - - case 'go_back': { - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: 'history.back()', - }) - return { success: true, message: 'Navigated back' } - } - - case 'go_forward': { - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: 'history.forward()', - }) - return { success: true, message: 'Navigated forward' } - } - - case 'wait_5_seconds': { - await new Promise((resolve) => setTimeout(resolve, 5000)) - return { success: true, message: 'Waited 5 seconds' } - } - - case 'drag_and_drop': { - const start = await this.scaleCoordinates( - action.args.x, - action.args.y, - ) - const end = await this.scaleCoordinates( - action.args.destination_x, - action.args.destination_y, - ) - - // Simulate drag and drop via JavaScript - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: ` - const startElem = document.elementFromPoint(${start.x}, ${start.y}); - const endElem = document.elementFromPoint(${end.x}, ${end.y}); - if (startElem && endElem) { - const dragStart = new DragEvent('dragstart', { bubbles: true, clientX: ${start.x}, clientY: ${start.y} }); - const drop = new DragEvent('drop', { bubbles: true, clientX: ${end.x}, clientY: ${end.y} }); - const dragEnd = new DragEvent('dragend', { bubbles: true }); - startElem.dispatchEvent(dragStart); - endElem.dispatchEvent(drop); - startElem.dispatchEvent(dragEnd); - } - `, - }) - - // Return original coordinates to avoid confusing the model - return { - success: true, - message: `Dragged from (${action.args.x}, ${action.args.y}) to (${action.args.destination_x}, ${action.args.destination_y})`, - } - } - - default: { - const _exhaustive: never = action - return { - success: false, - message: `Unknown action: ${JSON.stringify(action)}`, - } - } - } - } catch (error) { - const message = error instanceof Error ? error.message : String(error) - return { success: false, message: `Action failed: ${message}` } - } - } - - /** - * Capture a screenshot via MCP with retry logic - * - * Uses Gemini's recommended screenshot size (1440x900) for optimal model performance. - * Now that viewport detection is working correctly, the coordinate mapping will be accurate. - */ - async captureScreenshot(retries = 2): Promise { - const { width, height } = DEFAULTS.screenshotSize - - for (let attempt = 0; attempt <= retries; attempt++) { - try { - const result = await this.callMcp('browser_get_screenshot', { - tabId: this.ctx.tabId, - windowId: this.ctx.windowId, - width, - height, - showHighlights: false, - }) - - if (result.isError) { - const errorText = - result.content?.find((c) => c.type === 'text')?.text ?? - 'Unknown error' - if (attempt < retries) { - console.warn( - `Screenshot attempt ${attempt + 1} failed: ${errorText}, retrying...`, - ) - await new Promise((r) => setTimeout(r, 500)) - continue - } - console.warn('Screenshot capture failed:', errorText) - return null - } - - const imageContent = result.content.find((c) => c.type === 'image') - if (imageContent?.data) { - return imageContent.data - } - - if (attempt < retries) { - console.warn( - `Screenshot attempt ${attempt + 1}: No image data, retrying...`, - ) - await new Promise((r) => setTimeout(r, 500)) - continue - } - return null - } catch (error) { - if (attempt < retries) { - console.warn( - `Screenshot attempt ${attempt + 1} error:`, - error, - 'retrying...', - ) - await new Promise((r) => setTimeout(r, 500)) - continue - } - console.warn('Screenshot capture error:', error) - return null - } - } - return null - } - - /** - * Get current page URL via MCP - */ - async getCurrentUrl(): Promise { - try { - const result = await this.callMcp('browser_execute_javascript', { - tabId: this.ctx.tabId, - windowId: this.ctx.windowId, - code: 'window.location.href', - }) - - const textContent = - result.content.find((c) => c.type === 'text')?.text ?? '' - // Extract URL from result text - const urlMatch = textContent.match(/Result:\s*"?([^"\n]+)"?/) - return urlMatch?.[1] ?? 'unknown' - } catch { - return 'unknown' - } - } -} diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts deleted file mode 100644 index 04dee1fb7..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/agent.ts +++ /dev/null @@ -1,327 +0,0 @@ -/** - * Gemini Computer Use Agent - * Implements the agent loop that calls Gemini API and executes actions - * Uses UIMessageStreamEvent format for logging compatibility - */ - -import { randomUUID } from 'node:crypto' -import { ActionMapper } from './action-mapper' -import { - type ComputerUseAction, - DEFAULTS, - type GeminiComputerUseAgentConfig, - type GeminiContent, - type GeminiPart, - type GeminiResponse, -} from './types' - -const GEMINI_API_BASE = 'https://generativelanguage.googleapis.com/v1beta' - -interface StreamWriter { - write: (data: string) => Promise -} - -type ActionHook = ( - action: ComputerUseAction, - result: { success: boolean; message: string }, -) => Promise - -/** - * Emit SSE-formatted UIMessageStreamEvent - */ -function emitEvent( - writer: StreamWriter, - event: Record, -): Promise { - return writer.write(`data: ${JSON.stringify(event)}\n\n`) -} - -export class GeminiComputerUseAgent { - private config: GeminiComputerUseAgentConfig - private actionMapper: ActionMapper - private actionHook?: ActionHook - private contents: GeminiContent[] = [] - - constructor(config: GeminiComputerUseAgentConfig) { - this.config = config - this.actionMapper = new ActionMapper({ - mcpUrl: config.mcpUrl, - tabId: config.tabId, - windowId: config.windowId, - screenSize: config.screenSize, - }) - } - - /** - * Set a hook to be called after each action execution - */ - setActionHook(hook: ActionHook): void { - this.actionHook = hook - } - - /** - * Call the Gemini Computer Use API - */ - private async callGeminiApi(): Promise { - const url = `${GEMINI_API_BASE}/models/${DEFAULTS.model}:generateContent` - - const response = await fetch(url, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'x-goog-api-key': this.config.apiKey, - }, - body: JSON.stringify({ - contents: this.contents, - tools: [ - { - computer_use: { - environment: 'ENVIRONMENT_BROWSER', - }, - }, - ], - }), - }) - - if (!response.ok) { - const errorBody = await response.text() - throw new Error( - `Gemini API error: ${response.status} ${response.statusText} - ${errorBody}`, - ) - } - - return response.json() - } - - /** - * Extract function calls from a Gemini response - */ - private extractFunctionCalls(response: GeminiResponse): ComputerUseAction[] { - const candidate = response.candidates?.[0] - if (!candidate?.content?.parts) { - return [] - } - - const actions: ComputerUseAction[] = [] - for (const part of candidate.content.parts) { - if (part.functionCall) { - const { name, args } = part.functionCall - // Construct action object - actions.push({ name, args: args ?? {} } as ComputerUseAction) - } - } - - return actions - } - - /** - * Extract text response from Gemini response - */ - private extractTextResponse(response: GeminiResponse): string | null { - const candidate = response.candidates?.[0] - if (!candidate?.content?.parts) { - return null - } - - const textParts = candidate.content.parts - .map((p) => p.text) - .filter((text): text is string => text !== undefined) - return textParts.length > 0 ? textParts.join('\n') : null - } - - /** - * Build function response parts for the next turn - */ - private buildFunctionResponses( - actions: ComputerUseAction[], - currentUrl: string, - screenshotBase64: string | null, - ): GeminiPart[] { - const parts: GeminiPart[] = [] - - for (const action of actions) { - parts.push({ - functionResponse: { - name: action.name, - response: { url: currentUrl }, - }, - }) - } - - // Add screenshot as inline data - if (screenshotBase64) { - parts.push({ - inlineData: { - mimeType: 'image/png', - data: screenshotBase64, - }, - }) - } - - return parts - } - - /** - * Execute the agent loop - */ - async execute( - query: string, - streamWriter: StreamWriter, - signal: AbortSignal, - ): Promise<{ finalText: string | null; totalActions: number }> { - let totalActions = 0 - let finalText: string | null = null - - // Wait for page to stabilize before first screenshot - await new Promise((resolve) => setTimeout(resolve, 2000)) - - // Capture initial screenshot with retries - let initialScreenshot: string | null = null - for (let attempt = 1; attempt <= 3; attempt++) { - initialScreenshot = await this.actionMapper.captureScreenshot() - if (initialScreenshot) break - console.warn(`Initial screenshot attempt ${attempt} failed, retrying...`) - await new Promise((resolve) => setTimeout(resolve, 1000)) - } - - if (!initialScreenshot) { - throw new Error('Failed to capture initial screenshot after 3 attempts') - } - - // Build initial content - const initialParts: GeminiPart[] = [ - { text: query }, - { inlineData: { mimeType: 'image/png', data: initialScreenshot } }, - ] - this.contents.push({ role: 'user', parts: initialParts }) - - const messageId = randomUUID() - await emitEvent(streamWriter, { type: 'start', messageId }) - - let finished = false - for (let turn = 0; turn < this.config.turnLimit; turn++) { - if (signal.aborted) { - await emitEvent(streamWriter, { type: 'abort' }) - break - } - - // Start step (turn) - await emitEvent(streamWriter, { type: 'start-step' }) - - // Call Gemini API - let response: GeminiResponse - try { - response = await this.callGeminiApi() - } catch (error) { - const errorMsg = error instanceof Error ? error.message : String(error) - await emitEvent(streamWriter, { - type: 'error', - errorText: `API error: ${errorMsg}`, - }) - throw error - } - - // Check for API errors - if (response.error) { - await emitEvent(streamWriter, { - type: 'error', - errorText: response.error.message, - }) - throw new Error(`Gemini API error: ${response.error.message}`) - } - - // Extract text response - const textResponse = this.extractTextResponse(response) - if (textResponse) { - finalText = textResponse - const textId = randomUUID() - await emitEvent(streamWriter, { type: 'text-start', id: textId }) - await emitEvent(streamWriter, { - type: 'text-delta', - id: textId, - delta: textResponse, - }) - await emitEvent(streamWriter, { type: 'text-end', id: textId }) - } - - // Extract function calls - const actions = this.extractFunctionCalls(response) - - // If no actions, task is complete - if (actions.length === 0) { - await emitEvent(streamWriter, { type: 'finish-step' }) - await emitEvent(streamWriter, { - type: 'finish', - finishReason: 'completed', - }) - finished = true - break - } - - // Add model response to conversation - const candidate = response.candidates?.[0] - if (candidate?.content) { - this.contents.push(candidate.content) - } - - // Execute each action - for (const action of actions) { - if (signal.aborted) break - - const toolCallId = randomUUID() - - // Tool input events - await emitEvent(streamWriter, { - type: 'tool-input-start', - toolCallId, - toolName: action.name, - }) - await emitEvent(streamWriter, { - type: 'tool-input-available', - toolCallId, - toolName: action.name, - input: action.args, - }) - - const result = await this.actionMapper.execute(action) - totalActions++ - - // Tool output event - await emitEvent(streamWriter, { - type: 'tool-output-available', - toolCallId, - output: result, - }) - - // Call action hook (for screenshot capture) - if (this.actionHook) { - await this.actionHook(action, result) - } - } - - // Capture new screenshot and URL - const newScreenshot = await this.actionMapper.captureScreenshot() - const currentUrl = await this.actionMapper.getCurrentUrl() - - // Build function responses and add to conversation - const functionResponseParts = this.buildFunctionResponses( - actions, - currentUrl, - newScreenshot, - ) - this.contents.push({ role: 'user', parts: functionResponseParts }) - - // Finish step (turn) - await emitEvent(streamWriter, { type: 'finish-step' }) - } - - if (!finished && !signal.aborted) { - await emitEvent(streamWriter, { - type: 'finish', - finishReason: 'max_turns', - }) - } - - return { finalText, totalActions } - } -} diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts deleted file mode 100644 index f012d0564..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/index.ts +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Gemini Computer Use Evaluator - * Implements AgentEvaluator interface for the eval framework - */ - -import { DEFAULT_TIMEOUT_MS } from '../../constants' -import type { GeminiComputerUseConfig, TaskMetadata } from '../../types' -import { resolveEnvValue } from '../../utils/resolve-env' -import { withEvalTimeout } from '../../utils/with-eval-timeout' -import type { AgentContext, AgentEvaluator, AgentResult } from '../types' -import { GeminiComputerUseAgent } from './agent' -import { DEFAULTS } from './types' - -export class GeminiComputerUseEvaluator implements AgentEvaluator { - constructor(private ctx: AgentContext) {} - - async execute(): Promise { - const { config, task, capture, windowId = 0, tabId = 0 } = this.ctx - const agentConfig = config.agent as GeminiComputerUseConfig - - const startTime = Date.now() - const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS - - await capture.messageLogger.logUser(task.query) - - const apiKey = resolveEnvValue(agentConfig.apiKey) - if (!apiKey) { - throw new Error( - `API key not found. Set ${agentConfig.apiKey} environment variable or provide the key directly.`, - ) - } - - const agent = new GeminiComputerUseAgent({ - apiKey, - turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit, - screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize, - tabId, - windowId, - mcpUrl: `${config.browseros.server_url}/mcp`, - }) - - agent.setActionHook(async (_action, _result) => { - try { - await capture.screenshot.capture(capture.getActivePageId()) - } catch (err) { - console.warn('Screenshot capture failed in hook:', err) - } - }) - - const streamWriter = capture.createStreamWriter() - - let finalText: string | null = null - let totalActions = 0 - - const { terminationReason } = await withEvalTimeout( - timeoutMs, - capture, - async (signal) => { - const result = await agent.execute(task.query, streamWriter, signal) - finalText = result.finalText - totalActions = result.totalActions - return result - }, - ) - - const endTime = Date.now() - - const metadata: TaskMetadata = { - query_id: task.query_id, - dataset: task.dataset, - query: task.query, - started_at: new Date(startTime).toISOString(), - completed_at: new Date(endTime).toISOString(), - total_duration_ms: endTime - startTime, - total_steps: totalActions, - termination_reason: terminationReason, - final_answer: finalText ?? capture.getLastAssistantText(), - errors: capture.getErrors(), - warnings: capture.getWarnings(), - agent_config: { - type: 'gemini-computer-use', - model: DEFAULTS.model, - turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit, - screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize, - }, - grader_results: {}, - } - - await capture.trajectorySaver.saveMetadata(metadata) - - return { - metadata, - messages: capture.getMessages(), - finalAnswer: finalText ?? capture.getLastAssistantText(), - } - } -} diff --git a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts b/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts deleted file mode 100644 index ef3af48b2..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/gemini-computer-use/types.ts +++ /dev/null @@ -1,156 +0,0 @@ -/** - * Types for Gemini Computer Use agent - */ - -import { z } from 'zod' - -// Gemini Computer Use predefined actions (from API docs) -export const ComputerUseActionSchema = z.discriminatedUnion('name', [ - z.object({ - name: z.literal('click_at'), - args: z.object({ - x: z.number().min(0).max(999), - y: z.number().min(0).max(999), - }), - }), - z.object({ - name: z.literal('type_text_at'), - args: z.object({ - x: z.number().min(0).max(999), - y: z.number().min(0).max(999), - text: z.string(), - press_enter: z.boolean().optional(), - clear_before_typing: z.boolean().optional(), - }), - }), - z.object({ - name: z.literal('navigate'), - args: z.object({ - url: z.string(), - }), - }), - z.object({ - name: z.literal('scroll_document'), - args: z.object({ - direction: z.enum(['up', 'down', 'left', 'right']), - }), - }), - z.object({ - name: z.literal('scroll_at'), - args: z.object({ - x: z.number().min(0).max(999), - y: z.number().min(0).max(999), - direction: z.enum(['up', 'down', 'left', 'right']), - magnitude: z.number().optional(), - }), - }), - z.object({ - name: z.literal('key_combination'), - args: z.object({ - keys: z.string(), - }), - }), - z.object({ - name: z.literal('hover_at'), - args: z.object({ - x: z.number().min(0).max(999), - y: z.number().min(0).max(999), - }), - }), - z.object({ - name: z.literal('go_back'), - args: z.object({}).optional(), - }), - z.object({ - name: z.literal('go_forward'), - args: z.object({}).optional(), - }), - z.object({ - name: z.literal('wait_5_seconds'), - args: z.object({}).optional(), - }), - z.object({ - name: z.literal('drag_and_drop'), - args: z.object({ - x: z.number().min(0).max(999), - y: z.number().min(0).max(999), - destination_x: z.number().min(0).max(999), - destination_y: z.number().min(0).max(999), - }), - }), -]) - -export type ComputerUseAction = z.infer - -// Screen size configuration -export interface ScreenSize { - width: number - height: number -} - -// Context for action execution -export interface ActionContext { - mcpUrl: string - tabId: number - windowId: number - screenSize: ScreenSize -} - -// Gemini API types -export interface GeminiContent { - role: 'user' | 'model' - parts: GeminiPart[] -} - -export interface GeminiPart { - text?: string - inlineData?: { - mimeType: string - data: string - } - functionCall?: { - name: string - args?: Record - } - functionResponse?: { - name: string - response: Record - } -} - -export interface GeminiResponse { - candidates?: Array<{ - content: GeminiContent - finishReason?: string - }> - error?: { - message: string - code: number - } -} - -// Safety decision from Computer Use -export interface SafetyDecision { - decision: 'allow' | 'require_confirmation' | 'block' - explanation?: string -} - -// Agent configuration -export interface GeminiComputerUseAgentConfig { - apiKey: string - turnLimit: number - screenSize: ScreenSize - tabId: number - windowId: number - mcpUrl: string -} - -// Defaults -export const DEFAULTS = { - // Gemini's recommended screenshot size for optimal model accuracy - screenshotSize: { width: 1440, height: 900 }, - // Fallback viewport size (used when actual viewport can't be determined) - screenSize: { width: 1440, height: 900 }, - turnLimit: 30, - model: 'gemini-2.5-computer-use-preview-10-2025', -} as const diff --git a/packages/browseros-agent/apps/eval/src/agents/index.ts b/packages/browseros-agent/apps/eval/src/agents/index.ts index 1ace1f82a..12bc40272 100644 --- a/packages/browseros-agent/apps/eval/src/agents/index.ts +++ b/packages/browseros-agent/apps/eval/src/agents/index.ts @@ -1,8 +1,6 @@ -import { GeminiComputerUseEvaluator } from './gemini-computer-use' import { OrchestratorExecutorEvaluator } from './orchestrator-executor' import { registerAgent } from './registry' import { SingleAgentEvaluator } from './single-agent' -import { YutoriNavigatorEvaluator } from './yutori-navigator' // Register built-in agent types registerAgent('single', (ctx) => new SingleAgentEvaluator(ctx)) @@ -10,11 +8,6 @@ registerAgent( 'orchestrator-executor', (ctx) => new OrchestratorExecutorEvaluator(ctx), ) -registerAgent( - 'gemini-computer-use', - (ctx) => new GeminiComputerUseEvaluator(ctx), -) -registerAgent('yutori-navigator', (ctx) => new YutoriNavigatorEvaluator(ctx)) // Re-exports export { diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts deleted file mode 100644 index e8bee5e17..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/action-mapper.ts +++ /dev/null @@ -1,677 +0,0 @@ -/** - * Maps Yutori n1 actions to MCP tool calls - * - * Coordinate System: - * - n1 outputs normalized coordinates in 1000x1000 grid - * - Screenshots captured with size='large' (1028px width, aspect ratio preserved) - * - We scale normalized coords to actual viewport pixels - * - * Action Mapping (prioritize MCP tools over execute_javascript): - * - click → browser_click_coordinates ✅ - * - type → browser_type_at_coordinates (uses last clicked coords) ✅ - * - scroll up/down → browser_scroll_up/down ✅ - * - scroll left/right → browser_execute_javascript (no horizontal scroll tool) - * - key_press → browser_send_keys (for supported keys) ✅ - * - hover → browser_execute_javascript (no dedicated MCP tool) - * - drag → browser_execute_javascript (no dedicated MCP tool) - * - wait → setTimeout - * - refresh → browser_execute_javascript (no dedicated MCP tool) - * - go_back → browser_execute_javascript (no dedicated MCP tool) - * - goto_url → browser_navigate ✅ - * - stop → returns answer (no MCP call) - * - read_texts_and_links → browser_get_page_content ✅ - */ - -import { Client } from '@modelcontextprotocol/sdk/client/index.js' -import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js' -import sharp from 'sharp' -import type { ActionContext, N1Action, ScreenSize } from './types' -import { DEFAULTS } from './types' - -/** - * Convert PNG base64 to WebP base64 for smaller payload size. - * Yutori n1 recommends WebP format for better compression. - */ -async function convertToWebP(pngBase64: string): Promise { - const pngBuffer = Buffer.from(pngBase64, 'base64') - const webpBuffer = await sharp(pngBuffer) - .webp({ quality: 80 }) // Good balance of quality and size - .toBuffer() - return webpBuffer.toString('base64') -} - -interface McpToolResult { - content: Array<{ - type: string - text?: string - data?: string - mimeType?: string - }> - isError?: boolean -} - -const MCP_TIMEOUT_MS = 30000 - -// Scroll amount per unit (n1 recommends treating each amount as 10-15% of screen) -const SCROLL_PERCENT_PER_UNIT = 0.12 // 12% of viewport per scroll unit - -export class ActionMapper { - private ctx: ActionContext - private cachedViewport: ScreenSize | null = null - // Track last clicked coordinates for type action (n1 type has no coords) - private lastClickCoordinates: { x: number; y: number } | null = null - - constructor(ctx: ActionContext) { - this.ctx = ctx - } - - // Store debug info about viewport detection for inclusion in responses - private viewportDebugInfo: string = '' - - /** - * Get the actual browser viewport size via JavaScript - * This is critical for correct coordinate mapping: - * - Screenshot is scaled to 1028px width (aspect ratio preserved) - * - Clicks must be at actual viewport coordinates - * - We scale: (normalized/1000) * viewport - * Caches the result to avoid repeated calls - * Also stores debug info for troubleshooting - */ - async getViewportSize(): Promise { - if (this.cachedViewport) { - return this.cachedViewport - } - - try { - const result = await this.callMcp('browser_execute_javascript', { - tabId: this.ctx.tabId, - windowId: this.ctx.windowId, - code: '[window.innerWidth, window.innerHeight]', - }) - - const textContent = - result.content.find((c) => c.type === 'text')?.text ?? '' - - // Check for error in result - if (result.isError) { - this.viewportDebugInfo = `[VIEWPORT ERROR] JS execution failed: ${textContent}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - return this.ctx.screenSize - } - - // Parse array format - can be multiline: [1440, 900] or "Result: [\n 1200,\n 712\n]" - const arrayMatch = textContent.match(/\[\s*(\d+)\s*,\s*(\d+)\s*\]/s) - if (arrayMatch) { - const width = parseInt(arrayMatch[1], 10) - const height = parseInt(arrayMatch[2], 10) - if (width > 0 && height > 0) { - this.cachedViewport = { width, height } - this.viewportDebugInfo = `[VIEWPORT OK] Detected: ${width}x${height} (raw: "${textContent.substring(0, 100)}")` - console.log(this.viewportDebugInfo) - return this.cachedViewport - } else { - this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Invalid dimensions: ${width}x${height} from "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - } - } else { - this.viewportDebugInfo = `[VIEWPORT PARSE ERROR] Could not parse: "${textContent}". Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - } - } catch (error) { - const errMsg = error instanceof Error ? error.message : String(error) - this.viewportDebugInfo = `[VIEWPORT EXCEPTION] ${errMsg}. Using fallback: ${this.ctx.screenSize.width}x${this.ctx.screenSize.height}` - console.warn(this.viewportDebugInfo) - } - - // Fallback to config screenSize - return this.ctx.screenSize - } - - /** - * Clear cached viewport (call when tab/window changes or before new task) - */ - clearViewportCache(): void { - this.cachedViewport = null - } - - /** - * Reset all tracked state (call before starting a new task) - */ - reset(): void { - this.cachedViewport = null - this.lastClickCoordinates = null - } - - /** - * Scale normalized coordinate (0-1000) to actual viewport pixel value - * - * How it works: - * - Screenshot is captured at 1028px width with preserved aspect ratio - * - n1 predicts normalized coords (0-1000) for that screenshot - * - Since aspect ratio is preserved, we can scale directly to viewport - * - Formula: actualX = (normalizedX / 1000) * viewport.innerWidth - */ - private async scaleCoordinates( - normalizedX: number, - normalizedY: number, - ): Promise<{ x: number; y: number }> { - const viewport = await this.getViewportSize() - return { - x: Math.round((normalizedX / DEFAULTS.normalizedMax) * viewport.width), - y: Math.round((normalizedY / DEFAULTS.normalizedMax) * viewport.height), - } - } - - /** - * Call an MCP tool - */ - private async callMcp( - name: string, - args: Record = {}, - ): Promise { - const client = new Client({ - name: 'yutori-navigator', - version: '1.0.0', - }) - - const transport = new StreamableHTTPClientTransport( - new URL(this.ctx.mcpUrl), - { - requestInit: { - headers: { 'X-BrowserOS-Source': 'yutori-navigator' }, - }, - }, - ) - - try { - await client.connect(transport) - - const toolCallPromise = client.callTool({ name, arguments: args }) - let timeoutId: ReturnType | null = null - const timeoutPromise = new Promise((_, reject) => { - timeoutId = setTimeout( - () => - reject( - new Error(`MCP tool call timed out after ${MCP_TIMEOUT_MS}ms`), - ), - MCP_TIMEOUT_MS, - ) - }) - - try { - return (await Promise.race([ - toolCallPromise, - timeoutPromise, - ])) as McpToolResult - } finally { - if (timeoutId) clearTimeout(timeoutId) - } - } finally { - try { - await transport.close() - } catch { - // Ignore close errors - } - } - } - - /** - * Execute an n1 action by mapping to MCP tools - * Prioritizes native MCP tools over browser_execute_javascript for reliability - * Returns the result message and optionally the stop answer - */ - async execute( - action: N1Action, - ): Promise<{ success: boolean; message: string; stopAnswer?: string }> { - const { tabId, windowId } = this.ctx - - try { - switch (action.action_type) { - case 'click': { - const [normX, normY] = action.center_coordinates - const viewport = await this.getViewportSize() - const { x, y } = await this.scaleCoordinates(normX, normY) - - // Track coordinates for subsequent type action (n1 type has no coords) - this.lastClickCoordinates = { x, y } - - await this.callMcp('browser_click_coordinates', { - tabId, - windowId, - x, - y, - }) - // Return original coordinates + debug info - const debugInfo = `[DEBUG: input=(${normX},${normY}) → viewport=(${x},${y}), viewport=${viewport.width}x${viewport.height}] ${this.viewportDebugInfo}` - return { - success: true, - message: `Clicked at (${normX}, ${normY}). ${debugInfo}`, - } - } - - case 'type': { - const { text, press_enter_after, clear_before_typing } = action - - // n1 type action has no coordinates - it expects element to be focused - // Use last clicked coordinates with browser_type_at_coordinates - if (!this.lastClickCoordinates) { - // Fallback: click center of screen if no prior click - const viewport = await this.getViewportSize() - this.lastClickCoordinates = { - x: Math.round(viewport.width / 2), - y: Math.round(viewport.height / 2), - } - } - - const { x, y } = this.lastClickCoordinates - - // Clear field first if requested using native MCP tools - if (clear_before_typing) { - // Triple-click to select all text in the field - await this.callMcp('browser_click_coordinates', { - tabId, - windowId, - x, - y, - }) - // Use Delete key to clear - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Delete', - }) - } - - // Use browser_type_at_coordinates - the proper MCP tool for typing - await this.callMcp('browser_type_at_coordinates', { - tabId, - windowId, - x, - y, - text, - }) - - // Press Enter if requested using native MCP tool - if (press_enter_after) { - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: 'Enter', - }) - } - - // n1 type action has no coordinates - don't include viewport coords in response - return { - success: true, - message: `Typed "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}"`, - } - } - - case 'scroll': { - const { direction, center_coordinates, amount } = action - const [normX, normY] = center_coordinates - const { x, y } = await this.scaleCoordinates(normX, normY) - - // Track coordinates - this.lastClickCoordinates = { x, y } - - // Click at position first to focus element (for scrollable containers) - await this.callMcp('browser_click_coordinates', { - tabId, - windowId, - x, - y, - }) - - // For vertical scroll (up/down): use native MCP scroll tools - // For horizontal scroll (left/right): use JS (no MCP tool available) - if (direction === 'up' || direction === 'down') { - const scrollTool = - direction === 'up' ? 'browser_scroll_up' : 'browser_scroll_down' - - // Calculate how many scroll calls based on amount - // n1 amount 1-2 = ~20% viewport, our tool = 100% viewport - // So we scroll once for small amounts, more for larger - const scrollCount = Math.max(1, Math.round(amount / 5)) - - for (let i = 0; i < scrollCount; i++) { - await this.callMcp(scrollTool, { tabId, windowId }) - // Small delay between scrolls for stability - if (i < scrollCount - 1) { - await new Promise((r) => setTimeout(r, 100)) - } - } - - // Return original normalized coordinates - return { - success: true, - message: `Scrolled ${direction} at (${normX}, ${normY})`, - } - } else { - // Horizontal scroll - no MCP tool, use JS - const viewport = await this.getViewportSize() - const scrollPixels = Math.round( - amount * SCROLL_PERCENT_PER_UNIT * viewport.width, - ) - const scrollCode = - direction === 'left' - ? `window.scrollBy(-${scrollPixels}, 0)` - : `window.scrollBy(${scrollPixels}, 0)` - - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: scrollCode, - }) - - // Return original normalized coordinates - return { - success: true, - message: `Scrolled ${direction} at (${normX}, ${normY})`, - } - } - } - - case 'key_press': { - const { key_comb } = action - - // Map keys to browser_send_keys supported keys - // browser_send_keys supports: Enter, Delete, Backspace, Tab, Escape, - // ArrowUp, ArrowDown, ArrowLeft, ArrowRight, Home, End, PageUp, PageDown - const keyMap: Record = { - Enter: 'Enter', - Escape: 'Escape', - Tab: 'Tab', - Backspace: 'Backspace', - Delete: 'Delete', - ArrowUp: 'ArrowUp', - ArrowDown: 'ArrowDown', - ArrowLeft: 'ArrowLeft', - ArrowRight: 'ArrowRight', - Home: 'Home', - End: 'End', - PageUp: 'PageUp', - PageDown: 'PageDown', - // Alternative names n1 might use - Return: 'Enter', - Esc: 'Escape', - Up: 'ArrowUp', - Down: 'ArrowDown', - Left: 'ArrowLeft', - Right: 'ArrowRight', - } - - const mappedKey = keyMap[key_comb] - if (mappedKey) { - // Use native MCP tool - await this.callMcp('browser_send_keys', { - tabId, - windowId, - key: mappedKey, - }) - } else { - // For complex key combinations (Ctrl+A, etc.), use JavaScript - const parts = key_comb.split('+') - const mainKey = parts.pop() || '' - const modifiers = parts.map((p) => p.toLowerCase()) - - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: ` - const event = new KeyboardEvent('keydown', { - key: '${mainKey}', - code: 'Key${mainKey.toUpperCase()}', - ctrlKey: ${modifiers.includes('control') || modifiers.includes('ctrl')}, - shiftKey: ${modifiers.includes('shift')}, - altKey: ${modifiers.includes('alt')}, - metaKey: ${modifiers.includes('meta') || modifiers.includes('cmd')}, - bubbles: true - }); - document.activeElement?.dispatchEvent(event); - `, - }) - } - - return { success: true, message: `Pressed ${key_comb}` } - } - - case 'hover': { - // No dedicated MCP hover tool - use JS - const [normX, normY] = action.center_coordinates - const { x, y } = await this.scaleCoordinates(normX, normY) - - // Track coordinates - this.lastClickCoordinates = { x, y } - - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: ` - const elem = document.elementFromPoint(${x}, ${y}); - if (elem) { - const event = new MouseEvent('mouseover', { - bubbles: true, - clientX: ${x}, - clientY: ${y} - }); - elem.dispatchEvent(event); - } - `, - }) - - // Return original normalized coordinates - return { success: true, message: `Hovered at (${normX}, ${normY})` } - } - - case 'drag': { - // No dedicated MCP drag tool - use JS - const [startNormX, startNormY] = action.start_coordinates - const [endNormX, endNormY] = action.center_coordinates - const start = await this.scaleCoordinates(startNormX, startNormY) - const end = await this.scaleCoordinates(endNormX, endNormY) - - // Track end coordinates - this.lastClickCoordinates = end - - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: ` - const startElem = document.elementFromPoint(${start.x}, ${start.y}); - const endElem = document.elementFromPoint(${end.x}, ${end.y}); - if (startElem && endElem) { - const dragStart = new DragEvent('dragstart', { - bubbles: true, - clientX: ${start.x}, - clientY: ${start.y} - }); - const drop = new DragEvent('drop', { - bubbles: true, - clientX: ${end.x}, - clientY: ${end.y} - }); - const dragEnd = new DragEvent('dragend', { bubbles: true }); - startElem.dispatchEvent(dragStart); - endElem.dispatchEvent(drop); - startElem.dispatchEvent(dragEnd); - } - `, - }) - - // Return original normalized coordinates - return { - success: true, - message: `Dragged from (${startNormX}, ${startNormY}) to (${endNormX}, ${endNormY})`, - } - } - - case 'wait': { - // n1 uses this for page loads - await new Promise((resolve) => setTimeout(resolve, 2000)) - return { success: true, message: 'Waited 2 seconds' } - } - - case 'refresh': { - // No dedicated MCP refresh tool - use JS - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: 'location.reload()', - }) - // Wait for page to start reloading - await new Promise((resolve) => setTimeout(resolve, 1000)) - return { success: true, message: 'Refreshed page' } - } - - case 'go_back': { - // No dedicated MCP go_back tool - use JS - await this.callMcp('browser_execute_javascript', { - tabId, - windowId, - code: 'history.back()', - }) - return { success: true, message: 'Navigated back' } - } - - case 'goto_url': { - // Use native MCP navigate tool - await this.callMcp('browser_navigate', { - tabId, - windowId, - url: action.url, - }) - return { success: true, message: `Navigated to ${action.url}` } - } - - case 'read_texts_and_links': { - // Use native MCP tool - const result = await this.callMcp('browser_get_page_content', { - tabId, - windowId, - type: 'text-with-links', - }) - const content = - result.content.find((c) => c.type === 'text')?.text ?? '' - return { - success: true, - message: `Read page content (${content.length} chars)`, - } - } - - case 'stop': { - // Stop action - task is complete, return the answer - return { - success: true, - message: 'Task completed', - stopAnswer: action.answer, - } - } - - default: { - const _exhaustive: never = action - return { - success: false, - message: `Unknown action: ${JSON.stringify(action)}`, - } - } - } - } catch (error) { - const message = error instanceof Error ? error.message : String(error) - return { success: false, message: `Action failed: ${message}` } - } - } - - /** - * Capture a screenshot via MCP with retry logic - * - * Uses Yutori's recommended screenshot size (1280x800) for optimal model performance. - * Now that viewport detection is working correctly, the coordinate mapping will be accurate. - * - * Returns WebP base64 string - */ - async captureScreenshot(retries = 2): Promise { - const { width, height } = DEFAULTS.screenshotSize - - for (let attempt = 0; attempt <= retries; attempt++) { - try { - const result = await this.callMcp('browser_get_screenshot', { - tabId: this.ctx.tabId, - windowId: this.ctx.windowId, - width, - height, - showHighlights: false, - }) - - if (result.isError) { - const errorText = - result.content?.find((c) => c.type === 'text')?.text ?? - 'Unknown error' - if (attempt < retries) { - console.warn( - `Screenshot attempt ${attempt + 1} failed: ${errorText}, retrying...`, - ) - await new Promise((r) => setTimeout(r, 500)) - continue - } - console.warn('Screenshot capture failed:', errorText) - return null - } - - const imageContent = result.content.find((c) => c.type === 'image') - if (imageContent?.data) { - // Convert PNG to WebP for smaller payload (n1 recommends WebP) - try { - const webpBase64 = await convertToWebP(imageContent.data) - return webpBase64 - } catch (conversionError) { - console.warn('WebP conversion failed, using PNG:', conversionError) - return imageContent.data - } - } - - if (attempt < retries) { - console.warn( - `Screenshot attempt ${attempt + 1}: No image data, retrying...`, - ) - await new Promise((r) => setTimeout(r, 500)) - continue - } - return null - } catch (error) { - if (attempt < retries) { - console.warn( - `Screenshot attempt ${attempt + 1} error:`, - error, - 'retrying...', - ) - await new Promise((r) => setTimeout(r, 500)) - continue - } - console.warn('Screenshot capture error:', error) - return null - } - } - return null - } - - /** - * Get current page URL via MCP - */ - async getCurrentUrl(): Promise { - try { - const result = await this.callMcp('browser_execute_javascript', { - tabId: this.ctx.tabId, - windowId: this.ctx.windowId, - code: 'window.location.href', - }) - - const textContent = - result.content.find((c) => c.type === 'text')?.text ?? '' - const urlMatch = textContent.match(/Result:\s*"?([^"\n]+)"?/) - return urlMatch?.[1] ?? 'unknown' - } catch { - return 'unknown' - } - } -} diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts deleted file mode 100644 index 408327caa..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/agent.ts +++ /dev/null @@ -1,353 +0,0 @@ -/** - * Yutori Navigator n1 Agent - * - * Implements the agent loop that calls Yutori n1 API and executes actions. - * Uses UIMessageStreamEvent format for logging compatibility. - * - * n1 API follows OpenAI Chat Completions interface with special 'observation' role - * for screenshots. Full conversation history must be maintained. - */ - -import { randomUUID } from 'node:crypto' -import { ActionMapper } from './action-mapper' -import { - DEFAULTS, - type N1Action, - type N1ChatCompletionResponse, - type N1Message, - N1ResponseSchema, - YUTORI_API_BASE, - type YutoriNavigatorAgentConfig, -} from './types' - -interface StreamWriter { - write: (data: string) => Promise -} - -type ActionHook = ( - action: N1Action, - result: { success: boolean; message: string }, -) => Promise - -/** - * Emit SSE-formatted UIMessageStreamEvent - */ -function emitEvent( - writer: StreamWriter, - event: Record, -): Promise { - return writer.write(`data: ${JSON.stringify(event)}\n\n`) -} - -export class YutoriNavigatorAgent { - private config: YutoriNavigatorAgentConfig - private actionMapper: ActionMapper - private actionHook?: ActionHook - private messages: N1Message[] = [] - - constructor(config: YutoriNavigatorAgentConfig) { - this.config = config - this.actionMapper = new ActionMapper({ - mcpUrl: config.mcpUrl, - tabId: config.tabId, - windowId: config.windowId, - screenSize: config.screenSize, - }) - } - - /** - * Set a hook to be called after each action execution - */ - setActionHook(hook: ActionHook): void { - this.actionHook = hook - } - - /** - * Build observation message with screenshot and optional URL - */ - private buildObservationMessage( - screenshotBase64: string, - currentUrl?: string, - ): N1Message { - const content: N1Message['content'] = [] - - // Include URL if available (recommended by Yutori for better attribution) - if (currentUrl) { - content.push({ - type: 'text', - text: `Current URL: ${currentUrl}`, - }) - } - - // Add screenshot as base64 data URL (WebP for smaller payload) - content.push({ - type: 'image_url', - image_url: { - url: `data:image/webp;base64,${screenshotBase64}`, - }, - }) - - return { - role: 'observation', - content, - } - } - - /** - * Call the Yutori n1 API - */ - private async callN1Api(): Promise { - const url = `${YUTORI_API_BASE}/chat/completions` - - const response = await fetch(url, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - Authorization: `Bearer ${this.config.apiKey}`, - }, - body: JSON.stringify({ - model: DEFAULTS.model, - messages: this.messages, - temperature: DEFAULTS.temperature, - }), - }) - - if (!response.ok) { - const errorBody = await response.text() - throw new Error( - `Yutori n1 API error: ${response.status} ${response.statusText} - ${errorBody}`, - ) - } - - return response.json() - } - - /** - * Parse n1 response content to extract thoughts and actions - */ - private parseN1Response( - content: string, - ): { thoughts: string; actions: N1Action[] } | null { - try { - const parsed = JSON.parse(content) - const validated = N1ResponseSchema.safeParse(parsed) - - if (validated.success) { - return validated.data - } - - console.warn('n1 response validation failed:', validated.error.message) - // Try to extract what we can - return { - thoughts: parsed.thoughts ?? '', - actions: Array.isArray(parsed.actions) ? parsed.actions : [], - } - } catch (error) { - console.warn('Failed to parse n1 response:', error) - return null - } - } - - /** - * Execute the agent loop - */ - async execute( - query: string, - streamWriter: StreamWriter, - signal: AbortSignal, - ): Promise<{ finalText: string | null; totalActions: number }> { - let totalActions = 0 - let finalText: string | null = null - - // Wait for page to stabilize before first screenshot - await new Promise((resolve) => setTimeout(resolve, 2000)) - - // Capture initial screenshot with retries - let initialScreenshot: string | null = null - for (let attempt = 1; attempt <= 3; attempt++) { - initialScreenshot = await this.actionMapper.captureScreenshot() - if (initialScreenshot) break - console.warn(`Initial screenshot attempt ${attempt} failed, retrying...`) - await new Promise((resolve) => setTimeout(resolve, 1000)) - } - - if (!initialScreenshot) { - throw new Error('Failed to capture initial screenshot after 3 attempts') - } - - // Get initial URL - const initialUrl = await this.actionMapper.getCurrentUrl() - - // Build initial messages - // 1. User message with task - this.messages.push({ - role: 'user', - content: [{ type: 'text', text: query }], - }) - - // 2. Initial observation with screenshot - this.messages.push( - this.buildObservationMessage(initialScreenshot, initialUrl), - ) - - // Emit start event - const messageId = randomUUID() - await emitEvent(streamWriter, { type: 'start', messageId }) - - let finished = false - for (let turn = 0; turn < this.config.turnLimit; turn++) { - if (signal.aborted) { - await emitEvent(streamWriter, { type: 'abort' }) - break - } - - // Start step (turn) - await emitEvent(streamWriter, { type: 'start-step' }) - - // Call n1 API - let response: N1ChatCompletionResponse - try { - response = await this.callN1Api() - } catch (error) { - const errorMsg = error instanceof Error ? error.message : String(error) - await emitEvent(streamWriter, { - type: 'error', - errorText: `API error: ${errorMsg}`, - }) - throw error - } - - // Extract response content - const choice = response.choices?.[0] - if (!choice?.message?.content) { - await emitEvent(streamWriter, { - type: 'error', - errorText: 'Empty response from n1 API', - }) - throw new Error('Empty response from n1 API') - } - - const assistantContent = choice.message.content - - // Parse the JSON response - const parsed = this.parseN1Response(assistantContent) - if (!parsed) { - await emitEvent(streamWriter, { - type: 'error', - errorText: 'Failed to parse n1 response', - }) - throw new Error('Failed to parse n1 response') - } - - const { thoughts, actions } = parsed - - // Emit thoughts as text - if (thoughts) { - finalText = thoughts - const textId = randomUUID() - await emitEvent(streamWriter, { type: 'text-start', id: textId }) - await emitEvent(streamWriter, { - type: 'text-delta', - id: textId, - delta: thoughts, - }) - await emitEvent(streamWriter, { type: 'text-end', id: textId }) - } - - // Check for stop action or no actions - const stopAction = actions.find((a) => a.action_type === 'stop') - if (stopAction && stopAction.action_type === 'stop') { - finalText = stopAction.answer - await emitEvent(streamWriter, { type: 'finish-step' }) - await emitEvent(streamWriter, { - type: 'finish', - finishReason: 'completed', - }) - finished = true - break - } - - if (actions.length === 0) { - await emitEvent(streamWriter, { type: 'finish-step' }) - await emitEvent(streamWriter, { - type: 'finish', - finishReason: 'completed', - }) - finished = true - break - } - - // Add assistant response to conversation history - this.messages.push({ - role: 'assistant', - content: assistantContent, - }) - - // Execute each action - for (const action of actions) { - if (signal.aborted) break - - // Skip stop actions (handled above) - if (action.action_type === 'stop') continue - - const toolCallId = randomUUID() - - // Tool input events - await emitEvent(streamWriter, { - type: 'tool-input-start', - toolCallId, - toolName: action.action_type, - }) - await emitEvent(streamWriter, { - type: 'tool-input-available', - toolCallId, - toolName: action.action_type, - input: action, - }) - - const result = await this.actionMapper.execute(action) - totalActions++ - - // Check if this was a stop action that returned an answer - if (result.stopAnswer) { - finalText = result.stopAnswer - } - - // Tool output event - await emitEvent(streamWriter, { - type: 'tool-output-available', - toolCallId, - output: result, - }) - - // Call action hook (for screenshot capture) - if (this.actionHook) { - await this.actionHook(action, result) - } - } - - // Capture new screenshot and URL for next turn - const newScreenshot = await this.actionMapper.captureScreenshot() - const currentUrl = await this.actionMapper.getCurrentUrl() - - // Add observation for next turn (n1 requires full history) - if (newScreenshot) { - this.messages.push( - this.buildObservationMessage(newScreenshot, currentUrl), - ) - } - - // Finish step (turn) - await emitEvent(streamWriter, { type: 'finish-step' }) - } - - if (!finished && !signal.aborted) { - await emitEvent(streamWriter, { - type: 'finish', - finishReason: 'max_turns', - }) - } - - return { finalText, totalActions } - } -} diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts deleted file mode 100644 index ec41783a5..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/index.ts +++ /dev/null @@ -1,97 +0,0 @@ -/** - * Yutori Navigator Evaluator - * Implements AgentEvaluator interface for the eval framework - */ - -import { DEFAULT_TIMEOUT_MS } from '../../constants' -import type { TaskMetadata, YutoriNavigatorConfig } from '../../types' -import { resolveEnvValue } from '../../utils/resolve-env' -import { withEvalTimeout } from '../../utils/with-eval-timeout' -import type { AgentContext, AgentEvaluator, AgentResult } from '../types' -import { YutoriNavigatorAgent } from './agent' -import { DEFAULTS } from './types' - -export class YutoriNavigatorEvaluator implements AgentEvaluator { - constructor(private ctx: AgentContext) {} - - async execute(): Promise { - const { config, task, capture, windowId = 0, tabId = 0 } = this.ctx - const agentConfig = config.agent as YutoriNavigatorConfig - - const startTime = Date.now() - const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS - - await capture.messageLogger.logUser(task.query) - - const apiKey = resolveEnvValue(agentConfig.apiKey) - if (!apiKey) { - throw new Error( - `API key not found. Set ${agentConfig.apiKey} environment variable or provide the key directly.`, - ) - } - - const agent = new YutoriNavigatorAgent({ - apiKey, - turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit, - screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize, - tabId, - windowId, - mcpUrl: `${config.browseros.server_url}/mcp`, - }) - - agent.setActionHook(async (_action, _result) => { - try { - await capture.screenshot.capture(capture.getActivePageId()) - } catch (err) { - console.warn('Screenshot capture failed in hook:', err) - } - }) - - const streamWriter = capture.createStreamWriter() - - let finalText: string | null = null - let totalActions = 0 - - const { terminationReason } = await withEvalTimeout( - timeoutMs, - capture, - async (signal) => { - const result = await agent.execute(task.query, streamWriter, signal) - finalText = result.finalText - totalActions = result.totalActions - return result - }, - ) - - const endTime = Date.now() - - const metadata: TaskMetadata = { - query_id: task.query_id, - dataset: task.dataset, - query: task.query, - started_at: new Date(startTime).toISOString(), - completed_at: new Date(endTime).toISOString(), - total_duration_ms: endTime - startTime, - total_steps: totalActions, - termination_reason: terminationReason, - final_answer: finalText ?? capture.getLastAssistantText(), - errors: capture.getErrors(), - warnings: capture.getWarnings(), - agent_config: { - type: 'yutori-navigator', - model: DEFAULTS.model, - turnLimit: agentConfig.turnLimit ?? DEFAULTS.turnLimit, - screenSize: agentConfig.screenSize ?? DEFAULTS.screenSize, - }, - grader_results: {}, - } - - await capture.trajectorySaver.saveMetadata(metadata) - - return { - metadata, - messages: capture.getMessages(), - finalAnswer: finalText ?? capture.getLastAssistantText(), - } - } -} diff --git a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts b/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts deleted file mode 100644 index a1e3903b1..000000000 --- a/packages/browseros-agent/apps/eval/src/agents/yutori-navigator/types.ts +++ /dev/null @@ -1,158 +0,0 @@ -/** - * Types for Yutori Navigator n1 agent - * - * n1 is a pixels-to-actions LLM that follows OpenAI Chat Completions interface. - * Coordinates are normalized to 1000x1000 grid. - * Recommended screenshot size: 1280x800 (WXGA 16:10) - */ - -import { z } from 'zod' - -// n1 action schemas based on API documentation -export const N1ActionSchema = z.discriminatedUnion('action_type', [ - z.object({ - action_type: z.literal('click'), - center_coordinates: z.tuple([z.number(), z.number()]), - }), - z.object({ - action_type: z.literal('scroll'), - direction: z.enum(['up', 'down', 'left', 'right']), - center_coordinates: z.tuple([z.number(), z.number()]), - amount: z.number().int().min(1).max(10), - }), - z.object({ - action_type: z.literal('type'), - text: z.string(), - press_enter_after: z.boolean().optional(), - clear_before_typing: z.boolean().optional(), - }), - z.object({ - action_type: z.literal('key_press'), - key_comb: z.string(), // Playwright keyboard press format - }), - z.object({ - action_type: z.literal('hover'), - center_coordinates: z.tuple([z.number(), z.number()]), - }), - z.object({ - action_type: z.literal('drag'), - start_coordinates: z.tuple([z.number(), z.number()]), - center_coordinates: z.tuple([z.number(), z.number()]), // destination - }), - z.object({ - action_type: z.literal('wait'), - }), - z.object({ - action_type: z.literal('refresh'), - }), - z.object({ - action_type: z.literal('go_back'), - }), - z.object({ - action_type: z.literal('goto_url'), - url: z.string(), - }), - z.object({ - action_type: z.literal('read_texts_and_links'), - }), - z.object({ - action_type: z.literal('stop'), - answer: z.string(), - }), -]) - -export type N1Action = z.infer - -// n1 API response format -export const N1ResponseSchema = z.object({ - thoughts: z.string(), - actions: z.array(N1ActionSchema), -}) - -export type N1Response = z.infer - -// Screen size configuration -export interface ScreenSize { - width: number - height: number -} - -// Context for action execution -export interface ActionContext { - mcpUrl: string - tabId: number - windowId: number - screenSize: ScreenSize -} - -// OpenAI-compatible message types for n1 API -export type N1MessageRole = 'user' | 'assistant' | 'observation' - -export interface N1TextContent { - type: 'text' - text: string -} - -export interface N1ImageContent { - type: 'image_url' - image_url: { - url: string // Can be URL or data:image/webp;base64,... - } -} - -export type N1ContentPart = N1TextContent | N1ImageContent - -export interface N1Message { - role: N1MessageRole - content: string | N1ContentPart[] -} - -export interface N1ChatCompletionRequest { - model: string - messages: N1Message[] - temperature?: number -} - -export interface N1ChatCompletionResponse { - id: string - object: string - created: number - model: string - choices: Array<{ - index: number - message: { - role: 'assistant' - content: string // JSON string containing N1Response - } - finish_reason: string - }> - usage?: { - prompt_tokens: number - completion_tokens: number - total_tokens: number - } -} - -// Agent configuration -export interface YutoriNavigatorAgentConfig { - apiKey: string - turnLimit: number - screenSize: ScreenSize - tabId: number - windowId: number - mcpUrl: string -} - -// Defaults based on Yutori documentation -export const DEFAULTS = { - // WXGA 16:10 - Yutori's recommended screenshot size - screenshotSize: { width: 1280, height: 800 }, - screenSize: { width: 1280, height: 800 }, - turnLimit: 30, - model: 'n1-preview-2025-11', - temperature: 0.3, - // n1 uses 1000x1000 normalized coordinate system - normalizedMax: 1000, -} as const - -export const YUTORI_API_BASE = 'https://api.yutori.com/v1' diff --git a/packages/browseros-agent/apps/eval/src/dashboard/index.html b/packages/browseros-agent/apps/eval/src/dashboard/index.html index 59d7a401d..b65ba5d9f 100644 --- a/packages/browseros-agent/apps/eval/src/dashboard/index.html +++ b/packages/browseros-agent/apps/eval/src/dashboard/index.html @@ -197,8 +197,6 @@

Agent

@@ -287,50 +285,6 @@

Executor

- -
-
- - -
-
-
- - -
-
- - -
-
-
- - -
-
- - -
-
- - -
-
-
- - -
-
- - -
-
-
- - -
-
- @@ -816,20 +770,6 @@

Grader

setVal('cfg-exec-model', exec.model); setVal('cfg-exec-apikey', exec.apiKey); setVal('cfg-exec-baseurl', exec.baseUrl); - } else if (type === 'gemini-computer-use') { - setVal('cfg-gemini-apikey', agent.apiKey); - if (agent.screenSize) { - setVal('cfg-gemini-width', agent.screenSize.width); - setVal('cfg-gemini-height', agent.screenSize.height); - } - setVal('cfg-gemini-turns', agent.turnLimit); - } else if (type === 'yutori-navigator') { - setVal('cfg-yutori-apikey', agent.apiKey); - if (agent.screenSize) { - setVal('cfg-yutori-width', agent.screenSize.width); - setVal('cfg-yutori-height', agent.screenSize.height); - } - setVal('cfg-yutori-turns', agent.turnLimit); } // Infrastructure @@ -913,26 +853,6 @@

Grader

baseUrl: getVal('cfg-exec-baseurl') || undefined, }, }; - } else if (type === 'gemini-computer-use') { - agent = { - type: 'gemini-computer-use', - apiKey: getVal('cfg-gemini-apikey'), - screenSize: { - width: parseInt(getVal('cfg-gemini-width'), 10) || 1440, - height: parseInt(getVal('cfg-gemini-height'), 10) || 900, - }, - turnLimit: parseInt(getVal('cfg-gemini-turns'), 10) || 30, - }; - } else if (type === 'yutori-navigator') { - agent = { - type: 'yutori-navigator', - apiKey: getVal('cfg-yutori-apikey'), - screenSize: { - width: parseInt(getVal('cfg-yutori-width'), 10) || 1280, - height: parseInt(getVal('cfg-yutori-height'), 10) || 800, - }, - turnLimit: parseInt(getVal('cfg-yutori-turns'), 10) || 30, - }; } // Dataset: use dropdown value unless custom is selected diff --git a/packages/browseros-agent/apps/eval/src/types/config.ts b/packages/browseros-agent/apps/eval/src/types/config.ts index bf181535b..b2e9ff387 100644 --- a/packages/browseros-agent/apps/eval/src/types/config.ts +++ b/packages/browseros-agent/apps/eval/src/types/config.ts @@ -19,37 +19,9 @@ export const OrchestratorExecutorConfigSchema = z.object({ }), }) -export const GeminiComputerUseConfigSchema = z.object({ - type: z.literal('gemini-computer-use'), - apiKey: z - .string() - .describe('API key or env var name (e.g., GOOGLE_AI_API_KEY)'), - screenSize: z - .object({ - width: z.number().int().min(800).max(2560).default(1440), - height: z.number().int().min(600).max(1440).default(900), - }) - .optional(), - turnLimit: z.number().int().min(1).max(100).default(30), -}) - -export const YutoriNavigatorConfigSchema = z.object({ - type: z.literal('yutori-navigator'), - apiKey: z.string().describe('API key or env var name (e.g., YUTORI_API_KEY)'), - screenSize: z - .object({ - width: z.number().int().min(800).max(2560).default(1280), - height: z.number().int().min(600).max(1440).default(800), - }) - .optional(), - turnLimit: z.number().int().min(1).max(100).default(30), -}) - export const AgentConfigSchema = z.discriminatedUnion('type', [ SingleAgentConfigSchema, OrchestratorExecutorConfigSchema, - GeminiComputerUseConfigSchema, - YutoriNavigatorConfigSchema, ]) export const EvalConfigSchema = z.object({ @@ -84,9 +56,5 @@ export type SingleAgentConfig = z.infer export type OrchestratorExecutorConfig = z.infer< typeof OrchestratorExecutorConfigSchema > -export type GeminiComputerUseConfig = z.infer< - typeof GeminiComputerUseConfigSchema -> -export type YutoriNavigatorConfig = z.infer export type AgentConfig = z.infer export type EvalConfig = z.infer diff --git a/packages/browseros-agent/apps/eval/src/types/index.ts b/packages/browseros-agent/apps/eval/src/types/index.ts index e55f73a71..84c23a3c4 100644 --- a/packages/browseros-agent/apps/eval/src/types/index.ts +++ b/packages/browseros-agent/apps/eval/src/types/index.ts @@ -4,14 +4,10 @@ export { AgentConfigSchema, type EvalConfig, EvalConfigSchema, - type GeminiComputerUseConfig, - GeminiComputerUseConfigSchema, type OrchestratorExecutorConfig, OrchestratorExecutorConfigSchema, type SingleAgentConfig, SingleAgentConfigSchema, - type YutoriNavigatorConfig, - YutoriNavigatorConfigSchema, } from './config' // Error types export { diff --git a/packages/browseros-agent/apps/eval/src/types/result.ts b/packages/browseros-agent/apps/eval/src/types/result.ts index 29fa3f5c9..5678f8a3e 100644 --- a/packages/browseros-agent/apps/eval/src/types/result.ts +++ b/packages/browseros-agent/apps/eval/src/types/result.ts @@ -13,12 +13,7 @@ export const GraderResultSchema = z.object({ // Agent config in metadata const AgentConfigMetaSchema = z .object({ - type: z.enum([ - 'single', - 'orchestrator-executor', - 'gemini-computer-use', - 'yutori-navigator', - ]), + type: z.enum(['single', 'orchestrator-executor']), model: z.string().optional(), }) .passthrough() diff --git a/packages/browseros-agent/apps/eval/src/utils/config-validator.ts b/packages/browseros-agent/apps/eval/src/utils/config-validator.ts index 8b68ea843..d41a62288 100644 --- a/packages/browseros-agent/apps/eval/src/utils/config-validator.ts +++ b/packages/browseros-agent/apps/eval/src/utils/config-validator.ts @@ -70,13 +70,6 @@ export async function validateConfig( envVarsToCheck.push(config.agent.executor.apiKey) } } - } else if (config.agent.type === 'gemini-computer-use') { - // Gemini Computer Use agent - if (config.agent.apiKey) { - if (/^[A-Z][A-Z0-9_]*$/.test(config.agent.apiKey)) { - envVarsToCheck.push(config.agent.apiKey) - } - } } // Grader API key is checked at runtime - just warn if not set diff --git a/packages/browseros-agent/bun.lock b/packages/browseros-agent/bun.lock index 4ddbed64c..c8b268c9b 100644 --- a/packages/browseros-agent/bun.lock +++ b/packages/browseros-agent/bun.lock @@ -135,8 +135,6 @@ "@aws-sdk/client-s3": "^3.1014.0", "@browseros/server": "workspace:*", "@browseros/shared": "workspace:*", - "@google/gemini-cli-core": "^0.16.0", - "@google/genai": "1.30.0", "@modelcontextprotocol/sdk": "^1.25.2", "ai": "^6.0.94", "hono": "^4.6.0",