Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"agent": {
"type": "clado-action",
"provider": "clado-action",
"model": "qwen3-vl-30b-a3b-instruct",
"apiKey": "",
"baseUrl": "https://clado-ai--clado-browseros-action-001760-actionmodel-generate.modal.run",
"temperature": 0.7
},
"dataset": "../data/local/webbench-2of4-30-visible.jsonl",
"output_dir": "../results",
"num_workers": 5,
"restart_server_per_task": true,
"browseros": {
"server_url": "http://127.0.0.1:9110",
"base_cdp_port": 9010,
"base_server_port": 9110,
"base_extension_port": 9310,
"load_extensions": true,
"headless": true
},
"captcha": {
"api_key_env": "NOPECHA_API_KEY"
},
"graders": ["performance_grader"],
"grader_api_key_env": "OPENROUTER_API_KEY",
"grader_base_url": "https://openrouter.ai/api/v1",
"grader_model": "openai/gpt-4.1",
"timeout_ms": 1800000
}

Large diffs are not rendered by default.

199 changes: 199 additions & 0 deletions packages/browseros-agent/apps/eval/src/agents/clado-action/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/**
* Direct Clado Action evaluator.
*
* Runs the visual action model directly against the full task instruction,
* without an LLM orchestrator in front of it.
*/

import { Browser } from '@browseros/server/browser'
import { CdpBackend } from '@browseros/server/browser/backends/cdp'
import { CaptchaWaiter } from '../../capture/captcha-waiter'
import { DEFAULT_TIMEOUT_MS } from '../../constants'
import type { CladoActionConfig, EvalConfig, TaskMetadata } from '../../types'
import type { UIMessageStreamEvent } from '../../types/message'
import { resolveEnvValue } from '../../utils/resolve-env'
import { withEvalTimeout } from '../../utils/with-eval-timeout'
import { CladoActionExecutor } from '../orchestrator-executor/clado-action-executor'
import type { ExecutorCallbacks } from '../orchestrator-executor/executor'
import type { AgentContext, AgentEvaluator, AgentResult } from '../types'

function extractCdpPort(config: EvalConfig): number {
const serverUrl = config.browseros.server_url
const match = serverUrl.match(/:(\d+)$/)
if (!match) return config.browseros.base_cdp_port
const serverPort = Number.parseInt(match[1], 10)
const workerOffset = serverPort - config.browseros.base_server_port
return config.browseros.base_cdp_port + workerOffset
}

export class CladoActionEvaluator implements AgentEvaluator {
constructor(private ctx: AgentContext) {}

async execute(): Promise<AgentResult> {
const { config, task, capture } = this.ctx
const startTime = Date.now()
const timeoutMs = config.timeout_ms ?? DEFAULT_TIMEOUT_MS

await capture.messageLogger.logUser(task.query)

if (config.agent.type !== 'clado-action') {
throw new Error('CladoActionEvaluator requires clado-action config')
}

const agentConfig = config.agent as CladoActionConfig
const cdpPort = extractCdpPort(config)
const cdp = new CdpBackend({ port: cdpPort })
await cdp.connect()
const browser = new Browser(cdp)
capture.screenshot.setBrowser(browser)

const captchaWaiter = config.captcha
? new CaptchaWaiter({
waitTimeoutMs: config.captcha.wait_timeout_ms,
pollIntervalMs: config.captcha.poll_interval_ms,
})
: null

const callbacks: ExecutorCallbacks = {
onToolCallStart: ({ input }) => {
const args = input as Record<string, unknown> | undefined
if (args && typeof args.page === 'number') {
capture.setActivePageId(args.page)
}
},
onToolCallFinish: async () => {
try {
if (captchaWaiter) {
await captchaWaiter.waitIfCaptchaPresent(
browser,
capture.getActivePageId(),
)
}
const screenshotNum = await capture.screenshot.capture(
capture.getActivePageId(),
)
capture.emitEvent(task.query_id, {
type: 'screenshot-captured',
screenshot: screenshotNum,
})
} catch {
// Screenshot failures are non-fatal.
}
},
onStepFinish: async ({ toolCalls, toolResults, text }) => {
if (toolCalls) {
for (const tc of toolCalls) {
const inputEvent: UIMessageStreamEvent = {
type: 'tool-input-available',
toolCallId: tc.toolCallId,
toolName: tc.toolName,
input: tc.input,
}
await capture.messageLogger.logStreamEvent(inputEvent)
capture.emitEvent(task.query_id, inputEvent)
}
}
if (toolResults) {
for (const tr of toolResults) {
const outputEvent: UIMessageStreamEvent = {
type: 'tool-output-available',
toolCallId: tr.toolCallId,
output: tr.output,
}
await capture.messageLogger.logStreamEvent(outputEvent)
capture.emitEvent(task.query_id, outputEvent)
}
}
if (text) {
const textId = crypto.randomUUID()
const startEvent: UIMessageStreamEvent = {
type: 'text-start',
id: textId,
}
const deltaEvent: UIMessageStreamEvent = {
type: 'text-delta',
id: textId,
delta: text,
}
const endEvent: UIMessageStreamEvent = {
type: 'text-end',
id: textId,
}
await capture.messageLogger.logStreamEvent(startEvent)
await capture.messageLogger.logStreamEvent(deltaEvent)
await capture.messageLogger.logStreamEvent(endEvent)
capture.emitEvent(task.query_id, deltaEvent)
}
},
}

const executor = new CladoActionExecutor(
{
provider: agentConfig.provider,
model: agentConfig.model,
apiKey: resolveEnvValue(agentConfig.apiKey) ?? '',
baseUrl: agentConfig.baseUrl,
temperature: agentConfig.temperature,
},
config.browseros.server_url,
undefined,
undefined,
this.ctx.initialPageId,
)
executor.setCallbacks(callbacks)

try {
let finalAnswer: string | null = null
let totalSteps = 0

const { terminationReason, result } = await withEvalTimeout(
timeoutMs,
capture,
async (signal) => {
const execution = await executor.execute(task.query, signal)
finalAnswer = execution.observation
totalSteps = execution.actionsPerformed

if (execution.status !== 'done' && execution.status !== 'timeout') {
capture.addError('agent_execution', execution.observation)
}

return execution
},
)

const endTime = Date.now()
const metadata: TaskMetadata = {
query_id: task.query_id,
dataset: task.dataset,
query: task.query,
started_at: new Date(startTime).toISOString(),
completed_at: new Date(endTime).toISOString(),
total_duration_ms: endTime - startTime,
total_steps: result?.actionsPerformed ?? totalSteps,
termination_reason: terminationReason,
final_answer: finalAnswer,
errors: capture.getErrors(),
warnings: capture.getWarnings(),
device_pixel_ratio: capture.screenshot.getDevicePixelRatio(),
agent_config: {
type: 'clado-action',
model: agentConfig.model,
temperature: agentConfig.temperature,
},
grader_results: {},
}

await capture.trajectorySaver.saveMetadata(metadata)

return {
metadata,
messages: capture.getMessages(),
finalAnswer,
}
} finally {
await executor.close().catch(() => {})
await cdp.disconnect().catch(() => {})
}
}
}
2 changes: 2 additions & 0 deletions packages/browseros-agent/apps/eval/src/agents/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { CladoActionEvaluator } from './clado-action'
import { GeminiComputerUseEvaluator } from './gemini-computer-use'
import { OrchestratorExecutorEvaluator } from './orchestrator-executor'
import { registerAgent } from './registry'
Expand All @@ -10,6 +11,7 @@ registerAgent(
'orchestrator-executor',
(ctx) => new OrchestratorExecutorEvaluator(ctx),
)
registerAgent('clado-action', (ctx) => new CladoActionEvaluator(ctx))
registerAgent(
'gemini-computer-use',
(ctx) => new GeminiComputerUseEvaluator(ctx),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ const PAGE_SCOPED_TOOLS = new Set<string>([

interface CladoActionResponse {
action?: string
final_answer?: string
thinking?: string
x?: number
y?: number
text?: string
Expand All @@ -54,6 +56,8 @@ interface Viewport {

interface CladoAction {
action: string
final_answer?: string
thinking?: string
x?: number
y?: number
text?: string
Expand Down Expand Up @@ -177,7 +181,11 @@ export class CladoActionExecutor {
signal,
)
predictionCalls++
const thinking = this.extractThinking(prediction.raw_response)
const thinking = this.normalizeHistoryThinking(
typeof prediction.thinking === "string"
? prediction.thinking
: this.extractThinking(prediction.raw_response),
)
if (thinking) {
const previous = thinkingTrace[thinkingTrace.length - 1]
if (previous !== thinking) {
Expand Down Expand Up @@ -371,6 +379,10 @@ export class CladoActionExecutor {
instruction,
image_base64: imageBase64,
history: this.formatHistory(actionHistory),
history_actions: actionHistory,
...(typeof this.config.temperature === 'number'
? { temperature: this.config.temperature }
: {}),
}),
signal: requestController.signal,
})
Expand Down Expand Up @@ -401,23 +413,8 @@ export class CladoActionExecutor {
action: actionFromField ?? primaryFromRaw?.action,
}

const normalized: CladoAction[] = []
const primary = this.normalizeActionPayload(mergedPrimary)
if (primary) normalized.push(primary)

for (const candidate of rawActions.slice(1)) {
const parsed = this.normalizeActionPayload(candidate)
if (!parsed) continue
const prev = normalized[normalized.length - 1]
if (
!prev ||
this.getActionSignature(prev) !== this.getActionSignature(parsed)
) {
normalized.push(parsed)
}
}

return normalized
return primary ? [primary] : []
}

private normalizeActionPayload(
Expand All @@ -428,6 +425,9 @@ export class CladoActionExecutor {
}
return {
action: payload.action,
final_answer:
typeof payload.final_answer === "string" ? payload.final_answer : undefined,
thinking: typeof payload.thinking === "string" ? payload.thinking : undefined,
x: typeof payload.x === 'number' ? payload.x : undefined,
y: typeof payload.y === 'number' ? payload.y : undefined,
text: typeof payload.text === 'string' ? payload.text : undefined,
Expand Down Expand Up @@ -791,6 +791,8 @@ export class CladoActionExecutor {

return {
action: prediction.action,
final_answer: prediction.final_answer,
thinking: prediction.thinking,
x: prediction.x,
y: prediction.y,
text: prediction.text,
Expand Down Expand Up @@ -823,6 +825,20 @@ export class CladoActionExecutor {
return merged
}

private normalizeHistoryThinking(
thinking: string | undefined,
): string | undefined {
if (!thinking) return undefined

const normalized = thinking
.replace(/<\/?thinking>/gi, ' ')
.replace(/\s+/g, ' ')
.trim()

if (!normalized) return undefined
return normalized
}

private getActionSignature(action: CladoAction): string {
switch (action.action) {
case 'click':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ export class Executor {
private cladoExecutor: CladoActionExecutor | null = null
private stepsUsed = 0
private currentUrl = ''
private configTemplate: ResolvedAgentConfig
private configTemplate: ResolvedAgentConfig & { temperature?: number }
private isCladoAction: boolean
private browser: Browser | null
private serverUrl: string
Expand All @@ -74,7 +74,7 @@ export class Executor {
private callbacks: ExecutorCallbacks

constructor(
configTemplate: ResolvedAgentConfig,
configTemplate: ResolvedAgentConfig & { temperature?: number },
browser: Browser | null,
serverUrl: string,
options?: {
Expand Down Expand Up @@ -107,6 +107,7 @@ export class Executor {
model: this.configTemplate.model,
apiKey: this.configTemplate.apiKey ?? '',
baseUrl: this.configTemplate.baseUrl,
temperature: this.configTemplate.temperature,
},
this.serverUrl,
this.windowId,
Expand Down
Loading
Loading