chore: clean legacy browser tool remnants #141
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Weekly Eval | |
| on: | |
| schedule: | |
| # Every Saturday at 06:00 UTC | |
| - cron: '0 6 * * 6' | |
| push: | |
| branches: [main] | |
| paths: | |
| - 'packages/browseros-agent/apps/server/src/agent/**' | |
| - 'packages/browseros-agent/apps/server/src/tools/**' | |
| workflow_dispatch: | |
| inputs: | |
| config: | |
| description: 'Eval config file (relative to apps/eval/)' | |
| required: false | |
| default: 'configs/legacy/browseros-agent-weekly.json' | |
| permissions: | |
| contents: read | |
| jobs: | |
| eval: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 360 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Install BrowserOS | |
| run: | | |
| # Rolling stable channel — see https://cdn.browseros.com/download/BrowserOS.deb | |
| wget -q -O BrowserOS.deb https://cdn.browseros.com/download/BrowserOS.deb | |
| sudo dpkg -i BrowserOS.deb | |
| browseros --version || echo "BrowserOS installed at $(which browseros)" | |
| - name: Install Bun | |
| uses: oven-sh/setup-bun@v2 | |
| with: | |
| bun-version: latest | |
| - name: Install dependencies | |
| working-directory: packages/browseros-agent | |
| run: bun install --ignore-scripts | |
| - name: Install Claude Code CLI | |
| working-directory: packages/browseros-agent/apps/eval | |
| env: | |
| EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }} | |
| run: | | |
| if bun -e "const config = await Bun.file(process.env.EVAL_CONFIG).json(); process.exit(config.agent?.type === 'claude-code' ? 0 : 1)"; then | |
| npm install -g @anthropic-ai/claude-code@2.1.119 | |
| echo "Claude Code CLI installed at $(command -v claude)" | |
| claude --version | |
| else | |
| echo "Eval config does not use Claude Code; skipping Claude Code CLI install" | |
| fi | |
| - name: Install Python eval dependencies | |
| # agisdk pinned so silent upstream releases can't shift task definitions | |
| # or grader behavior. Bump intentionally with a documented re-baseline. | |
| run: pip install agisdk==0.3.5 requests | |
| - name: Clone WebArena-Infinity | |
| run: git clone --depth 1 https://github.com/web-arena-x/webarena-infinity.git /tmp/webarena-infinity | |
| - name: Install xvfb | |
| run: sudo apt-get update && sudo apt-get install -y xvfb | |
| - name: Install captcha solver extension | |
| working-directory: packages/browseros-agent/apps/eval | |
| run: | | |
| mkdir -p extensions | |
| curl -sL -o /tmp/nopecha.zip https://github.com/NopeCHALLC/nopecha-extension/releases/latest/download/chromium_automation.zip | |
| unzip -qo /tmp/nopecha.zip -d extensions/nopecha | |
| - name: Run eval and publish to R2 | |
| working-directory: packages/browseros-agent/apps/eval | |
| env: | |
| FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| AWS_REGION: ${{ secrets.AWS_REGION || 'us-west-2' }} | |
| AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} | |
| NOPECHA_API_KEY: ${{ secrets.NOPECHA_API_KEY }} | |
| BROWSEROS_BINARY: /usr/bin/browseros | |
| WEBARENA_INFINITY_DIR: /tmp/webarena-infinity | |
| # OpenClaw container runtime is macOS-only; opt the Linux runner | |
| # into the no-op stub so the server can boot and the eval can run. | |
| BROWSEROS_SKIP_OPENCLAW: '1' | |
| EVAL_CONFIG: ${{ github.event.inputs.config || 'configs/legacy/browseros-agent-weekly.json' }} | |
| run: | | |
| echo "Running eval with config: $EVAL_CONFIG" | |
| xvfb-run --auto-servernum --server-args="-screen 0 1440x900x24" bun run src/index.ts suite --config "$EVAL_CONFIG" | |
| # Capture the run directory so report.html can be generated before the R2 publish step. | |
| SUMMARY_PATH="$(find results -name summary.json -type f -print | sort | tail -n 1)" | |
| if [ -z "$SUMMARY_PATH" ]; then | |
| echo "No eval run summary found" | |
| exit 1 | |
| fi | |
| RUN_DIR="$(dirname "$SUMMARY_PATH")" | |
| echo "EVAL_RUN_DIR=$RUN_DIR" >> "$GITHUB_ENV" | |
| - name: Generate run analysis report | |
| if: success() | |
| working-directory: packages/browseros-agent/apps/eval | |
| env: | |
| CLAUDE_CODE_OAUTH_TOKEN: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} | |
| run: | | |
| echo "Generating run report for $EVAL_RUN_DIR" | |
| bun scripts/generate-report.ts --input "$EVAL_RUN_DIR" --output "$EVAL_RUN_DIR/report.html" | |
| - name: Publish eval run to R2 | |
| if: success() | |
| working-directory: packages/browseros-agent/apps/eval | |
| env: | |
| EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }} | |
| EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }} | |
| EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }} | |
| EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }} | |
| EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }} | |
| run: bun run src/index.ts publish --run "$EVAL_RUN_DIR" --target r2 | |
| - name: Generate trend report | |
| if: success() | |
| timeout-minutes: 5 | |
| continue-on-error: true | |
| working-directory: packages/browseros-agent | |
| env: | |
| EVAL_R2_ACCOUNT_ID: ${{ secrets.EVAL_R2_ACCOUNT_ID }} | |
| EVAL_R2_ACCESS_KEY_ID: ${{ secrets.EVAL_R2_ACCESS_KEY_ID }} | |
| EVAL_R2_SECRET_ACCESS_KEY: ${{ secrets.EVAL_R2_SECRET_ACCESS_KEY }} | |
| EVAL_R2_BUCKET: ${{ secrets.EVAL_R2_BUCKET }} | |
| EVAL_R2_CDN_BASE_URL: ${{ secrets.EVAL_R2_CDN_BASE_URL }} | |
| run: bun apps/eval/scripts/weekly-report.ts /tmp/eval-report.html | |
| - name: Upload trend report as artifact | |
| if: success() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-report-${{ github.run_id }} | |
| path: /tmp/eval-report.html | |
| - name: Upload server stderr logs (for post-mortem on startup failures) | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: browseros-server-logs-${{ github.run_id }} | |
| path: /tmp/browseros-server-logs/ | |
| if-no-files-found: ignore |