ci(bench): gate release benchmark on engine parity thresholds

carlos-alm · carlos-alm · commit ad6c78da3ab5 · 2026-04-24T00:26:28.000-06:00
The Benchmark workflow already runs after every release (on workflow_run completion of Publish). Add a parity gate to the build-benchmark job so drift between the native and wasm engines fails the workflow — catching regressions the benchmark data would otherwise silently record. The gate runs after the benchmark doc PR is created, so the raw numbers still land in generated/benchmarks/BUILD-BENCHMARKS.md even when parity regresses; only the workflow status goes red to alert maintainers. Thresholds reference the currently-open parity bugs on v3.9.5: - File-set gap |wasm - native| ≤ 2 (#1011) - DB size ratio native/wasm ≤ 1.02 (#1010) - Full-build edges-phase ratio ≤ 1.30 (#1013) - Full-build roles-phase ratio ≤ 1.30 (#1013) - 1-file incremental ratio ≤ 1.50 (#1012) The gate writes a markdown table to \$GITHUB_STEP_SUMMARY showing pass/fail per threshold with a direct link to the tracking issue, so reviewers see the regression at a glance without digging through logs. No behavior change on the passing path — when both engines are within thresholds the step exits 0 silently. Impact: 2 functions changed, 2 affected
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -198,6 +198,13 @@ jobs:
               --body "Automated build benchmark update for **${VERSION}** from workflow run [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})."
           fi
 
+      # Engine-parity gate: runs AFTER the doc PR is created so the PR still
+      # records raw benchmark data even when parity regresses. The job status
+      # going red alerts maintainers; the linked issues describe each threshold.
+      - name: Engine parity gate
+        if: steps.existing.outputs.skip != 'true'
+        run: node scripts/benchmark-parity-gate.mjs benchmark-result.json
+
   embedding-benchmark:
     runs-on: ubuntu-latest
     # 7 models x 30 min each = 210 min worst-case; symbols are sampled to 1500 so
diff --git a/scripts/benchmark-parity-gate.mjs b/scripts/benchmark-parity-gate.mjs
@@ -0,0 +1,121 @@
+#!/usr/bin/env node
+/**
+ * Engine parity gate — runs after the release build benchmark.
+ *
+ * Reads the merged benchmark-result.json (contains `wasm` and `native` blocks)
+ * and fails the workflow if the gap between engines breaches a documented
+ * threshold. A failure here doesn't block the release (benchmark runs *after*
+ * Publish completes); it surfaces regressions to maintainers via the workflow's
+ * red status and writes a summary to $GITHUB_STEP_SUMMARY.
+ *
+ * Thresholds reference the parity bugs open against v3.9.5:
+ *   - #1010 DB size / excess ast_nodes
+ *   - #1011 Native orchestrator drops files
+ *   - #1012 Native 1-file incremental runs globally
+ *   - #1013 Native full-build edges/roles phases
+ *
+ * Each threshold fires only when BOTH engines produced results. If one engine
+ * failed, we leave the gate passing so the rest of the workflow (doc PR,
+ * artifact upload) still runs, and a separate "both engines ran" check flags
+ * the missing engine.
+ */
+import fs from 'node:fs';
+import path from 'node:path';
+
+const resultFile = process.argv[2];
+if (!resultFile) {
+	console.error('Usage: benchmark-parity-gate.mjs <benchmark-result.json>');
+	process.exit(2);
+}
+
+const result = JSON.parse(fs.readFileSync(resultFile, 'utf8'));
+const { wasm, native, version } = result;
+
+const summaryFile = process.env.GITHUB_STEP_SUMMARY;
+const writeSummary = (text) => {
+	if (summaryFile) fs.appendFileSync(summaryFile, text);
+};
+
+function line(s = '') {
+	console.log(s);
+	writeSummary(`${s}\n`);
+}
+
+line(`## Engine parity gate — v${version}`);
+line('');
+
+if (!wasm || !native) {
+	const missing = [!wasm && 'wasm', !native && 'native'].filter(Boolean).join(', ');
+	line(`**FAIL:** missing engine result for: ${missing}. Benchmark cannot assert parity.`);
+	process.exit(1);
+}
+
+// ── Thresholds ─────────────────────────────────────────────────────────
+// Each entry:
+//   name       — human-readable label
+//   actual     — computed metric
+//   limit      — ceiling; actual must be ≤ limit
+//   formatter  — how to render the value
+//   tracks     — related issue link shown on failure
+const checks = [
+	{
+		name: 'File-set gap (|wasm − native|)',
+		actual: Math.abs(wasm.files - native.files),
+		limit: 2,
+		formatter: (v) => String(v),
+		tracks: '#1011',
+	},
+	{
+		name: 'DB size ratio (native / wasm)',
+		actual: native.dbSizeBytes / wasm.dbSizeBytes,
+		limit: 1.02,
+		formatter: (v) => v.toFixed(3),
+		tracks: '#1010',
+	},
+	{
+		name: 'Full-build edges-phase ratio',
+		actual: (native.phases?.edgesMs ?? 0) / Math.max(wasm.phases?.edgesMs ?? 1, 1),
+		limit: 1.3,
+		formatter: (v) => v.toFixed(2),
+		tracks: '#1013',
+	},
+	{
+		name: 'Full-build roles-phase ratio',
+		actual: (native.phases?.rolesMs ?? 0) / Math.max(wasm.phases?.rolesMs ?? 1, 1),
+		limit: 1.3,
+		formatter: (v) => v.toFixed(2),
+		tracks: '#1013',
+	},
+	{
+		name: '1-file incremental ratio',
+		actual:
+			(native.oneFileRebuildMs ?? 0) /
+			Math.max(wasm.oneFileRebuildMs ?? 1, 1),
+		limit: 1.5,
+		formatter: (v) => v.toFixed(2),
+		tracks: '#1012',
+	},
+];
+
+line('| Check | Actual | Limit | Status | Tracks |');
+line('|---|---:|---:|---|---|');
+
+let failed = 0;
+for (const c of checks) {
+	const ok = c.actual <= c.limit;
+	if (!ok) failed++;
+	const status = ok ? ':white_check_mark: pass' : ':x: **fail**';
+	line(
+		`| ${c.name} | ${c.formatter(c.actual)} | ${c.formatter(c.limit)} | ${status} | ${c.tracks} |`,
+	);
+}
+
+line('');
+if (failed > 0) {
+	line(
+		`**${failed} parity check(s) failed.** See linked issues for root-cause tracking; the benchmark doc PR (if opened) captures the raw numbers.`,
+	);
+	process.exit(1);
+}
+
+line('All parity checks passed.');