diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dfa9de8d..e08ca022 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -295,9 +295,162 @@ jobs: - name: Check compilation run: cargo check --workspace + # ── Pre-publish benchmark gate ── + # + # Mirrors the gate in publish.yml so every PR catches regressions before + # merge instead of at release time. Measures the PR-built native artifact + # against the local source as version "dev", appends to the benchmark + # history files (in-job only — never committed from CI), and runs the + # regression guard against the most recent release baseline. + # + # Long-running but parallel with the rest of CI, so it does not extend + # the critical path for fast-failing checks (lint/typecheck/test). + pre-publish-benchmark: + name: Pre-publish benchmark gate + # Only run on PRs — push-to-main re-runs the same benchmarks the merged + # PR already gated on, doubling CI minutes per landed change with no new + # signal. Mirrors the `if: github.event_name != 'push'` skip in + # publish.yml's equivalent gate. + if: github.event_name == 'pull_request' + needs: native-host-build + runs-on: ubuntu-latest + env: + # Surface why detectNoChanges returns false on each fast-skip pre-flight + # so we can pinpoint the cause of the ~2s incremental-rebuild regression + # observed in CI but not locally (#1066). Remove once root cause is fixed. + CODEGRAPH_FAST_SKIP_DIAG: "1" + + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - uses: actions/setup-node@v6 + with: + node-version: "22" + cache: "npm" + + - name: Setup Python (for resolution benchmark + tracer validation) + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Setup Go (for resolution benchmark + tracer validation) + uses: actions/setup-go@v6 + with: + go-version: "stable" + cache: false + + - name: Download PR-built native addon + uses: actions/download-artifact@v8 + with: + name: native-host-ubuntu-latest + path: crates/codegraph-core/ + + - name: Install dependencies + timeout-minutes: 20 + shell: bash + run: | + for attempt in 1 2 3; do + npm install && break + if [ "$attempt" -lt 3 ]; then + echo "::warning::npm install attempt $attempt failed, retrying in 15s..." + sleep 15 + else + echo "::error::npm install failed after 3 attempts" + exit 1 + fi + done + + - name: Install native addon over published binary + run: node scripts/ci-install-native.mjs + + # Build dist/ so benchmarks load the same compiled JS that ships to npm. + # Historical baselines (v3.9.6 and earlier) were measured against dist/ + # via the post-publish --npm path; running against src/ with --strip-types + # changes the load path and introduces version-to-version noise unrelated + # to the code under test (#1055). + - name: Build TypeScript + run: npm run build + + - name: Run build benchmark + timeout-minutes: 20 + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/benchmark.ts --version dev --dist > benchmark-result.json + + - name: Run resolution benchmark + timeout-minutes: 20 + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/resolution-benchmark.ts --version dev --dist > resolution-result.json + + - name: Gate on resolution thresholds + timeout-minutes: 30 + run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose + + - name: Run tracer validation (same-file edge recall) + timeout-minutes: 10 + run: npx vitest run tests/benchmarks/resolution/tracer/tracer-validation.test.ts --reporter=verbose + + - name: Merge resolution into build result + run: | + node -e " + const fs = require('fs'); + const build = JSON.parse(fs.readFileSync('benchmark-result.json', 'utf8')); + const resolution = JSON.parse(fs.readFileSync('resolution-result.json', 'utf8')); + build.resolution = resolution; + fs.writeFileSync('benchmark-result.json', JSON.stringify(build, null, 2)); + " + + - name: Run query benchmark + timeout-minutes: 20 + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/query-benchmark.ts --version dev --dist > query-benchmark-result.json + + - name: Run incremental benchmark + timeout-minutes: 20 + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/incremental-benchmark.ts --version dev --dist > incremental-benchmark-result.json + + - name: Update build report + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG scripts/update-benchmark-report.ts benchmark-result.json + + - name: Update query report + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG scripts/update-query-report.ts query-benchmark-result.json + + - name: Update incremental report + run: | + STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')") + node $STRIP_FLAG scripts/update-incremental-report.ts incremental-benchmark-result.json + + - name: Regression guard + env: + RUN_REGRESSION_GUARD: "1" + run: npm run test:regression-guard + + # Always upload raw JSON so a failed regression guard is debuggable + # without re-running the full benchmark suite locally. + - name: Upload benchmark JSON results + if: always() + uses: actions/upload-artifact@v7 + with: + name: benchmark-results-json + path: | + benchmark-result.json + query-benchmark-result.json + incremental-benchmark-result.json + if-no-files-found: warn + ci-pipeline: if: always() - needs: [lint, native-host-build, test, typecheck, audit, verify-imports, rust-check, parity] + needs: [lint, native-host-build, test, typecheck, audit, verify-imports, rust-check, parity, pre-publish-benchmark] runs-on: ubuntu-latest name: CI Testing Pipeline steps: diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index 37a6fbb4..337f0a5a 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -100,9 +100,18 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * Format: "version:metric-label" (must match the label passed to checkRegression). * Resolution keys use: "version:resolution precision" or "version:resolution recall". * - * Entries fire only when `latest.version` matches the prefix, so once a - * version is no longer the latest in committed history its entries become - * dead weight and should be removed (last pruned: 3.9.0/3.9.1/3.9.2). + * The `version` is the release where the regression was first observed. + * When the per-PR gate runs `dev` against that release as baseline, the + * exemption applies via the baseline-version fallback in assertNoRegressions + * (and the resolution loop) — so a single `3.9.6:Foo` entry covers both + * `3.9.6 vs 3.9.5` and every subsequent `dev vs 3.9.6` comparison until + * the next release clears the regression and the entry is pruned. + * + * Entries fire only when `latest.version` matches the prefix (or, for `dev` + * latest, when `previous.version` matches via the baseline fallback). Once + * a version is no longer the latest in committed history and no longer the + * baseline used for `dev` comparisons, its entries become dead weight and + * should be removed (last pruned: 3.9.0/3.9.1/3.9.2). * * - 3.9.6:Build ms/file / 3.9.6:No-op rebuild — WASM full build regressed * (#1036) when PR #1016 expanded AST_TYPE_MAPS from 3 to 23 languages, @@ -146,6 +155,17 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * sub-100ms metric is still razor-thin. Exempt this release; remove * once 3.11.0+ data confirms CI numbers stabilize under the new * methodology. + * + * - 3.10.0:fnDeps depth 1 — CI variance on a sub-30ms metric (24.7ms native + * baseline). The fn_deps Rust implementation, fnDepsData JS wrapper, and + * DB schema/indexes are all byte-for-byte unchanged since v3.9.6 (already + * documented in NOISY_METRICS above). Measured runs land at +40–60% + * typically (within the 50% NOISY_METRIC_THRESHOLD), but the per-PR gate + * has seen outliers up to +82% (run 25708925467: 24.7 → 44.9ms) on + * shared runners — the ~20ms absolute delta is the runner noise floor. + * Exempt this release; remove once 3.11.0+ data confirms stabilization + * under the warmup + 5-sample methodology already applied to incremental + * benchmarks. */ const KNOWN_REGRESSIONS = new Set([ '3.9.6:Build ms/file', @@ -155,6 +175,7 @@ const KNOWN_REGRESSIONS = new Set([ '3.9.6:resolution haskell recall', '3.10.0:No-op rebuild', '3.10.0:1-file rebuild', + '3.10.0:fnDeps depth 1', ]); /** @@ -272,6 +293,13 @@ function findLatestPair( if (!hasEngine(history[latestIdx])) continue; const latestVersion = history[latestIdx].version; + // 'dev' represents the current PR build (rolling entry — see + // scripts/update-benchmark-report.ts). It has no parseable semver, + // so effectiveGap('dev', anyRelease) returns Infinity — without this + // bypass, the gap check below would skip dev entirely and the loop + // would silently fall through to compare two real releases instead + // of dev vs the latest release, defeating the per-PR gate. + const isDevLatest = latestVersion === 'dev'; // Find previous non-dev entry with data for this engine, skipping // versions with known unreliable benchmark data and versions that @@ -284,7 +312,10 @@ function findLatestPair( if (entry.version === 'dev') continue; if (SKIP_VERSIONS.has(entry.version)) continue; if (!hasEngine(entry)) continue; - if (effectiveGap(latestVersion, entry.version) > MAX_VERSION_GAP) continue; + // Skip the gap check when comparing dev → release: dev is always + // the current build, so the most recent comparable release is the + // correct baseline regardless of feature-expansion distance. + if (!isDevLatest && effectiveGap(latestVersion, entry.version) > MAX_VERSION_GAP) continue; return { latest: history[latestIdx], previous: entry }; } // No valid baseline for this latest — try the next candidate @@ -329,11 +360,27 @@ function thresholdFor(label: string): number { return NOISY_METRICS.has(label) ? NOISY_METRIC_THRESHOLD : REGRESSION_THRESHOLD; } -function assertNoRegressions(checks: (RegressionCheck | null)[], version?: string) { +function assertNoRegressions( + checks: (RegressionCheck | null)[], + version?: string, + baselineVersion?: string, +) { const real = checks.filter(Boolean) as RegressionCheck[]; const regressions = real.filter((c) => { if (c.pctChange <= thresholdFor(c.label)) return false; if (version && KNOWN_REGRESSIONS.has(`${version}:${c.label}`)) return false; + // When `latest` is the rolling 'dev' build, KNOWN_REGRESSIONS entries + // are anchored to the release where the regression was first observed + // (e.g. '3.9.6:No-op rebuild'), not to 'dev'. Fall back to the baseline + // version so a regression introduced before release N stays exempt for + // every PR comparing dev → N until release N+1 clears it. + if ( + version === 'dev' && + baselineVersion && + KNOWN_REGRESSIONS.has(`${baselineVersion}:${c.label}`) + ) { + return false; + } return true; }); @@ -493,6 +540,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => { checkRegression(`1-file rebuild`, cur.oneFileRebuildMs, prev.oneFileRebuildMs), ], latest.version, + previous.version, ); }); } @@ -530,6 +578,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => { ), ], latest.version, + previous.version, ); }); } @@ -559,22 +608,32 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => { checkRegression(`1-file rebuild`, cur.oneFileRebuildMs, prev.oneFileRebuildMs), ], latest.version, + previous.version, ); }); } - // Resolve benchmarks (not engine-specific) - const resolveEntries = incrementalHistory.filter( - (e) => e.resolve != null && e.version !== 'dev' && !SKIP_VERSIONS.has(e.version), + // Resolve benchmarks (not engine-specific). Keep `dev` in the candidate + // pool so the per-PR gate (which produces a `dev` resolve entry) covers + // import resolution; previously the filter dropped dev outright, leaving + // nativeBatchMs / jsFallbackMs blind to PR-introduced regressions. + const resolvePair = findLatestPair( + incrementalHistory.filter((e) => e.resolve != null), + (e) => e.resolve != null, ); - if (resolveEntries.length >= 2) { - test(`import resolution — ${resolveEntries[0].version} vs ${resolveEntries[1].version}`, () => { - const cur = resolveEntries[0].resolve!; - const prev = resolveEntries[1].resolve!; - assertNoRegressions([ - checkRegression(`Native batch resolve`, cur.nativeBatchMs, prev.nativeBatchMs), - checkRegression(`JS fallback resolve`, cur.jsFallbackMs, prev.jsFallbackMs), - ]); + if (resolvePair) { + const { latest: latestRes, previous: previousRes } = resolvePair; + test(`import resolution — ${latestRes.version} vs ${previousRes.version}`, () => { + const cur = latestRes.resolve!; + const prev = previousRes.resolve!; + assertNoRegressions( + [ + checkRegression(`Native batch resolve`, cur.nativeBatchMs, prev.nativeBatchMs), + checkRegression(`JS fallback resolve`, cur.jsFallbackMs, prev.jsFallbackMs), + ], + latestRes.version, + previousRes.version, + ); }); } @@ -589,8 +648,8 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => { test('has resolve data to compare', () => { expect( - resolveEntries.length >= 2, - 'No import-resolution benchmark data with ≥2 non-dev entries to compare', + resolvePair != null, + 'No import-resolution benchmark data with ≥2 comparable entries', ).toBe(true); }); }); @@ -644,10 +703,18 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => { const prv = prevRes[lang]; if (!cur || !prv) continue; + // When latest is 'dev' (per-PR build), KNOWN_REGRESSIONS keys + // are anchored to the baseline release where the regression was + // first observed, not to 'dev' — fall back to previousRes.version. + const isDev = latestRes.version === 'dev'; + const precDrop = prv.precision - cur.precision; if (precDrop > PRECISION_DROP_PP) { const key = `${latestRes.version}:resolution ${lang} precision`; - if (!KNOWN_REGRESSIONS.has(key)) { + const fallbackKey = `${previousRes.version}:resolution ${lang} precision`; + const isKnown = + KNOWN_REGRESSIONS.has(key) || (isDev && KNOWN_REGRESSIONS.has(fallbackKey)); + if (!isKnown) { regressions.push( ` ${lang} precision: ${(prv.precision * 100).toFixed(1)}% → ${(cur.precision * 100).toFixed(1)}% (−${(precDrop * 100).toFixed(1)}pp, threshold ${(PRECISION_DROP_PP * 100).toFixed(0)}pp)`, ); @@ -657,7 +724,10 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => { const recDrop = prv.recall - cur.recall; if (recDrop > RECALL_DROP_PP) { const key = `${latestRes.version}:resolution ${lang} recall`; - if (!KNOWN_REGRESSIONS.has(key)) { + const fallbackKey = `${previousRes.version}:resolution ${lang} recall`; + const isKnown = + KNOWN_REGRESSIONS.has(key) || (isDev && KNOWN_REGRESSIONS.has(fallbackKey)); + if (!isKnown) { regressions.push( ` ${lang} recall: ${(prv.recall * 100).toFixed(1)}% → ${(cur.recall * 100).toFixed(1)}% (−${(recDrop * 100).toFixed(1)}pp, threshold ${(RECALL_DROP_PP * 100).toFixed(0)}pp)`, );