Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 154 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -295,9 +295,162 @@ jobs:
- name: Check compilation
run: cargo check --workspace

# ── Pre-publish benchmark gate ──
#
# Mirrors the gate in publish.yml so every PR catches regressions before
# merge instead of at release time. Measures the PR-built native artifact
# against the local source as version "dev", appends to the benchmark
# history files (in-job only — never committed from CI), and runs the
# regression guard against the most recent release baseline.
#
# Long-running but parallel with the rest of CI, so it does not extend
# the critical path for fast-failing checks (lint/typecheck/test).
pre-publish-benchmark:
name: Pre-publish benchmark gate
# Only run on PRs — push-to-main re-runs the same benchmarks the merged
# PR already gated on, doubling CI minutes per landed change with no new
# signal. Mirrors the `if: github.event_name != 'push'` skip in
# publish.yml's equivalent gate.
if: github.event_name == 'pull_request'
needs: native-host-build
runs-on: ubuntu-latest
env:
# Surface why detectNoChanges returns false on each fast-skip pre-flight
# so we can pinpoint the cause of the ~2s incremental-rebuild regression
# observed in CI but not locally (#1066). Remove once root cause is fixed.
CODEGRAPH_FAST_SKIP_DIAG: "1"

steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

- uses: actions/setup-node@v6
with:
node-version: "22"
cache: "npm"

- name: Setup Python (for resolution benchmark + tracer validation)
uses: actions/setup-python@v6
with:
python-version: "3.12"

- name: Setup Go (for resolution benchmark + tracer validation)
uses: actions/setup-go@v6
with:
go-version: "stable"
cache: false

- name: Download PR-built native addon
uses: actions/download-artifact@v8
with:
name: native-host-ubuntu-latest
path: crates/codegraph-core/

- name: Install dependencies
timeout-minutes: 20
shell: bash
run: |
for attempt in 1 2 3; do
npm install && break
if [ "$attempt" -lt 3 ]; then
echo "::warning::npm install attempt $attempt failed, retrying in 15s..."
sleep 15
else
echo "::error::npm install failed after 3 attempts"
exit 1
fi
done

- name: Install native addon over published binary
run: node scripts/ci-install-native.mjs

# Build dist/ so benchmarks load the same compiled JS that ships to npm.
# Historical baselines (v3.9.6 and earlier) were measured against dist/
# via the post-publish --npm path; running against src/ with --strip-types
# changes the load path and introduces version-to-version noise unrelated
# to the code under test (#1055).
- name: Build TypeScript
run: npm run build

- name: Run build benchmark
timeout-minutes: 20
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/benchmark.ts --version dev --dist > benchmark-result.json

- name: Run resolution benchmark
timeout-minutes: 20
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/resolution-benchmark.ts --version dev --dist > resolution-result.json

- name: Gate on resolution thresholds
timeout-minutes: 30
run: npx vitest run tests/benchmarks/resolution/resolution-benchmark.test.ts --reporter=verbose

- name: Run tracer validation (same-file edge recall)
timeout-minutes: 10
run: npx vitest run tests/benchmarks/resolution/tracer/tracer-validation.test.ts --reporter=verbose

- name: Merge resolution into build result
run: |
node -e "
const fs = require('fs');
const build = JSON.parse(fs.readFileSync('benchmark-result.json', 'utf8'));
const resolution = JSON.parse(fs.readFileSync('resolution-result.json', 'utf8'));
build.resolution = resolution;
fs.writeFileSync('benchmark-result.json', JSON.stringify(build, null, 2));
"

- name: Run query benchmark
timeout-minutes: 20
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/query-benchmark.ts --version dev --dist > query-benchmark-result.json

- name: Run incremental benchmark
timeout-minutes: 20
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG --import ./scripts/ts-resolve-loader.js scripts/incremental-benchmark.ts --version dev --dist > incremental-benchmark-result.json
Comment on lines +376 to +416
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Missing timeouts on benchmark execution steps

The four benchmark run steps (Run build benchmark, Run resolution benchmark, Run query benchmark, Run incremental benchmark) have no timeout-minutes, unlike Gate on resolution thresholds (30 min) and Run tracer validation (10 min). If any benchmark script hangs — for example due to an unresponsive subprocess or an infinite loop in new code — the job will occupy the runner for up to 6 hours before GitHub kills it, blocking the ci-pipeline gate for the entire duration. The equivalent steps in publish.yml share the same gap. Adding per-step timeouts (e.g., 15–20 min each) keeps a stuck job from becoming a multi-hour blocker.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 9aca6db — added timeout-minutes: 20 to all four benchmark execution steps (build / resolution / query / incremental) so a hung script can't hold the runner for the GitHub default 6 hours. Same caveat carries over to publish.yml's equivalent steps; left untouched here per one-PR-one-concern (will follow up if needed).


- name: Update build report
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG scripts/update-benchmark-report.ts benchmark-result.json

- name: Update query report
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG scripts/update-query-report.ts query-benchmark-result.json

- name: Update incremental report
run: |
STRIP_FLAG=$(node -e "const [M]=process.versions.node.split('.').map(Number); console.log(M>=23?'--strip-types':'--experimental-strip-types')")
node $STRIP_FLAG scripts/update-incremental-report.ts incremental-benchmark-result.json

- name: Regression guard
env:
RUN_REGRESSION_GUARD: "1"
run: npm run test:regression-guard

# Always upload raw JSON so a failed regression guard is debuggable
# without re-running the full benchmark suite locally.
- name: Upload benchmark JSON results
if: always()
uses: actions/upload-artifact@v7
with:
name: benchmark-results-json
path: |
benchmark-result.json
query-benchmark-result.json
incremental-benchmark-result.json
if-no-files-found: warn

ci-pipeline:
if: always()
needs: [lint, native-host-build, test, typecheck, audit, verify-imports, rust-check, parity]
needs: [lint, native-host-build, test, typecheck, audit, verify-imports, rust-check, parity, pre-publish-benchmark]
runs-on: ubuntu-latest
name: CI Testing Pipeline
steps:
Expand Down
110 changes: 90 additions & 20 deletions tests/benchmarks/regression-guard.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,18 @@ const SKIP_VERSIONS = new Set(['3.8.0']);
* Format: "version:metric-label" (must match the label passed to checkRegression).
* Resolution keys use: "version:resolution <lang> precision" or "version:resolution <lang> recall".
*
* Entries fire only when `latest.version` matches the prefix, so once a
* version is no longer the latest in committed history its entries become
* dead weight and should be removed (last pruned: 3.9.0/3.9.1/3.9.2).
* The `version` is the release where the regression was first observed.
* When the per-PR gate runs `dev` against that release as baseline, the
* exemption applies via the baseline-version fallback in assertNoRegressions
* (and the resolution loop) — so a single `3.9.6:Foo` entry covers both
* `3.9.6 vs 3.9.5` and every subsequent `dev vs 3.9.6` comparison until
* the next release clears the regression and the entry is pruned.
*
* Entries fire only when `latest.version` matches the prefix (or, for `dev`
* latest, when `previous.version` matches via the baseline fallback). Once
* a version is no longer the latest in committed history and no longer the
* baseline used for `dev` comparisons, its entries become dead weight and
* should be removed (last pruned: 3.9.0/3.9.1/3.9.2).
*
* - 3.9.6:Build ms/file / 3.9.6:No-op rebuild — WASM full build regressed
* (#1036) when PR #1016 expanded AST_TYPE_MAPS from 3 to 23 languages,
Expand Down Expand Up @@ -146,6 +155,17 @@ const SKIP_VERSIONS = new Set(['3.8.0']);
* sub-100ms metric is still razor-thin. Exempt this release; remove
* once 3.11.0+ data confirms CI numbers stabilize under the new
* methodology.
*
* - 3.10.0:fnDeps depth 1 — CI variance on a sub-30ms metric (24.7ms native
* baseline). The fn_deps Rust implementation, fnDepsData JS wrapper, and
* DB schema/indexes are all byte-for-byte unchanged since v3.9.6 (already
* documented in NOISY_METRICS above). Measured runs land at +40–60%
* typically (within the 50% NOISY_METRIC_THRESHOLD), but the per-PR gate
* has seen outliers up to +82% (run 25708925467: 24.7 → 44.9ms) on
* shared runners — the ~20ms absolute delta is the runner noise floor.
* Exempt this release; remove once 3.11.0+ data confirms stabilization
* under the warmup + 5-sample methodology already applied to incremental
* benchmarks.
*/
const KNOWN_REGRESSIONS = new Set([
'3.9.6:Build ms/file',
Expand All @@ -155,6 +175,7 @@ const KNOWN_REGRESSIONS = new Set([
'3.9.6:resolution haskell recall',
'3.10.0:No-op rebuild',
'3.10.0:1-file rebuild',
'3.10.0:fnDeps depth 1',
]);

/**
Expand Down Expand Up @@ -272,6 +293,13 @@ function findLatestPair<T extends { version: string }>(
if (!hasEngine(history[latestIdx])) continue;

const latestVersion = history[latestIdx].version;
// 'dev' represents the current PR build (rolling entry — see
// scripts/update-benchmark-report.ts). It has no parseable semver,
// so effectiveGap('dev', anyRelease) returns Infinity — without this
// bypass, the gap check below would skip dev entirely and the loop
// would silently fall through to compare two real releases instead
// of dev vs the latest release, defeating the per-PR gate.
const isDevLatest = latestVersion === 'dev';

// Find previous non-dev entry with data for this engine, skipping
// versions with known unreliable benchmark data and versions that
Expand All @@ -284,7 +312,10 @@ function findLatestPair<T extends { version: string }>(
if (entry.version === 'dev') continue;
if (SKIP_VERSIONS.has(entry.version)) continue;
if (!hasEngine(entry)) continue;
if (effectiveGap(latestVersion, entry.version) > MAX_VERSION_GAP) continue;
// Skip the gap check when comparing dev → release: dev is always
// the current build, so the most recent comparable release is the
// correct baseline regardless of feature-expansion distance.
if (!isDevLatest && effectiveGap(latestVersion, entry.version) > MAX_VERSION_GAP) continue;
return { latest: history[latestIdx], previous: entry };
}
// No valid baseline for this latest — try the next candidate
Expand Down Expand Up @@ -329,11 +360,27 @@ function thresholdFor(label: string): number {
return NOISY_METRICS.has(label) ? NOISY_METRIC_THRESHOLD : REGRESSION_THRESHOLD;
}

function assertNoRegressions(checks: (RegressionCheck | null)[], version?: string) {
function assertNoRegressions(
checks: (RegressionCheck | null)[],
version?: string,
baselineVersion?: string,
) {
const real = checks.filter(Boolean) as RegressionCheck[];
const regressions = real.filter((c) => {
if (c.pctChange <= thresholdFor(c.label)) return false;
if (version && KNOWN_REGRESSIONS.has(`${version}:${c.label}`)) return false;
// When `latest` is the rolling 'dev' build, KNOWN_REGRESSIONS entries
// are anchored to the release where the regression was first observed
// (e.g. '3.9.6:No-op rebuild'), not to 'dev'. Fall back to the baseline
// version so a regression introduced before release N stays exempt for
// every PR comparing dev → N until release N+1 clears it.
if (
version === 'dev' &&
baselineVersion &&
KNOWN_REGRESSIONS.has(`${baselineVersion}:${c.label}`)
) {
return false;
}
return true;
});

Expand Down Expand Up @@ -493,6 +540,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
checkRegression(`1-file rebuild`, cur.oneFileRebuildMs, prev.oneFileRebuildMs),
],
latest.version,
previous.version,
);
});
}
Expand Down Expand Up @@ -530,6 +578,7 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
),
],
latest.version,
previous.version,
);
});
}
Expand Down Expand Up @@ -559,22 +608,32 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
checkRegression(`1-file rebuild`, cur.oneFileRebuildMs, prev.oneFileRebuildMs),
],
latest.version,
previous.version,
);
});
}

// Resolve benchmarks (not engine-specific)
const resolveEntries = incrementalHistory.filter(
(e) => e.resolve != null && e.version !== 'dev' && !SKIP_VERSIONS.has(e.version),
// Resolve benchmarks (not engine-specific). Keep `dev` in the candidate
// pool so the per-PR gate (which produces a `dev` resolve entry) covers
// import resolution; previously the filter dropped dev outright, leaving
// nativeBatchMs / jsFallbackMs blind to PR-introduced regressions.
const resolvePair = findLatestPair(
incrementalHistory.filter((e) => e.resolve != null),
(e) => e.resolve != null,
);
if (resolveEntries.length >= 2) {
test(`import resolution — ${resolveEntries[0].version} vs ${resolveEntries[1].version}`, () => {
const cur = resolveEntries[0].resolve!;
const prev = resolveEntries[1].resolve!;
assertNoRegressions([
checkRegression(`Native batch resolve`, cur.nativeBatchMs, prev.nativeBatchMs),
checkRegression(`JS fallback resolve`, cur.jsFallbackMs, prev.jsFallbackMs),
]);
if (resolvePair) {
const { latest: latestRes, previous: previousRes } = resolvePair;
test(`import resolution — ${latestRes.version} vs ${previousRes.version}`, () => {
const cur = latestRes.resolve!;
const prev = previousRes.resolve!;
assertNoRegressions(
[
checkRegression(`Native batch resolve`, cur.nativeBatchMs, prev.nativeBatchMs),
checkRegression(`JS fallback resolve`, cur.jsFallbackMs, prev.jsFallbackMs),
],
latestRes.version,
previousRes.version,
);
});
}

Expand All @@ -589,8 +648,8 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {

test('has resolve data to compare', () => {
expect(
resolveEntries.length >= 2,
'No import-resolution benchmark data with ≥2 non-dev entries to compare',
resolvePair != null,
'No import-resolution benchmark data with ≥2 comparable entries',
).toBe(true);
});
});
Expand Down Expand Up @@ -644,10 +703,18 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
const prv = prevRes[lang];
if (!cur || !prv) continue;

// When latest is 'dev' (per-PR build), KNOWN_REGRESSIONS keys
// are anchored to the baseline release where the regression was
// first observed, not to 'dev' — fall back to previousRes.version.
const isDev = latestRes.version === 'dev';

const precDrop = prv.precision - cur.precision;
if (precDrop > PRECISION_DROP_PP) {
const key = `${latestRes.version}:resolution ${lang} precision`;
if (!KNOWN_REGRESSIONS.has(key)) {
const fallbackKey = `${previousRes.version}:resolution ${lang} precision`;
const isKnown =
KNOWN_REGRESSIONS.has(key) || (isDev && KNOWN_REGRESSIONS.has(fallbackKey));
if (!isKnown) {
regressions.push(
` ${lang} precision: ${(prv.precision * 100).toFixed(1)}% → ${(cur.precision * 100).toFixed(1)}% (−${(precDrop * 100).toFixed(1)}pp, threshold ${(PRECISION_DROP_PP * 100).toFixed(0)}pp)`,
);
Expand All @@ -657,7 +724,10 @@ describe.runIf(RUN_REGRESSION_GUARD)('Benchmark regression guard', () => {
const recDrop = prv.recall - cur.recall;
if (recDrop > RECALL_DROP_PP) {
const key = `${latestRes.version}:resolution ${lang} recall`;
if (!KNOWN_REGRESSIONS.has(key)) {
const fallbackKey = `${previousRes.version}:resolution ${lang} recall`;
const isKnown =
KNOWN_REGRESSIONS.has(key) || (isDev && KNOWN_REGRESSIONS.has(fallbackKey));
if (!isKnown) {
regressions.push(
` ${lang} recall: ${(prv.recall * 100).toFixed(1)}% → ${(cur.recall * 100).toFixed(1)}% (−${(recDrop * 100).toFixed(1)}pp, threshold ${(RECALL_DROP_PP * 100).toFixed(0)}pp)`,
);
Expand Down
Loading