Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion src/domain/graph/builder/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import path from 'node:path';
import { performance } from 'node:perf_hooks';
import {
acquireAdvisoryLock,
closeDb,
closeDbPair,
getBuildMeta,
initSchema,
Expand Down Expand Up @@ -39,6 +40,7 @@ import {
getInstalledWasmExtensions,
parseFilesWasmForBackfill,
} from '../../parser.js';
import { writeJournalHeader } from '../journal.js';
import { setWorkspaces } from '../resolve.js';
import { PipelineContext } from './context.js';
import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
Expand All @@ -47,7 +49,7 @@ import { buildEdges } from './stages/build-edges.js';
import { buildStructure } from './stages/build-structure.js';
// Pipeline stages
import { collectFiles } from './stages/collect-files.js';
import { detectChanges } from './stages/detect-changes.js';
import { detectChanges, detectNoChanges } from './stages/detect-changes.js';
import { finalize } from './stages/finalize.js';
import { insertNodes } from './stages/insert-nodes.js';
import { parseFiles } from './stages/parse-files.js';
Expand Down Expand Up @@ -1000,6 +1002,42 @@ export async function buildGraph(
try {
setupPipeline(ctx);

// ── JS-side fast-skip for native incremental (#1054) ──────────────
// The Rust orchestrator's internal early-exit fires reliably locally
// but not in CI, where every no-op rebuild was paying the full ~2s
// pipeline cost. A read-only mtime+size check here matches WASM's
// ~20ms early-exit and skips the orchestrator entirely when no
// source files have changed. Tier-2 hashing is left to the native
// side: any mismatch falls through and lets Rust's detect_changes
// remain the source of truth.
if (
ctx.nativeAvailable &&
ctx.engineName === 'native' &&
ctx.incremental &&
!ctx.forceFullRebuild &&
!(ctx.opts as Record<string, unknown>).scope
) {
try {
await collectFiles(ctx);
if (
detectNoChanges(ctx.db, ctx.allFiles, ctx.rootDir, ctx.opts as Record<string, unknown>)
) {
info('No changes detected. Graph is up to date.');
writeJournalHeader(ctx.rootDir, Date.now());
closeDb(ctx.db);
return;
}
} catch (err) {
// Pre-flight is best-effort — any failure falls through to the
// orchestrator, which performs its own complete detection.
// Reset ctx.allFiles so runPipelineStages re-collects under its own
// engine state if we ended up partially populated before throwing.
ctx.allFiles = undefined as unknown as string[];
ctx.discoveredDirs = undefined as unknown as Set<string>;
debug(`native fast-skip pre-flight failed: ${toErrorMessage(err)}`);
}
}
Comment on lines +1020 to +1039
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Redundant collectFiles call on fallthrough to runPipelineStages

When detectNoChanges returns false (changes detected), control falls through to tryNativeOrchestrator. If the native orchestrator itself fails and runPipelineStages is reached, collectFiles is called again at line 901 — doubling the collection work on this path. ctx.allFiles is overwritten with identical data, so there is no functional bug, but the extra traversal adds latency that counteracts the PR's goal of eliminating unnecessary overhead on the non-skip path. Consider guarding the second collectFiles call in runPipelineStages with a check like if (!ctx.allFiles?.length), or resetting ctx.allFiles in the catch block above so the second call is always the canonical one.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 540a420. Guarded the collectFiles stage so it returns early when ctx.allFiles and ctx.discoveredDirs are already populated (and opts.scope is unset). On the fallthrough path (pre-flight ran but changes were detected → tryNativeOrchestrator → JS pipeline fallback), runPipelineStages now reuses the already-collected file list instead of re-walking the filesystem.

To keep the guard correct on the failure path, the buildGraph catch block now resets ctx.allFiles and ctx.discoveredDirs to undefined when the pre-flight throws — so runPipelineStages re-collects under its own engine state if we got partially populated before the error.


// ── Rust orchestrator fast path (#695) ────────────────────────────
// When available, run the entire build pipeline in Rust with zero
// napi crossings (eliminates WAL dual-connection dance). Falls back
Expand Down
9 changes: 9 additions & 0 deletions src/domain/graph/builder/stages/collect-files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ function tryFastCollect(
export async function collectFiles(ctx: PipelineContext): Promise<void> {
const { rootDir, config, opts } = ctx;

// Skip when the JS-side fast-skip pre-flight (#1054) already populated the
// file list and changes were detected, causing fallthrough to the native
// orchestrator and then to runPipelineStages. Avoids redoing the filesystem
// walk on the non-skip path (~8ms on 473 files). On pre-flight failure the
// caller resets ctx.allFiles so this guard correctly falls through.
if (!opts.scope && ctx.allFiles?.length && ctx.discoveredDirs?.size) {
return;
}

if (opts.scope) {
// Scoped rebuild: rebuild only specified files.
//
Expand Down
88 changes: 88 additions & 0 deletions src/domain/graph/builder/stages/detect-changes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,94 @@ function handleIncrementalBuild(ctx: PipelineContext): void {
purgeAndAddReverseDeps(ctx, changePaths, reverseDeps);
}

/**
* Read-only pre-flight check for the native orchestrator.
*
* Returns true iff every collected source file has matching mtime+size in
* `file_hashes` and no DB-tracked file has been removed. When true, the
* caller can short-circuit before invoking the native orchestrator —
* matching WASM's ~20 ms early-exit path and avoiding the ~2s flat
* per-call native rebuild overhead seen in CI (#1054).
*
* Intentionally Tier-0/Tier-1 only (journal + mtime/size). Tier-2 content
* hashing is left to the native side: when this returns false the caller
* falls through to the orchestrator, which performs its own complete
* detection and is the source of truth.
*
* Conservatively returns false when CFG or dataflow analysis is enabled
* but the corresponding tables are empty — otherwise the fast-skip would
* silently suppress the pending-analysis pass that the JS path runs via
* `runPendingAnalysis`, and CFG/dataflow data would never populate on
* repos where source files don't change between builds.
*
* Pure read of `db` and the filesystem — never mutates either.
*/
export function detectNoChanges(
db: BetterSqlite3Database,
allFiles: string[],
rootDir: string,
opts?: Record<string, unknown>,
): boolean {
let hasTable = false;
try {
db.prepare('SELECT 1 FROM file_hashes LIMIT 1').get();
hasTable = true;
} catch {
/* table missing — first build */
}
if (!hasTable) return false;

const rows = db.prepare('SELECT file, hash, mtime, size FROM file_hashes').all() as FileHashRow[];
if (rows.length === 0) return false;
const existing = new Map<string, FileHashRow>(rows.map((r) => [r.file, r]));

const currentFiles = new Set<string>();
for (const file of allFiles) {
currentFiles.add(normalizePath(path.relative(rootDir, file)));
}
for (const existingFile of existing.keys()) {
if (!currentFiles.has(existingFile)) return false;
}

for (const file of allFiles) {
const relPath = normalizePath(path.relative(rootDir, file));
const record = existing.get(relPath);
if (!record) return false;
const stat = fileStat(file) as FileStat | undefined;
if (!stat) return false;
const storedMtime = record.mtime || 0;
const storedSize = record.size || 0;
if (storedSize <= 0) return false;
if (Math.floor(stat.mtimeMs) !== storedMtime || stat.size !== storedSize) return false;
}

// Pending-analysis guard: if CFG/dataflow is enabled but the corresponding
// table is empty (analysis newly enabled, or tables wiped between builds),
// fall through so the orchestrator / JS pipeline can run runPendingAnalysis.
// Mirrors the check at the top of runPendingAnalysis (see line ~244).
if (opts) {
if (opts.cfg !== false && hasEmptyAnalysisTable(db, 'cfg_blocks')) return false;
if (opts.dataflow !== false && hasEmptyAnalysisTable(db, 'dataflow')) return false;
}

return true;
}
Comment on lines 512 to +586
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Missing runPendingAnalysis check before early exit

detectNoChanges skips the runPendingAnalysis guard that the existing detectChanges fast-exit path (lines 610–617) always executes. If CFG or dataflow analysis was newly enabled (or tables were wiped) between builds and no source files changed, mtime/size will match file_hashes, detectNoChanges will return true, the function will write the journal header and return — and the pending analysis pass will silently never run. The data stays empty in cfg_blocks/dataflow indefinitely on no-op-rebuild repos.

The existing JS path guards this explicitly: it calls runPendingAnalysis(ctx) before writing the journal. Either the same check needs to be reproduced here (checking cfg_blocks/dataflow row counts against opts.cfg/opts.dataflow), or detectNoChanges must return false conservatively whenever those tables are empty and the corresponding options are enabled.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 8699471. detectNoChanges now takes an optional opts parameter and conservatively returns false when:

  • opts.cfg !== false and cfg_blocks is empty
  • opts.dataflow !== false and dataflow is empty

This mirrors the same empty-table check that runPendingAnalysis (line ~244) uses to decide whether to run the analysis pass. When the guard fires, the caller falls through to tryNativeOrchestrator (or, on native fallback, to runPipelineStages), which invokes detectChanges and the existing runPendingAnalysis path — populating CFG/dataflow even on no-op-rebuild repos.

The pipeline.ts call site now passes ctx.opts so the guard activates for every fast-skip attempt. Added 6 unit tests covering empty file_hashes, mtime+size match, deleted tracked file, mtime drift, and the new pending-analysis guards for both cfg and dataflow.


/**
* Returns true if `table` exists and has zero rows, matching the empty-table
* semantics of `runPendingAnalysis`. A missing table is treated as empty
* (the conservative outcome), so the caller falls through to the orchestrator
* which will create the schema and populate it.
*/
function hasEmptyAnalysisTable(db: BetterSqlite3Database, table: string): boolean {
try {
const row = db.prepare(`SELECT COUNT(*) as c FROM ${table}`).get() as { c: number } | undefined;
return (row?.c ?? 0) === 0;
} catch {
return true;
}
}

export async function detectChanges(ctx: PipelineContext): Promise<void> {
const start = performance.now();
try {
Expand Down
137 changes: 136 additions & 1 deletion tests/builder/detect-changes.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ import path from 'node:path';
import { afterAll, beforeAll, describe, expect, it } from 'vitest';
import { closeDb, initSchema, openDb } from '../../src/db/index.js';
import { PipelineContext } from '../../src/domain/graph/builder/context.js';
import { detectChanges } from '../../src/domain/graph/builder/stages/detect-changes.js';
import {
detectChanges,
detectNoChanges,
} from '../../src/domain/graph/builder/stages/detect-changes.js';
import { writeJournalHeader } from '../../src/domain/graph/journal.js';

let tmpDir: string;
Expand Down Expand Up @@ -142,3 +145,135 @@ describe('detectChanges stage', () => {
fs.rmSync(dir, { recursive: true, force: true });
});
});

describe('detectNoChanges fast-skip', () => {
function seedFile(dir: string, name: string, content: string): string {
const filePath = path.join(dir, name);
fs.writeFileSync(filePath, content);
return filePath;
}

function seedHashRow(
db: ReturnType<typeof openDb>,
relPath: string,
filePath: string,
): { mtime: number; size: number } {
const stat = fs.statSync(filePath);
const mtime = Math.floor(stat.mtimeMs);
db.prepare('INSERT INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)').run(
relPath,
'deadbeef',
mtime,
stat.size,
);
return { mtime, size: stat.size };
}

it('returns false when file_hashes is empty (first build)', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-noChange-empty-'));
const dbDir = path.join(dir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
const db = openDb(path.join(dbDir, 'graph.db'));
initSchema(db);
const file = seedFile(dir, 'a.js', 'export const a = 1;');

expect(detectNoChanges(db, [file], dir)).toBe(false);

closeDb(db);
fs.rmSync(dir, { recursive: true, force: true });
});

it('returns true when mtime+size match seeded file_hashes', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-noChange-match-'));
const dbDir = path.join(dir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
const db = openDb(path.join(dbDir, 'graph.db'));
initSchema(db);
const file = seedFile(dir, 'a.js', 'export const a = 1;');
seedHashRow(db, 'a.js', file);

expect(detectNoChanges(db, [file], dir)).toBe(true);

closeDb(db);
fs.rmSync(dir, { recursive: true, force: true });
});

it('returns false when a tracked file has been deleted', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-noChange-deleted-'));
const dbDir = path.join(dir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
const db = openDb(path.join(dbDir, 'graph.db'));
initSchema(db);
const file = seedFile(dir, 'a.js', 'export const a = 1;');
seedHashRow(db, 'a.js', file);
seedHashRow(db, 'gone.js', file); // tracked but no longer on disk

expect(detectNoChanges(db, [file], dir)).toBe(false);

closeDb(db);
fs.rmSync(dir, { recursive: true, force: true });
});

it('returns false when mtime differs from seeded value', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-noChange-mtime-'));
const dbDir = path.join(dir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
const db = openDb(path.join(dbDir, 'graph.db'));
initSchema(db);
const file = seedFile(dir, 'a.js', 'export const a = 1;');
const stat = fs.statSync(file);
db.prepare('INSERT INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)').run(
'a.js',
'deadbeef',
Math.floor(stat.mtimeMs) + 1000, // skewed mtime
stat.size,
);

expect(detectNoChanges(db, [file], dir)).toBe(false);

closeDb(db);
fs.rmSync(dir, { recursive: true, force: true });
});

it('returns false when CFG analysis is enabled but cfg_blocks is empty (#1064)', () => {
// Pending-analysis guard: even though mtime+size match, if cfg_blocks
// is empty (analysis newly enabled), the caller must fall through so
// runPendingAnalysis can populate the table.
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-noChange-pendingCfg-'));
const dbDir = path.join(dir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
const db = openDb(path.join(dbDir, 'graph.db'));
initSchema(db);
const file = seedFile(dir, 'a.js', 'export const a = 1;');
seedHashRow(db, 'a.js', file);
// cfg_blocks table is created empty by initSchema — that's the trigger.

// Without opts: legacy behaviour — fast-skip returns true.
expect(detectNoChanges(db, [file], dir)).toBe(true);
// With cfg enabled (cfg !== false) and cfg_blocks empty: must return false.
expect(detectNoChanges(db, [file], dir, { cfg: true, dataflow: false })).toBe(false);
// When cfg explicitly disabled (and dataflow disabled too so its guard
// doesn't fire), the empty cfg table is irrelevant.
expect(detectNoChanges(db, [file], dir, { cfg: false, dataflow: false })).toBe(true);

closeDb(db);
fs.rmSync(dir, { recursive: true, force: true });
});

it('returns false when dataflow is enabled but dataflow table is empty (#1064)', () => {
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-noChange-pendingDf-'));
const dbDir = path.join(dir, '.codegraph');
fs.mkdirSync(dbDir, { recursive: true });
const db = openDb(path.join(dbDir, 'graph.db'));
initSchema(db);
const file = seedFile(dir, 'a.js', 'export const a = 1;');
seedHashRow(db, 'a.js', file);

// Disable cfg so only the dataflow guard is exercised.
expect(detectNoChanges(db, [file], dir, { cfg: false, dataflow: true })).toBe(false);
expect(detectNoChanges(db, [file], dir, { cfg: false, dataflow: false })).toBe(true);

closeDb(db);
fs.rmSync(dir, { recursive: true, force: true });
});
});
Loading