Skip to content

Commit 8fcdb86

Browse files
authored
fix(native): persist file_hashes for dropped/symbol-less files (#1069)
* fix(native): persist file_hashes for dropped/symbol-less files (#1068) The JS-side fast-skip pre-flight (#1054) was permanently rejecting on repos containing optional-language files (e.g. .clj) because their rows were missing from file_hashes: - buildFileHashes / updateFileHashes iterated allSymbols, so files with zero symbols (empty, parser no-op, grammar-missing optional language) never got a hash row. Iterate filesToParse instead. - backfillNativeDroppedFiles now writes file_hashes rows for every file the Rust orchestrator dropped (e.g. .clj when no Rust extractor exists), so the fast-skip pre-flight can match them on rebuild. - Backfill runs on every successful orchestrator pass, not only on full builds. The Rust orchestrator's narrower file_collector treats files outside SUPPORTED_EXTENSIONS as 'removed' and deletes their nodes + file_hashes rows on every incremental run; backfill repairs that. Restructured the function to do the cheap missing-file check before the proxy->better-sqlite3 handoff so no-op rebuilds remain cheap. * fix(native): treat file_hashes gap as missing in backfill (#1069) Greptile feedback: backfillNativeDroppedFiles read 'nodes WHERE kind=file' to decide what's missing, but the fast-skip pre-flight (#1054) rejects on 'file_hashes' gaps. If a DB has a node row but no file_hashes row (e.g. state written by pre-#1068 code), the early-return triggers and the gap persists across rebuilds. Also query file_hashes and treat any expected file absent from EITHER table as missing. The existing upsert path repairs both rows. The file_hashes read is wrapped in try/catch so legacy DBs without the table fall through to the existing recovery path.
1 parent 29add9f commit 8fcdb86

3 files changed

Lines changed: 251 additions & 25 deletions

File tree

src/domain/graph/builder/pipeline.ts

Lines changed: 96 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,14 @@ import {
4343
import { writeJournalHeader } from '../journal.js';
4444
import { setWorkspaces } from '../resolve.js';
4545
import { PipelineContext } from './context.js';
46-
import { batchInsertNodes, collectFiles as collectFilesUtil, loadPathAliases } from './helpers.js';
46+
import {
47+
batchInsertNodes,
48+
collectFiles as collectFilesUtil,
49+
fileHash,
50+
fileStat,
51+
loadPathAliases,
52+
readFileSafe,
53+
} from './helpers.js';
4754
import { NativeDbProxy } from './native-db-proxy.js';
4855
import { buildEdges } from './stages/build-edges.js';
4956
import { buildStructure } from './stages/build-structure.js';
@@ -731,12 +738,15 @@ async function tryNativeOrchestrator(
731738
// stale native binaries). WASM handles those — backfill via WASM so both
732739
// engines process the same file set (#967).
733740
//
734-
// Only runs on full builds: incremental builds only touch changed files,
735-
// which are parsed through parseFilesAuto (which has its own per-file
736-
// backfill), so a full filesystem scan here would be wasted work.
737-
if (result.isFullBuild) {
738-
await backfillNativeDroppedFiles(ctx);
739-
}
741+
// Runs on every successful orchestrator pass (not just full builds): on
742+
// incrementals the orchestrator's change detection treats files outside
743+
// Rust's narrower file_collector as `removed` and deletes their nodes +
744+
// file_hashes rows. Without re-running the backfill we'd lose the symbols
745+
// for those files and permanently break the JS-side fast-skip pre-flight
746+
// (#1054, #1068). The function is cheap (single fs scan + DB query) when
747+
// nothing is missing, and on no-op rebuilds the missing-set is re-derived
748+
// from `nodes`, so it catches whatever Rust just deleted.
749+
await backfillNativeDroppedFiles(ctx);
740750

741751
closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
742752
return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
@@ -747,22 +757,40 @@ async function tryNativeOrchestrator(
747757
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
748758
*/
749759
async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
750-
// Needs a real better-sqlite3 connection for INSERT.
751-
if (ctx.nativeFirstProxy) {
752-
closeNativeDb(ctx, 'pre-parity-backfill');
753-
ctx.db = openDb(ctx.dbPath);
754-
ctx.nativeFirstProxy = false;
755-
}
756-
760+
// Compute the missing-file set FIRST, before any expensive DB handoff.
761+
// NativeDbProxy supports .prepare().all(), so the upfront query works
762+
// whether ctx.db is a proxy or a real better-sqlite3 connection. On
763+
// incremental no-op rebuilds nothing is missing, so we want to early-return
764+
// without paying the close-native / reopen-better-sqlite3 cost.
757765
const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set<string>());
758766
const expected = new Set(
759767
collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))),
760768
);
761769

762-
const existingRows = ctx.db
770+
const existingNodeRows = ctx.db
763771
.prepare("SELECT DISTINCT file FROM nodes WHERE kind = 'file'")
764772
.all() as Array<{ file: string }>;
765-
const existing = new Set(existingRows.map((r) => r.file));
773+
const existingNodes = new Set(existingNodeRows.map((r) => r.file));
774+
775+
// Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight
776+
// (#1054) rejects on `file_hashes` gaps, and the two tables can diverge
777+
// (e.g. a DB written by old code where `nodes` was populated but
778+
// `file_hashes` was not). Treating "in nodes but not in file_hashes" as
779+
// missing closes the gap so the backfill repairs the file_hashes row even
780+
// when the node row already exists.
781+
let existingHashes = new Set<string>();
782+
try {
783+
const existingHashRows = ctx.db
784+
.prepare('SELECT DISTINCT file FROM file_hashes')
785+
.all() as Array<{ file: string }>;
786+
existingHashes = new Set(existingHashRows.map((r) => r.file));
787+
} catch (e) {
788+
// file_hashes table may not exist on legacy DBs; treat as fully missing
789+
// so the backfill writes rows on the upsert path below.
790+
debug(
791+
`backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`,
792+
);
793+
}
766794

767795
// Restrict backfill to files with an installed WASM grammar. Extensions in
768796
// LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
@@ -772,14 +800,24 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
772800
const missingRel: string[] = [];
773801
const missingAbs: string[] = [];
774802
for (const rel of expected) {
775-
if (existing.has(rel)) continue;
803+
// A file is "missing" if it's absent from EITHER nodes OR file_hashes.
804+
// Both must be present for fast-skip to work correctly.
805+
if (existingNodes.has(rel) && existingHashes.has(rel)) continue;
776806
const ext = path.extname(rel).toLowerCase();
777807
if (!installedExts.has(ext)) continue;
778808
missingRel.push(rel);
779809
missingAbs.push(path.join(ctx.rootDir, rel));
780810
}
781811
if (missingAbs.length === 0) return;
782812

813+
// Now that we know there's work to do, hand off to better-sqlite3 (needed
814+
// for the INSERT path below).
815+
if (ctx.nativeFirstProxy) {
816+
closeNativeDb(ctx, 'pre-parity-backfill');
817+
ctx.db = openDb(ctx.dbPath);
818+
ctx.nativeFirstProxy = false;
819+
}
820+
783821
// Classify drops so users see per-extension reasons instead of just a count
784822
// (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
785823
// extractor); `native-extractor-failure` indicates a real native bug since
@@ -856,6 +894,47 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
856894
}
857895
}
858896

897+
// Persist file_hashes rows for every backfilled file. The Rust orchestrator
898+
// only hashes files it parsed itself, so without this step files in
899+
// optional-language extensions (e.g. .clj when no Rust extractor exists)
900+
// would be missing from `file_hashes` — permanently breaking the JS-side
901+
// fast-skip pre-flight (#1054), which rejects on `collected file missing
902+
// from file_hashes` and forces every no-op rebuild back through the full
903+
// ~2s native pipeline (#1068).
904+
//
905+
// Iterates `missingRel` (every collected file the Rust orchestrator
906+
// dropped), not `wasmResults`, so files that produced zero symbols still
907+
// get a row.
908+
try {
909+
const upsertHash = db.prepare(
910+
'INSERT OR REPLACE INTO file_hashes (file, hash, mtime, size) VALUES (?, ?, ?, ?)',
911+
);
912+
const writeHashes = db.transaction(() => {
913+
for (let i = 0; i < missingRel.length; i++) {
914+
const relPath = missingRel[i];
915+
const absPath = missingAbs[i];
916+
if (!relPath || !absPath) continue;
917+
let code: string | null;
918+
try {
919+
code = readFileSafe(absPath);
920+
} catch (e) {
921+
debug(`backfillNativeDroppedFiles: read failed for ${relPath}: ${toErrorMessage(e)}`);
922+
continue;
923+
}
924+
if (code === null) continue;
925+
const stat = fileStat(absPath);
926+
const mtime = stat ? Math.floor(stat.mtimeMs) : 0;
927+
const size = stat ? stat.size : 0;
928+
upsertHash.run(relPath, fileHash(code), mtime, size);
929+
}
930+
});
931+
writeHashes();
932+
} catch (e) {
933+
debug(
934+
`backfillNativeDroppedFiles: file_hashes write failed (table may not exist): ${toErrorMessage(e)}`,
935+
);
936+
}
937+
859938
// Free WASM parse trees from the inline backfill path (#1058).
860939
// `parseFilesWasmInline` sets `symbols._tree` (a live web-tree-sitter Tree
861940
// backed by WASM linear memory) on every result, but these symbols are

src/domain/graph/builder/stages/insert-nodes.ts

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ import path from 'node:path';
1212
import { performance } from 'node:perf_hooks';
1313
import { bulkNodeIdsByFile } from '../../../../db/index.js';
1414
import { debug } from '../../../../infrastructure/logger.js';
15+
import { normalizePath } from '../../../../shared/constants.js';
1516
import { toErrorMessage } from '../../../../shared/errors.js';
1617
import type {
1718
BetterSqlite3Database,
1819
ExtractorOutput,
20+
FileToParse,
1921
MetadataUpdate,
2022
SqliteStatement,
2123
} from '../../../../types.js';
@@ -90,16 +92,30 @@ function marshalSymbolBatches(allSymbols: Map<string, ExtractorOutput>): InsertN
9092
return batches;
9193
}
9294

93-
/** Build file hash entries from parsed symbols and precomputed/metadata sources. */
94-
function buildFileHashes(
95-
allSymbols: Map<string, ExtractorOutput>,
95+
/**
96+
* Build file hash entries for every collected file, including those that
97+
* produced zero symbols (empty files, parsers that silently no-op'd, or
98+
* optional-language extensions whose grammar wasn't installed). Iterating the
99+
* symbol map instead would skip such files and leave them missing from
100+
* `file_hashes`, which permanently breaks the JS-side fast-skip pre-flight on
101+
* any subsequent no-op rebuild (#1068).
102+
*
103+
* Exported for unit testing.
104+
*/
105+
export function buildFileHashes(
106+
filesToParse: FileToParse[],
96107
precomputedData: Map<string, PrecomputedFileData>,
97108
metadataUpdates: MetadataUpdate[],
98109
rootDir: string,
99110
): Array<{ file: string; hash: string; mtime: number; size: number }> {
100111
const fileHashes: Array<{ file: string; hash: string; mtime: number; size: number }> = [];
112+
const seen = new Set<string>();
113+
114+
for (const item of filesToParse) {
115+
const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file));
116+
if (seen.has(relPath)) continue;
117+
seen.add(relPath);
101118

102-
for (const [relPath] of allSymbols) {
103119
const precomputed = precomputedData.get(relPath);
104120
if (precomputed?._reverseDepOnly) {
105121
continue; // file unchanged, hash already correct
@@ -157,7 +173,7 @@ function tryNativeInsert(ctx: PipelineContext): boolean {
157173
for (const item of filesToParse) {
158174
if (item.relPath) precomputedData.set(item.relPath, item as PrecomputedFileData);
159175
}
160-
const fileHashes = buildFileHashes(allSymbols, precomputedData, metadataUpdates, rootDir);
176+
const fileHashes = buildFileHashes(filesToParse, precomputedData, metadataUpdates, rootDir);
161177

162178
// In native-first mode (single rusqlite connection), no WAL dance is needed.
163179
// In dual-connection mode, checkpoint JS side before native write, then
@@ -321,15 +337,23 @@ function insertChildrenAndEdges(
321337

322338
function updateFileHashes(
323339
_db: BetterSqlite3Database,
324-
allSymbols: Map<string, ExtractorOutput>,
340+
filesToParse: FileToParse[],
325341
precomputedData: Map<string, PrecomputedFileData>,
326342
metadataUpdates: MetadataUpdate[],
327343
rootDir: string,
328344
upsertHash: SqliteStatement | null,
329345
): void {
330346
if (!upsertHash) return;
331347

332-
for (const [relPath] of allSymbols) {
348+
// Iterate every collected file (#1068): files that produced zero symbols
349+
// (empty, parser no-op, or grammar-missing optional language) still need a
350+
// hash row, otherwise the next no-op rebuild's fast-skip pre-flight rejects.
351+
const seen = new Set<string>();
352+
for (const item of filesToParse) {
353+
const relPath = item.relPath ?? normalizePath(path.relative(rootDir, item.file));
354+
if (seen.has(relPath)) continue;
355+
seen.add(relPath);
356+
333357
const precomputed = precomputedData.get(relPath);
334358
if (precomputed?._reverseDepOnly) {
335359
// no-op: file unchanged, hash already correct
@@ -415,7 +439,7 @@ export async function insertNodes(ctx: PipelineContext): Promise<void> {
415439
const insertAll = ctx.db.transaction(() => {
416440
insertDefinitionsAndExports(ctx.db, allSymbols);
417441
insertChildrenAndEdges(ctx.db, allSymbols);
418-
updateFileHashes(ctx.db, allSymbols, precomputedData, metadataUpdates, rootDir, upsertHash);
442+
updateFileHashes(ctx.db, filesToParse, precomputedData, metadataUpdates, rootDir, upsertHash);
419443
});
420444

421445
insertAll();

tests/builder/insert-nodes.test.ts

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/**
2+
* Unit tests for insertNodes helpers.
3+
*
4+
* Regression coverage for #1068: the file-hash builder must emit a row for
5+
* every collected file, even those whose parser produced zero symbols (empty
6+
* files, parser no-op, or optional-language grammar unavailable). Skipping
7+
* symbol-less files would leave the next no-op rebuild's fast-skip pre-flight
8+
* (#1054) rejecting on `collected file missing from file_hashes` and force
9+
* the full ~2s native pipeline.
10+
*/
11+
import fs from 'node:fs';
12+
import os from 'node:os';
13+
import path from 'node:path';
14+
import { afterAll, beforeAll, describe, expect, it } from 'vitest';
15+
import { fileHash } from '../../src/domain/graph/builder/helpers.js';
16+
import { buildFileHashes } from '../../src/domain/graph/builder/stages/insert-nodes.js';
17+
18+
let tmpDir: string;
19+
20+
beforeAll(() => {
21+
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-insert-nodes-'));
22+
fs.writeFileSync(path.join(tmpDir, 'a.js'), 'export const a = 1;');
23+
// Symbol-less file (e.g. registered extension whose grammar wasn't installed,
24+
// or a file the parser silently no-op'd on). Content is arbitrary — the
25+
// hash builder must not care whether parsing produced any symbols.
26+
fs.writeFileSync(path.join(tmpDir, 'b.clj'), '(comment "no symbols")');
27+
});
28+
29+
afterAll(() => {
30+
if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true });
31+
});
32+
33+
describe('buildFileHashes', () => {
34+
it('emits a row for every collected file, including symbol-less ones (#1068)', () => {
35+
const filesToParse = [
36+
{ file: path.join(tmpDir, 'a.js') },
37+
{ file: path.join(tmpDir, 'b.clj') },
38+
];
39+
const result = buildFileHashes(filesToParse, new Map(), [], tmpDir);
40+
41+
const files = result.map((r) => r.file).sort();
42+
expect(files).toEqual(['a.js', 'b.clj']);
43+
for (const row of result) {
44+
expect(row.hash).toMatch(/^[0-9a-f]+$/);
45+
expect(row.size).toBeGreaterThan(0);
46+
expect(row.mtime).toBeGreaterThan(0);
47+
}
48+
});
49+
50+
it('uses precomputed hash when present', () => {
51+
const aPath = path.join(tmpDir, 'a.js');
52+
const precomputedHash = 'deadbeef';
53+
const precomputed = new Map([
54+
[
55+
'a.js',
56+
{
57+
file: aPath,
58+
relPath: 'a.js',
59+
hash: precomputedHash,
60+
stat: { mtime: 12345, size: 99 },
61+
},
62+
],
63+
]);
64+
const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir);
65+
66+
expect(result).toEqual([{ file: 'a.js', hash: precomputedHash, mtime: 12345, size: 99 }]);
67+
});
68+
69+
it('skips files marked _reverseDepOnly (hash already correct)', () => {
70+
const aPath = path.join(tmpDir, 'a.js');
71+
const precomputed = new Map([
72+
[
73+
'a.js',
74+
{
75+
file: aPath,
76+
relPath: 'a.js',
77+
hash: 'unused',
78+
_reverseDepOnly: true,
79+
},
80+
],
81+
]);
82+
const result = buildFileHashes([{ file: aPath, relPath: 'a.js' }], precomputed, [], tmpDir);
83+
84+
expect(result).toEqual([]);
85+
});
86+
87+
it('falls back to reading file from disk when no precomputed data exists', () => {
88+
const aPath = path.join(tmpDir, 'a.js');
89+
const result = buildFileHashes([{ file: aPath }], new Map(), [], tmpDir);
90+
91+
expect(result).toHaveLength(1);
92+
const row = result[0]!;
93+
expect(row.file).toBe('a.js');
94+
expect(row.hash).toBe(fileHash(fs.readFileSync(aPath, 'utf-8')));
95+
});
96+
97+
it('appends metadata-only updates after the file iteration', () => {
98+
const result = buildFileHashes(
99+
[],
100+
new Map(),
101+
[{ relPath: 'meta.js', hash: 'abc', stat: { mtime: 10, size: 20 } }],
102+
tmpDir,
103+
);
104+
105+
expect(result).toEqual([{ file: 'meta.js', hash: 'abc', mtime: 10, size: 20 }]);
106+
});
107+
108+
it('deduplicates when filesToParse contains the same relPath twice', () => {
109+
const aPath = path.join(tmpDir, 'a.js');
110+
const result = buildFileHashes(
111+
[
112+
{ file: aPath, relPath: 'a.js' },
113+
{ file: aPath, relPath: 'a.js' },
114+
],
115+
new Map(),
116+
[],
117+
tmpDir,
118+
);
119+
120+
expect(result).toHaveLength(1);
121+
expect(result[0]!.file).toBe('a.js');
122+
});
123+
});

0 commit comments

Comments
 (0)