Skip to content

Commit acc916f

Browse files
authored
Merge branch 'main' into fix/incremental-report-preserve-notes
2 parents 934893d + 031ab81 commit acc916f

2 files changed

Lines changed: 302 additions & 9 deletions

File tree

src/domain/graph/builder/pipeline.ts

Lines changed: 154 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import {
1515
initSchema,
1616
MIGRATIONS,
1717
openDb,
18+
purgeFilesData,
1819
releaseAdvisoryLock,
1920
setBuildMeta,
2021
} from '../../../db/index.js';
@@ -38,6 +39,7 @@ import {
3839
formatDropExtensionSummary,
3940
getActiveEngine,
4041
getInstalledWasmExtensions,
42+
NATIVE_SUPPORTED_EXTENSIONS,
4143
parseFilesWasmForBackfill,
4244
} from '../../parser.js';
4345
import { writeJournalHeader } from '../journal.js';
@@ -659,11 +661,12 @@ async function tryNativeOrchestrator(
659661
if (result.earlyExit) {
660662
info('No changes detected');
661663
// Even on no-op rebuilds, dropped-language files added since the last
662-
// full build are still missing from `nodes`/`file_hashes` (#1083). The
663-
// orchestrator's file_collector skipped them, so its earlyExit doesn't
664-
// imply DB consistency. Run the gap repair before returning.
664+
// full build are still missing from `nodes`/`file_hashes` (#1083), and
665+
// WASM-only files deleted from disk leave stale rows behind (#1073).
666+
// The orchestrator's file_collector skipped them, so its earlyExit
667+
// doesn't imply DB consistency. Run the gap repair before returning.
665668
const gap = detectDroppedLanguageGap(ctx);
666-
if (gap.missingAbs.length > 0) {
669+
if (gap.missingAbs.length > 0 || gap.staleRel.length > 0) {
667670
await backfillNativeDroppedFiles(ctx, gap);
668671
}
669672
closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
@@ -773,7 +776,13 @@ async function tryNativeOrchestrator(
773776
const removedCount = result.removedCount ?? 0;
774777
const changedCount = result.changedCount ?? 0;
775778
const gap = detectDroppedLanguageGap(ctx);
776-
if (result.isFullBuild || removedCount > 0 || changedCount > 0 || gap.missingAbs.length > 0) {
779+
if (
780+
result.isFullBuild ||
781+
removedCount > 0 ||
782+
changedCount > 0 ||
783+
gap.missingAbs.length > 0 ||
784+
gap.staleRel.length > 0
785+
) {
777786
await backfillNativeDroppedFiles(ctx, gap);
778787
}
779788

@@ -787,6 +796,101 @@ interface DroppedLanguageGap {
787796
missingRel: string[];
788797
/** Absolute paths, aligned by index with `missingRel`. */
789798
missingAbs: string[];
799+
/**
800+
* Relative paths of WASM-only files present in DB but absent from disk (#1073).
801+
* Rust's `detect_removed_files` filter (#1070) skips these, so the JS-side
802+
* backfill must purge them. Always disjoint from `missingRel`.
803+
*/
804+
staleRel: string[];
805+
}
806+
807+
/**
808+
* Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper
809+
* is pure and unit-testable independently of `getInstalledWasmExtensions` and
810+
* the `NATIVE_SUPPORTED_EXTENSIONS` global state.
811+
*/
812+
export interface WasmOnlyStaleFilesInput {
813+
/** Distinct `file` values from the `nodes` table. */
814+
existingNodes: ReadonlySet<string>;
815+
/** Distinct `file` values from the `file_hashes` table. */
816+
existingHashes: ReadonlySet<string>;
817+
/** Relative paths currently on disk (from `collectFilesUtil`). */
818+
expected: ReadonlySet<string>;
819+
/** Lowercased extensions whose WASM grammar is installed. */
820+
installedExts: ReadonlySet<string>;
821+
/** Extensions covered by the Rust addon — Rust owns deletion for these. */
822+
nativeSupported: ReadonlySet<string>;
823+
}
824+
825+
/**
826+
* Compute the WASM-only files present in the DB but missing from disk (#1073).
827+
*
828+
* Returns relative paths that:
829+
* - appear in `existingNodes` or `existingHashes` (in DB),
830+
* - are absent from `expected` (not on disk),
831+
* - have an extension installed for WASM, AND
832+
* - have an extension NOT covered by `nativeSupported` — Rust's
833+
* `purge_changed_files` handles deletion for natively-supported extensions
834+
* via its own `detect_removed_files`, so the caller must not double-purge.
835+
*
836+
* Extensions are lowercased before lookup to match the registry and Rust's
837+
* `LanguageKind::from_extension` (which normalises case for the languages
838+
* where both cases are conventional, e.g. R's `.r` / `.R`).
839+
*
840+
* DB paths are forced to forward slashes before comparison with `expected`
841+
* (which is always normalised). The on-disk invariant is that DB rows are
842+
* written with forward slashes, but a stale row written by older code on
843+
* Windows could carry back-slashes — normalising here makes the comparison
844+
* platform-safe and prevents false-positive purges of live rows. We replace
845+
* `\\` explicitly (rather than calling `normalizePath`, which only touches
846+
* `path.sep`) so the defence works when running on POSIX against a DB that
847+
* was migrated from Windows.
848+
*
849+
* Exported for unit testing.
850+
*/
851+
export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] {
852+
const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input;
853+
const stale: string[] = [];
854+
const seen = new Set<string>();
855+
const consider = (rawRel: string): void => {
856+
const rel = rawRel.replace(/\\/g, '/');
857+
if (expected.has(rel) || seen.has(rel)) return;
858+
const ext = path.extname(rel).toLowerCase();
859+
if (nativeSupported.has(ext)) return;
860+
if (!installedExts.has(ext)) return;
861+
seen.add(rel);
862+
// Push the ORIGINAL raw path (not the normalised form) so the eventual
863+
// `DELETE FROM nodes WHERE file = ?` predicate in `purgeFilesData`
864+
// matches the actual stored row. The dedup `seen` set keeps the
865+
// normalised form so a file written once with `\` and once with `/`
866+
// is still treated as one entry — but the value the SQL sees has to
867+
// be byte-identical to what's on disk in the DB.
868+
stale.push(rawRel);
869+
};
870+
for (const rel of existingNodes) consider(rel);
871+
for (const rel of existingHashes) consider(rel);
872+
return stale;
873+
}
874+
875+
/**
876+
* Group relative paths by their lowercased extension. Shape matches the bucket
877+
* type that `formatDropExtensionSummary` consumes, so callers can render a
878+
* log-friendly per-extension summary without going through `classifyNativeDrops`
879+
* when the reason is already known (e.g. the stale-purge path where every path
880+
* is guaranteed `unsupported-by-native`).
881+
*/
882+
function groupByExtension(relPaths: Iterable<string>): Map<string, string[]> {
883+
const buckets = new Map<string, string[]>();
884+
for (const rel of relPaths) {
885+
const ext = path.extname(rel).toLowerCase();
886+
let list = buckets.get(ext);
887+
if (!list) {
888+
list = [];
889+
buckets.set(ext, list);
890+
}
891+
list.push(rel);
892+
}
893+
return buckets;
790894
}
791895

792896
/**
@@ -803,6 +907,13 @@ interface DroppedLanguageGap {
803907
* regression — excluding them keeps the warn count in
804908
* `backfillNativeDroppedFiles` meaningful.
805909
*
910+
* Also detects WASM-only files deleted from disk (#1073). Rust's
911+
* `detect_removed_files` filter (#1070) skips files outside its supported
912+
* extensions, so deletions of WASM-only languages don't reach the native
913+
* purge path; the rest of the backfill only inserts rows, so without this
914+
* step stale `nodes`/`file_hashes` rows would linger across incremental
915+
* rebuilds until the next full rebuild.
916+
*
806917
* Cheap (no DB handoff, no parsing): used both to gate the backfill call
807918
* and as its working set. NativeDbProxy supports `.prepare().all()`, so
808919
* this works whether `ctx.db` is a proxy or a real better-sqlite3
@@ -844,13 +955,25 @@ function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap {
844955
missingRel.push(rel);
845956
missingAbs.push(path.join(ctx.rootDir, rel));
846957
}
847-
return { missingRel, missingAbs };
958+
959+
const staleRel = computeWasmOnlyStaleFiles({
960+
existingNodes,
961+
existingHashes,
962+
expected,
963+
installedExts,
964+
nativeSupported: NATIVE_SUPPORTED_EXTENSIONS,
965+
});
966+
967+
return { missingRel, missingAbs, staleRel };
848968
}
849969

850970
/**
851971
* Backfill files that the native orchestrator silently dropped during parse.
852972
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
853973
*
974+
* Also purges stale rows for WASM-only files deleted from disk (#1073), which
975+
* Rust's `detect_removed_files` filter (#1070) skips.
976+
*
854977
* Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller
855978
* can use the same scan for both gating and the actual backfill — avoiding
856979
* a redundant fs walk when the orchestrator's signals already triggered.
@@ -859,8 +982,8 @@ async function backfillNativeDroppedFiles(
859982
ctx: PipelineContext,
860983
gap: DroppedLanguageGap,
861984
): Promise<void> {
862-
const { missingRel, missingAbs } = gap;
863-
if (missingAbs.length === 0) return;
985+
const { missingRel, missingAbs, staleRel } = gap;
986+
if (missingAbs.length === 0 && staleRel.length === 0) return;
864987

865988
// Now that we know there's work to do, hand off to better-sqlite3 (needed
866989
// for the INSERT path below).
@@ -870,6 +993,28 @@ async function backfillNativeDroppedFiles(
870993
ctx.nativeFirstProxy = false;
871994
}
872995

996+
const dbConn = ctx.db as unknown as BetterSqlite3Database;
997+
998+
// Purge WASM-only files that were deleted from disk (#1073). Rust's
999+
// detect_removed_files skips them and the insert path below never visits
1000+
// them, so without this their rows would persist across rebuilds until the
1001+
// next full rebuild reset the DB.
1002+
if (staleRel.length > 0) {
1003+
// `computeWasmOnlyStaleFiles` guarantees every path here has an extension
1004+
// outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would
1005+
// always bucket 100% into `unsupported-by-native`. Build the extension
1006+
// summary directly to avoid a redundant classification pass.
1007+
const staleByExt = groupByExtension(staleRel);
1008+
info(
1009+
`Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`,
1010+
);
1011+
purgeFilesData(dbConn, staleRel);
1012+
}
1013+
1014+
if (missingAbs.length === 0) return;
1015+
1016+
if (missingAbs.length === 0) return;
1017+
8731018
// Classify drops so users see per-extension reasons instead of just a count
8741019
// (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
8751020
// extractor); `native-extractor-failure` indicates a real native bug since
@@ -918,7 +1063,7 @@ async function backfillNativeDroppedFiles(
9181063
exportKeys.push([exp.name, exp.kind, relPath, exp.line]);
9191064
}
9201065
}
921-
const db = ctx.db as unknown as BetterSqlite3Database;
1066+
const db = dbConn;
9221067
batchInsertNodes(db, rows);
9231068

9241069
// Mark exported symbols in batches — mirrors insertDefinitionsAndExports.
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/**
2+
* Unit tests for `computeWasmOnlyStaleFiles` (#1073).
3+
*
4+
* The Rust orchestrator's `detect_removed_files` filter (#1070) skips files
5+
* outside its supported extensions, so deletions of WASM-only languages don't
6+
* reach the native purge path. The JS-side backfill only inserts rows, so
7+
* without this helper a deleted WASM-only file would leak `nodes`/`file_hashes`
8+
* rows until the next full rebuild.
9+
*
10+
* These tests pass the extension sets as parameters so they remain meaningful
11+
* even when every currently-registered language is natively supported
12+
* (i.e. `installedExts == nativeSupported`). The bug surface re-opens any time
13+
* a new WASM-only language enters the registry before its Rust extractor.
14+
*/
15+
import { describe, expect, it } from 'vitest';
16+
import { computeWasmOnlyStaleFiles } from '../../src/domain/graph/builder/pipeline.js';
17+
18+
const NATIVE = new Set(['.ts', '.js', '.r']);
19+
const INSTALLED = new Set(['.ts', '.js', '.r', '.gleam', '.foo']);
20+
21+
describe('computeWasmOnlyStaleFiles', () => {
22+
it('returns WASM-only files present in DB but missing from disk', () => {
23+
const stale = computeWasmOnlyStaleFiles({
24+
existingNodes: new Set(['src/a.gleam', 'src/b.ts']),
25+
existingHashes: new Set(['src/a.gleam', 'src/b.ts']),
26+
expected: new Set(['src/b.ts']),
27+
installedExts: INSTALLED,
28+
nativeSupported: NATIVE,
29+
});
30+
expect(stale).toEqual(['src/a.gleam']);
31+
});
32+
33+
it('skips natively-supported extensions — Rust owns their deletion path', () => {
34+
const stale = computeWasmOnlyStaleFiles({
35+
existingNodes: new Set(['src/old.ts', 'src/old.r']),
36+
existingHashes: new Set(['src/old.ts', 'src/old.r']),
37+
expected: new Set(),
38+
installedExts: INSTALLED,
39+
nativeSupported: NATIVE,
40+
});
41+
expect(stale).toEqual([]);
42+
});
43+
44+
it('skips extensions with no installed WASM grammar', () => {
45+
// .bar is not in installedExts — neither engine can parse it, so DB rows
46+
// for it can't have been written by this codepath. Leave them alone.
47+
const stale = computeWasmOnlyStaleFiles({
48+
existingNodes: new Set(['src/x.bar']),
49+
existingHashes: new Set(['src/x.bar']),
50+
expected: new Set(),
51+
installedExts: INSTALLED,
52+
nativeSupported: NATIVE,
53+
});
54+
expect(stale).toEqual([]);
55+
});
56+
57+
it('catches files that exist only in file_hashes (nodes missing)', () => {
58+
// Legacy DB shape where file_hashes was written but `nodes` was not — the
59+
// backfill should still recognise the file_hashes row as stale.
60+
const stale = computeWasmOnlyStaleFiles({
61+
existingNodes: new Set(),
62+
existingHashes: new Set(['src/leftover.gleam']),
63+
expected: new Set(),
64+
installedExts: INSTALLED,
65+
nativeSupported: NATIVE,
66+
});
67+
expect(stale).toEqual(['src/leftover.gleam']);
68+
});
69+
70+
it('catches files that exist only in nodes (file_hashes missing)', () => {
71+
const stale = computeWasmOnlyStaleFiles({
72+
existingNodes: new Set(['src/leftover.gleam']),
73+
existingHashes: new Set(),
74+
expected: new Set(),
75+
installedExts: INSTALLED,
76+
nativeSupported: NATIVE,
77+
});
78+
expect(stale).toEqual(['src/leftover.gleam']);
79+
});
80+
81+
it('deduplicates files appearing in both nodes and file_hashes', () => {
82+
const stale = computeWasmOnlyStaleFiles({
83+
existingNodes: new Set(['src/dup.gleam']),
84+
existingHashes: new Set(['src/dup.gleam']),
85+
expected: new Set(),
86+
installedExts: INSTALLED,
87+
nativeSupported: NATIVE,
88+
});
89+
expect(stale).toEqual(['src/dup.gleam']);
90+
});
91+
92+
it('lowercases extensions to match registry/Rust normalisation', () => {
93+
// R is conventionally written `.R` on disk. The registry and the Rust
94+
// `LanguageKind::from_extension` accept both cases; `installedExts` and
95+
// `nativeSupported` carry the lowercase canonical form.
96+
const stale = computeWasmOnlyStaleFiles({
97+
existingNodes: new Set(['src/Plot.R']),
98+
existingHashes: new Set(['src/Plot.R']),
99+
expected: new Set(),
100+
installedExts: INSTALLED,
101+
nativeSupported: NATIVE,
102+
});
103+
// .R lowercases to .r which IS native-supported, so it should be skipped.
104+
expect(stale).toEqual([]);
105+
});
106+
107+
it('returns empty when DB and disk agree', () => {
108+
const stale = computeWasmOnlyStaleFiles({
109+
existingNodes: new Set(['src/a.gleam', 'src/b.ts']),
110+
existingHashes: new Set(['src/a.gleam', 'src/b.ts']),
111+
expected: new Set(['src/a.gleam', 'src/b.ts']),
112+
installedExts: INSTALLED,
113+
nativeSupported: NATIVE,
114+
});
115+
expect(stale).toEqual([]);
116+
});
117+
118+
it('normalises DB paths with back-slashes against forward-slash expected set', () => {
119+
// Defends against false-positive purges on Windows where a stale DB row
120+
// (written by older code) could carry back-slashes while `expected` is
121+
// always normalised. Without `normalizePath` inside `consider`, the file
122+
// would look stale and be purged even though it exists on disk.
123+
const stale = computeWasmOnlyStaleFiles({
124+
existingNodes: new Set(['src\\live.gleam']),
125+
existingHashes: new Set(['src\\live.gleam']),
126+
expected: new Set(['src/live.gleam']),
127+
installedExts: INSTALLED,
128+
nativeSupported: NATIVE,
129+
});
130+
expect(stale).toEqual([]);
131+
});
132+
133+
it('preserves back-slash form so DELETE matches the actual DB row', () => {
134+
// Counterpart to the previous test: when a back-slash DB row is GENUINELY
135+
// stale (file no longer on disk), the returned path must keep its raw form
136+
// so `purgeFilesData`'s `DELETE FROM nodes WHERE file = ?` matches the
137+
// stored row. Pushing the forward-slash-normalised form would let the
138+
// stale row silently persist — exactly the regression #1073 fixes.
139+
const stale = computeWasmOnlyStaleFiles({
140+
existingNodes: new Set(['src\\dead.gleam']),
141+
existingHashes: new Set(['src\\dead.gleam']),
142+
expected: new Set(),
143+
installedExts: INSTALLED,
144+
nativeSupported: NATIVE,
145+
});
146+
expect(stale).toEqual(['src\\dead.gleam']);
147+
});
148+
});

0 commit comments

Comments
 (0)