-
Notifications
You must be signed in to change notification settings - Fork 10
fix(native): purge stale rows when WASM-only files are deleted #1122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
f26abee
8af09cc
c8fabe1
b7de480
17bf7fd
4c22410
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,7 @@ import { | |||||||||
| initSchema, | ||||||||||
| MIGRATIONS, | ||||||||||
| openDb, | ||||||||||
| purgeFilesData, | ||||||||||
| releaseAdvisoryLock, | ||||||||||
| setBuildMeta, | ||||||||||
| } from '../../../db/index.js'; | ||||||||||
|
|
@@ -38,6 +39,7 @@ import { | |||||||||
| formatDropExtensionSummary, | ||||||||||
| getActiveEngine, | ||||||||||
| getInstalledWasmExtensions, | ||||||||||
| NATIVE_SUPPORTED_EXTENSIONS, | ||||||||||
| parseFilesWasmForBackfill, | ||||||||||
| } from '../../parser.js'; | ||||||||||
| import { writeJournalHeader } from '../journal.js'; | ||||||||||
|
|
@@ -774,6 +776,58 @@ async function tryNativeOrchestrator( | |||||||||
| return formatNativeTimingResult(p, structurePatchMs, analysisTiming); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper | ||||||||||
| * is pure and unit-testable independently of `getInstalledWasmExtensions` and | ||||||||||
| * the `NATIVE_SUPPORTED_EXTENSIONS` global state. | ||||||||||
| */ | ||||||||||
| export interface WasmOnlyStaleFilesInput { | ||||||||||
| /** Distinct `file` values from the `nodes` table. */ | ||||||||||
| existingNodes: ReadonlySet<string>; | ||||||||||
| /** Distinct `file` values from the `file_hashes` table. */ | ||||||||||
| existingHashes: ReadonlySet<string>; | ||||||||||
| /** Relative paths currently on disk (from `collectFilesUtil`). */ | ||||||||||
| expected: ReadonlySet<string>; | ||||||||||
| /** Lowercased extensions whose WASM grammar is installed. */ | ||||||||||
| installedExts: ReadonlySet<string>; | ||||||||||
| /** Extensions covered by the Rust addon — Rust owns deletion for these. */ | ||||||||||
| nativeSupported: ReadonlySet<string>; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Compute the WASM-only files present in the DB but missing from disk (#1073). | ||||||||||
| * | ||||||||||
| * Returns relative paths that: | ||||||||||
| * - appear in `existingNodes` or `existingHashes` (in DB), | ||||||||||
| * - are absent from `expected` (not on disk), | ||||||||||
| * - have an extension installed for WASM, AND | ||||||||||
| * - have an extension NOT covered by `nativeSupported` — Rust's | ||||||||||
| * `purge_changed_files` handles deletion for natively-supported extensions | ||||||||||
| * via its own `detect_removed_files`, so the caller must not double-purge. | ||||||||||
| * | ||||||||||
| * Extensions are lowercased before lookup to match the registry and Rust's | ||||||||||
| * `LanguageKind::from_extension` (which normalises case for the languages | ||||||||||
| * where both cases are conventional, e.g. R's `.r` / `.R`). | ||||||||||
| * | ||||||||||
| * Exported for unit testing. | ||||||||||
| */ | ||||||||||
| export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] { | ||||||||||
| const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input; | ||||||||||
| const stale: string[] = []; | ||||||||||
| const seen = new Set<string>(); | ||||||||||
| const consider = (rel: string): void => { | ||||||||||
| if (expected.has(rel) || seen.has(rel)) return; | ||||||||||
| const ext = path.extname(rel).toLowerCase(); | ||||||||||
| if (nativeSupported.has(ext)) return; | ||||||||||
| if (!installedExts.has(ext)) return; | ||||||||||
| seen.add(rel); | ||||||||||
| stale.push(rel); | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The dedup key should remain the normalised form (so
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 4c22410. |
||||||||||
| }; | ||||||||||
| for (const rel of existingNodes) consider(rel); | ||||||||||
| for (const rel of existingHashes) consider(rel); | ||||||||||
| return stale; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Backfill files that the native orchestrator silently dropped during parse. | ||||||||||
| * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). | ||||||||||
|
|
@@ -830,7 +884,22 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> { | |||||||||
| missingRel.push(rel); | ||||||||||
| missingAbs.push(path.join(ctx.rootDir, rel)); | ||||||||||
| } | ||||||||||
| if (missingAbs.length === 0) return; | ||||||||||
|
|
||||||||||
| // Detect WASM-only files deleted from disk (#1073). The Rust orchestrator's | ||||||||||
| // `detect_removed_files` filter (#1070) skips files outside its supported | ||||||||||
| // extensions, so deletions of WASM-only languages don't reach the native | ||||||||||
| // purge path. JS doesn't notice either — the rest of this function only | ||||||||||
| // inserts rows. Result: stale `nodes`/`file_hashes` rows linger across | ||||||||||
| // incremental rebuilds until the next full rebuild. | ||||||||||
| const staleRel = computeWasmOnlyStaleFiles({ | ||||||||||
| existingNodes, | ||||||||||
| existingHashes, | ||||||||||
| expected, | ||||||||||
| installedExts, | ||||||||||
| nativeSupported: NATIVE_SUPPORTED_EXTENSIONS, | ||||||||||
| }); | ||||||||||
|
|
||||||||||
| if (missingAbs.length === 0 && staleRel.length === 0) return; | ||||||||||
|
|
||||||||||
| // Now that we know there's work to do, hand off to better-sqlite3 (needed | ||||||||||
| // for the INSERT path below). | ||||||||||
|
|
@@ -840,6 +909,29 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> { | |||||||||
| ctx.nativeFirstProxy = false; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| const dbConn = ctx.db as unknown as BetterSqlite3Database; | ||||||||||
|
|
||||||||||
| // Purge WASM-only files that were deleted from disk (#1073). Rust's | ||||||||||
| // detect_removed_files skips them and the insert path below never visits | ||||||||||
| // them, so without this their rows would persist across rebuilds until the | ||||||||||
| // next full rebuild reset the DB. | ||||||||||
| if (staleRel.length > 0) { | ||||||||||
| const { byReason: staleByReason, totals: staleTotals } = classifyNativeDrops(staleRel); | ||||||||||
| info( | ||||||||||
| `Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByReason['unsupported-by-native'])}`, | ||||||||||
| ); | ||||||||||
| // staleRel is restricted above to extensions outside NATIVE_SUPPORTED_EXTENSIONS, | ||||||||||
| // so the native-extractor-failure bucket should always be empty here. | ||||||||||
| if (staleTotals['native-extractor-failure'] > 0) { | ||||||||||
| debug( | ||||||||||
| `backfillNativeDroppedFiles: stale-purge classified ${staleTotals['native-extractor-failure']} native-supported file(s) — unexpected; inspect the filter`, | ||||||||||
| ); | ||||||||||
| } | ||||||||||
| purgeFilesData(dbConn, staleRel); | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 8af09cc. Replaced the |
||||||||||
| } | ||||||||||
|
|
||||||||||
| if (missingAbs.length === 0) return; | ||||||||||
|
|
||||||||||
| // Classify drops so users see per-extension reasons instead of just a count | ||||||||||
| // (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust | ||||||||||
| // extractor); `native-extractor-failure` indicates a real native bug since | ||||||||||
|
|
@@ -888,7 +980,7 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> { | |||||||||
| exportKeys.push([exp.name, exp.kind, relPath, exp.line]); | ||||||||||
| } | ||||||||||
| } | ||||||||||
| const db = ctx.db as unknown as BetterSqlite3Database; | ||||||||||
| const db = dbConn; | ||||||||||
| batchInsertNodes(db, rows); | ||||||||||
|
|
||||||||||
| // Mark exported symbols in batches — mirrors insertDefinitionsAndExports. | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| /** | ||
| * Unit tests for `computeWasmOnlyStaleFiles` (#1073). | ||
| * | ||
| * The Rust orchestrator's `detect_removed_files` filter (#1070) skips files | ||
| * outside its supported extensions, so deletions of WASM-only languages don't | ||
| * reach the native purge path. The JS-side backfill only inserts rows, so | ||
| * without this helper a deleted WASM-only file would leak `nodes`/`file_hashes` | ||
| * rows until the next full rebuild. | ||
| * | ||
| * These tests pass the extension sets as parameters so they remain meaningful | ||
| * even when every currently-registered language is natively supported | ||
| * (i.e. `installedExts == nativeSupported`). The bug surface re-opens any time | ||
| * a new WASM-only language enters the registry before its Rust extractor. | ||
| */ | ||
| import { describe, expect, it } from 'vitest'; | ||
| import { computeWasmOnlyStaleFiles } from '../../src/domain/graph/builder/pipeline.js'; | ||
|
|
||
| const NATIVE = new Set(['.ts', '.js', '.r']); | ||
| const INSTALLED = new Set(['.ts', '.js', '.r', '.gleam', '.foo']); | ||
|
|
||
| describe('computeWasmOnlyStaleFiles', () => { | ||
| it('returns WASM-only files present in DB but missing from disk', () => { | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/a.gleam', 'src/b.ts']), | ||
| existingHashes: new Set(['src/a.gleam', 'src/b.ts']), | ||
| expected: new Set(['src/b.ts']), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual(['src/a.gleam']); | ||
| }); | ||
|
|
||
| it('skips natively-supported extensions — Rust owns their deletion path', () => { | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/old.ts', 'src/old.r']), | ||
| existingHashes: new Set(['src/old.ts', 'src/old.r']), | ||
| expected: new Set(), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual([]); | ||
| }); | ||
|
|
||
| it('skips extensions with no installed WASM grammar', () => { | ||
| // .bar is not in installedExts — neither engine can parse it, so DB rows | ||
| // for it can't have been written by this codepath. Leave them alone. | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/x.bar']), | ||
| existingHashes: new Set(['src/x.bar']), | ||
| expected: new Set(), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual([]); | ||
| }); | ||
|
|
||
| it('catches files that exist only in file_hashes (nodes missing)', () => { | ||
| // Legacy DB shape where file_hashes was written but `nodes` was not — the | ||
| // backfill should still recognise the file_hashes row as stale. | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(), | ||
| existingHashes: new Set(['src/leftover.gleam']), | ||
| expected: new Set(), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual(['src/leftover.gleam']); | ||
| }); | ||
|
|
||
| it('catches files that exist only in nodes (file_hashes missing)', () => { | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/leftover.gleam']), | ||
| existingHashes: new Set(), | ||
| expected: new Set(), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual(['src/leftover.gleam']); | ||
| }); | ||
|
|
||
| it('deduplicates files appearing in both nodes and file_hashes', () => { | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/dup.gleam']), | ||
| existingHashes: new Set(['src/dup.gleam']), | ||
| expected: new Set(), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual(['src/dup.gleam']); | ||
| }); | ||
|
|
||
| it('lowercases extensions to match registry/Rust normalisation', () => { | ||
| // R is conventionally written `.R` on disk. The registry and the Rust | ||
| // `LanguageKind::from_extension` accept both cases; `installedExts` and | ||
| // `nativeSupported` carry the lowercase canonical form. | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/Plot.R']), | ||
| existingHashes: new Set(['src/Plot.R']), | ||
| expected: new Set(), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| // .R lowercases to .r which IS native-supported, so it should be skipped. | ||
| expect(stale).toEqual([]); | ||
| }); | ||
|
|
||
| it('returns empty when DB and disk agree', () => { | ||
| const stale = computeWasmOnlyStaleFiles({ | ||
| existingNodes: new Set(['src/a.gleam', 'src/b.ts']), | ||
| existingHashes: new Set(['src/a.gleam', 'src/b.ts']), | ||
| expected: new Set(['src/a.gleam', 'src/b.ts']), | ||
| installedExts: INSTALLED, | ||
| nativeSupported: NATIVE, | ||
| }); | ||
| expect(stale).toEqual([]); | ||
| }); | ||
| }); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
expected.has(rel)uses raw DB paths against normalised on-disk pathsexpectedis built withnormalizePath(path.relative(ctx.rootDir, f)), whileexistingNodesandexistingHashesare rawfilecolumn values from the DB. On Windows, a DB row stored with back-slashes (src\a.gleam) would fail theexpected.has(rel)check even when the file still exists on disk, causing a false-positive stale detection and an unintended purge of live rows. The same path-normalisation mismatch already exists in themissingRelloop above (where it causes under-detection rather than over-deletion), so this is a pre-existing assumption — but the consequence is worse on the new purge path. Worth adding anormalizePathcall onrelinsideconsiderif Windows support is ever on the table, or a comment documenting the invariant that DB paths are always forward-slash-normalised.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 8af09cc.
consider()now appliesrawRel.replace(/\\/g, '/')before comparing againstexpected(which is already forward-slash-normalised), so a stale DB row carrying back-slashes — e.g. one migrated from a Windows-built DB — no longer triggers a false-positive purge of a live file. I used an explicit regex replace rather thannormalizePathbecausenormalizePathonly touchespath.sep, so it would be a no-op on POSIX against a Windows-flavoured string. Added a regression test inwasm-only-stale-files.test.tsfor the back-slash case.