Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 122 additions & 2 deletions src/domain/graph/builder/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
initSchema,
MIGRATIONS,
openDb,
purgeFilesData,
releaseAdvisoryLock,
setBuildMeta,
} from '../../../db/index.js';
Expand All @@ -38,6 +39,7 @@ import {
formatDropExtensionSummary,
getActiveEngine,
getInstalledWasmExtensions,
NATIVE_SUPPORTED_EXTENSIONS,
parseFilesWasmForBackfill,
} from '../../parser.js';
import { writeJournalHeader } from '../journal.js';
Expand Down Expand Up @@ -774,6 +776,89 @@ async function tryNativeOrchestrator(
return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
}

/**
* Inputs to {@link computeWasmOnlyStaleFiles}. Sets are passed in so the helper
* is pure and unit-testable independently of `getInstalledWasmExtensions` and
* the `NATIVE_SUPPORTED_EXTENSIONS` global state.
*/
export interface WasmOnlyStaleFilesInput {
/** Distinct `file` values from the `nodes` table. */
existingNodes: ReadonlySet<string>;
/** Distinct `file` values from the `file_hashes` table. */
existingHashes: ReadonlySet<string>;
/** Relative paths currently on disk (from `collectFilesUtil`). */
expected: ReadonlySet<string>;
/** Lowercased extensions whose WASM grammar is installed. */
installedExts: ReadonlySet<string>;
/** Extensions covered by the Rust addon — Rust owns deletion for these. */
nativeSupported: ReadonlySet<string>;
}

/**
* Compute the WASM-only files present in the DB but missing from disk (#1073).
*
* Returns relative paths that:
* - appear in `existingNodes` or `existingHashes` (in DB),
* - are absent from `expected` (not on disk),
* - have an extension installed for WASM, AND
* - have an extension NOT covered by `nativeSupported` — Rust's
* `purge_changed_files` handles deletion for natively-supported extensions
* via its own `detect_removed_files`, so the caller must not double-purge.
*
* Extensions are lowercased before lookup to match the registry and Rust's
* `LanguageKind::from_extension` (which normalises case for the languages
* where both cases are conventional, e.g. R's `.r` / `.R`).
*
* DB paths are forced to forward slashes before comparison with `expected`
* (which is always normalised). The on-disk invariant is that DB rows are
* written with forward slashes, but a stale row written by older code on
* Windows could carry back-slashes — normalising here makes the comparison
* platform-safe and prevents false-positive purges of live rows. We replace
* `\\` explicitly (rather than calling `normalizePath`, which only touches
* `path.sep`) so the defence works when running on POSIX against a DB that
* was migrated from Windows.
*
* Exported for unit testing.
*/
export function computeWasmOnlyStaleFiles(input: WasmOnlyStaleFilesInput): string[] {
const { existingNodes, existingHashes, expected, installedExts, nativeSupported } = input;
const stale: string[] = [];
const seen = new Set<string>();
const consider = (rawRel: string): void => {
const rel = rawRel.replace(/\\/g, '/');
if (expected.has(rel) || seen.has(rel)) return;
const ext = path.extname(rel).toLowerCase();
if (nativeSupported.has(ext)) return;
if (!installedExts.has(ext)) return;
seen.add(rel);
stale.push(rel);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 stale.push(rel) stores the forward-slash-normalised path, but purgeFilesData executes DELETE FROM nodes WHERE file = ? with that value. If the DB row was stored with back-slashes (the Windows-migration case the comment describes), the SQL predicate won't match and the DELETE hits 0 rows — the stale row silently persists, which is exactly the regression this PR is trying to fix.

The dedup key should remain the normalised form (so src/a.gleam and src\a.gleam are still treated as one file), but the value pushed into stale must be the original rawRel so the later WHERE file = ? matches the actual stored path. The existing back-slash test only covers the "file still on disk" path; adding a counterpart with an empty expected set would catch this.

Suggested change
seen.add(rel);
stale.push(rel);
seen.add(rel);
stale.push(rawRel);

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 4c22410. consider() now pushes rawRel (not the normalised rel) into the stale list, so purgeFilesData's DELETE FROM nodes WHERE file = ? is byte-identical to the stored row. The dedup seen set still uses the normalised form so a path written once with \ and once with / is treated as one entry. Added the counterpart regression test (preserves back-slash form so DELETE matches the actual DB row) covering a stale back-slash row with an empty expected set.

};
for (const rel of existingNodes) consider(rel);
for (const rel of existingHashes) consider(rel);
return stale;
}

/**
* Group relative paths by their lowercased extension. Shape matches the bucket
* type that `formatDropExtensionSummary` consumes, so callers can render a
* log-friendly per-extension summary without going through `classifyNativeDrops`
* when the reason is already known (e.g. the stale-purge path where every path
* is guaranteed `unsupported-by-native`).
*/
function groupByExtension(relPaths: Iterable<string>): Map<string, string[]> {
const buckets = new Map<string, string[]>();
for (const rel of relPaths) {
const ext = path.extname(rel).toLowerCase();
let list = buckets.get(ext);
if (!list) {
list = [];
buckets.set(ext, list);
}
list.push(rel);
}
return buckets;
}

/**
* Backfill files that the native orchestrator silently dropped during parse.
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
Expand Down Expand Up @@ -830,7 +915,22 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
missingRel.push(rel);
missingAbs.push(path.join(ctx.rootDir, rel));
}
if (missingAbs.length === 0) return;

// Detect WASM-only files deleted from disk (#1073). The Rust orchestrator's
// `detect_removed_files` filter (#1070) skips files outside its supported
// extensions, so deletions of WASM-only languages don't reach the native
// purge path. JS doesn't notice either — the rest of this function only
// inserts rows. Result: stale `nodes`/`file_hashes` rows linger across
// incremental rebuilds until the next full rebuild.
const staleRel = computeWasmOnlyStaleFiles({
existingNodes,
existingHashes,
expected,
installedExts,
nativeSupported: NATIVE_SUPPORTED_EXTENSIONS,
});

if (missingAbs.length === 0 && staleRel.length === 0) return;

// Now that we know there's work to do, hand off to better-sqlite3 (needed
// for the INSERT path below).
Expand All @@ -840,6 +940,26 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
ctx.nativeFirstProxy = false;
}

const dbConn = ctx.db as unknown as BetterSqlite3Database;

// Purge WASM-only files that were deleted from disk (#1073). Rust's
// detect_removed_files skips them and the insert path below never visits
// them, so without this their rows would persist across rebuilds until the
// next full rebuild reset the DB.
if (staleRel.length > 0) {
// `computeWasmOnlyStaleFiles` guarantees every path here has an extension
// outside NATIVE_SUPPORTED_EXTENSIONS, so `classifyNativeDrops` would
// always bucket 100% into `unsupported-by-native`. Build the extension
// summary directly to avoid a redundant classification pass.
const staleByExt = groupByExtension(staleRel);
info(
`Detected ${staleRel.length} deleted WASM-only file(s) the native orchestrator skipped; purging stale rows: ${formatDropExtensionSummary(staleByExt)}`,
);
purgeFilesData(dbConn, staleRel);
}

if (missingAbs.length === 0) return;

// Classify drops so users see per-extension reasons instead of just a count
// (#1011). `unsupported-by-native` is a legitimate parser limit (no Rust
// extractor); `native-extractor-failure` indicates a real native bug since
Expand Down Expand Up @@ -888,7 +1008,7 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
exportKeys.push([exp.name, exp.kind, relPath, exp.line]);
}
}
const db = ctx.db as unknown as BetterSqlite3Database;
const db = dbConn;
batchInsertNodes(db, rows);

// Mark exported symbols in batches — mirrors insertDefinitionsAndExports.
Expand Down
132 changes: 132 additions & 0 deletions tests/builder/wasm-only-stale-files.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/**
* Unit tests for `computeWasmOnlyStaleFiles` (#1073).
*
* The Rust orchestrator's `detect_removed_files` filter (#1070) skips files
* outside its supported extensions, so deletions of WASM-only languages don't
* reach the native purge path. The JS-side backfill only inserts rows, so
* without this helper a deleted WASM-only file would leak `nodes`/`file_hashes`
* rows until the next full rebuild.
*
* These tests pass the extension sets as parameters so they remain meaningful
* even when every currently-registered language is natively supported
* (i.e. `installedExts == nativeSupported`). The bug surface re-opens any time
* a new WASM-only language enters the registry before its Rust extractor.
*/
import { describe, expect, it } from 'vitest';
import { computeWasmOnlyStaleFiles } from '../../src/domain/graph/builder/pipeline.js';

const NATIVE = new Set(['.ts', '.js', '.r']);
const INSTALLED = new Set(['.ts', '.js', '.r', '.gleam', '.foo']);

describe('computeWasmOnlyStaleFiles', () => {
it('returns WASM-only files present in DB but missing from disk', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/a.gleam', 'src/b.ts']),
existingHashes: new Set(['src/a.gleam', 'src/b.ts']),
expected: new Set(['src/b.ts']),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/a.gleam']);
});

it('skips natively-supported extensions — Rust owns their deletion path', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/old.ts', 'src/old.r']),
existingHashes: new Set(['src/old.ts', 'src/old.r']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('skips extensions with no installed WASM grammar', () => {
// .bar is not in installedExts — neither engine can parse it, so DB rows
// for it can't have been written by this codepath. Leave them alone.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/x.bar']),
existingHashes: new Set(['src/x.bar']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('catches files that exist only in file_hashes (nodes missing)', () => {
// Legacy DB shape where file_hashes was written but `nodes` was not — the
// backfill should still recognise the file_hashes row as stale.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(),
existingHashes: new Set(['src/leftover.gleam']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/leftover.gleam']);
});

it('catches files that exist only in nodes (file_hashes missing)', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/leftover.gleam']),
existingHashes: new Set(),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/leftover.gleam']);
});

it('deduplicates files appearing in both nodes and file_hashes', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/dup.gleam']),
existingHashes: new Set(['src/dup.gleam']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual(['src/dup.gleam']);
});

it('lowercases extensions to match registry/Rust normalisation', () => {
// R is conventionally written `.R` on disk. The registry and the Rust
// `LanguageKind::from_extension` accept both cases; `installedExts` and
// `nativeSupported` carry the lowercase canonical form.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/Plot.R']),
existingHashes: new Set(['src/Plot.R']),
expected: new Set(),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
// .R lowercases to .r which IS native-supported, so it should be skipped.
expect(stale).toEqual([]);
});

it('returns empty when DB and disk agree', () => {
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src/a.gleam', 'src/b.ts']),
existingHashes: new Set(['src/a.gleam', 'src/b.ts']),
expected: new Set(['src/a.gleam', 'src/b.ts']),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});

it('normalises DB paths with back-slashes against forward-slash expected set', () => {
// Defends against false-positive purges on Windows where a stale DB row
// (written by older code) could carry back-slashes while `expected` is
// always normalised. Without `normalizePath` inside `consider`, the file
// would look stale and be purged even though it exists on disk.
const stale = computeWasmOnlyStaleFiles({
existingNodes: new Set(['src\\live.gleam']),
existingHashes: new Set(['src\\live.gleam']),
expected: new Set(['src/live.gleam']),
installedExts: INSTALLED,
nativeSupported: NATIVE,
});
expect(stale).toEqual([]);
});
});
Loading