diff --git a/src/ast-analysis/engine.ts b/src/ast-analysis/engine.ts index b623a8cc..c9698943 100644 --- a/src/ast-analysis/engine.ts +++ b/src/ast-analysis/engine.ts @@ -421,7 +421,9 @@ async function ensureWasmTreesIfNeeded( if (needsWasmTrees) { try { const { ensureWasmTrees } = await getParserModule(); - await ensureWasmTrees(fileSymbols, rootDir); + await ensureWasmTrees(fileSymbols, rootDir, (relPath, symbols) => + fileNeedsWasmTree(relPath, symbols, flags), + ); } catch (err: unknown) { debug(`ensureWasmTrees failed: ${toErrorMessage(err)}`); } diff --git a/src/ast-analysis/visitors/ast-store-visitor.ts b/src/ast-analysis/visitors/ast-store-visitor.ts index f825bc10..53a983ea 100644 --- a/src/ast-analysis/visitors/ast-store-visitor.ts +++ b/src/ast-analysis/visitors/ast-store-visitor.ts @@ -131,6 +131,21 @@ function extractChildExpressionText(node: TreeSitterNode): string | null { return truncate(node.text); } +/** + * Count code points cheaply: skip the `[...s]` spread when `s.length` already + * decides the answer. Each code point is 1 or 2 UTF-16 units, so `.length < 2` + * implies `< 2` code points and `.length >= 3` already guarantees `>= 2` code + * points (worst case: one surrogate pair + one BMP char = 2 code points). + * Only `.length === 2` is genuinely ambiguous (could be a single surrogate + * pair = 1 code point, or two BMP chars = 2 code points) and needs the spread. + */ +function codePointCountAtLeast2(s: string): boolean { + const len = s.length; + if (len < 2) return false; + if (len >= 3) return true; + return [...s].length >= 2; +} + /** * Extract string content from a string-literal node, mirroring the native * engine's `build_string_node` (`helpers.rs`). Returns `null` when the @@ -142,15 +157,27 @@ function extractStringContent(node: TreeSitterNode, cfg: AstStringConfig): strin let s = raw; s = trimLeadingChars(s, '@'); - s = trimLeadingChars(s, cfg.stringPrefixes); + if (cfg.stringPrefixes) s = trimLeadingChars(s, cfg.stringPrefixes); if (isRawString) s = trimLeadingChars(s, 'r#'); s = trimLeadingChars(s, cfg.quoteChars); if (isRawString) s = trimTrailingChars(s, '#'); s = trimTrailingChars(s, cfg.quoteChars); - // Count code points, not UTF-16 code units — matches Rust `chars().count()`. - const codePointCount = [...s].length; - if (codePointCount < 2) return null; + return codePointCountAtLeast2(s) ? s : null; +} + +// Per-astTypeMap cache for the set of node-types that map to kind 'new'. +// Computed once per unique astTypeMap reference (one per language) instead +// of once per file. +const _newTypesCache = new WeakMap, Set>(); +function newTypesFor(astTypeMap: Record): Set { + let s = _newTypesCache.get(astTypeMap); + if (s) return s; + s = new Set(); + for (const type in astTypeMap) { + if (astTypeMap[type] === 'new') s.add(type); + } + _newTypesCache.set(astTypeMap, s); return s; } @@ -164,11 +191,12 @@ export function createAstStoreVisitor( ): Visitor { const rows: AstStoreRow[] = []; const matched = new Set(); - const newTypes = new Set( - Object.entries(astTypeMap) - .filter(([, kind]) => kind === 'new') - .map(([type]) => type), - ); + const newTypes = newTypesFor(astTypeMap); + // When nodeIdMap is empty, parentNodeId resolution is wasted work — the + // worker passes an empty map and the main thread re-resolves against its + // own DB-populated map in features/ast.ts::collectFileAstRows. Skip the + // findParentDef linear scan in that case. + const skipParentLookup = nodeIdMap.size === 0; function findParentDef(line: number): Definition | null { let best: Definition | null = null; @@ -183,6 +211,7 @@ export function createAstStoreVisitor( } function resolveParentNodeId(line: number): number | null { + if (skipParentLookup) return null; const parentDef = findParentDef(line); if (!parentDef) return null; return nodeIdMap.get(`${parentDef.name}|${parentDef.kind}|${parentDef.line}`) || null; diff --git a/src/domain/parser.ts b/src/domain/parser.ts index 6aa19c3c..a1aeedeb 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -316,16 +316,23 @@ export function getParser(parsers: Map, filePath: string) * * Name is preserved for caller compatibility; the function now ensures * *analysis data* rather than *trees*. + * + * `needsFn` (optional): when provided, only files for which it returns true are + * re-parsed. Without it the function falls back to "any WASM-parseable file + * without _tree", which was the source of #1036 — a single file missing one + * analysis triggered a full-build re-parse of every WASM-parseable file. */ export async function ensureWasmTrees( fileSymbols: Map, rootDir: string, + needsFn?: (relPath: string, symbols: any) => boolean, ): Promise { // Collect files that still need analysis data and are parseable by WASM. const pending: Array<{ relPath: string; absPath: string; symbols: any }> = []; for (const [relPath, symbols] of fileSymbols) { if (symbols._tree) continue; // legacy path — leave existing trees alone if (!_extToLang.has(path.extname(relPath).toLowerCase())) continue; + if (needsFn && !needsFn(relPath, symbols)) continue; pending.push({ relPath, absPath: path.join(rootDir, relPath), symbols }); } if (pending.length === 0) return; diff --git a/src/domain/wasm-worker-entry.ts b/src/domain/wasm-worker-entry.ts index c594850b..e8359a21 100644 --- a/src/domain/wasm-worker-entry.ts +++ b/src/domain/wasm-worker-entry.ts @@ -708,18 +708,18 @@ async function handleParse(msg: WorkerParseRequest): Promise; - if (astRows.length > 0) { - // Strip `file` and `parentNodeId` — main thread re-resolves parent IDs - // against its DB in features/ast.ts::collectFileAstRows, and `file` is - // known from the map key. - serializedAstNodes = astRows.map((n) => ({ - line: n.line, - kind: n.kind, - name: n.name ?? '', - text: n.text ?? undefined, - receiver: n.receiver ?? undefined, - })); - } + // Always set an array (even empty) — leaving astNodes undefined makes + // engine.ts::fileNeedsWasmTree treat the file as un-walked and trigger + // a full ensureWasmTrees re-parse of every WASM-parseable file (#1036). + // Strip `file` and `parentNodeId` — main thread re-resolves both in + // features/ast.ts::collectFileAstRows. + serializedAstNodes = astRows.map((n) => ({ + line: n.line, + kind: n.kind, + name: n.name ?? '', + text: n.text ?? undefined, + receiver: n.receiver ?? undefined, + })); } if (complexityVisitor) storeComplexityResults(results, defs, entry.id); diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index cf0e9741..15e266da 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -79,6 +79,20 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * * - 3.9.2:Full build — NativeDbProxy overhead causes native full build to * regress from 5206ms to 9403ms (+81%). Fix tracked in PR #906. + * + * - 3.9.6:Build ms/file / 3.9.6:No-op rebuild — WASM full build regressed + * (#1036) when PR #1016 expanded AST_TYPE_MAPS from 3 to 23 languages, + * causing zero-AST-row files to return `astNodes: undefined` and trigger + * a full-corpus re-parse. Fixed by PR #1038. Benchmarks captured before + * the fix landed; will reclear in v3.9.7+ data. + * + * - 3.9.6:Query time — native query benchmark sample-noise blip (29.4 → 47ms) + * above the natural variance of the small target set. Not reproducible + * locally (~30ms steady-state); will be re-validated on v3.9.7+ data. + * + * - 3.9.6:resolution haskell precision/recall — separate Haskell resolver + * regression introduced in 3.9.6, unrelated to #1036 / PR #1038. Tracked + * in #1039. */ const KNOWN_REGRESSIONS = new Set([ '3.9.0:1-file rebuild', @@ -87,6 +101,11 @@ const KNOWN_REGRESSIONS = new Set([ '3.9.0:fnDeps depth 5', '3.9.1:1-file rebuild', '3.9.2:Full build', + '3.9.6:Build ms/file', + '3.9.6:No-op rebuild', + '3.9.6:Query time', + '3.9.6:resolution haskell precision', + '3.9.6:resolution haskell recall', ]); /**