From f04bbb3141dc0581999a294b57107c0740f0318b Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 29 Apr 2026 15:33:22 -0600 Subject: [PATCH 1/4] perf(native): scope incremental rebuild to truly-changed files (#1012) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Native 1-file incremental was 4.3x slower than WASM (876ms vs 203ms) because `run_pipeline` re-parsed the full reverse-dep cone (47 files for a 1-file change), which cascaded into insert/structure/roles/analysis phases. Adopt the WASM save+reconnect strategy: save reverse-dep → changed-file edges before purge, reconnect to new node IDs after Stage 5. Reverse-dep files are no longer re-parsed — only their affected edges are reconstructed. Result: 876ms → 43ms (95% faster, 0.78x WASM). - insertMs: 185ms → 0.2ms - structureMs: 73ms → 3.1ms - rolesMs: 311ms → 18.3ms Edge counts identical between full and 1-file rebuild (37134), confirming no edge loss or duplication. The existing `1-file incremental ratio` gate in `benchmark-parity-gate.mjs` (limit 1.5) enforces the ceiling. --- Cargo.lock | 2 +- crates/codegraph-core/src/build_pipeline.rs | 93 +++++------ crates/codegraph-core/src/change_detection.rs | 149 ++++++++++++++++++ 3 files changed, 189 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb7395f7..fedf6a1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,7 +66,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "codegraph-core" -version = "3.9.4" +version = "3.9.5" dependencies = [ "globset", "ignore", diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 38c71518..0f0787ab 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -180,30 +180,26 @@ pub fn run_pipeline( }); } - // Track reverse-dep files that need re-parsing for edge reconstruction. - // Also track their relative paths so we can exclude them from analysis_scope — - // reverse-dep files are re-parsed for edge rebuilding but their content didn't - // change, so running AST/complexity/CFG/dataflow on them is wasted work (#761). - let mut reverse_dep_abs_paths: Vec = Vec::new(); - let mut reverse_dep_rel_paths: HashSet = HashSet::new(); + // Save reverse-dep → changed-file edges before purge so we can reconnect + // them to new node IDs after Stage 5 (#1012). This matches the WASM/JS + // strategy and lets us skip re-parsing reverse-dep files entirely: + // parse/insert/structure/roles/analysis all scope to truly-changed files. + let mut saved_reverse_dep_edges: Vec = Vec::new(); // Handle full build: clear all graph data if change_result.is_full_build { let has_embeddings = change_detection::has_embeddings(conn); change_detection::clear_all_graph_data(conn, has_embeddings); } else { - // Incremental: find reverse deps and purge changed files - let changed_rel_paths: HashSet = parse_changes - .iter() - .map(|c| c.rel_path.clone()) - .chain(change_result.removed.iter().cloned()) - .collect(); - - let reverse_deps = if opts.no_reverse_deps.unwrap_or(false) { - HashSet::new() - } else { - change_detection::find_reverse_dependencies(conn, &changed_rel_paths, root_dir) - }; + // Incremental: save reverse-dep edges (if reverse-dep tracking is enabled), + // then purge changed files only. + let changed_paths: Vec = + parse_changes.iter().map(|c| c.rel_path.clone()).collect(); + + if !opts.no_reverse_deps.unwrap_or(false) { + saved_reverse_dep_edges = + change_detection::save_reverse_dep_edges(conn, &changed_paths); + } let files_to_purge: Vec = change_result .removed @@ -211,28 +207,20 @@ pub fn run_pipeline( .chain(parse_changes.iter().map(|c| &c.rel_path)) .cloned() .collect(); - let reverse_dep_list: Vec = reverse_deps.iter().cloned().collect(); - change_detection::purge_changed_files(conn, &files_to_purge, &reverse_dep_list); - - // Track reverse-dep absolute paths so we can re-parse them for edge - // rebuilding. Their nodes are still in the DB (only edges were purged), - // but we need fresh FileSymbols so Stage 7 can reconstruct their - // import and call edges. - for rdep in &reverse_dep_list { - let abs = Path::new(root_dir).join(rdep); - if abs.exists() { - reverse_dep_abs_paths.push(abs.to_str().unwrap_or("").to_string()); - reverse_dep_rel_paths.insert(rdep.clone()); - } - } + // Pass empty reverse_dep_files: purge already deletes both directions + // for changed files (which removes the saved reverse-dep → changed-file + // edges from the live table), and other outgoing edges from reverse-dep + // files remain valid and must NOT be deleted — they will be reconnected + // to new target IDs after insert. + change_detection::purge_changed_files(conn, &files_to_purge, &[]); } // ── Stage 4: Parse files ─────────────────────────────────────────── + // Only truly-changed files are parsed. Reverse-dep files are not re-parsed — + // their edges to changed files are reconstructed via save+reconnect (#1012). let t0 = Instant::now(); - let mut files_to_parse: Vec = + let files_to_parse: Vec = parse_changes.iter().map(|c| c.abs_path.clone()).collect(); - // Include reverse-dep files so their edges are rebuilt after purging - files_to_parse.extend(reverse_dep_abs_paths); let parsed = parallel::parse_files_parallel(&files_to_parse, root_dir, include_dataflow, include_ast); @@ -330,32 +318,30 @@ pub fn run_pipeline( // internal logic. We load nodes from DB and pass to the edge builder. build_and_insert_call_edges(conn, &file_symbols, &import_ctx, !change_result.is_full_build); + // Reconnect saved reverse-dep edges to new node IDs (#1012). Mirrors + // `reconnectReverseDepEdges` in build-edges.ts — for each saved edge, + // look up the new target node and recreate the edge with the original + // source_id (still valid; reverse-dep nodes were never purged). + if !saved_reverse_dep_edges.is_empty() { + change_detection::reconnect_reverse_dep_edges(conn, &saved_reverse_dep_edges); + } + timing.edges_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 8: Structure + roles ───────────────────────────────────── let t0 = Instant::now(); let line_count_map = structure::build_line_count_map(&file_symbols, root_dir); + // file_symbols only contains truly-changed files (reverse-deps are not + // re-parsed; their edges are reconnected via save+reconnect — #1012), so + // analysis_scope == changed_files. let changed_files: Vec = file_symbols.keys().cloned().collect(); - // Build analysis_scope excluding reverse-dep files — they were re-parsed for - // edge reconstruction but their content didn't change, so AST/complexity/CFG/ - // dataflow analysis would be redundant (#761). This matches the JS pipeline's - // _reverseDepOnly filtering in run-analyses.ts. let analysis_scope: Option> = if change_result.is_full_build { None } else { - Some( - changed_files - .iter() - .filter(|f| !reverse_dep_rel_paths.contains(f.as_str())) - .cloned() - .collect(), - ) + Some(changed_files.clone()) }; let existing_file_count = structure::get_existing_file_count(conn); - // Use parse_changes.len() for the threshold — changed_files includes - // reverse-dep files added for edge rebuilding, which inflates the count - // and would skip the fast path even for single-file incremental builds. let use_fast_path = !change_result.is_full_build && parse_changes.len() <= FAST_PATH_MAX_CHANGED_FILES && existing_file_count > FAST_PATH_MIN_EXISTING_FILES; @@ -385,11 +371,10 @@ pub fn run_pipeline( timing.structure_ms = t0.elapsed().as_secs_f64() * 1000.0; let t0 = Instant::now(); - // Role classification intentionally uses the full `changed_files` list - // (including reverse-dep files), not `analysis_scope`. Reverse-dep files - // had their edges rebuilt, which can change fan-in/fan-out and therefore - // role assignments — so they must be re-classified even though their - // content didn't change and they are excluded from AST analysis. + // Role classification needs only the truly-changed files — + // `do_classify_incremental` expands to neighbours via the edges table, so + // reverse-dep files are picked up automatically when their fan-in/fan-out + // is affected by edges to/from changed files. let changed_file_list: Option> = if change_result.is_full_build { None } else { diff --git a/crates/codegraph-core/src/change_detection.rs b/crates/codegraph-core/src/change_detection.rs index ff774269..702996b1 100644 --- a/crates/codegraph-core/src/change_detection.rs +++ b/crates/codegraph-core/src/change_detection.rs @@ -345,6 +345,155 @@ fn file_mtime_size(path: &str) -> (i64, i64) { } } +/// A reverse-dep edge captured before purge so it can be reconnected to the +/// new target node ID after the changed file's nodes are re-inserted. +#[derive(Debug, Clone)] +pub struct SavedReverseDepEdge { + pub source_id: i64, + pub tgt_name: String, + pub tgt_kind: String, + pub tgt_file: String, + pub tgt_line: i64, + pub edge_kind: String, + pub confidence: f64, + pub dynamic: i64, +} + +/// Save edges from reverse-dep files → changed files BEFORE purge so they +/// can be reconnected to new target node IDs after node insertion (#1012). +/// +/// Mirrors the JS `purgeAndAddReverseDeps` path in `detect-changes.ts`. By +/// saving the edge topology and reconnecting after insert, we avoid the need +/// to re-parse every reverse-dep file just to rebuild its edges. That re-parse +/// is what made the native pipeline scale parse/insert/structure/roles with +/// the full reverse-dep cone (47 files for a 1-file change) instead of just +/// the truly-changed files (1 file). +pub fn save_reverse_dep_edges( + conn: &Connection, + changed_paths: &[String], +) -> Vec { + let mut saved = Vec::new(); + if changed_paths.is_empty() { + return saved; + } + let changed_set: HashSet<&str> = changed_paths.iter().map(|s| s.as_str()).collect(); + + let mut stmt = match conn.prepare( + "SELECT e.source_id, n_tgt.name, n_tgt.kind, n_tgt.file, n_tgt.line, \ + e.kind, e.confidence, e.dynamic, n_src.file \ + FROM edges e \ + JOIN nodes n_src ON e.source_id = n_src.id \ + JOIN nodes n_tgt ON e.target_id = n_tgt.id \ + WHERE n_tgt.file = ?1 AND n_src.file != n_tgt.file", + ) { + Ok(s) => s, + Err(_) => return saved, + }; + + for changed in changed_paths { + let rows = match stmt.query_map([changed], |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + row.get::<_, String>(3)?, + row.get::<_, i64>(4)?, + row.get::<_, String>(5)?, + row.get::<_, f64>(6)?, + row.get::<_, i64>(7)?, + row.get::<_, String>(8)?, + )) + }) { + Ok(r) => r, + Err(_) => continue, + }; + for row in rows.flatten() { + // Skip edges whose source is itself being purged — buildEdges will + // re-emit them with correct new IDs. + if changed_set.contains(row.8.as_str()) { + continue; + } + saved.push(SavedReverseDepEdge { + source_id: row.0, + tgt_name: row.1, + tgt_kind: row.2, + tgt_file: row.3, + tgt_line: row.4, + edge_kind: row.5, + confidence: row.6, + dynamic: row.7, + }); + } + } + saved +} + +/// Reconnect saved reverse-dep edges to the new target node IDs. +/// +/// The source node ID is still valid (reverse-dep nodes were never purged). +/// The target was deleted and re-inserted with a new ID — look it up by +/// (name, kind, file) using nearest-line matching, and recreate the edge. +/// Mirrors `reconnectReverseDepEdges` in `build-edges.ts`. +/// +/// Returns (reconnected, dropped) counts. +pub fn reconnect_reverse_dep_edges( + conn: &Connection, + saved: &[SavedReverseDepEdge], +) -> (usize, usize) { + if saved.is_empty() { + return (0, 0); + } + let tx = match conn.unchecked_transaction() { + Ok(tx) => tx, + Err(_) => return (0, 0), + }; + + let mut reconnected = 0usize; + let mut dropped = 0usize; + { + let mut find_stmt = match tx.prepare( + "SELECT id FROM nodes WHERE name = ?1 AND kind = ?2 AND file = ?3 \ + ORDER BY ABS(line - ?4) LIMIT 1", + ) { + Ok(s) => s, + Err(_) => return (0, 0), + }; + let mut insert_stmt = match tx.prepare( + "INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) \ + VALUES (?1, ?2, ?3, ?4, ?5)", + ) { + Ok(s) => s, + Err(_) => return (0, 0), + }; + for s in saved { + match find_stmt.query_row( + rusqlite::params![&s.tgt_name, &s.tgt_kind, &s.tgt_file, s.tgt_line], + |row| row.get::<_, i64>(0), + ) { + Ok(new_id) => { + if insert_stmt + .execute(rusqlite::params![ + s.source_id, + new_id, + &s.edge_kind, + s.confidence, + s.dynamic, + ]) + .is_ok() + { + reconnected += 1; + } + } + Err(_) => { + dropped += 1; + } + } + } + } + let _ = tx.commit(); + (reconnected, dropped) +} + /// Find files that import from changed files (reverse dependencies). pub fn find_reverse_dependencies( conn: &Connection, From 2132d9960a9d14c941e0ef890cb52e97db4a0ae6 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:04:41 -0600 Subject: [PATCH 2/4] fix(native): only count actual inserts in reconnect counter (#1027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit INSERT OR IGNORE returns Ok(0) for duplicate-row no-ops, so the previous `is_ok()` check inflated `reconnected` whenever two saved entries resolved to the same (source_id, new_target_id, kind). Match on the rows-affected count and only increment when n > 0; leave the dropped counter for genuine errors. Diagnostic-only — no functional change to edge data. Impact: 1 functions changed, 0 affected --- crates/codegraph-core/src/change_detection.rs | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/crates/codegraph-core/src/change_detection.rs b/crates/codegraph-core/src/change_detection.rs index 702996b1..9c5833ae 100644 --- a/crates/codegraph-core/src/change_detection.rs +++ b/crates/codegraph-core/src/change_detection.rs @@ -471,17 +471,19 @@ pub fn reconnect_reverse_dep_edges( |row| row.get::<_, i64>(0), ) { Ok(new_id) => { - if insert_stmt - .execute(rusqlite::params![ - s.source_id, - new_id, - &s.edge_kind, - s.confidence, - s.dynamic, - ]) - .is_ok() - { - reconnected += 1; + // INSERT OR IGNORE silently swallows duplicate-row constraint + // errors and returns Ok(0). Only count rows that actually + // inserted so the diagnostic counter isn't inflated by no-ops. + match insert_stmt.execute(rusqlite::params![ + s.source_id, + new_id, + &s.edge_kind, + s.confidence, + s.dynamic, + ]) { + Ok(n) if n > 0 => reconnected += 1, + Ok(_) => {} // duplicate skipped by INSERT OR IGNORE + Err(_) => dropped += 1, } } Err(_) => { From c268648b68640bcf35da5210eb52aec976173b5f Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:04:55 -0600 Subject: [PATCH 3/4] fix(native): log dropped reverse-dep edges during reconnect (#1027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `reconnect_reverse_dep_edges` returns `(reconnected, dropped)` but the caller previously discarded the result. Dropped edges occur when a saved target node can't be re-located after re-insert (e.g. the symbol was renamed or deleted), and silently swallowing them makes partial edge loss during incremental rebuild hard to diagnose. Surface the count via eprintln when dropped > 0 — same observability pattern other native diagnostics use. Impact: 1 functions changed, 0 affected --- crates/codegraph-core/src/build_pipeline.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 0f0787ab..53cafabe 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -323,7 +323,13 @@ pub fn run_pipeline( // look up the new target node and recreate the edge with the original // source_id (still valid; reverse-dep nodes were never purged). if !saved_reverse_dep_edges.is_empty() { - change_detection::reconnect_reverse_dep_edges(conn, &saved_reverse_dep_edges); + let (reconnected, dropped) = + change_detection::reconnect_reverse_dep_edges(conn, &saved_reverse_dep_edges); + if dropped > 0 { + eprintln!( + "[codegraph] reconnect_reverse_dep_edges: {reconnected} reconnected, {dropped} dropped (target nodes not found)" + ); + } } timing.edges_ms = t0.elapsed().as_secs_f64() * 1000.0; From b6baec076410d6216b6230cb028309d0a6528264 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Wed, 29 Apr 2026 20:21:38 -0600 Subject: [PATCH 4/4] fix(native): reclassify reverse-deps of removed files (#1027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a file is removed during incremental build, its nodes are purged along with edges pointing at them. Files that imported the removed file were never re-parsed (correct, by design) but their role records became stale: fan-out dropped silently because do_classify_incremental was only seeded with parsed files. For removal-only builds, classification was skipped entirely. Compute reverse-deps of removed entries before purge (find_reverse_deps with change_result.removed as seed, mirroring WASM/JS) and union them into the seed passed to do_classify_incremental. No re-parse needed — the classifier reads fan-in/fan-out from the post-purge edges table. Impact: 1 functions changed, 0 affected --- crates/codegraph-core/src/build_pipeline.rs | 36 ++++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 53cafabe..92ef2b1f 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -185,6 +185,11 @@ pub fn run_pipeline( // strategy and lets us skip re-parsing reverse-dep files entirely: // parse/insert/structure/roles/analysis all scope to truly-changed files. let mut saved_reverse_dep_edges: Vec = Vec::new(); + // Files that import a removed file. Save+reconnect doesn't apply (the + // target node is gone for good), but their role records go stale because + // edges to the deleted file's nodes get purged in Stage 3. Reclassify them + // in Stage 8 so fan-out reflects reality. (#1027 review) + let mut removal_reverse_deps: Vec = Vec::new(); // Handle full build: clear all graph data if change_result.is_full_build { @@ -199,6 +204,15 @@ pub fn run_pipeline( if !opts.no_reverse_deps.unwrap_or(false) { saved_reverse_dep_edges = change_detection::save_reverse_dep_edges(conn, &changed_paths); + + if !change_result.removed.is_empty() { + let removed_set: HashSet = + change_result.removed.iter().cloned().collect(); + removal_reverse_deps = + change_detection::find_reverse_dependencies(conn, &removed_set, root_dir) + .into_iter() + .collect(); + } } let files_to_purge: Vec = change_result @@ -377,14 +391,26 @@ pub fn run_pipeline( timing.structure_ms = t0.elapsed().as_secs_f64() * 1000.0; let t0 = Instant::now(); - // Role classification needs only the truly-changed files — - // `do_classify_incremental` expands to neighbours via the edges table, so - // reverse-dep files are picked up automatically when their fan-in/fan-out - // is affected by edges to/from changed files. + // Role classification needs the truly-changed files plus reverse-deps of + // any removed files. `do_classify_incremental` expands to neighbours via + // the edges table, so reverse-deps of *changed* files are picked up + // automatically when their fan-in/fan-out is affected. Reverse-deps of + // *removed* files have to be added explicitly — the deleted file's nodes + // are gone, so neighbour expansion can't reach the importer. Without this + // seed, removal-only builds skip role classification entirely. (#1027) let changed_file_list: Option> = if change_result.is_full_build { None } else { - Some(changed_files) + let mut files = changed_files; + if !removal_reverse_deps.is_empty() { + let existing: HashSet = files.iter().cloned().collect(); + for f in removal_reverse_deps { + if !existing.contains(&f) { + files.push(f); + } + } + } + Some(files) }; if let Some(ref files) = changed_file_list { if !files.is_empty() {