diff --git a/Cargo.lock b/Cargo.lock index fb7395f7..fedf6a1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -66,7 +66,7 @@ checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "codegraph-core" -version = "3.9.4" +version = "3.9.5" dependencies = [ "globset", "ignore", diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 38c71518..92ef2b1f 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -180,30 +180,40 @@ pub fn run_pipeline( }); } - // Track reverse-dep files that need re-parsing for edge reconstruction. - // Also track their relative paths so we can exclude them from analysis_scope — - // reverse-dep files are re-parsed for edge rebuilding but their content didn't - // change, so running AST/complexity/CFG/dataflow on them is wasted work (#761). - let mut reverse_dep_abs_paths: Vec = Vec::new(); - let mut reverse_dep_rel_paths: HashSet = HashSet::new(); + // Save reverse-dep → changed-file edges before purge so we can reconnect + // them to new node IDs after Stage 5 (#1012). This matches the WASM/JS + // strategy and lets us skip re-parsing reverse-dep files entirely: + // parse/insert/structure/roles/analysis all scope to truly-changed files. + let mut saved_reverse_dep_edges: Vec = Vec::new(); + // Files that import a removed file. Save+reconnect doesn't apply (the + // target node is gone for good), but their role records go stale because + // edges to the deleted file's nodes get purged in Stage 3. Reclassify them + // in Stage 8 so fan-out reflects reality. (#1027 review) + let mut removal_reverse_deps: Vec = Vec::new(); // Handle full build: clear all graph data if change_result.is_full_build { let has_embeddings = change_detection::has_embeddings(conn); change_detection::clear_all_graph_data(conn, has_embeddings); } else { - // Incremental: find reverse deps and purge changed files - let changed_rel_paths: HashSet = parse_changes - .iter() - .map(|c| c.rel_path.clone()) - .chain(change_result.removed.iter().cloned()) - .collect(); - - let reverse_deps = if opts.no_reverse_deps.unwrap_or(false) { - HashSet::new() - } else { - change_detection::find_reverse_dependencies(conn, &changed_rel_paths, root_dir) - }; + // Incremental: save reverse-dep edges (if reverse-dep tracking is enabled), + // then purge changed files only. + let changed_paths: Vec = + parse_changes.iter().map(|c| c.rel_path.clone()).collect(); + + if !opts.no_reverse_deps.unwrap_or(false) { + saved_reverse_dep_edges = + change_detection::save_reverse_dep_edges(conn, &changed_paths); + + if !change_result.removed.is_empty() { + let removed_set: HashSet = + change_result.removed.iter().cloned().collect(); + removal_reverse_deps = + change_detection::find_reverse_dependencies(conn, &removed_set, root_dir) + .into_iter() + .collect(); + } + } let files_to_purge: Vec = change_result .removed @@ -211,28 +221,20 @@ pub fn run_pipeline( .chain(parse_changes.iter().map(|c| &c.rel_path)) .cloned() .collect(); - let reverse_dep_list: Vec = reverse_deps.iter().cloned().collect(); - change_detection::purge_changed_files(conn, &files_to_purge, &reverse_dep_list); - - // Track reverse-dep absolute paths so we can re-parse them for edge - // rebuilding. Their nodes are still in the DB (only edges were purged), - // but we need fresh FileSymbols so Stage 7 can reconstruct their - // import and call edges. - for rdep in &reverse_dep_list { - let abs = Path::new(root_dir).join(rdep); - if abs.exists() { - reverse_dep_abs_paths.push(abs.to_str().unwrap_or("").to_string()); - reverse_dep_rel_paths.insert(rdep.clone()); - } - } + // Pass empty reverse_dep_files: purge already deletes both directions + // for changed files (which removes the saved reverse-dep → changed-file + // edges from the live table), and other outgoing edges from reverse-dep + // files remain valid and must NOT be deleted — they will be reconnected + // to new target IDs after insert. + change_detection::purge_changed_files(conn, &files_to_purge, &[]); } // ── Stage 4: Parse files ─────────────────────────────────────────── + // Only truly-changed files are parsed. Reverse-dep files are not re-parsed — + // their edges to changed files are reconstructed via save+reconnect (#1012). let t0 = Instant::now(); - let mut files_to_parse: Vec = + let files_to_parse: Vec = parse_changes.iter().map(|c| c.abs_path.clone()).collect(); - // Include reverse-dep files so their edges are rebuilt after purging - files_to_parse.extend(reverse_dep_abs_paths); let parsed = parallel::parse_files_parallel(&files_to_parse, root_dir, include_dataflow, include_ast); @@ -330,32 +332,36 @@ pub fn run_pipeline( // internal logic. We load nodes from DB and pass to the edge builder. build_and_insert_call_edges(conn, &file_symbols, &import_ctx, !change_result.is_full_build); + // Reconnect saved reverse-dep edges to new node IDs (#1012). Mirrors + // `reconnectReverseDepEdges` in build-edges.ts — for each saved edge, + // look up the new target node and recreate the edge with the original + // source_id (still valid; reverse-dep nodes were never purged). + if !saved_reverse_dep_edges.is_empty() { + let (reconnected, dropped) = + change_detection::reconnect_reverse_dep_edges(conn, &saved_reverse_dep_edges); + if dropped > 0 { + eprintln!( + "[codegraph] reconnect_reverse_dep_edges: {reconnected} reconnected, {dropped} dropped (target nodes not found)" + ); + } + } + timing.edges_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 8: Structure + roles ───────────────────────────────────── let t0 = Instant::now(); let line_count_map = structure::build_line_count_map(&file_symbols, root_dir); + // file_symbols only contains truly-changed files (reverse-deps are not + // re-parsed; their edges are reconnected via save+reconnect — #1012), so + // analysis_scope == changed_files. let changed_files: Vec = file_symbols.keys().cloned().collect(); - // Build analysis_scope excluding reverse-dep files — they were re-parsed for - // edge reconstruction but their content didn't change, so AST/complexity/CFG/ - // dataflow analysis would be redundant (#761). This matches the JS pipeline's - // _reverseDepOnly filtering in run-analyses.ts. let analysis_scope: Option> = if change_result.is_full_build { None } else { - Some( - changed_files - .iter() - .filter(|f| !reverse_dep_rel_paths.contains(f.as_str())) - .cloned() - .collect(), - ) + Some(changed_files.clone()) }; let existing_file_count = structure::get_existing_file_count(conn); - // Use parse_changes.len() for the threshold — changed_files includes - // reverse-dep files added for edge rebuilding, which inflates the count - // and would skip the fast path even for single-file incremental builds. let use_fast_path = !change_result.is_full_build && parse_changes.len() <= FAST_PATH_MAX_CHANGED_FILES && existing_file_count > FAST_PATH_MIN_EXISTING_FILES; @@ -385,15 +391,26 @@ pub fn run_pipeline( timing.structure_ms = t0.elapsed().as_secs_f64() * 1000.0; let t0 = Instant::now(); - // Role classification intentionally uses the full `changed_files` list - // (including reverse-dep files), not `analysis_scope`. Reverse-dep files - // had their edges rebuilt, which can change fan-in/fan-out and therefore - // role assignments — so they must be re-classified even though their - // content didn't change and they are excluded from AST analysis. + // Role classification needs the truly-changed files plus reverse-deps of + // any removed files. `do_classify_incremental` expands to neighbours via + // the edges table, so reverse-deps of *changed* files are picked up + // automatically when their fan-in/fan-out is affected. Reverse-deps of + // *removed* files have to be added explicitly — the deleted file's nodes + // are gone, so neighbour expansion can't reach the importer. Without this + // seed, removal-only builds skip role classification entirely. (#1027) let changed_file_list: Option> = if change_result.is_full_build { None } else { - Some(changed_files) + let mut files = changed_files; + if !removal_reverse_deps.is_empty() { + let existing: HashSet = files.iter().cloned().collect(); + for f in removal_reverse_deps { + if !existing.contains(&f) { + files.push(f); + } + } + } + Some(files) }; if let Some(ref files) = changed_file_list { if !files.is_empty() { diff --git a/crates/codegraph-core/src/change_detection.rs b/crates/codegraph-core/src/change_detection.rs index ff774269..9c5833ae 100644 --- a/crates/codegraph-core/src/change_detection.rs +++ b/crates/codegraph-core/src/change_detection.rs @@ -345,6 +345,157 @@ fn file_mtime_size(path: &str) -> (i64, i64) { } } +/// A reverse-dep edge captured before purge so it can be reconnected to the +/// new target node ID after the changed file's nodes are re-inserted. +#[derive(Debug, Clone)] +pub struct SavedReverseDepEdge { + pub source_id: i64, + pub tgt_name: String, + pub tgt_kind: String, + pub tgt_file: String, + pub tgt_line: i64, + pub edge_kind: String, + pub confidence: f64, + pub dynamic: i64, +} + +/// Save edges from reverse-dep files → changed files BEFORE purge so they +/// can be reconnected to new target node IDs after node insertion (#1012). +/// +/// Mirrors the JS `purgeAndAddReverseDeps` path in `detect-changes.ts`. By +/// saving the edge topology and reconnecting after insert, we avoid the need +/// to re-parse every reverse-dep file just to rebuild its edges. That re-parse +/// is what made the native pipeline scale parse/insert/structure/roles with +/// the full reverse-dep cone (47 files for a 1-file change) instead of just +/// the truly-changed files (1 file). +pub fn save_reverse_dep_edges( + conn: &Connection, + changed_paths: &[String], +) -> Vec { + let mut saved = Vec::new(); + if changed_paths.is_empty() { + return saved; + } + let changed_set: HashSet<&str> = changed_paths.iter().map(|s| s.as_str()).collect(); + + let mut stmt = match conn.prepare( + "SELECT e.source_id, n_tgt.name, n_tgt.kind, n_tgt.file, n_tgt.line, \ + e.kind, e.confidence, e.dynamic, n_src.file \ + FROM edges e \ + JOIN nodes n_src ON e.source_id = n_src.id \ + JOIN nodes n_tgt ON e.target_id = n_tgt.id \ + WHERE n_tgt.file = ?1 AND n_src.file != n_tgt.file", + ) { + Ok(s) => s, + Err(_) => return saved, + }; + + for changed in changed_paths { + let rows = match stmt.query_map([changed], |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, String>(1)?, + row.get::<_, String>(2)?, + row.get::<_, String>(3)?, + row.get::<_, i64>(4)?, + row.get::<_, String>(5)?, + row.get::<_, f64>(6)?, + row.get::<_, i64>(7)?, + row.get::<_, String>(8)?, + )) + }) { + Ok(r) => r, + Err(_) => continue, + }; + for row in rows.flatten() { + // Skip edges whose source is itself being purged — buildEdges will + // re-emit them with correct new IDs. + if changed_set.contains(row.8.as_str()) { + continue; + } + saved.push(SavedReverseDepEdge { + source_id: row.0, + tgt_name: row.1, + tgt_kind: row.2, + tgt_file: row.3, + tgt_line: row.4, + edge_kind: row.5, + confidence: row.6, + dynamic: row.7, + }); + } + } + saved +} + +/// Reconnect saved reverse-dep edges to the new target node IDs. +/// +/// The source node ID is still valid (reverse-dep nodes were never purged). +/// The target was deleted and re-inserted with a new ID — look it up by +/// (name, kind, file) using nearest-line matching, and recreate the edge. +/// Mirrors `reconnectReverseDepEdges` in `build-edges.ts`. +/// +/// Returns (reconnected, dropped) counts. +pub fn reconnect_reverse_dep_edges( + conn: &Connection, + saved: &[SavedReverseDepEdge], +) -> (usize, usize) { + if saved.is_empty() { + return (0, 0); + } + let tx = match conn.unchecked_transaction() { + Ok(tx) => tx, + Err(_) => return (0, 0), + }; + + let mut reconnected = 0usize; + let mut dropped = 0usize; + { + let mut find_stmt = match tx.prepare( + "SELECT id FROM nodes WHERE name = ?1 AND kind = ?2 AND file = ?3 \ + ORDER BY ABS(line - ?4) LIMIT 1", + ) { + Ok(s) => s, + Err(_) => return (0, 0), + }; + let mut insert_stmt = match tx.prepare( + "INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) \ + VALUES (?1, ?2, ?3, ?4, ?5)", + ) { + Ok(s) => s, + Err(_) => return (0, 0), + }; + for s in saved { + match find_stmt.query_row( + rusqlite::params![&s.tgt_name, &s.tgt_kind, &s.tgt_file, s.tgt_line], + |row| row.get::<_, i64>(0), + ) { + Ok(new_id) => { + // INSERT OR IGNORE silently swallows duplicate-row constraint + // errors and returns Ok(0). Only count rows that actually + // inserted so the diagnostic counter isn't inflated by no-ops. + match insert_stmt.execute(rusqlite::params![ + s.source_id, + new_id, + &s.edge_kind, + s.confidence, + s.dynamic, + ]) { + Ok(n) if n > 0 => reconnected += 1, + Ok(_) => {} // duplicate skipped by INSERT OR IGNORE + Err(_) => dropped += 1, + } + } + Err(_) => { + dropped += 1; + } + } + } + } + let _ = tx.commit(); + (reconnected, dropped) +} + /// Find files that import from changed files (reverse dependencies). pub fn find_reverse_dependencies( conn: &Connection,