From b17f1bdc0b2bc1629bdb5a1b92c58b79d19af07c Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 29 Apr 2026 22:20:18 -0600 Subject: [PATCH 1/3] perf(native): batch-load file/symbol node IDs in edges phase (#1013) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces per-import and per-file `conn.query_row` calls in `build_import_edges` and `build_and_insert_call_edges` with one-shot HashMap pre-loads. Each `query_row` ran a fresh sqlite3_prepare/step/ finalize cycle; on a ~470-file repo this paid ~2.3k cycles in the import-edge stage alone, dominating the edges phase. Also chunks the import-edge insert into multi-row VALUES batches (199 rows × 5 params), mirroring `edges_db::do_insert_edges`, to remove the per-row prepared-statement bind/step/reset overhead. Self-build benchmark (codegraph on itself, 743 files): edges: 124.5 ms native vs 193 ms wasm (was 310 ms / 179 ms in 3.9.5) roles: 73.7 ms native vs 70 ms wasm (was 269 ms / 62 ms in 3.9.5) Roles converged as a side effect — its prior tail was driven by shared SQLite cache pressure during the same build session. Closes #1013 (docs check acknowledged: internal perf fix, no API/language/feature surface changes) --- crates/codegraph-core/src/build_pipeline.rs | 30 +++-- crates/codegraph-core/src/import_edges.rs | 135 +++++++++++++++----- 2 files changed, 123 insertions(+), 42 deletions(-) diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 92ef2b1f..364b02c8 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -1081,6 +1081,25 @@ fn build_and_insert_call_edges( .map(String::from) .collect(); + // Pre-load every file node ID into a HashMap with one query, replacing + // the per-file `query_row` cycle that paid a fresh sqlite3_prepare for + // each entry in `file_symbols` (#1013). + let file_node_ids: HashMap = { + let mut map = HashMap::new(); + if let Ok(mut stmt) = + conn.prepare("SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0") + { + if let Ok(rows) = stmt.query_map([], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32)) + }) { + for r in rows.flatten() { + map.insert(r.0, r.1); + } + } + } + map + }; + // Build FileEdgeInput entries for the native edge builder let mut file_entries: Vec = Vec::new(); for (rel_path, symbols) in file_symbols { @@ -1088,14 +1107,9 @@ fn build_and_insert_call_edges( continue; } - // Look up file node ID - let file_node_id: u32 = match conn.query_row( - "SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0", - [rel_path, rel_path], - |row| row.get::<_, i64>(0), - ) { - Ok(id) => id as u32, - Err(_) => continue, + let file_node_id: u32 = match file_node_ids.get(rel_path) { + Some(&id) => id, + None => continue, }; // Build imported names from resolved imports diff --git a/crates/codegraph-core/src/import_edges.rs b/crates/codegraph-core/src/import_edges.rs index f0f517c2..251727a2 100644 --- a/crates/codegraph-core/src/import_edges.rs +++ b/crates/codegraph-core/src/import_edges.rs @@ -153,24 +153,48 @@ pub fn detect_barrel_only_files(ctx: &ImportEdgeContext) -> HashSet { barrel_only } -/// Look up a file node ID from the database. -fn get_file_node_id(conn: &Connection, rel_path: &str) -> Option { - conn.query_row( - "SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0", - [rel_path, rel_path], - |row| row.get(0), - ) - .ok() +/// Load every file node ID into a HashMap in one query — replaces per-import +/// `conn.query_row` lookups that paid the SQLite prepare/execute cycle on each +/// call (#1013). +fn load_file_node_ids(conn: &Connection) -> HashMap { + let mut map = HashMap::new(); + if let Ok(mut stmt) = + conn.prepare("SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0") + { + if let Ok(rows) = stmt.query_map([], |row| { + Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)) + }) { + for r in rows.flatten() { + map.insert(r.0, r.1); + } + } + } + map } -/// Look up the first symbol node ID by name and file (for type-only import resolution). -fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option { - conn.query_row( - "SELECT id FROM nodes WHERE name = ? AND file = ? AND kind != 'file' LIMIT 1", - [name, file], - |row| row.get(0), - ) - .ok() +/// Load every (name, file) -> id mapping for non-file nodes in one query. +/// Mirrors the JS `nodesByNameAndFile` lookup map; preserves the first-row +/// semantics of the legacy `LIMIT 1` query by keeping the first ID seen per +/// key. Skipped entirely when no type-only imports exist (saves one full +/// scan of `nodes` on the common case). +fn load_symbol_node_ids(conn: &Connection) -> HashMap<(String, String), i64> { + let mut map = HashMap::new(); + if let Ok(mut stmt) = + conn.prepare("SELECT name, file, id FROM nodes WHERE kind != 'file'") + { + if let Ok(rows) = stmt.query_map([], |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, String>(1)?, + row.get::<_, i64>(2)?, + )) + }) { + for r in rows.flatten() { + map.entry((r.0, r.1)).or_insert(r.2); + } + } + } + map } /// Build import edges from parsed file symbols. @@ -185,10 +209,23 @@ fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec { let mut edges = Vec::new(); + // Pre-load all file node IDs once. Previously this was N x query_row, + // each of which ran a fresh sqlite3_prepare/step/finalize cycle (#1013). + let file_node_ids = load_file_node_ids(conn); + let needs_symbol_map = ctx + .file_symbols + .values() + .any(|s| s.imports.iter().any(|i| i.type_only.unwrap_or(false))); + let symbol_node_ids = if needs_symbol_map { + load_symbol_node_ids(conn) + } else { + HashMap::new() + }; + for (rel_path, symbols) in &ctx.file_symbols { let is_barrel_only = ctx.barrel_only_files.contains(rel_path); - let file_node_id = match get_file_node_id(conn, rel_path) { - Some(id) => id, + let file_node_id = match file_node_ids.get(rel_path) { + Some(&id) => id, None => continue, }; @@ -203,8 +240,8 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec id, + let target_id = match file_node_ids.get(&resolved_path) { + Some(&id) => id, None => continue, }; @@ -238,7 +275,9 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec Vec "imports-type", "dynamic-imports" => "dynamic-imports", @@ -286,7 +325,15 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec tx, Err(_) => return, }; - if let Ok(mut stmt) = tx.prepare( - "INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES (?, ?, ?, ?, ?)", - ) { - for e in edges { - let _ = stmt.execute(rusqlite::params![ - e.source_id, - e.target_id, - e.kind, - e.confidence, - e.dynamic, - ]); + + for chunk in edges.chunks(INSERT_CHUNK) { + let placeholders: Vec = (0..chunk.len()) + .map(|i| { + let base = i * 5; + format!( + "(?{},?{},?{},?{},?{})", + base + 1, + base + 2, + base + 3, + base + 4, + base + 5 + ) + }) + .collect(); + let sql = format!( + "INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES {}", + placeholders.join(",") + ); + let mut stmt = match tx.prepare_cached(&sql) { + Ok(s) => s, + Err(_) => continue, + }; + for (i, edge) in chunk.iter().enumerate() { + let base = i * 5; + let _ = stmt.raw_bind_parameter(base + 1, edge.source_id); + let _ = stmt.raw_bind_parameter(base + 2, edge.target_id); + let _ = stmt.raw_bind_parameter(base + 3, edge.kind.as_str()); + let _ = stmt.raw_bind_parameter(base + 4, edge.confidence); + let _ = stmt.raw_bind_parameter(base + 5, edge.dynamic); } + let _ = stmt.raw_execute(); } let _ = tx.commit(); } From 1081b64d82d49c143f638b30ea24c507a268dadd Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 29 Apr 2026 23:01:34 -0600 Subject: [PATCH 2/3] fix(native): bound symbol-node lookup and restore name=file guard (#1028) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address two P2 review concerns from greptile on #1013: 1. **Bounded symbol-node lookup.** `load_symbol_node_ids` previously did an unbounded `SELECT name, file, id FROM nodes WHERE kind != 'file'`, loading every non-file symbol into memory. On 100k+-symbol monorepos this could push hundreds of MB. Now we walk type-only imports up front to collect the distinct `(name, file)` pairs we'll actually need, then issue a chunked `(name, file) IN (...)` query (332 pairs per chunk × 2 binds = 664, safely under `SQLITE_MAX_VARIABLE_NUMBER`). The full-scan path is gone; only the symbols referenced by type-only imports are hit, preserving the one-round-trip win without the memory blow-up. 2. **Restore `name = file` guard on file-node lookups.** The original per-row query bound `rel_path` to both `name = ?` and `file = ?`, matching only nodes where the two columns agreed. The bulk query keyed on `file` alone, so an unrelated file-kind row sharing the same `file` value (different `name`) could silently overwrite the map entry. Add `AND name = file` to both `load_file_node_ids` and the parallel pre-load in `build_and_insert_call_edges` to keep the legacy semantics explicit. --- crates/codegraph-core/src/build_pipeline.rs | 12 +- crates/codegraph-core/src/import_edges.rs | 129 +++++++++++++++----- 2 files changed, 109 insertions(+), 32 deletions(-) diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index 364b02c8..72644d4d 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -1084,11 +1084,17 @@ fn build_and_insert_call_edges( // Pre-load every file node ID into a HashMap with one query, replacing // the per-file `query_row` cycle that paid a fresh sqlite3_prepare for // each entry in `file_symbols` (#1013). + // + // The `name = file` predicate matches the legacy per-row lookup + // (`WHERE name = ? AND file = ?` with both binds set to `rel_path`). + // For file-kind nodes `name` and `file` are conventionally identical, + // but keeping the guard prevents an unrelated row from silently + // overwriting the map entry for `file` (#1028 review). let file_node_ids: HashMap = { let mut map = HashMap::new(); - if let Ok(mut stmt) = - conn.prepare("SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0") - { + if let Ok(mut stmt) = conn.prepare( + "SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file", + ) { if let Ok(rows) = stmt.query_map([], |row| { Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32)) }) { diff --git a/crates/codegraph-core/src/import_edges.rs b/crates/codegraph-core/src/import_edges.rs index 251727a2..23a90b70 100644 --- a/crates/codegraph-core/src/import_edges.rs +++ b/crates/codegraph-core/src/import_edges.rs @@ -156,11 +156,17 @@ pub fn detect_barrel_only_files(ctx: &ImportEdgeContext) -> HashSet { /// Load every file node ID into a HashMap in one query — replaces per-import /// `conn.query_row` lookups that paid the SQLite prepare/execute cycle on each /// call (#1013). +/// +/// Includes the explicit `name = file` predicate that matched the legacy +/// per-row lookup (`WHERE name = ? AND file = ?` with both binds set to +/// `rel_path`). For file-kind nodes `name` and `file` are conventionally +/// identical, but keeping the guard prevents an unrelated row from silently +/// overwriting the map entry for `file`. fn load_file_node_ids(conn: &Connection) -> HashMap { let mut map = HashMap::new(); - if let Ok(mut stmt) = - conn.prepare("SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0") - { + if let Ok(mut stmt) = conn.prepare( + "SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file", + ) { if let Ok(rows) = stmt.query_map([], |row| { Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)) }) { @@ -172,31 +178,95 @@ fn load_file_node_ids(conn: &Connection) -> HashMap { map } -/// Load every (name, file) -> id mapping for non-file nodes in one query. -/// Mirrors the JS `nodesByNameAndFile` lookup map; preserves the first-row -/// semantics of the legacy `LIMIT 1` query by keeping the first ID seen per -/// key. Skipped entirely when no type-only imports exist (saves one full -/// scan of `nodes` on the common case). -fn load_symbol_node_ids(conn: &Connection) -> HashMap<(String, String), i64> { - let mut map = HashMap::new(); - if let Ok(mut stmt) = - conn.prepare("SELECT name, file, id FROM nodes WHERE kind != 'file'") - { - if let Ok(rows) = stmt.query_map([], |row| { - Ok(( - row.get::<_, String>(0)?, - row.get::<_, String>(1)?, - row.get::<_, i64>(2)?, - )) - }) { - for r in rows.flatten() { - map.entry((r.0, r.1)).or_insert(r.2); +/// Load symbol node IDs for the supplied `(name, file)` pairs in one chunked +/// query. Mirrors the JS `nodesByNameAndFile` lookup map; preserves the +/// first-row semantics of the legacy `LIMIT 1` query by keeping the first ID +/// seen per key. +/// +/// The pairs are pre-computed by walking the type-only imports in +/// `ctx.file_symbols`, so we never scan the entire `nodes` table — even on +/// monorepos with 100k+ symbols, only the small slice actually referenced by +/// type-only imports is hit (#1013, #1028 review). +fn load_symbol_node_ids( + conn: &Connection, + needed_pairs: &HashSet<(String, String)>, +) -> HashMap<(String, String), i64> { + let mut map: HashMap<(String, String), i64> = HashMap::new(); + if needed_pairs.is_empty() { + return map; + } + + // 332 pairs × 2 params + 1 spare = 665 binds, comfortably under + // `SQLITE_MAX_VARIABLE_NUMBER`'s legacy 999 default. + const SYMBOL_LOOKUP_CHUNK: usize = 332; + + let pairs: Vec<&(String, String)> = needed_pairs.iter().collect(); + for chunk in pairs.chunks(SYMBOL_LOOKUP_CHUNK) { + let placeholders: Vec = (0..chunk.len()) + .map(|i| { + let base = i * 2; + format!("(?{},?{})", base + 1, base + 2) + }) + .collect(); + let sql = format!( + "SELECT name, file, id FROM nodes WHERE kind != 'file' AND (name, file) IN ({})", + placeholders.join(",") + ); + let mut params: Vec<&dyn rusqlite::ToSql> = Vec::with_capacity(chunk.len() * 2); + for (name, file) in chunk { + params.push(name); + params.push(file); + } + + if let Ok(mut stmt) = conn.prepare(&sql) { + if let Ok(rows) = stmt.query_map(rusqlite::params_from_iter(params.iter()), |row| { + Ok(( + row.get::<_, String>(0)?, + row.get::<_, String>(1)?, + row.get::<_, i64>(2)?, + )) + }) { + for r in rows.flatten() { + map.entry((r.0, r.1)).or_insert(r.2); + } } } } map } +/// Walk type-only imports in `ctx.file_symbols` and return the distinct +/// `(name, file)` pairs that `build_import_edges` will need to look up. +/// Resolves barrel files the same way the edge-building loop does so the +/// pre-computed set matches the actual lookup keys. +fn collect_type_only_lookup_pairs(ctx: &ImportEdgeContext) -> HashSet<(String, String)> { + let mut pairs = HashSet::new(); + for (rel_path, symbols) in &ctx.file_symbols { + let abs_file = Path::new(&ctx.root_dir).join(rel_path); + let abs_str = abs_file.to_str().unwrap_or(""); + for imp in &symbols.imports { + if !imp.type_only.unwrap_or(false) { + continue; + } + let resolved_path = ctx.get_resolved(abs_str, &imp.source); + for name in &imp.names { + let clean_name = name.strip_prefix("* as ").unwrap_or(name); + let mut target_file = resolved_path.clone(); + if ctx.is_barrel_file(&resolved_path) { + let mut visited = HashSet::new(); + if let Some(actual) = + ctx.resolve_barrel_export(&resolved_path, clean_name, &mut visited) + { + target_file = actual; + } + } + pairs.insert((clean_name.to_string(), target_file)); + } + } + } + pairs +} + /// Build import edges from parsed file symbols. /// /// For each file's imports, resolves the target path and creates edges: @@ -212,14 +282,15 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec Date: Wed, 29 Apr 2026 23:07:58 -0600 Subject: [PATCH 3/3] fix(native): surface bind/execute errors in insert_edges (#1028) Greptile flagged two P2 issues in `insert_edges`: 1. Silent bind failures: `let _ = stmt.raw_bind_parameter(...)` silently discarded errors. A failed bind would leave that position unbound (NULL), causing `raw_execute()` to insert ghost edge rows with NULL `source_id`/`target_id`. Bind/execute now run inside a fallible `insert_edge_chunk` helper; failures emit a stderr warning and the chunk is skipped instead of producing partial rows. 2. `prepare_cached` mismatched with dynamic SQL: the SQL string varies with chunk length, so trailing partial chunks were always cache misses. Switched to plain `tx.prepare(&sql)` to match intent. No behavioural change for the success path. cargo test -p codegraph-core --lib: 181 passed. --- crates/codegraph-core/src/import_edges.rs | 89 +++++++++++++++-------- 1 file changed, 57 insertions(+), 32 deletions(-) diff --git a/crates/codegraph-core/src/import_edges.rs b/crates/codegraph-core/src/import_edges.rs index 23a90b70..45847692 100644 --- a/crates/codegraph-core/src/import_edges.rs +++ b/crates/codegraph-core/src/import_edges.rs @@ -404,49 +404,74 @@ const INSERT_CHUNK: usize = 199; /// /// Replaces the previous one-prepared-statement-per-row pattern that paid a /// per-edge bind/step/reset cycle. With the chunked path each chunk runs a -/// single VM execution against a cached statement (#1013). +/// single VM execution against a freshly prepared statement (#1013). +/// +/// Bind/execute errors are surfaced via a stderr warning and the offending +/// chunk is skipped — silently swallowing them previously could produce +/// `NULL` columns in the inserted edge rows. pub fn insert_edges(conn: &Connection, edges: &[EdgeRow]) { if edges.is_empty() { return; } let tx = match conn.unchecked_transaction() { Ok(tx) => tx, - Err(_) => return, + Err(e) => { + eprintln!("[codegraph] insert_edges: failed to start transaction: {e}"); + return; + } }; for chunk in edges.chunks(INSERT_CHUNK) { - let placeholders: Vec = (0..chunk.len()) - .map(|i| { - let base = i * 5; - format!( - "(?{},?{},?{},?{},?{})", - base + 1, - base + 2, - base + 3, - base + 4, - base + 5 - ) - }) - .collect(); - let sql = format!( - "INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES {}", - placeholders.join(",") - ); - let mut stmt = match tx.prepare_cached(&sql) { - Ok(s) => s, - Err(_) => continue, - }; - for (i, edge) in chunk.iter().enumerate() { - let base = i * 5; - let _ = stmt.raw_bind_parameter(base + 1, edge.source_id); - let _ = stmt.raw_bind_parameter(base + 2, edge.target_id); - let _ = stmt.raw_bind_parameter(base + 3, edge.kind.as_str()); - let _ = stmt.raw_bind_parameter(base + 4, edge.confidence); - let _ = stmt.raw_bind_parameter(base + 5, edge.dynamic); + if let Err(e) = insert_edge_chunk(&tx, chunk) { + eprintln!( + "[codegraph] insert_edges: skipped chunk of {} rows due to error: {e}", + chunk.len() + ); } - let _ = stmt.raw_execute(); } - let _ = tx.commit(); + if let Err(e) = tx.commit() { + eprintln!("[codegraph] insert_edges: commit failed: {e}"); + } +} + +/// Bind and execute a single chunk in its own fallible scope so the caller +/// can log the failure and continue with the next chunk. +/// +/// `prepare` (not `prepare_cached`) is used because the SQL string varies +/// with chunk length — caching keyed on dynamic SQL would churn the LRU +/// for every partial trailing chunk and obscure the intent of the cache. +fn insert_edge_chunk( + tx: &rusqlite::Transaction<'_>, + chunk: &[EdgeRow], +) -> rusqlite::Result<()> { + let placeholders: Vec = (0..chunk.len()) + .map(|i| { + let base = i * 5; + format!( + "(?{},?{},?{},?{},?{})", + base + 1, + base + 2, + base + 3, + base + 4, + base + 5 + ) + }) + .collect(); + let sql = format!( + "INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES {}", + placeholders.join(",") + ); + let mut stmt = tx.prepare(&sql)?; + for (i, edge) in chunk.iter().enumerate() { + let base = i * 5; + stmt.raw_bind_parameter(base + 1, edge.source_id)?; + stmt.raw_bind_parameter(base + 2, edge.target_id)?; + stmt.raw_bind_parameter(base + 3, edge.kind.as_str())?; + stmt.raw_bind_parameter(base + 4, edge.confidence)?; + stmt.raw_bind_parameter(base + 5, edge.dynamic)?; + } + stmt.raw_execute()?; + Ok(()) } #[cfg(test)]