Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions crates/codegraph-core/src/build_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1081,21 +1081,41 @@ fn build_and_insert_call_edges(
.map(String::from)
.collect();

// Pre-load every file node ID into a HashMap with one query, replacing
// the per-file `query_row` cycle that paid a fresh sqlite3_prepare for
// each entry in `file_symbols` (#1013).
//
// The `name = file` predicate matches the legacy per-row lookup
// (`WHERE name = ? AND file = ?` with both binds set to `rel_path`).
// For file-kind nodes `name` and `file` are conventionally identical,
// but keeping the guard prevents an unrelated row from silently
// overwriting the map entry for `file` (#1028 review).
let file_node_ids: HashMap<String, u32> = {
let mut map = HashMap::new();
if let Ok(mut stmt) = conn.prepare(
"SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file",
) {
if let Ok(rows) = stmt.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32))
}) {
for r in rows.flatten() {
map.insert(r.0, r.1);
}
}
}
map
};

// Build FileEdgeInput entries for the native edge builder
let mut file_entries: Vec<FileEdgeInput> = Vec::new();
for (rel_path, symbols) in file_symbols {
if import_ctx.barrel_only_files.contains(rel_path) {
continue;
}

// Look up file node ID
let file_node_id: u32 = match conn.query_row(
"SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0",
[rel_path, rel_path],
|row| row.get::<_, i64>(0),
) {
Ok(id) => id as u32,
Err(_) => continue,
let file_node_id: u32 = match file_node_ids.get(rel_path) {
Some(&id) => id,
None => continue,
};

// Build imported names from resolved imports
Expand Down
235 changes: 199 additions & 36 deletions crates/codegraph-core/src/import_edges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,24 +153,118 @@ pub fn detect_barrel_only_files(ctx: &ImportEdgeContext) -> HashSet<String> {
barrel_only
}

/// Look up a file node ID from the database.
fn get_file_node_id(conn: &Connection, rel_path: &str) -> Option<i64> {
conn.query_row(
"SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0",
[rel_path, rel_path],
|row| row.get(0),
)
.ok()
/// Load every file node ID into a HashMap in one query — replaces per-import
/// `conn.query_row` lookups that paid the SQLite prepare/execute cycle on each
/// call (#1013).
///
/// Includes the explicit `name = file` predicate that matched the legacy
/// per-row lookup (`WHERE name = ? AND file = ?` with both binds set to
/// `rel_path`). For file-kind nodes `name` and `file` are conventionally
/// identical, but keeping the guard prevents an unrelated row from silently
/// overwriting the map entry for `file`.
fn load_file_node_ids(conn: &Connection) -> HashMap<String, i64> {
let mut map = HashMap::new();
if let Ok(mut stmt) = conn.prepare(
"SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0 AND name = file",
) {
if let Ok(rows) = stmt.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
}) {
for r in rows.flatten() {
map.insert(r.0, r.1);
}
}
}
map
}

/// Look up the first symbol node ID by name and file (for type-only import resolution).
fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option<i64> {
conn.query_row(
"SELECT id FROM nodes WHERE name = ? AND file = ? AND kind != 'file' LIMIT 1",
[name, file],
|row| row.get(0),
)
.ok()
/// Load symbol node IDs for the supplied `(name, file)` pairs in one chunked
/// query. Mirrors the JS `nodesByNameAndFile` lookup map; preserves the
/// first-row semantics of the legacy `LIMIT 1` query by keeping the first ID
/// seen per key.
///
/// The pairs are pre-computed by walking the type-only imports in
/// `ctx.file_symbols`, so we never scan the entire `nodes` table — even on
/// monorepos with 100k+ symbols, only the small slice actually referenced by
/// type-only imports is hit (#1013, #1028 review).
fn load_symbol_node_ids(
conn: &Connection,
needed_pairs: &HashSet<(String, String)>,
) -> HashMap<(String, String), i64> {
let mut map: HashMap<(String, String), i64> = HashMap::new();
if needed_pairs.is_empty() {
return map;
}

// 332 pairs × 2 params + 1 spare = 665 binds, comfortably under
// `SQLITE_MAX_VARIABLE_NUMBER`'s legacy 999 default.
const SYMBOL_LOOKUP_CHUNK: usize = 332;

let pairs: Vec<&(String, String)> = needed_pairs.iter().collect();
for chunk in pairs.chunks(SYMBOL_LOOKUP_CHUNK) {
let placeholders: Vec<String> = (0..chunk.len())
.map(|i| {
let base = i * 2;
format!("(?{},?{})", base + 1, base + 2)
})
.collect();
let sql = format!(
"SELECT name, file, id FROM nodes WHERE kind != 'file' AND (name, file) IN ({})",
placeholders.join(",")
);
let mut params: Vec<&dyn rusqlite::ToSql> = Vec::with_capacity(chunk.len() * 2);
for (name, file) in chunk {
params.push(name);
params.push(file);
}

if let Ok(mut stmt) = conn.prepare(&sql) {
if let Ok(rows) = stmt.query_map(rusqlite::params_from_iter(params.iter()), |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i64>(2)?,
))
}) {
for r in rows.flatten() {
map.entry((r.0, r.1)).or_insert(r.2);
}
}
}
}
map
}

/// Walk type-only imports in `ctx.file_symbols` and return the distinct
/// `(name, file)` pairs that `build_import_edges` will need to look up.
/// Resolves barrel files the same way the edge-building loop does so the
/// pre-computed set matches the actual lookup keys.
fn collect_type_only_lookup_pairs(ctx: &ImportEdgeContext) -> HashSet<(String, String)> {
let mut pairs = HashSet::new();
for (rel_path, symbols) in &ctx.file_symbols {
let abs_file = Path::new(&ctx.root_dir).join(rel_path);
let abs_str = abs_file.to_str().unwrap_or("");
for imp in &symbols.imports {
if !imp.type_only.unwrap_or(false) {
continue;
}
let resolved_path = ctx.get_resolved(abs_str, &imp.source);
for name in &imp.names {
let clean_name = name.strip_prefix("* as ").unwrap_or(name);
let mut target_file = resolved_path.clone();
if ctx.is_barrel_file(&resolved_path) {
let mut visited = HashSet::new();
if let Some(actual) =
ctx.resolve_barrel_export(&resolved_path, clean_name, &mut visited)
{
target_file = actual;
}
}
pairs.insert((clean_name.to_string(), target_file));
}
}
}
pairs
}

/// Build import edges from parsed file symbols.
Expand All @@ -185,10 +279,24 @@ fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option<i64>
pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<EdgeRow> {
let mut edges = Vec::new();

// Pre-load all file node IDs once. Previously this was N x query_row,
// each of which ran a fresh sqlite3_prepare/step/finalize cycle (#1013).
let file_node_ids = load_file_node_ids(conn);
// Only the symbols actually referenced by type-only imports are needed —
// skip the lookup entirely when no type-only imports exist (the common
// case), and otherwise issue a chunked `(name, file) IN (...)` query so
// memory stays bounded even on large monorepos (#1028 review).
let needed_symbol_pairs = collect_type_only_lookup_pairs(ctx);
let symbol_node_ids = if needed_symbol_pairs.is_empty() {
HashMap::new()
} else {
load_symbol_node_ids(conn, &needed_symbol_pairs)
};

for (rel_path, symbols) in &ctx.file_symbols {
let is_barrel_only = ctx.barrel_only_files.contains(rel_path);
let file_node_id = match get_file_node_id(conn, rel_path) {
Some(id) => id,
let file_node_id = match file_node_ids.get(rel_path) {
Some(&id) => id,
None => continue,
};

Expand All @@ -203,8 +311,8 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
}

let resolved_path = ctx.get_resolved(abs_str, &imp.source);
let target_id = match get_file_node_id(conn, &resolved_path) {
Some(id) => id,
let target_id = match file_node_ids.get(&resolved_path) {
Some(&id) => id,
None => continue,
};

Expand Down Expand Up @@ -238,7 +346,9 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
target_file = actual;
}
}
if let Some(sym_id) = get_symbol_node_id(conn, clean_name, &target_file) {
if let Some(&sym_id) =
symbol_node_ids.get(&(clean_name.to_string(), target_file))
{
edges.push(EdgeRow {
source_id: file_node_id,
target_id: sym_id,
Expand All @@ -262,7 +372,7 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
if actual_source != resolved_path
&& resolved_sources.insert(actual_source.clone())
{
if let Some(actual_id) = get_file_node_id(conn, &actual_source) {
if let Some(&actual_id) = file_node_ids.get(&actual_source) {
let through_kind = match edge_kind {
"imports-type" => "imports-type",
"dynamic-imports" => "dynamic-imports",
Expand All @@ -286,29 +396,82 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
edges
}

/// Batch insert edges into the database.
/// 199 rows × 5 params = 995 bind parameters, safely under the legacy
/// `SQLITE_MAX_VARIABLE_NUMBER` default of 999. Mirrors `edges_db::CHUNK`.
const INSERT_CHUNK: usize = 199;

/// Batch insert edges into the database using multi-row VALUES chunks.
///
/// Replaces the previous one-prepared-statement-per-row pattern that paid a
/// per-edge bind/step/reset cycle. With the chunked path each chunk runs a
/// single VM execution against a freshly prepared statement (#1013).
///
/// Bind/execute errors are surfaced via a stderr warning and the offending
/// chunk is skipped — silently swallowing them previously could produce
/// `NULL` columns in the inserted edge rows.
pub fn insert_edges(conn: &Connection, edges: &[EdgeRow]) {
if edges.is_empty() {
return;
}
let tx = match conn.unchecked_transaction() {
Ok(tx) => tx,
Err(_) => return,
Err(e) => {
eprintln!("[codegraph] insert_edges: failed to start transaction: {e}");
return;
}
};
if let Ok(mut stmt) = tx.prepare(
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES (?, ?, ?, ?, ?)",
) {
for e in edges {
let _ = stmt.execute(rusqlite::params![
e.source_id,
e.target_id,
e.kind,
e.confidence,
e.dynamic,
]);

for chunk in edges.chunks(INSERT_CHUNK) {
if let Err(e) = insert_edge_chunk(&tx, chunk) {
eprintln!(
"[codegraph] insert_edges: skipped chunk of {} rows due to error: {e}",
chunk.len()
);
}
}
let _ = tx.commit();
if let Err(e) = tx.commit() {
eprintln!("[codegraph] insert_edges: commit failed: {e}");
}
}

/// Bind and execute a single chunk in its own fallible scope so the caller
/// can log the failure and continue with the next chunk.
///
/// `prepare` (not `prepare_cached`) is used because the SQL string varies
/// with chunk length — caching keyed on dynamic SQL would churn the LRU
/// for every partial trailing chunk and obscure the intent of the cache.
fn insert_edge_chunk(
tx: &rusqlite::Transaction<'_>,
chunk: &[EdgeRow],
) -> rusqlite::Result<()> {
let placeholders: Vec<String> = (0..chunk.len())
.map(|i| {
let base = i * 5;
format!(
"(?{},?{},?{},?{},?{})",
base + 1,
base + 2,
base + 3,
base + 4,
base + 5
)
})
.collect();
let sql = format!(
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES {}",
placeholders.join(",")
);
let mut stmt = tx.prepare(&sql)?;
for (i, edge) in chunk.iter().enumerate() {
let base = i * 5;
stmt.raw_bind_parameter(base + 1, edge.source_id)?;
stmt.raw_bind_parameter(base + 2, edge.target_id)?;
stmt.raw_bind_parameter(base + 3, edge.kind.as_str())?;
stmt.raw_bind_parameter(base + 4, edge.confidence)?;
stmt.raw_bind_parameter(base + 5, edge.dynamic)?;
}
stmt.raw_execute()?;
Ok(())
}

#[cfg(test)]
Expand Down
Loading