Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions crates/codegraph-core/src/build_pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1081,21 +1081,35 @@ fn build_and_insert_call_edges(
.map(String::from)
.collect();

// Pre-load every file node ID into a HashMap with one query, replacing
// the per-file `query_row` cycle that paid a fresh sqlite3_prepare for
// each entry in `file_symbols` (#1013).
let file_node_ids: HashMap<String, u32> = {
let mut map = HashMap::new();
if let Ok(mut stmt) =
conn.prepare("SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0")
{
if let Ok(rows) = stmt.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)? as u32))
}) {
for r in rows.flatten() {
map.insert(r.0, r.1);
}
}
}
map
};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Query drops name = ? filter; semantics diverge from old lookup

The original query was:

SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0

Both bind parameters were rel_path, so only nodes where name = file = rel_path were matched.

The new bulk query is SELECT file, id … WHERE kind = 'file' AND line = 0 — the name column is no longer checked, and any collision where two file-kind nodes share the same file value (different name) would silently overwrite an earlier map entry. The same pattern is repeated in import_edges.rs::load_file_node_ids. While name = file is the overwhelming convention for file nodes, the prior guard was explicit and free; dropping it is a silent behavioral change worth documenting or re-adding as a WHERE name = file predicate if the column is guaranteed redundant.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1081b64. Added AND name = file to both the load_file_node_ids query in import_edges.rs and the parallel pre-load in build_and_insert_call_edges in build_pipeline.rs. This restores the explicit guard from the legacy per-row lookup (WHERE name = ? AND file = ? with both binds set to rel_path), so an unrelated file-kind row sharing the same file value but different name can no longer silently overwrite the map entry.


// Build FileEdgeInput entries for the native edge builder
let mut file_entries: Vec<FileEdgeInput> = Vec::new();
for (rel_path, symbols) in file_symbols {
if import_ctx.barrel_only_files.contains(rel_path) {
continue;
}

// Look up file node ID
let file_node_id: u32 = match conn.query_row(
"SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0",
[rel_path, rel_path],
|row| row.get::<_, i64>(0),
) {
Ok(id) => id as u32,
Err(_) => continue,
let file_node_id: u32 = match file_node_ids.get(rel_path) {
Some(&id) => id,
None => continue,
};

// Build imported names from resolved imports
Expand Down
135 changes: 101 additions & 34 deletions crates/codegraph-core/src/import_edges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -153,24 +153,48 @@ pub fn detect_barrel_only_files(ctx: &ImportEdgeContext) -> HashSet<String> {
barrel_only
}

/// Look up a file node ID from the database.
fn get_file_node_id(conn: &Connection, rel_path: &str) -> Option<i64> {
conn.query_row(
"SELECT id FROM nodes WHERE name = ? AND kind = 'file' AND file = ? AND line = 0",
[rel_path, rel_path],
|row| row.get(0),
)
.ok()
/// Load every file node ID into a HashMap in one query — replaces per-import
/// `conn.query_row` lookups that paid the SQLite prepare/execute cycle on each
/// call (#1013).
fn load_file_node_ids(conn: &Connection) -> HashMap<String, i64> {
let mut map = HashMap::new();
if let Ok(mut stmt) =
conn.prepare("SELECT file, id FROM nodes WHERE kind = 'file' AND line = 0")
{
if let Ok(rows) = stmt.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
}) {
for r in rows.flatten() {
map.insert(r.0, r.1);
}
}
}
map
}

/// Look up the first symbol node ID by name and file (for type-only import resolution).
fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option<i64> {
conn.query_row(
"SELECT id FROM nodes WHERE name = ? AND file = ? AND kind != 'file' LIMIT 1",
[name, file],
|row| row.get(0),
)
.ok()
/// Load every (name, file) -> id mapping for non-file nodes in one query.
/// Mirrors the JS `nodesByNameAndFile` lookup map; preserves the first-row
/// semantics of the legacy `LIMIT 1` query by keeping the first ID seen per
/// key. Skipped entirely when no type-only imports exist (saves one full
/// scan of `nodes` on the common case).
fn load_symbol_node_ids(conn: &Connection) -> HashMap<(String, String), i64> {
let mut map = HashMap::new();
if let Ok(mut stmt) =
conn.prepare("SELECT name, file, id FROM nodes WHERE kind != 'file'")
{
if let Ok(rows) = stmt.query_map([], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, i64>(2)?,
))
}) {
for r in rows.flatten() {
map.entry((r.0, r.1)).or_insert(r.2);
}
}
}
map
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Unbounded full-table scan into memory for symbol nodes

SELECT name, file, id FROM nodes WHERE kind != 'file' has no row limit and loads every non-file symbol into a single HashMap. For large monorepos (100k+ symbols) this can easily push several hundred MB of resident RAM in a single build pass. The old get_symbol_node_id was lazy — it only hit SQLite for the specific (name, file) pairs that were actually needed, and only when type-only imports existed.

Consider capping with a two-phase approach: collect the distinct (name, file) pairs needed while iterating ctx.file_symbols, then issue a single WHERE (name, file) IN (…) query for only those pairs. This keeps the one-round-trip benefit without pulling the entire nodes table into memory.

Fix in Claude Code

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 1081b64. Replaced the unbounded SELECT name, file, id FROM nodes WHERE kind != 'file' with a chunked (name, file) IN (...) query. build_import_edges now walks type-only imports up front (resolving barrels the same way the edge loop does) to collect the distinct (name, file) pairs that will actually be looked up, then load_symbol_node_ids issues those in 332-pair chunks (664 binds, well under SQLITE_MAX_VARIABLE_NUMBER's legacy 999). The full-table scan path is gone — even on 100k+-symbol monorepos, only the small slice referenced by type-only imports is hit, while preserving the one-round-trip win.


/// Build import edges from parsed file symbols.
Expand All @@ -185,10 +209,23 @@ fn get_symbol_node_id(conn: &Connection, name: &str, file: &str) -> Option<i64>
pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<EdgeRow> {
let mut edges = Vec::new();

// Pre-load all file node IDs once. Previously this was N x query_row,
// each of which ran a fresh sqlite3_prepare/step/finalize cycle (#1013).
let file_node_ids = load_file_node_ids(conn);
let needs_symbol_map = ctx
.file_symbols
.values()
.any(|s| s.imports.iter().any(|i| i.type_only.unwrap_or(false)));
let symbol_node_ids = if needs_symbol_map {
load_symbol_node_ids(conn)
} else {
HashMap::new()
};

for (rel_path, symbols) in &ctx.file_symbols {
let is_barrel_only = ctx.barrel_only_files.contains(rel_path);
let file_node_id = match get_file_node_id(conn, rel_path) {
Some(id) => id,
let file_node_id = match file_node_ids.get(rel_path) {
Some(&id) => id,
None => continue,
};

Expand All @@ -203,8 +240,8 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
}

let resolved_path = ctx.get_resolved(abs_str, &imp.source);
let target_id = match get_file_node_id(conn, &resolved_path) {
Some(id) => id,
let target_id = match file_node_ids.get(&resolved_path) {
Some(&id) => id,
None => continue,
};

Expand Down Expand Up @@ -238,7 +275,9 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
target_file = actual;
}
}
if let Some(sym_id) = get_symbol_node_id(conn, clean_name, &target_file) {
if let Some(&sym_id) =
symbol_node_ids.get(&(clean_name.to_string(), target_file))
{
edges.push(EdgeRow {
source_id: file_node_id,
target_id: sym_id,
Expand All @@ -262,7 +301,7 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
if actual_source != resolved_path
&& resolved_sources.insert(actual_source.clone())
{
if let Some(actual_id) = get_file_node_id(conn, &actual_source) {
if let Some(&actual_id) = file_node_ids.get(&actual_source) {
let through_kind = match edge_kind {
"imports-type" => "imports-type",
"dynamic-imports" => "dynamic-imports",
Expand All @@ -286,7 +325,15 @@ pub fn build_import_edges(conn: &Connection, ctx: &ImportEdgeContext) -> Vec<Edg
edges
}

/// Batch insert edges into the database.
/// 199 rows × 5 params = 995 bind parameters, safely under the legacy
/// `SQLITE_MAX_VARIABLE_NUMBER` default of 999. Mirrors `edges_db::CHUNK`.
const INSERT_CHUNK: usize = 199;

/// Batch insert edges into the database using multi-row VALUES chunks.
///
/// Replaces the previous one-prepared-statement-per-row pattern that paid a
/// per-edge bind/step/reset cycle. With the chunked path each chunk runs a
/// single VM execution against a cached statement (#1013).
pub fn insert_edges(conn: &Connection, edges: &[EdgeRow]) {
if edges.is_empty() {
return;
Expand All @@ -295,18 +342,38 @@ pub fn insert_edges(conn: &Connection, edges: &[EdgeRow]) {
Ok(tx) => tx,
Err(_) => return,
};
if let Ok(mut stmt) = tx.prepare(
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES (?, ?, ?, ?, ?)",
) {
for e in edges {
let _ = stmt.execute(rusqlite::params![
e.source_id,
e.target_id,
e.kind,
e.confidence,
e.dynamic,
]);

for chunk in edges.chunks(INSERT_CHUNK) {
let placeholders: Vec<String> = (0..chunk.len())
.map(|i| {
let base = i * 5;
format!(
"(?{},?{},?{},?{},?{})",
base + 1,
base + 2,
base + 3,
base + 4,
base + 5
)
})
.collect();
let sql = format!(
"INSERT OR IGNORE INTO edges (source_id, target_id, kind, confidence, dynamic) VALUES {}",
placeholders.join(",")
);
let mut stmt = match tx.prepare_cached(&sql) {
Ok(s) => s,
Err(_) => continue,
};
for (i, edge) in chunk.iter().enumerate() {
let base = i * 5;
let _ = stmt.raw_bind_parameter(base + 1, edge.source_id);
let _ = stmt.raw_bind_parameter(base + 2, edge.target_id);
let _ = stmt.raw_bind_parameter(base + 3, edge.kind.as_str());
let _ = stmt.raw_bind_parameter(base + 4, edge.confidence);
let _ = stmt.raw_bind_parameter(base + 5, edge.dynamic);
}
let _ = stmt.raw_execute();
}
let _ = tx.commit();
}
Expand Down
Loading