Skip to content

Commit 12683b8

Browse files
committed
feat(graph): queryable computation-bottleneck metrics + indexing perf & fixes
Per-function complexity metadata is now stored on graph nodes and queryable, alongside several indexing performance and correctness fixes developed and validated together (3704 tests, ASan/UBSan clean). Bottleneck metrics (query via query_graph): - Tier A (in the extraction AST walk): cyclomatic (complexity), cognitive (nesting-weighted), loop_count, loop_depth (max nested-loop depth), param_count, max_access_depth. - Tier B (new pre-dump pass, pass_complexity.c): transitive_loop_depth propagated along CALLS edges + a recursive flag (direct self-recursion and mutual-recursion cycles), plus the call-context signals linear_scan_in_loop, alloc_in_loop, recursion_in_loop and unguarded_recursion. - query_graph and get_architecture tool descriptions document the metrics and the Leiden community clusters. Cypher engine: - node_prop exposes arbitrary persisted node properties to WHERE/RETURN. - Fix projection aliasing: multi-property rows shared a single static buffer so every column returned the last value read; now per-column/rotating buffers. - Fix a stack-use-after-scope in aggregate RETURN (caller-owned value buffers). Indexing performance: - Gate C/C++ #define Macro-node extraction to full mode (it is ~49% of nodes on the Linux kernel); moderate/fast skip it. - Emit the complexity property block only for Function/Method nodes so the millions of Macro/Field/Variable/Class/Enum nodes no longer carry zeroed fields — large RAM reduction at scale. - Classify node types via tree-sitter TSSymbol bitsets in cbm_kind_in_set instead of per-node strcmp scans (thread-local cache, strcmp fallback; behaviour-identical). - Subsample frequent (Zipfian) tokens in the semantic co-occurrence finalize; ~14x faster finalize on the kernel, output unchanged. - pass_lsp_cross: replace O(n^2) linear dedup with hash-set dedup. Windows: - Canonicalize drive-letter case during path normalization so "c:/repo" and "C:/repo" derive the same project key and cache file (#394/#227/#367). Tests: extraction, pipeline and cypher regressions covering all of the above.
1 parent 64be280 commit 12683b8

23 files changed

Lines changed: 1253 additions & 54 deletions

Makefile.cbm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ PIPELINE_SRCS = \
195195
src/pipeline/pass_k8s.c \
196196
src/pipeline/pass_similarity.c \
197197
src/pipeline/pass_semantic_edges.c \
198+
src/pipeline/pass_complexity.c \
198199
src/pipeline/pass_cross_repo.c \
199200
src/pipeline/artifact.c \
200201
src/pipeline/pass_pkgmap.c

internal/cbm/cbm.c

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <stdint.h> // uint32_t, uint64_t, int64_t
1717
#include <stdlib.h>
1818
#include <string.h>
19+
#include <ctype.h>
1920
#include <time.h> // struct timespec, CLOCK_MONOTONIC
2021

2122
// Atomic counters for profiling parse vs extraction time (nanoseconds).
@@ -28,6 +29,20 @@ static _Atomic uint64_t total_preprocess_ns = 0;
2829
static _Atomic uint64_t total_files_preprocessed = 0;
2930
static _Atomic uint64_t total_files = 0;
3031

32+
// C/C++ preprocessor #define macros are extracted as Macro nodes (#375). On a
33+
// macro-dense codebase (e.g. the Linux kernel: ~2.4M macros, 49% of all nodes)
34+
// this is the dominant extraction cost, so it is gated to the full/advanced
35+
// index modes. Default ON to preserve behavior for direct callers/tests; the
36+
// pipeline sets it from the index mode before extraction. Set once pre-extract,
37+
// read-only during, so a relaxed atomic is sufficient.
38+
static _Atomic int g_extract_macros = 1;
39+
void cbm_set_macro_extraction(int enabled) {
40+
atomic_store_explicit(&g_extract_macros, enabled ? 1 : 0, memory_order_relaxed);
41+
}
42+
int cbm_macro_extraction_enabled(void) {
43+
return atomic_load_explicit(&g_extract_macros, memory_order_relaxed);
44+
}
45+
3146
#define NSEC_PER_SEC 1000000000ULL
3247
#define USEC_TO_NSEC 1000ULL
3348
/* Use compat.h's cbm_clock_gettime which accepts CLOCK_MONOTONIC (value
@@ -251,6 +266,100 @@ void cbm_shutdown(void) {
251266
cbm_initialized = 0;
252267
}
253268

269+
// --- Bottleneck call-name classification (language-agnostic heuristics) ---
270+
271+
// Case-insensitive equality for short callee names.
272+
static bool name_ieq(const char *a, const char *b) {
273+
for (; *a && *b; a++, b++) {
274+
if (tolower((unsigned char)*a) != tolower((unsigned char)*b)) {
275+
return false;
276+
}
277+
}
278+
return *a == '\0' && *b == '\0';
279+
}
280+
281+
static bool name_in_set(const char *name, const char *const *set) {
282+
for (const char *const *s = set; *s; s++) {
283+
if (name_ieq(name, *s)) {
284+
return true;
285+
}
286+
}
287+
return false;
288+
}
289+
290+
// Linear-scan / membership calls: a hit inside a loop is the textbook hidden
291+
// O(n^2) (cf. Olivo et al., PLDI'15) that syntactic loop-depth alone misses.
292+
static bool is_linear_scan_name(const char *n) {
293+
static const char *const set[] = {"find", "indexof", "contains", "includes", "search",
294+
"lookup", "strstr", "strchr", "strrchr", "memchr",
295+
"find_if", "findindex", "count", "index", NULL};
296+
return name_in_set(n, set);
297+
}
298+
299+
// Allocation / growable-append calls: repeated inside a loop is the classic
300+
// accidental reallocation / string-concat O(n^2). Names are deliberately
301+
// conservative; meaningless in some languages → simply never matches there.
302+
static bool is_alloc_name(const char *n) {
303+
static const char *const set[] = {"malloc", "calloc", "realloc", "strdup",
304+
"strndup", "append", "push_back", "emplace_back",
305+
"concat", "strcat", "strncat", "push",
306+
"pushback", NULL};
307+
return name_in_set(n, set);
308+
}
309+
310+
// Count parameters from a signature string like "(int a, Foo* b, cb (*)(int,int))".
311+
// Fallback for languages where param_names isn't populated (e.g. C keeps only the
312+
// signature text). Counts commas at the top paren level; treats "()"/"(void)" as 0.
313+
// Approximate by design (a structural smell, not an exact arity).
314+
static int count_params_from_signature(const char *sig) {
315+
if (!sig) {
316+
return 0;
317+
}
318+
const char *p = sig;
319+
while (*p && *p != '(') {
320+
p++;
321+
}
322+
if (*p != '(') {
323+
return 0;
324+
}
325+
p++;
326+
const char *list = p;
327+
int depth = 0;
328+
int commas = 0;
329+
bool any = false;
330+
for (; *p; p++) {
331+
char ch = *p;
332+
if (ch == '(' || ch == '[' || ch == '{' || ch == '<') {
333+
depth++;
334+
} else if (ch == ')') {
335+
if (depth == 0) {
336+
break;
337+
}
338+
depth--;
339+
} else if (ch == ']' || ch == '}' || ch == '>') {
340+
if (depth > 0) {
341+
depth--;
342+
}
343+
} else if (ch == ',' && depth == 0) {
344+
commas++;
345+
} else if (!isspace((unsigned char)ch)) {
346+
any = true;
347+
}
348+
}
349+
if (!any) {
350+
return 0; /* "()" */
351+
}
352+
if (commas == 0) {
353+
while (*list == ' ' || *list == '\t') {
354+
list++;
355+
}
356+
if (strncmp(list, "void", 4) == 0 && (list[4] == ')' || list[4] == ' ' || list[4] == '\0')) {
357+
return 0; /* C "(void)" */
358+
}
359+
}
360+
return commas + 1;
361+
}
362+
254363
// --- Main extraction function ---
255364

256365
CBMFileResult *cbm_extract_file(const char *source, int source_len, CBMLanguage language,
@@ -396,6 +505,12 @@ CBMFileResult *cbm_extract_file(const char *source, int source_len, CBMLanguage
396505
}
397506
atomic_fetch_add(&total_lsp_ns, now_ns() - lsp_start);
398507

508+
// Calls extracted so far all carry ORIGINAL-source line numbers; the C/C++
509+
// preprocessor second pass below appends calls with EXPANDED-source lines,
510+
// which must not be used for the def line-range attribution of the bottleneck
511+
// metrics. Remember the boundary.
512+
int orig_calls_count = result->calls.count;
513+
399514
// Second pass: preprocess C/C++/CUDA and extract additional macro-hidden calls.
400515
// Defs keep original-source line numbers; only CALLS are extracted from expanded source.
401516
if (language == CBM_LANG_C || language == CBM_LANG_CPP || language == CBM_LANG_CUDA) {
@@ -457,6 +572,97 @@ CBMFileResult *cbm_extract_file(const char *source, int source_len, CBMLanguage
457572
atomic_fetch_add(&total_preprocess_ns, now_ns() - pp_start);
458573
}
459574

575+
// Bottleneck call-context metrics. Each call is attributed to the INNERMOST
576+
// enclosing Function/Method def by source-line range (defs and calls in one
577+
// CBMFileResult share the same file). Range matching is used instead of
578+
// enclosing_func_qn string matching because some grammars (notably C, whose
579+
// function_definition has no "name" field) attribute the call's scope to the
580+
// module rather than the function — line ranges are unambiguous and
581+
// language-agnostic. Bounded per file (defs x calls), not a repo-scale scan.
582+
int def_count = result->defs.count;
583+
bool *has_self = def_count > 0 ? calloc((size_t)def_count, sizeof(bool)) : NULL;
584+
bool *has_guarded = def_count > 0 ? calloc((size_t)def_count, sizeof(bool)) : NULL;
585+
586+
// param_count is a standalone structural smell (independent of calls). Prefer
587+
// the parsed param_names array; fall back to counting from the signature text
588+
// for languages (e.g. C) that populate only the signature.
589+
for (int di = 0; di < def_count; di++) {
590+
CBMDefinition *d = &result->defs.items[di];
591+
int pc = 0;
592+
if (d->param_names) {
593+
while (d->param_names[pc]) {
594+
pc++;
595+
}
596+
}
597+
if (pc == 0 && d->signature) {
598+
pc = count_params_from_signature(d->signature);
599+
}
600+
d->param_count = pc;
601+
}
602+
603+
for (int ci = 0; ci < orig_calls_count; ci++) {
604+
const CBMCall *c = &result->calls.items[ci];
605+
if (!c->callee_name || c->start_line <= 0) {
606+
continue;
607+
}
608+
// Innermost enclosing Function/Method def by line range (smallest span).
609+
int best = -1;
610+
int best_span = -1;
611+
for (int di = 0; di < def_count; di++) {
612+
const CBMDefinition *d = &result->defs.items[di];
613+
if (!d->name || !d->label ||
614+
(strcmp(d->label, "Function") != 0 && strcmp(d->label, "Method") != 0)) {
615+
continue;
616+
}
617+
if ((int)d->start_line <= c->start_line && c->start_line <= (int)d->end_line) {
618+
int span = (int)d->end_line - (int)d->start_line;
619+
if (best < 0 || span < best_span) {
620+
best_span = span;
621+
best = di;
622+
}
623+
}
624+
}
625+
if (best < 0) {
626+
continue;
627+
}
628+
CBMDefinition *d = &result->defs.items[best];
629+
// callee_name may be bare ("recur") or qualified ("pkg.recur", "self.recur")
630+
const char *dot = strrchr(c->callee_name, '.');
631+
const char *callee_short = dot ? dot + 1 : c->callee_name;
632+
bool in_loop = c->loop_depth > 0;
633+
634+
if (strcmp(callee_short, d->name) == 0) {
635+
// Direct self-recursion. The call graph omits self-edges (pass_calls
636+
// skips source==target), so detect it here; seeds "recursive".
637+
d->is_recursive = true;
638+
if (has_self) {
639+
has_self[best] = true;
640+
}
641+
if (in_loop) {
642+
d->recursion_in_loop = true; // recursion compounded by a loop
643+
}
644+
if (c->branch_depth > 0 && has_guarded) {
645+
has_guarded[best] = true; // a self-call guarded by some conditional
646+
}
647+
}
648+
if (in_loop && is_linear_scan_name(callee_short)) {
649+
d->linear_scan_in_loop++; // hidden O(n^2): linear scan inside a loop
650+
}
651+
if (in_loop && is_alloc_name(callee_short)) {
652+
d->alloc_in_loop++; // repeated allocation/append inside a loop
653+
}
654+
}
655+
656+
// Recursive with no self-call guarded by any conditional → no obvious base
657+
// case on the recursive path: a stronger "potentially unbounded" signal.
658+
for (int di = 0; di < def_count; di++) {
659+
if (has_self && has_self[di] && !(has_guarded && has_guarded[di])) {
660+
result->defs.items[di].unguarded_recursion = true;
661+
}
662+
}
663+
free(has_self);
664+
free(has_guarded);
665+
460666
uint64_t t2 = now_ns();
461667

462668
result->imports_count = result->imports.count;

internal/cbm/cbm.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,16 @@ typedef struct {
195195
const char *route_path; // HTTP route path from decorator (e.g., "/api/users") or NULL
196196
const char *route_method; // HTTP method from decorator (e.g., "POST") or NULL
197197
int complexity; // cyclomatic complexity
198+
int cognitive; // cognitive complexity (nesting-weighted)
199+
int loop_count; // number of loop constructs in the body
200+
int loop_depth; // max nested-loop depth (bottleneck proxy)
201+
bool is_recursive; // body contains a direct self-call (seed for "recursive")
202+
int param_count; // number of parameters (large = complexity smell)
203+
int max_access_depth; // deepest chained member/subscript access (a.b.c.d)
204+
int linear_scan_in_loop; // count of linear-scan calls (find/contains/indexOf) inside loops
205+
int alloc_in_loop; // count of allocation/append calls inside loops
206+
bool recursion_in_loop; // a self-call occurs inside a loop body
207+
bool unguarded_recursion; // recursive with no self-call guarded by a conditional
198208
int lines; // body line count
199209
uint32_t *fingerprint; // MinHash fingerprint (arena-allocated, K values) or NULL
200210
int fingerprint_k; // number of hash values (CBM_MINHASH_K or 0)
@@ -223,6 +233,9 @@ typedef struct {
223233
const char *second_arg_name; // second argument identifier (handler ref) or NULL
224234
CBMCallArg args[CBM_MAX_CALL_ARGS]; // first N arguments with expressions
225235
int arg_count; // number of captured arguments
236+
int loop_depth; // enclosing loop nesting at the call site
237+
int branch_depth; // enclosing branch nesting at the call site
238+
int start_line; // 1-based source line of the call (for def range-match)
226239
} CBMCall;
227240

228241
typedef struct {
@@ -532,6 +545,12 @@ uint64_t cbm_get_preprocess_ns(void);
532545
uint64_t cbm_get_files_preprocessed(void);
533546
void cbm_reset_profile(void);
534547

548+
// Toggle C/C++ preprocessor Macro-node extraction (#375). The pipeline enables
549+
// it only for full/advanced index modes (it dominates extraction on macro-dense
550+
// codebases). Default ON. Set before extraction; read-only during.
551+
void cbm_set_macro_extraction(int enabled);
552+
int cbm_macro_extraction_enabled(void);
553+
535554
// --- Internal helpers used by extractors ---
536555

537556
// Growable array push functions (arena-allocated, no individual free needed).

internal/cbm/extract_calls.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,9 @@ void handle_calls(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, Walk
727727
CBMCall call = {0};
728728
call.callee_name = callee;
729729
call.enclosing_func_qn = state->enclosing_func_qn;
730+
call.loop_depth = state->loop_depth; // enclosing loop nesting at this call
731+
call.branch_depth = state->branch_depth; // enclosing branch nesting at this call
732+
call.start_line = (int)ts_node_start_point(node).row + TS_LINE_OFFSET;
730733

731734
TSNode args = ts_node_child_by_field_name(node, TS_FIELD("arguments"));
732735
if (!ts_node_is_null(args)) {

internal/cbm/extract_defs.c

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1671,6 +1671,17 @@ static void resolve_cpp_trailing_return(CBMArena *a, TSNode func_node, const cha
16711671
}
16721672
}
16731673

1674+
/* Compute and store the structural complexity metrics for a definition. */
1675+
static void set_def_complexity(CBMDefinition *def, TSNode body, const CBMLangSpec *spec) {
1676+
cbm_complexity_t cx;
1677+
cbm_compute_complexity(body, spec->branching_node_types, &cx);
1678+
def->complexity = cx.cyclomatic;
1679+
def->cognitive = cx.cognitive;
1680+
def->loop_count = cx.loop_count;
1681+
def->loop_depth = cx.loop_depth;
1682+
def->max_access_depth = cx.max_access_depth;
1683+
}
1684+
16741685
static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec) {
16751686
CBMArena *a = ctx->arena;
16761687

@@ -1744,7 +1755,7 @@ static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
17441755

17451756
// Complexity
17461757
if (spec->branching_node_types && spec->branching_node_types[0]) {
1747-
def.complexity = cbm_count_branching(node, spec->branching_node_types);
1758+
set_def_complexity(&def, node, spec);
17481759
}
17491760

17501761
// MinHash fingerprint
@@ -2277,7 +2288,7 @@ static void push_method_def(CBMExtractCtx *ctx, TSNode child, const char *class_
22772288
def.docstring = extract_docstring(a, child, ctx->source, ctx->language);
22782289

22792290
if (spec->branching_node_types && spec->branching_node_types[0]) {
2280-
def.complexity = cbm_count_branching(child, spec->branching_node_types);
2291+
set_def_complexity(&def, child, spec);
22812292
}
22822293

22832294
// MinHash fingerprint
@@ -2425,7 +2436,7 @@ static void extract_rust_impl(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
24252436
}
24262437

24272438
if (spec->branching_node_types && spec->branching_node_types[0]) {
2428-
def.complexity = cbm_count_branching(child, spec->branching_node_types);
2439+
set_def_complexity(&def, child, spec);
24292440
}
24302441

24312442
// MinHash fingerprint
@@ -3829,7 +3840,11 @@ static void walk_defs(CBMExtractCtx *ctx, TSNode root, const CBMLangSpec *spec,
38293840

38303841
if (is_c_preprocessor_lang(ctx->language) &&
38313842
(strcmp(kind, "preproc_def") == 0 || strcmp(kind, "preproc_function_def") == 0)) {
3832-
extract_c_macro_def(ctx, node);
3843+
// Gated to full/advanced index modes — macros dominate extraction on
3844+
// macro-dense codebases (e.g. the Linux kernel). See #375.
3845+
if (cbm_macro_extraction_enabled()) {
3846+
extract_c_macro_def(ctx, node);
3847+
}
38333848
continue; // the macro body is a preproc_arg — nothing more to extract
38343849
}
38353850

internal/cbm/extract_unified.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ static void recompute_state(WalkState *state, const char *module_qn) {
3636
state->enclosing_class_qn = NULL;
3737
state->inside_call = false;
3838
state->inside_import = false;
39+
state->loop_depth = 0;
40+
state->branch_depth = 0;
3941

4042
for (int i = 0; i < state->scope_top; i++) {
4143
switch (state->scopes[i].kind) {
@@ -51,6 +53,12 @@ static void recompute_state(WalkState *state, const char *module_qn) {
5153
case SCOPE_IMPORT:
5254
state->inside_import = true;
5355
break;
56+
case SCOPE_LOOP:
57+
state->loop_depth++;
58+
break;
59+
case SCOPE_BRANCH:
60+
state->branch_depth++;
61+
break;
5462
default:
5563
break;
5664
}
@@ -686,6 +694,15 @@ static void push_boundary_scopes(CBMExtractCtx *ctx, TSNode node, const CBMLangS
686694
if (spec->import_node_types && cbm_kind_in_set(node, spec->import_node_types)) {
687695
push_scope(state, SCOPE_IMPORT, depth, NULL);
688696
}
697+
/* Loop / branch nesting for bottleneck metrics. Loops are gated on named
698+
* nodes so anonymous `for`/`while` keyword tokens don't count. A loop is NOT
699+
* also counted as a branch (many specs list loops in branching_node_types,
700+
* but a loop is not a base-case guard for the unguarded-recursion signal). */
701+
if (ts_node_is_named(node) && cbm_is_loop_node_type(ts_node_type(node))) {
702+
push_scope(state, SCOPE_LOOP, depth, NULL);
703+
} else if (spec->branching_node_types && cbm_kind_in_set(node, spec->branching_node_types)) {
704+
push_scope(state, SCOPE_BRANCH, depth, NULL);
705+
}
689706
}
690707

691708
void cbm_extract_unified(CBMExtractCtx *ctx) {

0 commit comments

Comments
 (0)