consequence_graph/diagnose.py at master · Raj-Taware/consequence_graph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
diagnose.py — run this from the neural-lam root after indexing.
Windows-friendly replacement for grep-based inspection.

Usage:
    python diagnose.py
"""
import sys
import os
import json

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from core.graph import KnowledgeGraph
from core.indexer import Indexer
from core.enricher import Enricher
from core.query import QueryEngine
from presets.neural_lam import apply
from output.llm_context import format_impact_as_context

NEURAL_LAM_PATH = "./neural_lam"
CACHE = ".codegraph/cache.json"

def reindex():
    print(f"Indexing {NEURAL_LAM_PATH} ...")
    graph = KnowledgeGraph()
    graph.CACHE_FILE = CACHE
    indexer = Indexer(graph, NEURAL_LAM_PATH)
    indexer.index_path(NEURAL_LAM_PATH)
    print(f"  Raw: {graph.node_count()} nodes, {graph.edge_count()} edges")
    Enricher(graph).run()
    print(f"  After enrichment: {graph.node_count()} nodes, {graph.edge_count()} edges")
    apply(graph)
    print(f"  After preset: {graph.node_count()} nodes, {graph.edge_count()} edges")
    graph.save(CACHE)
    return graph

def print_section(title):
    print(f"\n{'='*60}")
    print(f"  {title}")
    print('='*60)

def run(graph):
    engine = QueryEngine(graph)

    # 1. Node type breakdown
    print_section("NODE TYPE BREAKDOWN")
    stats = graph.stats()
    for k, v in stats["node_types"].items():
        print(f"  {k:25s} {v}")

    # 2. Key functions we care about
    print_section("KEY FUNCTIONS FOUND")
    targets = [
        "training_step", "validation_step", "forward",
        "__getitem__", "predict_step", "configure_optimizers",
        "encode", "decode", "process",
    ]
    for t in targets:
        matches = graph.resolve_node_id(t)
        for m in matches:
            d = graph.get_node(m)
            hook = " ⚡" if d.get("is_lightning_hook") else ""
            shapes = f" shapes={d['tensor_shapes']}" if d.get("tensor_shapes") else ""
            print(f"  {m}{hook}{shapes}")

    # 3. All classes
    print_section("CLASSES")
    for nid, d in sorted(graph.g.nodes(data=True)):
        if d.get("node_type") == "class":
            bases = d.get("base_classes", [])
            print(f"  {nid}")
            if bases:
                print(f"    inherits: {', '.join(bases)}")

    # 4. Impact reports for the 3 most interesting nodes
    print_section("IMPACT: ARModel.training_step")
    r = engine.impact("training_step")
    if "error" not in r and "ambiguous" not in r:
        print(format_impact_as_context(r))
    else:
        print(json.dumps(r, indent=2))

    print_section("IMPACT: BaseGraphModel (or GraphLAM) forward")
    # forward is ambiguous — find all forward functions and pick the GNN one
    forward_nodes = [
        nid for nid, d in graph.g.nodes(data=True)
        if d.get("name") == "forward" and d.get("node_type") == "function"
    ]
    if not forward_nodes:
        print("No forward() functions found in graph.")
    elif len(forward_nodes) == 1:
        r = engine.impact(forward_nodes[0])
        print(format_impact_as_context(r))
    else:
        print(f"Multiple forward() nodes found ({len(forward_nodes)}):")
        for n in forward_nodes:
            print(f"  {n}")
        # Pick the most interesting one — prefer InteractionNet (the GNN core)
        gnn_forward = next((n for n in forward_nodes if "interaction" in n.lower()), forward_nodes[0])
        print(f"\nShowing impact for: {gnn_forward}")
        r = engine.impact(gnn_forward)
        print(format_impact_as_context(r))

    print_section("IMPACT: WeatherDataset.__getitem__")
    # Find the real WeatherDataset.__getitem__ node ID
    getitem_candidates = [
        nid for nid, d in graph.g.nodes(data=True)
        if d.get("name") == "__getitem__"
        and "WeatherDataset" in nid
        and "Padded" not in nid  # exclude the utility PaddedWeatherDataset
    ]
    if getitem_candidates:
        r = engine.impact(getitem_candidates[0])
        print(format_impact_as_context(r))
    else:
        r = engine.impact("WeatherDataset.__getitem__")
        print(format_impact_as_context(r) if "error" not in r else json.dumps(r, indent=2))

    # 5. Highest centrality nodes — the load-bearing ones
    print_section("HIGHEST IN-DEGREE NODES (most depended upon)")
    scored = []
    for nid in graph.g.nodes:
        d = graph.get_node(nid)
        if d.get("node_type") in ("function", "class"):
            scored.append((graph.in_degree(nid), nid))
    scored.sort(reverse=True)
    for score, nid in scored[:15]:
        print(f"  [{score:3d} dependents] {nid}")

if __name__ == "__main__":
    if "--reindex" in sys.argv or not os.path.exists(CACHE):
        graph = reindex()
    else:
        graph = KnowledgeGraph()
        graph.CACHE_FILE = CACHE
        loaded = graph.load(CACHE)
        if not loaded:
            graph = reindex()
        else:
            print(f"Loaded from cache: {graph.node_count()} nodes, {graph.edge_count()} edges")

    run(graph)