gbrain/src/core/engine.ts at master · garrytan/gbrain · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import type {
  Page, PageInput, PageFilters, GetPageOpts,
  Chunk, ChunkInput, StaleChunkRow, StalePageRow,
  SearchResult, SearchOpts,
  Link, GraphNode, GraphPath, RelationalFanoutRow, RelationalFanoutOpts,
  TimelineEntry, TimelineInput, TimelineOpts,
  RawData,
  PageVersion,
  BrainStats, BrainHealth,
  IngestLogEntry, IngestLogInput,
  EngineConfig,
  CodeEdgeInput, CodeEdgeResult,
  EvalCandidate, EvalCandidateInput,
  EvalCaptureFailure, EvalCaptureFailureReason,
  SalienceOpts, SalienceResult, AnomaliesOpts, AnomalyResult,
  EmotionalWeightInputRow, EmotionalWeightWriteRow,
  DomainBankSampleOpts, CorpusSampleOpts, DomainBankRow,
  AdjacencyRow,
  EnrichCandidatesOpts, EnrichCandidate,
} from './types.ts';

/**
 * v0.27.1: file row for binary-asset metadata. Mirrors the `files` table
 * shape on both engines (Postgres has had it since v0.18; PGLite gets it
 * via migration v36).
 */
/**
 * Options for `traverseGraph`.
 *
 * `frontierCap`: when set, the BFS recursive term applies a parenthesized
 * `LIMIT N ORDER BY slug,id` so each iteration emits at most N rows. This
 * is the "approximately per-layer" cap discussed in the T8 plan — Postgres'
 * recursive CTE caps per ITERATION, not strictly per BFS LAYER (BFS layer
 * boundaries map to recursive iterations only when fan-out is bounded).
 * For hub-fanout graphs the cap fires early and bounds the work. Default:
 * unset = no cap (back-compat; existing callers see no change).
 *
 * NOTE: a truncation-detection signal (`onTruncation` callback) was
 * designed but the v1 algorithm had both false-positive (organic count ==
 * cap) and false-negative (LIMIT-before-DISTINCT in diamond graphs) cases
 * caught by adversarial review. The signal is deferred until a
 * dedupe-then-cap SQL rewrite + real Postgres parity coverage lands. See
 * TODOS.md → "T8 truncation signal" entry. Callers that need to detect
 * truncation can compare `result.length` against expected fanout bounds
 * as a coarse-but-honest signal in the interim.
 */
/**
 * v0.38: bare row shape returned by `BrainEngine.listAllSources()`.
 * Kept lean (no per-source page_count) so the autopilot tick stays O(1)
 * SQL queries regardless of source count. `sources-ops.SourceListEntry`
 * is the enriched application-layer shape.
 */
export interface SourceRow {
  id: string;
  name: string | null;
  local_path: string | null;
  last_sync_at: Date | null;
  config: Record<string, unknown>;
}

export interface TraverseGraphOpts {
  sourceId?: string;
  sourceIds?: string[];
  frontierCap?: number;
}

export interface FileRow {
  id: number;
  source_id: string;
  page_slug: string | null;
  page_id: number | null;
  filename: string;
  storage_path: string;
  mime_type: string | null;
  size_bytes: number | null;
  content_hash: string;
  metadata: Record<string, unknown>;
  created_at: Date;
}

/**
 * v0.27.1: spec for upsertFile. Identity is (source_id, storage_path).
 * Re-upserting the same identity with a different content_hash updates the
 * row in place (image was replaced); same content_hash is a no-op.
 */
export interface FileSpec {
  source_id?: string;
  page_slug?: string | null;
  page_id?: number | null;
  filename: string;
  storage_path: string;
  mime_type?: string | null;
  size_bytes?: number | null;
  content_hash: string;
  metadata?: Record<string, unknown>;
}

/**
 * v0.41.18.0 — shared opts for engine batch primitives that self-retry on
 * transient connection errors. Threaded through addLinksBatch /
 * addTimelineEntriesBatch / addTakesBatch / upsertChunks.
 *
 * Retry semantics: each batch primitive wraps its internal SQL in
 * `withRetry(BULK_RETRY_OPTS)` (default `{maxRetries:3, delayMs:1000,
 * delayMaxMs:10000, jitter:'decorrelated'}`). Callers MUST NOT add their own
 * `withRetry` wrapper around these methods — that produces 3×3=9 retry
 * attempts under failure, amplifying load on a recovering circuit breaker.
 * CI lint guard `scripts/check-no-double-retry.sh` enforces the rule.
 *
 * - `auditSite`: typed label for the JSONL audit emission (`~/.gbrain/audit/
 *   batch-retry-YYYY-Www.jsonl`). Must be a member of `BATCH_AUDIT_SITES`
 *   in `src/core/retry.ts`. The CI lint guard `scripts/check-batch-audit-
 *   site.sh` validates every string-literal value at build time.
 * - `signal`: AbortSignal that aborts mid-retry-sleep on SIGTERM/SIGINT.
 *   `MinionWorker.shutdownAbort.signal` is the canonical source.
 */
import type { BatchAuditSite } from './retry.ts';
export interface BatchOpts {
  auditSite?: BatchAuditSite;
  signal?: AbortSignal;
}

/** Input row for addLinksBatch. Optional fields default to '' (matches NOT NULL DDL). */
export interface LinkBatchInput {
  from_slug: string;
  to_slug: string;
  link_type?: string;
  context?: string;
  /**
   * Provenance (v0.13+; opened to kebab tags in v114 / #1941). Any lowercase
   * kebab-case value <=64 chars is DB-valid (CHECK `^[a-z][a-z0-9]*(-[a-z0-9]+)*$`),
   * so external derivers stamp their own tag (e.g. 'citation-graph'). The
   * reconciliation-managed built-ins are 'markdown' ([Name](path) refs),
   * 'frontmatter' (YAML-derived, see origin_*), 'mentions', 'wikilink-resolved';
   * 'manual' is for user/tool-created edges. NULL = legacy/unknown (pre-v0.13).
   * Missing on this batch input defaults to 'markdown'. NOTE: the add_link OP
   * (not this engine method) forbids callers from passing the four managed
   * built-ins and defaults omitted to 'manual' — internal callers use the
   * engine directly and keep writing the managed values.
   */
  link_source?: string;
  /** For link_source='frontmatter': slug of the page whose frontmatter created this edge. */
  origin_slug?: string;
  /** Frontmatter field name (e.g. 'key_people', 'investors'). */
  origin_field?: string;
  /**
   * v0.18.0: source id for each endpoint. When omitted, the engine JOINs
   * against `source_id='default'`. Pass explicit values when the edge
   * lives in a non-default source OR crosses sources.
   *
   * Without these fields, the batch JOIN `pages.slug = v.from_slug` fans
   * out across every source containing that slug, silently creating wrong
   * edges in a multi-source brain. The source_id filter eliminates the
   * fan-out. Origin pages (frontmatter provenance) get their own
   * source_id so reconciliation can't delete edges from another source's
   * frontmatter.
   */
  from_source_id?: string;
  to_source_id?: string;
  origin_source_id?: string;
  /**
   * v0.41.18.0 (A10, codex finding #12): distinguishes "plain body mention"
   * (NULL or 'plain') from "verb-pattern-derived typed NER" ('typed_ner')
   * within link_source='mentions'. Backed by v98 schema column. NOT in
   * the links UNIQUE constraint — same (from, to, type, source, origin)
   * tuple with different link_kind collides DO NOTHING. Default NULL =
   * legacy / unknown / pre-v98 semantics.
   */
  link_kind?: string;
}

/** Input row for addTimelineEntriesBatch. Optional fields default to '' (matches NOT NULL DDL). */
export interface TimelineBatchInput {
  slug: string;
  date: string;
  source?: string;
  summary: string;
  detail?: string;
  /**
   * v0.18.0: source id for the owning page. When omitted, the engine JOINs
   * against `source_id='default'`. Without this, two pages sharing the
   * same slug across sources would fan out timeline rows to both.
   */
  source_id?: string;
}

/**
 * A single dedicated database connection, isolated from the engine's pool.
 *
 * Used by migration paths that need session-level GUCs (e.g.
 * `SET statement_timeout = '600000'` before a `CREATE INDEX CONCURRENTLY`)
 * without leaking into the shared pool, and by write-quiesce designs
 * that need a session-lifetime Postgres advisory lock that survives
 * across transaction boundaries.
 *
 * On Postgres: backed by postgres-js `sql.reserve()`; the same backend
 * process serves every `executeRaw` call within the callback. Released
 * automatically when the callback returns or throws.
 *
 * On PGLite: a thin pass-through. PGLite has no pool, so every call is
 * already on the single backing connection. The interface is still
 * exposed so cross-engine callers don't need to branch.
 *
 * Not safe to call from inside `transaction()`. The transaction holds a
 * different backend; reserving a second one can deadlock on a row the
 * transaction itself is waiting to write.
 */
export interface ReservedConnection {
  /**
   * v0.41.18.0 (A20, codex #7): optional 3rd-arg `opts.signal` lets callers
   * actually cancel a running query. Init nudge (3s wallclock cap) wires an
   * AbortController whose timer fires at 3s; queries that haven't returned
   * by then get cancelled (Postgres: query.cancel(); PGLite: in-process,
   * Promise.race against signal-rejection — documented gap because PGLite
   * has no kernel-level cancellation).
   */
  executeRaw<T = Record<string, unknown>>(
    sql: string,
    params?: unknown[],
    opts?: { signal?: AbortSignal },
  ): Promise<T[]>;
}

/**
 * v0.28: Takes — typed/weighted/attributed claims, indexed in Postgres.
 * Markdown is source of truth (fenced table on the page); this row is the
 * derived index. Page-scoped via page_id (NOT slug — slug is unique only
 * within a source). `(page_id, row_num)` is the natural unique key.
 */
// v0.38: TakeKind opens from closed 4-element union to string (T3/T10).
// Pre-v0.38, kinds {fact|take|bet|hunch} were enforced by DB CHECK
// (migrations v41/v48) AND by this TS closed union. Codex outside-voice
// review caught that dropping the CHECK without also widening the TS
// type "moves inconsistency around" — raw SQL and old clients could
// poison rows that runtime-validate cleanly. v0.38 migration v76 drops
// the CHECK; this widens the type. Runtime validation moves to the
// active schema pack's `takes_kinds:` declaration. The annotation
// primitive's seed list in gbrain-base reproduces {fact|take|bet|hunch}
// so existing behavior is unchanged; packs can extend to {finding|
// hypothesis|observation|...} per domain.
export interface TakeKindLiteral { kind: string }
export type TakeKind = string;

/** Input row for addTakesBatch. */
export interface TakeBatchInput {
  page_id: number;
  row_num: number;
  claim: string;
  kind: TakeKind;
  holder: string;
  weight?: number;          // 0..1, default 0.5; clamped server-side
  since_date?: string;      // ISO date 'YYYY-MM-DD'
  until_date?: string;
  source?: string;
  superseded_by?: number | null;
  active?: boolean;         // default true
}

/** Take row as returned by listTakes / searchTakes. */
export interface Take {
  id: number;
  page_id: number;
  page_slug: string;        // joined from pages
  row_num: number;
  claim: string;
  kind: TakeKind;
  holder: string;
  weight: number;
  since_date: string | null;
  until_date: string | null;
  source: string | null;
  superseded_by: number | null;
  active: boolean;
  resolved_at: string | null;
  resolved_outcome: boolean | null;
  /**
   * v0.30.0: 3-state outcome label. v0.36.1.1 added 'unresolvable' as a 4th
   * state for verdicts where evidence was insufficient to grade. Sits
   * alongside `resolved_outcome` for back-compat. New writes populate both;
   * legacy v0.28-resolved rows have `resolved_quality` backfilled by
   * migration v40 from the boolean. Null on unresolved rows. Schema CHECK
   * (widened in v74) enforces (quality, outcome) consistency:
   * `correct` ↔ `outcome=true`, `incorrect` ↔ `outcome=false`,
   * `partial` ↔ `outcome=NULL`, `unresolvable` ↔ `outcome=NULL`.
   */
  resolved_quality: 'correct' | 'incorrect' | 'partial' | 'unresolvable' | null;
  resolved_value: number | null;
  resolved_unit: string | null;
  resolved_source: string | null;
  resolved_by: string | null;
  created_at: string;
  updated_at: string;
}

export interface TakesListOpts {
  page_id?: number;
  page_slug?: string;       // resolved via JOIN
  holder?: string;
  kind?: TakeKind;
  active?: boolean;         // default true (only active rows)
  resolved?: boolean;       // true = only resolved; false = only unresolved; undefined = both
  /** Per-token MCP allow-list. Server applies AND holder = ANY($takesHoldersAllowList) when set. */
  takesHoldersAllowList?: string[];
  sortBy?: 'weight' | 'since_date' | 'created_at';
  limit?: number;
  offset?: number;
}

/** Search result row from searchTakes / searchTakesVector. */
export interface TakeHit {
  take_id: number;
  page_id: number;
  page_slug: string;
  row_num: number;
  claim: string;
  kind: TakeKind;
  holder: string;
  weight: number;
  score: number;            // search rank score (ts_rank for keyword, 1-cos_dist for vector)
}

/** v0.28 stale-takes row (mirrors StaleChunkRow shape). Embedding column intentionally omitted. */
export interface StaleTakeRow {
  take_id: number;
  page_slug: string;
  row_num: number;
  claim: string;
}

/** Resolution metadata for resolveTake. */
export interface TakeResolution {
  /**
   * v0.30.0: primary 3-state input; v0.36.1.1 widened to 4-state with
   * 'unresolvable'. When set, takes precedence over `outcome` and the engine
   * writes both columns (quality directly; outcome derived:
   * `correct→true`, `incorrect→false`, `partial→null`, `unresolvable→null`).
   * `unresolvable` marks rows where the judge ran but evidence was
   * insufficient to grade; surfaces in `TakesScorecard.unresolvable_count`.
   */
  quality?: 'correct' | 'incorrect' | 'partial' | 'unresolvable';
  /**
   * v0.28 back-compat input. Keep submitting for v0.28 callers; the engine
   * derives quality (`true→correct`, `false→incorrect`). When `quality` is
   * also set, `quality` wins. When neither is set, the engine throws.
   * Mutually-exclusive with `quality === 'partial'` because partial isn't
   * binary.
   */
  outcome?: boolean;
  value?: number;
  unit?: string;       // 'usd' | 'pct' | 'count' | other
  source?: string;
  resolvedBy: string;  // slug or 'garry'
}

/** v0.30.0: scorecard aggregate. */
export interface TakesScorecard {
  total_bets: number;
  /**
   * Count of resolved rows where `resolved_quality IN
   * ('correct','incorrect','partial')`. v0.36.1.1 deliberately keeps this
   * 3-state semantic to preserve historical comparisons. Unresolvable rows
   * land in the sibling `unresolvable_count` field instead.
   */
  resolved: number;
  correct: number;
  incorrect: number;
  partial: number;
  /** Accuracy = correct / (correct + incorrect). NULL when n=0. */
  accuracy: number | null;
  /**
   * Brier score over rows where `resolved_quality IN ('correct','incorrect')`.
   * Maps `correct→1`, `incorrect→0`, computes `mean((weight − outcome)²)`.
   * Lower is better; 0 = perfect; 0.25 = always-50% baseline.
   * Excludes partial AND unresolvable — both hide signal; the dedicated
   * `partial_rate` and `unresolvable_rate` fields surface them separately.
   * NULL when no correct+incorrect rows.
   */
  brier: number | null;
  /** partial / resolved. NULL when n=0. */
  partial_rate: number | null;
  /**
   * v0.36.1.1: count of rows where `resolved_quality = 'unresolvable'`.
   * Sibling field to `resolved` so historical comparisons against pre-v80
   * scorecards stay valid; `resolved` retains its 3-state meaning, and
   * unresolvable rows count here separately. Optional for SDK back-compat —
   * downstream consumers constructing TakesScorecard fixtures shouldn't have
   * to update on a hotfix. `finalizeScorecard` always populates it.
   */
  unresolvable_count?: number;
  /**
   * v0.37.2.0: `unresolvable_count / (resolved + unresolvable_count)`. NULL
   * when both are 0. Surfaces the spec's headline calibration signal:
   * "what fraction of grade-attempted takes couldn't be graded?" — high
   * values signal weak evidence retrieval rather than wrong predictions.
   * Optional for SDK back-compat; see `unresolvable_count` note above.
   */
  unresolvable_rate?: number | null;
}

export interface TakesScorecardOpts {
  holder?: string;
  domainPrefix?: string; // e.g. 'companies/' to scope the scorecard
  since?: string;        // ISO date 'YYYY-MM-DD'
  until?: string;        // ISO date 'YYYY-MM-DD'
}

/** v0.30.0: calibration curve bucket. */
export interface CalibrationBucket {
  /** Lower bound of the weight bucket, inclusive. */
  bucket_lo: number;
  /** Upper bound, exclusive (except for the final bucket which is inclusive of 1.0). */
  bucket_hi: number;
  /** Count of resolved correct+incorrect bets falling in this weight range. */
  n: number;
  /** correct / n. NULL when n=0. */
  observed: number | null;
  /** mean(weight) within the bucket — what was predicted on average. NULL when n=0. */
  predicted: number | null;
}

export interface CalibrationCurveOpts {
  holder?: string;
  bucketSize?: number; // default 0.1
}

/** Synthesis evidence row input (provenance from think synthesis pages). */
export interface SynthesisEvidenceInput {
  synthesis_page_id: number;
  take_page_id: number;
  take_row_num: number;
  citation_index: number;
}

/** Dream-cycle Haiku verdict on whether a transcript is worth processing. */
export interface DreamVerdict {
  worth_processing: boolean;
  reasons: string[];
  judged_at: string;
}

/** Input shape for putDreamVerdict — judged_at defaults to now() server-side. */
export interface DreamVerdictInput {
  worth_processing: boolean;
  reasons: string[];
}

// ============================================================
// v0.31 Hot Memory: facts table + recall surface
// ============================================================

/** Allowed `facts.kind` values. Different decay halflives apply per kind. */
export type FactKind = 'event' | 'preference' | 'commitment' | 'belief' | 'fact';

export const ALL_FACT_KINDS: readonly FactKind[] = [
  'event', 'preference', 'commitment', 'belief', 'fact',
] as const;

/** Visibility tier on a fact row. Mirrors takes' world-default ACL contract (D21). */
export type FactVisibility = 'private' | 'world';

/** Status returned by insertFact. */
export type FactInsertStatus = 'inserted' | 'duplicate' | 'superseded';

/** A fact row read from the facts table. */
export interface FactRow {
  id: number;
  source_id: string;
  entity_slug: string | null;
  fact: string;
  kind: FactKind;
  visibility: FactVisibility;
  /**
   * v0.31.2: salience tier the LLM assigned at extraction time. Surfaces
   * to consumers (recall response, daily-page writer, admin dashboard,
   * agents reading via MCP `_meta.brain_hot_memory`). Pre-v45 brains had
   * no notability column; migration v46 backfills with default 'medium'.
   */
  notability: 'high' | 'medium' | 'low';
  context: string | null;
  valid_from: Date;
  valid_until: Date | null;
  expired_at: Date | null;
  superseded_by: number | null;
  consolidated_at: Date | null;
  consolidated_into: number | null;
  source: string;
  source_session: string | null;
  confidence: number;
  embedding: Float32Array | null;
  embedded_at: Date | null;
  created_at: Date;
}

/** Input for insertFact. source_id supplied via the ctx arg. */
export interface NewFact {
  fact: string;
  kind?: FactKind;                     // default 'fact'
  entity_slug?: string | null;
  visibility?: FactVisibility;          // default 'private'
  context?: string | null;
  valid_from?: Date;                   // default now()
  valid_until?: Date | null;
  source: string;                       // 'mcp:put_page' | 'mcp:extract_facts' | 'cli:think' | etc
  source_session?: string | null;
  confidence?: number;                  // [0,1], default 1.0
  notability?: 'high' | 'medium' | 'low'; // salience filter for extraction gate
  embedding?: Float32Array | null;     // pre-computed; if null, insertFact computes via gateway
  /**
   * v0.35.4 (D-CDX-5) — typed-claim fields. Optional. When populated,
   * `gbrain eval trajectory` + `find_trajectory` MCP op consume them for
   * chronological regression detection and drift_score. `claim_metric` is
   * normalized to lowercase snake_case by the extraction layer before
   * this method sees it; the engine stores verbatim.
   */
  claim_metric?: string | null;
  claim_value?: number | null;
  claim_unit?: string | null;
  claim_period?: string | null;
  /**
   * v0.40.2.0 — event-shaped row marker ('meeting', 'job_change',
   * 'location_change', etc). Mutually informational with `claim_metric`:
   * a row can have either, both, or neither. Persisted into
   * `facts.event_type` (migration v89). Existing callers don't need to
   * set this — leaving it undefined preserves pre-v0.40 behavior.
   */
  event_type?: string | null;
}

/** Options shared by list-facts methods. */
export interface FactListOpts {
  /** Hide expired_at IS NOT NULL rows. Default true. */
  activeOnly?: boolean;
  limit?: number;
  offset?: number;
  /** Restrict to specific kinds. Default: all kinds. */
  kinds?: FactKind[];
  /**
   * Visibility filter. When undefined, returns all. When set, only matches
   * are returned. Remote (untrusted) callers must supply ['world'].
   */
  visibility?: FactVisibility[];
}

/** Per-source operational health snapshot consumed by `gbrain doctor`. */
export interface FactsHealth {
  source_id: string;
  total_active: number;          // facts where expired_at IS NULL
  total_today: number;           // created in last 24h
  total_week: number;            // created in last 7d
  total_expired: number;         // expired_at IS NOT NULL
  total_consolidated: number;    // consolidated_at IS NOT NULL
  top_entities: Array<{ entity_slug: string; count: number }>;
  /** Optional counters fed by the queue / classifier — populated when those modules report. */
  drop_counter?: number;
  classifier_fail_counter?: number;
  p50_latency_ms?: number;
  p99_latency_ms?: number;
}

/**
 * v0.35.4 (D-CDX-6) — Options for `BrainEngine.findTrajectory`.
 *
 * `sourceId` (scalar fast path) and `sourceIds` (federated array) follow
 * the v0.34.1.0 search* pattern: when `sourceIds` is set the engine
 * applies `WHERE source_id = ANY($N::text[])`; otherwise scalar predicate
 * with `sourceId ?? 'default'`.
 *
 * `remote` (D-CDX-1) gates the visibility filter: when true the engine
 * adds `AND visibility = 'world'`, mirroring `recall`'s posture for
 * untrusted callers. Local CLI keeps `remote: false` and sees both
 * private + world facts.
 */
export interface TrajectoryOpts {
  entitySlug: string;
  /** Single-source scope; default 'default' when both this and sourceIds are unset. */
  sourceId?: string;
  /** Federated array scope (mutually exclusive with sourceId; the array wins when set). */
  sourceIds?: string[];
  /** When true, filters to visibility='world' only. Set by MCP layer from ctx.remote. */
  remote?: boolean;
  /** Metric filter. When set, only facts with this canonical metric label participate. */
  metric?: string;
  /**
   * v0.40.2.0 — kind filter. Default 'all'. Defensive opt that future-proofs
   * the API now that event_type rows live alongside metric rows in the same
   * table. Existing callers (founder-scorecard, eval-trajectory) pass
   * 'metric' explicitly for clarity (no behavior change since their
   * downstream math already skips NULL-metric rows). Richer event-shape
   * filtering (job_change vs meeting vs location) is a v0.40.3+ TODO once
   * the event schema gets structured fields.
   *   - 'metric': only rows with claim_metric IS NOT NULL
   *   - 'event':  only rows with event_type IS NOT NULL
   *   - 'all':    both (default)
   */
  kind?: 'metric' | 'event' | 'all';
  /** Lower bound on valid_from (inclusive). YYYY-MM-DD or full ISO. */
  since?: string | Date;
  /** Upper bound on valid_from (inclusive). YYYY-MM-DD or full ISO. */
  until?: string | Date;
  /** Cap on points returned. Default 100, max 500. */
  limit?: number;
}

/**
 * A single point in an entity's claim trajectory. Carries the typed-claim
 * fields when populated (drives regression detection), the underlying
 * fact text (for display), provenance (source_session, source_markdown_slug),
 * and the raw embedding so the caller can compute drift_score without a
 * second SQL round-trip.
 */
export interface TrajectoryPoint {
  fact_id: number;
  valid_from: Date;
  metric: string | null;
  value: number | null;
  unit: string | null;
  period: string | null;
  /**
   * v0.40.2.0 — event-shaped row marker (e.g. 'meeting', 'job_change',
   * 'location_change'). Mutually informational with metric: a row can have
   * (a) metric set + event_type null (typed claim like MRR=$50K),
   * (b) metric null + event_type set (event like "last met Marco"), or
   * (c) both null (legacy free-text fact row from pre-v0.35.4 brains).
   * Both founder-scorecard's per-metric math and eval-trajectory's
   * regression analysis already skip null-metric rows, so event-only
   * rows ride through invisibly to those callers.
   */
  event_type: string | null;
  text: string;
  source_session: string | null;
  source_markdown_slug: string | null;
  /** Raw embedding for drift computation; null when the fact was inserted without one. */
  embedding: Float32Array | null;
}

/** Maximum results returned by search operations. Internal bulk operations (listPages) are not clamped. */
export const MAX_SEARCH_LIMIT = 100;

/** Clamp a user-provided search limit to a safe range. */
export function clampSearchLimit(limit: number | undefined, defaultLimit = 20, cap = MAX_SEARCH_LIMIT): number {
  if (limit === undefined || limit === null || !Number.isFinite(limit) || Number.isNaN(limit)) return defaultLimit;
  if (limit <= 0) return defaultLimit;
  return Math.min(Math.floor(limit), cap);
}

export interface BrainEngine {
  /** Discriminator: lets migrations and other consumers branch on engine kind without instanceof + dynamic imports. */
  readonly kind: 'postgres' | 'pglite';

  // Lifecycle
  connect(config: EngineConfig): Promise<void>;
  disconnect(): Promise<void>;
  /**
   * Recover a dropped connection using the config captured at the last
   * `connect()`. Callers (autopilot health probe, batchRetry) MUST use this
   * instead of `disconnect()` + bare `connect()`: the latter loses the config
   * (#2034 — a bare `connect()` with no args throws `database_url undefined`
   * forever) AND opens a null-connection window. Implemented on BOTH engines
   * for parity so the call is never a silent no-op.
   */
  reconnect(ctx?: { error?: unknown }): Promise<void>;
  initSchema(): Promise<void>;
  transaction<T>(fn: (engine: BrainEngine) => Promise<T>): Promise<T>;
  /**
   * Run `fn` with a dedicated connection (Postgres: reserved backend;
   * PGLite: pass-through). See `ReservedConnection` for semantics and
   * usage constraints. Release is automatic.
   */
  withReservedConnection<T>(fn: (conn: ReservedConnection) => Promise<T>): Promise<T>;

  // Pages CRUD
  /**
   * Fetch a page by slug.
   * v0.26.5: by default soft-deleted rows return null (matches the search
   * filter contract). Pass `opts.includeDeleted: true` to surface them with
   * `deleted_at` populated — used by `gbrain pages purge-deleted` listing,
   * by `restore_page` flow, and by operator diagnostics.
   */
  getPage(slug: string, opts?: GetPageOpts): Promise<Page | null>;
  /**
   * Insert or update a page. When `opts.sourceId` is omitted, the row is
   * written under the schema DEFAULT ('default'). When provided, `source_id`
   * is included in the INSERT column list so ON CONFLICT (source_id, slug)
   * DO UPDATE actually targets the intended row instead of fabricating a
   * duplicate at (default, slug). Multi-source brains MUST pass sourceId.
   */
  putPage(slug: string, page: PageInput, opts?: { sourceId?: string }): Promise<Page>;
  /**
   * v0.41.13 (#1309) — identity-based dedup pre-check for the import pipeline.
   *
   * Returns the first matching `{slug, id}` whose `(source_id, …)` matches
   * the supplied identity signal, OR null when nothing matches.
   *
   * Identity precedence (a row matches if EITHER fires):
   *   - `content_hash = $hash` AND `deleted_at IS NULL`
   *   - `frontmatter->>'id' = $frontmatterId` AND `$frontmatterId IS NOT NULL`
   *     AND `deleted_at IS NULL`
   *
   * Background: the overlapping-ingest-roots bug class (infiniteGameExp,
   * issue #1309) created two pages per file when a user ran `gbrain import
   * /vault/Subdir/` then `gbrain import /vault/` — the slug-shape changed
   * but the content + external ID were identical. Pre-fix, the import
   * pipeline dedup-checked by `getPage(slug)` alone and missed the
   * cross-slug duplicate. This method gives the importer a deterministic
   * way to identify true duplicates BEFORE insert.
   *
   * Per codex review: the optional `?` shape lets existing test doubles
   * compile without changes. Callers must defensively check
   * `engine.findDuplicatePage?.(...)` and fall through on undefined.
   * `deleted_at IS NULL` is deliberate — a soft-deleted page should NOT
   * block a legitimate re-import under a new slug.
   */
  findDuplicatePage?(
    sourceId: string,
    opts: { hash: string; frontmatterId?: string | null },
  ): Promise<{ slug: string; id: number } | null>;
  /**
   * Hard-delete a page row. Cascades to content_chunks, page_links,
   * chunk_relations via existing FK ON DELETE CASCADE.
   *
   * v0.26.5: this is no longer the public-facing `delete_page` op handler —
   * the op now soft-deletes via `softDeletePage` instead. `deletePage` stays
   * as the underlying primitive used by `purgeDeletedPages` and by callers
   * that explicitly want hard-delete semantics (e.g. test setup teardown).
   */
  /**
   * v0.18.0+ multi-source: `opts.sourceId` scopes the DELETE so a source-A
   * delete doesn't hard-delete the same-slug pages in sources B/C/D. Without
   * it, the bare DELETE matches every row with that slug across all sources.
   * Cascades through content_chunks / page_links / chunk_relations via FKs.
   *
   * v0.41.19.0 (CDX-11): single-row primitive used by `purgeDeletedPages`,
   * `gbrain sync` (one path per call), test setup teardown, and the v0.41.19.0
   * sync-delete decompose path (when `deletePages` throws on a 500-row batch,
   * the sync loop falls back to per-slug `deletePage` to log unrecoverable
   * failures to `failedFiles`). `gbrain sync` calls this on EVERY run that
   * sees a deleted file — it is NOT admin-only.
   */
  deletePage(slug: string, opts?: { sourceId?: string }): Promise<void>;
  /**
   * v0.41.19.0 — batch delete: single SQL round-trip via
   * `DELETE FROM pages WHERE slug = ANY($1::text[]) AND source_id = $2
   *  RETURNING slug`. Cascades through content_chunks / page_links (×3) /
   * tags / raw_data / timeline_entries / page_versions via FKs declared in
   * `src/schema.sql`. `files.page_id` and `links.origin_page_id` go SET
   * NULL per their FK definitions.
   *
   * SINGLE-BATCH PRIMITIVE: caller is responsible for chunking the input to
   * `<= DELETE_BATCH_SIZE` entries per call (see
   * `src/core/engine-constants.ts`). Matches the `addLinksBatch` convention
   * — engine assumes well-behaved input, caller owns the slicing.
   *
   * Returns the slugs of rows ACTUALLY DELETED (order undefined). Callers
   * use this to filter their own `pagesAffected` tracking so downstream
   * phases don't waste lookups on phantom slugs (paths that were in the
   * deletion list but had no DB row).
   *
   * ATOMICITY: one statement, one transaction. The whole batch commits or
   * the whole batch rolls back. Coarser than the per-row `deletePage`
   * cadence — a mid-loop abort or transient connection failure can roll
   * back up to `DELETE_BATCH_SIZE - 1` successful deletes from the
   * in-flight batch. `gbrain sync` is idempotent (next run picks them up
   * via git diff); other callers should account for the contract.
   *
   * sourceId is REQUIRED (no `'default'` fallback). This is asymmetric with
   * `deletePage` (which keeps the optional/'default' fallback for back-
   * compat). Filed as v0.42+ TODO to tighten `deletePage` to match once a
   * full caller audit confirms every site threads `sourceId`.
   */
  deletePages(slugs: string[], opts: { sourceId: string }): Promise<string[]>;
  /**
   * v0.41.19.0 — batch path → slug resolution. Single SQL round-trip via
   * `SELECT slug, source_path FROM pages WHERE source_path = ANY($1::text[])
   *  AND source_id = $2`. Returns `Map<path, slug>`; paths NOT in the map
   * have no `source_path` row in the DB and the caller is expected to fall
   * back to `resolveSlugForPath(path)` for the path-derived slug.
   *
   * Mirrors the contract of the single-call `resolveSlugByPathOrSourcePath`
   * helper in `src/commands/sync.ts`, batched. As of v0.41.19.0, that
   * single-call helper is implemented on top of this method (one Map
   * allocation per single-path call; negligible cost; one owner of the SQL
   * + fallback semantics).
   *
   * SINGLE-BATCH PRIMITIVE: caller chunks to `<= DELETE_BATCH_SIZE`.
   *
   * Empty `paths` short-circuits to an empty Map without touching the DB.
   */
  resolveSlugsByPaths(
    paths: string[],
    opts: { sourceId: string },
  ): Promise<Map<string, string>>;
  /**
   * v0.26.5 — set `deleted_at = now()` on a page. Returns the slug if a row
   * was soft-deleted, null if no row matched (already soft-deleted OR not found).
   * Idempotent-as-null. The page stays in the DB and cascade rows (chunks,
   * links) stay intact; the autopilot purge phase hard-deletes after 72h.
   */
  softDeletePage(slug: string, opts?: { sourceId?: string }): Promise<{ slug: string } | null>;
  /**
   * v0.26.5 — clear `deleted_at` on a soft-deleted page. Returns true iff a
   * row was restored. False if the slug is unknown OR the page is not
   * currently soft-deleted (idempotent-as-false).
   */
  restorePage(slug: string, opts?: { sourceId?: string }): Promise<boolean>;
  /**
   * v0.26.5 — hard-delete pages whose `deleted_at` is older than the cutoff.
   * Called by the autopilot purge phase and by the `gbrain pages purge-deleted`
   * CLI escape hatch. Cascades through existing FKs.
   */
  purgeDeletedPages(olderThanHours: number): Promise<{ slugs: string[]; count: number }>;
  /**
   * v0.26.5: by default `listPages` excludes soft-deleted rows. Set
   * `filters.includeDeleted: true` to surface them.
   */
  listPages(filters?: PageFilters): Promise<Page[]>;
  /**
   * Fuzzy slug resolver.
   *
   * v0.41.13 (#1436): `opts.sourceId` scopes the search to a single source;
   * `opts.sourceIds` to an array (federated_read OAuth tier). Pre-fix the
   * resolver was unscoped, so MCP `get_page` with `fuzzy: true` would
   * return candidates from sources the caller couldn't actually access.
   * Source-bleed via fuzzy resolution was the bug class infiniteGameExp
   * reported as #1436. When neither opt is set, the original unscoped
   * behavior is preserved for back-compat with internal callers (the
   * `gbrain query --resolve` CLI path, etc.). Field names match the
   * `sourceScopeOpts(ctx)` helper output so callers can spread directly.
   */
  resolveSlugs(partial: string, opts?: { sourceId?: string; sourceIds?: string[] }): Promise<string[]>;
  /**
   * Returns the slug of every page in the brain. Used by batch commands as a
   * mutation-immune iteration source (alternative to listPages OFFSET pagination,
   * which is unstable when ordering by updated_at and writes are happening).
   *
   * v0.31.8 (D12): `opts.sourceId` scopes the result to a single source
   * (used by the source-aware reconcileLinks path so wikilink resolution
   * doesn't span unrelated sources). When omitted, returns the union of
   * slugs across every source (pre-v0.31.8 behavior).
   */
  getAllSlugs(opts?: { sourceId?: string }): Promise<Set<string>>;

  /**
   * v0.32.8: cross-source page enumeration. Returns one row per (slug,
   * source_id) pair across the brain, ordered by (source_id, slug) for
   * deterministic iteration on large brains. Used by extract-takes,
   * extract, and integrity to replace the `getAllSlugs() → getPage(slug)`
   * N+1 pattern, which silently defaulted to source_id='default' and
   * skipped non-default-source pages.
   *
   * Cheap by design: only slug + source_id, not the full Page row. For
   * loops that need page.compiled_truth / timeline / frontmatter, use
   * `forEachPage` from src/core/engine-iter.ts instead.
   */
  listAllPageRefs(): Promise<Array<{ slug: string; source_id: string }>>;

  /**
   * v0.38 — lean per-source enumeration for hot-loop callers (autopilot
   * dispatch, doctor freshness check). Returns the bare row shape sources-ops
   * needs without the N+1 per-source page_count enrichment in
   * `sources-ops.listSources`.
   *
   * Defaults filter out archived sources. When `localPathOnly` is true,
   * also filters `local_path IS NOT NULL` so the autopilot fan-out doesn't
   * dispatch jobs for pure-DB sources whose handler would fall back to
   * the global sync.repo_path (codex r1 P1-4).
   *
   * `config` is returned as `Record<string, unknown>` — both engines
   * already parse the JSONB at the boundary (Postgres-js returns
   * parsed objects; PGLite returns objects via its built-in JSONB
   * codec). Callers reading `config['last_full_cycle_at']` get a string.
   */
  listAllSources(opts?: {
    includeArchived?: boolean;
    localPathOnly?: boolean;
  }): Promise<SourceRow[]>;

  /**
   * v0.38 — atomic JSONB merge into sources.config. Uses Postgres's
   * `config || $patch::jsonb` operator so concurrent writers don't
   * stomp each other (last write wins, but no read-modify-write race).
   *
   * Primary caller: runCycle's exit hook writes
   *   { last_full_cycle_at: '<ISO>' }
   * after a successful per-source cycle so autopilot's freshness gate
   * can read it next tick. Resolves codex round-1 P0-5 (write site for
   * last_full_cycle_at was unspecified pre-PR).
   *
   * Returns true if a row was updated (source exists), false otherwise
   * (silently no-ops on unknown sourceId — caller decides whether that's
   * a problem).
   */
  updateSourceConfig(sourceId: string, patch: Record<string, unknown>): Promise<boolean>;

  /**
   * v0.37.0 — prefix-stratified page sampling for `gbrain brainstorm` / `gbrain lsd`
   * domain-bank module. Takes a caller-supplied prefix list (cached at the domain-bank
   * layer per D3), returns one page per prefix tiebroken by `connection_count`
   * (LEFT JOIN to page_links, count of inbound links).
   *
   * Stale-bias (D5 / LSD): when `opts.staleBias === true`, ROW_NUMBER() ORDER BY
   * prefers pages with `last_retrieved_at IS NULL` (never retrieved) > pages older
   * than `staleThresholdDays` (default 90) > recently-retrieved.
   *
   * Source scoping (D5, codex r2 #2 fix): `sourceId` (scalar) and `sourceIds`
   * (array, wins over scalar) per the [source-id-canonical-thread] pattern.
   * Both threaded from day 1 even though v0.37.0 callers are CLI-local — D7
   * MCP exposure ships zero-refactor.
   *
   * Soft-deleted pages (deleted_at IS NOT NULL) excluded automatically.
   */
  listPrefixSampledPages(opts: DomainBankSampleOpts): Promise<DomainBankRow[]>;

  /**
   * v0.37.0 — corpus-sampling fallback for `gbrain brainstorm` when prefix-stratified
   * can't fill M (small brain, single-prefix corpus). Random sample of N pages with
   * the same exclusion + source-scope semantics as `listPrefixSampledPages`.
   * Deterministic with `opts.seed` set; falls back to RANDOM() otherwise.
   *
   * Returns the same `DomainBankRow` shape so the orchestrator can union both
   * sources of pages and dedup by slug+source_id.
   */
  listCorpusSample(opts: CorpusSampleOpts): Promise<DomainBankRow[]>;

  // Search
  searchKeyword(query: string, opts?: SearchOpts): Promise<SearchResult[]>;
  searchVector(embedding: Float32Array, opts?: SearchOpts): Promise<SearchResult[]>;
  /**
   * Hydrate embeddings for chunks already known by id. v0.36 (D9):
   * optional `column` parameter selects which content_chunks column to
   * fetch from (default 'embedding'). The dynamic-embedding-column
   * search path hands its resolved column name here so cosineReScore
   * rehydrates in the right embedding space — otherwise vector search
   * against `embedding_voyage` would HNSW-rank against Voyage but
   * rescore against OpenAI vectors (NaN / wrong rankings).
   *
   * The column name MUST be regex-validated by the caller (resolveEmbed-
   * dingColumn rejects bad names). Engines identifier-quote on
   * interpolation as defense in depth (D12).
   */
  getEmbeddingsByChunkIds(ids: number[], column?: string): Promise<Map<number, Float32Array>>;

  // Chunks
  /**
   * Replace the chunk set for a page. Internal page-id lookup is sourceId-
   * scoped when `opts.sourceId` is given; without it, the schema DEFAULT
   * matches and bare-slug lookup blows up if the same slug exists in
   * multiple sources (Postgres 21000).
   */
  /**
   * v0.41.18.0: internal SQL wrapped in `withRetry(BULK_RETRY_OPTS)` against
   * transient connection errors (Supavisor circuit-breaker recovery).
   * Idempotent under replay via single-statement DELETE+INSERT in implicit tx
   * — Postgres rolls back automatically on conn drop, so commit-ambiguous
   * failure replays to the same end state. Callers MUST NOT wrap externally;
   * see {@link BatchOpts} retry-contract block.
   */
  upsertChunks(slug: string, chunks: ChunkInput[], opts?: { sourceId?: string } & BatchOpts): Promise<void>;
  /**
   * Read every chunk for a page. `opts.sourceId` source-scopes the page
   * lookup; without it, multi-source brains return chunks from every
   * same-slug source (importCodeFile uses this for incremental embedding
   * reuse, which would then attach the wrong source's embeddings).
   */
  getChunks(slug: string, opts?: { sourceId?: string }): Promise<Chunk[]>;
  /**
   * Count chunks across the brain where embedding IS NULL.
   * Pre-flight short-circuit for `embed --stale` so a 100%-embedded brain
   * does no further work after a single SELECT count(*) (~50 bytes wire).
   *
   * `opts.sourceId` scopes the count to a single source. When omitted,
   * counts across every source in the brain. Operators running
   * `gbrain embed --stale --source media-corpus` expect only that
   * source's NULLs touched; the caller threads `sourceId` here.
   */
  countStaleChunks(opts?: { sourceId?: string; signature?: string }): Promise<number>;
  /**
   * Sum of LENGTH(chunk_text) over stale chunks — the character-count
   * backlog the embed phase / embed-backfill will process. Sibling of
   * countStaleChunks (same stale predicate + embed_skip filter + optional
   * sourceId scope); used by the `gbrain sync --all` cost preview to price
   * the embedding backlog via estimateCostFromChars. Returns 0 on an
   * empty/fully-embedded brain.
   *
   * v0.41.31: `signature` (optional) widens "stale" to ALSO include chunks
   * whose page `embedding_signature` is set AND differs from the current
   * model signature (a model/dims swap). NULL signature is GRANDFATHERED
   * (never counted) so the post-migration corpus isn't flagged en masse.
   * Omit `signature` for the legacy `embedding IS NULL`-only count.
   */
  sumStaleChunkChars(opts?: { sourceId?: string; signature?: string }): Promise<number>;
  /**
   * Stamp `pages.embedding_signature = signature` for one page. Called after
   * a page's chunks are (re)embedded so a later model swap can detect it as
   * stale. Idempotent. No-op if the page doesn't exist.
   */
  setPageEmbeddingSignature(slug: string, opts: { sourceId?: string; signature: string }): Promise<void>;
  /**
   * NULL out the embeddings (and embedded_at) of every chunk whose page
   * `embedding_signature` is set AND differs from `signature` — i.e. pages
   * embedded under a now-stale model. Returns the chunk count invalidated.