From 397e5cff315fdefc1766b9b2e158ecbe927be5c6 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:03:39 -0700 Subject: [PATCH 001/215] feat(db): add persistent cluster schema --- packages/api-core/src/db/migrate.test.ts | 15 ++ packages/api-core/src/db/migrate.ts | 256 +++++++++++++++++++++++ 2 files changed, 271 insertions(+) diff --git a/packages/api-core/src/db/migrate.test.ts b/packages/api-core/src/db/migrate.test.ts index 476cc5a..1ad4115 100644 --- a/packages/api-core/src/db/migrate.test.ts +++ b/packages/api-core/src/db/migrate.test.ts @@ -18,6 +18,16 @@ test('migrate creates core tables', () => { assert.ok(names.includes('documents')); assert.ok(names.includes('document_embeddings')); assert.ok(names.includes('thread_vectors')); + assert.ok(names.includes('blobs')); + assert.ok(names.includes('actors')); + assert.ok(names.includes('thread_revisions')); + assert.ok(names.includes('thread_fingerprints')); + assert.ok(names.includes('thread_key_summaries')); + assert.ok(names.includes('similarity_edge_evidence')); + assert.ok(names.includes('cluster_groups')); + assert.ok(names.includes('cluster_memberships')); + assert.ok(names.includes('cluster_overrides')); + assert.ok(names.includes('cluster_events')); assert.ok(names.includes('cluster_runs')); assert.ok(names.includes('repo_sync_state')); assert.ok(names.includes('repo_pipeline_state')); @@ -29,6 +39,11 @@ test('migrate creates core tables', () => { const summaryColumns = db.prepare('pragma table_info(document_summaries)').all() as Array<{ name: string }>; assert.ok(summaryColumns.map((column) => column.name).includes('prompt_version')); + + const clusterMembershipColumns = db.prepare('pragma table_info(cluster_memberships)').all() as Array<{ name: string }>; + const clusterMembershipColumnNames = clusterMembershipColumns.map((column) => column.name); + assert.ok(clusterMembershipColumnNames.includes('state')); + assert.ok(clusterMembershipColumnNames.includes('removed_by')); } finally { db.close(); } diff --git a/packages/api-core/src/db/migrate.ts b/packages/api-core/src/db/migrate.ts index 36c4e3e..09d03ea 100644 --- a/packages/api-core/src/db/migrate.ts +++ b/packages/api-core/src/db/migrate.ts @@ -57,6 +57,104 @@ const migrationStatements = [ ) `, ` + create table if not exists blobs ( + id integer primary key, + sha256 text not null unique, + media_type text not null, + compression text not null default 'none', + size_bytes integer not null, + storage_kind text not null, + storage_path text, + inline_text text, + created_at text not null + ) + `, + ` + create table if not exists actors ( + id integer primary key, + provider text not null, + provider_user_id text not null, + login text not null, + display_name text, + actor_type text, + site_admin integer not null default 0, + raw_json_blob_id integer references blobs(id) on delete set null, + first_seen_at text not null, + last_seen_at text not null, + updated_at text not null, + unique(provider, provider_user_id) + ) + `, + ` + create table if not exists actor_repo_stats ( + repo_id integer not null references repositories(id) on delete cascade, + actor_id integer not null references actors(id) on delete cascade, + opened_issues integer not null default 0, + opened_prs integer not null default 0, + comments integer not null default 0, + merged_prs integer not null default 0, + closed_threads integer not null default 0, + first_activity_at text, + last_activity_at text, + trust_tier text, + primary key (repo_id, actor_id) + ) + `, + ` + create table if not exists thread_revisions ( + id integer primary key, + thread_id integer not null references threads(id) on delete cascade, + source_updated_at text, + content_hash text not null, + title_hash text not null, + body_hash text not null, + labels_hash text not null, + raw_json_blob_id integer references blobs(id) on delete set null, + created_at text not null, + unique(thread_id, content_hash) + ) + `, + ` + create table if not exists thread_code_snapshots ( + id integer primary key, + thread_revision_id integer not null unique references thread_revisions(id) on delete cascade, + base_sha text, + head_sha text, + files_changed integer not null default 0, + additions integer not null default 0, + deletions integer not null default 0, + patch_digest text, + raw_diff_blob_id integer references blobs(id) on delete set null, + created_at text not null + ) + `, + ` + create table if not exists thread_changed_files ( + snapshot_id integer not null references thread_code_snapshots(id) on delete cascade, + path text not null, + status text, + additions integer not null default 0, + deletions integer not null default 0, + previous_path text, + patch_blob_id integer references blobs(id) on delete set null, + patch_hash text, + primary key (snapshot_id, path) + ) + `, + ` + create table if not exists thread_hunk_signatures ( + id integer primary key, + snapshot_id integer not null references thread_code_snapshots(id) on delete cascade, + path text not null, + hunk_hash text not null, + context_hash text not null, + added_token_hash text not null, + removed_token_hash text not null, + created_at text not null, + unique(snapshot_id, path, hunk_hash) + ) + `, + ` create table if not exists documents ( id integer primary key, thread_id integer not null unique references threads(id) on delete cascade, @@ -138,6 +236,56 @@ const migrationStatements = [ ) `, ` + create table if not exists thread_fingerprints ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + algorithm_version text not null, + fingerprint_hash text not null, + fingerprint_slug text not null, + title_tokens_json text not null, + body_token_hash text not null, + linked_refs_json text not null, + file_set_hash text not null, + module_buckets_json text not null, + minhash_signature_blob_id integer references blobs(id) on delete set null, + simhash64 text not null, + winnow_hashes_blob_id integer references blobs(id) on delete set null, + feature_json text not null, + created_at text not null, + unique(thread_revision_id, algorithm_version) + ) + `, + ` + create table if not exists thread_key_summaries ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + summary_kind text not null, + prompt_version text not null, + provider text not null, + model text not null, + input_hash text not null, + output_hash text not null, + output_json_blob_id integer references blobs(id) on delete set null, + key_text text not null, + created_at text not null, + unique(thread_revision_id, summary_kind, prompt_version, provider, model) + ) + `, + ` + create table if not exists pipeline_runs ( + id integer primary key, + repo_id integer references repositories(id) on delete cascade, + run_kind text not null, + algorithm_version text, + config_hash text, + status text not null, + started_at text not null, + finished_at text, + stats_json text, + error_text text + ) + `, + ` create table if not exists repo_pipeline_state ( repo_id integer primary key references repositories(id) on delete cascade, summary_model text not null, @@ -225,6 +373,25 @@ const migrationStatements = [ ) `, ` + create table if not exists similarity_edge_evidence ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + left_thread_id integer not null references threads(id) on delete cascade, + right_thread_id integer not null references threads(id) on delete cascade, + algorithm_version text not null, + config_hash text not null, + score real not null, + tier text not null, + state text not null, + breakdown_json text not null, + first_seen_run_id integer references pipeline_runs(id) on delete set null, + last_seen_run_id integer references pipeline_runs(id) on delete set null, + created_at text not null, + updated_at text not null, + unique(repo_id, left_thread_id, right_thread_id, algorithm_version, config_hash) + ) + `, + ` create table if not exists clusters ( id integer primary key, repo_id integer not null references repositories(id) on delete cascade, @@ -242,6 +409,77 @@ const migrationStatements = [ created_at text not null, primary key (cluster_id, thread_id) ) + `, + ` + create table if not exists cluster_groups ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + stable_key text not null, + stable_slug text not null, + status text not null, + cluster_type text, + representative_thread_id integer references threads(id) on delete set null, + title text, + created_at text not null, + updated_at text not null, + closed_at text, + unique(repo_id, stable_key), + unique(repo_id, stable_slug) + ) + `, + ` + create table if not exists cluster_memberships ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + role text not null, + state text not null, + score_to_representative real, + first_seen_run_id integer references pipeline_runs(id) on delete set null, + last_seen_run_id integer references pipeline_runs(id) on delete set null, + added_by text not null, + removed_by text, + added_reason_json text not null, + removed_reason_json text, + created_at text not null, + updated_at text not null, + removed_at text, + primary key (cluster_id, thread_id) + ) + `, + ` + create table if not exists cluster_overrides ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + action text not null, + actor_id integer references actors(id) on delete set null, + reason text, + created_at text not null, + expires_at text, + unique(cluster_id, thread_id, action) + ) + `, + ` + create table if not exists cluster_events ( + id integer primary key, + cluster_id integer not null references cluster_groups(id) on delete cascade, + run_id integer references pipeline_runs(id) on delete set null, + event_type text not null, + actor_kind text not null, + actor_id integer references actors(id) on delete set null, + payload_json text not null, + created_at text not null + ) + `, + ` + create table if not exists cluster_aliases ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + alias_slug text not null, + reason text not null, + created_at text not null, + primary key (cluster_id, alias_slug) + ) ` ]; @@ -292,9 +530,27 @@ export function migrate(db: SqliteDatabase): void { } db.exec('create index if not exists idx_threads_repo_number on threads(repo_id, number)'); + db.exec('create index if not exists idx_blobs_sha256 on blobs(sha256)'); + db.exec('create index if not exists idx_actors_provider_login on actors(provider, login)'); + db.exec('create index if not exists idx_actor_repo_stats_actor on actor_repo_stats(actor_id)'); + db.exec('create index if not exists idx_thread_revisions_thread_created on thread_revisions(thread_id, created_at)'); + db.exec('create index if not exists idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash)'); + db.exec('create index if not exists idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug)'); + db.exec('create index if not exists idx_thread_code_snapshots_revision on thread_code_snapshots(thread_revision_id)'); + db.exec('create index if not exists idx_thread_changed_files_path on thread_changed_files(path)'); + db.exec('create index if not exists idx_thread_hunk_signatures_hash on thread_hunk_signatures(hunk_hash)'); + db.exec('create index if not exists idx_thread_key_summaries_revision_kind on thread_key_summaries(thread_revision_id, summary_kind)'); db.exec('create index if not exists idx_document_summaries_thread_model on document_summaries(thread_id, model)'); db.exec('create index if not exists idx_thread_vectors_basis_model on thread_vectors(basis, model)'); + db.exec('create index if not exists idx_pipeline_runs_repo_kind_id on pipeline_runs(repo_id, run_kind, id)'); db.exec('create index if not exists idx_cluster_runs_repo_status_id on cluster_runs(repo_id, status, id)'); db.exec('create index if not exists idx_clusters_repo_run_id on clusters(repo_id, cluster_run_id, id)'); db.exec('create index if not exists idx_cluster_members_thread_cluster on cluster_members(thread_id, cluster_id)'); + db.exec('create index if not exists idx_similarity_edge_evidence_repo_pair on similarity_edge_evidence(repo_id, left_thread_id, right_thread_id)'); + db.exec('create index if not exists idx_similarity_edge_evidence_repo_state_score on similarity_edge_evidence(repo_id, state, tier, score)'); + db.exec('create index if not exists idx_cluster_groups_repo_status on cluster_groups(repo_id, status)'); + db.exec('create index if not exists idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state)'); + db.exec('create index if not exists idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state)'); + db.exec('create index if not exists idx_cluster_overrides_repo_target on cluster_overrides(repo_id, cluster_id, thread_id, action)'); + db.exec('create index if not exists idx_cluster_events_cluster_created on cluster_events(cluster_id, created_at)'); } From 734abd1dacb28705472ad2bdd86a624415075289 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:04:10 -0700 Subject: [PATCH 002/215] feat(cluster): add deterministic human keys --- .../api-core/src/cluster/human-key.test.ts | 22 +++++++++ packages/api-core/src/cluster/human-key.ts | 45 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 packages/api-core/src/cluster/human-key.test.ts create mode 100644 packages/api-core/src/cluster/human-key.ts diff --git a/packages/api-core/src/cluster/human-key.test.ts b/packages/api-core/src/cluster/human-key.test.ts new file mode 100644 index 0000000..bde1328 --- /dev/null +++ b/packages/api-core/src/cluster/human-key.test.ts @@ -0,0 +1,22 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { humanKeyForValue, humanKeyFromHash, stableHash } from './human-key.js'; + +test('humanKeyForValue returns a stable operator slug and machine hash', () => { + const first = humanKeyForValue('repo:openclaw/openclaw thread:42 title:download stalls'); + const second = humanKeyForValue('repo:openclaw/openclaw thread:42 title:download stalls'); + + assert.equal(first.hash, second.hash); + assert.equal(first.slug, second.slug); + assert.match(first.hash, /^[a-f0-9]{64}$/); + assert.match(first.slug, /^[a-z]+-[a-z]+-[a-z]+-[a-z0-9]{4}$/); +}); + +test('humanKeyFromHash rejects non-SHA256 input', () => { + assert.throws(() => humanKeyFromHash('not-a-hash'), /SHA-256/); +}); + +test('stableHash changes when source material changes', () => { + assert.notEqual(stableHash('thread a'), stableHash('thread b')); +}); diff --git a/packages/api-core/src/cluster/human-key.ts b/packages/api-core/src/cluster/human-key.ts new file mode 100644 index 0000000..551d9bb --- /dev/null +++ b/packages/api-core/src/cluster/human-key.ts @@ -0,0 +1,45 @@ +import crypto from 'node:crypto'; + +const WORDS = [ + 'anchor', 'apex', 'atlas', 'beacon', 'binary', 'bridge', 'cable', 'canvas', + 'cipher', 'clear', 'cloud', 'cobalt', 'comet', 'copper', 'delta', 'drift', + 'ember', 'engine', 'falcon', 'fiber', 'field', 'filter', 'focus', 'forge', + 'frame', 'garden', 'glide', 'harbor', 'helix', 'hollow', 'index', 'island', + 'kernel', 'keystone', 'lantern', 'lattice', 'ledger', 'level', 'maple', 'matrix', + 'meadow', 'merge', 'mirror', 'module', 'needle', 'noble', 'nova', 'orbit', + 'origin', 'parcel', 'patch', 'pillar', 'pixel', 'plume', 'portal', 'pulse', + 'quartz', 'quiet', 'radar', 'raven', 'relay', 'render', 'ripple', 'river', + 'signal', 'silver', 'sketch', 'socket', 'solar', 'span', 'spiral', 'spring', + 'stable', 'stone', 'summit', 'switch', 'thread', 'timber', 'token', 'trace', + 'union', 'vector', 'velvet', 'vertex', 'vessel', 'violet', 'vista', 'wave', + 'willow', 'window', 'yellow', 'zenith', +] as const; + +export type HumanKey = { + hash: string; + slug: string; + checksum: string; +}; + +export function stableHash(value: string): string { + return crypto.createHash('sha256').update(value).digest('hex'); +} + +export function humanKeyFromHash(hash: string): HumanKey { + const normalized = hash.toLowerCase(); + if (!/^[a-f0-9]{64}$/.test(normalized)) { + throw new Error('Human key hash must be a SHA-256 hex digest'); + } + + const indexes = [0, 2, 4].map((offset) => Number.parseInt(normalized.slice(offset, offset + 2), 16) % WORDS.length); + const checksum = Number.parseInt(normalized.slice(6, 12), 16).toString(36).padStart(4, '0').slice(-4); + return { + hash: normalized, + slug: `${WORDS[indexes[0]]}-${WORDS[indexes[1]]}-${WORDS[indexes[2]]}-${checksum}`, + checksum, + }; +} + +export function humanKeyForValue(value: string): HumanKey { + return humanKeyFromHash(stableHash(value)); +} From a3812985209de88a87fec6ac67e6c44775ec15ac Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:04:50 -0700 Subject: [PATCH 003/215] feat(cluster): add deterministic fingerprint algorithms --- .../cluster/fingerprint-algorithms.test.ts | 47 ++++++++ .../src/cluster/fingerprint-algorithms.ts | 107 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 packages/api-core/src/cluster/fingerprint-algorithms.test.ts create mode 100644 packages/api-core/src/cluster/fingerprint-algorithms.ts diff --git a/packages/api-core/src/cluster/fingerprint-algorithms.test.ts b/packages/api-core/src/cluster/fingerprint-algorithms.test.ts new file mode 100644 index 0000000..67b8973 --- /dev/null +++ b/packages/api-core/src/cluster/fingerprint-algorithms.test.ts @@ -0,0 +1,47 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { + buildShingles, + jaccard, + minhashSignature, + minhashSimilarity, + simhash64, + simhashSimilarity, + winnowingFingerprints, +} from './fingerprint-algorithms.js'; + +test('buildShingles creates stable token shingles', () => { + assert.deepEqual(buildShingles(['a', 'b', 'c', 'd'], 3), ['a b c', 'b c d']); + assert.deepEqual(buildShingles(['a', 'b'], 3), ['a b']); +}); + +test('minhash signatures are deterministic and comparable', () => { + const first = minhashSignature(['cache', 'key', 'collision', 'fix'], { permutations: 16, shingleSize: 2 }); + const second = minhashSignature(['cache', 'key', 'collision', 'fix'], { permutations: 16, shingleSize: 2 }); + const different = minhashSignature(['ui', 'button', 'color', 'fix'], { permutations: 16, shingleSize: 2 }); + + assert.deepEqual(first, second); + assert.equal(minhashSimilarity(first, second), 1); + assert.ok(minhashSimilarity(first, different) < 1); +}); + +test('simhash similarity reflects token distance', () => { + const first = simhash64(['download', 'retry', 'timeout', 'hangs']); + const second = simhash64(['download', 'retry', 'timeout', 'stalls']); + const different = simhash64(['theme', 'button', 'contrast', 'color']); + + assert.ok(simhashSimilarity(first, second) > simhashSimilarity(first, different)); +}); + +test('winnowing fingerprints are deterministic selected hashes', () => { + const first = winnowingFingerprints(['a', 'b', 'c', 'd', 'e', 'f'], { kgram: 3, window: 2 }); + const second = winnowingFingerprints(['a', 'b', 'c', 'd', 'e', 'f'], { kgram: 3, window: 2 }); + + assert.deepEqual(first, second); + assert.ok(first.length > 0); +}); + +test('jaccard scores set overlap', () => { + assert.equal(jaccard(new Set(['a', 'b']), new Set(['b', 'c'])), 1 / 3); +}); diff --git a/packages/api-core/src/cluster/fingerprint-algorithms.ts b/packages/api-core/src/cluster/fingerprint-algorithms.ts new file mode 100644 index 0000000..1757020 --- /dev/null +++ b/packages/api-core/src/cluster/fingerprint-algorithms.ts @@ -0,0 +1,107 @@ +import crypto from 'node:crypto'; + +const MASK_64 = (1n << 64n) - 1n; + +function stableHash64(value: string, seed = 0): bigint { + const digest = crypto.createHash('sha256').update(`${seed}:${value}`).digest(); + return digest.readBigUInt64BE(0); +} + +export function buildShingles(tokens: string[], size = 3): string[] { + const normalizedSize = Math.max(1, Math.trunc(size)); + if (tokens.length === 0) return []; + if (tokens.length < normalizedSize) return [tokens.join(' ')]; + const shingles: string[] = []; + for (let index = 0; index <= tokens.length - normalizedSize; index += 1) { + shingles.push(tokens.slice(index, index + normalizedSize).join(' ')); + } + return Array.from(new Set(shingles)); +} + +export function minhashSignature(tokens: string[], params: { permutations?: number; shingleSize?: number } = {}): string[] { + const permutations = Math.max(1, Math.trunc(params.permutations ?? 64)); + const shingles = buildShingles(tokens, params.shingleSize ?? 3); + if (shingles.length === 0) { + return Array.from({ length: permutations }, () => '0'); + } + + const signature: string[] = []; + for (let seed = 0; seed < permutations; seed += 1) { + let minValue: bigint | null = null; + for (const shingle of shingles) { + const value = stableHash64(shingle, seed); + if (minValue === null || value < minValue) { + minValue = value; + } + } + signature.push((minValue ?? 0n).toString(16).padStart(16, '0')); + } + return signature; +} + +export function minhashSimilarity(left: string[], right: string[]): number { + if (left.length === 0 || left.length !== right.length) return 0; + let matches = 0; + for (let index = 0; index < left.length; index += 1) { + if (left[index] === right[index]) matches += 1; + } + return matches / left.length; +} + +export function simhash64(tokens: string[]): string { + const weights = Array.from({ length: 64 }, () => 0); + for (const token of tokens) { + const hash = stableHash64(token) & MASK_64; + for (let bit = 0; bit < 64; bit += 1) { + weights[bit] += ((hash >> BigInt(bit)) & 1n) === 1n ? 1 : -1; + } + } + + let value = 0n; + for (let bit = 0; bit < 64; bit += 1) { + if (weights[bit] >= 0) { + value |= 1n << BigInt(bit); + } + } + return value.toString(16).padStart(16, '0'); +} + +export function simhashSimilarity(leftHex: string, rightHex: string): number { + const left = BigInt(`0x${leftHex}`); + const right = BigInt(`0x${rightHex}`); + let value = left ^ right; + let distance = 0; + while (value > 0n) { + distance += Number(value & 1n); + value >>= 1n; + } + return Math.max(0, 1 - distance / 64); +} + +export function winnowingFingerprints(tokens: string[], params: { kgram?: number; window?: number } = {}): string[] { + const kgram = Math.max(1, Math.trunc(params.kgram ?? 5)); + const window = Math.max(1, Math.trunc(params.window ?? 4)); + const grams = buildShingles(tokens, kgram); + if (grams.length === 0) return []; + const hashes = grams.map((gram) => stableHash64(gram).toString(16).padStart(16, '0')); + if (hashes.length <= window) { + return [hashes.reduce((min, value) => (value < min ? value : min), hashes[0])]; + } + + const selected = new Set(); + for (let start = 0; start <= hashes.length - window; start += 1) { + const slice = hashes.slice(start, start + window); + selected.add(slice.reduce((min, value) => (value < min ? value : min), slice[0])); + } + return Array.from(selected).sort(); +} + +export function jaccard(left: Set, right: Set): number { + if (left.size === 0 && right.size === 0) return 0; + let intersection = 0; + for (const value of left) { + if (right.has(value)) intersection += 1; + } + const union = left.size + right.size - intersection; + return union === 0 ? 0 : intersection / union; +} From 58c8b8d3f3447e60c7c52cd5a43fd4976b3ae4c9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:06:25 -0700 Subject: [PATCH 004/215] feat(cluster): build deterministic thread fingerprints --- .../src/cluster/thread-fingerprint.test.ts | 74 +++++++ .../src/cluster/thread-fingerprint.ts | 185 ++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 packages/api-core/src/cluster/thread-fingerprint.test.ts create mode 100644 packages/api-core/src/cluster/thread-fingerprint.ts diff --git a/packages/api-core/src/cluster/thread-fingerprint.test.ts b/packages/api-core/src/cluster/thread-fingerprint.test.ts new file mode 100644 index 0000000..3034931 --- /dev/null +++ b/packages/api-core/src/cluster/thread-fingerprint.test.ts @@ -0,0 +1,74 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { + buildDeterministicThreadFingerprint, + compareDeterministicFingerprints, + moduleBucket, + tokenize, +} from './thread-fingerprint.js'; + +test('tokenize normalizes text for deterministic fingerprints', () => { + assert.deepEqual(tokenize('Fix: Download retry hangs!'), ['fix', 'download', 'retry', 'hangs']); +}); + +test('moduleBucket groups files by stable path prefix', () => { + assert.equal(moduleBucket('packages/api-core/src/service.ts'), 'packages/api-core/*'); + assert.equal(moduleBucket('README.md'), 'README.md/*'); +}); + +test('buildDeterministicThreadFingerprint is stable without model inputs', () => { + const input = { + threadId: 1, + number: 42, + kind: 'issue' as const, + title: 'Download retry hangs', + body: 'The transfer retries forever after a timeout.', + labels: ['bug'], + linkedRefs: ['42'], + }; + + const first = buildDeterministicThreadFingerprint(input); + const second = buildDeterministicThreadFingerprint(input); + + assert.equal(first.fingerprintHash, second.fingerprintHash); + assert.equal(first.fingerprintSlug, second.fingerprintSlug); + assert.equal(first.algorithmVersion, 'thread-fingerprint-v1'); + assert.ok(first.minhashSignature.length > 0); +}); + +test('compareDeterministicFingerprints scores deterministic overlap features', () => { + const first = buildDeterministicThreadFingerprint({ + threadId: 1, + number: 42, + kind: 'pull_request', + title: 'Fix downloader retry loop', + body: 'Stops retrying forever after transfer timeout.', + labels: ['bug'], + changedFiles: ['packages/api-core/src/download.ts'], + linkedRefs: ['100'], + hunkSignatures: ['h1'], + patchIds: ['p1'], + }); + const second = buildDeterministicThreadFingerprint({ + threadId: 2, + number: 43, + kind: 'pull_request', + title: 'Fix downloader retry loop', + body: 'Stops retrying forever after transfer timeout.', + labels: ['bug'], + changedFiles: ['packages/api-core/src/download.ts'], + linkedRefs: ['100'], + hunkSignatures: ['h1'], + patchIds: ['p1'], + }); + + const breakdown = compareDeterministicFingerprints(first, second); + + assert.equal(breakdown.linkedRefOverlap, 1); + assert.equal(breakdown.fileOverlap, 1); + assert.equal(breakdown.hunkOverlap, 1); + assert.equal(breakdown.patchOverlap, 1); + assert.ok(Math.abs(breakdown.structure - 1) < 1e-9); + assert.equal(breakdown.lineage, 1); +}); diff --git a/packages/api-core/src/cluster/thread-fingerprint.ts b/packages/api-core/src/cluster/thread-fingerprint.ts new file mode 100644 index 0000000..9958e34 --- /dev/null +++ b/packages/api-core/src/cluster/thread-fingerprint.ts @@ -0,0 +1,185 @@ +import { buildShingles, jaccard, minhashSignature, minhashSimilarity, simhash64, simhashSimilarity, winnowingFingerprints } from './fingerprint-algorithms.js'; +import { humanKeyForValue } from './human-key.js'; + +const TOKEN_RE = /[a-zA-Z0-9_]+/g; +const TITLE_STOPWORDS = new Set([ + 'fix', + 'bug', + 'feat', + 'feature', + 'docs', + 'chore', + 'refactor', + 'test', + 'add', + 'update', + 'improve', + 'support', + 'allow', + 'enable', + 'with', + 'from', + 'when', + 'after', + 'before', + 'into', + 'for', + 'the', + 'and', + 'or', +]); + +export type FingerprintInput = { + threadId: number; + number: number; + kind: 'issue' | 'pull_request'; + title: string; + body: string | null; + labels: string[]; + changedFiles?: string[]; + linkedRefs?: string[]; + hunkSignatures?: string[]; + patchIds?: string[]; +}; + +export type DeterministicThreadFingerprint = { + algorithmVersion: string; + fingerprintHash: string; + fingerprintSlug: string; + titleTokens: string[]; + salientTitleTokens: string[]; + bodyTokens: string[]; + linkedRefs: string[]; + moduleBuckets: string[]; + changedFiles: string[]; + hunkSignatures: string[]; + patchIds: string[]; + minhashSignature: string[]; + simhash64: string; + winnowHashes: string[]; +}; + +export type FingerprintPairBreakdown = { + linkedRefOverlap: number; + titleOverlap: number; + tokenMinhash: number; + tokenSimhash: number; + tokenWinnow: number; + fileOverlap: number; + moduleOverlap: number; + hunkOverlap: number; + patchOverlap: number; + structure: number; + lineage: number; +}; + +export const THREAD_FINGERPRINT_ALGORITHM_VERSION = 'thread-fingerprint-v1'; + +export function tokenize(value: string | null | undefined): string[] { + return Array.from(value?.toLowerCase().matchAll(TOKEN_RE) ?? []).map((match) => match[0]); +} + +export function moduleBucket(path: string, depth = 2): string { + const parts = path.split('/').filter(Boolean); + if (parts.length === 0) return 'root/*'; + return `${parts.slice(0, depth).join('/')}/*`; +} + +function uniqueSorted(values: string[]): string[] { + return Array.from(new Set(values.filter(Boolean))).sort(); +} + +function overlapMin(left: Set, right: Set): number { + if (left.size === 0 || right.size === 0) return 0; + let intersection = 0; + for (const value of left) { + if (right.has(value)) intersection += 1; + } + return intersection / Math.min(left.size, right.size); +} + +export function buildDeterministicThreadFingerprint(input: FingerprintInput): DeterministicThreadFingerprint { + const titleTokens = tokenize(input.title); + const bodyTokens = tokenize(input.body); + const changedFiles = uniqueSorted(input.changedFiles ?? []); + const linkedRefs = uniqueSorted(input.linkedRefs ?? []); + const hunkSignatures = uniqueSorted(input.hunkSignatures ?? []); + const patchIds = uniqueSorted(input.patchIds ?? []); + const moduleBuckets = uniqueSorted(changedFiles.map((path) => moduleBucket(path))); + const salientTitleTokens = uniqueSorted(titleTokens.filter((token) => token.length >= 4 && !TITLE_STOPWORDS.has(token))); + const materialTokens = [ + ...titleTokens, + ...bodyTokens, + ...linkedRefs, + ...changedFiles, + ...hunkSignatures, + ...patchIds, + ]; + const minhash = minhashSignature(materialTokens); + const simhash = simhash64(materialTokens); + const winnow = winnowingFingerprints(materialTokens); + const hashMaterial = JSON.stringify({ + algorithmVersion: THREAD_FINGERPRINT_ALGORITHM_VERSION, + kind: input.kind, + titleTokens, + bodyTokens, + labels: uniqueSorted(input.labels), + linkedRefs, + changedFiles, + hunkSignatures, + patchIds, + minhash, + simhash, + winnow, + }); + const key = humanKeyForValue(hashMaterial); + + return { + algorithmVersion: THREAD_FINGERPRINT_ALGORITHM_VERSION, + fingerprintHash: key.hash, + fingerprintSlug: key.slug, + titleTokens, + salientTitleTokens, + bodyTokens, + linkedRefs, + moduleBuckets, + changedFiles, + hunkSignatures, + patchIds, + minhashSignature: minhash, + simhash64: simhash, + winnowHashes: winnow, + }; +} + +export function compareDeterministicFingerprints( + left: DeterministicThreadFingerprint, + right: DeterministicThreadFingerprint, +): FingerprintPairBreakdown { + const linkedRefOverlap = Math.max( + jaccard(new Set(left.linkedRefs), new Set(right.linkedRefs)), + overlapMin(new Set(left.linkedRefs), new Set(right.linkedRefs)), + ); + const titleOverlap = jaccard(new Set(left.salientTitleTokens), new Set(right.salientTitleTokens)); + const fileOverlap = jaccard(new Set(left.changedFiles), new Set(right.changedFiles)); + const moduleOverlap = jaccard(new Set(left.moduleBuckets), new Set(right.moduleBuckets)); + const hunkOverlap = jaccard(new Set(left.hunkSignatures), new Set(right.hunkSignatures)); + const patchOverlap = overlapMin(new Set(left.patchIds), new Set(right.patchIds)); + return { + linkedRefOverlap, + titleOverlap, + tokenMinhash: minhashSimilarity(left.minhashSignature, right.minhashSignature), + tokenSimhash: simhashSimilarity(left.simhash64, right.simhash64), + tokenWinnow: jaccard(new Set(left.winnowHashes), new Set(right.winnowHashes)), + fileOverlap, + moduleOverlap, + hunkOverlap, + patchOverlap, + structure: 0.7 * hunkOverlap + 0.2 * fileOverlap + 0.1 * moduleOverlap, + lineage: patchOverlap, + }; +} + +export function tokenShinglesForDebug(tokens: string[], size = 3): string[] { + return buildShingles(tokens, size); +} From 95ed7109912b86bfb2816076841fef8acc153918 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:06:59 -0700 Subject: [PATCH 005/215] feat(cluster): score deterministic similarity evidence --- .../src/cluster/evidence-score.test.ts | 73 +++++++++++++++ .../api-core/src/cluster/evidence-score.ts | 92 +++++++++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 packages/api-core/src/cluster/evidence-score.test.ts create mode 100644 packages/api-core/src/cluster/evidence-score.ts diff --git a/packages/api-core/src/cluster/evidence-score.test.ts b/packages/api-core/src/cluster/evidence-score.test.ts new file mode 100644 index 0000000..b7e4151 --- /dev/null +++ b/packages/api-core/src/cluster/evidence-score.test.ts @@ -0,0 +1,73 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { scoreSimilarityEvidence } from './evidence-score.js'; +import { buildDeterministicThreadFingerprint } from './thread-fingerprint.js'; + +function fp(params: { + id: number; + title: string; + body?: string; + files?: string[]; + refs?: string[]; + hunks?: string[]; + patches?: string[]; +}) { + return buildDeterministicThreadFingerprint({ + threadId: params.id, + number: params.id, + kind: 'pull_request', + title: params.title, + body: params.body ?? '', + labels: [], + changedFiles: params.files ?? [], + linkedRefs: params.refs ?? [], + hunkSignatures: params.hunks ?? [], + patchIds: params.patches ?? [], + }); +} + +test('scoreSimilarityEvidence emits strong evidence from deterministic code overlap', () => { + const left = fp({ + id: 1, + title: 'Fix cache key collision', + files: ['packages/api-core/src/cache.ts'], + refs: ['123'], + hunks: ['h1'], + patches: ['p1'], + }); + const right = fp({ + id: 2, + title: 'Fix cache key collision', + files: ['packages/api-core/src/cache.ts'], + refs: ['123'], + hunks: ['h1'], + patches: ['p1'], + }); + + const evidence = scoreSimilarityEvidence(left, right); + + assert.equal(evidence.tier, 'strong'); + assert.ok(evidence.score > 0.7); + assert.equal(evidence.embeddingSimilarity, null); + assert.equal(evidence.llmKeySimilarity, null); +}); + +test('scoreSimilarityEvidence can improve confidence with optional enrichment', () => { + const left = fp({ id: 1, title: 'Fix flaky download retry', body: 'Retries forever after timeout.' }); + const right = fp({ id: 2, title: 'Handle stalled download timeout', body: 'Retry loop never exits.' }); + + const base = scoreSimilarityEvidence(left, right); + const enriched = scoreSimilarityEvidence(left, right, { embeddingSimilarity: 0.95, llmKeySimilarity: 0.95 }); + + assert.ok(enriched.score > base.score); +}); + +test('scoreSimilarityEvidence rejects unrelated deterministic fingerprints', () => { + const left = fp({ id: 1, title: 'Fix cache key collision', files: ['packages/api-core/src/cache.ts'] }); + const right = fp({ id: 2, title: 'Update docs typography', files: ['docs/design.md'] }); + + const evidence = scoreSimilarityEvidence(left, right); + + assert.equal(evidence.tier, 'none'); +}); diff --git a/packages/api-core/src/cluster/evidence-score.ts b/packages/api-core/src/cluster/evidence-score.ts new file mode 100644 index 0000000..7e4d49f --- /dev/null +++ b/packages/api-core/src/cluster/evidence-score.ts @@ -0,0 +1,92 @@ +import { compareDeterministicFingerprints, type DeterministicThreadFingerprint, type FingerprintPairBreakdown } from './thread-fingerprint.js'; + +export type EvidenceTier = 'strong' | 'weak' | 'none'; + +export type OptionalEnrichmentEvidence = { + embeddingSimilarity?: number | null; + llmKeySimilarity?: number | null; +}; + +export type EvidenceScoreConfig = { + minScore: number; + strongScore: number; + weightLineage: number; + weightStructure: number; + weightLinkedRefs: number; + weightTitle: number; + weightMinhash: number; + weightSimhash: number; + weightWinnow: number; + weightEmbedding: number; + weightLlmKey: number; +}; + +export type SimilarityEvidenceBreakdown = FingerprintPairBreakdown & { + embeddingSimilarity: number | null; + llmKeySimilarity: number | null; + score: number; + tier: EvidenceTier; +}; + +export const DEFAULT_EVIDENCE_SCORE_CONFIG: EvidenceScoreConfig = { + minScore: 0.48, + strongScore: 0.74, + weightLineage: 0.25, + weightStructure: 0.22, + weightLinkedRefs: 0.16, + weightTitle: 0.08, + weightMinhash: 0.10, + weightSimhash: 0.08, + weightWinnow: 0.07, + weightEmbedding: 0.02, + weightLlmKey: 0.02, +}; + +function clamp01(value: number | null | undefined): number { + if (value === null || value === undefined || Number.isNaN(value)) return 0; + return Math.max(0, Math.min(1, value)); +} + +export function scoreSimilarityEvidence( + left: DeterministicThreadFingerprint, + right: DeterministicThreadFingerprint, + enrichment: OptionalEnrichmentEvidence = {}, + config: EvidenceScoreConfig = DEFAULT_EVIDENCE_SCORE_CONFIG, +): SimilarityEvidenceBreakdown { + const base = compareDeterministicFingerprints(left, right); + const embeddingSimilarity = enrichment.embeddingSimilarity ?? null; + const llmKeySimilarity = enrichment.llmKeySimilarity ?? null; + const score = + config.weightLineage * base.lineage + + config.weightStructure * base.structure + + config.weightLinkedRefs * base.linkedRefOverlap + + config.weightTitle * base.titleOverlap + + config.weightMinhash * base.tokenMinhash + + config.weightSimhash * base.tokenSimhash + + config.weightWinnow * base.tokenWinnow + + config.weightEmbedding * clamp01(embeddingSimilarity) + + config.weightLlmKey * clamp01(llmKeySimilarity); + + let tier: EvidenceTier = 'none'; + if ( + base.lineage >= 0.8 || + (base.linkedRefOverlap >= 0.8 && (base.structure >= 0.25 || base.titleOverlap >= 0.25)) || + score >= config.strongScore + ) { + tier = 'strong'; + } else if ( + score >= config.minScore || + (base.structure >= 0.5 && base.tokenSimhash >= 0.55) || + (base.linkedRefOverlap >= 0.5 && base.tokenMinhash >= 0.25) + ) { + tier = 'weak'; + } + + return { + ...base, + embeddingSimilarity, + llmKeySimilarity, + score, + tier, + }; +} From cceb9e8bfd2c2a7536f4328efa6e3838326c17ad Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:07:56 -0700 Subject: [PATCH 006/215] feat(cluster): add membership governance rules --- .../api-core/src/cluster/governance.test.ts | 128 +++++++++++++ packages/api-core/src/cluster/governance.ts | 175 ++++++++++++++++++ 2 files changed, 303 insertions(+) create mode 100644 packages/api-core/src/cluster/governance.test.ts create mode 100644 packages/api-core/src/cluster/governance.ts diff --git a/packages/api-core/src/cluster/governance.test.ts b/packages/api-core/src/cluster/governance.test.ts new file mode 100644 index 0000000..ae1868b --- /dev/null +++ b/packages/api-core/src/cluster/governance.test.ts @@ -0,0 +1,128 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { applyClusterGovernance, type ClusterMembership, type DurableCluster } from './governance.js'; + +test('applyClusterGovernance creates a stable cluster for new evidence', () => { + const result = applyClusterGovernance({ + repoId: 1, + existingClusters: [], + existingMemberships: [], + overrides: [], + proposals: [ + { + representativeThreadId: 10, + memberThreadIds: [10, 11], + scoresToRepresentative: new Map([[10, 1], [11, 0.82]]), + }, + ], + }); + + assert.equal(result.clusters.length, 1); + assert.match(result.clusters[0].stableSlug, /^[a-z]+-[a-z]+-[a-z]+-[a-z0-9]{4}$/); + assert.deepEqual(result.clusters[0].memberThreadIds, [10, 11]); + assert.equal(result.events[0].eventType, 'create_cluster'); +}); + +test('applyClusterGovernance reuses existing cluster identity across syncs', () => { + const existingCluster: DurableCluster = { + id: 'focus-bridge-signal-9m', + repoId: 1, + stableKey: 'hash', + stableSlug: 'focus-bridge-signal-9m', + representativeThreadId: 10, + memberThreadIds: [10, 11], + }; + const existingMemberships: ClusterMembership[] = [ + { + clusterId: existingCluster.id, + threadId: 10, + role: 'canonical', + state: 'active', + scoreToRepresentative: 1, + addedBy: 'algo', + removedBy: null, + }, + { + clusterId: existingCluster.id, + threadId: 11, + role: 'related', + state: 'active', + scoreToRepresentative: 0.82, + addedBy: 'algo', + removedBy: null, + }, + ]; + + const result = applyClusterGovernance({ + repoId: 1, + existingClusters: [existingCluster], + existingMemberships, + overrides: [], + proposals: [ + { + representativeThreadId: 10, + memberThreadIds: [10, 11, 12], + scoresToRepresentative: new Map([[10, 1], [11, 0.85], [12, 0.8]]), + }, + ], + }); + + assert.equal(result.clusters[0].id, existingCluster.id); + assert.deepEqual(result.clusters[0].memberThreadIds, [10, 11, 12]); +}); + +test('applyClusterGovernance blocks automatic re-add after maintainer exclusion', () => { + const existingCluster: DurableCluster = { + id: 'focus-bridge-signal-9m', + repoId: 1, + stableKey: 'hash', + stableSlug: 'focus-bridge-signal-9m', + representativeThreadId: 10, + memberThreadIds: [10], + }; + + const result = applyClusterGovernance({ + repoId: 1, + existingClusters: [existingCluster], + existingMemberships: [ + { + clusterId: existingCluster.id, + threadId: 10, + role: 'canonical', + state: 'active', + scoreToRepresentative: 1, + addedBy: 'algo', + removedBy: null, + }, + { + clusterId: existingCluster.id, + threadId: 11, + role: 'related', + state: 'removed_by_user', + scoreToRepresentative: 0.82, + addedBy: 'algo', + removedBy: 'user', + }, + ], + overrides: [ + { + clusterId: existingCluster.id, + threadId: 11, + action: 'exclude', + }, + ], + proposals: [ + { + representativeThreadId: 10, + memberThreadIds: [10, 11], + scoresToRepresentative: new Map([[10, 1], [11, 0.95]]), + }, + ], + }); + + const membership = result.memberships.find((item) => item.threadId === 11); + assert.equal(membership?.state, 'blocked_by_override'); + assert.equal(membership?.removedBy, 'user'); + assert.deepEqual(result.clusters[0].memberThreadIds, [10]); +}); diff --git a/packages/api-core/src/cluster/governance.ts b/packages/api-core/src/cluster/governance.ts new file mode 100644 index 0000000..94840ad --- /dev/null +++ b/packages/api-core/src/cluster/governance.ts @@ -0,0 +1,175 @@ +import { humanKeyForValue } from './human-key.js'; + +export type ClusterMembershipState = 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; +export type ClusterMembershipRole = 'canonical' | 'duplicate' | 'related'; +export type ClusterOverrideAction = 'exclude' | 'force_include' | 'force_canonical'; +export type ClusterEventType = 'create_cluster' | 'add_member' | 'block_member' | 'keep_member' | 'remove_member'; + +export type DurableCluster = { + id: string; + repoId: number; + stableKey: string; + stableSlug: string; + representativeThreadId: number | null; + memberThreadIds: number[]; +}; + +export type ClusterMembership = { + clusterId: string; + threadId: number; + role: ClusterMembershipRole; + state: ClusterMembershipState; + scoreToRepresentative: number | null; + addedBy: 'algo' | 'user' | 'import'; + removedBy: 'algo' | 'user' | null; +}; + +export type ClusterOverride = { + clusterId: string; + threadId: number; + action: ClusterOverrideAction; +}; + +export type ClusterProposal = { + representativeThreadId: number; + memberThreadIds: number[]; + scoresToRepresentative: Map; +}; + +export type ClusterGovernanceInput = { + repoId: number; + existingClusters: DurableCluster[]; + existingMemberships: ClusterMembership[]; + overrides: ClusterOverride[]; + proposals: ClusterProposal[]; +}; + +export type ClusterGovernanceEvent = { + clusterId: string; + eventType: ClusterEventType; + threadId: number | null; + payload: Record; +}; + +export type ClusterGovernanceResult = { + clusters: DurableCluster[]; + memberships: ClusterMembership[]; + events: ClusterGovernanceEvent[]; +}; + +function stableClusterIdentity(repoId: number, representativeThreadId: number): { key: string; slug: string } { + const key = humanKeyForValue(`cluster:${repoId}:${representativeThreadId}`); + return { key: key.hash, slug: key.slug }; +} + +function membershipKey(clusterId: string, threadId: number): string { + return `${clusterId}:${threadId}`; +} + +function findReusableCluster( + proposal: ClusterProposal, + existingClusters: DurableCluster[], + existingMemberships: ClusterMembership[], +): DurableCluster | null { + const activeByThread = new Map(); + for (const membership of existingMemberships) { + if (membership.state === 'active') { + activeByThread.set(membership.threadId, membership.clusterId); + } + } + + const counts = new Map(); + for (const threadId of proposal.memberThreadIds) { + const clusterId = activeByThread.get(threadId); + if (clusterId) counts.set(clusterId, (counts.get(clusterId) ?? 0) + 1); + } + + const winner = Array.from(counts.entries()).sort((left, right) => right[1] - left[1])[0]?.[0]; + if (!winner) return null; + return existingClusters.find((cluster) => cluster.id === winner) ?? null; +} + +export function applyClusterGovernance(input: ClusterGovernanceInput): ClusterGovernanceResult { + const clusters = new Map(input.existingClusters.map((cluster) => [cluster.id, { ...cluster, memberThreadIds: [...cluster.memberThreadIds] }])); + const memberships = new Map(input.existingMemberships.map((membership) => [membershipKey(membership.clusterId, membership.threadId), { ...membership }])); + const overrides = new Map(input.overrides.map((override) => [membershipKey(override.clusterId, override.threadId), override])); + const events: ClusterGovernanceEvent[] = []; + + for (const proposal of input.proposals) { + let cluster = findReusableCluster(proposal, input.existingClusters, input.existingMemberships); + if (!cluster) { + const identity = stableClusterIdentity(input.repoId, proposal.representativeThreadId); + cluster = { + id: identity.slug, + repoId: input.repoId, + stableKey: identity.key, + stableSlug: identity.slug, + representativeThreadId: proposal.representativeThreadId, + memberThreadIds: [], + }; + events.push({ + clusterId: cluster.id, + eventType: 'create_cluster', + threadId: null, + payload: { representativeThreadId: proposal.representativeThreadId }, + }); + } + clusters.set(cluster.id, cluster); + + const proposedMembers = new Set(proposal.memberThreadIds); + for (const threadId of proposedMembers) { + const key = membershipKey(cluster.id, threadId); + const override = overrides.get(key); + if (override?.action === 'exclude') { + memberships.set(key, { + clusterId: cluster.id, + threadId, + role: 'related', + state: 'blocked_by_override', + scoreToRepresentative: proposal.scoresToRepresentative.get(threadId) ?? null, + addedBy: 'algo', + removedBy: 'user', + }); + events.push({ + clusterId: cluster.id, + eventType: 'block_member', + threadId, + payload: { reason: 'manual_exclusion' }, + }); + continue; + } + + const existing = memberships.get(key); + memberships.set(key, { + clusterId: cluster.id, + threadId, + role: threadId === proposal.representativeThreadId || override?.action === 'force_canonical' ? 'canonical' : 'related', + state: 'active', + scoreToRepresentative: proposal.scoresToRepresentative.get(threadId) ?? null, + addedBy: existing?.addedBy ?? (override?.action === 'force_include' || override?.action === 'force_canonical' ? 'user' : 'algo'), + removedBy: null, + }); + events.push({ + clusterId: cluster.id, + eventType: existing?.state === 'active' ? 'keep_member' : 'add_member', + threadId, + payload: { scoreToRepresentative: proposal.scoresToRepresentative.get(threadId) ?? null }, + }); + } + + const activeMembers = Array.from(memberships.values()) + .filter((membership) => membership.clusterId === cluster.id && membership.state === 'active') + .map((membership) => membership.threadId) + .sort((left, right) => left - right); + clusters.set(cluster.id, { + ...cluster, + memberThreadIds: activeMembers, + }); + } + + return { + clusters: Array.from(clusters.values()), + memberships: Array.from(memberships.values()), + events, + }; +} From 60458bfcc9b29946d4bd38a8593dcaf2769199c8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:08:39 -0700 Subject: [PATCH 007/215] feat(db): add content-addressed blob store --- packages/api-core/src/db/blob-store.test.ts | 45 ++++++++ packages/api-core/src/db/blob-store.ts | 116 ++++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 packages/api-core/src/db/blob-store.test.ts create mode 100644 packages/api-core/src/db/blob-store.ts diff --git a/packages/api-core/src/db/blob-store.test.ts b/packages/api-core/src/db/blob-store.test.ts new file mode 100644 index 0000000..cefb2b0 --- /dev/null +++ b/packages/api-core/src/db/blob-store.test.ts @@ -0,0 +1,45 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +import { blobObjectPath, readTextBlob, storeTextBlob } from './blob-store.js'; +import { migrate } from './migrate.js'; +import { openDb } from './sqlite.js'; + +test('storeTextBlob keeps small payloads inline and deduplicates by hash', () => { + const db = openDb(':memory:'); + const storeRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-blob-store-')); + try { + migrate(db); + const first = storeTextBlob(db, storeRoot, '{"ok":true}', { mediaType: 'application/json' }); + const second = storeTextBlob(db, storeRoot, '{"ok":true}', { mediaType: 'application/json' }); + + assert.equal(first.id, second.id); + assert.equal(first.storageKind, 'inline'); + assert.equal(readTextBlob(db, storeRoot, first.id), '{"ok":true}'); + } finally { + db.close(); + } +}); + +test('storeTextBlob writes large payloads to content-addressed files', () => { + const db = openDb(':memory:'); + const storeRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-blob-store-')); + try { + migrate(db); + const payload = 'x'.repeat(128); + const stored = storeTextBlob(db, storeRoot, payload, { + mediaType: 'text/plain', + inlineThresholdBytes: 8, + }); + + assert.equal(stored.storageKind, 'file'); + assert.equal(stored.storagePath, path.relative(storeRoot, blobObjectPath(storeRoot, stored.sha256, 'gzip'))); + assert.ok(fs.existsSync(path.join(storeRoot, stored.storagePath ?? ''))); + assert.equal(readTextBlob(db, storeRoot, stored.id), payload); + } finally { + db.close(); + } +}); diff --git a/packages/api-core/src/db/blob-store.ts b/packages/api-core/src/db/blob-store.ts new file mode 100644 index 0000000..101d06e --- /dev/null +++ b/packages/api-core/src/db/blob-store.ts @@ -0,0 +1,116 @@ +import crypto from 'node:crypto'; +import fs from 'node:fs'; +import path from 'node:path'; +import zlib from 'node:zlib'; + +import type { SqliteDatabase } from './sqlite.js'; + +export type StoredBlob = { + id: number; + sha256: string; + storageKind: 'inline' | 'file'; + storagePath: string | null; + sizeBytes: number; +}; + +export type StoreBlobOptions = { + mediaType: string; + inlineThresholdBytes?: number; +}; + +function nowIso(): string { + return new Date().toISOString(); +} + +export function blobObjectPath(storeRoot: string, sha256: string, compression: string): string { + const extension = compression === 'gzip' ? '.gz' : ''; + return path.join(storeRoot, 'objects', 'sha256', sha256.slice(0, 2), sha256.slice(2, 4), `${sha256}${extension}`); +} + +export function storeTextBlob( + db: SqliteDatabase, + storeRoot: string, + value: string, + options: StoreBlobOptions, +): StoredBlob { + const raw = Buffer.from(value, 'utf8'); + const sha256 = crypto.createHash('sha256').update(raw).digest('hex'); + const existing = db.prepare('select * from blobs where sha256 = ? limit 1').get(sha256) as + | { + id: number; + sha256: string; + storage_kind: 'inline' | 'file'; + storage_path: string | null; + size_bytes: number; + } + | undefined; + if (existing) { + return { + id: existing.id, + sha256: existing.sha256, + storageKind: existing.storage_kind, + storagePath: existing.storage_path, + sizeBytes: existing.size_bytes, + }; + } + + const inlineThresholdBytes = options.inlineThresholdBytes ?? 4096; + const createdAt = nowIso(); + if (raw.byteLength <= inlineThresholdBytes) { + const result = db + .prepare( + `insert into blobs (sha256, media_type, compression, size_bytes, storage_kind, storage_path, inline_text, created_at) + values (?, ?, 'none', ?, 'inline', null, ?, ?)`, + ) + .run(sha256, options.mediaType, raw.byteLength, value, createdAt); + return { + id: Number(result.lastInsertRowid), + sha256, + storageKind: 'inline', + storagePath: null, + sizeBytes: raw.byteLength, + }; + } + + const objectPath = blobObjectPath(storeRoot, sha256, 'gzip'); + fs.mkdirSync(path.dirname(objectPath), { recursive: true }); + if (!fs.existsSync(objectPath)) { + fs.writeFileSync(objectPath, zlib.gzipSync(raw)); + } + const result = db + .prepare( + `insert into blobs (sha256, media_type, compression, size_bytes, storage_kind, storage_path, inline_text, created_at) + values (?, ?, 'gzip', ?, 'file', ?, null, ?)`, + ) + .run(sha256, options.mediaType, raw.byteLength, path.relative(storeRoot, objectPath), createdAt); + + return { + id: Number(result.lastInsertRowid), + sha256, + storageKind: 'file', + storagePath: path.relative(storeRoot, objectPath), + sizeBytes: raw.byteLength, + }; +} + +export function readTextBlob(db: SqliteDatabase, storeRoot: string, blobId: number): string { + const row = db.prepare('select * from blobs where id = ? limit 1').get(blobId) as + | { + compression: string; + storage_kind: 'inline' | 'file'; + storage_path: string | null; + inline_text: string | null; + } + | undefined; + if (!row) { + throw new Error(`Blob ${blobId} not found`); + } + if (row.storage_kind === 'inline') { + return row.inline_text ?? ''; + } + if (!row.storage_path) { + throw new Error(`Blob ${blobId} has no storage path`); + } + const stored = fs.readFileSync(path.join(storeRoot, row.storage_path)); + return (row.compression === 'gzip' ? zlib.gunzipSync(stored) : stored).toString('utf8'); +} From 4c4bf0c09a3c95c9fe4fd57bd7cbdfb00f7c2a49 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:09:11 -0700 Subject: [PATCH 008/215] feat(cluster): define LLM key summary contract --- .../src/cluster/llm-key-summary.test.ts | 60 +++++++++++++++++++ .../api-core/src/cluster/llm-key-summary.ts | 49 +++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 packages/api-core/src/cluster/llm-key-summary.test.ts create mode 100644 packages/api-core/src/cluster/llm-key-summary.ts diff --git a/packages/api-core/src/cluster/llm-key-summary.test.ts b/packages/api-core/src/cluster/llm-key-summary.test.ts new file mode 100644 index 0000000..bec136c --- /dev/null +++ b/packages/api-core/src/cluster/llm-key-summary.test.ts @@ -0,0 +1,60 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { + LLM_KEY_SUMMARY_PROMPT_VERSION, + LLM_KEY_SUMMARY_SYSTEM_PROMPT, + llmKeyEmbeddingText, + llmKeyInputHash, + parseLlmKeySummary, +} from './llm-key-summary.js'; + +test('parseLlmKeySummary accepts the strict 3-line contract', () => { + const summary = parseLlmKeySummary({ + intent: 'Stop downloads from retrying forever after timeout.', + surface: 'CLI sync downloader and retry loop.', + mechanism: 'Exit retry loop when timeout state is terminal.', + }); + + assert.equal(summary.intent, 'Stop downloads from retrying forever after timeout.'); + assert.equal( + llmKeyEmbeddingText(summary), + [ + 'intent: Stop downloads from retrying forever after timeout.', + 'surface: CLI sync downloader and retry loop.', + 'mechanism: Exit retry loop when timeout state is terminal.', + ].join('\n'), + ); +}); + +test('parseLlmKeySummary rejects missing or oversized fields', () => { + assert.throws( + () => + parseLlmKeySummary({ + intent: 'x'.repeat(121), + surface: 'CLI', + mechanism: 'Patch retry loop.', + }), + /Too big/, + ); +}); + +test('llmKeyInputHash is deterministic and prompt-version scoped', () => { + const first = llmKeyInputHash({ title: 'Fix retry', body: 'Retry forever' }); + const second = llmKeyInputHash({ title: 'Fix retry', body: 'Retry forever' }); + const third = llmKeyInputHash({ + promptVersion: `${LLM_KEY_SUMMARY_PROMPT_VERSION}-next`, + title: 'Fix retry', + body: 'Retry forever', + }); + + assert.equal(first, second); + assert.notEqual(first, third); +}); + +test('LLM_KEY_SUMMARY_SYSTEM_PROMPT requires strict JSON fields', () => { + assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /Return only strict JSON/); + assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /intent/); + assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /surface/); + assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /mechanism/); +}); diff --git a/packages/api-core/src/cluster/llm-key-summary.ts b/packages/api-core/src/cluster/llm-key-summary.ts new file mode 100644 index 0000000..a7b5f78 --- /dev/null +++ b/packages/api-core/src/cluster/llm-key-summary.ts @@ -0,0 +1,49 @@ +import crypto from 'node:crypto'; + +import { z } from 'zod'; + +export const LLM_KEY_SUMMARY_PROMPT_VERSION = 'llm-key-summary-v1'; + +export const LLM_KEY_SUMMARY_SYSTEM_PROMPT = `You produce stable deduplication keys for GitHub issues and pull requests. +Return only strict JSON with exactly these fields: +intent: one sentence, max 120 chars, what outcome is being requested or changed. +surface: one sentence, max 120 chars, affected user/API/module/file area. +mechanism: one sentence, max 160 chars, cause or implementation approach. +Use concrete nouns from the input. Do not mention uncertainty. Do not add advice.`; + +export const llmKeySummarySchema = z.object({ + intent: z.string().trim().min(1).max(120), + surface: z.string().trim().min(1).max(120), + mechanism: z.string().trim().min(1).max(160), +}); + +export type LlmKeySummary = z.infer; + +export function parseLlmKeySummary(value: unknown): LlmKeySummary { + return llmKeySummarySchema.parse(value); +} + +export function llmKeyEmbeddingText(summary: LlmKeySummary): string { + return [`intent: ${summary.intent}`, `surface: ${summary.surface}`, `mechanism: ${summary.mechanism}`].join('\n'); +} + +export function llmKeyInputHash(input: { + promptVersion?: string; + title: string; + body: string | null; + commentsText?: string | null; + diffText?: string | null; +}): string { + return crypto + .createHash('sha256') + .update( + JSON.stringify({ + promptVersion: input.promptVersion ?? LLM_KEY_SUMMARY_PROMPT_VERSION, + title: input.title, + body: input.body ?? '', + commentsText: input.commentsText ?? '', + diffText: input.diffText ?? '', + }), + ) + .digest('hex'); +} From 843eb1d052f7167e246a4d03b12fe77c1cdda9ff Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:10:09 -0700 Subject: [PATCH 009/215] feat(cluster): persist durable evidence state --- .../src/cluster/persistent-store.test.ts | 127 ++++++++++++ .../api-core/src/cluster/persistent-store.ts | 194 ++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 packages/api-core/src/cluster/persistent-store.test.ts create mode 100644 packages/api-core/src/cluster/persistent-store.ts diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts new file mode 100644 index 0000000..af5cc71 --- /dev/null +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -0,0 +1,127 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { migrate } from '../db/migrate.js'; +import { openDb } from '../db/sqlite.js'; +import { scoreSimilarityEvidence } from './evidence-score.js'; +import { + createPipelineRun, + finishPipelineRun, + recordClusterEvent, + upsertClusterGroup, + upsertClusterMembership, + upsertSimilarityEdgeEvidence, +} from './persistent-store.js'; +import { buildDeterministicThreadFingerprint } from './thread-fingerprint.js'; + +function seedRepoAndThreads(db: ReturnType): void { + db.prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', '2026-01-01T00:00:00Z')`, + ).run(); + const insertThread = db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, updated_at + ) values (?, 1, ?, ?, 'pull_request', 'open', ?, '', 'alice', 'User', ?, '[]', '[]', '{}', ?, 0, ?, ?, ?)`, + ); + insertThread.run(10, '10', 10, 'Fix cache collision', 'https://github.com/openclaw/openclaw/pull/10', 'h10', '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z'); + insertThread.run(11, '11', 11, 'Fix cache collision', 'https://github.com/openclaw/openclaw/pull/11', 'h11', '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z', '2026-01-01T00:00:00Z'); +} + +test('persistent cluster store upserts edge evidence and governed memberships', () => { + const db = openDb(':memory:'); + try { + migrate(db); + seedRepoAndThreads(db); + const runId = createPipelineRun(db, { + repoId: 1, + runKind: 'cluster', + algorithmVersion: 'test-v1', + configHash: 'cfg', + }); + const left = buildDeterministicThreadFingerprint({ + threadId: 10, + number: 10, + kind: 'pull_request', + title: 'Fix cache collision', + body: '', + labels: [], + changedFiles: ['packages/api-core/src/cache.ts'], + linkedRefs: ['123'], + hunkSignatures: ['h1'], + patchIds: ['p1'], + }); + const right = buildDeterministicThreadFingerprint({ + threadId: 11, + number: 11, + kind: 'pull_request', + title: 'Fix cache collision', + body: '', + labels: [], + changedFiles: ['packages/api-core/src/cache.ts'], + linkedRefs: ['123'], + hunkSignatures: ['h1'], + patchIds: ['p1'], + }); + const evidence = scoreSimilarityEvidence(left, right); + assert.notEqual(evidence.tier, 'none'); + + upsertSimilarityEdgeEvidence(db, { + repoId: 1, + leftThreadId: 10, + rightThreadId: 11, + algorithmVersion: 'test-v1', + configHash: 'cfg', + score: evidence.score, + tier: evidence.tier === 'strong' ? 'strong' : 'weak', + breakdown: evidence, + runId, + }); + const clusterId = upsertClusterGroup(db, { + repoId: 1, + stableKey: 'cluster-hash', + stableSlug: 'focus-bridge-signal-9m', + representativeThreadId: 10, + title: 'Fix cache collision', + }); + upsertClusterMembership(db, { + clusterId, + threadId: 10, + role: 'canonical', + state: 'active', + scoreToRepresentative: 1, + runId, + addedBy: 'algo', + }); + upsertClusterMembership(db, { + clusterId, + threadId: 11, + role: 'related', + state: 'active', + scoreToRepresentative: evidence.score, + runId, + addedBy: 'algo', + }); + recordClusterEvent(db, { + clusterId, + runId, + eventType: 'add_member', + actorKind: 'algo', + payload: { threadId: 11 }, + }); + finishPipelineRun(db, runId, { status: 'completed', stats: { edges: 1, clusters: 1 } }); + + const edgeCount = db.prepare('select count(*) as count from similarity_edge_evidence').get() as { count: number }; + const membershipCount = db.prepare('select count(*) as count from cluster_memberships').get() as { count: number }; + const eventCount = db.prepare('select count(*) as count from cluster_events').get() as { count: number }; + const run = db.prepare('select status from pipeline_runs where id = ?').get(runId) as { status: string }; + + assert.equal(edgeCount.count, 1); + assert.equal(membershipCount.count, 2); + assert.equal(eventCount.count, 1); + assert.equal(run.status, 'completed'); + } finally { + db.close(); + } +}); diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts new file mode 100644 index 0000000..bf225b7 --- /dev/null +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -0,0 +1,194 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { EvidenceTier, SimilarityEvidenceBreakdown } from './evidence-score.js'; + +function nowIso(): string { + return new Date().toISOString(); +} + +export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster'; + +export function createPipelineRun( + db: SqliteDatabase, + params: { + repoId: number; + runKind: PipelineRunKind; + algorithmVersion?: string | null; + configHash?: string | null; + }, +): number { + const result = db + .prepare( + `insert into pipeline_runs (repo_id, run_kind, algorithm_version, config_hash, status, started_at) + values (?, ?, ?, ?, 'running', ?)`, + ) + .run(params.repoId, params.runKind, params.algorithmVersion ?? null, params.configHash ?? null, nowIso()); + return Number(result.lastInsertRowid); +} + +export function finishPipelineRun( + db: SqliteDatabase, + runId: number, + params: { status: 'completed' | 'failed'; stats?: unknown; errorText?: string | null }, +): void { + db.prepare('update pipeline_runs set status = ?, finished_at = ?, stats_json = ?, error_text = ? where id = ?').run( + params.status, + nowIso(), + JSON.stringify(params.stats ?? null), + params.errorText ?? null, + runId, + ); +} + +export function upsertSimilarityEdgeEvidence( + db: SqliteDatabase, + params: { + repoId: number; + leftThreadId: number; + rightThreadId: number; + algorithmVersion: string; + configHash: string; + score: number; + tier: Exclude; + state?: 'active' | 'stale' | 'rejected'; + breakdown: SimilarityEvidenceBreakdown; + runId: number; + }, +): void { + const left = Math.min(params.leftThreadId, params.rightThreadId); + const right = Math.max(params.leftThreadId, params.rightThreadId); + const createdAt = nowIso(); + db.prepare( + `insert into similarity_edge_evidence ( + repo_id, left_thread_id, right_thread_id, algorithm_version, config_hash, + score, tier, state, breakdown_json, first_seen_run_id, last_seen_run_id, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, left_thread_id, right_thread_id, algorithm_version, config_hash) do update set + score = excluded.score, + tier = excluded.tier, + state = excluded.state, + breakdown_json = excluded.breakdown_json, + last_seen_run_id = excluded.last_seen_run_id, + updated_at = excluded.updated_at`, + ).run( + params.repoId, + left, + right, + params.algorithmVersion, + params.configHash, + params.score, + params.tier, + params.state ?? 'active', + JSON.stringify(params.breakdown), + params.runId, + params.runId, + createdAt, + createdAt, + ); +} + +export function upsertClusterGroup( + db: SqliteDatabase, + params: { + repoId: number; + stableKey: string; + stableSlug: string; + status?: 'active' | 'closed' | 'merged' | 'split'; + clusterType?: string | null; + representativeThreadId?: number | null; + title?: string | null; + }, +): number { + const timestamp = nowIso(); + db.prepare( + `insert into cluster_groups ( + repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, stable_key) do update set + stable_slug = excluded.stable_slug, + status = excluded.status, + cluster_type = excluded.cluster_type, + representative_thread_id = excluded.representative_thread_id, + title = excluded.title, + updated_at = excluded.updated_at`, + ).run( + params.repoId, + params.stableKey, + params.stableSlug, + params.status ?? 'active', + params.clusterType ?? null, + params.representativeThreadId ?? null, + params.title ?? null, + timestamp, + timestamp, + ); + const row = db + .prepare('select id from cluster_groups where repo_id = ? and stable_key = ? limit 1') + .get(params.repoId, params.stableKey) as { id: number }; + return row.id; +} + +export function upsertClusterMembership( + db: SqliteDatabase, + params: { + clusterId: number; + threadId: number; + role: 'canonical' | 'duplicate' | 'related'; + state: 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; + scoreToRepresentative?: number | null; + runId?: number | null; + addedBy: 'algo' | 'user' | 'import'; + removedBy?: 'algo' | 'user' | null; + addedReason?: unknown; + removedReason?: unknown; + }, +): void { + const timestamp = nowIso(); + db.prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, + first_seen_run_id, last_seen_run_id, added_by, removed_by, + added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(cluster_id, thread_id) do update set + role = excluded.role, + state = excluded.state, + score_to_representative = excluded.score_to_representative, + last_seen_run_id = excluded.last_seen_run_id, + removed_by = excluded.removed_by, + removed_reason_json = excluded.removed_reason_json, + updated_at = excluded.updated_at, + removed_at = excluded.removed_at`, + ).run( + params.clusterId, + params.threadId, + params.role, + params.state, + params.scoreToRepresentative ?? null, + params.runId ?? null, + params.runId ?? null, + params.addedBy, + params.removedBy ?? null, + JSON.stringify(params.addedReason ?? null), + JSON.stringify(params.removedReason ?? null), + timestamp, + timestamp, + params.state === 'active' ? null : timestamp, + ); +} + +export function recordClusterEvent( + db: SqliteDatabase, + params: { + clusterId: number; + runId?: number | null; + eventType: string; + actorKind: 'algo' | 'user' | 'import'; + actorId?: number | null; + payload: unknown; + }, +): void { + db.prepare( + `insert into cluster_events (cluster_id, run_id, event_type, actor_kind, actor_id, payload_json, created_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ).run(params.clusterId, params.runId ?? null, params.eventType, params.actorKind, params.actorId ?? null, JSON.stringify(params.payload), nowIso()); +} From 70263630e3953fe1317e4f7741fa187c0791a8bd Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:11:37 -0700 Subject: [PATCH 010/215] feat(cluster): build deterministic cluster graph --- .../src/cluster/deterministic-engine.test.ts | 64 +++++++++ .../src/cluster/deterministic-engine.ts | 127 ++++++++++++++++++ .../api-core/src/cluster/evidence-score.ts | 1 + 3 files changed, 192 insertions(+) create mode 100644 packages/api-core/src/cluster/deterministic-engine.test.ts create mode 100644 packages/api-core/src/cluster/deterministic-engine.ts diff --git a/packages/api-core/src/cluster/deterministic-engine.test.ts b/packages/api-core/src/cluster/deterministic-engine.test.ts new file mode 100644 index 0000000..772209e --- /dev/null +++ b/packages/api-core/src/cluster/deterministic-engine.test.ts @@ -0,0 +1,64 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { buildDeterministicClusterGraph } from './deterministic-engine.js'; + +test('buildDeterministicClusterGraph clusters without embeddings or LLM summaries', () => { + const result = buildDeterministicClusterGraph([ + { + id: 10, + number: 10, + kind: 'issue', + title: 'Download retry hangs forever', + body: 'The transfer retry loop never exits after timeout.', + labels: ['bug'], + }, + { + id: 11, + number: 11, + kind: 'issue', + title: 'Download retry loop never exits', + body: 'Retry hangs forever after timeout.', + labels: ['bug'], + }, + { + id: 12, + number: 12, + kind: 'issue', + title: 'Improve documentation typography', + body: 'Docs heading sizes look inconsistent.', + labels: ['docs'], + }, + ]); + + const duplicateCluster = result.clusters.find((cluster) => cluster.members.includes(10)); + + assert.ok(result.edges.length >= 1); + assert.ok(duplicateCluster); + assert.deepEqual(new Set(duplicateCluster?.members), new Set([10, 11])); +}); + +test('buildDeterministicClusterGraph infers hard refs from text', () => { + const result = buildDeterministicClusterGraph([ + { + id: 10, + number: 10, + kind: 'pull_request', + title: 'Fixes #99', + body: 'Patch retry loop.', + labels: [], + changedFiles: ['packages/api-core/src/retry.ts'], + }, + { + id: 11, + number: 11, + kind: 'issue', + title: 'Retry loop broken', + body: 'See pull/99 and timeout notes.', + labels: [], + changedFiles: ['packages/api-core/src/retry.ts'], + }, + ]); + + assert.equal(result.edges[0]?.tier, 'strong'); +}); diff --git a/packages/api-core/src/cluster/deterministic-engine.ts b/packages/api-core/src/cluster/deterministic-engine.ts new file mode 100644 index 0000000..412a8b0 --- /dev/null +++ b/packages/api-core/src/cluster/deterministic-engine.ts @@ -0,0 +1,127 @@ +import { buildClusters, type SimilarityEdge } from './build.js'; +import { scoreSimilarityEvidence, type SimilarityEvidenceBreakdown } from './evidence-score.js'; +import { buildDeterministicThreadFingerprint, type DeterministicThreadFingerprint } from './thread-fingerprint.js'; + +const REF_RE = /(?:#|issues\/|pull\/)(\d+)/gi; + +export type DeterministicClusterInput = { + id: number; + number: number; + kind: 'issue' | 'pull_request'; + title: string; + body: string | null; + labels: string[]; + changedFiles?: string[]; + linkedRefs?: string[]; + hunkSignatures?: string[]; + patchIds?: string[]; +}; + +export type DeterministicClusterEdge = SimilarityEdge & { + tier: 'strong' | 'weak'; + breakdown: SimilarityEvidenceBreakdown; +}; + +export type DeterministicClusterResult = { + edges: DeterministicClusterEdge[]; + clusters: Array<{ representativeThreadId: number; members: number[] }>; + fingerprints: Map; +}; + +function extractRefs(value: string | null): string[] { + const refs = new Set(); + for (const match of value?.matchAll(REF_RE) ?? []) { + refs.add(match[1]); + } + return Array.from(refs).sort(); +} + +function bump(index: Map>, key: string, id: number): void { + const bucket = index.get(key) ?? new Set(); + bucket.add(id); + index.set(key, bucket); +} + +function buildCandidatePairs( + fingerprints: Map, + params: { maxBucketSize: number; topK: number }, +): Array<[number, number]> { + const index = new Map>(); + for (const [id, fingerprint] of fingerprints.entries()) { + for (const token of fingerprint.salientTitleTokens) bump(index, `title:${token}`, id); + for (const ref of fingerprint.linkedRefs) bump(index, `ref:${ref}`, id); + for (const file of fingerprint.changedFiles) bump(index, `file:${file}`, id); + for (const module of fingerprint.moduleBuckets) bump(index, `module:${module}`, id); + for (const hunk of fingerprint.hunkSignatures) bump(index, `hunk:${hunk}`, id); + } + + const votes = new Map(); + for (const bucket of index.values()) { + if (bucket.size > params.maxBucketSize) continue; + const ids = Array.from(bucket).sort((left, right) => left - right); + for (let leftIndex = 0; leftIndex < ids.length; leftIndex += 1) { + for (let rightIndex = leftIndex + 1; rightIndex < ids.length; rightIndex += 1) { + const key = `${ids[leftIndex]}:${ids[rightIndex]}`; + votes.set(key, (votes.get(key) ?? 0) + 1); + } + } + } + + return Array.from(votes.entries()) + .sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0])) + .slice(0, fingerprints.size * params.topK) + .map(([key]) => { + const [left, right] = key.split(':').map(Number); + return [left, right] as [number, number]; + }); +} + +export function buildDeterministicClusterGraph( + inputs: DeterministicClusterInput[], + params: { maxBucketSize?: number; topK?: number } = {}, +): DeterministicClusterResult { + const fingerprints = new Map(); + const titleById = new Map(); + for (const input of inputs) { + const inferredRefs = extractRefs(`${input.title}\n${input.body ?? ''}`); + fingerprints.set( + input.id, + buildDeterministicThreadFingerprint({ + ...input, + linkedRefs: Array.from(new Set([...(input.linkedRefs ?? []), ...inferredRefs])).sort(), + }), + ); + titleById.set(input.id, input.title); + } + + const pairs = buildCandidatePairs(fingerprints, { + maxBucketSize: params.maxBucketSize ?? 500, + topK: params.topK ?? 64, + }); + const edges: DeterministicClusterEdge[] = []; + for (const [leftThreadId, rightThreadId] of pairs) { + const left = fingerprints.get(leftThreadId); + const right = fingerprints.get(rightThreadId); + if (!left || !right) continue; + const breakdown = scoreSimilarityEvidence(left, right); + if (breakdown.tier === 'none') continue; + edges.push({ + leftThreadId, + rightThreadId, + score: breakdown.score, + tier: breakdown.tier, + breakdown, + }); + } + + const clusters = buildClusters( + inputs.map((input) => ({ + threadId: input.id, + number: input.number, + title: titleById.get(input.id) ?? input.title, + })), + edges, + ); + + return { edges, clusters, fingerprints }; +} diff --git a/packages/api-core/src/cluster/evidence-score.ts b/packages/api-core/src/cluster/evidence-score.ts index 7e4d49f..6faa754 100644 --- a/packages/api-core/src/cluster/evidence-score.ts +++ b/packages/api-core/src/cluster/evidence-score.ts @@ -76,6 +76,7 @@ export function scoreSimilarityEvidence( tier = 'strong'; } else if ( score >= config.minScore || + (base.titleOverlap >= 0.25 && base.tokenSimhash >= 0.55) || (base.structure >= 0.5 && base.tokenSimhash >= 0.55) || (base.linkedRefOverlap >= 0.5 && base.tokenMinhash >= 0.25) ) { From ee22513911f6b806fbf985cdd97d01b52bc936a4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:14:33 -0700 Subject: [PATCH 011/215] feat(cluster): fallback to deterministic clustering --- .../src/cluster/deterministic-engine.ts | 1 + packages/api-core/src/service.test.ts | 52 +++++++++++++++ packages/api-core/src/service.ts | 66 +++++++++++++++++-- 3 files changed, 112 insertions(+), 7 deletions(-) diff --git a/packages/api-core/src/cluster/deterministic-engine.ts b/packages/api-core/src/cluster/deterministic-engine.ts index 412a8b0..055d3a7 100644 --- a/packages/api-core/src/cluster/deterministic-engine.ts +++ b/packages/api-core/src/cluster/deterministic-engine.ts @@ -88,6 +88,7 @@ export function buildDeterministicClusterGraph( input.id, buildDeterministicThreadFingerprint({ ...input, + threadId: input.id, linkedRefs: Array.from(new Set([...(input.linkedRefs ?? []), ...inferredRefs])).sort(), }), ); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index ab3a36e..91e5c1a 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -1920,6 +1920,58 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a } }); +test('clusterRepository falls back to deterministic fingerprints when vectors are missing', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Download retry hangs forever', 'The transfer retry loop never exits after timeout.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Download retry loop never exits', 'Retry hangs forever after timeout.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + + const result = await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + k: 1, + minScore: 0.1, + }); + + assert.equal(result.edges, 1); + assert.equal(result.clusters, 1); + } finally { + service.close(); + } +}); + test('embedRepository rebuilds a corrupted active vector store during upsert', async () => { const vectors = new Map(); let failNextUpsert = true; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 1b12298..d421be8 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -49,6 +49,7 @@ import { } from '@ghcrawl/api-contract'; import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; +import { buildDeterministicClusterGraph } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { ensureRuntimeDirs, @@ -104,6 +105,7 @@ type CommentSeed = { }; type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary'; +type SimilaritySourceKind = EmbeddingSourceKind | 'deterministic_fingerprint'; type EmbeddingTask = { threadId: number; @@ -1298,7 +1300,7 @@ export class GHCrawlService { try { let items: Array<{ id: number; number: number; title: string }>; - let aggregatedEdges: Map }>; + let aggregatedEdges: Map }>; if (this.isRepoVectorStateCurrent(repository.id)) { const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); @@ -1355,7 +1357,22 @@ export class GHCrawlService { onProgress: params.onProgress, }); } else { - throw new Error(`Vectors for ${repository.fullName} are stale or missing. Run refresh or embed first.`); + const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); + const deterministic = buildDeterministicClusterGraph(deterministicItems, { topK: Math.max(k * 8, 64) }); + items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); + aggregatedEdges = new Map(); + for (const edge of deterministic.edges) { + if (edge.score < minScore) continue; + aggregatedEdges.set(this.edgeKey(edge.leftThreadId, edge.rightThreadId), { + leftThreadId: edge.leftThreadId, + rightThreadId: edge.rightThreadId, + score: edge.score, + sourceKinds: new Set(['deterministic_fingerprint']), + }); + } + params.onProgress?.( + `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${repository.fullName} without embeddings`, + ); } const edges = Array.from(aggregatedEdges.values()).map((entry) => ({ @@ -3966,6 +3983,41 @@ export class GHCrawlService { })); } + private loadDeterministicClusterableThreadMeta(repoId: number): Array<{ + id: number; + number: number; + kind: 'issue' | 'pull_request'; + title: string; + body: string | null; + labels: string[]; + }> { + const rows = this.db + .prepare( + `select id, number, kind, title, body, labels_json + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null + order by number asc`, + ) + .all(repoId) as Array<{ + id: number; + number: number; + kind: 'issue' | 'pull_request'; + title: string; + body: string | null; + labels_json: string; + }>; + return rows.map((row) => ({ + id: row.id, + number: row.number, + kind: row.kind, + title: row.title, + body: row.body, + labels: parseArray(row.labels_json), + })); + } + private loadNormalizedActiveVectors(repoId: number): Array<{ id: number; number: number; title: string; embedding: number[] }> { return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({ id: row.id, @@ -4133,8 +4185,8 @@ export class GHCrawlService { repoId: number, sourceKinds: EmbeddingSourceKind[], params: { limit: number; minScore: number; onProgress?: (message: string) => void }, - ): Promise }>> { - const aggregated = new Map }>(); + ): Promise }>> { + const aggregated = new Map }>(); const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repoId, sourceKind), 0); if (sourceKinds.length === 0 || totalItems === 0) { @@ -4230,9 +4282,9 @@ export class GHCrawlService { } private mergeSourceKindEdges( - aggregated: Map }>, + aggregated: Map }>, edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, - sourceKind: EmbeddingSourceKind, + sourceKind: SimilaritySourceKind, ): void { for (const edge of edges) { const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId); @@ -4362,7 +4414,7 @@ export class GHCrawlService { private persistClusterRun( repoId: number, runId: number, - aggregatedEdges: Map }>, + aggregatedEdges: Map }>, clusters: Array<{ representativeThreadId: number; members: number[] }>, ): void { const insertEdge = this.db.prepare( From 1b39d95bd9531ebd500d4aa78e2e235c20b5eb70 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:15:34 -0700 Subject: [PATCH 012/215] feat(cluster): persist durable cluster state --- .../api-core/src/cluster/persistent-store.ts | 2 +- packages/api-core/src/service.test.ts | 6 ++ packages/api-core/src/service.ts | 93 +++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index bf225b7..932077a 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -50,7 +50,7 @@ export function upsertSimilarityEdgeEvidence( score: number; tier: Exclude; state?: 'active' | 'stale' | 'rejected'; - breakdown: SimilarityEvidenceBreakdown; + breakdown: SimilarityEvidenceBreakdown | unknown; runId: number; }, ): void { diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 91e5c1a..8bf3931 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -1915,6 +1915,12 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a assert.equal(resetCalls, 2); assert.equal(result.edges, 1); assert.equal(result.clusters, 1); + const durableClusters = service.db.prepare('select count(*) as count from cluster_groups').get() as { count: number }; + const durableMemberships = service.db.prepare('select count(*) as count from cluster_memberships').get() as { count: number }; + const durableEdges = service.db.prepare('select count(*) as count from similarity_edge_evidence').get() as { count: number }; + assert.equal(durableClusters.count, 1); + assert.equal(durableMemberships.count, 2); + assert.equal(durableEdges.count, 1); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index d421be8..7352e70 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -51,6 +51,15 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildDeterministicClusterGraph } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; +import { humanKeyForValue } from './cluster/human-key.js'; +import { + createPipelineRun, + finishPipelineRun, + recordClusterEvent, + upsertClusterGroup, + upsertClusterMembership, + upsertSimilarityEdgeEvidence, +} from './cluster/persistent-store.js'; import { ensureRuntimeDirs, isLikelyGitHubToken, @@ -1295,6 +1304,19 @@ export class GHCrawlService { }): Promise { const repository = this.requireRepository(params.owner, params.repo); const runId = this.startRun('cluster_runs', repository.id, repository.fullName); + const pipelineRunId = createPipelineRun(this.db, { + repoId: repository.id, + runKind: 'cluster', + algorithmVersion: 'persistent-cluster-v1', + configHash: stableContentHash( + JSON.stringify({ + minScore: params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE, + k: params.k ?? 6, + embedModel: this.config.embedModel, + embeddingBasis: this.config.embeddingBasis, + }), + ), + }); const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE; const k = params.k ?? 6; @@ -1388,6 +1410,7 @@ export class GHCrawlService { edges, ); this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); + this.persistDurableClusterState(repository.id, pipelineRunId, aggregatedEdges, clusters); this.pruneOldClusterRuns(repository.id, runId); if (this.isRepoVectorStateCurrent(repository.id)) { this.markRepoClustersCurrent(repository.id); @@ -1397,9 +1420,11 @@ export class GHCrawlService { params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`); this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length }); + finishPipelineRun(this.db, pipelineRunId, { status: 'completed', stats: { edges: edges.length, clusters: clusters.length } }); return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length }); } catch (error) { this.finishRun('cluster_runs', runId, 'failed', null, error); + finishPipelineRun(this.db, pipelineRunId, { status: 'failed', errorText: error instanceof Error ? error.message : String(error) }); throw error; } } @@ -4465,6 +4490,74 @@ export class GHCrawlService { })(); } + private persistDurableClusterState( + repoId: number, + pipelineRunId: number, + aggregatedEdges: Map }>, + clusters: Array<{ representativeThreadId: number; members: number[] }>, + ): void { + this.db.transaction(() => { + for (const edge of aggregatedEdges.values()) { + upsertSimilarityEdgeEvidence(this.db, { + repoId, + leftThreadId: edge.leftThreadId, + rightThreadId: edge.rightThreadId, + algorithmVersion: 'persistent-cluster-v1', + configHash: stableContentHash(JSON.stringify({ sources: Array.from(edge.sourceKinds).sort(), model: this.config.embedModel })), + score: edge.score, + tier: edge.score >= DEFAULT_CLUSTER_MIN_SCORE ? 'strong' : 'weak', + state: 'active', + breakdown: { + sources: Array.from(edge.sourceKinds).sort(), + score: edge.score, + }, + runId: pipelineRunId, + }); + } + + for (const cluster of clusters) { + const identity = humanKeyForValue(`repo:${repoId}:cluster-representative:${cluster.representativeThreadId}`); + const clusterId = upsertClusterGroup(this.db, { + repoId, + stableKey: identity.hash, + stableSlug: identity.slug, + status: 'active', + clusterType: cluster.members.length > 1 ? 'duplicate_candidate' : 'singleton_orphan', + representativeThreadId: cluster.representativeThreadId, + title: `Cluster ${identity.slug}`, + }); + for (const memberId of cluster.members) { + const scoreKey = this.edgeKey(cluster.representativeThreadId, memberId); + const score = memberId === cluster.representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); + upsertClusterMembership(this.db, { + clusterId, + threadId: memberId, + role: memberId === cluster.representativeThreadId ? 'canonical' : 'related', + state: 'active', + scoreToRepresentative: score, + runId: pipelineRunId, + addedBy: 'algo', + addedReason: { + source: 'clusterRepository', + representativeThreadId: cluster.representativeThreadId, + }, + }); + recordClusterEvent(this.db, { + clusterId, + runId: pipelineRunId, + eventType: memberId === cluster.representativeThreadId ? 'keep_canonical' : 'upsert_member', + actorKind: 'algo', + payload: { + threadId: memberId, + representativeThreadId: cluster.representativeThreadId, + scoreToRepresentative: score, + }, + }); + } + } + })(); + } + private pruneOldClusterRuns(repoId: number, keepRunId: number): void { this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId); } From 048203e80fdee35ce4e1359a37b20044d07bec7e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:16:14 -0700 Subject: [PATCH 013/215] feat(cluster): honor manual cluster exclusions --- packages/api-core/src/service.test.ts | 21 +++++++++++++ packages/api-core/src/service.ts | 44 +++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 8bf3931..dd56aca 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -1921,6 +1921,27 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a assert.equal(durableClusters.count, 1); assert.equal(durableMemberships.count, 2); assert.equal(durableEdges.count, 1); + + const cluster = service.db.prepare('select id from cluster_groups limit 1').get() as { id: number }; + service.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at) + values (?, ?, ?, 'exclude', ?, ?)`, + ) + .run(1, cluster.id, 11, 'maintainer removed from cluster', now); + + await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + k: 1, + minScore: 0.1, + }); + + const blocked = service.db + .prepare('select state, removed_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(cluster.id, 11) as { state: string; removed_by: string | null }; + assert.equal(blocked.state, 'blocked_by_override'); + assert.equal(blocked.removed_by, 'user'); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 7352e70..de89fc4 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -4529,6 +4529,50 @@ export class GHCrawlService { for (const memberId of cluster.members) { const scoreKey = this.edgeKey(cluster.representativeThreadId, memberId); const score = memberId === cluster.representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); + const excluded = this.db + .prepare( + `select 1 + from cluster_overrides + where cluster_id = ? + and thread_id = ? + and action = 'exclude' + and (expires_at is null or expires_at > ?) + limit 1`, + ) + .get(clusterId, memberId, nowIso()); + if (excluded) { + upsertClusterMembership(this.db, { + clusterId, + threadId: memberId, + role: 'related', + state: 'blocked_by_override', + scoreToRepresentative: score, + runId: pipelineRunId, + addedBy: 'algo', + removedBy: 'user', + addedReason: { + source: 'clusterRepository', + representativeThreadId: cluster.representativeThreadId, + }, + removedReason: { + source: 'cluster_overrides', + action: 'exclude', + }, + }); + recordClusterEvent(this.db, { + clusterId, + runId: pipelineRunId, + eventType: 'block_member', + actorKind: 'algo', + payload: { + threadId: memberId, + representativeThreadId: cluster.representativeThreadId, + scoreToRepresentative: score, + reason: 'manual_exclusion', + }, + }); + continue; + } upsertClusterMembership(this.db, { clusterId, threadId: memberId, From 1976fa95530e55434933ed51b7b13a0ef343910a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:19:59 -0700 Subject: [PATCH 014/215] feat(contract): add cluster exclusion action --- packages/api-contract/src/contracts.test.ts | 59 ++++++++++++++++++++- packages/api-contract/src/contracts.ts | 20 +++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 3c88d08..c569f80 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -1,7 +1,14 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { actionRequestSchema, healthResponseSchema, neighborsResponseSchema, searchResponseSchema } from './contracts.js'; +import { + actionRequestSchema, + clusterOverrideResponseSchema, + excludeClusterMemberRequestSchema, + healthResponseSchema, + neighborsResponseSchema, + searchResponseSchema, +} from './contracts.js'; test('health schema accepts configured status payload', () => { const parsed = healthResponseSchema.parse({ @@ -46,6 +53,56 @@ test('action request accepts optional thread number', () => { assert.equal(parsed.threadNumber, 42); }); +test('exclude cluster member request trims optional reason', () => { + const parsed = excludeClusterMemberRequestSchema.parse({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: ' confirmed separate bug ', + }); + + assert.equal(parsed.reason, 'confirmed separate bug'); +}); + +test('cluster override response accepts durable removal state', () => { + const parsed = clusterOverrideResponseSchema.parse({ + ok: true, + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + clusterId: 7, + thread: { + id: 10, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Downloader hangs', + body: 'The transfer never finishes.', + authorLogin: 'alice', + htmlUrl: 'https://github.com/openclaw/openclaw/issues/42', + labels: ['bug'], + updatedAtGh: new Date().toISOString(), + clusterId: null, + }, + action: 'exclude', + state: 'removed_by_user', + message: 'Removed issue #42 from cluster 7.', + }); + + assert.equal(parsed.state, 'removed_by_user'); +}); + test('neighbors schema accepts repository, source thread, and neighbor list', () => { const parsed = neighborsResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 1771e66..8356d10 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -247,6 +247,15 @@ export const closeClusterRequestSchema = z.object({ }); export type CloseClusterRequest = z.infer; +export const excludeClusterMemberRequestSchema = z.object({ + owner: z.string(), + repo: z.string(), + clusterId: z.number().int().positive(), + threadNumber: z.number().int().positive(), + reason: z.string().trim().min(1).optional(), +}); +export type ExcludeClusterMemberRequest = z.infer; + export const closeResponseSchema = z.object({ ok: z.boolean(), repository: repositorySchema, @@ -257,6 +266,17 @@ export const closeResponseSchema = z.object({ }); export type CloseResponse = z.infer; +export const clusterOverrideResponseSchema = z.object({ + ok: z.boolean(), + repository: repositorySchema, + clusterId: z.number().int().positive(), + thread: threadSchema, + action: z.enum(['exclude']), + state: z.enum(['removed_by_user', 'blocked_by_override']), + message: z.string(), +}); +export type ClusterOverrideResponse = z.infer; + export const rerunActionSchema = z.enum(['summarize', 'embed', 'cluster']); export type RerunAction = z.infer; From 130c5fc4493172534e8f467dd35fce866cbdc0b9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:22:16 -0700 Subject: [PATCH 015/215] feat(cluster): add manual exclusion service --- packages/api-core/src/service.test.ts | 75 +++++++++++++++++++++++++++ packages/api-core/src/service.ts | 73 ++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index dd56aca..b673995 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2918,6 +2918,81 @@ test('manual cluster closure is hidden from JSON summaries by default but remain } }); +test('excludeThreadFromCluster records a durable manual exclusion', () => { + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + }); + + try { + const now = '2026-03-10T12:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, 1, '100', 42, 'issue', 'open', 'Issue one', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', 10, 'Cluster trace-alpha-river', now, now); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 10, 'related', 'active', 0.87, null, null, 'algo', null, '{}', null, now, now, null); + + const response = service.excludeThreadFromCluster({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: 'false positive', + }); + + assert.equal(response.ok, true); + assert.equal(response.state, 'removed_by_user'); + assert.equal(response.thread.number, 42); + const override = service.db.prepare('select action, reason from cluster_overrides where cluster_id = ? and thread_id = ?').get(7, 10) as { + action: string; + reason: string; + }; + assert.deepEqual(override, { action: 'exclude', reason: 'false positive' }); + const membership = service.db + .prepare('select state, removed_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(7, 10) as { state: string; removed_by: string }; + assert.deepEqual(membership, { state: 'removed_by_user', removed_by: 'user' }); + const event = service.db.prepare('select event_type, actor_kind from cluster_events where cluster_id = ?').get(7) as { + event_type: string; + actor_kind: string; + }; + assert.deepEqual(event, { event_type: 'manual_exclude_member', actor_kind: 'user' }); + } finally { + service.close(); + } +}); + test('syncRepository reconciles stale open threads and marks confirmed closures without re-fetching comments', async () => { let listIssueCommentCalls = 0; let getIssueCalls = 0; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index de89fc4..c3babdd 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -13,6 +13,7 @@ import { actionResponseSchema, authorThreadsResponseSchema, closeResponseSchema, + clusterOverrideResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, @@ -29,11 +30,13 @@ import { type ActionResponse, type AuthorThreadsResponse, type CloseResponse, + type ClusterOverrideResponse, type ClusterDetailResponse, type ClusterDto, type ClusterResultDto, type ClusterSummariesResponse, type ClustersResponse, + type ExcludeClusterMemberRequest, type EmbedResultDto, type HealthResponse, type NeighborsResponse, @@ -890,6 +893,76 @@ export class GHCrawlService { }); } + excludeThreadFromCluster(params: ExcludeClusterMemberRequest): ClusterOverrideResponse { + const repository = this.requireRepository(params.owner, params.repo); + const cluster = this.db + .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') + .get(repository.id, params.clusterId) as { id: number } | undefined; + if (!cluster) { + throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); + } + + const thread = this.db + .prepare('select * from threads where repo_id = ? and number = ? limit 1') + .get(repository.id, params.threadNumber) as ThreadRow | undefined; + if (!thread) { + throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); + } + + const existingMembership = this.db + .prepare('select role, score_to_representative from cluster_memberships where cluster_id = ? and thread_id = ? limit 1') + .get(cluster.id, thread.id) as { role: 'canonical' | 'duplicate' | 'related'; score_to_representative: number | null } | undefined; + const timestamp = nowIso(); + this.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) + values (?, ?, ?, 'exclude', ?, ?, null) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + created_at = excluded.created_at, + expires_at = null`, + ) + .run(repository.id, cluster.id, thread.id, params.reason ?? null, timestamp); + + upsertClusterMembership(this.db, { + clusterId: cluster.id, + threadId: thread.id, + role: existingMembership?.role ?? 'related', + state: 'removed_by_user', + scoreToRepresentative: existingMembership?.score_to_representative ?? null, + addedBy: 'user', + removedBy: 'user', + addedReason: { + source: 'excludeThreadFromCluster', + }, + removedReason: { + source: 'cluster_overrides', + action: 'exclude', + reason: params.reason ?? null, + }, + }); + recordClusterEvent(this.db, { + clusterId: cluster.id, + eventType: 'manual_exclude_member', + actorKind: 'user', + payload: { + threadId: thread.id, + threadNumber: thread.number, + reason: params.reason ?? null, + }, + }); + + return clusterOverrideResponseSchema.parse({ + ok: true, + repository, + clusterId: cluster.id, + thread: threadToDto(thread), + action: 'exclude', + state: 'removed_by_user', + message: `Removed ${thread.kind} #${thread.number} from durable cluster ${cluster.id}.`, + }); + } + async syncRepository( params: SyncOptions, ): Promise { From 283f21757fafdabc23ab0605f88b9e039743a6b5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:23:34 -0700 Subject: [PATCH 016/215] feat(api): expose cluster exclusion action --- packages/api-core/src/api/server.test.ts | 102 ++++++++++++++++++++++- packages/api-core/src/api/server.ts | 14 +++- 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index 869e189..8d36163 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -1,7 +1,16 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterSummariesResponseSchema, healthResponseSchema, neighborsResponseSchema, threadsResponseSchema } from '@ghcrawl/api-contract'; +import { + authorThreadsResponseSchema, + closeResponseSchema, + clusterDetailResponseSchema, + clusterOverrideResponseSchema, + clusterSummariesResponseSchema, + healthResponseSchema, + neighborsResponseSchema, + threadsResponseSchema, +} from '@ghcrawl/api-contract'; import { createApiServer } from './server.js'; import { GHCrawlService } from '../service.js'; @@ -389,6 +398,97 @@ test('close-thread and includeClosed thread routes expose locally closed items', } }); +test('exclude cluster member action records a durable override', async () => { + const service = new GHCrawlService({ + config: { + workspaceRoot: process.cwd(), + configDir: '/tmp/ghcrawl-test', + configPath: '/tmp/ghcrawl-test/config.json', + configFileExists: true, + dbPath: ':memory:', + dbPathSource: 'config', + apiPort: 5179, + secretProvider: 'plaintext', + githubTokenSource: 'none', + openaiApiKeySource: 'none', + summaryModel: 'gpt-5-mini', + embedModel: 'text-embedding-3-large', + embeddingBasis: 'title_original', + vectorBackend: 'vectorlite', + embedBatchSize: 8, + embedConcurrency: 10, + embedMaxUnread: 20, + openSearchIndex: 'ghcrawl-threads', + tuiPreferences: {}, + }, + github: { + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + }, + }); + + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', 10, 'Cluster trace-alpha-river', now, now); + + const server = createApiServer(service); + try { + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const address = server.address(); + assert(address && typeof address === 'object'); + + const response = await fetch(`http://127.0.0.1:${address.port}/actions/exclude-cluster-member`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: 'not the same defect', + }), + }); + assert.equal(response.status, 200); + const payload = clusterOverrideResponseSchema.parse((await response.json()) as unknown); + assert.equal(payload.state, 'removed_by_user'); + + const override = service.db.prepare('select action, reason from cluster_overrides where cluster_id = ? and thread_id = ?').get(7, 10) as { + action: string; + reason: string; + }; + assert.deepEqual(override, { action: 'exclude', reason: 'not the same defect' }); + } finally { + await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); + service.close(); + } +}); + test('server returns 400 for malformed request inputs', async () => { const service = new GHCrawlService({ config: { diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 79032c8..4d152fc 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -1,6 +1,12 @@ import http from 'node:http'; -import { actionRequestSchema, closeClusterRequestSchema, closeThreadRequestSchema, refreshRequestSchema } from '@ghcrawl/api-contract'; +import { + actionRequestSchema, + closeClusterRequestSchema, + closeThreadRequestSchema, + excludeClusterMemberRequestSchema, + refreshRequestSchema, +} from '@ghcrawl/api-contract'; import { ZodError } from 'zod'; import { GHCrawlService, parseRepoParams } from '../service.js'; @@ -191,6 +197,12 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'POST' && url.pathname === '/actions/exclude-cluster-member') { + const body = excludeClusterMemberRequestSchema.parse(await readBody(req)); + sendJson(res, 200, service.excludeThreadFromCluster(body)); + return; + } + sendJson(res, 404, { error: 'Not found' }); } catch (error) { const message = error instanceof Error ? error.message : String(error); From a72e22f64a2fdc09625297d3b9e1c91b54769535 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:25:10 -0700 Subject: [PATCH 017/215] feat(cli): add cluster exclusion command --- apps/cli/src/main.test.ts | 47 +++++++++++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 33 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index c018b8d..27fb5e1 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -47,6 +47,7 @@ const publicCommands = [ 'author', 'close-thread', 'close-cluster', + 'exclude-cluster-member', 'embed', 'cluster', 'clusters', @@ -268,6 +269,7 @@ test('agent-facing command help advertises explicit --json', async () => { 'author', 'close-thread', 'close-cluster', + 'exclude-cluster-member', 'embed', 'cluster', 'clusters', @@ -310,6 +312,44 @@ test('compatibility path keeps json-by-default commands working without --json', assert.match(stdout.read(), /"threads"/); }); +test('exclude-cluster-member command forwards durable override inputs', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.excludeThreadFromCluster; + let received: unknown; + + GHCrawlService.prototype.excludeThreadFromCluster = function excludeThreadFromClusterStub(params: unknown) { + received = params; + return { + ok: true, + clusterId: 7, + thread: { number: 42 }, + action: 'exclude', + state: 'removed_by_user', + message: 'removed', + } as never; + }; + + try { + await run(['exclude-cluster-member', 'openclaw/openclaw', '--id', '7', '--number', '42', '--reason', 'false positive'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.excludeThreadFromCluster = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: 'false positive', + }); + assert.match(stdout.read(), /"state": "removed_by_user"/); +}); + test('long-running command progress stays on stderr and payload stays on stdout', async () => { const stdout = createWritableCapture(); const stderr = createWritableCapture(); @@ -387,6 +427,13 @@ test('parseRepoFlags accepts kind filter for threads', () => { assert.equal(parsed.values.kind, 'pull_request'); }); +test('parseRepoFlags accepts exclusion reason', () => { + const parsed = parseRepoFlags('exclude-cluster-member', ['openclaw/openclaw', '--id', '7', '--number', '42', '--reason', 'false positive']); + assert.equal(parsed.owner, 'openclaw'); + assert.equal(parsed.repo, 'openclaw'); + assert.equal(parsed.values.reason, 'false positive'); +}); + test('parseRepoFlags accepts heap diagnostics options', () => { const parsed = parseRepoFlags('cluster', ['openclaw/openclaw', '--heap-snapshot-dir', './tmp/heaps', '--heap-log-interval-ms', '5000']); assert.equal(parsed.owner, 'openclaw'); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 4446002..690f4a0 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -21,6 +21,7 @@ type CommandName = | 'author' | 'close-thread' | 'close-cluster' + | 'exclude-cluster-member' | 'summarize' | 'purge-comments' | 'embed' @@ -191,6 +192,19 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl close-cluster openclaw/openclaw --id 123 --json'], agentJson: true, }, + { + name: 'exclude-cluster-member', + synopsis: 'exclude-cluster-member --id --number [--reason ] [--json]', + description: 'Remove one issue or PR from a durable cluster and block automatic re-entry.', + options: [ + '--id Durable cluster id', + '--number Issue or PR number to exclude', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl exclude-cluster-member openclaw/openclaw --id 123 --number 42 --reason "false positive" --json'], + agentJson: true, + }, { name: 'embed', synopsis: 'embed [--number ] [--json]', @@ -470,6 +484,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo threshold: { type: 'string' }, port: { type: 'string' }, id: { type: 'string' }, + reason: { type: 'string' }, sort: { type: 'string' }, search: { type: 'string' }, 'min-size': { type: 'string' }, @@ -1004,6 +1019,24 @@ export async function run( writeJson(stdout, result); return; } + case 'exclude-cluster-member': { + const { owner, repo, values } = parseRepoFlags('exclude-cluster-member', rest); + if (typeof values.id !== 'string') { + throw new CliUsageError('Missing --id', 'exclude-cluster-member'); + } + if (typeof values.number !== 'string') { + throw new CliUsageError('Missing --number', 'exclude-cluster-member'); + } + const result = getService().excludeThreadFromCluster({ + owner, + repo, + clusterId: parsePositiveInteger('id', values.id, 'exclude-cluster-member'), + threadNumber: parsePositiveInteger('number', values.number, 'exclude-cluster-member'), + reason: typeof values.reason === 'string' ? values.reason : undefined, + }); + writeJson(stdout, result); + return; + } case 'summarize': { const { owner, repo, values } = parseRepoFlags('summarize', rest); const result = await getService().summarizeRepository({ From a98322c301742b5080cc1df7248802d46c5107e0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:26:50 -0700 Subject: [PATCH 018/215] feat(cluster): persist thread fingerprints --- .../src/cluster/persistent-store.test.ts | 59 ++++++++ .../api-core/src/cluster/persistent-store.ts | 132 ++++++++++++++++++ 2 files changed, 191 insertions(+) diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index af5cc71..e796df8 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -11,6 +11,8 @@ import { upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, + upsertThreadFingerprint, + upsertThreadRevision, } from './persistent-store.js'; import { buildDeterministicThreadFingerprint } from './thread-fingerprint.js'; @@ -125,3 +127,60 @@ test('persistent cluster store upserts edge evidence and governed memberships', db.close(); } }); + +test('persistent cluster store records thread revisions and deterministic fingerprints', () => { + const db = openDb(':memory:'); + try { + migrate(db); + seedRepoAndThreads(db); + const fingerprint = buildDeterministicThreadFingerprint({ + threadId: 10, + number: 10, + kind: 'pull_request', + title: 'Fix cache collision', + body: 'Cache keys collide across repos.', + labels: ['bug'], + changedFiles: ['packages/api-core/src/cache.ts'], + linkedRefs: ['123'], + hunkSignatures: ['h1'], + patchIds: ['p1'], + }); + const revisionId = upsertThreadRevision(db, { + threadId: 10, + sourceUpdatedAt: '2026-01-01T00:00:00Z', + title: 'Fix cache collision', + body: 'Cache keys collide across repos.', + labels: ['bug'], + rawJson: '{"number":10}', + }); + + upsertThreadFingerprint(db, { threadRevisionId: revisionId, fingerprint }); + upsertThreadFingerprint(db, { threadRevisionId: revisionId, fingerprint }); + + const revisionCount = db.prepare('select count(*) as count from thread_revisions').get() as { count: number }; + const fingerprintRow = db + .prepare( + `select fingerprint_hash, fingerprint_slug, simhash64, minhash_signature_blob_id, winnow_hashes_blob_id + from thread_fingerprints + where thread_revision_id = ?`, + ) + .get(revisionId) as { + fingerprint_hash: string; + fingerprint_slug: string; + simhash64: string; + minhash_signature_blob_id: number; + winnow_hashes_blob_id: number; + }; + const blobCount = db.prepare('select count(*) as count from blobs').get() as { count: number }; + + assert.equal(revisionCount.count, 1); + assert.equal(fingerprintRow.fingerprint_hash, fingerprint.fingerprintHash); + assert.equal(fingerprintRow.fingerprint_slug, fingerprint.fingerprintSlug); + assert.equal(fingerprintRow.simhash64, fingerprint.simhash64); + assert.ok(fingerprintRow.minhash_signature_blob_id > 0); + assert.ok(fingerprintRow.winnow_hashes_blob_id > 0); + assert.equal(blobCount.count, 3); + } finally { + db.close(); + } +}); diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index 932077a..d97664e 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -1,12 +1,144 @@ +import crypto from 'node:crypto'; + import type { SqliteDatabase } from '../db/sqlite.js'; import type { EvidenceTier, SimilarityEvidenceBreakdown } from './evidence-score.js'; +import type { DeterministicThreadFingerprint } from './thread-fingerprint.js'; function nowIso(): string { return new Date().toISOString(); } +function stableHash(value: string): string { + return crypto.createHash('sha256').update(value).digest('hex'); +} + +function jsonHash(value: unknown): string { + return stableHash(JSON.stringify(value)); +} + +function upsertInlineBlob( + db: SqliteDatabase, + params: { + text: string; + mediaType: string; + }, +): number { + const sha256 = stableHash(params.text); + db.prepare( + `insert into blobs (sha256, media_type, compression, size_bytes, storage_kind, storage_path, inline_text, created_at) + values (?, ?, 'none', ?, 'inline', null, ?, ?) + on conflict(sha256) do nothing`, + ).run(sha256, params.mediaType, Buffer.byteLength(params.text), params.text, nowIso()); + const row = db.prepare('select id from blobs where sha256 = ? limit 1').get(sha256) as { id: number }; + return row.id; +} + export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster'; +export function upsertThreadRevision( + db: SqliteDatabase, + params: { + threadId: number; + sourceUpdatedAt?: string | null; + title: string; + body?: string | null; + labels: string[]; + rawJson?: string | null; + }, +): number { + const labels = Array.from(new Set(params.labels)).sort(); + const contentHash = jsonHash({ + title: params.title, + body: params.body ?? '', + labels, + rawJson: params.rawJson ?? null, + }); + const rawJsonBlobId = + params.rawJson && params.rawJson !== '{}' + ? upsertInlineBlob(db, { + text: params.rawJson, + mediaType: 'application/vnd.ghcrawl.thread.raw+json', + }) + : null; + db.prepare( + `insert into thread_revisions ( + thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, raw_json_blob_id, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_id, content_hash) do update set + source_updated_at = excluded.source_updated_at, + raw_json_blob_id = excluded.raw_json_blob_id`, + ).run( + params.threadId, + params.sourceUpdatedAt ?? null, + contentHash, + stableHash(params.title), + stableHash(params.body ?? ''), + jsonHash(labels), + rawJsonBlobId, + nowIso(), + ); + const row = db + .prepare('select id from thread_revisions where thread_id = ? and content_hash = ? limit 1') + .get(params.threadId, contentHash) as { id: number }; + return row.id; +} + +export function upsertThreadFingerprint( + db: SqliteDatabase, + params: { + threadRevisionId: number; + fingerprint: DeterministicThreadFingerprint; + }, +): void { + const minhashBlobId = upsertInlineBlob(db, { + text: JSON.stringify(params.fingerprint.minhashSignature), + mediaType: 'application/vnd.ghcrawl.minhash+json', + }); + const winnowBlobId = upsertInlineBlob(db, { + text: JSON.stringify(params.fingerprint.winnowHashes), + mediaType: 'application/vnd.ghcrawl.winnow+json', + }); + const featureJson = JSON.stringify({ + salientTitleTokens: params.fingerprint.salientTitleTokens, + hunkSignatures: params.fingerprint.hunkSignatures, + patchIds: params.fingerprint.patchIds, + }); + db.prepare( + `insert into thread_fingerprints ( + thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, + title_tokens_json, body_token_hash, linked_refs_json, file_set_hash, module_buckets_json, + minhash_signature_blob_id, simhash64, winnow_hashes_blob_id, feature_json, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_revision_id, algorithm_version) do update set + fingerprint_hash = excluded.fingerprint_hash, + fingerprint_slug = excluded.fingerprint_slug, + title_tokens_json = excluded.title_tokens_json, + body_token_hash = excluded.body_token_hash, + linked_refs_json = excluded.linked_refs_json, + file_set_hash = excluded.file_set_hash, + module_buckets_json = excluded.module_buckets_json, + minhash_signature_blob_id = excluded.minhash_signature_blob_id, + simhash64 = excluded.simhash64, + winnow_hashes_blob_id = excluded.winnow_hashes_blob_id, + feature_json = excluded.feature_json`, + ).run( + params.threadRevisionId, + params.fingerprint.algorithmVersion, + params.fingerprint.fingerprintHash, + params.fingerprint.fingerprintSlug, + JSON.stringify(params.fingerprint.titleTokens), + jsonHash(params.fingerprint.bodyTokens), + JSON.stringify(params.fingerprint.linkedRefs), + jsonHash(params.fingerprint.changedFiles), + JSON.stringify(params.fingerprint.moduleBuckets), + minhashBlobId, + params.fingerprint.simhash64, + winnowBlobId, + featureJson, + nowIso(), + ); +} + export function createPipelineRun( db: SqliteDatabase, params: { From a009cec3e0b779f837a2d820c4eddbe842bd3c13 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:27:47 -0700 Subject: [PATCH 019/215] feat(cluster): save fallback fingerprints --- packages/api-core/src/service.test.ts | 4 +++ packages/api-core/src/service.ts | 38 ++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index b673995..c0289c8 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -1994,6 +1994,10 @@ test('clusterRepository falls back to deterministic fingerprints when vectors ar assert.equal(result.edges, 1); assert.equal(result.clusters, 1); + const revisionCount = service.db.prepare('select count(*) as count from thread_revisions').get() as { count: number }; + const fingerprintCount = service.db.prepare('select count(*) as count from thread_fingerprints').get() as { count: number }; + assert.equal(revisionCount.count, 2); + assert.equal(fingerprintCount.count, 2); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index c3babdd..d54724a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -62,7 +62,10 @@ import { upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, + upsertThreadFingerprint, + upsertThreadRevision, } from './cluster/persistent-store.js'; +import type { DeterministicThreadFingerprint } from './cluster/thread-fingerprint.js'; import { ensureRuntimeDirs, isLikelyGitHubToken, @@ -1454,6 +1457,7 @@ export class GHCrawlService { } else { const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); const deterministic = buildDeterministicClusterGraph(deterministicItems, { topK: Math.max(k * 8, 64) }); + this.persistDeterministicFingerprints(deterministicItems, deterministic.fingerprints); items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); aggregatedEdges = new Map(); for (const edge of deterministic.edges) { @@ -4088,10 +4092,12 @@ export class GHCrawlService { title: string; body: string | null; labels: string[]; + rawJson: string; + updatedAtGh: string | null; }> { const rows = this.db .prepare( - `select id, number, kind, title, body, labels_json + `select id, number, kind, title, body, labels_json, raw_json, updated_at_gh from threads where repo_id = ? and state = 'open' @@ -4105,6 +4111,8 @@ export class GHCrawlService { title: string; body: string | null; labels_json: string; + raw_json: string; + updated_at_gh: string | null; }>; return rows.map((row) => ({ id: row.id, @@ -4113,9 +4121,37 @@ export class GHCrawlService { title: row.title, body: row.body, labels: parseArray(row.labels_json), + rawJson: row.raw_json, + updatedAtGh: row.updated_at_gh, })); } + private persistDeterministicFingerprints( + items: Array<{ + id: number; + title: string; + body: string | null; + labels: string[]; + rawJson: string; + updatedAtGh: string | null; + }>, + fingerprints: Map, + ): void { + for (const item of items) { + const fingerprint = fingerprints.get(item.id); + if (!fingerprint) continue; + const revisionId = upsertThreadRevision(this.db, { + threadId: item.id, + sourceUpdatedAt: item.updatedAtGh, + title: item.title, + body: item.body, + labels: item.labels, + rawJson: item.rawJson, + }); + upsertThreadFingerprint(this.db, { threadRevisionId: revisionId, fingerprint }); + } + } + private loadNormalizedActiveVectors(repoId: number): Array<{ id: number; number: number; title: string; embedding: number[] }> { return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({ id: row.id, From dbf7c228b40781e833bac655bc5d925edee738e8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:29:25 -0700 Subject: [PATCH 020/215] feat(cluster): add actor stats store --- .../src/cluster/persistent-store.test.ts | 36 +++++++ .../api-core/src/cluster/persistent-store.ts | 96 +++++++++++++++++++ 2 files changed, 132 insertions(+) diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index e796df8..e665a0d 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -7,7 +7,9 @@ import { scoreSimilarityEvidence } from './evidence-score.js'; import { createPipelineRun, finishPipelineRun, + refreshActorRepoStats, recordClusterEvent, + upsertActor, upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, @@ -128,6 +130,40 @@ test('persistent cluster store upserts edge evidence and governed memberships', } }); +test('persistent cluster store upserts actors and recomputes repo stats', () => { + const db = openDb(':memory:'); + try { + migrate(db); + seedRepoAndThreads(db); + const actorId = upsertActor(db, { + providerUserId: 'alice-id', + login: 'alice', + displayName: 'Alice', + actorType: 'User', + rawJson: '{"login":"alice"}', + }); + db.prepare( + `insert into comments ( + thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run(10, 'c1', 'issue_comment', 'alice', 'User', 'confirmed', 0, '{}', '2026-01-02T00:00:00Z', '2026-01-02T00:00:00Z'); + + refreshActorRepoStats(db, 1); + + const actor = db.prepare('select login, display_name from actors where id = ?').get(actorId) as { login: string; display_name: string }; + const stats = db.prepare('select opened_prs, comments, trust_tier from actor_repo_stats where repo_id = ? and actor_id = ?').get(1, actorId) as { + opened_prs: number; + comments: number; + trust_tier: string; + }; + + assert.deepEqual(actor, { login: 'alice', display_name: 'Alice' }); + assert.deepEqual(stats, { opened_prs: 2, comments: 1, trust_tier: 'unknown' }); + } finally { + db.close(); + } +}); + test('persistent cluster store records thread revisions and deterministic fingerprints', () => { const db = openDb(':memory:'); try { diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index d97664e..eb24a0d 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -35,6 +35,102 @@ function upsertInlineBlob( export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster'; +export function upsertActor( + db: SqliteDatabase, + params: { + provider?: 'github'; + providerUserId: string; + login: string; + displayName?: string | null; + actorType?: string | null; + siteAdmin?: boolean; + rawJson?: string | null; + }, +): number { + const timestamp = nowIso(); + const rawJsonBlobId = + params.rawJson && params.rawJson !== '{}' + ? upsertInlineBlob(db, { + text: params.rawJson, + mediaType: 'application/vnd.ghcrawl.actor.raw+json', + }) + : null; + db.prepare( + `insert into actors ( + provider, provider_user_id, login, display_name, actor_type, site_admin, + raw_json_blob_id, first_seen_at, last_seen_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(provider, provider_user_id) do update set + login = excluded.login, + display_name = excluded.display_name, + actor_type = excluded.actor_type, + site_admin = excluded.site_admin, + raw_json_blob_id = excluded.raw_json_blob_id, + last_seen_at = excluded.last_seen_at, + updated_at = excluded.updated_at`, + ).run( + params.provider ?? 'github', + params.providerUserId, + params.login, + params.displayName ?? null, + params.actorType ?? null, + params.siteAdmin ? 1 : 0, + rawJsonBlobId, + timestamp, + timestamp, + timestamp, + ); + const row = db + .prepare('select id from actors where provider = ? and provider_user_id = ? limit 1') + .get(params.provider ?? 'github', params.providerUserId) as { id: number }; + return row.id; +} + +export function refreshActorRepoStats(db: SqliteDatabase, repoId: number): void { + db.prepare('delete from actor_repo_stats where repo_id = ?').run(repoId); + db.prepare( + `insert into actor_repo_stats ( + repo_id, actor_id, opened_issues, opened_prs, comments, merged_prs, closed_threads, first_activity_at, last_activity_at, trust_tier + ) + select + ?, + a.id, + (select count(*) from threads t where t.repo_id = ? and t.kind = 'issue' and lower(t.author_login) = lower(a.login)), + (select count(*) from threads t where t.repo_id = ? and t.kind = 'pull_request' and lower(t.author_login) = lower(a.login)), + (select count(*) + from comments c + join threads t on t.id = c.thread_id + where t.repo_id = ? and lower(c.author_login) = lower(a.login)), + (select count(*) from threads t where t.repo_id = ? and t.kind = 'pull_request' and t.merged_at_gh is not null and lower(t.author_login) = lower(a.login)), + (select count(*) from threads t where t.repo_id = ? and t.closed_at_gh is not null and lower(t.author_login) = lower(a.login)), + (select min(activity_at) + from ( + select created_at_gh as activity_at from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login) + union all + select c.created_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(a.login) + ) + where activity_at is not null), + (select max(activity_at) + from ( + select updated_at_gh as activity_at from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login) + union all + select c.updated_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(a.login) + ) + where activity_at is not null), + case + when a.actor_type = 'Bot' then 'bot' + when (select count(*) from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login)) >= 3 then 'repeat_contributor' + else 'unknown' + end + from actors a + where exists (select 1 from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login)) + or exists ( + select 1 from comments c join threads t on t.id = c.thread_id + where t.repo_id = ? and lower(c.author_login) = lower(a.login) + )`, + ).run(repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId); +} + export function upsertThreadRevision( db: SqliteDatabase, params: { From 479fc7ca836f7ced4ad6258f793e97a9c1097098 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:31:11 -0700 Subject: [PATCH 021/215] feat(sync): capture actor stats --- packages/api-core/src/service.test.ts | 71 ++++++++++++++++++ packages/api-core/src/service.ts | 100 +++++++++++++++++--------- 2 files changed, 138 insertions(+), 33 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index c0289c8..b8d3381 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2997,6 +2997,77 @@ test('excludeThreadFromCluster records a durable manual exclusion', () => { } }); +test('syncRepository records actors and repo stats from thread and comment authors', async () => { + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [ + { + id: 100, + number: 42, + state: 'open', + title: 'Downloader hangs', + body: 'The transfer never finishes.', + html_url: 'https://github.com/openclaw/openclaw/issues/42', + labels: [], + user: { id: 501, login: 'alice', type: 'User', site_admin: false }, + created_at: '2026-03-09T00:00:00Z', + updated_at: '2026-03-09T00:00:00Z', + }, + ], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [ + { + id: 900, + body: 'same here', + user: { id: 502, login: 'bob', type: 'User', site_admin: false }, + created_at: '2026-03-09T01:00:00Z', + updated_at: '2026-03-09T01:00:00Z', + }, + ], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + }); + + try { + await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + includeComments: true, + limit: 1, + }); + + const actors = service.db.prepare('select login, actor_type from actors order by login').all() as Array<{ + login: string; + actor_type: string; + }>; + const stats = service.db + .prepare( + `select a.login, s.opened_issues, s.comments + from actor_repo_stats s + join actors a on a.id = s.actor_id + order by a.login`, + ) + .all() as Array<{ login: string; opened_issues: number; comments: number }>; + + assert.deepEqual(actors, [ + { login: 'alice', actor_type: 'User' }, + { login: 'bob', actor_type: 'User' }, + ]); + assert.deepEqual(stats, [ + { login: 'alice', opened_issues: 1, comments: 0 }, + { login: 'bob', opened_issues: 0, comments: 1 }, + ]); + } finally { + service.close(); + } +}); + test('syncRepository reconciles stale open threads and marks confirmed closures without re-fetching comments', async () => { let listIssueCommentCalls = 0; let getIssueCalls = 0; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index d54724a..361252d 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -58,7 +58,9 @@ import { humanKeyForValue } from './cluster/human-key.js'; import { createPipelineRun, finishPipelineRun, + refreshActorRepoStats, recordClusterEvent, + upsertActor, upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, @@ -1081,6 +1083,7 @@ export class GHCrawlService { lastReconciledOpenCloseAt: reconciledOpenCloseAt ?? syncCursor.lastReconciledOpenCloseAt, }; this.writeSyncCursorState(repoId, nextSyncCursor); + refreshActorRepoStats(this.db, repoId); this.finishRun('sync_runs', runId, 'completed', { threadsSynced, @@ -3165,48 +3168,63 @@ export class GHCrawlService { const issueComments = await github.listIssueComments(owner, repo, number, reporter); comments.push( - ...issueComments.map((comment) => ({ - githubId: String(comment.id), - commentType: 'issue_comment', - authorLogin: userLogin(comment), - authorType: userType(comment), - body: String(comment.body ?? ''), - isBot: isBotLikeAuthor({ authorLogin: userLogin(comment), authorType: userType(comment) }), - rawJson: asJson(comment), - createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, - updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, - })), + ...issueComments.map((comment) => { + this.upsertActorFromPayload(comment); + const authorLogin = userLogin(comment); + const authorType = userType(comment); + return { + githubId: String(comment.id), + commentType: 'issue_comment', + authorLogin, + authorType, + body: String(comment.body ?? ''), + isBot: isBotLikeAuthor({ authorLogin, authorType }), + rawJson: asJson(comment), + createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, + updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, + }; + }), ); if (isPr) { const reviews = await github.listPullReviews(owner, repo, number, reporter); comments.push( - ...reviews.map((review) => ({ - githubId: String(review.id), - commentType: 'review', - authorLogin: userLogin(review), - authorType: userType(review), - body: String(review.body ?? review.state ?? ''), - isBot: isBotLikeAuthor({ authorLogin: userLogin(review), authorType: userType(review) }), - rawJson: asJson(review), - createdAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, - updatedAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, - })), + ...reviews.map((review) => { + this.upsertActorFromPayload(review); + const authorLogin = userLogin(review); + const authorType = userType(review); + return { + githubId: String(review.id), + commentType: 'review', + authorLogin, + authorType, + body: String(review.body ?? review.state ?? ''), + isBot: isBotLikeAuthor({ authorLogin, authorType }), + rawJson: asJson(review), + createdAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, + updatedAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, + }; + }), ); const reviewComments = await github.listPullReviewComments(owner, repo, number, reporter); comments.push( - ...reviewComments.map((comment) => ({ - githubId: String(comment.id), - commentType: 'review_comment', - authorLogin: userLogin(comment), - authorType: userType(comment), - body: String(comment.body ?? ''), - isBot: isBotLikeAuthor({ authorLogin: userLogin(comment), authorType: userType(comment) }), - rawJson: asJson(comment), - createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, - updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, - })), + ...reviewComments.map((comment) => { + this.upsertActorFromPayload(comment); + const authorLogin = userLogin(comment); + const authorType = userType(comment); + return { + githubId: String(comment.id), + commentType: 'review_comment', + authorLogin, + authorType, + body: String(comment.body ?? ''), + isBot: isBotLikeAuthor({ authorLogin, authorType }), + rawJson: asJson(comment), + createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, + updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, + }; + }), ); } @@ -3252,6 +3270,21 @@ export class GHCrawlService { return row.id; } + private upsertActorFromPayload(payload: Record): number | null { + const user = payload.user as Record | undefined; + const login = userLogin(payload); + if (!user || !login) return null; + const providerUserId = user.id === undefined || user.id === null ? login : String(user.id); + return upsertActor(this.db, { + providerUserId, + login, + displayName: typeof user.name === 'string' ? user.name : null, + actorType: userType(payload), + siteAdmin: user.site_admin === true, + rawJson: asJson(user), + }); + } + private upsertThread( repoId: number, kind: 'issue' | 'pull_request', @@ -3263,6 +3296,7 @@ export class GHCrawlService { const labels = parseLabels(payload); const assignees = parseAssignees(payload); const contentHash = stableContentHash(`${title}\n${body ?? ''}`); + this.upsertActorFromPayload(payload); this.db .prepare( `insert into threads ( From 6978b21e64f6f82e172bfd63fc1909eb86347d24 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:31:53 -0700 Subject: [PATCH 022/215] feat(contract): add durable cluster schemas --- packages/api-contract/src/contracts.test.ts | 55 +++++++++++++++++++++ packages/api-contract/src/contracts.ts | 29 +++++++++++ 2 files changed, 84 insertions(+) diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index c569f80..683ff89 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -4,6 +4,7 @@ import assert from 'node:assert/strict'; import { actionRequestSchema, clusterOverrideResponseSchema, + durableClustersResponseSchema, excludeClusterMemberRequestSchema, healthResponseSchema, neighborsResponseSchema, @@ -103,6 +104,60 @@ test('cluster override response accepts durable removal state', () => { assert.equal(parsed.state, 'removed_by_user'); }); +test('durable clusters response accepts stable slugs and governed member states', () => { + const parsed = durableClustersResponseSchema.parse({ + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + clusters: [ + { + clusterId: 7, + stableKey: 'abc123', + stableSlug: 'trace-alpha-river', + status: 'active', + clusterType: 'duplicate_candidate', + title: 'Cluster trace-alpha-river', + representativeThreadId: 10, + activeCount: 1, + removedCount: 1, + blockedCount: 0, + members: [ + { + thread: { + id: 10, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Downloader hangs', + body: 'The transfer never finishes.', + authorLogin: 'alice', + htmlUrl: 'https://github.com/openclaw/openclaw/issues/42', + labels: ['bug'], + updatedAtGh: new Date().toISOString(), + clusterId: null, + }, + role: 'canonical', + state: 'active', + scoreToRepresentative: 1, + }, + ], + }, + ], + }); + + assert.equal(parsed.clusters[0]?.stableSlug, 'trace-alpha-river'); +}); + test('neighbors schema accepts repository, source thread, and neighbor list', () => { const parsed = neighborsResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 8356d10..afb4a50 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -167,6 +167,35 @@ export const clusterSummariesResponseSchema = z.object({ }); export type ClusterSummariesResponse = z.infer; +export const durableClusterMemberSchema = z.object({ + thread: threadSchema, + role: z.enum(['canonical', 'duplicate', 'related']), + state: z.enum(['active', 'removed_by_user', 'blocked_by_override', 'pending_review', 'stale']), + scoreToRepresentative: z.number().nullable(), +}); +export type DurableClusterMemberDto = z.infer; + +export const durableClusterSchema = z.object({ + clusterId: z.number().int().positive(), + stableKey: z.string(), + stableSlug: z.string(), + status: z.enum(['active', 'closed', 'merged', 'split']), + clusterType: z.string().nullable(), + title: z.string().nullable(), + representativeThreadId: z.number().int().positive().nullable(), + activeCount: z.number().int().nonnegative(), + removedCount: z.number().int().nonnegative(), + blockedCount: z.number().int().nonnegative(), + members: z.array(durableClusterMemberSchema), +}); +export type DurableClusterDto = z.infer; + +export const durableClustersResponseSchema = z.object({ + repository: repositorySchema, + clusters: z.array(durableClusterSchema), +}); +export type DurableClustersResponse = z.infer; + export const threadSummariesSchema = z.object({ problem_summary: z.string().optional(), solution_summary: z.string().optional(), From ade2a8ac7c9a68720ab4217f42f1cffe75e930a8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:33:19 -0700 Subject: [PATCH 023/215] feat(cluster): list durable clusters --- packages/api-core/src/service.test.ts | 56 ++++++++++++++++++ packages/api-core/src/service.ts | 85 +++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index b8d3381..8e5f5fd 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2997,6 +2997,62 @@ test('excludeThreadFromCluster records a durable manual exclusion', () => { } }); +test('listDurableClusters returns stable slugs and governed member states', () => { + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + }); + + try { + const now = '2026-03-10T12:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Issue one', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Issue two', 'body', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', 10, 'Cluster trace-alpha-river', now, now); + const insertMembership = service.db.prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertMembership.run(7, 10, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); + insertMembership.run(7, 11, 'related', 'blocked_by_override', 0.87, null, null, 'algo', 'user', '{}', '{}', now, now, now); + + const response = service.listDurableClusters({ owner: 'openclaw', repo: 'openclaw' }); + + assert.equal(response.clusters[0]?.stableSlug, 'trace-alpha-river'); + assert.equal(response.clusters[0]?.activeCount, 1); + assert.equal(response.clusters[0]?.blockedCount, 1); + assert.equal(response.clusters[0]?.members[1]?.state, 'blocked_by_override'); + } finally { + service.close(); + } +}); + test('syncRepository records actors and repo stats from thread and comment authors', async () => { const service = makeTestService({ checkAuth: async () => undefined, diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 361252d..b44d1dc 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -18,6 +18,7 @@ import { clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, + durableClustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, @@ -36,6 +37,7 @@ import { type ClusterResultDto, type ClusterSummariesResponse, type ClustersResponse, + type DurableClustersResponse, type ExcludeClusterMemberRequest, type EmbedResultDto, type HealthResponse, @@ -2140,6 +2142,89 @@ export class GHCrawlService { }); } + listDurableClusters(params: { owner: string; repo: string; includeInactive?: boolean; memberLimit?: number }): DurableClustersResponse { + const repository = this.requireRepository(params.owner, params.repo); + const clusterRows = this.db + .prepare( + `select id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title + from cluster_groups + where repo_id = ? + and (? = 1 or status = 'active') + order by updated_at desc, id asc`, + ) + .all(repository.id, params.includeInactive ? 1 : 0) as Array<{ + id: number; + stable_key: string; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + cluster_type: string | null; + representative_thread_id: number | null; + title: string | null; + }>; + if (clusterRows.length === 0) { + return durableClustersResponseSchema.parse({ repository, clusters: [] }); + } + + const clusterIds = clusterRows.map((row) => row.id); + const placeholders = clusterIds.map(() => '?').join(','); + const memberRows = this.db + .prepare( + `select + cm.cluster_id, + cm.role as membership_role, + cm.state as membership_state, + cm.score_to_representative as membership_score, + t.* + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id in (${placeholders}) + order by + case cm.role when 'canonical' then 0 else 1 end, + case cm.state when 'active' then 0 when 'pending_review' then 1 else 2 end, + t.number asc`, + ) + .all(...clusterIds) as Array< + ThreadRow & { + cluster_id: number; + membership_role: 'canonical' | 'duplicate' | 'related'; + membership_state: 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; + membership_score: number | null; + } + >; + const membersByCluster = new Map(); + for (const row of memberRows) { + const members = membersByCluster.get(row.cluster_id) ?? []; + members.push(row); + membersByCluster.set(row.cluster_id, members); + } + + return durableClustersResponseSchema.parse({ + repository, + clusters: clusterRows.map((cluster) => { + const rows = membersByCluster.get(cluster.id) ?? []; + const visibleRows = params.memberLimit === undefined ? rows : rows.slice(0, params.memberLimit); + return { + clusterId: cluster.id, + stableKey: cluster.stable_key, + stableSlug: cluster.stable_slug, + status: cluster.status, + clusterType: cluster.cluster_type, + title: cluster.title, + representativeThreadId: cluster.representative_thread_id, + activeCount: rows.filter((row) => row.membership_state === 'active').length, + removedCount: rows.filter((row) => row.membership_state === 'removed_by_user').length, + blockedCount: rows.filter((row) => row.membership_state === 'blocked_by_override').length, + members: visibleRows.map((row) => ({ + thread: threadToDto(row), + role: row.membership_role, + state: row.membership_state, + scoreToRepresentative: row.membership_score, + })), + }; + }), + }); + } + async refreshRepository(params: { owner: string; repo: string; From 6322e95654a9bdd43f97c43ea717136cd9381026 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:34:08 -0700 Subject: [PATCH 024/215] feat(api): expose durable clusters --- packages/api-core/src/api/server.test.ts | 67 ++++++++++++++++++++++++ packages/api-core/src/api/server.ts | 16 ++++++ 2 files changed, 83 insertions(+) diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index 8d36163..6dd27ba 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -7,6 +7,7 @@ import { clusterDetailResponseSchema, clusterOverrideResponseSchema, clusterSummariesResponseSchema, + durableClustersResponseSchema, healthResponseSchema, neighborsResponseSchema, threadsResponseSchema, @@ -489,6 +490,72 @@ test('exclude cluster member action records a durable override', async () => { } }); +test('durable clusters endpoint returns stable cluster state', async () => { + const service = new GHCrawlService({ + config: { + workspaceRoot: process.cwd(), + configDir: '/tmp/ghcrawl-test', + configPath: '/tmp/ghcrawl-test/config.json', + configFileExists: true, + dbPath: ':memory:', + dbPathSource: 'config', + apiPort: 5179, + secretProvider: 'plaintext', + githubTokenSource: 'none', + openaiApiKeySource: 'none', + summaryModel: 'gpt-5-mini', + embedModel: 'text-embedding-3-large', + embeddingBasis: 'title_original', + vectorBackend: 'vectorlite', + embedBatchSize: 8, + embedConcurrency: 10, + embedMaxUnread: 20, + openSearchIndex: 'ghcrawl-threads', + tuiPreferences: {}, + }, + github: { + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + }, + }); + + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', null, 'Cluster trace-alpha-river', now, now); + + const server = createApiServer(service); + try { + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const address = server.address(); + assert(address && typeof address === 'object'); + + const response = await fetch(`http://127.0.0.1:${address.port}/durable-clusters?owner=openclaw&repo=openclaw`); + assert.equal(response.status, 200); + const payload = durableClustersResponseSchema.parse((await response.json()) as unknown); + assert.equal(payload.clusters[0]?.stableSlug, 'trace-alpha-river'); + } finally { + await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); + service.close(); + } +}); + test('server returns 400 for malformed request inputs', async () => { const service = new GHCrawlService({ config: { diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 4d152fc..d85c095 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -121,6 +121,22 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'GET' && url.pathname === '/durable-clusters') { + const params = parseRepoParams(url); + const includeInactive = url.searchParams.get('includeInactive') === 'true'; + const memberLimitValue = url.searchParams.get('memberLimit'); + sendJson( + res, + 200, + service.listDurableClusters({ + ...params, + includeInactive, + memberLimit: memberLimitValue ? Number(memberLimitValue) : undefined, + }), + ); + return; + } + if (req.method === 'GET' && url.pathname === '/cluster-summaries') { const params = parseRepoParams(url); const sortParam = url.searchParams.get('sort'); From 17680466af3053ac8f44b5e8778c1823ff4c4911 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:35:33 -0700 Subject: [PATCH 025/215] feat(cli): list durable clusters --- apps/cli/src/main.test.ts | 39 +++++++++++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 28 ++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 27fb5e1..14fe799 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -51,6 +51,7 @@ const publicCommands = [ 'embed', 'cluster', 'clusters', + 'durable-clusters', 'cluster-detail', 'search', 'neighbors', @@ -273,6 +274,7 @@ test('agent-facing command help advertises explicit --json', async () => { 'embed', 'cluster', 'clusters', + 'durable-clusters', 'cluster-detail', 'search', 'neighbors', @@ -350,6 +352,36 @@ test('exclude-cluster-member command forwards durable override inputs', async () assert.match(stdout.read(), /"state": "removed_by_user"/); }); +test('durable-clusters command forwards stable cluster list options', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.listDurableClusters; + let received: unknown; + + GHCrawlService.prototype.listDurableClusters = function listDurableClustersStub(params: unknown) { + received = params; + return { repository: { fullName: 'openclaw/openclaw' }, clusters: [{ stableSlug: 'trace-alpha-river' }] } as never; + }; + + try { + await run(['durable-clusters', 'openclaw/openclaw', '--include-inactive', '--member-limit', '5'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.listDurableClusters = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + includeInactive: true, + memberLimit: 5, + }); + assert.match(stdout.read(), /trace-alpha-river/); +}); + test('long-running command progress stays on stderr and payload stays on stdout', async () => { const stdout = createWritableCapture(); const stderr = createWritableCapture(); @@ -420,6 +452,13 @@ test('parseRepoFlags accepts include-closed boolean flag', () => { assert.equal(parsed.values['include-closed'], true); }); +test('parseRepoFlags accepts include-inactive durable cluster flag', () => { + const parsed = parseRepoFlags('durable-clusters', ['openclaw/openclaw', '--include-inactive']); + assert.equal(parsed.owner, 'openclaw'); + assert.equal(parsed.repo, 'openclaw'); + assert.equal(parsed.values['include-inactive'], true); +}); + test('parseRepoFlags accepts kind filter for threads', () => { const parsed = parseRepoFlags('threads', ['openclaw/openclaw', '--kind', 'pull_request']); assert.equal(parsed.owner, 'openclaw'); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 690f4a0..22091ae 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -28,6 +28,7 @@ type CommandName = | 'cluster' | 'cluster-experiment' | 'clusters' + | 'durable-clusters' | 'cluster-detail' | 'search' | 'neighbors' @@ -256,6 +257,18 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl cluster-detail openclaw/openclaw --id 123 --member-limit 20 --body-chars 280 --json'], agentJson: true, }, + { + name: 'durable-clusters', + synopsis: 'durable-clusters [--include-inactive] [--member-limit ] [--json]', + description: 'List persistent cluster identities, stable slugs, and governed memberships.', + options: [ + '--include-inactive Include closed, merged, and split durable clusters', + '--member-limit Limit returned members per cluster', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl durable-clusters openclaw/openclaw --member-limit 10 --json'], + agentJson: true, + }, { name: 'search', synopsis: 'search --query [--mode keyword|semantic|hybrid] [--json]', @@ -472,6 +485,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo 'include-comments': { type: 'boolean' }, 'full-reconcile': { type: 'boolean' }, 'include-closed': { type: 'boolean' }, + 'include-inactive': { type: 'boolean' }, kind: { type: 'string' }, number: { type: 'string' }, numbers: { type: 'string' }, @@ -1124,6 +1138,20 @@ export async function run( writeJson(stdout, result); return; } + case 'durable-clusters': { + const { owner, repo, values } = parseRepoFlags('durable-clusters', rest); + const result = getService().listDurableClusters({ + owner, + repo, + includeInactive: values['include-inactive'] === true, + memberLimit: + typeof values['member-limit'] === 'string' + ? parsePositiveInteger('member-limit', values['member-limit'], 'durable-clusters') + : undefined, + }); + writeJson(stdout, result); + return; + } case 'cluster-detail': { const { owner, repo, values } = parseRepoFlags('cluster-detail', rest); if (typeof values.id !== 'string') { From fabed46c64a085fd761dae71b5880a29aed00891 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:40:23 -0700 Subject: [PATCH 026/215] feat(github): fetch pull request files --- packages/api-core/src/api/server.test.ts | 9 ++++ .../api-core/src/cluster/perf.integration.ts | 1 + packages/api-core/src/github/client.ts | 15 +++++++ packages/api-core/src/service.test.ts | 45 +++++++++++++++++++ 4 files changed, 70 insertions(+) diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index 6dd27ba..34dd756 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -48,6 +48,7 @@ test('health endpoint returns contract payload', async () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -100,6 +101,7 @@ test('neighbors endpoint returns contract payload', async () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -194,6 +196,7 @@ test('threads endpoint can filter by a bulk number list', async () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -265,6 +268,7 @@ test('author-threads endpoint returns one author with strongest same-author matc listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -346,6 +350,7 @@ test('close-thread and includeClosed thread routes expose locally closed items', listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -431,6 +436,7 @@ test('exclude cluster member action records a durable override', async () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -522,6 +528,7 @@ test('durable clusters endpoint returns stable cluster state', async () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -588,6 +595,7 @@ test('server returns 400 for malformed request inputs', async () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -644,6 +652,7 @@ test('cluster summary and detail endpoints return contract payloads', async () = listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); diff --git a/packages/api-core/src/cluster/perf.integration.ts b/packages/api-core/src/cluster/perf.integration.ts index bce518f..7b9da8e 100644 --- a/packages/api-core/src/cluster/perf.integration.ts +++ b/packages/api-core/src/cluster/perf.integration.ts @@ -181,6 +181,7 @@ function createGitHubStub(): GHCrawlService['github'] { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }; } diff --git a/packages/api-core/src/github/client.ts b/packages/api-core/src/github/client.ts index dc2f733..62740c5 100644 --- a/packages/api-core/src/github/client.ts +++ b/packages/api-core/src/github/client.ts @@ -15,6 +15,7 @@ export type GitHubClient = { ) => Promise>>; getIssue: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise>; getPull: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise>; + listPullFiles: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise>>; listIssueComments: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise>>; listPullReviews: (owner: string, repo: string, number: number, reporter?: GitHubReporter) => Promise>>; listPullReviewComments: ( @@ -196,6 +197,20 @@ export function makeGitHubClient(options: RequestOptions): GitHubClient { return response.data as Record; }); }, + async listPullFiles(owner, repo, number, reporter) { + return paginate( + `GET /repos/${owner}/${repo}/pulls/${number}/files per_page=100`, + undefined, + reporter, + (octokit) => + octokit.paginate.iterator(octokit.rest.pulls.listFiles, { + owner, + repo, + pull_number: number, + per_page: 100, + }) as AsyncIterable>>, + ); + }, async listIssueComments(owner, repo, number, reporter) { return paginate( `GET /repos/${owner}/${repo}/issues/${number}/comments per_page=100`, diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 8e5f5fd..92f05f5 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -72,6 +72,7 @@ test('doctor reports config path and successful auth smoke checks', async () => listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => { @@ -117,6 +118,7 @@ test('doctor reports invalid token format without attempting auth', async () => listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -234,6 +236,7 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and listPullReviewCommentCalls += 1; return []; }, + listPullFiles: async () => [], }); try { @@ -364,6 +367,7 @@ test('syncRepository fetches comments, reviews, and review comments when include }, ]; }, + listPullFiles: async () => [], }); try { @@ -402,6 +406,7 @@ test('summarizeRepository excludes hydrated comments by default and reports toke listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, { checkAuth: async () => undefined, @@ -510,6 +515,7 @@ test('summarizeRepository includes hydrated human comments when includeComments listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, { checkAuth: async () => undefined, @@ -617,6 +623,7 @@ test('summarizeRepository prices progress output using the configured summary mo listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, { checkAuth: async () => undefined, @@ -708,6 +715,7 @@ test('purgeComments removes hydrated comments and refreshes canonical documents' listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -805,6 +813,7 @@ test('embedRepository batches multi-source embeddings and skips unchanged inputs listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, { checkAuth: async () => undefined, @@ -912,6 +921,7 @@ test('listNeighbors uses the vectorlite sidecar for current active vectors', asy listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -972,6 +982,7 @@ test('embedRepository prunes closed vectors before reusing current active vector listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -1055,6 +1066,7 @@ test('embedRepository truncates oversized inputs before submission', async () => listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -1179,6 +1191,7 @@ test('embedRepository isolates a failing oversized item from a mixed batch and r listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -1306,6 +1319,7 @@ test('embedRepository recovers from wrapped maximum input length errors by shrin listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -1430,6 +1444,7 @@ test('listNeighbors returns exact nearest neighbors for an embedded thread', () listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -1513,6 +1528,7 @@ test('listAuthorThreads returns one author view with strongest same-author match listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -1586,6 +1602,7 @@ test('clusterRepository emits timed progress updates while identifying similarit listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -1642,6 +1659,7 @@ test('clusterRepository merges source kinds into one edge without directional du listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -1704,6 +1722,7 @@ test('clusterRepository prunes older cluster runs for the repo after a successfu listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -1770,6 +1789,7 @@ test('clusterRepository purges legacy embeddings and inline vector payloads afte listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -1875,6 +1895,7 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -1963,6 +1984,7 @@ test('clusterRepository falls back to deterministic fingerprints when vectors ar listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, }); @@ -2046,6 +2068,7 @@ test('embedRepository rebuilds a corrupted active vector store during upsert', a listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -2100,6 +2123,7 @@ test('clusterExperiment falls back to active vectors when legacy embeddings are listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => undefined, @@ -2165,6 +2189,7 @@ test('clusterRepository does not retain a parsed embedding cache in-process', as listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2218,6 +2243,7 @@ test('tui snapshot returns mixed issue and pull request counts with default rece listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2318,6 +2344,7 @@ test('tui cluster detail and thread detail expose members, summaries, and neighb listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2421,6 +2448,7 @@ test('getTuiThreadDetail prefers stored cluster neighbors over exact embedding s listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2511,6 +2539,7 @@ test('refreshRepository runs sync, embed, and cluster in order and returns the c listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }, { checkAuth: async () => undefined, @@ -2556,6 +2585,7 @@ test('agent cluster summary and detail dumps expose repo stats, snippets, and su listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2706,6 +2736,7 @@ test('getTuiThreadDetail can skip neighbor loading for fast browse paths', () => listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2782,6 +2813,7 @@ test('local thread closure updates default thread filters and auto-closes fully listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2867,6 +2899,7 @@ test('manual cluster closure is hidden from JSON summaries by default but remain listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -2932,6 +2965,7 @@ test('excludeThreadFromCluster records a durable manual exclusion', () => { listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3007,6 +3041,7 @@ test('listDurableClusters returns stable slugs and governed member states', () = listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3088,6 +3123,7 @@ test('syncRepository records actors and repo stats from thread and comment autho ], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3195,6 +3231,7 @@ test('syncRepository reconciles stale open threads and marks confirmed closures }, listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3287,6 +3324,7 @@ test('syncRepository treats missing stale pull requests as closed and continues' listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3367,6 +3405,7 @@ test('syncRepository skips stale-open reconciliation for filtered crawls', async listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3431,6 +3470,7 @@ test('syncRepository leaves unseen stale open items alone by default when closed listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3499,6 +3539,7 @@ test('syncRepository performs direct stale-open reconciliation when fullReconcil listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3571,6 +3612,7 @@ test('syncRepository derives the default overlapping since window from the last listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3675,6 +3717,7 @@ test('syncRepository uses an explicit since window for both open and closed over listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3744,6 +3787,7 @@ test('syncRepository skips the closed overlap sweep on the first full scan with listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { @@ -3774,6 +3818,7 @@ test('repository-scoped reads and neighbors do not leak across repos in the same listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], + listPullFiles: async () => [], }); try { From ff780cf98a6de1b3777f0a77f1b79d781e431b3b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:41:03 -0700 Subject: [PATCH 027/215] feat(cluster): derive code hunk signatures --- .../src/cluster/code-signature.test.ts | 65 +++++++++ .../api-core/src/cluster/code-signature.ts | 126 ++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 packages/api-core/src/cluster/code-signature.test.ts create mode 100644 packages/api-core/src/cluster/code-signature.ts diff --git a/packages/api-core/src/cluster/code-signature.test.ts b/packages/api-core/src/cluster/code-signature.test.ts new file mode 100644 index 0000000..6159c00 --- /dev/null +++ b/packages/api-core/src/cluster/code-signature.test.ts @@ -0,0 +1,65 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import { buildCodeSnapshotSignature, extractHunkSignatures, normalizePullFile } from './code-signature.js'; + +test('normalizePullFile extracts stable GitHub file metadata', () => { + const file = normalizePullFile({ + filename: 'packages/api-core/src/service.ts', + status: 'modified', + previous_filename: 'old.ts', + additions: 4, + deletions: 2, + changes: 6, + patch: '@@ -1 +1 @@\n-old\n+new', + sha: 'abc', + }); + + assert.deepEqual(file, { + filename: 'packages/api-core/src/service.ts', + status: 'modified', + previousFilename: 'old.ts', + additions: 4, + deletions: 2, + changes: 6, + patch: '@@ -1 +1 @@\n-old\n+new', + sha: 'abc', + }); +}); + +test('extractHunkSignatures produces deterministic hashes per diff hunk', () => { + const patch = [ + '@@ -1,3 +1,3 @@', + ' export function run() {', + '- return oldValue;', + '+ return newValue;', + ' }', + '@@ -10,2 +10,2 @@', + '-const mode = "slow";', + '+const mode = "fast";', + ].join('\n'); + + const first = extractHunkSignatures('src/run.ts', patch); + const second = extractHunkSignatures('src/run.ts', patch); + + assert.equal(first.length, 2); + assert.deepEqual(first, second); + assert.notEqual(first[0]?.hunkHash, first[1]?.hunkHash); +}); + +test('buildCodeSnapshotSignature returns files, patch digest, and hunk signatures', () => { + const snapshot = buildCodeSnapshotSignature([ + { + filename: 'src/run.ts', + status: 'modified', + additions: 1, + deletions: 1, + changes: 2, + patch: '@@ -1 +1 @@\n-old\n+new', + }, + ]); + + assert.equal(snapshot.files.length, 1); + assert.equal(snapshot.hunkSignatures.length, 1); + assert.match(snapshot.patchDigest, /^[a-f0-9]{64}$/); +}); diff --git a/packages/api-core/src/cluster/code-signature.ts b/packages/api-core/src/cluster/code-signature.ts new file mode 100644 index 0000000..d64a338 --- /dev/null +++ b/packages/api-core/src/cluster/code-signature.ts @@ -0,0 +1,126 @@ +import crypto from 'node:crypto'; + +const TOKEN_RE = /[a-zA-Z0-9_.$/-]+/g; + +export type PullFileMetadata = { + filename: string; + status?: string | null; + previousFilename?: string | null; + additions: number; + deletions: number; + changes: number; + patch?: string | null; + sha?: string | null; +}; + +export type HunkSignature = { + path: string; + hunkHash: string; + contextHash: string; + addedTokenHash: string; + removedTokenHash: string; +}; + +export type CodeSnapshotSignature = { + files: PullFileMetadata[]; + patchDigest: string; + hunkSignatures: HunkSignature[]; +}; + +function sha256(value: string): string { + return crypto.createHash('sha256').update(value).digest('hex'); +} + +function tokenize(value: string): string[] { + return Array.from(value.toLowerCase().matchAll(TOKEN_RE)).map((match) => match[0]); +} + +function tokenHash(lines: string[]): string { + return sha256(JSON.stringify(lines.flatMap(tokenize).sort())); +} + +export function normalizePullFile(payload: Record): PullFileMetadata { + return { + filename: String(payload.filename ?? ''), + status: typeof payload.status === 'string' ? payload.status : null, + previousFilename: typeof payload.previous_filename === 'string' ? payload.previous_filename : null, + additions: Number(payload.additions ?? 0), + deletions: Number(payload.deletions ?? 0), + changes: Number(payload.changes ?? 0), + patch: typeof payload.patch === 'string' ? payload.patch : null, + sha: typeof payload.sha === 'string' ? payload.sha : null, + }; +} + +export function extractHunkSignatures(path: string, patch: string | null | undefined): HunkSignature[] { + if (!patch) return []; + + const signatures: HunkSignature[] = []; + let header: string | null = null; + let context: string[] = []; + let added: string[] = []; + let removed: string[] = []; + + function flush(): void { + if (!header) return; + const hunkPayload = JSON.stringify({ + path, + header, + contextHash: tokenHash(context), + addedTokenHash: tokenHash(added), + removedTokenHash: tokenHash(removed), + }); + signatures.push({ + path, + hunkHash: sha256(hunkPayload), + contextHash: tokenHash(context), + addedTokenHash: tokenHash(added), + removedTokenHash: tokenHash(removed), + }); + } + + for (const line of patch.split('\n')) { + if (line.startsWith('@@')) { + flush(); + header = line; + context = []; + added = []; + removed = []; + continue; + } + if (!header || line.startsWith('+++') || line.startsWith('---')) continue; + if (line.startsWith('+')) { + added.push(line.slice(1)); + } else if (line.startsWith('-')) { + removed.push(line.slice(1)); + } else { + context.push(line.startsWith(' ') ? line.slice(1) : line); + } + } + flush(); + + return signatures; +} + +export function buildCodeSnapshotSignature(files: Array>): CodeSnapshotSignature { + const normalizedFiles = files.map(normalizePullFile).filter((file) => file.filename.length > 0); + const hunkSignatures = normalizedFiles.flatMap((file) => extractHunkSignatures(file.filename, file.patch)); + const patchDigest = sha256( + JSON.stringify( + normalizedFiles.map((file) => ({ + filename: file.filename, + status: file.status, + previousFilename: file.previousFilename, + additions: file.additions, + deletions: file.deletions, + patchHash: file.patch ? sha256(file.patch) : null, + })), + ), + ); + + return { + files: normalizedFiles, + patchDigest, + hunkSignatures, + }; +} From 3521bc7589a44ecd291159992f86b9bbe55e8eff Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:41:42 -0700 Subject: [PATCH 028/215] feat(cluster): persist code snapshots --- .../src/cluster/persistent-store.test.ts | 59 ++++++++++++++ .../api-core/src/cluster/persistent-store.ts | 79 +++++++++++++++++++ 2 files changed, 138 insertions(+) diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index e665a0d..7c2ff6b 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -3,6 +3,7 @@ import assert from 'node:assert/strict'; import { migrate } from '../db/migrate.js'; import { openDb } from '../db/sqlite.js'; +import { buildCodeSnapshotSignature } from './code-signature.js'; import { scoreSimilarityEvidence } from './evidence-score.js'; import { createPipelineRun, @@ -15,6 +16,7 @@ import { upsertSimilarityEdgeEvidence, upsertThreadFingerprint, upsertThreadRevision, + upsertThreadCodeSnapshot, } from './persistent-store.js'; import { buildDeterministicThreadFingerprint } from './thread-fingerprint.js'; @@ -220,3 +222,60 @@ test('persistent cluster store records thread revisions and deterministic finger db.close(); } }); + +test('persistent cluster store records code snapshots, changed files, and hunk signatures', () => { + const db = openDb(':memory:'); + try { + migrate(db); + seedRepoAndThreads(db); + const revisionId = upsertThreadRevision(db, { + threadId: 10, + sourceUpdatedAt: '2026-01-01T00:00:00Z', + title: 'Fix cache collision', + body: '', + labels: [], + rawJson: '{}', + }); + const signature = buildCodeSnapshotSignature([ + { + filename: 'packages/api-core/src/cache.ts', + status: 'modified', + additions: 1, + deletions: 1, + changes: 2, + patch: '@@ -1 +1 @@\n-oldKey\n+newKey', + }, + ]); + + const snapshotId = upsertThreadCodeSnapshot(db, { + threadRevisionId: revisionId, + baseSha: 'base', + headSha: 'head', + signature, + }); + + const snapshot = db.prepare('select files_changed, additions, deletions, patch_digest from thread_code_snapshots where id = ?').get(snapshotId) as { + files_changed: number; + additions: number; + deletions: number; + patch_digest: string; + }; + const file = db.prepare('select path, patch_blob_id from thread_changed_files where snapshot_id = ?').get(snapshotId) as { + path: string; + patch_blob_id: number; + }; + const hunkCount = db.prepare('select count(*) as count from thread_hunk_signatures where snapshot_id = ?').get(snapshotId) as { count: number }; + + assert.deepEqual(snapshot, { + files_changed: 1, + additions: 1, + deletions: 1, + patch_digest: signature.patchDigest, + }); + assert.equal(file.path, 'packages/api-core/src/cache.ts'); + assert.ok(file.patch_blob_id > 0); + assert.equal(hunkCount.count, 1); + } finally { + db.close(); + } +}); diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index eb24a0d..c57ea6f 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -1,6 +1,7 @@ import crypto from 'node:crypto'; import type { SqliteDatabase } from '../db/sqlite.js'; +import type { CodeSnapshotSignature } from './code-signature.js'; import type { EvidenceTier, SimilarityEvidenceBreakdown } from './evidence-score.js'; import type { DeterministicThreadFingerprint } from './thread-fingerprint.js'; @@ -235,6 +236,84 @@ export function upsertThreadFingerprint( ); } +export function upsertThreadCodeSnapshot( + db: SqliteDatabase, + params: { + threadRevisionId: number; + baseSha?: string | null; + headSha?: string | null; + signature: CodeSnapshotSignature; + }, +): number { + const timestamp = nowIso(); + const filesChanged = params.signature.files.length; + const additions = params.signature.files.reduce((sum, file) => sum + file.additions, 0); + const deletions = params.signature.files.reduce((sum, file) => sum + file.deletions, 0); + db.prepare( + `insert into thread_code_snapshots ( + thread_revision_id, base_sha, head_sha, files_changed, additions, deletions, patch_digest, raw_diff_blob_id, created_at + ) values (?, ?, ?, ?, ?, ?, ?, null, ?) + on conflict(thread_revision_id) do update set + base_sha = excluded.base_sha, + head_sha = excluded.head_sha, + files_changed = excluded.files_changed, + additions = excluded.additions, + deletions = excluded.deletions, + patch_digest = excluded.patch_digest`, + ).run( + params.threadRevisionId, + params.baseSha ?? null, + params.headSha ?? null, + filesChanged, + additions, + deletions, + params.signature.patchDigest, + timestamp, + ); + const snapshot = db + .prepare('select id from thread_code_snapshots where thread_revision_id = ? limit 1') + .get(params.threadRevisionId) as { id: number }; + + db.prepare('delete from thread_changed_files where snapshot_id = ?').run(snapshot.id); + db.prepare('delete from thread_hunk_signatures where snapshot_id = ?').run(snapshot.id); + + const insertFile = db.prepare( + `insert into thread_changed_files ( + snapshot_id, path, status, additions, deletions, previous_path, patch_blob_id, patch_hash + ) values (?, ?, ?, ?, ?, ?, ?, ?)`, + ); + for (const file of params.signature.files) { + const patchBlobId = file.patch + ? upsertInlineBlob(db, { + text: file.patch, + mediaType: 'text/x-diff', + }) + : null; + insertFile.run( + snapshot.id, + file.filename, + file.status ?? null, + file.additions, + file.deletions, + file.previousFilename ?? null, + patchBlobId, + file.patch ? stableHash(file.patch) : null, + ); + } + + const insertHunk = db.prepare( + `insert into thread_hunk_signatures ( + snapshot_id, path, hunk_hash, context_hash, added_token_hash, removed_token_hash, created_at + ) values (?, ?, ?, ?, ?, ?, ?) + on conflict(snapshot_id, path, hunk_hash) do nothing`, + ); + for (const hunk of params.signature.hunkSignatures) { + insertHunk.run(snapshot.id, hunk.path, hunk.hunkHash, hunk.contextHash, hunk.addedTokenHash, hunk.removedTokenHash, timestamp); + } + + return snapshot.id; +} + export function createPipelineRun( db: SqliteDatabase, params: { From 0e13773544e1a2ca13dbb8b92e25433ca98139e1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:42:07 -0700 Subject: [PATCH 029/215] feat(contract): report synced code files --- packages/api-contract/src/contracts.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index afb4a50..b303bf2 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -223,6 +223,7 @@ export const syncResultSchema = z.object({ runId: z.number().int().positive(), threadsSynced: z.number().int().nonnegative(), commentsSynced: z.number().int().nonnegative(), + codeFilesSynced: z.number().int().nonnegative().default(0), threadsClosed: z.number().int().nonnegative(), }); export type SyncResultDto = z.infer; From 2588b3f0a443323729253e435495651aec4d58dc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:43:32 -0700 Subject: [PATCH 030/215] feat(sync): hydrate code snapshots --- packages/api-core/src/service.test.ts | 86 ++++++++++++++++++++++++++- packages/api-core/src/service.ts | 44 +++++++++++++- 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 92f05f5..a994675 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -162,6 +162,7 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and let listIssueCommentCalls = 0; let listPullReviewCalls = 0; let listPullReviewCommentCalls = 0; + let listPullFileCalls = 0; const service = makeTestService({ checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), @@ -236,7 +237,10 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and listPullReviewCommentCalls += 1; return []; }, - listPullFiles: async () => [], + listPullFiles: async () => { + listPullFileCalls += 1; + return []; + }, }); try { @@ -266,6 +270,7 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and assert.equal(listIssueCommentCalls, 0); assert.equal(listPullReviewCalls, 0); assert.equal(listPullReviewCommentCalls, 0); + assert.equal(listPullFileCalls, 0); const rows = service.db .prepare('select number, kind, first_pulled_at, last_pulled_at from threads order by number asc') @@ -390,6 +395,85 @@ test('syncRepository fetches comments, reviews, and review comments when include } }); +test('syncRepository hydrates pull request code snapshots when includeCode is enabled', async () => { + let listPullFileCalls = 0; + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [ + { + id: 101, + number: 43, + state: 'open', + title: 'Downloader PR', + body: 'Implements a fix.', + html_url: 'https://github.com/openclaw/openclaw/pull/43', + labels: [{ name: 'bug' }], + assignees: [], + pull_request: { url: 'https://api.github.com/repos/openclaw/openclaw/pulls/43' }, + user: { login: 'alice', type: 'User' }, + }, + ], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async (_owner, _repo, number) => ({ + id: 101, + number, + state: 'open', + title: 'Downloader PR', + body: 'Implements a fix.', + html_url: `https://github.com/openclaw/openclaw/pull/${number}`, + labels: [{ name: 'bug' }], + assignees: [], + user: { login: 'alice', type: 'User' }, + draft: false, + base: { sha: 'base-sha' }, + head: { sha: 'head-sha' }, + updated_at: '2026-03-09T00:00:00Z', + }), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => { + listPullFileCalls += 1; + return [ + { + filename: 'packages/api-core/src/service.ts', + status: 'modified', + additions: 1, + deletions: 1, + changes: 2, + patch: '@@ -1 +1 @@\n-old\n+new', + }, + ]; + }, + }); + + try { + const result = await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + includeCode: true, + }); + + assert.equal(result.codeFilesSynced, 1); + assert.equal(listPullFileCalls, 1); + const snapshot = service.db.prepare('select base_sha, head_sha, files_changed from thread_code_snapshots').get() as { + base_sha: string; + head_sha: string; + files_changed: number; + }; + const file = service.db.prepare('select path from thread_changed_files').get() as { path: string }; + const hunkCount = service.db.prepare('select count(*) as count from thread_hunk_signatures').get() as { count: number }; + assert.deepEqual(snapshot, { base_sha: 'base-sha', head_sha: 'head-sha', files_changed: 1 }); + assert.equal(file.path, 'packages/api-core/src/service.ts'); + assert.equal(hunkCount.count, 1); + } finally { + service.close(); + } +}); + test('summarizeRepository excludes hydrated comments by default and reports token usage', async () => { const summaryInputs: string[] = []; const service = makeTestService( diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b44d1dc..c582136 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -54,6 +54,7 @@ import { } from '@ghcrawl/api-contract'; import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; +import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; import { buildDeterministicClusterGraph } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue } from './cluster/human-key.js'; @@ -68,6 +69,7 @@ import { upsertSimilarityEdgeEvidence, upsertThreadFingerprint, upsertThreadRevision, + upsertThreadCodeSnapshot, } from './cluster/persistent-store.js'; import type { DeterministicThreadFingerprint } from './cluster/thread-fingerprint.js'; import { @@ -252,6 +254,7 @@ type SyncCursorState = { type SyncRunStats = { threadsSynced: number; commentsSynced: number; + codeFilesSynced: number; threadsClosed: number; threadsClosedFromClosedSweep?: number; threadsClosedFromDirectReconcile?: number; @@ -260,6 +263,7 @@ type SyncRunStats = { effectiveSince: string | null; limit: number | null; includeComments: boolean; + includeCode?: boolean; fullReconcile?: boolean; isFullOpenScan: boolean; isOverlappingOpenScan: boolean; @@ -366,6 +370,7 @@ type SyncOptions = { since?: string; limit?: number; includeComments?: boolean; + includeCode?: boolean; fullReconcile?: boolean; onProgress?: (message: string) => void; startedAt?: string; @@ -456,6 +461,8 @@ function parseSyncRunStats(statsJson: string | null): SyncRunStats | null { effectiveSince: typeof parsed.effectiveSince === 'string' ? parsed.effectiveSince : null, limit: typeof parsed.limit === 'number' ? parsed.limit : null, includeComments: parsed.includeComments === true, + codeFilesSynced: typeof parsed.codeFilesSynced === 'number' ? parsed.codeFilesSynced : 0, + includeCode: parsed.includeCode === true, isFullOpenScan: parsed.isFullOpenScan === true, isOverlappingOpenScan: parsed.isOverlappingOpenScan === true, overlapReferenceAt: typeof parsed.overlapReferenceAt === 'string' ? parsed.overlapReferenceAt : null, @@ -975,6 +982,7 @@ export class GHCrawlService { ): Promise { const crawlStartedAt = params.startedAt ?? nowIso(); const includeComments = params.includeComments ?? false; + const includeCode = params.includeCode ?? false; const github = this.requireGithub(); params.onProgress?.(`[sync] fetching repository metadata for ${params.owner}/${params.repo}`); const reporter = params.onProgress ? (message: string) => params.onProgress?.(message.replace(/^\[github\]/, '[sync/github]')) : undefined; @@ -1000,6 +1008,11 @@ export class GHCrawlService { ? '[sync] comment hydration enabled; fetching issue comments, reviews, and review comments' : '[sync] metadata-only mode; skipping comment, review, and review-comment fetches', ); + params.onProgress?.( + includeCode + ? '[sync] code hydration enabled; fetching pull request file metadata and patch signatures' + : '[sync] code hydration disabled; skipping pull request file fetches', + ); if (isFullOpenScan) { params.onProgress?.('[sync] full open scan; no prior completed overlap/full cursor was found for this repository'); } else if (params.since === undefined && effectiveSince && overlapReferenceAt) { @@ -1013,6 +1026,7 @@ export class GHCrawlService { params.onProgress?.(`[sync] discovered ${items.length} threads to process`); let threadsSynced = 0; let commentsSynced = 0; + let codeFilesSynced = 0; for (const [index, item] of items.entries()) { if (index > 0 && index % SYNC_BATCH_SIZE === 0) { @@ -1026,6 +1040,11 @@ export class GHCrawlService { try { const threadPayload = isPr ? await github.getPull(params.owner, params.repo, number, reporter) : item; const threadId = this.upsertThread(repoId, kind, threadPayload, crawlStartedAt); + if (includeCode && isPr) { + const files = await github.listPullFiles(params.owner, params.repo, number, reporter); + this.persistThreadCodeSnapshot(threadId, threadPayload, files); + codeFilesSynced += files.length; + } if (includeComments) { const comments = await this.fetchThreadComments(params.owner, params.repo, number, isPr, reporter); this.replaceComments(threadId, comments); @@ -1090,12 +1109,14 @@ export class GHCrawlService { this.finishRun('sync_runs', runId, 'completed', { threadsSynced, commentsSynced, + codeFilesSynced, threadsClosed, crawlStartedAt, requestedSince: params.since ?? null, effectiveSince: effectiveSince ?? null, limit: params.limit ?? null, includeComments, + includeCode, fullReconcile: params.fullReconcile ?? false, isFullOpenScan, isOverlappingOpenScan, @@ -1104,7 +1125,7 @@ export class GHCrawlService { threadsClosedFromDirectReconcile, reconciledOpenCloseAt, } satisfies SyncRunStats, undefined, finishedAt); - return syncResultSchema.parse({ runId, threadsSynced, commentsSynced, threadsClosed }); + return syncResultSchema.parse({ runId, threadsSynced, commentsSynced, codeFilesSynced, threadsClosed }); } catch (error) { this.finishRun('sync_runs', runId, 'failed', null, error); throw error; @@ -3439,6 +3460,27 @@ export class GHCrawlService { return row.id; } + private persistThreadCodeSnapshot(threadId: number, threadPayload: Record, files: Array>): void { + const title = String(threadPayload.title ?? `#${threadPayload.number}`); + const body = typeof threadPayload.body === 'string' ? threadPayload.body : null; + const revisionId = upsertThreadRevision(this.db, { + threadId, + sourceUpdatedAt: typeof threadPayload.updated_at === 'string' ? threadPayload.updated_at : null, + title, + body, + labels: parseLabels(threadPayload), + rawJson: asJson(threadPayload), + }); + const base = threadPayload.base as Record | undefined; + const head = threadPayload.head as Record | undefined; + upsertThreadCodeSnapshot(this.db, { + threadRevisionId: revisionId, + baseSha: typeof base?.sha === 'string' ? base.sha : null, + headSha: typeof head?.sha === 'string' ? head.sha : null, + signature: buildCodeSnapshotSignature(files), + }); + } + private async applyClosedOverlapSweep(params: { repoId: number; owner: string; From 1afc3dceef5a230aa54e7f57fae31d126789ec9a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:44:07 -0700 Subject: [PATCH 031/215] feat(cluster): strengthen hunk evidence --- packages/api-core/src/cluster/evidence-score.test.ts | 10 ++++++++++ packages/api-core/src/cluster/evidence-score.ts | 1 + 2 files changed, 11 insertions(+) diff --git a/packages/api-core/src/cluster/evidence-score.test.ts b/packages/api-core/src/cluster/evidence-score.test.ts index b7e4151..c88c6ee 100644 --- a/packages/api-core/src/cluster/evidence-score.test.ts +++ b/packages/api-core/src/cluster/evidence-score.test.ts @@ -63,6 +63,16 @@ test('scoreSimilarityEvidence can improve confidence with optional enrichment', assert.ok(enriched.score > base.score); }); +test('scoreSimilarityEvidence treats exact hunk overlap as strong evidence without prose similarity', () => { + const left = fp({ id: 1, title: 'Replace queue scheduler', body: 'Internal refactor.', hunks: ['same-hunk'] }); + const right = fp({ id: 2, title: 'Patch database migrator', body: 'Unrelated words.', hunks: ['same-hunk'] }); + + const evidence = scoreSimilarityEvidence(left, right); + + assert.equal(evidence.tier, 'strong'); + assert.equal(evidence.hunkOverlap, 1); +}); + test('scoreSimilarityEvidence rejects unrelated deterministic fingerprints', () => { const left = fp({ id: 1, title: 'Fix cache key collision', files: ['packages/api-core/src/cache.ts'] }); const right = fp({ id: 2, title: 'Update docs typography', files: ['docs/design.md'] }); diff --git a/packages/api-core/src/cluster/evidence-score.ts b/packages/api-core/src/cluster/evidence-score.ts index 6faa754..5f16df7 100644 --- a/packages/api-core/src/cluster/evidence-score.ts +++ b/packages/api-core/src/cluster/evidence-score.ts @@ -70,6 +70,7 @@ export function scoreSimilarityEvidence( let tier: EvidenceTier = 'none'; if ( base.lineage >= 0.8 || + base.hunkOverlap >= 0.8 || (base.linkedRefOverlap >= 0.8 && (base.structure >= 0.25 || base.titleOverlap >= 0.25)) || score >= config.strongScore ) { From 5cf8aaacd63dced2f9ea7d5f0c1722e147032806 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:45:03 -0700 Subject: [PATCH 032/215] feat(cluster): load code signatures --- packages/api-core/src/service.test.ts | 83 +++++++++++++++++++++++++++ packages/api-core/src/service.ts | 68 ++++++++++++++++++++++ 2 files changed, 151 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index a994675..e93b8d0 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2109,6 +2109,89 @@ test('clusterRepository falls back to deterministic fingerprints when vectors ar } }); +test('clusterRepository uses hydrated code hunk signatures without embeddings', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [ + { + id: 100, + number: 42, + state: 'open', + title: 'Rewrite scheduler state', + body: 'Internal cleanup.', + html_url: 'https://github.com/openclaw/openclaw/pull/42', + labels: [], + pull_request: { url: 'https://api.github.com/repos/openclaw/openclaw/pulls/42' }, + user: { login: 'alice', type: 'User' }, + }, + { + id: 101, + number: 43, + state: 'open', + title: 'Patch migration locking', + body: 'Different prose.', + html_url: 'https://github.com/openclaw/openclaw/pull/43', + labels: [], + pull_request: { url: 'https://api.github.com/repos/openclaw/openclaw/pulls/43' }, + user: { login: 'bob', type: 'User' }, + }, + ], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async (_owner, _repo, number) => ({ + id: number, + number, + state: 'open', + title: number === 42 ? 'Rewrite scheduler state' : 'Patch migration locking', + body: number === 42 ? 'Internal cleanup.' : 'Different prose.', + html_url: `https://github.com/openclaw/openclaw/pull/${number}`, + labels: [], + user: { login: number === 42 ? 'alice' : 'bob', type: 'User' }, + draft: false, + base: { sha: 'base-sha' }, + head: { sha: `head-${number}` }, + updated_at: '2026-03-09T00:00:00Z', + }), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [ + { + filename: 'packages/api-core/src/cluster/build.ts', + status: 'modified', + additions: 1, + deletions: 1, + changes: 2, + patch: '@@ -1 +1 @@\n-oldCluster\n+newCluster', + }, + ], + }, + }); + + try { + const sync = await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + includeCode: true, + }); + const result = await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + minScore: 0.1, + }); + + assert.equal(sync.codeFilesSynced, 2); + assert.equal(result.edges, 1); + assert.equal(result.clusters, 1); + } finally { + service.close(); + } +}); + test('embedRepository rebuilds a corrupted active vector store during upsert', async () => { const vectors = new Map(); let failNextUpsert = true; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index c582136..b94b065 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -4255,6 +4255,9 @@ export class GHCrawlService { labels: string[]; rawJson: string; updatedAtGh: string | null; + changedFiles: string[]; + hunkSignatures: string[]; + patchIds: string[]; }> { const rows = this.db .prepare( @@ -4275,6 +4278,7 @@ export class GHCrawlService { raw_json: string; updated_at_gh: string | null; }>; + const codeFeaturesByThread = this.loadLatestCodeFeatures(rows.map((row) => row.id)); return rows.map((row) => ({ id: row.id, number: row.number, @@ -4284,9 +4288,73 @@ export class GHCrawlService { labels: parseArray(row.labels_json), rawJson: row.raw_json, updatedAtGh: row.updated_at_gh, + changedFiles: codeFeaturesByThread.get(row.id)?.changedFiles ?? [], + hunkSignatures: codeFeaturesByThread.get(row.id)?.hunkSignatures ?? [], + patchIds: codeFeaturesByThread.get(row.id)?.patchIds ?? [], })); } + private loadLatestCodeFeatures(threadIds: number[]): Map { + if (threadIds.length === 0) return new Map(); + const placeholders = threadIds.map(() => '?').join(','); + const latestRevisions = this.db + .prepare( + `select thread_id, max(id) as revision_id + from thread_revisions + where thread_id in (${placeholders}) + group by thread_id`, + ) + .all(...threadIds) as Array<{ thread_id: number; revision_id: number }>; + if (latestRevisions.length === 0) return new Map(); + + const revisionToThread = new Map(latestRevisions.map((row) => [row.revision_id, row.thread_id])); + const revisionPlaceholders = latestRevisions.map(() => '?').join(','); + const fileRows = this.db + .prepare( + `select cs.thread_revision_id, cf.path, cf.patch_hash + from thread_code_snapshots cs + join thread_changed_files cf on cf.snapshot_id = cs.id + where cs.thread_revision_id in (${revisionPlaceholders}) + order by cf.path asc`, + ) + .all(...latestRevisions.map((row) => row.revision_id)) as Array<{ thread_revision_id: number; path: string; patch_hash: string | null }>; + const hunkRows = this.db + .prepare( + `select cs.thread_revision_id, hs.hunk_hash + from thread_code_snapshots cs + join thread_hunk_signatures hs on hs.snapshot_id = cs.id + where cs.thread_revision_id in (${revisionPlaceholders}) + order by hs.hunk_hash asc`, + ) + .all(...latestRevisions.map((row) => row.revision_id)) as Array<{ thread_revision_id: number; hunk_hash: string }>; + + const out = new Map(); + function entry(threadId: number): { changedFiles: string[]; hunkSignatures: string[]; patchIds: string[] } { + const existing = out.get(threadId) ?? { changedFiles: [], hunkSignatures: [], patchIds: [] }; + out.set(threadId, existing); + return existing; + } + for (const row of fileRows) { + const threadId = revisionToThread.get(row.thread_revision_id); + if (threadId === undefined) continue; + const target = entry(threadId); + target.changedFiles.push(row.path); + if (row.patch_hash) target.patchIds.push(row.patch_hash); + } + for (const row of hunkRows) { + const threadId = revisionToThread.get(row.thread_revision_id); + if (threadId === undefined) continue; + entry(threadId).hunkSignatures.push(row.hunk_hash); + } + + for (const target of out.values()) { + target.changedFiles = Array.from(new Set(target.changedFiles)).sort(); + target.hunkSignatures = Array.from(new Set(target.hunkSignatures)).sort(); + target.patchIds = Array.from(new Set(target.patchIds)).sort(); + } + return out; + } + private persistDeterministicFingerprints( items: Array<{ id: number; From 5ca91a256c1538de413a402da2a255ef944dde54 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:45:55 -0700 Subject: [PATCH 033/215] feat(cli): add code hydration flag --- apps/cli/src/main.test.ts | 32 ++++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 5 ++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 14fe799..c157771 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -413,6 +413,31 @@ test('long-running command progress stays on stderr and payload stays on stdout' assert.doesNotMatch(stdout.read(), /\[sync] started/); }); +test('sync command forwards include-code hydration flag', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.syncRepository; + let received: unknown; + + GHCrawlService.prototype.syncRepository = async function syncRepositoryStub(params: unknown) { + received = params; + return { runId: 1, threadsSynced: 1, commentsSynced: 0, codeFilesSynced: 1, threadsClosed: 0 } as never; + }; + + try { + await run(['sync', 'openclaw/openclaw', '--include-code'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.syncRepository = original; + context.cleanup(); + } + + assert.equal((received as { includeCode?: boolean }).includeCode, true); + assert.match(stdout.read(), /"codeFilesSynced": 1/); +}); + test('parseOwnerRepo accepts owner slash repo syntax', () => { assert.deepEqual(parseOwnerRepo('openclaw/openclaw'), { owner: 'openclaw', repo: 'openclaw' }); }); @@ -438,6 +463,13 @@ test('parseRepoFlags accepts include-comments boolean flag', () => { assert.equal(parsed.values['include-comments'], true); }); +test('parseRepoFlags accepts include-code boolean flag', () => { + const parsed = parseRepoFlags('sync', ['openclaw/openclaw', '--include-code']); + assert.equal(parsed.owner, 'openclaw'); + assert.equal(parsed.repo, 'openclaw'); + assert.equal(parsed.values['include-code'], true); +}); + test('parseRepoFlags accepts full-reconcile boolean flag', () => { const parsed = parseRepoFlags('sync', ['openclaw/openclaw', '--full-reconcile']); assert.equal(parsed.owner, 'openclaw'); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 22091ae..c0fa4f2 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -125,12 +125,13 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'sync', - synopsis: 'sync [--since ] [--limit ] [--include-comments] [--full-reconcile] [--json]', + synopsis: 'sync [--since ] [--limit ] [--include-comments] [--include-code] [--full-reconcile] [--json]', description: 'Sync open GitHub issues and PRs into the local database.', options: [ '--since Limit sync window using ISO time or 15m/2h/7d/1mo', '--limit Limit the number of synced items', '--include-comments Hydrate issue comments, PR reviews, and review comments', + '--include-code Hydrate pull request file metadata and patch signatures', '--full-reconcile Reconcile stale open items instead of metadata-only incrementals', '--json Emit machine-readable JSON output explicitly', ], @@ -483,6 +484,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo limit: { type: 'string' }, json: { type: 'boolean' }, 'include-comments': { type: 'boolean' }, + 'include-code': { type: 'boolean' }, 'full-reconcile': { type: 'boolean' }, 'include-closed': { type: 'boolean' }, 'include-inactive': { type: 'boolean' }, @@ -950,6 +952,7 @@ export async function run( since: typeof values.since === 'string' ? resolveSinceValue(values.since) : undefined, limit: typeof values.limit === 'string' ? parsePositiveInteger('limit', values.limit, 'sync') : undefined, includeComments: values['include-comments'] === true, + includeCode: values['include-code'] === true, fullReconcile: values['full-reconcile'] === true, onProgress: (message: string) => writeProgress(message, stderr), }); From d6e2eabe54cb0415e92ab57bc75ef356da019de1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:46:16 -0700 Subject: [PATCH 034/215] feat(contract): allow refresh code hydration --- packages/api-contract/src/contracts.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index b303bf2..6e06450 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -247,6 +247,7 @@ export const refreshRequestSchema = z.object({ sync: z.boolean().optional(), embed: z.boolean().optional(), cluster: z.boolean().optional(), + includeCode: z.boolean().optional(), }); export type RefreshRequest = z.infer; From 7ae7fb6079b2113d0e73a698f7980734f4bbbf51 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:47:19 -0700 Subject: [PATCH 035/215] feat(refresh): propagate code hydration --- packages/api-core/src/service.test.ts | 43 ++++++++++++++++++++++++++- packages/api-core/src/service.ts | 2 ++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index e93b8d0..ad3ca30 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -72,7 +72,7 @@ test('doctor reports config path and successful auth smoke checks', async () => listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], - listPullFiles: async () => [], + listPullFiles: async () => [], }, ai: { checkAuth: async () => { @@ -2742,6 +2742,47 @@ test('refreshRepository runs sync, embed, and cluster in order and returns the c } }); +test('refreshRepository forwards includeCode to sync stage', async () => { + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + let receivedIncludeCode: boolean | undefined; + const originalSyncRepository = service.syncRepository.bind(service); + service.syncRepository = (async (params: Parameters[0]) => { + receivedIncludeCode = params.includeCode; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', '2026-03-09T00:00:00Z'); + return { runId: 1, threadsSynced: 0, commentsSynced: 0, codeFilesSynced: 0, threadsClosed: 0 }; + }) as typeof service.syncRepository; + + try { + await service.refreshRepository({ + owner: 'openclaw', + repo: 'openclaw', + embed: false, + cluster: false, + includeCode: true, + }); + + assert.equal(receivedIncludeCode, true); + } finally { + service.syncRepository = originalSyncRepository; + service.close(); + } +}); + test('agent cluster summary and detail dumps expose repo stats, snippets, and summaries', () => { const service = makeTestService({ checkAuth: async () => undefined, diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b94b065..c1cacdd 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -2252,6 +2252,7 @@ export class GHCrawlService { sync?: boolean; embed?: boolean; cluster?: boolean; + includeCode?: boolean; onProgress?: (message: string) => void; }): Promise { const selected = { @@ -2274,6 +2275,7 @@ export class GHCrawlService { sync = await this.syncRepository({ owner: params.owner, repo: params.repo, + includeCode: params.includeCode, onProgress: params.onProgress, }); } From b7a37eb82c2a2e7d1d55a284acdb3deb555e904a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:47:51 -0700 Subject: [PATCH 036/215] feat(cli): refresh code hydration --- apps/cli/src/main.test.ts | 31 +++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 4 +++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index c157771..ffbdb3f 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -438,6 +438,37 @@ test('sync command forwards include-code hydration flag', async () => { assert.match(stdout.read(), /"codeFilesSynced": 1/); }); +test('refresh command forwards include-code hydration flag', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.refreshRepository; + let received: unknown; + + GHCrawlService.prototype.refreshRepository = async function refreshRepositoryStub(params: unknown) { + received = params; + return { + repository: { fullName: 'openclaw/openclaw' }, + selected: { sync: true, embed: true, cluster: true }, + sync: { codeFilesSynced: 1 }, + embed: null, + cluster: null, + } as never; + }; + + try { + await run(['refresh', 'openclaw/openclaw', '--include-code'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.refreshRepository = original; + context.cleanup(); + } + + assert.equal((received as { includeCode?: boolean }).includeCode, true); + assert.match(stdout.read(), /"codeFilesSynced": 1/); +}); + test('parseOwnerRepo accepts owner slash repo syntax', () => { assert.deepEqual(parseOwnerRepo('openclaw/openclaw'), { owner: 'openclaw', repo: 'openclaw' }); }); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index c0fa4f2..2899553 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -140,10 +140,11 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'refresh', - synopsis: 'refresh [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', + synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', description: 'Run sync, embed, and cluster in one staged pipeline.', options: [ '--no-sync Skip the GitHub sync stage', + '--include-code Hydrate pull request file metadata during sync', '--no-embed Skip the embeddings stage', '--no-cluster Skip the clustering stage', '--heap-snapshot-dir Write heap snapshots during long-running work', @@ -969,6 +970,7 @@ export async function run( sync: values['no-sync'] === true ? false : undefined, embed: values['no-embed'] === true ? false : undefined, cluster: values['no-cluster'] === true ? false : undefined, + includeCode: values['include-code'] === true, onProgress: heapDiagnostics?.wrapProgress((message: string) => writeProgress(message, stderr)) ?? ((message: string) => writeProgress(message, stderr)), From b54817b290d4ab78fcbdb09429469dde6e6d74ee Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:48:17 -0700 Subject: [PATCH 037/215] feat(openai): add key summary generation --- packages/api-core/src/openai/provider.ts | 52 ++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index 53a6cc3..5332d7b 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -3,6 +3,8 @@ import { APIConnectionError, APIConnectionTimeoutError, APIError, RateLimitError import { zodTextFormat } from 'openai/helpers/zod'; import { z } from 'zod'; +import { LLM_KEY_SUMMARY_SYSTEM_PROMPT, llmKeySummarySchema, type LlmKeySummary } from '../cluster/llm-key-summary.js'; + export type SummaryResult = { problemSummary: string; solutionSummary: string; @@ -21,6 +23,7 @@ export type SummaryUsage = { export type AiProvider = { checkAuth: () => Promise; summarizeThread: (params: { model: string; text: string }) => Promise<{ summary: SummaryResult; usage?: SummaryUsage }>; + generateKeySummary?: (params: { model: string; text: string }) => Promise<{ summary: LlmKeySummary; usage?: SummaryUsage }>; embedTexts: (params: { model: string; texts: string[]; dimensions?: number }) => Promise; }; @@ -116,6 +119,55 @@ export class OpenAiProvider implements AiProvider { throw new Error(`OpenAI summarization failed after 3 attempts: ${lastError?.message ?? 'unknown error'}`); } + async generateKeySummary(params: { model: string; text: string }): Promise<{ summary: LlmKeySummary; usage?: SummaryUsage }> { + const format = zodTextFormat(llmKeySummarySchema, 'ghcrawl_key_summary'); + let lastError: Error | null = null; + + for (const [attemptIndex, maxOutputTokens] of [240, 400, 600].entries()) { + try { + const response = await this.client.responses.create({ + model: params.model, + input: [ + { + role: 'system', + content: [{ type: 'input_text', text: LLM_KEY_SUMMARY_SYSTEM_PROMPT }], + }, + { + role: 'user', + content: [{ type: 'input_text', text: params.text }], + }, + ], + text: { + format, + verbosity: 'low', + }, + max_output_tokens: maxOutputTokens, + }); + + const raw = response.output_text ?? ''; + return { + summary: llmKeySummarySchema.parse(JSON.parse(raw)), + usage: response.usage + ? { + inputTokens: response.usage.input_tokens, + outputTokens: response.usage.output_tokens, + totalTokens: response.usage.total_tokens, + cachedInputTokens: response.usage.input_tokens_details?.cached_tokens ?? 0, + reasoningTokens: response.usage.output_tokens_details?.reasoning_tokens ?? 0, + } + : undefined, + }; + } catch (error) { + lastError = error instanceof Error ? error : new Error(String(error)); + if (attemptIndex === 2) { + break; + } + } + } + + throw new Error(`OpenAI key summarization failed after 3 attempts: ${lastError?.message ?? 'unknown error'}`); + } + async embedTexts(params: { model: string; texts: string[]; dimensions?: number }): Promise { if (params.texts.length === 0) { return []; From fc7c8f546ed51be57549a849bfc0ab0b548cbc99 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:48:56 -0700 Subject: [PATCH 038/215] feat(cluster): persist key summaries --- .../src/cluster/persistent-store.test.ts | 41 ++++++++++++++++++ .../api-core/src/cluster/persistent-store.ts | 42 +++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index 7c2ff6b..1c84fd2 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -17,6 +17,7 @@ import { upsertThreadFingerprint, upsertThreadRevision, upsertThreadCodeSnapshot, + upsertThreadKeySummary, } from './persistent-store.js'; import { buildDeterministicThreadFingerprint } from './thread-fingerprint.js'; @@ -279,3 +280,43 @@ test('persistent cluster store records code snapshots, changed files, and hunk s db.close(); } }); + +test('persistent cluster store records structured key summaries', () => { + const db = openDb(':memory:'); + try { + migrate(db); + seedRepoAndThreads(db); + const revisionId = upsertThreadRevision(db, { + threadId: 10, + sourceUpdatedAt: '2026-01-01T00:00:00Z', + title: 'Fix cache collision', + body: '', + labels: [], + rawJson: '{}', + }); + + upsertThreadKeySummary(db, { + threadRevisionId: revisionId, + summaryKind: 'llm_key_3line', + promptVersion: 'llm-key-summary-v1', + provider: 'openai', + model: 'gpt-5-mini', + inputHash: 'input-hash', + summary: { + intent: 'Fix cache collision.', + surface: 'API core cache.', + mechanism: 'Changes cache key derivation.', + }, + }); + + const row = db.prepare('select input_hash, key_text from thread_key_summaries where thread_revision_id = ?').get(revisionId) as { + input_hash: string; + key_text: string; + }; + assert.equal(row.input_hash, 'input-hash'); + assert.match(row.key_text, /intent: Fix cache collision\./); + assert.match(row.key_text, /surface: API core cache\./); + } finally { + db.close(); + } +}); diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index c57ea6f..3256478 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -3,6 +3,7 @@ import crypto from 'node:crypto'; import type { SqliteDatabase } from '../db/sqlite.js'; import type { CodeSnapshotSignature } from './code-signature.js'; import type { EvidenceTier, SimilarityEvidenceBreakdown } from './evidence-score.js'; +import { llmKeyEmbeddingText, type LlmKeySummary } from './llm-key-summary.js'; import type { DeterministicThreadFingerprint } from './thread-fingerprint.js'; function nowIso(): string { @@ -314,6 +315,47 @@ export function upsertThreadCodeSnapshot( return snapshot.id; } +export function upsertThreadKeySummary( + db: SqliteDatabase, + params: { + threadRevisionId: number; + summaryKind: string; + promptVersion: string; + provider: string; + model: string; + inputHash: string; + summary: LlmKeySummary; + }, +): void { + const outputJson = JSON.stringify(params.summary); + const outputBlobId = upsertInlineBlob(db, { + text: outputJson, + mediaType: 'application/vnd.ghcrawl.key-summary+json', + }); + db.prepare( + `insert into thread_key_summaries ( + thread_revision_id, summary_kind, prompt_version, provider, model, + input_hash, output_hash, output_json_blob_id, key_text, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_revision_id, summary_kind, prompt_version, provider, model) do update set + input_hash = excluded.input_hash, + output_hash = excluded.output_hash, + output_json_blob_id = excluded.output_json_blob_id, + key_text = excluded.key_text`, + ).run( + params.threadRevisionId, + params.summaryKind, + params.promptVersion, + params.provider, + params.model, + params.inputHash, + stableHash(outputJson), + outputBlobId, + llmKeyEmbeddingText(params.summary), + nowIso(), + ); +} + export function createPipelineRun( db: SqliteDatabase, params: { From c04c70904e5c076f754fcc437218fa8528030b7d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:50:43 -0700 Subject: [PATCH 039/215] feat(cluster): generate key summaries --- packages/api-core/src/service.test.ts | 74 ++++++++++++++++- packages/api-core/src/service.ts | 113 ++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index ad3ca30..f049b85 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -118,7 +118,7 @@ test('doctor reports invalid token format without attempting auth', async () => listIssueComments: async () => [], listPullReviews: async () => [], listPullReviewComments: async () => [], - listPullFiles: async () => [], + listPullFiles: async () => [], }, }); @@ -785,6 +785,78 @@ test('summarizeRepository prices progress output using the configured summary mo } }); +test('generateKeySummaries stores cached 3-line key summaries', async () => { + let calls = 0; + const service = makeTestService( + { + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + { + checkAuth: async () => undefined, + summarizeThread: async () => { + throw new Error('not expected'); + }, + generateKeySummary: async () => { + calls += 1; + return { + summary: { + intent: 'Fix retry loop.', + surface: 'Downloader.', + mechanism: 'Changes timeout handling.', + }, + usage: { + inputTokens: 10, + outputTokens: 5, + totalTokens: 15, + cachedInputTokens: 0, + reasoningTokens: 0, + }, + }; + }, + embedTexts: async () => [], + }, + ); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + + const first = await service.generateKeySummaries({ owner: 'openclaw', repo: 'openclaw' }); + const second = await service.generateKeySummaries({ owner: 'openclaw', repo: 'openclaw' }); + + assert.equal(first.generated, 1); + assert.equal(first.totalTokens, 15); + assert.equal(second.skipped, 1); + assert.equal(calls, 1); + const row = service.db.prepare('select key_text from thread_key_summaries').get() as { key_text: string }; + assert.match(row.key_text, /intent: Fix retry loop\./); + } finally { + service.close(); + } +}); + test('purgeComments removes hydrated comments and refreshes canonical documents', () => { const service = makeTestService({ checkAuth: async () => undefined, diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index c1cacdd..54f7e41 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -58,6 +58,7 @@ import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; import { buildDeterministicClusterGraph } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue } from './cluster/human-key.js'; +import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; import { createPipelineRun, finishPipelineRun, @@ -70,6 +71,7 @@ import { upsertThreadFingerprint, upsertThreadRevision, upsertThreadCodeSnapshot, + upsertThreadKeySummary, } from './cluster/persistent-store.js'; import type { DeterministicThreadFingerprint } from './cluster/thread-fingerprint.js'; import { @@ -1281,6 +1283,117 @@ export class GHCrawlService { } } + async generateKeySummaries(params: { + owner: string; + repo: string; + threadNumber?: number; + limit?: number; + onProgress?: (message: string) => void; + }): Promise<{ runId: number; generated: number; skipped: number; inputTokens: number; outputTokens: number; totalTokens: number }> { + const ai = this.requireAi(); + if (!ai.generateKeySummary) { + throw new Error('Configured AI provider does not support key summary generation.'); + } + const repository = this.requireRepository(params.owner, params.repo); + const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `key-summary:${params.threadNumber}` : `key-summary:${repository.fullName}`); + + try { + let sql = + `select id, number, title, body, labels_json, raw_json, updated_at_gh + from threads + where repo_id = ? and state = 'open'`; + const args: number[] = [repository.id]; + if (params.threadNumber) { + sql += ' and number = ?'; + args.push(params.threadNumber); + } + sql += ' order by number asc'; + if (params.limit) { + sql += ' limit ?'; + args.push(params.limit); + } + + const rows = this.db.prepare(sql).all(...args) as Array<{ + id: number; + number: number; + title: string; + body: string | null; + labels_json: string; + raw_json: string; + updated_at_gh: string | null; + }>; + params.onProgress?.(`[key-summary] loaded ${rows.length} candidate thread(s) for ${repository.fullName}`); + + let generated = 0; + let skipped = 0; + let inputTokens = 0; + let outputTokens = 0; + let totalTokens = 0; + + for (const row of rows) { + const labels = parseArray(row.labels_json); + const inputHash = llmKeyInputHash({ + title: row.title, + body: row.body, + commentsText: null, + diffText: null, + }); + const revisionId = upsertThreadRevision(this.db, { + threadId: row.id, + sourceUpdatedAt: row.updated_at_gh, + title: row.title, + body: row.body, + labels, + rawJson: row.raw_json, + }); + const existing = this.db + .prepare( + `select input_hash + from thread_key_summaries + where thread_revision_id = ? + and summary_kind = 'llm_key_3line' + and prompt_version = ? + and provider = 'openai' + and model = ? + limit 1`, + ) + .get(revisionId, LLM_KEY_SUMMARY_PROMPT_VERSION, this.config.summaryModel) as { input_hash: string } | undefined; + if (existing?.input_hash === inputHash) { + skipped += 1; + continue; + } + + const result = await ai.generateKeySummary({ + model: this.config.summaryModel, + text: [`title: ${row.title}`, `labels: ${labels.join(', ')}`, `body: ${row.body ?? ''}`].join('\n'), + }); + upsertThreadKeySummary(this.db, { + threadRevisionId: revisionId, + summaryKind: 'llm_key_3line', + promptVersion: LLM_KEY_SUMMARY_PROMPT_VERSION, + provider: 'openai', + model: this.config.summaryModel, + inputHash, + summary: result.summary, + }); + generated += 1; + if (result.usage) { + inputTokens += result.usage.inputTokens; + outputTokens += result.usage.outputTokens; + totalTokens += result.usage.totalTokens; + } + params.onProgress?.(`[key-summary] generated ${generated}/${rows.length} thread #${row.number}`); + } + + const payload = { runId, generated, skipped, inputTokens, outputTokens, totalTokens }; + this.finishRun('summary_runs', runId, 'completed', payload); + return payload; + } catch (error) { + this.finishRun('summary_runs', runId, 'failed', null, error); + throw error; + } + } + purgeComments(params: { owner: string; repo: string; From ae6e3634dc07f7e17ac172937c94384930964785 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:51:45 -0700 Subject: [PATCH 040/215] feat(cli): add key summaries command --- apps/cli/src/main.test.ts | 33 +++++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 25 +++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index ffbdb3f..fa47a1d 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -49,6 +49,7 @@ const publicCommands = [ 'close-cluster', 'exclude-cluster-member', 'embed', + 'key-summaries', 'cluster', 'clusters', 'durable-clusters', @@ -272,6 +273,7 @@ test('agent-facing command help advertises explicit --json', async () => { 'close-cluster', 'exclude-cluster-member', 'embed', + 'key-summaries', 'cluster', 'clusters', 'durable-clusters', @@ -469,6 +471,37 @@ test('refresh command forwards include-code hydration flag', async () => { assert.match(stdout.read(), /"codeFilesSynced": 1/); }); +test('key-summaries command forwards enrichment options', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.generateKeySummaries; + let received: unknown; + + GHCrawlService.prototype.generateKeySummaries = async function generateKeySummariesStub(params: unknown) { + received = params; + return { runId: 1, generated: 1, skipped: 0, inputTokens: 10, outputTokens: 5, totalTokens: 15 } as never; + }; + + try { + await run(['key-summaries', 'openclaw/openclaw', '--number', '42', '--limit', '1'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.generateKeySummaries = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + threadNumber: 42, + limit: 1, + onProgress: (received as { onProgress?: unknown }).onProgress, + }); + assert.match(stdout.read(), /"generated": 1/); +}); + test('parseOwnerRepo accepts owner slash repo syntax', () => { assert.deepEqual(parseOwnerRepo('openclaw/openclaw'), { owner: 'openclaw', repo: 'openclaw' }); }); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 2899553..39cbb0c 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -23,6 +23,7 @@ type CommandName = | 'close-cluster' | 'exclude-cluster-member' | 'summarize' + | 'key-summaries' | 'purge-comments' | 'embed' | 'cluster' @@ -216,6 +217,18 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl embed openclaw/openclaw --json', 'ghcrawl embed openclaw/openclaw --number 42 --json'], agentJson: true, }, + { + name: 'key-summaries', + synopsis: 'key-summaries [--number ] [--limit ] [--json]', + description: 'Generate cached 3-line LLM key summaries for clustering enrichment.', + options: [ + '--number Restrict key summary work to one thread', + '--limit Limit the number of generated summaries', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl key-summaries openclaw/openclaw --limit 25 --json'], + agentJson: true, + }, { name: 'cluster', synopsis: 'cluster [--k ] [--threshold ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', @@ -1068,6 +1081,18 @@ export async function run( writeJson(stdout, result); return; } + case 'key-summaries': { + const { owner, repo, values } = parseRepoFlags('key-summaries', rest); + const result = await getService().generateKeySummaries({ + owner, + repo, + threadNumber: typeof values.number === 'string' ? parsePositiveInteger('number', values.number, 'key-summaries') : undefined, + limit: typeof values.limit === 'string' ? parsePositiveInteger('limit', values.limit, 'key-summaries') : undefined, + onProgress: (message: string) => writeProgress(message, stderr), + }); + writeJson(stdout, result); + return; + } case 'purge-comments': { const { owner, repo, values } = parseRepoFlags('purge-comments', rest); const result = getService().purgeComments({ From 011601b25812061b1a19d55f6bb19ee113133fae Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 20:57:57 -0700 Subject: [PATCH 041/215] feat(embed): support key summary basis --- apps/cli/src/main.ts | 30 +++++--- apps/cli/src/tui/app.test.ts | 4 + apps/cli/src/tui/app.ts | 24 +++--- packages/api-core/src/cluster/edge-worker.ts | 2 +- packages/api-core/src/config.ts | 4 +- packages/api-core/src/service.test.ts | 79 ++++++++++++++++++++ packages/api-core/src/service.ts | 63 ++++++++++++++-- 7 files changed, 177 insertions(+), 29 deletions(-) diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 39cbb0c..01f2f8c 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -60,7 +60,7 @@ type ConfigureReport = { configPath: string; updated: boolean; summaryModel: 'gpt-5-mini' | 'gpt-5.4-mini'; - embeddingBasis: 'title_original' | 'title_summary'; + embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; vectorBackend: 'vectorlite'; costEstimateUsd: { sampleThreads: number; @@ -107,11 +107,11 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'configure', - synopsis: 'configure [--summary-model gpt-5-mini|gpt-5.4-mini] [--embedding-basis title_original|title_summary] [--json]', + synopsis: 'configure [--summary-model gpt-5-mini|gpt-5.4-mini] [--embedding-basis title_original|title_summary|llm_key_summary] [--json]', description: 'Show or update persisted summarization and embedding settings.', options: [ '--summary-model Select gpt-5-mini or gpt-5.4-mini for summarization', - '--embedding-basis Select title_original or title_summary for active vectors', + '--embedding-basis Select title_original, title_summary, or llm_key_summary for active vectors', '--json Emit machine-readable JSON output explicitly', ], examples: ['ghcrawl configure', 'ghcrawl configure --summary-model gpt-5.4-mini', 'ghcrawl configure --embedding-basis title_original --json'], @@ -661,7 +661,7 @@ function buildConfigureReport(options: { configPath: string; updated: boolean; summaryModel: 'gpt-5-mini' | 'gpt-5.4-mini'; - embeddingBasis: 'title_original' | 'title_summary'; + embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; vectorBackend: 'vectorlite'; }): ConfigureReport { return { @@ -720,12 +720,18 @@ export function formatDoctorReport(result: DoctorReport): string { } export function formatConfigureReport(result: ConfigureReport): string { - const basisLabel = result.embeddingBasis === 'title_summary' - ? 'title + dedupe summary' - : 'title + original body'; - const summaryModeNote = result.embeddingBasis === 'title_summary' - ? 'enabled automatically during refresh' - : 'disabled by default; enable title_summary to summarize before embedding'; + const basisLabel = + result.embeddingBasis === 'title_summary' + ? 'title + dedupe summary' + : result.embeddingBasis === 'llm_key_summary' + ? 'title + 3-line LLM key summary' + : 'title + original body'; + const summaryModeNote = + result.embeddingBasis === 'title_summary' + ? 'enabled automatically during refresh' + : result.embeddingBasis === 'llm_key_summary' + ? 'requires key-summaries before embedding' + : 'disabled by default; enable title_summary or llm_key_summary before embedding'; const lines = [ 'ghcrawl configure', `config path: ${result.configPath}`, @@ -927,7 +933,7 @@ export async function run( }); const values = parsed.values as RepoCommandValues; const summaryModel = parseEnum('configure', 'summary-model', values['summary-model'], ['gpt-5-mini', 'gpt-5.4-mini']); - const embeddingBasis = parseEnum('configure', 'embedding-basis', values['embedding-basis'], ['title_original', 'title_summary']); + const embeddingBasis = parseEnum('configure', 'embedding-basis', values['embedding-basis'], ['title_original', 'title_summary', 'llm_key_summary']); const current = getConfig(); const stored = readPersistedConfig(loadConfigOptions); const next = { @@ -947,7 +953,7 @@ export async function run( configPath: current.configPath, updated, summaryModel: next.summaryModel as 'gpt-5-mini' | 'gpt-5.4-mini', - embeddingBasis: next.embeddingBasis as 'title_original' | 'title_summary', + embeddingBasis: next.embeddingBasis as 'title_original' | 'title_summary' | 'llm_key_summary', vectorBackend: 'vectorlite', }); const shouldWriteJson = values.json === true || (stdout as NodeJS.WriteStream).isTTY !== true; diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 19289d9..dde75fa 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -211,6 +211,10 @@ test('buildUpdatePipelineHelpContent explains the LLM summary tradeoff for both const enabled = buildUpdatePipelineHelpContent('title_summary'); assert.match(enabled, /LLM summaries: enabled/); assert.match(enabled, /about 50%/); + + const keySummary = buildUpdatePipelineHelpContent('llm_key_summary'); + assert.match(keySummary, /3-line key summaries/); + assert.match(keySummary, /key-summaries/); }); test('buildRefreshCliArgs maps the staged selection to refresh skip flags', () => { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 4a336ca..38a266f 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -8,6 +8,7 @@ import { fileURLToPath } from 'node:url'; import blessed from 'neo-blessed'; import type { + EmbeddingBasis, GHCrawlService, TuiClusterDetail, TuiClusterSortMode, @@ -1379,14 +1380,19 @@ export function buildUpdatePipelineLabels( }); } -export function buildUpdatePipelineHelpContent(embeddingBasis: 'title_original' | 'title_summary'): string { - const summariesEnabled = embeddingBasis === 'title_summary'; - const summaryStatus = summariesEnabled - ? 'LLM summaries: enabled via title_summary.' - : 'LLM summaries: disabled; current basis is title_original.'; - const summaryAction = summariesEnabled - ? 'On openclaw/openclaw this improved non-solo cluster membership by about 50% versus title_original.' - : 'Enable with `ghcrawl configure --embedding-basis title_summary` if you want richer clustering; on openclaw/openclaw that improved non-solo cluster membership by about 50%.'; +export function buildUpdatePipelineHelpContent(embeddingBasis: EmbeddingBasis): string { + const summaryStatus = + embeddingBasis === 'title_summary' + ? 'LLM summaries: enabled via title_summary.' + : embeddingBasis === 'llm_key_summary' + ? '3-line key summaries: active embedding basis.' + : 'LLM summaries: disabled; current basis is title_original.'; + const summaryAction = + embeddingBasis === 'title_summary' + ? 'On openclaw/openclaw this improved non-solo cluster membership by about 50% versus title_original.' + : embeddingBasis === 'llm_key_summary' + ? 'Run `ghcrawl key-summaries` before embedding so the active vectors have deterministic key text.' + : 'Enable with `ghcrawl configure --embedding-basis title_summary` if you want richer clustering; on openclaw/openclaw that improved non-solo cluster membership by about 50%.'; return [ 'Usually you want all three. Run order is fixed: GitHub sync/reconcile -> embeddings -> clusters.', `${summaryStatus} ${summaryAction}`, @@ -1522,7 +1528,7 @@ async function promptHelp(screen: blessed.Widgets.Screen): Promise { async function promptUpdatePipelineSelection( screen: blessed.Widgets.Screen, stats: TuiRepoStats | null, - embeddingBasis: 'title_original' | 'title_summary', + embeddingBasis: EmbeddingBasis, ): Promise { const selection: UpdateTaskSelection = { sync: true, embed: true, cluster: true }; const modalWidth = '76%'; diff --git a/packages/api-core/src/cluster/edge-worker.ts b/packages/api-core/src/cluster/edge-worker.ts index 97b2777..98fcc95 100644 --- a/packages/api-core/src/cluster/edge-worker.ts +++ b/packages/api-core/src/cluster/edge-worker.ts @@ -7,7 +7,7 @@ import { buildSourceKindEdges } from './exact-edges.js'; type WorkerInput = { dbPath: string; repoId: number; - sourceKind: 'title' | 'body' | 'dedupe_summary'; + sourceKind: 'title' | 'body' | 'dedupe_summary' | 'llm_key_summary'; limit: number; minScore: number; }; diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index f57726d..d159709 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -9,7 +9,7 @@ export type SecretProvider = 'plaintext' | 'op'; export type TuiSortPreference = 'recent' | 'size'; export type TuiMinClusterSize = 0 | 1 | 10 | 20 | 50; export type TuiWideLayoutPreference = 'columns' | 'right-stack'; -export type EmbeddingBasis = 'title_original' | 'title_summary'; +export type EmbeddingBasis = 'title_original' | 'title_summary' | 'llm_key_summary'; export type VectorBackend = 'vectorlite'; export type TuiRepositoryPreference = { @@ -181,7 +181,7 @@ function getTuiWideLayoutPreference(value: unknown): TuiWideLayoutPreference | u } function getEmbeddingBasis(value: unknown): EmbeddingBasis | undefined { - return value === 'title_original' || value === 'title_summary' ? value : undefined; + return value === 'title_original' || value === 'title_summary' || value === 'llm_key_summary' ? value : undefined; } function getVectorBackend(value: unknown): VectorBackend | undefined { diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index f049b85..96d3a06 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -1061,6 +1061,85 @@ test('embedRepository batches multi-source embeddings and skips unchanged inputs } }); +test('embedRepository can use stored 3-line key summaries as active vector input', async () => { + let embeddedText = ''; + const service = new GHCrawlService({ + config: makeTestConfig({ embeddingBasis: 'llm_key_summary' }), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + ai: { + checkAuth: async () => undefined, + summarizeThread: async () => { + throw new Error('not expected'); + }, + embedTexts: async ({ texts }) => { + embeddedText = texts[0] ?? ''; + return texts.map((_text, index) => makeEmbedding(1, index)); + }, + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, raw_json_blob_id, created_at) + values (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(100, 10, now, 'content-hash', 'title-hash', 'body-hash', 'labels-hash', null, now); + service.db + .prepare( + `insert into thread_key_summaries ( + thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, output_json_blob_id, key_text, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 100, + 'llm_key_3line', + 'llm-key-summary-v1', + 'openai', + 'gpt-5-mini', + 'input-hash', + 'output-hash', + null, + 'intent: Fix retry loop.\nsurface: Downloader.\nmechanism: Changes timeout handling.', + now, + ); + + const result = await service.embedRepository({ owner: 'openclaw', repo: 'openclaw' }); + + assert.equal(result.embedded, 1); + assert.match(embeddedText, /key_summary:/); + assert.match(embeddedText, /intent: Fix retry loop\./); + } finally { + service.close(); + } +}); + test('listNeighbors uses the vectorlite sidecar for current active vectors', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 54f7e41..ba7d99e 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -127,7 +127,7 @@ type CommentSeed = { updatedAtGh: string | null; }; -type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary'; +type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary' | 'llm_key_summary'; type SimilaritySourceKind = EmbeddingSourceKind | 'deterministic_fingerprint'; type EmbeddingTask = { @@ -1541,6 +1541,7 @@ export class GHCrawlService { if (this.isRepoVectorStateCurrent(repository.id)) { const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); + const activeSourceKind = this.activeVectorSourceKind(); const activeIds = new Set(vectorItems.map((item) => item.id)); const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k); aggregatedEdges = new Map(); @@ -1570,7 +1571,7 @@ export class GHCrawlService { leftThreadId: Math.min(item.id, neighbor.threadId), rightThreadId: Math.max(item.id, neighbor.threadId), score: neighbor.score, - sourceKinds: new Set(['dedupe_summary']), + sourceKinds: new Set([activeSourceKind]), }); } } @@ -1666,7 +1667,7 @@ export class GHCrawlService { const repository = this.requireRepository(params.owner, params.repo); const loaded = this.loadClusterableThreadMeta(repository.id); const activeVectors = this.isRepoVectorStateCurrent(repository.id) ? this.loadNormalizedActiveVectors(repository.id) : []; - const activeSourceKind: EmbeddingSourceKind = this.config.embeddingBasis === 'title_summary' ? 'dedupe_summary' : 'body'; + const activeSourceKind = this.activeVectorSourceKind(); const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0); const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds); const items = useActiveVectors @@ -1878,7 +1879,7 @@ export class GHCrawlService { } // Finalize edge scores using the configured aggregation method - const defaultWeights: Record = { dedupe_summary: 0.5, title: 0.3, body: 0.2 }; + const defaultWeights: Record = { dedupe_summary: 0.5, llm_key_summary: 0.5, title: 0.3, body: 0.2 }; const weights = { ...defaultWeights, ...(params.aggregationWeights ?? {}) }; const aggregated = this.finalizeEdgeScores(perSourceScores, aggregation, weights, minScore); @@ -3953,6 +3954,7 @@ export class GHCrawlService { title: string; body: string | null; dedupeSummary: string | null; + keySummary: string | null; }): ActiveVectorTask | null { const sections = [`title: ${normalizeSummaryText(params.title)}`]; if (this.config.embeddingBasis === 'title_summary') { @@ -3961,6 +3963,12 @@ export class GHCrawlService { return null; } sections.push(`summary: ${summary}`); + } else if (this.config.embeddingBasis === 'llm_key_summary') { + const keySummary = normalizeSummaryText(params.keySummary ?? ''); + if (!keySummary) { + return null; + } + sections.push(`key_summary:\n${keySummary}`); } else { const body = normalizeSummaryText(params.body ?? ''); if (body) { @@ -3986,6 +3994,16 @@ export class GHCrawlService { }; } + private activeVectorSourceKind(): EmbeddingSourceKind { + if (this.config.embeddingBasis === 'title_summary') { + return 'dedupe_summary'; + } + if (this.config.embeddingBasis === 'llm_key_summary') { + return 'llm_key_summary'; + } + return 'body'; + } + private prepareEmbeddingText( text: string, maxEstimatedTokens: number, @@ -4579,6 +4597,7 @@ export class GHCrawlService { body: string | null; }>; const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber); + const keySummaryTexts = this.loadKeySummaryTextMap(repoId, threadNumber); const missingSummaryThreadNumbers: number[] = []; const tasks = rows.flatMap((row) => { const task = this.buildActiveVectorTask({ @@ -4587,11 +4606,12 @@ export class GHCrawlService { title: row.title, body: row.body, dedupeSummary: summaryTexts.get(row.id) ?? null, + keySummary: keySummaryTexts.get(row.id) ?? null, }); if (task) { return [task]; } - if (this.config.embeddingBasis === 'title_summary') { + if (this.config.embeddingBasis === 'title_summary' || this.config.embeddingBasis === 'llm_key_summary') { missingSummaryThreadNumbers.push(row.number); } return []; @@ -4653,6 +4673,39 @@ export class GHCrawlService { return combined; } + private loadKeySummaryTextMap(repoId: number, threadNumber?: number): Map { + let sql = + `select tr.thread_id, ks.key_text + from thread_key_summaries ks + join thread_revisions tr on tr.id = ks.thread_revision_id + join threads t on t.id = tr.thread_id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and ks.summary_kind = 'llm_key_3line' + and ks.prompt_version = ? + and ks.model = ?`; + const args: Array = [repoId, LLM_KEY_SUMMARY_PROMPT_VERSION, this.config.summaryModel]; + if (threadNumber) { + sql += ' and t.number = ?'; + args.push(threadNumber); + } + sql += ' order by tr.id asc'; + + const rows = this.db.prepare(sql).all(...args) as Array<{ + thread_id: number; + key_text: string; + }>; + const combined = new Map(); + for (const row of rows) { + const text = normalizeSummaryText(row.key_text); + if (text) { + combined.set(row.thread_id, text); + } + } + return combined; + } + private edgeKey(leftThreadId: number, rightThreadId: number): string { const left = Math.min(leftThreadId, rightThreadId); const right = Math.max(leftThreadId, rightThreadId); From 0973ac6ae7847bf11039f1f07e8adf87ff46c36f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 21:00:59 -0700 Subject: [PATCH 042/215] fix(db): spill large patch blobs to object store --- .../src/cluster/persistent-store.test.ts | 54 +++++++++++++++++++ .../api-core/src/cluster/persistent-store.ts | 22 +++++++- packages/api-core/src/service.ts | 5 ++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index 1c84fd2..7e0091f 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -1,5 +1,8 @@ import test from 'node:test'; import assert from 'node:assert/strict'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; import { migrate } from '../db/migrate.js'; import { openDb } from '../db/sqlite.js'; @@ -281,6 +284,57 @@ test('persistent cluster store records code snapshots, changed files, and hunk s } }); +test('persistent cluster store keeps large code patches out of SQLite', () => { + const db = openDb(':memory:'); + const storeRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-code-blob-')); + try { + migrate(db); + seedRepoAndThreads(db); + const revisionId = upsertThreadRevision(db, { + threadId: 10, + sourceUpdatedAt: '2026-01-01T00:00:00Z', + title: 'Fix cache collision', + body: '', + labels: [], + rawJson: '{}', + }); + const largePatch = `@@ -1 +1 @@\n-${'oldKey\n'.repeat(800)}+${'newKey\n'.repeat(800)}`; + const signature = buildCodeSnapshotSignature([ + { + filename: 'packages/api-core/src/cache.ts', + status: 'modified', + additions: 800, + deletions: 800, + changes: 1600, + patch: largePatch, + }, + ]); + + const snapshotId = upsertThreadCodeSnapshot(db, { + threadRevisionId: revisionId, + signature, + storeRoot, + }); + + const blob = db + .prepare( + `select b.storage_kind, b.storage_path, b.inline_text + from thread_changed_files f + join blobs b on b.id = f.patch_blob_id + where f.snapshot_id = ?`, + ) + .get(snapshotId) as { storage_kind: string; storage_path: string | null; inline_text: string | null }; + + assert.equal(blob.storage_kind, 'file'); + assert.equal(blob.inline_text, null); + assert.ok(blob.storage_path); + assert.ok(fs.existsSync(path.join(storeRoot, blob.storage_path))); + } finally { + db.close(); + fs.rmSync(storeRoot, { recursive: true, force: true }); + } +}); + test('persistent cluster store records structured key summaries', () => { const db = openDb(':memory:'); try { diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index 3256478..b1c4350 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -1,5 +1,6 @@ import crypto from 'node:crypto'; +import { storeTextBlob } from '../db/blob-store.js'; import type { SqliteDatabase } from '../db/sqlite.js'; import type { CodeSnapshotSignature } from './code-signature.js'; import type { EvidenceTier, SimilarityEvidenceBreakdown } from './evidence-score.js'; @@ -35,6 +36,23 @@ function upsertInlineBlob( return row.id; } +function upsertTextBlob( + db: SqliteDatabase, + params: { + text: string; + mediaType: string; + storeRoot?: string; + }, +): number { + if (params.storeRoot) { + return storeTextBlob(db, params.storeRoot, params.text, { + mediaType: params.mediaType, + inlineThresholdBytes: 4096, + }).id; + } + return upsertInlineBlob(db, params); +} + export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster'; export function upsertActor( @@ -244,6 +262,7 @@ export function upsertThreadCodeSnapshot( baseSha?: string | null; headSha?: string | null; signature: CodeSnapshotSignature; + storeRoot?: string; }, ): number { const timestamp = nowIso(); @@ -285,9 +304,10 @@ export function upsertThreadCodeSnapshot( ); for (const file of params.signature.files) { const patchBlobId = file.patch - ? upsertInlineBlob(db, { + ? upsertTextBlob(db, { text: file.patch, mediaType: 'text/x-diff', + storeRoot: params.storeRoot, }) : null; insertFile.run( diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index ba7d99e..9b40fd9 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3594,9 +3594,14 @@ export class GHCrawlService { baseSha: typeof base?.sha === 'string' ? base.sha : null, headSha: typeof head?.sha === 'string' ? head.sha : null, signature: buildCodeSnapshotSignature(files), + storeRoot: this.blobStoreRoot(), }); } + private blobStoreRoot(): string { + return path.join(path.dirname(this.config.dbPath), '.ghcrawl-store'); + } + private async applyClosedOverlapSweep(params: { repoId: number; owner: string; From 6fa8737e6752e430f92a0b0148048c21251c7fbf Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 21:06:00 -0700 Subject: [PATCH 043/215] feat(cluster): cache latest deterministic fingerprints --- .../src/cluster/deterministic-engine.ts | 4 +- packages/api-core/src/service.test.ts | 74 +++++++++++++++++++ packages/api-core/src/service.ts | 54 ++++++++++++-- 3 files changed, 122 insertions(+), 10 deletions(-) diff --git a/packages/api-core/src/cluster/deterministic-engine.ts b/packages/api-core/src/cluster/deterministic-engine.ts index 055d3a7..00e6ee8 100644 --- a/packages/api-core/src/cluster/deterministic-engine.ts +++ b/packages/api-core/src/cluster/deterministic-engine.ts @@ -28,7 +28,7 @@ export type DeterministicClusterResult = { fingerprints: Map; }; -function extractRefs(value: string | null): string[] { +export function extractDeterministicRefs(value: string | null): string[] { const refs = new Set(); for (const match of value?.matchAll(REF_RE) ?? []) { refs.add(match[1]); @@ -83,7 +83,7 @@ export function buildDeterministicClusterGraph( const fingerprints = new Map(); const titleById = new Map(); for (const input of inputs) { - const inferredRefs = extractRefs(`${input.title}\n${input.body ?? ''}`); + const inferredRefs = extractDeterministicRefs(`${input.title}\n${input.body ?? ''}`); fingerprints.set( input.id, buildDeterministicThreadFingerprint({ diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 96d3a06..e4a1c51 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2260,6 +2260,80 @@ test('clusterRepository falls back to deterministic fingerprints when vectors ar } }); +test('clusterRepository materializes only changed deterministic fingerprints', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Download retry hangs forever', 'The transfer retry loop never exits after timeout.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Download retry loop never exits', 'Retry hangs forever after timeout.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + + await service.clusterRepository({ owner: 'openclaw', repo: 'openclaw', k: 1, minScore: 0.1 }); + + const secondMessages: string[] = []; + await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + k: 1, + minScore: 0.1, + onProgress: (message) => secondMessages.push(message), + }); + assert.ok(secondMessages.some((message) => message.includes('[fingerprint] latest revisions computed=0 skipped=2'))); + + service.db + .prepare('update threads set body = ?, content_hash = ?, updated_at_gh = ?, updated_at = ? where id = ?') + .run('The transfer retry loop never exits after a network timeout.', 'hash-42b', '2026-03-10T00:00:00Z', '2026-03-10T00:00:00Z', 10); + + const thirdMessages: string[] = []; + await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + k: 1, + minScore: 0.1, + onProgress: (message) => thirdMessages.push(message), + }); + + const revisionCount = service.db.prepare('select count(*) as count from thread_revisions').get() as { count: number }; + const fingerprintCount = service.db.prepare('select count(*) as count from thread_fingerprints').get() as { count: number }; + assert.ok(thirdMessages.some((message) => message.includes('[fingerprint] latest revisions computed=1 skipped=1'))); + assert.equal(revisionCount.count, 3); + assert.equal(fingerprintCount.count, 3); + } finally { + service.close(); + } +}); + test('clusterRepository uses hydrated code hunk signatures without embeddings', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 9b40fd9..2d05645 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -55,7 +55,7 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; -import { buildDeterministicClusterGraph } from './cluster/deterministic-engine.js'; +import { buildDeterministicClusterGraph, extractDeterministicRefs } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; @@ -73,7 +73,10 @@ import { upsertThreadCodeSnapshot, upsertThreadKeySummary, } from './cluster/persistent-store.js'; -import type { DeterministicThreadFingerprint } from './cluster/thread-fingerprint.js'; +import { + buildDeterministicThreadFingerprint, + THREAD_FINGERPRINT_ALGORITHM_VERSION, +} from './cluster/thread-fingerprint.js'; import { ensureRuntimeDirs, isLikelyGitHubToken, @@ -1596,8 +1599,8 @@ export class GHCrawlService { }); } else { const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); + this.materializeLatestDeterministicFingerprints(deterministicItems, params.onProgress); const deterministic = buildDeterministicClusterGraph(deterministicItems, { topK: Math.max(k * 8, 64) }); - this.persistDeterministicFingerprints(deterministicItems, deterministic.fingerprints); items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); aggregatedEdges = new Map(); for (const edge of deterministic.edges) { @@ -4493,20 +4496,25 @@ export class GHCrawlService { return out; } - private persistDeterministicFingerprints( + private materializeLatestDeterministicFingerprints( items: Array<{ id: number; + number: number; + kind: 'issue' | 'pull_request'; title: string; body: string | null; labels: string[]; rawJson: string; updatedAtGh: string | null; + changedFiles: string[]; + hunkSignatures: string[]; + patchIds: string[]; }>, - fingerprints: Map, - ): void { + onProgress?: (message: string) => void, + ): { computed: number; skipped: number } { + let computed = 0; + let skipped = 0; for (const item of items) { - const fingerprint = fingerprints.get(item.id); - if (!fingerprint) continue; const revisionId = upsertThreadRevision(this.db, { threadId: item.id, sourceUpdatedAt: item.updatedAtGh, @@ -4515,8 +4523,38 @@ export class GHCrawlService { labels: item.labels, rawJson: item.rawJson, }); + const existing = this.db + .prepare( + `select id + from thread_fingerprints + where thread_revision_id = ? + and algorithm_version = ? + limit 1`, + ) + .get(revisionId, THREAD_FINGERPRINT_ALGORITHM_VERSION) as { id: number } | undefined; + if (existing) { + skipped += 1; + continue; + } + + const inferredRefs = extractDeterministicRefs(`${item.title}\n${item.body ?? ''}`); + const fingerprint = buildDeterministicThreadFingerprint({ + threadId: item.id, + number: item.number, + kind: item.kind, + title: item.title, + body: item.body, + labels: item.labels, + linkedRefs: inferredRefs, + changedFiles: item.changedFiles, + hunkSignatures: item.hunkSignatures, + patchIds: item.patchIds, + }); upsertThreadFingerprint(this.db, { threadRevisionId: revisionId, fingerprint }); + computed += 1; } + onProgress?.(`[fingerprint] latest revisions computed=${computed} skipped=${skipped}`); + return { computed, skipped }; } private loadNormalizedActiveVectors(repoId: number): Array<{ id: number; number: number; title: string; embedding: number[] }> { From 61f5874cecd6d3a70a12f038c770ce59837e73fc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 21:08:01 -0700 Subject: [PATCH 044/215] feat(cluster): build deterministic graph from fingerprint cache --- .../src/cluster/deterministic-engine.ts | 33 +++++-- .../api-core/src/cluster/persistent-store.ts | 1 + .../src/cluster/thread-fingerprint.test.ts | 2 +- .../src/cluster/thread-fingerprint.ts | 2 +- packages/api-core/src/service.ts | 97 ++++++++++++++++++- 5 files changed, 125 insertions(+), 10 deletions(-) diff --git a/packages/api-core/src/cluster/deterministic-engine.ts b/packages/api-core/src/cluster/deterministic-engine.ts index 00e6ee8..6d350e5 100644 --- a/packages/api-core/src/cluster/deterministic-engine.ts +++ b/packages/api-core/src/cluster/deterministic-engine.ts @@ -17,6 +17,12 @@ export type DeterministicClusterInput = { patchIds?: string[]; }; +export type DeterministicClusterNode = { + id: number; + number: number; + title: string; +}; + export type DeterministicClusterEdge = SimilarityEdge & { tier: 'strong' | 'weak'; breakdown: SimilarityEvidenceBreakdown; @@ -81,7 +87,6 @@ export function buildDeterministicClusterGraph( params: { maxBucketSize?: number; topK?: number } = {}, ): DeterministicClusterResult { const fingerprints = new Map(); - const titleById = new Map(); for (const input of inputs) { const inferredRefs = extractDeterministicRefs(`${input.title}\n${input.body ?? ''}`); fingerprints.set( @@ -92,9 +97,25 @@ export function buildDeterministicClusterGraph( linkedRefs: Array.from(new Set([...(input.linkedRefs ?? []), ...inferredRefs])).sort(), }), ); - titleById.set(input.id, input.title); } + return buildDeterministicClusterGraphFromFingerprints( + inputs.map((input) => ({ + id: input.id, + number: input.number, + title: input.title, + })), + fingerprints, + params, + ); +} + +export function buildDeterministicClusterGraphFromFingerprints( + nodes: DeterministicClusterNode[], + fingerprints: Map, + params: { maxBucketSize?: number; topK?: number } = {}, +): DeterministicClusterResult { + const titleById = new Map(nodes.map((node) => [node.id, node.title])); const pairs = buildCandidatePairs(fingerprints, { maxBucketSize: params.maxBucketSize ?? 500, topK: params.topK ?? 64, @@ -116,10 +137,10 @@ export function buildDeterministicClusterGraph( } const clusters = buildClusters( - inputs.map((input) => ({ - threadId: input.id, - number: input.number, - title: titleById.get(input.id) ?? input.title, + nodes.map((node) => ({ + threadId: node.id, + number: node.number, + title: titleById.get(node.id) ?? node.title, })), edges, ); diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index b1c4350..1f01470 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -216,6 +216,7 @@ export function upsertThreadFingerprint( }); const featureJson = JSON.stringify({ salientTitleTokens: params.fingerprint.salientTitleTokens, + changedFiles: params.fingerprint.changedFiles, hunkSignatures: params.fingerprint.hunkSignatures, patchIds: params.fingerprint.patchIds, }); diff --git a/packages/api-core/src/cluster/thread-fingerprint.test.ts b/packages/api-core/src/cluster/thread-fingerprint.test.ts index 3034931..96323ae 100644 --- a/packages/api-core/src/cluster/thread-fingerprint.test.ts +++ b/packages/api-core/src/cluster/thread-fingerprint.test.ts @@ -33,7 +33,7 @@ test('buildDeterministicThreadFingerprint is stable without model inputs', () => assert.equal(first.fingerprintHash, second.fingerprintHash); assert.equal(first.fingerprintSlug, second.fingerprintSlug); - assert.equal(first.algorithmVersion, 'thread-fingerprint-v1'); + assert.equal(first.algorithmVersion, 'thread-fingerprint-v2'); assert.ok(first.minhashSignature.length > 0); }); diff --git a/packages/api-core/src/cluster/thread-fingerprint.ts b/packages/api-core/src/cluster/thread-fingerprint.ts index 9958e34..ded7ab3 100644 --- a/packages/api-core/src/cluster/thread-fingerprint.ts +++ b/packages/api-core/src/cluster/thread-fingerprint.ts @@ -73,7 +73,7 @@ export type FingerprintPairBreakdown = { lineage: number; }; -export const THREAD_FINGERPRINT_ALGORITHM_VERSION = 'thread-fingerprint-v1'; +export const THREAD_FINGERPRINT_ALGORITHM_VERSION = 'thread-fingerprint-v2'; export function tokenize(value: string | null | undefined): string[] { return Array.from(value?.toLowerCase().matchAll(TOKEN_RE) ?? []).map((match) => match[0]); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 2d05645..4a0fcce 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -55,7 +55,7 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; -import { buildDeterministicClusterGraph, extractDeterministicRefs } from './cluster/deterministic-engine.js'; +import { buildDeterministicClusterGraphFromFingerprints, extractDeterministicRefs } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; @@ -76,6 +76,7 @@ import { import { buildDeterministicThreadFingerprint, THREAD_FINGERPRINT_ALGORITHM_VERSION, + type DeterministicThreadFingerprint, } from './cluster/thread-fingerprint.js'; import { ensureRuntimeDirs, @@ -90,6 +91,7 @@ import { } from './config.js'; import { migrate } from './db/migrate.js'; import { openDb, type SqliteDatabase } from './db/sqlite.js'; +import { readTextBlob } from './db/blob-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; @@ -486,6 +488,16 @@ function parseArray(value: string): string[] { return JSON.parse(value) as string[]; } +function parseStringArrayJson(value: string | null | undefined): string[] { + if (!value) return []; + try { + const parsed = JSON.parse(value) as unknown; + return Array.isArray(parsed) ? parsed.filter((entry): entry is string => typeof entry === 'string') : []; + } catch { + return []; + } +} + function userLogin(payload: Record): string | null { const user = payload.user as Record | undefined; const login = user?.login; @@ -1600,7 +1612,12 @@ export class GHCrawlService { } else { const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); this.materializeLatestDeterministicFingerprints(deterministicItems, params.onProgress); - const deterministic = buildDeterministicClusterGraph(deterministicItems, { topK: Math.max(k * 8, 64) }); + const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); + const deterministic = buildDeterministicClusterGraphFromFingerprints( + deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), + persistedFingerprints, + { topK: Math.max(k * 8, 64) }, + ); items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); aggregatedEdges = new Map(); for (const edge of deterministic.edges) { @@ -4557,6 +4574,82 @@ export class GHCrawlService { return { computed, skipped }; } + private loadLatestDeterministicFingerprints(threadIds: number[]): Map { + if (threadIds.length === 0) return new Map(); + const placeholders = threadIds.map(() => '?').join(','); + const rows = this.db + .prepare( + `select + tr.thread_id, + tf.fingerprint_hash, + tf.fingerprint_slug, + tf.title_tokens_json, + tf.linked_refs_json, + tf.module_buckets_json, + tf.minhash_signature_blob_id, + tf.simhash64, + tf.winnow_hashes_blob_id, + tf.feature_json + from thread_revisions tr + join ( + select thread_id, max(id) as revision_id + from thread_revisions + where thread_id in (${placeholders}) + group by thread_id + ) latest on latest.revision_id = tr.id + join thread_fingerprints tf on tf.thread_revision_id = tr.id + where tf.algorithm_version = ?`, + ) + .all(...threadIds, THREAD_FINGERPRINT_ALGORITHM_VERSION) as Array<{ + thread_id: number; + fingerprint_hash: string; + fingerprint_slug: string; + title_tokens_json: string; + linked_refs_json: string; + module_buckets_json: string; + minhash_signature_blob_id: number | null; + simhash64: string; + winnow_hashes_blob_id: number | null; + feature_json: string; + }>; + + const fingerprints = new Map(); + for (const row of rows) { + const feature = (() => { + try { + return JSON.parse(row.feature_json) as Record; + } catch { + return {}; + } + })(); + const stringFeature = (key: string): string[] => { + const value = feature[key]; + return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : []; + }; + fingerprints.set(row.thread_id, { + algorithmVersion: THREAD_FINGERPRINT_ALGORITHM_VERSION, + fingerprintHash: row.fingerprint_hash, + fingerprintSlug: row.fingerprint_slug, + titleTokens: parseStringArrayJson(row.title_tokens_json), + salientTitleTokens: stringFeature('salientTitleTokens'), + bodyTokens: [], + linkedRefs: parseStringArrayJson(row.linked_refs_json), + moduleBuckets: parseStringArrayJson(row.module_buckets_json), + changedFiles: stringFeature('changedFiles'), + hunkSignatures: stringFeature('hunkSignatures'), + patchIds: stringFeature('patchIds'), + minhashSignature: row.minhash_signature_blob_id + ? parseStringArrayJson(readTextBlob(this.db, this.blobStoreRoot(), row.minhash_signature_blob_id)) + : [], + simhash64: row.simhash64, + winnowHashes: row.winnow_hashes_blob_id + ? parseStringArrayJson(readTextBlob(this.db, this.blobStoreRoot(), row.winnow_hashes_blob_id)) + : [], + }); + } + return fingerprints; + } + private loadNormalizedActiveVectors(repoId: number): Array<{ id: number; number: number; title: string; embedding: number[] }> { return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({ id: row.id, From fd5824fe407997780ef949322f826834d1948c83 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 21:09:09 -0700 Subject: [PATCH 045/215] feat(sync): refresh fingerprints for touched threads --- packages/api-core/src/service.test.ts | 5 ++++ packages/api-core/src/service.ts | 33 ++++++++++++++++++++------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index e4a1c51..1d74e23 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -258,6 +258,7 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and assert.match(messages.join('\n'), /1\/2 issue #42/); assert.match(messages.join('\n'), /2\/2 pull_request #43/); assert.match(messages.join('\n'), /metadata-only mode; skipping comment, review, and review-comment fetches/); + assert.match(messages.join('\n'), /\[fingerprint\] latest revisions computed=2 skipped=0/); assert.equal(service.listRepositories().repositories.length, 1); assert.equal(service.listThreads({ owner: 'openclaw', repo: 'openclaw' }).threads.length, 2); assert.deepEqual( @@ -271,6 +272,8 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and assert.equal(listPullReviewCalls, 0); assert.equal(listPullReviewCommentCalls, 0); assert.equal(listPullFileCalls, 0); + const fingerprintCount = service.db.prepare('select count(*) as count from thread_fingerprints').get() as { count: number }; + assert.equal(fingerprintCount.count, 2); const rows = service.db .prepare('select number, kind, first_pulled_at, last_pulled_at from threads order by number asc') @@ -466,9 +469,11 @@ test('syncRepository hydrates pull request code snapshots when includeCode is en }; const file = service.db.prepare('select path from thread_changed_files').get() as { path: string }; const hunkCount = service.db.prepare('select count(*) as count from thread_hunk_signatures').get() as { count: number }; + const fingerprint = service.db.prepare('select feature_json from thread_fingerprints').get() as { feature_json: string }; assert.deepEqual(snapshot, { base_sha: 'base-sha', head_sha: 'head-sha', files_changed: 1 }); assert.equal(file.path, 'packages/api-core/src/service.ts'); assert.equal(hunkCount.count, 1); + assert.equal(JSON.parse(fingerprint.feature_json).hunkSignatures.length, 1); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 4a0fcce..d79e635 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1044,6 +1044,7 @@ export class GHCrawlService { let threadsSynced = 0; let commentsSynced = 0; let codeFilesSynced = 0; + const fingerprintThreadIds: number[] = []; for (const [index, item] of items.entries()) { if (index > 0 && index % SYNC_BATCH_SIZE === 0) { @@ -1068,6 +1069,7 @@ export class GHCrawlService { commentsSynced += comments.length; } this.refreshDocument(threadId); + fingerprintThreadIds.push(threadId); threadsSynced += 1; } catch (error) { const message = error instanceof Error ? error.message : String(error); @@ -1111,6 +1113,13 @@ export class GHCrawlService { if (threadsClosed > 0) { this.reconcileClusterCloseState(repoId); } + if (fingerprintThreadIds.length > 0) { + const fingerprintItems = this.loadDeterministicClusterableThreadMeta( + repoId, + Array.from(new Set(fingerprintThreadIds)), + ); + this.materializeLatestDeterministicFingerprints(fingerprintItems, params.onProgress); + } const finishedAt = nowIso(); const reconciledOpenCloseAt = shouldSweepClosedOverlap || shouldReconcileMissingOpenThreads ? finishedAt : null; const nextSyncCursor: SyncCursorState = { @@ -4404,7 +4413,7 @@ export class GHCrawlService { })); } - private loadDeterministicClusterableThreadMeta(repoId: number): Array<{ + private loadDeterministicClusterableThreadMeta(repoId: number, threadIds?: number[]): Array<{ id: number; number: number; kind: 'issue' | 'pull_request'; @@ -4417,16 +4426,24 @@ export class GHCrawlService { hunkSignatures: string[]; patchIds: string[]; }> { + let sql = + `select id, number, kind, title, body, labels_json, raw_json, updated_at_gh + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null`; + const args: Array = [repoId]; + if (threadIds && threadIds.length > 0) { + sql += ` and id in (${threadIds.map(() => '?').join(',')})`; + args.push(...threadIds); + } + sql += ' order by number asc'; + const rows = this.db .prepare( - `select id, number, kind, title, body, labels_json, raw_json, updated_at_gh - from threads - where repo_id = ? - and state = 'open' - and closed_at_local is null - order by number asc`, + sql, ) - .all(repoId) as Array<{ + .all(...args) as Array<{ id: number; number: number; kind: 'issue' | 'pull_request'; From 6d13008c96a5356db52bc5b14edba25d3866f81e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 21:11:52 -0700 Subject: [PATCH 046/215] feat(cluster): merge deterministic and vector evidence --- packages/api-core/src/service.test.ts | 92 +++++++++++++++++++++++++++ packages/api-core/src/service.ts | 80 ++++++++++++----------- 2 files changed, 131 insertions(+), 41 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 1d74e23..4eaab61 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2422,6 +2422,98 @@ test('clusterRepository uses hydrated code hunk signatures without embeddings', } }); +test('clusterRepository keeps deterministic hunk edges when active vectors are current', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [ + { + id: 100, + number: 42, + state: 'open', + title: 'Rewrite scheduler state', + body: 'Internal cleanup.', + html_url: 'https://github.com/openclaw/openclaw/pull/42', + labels: [], + pull_request: { url: 'https://api.github.com/repos/openclaw/openclaw/pulls/42' }, + user: { login: 'alice', type: 'User' }, + }, + { + id: 101, + number: 43, + state: 'open', + title: 'Patch migration locking', + body: 'Different prose.', + html_url: 'https://github.com/openclaw/openclaw/pull/43', + labels: [], + pull_request: { url: 'https://api.github.com/repos/openclaw/openclaw/pulls/43' }, + user: { login: 'bob', type: 'User' }, + }, + ], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async (_owner, _repo, number) => ({ + id: number, + number, + state: 'open', + title: number === 42 ? 'Rewrite scheduler state' : 'Patch migration locking', + body: number === 42 ? 'Internal cleanup.' : 'Different prose.', + html_url: `https://github.com/openclaw/openclaw/pull/${number}`, + labels: [], + user: { login: number === 42 ? 'alice' : 'bob', type: 'User' }, + draft: false, + base: { sha: 'base-sha' }, + head: { sha: `head-${number}` }, + updated_at: '2026-03-09T00:00:00Z', + }), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [ + { + filename: 'packages/api-core/src/cluster/build.ts', + status: 'modified', + additions: 1, + deletions: 1, + changes: 2, + patch: '@@ -1 +1 @@\n-oldCluster\n+newCluster', + }, + ], + }, + ai: { + checkAuth: async () => undefined, + summarizeThread: async () => { + throw new Error('not expected'); + }, + embedTexts: async ({ texts }) => texts.map((_text, index) => (index === 0 ? makeEmbedding(1, 0) : makeEmbedding(0, 1))), + }, + }); + + try { + await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + includeCode: true, + }); + await service.embedRepository({ owner: 'openclaw', repo: 'openclaw' }); + + const result = await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + minScore: 0.1, + }); + + const evidence = service.db.prepare('select breakdown_json from similarity_edge_evidence').get() as { breakdown_json: string }; + assert.equal(result.edges, 1); + assert.deepEqual(JSON.parse(evidence.breakdown_json).sources, ['deterministic_fingerprint']); + } finally { + service.close(); + } +}); + test('embedRepository rebuilds a corrupted active vector store during upsert', async () => { const vectors = new Map(); let failNextUpsert = true; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index d79e635..2e7215e 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1560,15 +1560,30 @@ export class GHCrawlService { const k = params.k ?? 6; try { - let items: Array<{ id: number; number: number; title: string }>; - let aggregatedEdges: Map }>; + const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); + this.materializeLatestDeterministicFingerprints(deterministicItems, params.onProgress); + const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); + const deterministic = buildDeterministicClusterGraphFromFingerprints( + deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), + persistedFingerprints, + { topK: Math.max(k * 8, 64) }, + ); + const items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); + const aggregatedEdges = new Map }>(); + this.mergeSourceKindEdges( + aggregatedEdges, + deterministic.edges.filter((edge) => edge.score >= minScore), + 'deterministic_fingerprint', + ); + params.onProgress?.( + `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${repository.fullName}`, + ); if (this.isRepoVectorStateCurrent(repository.id)) { const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); const activeSourceKind = this.activeVectorSourceKind(); const activeIds = new Set(vectorItems.map((item) => item.id)); const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k); - aggregatedEdges = new Map(); let processed = 0; let lastProgressAt = Date.now(); @@ -1586,18 +1601,17 @@ export class GHCrawlService { for (const neighbor of neighbors) { if (!activeIds.has(neighbor.threadId)) continue; if (neighbor.score < minScore) continue; - const key = this.edgeKey(item.id, neighbor.threadId); - const existing = aggregatedEdges.get(key); - if (existing) { - existing.score = Math.max(existing.score, neighbor.score); - } else { - aggregatedEdges.set(key, { - leftThreadId: Math.min(item.id, neighbor.threadId), - rightThreadId: Math.max(item.id, neighbor.threadId), - score: neighbor.score, - sourceKinds: new Set([activeSourceKind]), - }); - } + this.mergeSourceKindEdges( + aggregatedEdges, + [ + { + leftThreadId: Math.min(item.id, neighbor.threadId), + rightThreadId: Math.max(item.id, neighbor.threadId), + score: neighbor.score, + }, + ], + activeSourceKind, + ); } processed += 1; const now = Date.now(); @@ -1606,41 +1620,25 @@ export class GHCrawlService { lastProgressAt = now; } } - items = vectorItems; } else if (this.hasLegacyEmbeddings(repository.id)) { const legacy = this.loadClusterableThreadMeta(repository.id); - items = legacy.items; params.onProgress?.( - `[cluster] loaded ${items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`, + `[cluster] loaded ${legacy.items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`, ); - aggregatedEdges = await this.aggregateRepositoryEdges(repository.id, legacy.sourceKinds, { + const legacyEdges = await this.aggregateRepositoryEdges(repository.id, legacy.sourceKinds, { limit: k, minScore, onProgress: params.onProgress, }); - } else { - const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); - this.materializeLatestDeterministicFingerprints(deterministicItems, params.onProgress); - const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); - const deterministic = buildDeterministicClusterGraphFromFingerprints( - deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), - persistedFingerprints, - { topK: Math.max(k * 8, 64) }, - ); - items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); - aggregatedEdges = new Map(); - for (const edge of deterministic.edges) { - if (edge.score < minScore) continue; - aggregatedEdges.set(this.edgeKey(edge.leftThreadId, edge.rightThreadId), { - leftThreadId: edge.leftThreadId, - rightThreadId: edge.rightThreadId, - score: edge.score, - sourceKinds: new Set(['deterministic_fingerprint']), - }); + for (const legacyEdge of legacyEdges.values()) { + for (const sourceKind of legacyEdge.sourceKinds) { + this.mergeSourceKindEdges( + aggregatedEdges, + [{ leftThreadId: legacyEdge.leftThreadId, rightThreadId: legacyEdge.rightThreadId, score: legacyEdge.score }], + sourceKind, + ); + } } - params.onProgress?.( - `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${repository.fullName} without embeddings`, - ); } const edges = Array.from(aggregatedEdges.values()).map((entry) => ({ From 6dd784f2839c16f96fcdd5b5736ad2a0d760004e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 21:54:14 -0700 Subject: [PATCH 047/215] feat(cluster): pin durable canonical member --- apps/cli/src/main.test.ts | 38 ++++++ apps/cli/src/main.ts | 32 +++++ packages/api-contract/src/client.ts | 24 ++++ packages/api-contract/src/contracts.test.ts | 51 ++++++++ packages/api-contract/src/contracts.ts | 13 +- packages/api-core/src/api/server.test.ts | 104 ++++++++++++++++ packages/api-core/src/api/server.ts | 7 ++ packages/api-core/src/service.test.ts | 69 +++++++++++ packages/api-core/src/service.ts | 131 ++++++++++++++++++-- 9 files changed, 456 insertions(+), 13 deletions(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index fa47a1d..b721e9b 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -354,6 +354,44 @@ test('exclude-cluster-member command forwards durable override inputs', async () assert.match(stdout.read(), /"state": "removed_by_user"/); }); +test('set-cluster-canonical command forwards durable override inputs', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.setClusterCanonicalThread; + let received: unknown; + + GHCrawlService.prototype.setClusterCanonicalThread = function setClusterCanonicalThreadStub(params: unknown) { + received = params; + return { + ok: true, + clusterId: 7, + thread: { number: 42 }, + action: 'force_canonical', + state: 'active', + message: 'set', + } as never; + }; + + try { + await run(['set-cluster-canonical', 'openclaw/openclaw', '--id', '7', '--number', '42', '--reason', 'best root issue'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.setClusterCanonicalThread = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: 'best root issue', + }); + assert.match(stdout.read(), /"action": "force_canonical"/); +}); + test('durable-clusters command forwards stable cluster list options', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 01f2f8c..87adfcf 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -22,6 +22,7 @@ type CommandName = | 'close-thread' | 'close-cluster' | 'exclude-cluster-member' + | 'set-cluster-canonical' | 'summarize' | 'key-summaries' | 'purge-comments' @@ -209,6 +210,19 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl exclude-cluster-member openclaw/openclaw --id 123 --number 42 --reason "false positive" --json'], agentJson: true, }, + { + name: 'set-cluster-canonical', + synopsis: 'set-cluster-canonical --id --number [--reason ] [--json]', + description: 'Pin one durable cluster member as the canonical representative.', + options: [ + '--id Durable cluster id', + '--number Issue or PR number to mark canonical', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl set-cluster-canonical openclaw/openclaw --id 123 --number 42 --reason "best root issue" --json'], + agentJson: true, + }, { name: 'embed', synopsis: 'embed [--number ] [--json]', @@ -1075,6 +1089,24 @@ export async function run( writeJson(stdout, result); return; } + case 'set-cluster-canonical': { + const { owner, repo, values } = parseRepoFlags('set-cluster-canonical', rest); + if (typeof values.id !== 'string') { + throw new CliUsageError('Missing --id', 'set-cluster-canonical'); + } + if (typeof values.number !== 'string') { + throw new CliUsageError('Missing --number', 'set-cluster-canonical'); + } + const result = getService().setClusterCanonicalThread({ + owner, + repo, + clusterId: parsePositiveInteger('id', values.id, 'set-cluster-canonical'), + threadNumber: parsePositiveInteger('number', values.number, 'set-cluster-canonical'), + reason: typeof values.reason === 'string' ? values.reason : undefined, + }); + writeJson(stdout, result); + return; + } case 'summarize': { const { owner, repo, values } = parseRepoFlags('summarize', rest); const result = await getService().summarizeRepository({ diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index 0381e71..98f06fc 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -6,17 +6,21 @@ import { closeThreadRequestSchema, authorThreadsResponseSchema, clusterDetailResponseSchema, + clusterOverrideResponseSchema, clusterSummariesResponseSchema, clustersResponseSchema, + excludeClusterMemberRequestSchema, healthResponseSchema, refreshRequestSchema, refreshResponseSchema, repositoriesResponseSchema, searchResponseSchema, + setClusterCanonicalRequestSchema, threadsResponseSchema, type ActionRequest, type ActionResponse, type CloseResponse, + type ClusterOverrideResponse, type AuthorThreadsResponse, type ClusterDetailResponse, type ClusterSummariesResponse, @@ -58,6 +62,8 @@ export type GitcrawlClient = { rerun: (request: ActionRequest) => Promise; closeThread: (request: { owner: string; repo: string; threadNumber: number }) => Promise; closeCluster: (request: { owner: string; repo: string; clusterId: number }) => Promise; + excludeClusterMember: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; + setClusterCanonical: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; }; type FetchLike = typeof fetch; @@ -171,5 +177,23 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet }); return readJson(res, closeResponseSchema); }, + async excludeClusterMember(request) { + const body = excludeClusterMemberRequestSchema.parse(request); + const res = await fetchImpl(`${normalized}/actions/exclude-cluster-member`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + return readJson(res, clusterOverrideResponseSchema); + }, + async setClusterCanonical(request) { + const body = setClusterCanonicalRequestSchema.parse(request); + const res = await fetchImpl(`${normalized}/actions/set-cluster-canonical`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + return readJson(res, clusterOverrideResponseSchema); + }, }; } diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 683ff89..752bfcf 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -9,6 +9,7 @@ import { healthResponseSchema, neighborsResponseSchema, searchResponseSchema, + setClusterCanonicalRequestSchema, } from './contracts.js'; test('health schema accepts configured status payload', () => { @@ -66,6 +67,18 @@ test('exclude cluster member request trims optional reason', () => { assert.equal(parsed.reason, 'confirmed separate bug'); }); +test('set cluster canonical request trims optional reason', () => { + const parsed = setClusterCanonicalRequestSchema.parse({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: ' best root issue ', + }); + + assert.equal(parsed.reason, 'best root issue'); +}); + test('cluster override response accepts durable removal state', () => { const parsed = clusterOverrideResponseSchema.parse({ ok: true, @@ -104,6 +117,44 @@ test('cluster override response accepts durable removal state', () => { assert.equal(parsed.state, 'removed_by_user'); }); +test('cluster override response accepts force canonical action', () => { + const parsed = clusterOverrideResponseSchema.parse({ + ok: true, + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + clusterId: 7, + thread: { + id: 10, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Downloader hangs', + body: 'The transfer never finishes.', + authorLogin: 'alice', + htmlUrl: 'https://github.com/openclaw/openclaw/issues/42', + labels: ['bug'], + updatedAtGh: new Date().toISOString(), + clusterId: null, + }, + action: 'force_canonical', + state: 'active', + message: 'Set issue #42 as canonical for cluster 7.', + }); + + assert.equal(parsed.action, 'force_canonical'); +}); + test('durable clusters response accepts stable slugs and governed member states', () => { const parsed = durableClustersResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 6e06450..55a1794 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -287,6 +287,15 @@ export const excludeClusterMemberRequestSchema = z.object({ }); export type ExcludeClusterMemberRequest = z.infer; +export const setClusterCanonicalRequestSchema = z.object({ + owner: z.string(), + repo: z.string(), + clusterId: z.number().int().positive(), + threadNumber: z.number().int().positive(), + reason: z.string().trim().min(1).optional(), +}); +export type SetClusterCanonicalRequest = z.infer; + export const closeResponseSchema = z.object({ ok: z.boolean(), repository: repositorySchema, @@ -302,8 +311,8 @@ export const clusterOverrideResponseSchema = z.object({ repository: repositorySchema, clusterId: z.number().int().positive(), thread: threadSchema, - action: z.enum(['exclude']), - state: z.enum(['removed_by_user', 'blocked_by_override']), + action: z.enum(['exclude', 'force_canonical']), + state: z.enum(['active', 'removed_by_user', 'blocked_by_override']), message: z.string(), }); export type ClusterOverrideResponse = z.infer; diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index 34dd756..2dce959 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -496,6 +496,110 @@ test('exclude cluster member action records a durable override', async () => { } }); +test('set cluster canonical action records a durable override', async () => { + const service = new GHCrawlService({ + config: { + workspaceRoot: process.cwd(), + configDir: '/tmp/ghcrawl-test', + configPath: '/tmp/ghcrawl-test/config.json', + configFileExists: true, + dbPath: ':memory:', + dbPathSource: 'config', + apiPort: 5179, + secretProvider: 'plaintext', + githubTokenSource: 'none', + openaiApiKeySource: 'none', + summaryModel: 'gpt-5-mini', + embedModel: 'text-embedding-3-large', + embeddingBasis: 'title_original', + vectorBackend: 'vectorlite', + embedBatchSize: 8, + embedConcurrency: 10, + embedMaxUnread: 20, + openSearchIndex: 'ghcrawl-threads', + tuiPreferences: {}, + }, + github: { + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Downloader retry loop', 'Retries forever.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', 10, 'Cluster trace-alpha-river', now, now); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 11, 'related', 'active', 0.8, null, null, 'algo', null, '{}', '{}', now, now, null); + + const server = createApiServer(service); + try { + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const address = server.address(); + assert(address && typeof address === 'object'); + + const response = await fetch(`http://127.0.0.1:${address.port}/actions/set-cluster-canonical`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 43, + reason: 'best root issue', + }), + }); + assert.equal(response.status, 200); + const payload = clusterOverrideResponseSchema.parse((await response.json()) as unknown); + assert.equal(payload.action, 'force_canonical'); + + const override = service.db.prepare('select action, reason from cluster_overrides where cluster_id = ? and thread_id = ?').get(7, 11) as { + action: string; + reason: string; + }; + const group = service.db.prepare('select representative_thread_id from cluster_groups where id = ?').get(7) as { + representative_thread_id: number; + }; + assert.deepEqual(override, { action: 'force_canonical', reason: 'best root issue' }); + assert.equal(group.representative_thread_id, 11); + } finally { + await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); + service.close(); + } +}); + test('durable clusters endpoint returns stable cluster state', async () => { const service = new GHCrawlService({ config: { diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index d85c095..8c9de24 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -6,6 +6,7 @@ import { closeThreadRequestSchema, excludeClusterMemberRequestSchema, refreshRequestSchema, + setClusterCanonicalRequestSchema, } from '@ghcrawl/api-contract'; import { ZodError } from 'zod'; @@ -219,6 +220,12 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'POST' && url.pathname === '/actions/set-cluster-canonical') { + const body = setClusterCanonicalRequestSchema.parse(await readBody(req)); + sendJson(res, 200, service.setClusterCanonicalThread(body)); + return; + } + sendJson(res, 404, { error: 'Not found' }); } catch (error) { const message = error instanceof Error ? error.message : String(error); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 4eaab61..3fdf4e4 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2265,6 +2265,75 @@ test('clusterRepository falls back to deterministic fingerprints when vectors ar } }); +test('clusterRepository preserves a forced canonical representative on rebuild', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Download retry hangs forever', 'The transfer retry loop never exits after timeout.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Download retry loop never exits', 'Retry hangs forever after timeout.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + + await service.clusterRepository({ owner: 'openclaw', repo: 'openclaw', k: 1, minScore: 0.1 }); + const cluster = service.db.prepare('select id from cluster_groups limit 1').get() as { id: number }; + + const override = service.setClusterCanonicalThread({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: cluster.id, + threadNumber: 43, + reason: 'best root issue', + }); + await service.clusterRepository({ owner: 'openclaw', repo: 'openclaw', k: 1, minScore: 0.1 }); + + const group = service.db.prepare('select representative_thread_id from cluster_groups where id = ?').get(cluster.id) as { + representative_thread_id: number; + }; + const roles = service.db + .prepare('select thread_id, role, added_by from cluster_memberships where cluster_id = ? order by thread_id asc') + .all(cluster.id) as Array<{ thread_id: number; role: string; added_by: string }>; + + assert.equal(override.action, 'force_canonical'); + assert.equal(group.representative_thread_id, 11); + assert.deepEqual(roles, [ + { thread_id: 10, role: 'related', added_by: 'algo' }, + { thread_id: 11, role: 'canonical', added_by: 'user' }, + ]); + } finally { + service.close(); + } +}); + test('clusterRepository materializes only changed deterministic fingerprints', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 2e7215e..9cf5185 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -48,6 +48,7 @@ import { type SearchHitDto, type SearchMode, type SearchResponse, + type SetClusterCanonicalRequest, type SyncResultDto, type ThreadDto, type ThreadsResponse, @@ -994,6 +995,93 @@ export class GHCrawlService { }); } + setClusterCanonicalThread(params: SetClusterCanonicalRequest): ClusterOverrideResponse { + const repository = this.requireRepository(params.owner, params.repo); + const cluster = this.db + .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') + .get(repository.id, params.clusterId) as { id: number } | undefined; + if (!cluster) { + throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); + } + + const thread = this.db + .prepare('select * from threads where repo_id = ? and number = ? limit 1') + .get(repository.id, params.threadNumber) as ThreadRow | undefined; + if (!thread) { + throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); + } + + const membership = this.db + .prepare('select score_to_representative from cluster_memberships where cluster_id = ? and thread_id = ? limit 1') + .get(cluster.id, thread.id) as { score_to_representative: number | null } | undefined; + if (!membership) { + throw new Error(`Thread #${params.threadNumber} is not a member of durable cluster ${cluster.id}.`); + } + + const timestamp = nowIso(); + this.db.transaction(() => { + this.db + .prepare( + `delete from cluster_overrides + where cluster_id = ? + and action = 'force_canonical' + and thread_id <> ?`, + ) + .run(cluster.id, thread.id); + this.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) + values (?, ?, ?, 'force_canonical', ?, ?, null) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + created_at = excluded.created_at, + expires_at = null`, + ) + .run(repository.id, cluster.id, thread.id, params.reason ?? null, timestamp); + this.db + .prepare("update cluster_groups set representative_thread_id = ?, updated_at = ? where id = ?") + .run(thread.id, timestamp, cluster.id); + this.db + .prepare("update cluster_memberships set role = 'related', updated_at = ? where cluster_id = ? and role = 'canonical'") + .run(timestamp, cluster.id); + upsertClusterMembership(this.db, { + clusterId: cluster.id, + threadId: thread.id, + role: 'canonical', + state: 'active', + scoreToRepresentative: 1, + addedBy: 'user', + addedReason: { + source: 'setClusterCanonicalThread', + reason: params.reason ?? null, + }, + }); + this.db + .prepare("update cluster_memberships set added_by = 'user', updated_at = ? where cluster_id = ? and thread_id = ?") + .run(timestamp, cluster.id, thread.id); + recordClusterEvent(this.db, { + clusterId: cluster.id, + eventType: 'manual_force_canonical', + actorKind: 'user', + payload: { + threadId: thread.id, + threadNumber: thread.number, + reason: params.reason ?? null, + }, + }); + })(); + + return clusterOverrideResponseSchema.parse({ + ok: true, + repository, + clusterId: cluster.id, + thread: threadToDto(thread), + action: 'force_canonical', + state: 'active', + message: `Set ${thread.kind} #${thread.number} as canonical for durable cluster ${cluster.id}.`, + }); + } + async syncRepository( params: SyncOptions, ): Promise { @@ -5183,9 +5271,29 @@ export class GHCrawlService { representativeThreadId: cluster.representativeThreadId, title: `Cluster ${identity.slug}`, }); + const forcedCanonical = this.db + .prepare( + `select thread_id + from cluster_overrides + where cluster_id = ? + and action = 'force_canonical' + and (expires_at is null or expires_at > ?) + order by created_at desc, id desc + limit 1`, + ) + .get(clusterId, nowIso()) as { thread_id: number } | undefined; + const representativeThreadId = + forcedCanonical && cluster.members.includes(forcedCanonical.thread_id) + ? forcedCanonical.thread_id + : cluster.representativeThreadId; + if (representativeThreadId !== cluster.representativeThreadId) { + this.db + .prepare('update cluster_groups set representative_thread_id = ?, updated_at = ? where id = ?') + .run(representativeThreadId, nowIso(), clusterId); + } for (const memberId of cluster.members) { - const scoreKey = this.edgeKey(cluster.representativeThreadId, memberId); - const score = memberId === cluster.representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); + const scoreKey = this.edgeKey(representativeThreadId, memberId); + const score = memberId === representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); const excluded = this.db .prepare( `select 1 @@ -5209,7 +5317,7 @@ export class GHCrawlService { removedBy: 'user', addedReason: { source: 'clusterRepository', - representativeThreadId: cluster.representativeThreadId, + representativeThreadId, }, removedReason: { source: 'cluster_overrides', @@ -5223,7 +5331,7 @@ export class GHCrawlService { actorKind: 'algo', payload: { threadId: memberId, - representativeThreadId: cluster.representativeThreadId, + representativeThreadId, scoreToRepresentative: score, reason: 'manual_exclusion', }, @@ -5233,25 +5341,26 @@ export class GHCrawlService { upsertClusterMembership(this.db, { clusterId, threadId: memberId, - role: memberId === cluster.representativeThreadId ? 'canonical' : 'related', + role: memberId === representativeThreadId ? 'canonical' : 'related', state: 'active', - scoreToRepresentative: score, + scoreToRepresentative: memberId === representativeThreadId ? 1 : score, runId: pipelineRunId, - addedBy: 'algo', + addedBy: memberId === representativeThreadId && forcedCanonical?.thread_id === memberId ? 'user' : 'algo', addedReason: { source: 'clusterRepository', - representativeThreadId: cluster.representativeThreadId, + representativeThreadId, + forceCanonical: forcedCanonical?.thread_id === memberId, }, }); recordClusterEvent(this.db, { clusterId, runId: pipelineRunId, - eventType: memberId === cluster.representativeThreadId ? 'keep_canonical' : 'upsert_member', + eventType: memberId === representativeThreadId ? 'keep_canonical' : 'upsert_member', actorKind: 'algo', payload: { threadId: memberId, - representativeThreadId: cluster.representativeThreadId, - scoreToRepresentative: score, + representativeThreadId, + scoreToRepresentative: memberId === representativeThreadId ? 1 : score, }, }); } From 9181f6b6d3704da751efa2a892ad267aeebecf39 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:01:52 -0700 Subject: [PATCH 048/215] feat(cluster): force include durable members --- apps/cli/src/main.test.ts | 38 +++++++ apps/cli/src/main.ts | 32 ++++++ packages/api-contract/src/client.ts | 11 ++ packages/api-contract/src/contracts.test.ts | 13 +++ packages/api-contract/src/contracts.ts | 11 +- packages/api-core/src/api/server.ts | 7 ++ packages/api-core/src/service.test.ts | 63 +++++++++++ packages/api-core/src/service.ts | 116 ++++++++++++++++++++ 8 files changed, 290 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index b721e9b..d78efea 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -392,6 +392,44 @@ test('set-cluster-canonical command forwards durable override inputs', async () assert.match(stdout.read(), /"action": "force_canonical"/); }); +test('include-cluster-member command forwards durable override inputs', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.includeThreadInCluster; + let received: unknown; + + GHCrawlService.prototype.includeThreadInCluster = function includeThreadInClusterStub(params: unknown) { + received = params; + return { + ok: true, + clusterId: 7, + thread: { number: 42 }, + action: 'force_include', + state: 'active', + message: 'included', + } as never; + }; + + try { + await run(['include-cluster-member', 'openclaw/openclaw', '--id', '7', '--number', '42', '--reason', 'same root cause'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.includeThreadInCluster = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: 'same root cause', + }); + assert.match(stdout.read(), /"action": "force_include"/); +}); + test('durable-clusters command forwards stable cluster list options', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 87adfcf..a77696e 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -22,6 +22,7 @@ type CommandName = | 'close-thread' | 'close-cluster' | 'exclude-cluster-member' + | 'include-cluster-member' | 'set-cluster-canonical' | 'summarize' | 'key-summaries' @@ -210,6 +211,19 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl exclude-cluster-member openclaw/openclaw --id 123 --number 42 --reason "false positive" --json'], agentJson: true, }, + { + name: 'include-cluster-member', + synopsis: 'include-cluster-member --id --number [--reason ] [--json]', + description: 'Add one issue or PR to a durable cluster and keep it included across rebuilds.', + options: [ + '--id Durable cluster id', + '--number Issue or PR number to include', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl include-cluster-member openclaw/openclaw --id 123 --number 42 --reason "same root cause" --json'], + agentJson: true, + }, { name: 'set-cluster-canonical', synopsis: 'set-cluster-canonical --id --number [--reason ] [--json]', @@ -1089,6 +1103,24 @@ export async function run( writeJson(stdout, result); return; } + case 'include-cluster-member': { + const { owner, repo, values } = parseRepoFlags('include-cluster-member', rest); + if (typeof values.id !== 'string') { + throw new CliUsageError('Missing --id', 'include-cluster-member'); + } + if (typeof values.number !== 'string') { + throw new CliUsageError('Missing --number', 'include-cluster-member'); + } + const result = getService().includeThreadInCluster({ + owner, + repo, + clusterId: parsePositiveInteger('id', values.id, 'include-cluster-member'), + threadNumber: parsePositiveInteger('number', values.number, 'include-cluster-member'), + reason: typeof values.reason === 'string' ? values.reason : undefined, + }); + writeJson(stdout, result); + return; + } case 'set-cluster-canonical': { const { owner, repo, values } = parseRepoFlags('set-cluster-canonical', rest); if (typeof values.id !== 'string') { diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index 98f06fc..a37f55d 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -11,6 +11,7 @@ import { clustersResponseSchema, excludeClusterMemberRequestSchema, healthResponseSchema, + includeClusterMemberRequestSchema, refreshRequestSchema, refreshResponseSchema, repositoriesResponseSchema, @@ -63,6 +64,7 @@ export type GitcrawlClient = { closeThread: (request: { owner: string; repo: string; threadNumber: number }) => Promise; closeCluster: (request: { owner: string; repo: string; clusterId: number }) => Promise; excludeClusterMember: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; + includeClusterMember: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; setClusterCanonical: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; }; @@ -186,6 +188,15 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet }); return readJson(res, clusterOverrideResponseSchema); }, + async includeClusterMember(request) { + const body = includeClusterMemberRequestSchema.parse(request); + const res = await fetchImpl(`${normalized}/actions/include-cluster-member`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + return readJson(res, clusterOverrideResponseSchema); + }, async setClusterCanonical(request) { const body = setClusterCanonicalRequestSchema.parse(request); const res = await fetchImpl(`${normalized}/actions/set-cluster-canonical`, { diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 752bfcf..6edfb4e 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -7,6 +7,7 @@ import { durableClustersResponseSchema, excludeClusterMemberRequestSchema, healthResponseSchema, + includeClusterMemberRequestSchema, neighborsResponseSchema, searchResponseSchema, setClusterCanonicalRequestSchema, @@ -79,6 +80,18 @@ test('set cluster canonical request trims optional reason', () => { assert.equal(parsed.reason, 'best root issue'); }); +test('include cluster member request trims optional reason', () => { + const parsed = includeClusterMemberRequestSchema.parse({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + threadNumber: 42, + reason: ' same root cause ', + }); + + assert.equal(parsed.reason, 'same root cause'); +}); + test('cluster override response accepts durable removal state', () => { const parsed = clusterOverrideResponseSchema.parse({ ok: true, diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 55a1794..662513a 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -287,6 +287,15 @@ export const excludeClusterMemberRequestSchema = z.object({ }); export type ExcludeClusterMemberRequest = z.infer; +export const includeClusterMemberRequestSchema = z.object({ + owner: z.string(), + repo: z.string(), + clusterId: z.number().int().positive(), + threadNumber: z.number().int().positive(), + reason: z.string().trim().min(1).optional(), +}); +export type IncludeClusterMemberRequest = z.infer; + export const setClusterCanonicalRequestSchema = z.object({ owner: z.string(), repo: z.string(), @@ -311,7 +320,7 @@ export const clusterOverrideResponseSchema = z.object({ repository: repositorySchema, clusterId: z.number().int().positive(), thread: threadSchema, - action: z.enum(['exclude', 'force_canonical']), + action: z.enum(['exclude', 'force_include', 'force_canonical']), state: z.enum(['active', 'removed_by_user', 'blocked_by_override']), message: z.string(), }); diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 8c9de24..c35b1a9 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -5,6 +5,7 @@ import { closeClusterRequestSchema, closeThreadRequestSchema, excludeClusterMemberRequestSchema, + includeClusterMemberRequestSchema, refreshRequestSchema, setClusterCanonicalRequestSchema, } from '@ghcrawl/api-contract'; @@ -220,6 +221,12 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'POST' && url.pathname === '/actions/include-cluster-member') { + const body = includeClusterMemberRequestSchema.parse(await readBody(req)); + sendJson(res, 200, service.includeThreadInCluster(body)); + return; + } + if (req.method === 'POST' && url.pathname === '/actions/set-cluster-canonical') { const body = setClusterCanonicalRequestSchema.parse(await readBody(req)); sendJson(res, 200, service.setClusterCanonicalThread(body)); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 3fdf4e4..be94ec7 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2334,6 +2334,69 @@ test('clusterRepository preserves a forced canonical representative on rebuild', } }); +test('clusterRepository preserves a forced include on rebuild', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Download retry hangs forever', 'The transfer retry loop never exits after timeout.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Download retry loop never exits', 'Retry hangs forever after timeout.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + insertThread.run(12, 1, '102', 44, 'issue', 'open', 'Docs typo', 'Fix a typo in documentation.', 'carol', 'User', 'https://github.com/openclaw/openclaw/issues/44', '[]', '[]', '{}', 'hash-44', 0, now, now, null, null, now, now, now); + + await service.clusterRepository({ owner: 'openclaw', repo: 'openclaw', k: 1, minScore: 0.1 }); + const cluster = service.db.prepare('select id from cluster_groups limit 1').get() as { id: number }; + + const override = service.includeThreadInCluster({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: cluster.id, + threadNumber: 44, + reason: 'same incident family', + }); + await service.clusterRepository({ owner: 'openclaw', repo: 'openclaw', k: 1, minScore: 0.1 }); + + const membership = service.db + .prepare('select role, state, added_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(cluster.id, 12) as { role: string; state: string; added_by: string }; + + assert.equal(override.action, 'force_include'); + assert.deepEqual(membership, { role: 'related', state: 'active', added_by: 'user' }); + } finally { + service.close(); + } +}); + test('clusterRepository materializes only changed deterministic fingerprints', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 9cf5185..6a1d7a5 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -41,6 +41,7 @@ import { type ExcludeClusterMemberRequest, type EmbedResultDto, type HealthResponse, + type IncludeClusterMemberRequest, type NeighborsResponse, type RefreshResponse, type RepositoriesResponse, @@ -995,6 +996,75 @@ export class GHCrawlService { }); } + includeThreadInCluster(params: IncludeClusterMemberRequest): ClusterOverrideResponse { + const repository = this.requireRepository(params.owner, params.repo); + const cluster = this.db + .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') + .get(repository.id, params.clusterId) as { id: number } | undefined; + if (!cluster) { + throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); + } + + const thread = this.db + .prepare('select * from threads where repo_id = ? and number = ? limit 1') + .get(repository.id, params.threadNumber) as ThreadRow | undefined; + if (!thread) { + throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); + } + + const timestamp = nowIso(); + this.db.transaction(() => { + this.db + .prepare("delete from cluster_overrides where cluster_id = ? and thread_id = ? and action = 'exclude'") + .run(cluster.id, thread.id); + this.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) + values (?, ?, ?, 'force_include', ?, ?, null) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + created_at = excluded.created_at, + expires_at = null`, + ) + .run(repository.id, cluster.id, thread.id, params.reason ?? null, timestamp); + upsertClusterMembership(this.db, { + clusterId: cluster.id, + threadId: thread.id, + role: 'related', + state: 'active', + scoreToRepresentative: null, + addedBy: 'user', + addedReason: { + source: 'includeThreadInCluster', + reason: params.reason ?? null, + }, + }); + this.db + .prepare("update cluster_memberships set added_by = 'user', updated_at = ? where cluster_id = ? and thread_id = ?") + .run(timestamp, cluster.id, thread.id); + recordClusterEvent(this.db, { + clusterId: cluster.id, + eventType: 'manual_force_include', + actorKind: 'user', + payload: { + threadId: thread.id, + threadNumber: thread.number, + reason: params.reason ?? null, + }, + }); + })(); + + return clusterOverrideResponseSchema.parse({ + ok: true, + repository, + clusterId: cluster.id, + thread: threadToDto(thread), + action: 'force_include', + state: 'active', + message: `Included ${thread.kind} #${thread.number} in durable cluster ${cluster.id}.`, + }); + } + setClusterCanonicalThread(params: SetClusterCanonicalRequest): ClusterOverrideResponse { const repository = this.requireRepository(params.owner, params.repo); const cluster = this.db @@ -5364,6 +5434,52 @@ export class GHCrawlService { }, }); } + const forcedIncludes = this.db + .prepare( + `select thread_id, reason + from cluster_overrides + where cluster_id = ? + and action = 'force_include' + and (expires_at is null or expires_at > ?) + order by created_at asc, id asc`, + ) + .all(clusterId, nowIso()) as Array<{ thread_id: number; reason: string | null }>; + for (const forced of forcedIncludes) { + if (cluster.members.includes(forced.thread_id)) { + continue; + } + const scoreKey = this.edgeKey(representativeThreadId, forced.thread_id); + const score = forced.thread_id === representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); + upsertClusterMembership(this.db, { + clusterId, + threadId: forced.thread_id, + role: forced.thread_id === representativeThreadId ? 'canonical' : 'related', + state: 'active', + scoreToRepresentative: score, + runId: pipelineRunId, + addedBy: 'user', + addedReason: { + source: 'cluster_overrides', + action: 'force_include', + reason: forced.reason, + }, + }); + this.db + .prepare("update cluster_memberships set added_by = 'user', updated_at = ? where cluster_id = ? and thread_id = ?") + .run(nowIso(), clusterId, forced.thread_id); + recordClusterEvent(this.db, { + clusterId, + runId: pipelineRunId, + eventType: 'force_include_member', + actorKind: 'algo', + payload: { + threadId: forced.thread_id, + representativeThreadId, + scoreToRepresentative: score, + reason: forced.reason, + }, + }); + } } })(); } From 3801655eab643ec7f3b819d904d9f8caa9fbc586 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:07:19 -0700 Subject: [PATCH 049/215] feat(cluster): merge durable clusters --- apps/cli/src/main.test.ts | 39 +++++++ apps/cli/src/main.ts | 34 ++++++ packages/api-contract/src/client.ts | 13 +++ packages/api-contract/src/contracts.test.ts | 33 ++++++ packages/api-contract/src/contracts.ts | 18 +++ packages/api-core/src/api/server.ts | 7 ++ packages/api-core/src/service.test.ts | 83 ++++++++++++++ packages/api-core/src/service.ts | 119 ++++++++++++++++++++ 8 files changed, 346 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index d78efea..41fff2e 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -272,6 +272,9 @@ test('agent-facing command help advertises explicit --json', async () => { 'close-thread', 'close-cluster', 'exclude-cluster-member', + 'include-cluster-member', + 'set-cluster-canonical', + 'merge-clusters', 'embed', 'key-summaries', 'cluster', @@ -430,6 +433,42 @@ test('include-cluster-member command forwards durable override inputs', async () assert.match(stdout.read(), /"action": "force_include"/); }); +test('merge-clusters command forwards durable merge inputs', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.mergeDurableClusters; + let received: unknown; + + GHCrawlService.prototype.mergeDurableClusters = function mergeDurableClustersStub(params: unknown) { + received = params; + return { + ok: true, + sourceClusterId: 7, + targetClusterId: 8, + message: 'merged', + } as never; + }; + + try { + await run(['merge-clusters', 'openclaw/openclaw', '--source', '7', '--target', '8', '--reason', 'same root cause'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.mergeDurableClusters = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + sourceClusterId: 7, + targetClusterId: 8, + reason: 'same root cause', + }); + assert.match(stdout.read(), /"targetClusterId": 8/); +}); + test('durable-clusters command forwards stable cluster list options', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index a77696e..406dde1 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -24,6 +24,7 @@ type CommandName = | 'exclude-cluster-member' | 'include-cluster-member' | 'set-cluster-canonical' + | 'merge-clusters' | 'summarize' | 'key-summaries' | 'purge-comments' @@ -237,6 +238,19 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl set-cluster-canonical openclaw/openclaw --id 123 --number 42 --reason "best root issue" --json'], agentJson: true, }, + { + name: 'merge-clusters', + synopsis: 'merge-clusters --source --target [--reason ] [--json]', + description: 'Merge one durable cluster into another and preserve the source slug as an alias.', + options: [ + '--source Durable cluster id to merge from', + '--target Durable cluster id to merge into', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl merge-clusters openclaw/openclaw --source 123 --target 456 --reason "same root cause" --json'], + agentJson: true, + }, { name: 'embed', synopsis: 'embed [--number ] [--json]', @@ -542,6 +556,8 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo threshold: { type: 'string' }, port: { type: 'string' }, id: { type: 'string' }, + source: { type: 'string' }, + target: { type: 'string' }, reason: { type: 'string' }, sort: { type: 'string' }, search: { type: 'string' }, @@ -1139,6 +1155,24 @@ export async function run( writeJson(stdout, result); return; } + case 'merge-clusters': { + const { owner, repo, values } = parseRepoFlags('merge-clusters', rest); + if (typeof values.source !== 'string') { + throw new CliUsageError('Missing --source', 'merge-clusters'); + } + if (typeof values.target !== 'string') { + throw new CliUsageError('Missing --target', 'merge-clusters'); + } + const result = getService().mergeDurableClusters({ + owner, + repo, + sourceClusterId: parsePositiveInteger('source', values.source, 'merge-clusters'), + targetClusterId: parsePositiveInteger('target', values.target, 'merge-clusters'), + reason: typeof values.reason === 'string' ? values.reason : undefined, + }); + writeJson(stdout, result); + return; + } case 'summarize': { const { owner, repo, values } = parseRepoFlags('summarize', rest); const result = await getService().summarizeRepository({ diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index a37f55d..52f9864 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -6,12 +6,14 @@ import { closeThreadRequestSchema, authorThreadsResponseSchema, clusterDetailResponseSchema, + clusterMergeResponseSchema, clusterOverrideResponseSchema, clusterSummariesResponseSchema, clustersResponseSchema, excludeClusterMemberRequestSchema, healthResponseSchema, includeClusterMemberRequestSchema, + mergeClustersRequestSchema, refreshRequestSchema, refreshResponseSchema, repositoriesResponseSchema, @@ -21,6 +23,7 @@ import { type ActionRequest, type ActionResponse, type CloseResponse, + type ClusterMergeResponse, type ClusterOverrideResponse, type AuthorThreadsResponse, type ClusterDetailResponse, @@ -66,6 +69,7 @@ export type GitcrawlClient = { excludeClusterMember: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; includeClusterMember: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; setClusterCanonical: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; + mergeClusters: (request: { owner: string; repo: string; sourceClusterId: number; targetClusterId: number; reason?: string }) => Promise; }; type FetchLike = typeof fetch; @@ -206,5 +210,14 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet }); return readJson(res, clusterOverrideResponseSchema); }, + async mergeClusters(request) { + const body = mergeClustersRequestSchema.parse(request); + const res = await fetchImpl(`${normalized}/actions/merge-clusters`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + return readJson(res, clusterMergeResponseSchema); + }, }; } diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 6edfb4e..c52e63c 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -3,11 +3,13 @@ import assert from 'node:assert/strict'; import { actionRequestSchema, + clusterMergeResponseSchema, clusterOverrideResponseSchema, durableClustersResponseSchema, excludeClusterMemberRequestSchema, healthResponseSchema, includeClusterMemberRequestSchema, + mergeClustersRequestSchema, neighborsResponseSchema, searchResponseSchema, setClusterCanonicalRequestSchema, @@ -92,6 +94,18 @@ test('include cluster member request trims optional reason', () => { assert.equal(parsed.reason, 'same root cause'); }); +test('merge clusters request trims optional reason', () => { + const parsed = mergeClustersRequestSchema.parse({ + owner: 'openclaw', + repo: 'openclaw', + sourceClusterId: 7, + targetClusterId: 8, + reason: ' same root cause ', + }); + + assert.equal(parsed.reason, 'same root cause'); +}); + test('cluster override response accepts durable removal state', () => { const parsed = clusterOverrideResponseSchema.parse({ ok: true, @@ -168,6 +182,25 @@ test('cluster override response accepts force canonical action', () => { assert.equal(parsed.action, 'force_canonical'); }); +test('cluster merge response accepts source and target ids', () => { + const parsed = clusterMergeResponseSchema.parse({ + ok: true, + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + sourceClusterId: 7, + targetClusterId: 8, + message: 'merged', + }); + + assert.equal(parsed.targetClusterId, 8); +}); + test('durable clusters response accepts stable slugs and governed member states', () => { const parsed = durableClustersResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 662513a..1d83166 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -305,6 +305,15 @@ export const setClusterCanonicalRequestSchema = z.object({ }); export type SetClusterCanonicalRequest = z.infer; +export const mergeClustersRequestSchema = z.object({ + owner: z.string(), + repo: z.string(), + sourceClusterId: z.number().int().positive(), + targetClusterId: z.number().int().positive(), + reason: z.string().trim().min(1).optional(), +}); +export type MergeClustersRequest = z.infer; + export const closeResponseSchema = z.object({ ok: z.boolean(), repository: repositorySchema, @@ -326,6 +335,15 @@ export const clusterOverrideResponseSchema = z.object({ }); export type ClusterOverrideResponse = z.infer; +export const clusterMergeResponseSchema = z.object({ + ok: z.boolean(), + repository: repositorySchema, + sourceClusterId: z.number().int().positive(), + targetClusterId: z.number().int().positive(), + message: z.string(), +}); +export type ClusterMergeResponse = z.infer; + export const rerunActionSchema = z.enum(['summarize', 'embed', 'cluster']); export type RerunAction = z.infer; diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index c35b1a9..a5d157d 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -6,6 +6,7 @@ import { closeThreadRequestSchema, excludeClusterMemberRequestSchema, includeClusterMemberRequestSchema, + mergeClustersRequestSchema, refreshRequestSchema, setClusterCanonicalRequestSchema, } from '@ghcrawl/api-contract'; @@ -233,6 +234,12 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'POST' && url.pathname === '/actions/merge-clusters') { + const body = mergeClustersRequestSchema.parse(await readBody(req)); + sendJson(res, 200, service.mergeDurableClusters(body)); + return; + } + sendJson(res, 404, { error: 'Not found' }); } catch (error) { const message = error instanceof Error ? error.message : String(error); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index be94ec7..000a2a3 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2397,6 +2397,89 @@ test('clusterRepository preserves a forced include on rebuild', async () => { } }); +test('mergeDurableClusters preserves source slug and force-includes active source members', () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Root issue', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Related issue', 'body', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + const insertCluster = service.db.prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, 'active', 'duplicate_candidate', ?, ?, ?, ?)`, + ); + insertCluster.run(7, 1, 'source-key', 'source-slug', 11, 'Source cluster', now, now); + insertCluster.run(8, 1, 'target-key', 'target-slug', 10, 'Target cluster', now, now); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 11, 'canonical', 'active', 1, null, null, 'algo', null, '{}', '{}', now, now, null); + + const result = service.mergeDurableClusters({ + owner: 'openclaw', + repo: 'openclaw', + sourceClusterId: 7, + targetClusterId: 8, + reason: 'same root cause', + }); + + const source = service.db.prepare('select status from cluster_groups where id = ?').get(7) as { status: string }; + const alias = service.db.prepare('select reason from cluster_aliases where cluster_id = ? and alias_slug = ?').get(8, 'source-slug') as { + reason: string; + }; + const override = service.db.prepare('select action, reason from cluster_overrides where cluster_id = ? and thread_id = ?').get(8, 11) as { + action: string; + reason: string; + }; + const membership = service.db + .prepare('select state, added_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(8, 11) as { state: string; added_by: string }; + + assert.equal(result.targetClusterId, 8); + assert.equal(source.status, 'merged'); + assert.equal(alias.reason, 'merged_from:7'); + assert.deepEqual(override, { action: 'force_include', reason: 'same root cause' }); + assert.deepEqual(membership, { state: 'active', added_by: 'user' }); + } finally { + service.close(); + } +}); + test('clusterRepository materializes only changed deterministic fingerprints', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 6a1d7a5..6809abc 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -14,6 +14,7 @@ import { authorThreadsResponseSchema, closeResponseSchema, clusterOverrideResponseSchema, + clusterMergeResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, @@ -31,6 +32,7 @@ import { type ActionResponse, type AuthorThreadsResponse, type CloseResponse, + type ClusterMergeResponse, type ClusterOverrideResponse, type ClusterDetailResponse, type ClusterDto, @@ -42,6 +44,7 @@ import { type EmbedResultDto, type HealthResponse, type IncludeClusterMemberRequest, + type MergeClustersRequest, type NeighborsResponse, type RefreshResponse, type RepositoriesResponse, @@ -1152,6 +1155,122 @@ export class GHCrawlService { }); } + mergeDurableClusters(params: MergeClustersRequest): ClusterMergeResponse { + if (params.sourceClusterId === params.targetClusterId) { + throw new Error('Source and target cluster ids must differ.'); + } + const repository = this.requireRepository(params.owner, params.repo); + const clusters = this.db + .prepare( + `select id, stable_slug + from cluster_groups + where repo_id = ? + and id in (?, ?)`, + ) + .all(repository.id, params.sourceClusterId, params.targetClusterId) as Array<{ id: number; stable_slug: string }>; + const source = clusters.find((cluster) => cluster.id === params.sourceClusterId); + const target = clusters.find((cluster) => cluster.id === params.targetClusterId); + if (!source) { + throw new Error(`Durable source cluster ${params.sourceClusterId} was not found for ${repository.fullName}.`); + } + if (!target) { + throw new Error(`Durable target cluster ${params.targetClusterId} was not found for ${repository.fullName}.`); + } + + const timestamp = nowIso(); + const members = this.db + .prepare( + `select thread_id, score_to_representative + from cluster_memberships + where cluster_id = ? + and state = 'active'`, + ) + .all(source.id) as Array<{ thread_id: number; score_to_representative: number | null }>; + const sourceAliases = this.db + .prepare('select alias_slug, reason from cluster_aliases where cluster_id = ?') + .all(source.id) as Array<{ alias_slug: string; reason: string }>; + + this.db.transaction(() => { + const upsertAlias = this.db.prepare( + `insert into cluster_aliases (cluster_id, alias_slug, reason, created_at) + values (?, ?, ?, ?) + on conflict(cluster_id, alias_slug) do update set + reason = excluded.reason`, + ); + upsertAlias.run(target.id, source.stable_slug, `merged_from:${source.id}`, timestamp); + for (const alias of sourceAliases) { + upsertAlias.run(target.id, alias.alias_slug, alias.reason, timestamp); + } + + for (const member of members) { + this.db + .prepare("delete from cluster_overrides where cluster_id = ? and thread_id = ? and action = 'exclude'") + .run(target.id, member.thread_id); + this.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) + values (?, ?, ?, 'force_include', ?, ?, null) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + created_at = excluded.created_at, + expires_at = null`, + ) + .run(repository.id, target.id, member.thread_id, params.reason ?? `merged from cluster ${source.id}`, timestamp); + upsertClusterMembership(this.db, { + clusterId: target.id, + threadId: member.thread_id, + role: 'related', + state: 'active', + scoreToRepresentative: member.score_to_representative, + addedBy: 'user', + addedReason: { + source: 'mergeDurableClusters', + sourceClusterId: source.id, + reason: params.reason ?? null, + }, + }); + this.db + .prepare("update cluster_memberships set added_by = 'user', updated_at = ? where cluster_id = ? and thread_id = ?") + .run(timestamp, target.id, member.thread_id); + } + + this.db + .prepare("update cluster_groups set status = 'merged', closed_at = ?, updated_at = ? where id = ?") + .run(timestamp, timestamp, source.id); + this.db + .prepare("update cluster_groups set updated_at = ? where id = ?") + .run(timestamp, target.id); + recordClusterEvent(this.db, { + clusterId: source.id, + eventType: 'manual_merge_source', + actorKind: 'user', + payload: { + targetClusterId: target.id, + reason: params.reason ?? null, + }, + }); + recordClusterEvent(this.db, { + clusterId: target.id, + eventType: 'manual_merge_target', + actorKind: 'user', + payload: { + sourceClusterId: source.id, + sourceSlug: source.stable_slug, + movedMemberCount: members.length, + reason: params.reason ?? null, + }, + }); + })(); + + return clusterMergeResponseSchema.parse({ + ok: true, + repository, + sourceClusterId: source.id, + targetClusterId: target.id, + message: `Merged durable cluster ${source.id} into ${target.id}.`, + }); + } + async syncRepository( params: SyncOptions, ): Promise { From 95730c0d561ce0e909c7ab929d642549604f8862 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:12:01 -0700 Subject: [PATCH 050/215] feat(cluster): split durable clusters --- apps/cli/src/main.test.ts | 38 +++++ apps/cli/src/main.ts | 32 ++++ packages/api-contract/src/client.ts | 13 ++ packages/api-contract/src/contracts.test.ts | 35 ++++ packages/api-contract/src/contracts.ts | 19 +++ packages/api-core/src/api/server.ts | 7 + packages/api-core/src/service.test.ts | 99 +++++++++++ packages/api-core/src/service.ts | 176 ++++++++++++++++++++ 8 files changed, 419 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 41fff2e..235cc07 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -275,6 +275,7 @@ test('agent-facing command help advertises explicit --json', async () => { 'include-cluster-member', 'set-cluster-canonical', 'merge-clusters', + 'split-cluster', 'embed', 'key-summaries', 'cluster', @@ -469,6 +470,43 @@ test('merge-clusters command forwards durable merge inputs', async () => { assert.match(stdout.read(), /"targetClusterId": 8/); }); +test('split-cluster command forwards durable split inputs', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.splitDurableCluster; + let received: unknown; + + GHCrawlService.prototype.splitDurableCluster = function splitDurableClusterStub(params: unknown) { + received = params; + return { + ok: true, + sourceClusterId: 7, + newClusterId: 9, + movedCount: 2, + message: 'split', + } as never; + }; + + try { + await run(['split-cluster', 'openclaw/openclaw', '--source', '7', '--numbers', '42,43', '--reason', 'separate root cause'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.splitDurableCluster = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + sourceClusterId: 7, + threadNumbers: [42, 43], + reason: 'separate root cause', + }); + assert.match(stdout.read(), /"newClusterId": 9/); +}); + test('durable-clusters command forwards stable cluster list options', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 406dde1..20b8ba4 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -25,6 +25,7 @@ type CommandName = | 'include-cluster-member' | 'set-cluster-canonical' | 'merge-clusters' + | 'split-cluster' | 'summarize' | 'key-summaries' | 'purge-comments' @@ -251,6 +252,19 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl merge-clusters openclaw/openclaw --source 123 --target 456 --reason "same root cause" --json'], agentJson: true, }, + { + name: 'split-cluster', + synopsis: 'split-cluster --source --numbers [--reason ] [--json]', + description: 'Split selected active members into a new durable cluster and block automatic re-entry into the source.', + options: [ + '--source Durable cluster id to split from', + '--numbers Issue or PR numbers to move into the new cluster', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl split-cluster openclaw/openclaw --source 123 --numbers 42,43 --reason "separate root cause" --json'], + agentJson: true, + }, { name: 'embed', synopsis: 'embed [--number ] [--json]', @@ -1173,6 +1187,24 @@ export async function run( writeJson(stdout, result); return; } + case 'split-cluster': { + const { owner, repo, values } = parseRepoFlags('split-cluster', rest); + if (typeof values.source !== 'string') { + throw new CliUsageError('Missing --source', 'split-cluster'); + } + if (typeof values.numbers !== 'string') { + throw new CliUsageError('Missing --numbers', 'split-cluster'); + } + const result = getService().splitDurableCluster({ + owner, + repo, + sourceClusterId: parsePositiveInteger('source', values.source, 'split-cluster'), + threadNumbers: parsePositiveIntegerList('numbers', values.numbers, 'split-cluster'), + reason: typeof values.reason === 'string' ? values.reason : undefined, + }); + writeJson(stdout, result); + return; + } case 'summarize': { const { owner, repo, values } = parseRepoFlags('summarize', rest); const result = await getService().summarizeRepository({ diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index 52f9864..60492be 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -8,6 +8,7 @@ import { clusterDetailResponseSchema, clusterMergeResponseSchema, clusterOverrideResponseSchema, + clusterSplitResponseSchema, clusterSummariesResponseSchema, clustersResponseSchema, excludeClusterMemberRequestSchema, @@ -19,12 +20,14 @@ import { repositoriesResponseSchema, searchResponseSchema, setClusterCanonicalRequestSchema, + splitClusterRequestSchema, threadsResponseSchema, type ActionRequest, type ActionResponse, type CloseResponse, type ClusterMergeResponse, type ClusterOverrideResponse, + type ClusterSplitResponse, type AuthorThreadsResponse, type ClusterDetailResponse, type ClusterSummariesResponse, @@ -70,6 +73,7 @@ export type GitcrawlClient = { includeClusterMember: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; setClusterCanonical: (request: { owner: string; repo: string; clusterId: number; threadNumber: number; reason?: string }) => Promise; mergeClusters: (request: { owner: string; repo: string; sourceClusterId: number; targetClusterId: number; reason?: string }) => Promise; + splitCluster: (request: { owner: string; repo: string; sourceClusterId: number; threadNumbers: number[]; reason?: string }) => Promise; }; type FetchLike = typeof fetch; @@ -219,5 +223,14 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet }); return readJson(res, clusterMergeResponseSchema); }, + async splitCluster(request) { + const body = splitClusterRequestSchema.parse(request); + const res = await fetchImpl(`${normalized}/actions/split-cluster`, { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify(body), + }); + return readJson(res, clusterSplitResponseSchema); + }, }; } diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index c52e63c..9ca92e7 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -5,6 +5,7 @@ import { actionRequestSchema, clusterMergeResponseSchema, clusterOverrideResponseSchema, + clusterSplitResponseSchema, durableClustersResponseSchema, excludeClusterMemberRequestSchema, healthResponseSchema, @@ -13,6 +14,7 @@ import { neighborsResponseSchema, searchResponseSchema, setClusterCanonicalRequestSchema, + splitClusterRequestSchema, } from './contracts.js'; test('health schema accepts configured status payload', () => { @@ -106,6 +108,19 @@ test('merge clusters request trims optional reason', () => { assert.equal(parsed.reason, 'same root cause'); }); +test('split cluster request trims optional reason', () => { + const parsed = splitClusterRequestSchema.parse({ + owner: 'openclaw', + repo: 'openclaw', + sourceClusterId: 7, + threadNumbers: [42, 43], + reason: ' separate root cause ', + }); + + assert.equal(parsed.reason, 'separate root cause'); + assert.deepEqual(parsed.threadNumbers, [42, 43]); +}); + test('cluster override response accepts durable removal state', () => { const parsed = clusterOverrideResponseSchema.parse({ ok: true, @@ -201,6 +216,26 @@ test('cluster merge response accepts source and target ids', () => { assert.equal(parsed.targetClusterId, 8); }); +test('cluster split response accepts source and new ids', () => { + const parsed = clusterSplitResponseSchema.parse({ + ok: true, + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + sourceClusterId: 7, + newClusterId: 8, + movedCount: 2, + message: 'split', + }); + + assert.equal(parsed.newClusterId, 8); +}); + test('durable clusters response accepts stable slugs and governed member states', () => { const parsed = durableClustersResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 1d83166..b0ad605 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -314,6 +314,15 @@ export const mergeClustersRequestSchema = z.object({ }); export type MergeClustersRequest = z.infer; +export const splitClusterRequestSchema = z.object({ + owner: z.string(), + repo: z.string(), + sourceClusterId: z.number().int().positive(), + threadNumbers: z.array(z.number().int().positive()).min(1), + reason: z.string().trim().min(1).optional(), +}); +export type SplitClusterRequest = z.infer; + export const closeResponseSchema = z.object({ ok: z.boolean(), repository: repositorySchema, @@ -344,6 +353,16 @@ export const clusterMergeResponseSchema = z.object({ }); export type ClusterMergeResponse = z.infer; +export const clusterSplitResponseSchema = z.object({ + ok: z.boolean(), + repository: repositorySchema, + sourceClusterId: z.number().int().positive(), + newClusterId: z.number().int().positive(), + movedCount: z.number().int().nonnegative(), + message: z.string(), +}); +export type ClusterSplitResponse = z.infer; + export const rerunActionSchema = z.enum(['summarize', 'embed', 'cluster']); export type RerunAction = z.infer; diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index a5d157d..2a79753 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -9,6 +9,7 @@ import { mergeClustersRequestSchema, refreshRequestSchema, setClusterCanonicalRequestSchema, + splitClusterRequestSchema, } from '@ghcrawl/api-contract'; import { ZodError } from 'zod'; @@ -240,6 +241,12 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'POST' && url.pathname === '/actions/split-cluster') { + const body = splitClusterRequestSchema.parse(await readBody(req)); + sendJson(res, 200, service.splitDurableCluster(body)); + return; + } + sendJson(res, 404, { error: 'Not found' }); } catch (error) { const message = error instanceof Error ? error.message : String(error); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 000a2a3..0d88322 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2480,6 +2480,105 @@ test('mergeDurableClusters preserves source slug and force-includes active sourc } }); +test('splitDurableCluster creates a governed cluster and blocks source re-entry', () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Canonical issue', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Remaining issue', 'body', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + insertThread.run(12, 1, '102', 44, 'issue', 'open', 'Moved issue', 'body', 'carol', 'User', 'https://github.com/openclaw/openclaw/issues/44', '[]', '[]', '{}', 'hash-44', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, 'active', 'duplicate_candidate', ?, ?, ?, ?)`, + ) + .run(7, 1, 'source-key', 'source-slug', 10, 'Source cluster', now, now); + const insertMembership = service.db.prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, 'active', ?, ?, ?, 'algo', ?, ?, ?, ?, ?, ?)`, + ); + insertMembership.run(7, 10, 'canonical', 1, null, null, null, '{}', '{}', now, now, null); + insertMembership.run(7, 11, 'related', 0.72, null, null, null, '{}', '{}', now, now, null); + insertMembership.run(7, 12, 'related', 0.81, null, null, null, '{}', '{}', now, now, null); + + const result = service.splitDurableCluster({ + owner: 'openclaw', + repo: 'openclaw', + sourceClusterId: 7, + threadNumbers: [42, 44], + reason: 'separate root cause', + }); + + const sourceCanonical = service.db + .prepare('select representative_thread_id from cluster_groups where id = ?') + .get(7) as { representative_thread_id: number }; + const movedSourceMembership = service.db + .prepare('select state, removed_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(7, 10) as { state: string; removed_by: string }; + const remainingSourceMembership = service.db + .prepare('select role, state from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(7, 11) as { role: string; state: string }; + const sourceOverride = service.db + .prepare('select action, reason from cluster_overrides where cluster_id = ? and thread_id = ?') + .get(7, 10) as { action: string; reason: string }; + const newCanonical = service.db + .prepare('select role, state, added_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(result.newClusterId, 10) as { role: string; state: string; added_by: string }; + const newRelated = service.db + .prepare('select role, state, added_by from cluster_memberships where cluster_id = ? and thread_id = ?') + .get(result.newClusterId, 12) as { role: string; state: string; added_by: string }; + const newOverride = service.db + .prepare('select action, reason from cluster_overrides where cluster_id = ? and thread_id = ?') + .get(result.newClusterId, 12) as { action: string; reason: string }; + + assert.equal(result.sourceClusterId, 7); + assert.equal(result.movedCount, 2); + assert.equal(sourceCanonical.representative_thread_id, 11); + assert.deepEqual(movedSourceMembership, { state: 'removed_by_user', removed_by: 'user' }); + assert.deepEqual(remainingSourceMembership, { role: 'canonical', state: 'active' }); + assert.deepEqual(sourceOverride, { action: 'exclude', reason: 'separate root cause' }); + assert.deepEqual(newCanonical, { role: 'canonical', state: 'active', added_by: 'user' }); + assert.deepEqual(newRelated, { role: 'related', state: 'active', added_by: 'user' }); + assert.deepEqual(newOverride, { action: 'force_include', reason: 'separate root cause' }); + } finally { + service.close(); + } +}); + test('clusterRepository materializes only changed deterministic fingerprints', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 6809abc..e92cfb1 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -15,6 +15,7 @@ import { closeResponseSchema, clusterOverrideResponseSchema, clusterMergeResponseSchema, + clusterSplitResponseSchema, clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, @@ -34,6 +35,7 @@ import { type CloseResponse, type ClusterMergeResponse, type ClusterOverrideResponse, + type ClusterSplitResponse, type ClusterDetailResponse, type ClusterDto, type ClusterResultDto, @@ -53,6 +55,7 @@ import { type SearchMode, type SearchResponse, type SetClusterCanonicalRequest, + type SplitClusterRequest, type SyncResultDto, type ThreadDto, type ThreadsResponse, @@ -1271,6 +1274,179 @@ export class GHCrawlService { }); } + splitDurableCluster(params: SplitClusterRequest): ClusterSplitResponse { + const threadNumbers = Array.from(new Set(params.threadNumbers)).sort((left, right) => left - right); + const repository = this.requireRepository(params.owner, params.repo); + const source = this.db + .prepare( + `select id, stable_slug + from cluster_groups + where repo_id = ? + and id = ? + limit 1`, + ) + .get(repository.id, params.sourceClusterId) as { id: number; stable_slug: string } | undefined; + if (!source) { + throw new Error(`Durable source cluster ${params.sourceClusterId} was not found for ${repository.fullName}.`); + } + + const placeholders = threadNumbers.map(() => '?').join(', '); + const requestedThreads = this.db + .prepare(`select id, number, title from threads where repo_id = ? and number in (${placeholders})`) + .all(repository.id, ...threadNumbers) as Array<{ id: number; number: number; title: string }>; + const requestedByNumber = new Map(requestedThreads.map((thread) => [thread.number, thread])); + const missingNumbers = threadNumbers.filter((number) => !requestedByNumber.has(number)); + if (missingNumbers.length > 0) { + throw new Error(`Thread(s) ${missingNumbers.map((number) => `#${number}`).join(', ')} were not found for ${repository.fullName}.`); + } + + const activeMembers = this.db + .prepare( + `select cm.thread_id, cm.role, cm.score_to_representative, t.number, t.title + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + and cm.state = 'active' + order by t.number asc`, + ) + .all(source.id) as Array<{ + thread_id: number; + role: 'canonical' | 'duplicate' | 'related'; + score_to_representative: number | null; + number: number; + title: string; + }>; + const selectedThreadIds = new Set(requestedThreads.map((thread) => thread.id)); + const selectedMembers = activeMembers.filter((member) => selectedThreadIds.has(member.thread_id)); + const missingActiveNumbers = threadNumbers.filter((number) => !selectedMembers.some((member) => member.number === number)); + if (missingActiveNumbers.length > 0) { + throw new Error(`Thread(s) ${missingActiveNumbers.map((number) => `#${number}`).join(', ')} are not active members of durable cluster ${source.id}.`); + } + + const remainingMembers = activeMembers.filter((member) => !selectedThreadIds.has(member.thread_id)); + if (remainingMembers.length === 0) { + throw new Error('Split must leave at least one active member in the source cluster.'); + } + + const selectedCanonical = selectedMembers.find((member) => member.role === 'canonical') ?? selectedMembers[0]; + const remainingCanonical = remainingMembers.find((member) => member.role === 'canonical') ?? remainingMembers[0]; + if (!selectedCanonical || !remainingCanonical) { + throw new Error('Split requires selected and remaining active members.'); + } + + const identity = humanKeyForValue(`cluster-split:${repository.id}:${source.id}:${selectedMembers.map((member) => member.thread_id).join(',')}`); + const timestamp = nowIso(); + let newClusterId = 0; + this.db.transaction(() => { + newClusterId = upsertClusterGroup(this.db, { + repoId: repository.id, + stableKey: identity.hash, + stableSlug: identity.slug, + status: 'active', + clusterType: 'duplicate_candidate', + representativeThreadId: selectedCanonical.thread_id, + title: `Split from ${source.stable_slug}`, + }); + + this.db + .prepare('update cluster_groups set representative_thread_id = ?, updated_at = ? where id = ?') + .run(remainingCanonical.thread_id, timestamp, source.id); + this.db + .prepare("update cluster_memberships set role = 'canonical', updated_at = ? where cluster_id = ? and thread_id = ?") + .run(timestamp, source.id, remainingCanonical.thread_id); + + for (const member of selectedMembers) { + const reason = params.reason ?? `split into cluster ${newClusterId}`; + this.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) + values (?, ?, ?, 'exclude', ?, ?, null) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + created_at = excluded.created_at, + expires_at = null`, + ) + .run(repository.id, source.id, member.thread_id, reason, timestamp); + upsertClusterMembership(this.db, { + clusterId: source.id, + threadId: member.thread_id, + role: member.role, + state: 'removed_by_user', + scoreToRepresentative: member.score_to_representative, + addedBy: 'user', + removedBy: 'user', + addedReason: { + source: 'splitDurableCluster', + newClusterId, + }, + removedReason: { + source: 'cluster_overrides', + action: 'exclude', + reason: params.reason ?? null, + }, + }); + + this.db + .prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) + values (?, ?, ?, 'force_include', ?, ?, null) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + created_at = excluded.created_at, + expires_at = null`, + ) + .run(repository.id, newClusterId, member.thread_id, reason, timestamp); + upsertClusterMembership(this.db, { + clusterId: newClusterId, + threadId: member.thread_id, + role: member.thread_id === selectedCanonical.thread_id ? 'canonical' : 'related', + state: 'active', + scoreToRepresentative: member.thread_id === selectedCanonical.thread_id ? 1 : member.score_to_representative, + addedBy: 'user', + addedReason: { + source: 'splitDurableCluster', + sourceClusterId: source.id, + reason: params.reason ?? null, + }, + }); + this.db + .prepare("update cluster_memberships set added_by = 'user', updated_at = ? where cluster_id = ? and thread_id = ?") + .run(timestamp, newClusterId, member.thread_id); + } + + recordClusterEvent(this.db, { + clusterId: source.id, + eventType: 'manual_split_source', + actorKind: 'user', + payload: { + newClusterId, + movedThreadNumbers: selectedMembers.map((member) => member.number), + reason: params.reason ?? null, + }, + }); + recordClusterEvent(this.db, { + clusterId: newClusterId, + eventType: 'manual_split_target', + actorKind: 'user', + payload: { + sourceClusterId: source.id, + sourceSlug: source.stable_slug, + movedThreadNumbers: selectedMembers.map((member) => member.number), + reason: params.reason ?? null, + }, + }); + })(); + + return clusterSplitResponseSchema.parse({ + ok: true, + repository, + sourceClusterId: source.id, + newClusterId, + movedCount: selectedMembers.length, + message: `Split ${selectedMembers.length} member(s) from durable cluster ${source.id} into ${newClusterId}.`, + }); + } + async syncRepository( params: SyncOptions, ): Promise { From 476a2279fef1f15f568f96044acaa4fefa0d3b83 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:14:44 -0700 Subject: [PATCH 051/215] fix(db): spill large comment payloads --- packages/api-core/src/db/migrate.ts | 8 ++++++++ packages/api-core/src/service.test.ts | 8 ++++++++ packages/api-core/src/service.ts | 22 ++++++++++++++++++---- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/packages/api-core/src/db/migrate.ts b/packages/api-core/src/db/migrate.ts index 09d03ea..d51eedc 100644 --- a/packages/api-core/src/db/migrate.ts +++ b/packages/api-core/src/db/migrate.ts @@ -51,6 +51,7 @@ const migrationStatements = [ body text not null, is_bot integer not null default 0, raw_json text not null, + raw_json_blob_id integer references blobs(id) on delete set null, created_at_gh text, updated_at_gh text, unique(thread_id, comment_type, github_id) @@ -505,6 +506,13 @@ export function migrate(db: SqliteDatabase): void { db.exec('alter table threads add column close_reason_local text'); } + const commentColumns = new Set( + (db.prepare('pragma table_info(comments)').all() as Array<{ name: string }>).map((column) => column.name), + ); + if (!commentColumns.has('raw_json_blob_id')) { + db.exec('alter table comments add column raw_json_blob_id integer references blobs(id) on delete set null'); + } + const clusterColumns = new Set( (db.prepare('pragma table_info(clusters)').all() as Array<{ name: string }>).map((column) => column.name), ); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 0d88322..801ca58 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -345,6 +345,7 @@ test('syncRepository fetches comments, reviews, and review comments when include { id: 200, body: 'same here', + payload: 'x'.repeat(5000), created_at: '2026-03-09T00:00:00Z', updated_at: '2026-03-09T00:00:00Z', user: { login: 'bob', type: 'User' }, @@ -393,6 +394,13 @@ test('syncRepository fetches comments, reviews, and review comments when include const commentCount = service.db.prepare('select count(*) as count from comments').get() as { count: number }; assert.equal(commentCount.count, 3); + const largeComment = service.db + .prepare("select raw_json, raw_json_blob_id from comments where comment_type = 'issue_comment' limit 1") + .get() as { raw_json: string; raw_json_blob_id: number | null }; + assert.equal(largeComment.raw_json, '{}'); + assert.equal(typeof largeComment.raw_json_blob_id, 'number'); + const blob = service.db.prepare('select storage_kind from blobs where id = ?').get(largeComment.raw_json_blob_id) as { storage_kind: string }; + assert.equal(blob.storage_kind, 'file'); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e92cfb1..a66be34 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -99,7 +99,7 @@ import { } from './config.js'; import { migrate } from './db/migrate.js'; import { openDb, type SqliteDatabase } from './db/sqlite.js'; -import { readTextBlob } from './db/blob-store.js'; +import { readTextBlob, storeTextBlob } from './db/blob-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; @@ -398,6 +398,7 @@ const SYNC_BATCH_SIZE = 100; const SYNC_BATCH_DELAY_MS = 5000; const STALE_CLOSED_SWEEP_LIMIT = 1000; const CLUSTER_PROGRESS_INTERVAL_MS = 5000; +const RAW_JSON_INLINE_THRESHOLD_BYTES = 4096; const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000; const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3; const EMBED_MAX_ITEM_TOKENS = 7000; @@ -4082,6 +4083,17 @@ export class GHCrawlService { return path.join(path.dirname(this.config.dbPath), '.ghcrawl-store'); } + private rawJsonStorage(rawJson: string, mediaType: string): { inlineJson: string; blobId: number | null } { + if (Buffer.byteLength(rawJson, 'utf8') <= RAW_JSON_INLINE_THRESHOLD_BYTES) { + return { inlineJson: rawJson, blobId: null }; + } + const blob = storeTextBlob(this.db, this.blobStoreRoot(), rawJson, { + mediaType, + inlineThresholdBytes: RAW_JSON_INLINE_THRESHOLD_BYTES, + }); + return { inlineJson: '{}', blobId: blob.id }; + } + private async applyClosedOverlapSweep(params: { repoId: number; owner: string; @@ -4271,12 +4283,13 @@ export class GHCrawlService { private replaceComments(threadId: number, comments: CommentSeed[]): void { const insert = this.db.prepare( `insert into comments ( - thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, raw_json_blob_id, created_at_gh, updated_at_gh + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ); const tx = this.db.transaction((commentRows: CommentSeed[]) => { this.db.prepare('delete from comments where thread_id = ?').run(threadId); for (const comment of commentRows) { + const raw = this.rawJsonStorage(comment.rawJson, `application/vnd.ghcrawl.${comment.commentType}.raw+json`); insert.run( threadId, comment.githubId, @@ -4285,7 +4298,8 @@ export class GHCrawlService { comment.authorType, comment.body, comment.isBot ? 1 : 0, - comment.rawJson, + raw.inlineJson, + raw.blobId, comment.createdAtGh, comment.updatedAtGh, ); From 0844942b5252a2aef025f68209316683004bd8ad Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:19:34 -0700 Subject: [PATCH 052/215] feat(cluster): refresh durable neighborhoods --- apps/cli/src/main.test.ts | 31 ++++++++ apps/cli/src/main.ts | 6 +- .../src/cluster/deterministic-engine.test.ts | 36 +++++++++ .../src/cluster/deterministic-engine.ts | 22 +++++- .../api-core/src/cluster/persistent-store.ts | 2 +- packages/api-core/src/service.test.ts | 74 +++++++++++++++++++ packages/api-core/src/service.ts | 73 +++++++++++++----- 7 files changed, 220 insertions(+), 24 deletions(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 235cc07..decf4d2 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -507,6 +507,37 @@ test('split-cluster command forwards durable split inputs', async () => { assert.match(stdout.read(), /"newClusterId": 9/); }); +test('cluster command forwards neighborhood refresh inputs', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.clusterRepository; + let received: unknown; + + GHCrawlService.prototype.clusterRepository = async function clusterRepositoryStub(params: unknown) { + received = params; + return { runId: 12, edges: 3, clusters: 1 } as never; + }; + + try { + await run(['cluster', 'openclaw/openclaw', '--number', '42', '--k', '4', '--threshold', '0.82'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.clusterRepository = original; + context.cleanup(); + } + + const params = received as { owner: string; repo: string; threadNumber: number; k: number; minScore: number; onProgress?: unknown }; + assert.equal(params.owner, 'openclaw'); + assert.equal(params.repo, 'openclaw'); + assert.equal(params.threadNumber, 42); + assert.equal(params.k, 4); + assert.equal(params.minScore, 0.82); + assert.equal(typeof params.onProgress, 'function'); + assert.match(stdout.read(), /"edges": 3/); +}); + test('durable-clusters command forwards stable cluster list options', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 20b8ba4..81b945c 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -287,16 +287,17 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'cluster', - synopsis: 'cluster [--k ] [--threshold ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', + synopsis: 'cluster [--number ] [--k ] [--threshold ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', description: 'Build or refresh local similarity clusters.', options: [ + '--number Refresh only one durable cluster neighborhood', '--k Limit nearest-neighbor fanout', '--threshold Minimum similarity score', '--heap-snapshot-dir Write heap snapshots during long-running work', '--heap-log-interval-ms Emit periodic heap diagnostics', '--json Emit machine-readable JSON output explicitly', ], - examples: ['ghcrawl cluster openclaw/openclaw --json', 'ghcrawl cluster openclaw/openclaw --threshold 0.82 --json'], + examples: ['ghcrawl cluster openclaw/openclaw --json', 'ghcrawl cluster openclaw/openclaw --number 42 --threshold 0.82 --json'], agentJson: true, }, { @@ -1258,6 +1259,7 @@ export async function run( const result = await getService().clusterRepository({ owner, repo, + threadNumber: typeof values.number === 'string' ? parsePositiveInteger('number', values.number, 'cluster') : undefined, k: typeof values.k === 'string' ? parsePositiveInteger('k', values.k, 'cluster') : undefined, minScore: typeof values.threshold === 'string' ? parseFiniteNumber('threshold', values.threshold, 'cluster') : undefined, onProgress: diff --git a/packages/api-core/src/cluster/deterministic-engine.test.ts b/packages/api-core/src/cluster/deterministic-engine.test.ts index 772209e..e1f6ea5 100644 --- a/packages/api-core/src/cluster/deterministic-engine.test.ts +++ b/packages/api-core/src/cluster/deterministic-engine.test.ts @@ -62,3 +62,39 @@ test('buildDeterministicClusterGraph infers hard refs from text', () => { assert.equal(result.edges[0]?.tier, 'strong'); }); + +test('buildDeterministicClusterGraph can limit candidates to a seed neighborhood', () => { + const result = buildDeterministicClusterGraph( + [ + { + id: 10, + number: 10, + kind: 'issue', + title: 'Retry loop hangs', + body: 'Transfer retry loop never exits.', + labels: ['bug'], + }, + { + id: 11, + number: 11, + kind: 'issue', + title: 'Retry loop hangs again', + body: 'Transfer retry loop never exits.', + labels: ['bug'], + }, + { + id: 12, + number: 12, + kind: 'issue', + title: 'Retry loop hangs on timeout', + body: 'Transfer retry loop never exits.', + labels: ['bug'], + }, + ], + { seedThreadIds: [10] }, + ); + + assert.ok(result.edges.length >= 1); + assert.ok(result.edges.every((edge) => edge.leftThreadId === 10 || edge.rightThreadId === 10)); + assert.ok(result.clusters.every((cluster) => cluster.members.includes(10))); +}); diff --git a/packages/api-core/src/cluster/deterministic-engine.ts b/packages/api-core/src/cluster/deterministic-engine.ts index 6d350e5..6f43c30 100644 --- a/packages/api-core/src/cluster/deterministic-engine.ts +++ b/packages/api-core/src/cluster/deterministic-engine.ts @@ -50,7 +50,7 @@ function bump(index: Map>, key: string, id: number): void { function buildCandidatePairs( fingerprints: Map, - params: { maxBucketSize: number; topK: number }, + params: { maxBucketSize: number; topK: number; seedThreadIds?: Set }, ): Array<[number, number]> { const index = new Map>(); for (const [id, fingerprint] of fingerprints.entries()) { @@ -67,6 +67,9 @@ function buildCandidatePairs( const ids = Array.from(bucket).sort((left, right) => left - right); for (let leftIndex = 0; leftIndex < ids.length; leftIndex += 1) { for (let rightIndex = leftIndex + 1; rightIndex < ids.length; rightIndex += 1) { + if (params.seedThreadIds && !params.seedThreadIds.has(ids[leftIndex]) && !params.seedThreadIds.has(ids[rightIndex])) { + continue; + } const key = `${ids[leftIndex]}:${ids[rightIndex]}`; votes.set(key, (votes.get(key) ?? 0) + 1); } @@ -84,7 +87,7 @@ function buildCandidatePairs( export function buildDeterministicClusterGraph( inputs: DeterministicClusterInput[], - params: { maxBucketSize?: number; topK?: number } = {}, + params: { maxBucketSize?: number; topK?: number; seedThreadIds?: number[] } = {}, ): DeterministicClusterResult { const fingerprints = new Map(); for (const input of inputs) { @@ -113,12 +116,14 @@ export function buildDeterministicClusterGraph( export function buildDeterministicClusterGraphFromFingerprints( nodes: DeterministicClusterNode[], fingerprints: Map, - params: { maxBucketSize?: number; topK?: number } = {}, + params: { maxBucketSize?: number; topK?: number; seedThreadIds?: number[] } = {}, ): DeterministicClusterResult { const titleById = new Map(nodes.map((node) => [node.id, node.title])); + const seedThreadIds = params.seedThreadIds ? new Set(params.seedThreadIds) : undefined; const pairs = buildCandidatePairs(fingerprints, { maxBucketSize: params.maxBucketSize ?? 500, topK: params.topK ?? 64, + seedThreadIds, }); const edges: DeterministicClusterEdge[] = []; for (const [leftThreadId, rightThreadId] of pairs) { @@ -136,8 +141,17 @@ export function buildDeterministicClusterGraphFromFingerprints( }); } + const clusterNodeIds = new Set(); + if (seedThreadIds) { + for (const id of seedThreadIds) clusterNodeIds.add(id); + for (const edge of edges) { + clusterNodeIds.add(edge.leftThreadId); + clusterNodeIds.add(edge.rightThreadId); + } + } + const clusterNodes = seedThreadIds ? nodes.filter((node) => clusterNodeIds.has(node.id)) : nodes; const clusters = buildClusters( - nodes.map((node) => ({ + clusterNodes.map((node) => ({ threadId: node.id, number: node.number, title: titleById.get(node.id) ?? node.title, diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index 1f01470..2d976a6 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -53,7 +53,7 @@ function upsertTextBlob( return upsertInlineBlob(db, params); } -export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster'; +export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster' | 'cluster_incremental'; export function upsertActor( db: SqliteDatabase, diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 801ca58..b6de130 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -2661,6 +2661,80 @@ test('clusterRepository materializes only changed deterministic fingerprints', a } }); +test('clusterRepository can refresh one durable neighborhood without replacing the full snapshot', async () => { + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + checkAuth: async () => undefined, + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Download retry hangs forever', 'The transfer retry loop never exits after timeout.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Download retry loop never exits', 'Retry hangs forever after timeout.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + insertThread.run(12, 1, '102', 44, 'issue', 'open', 'Improve documentation typography', 'Docs heading sizes look inconsistent.', 'carol', 'User', 'https://github.com/openclaw/openclaw/issues/44', '[]', '[]', '{}', 'hash-44', 0, now, now, null, null, now, now, now); + + const full = await service.clusterRepository({ owner: 'openclaw', repo: 'openclaw', k: 1, minScore: 0.1 }); + service.db + .prepare('update threads set body = ?, content_hash = ?, updated_at_gh = ?, updated_at = ? where id = ?') + .run('The transfer retry loop never exits after a network timeout.', 'hash-42b', '2026-03-10T00:00:00Z', '2026-03-10T00:00:00Z', 10); + + const messages: string[] = []; + const incremental = await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + threadNumber: 42, + k: 1, + minScore: 0.1, + onProgress: (message) => messages.push(message), + }); + + const fullSnapshotClusters = service.db + .prepare('select count(*) as count from clusters where cluster_run_id = ?') + .get(full.runId) as { count: number }; + const incrementalSnapshotClusters = service.db + .prepare('select count(*) as count from clusters where cluster_run_id = ?') + .get(incremental.runId) as { count: number }; + const incrementalRun = service.db + .prepare("select run_kind from pipeline_runs where run_kind = 'cluster_incremental' order by id desc limit 1") + .get() as { run_kind: string } | undefined; + + assert.ok(messages.some((message) => message.includes('[fingerprint] latest revisions computed=1 skipped=0'))); + assert.ok(messages.some((message) => message.includes('without replacing the full cluster snapshot'))); + assert.ok(fullSnapshotClusters.count > 0); + assert.equal(incrementalSnapshotClusters.count, 0); + assert.equal(incrementalRun?.run_kind, 'cluster_incremental'); + } finally { + service.close(); + } +}); + test('clusterRepository uses hydrated code hunk signatures without embeddings', async () => { const service = new GHCrawlService({ config: makeTestConfig(), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index a66be34..85269e4 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1991,18 +1991,21 @@ export class GHCrawlService { async clusterRepository(params: { owner: string; repo: string; + threadNumber?: number; minScore?: number; k?: number; onProgress?: (message: string) => void; }): Promise { const repository = this.requireRepository(params.owner, params.repo); - const runId = this.startRun('cluster_runs', repository.id, repository.fullName); + const runSubject = params.threadNumber ? `${repository.fullName}#${params.threadNumber}` : repository.fullName; + const runId = this.startRun('cluster_runs', repository.id, runSubject); const pipelineRunId = createPipelineRun(this.db, { repoId: repository.id, - runKind: 'cluster', + runKind: params.threadNumber ? 'cluster_incremental' : 'cluster', algorithmVersion: 'persistent-cluster-v1', configHash: stableContentHash( JSON.stringify({ + threadNumber: params.threadNumber ?? null, minScore: params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE, k: params.k ?? 6, embedModel: this.config.embedModel, @@ -2014,15 +2017,32 @@ export class GHCrawlService { const k = params.k ?? 6; try { + const seedThread = params.threadNumber + ? (this.db + .prepare( + `select id, number + from threads + where repo_id = ? + and number = ? + and state = 'open' + and closed_at_local is null + limit 1`, + ) + .get(repository.id, params.threadNumber) as { id: number; number: number } | undefined) + : undefined; + if (params.threadNumber && !seedThread) { + throw new Error(`Open thread #${params.threadNumber} was not found for ${repository.fullName}.`); + } + const seedThreadIds = seedThread ? [seedThread.id] : undefined; const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); - this.materializeLatestDeterministicFingerprints(deterministicItems, params.onProgress); + const fingerprintItems = seedThreadIds ? deterministicItems.filter((item) => seedThreadIds.includes(item.id)) : deterministicItems; + this.materializeLatestDeterministicFingerprints(fingerprintItems, params.onProgress); const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); const deterministic = buildDeterministicClusterGraphFromFingerprints( deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), persistedFingerprints, - { topK: Math.max(k * 8, 64) }, + { topK: Math.max(k * 8, 64), seedThreadIds }, ); - const items = deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })); const aggregatedEdges = new Map }>(); this.mergeSourceKindEdges( aggregatedEdges, @@ -2030,11 +2050,12 @@ export class GHCrawlService { 'deterministic_fingerprint', ); params.onProgress?.( - `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${repository.fullName}`, + `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${runSubject}`, ); if (this.isRepoVectorStateCurrent(repository.id)) { const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); + const queryVectorItems = seedThreadIds ? vectorItems.filter((item) => seedThreadIds.includes(item.id)) : vectorItems; const activeSourceKind = this.activeVectorSourceKind(); const activeIds = new Set(vectorItems.map((item) => item.id)); const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k); @@ -2042,9 +2063,9 @@ export class GHCrawlService { let lastProgressAt = Date.now(); params.onProgress?.( - `[cluster] loaded ${vectorItems.length} active vector(s) for ${repository.fullName} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`, + `[cluster] loaded ${vectorItems.length} active vector(s), querying ${queryVectorItems.length} for ${runSubject} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`, ); - for (const item of vectorItems) { + for (const item of queryVectorItems) { const neighbors = this.queryNearestWithRecovery(repository.id, repository.fullName, { vector: item.embedding, limit: annQuery.limit, @@ -2070,11 +2091,11 @@ export class GHCrawlService { processed += 1; const now = Date.now(); if (params.onProgress && now - lastProgressAt >= CLUSTER_PROGRESS_INTERVAL_MS) { - params.onProgress(`[cluster] queried ${processed}/${vectorItems.length} vectors current_edges=${aggregatedEdges.size}`); + params.onProgress(`[cluster] queried ${processed}/${queryVectorItems.length} vectors current_edges=${aggregatedEdges.size}`); lastProgressAt = now; } } - } else if (this.hasLegacyEmbeddings(repository.id)) { + } else if (!seedThreadIds && this.hasLegacyEmbeddings(repository.id)) { const legacy = this.loadClusterableThreadMeta(repository.id); params.onProgress?.( `[cluster] loaded ${legacy.items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`, @@ -2103,22 +2124,40 @@ export class GHCrawlService { params.onProgress?.(`[cluster] built ${edges.length} similarity edge(s)`); + const involvedIds = new Set(); + if (seedThreadIds) { + for (const id of seedThreadIds) involvedIds.add(id); + for (const edge of aggregatedEdges.values()) { + involvedIds.add(edge.leftThreadId); + involvedIds.add(edge.rightThreadId); + } + } + const clusterItems = seedThreadIds ? deterministicItems.filter((item) => involvedIds.has(item.id)) : deterministicItems; const clusters = buildClusters( - items.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), + clusterItems.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges, ); - this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); + if (!seedThreadIds) { + this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); + } this.persistDurableClusterState(repository.id, pipelineRunId, aggregatedEdges, clusters); - this.pruneOldClusterRuns(repository.id, runId); - if (this.isRepoVectorStateCurrent(repository.id)) { + if (!seedThreadIds) { + this.pruneOldClusterRuns(repository.id, runId); + } + if (!seedThreadIds && this.isRepoVectorStateCurrent(repository.id)) { this.markRepoClustersCurrent(repository.id); this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress); } - params.onProgress?.(`[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`); + params.onProgress?.( + seedThreadIds + ? `[cluster] persisted ${clusters.length} durable neighborhood cluster(s) without replacing the full cluster snapshot` + : `[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`, + ); - this.finishRun('cluster_runs', runId, 'completed', { edges: edges.length, clusters: clusters.length }); - finishPipelineRun(this.db, pipelineRunId, { status: 'completed', stats: { edges: edges.length, clusters: clusters.length } }); + const stats = { edges: edges.length, clusters: clusters.length, threadNumber: params.threadNumber ?? null }; + this.finishRun('cluster_runs', runId, 'completed', stats); + finishPipelineRun(this.db, pipelineRunId, { status: 'completed', stats }); return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length }); } catch (error) { this.finishRun('cluster_runs', runId, 'failed', null, error); From 84a6b3e52a071e69d5cb5b2320ae32b554e02748 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:21:20 -0700 Subject: [PATCH 053/215] fix(db): spill full PR diffs --- .../src/cluster/persistent-store.test.ts | 26 ++++++++++++++----- .../api-core/src/cluster/persistent-store.ts | 21 +++++++++++++-- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index 7e0091f..aa01362 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -258,11 +258,12 @@ test('persistent cluster store records code snapshots, changed files, and hunk s signature, }); - const snapshot = db.prepare('select files_changed, additions, deletions, patch_digest from thread_code_snapshots where id = ?').get(snapshotId) as { + const snapshot = db.prepare('select files_changed, additions, deletions, patch_digest, raw_diff_blob_id from thread_code_snapshots where id = ?').get(snapshotId) as { files_changed: number; additions: number; deletions: number; patch_digest: string; + raw_diff_blob_id: number; }; const file = db.prepare('select path, patch_blob_id from thread_changed_files where snapshot_id = ?').get(snapshotId) as { path: string; @@ -270,12 +271,11 @@ test('persistent cluster store records code snapshots, changed files, and hunk s }; const hunkCount = db.prepare('select count(*) as count from thread_hunk_signatures where snapshot_id = ?').get(snapshotId) as { count: number }; - assert.deepEqual(snapshot, { - files_changed: 1, - additions: 1, - deletions: 1, - patch_digest: signature.patchDigest, - }); + assert.equal(snapshot.files_changed, 1); + assert.equal(snapshot.additions, 1); + assert.equal(snapshot.deletions, 1); + assert.equal(snapshot.patch_digest, signature.patchDigest); + assert.ok(snapshot.raw_diff_blob_id > 0); assert.equal(file.path, 'packages/api-core/src/cache.ts'); assert.ok(file.patch_blob_id > 0); assert.equal(hunkCount.count, 1); @@ -329,6 +329,18 @@ test('persistent cluster store keeps large code patches out of SQLite', () => { assert.equal(blob.inline_text, null); assert.ok(blob.storage_path); assert.ok(fs.existsSync(path.join(storeRoot, blob.storage_path))); + const rawDiffBlob = db + .prepare( + `select b.storage_kind, b.storage_path, b.inline_text + from thread_code_snapshots s + join blobs b on b.id = s.raw_diff_blob_id + where s.id = ?`, + ) + .get(snapshotId) as { storage_kind: string; storage_path: string | null; inline_text: string | null }; + assert.equal(rawDiffBlob.storage_kind, 'file'); + assert.equal(rawDiffBlob.inline_text, null); + assert.ok(rawDiffBlob.storage_path); + assert.ok(fs.existsSync(path.join(storeRoot, rawDiffBlob.storage_path))); } finally { db.close(); fs.rmSync(storeRoot, { recursive: true, force: true }); diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index 2d976a6..f945151 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -270,17 +270,33 @@ export function upsertThreadCodeSnapshot( const filesChanged = params.signature.files.length; const additions = params.signature.files.reduce((sum, file) => sum + file.additions, 0); const deletions = params.signature.files.reduce((sum, file) => sum + file.deletions, 0); + const rawDiff = params.signature.files + .filter((file) => file.patch) + .map((file) => { + const previous = file.previousFilename ?? file.filename; + return [`diff --git a/${previous} b/${file.filename}`, file.patch].join('\n'); + }) + .join('\n'); + const rawDiffBlobId = + rawDiff.length > 0 + ? upsertTextBlob(db, { + text: rawDiff, + mediaType: 'text/x-diff', + storeRoot: params.storeRoot, + }) + : null; db.prepare( `insert into thread_code_snapshots ( thread_revision_id, base_sha, head_sha, files_changed, additions, deletions, patch_digest, raw_diff_blob_id, created_at - ) values (?, ?, ?, ?, ?, ?, ?, null, ?) + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?) on conflict(thread_revision_id) do update set base_sha = excluded.base_sha, head_sha = excluded.head_sha, files_changed = excluded.files_changed, additions = excluded.additions, deletions = excluded.deletions, - patch_digest = excluded.patch_digest`, + patch_digest = excluded.patch_digest, + raw_diff_blob_id = excluded.raw_diff_blob_id`, ).run( params.threadRevisionId, params.baseSha ?? null, @@ -289,6 +305,7 @@ export function upsertThreadCodeSnapshot( additions, deletions, params.signature.patchDigest, + rawDiffBlobId, timestamp, ); const snapshot = db From 255f5d262548e91b83f2f8f29d85e27d8c521452 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 22:22:27 -0700 Subject: [PATCH 054/215] refactor(ai): name key summary providers --- packages/api-core/src/openai/provider.ts | 3 +++ packages/api-core/src/service.test.ts | 4 +++- packages/api-core/src/service.ts | 7 ++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index 5332d7b..71ee588 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -21,6 +21,7 @@ export type SummaryUsage = { }; export type AiProvider = { + providerName?: string; checkAuth: () => Promise; summarizeThread: (params: { model: string; text: string }) => Promise<{ summary: SummaryResult; usage?: SummaryUsage }>; generateKeySummary?: (params: { model: string; text: string }) => Promise<{ summary: LlmKeySummary; usage?: SummaryUsage }>; @@ -35,6 +36,8 @@ const summarySchema = z.object({ }); export class OpenAiProvider implements AiProvider { + readonly providerName = 'openai'; + private readonly client: OpenAI; constructor(apiKey: string) { diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index b6de130..689b30d 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -813,6 +813,7 @@ test('generateKeySummaries stores cached 3-line key summaries', async () => { listPullFiles: async () => [], }, { + providerName: 'test-agent', checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); @@ -863,7 +864,8 @@ test('generateKeySummaries stores cached 3-line key summaries', async () => { assert.equal(first.totalTokens, 15); assert.equal(second.skipped, 1); assert.equal(calls, 1); - const row = service.db.prepare('select key_text from thread_key_summaries').get() as { key_text: string }; + const row = service.db.prepare('select provider, key_text from thread_key_summaries').get() as { provider: string; key_text: string }; + assert.equal(row.provider, 'test-agent'); assert.match(row.key_text, /intent: Fix retry loop\./); } finally { service.close(); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 85269e4..b978492 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1772,6 +1772,7 @@ export class GHCrawlService { if (!ai.generateKeySummary) { throw new Error('Configured AI provider does not support key summary generation.'); } + const providerName = ai.providerName ?? 'custom'; const repository = this.requireRepository(params.owner, params.repo); const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `key-summary:${params.threadNumber}` : `key-summary:${repository.fullName}`); @@ -1831,11 +1832,11 @@ export class GHCrawlService { where thread_revision_id = ? and summary_kind = 'llm_key_3line' and prompt_version = ? - and provider = 'openai' + and provider = ? and model = ? limit 1`, ) - .get(revisionId, LLM_KEY_SUMMARY_PROMPT_VERSION, this.config.summaryModel) as { input_hash: string } | undefined; + .get(revisionId, LLM_KEY_SUMMARY_PROMPT_VERSION, providerName, this.config.summaryModel) as { input_hash: string } | undefined; if (existing?.input_hash === inputHash) { skipped += 1; continue; @@ -1849,7 +1850,7 @@ export class GHCrawlService { threadRevisionId: revisionId, summaryKind: 'llm_key_3line', promptVersion: LLM_KEY_SUMMARY_PROMPT_VERSION, - provider: 'openai', + provider: providerName, model: this.config.summaryModel, inputHash, summary: result.summary, From f8fe3be97d4e575d9eb1c4765596bd21b0a3d2ed Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:17:14 -0700 Subject: [PATCH 055/215] feat(cluster): explain durable clusters --- apps/cli/src/main.test.ts | 39 ++++ apps/cli/src/main.ts | 36 ++++ packages/api-contract/src/client.ts | 14 ++ packages/api-contract/src/contracts.test.ts | 63 +++++++ packages/api-contract/src/contracts.ts | 47 +++++ packages/api-core/src/api/server.ts | 27 +++ packages/api-core/src/service.test.ts | 76 ++++++++ packages/api-core/src/service.ts | 190 ++++++++++++++++++++ 8 files changed, 492 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index decf4d2..a09bfb2 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -282,6 +282,7 @@ test('agent-facing command help advertises explicit --json', async () => { 'clusters', 'durable-clusters', 'cluster-detail', + 'cluster-explain', 'search', 'neighbors', ] as const) { @@ -568,6 +569,44 @@ test('durable-clusters command forwards stable cluster list options', async () = assert.match(stdout.read(), /trace-alpha-river/); }); +test('cluster-explain command forwards evidence options', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.explainDurableCluster; + let received: unknown; + + GHCrawlService.prototype.explainDurableCluster = function explainDurableClusterStub(params: unknown) { + received = params; + return { + repository: { fullName: 'openclaw/openclaw' }, + cluster: { clusterId: 7, stableSlug: 'trace-alpha-river', members: [] }, + evidence: [{ sources: ['deterministic_fingerprint'] }], + overrides: [], + aliases: [], + events: [], + } as never; + }; + + try { + await run(['cluster-explain', 'openclaw/openclaw', '--id', '7', '--member-limit', '4', '--event-limit', '9'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.explainDurableCluster = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + memberLimit: 4, + eventLimit: 9, + }); + assert.match(stdout.read(), /deterministic_fingerprint/); +}); + test('long-running command progress stays on stderr and payload stays on stdout', async () => { const stdout = createWritableCapture(); const stderr = createWritableCapture(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 81b945c..46b3531 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -35,6 +35,7 @@ type CommandName = | 'clusters' | 'durable-clusters' | 'cluster-detail' + | 'cluster-explain' | 'search' | 'neighbors' | 'tui' @@ -329,6 +330,19 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl cluster-detail openclaw/openclaw --id 123 --member-limit 20 --body-chars 280 --json'], agentJson: true, }, + { + name: 'cluster-explain', + synopsis: 'cluster-explain --id [--member-limit ] [--event-limit ] [--json]', + description: 'Explain one durable cluster with evidence, overrides, aliases, and event history.', + options: [ + '--id Durable cluster id to inspect', + '--member-limit Limit member rows and evidence scope', + '--event-limit Limit event history rows', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl cluster-explain openclaw/openclaw --id 123 --member-limit 20 --event-limit 50 --json'], + agentJson: true, + }, { name: 'durable-clusters', synopsis: 'durable-clusters [--include-inactive] [--member-limit ] [--json]', @@ -578,6 +592,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo search: { type: 'string' }, 'min-size': { type: 'string' }, 'member-limit': { type: 'string' }, + 'event-limit': { type: 'string' }, 'body-chars': { type: 'string' }, 'no-sync': { type: 'boolean' }, 'no-embed': { type: 'boolean' }, @@ -1342,6 +1357,27 @@ export async function run( writeJson(stdout, result); return; } + case 'cluster-explain': { + const { owner, repo, values } = parseRepoFlags('cluster-explain', rest); + if (typeof values.id !== 'string') { + throw new CliUsageError('Missing --id', 'cluster-explain'); + } + const result = getService().explainDurableCluster({ + owner, + repo, + clusterId: parsePositiveInteger('id', values.id, 'cluster-explain'), + memberLimit: + typeof values['member-limit'] === 'string' + ? parsePositiveInteger('member-limit', values['member-limit'], 'cluster-explain') + : undefined, + eventLimit: + typeof values['event-limit'] === 'string' + ? parsePositiveInteger('event-limit', values['event-limit'], 'cluster-explain') + : undefined, + }); + writeJson(stdout, result); + return; + } case 'search': { const { owner, repo, values } = parseRepoFlags('search', rest); if (typeof values.query !== 'string') { diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index 60492be..8c791b0 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -6,6 +6,7 @@ import { closeThreadRequestSchema, authorThreadsResponseSchema, clusterDetailResponseSchema, + clusterExplainResponseSchema, clusterMergeResponseSchema, clusterOverrideResponseSchema, clusterSplitResponseSchema, @@ -30,6 +31,7 @@ import { type ClusterSplitResponse, type AuthorThreadsResponse, type ClusterDetailResponse, + type ClusterExplainResponse, type ClusterSummariesResponse, type ClustersResponse, type HealthResponse, @@ -65,6 +67,7 @@ export type GitcrawlClient = { bodyChars?: number; includeClosed?: boolean; }) => Promise; + explainCluster: (params: { owner: string; repo: string; clusterId: number; memberLimit?: number; eventLimit?: number }) => Promise; refresh: (request: RefreshRequest) => Promise; rerun: (request: ActionRequest) => Promise; closeThread: (request: { owner: string; repo: string; threadNumber: number }) => Promise; @@ -151,6 +154,17 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet const res = await fetchImpl(`${normalized}/cluster-detail?${search.toString()}`); return readJson(res, clusterDetailResponseSchema); }, + async explainCluster(params) { + const search = new URLSearchParams({ + owner: params.owner, + repo: params.repo, + clusterId: String(params.clusterId), + }); + if (params.memberLimit !== undefined) search.set('memberLimit', String(params.memberLimit)); + if (params.eventLimit !== undefined) search.set('eventLimit', String(params.eventLimit)); + const res = await fetchImpl(`${normalized}/cluster-explain?${search.toString()}`); + return readJson(res, clusterExplainResponseSchema); + }, async refresh(request) { const body = refreshRequestSchema.parse(request); const res = await fetchImpl(`${normalized}/actions/refresh`, { diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 9ca92e7..36545db 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -3,6 +3,7 @@ import assert from 'node:assert/strict'; import { actionRequestSchema, + clusterExplainResponseSchema, clusterMergeResponseSchema, clusterOverrideResponseSchema, clusterSplitResponseSchema, @@ -290,6 +291,68 @@ test('durable clusters response accepts stable slugs and governed member states' assert.equal(parsed.clusters[0]?.stableSlug, 'trace-alpha-river'); }); +test('cluster explain response accepts evidence and governance records', () => { + const thread = { + id: 10, + repoId: 1, + number: 42, + kind: 'issue' as const, + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Downloader hangs', + body: 'The transfer never finishes.', + authorLogin: 'alice', + htmlUrl: 'https://github.com/openclaw/openclaw/issues/42', + labels: ['bug'], + updatedAtGh: new Date().toISOString(), + clusterId: null, + }; + const parsed = clusterExplainResponseSchema.parse({ + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + cluster: { + clusterId: 7, + stableKey: 'abc123', + stableSlug: 'trace-alpha-river', + status: 'active', + clusterType: 'duplicate_candidate', + title: 'Cluster trace-alpha-river', + representativeThreadId: 10, + activeCount: 1, + removedCount: 0, + blockedCount: 0, + members: [{ thread, role: 'canonical', state: 'active', scoreToRepresentative: 1 }], + }, + aliases: [{ aliasSlug: 'old-slug', reason: 'merged_from:3', createdAt: '2026-03-09T00:00:00Z' }], + overrides: [{ threadNumber: 42, action: 'force_canonical', reason: 'best root issue', createdAt: '2026-03-09T00:00:00Z', expiresAt: null }], + events: [{ eventType: 'keep_canonical', actorKind: 'algo', payload: { threadId: 10 }, createdAt: '2026-03-09T00:00:00Z' }], + evidence: [ + { + leftThreadNumber: 42, + rightThreadNumber: 43, + score: 0.91, + tier: 'strong', + state: 'active', + sources: ['deterministic_fingerprint'], + breakdown: { score: 0.91 }, + lastSeenRunId: 5, + updatedAt: '2026-03-09T00:00:00Z', + }, + ], + }); + + assert.equal(parsed.evidence[0]?.sources[0], 'deterministic_fingerprint'); +}); + test('neighbors schema accepts repository, source thread, and neighbor list', () => { const parsed = neighborsResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index b0ad605..d4d5424 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -219,6 +219,53 @@ export const clusterDetailResponseSchema = z.object({ }); export type ClusterDetailResponse = z.infer; +export const clusterExplainAliasSchema = z.object({ + aliasSlug: z.string(), + reason: z.string(), + createdAt: z.string(), +}); +export type ClusterExplainAliasDto = z.infer; + +export const clusterExplainOverrideSchema = z.object({ + threadNumber: z.number().int().positive(), + action: z.enum(['exclude', 'force_include', 'force_canonical']), + reason: z.string().nullable(), + createdAt: z.string(), + expiresAt: z.string().nullable(), +}); +export type ClusterExplainOverrideDto = z.infer; + +export const clusterExplainEventSchema = z.object({ + eventType: z.string(), + actorKind: z.string(), + payload: z.record(z.string(), z.unknown()).nullable(), + createdAt: z.string(), +}); +export type ClusterExplainEventDto = z.infer; + +export const clusterExplainEvidenceSchema = z.object({ + leftThreadNumber: z.number().int().positive(), + rightThreadNumber: z.number().int().positive(), + score: z.number(), + tier: z.enum(['strong', 'weak']), + state: z.enum(['active', 'stale', 'rejected']), + sources: z.array(z.string()), + breakdown: z.record(z.string(), z.unknown()), + lastSeenRunId: z.number().int().positive().nullable(), + updatedAt: z.string(), +}); +export type ClusterExplainEvidenceDto = z.infer; + +export const clusterExplainResponseSchema = z.object({ + repository: repositorySchema, + cluster: durableClusterSchema, + aliases: z.array(clusterExplainAliasSchema), + overrides: z.array(clusterExplainOverrideSchema), + events: z.array(clusterExplainEventSchema), + evidence: z.array(clusterExplainEvidenceSchema), +}); +export type ClusterExplainResponse = z.infer; + export const syncResultSchema = z.object({ runId: z.number().int().positive(), threadsSynced: z.number().int().nonnegative(), diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 2a79753..038f01d 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -193,6 +193,33 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'GET' && url.pathname === '/cluster-explain') { + const params = parseRepoParams(url); + const clusterIdValue = url.searchParams.get('clusterId'); + if (!clusterIdValue) { + sendJson(res, 400, { error: 'Missing clusterId parameter' }); + return; + } + const clusterId = Number(clusterIdValue); + if (!Number.isInteger(clusterId) || clusterId <= 0) { + sendJson(res, 400, { error: 'Invalid clusterId parameter' }); + return; + } + const memberLimitValue = url.searchParams.get('memberLimit'); + const eventLimitValue = url.searchParams.get('eventLimit'); + sendJson( + res, + 200, + service.explainDurableCluster({ + ...params, + clusterId, + memberLimit: memberLimitValue ? Number(memberLimitValue) : undefined, + eventLimit: eventLimitValue ? Number(eventLimitValue) : undefined, + }), + ); + return; + } + if (req.method === 'POST' && url.pathname === '/actions/rerun') { const body = actionRequestSchema.parse(await readBody(req)); sendJson(res, 200, await service.rerunAction(body)); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 689b30d..a2d8769 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -4016,6 +4016,82 @@ test('listDurableClusters returns stable slugs and governed member states', () = } }); +test('explainDurableCluster returns evidence and governance records', () => { + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + + try { + const now = '2026-03-10T12:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Issue one', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Issue two', 'body', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', 10, 'Cluster trace-alpha-river', now, now); + const insertMembership = service.db.prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertMembership.run(7, 10, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); + insertMembership.run(7, 11, 'related', 'active', 0.91, null, null, 'algo', null, '{}', null, now, now, null); + service.db + .prepare( + `insert into similarity_edge_evidence ( + repo_id, left_thread_id, right_thread_id, algorithm_version, config_hash, score, tier, state, + breakdown_json, first_seen_run_id, last_seen_run_id, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 10, 11, 'persistent-cluster-v1', 'config', 0.91, 'strong', 'active', '{"sources":["deterministic_fingerprint"],"score":0.91}', null, null, now, now); + service.db + .prepare('insert into cluster_overrides (repo_id, cluster_id, thread_id, action, reason, created_at, expires_at) values (?, ?, ?, ?, ?, ?, ?)') + .run(1, 7, 10, 'force_canonical', 'best root issue', now, null); + service.db + .prepare('insert into cluster_aliases (cluster_id, alias_slug, reason, created_at) values (?, ?, ?, ?)') + .run(7, 'old-slug', 'merged_from:3', now); + service.db + .prepare('insert into cluster_events (cluster_id, run_id, event_type, actor_kind, payload_json, created_at) values (?, ?, ?, ?, ?, ?)') + .run(7, null, 'keep_canonical', 'algo', '{"threadId":10}', now); + + const response = service.explainDurableCluster({ owner: 'openclaw', repo: 'openclaw', clusterId: 7 }); + + assert.equal(response.cluster.stableSlug, 'trace-alpha-river'); + assert.equal(response.evidence[0]?.leftThreadNumber, 42); + assert.equal(response.evidence[0]?.sources[0], 'deterministic_fingerprint'); + assert.equal(response.overrides[0]?.action, 'force_canonical'); + assert.equal(response.aliases[0]?.aliasSlug, 'old-slug'); + assert.deepEqual(response.events[0]?.payload, { threadId: 10 }); + } finally { + service.close(); + } +}); + test('syncRepository records actors and repo stats from thread and comment authors', async () => { const service = makeTestService({ checkAuth: async () => undefined, diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b978492..7bdc0f4 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -17,6 +17,7 @@ import { clusterMergeResponseSchema, clusterSplitResponseSchema, clusterDetailResponseSchema, + clusterExplainResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, @@ -37,6 +38,7 @@ import { type ClusterOverrideResponse, type ClusterSplitResponse, type ClusterDetailResponse, + type ClusterExplainResponse, type ClusterDto, type ClusterResultDto, type ClusterSummariesResponse, @@ -507,6 +509,16 @@ function parseStringArrayJson(value: string | null | undefined): string[] { } } +function parseObjectJson(value: string | null | undefined): Record | null { + if (!value) return null; + try { + const parsed = JSON.parse(value) as unknown; + return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? (parsed as Record) : null; + } catch { + return null; + } +} + function userLogin(payload: Record): string | null { const user = payload.user as Record | undefined; const login = user?.login; @@ -2881,6 +2893,184 @@ export class GHCrawlService { }); } + explainDurableCluster(params: { owner: string; repo: string; clusterId: number; memberLimit?: number; eventLimit?: number }): ClusterExplainResponse { + const repository = this.requireRepository(params.owner, params.repo); + const cluster = this.db + .prepare( + `select id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title + from cluster_groups + where repo_id = ? + and id = ? + limit 1`, + ) + .get(repository.id, params.clusterId) as + | { + id: number; + stable_key: string; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + cluster_type: string | null; + representative_thread_id: number | null; + title: string | null; + } + | undefined; + if (!cluster) { + throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); + } + + const allMembers = this.db + .prepare( + `select + cm.role as membership_role, + cm.state as membership_state, + cm.score_to_representative as membership_score, + t.* + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + order by + case cm.role when 'canonical' then 0 else 1 end, + case cm.state when 'active' then 0 when 'pending_review' then 1 else 2 end, + t.number asc`, + ) + .all(cluster.id) as Array< + ThreadRow & { + membership_role: 'canonical' | 'duplicate' | 'related'; + membership_state: 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; + membership_score: number | null; + } + >; + const visibleMembers = allMembers.slice(0, params.memberLimit ?? 50); + const visibleThreadIds = visibleMembers.map((row) => row.id); + + const aliases = this.db + .prepare( + `select alias_slug, reason, created_at + from cluster_aliases + where cluster_id = ? + order by created_at desc, alias_slug asc`, + ) + .all(cluster.id) as Array<{ alias_slug: string; reason: string; created_at: string }>; + const overrides = this.db + .prepare( + `select t.number, co.action, co.reason, co.created_at, co.expires_at + from cluster_overrides co + join threads t on t.id = co.thread_id + where co.cluster_id = ? + order by co.created_at desc, t.number asc`, + ) + .all(cluster.id) as Array<{ number: number; action: 'exclude' | 'force_include' | 'force_canonical'; reason: string | null; created_at: string; expires_at: string | null }>; + const events = this.db + .prepare( + `select event_type, actor_kind, payload_json, created_at + from cluster_events + where cluster_id = ? + order by created_at desc, id desc + limit ?`, + ) + .all(cluster.id, params.eventLimit ?? 25) as Array<{ event_type: string; actor_kind: string; payload_json: string; created_at: string }>; + + let evidence: Array<{ + leftThreadNumber: number; + rightThreadNumber: number; + score: number; + tier: 'strong' | 'weak'; + state: 'active' | 'stale' | 'rejected'; + sources: string[]; + breakdown: Record; + lastSeenRunId: number | null; + updatedAt: string; + }> = []; + if (visibleThreadIds.length >= 2) { + const placeholders = visibleThreadIds.map(() => '?').join(','); + const rows = this.db + .prepare( + `select + le.number as left_number, + re.number as right_number, + e.score, + e.tier, + e.state, + e.breakdown_json, + e.last_seen_run_id, + e.updated_at + from similarity_edge_evidence e + join threads le on le.id = e.left_thread_id + join threads re on re.id = e.right_thread_id + where e.repo_id = ? + and e.left_thread_id in (${placeholders}) + and e.right_thread_id in (${placeholders}) + order by e.score desc, le.number asc, re.number asc`, + ) + .all(repository.id, ...visibleThreadIds, ...visibleThreadIds) as Array<{ + left_number: number; + right_number: number; + score: number; + tier: 'strong' | 'weak'; + state: 'active' | 'stale' | 'rejected'; + breakdown_json: string; + last_seen_run_id: number | null; + updated_at: string; + }>; + evidence = rows.map((row) => { + const breakdown = parseObjectJson(row.breakdown_json) ?? {}; + const rawSources = breakdown.sources; + return { + leftThreadNumber: row.left_number, + rightThreadNumber: row.right_number, + score: row.score, + tier: row.tier, + state: row.state, + sources: Array.isArray(rawSources) ? rawSources.filter((source): source is string => typeof source === 'string') : [], + breakdown, + lastSeenRunId: row.last_seen_run_id, + updatedAt: row.updated_at, + }; + }); + } + + return clusterExplainResponseSchema.parse({ + repository, + cluster: { + clusterId: cluster.id, + stableKey: cluster.stable_key, + stableSlug: cluster.stable_slug, + status: cluster.status, + clusterType: cluster.cluster_type, + title: cluster.title, + representativeThreadId: cluster.representative_thread_id, + activeCount: allMembers.filter((row) => row.membership_state === 'active').length, + removedCount: allMembers.filter((row) => row.membership_state === 'removed_by_user').length, + blockedCount: allMembers.filter((row) => row.membership_state === 'blocked_by_override').length, + members: visibleMembers.map((row) => ({ + thread: threadToDto(row), + role: row.membership_role, + state: row.membership_state, + scoreToRepresentative: row.membership_score, + })), + }, + aliases: aliases.map((alias) => ({ + aliasSlug: alias.alias_slug, + reason: alias.reason, + createdAt: alias.created_at, + })), + overrides: overrides.map((override) => ({ + threadNumber: override.number, + action: override.action, + reason: override.reason, + createdAt: override.created_at, + expiresAt: override.expires_at, + })), + events: events.map((event) => ({ + eventType: event.event_type, + actorKind: event.actor_kind, + payload: parseObjectJson(event.payload_json), + createdAt: event.created_at, + })), + evidence, + }); + } + async refreshRepository(params: { owner: string; repo: string; From f0018928024260e1aa442ba1c6f8e335501a6a15 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:17:48 -0700 Subject: [PATCH 056/215] docs: document durable cluster governance --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index b409617..0eecfcb 100644 --- a/README.md +++ b/README.md @@ -243,8 +243,10 @@ ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --include-closed --json +ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --json ghcrawl cluster-detail owner/repo --id 123 --include-closed --json +ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ghcrawl search owner/repo --query "download stalls" --json ``` @@ -258,6 +260,25 @@ Use `close-thread` when you know a local issue/PR should be treated as closed be Use `close-cluster` when you want to locally suppress a whole cluster from default JSON exploration without waiting for a rebuild. +## Durable Cluster Governance + +The durable cluster commands operate on stable cluster identities, not one-off run snapshots: + +```bash +ghcrawl durable-clusters owner/repo --member-limit 10 --json +ghcrawl cluster-explain owner/repo --id 123 --json +ghcrawl exclude-cluster-member owner/repo --id 123 --number 42 --reason "false positive" --json +ghcrawl include-cluster-member owner/repo --id 123 --number 42 --reason "same root cause" --json +ghcrawl set-cluster-canonical owner/repo --id 123 --number 42 --reason "best root issue" --json +ghcrawl merge-clusters owner/repo --source 123 --target 456 --reason "same incident" --json +ghcrawl split-cluster owner/repo --source 123 --numbers 42,43 --reason "separate root cause" --json +ghcrawl cluster owner/repo --number 42 --json +``` + +Use `cluster-explain` when you need to answer why a durable cluster exists. It returns the stable slug, aliases, governed memberships, overrides, event history, and pairwise evidence sources such as deterministic fingerprints, hunk overlap, and vector-backed edges. + +Maintainer overrides are sticky. If you exclude a thread from a durable cluster, future clustering records that decision and will not silently re-add it to the same cluster. `cluster --number` refreshes only one durable neighborhood, which is the cheaper path after a small sync or a manual governance edit. + ## Cost To Operate The main variable costs are summarization and embeddings. Embedding pricing is published by OpenAI here: [OpenAI API pricing](https://developers.openai.com/api/docs/pricing#embeddings). @@ -301,6 +322,7 @@ ghcrawl refresh owner/repo ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json +ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ``` ### Video Walkthrough From f69e1b737e0cdd7e6bd19939dd5a287cee31fa4a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:24:20 -0700 Subject: [PATCH 057/215] test(api): cover durable cluster explain route --- packages/api-core/src/api/server.test.ts | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index 2dce959..5250d35 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -5,6 +5,7 @@ import { authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, + clusterExplainResponseSchema, clusterOverrideResponseSchema, clusterSummariesResponseSchema, durableClustersResponseSchema, @@ -793,6 +794,27 @@ test('cluster summary and detail endpoints return contract payloads', async () = values (?, ?, ?, ?)`, ) .run(100, 10, null, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'active', 'duplicate_candidate', 10, 'Cluster trace-alpha-river', now, now); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 10, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); + service.db + .prepare( + `insert into cluster_events (cluster_id, run_id, event_type, actor_kind, payload_json, created_at) + values (?, ?, ?, ?, ?, ?)`, + ) + .run(7, null, 'keep_canonical', 'algo', '{"threadId":10}', now); const server = createApiServer(service); try { @@ -814,6 +836,14 @@ test('cluster summary and detail endpoints return contract payloads', async () = const detail = clusterDetailResponseSchema.parse((await detailResponse.json()) as unknown); assert.equal(detail.cluster.clusterId, 100); assert.equal(detail.members[0]?.thread.number, 42); + + const explainResponse = await fetch( + `http://127.0.0.1:${address.port}/cluster-explain?owner=openclaw&repo=openclaw&clusterId=7`, + ); + assert.equal(explainResponse.status, 200); + const explain = clusterExplainResponseSchema.parse((await explainResponse.json()) as unknown); + assert.equal(explain.cluster.clusterId, 7); + assert.equal(explain.events[0]?.eventType, 'keep_canonical'); } finally { await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); service.close(); From b7f07b59bbc2fe175462651d1770bac65440bdbf Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:24:29 -0700 Subject: [PATCH 058/215] docs: update ghcrawl governance protocol --- SPEC.md | 19 ++++++++++ skills/ghcrawl/SKILL.md | 39 ++++++++++++++++++-- skills/ghcrawl/references/protocol.md | 51 ++++++++++++++++++++++++++- 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/SPEC.md b/SPEC.md index cf99b0f..d6bc713 100644 --- a/SPEC.md +++ b/SPEC.md @@ -131,8 +131,16 @@ The product must keep these machine-facing surfaces working: - `ghcrawl close-cluster owner/repo --id --json` - `ghcrawl embed owner/repo --json` - `ghcrawl cluster owner/repo --json` +- `ghcrawl cluster owner/repo --number --json` - `ghcrawl clusters owner/repo --json` +- `ghcrawl durable-clusters owner/repo --json` - `ghcrawl cluster-detail owner/repo --id --json` +- `ghcrawl cluster-explain owner/repo --id --json` +- `ghcrawl exclude-cluster-member owner/repo --id --number --json` +- `ghcrawl include-cluster-member owner/repo --id --number --json` +- `ghcrawl set-cluster-canonical owner/repo --id --number --json` +- `ghcrawl merge-clusters owner/repo --source --target --json` +- `ghcrawl split-cluster owner/repo --source --numbers --json` - `ghcrawl search owner/repo --query --json` - `ghcrawl neighbors owner/repo --number --json` @@ -146,8 +154,14 @@ The product must keep these machine-facing surfaces working: - `GET /clusters` - `GET /cluster-summaries` - `GET /cluster-detail` +- `GET /cluster-explain` - `POST /actions/rerun` - `POST /actions/refresh` +- `POST /actions/exclude-cluster-member` +- `POST /actions/include-cluster-member` +- `POST /actions/set-cluster-canonical` +- `POST /actions/merge-clusters` +- `POST /actions/split-cluster` ### TUI @@ -174,6 +188,11 @@ That means: - URL - body snippet - stored summary fields when present +- expose durable cluster governance with: + - stable slug and aliases + - maintainer overrides + - pairwise evidence and event history + - explicit merge/split/include/exclude/canonical commands The installable skill lives in: diff --git a/skills/ghcrawl/SKILL.md b/skills/ghcrawl/SKILL.md index 38c44d6..39cc3c5 100644 --- a/skills/ghcrawl/SKILL.md +++ b/skills/ghcrawl/SKILL.md @@ -29,7 +29,7 @@ Current pipeline defaults to keep in mind: - changing summary model or embedding basis with `ghcrawl configure` makes the next refresh rebuild vectors and clusters - opting into `title_summary` can materially improve clustering quality, but it adds OpenAI cost; on `openclaw/openclaw` it improved non-solo cluster membership by about 50% -Also never run `close-thread` or `close-cluster` unless the user explicitly asks you to mark a local thread or cluster closed. +Also never run `close-thread`, `close-cluster`, `exclude-cluster-member`, `include-cluster-member`, `set-cluster-canonical`, `merge-clusters`, or `split-cluster` unless the user explicitly asks you to change local cluster governance. ## When to use this skill @@ -62,7 +62,9 @@ Without explicit user direction to refresh data, prefer these local-only command ```bash ghcrawl threads owner/repo --numbers 12345 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json +ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json +ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl author owner/repo --login lqquan --json ghcrawl search owner/repo --query "download stalls" --mode hybrid --json @@ -81,7 +83,7 @@ By default: If the user explicitly wants to inspect those records, add `--include-closed`. -Use `threads --numbers 12345` when you need to find the cluster for one specific issue/PR number. The returned thread record includes `clusterId`. If it is non-null, follow with `cluster-detail --id `. +Use `threads --numbers 12345` when you need to find the cluster for one specific issue/PR number. The returned thread record includes `clusterId`. If it is non-null, follow with `cluster-detail --id ` for snapshot details or `cluster-explain --id ` for durable evidence and governance. Use `configure --json` when you need to confirm the currently selected summary model or embedding basis before suggesting an expensive refresh. @@ -98,6 +100,18 @@ ghcrawl close-cluster owner/repo --id 123 --json If `close-thread` closes the last open item in a cluster, ghcrawl will automatically mark that cluster closed too. +If the user explicitly asks to govern durable cluster membership, use: + +```bash +ghcrawl exclude-cluster-member owner/repo --id 123 --number 42 --reason "false positive" --json +ghcrawl include-cluster-member owner/repo --id 123 --number 42 --reason "same root cause" --json +ghcrawl set-cluster-canonical owner/repo --id 123 --number 42 --reason "best root issue" --json +ghcrawl merge-clusters owner/repo --source 123 --target 456 --reason "same incident" --json +ghcrawl split-cluster owner/repo --source 123 --numbers 42,43 --reason "separate root cause" --json +``` + +Use `cluster --number ` only when the user explicitly asks to refresh one local durable neighborhood after a small sync or governance edit. + ### 2. Check local health only when needed Run: @@ -187,7 +201,26 @@ This returns: - a body snippet - stored summary fields when present -### 7. Optional deeper inspection +### 7. Explain durable evidence and governance + +Use: + +```bash +ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json +``` + +This returns: + +- stable durable cluster identity and slug +- governed memberships +- aliases +- maintainer overrides +- event history +- pairwise evidence sources and score breakdowns + +Use this when the user asks why items are together, why an item stayed out, or what a maintainer changed. + +### 8. Optional deeper inspection Use search or neighbors as needed: diff --git a/skills/ghcrawl/references/protocol.md b/skills/ghcrawl/references/protocol.md index 582a4f4..5dae5e9 100644 --- a/skills/ghcrawl/references/protocol.md +++ b/skills/ghcrawl/references/protocol.md @@ -51,6 +51,7 @@ The returned `thread` objects include: If `clusterId` is non-null, follow with: - `ghcrawl cluster-detail owner/repo --id ` +- `ghcrawl cluster-explain owner/repo --id ` when the user asks why the cluster exists or what changed it Useful flags: @@ -156,6 +157,51 @@ Each member includes: By default this hides locally closed clusters; use `--include-closed` when the user explicitly wants them. +### `ghcrawl durable-clusters owner/repo --json` + +Read-only list of durable cluster identities and governed memberships. + +Useful flags: + +- `--include-inactive` +- `--member-limit ` + +Use this when stable cluster slugs, removed members, blocked members, or durable governance state matter more than the latest run snapshot. + +### `ghcrawl cluster-explain owner/repo --id --json` + +Read-only explanation for one durable cluster. + +Useful flags: + +- `--member-limit ` +- `--event-limit ` + +Returns: + +- stable durable identity and slug +- governed memberships +- aliases +- maintainer overrides +- event history +- pairwise evidence sources and score breakdowns + +Use this when the user asks why threads are together, why a thread stayed out, or what maintainer action changed the cluster. + +### Durable governance commands + +These mutate local durable cluster governance. Use them only when the user explicitly asks for that mutation: + +```bash +ghcrawl exclude-cluster-member owner/repo --id 123 --number 42 --reason "false positive" --json +ghcrawl include-cluster-member owner/repo --id 123 --number 42 --reason "same root cause" --json +ghcrawl set-cluster-canonical owner/repo --id 123 --number 42 --reason "best root issue" --json +ghcrawl merge-clusters owner/repo --source 123 --target 456 --reason "same incident" --json +ghcrawl split-cluster owner/repo --source 123 --numbers 42,43 --reason "separate root cause" --json +``` + +After a small sync or governance edit, use `ghcrawl cluster owner/repo --number --json` only when the user explicitly asks to refresh that local durable neighborhood. + ### `ghcrawl close-thread owner/repo --number --json` Marks one local issue/PR closed without waiting for the next GitHub sync. @@ -192,8 +238,10 @@ pnpm --filter ghcrawl cli author owner/repo --login lqquan --json pnpm --filter ghcrawl cli refresh owner/repo pnpm --filter ghcrawl cli clusters owner/repo --min-size 10 --limit 20 --sort recent --json pnpm --filter ghcrawl cli clusters owner/repo --min-size 10 --limit 20 --sort recent --include-closed --json +pnpm --filter ghcrawl cli durable-clusters owner/repo --member-limit 10 --json pnpm --filter ghcrawl cli cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json pnpm --filter ghcrawl cli cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --include-closed --json +pnpm --filter ghcrawl cli cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json pnpm --filter ghcrawl cli close-thread owner/repo --number 42 --json pnpm --filter ghcrawl cli close-cluster owner/repo --id 123 --json ``` @@ -208,4 +256,5 @@ If the supported CLI path still fails, hangs, or returns unusable output, stop a 4. Only if doctor is healthy and the user explicitly asked, run `ghcrawl refresh owner/repo` 5. `ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json` 6. `ghcrawl cluster-detail owner/repo --id --json` -7. optionally `threads`, `author`, `search`, or `neighbors` with `--json` +7. `ghcrawl cluster-explain owner/repo --id --json` when evidence or governance matters +8. optionally `threads`, `author`, `search`, or `neighbors` with `--json` From 435a3be5d7d9390c46864a3c29768150af57804c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:29:03 -0700 Subject: [PATCH 059/215] feat(author): expose actor profile stats --- apps/cli/src/main.test.ts | 64 +++++++ apps/cli/src/main.ts | 4 +- packages/api-contract/src/contracts.test.ts | 41 +++++ packages/api-contract/src/contracts.ts | 35 ++++ packages/api-core/src/api/server.test.ts | 24 +++ packages/api-core/src/api/server.ts | 12 ++ packages/api-core/src/service.test.ts | 6 + packages/api-core/src/service.ts | 177 ++++++++++++++++++++ 8 files changed, 361 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index a09bfb2..837f7e6 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -232,6 +232,70 @@ test('missing required flags exit with code 2 and command-specific hints', async } }); +test('author command returns actor profile stats and threads', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.getAuthor; + let received: unknown; + + GHCrawlService.prototype.getAuthor = function getAuthorStub(params: unknown) { + received = params; + return { + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: '1', + updatedAt: '2026-03-09T00:00:00Z', + }, + authorLogin: 'alice', + actor: { + id: 1, + provider: 'github', + providerUserId: '501', + login: 'alice', + displayName: null, + actorType: 'User', + siteAdmin: false, + firstSeenAt: '2026-03-09T00:00:00Z', + lastSeenAt: '2026-03-09T00:00:00Z', + updatedAt: '2026-03-09T00:00:00Z', + }, + stats: { + openedIssueCount: 1, + openedPullRequestCount: 0, + commentCount: 0, + mergedPullRequestCount: 0, + closedThreadCount: 0, + firstActivityAt: '2026-03-09T00:00:00Z', + lastActivityAt: '2026-03-09T00:00:00Z', + trustTier: 'unknown', + }, + threads: [], + } as never; + }; + + try { + await run(['author', 'openclaw/openclaw', '--login', 'alice', '--include-closed'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.getAuthor = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + login: 'alice', + includeClosed: true, + }); + assert.match(stdout.read(), /"providerUserId": "501"/); + assert.match(stdout.read(), /"openedIssueCount": 1/); +}); + test('invalid enum and value parsing exits with code 2', async () => { const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 46b3531..9dc41b8 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -176,7 +176,7 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ { name: 'author', synopsis: 'author --login [--include-closed] [--json]', - description: 'List local issue and PR records for a single author.', + description: 'Show actor identity, repo stats, and local threads for one author.', options: [ '--login GitHub login to inspect', '--include-closed Include locally closed items', @@ -1096,7 +1096,7 @@ export async function run( if (typeof values.login !== 'string' || values.login.trim().length === 0) { throw new CliUsageError('Missing --login', 'author'); } - const result = getService().listAuthorThreads({ + const result = getService().getAuthor({ owner, repo, login: values.login, diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 36545db..7771ee7 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -3,6 +3,7 @@ import assert from 'node:assert/strict'; import { actionRequestSchema, + authorResponseSchema, clusterExplainResponseSchema, clusterMergeResponseSchema, clusterOverrideResponseSchema, @@ -61,6 +62,46 @@ test('action request accepts optional thread number', () => { assert.equal(parsed.threadNumber, 42); }); +test('author response accepts actor identity and repo stats', () => { + const parsed = authorResponseSchema.parse({ + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + authorLogin: 'alice', + actor: { + id: 1, + provider: 'github', + providerUserId: '501', + login: 'alice', + displayName: null, + actorType: 'User', + siteAdmin: false, + firstSeenAt: new Date().toISOString(), + lastSeenAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + }, + stats: { + openedIssueCount: 1, + openedPullRequestCount: 0, + commentCount: 0, + mergedPullRequestCount: 0, + closedThreadCount: 0, + firstActivityAt: new Date().toISOString(), + lastActivityAt: new Date().toISOString(), + trustTier: 'unknown', + }, + threads: [], + }); + + assert.equal(parsed.actor?.providerUserId, '501'); + assert.equal(parsed.stats.openedIssueCount, 1); +}); + test('exclude cluster member request trims optional reason', () => { const parsed = excludeClusterMemberRequestSchema.parse({ owner: 'openclaw', diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index d4d5424..c22cef3 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -80,6 +80,41 @@ export const authorThreadsResponseSchema = z.object({ }); export type AuthorThreadsResponse = z.infer; +export const actorSchema = z.object({ + id: z.number().int().positive(), + provider: z.string(), + providerUserId: z.string(), + login: z.string(), + displayName: z.string().nullable(), + actorType: z.string().nullable(), + siteAdmin: z.boolean(), + firstSeenAt: z.string(), + lastSeenAt: z.string(), + updatedAt: z.string(), +}); +export type ActorDto = z.infer; + +export const authorStatsSchema = z.object({ + openedIssueCount: z.number().int().nonnegative(), + openedPullRequestCount: z.number().int().nonnegative(), + commentCount: z.number().int().nonnegative(), + mergedPullRequestCount: z.number().int().nonnegative(), + closedThreadCount: z.number().int().nonnegative(), + firstActivityAt: z.string().nullable(), + lastActivityAt: z.string().nullable(), + trustTier: z.string().nullable(), +}); +export type AuthorStatsDto = z.infer; + +export const authorResponseSchema = z.object({ + repository: repositorySchema, + authorLogin: z.string(), + actor: actorSchema.nullable(), + stats: authorStatsSchema, + threads: z.array(authorThreadSchema), +}); +export type AuthorResponse = z.infer; + export const searchHitSchema = z.object({ thread: threadSchema, keywordScore: z.number().nullable(), diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index 5250d35..e84c05e 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -2,6 +2,7 @@ import test from 'node:test'; import assert from 'node:assert/strict'; import { + authorResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, @@ -298,6 +299,22 @@ test('author-threads endpoint returns one author with strongest same-author matc values (?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(1, 1, 10, 11, 'exact_cosine', 0.91, '{}', now); + service.db + .prepare( + `insert into actors ( + id, provider, provider_user_id, login, display_name, actor_type, site_admin, raw_json_blob_id, + first_seen_at, last_seen_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'github', '501', 'lqquan', null, 'User', 0, null, now, now, now); + service.db + .prepare( + `insert into actor_repo_stats ( + repo_id, actor_id, opened_issues, opened_prs, comments, merged_prs, closed_threads, + first_activity_at, last_activity_at, trust_tier + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 1, 1, 1, 0, 0, 0, now, now, 'unknown'); const server = createApiServer(service); try { @@ -313,6 +330,13 @@ test('author-threads endpoint returns one author with strongest same-author matc assert.equal(payload.authorLogin, 'lqquan'); assert.deepEqual(payload.threads.map((item) => item.thread.number), [43, 42]); assert.equal(payload.threads[0]?.strongestSameAuthorMatch?.number, 42); + + const authorResponse = await fetch(`http://127.0.0.1:${address.port}/author?owner=openclaw&repo=openclaw&login=lqquan`); + assert.equal(authorResponse.status, 200); + const author = authorResponseSchema.parse((await authorResponse.json()) as unknown); + assert.equal(author.actor?.providerUserId, '501'); + assert.equal(author.stats.openedIssueCount, 1); + assert.deepEqual(author.threads.map((item) => item.thread.number), [43, 42]); } finally { await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); service.close(); diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 038f01d..f5d98b1 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -78,6 +78,18 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'GET' && url.pathname === '/author') { + const params = parseRepoParams(url); + const login = (url.searchParams.get('login') ?? '').trim(); + if (!login) { + sendJson(res, 400, { error: 'Missing login parameter' }); + return; + } + const includeClosed = url.searchParams.get('includeClosed') === 'true'; + sendJson(res, 200, service.getAuthor({ ...params, login, includeClosed })); + return; + } + if (req.method === 'GET' && url.pathname === '/search') { const params = parseRepoParams(url); const query = url.searchParams.get('query'); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index a2d8769..b19b38a 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -4159,6 +4159,12 @@ test('syncRepository records actors and repo stats from thread and comment autho { login: 'alice', opened_issues: 1, comments: 0 }, { login: 'bob', opened_issues: 0, comments: 1 }, ]); + + const author = service.getAuthor({ owner: 'openclaw', repo: 'openclaw', login: 'alice' }); + assert.equal(author.actor?.providerUserId, '501'); + assert.equal(author.stats.openedIssueCount, 1); + assert.equal(author.stats.commentCount, 0); + assert.deepEqual(author.threads.map((item) => item.thread.number), [42]); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 7bdc0f4..f517da5 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -11,6 +11,7 @@ import { Worker } from 'node:worker_threads'; import { IterableMapper } from '@shutterstock/p-map-iterable'; import { actionResponseSchema, + authorResponseSchema, authorThreadsResponseSchema, closeResponseSchema, clusterOverrideResponseSchema, @@ -32,6 +33,8 @@ import { threadsResponseSchema, type ActionRequest, type ActionResponse, + type AuthorResponse, + type AuthorStatsDto, type AuthorThreadsResponse, type CloseResponse, type ClusterMergeResponse, @@ -610,6 +613,19 @@ function threadToDto(row: ThreadRow, clusterId?: number | null): ThreadDto { }; } +function emptyAuthorStats(): AuthorStatsDto { + return { + openedIssueCount: 0, + openedPullRequestCount: 0, + commentCount: 0, + mergedPullRequestCount: 0, + closedThreadCount: 0, + firstActivityAt: null, + lastActivityAt: null, + trustTier: null, + }; +} + export class GHCrawlService { readonly config: GitcrawlConfig; readonly db: SqliteDatabase; @@ -879,6 +895,167 @@ export class GHCrawlService { }); } + getAuthor(params: { owner: string; repo: string; login: string; includeClosed?: boolean }): AuthorResponse { + const repository = this.requireRepository(params.owner, params.repo); + const normalizedLogin = params.login.trim(); + if (!normalizedLogin) { + return authorResponseSchema.parse({ + repository, + authorLogin: '', + actor: null, + stats: emptyAuthorStats(), + threads: [], + }); + } + + const threads = this.listAuthorThreads(params).threads; + const actorRow = this.db + .prepare( + `select + a.id, + a.provider, + a.provider_user_id, + a.login, + a.display_name, + a.actor_type, + a.site_admin, + a.first_seen_at, + a.last_seen_at, + a.updated_at, + s.opened_issues, + s.opened_prs, + s.comments, + s.merged_prs, + s.closed_threads, + s.first_activity_at, + s.last_activity_at, + s.trust_tier + from actors a + left join actor_repo_stats s on s.actor_id = a.id and s.repo_id = ? + where lower(a.login) = lower(?) + order by s.last_activity_at desc nulls last, a.last_seen_at desc + limit 1`, + ) + .get(repository.id, normalizedLogin) as + | { + id: number; + provider: string; + provider_user_id: string; + login: string; + display_name: string | null; + actor_type: string | null; + site_admin: number; + first_seen_at: string; + last_seen_at: string; + updated_at: string; + opened_issues: number | null; + opened_prs: number | null; + comments: number | null; + merged_prs: number | null; + closed_threads: number | null; + first_activity_at: string | null; + last_activity_at: string | null; + trust_tier: string | null; + } + | undefined; + const fallbackStats = this.computeAuthorStats(repository.id, normalizedLogin); + + return authorResponseSchema.parse({ + repository, + authorLogin: actorRow?.login ?? normalizedLogin, + actor: actorRow + ? { + id: actorRow.id, + provider: actorRow.provider, + providerUserId: actorRow.provider_user_id, + login: actorRow.login, + displayName: actorRow.display_name, + actorType: actorRow.actor_type, + siteAdmin: actorRow.site_admin === 1, + firstSeenAt: actorRow.first_seen_at, + lastSeenAt: actorRow.last_seen_at, + updatedAt: actorRow.updated_at, + } + : null, + stats: { + openedIssueCount: actorRow?.opened_issues ?? fallbackStats.openedIssueCount, + openedPullRequestCount: actorRow?.opened_prs ?? fallbackStats.openedPullRequestCount, + commentCount: actorRow?.comments ?? fallbackStats.commentCount, + mergedPullRequestCount: actorRow?.merged_prs ?? fallbackStats.mergedPullRequestCount, + closedThreadCount: actorRow?.closed_threads ?? fallbackStats.closedThreadCount, + firstActivityAt: actorRow?.first_activity_at ?? fallbackStats.firstActivityAt, + lastActivityAt: actorRow?.last_activity_at ?? fallbackStats.lastActivityAt, + trustTier: actorRow?.trust_tier ?? fallbackStats.trustTier, + }, + threads, + }); + } + + private computeAuthorStats(repoId: number, login: string): ReturnType { + const row = this.db + .prepare( + `select + (select count(*) from threads where repo_id = ? and kind = 'issue' and lower(author_login) = lower(?)) as opened_issues, + (select count(*) from threads where repo_id = ? and kind = 'pull_request' and lower(author_login) = lower(?)) as opened_prs, + (select count(*) from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(?)) as comments, + (select count(*) from threads where repo_id = ? and kind = 'pull_request' and merged_at_gh is not null and lower(author_login) = lower(?)) as merged_prs, + (select count(*) from threads where repo_id = ? and closed_at_gh is not null and lower(author_login) = lower(?)) as closed_threads, + (select min(activity_at) + from ( + select created_at_gh as activity_at from threads where repo_id = ? and lower(author_login) = lower(?) + union all + select c.created_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(?) + ) + where activity_at is not null) as first_activity_at, + (select max(activity_at) + from ( + select updated_at_gh as activity_at from threads where repo_id = ? and lower(author_login) = lower(?) + union all + select c.updated_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(?) + ) + where activity_at is not null) as last_activity_at`, + ) + .get( + repoId, + login, + repoId, + login, + repoId, + login, + repoId, + login, + repoId, + login, + repoId, + login, + repoId, + login, + repoId, + login, + repoId, + login, + ) as { + opened_issues: number; + opened_prs: number; + comments: number; + merged_prs: number; + closed_threads: number; + first_activity_at: string | null; + last_activity_at: string | null; + }; + + return { + openedIssueCount: row.opened_issues, + openedPullRequestCount: row.opened_prs, + commentCount: row.comments, + mergedPullRequestCount: row.merged_prs, + closedThreadCount: row.closed_threads, + firstActivityAt: row.first_activity_at, + lastActivityAt: row.last_activity_at, + trustTier: row.opened_issues + row.opened_prs >= 3 ? 'repeat_contributor' : null, + }; + } + closeThreadLocally(params: { owner: string; repo: string; threadNumber: number }): CloseResponse { const repository = this.requireRepository(params.owner, params.repo); const row = this.db From e8d6d7c8d8d99609277d711b858ec1483b4f53fe Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:29:45 -0700 Subject: [PATCH 060/215] docs: document author profile surface --- README.md | 1 + SPEC.md | 4 +++- skills/ghcrawl/SKILL.md | 2 ++ skills/ghcrawl/references/protocol.md | 10 ++++++++-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0eecfcb..58062df 100644 --- a/README.md +++ b/README.md @@ -320,6 +320,7 @@ The skill is built around the stable JSON CLI surface and is intentionally conse ghcrawl doctor --json ghcrawl refresh owner/repo ghcrawl threads owner/repo --numbers 42,43,44 --json +ghcrawl author owner/repo --login lqquan --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json diff --git a/SPEC.md b/SPEC.md index d6bc713..b06d0f9 100644 --- a/SPEC.md +++ b/SPEC.md @@ -126,7 +126,7 @@ The product must keep these machine-facing surfaces working: - `ghcrawl sync owner/repo --json` - `ghcrawl refresh owner/repo --json` - `ghcrawl threads owner/repo --numbers --json` -- `ghcrawl author owner/repo --login --json` +- `ghcrawl author owner/repo --login --json` for actor identity, repo stats, and authored threads - `ghcrawl close-thread owner/repo --number --json` - `ghcrawl close-cluster owner/repo --id --json` - `ghcrawl embed owner/repo --json` @@ -149,6 +149,8 @@ The product must keep these machine-facing surfaces working: - `GET /health` - `GET /repositories` - `GET /threads` +- `GET /author` +- `GET /author-threads` - `GET /search` - `GET /neighbors` - `GET /clusters` diff --git a/skills/ghcrawl/SKILL.md b/skills/ghcrawl/SKILL.md index 39cc3c5..4c9739e 100644 --- a/skills/ghcrawl/SKILL.md +++ b/skills/ghcrawl/SKILL.md @@ -85,6 +85,8 @@ If the user explicitly wants to inspect those records, add `--include-closed`. Use `threads --numbers 12345` when you need to find the cluster for one specific issue/PR number. The returned thread record includes `clusterId`. If it is non-null, follow with `cluster-detail --id ` for snapshot details or `cluster-explain --id ` for durable evidence and governance. +Use `author --login ` when the user asks about a contributor or maintainer. It returns actor identity, repo-local activity stats, authored threads, and the strongest same-author similarity match for each thread. + Use `configure --json` when you need to confirm the currently selected summary model or embedding basis before suggesting an expensive refresh. Use `threads --numbers ...` when you need a batch of specific issue/PR records. Do not pay the CLI startup cost 10 times for 10 separate single-thread lookups. diff --git a/skills/ghcrawl/references/protocol.md b/skills/ghcrawl/references/protocol.md index 5dae5e9..e8e2585 100644 --- a/skills/ghcrawl/references/protocol.md +++ b/skills/ghcrawl/references/protocol.md @@ -61,14 +61,20 @@ Useful flags: ### `ghcrawl author owner/repo --login --json` -Bulk read path for all open issue/PR records from one author in the local DB. +Read path for one local GitHub actor. -Use this when you want to inspect a user's open items together and see the strongest stored same-author similarity match for each item. +Use this when you want to inspect a user's identity, repo-local activity stats, open authored items, and strongest stored same-author similarity match for each item. Useful flags: - `--include-closed` +Returns: + +- `actor` +- `stats` +- `threads[]` + ### `ghcrawl refresh owner/repo` Runs the staged pipeline in fixed order: From 05bfa6a359e7e9151a3c60909af51481eab9a907 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:30:43 -0700 Subject: [PATCH 061/215] test(cli): cover governance command help --- apps/cli/src/main.test.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 837f7e6..ec23604 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -48,12 +48,17 @@ const publicCommands = [ 'close-thread', 'close-cluster', 'exclude-cluster-member', + 'include-cluster-member', + 'set-cluster-canonical', + 'merge-clusters', + 'split-cluster', 'embed', 'key-summaries', 'cluster', 'clusters', 'durable-clusters', 'cluster-detail', + 'cluster-explain', 'search', 'neighbors', 'tui', From f66b835bb25117972a2a7f0de218397d17ef8ccd Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:34:36 -0700 Subject: [PATCH 062/215] feat(runs): expose pipeline history --- apps/cli/src/main.test.ts | 53 +++++++++++++++++ apps/cli/src/main.ts | 25 ++++++++ packages/api-contract/src/contracts.test.ts | 29 +++++++++ packages/api-contract/src/contracts.ts | 21 +++++++ packages/api-core/src/api/server.test.ts | 65 +++++++++++++++++++++ packages/api-core/src/api/server.ts | 18 ++++++ packages/api-core/src/service.test.ts | 44 ++++++++++++++ packages/api-core/src/service.ts | 47 +++++++++++++++ 8 files changed, 302 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index ec23604..905c360 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -43,6 +43,7 @@ const publicCommands = [ 'version', 'sync', 'refresh', + 'runs', 'threads', 'author', 'close-thread', @@ -301,6 +302,58 @@ test('author command returns actor profile stats and threads', async () => { assert.match(stdout.read(), /"openedIssueCount": 1/); }); +test('runs command returns pipeline history', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.listRunHistory; + let received: unknown; + + GHCrawlService.prototype.listRunHistory = function listRunHistoryStub(params: unknown) { + received = params; + return { + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: '1', + updatedAt: '2026-03-09T00:00:00Z', + }, + runs: [ + { + runId: 1, + runKind: 'cluster', + scope: 'openclaw/openclaw', + status: 'failed', + startedAt: '2026-03-09T00:00:00Z', + finishedAt: '2026-03-09T00:01:00Z', + stats: null, + errorText: 'boom', + }, + ], + } as never; + }; + + try { + await run(['runs', 'openclaw/openclaw', '--kind', 'cluster', '--limit', '5'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.listRunHistory = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + kind: 'cluster', + limit: 5, + }); + assert.match(stdout.read(), /"runKind": "cluster"/); + assert.match(stdout.read(), /"errorText": "boom"/); +}); + test('invalid enum and value parsing exits with code 2', async () => { const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 9dc41b8..c6850cc 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -17,6 +17,7 @@ type CommandName = | 'version' | 'sync' | 'refresh' + | 'runs' | 'threads' | 'author' | 'close-thread' @@ -160,6 +161,18 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl refresh openclaw/openclaw', 'ghcrawl refresh openclaw/openclaw --no-sync --json'], agentJson: true, }, + { + name: 'runs', + synopsis: 'runs [--kind sync|summary|embedding|cluster] [--limit ] [--json]', + description: 'List recent local pipeline runs and failures for one repo.', + options: [ + '--kind sync|summary|embedding|cluster Restrict to one run table', + '--limit Maximum number of records to return', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl runs openclaw/openclaw --limit 20 --json', 'ghcrawl runs openclaw/openclaw --kind cluster --json'], + agentJson: true, + }, { name: 'threads', synopsis: 'threads [--numbers ] [--kind issue|pull_request] [--include-closed] [--json]', @@ -1078,6 +1091,18 @@ export async function run( heapDiagnostics?.dispose(); } } + case 'runs': { + const { owner, repo, values } = parseRepoFlags('runs', rest); + const kind = parseEnum('runs', 'kind', values.kind, ['sync', 'summary', 'embedding', 'cluster']); + const result = getService().listRunHistory({ + owner, + repo, + kind, + limit: typeof values.limit === 'string' ? parsePositiveInteger('limit', values.limit, 'runs') : undefined, + }); + writeJson(stdout, result); + return; + } case 'threads': { const { owner, repo, values } = parseRepoFlags('threads', rest); const kind = parseEnum('threads', 'kind', values.kind, ['issue', 'pull_request']); diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index 7771ee7..a8a3408 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -14,6 +14,7 @@ import { includeClusterMemberRequestSchema, mergeClustersRequestSchema, neighborsResponseSchema, + runHistoryResponseSchema, searchResponseSchema, setClusterCanonicalRequestSchema, splitClusterRequestSchema, @@ -62,6 +63,34 @@ test('action request accepts optional thread number', () => { assert.equal(parsed.threadNumber, 42); }); +test('run history response accepts mixed pipeline records', () => { + const parsed = runHistoryResponseSchema.parse({ + repository: { + id: 1, + owner: 'openclaw', + name: 'openclaw', + fullName: 'openclaw/openclaw', + githubRepoId: null, + updatedAt: new Date().toISOString(), + }, + runs: [ + { + runId: 7, + runKind: 'sync', + scope: 'openclaw/openclaw', + status: 'completed', + startedAt: new Date().toISOString(), + finishedAt: new Date().toISOString(), + stats: { threadsSynced: 2 }, + errorText: null, + }, + ], + }); + + assert.equal(parsed.runs[0]?.runKind, 'sync'); + assert.equal(parsed.runs[0]?.stats?.threadsSynced, 2); +}); + test('author response accepts actor identity and repo stats', () => { const parsed = authorResponseSchema.parse({ repository: { diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index c22cef3..5bec16a 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -52,6 +52,27 @@ export const repositoriesResponseSchema = z.object({ }); export type RepositoriesResponse = z.infer; +export const runKindSchema = z.enum(['sync', 'summary', 'embedding', 'cluster']); +export type RunKind = z.infer; + +export const runRecordSchema = z.object({ + runId: z.number().int().positive(), + runKind: runKindSchema, + scope: z.string(), + status: z.string(), + startedAt: z.string(), + finishedAt: z.string().nullable(), + stats: z.record(z.string(), z.unknown()).nullable(), + errorText: z.string().nullable(), +}); +export type RunRecordDto = z.infer; + +export const runHistoryResponseSchema = z.object({ + repository: repositorySchema, + runs: z.array(runRecordSchema), +}); +export type RunHistoryResponse = z.infer; + export const threadsResponseSchema = z.object({ repository: repositorySchema, threads: z.array(threadSchema), diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index e84c05e..f96c802 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -12,6 +12,7 @@ import { durableClustersResponseSchema, healthResponseSchema, neighborsResponseSchema, + runHistoryResponseSchema, threadsResponseSchema, } from '@ghcrawl/api-contract'; @@ -71,6 +72,70 @@ test('health endpoint returns contract payload', async () => { } }); +test('runs endpoint returns recent pipeline history', async () => { + const service = new GHCrawlService({ + config: { + workspaceRoot: process.cwd(), + configDir: '/tmp/ghcrawl-test', + configPath: '/tmp/ghcrawl-test/config.json', + configFileExists: true, + dbPath: ':memory:', + dbPathSource: 'config', + apiPort: 5179, + secretProvider: 'plaintext', + githubTokenSource: 'none', + openaiApiKeySource: 'none', + summaryModel: 'gpt-5-mini', + embedModel: 'text-embedding-3-large', + embeddingBasis: 'title_original', + vectorBackend: 'vectorlite', + embedBatchSize: 8, + embedConcurrency: 10, + embedMaxUnread: 20, + openSearchIndex: 'ghcrawl-threads', + tuiPreferences: {}, + }, + github: { + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare(`insert into sync_runs (id, repo_id, scope, status, started_at, finished_at, stats_json) values (?, ?, ?, ?, ?, ?, ?)`) + .run(1, 1, 'openclaw/openclaw', 'completed', now, now, '{"threadsSynced":2}'); + + const server = createApiServer(service); + try { + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const address = server.address(); + assert(address && typeof address === 'object'); + + const response = await fetch(`http://127.0.0.1:${address.port}/runs?owner=openclaw&repo=openclaw&kind=sync`); + assert.equal(response.status, 200); + const payload = runHistoryResponseSchema.parse((await response.json()) as unknown); + assert.equal(payload.runs[0]?.runKind, 'sync'); + assert.equal(payload.runs[0]?.stats?.threadsSynced, 2); + } finally { + await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); + service.close(); + } +}); + test('neighbors endpoint returns contract payload', async () => { const service = new GHCrawlService({ config: { diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index f5d98b1..64d2bc8 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -49,6 +49,24 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } + if (req.method === 'GET' && url.pathname === '/runs') { + const params = parseRepoParams(url); + const kindParam = url.searchParams.get('kind'); + const kind = + kindParam === 'sync' || kindParam === 'summary' || kindParam === 'embedding' || kindParam === 'cluster' ? kindParam : undefined; + const limitValue = url.searchParams.get('limit'); + sendJson( + res, + 200, + service.listRunHistory({ + ...params, + kind, + limit: limitValue ? Number(limitValue) : undefined, + }), + ); + return; + } + if (req.method === 'GET' && url.pathname === '/threads') { const params = parseRepoParams(url); const kindParam = url.searchParams.get('kind'); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index b19b38a..c49e723 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -157,6 +157,50 @@ test('doctor explains when secrets are expected from 1Password CLI env injection } }); +test('listRunHistory returns recent runs across pipeline tables', () => { + const service = makeTestService({ + checkAuth: async () => undefined, + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + + try { + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', '2026-03-09T00:00:00Z'); + service.db + .prepare(`insert into sync_runs (id, repo_id, scope, status, started_at, finished_at, stats_json) values (?, ?, ?, ?, ?, ?, ?)`) + .run(1, 1, 'openclaw/openclaw', 'completed', '2026-03-09T00:00:00Z', '2026-03-09T00:01:00Z', '{"threadsSynced":2}'); + service.db + .prepare(`insert into cluster_runs (id, repo_id, scope, status, started_at, finished_at, error_text) values (?, ?, ?, ?, ?, ?, ?)`) + .run(2, 1, 'openclaw/openclaw', 'failed', '2026-03-09T00:02:00Z', '2026-03-09T00:03:00Z', 'boom'); + + const allRuns = service.listRunHistory({ owner: 'openclaw', repo: 'openclaw' }); + assert.deepEqual( + allRuns.runs.map((run) => [run.runKind, run.status]), + [ + ['cluster', 'failed'], + ['sync', 'completed'], + ], + ); + assert.equal(allRuns.runs[1]?.stats?.threadsSynced, 2); + + const syncRuns = service.listRunHistory({ owner: 'openclaw', repo: 'openclaw', kind: 'sync' }); + assert.deepEqual(syncRuns.runs.map((run) => run.runKind), ['sync']); + } finally { + service.close(); + } +}); + test('syncRepository defaults to metadata-only mode, preserves thread kind, and tracks first/last pull timestamps', async () => { const messages: string[] = []; let listIssueCommentCalls = 0; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f517da5..2f8cd48 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -28,6 +28,7 @@ import { neighborsResponseSchema, refreshResponseSchema, repositoriesResponseSchema, + runHistoryResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, @@ -56,6 +57,8 @@ import { type RefreshResponse, type RepositoriesResponse, type RepositoryDto, + type RunHistoryResponse, + type RunKind, type SearchHitDto, type SearchMode, type SearchResponse, @@ -736,6 +739,50 @@ export class GHCrawlService { return repositoriesResponseSchema.parse({ repositories: rows.map(repositoryToDto) }); } + listRunHistory(params: { owner: string; repo: string; kind?: RunKind; limit?: number }): RunHistoryResponse { + const repository = this.requireRepository(params.owner, params.repo); + const limit = Math.min(Math.max(params.limit ?? 20, 1), 200); + const tables: Array<{ kind: RunKind; table: RunTable }> = [ + { kind: 'sync', table: 'sync_runs' }, + { kind: 'summary', table: 'summary_runs' }, + { kind: 'embedding', table: 'embedding_runs' }, + { kind: 'cluster', table: 'cluster_runs' }, + ]; + const selectedTables = params.kind ? tables.filter((entry) => entry.kind === params.kind) : tables; + const sql = selectedTables + .map( + (entry) => + `select '${entry.kind}' as run_kind, id, scope, status, started_at, finished_at, stats_json, error_text from ${entry.table} where repo_id = ?`, + ) + .join(' union all '); + const rows = this.db + .prepare(`select * from (${sql}) order by started_at desc, id desc limit ?`) + .all(...selectedTables.map(() => repository.id), limit) as Array<{ + run_kind: RunKind; + id: number; + scope: string; + status: string; + started_at: string; + finished_at: string | null; + stats_json: string | null; + error_text: string | null; + }>; + + return runHistoryResponseSchema.parse({ + repository, + runs: rows.map((row) => ({ + runId: row.id, + runKind: row.run_kind, + scope: row.scope, + status: row.status, + startedAt: row.started_at, + finishedAt: row.finished_at, + stats: parseObjectJson(row.stats_json), + errorText: row.error_text, + })), + }); + } + listThreads(params: { owner: string; repo: string; kind?: 'issue' | 'pull_request'; numbers?: number[]; includeClosed?: boolean }): ThreadsResponse { const repository = this.requireRepository(params.owner, params.repo); const clusterIds = new Map(); From 2cfe4bc5db4e3dfbfa465a37fa4455f9e84bbb89 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Thu, 23 Apr 2026 23:34:43 -0700 Subject: [PATCH 063/215] docs: document run history surface --- README.md | 1 + SPEC.md | 2 ++ docs/PLAN.md | 2 +- skills/ghcrawl/SKILL.md | 3 +++ skills/ghcrawl/references/protocol.md | 37 +++++++++++++++++++++++---- 5 files changed, 39 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 58062df..ff8e6ea 100644 --- a/README.md +++ b/README.md @@ -319,6 +319,7 @@ The skill is built around the stable JSON CLI surface and is intentionally conse ```bash ghcrawl doctor --json ghcrawl refresh owner/repo +ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl author owner/repo --login lqquan --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json diff --git a/SPEC.md b/SPEC.md index b06d0f9..e1f52cd 100644 --- a/SPEC.md +++ b/SPEC.md @@ -125,6 +125,7 @@ The product must keep these machine-facing surfaces working: - `ghcrawl doctor --json` - `ghcrawl sync owner/repo --json` - `ghcrawl refresh owner/repo --json` +- `ghcrawl runs owner/repo --json` - `ghcrawl threads owner/repo --numbers --json` - `ghcrawl author owner/repo --login --json` for actor identity, repo stats, and authored threads - `ghcrawl close-thread owner/repo --number --json` @@ -148,6 +149,7 @@ The product must keep these machine-facing surfaces working: - `GET /health` - `GET /repositories` +- `GET /runs` - `GET /threads` - `GET /author` - `GET /author-threads` diff --git a/docs/PLAN.md b/docs/PLAN.md index 41c4308..df60ddb 100644 --- a/docs/PLAN.md +++ b/docs/PLAN.md @@ -124,7 +124,7 @@ Decision note: - [x] Preserve package boundaries so future web code stays HTTP-only and does not import `api-core`. - [ ] Add any missing read endpoints we want before UI work: - neighbors - - run history + - [x] run history - thread detail with summaries and optional hydrated comments - [ ] Build the deferred Vite web app only after the API shape settles. - [ ] Use `shadcn/ui` primitives with a custom visual system rather than stock styling. diff --git a/skills/ghcrawl/SKILL.md b/skills/ghcrawl/SKILL.md index 4c9739e..e4d5537 100644 --- a/skills/ghcrawl/SKILL.md +++ b/skills/ghcrawl/SKILL.md @@ -60,6 +60,7 @@ Start with local read-only commands: Without explicit user direction to refresh data, prefer these local-only commands: ```bash +ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 12345 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json ghcrawl durable-clusters owner/repo --member-limit 10 --json @@ -87,6 +88,8 @@ Use `threads --numbers 12345` when you need to find the cluster for one specific Use `author --login ` when the user asks about a contributor or maintainer. It returns actor identity, repo-local activity stats, authored threads, and the strongest same-author similarity match for each thread. +Use `runs` when freshness, repeated failures, or background pipeline status matters. It returns recent sync, summary, embedding, and cluster runs with status, timestamps, stats, and errors. + Use `configure --json` when you need to confirm the currently selected summary model or embedding basis before suggesting an expensive refresh. Use `threads --numbers ...` when you need a batch of specific issue/PR records. Do not pay the CLI startup cost 10 times for 10 separate single-thread lookups. diff --git a/skills/ghcrawl/references/protocol.md b/skills/ghcrawl/references/protocol.md index e8e2585..a608d50 100644 --- a/skills/ghcrawl/references/protocol.md +++ b/skills/ghcrawl/references/protocol.md @@ -92,6 +92,31 @@ Optional skips: Do not run this unless the user explicitly asked for a refresh/rebuild. +### `ghcrawl runs owner/repo --json` + +Read-only run history for one repo. + +Use this when sync freshness, repeated failures, or pipeline status matters. + +Useful flags: + +- `--kind sync|summary|embedding|cluster` +- `--limit ` + +Returns: + +- `repository` +- `runs[]` + +Each run includes: + +- `runKind` +- `status` +- `startedAt` +- `finishedAt` +- `stats` +- `errorText` + ### `ghcrawl clusters owner/repo --json` Useful flags: @@ -237,6 +262,7 @@ If `ghcrawl` is not installed globally: ```bash pnpm --filter ghcrawl cli doctor --json pnpm --filter ghcrawl cli configure --json +pnpm --filter ghcrawl cli runs owner/repo --limit 20 --json pnpm --filter ghcrawl cli threads owner/repo --numbers 12345 --json pnpm --filter ghcrawl cli threads owner/repo --numbers 42,43,44 --json pnpm --filter ghcrawl cli threads owner/repo --numbers 42,43,44 --include-closed --json @@ -256,11 +282,12 @@ If the supported CLI path still fails, hangs, or returns unusable output, stop a ## Suggested analysis flow -1. Start read-only with `clusters`, `cluster-detail`, `threads`, `author`, `search`, or `neighbors` +1. Start read-only with `clusters`, `cluster-detail`, `threads`, `author`, `runs`, `search`, or `neighbors` 2. Only if API-backed work is needed or a read-only request failed, run `ghcrawl doctor --json` 3. If auth is unavailable, stay read-only 4. Only if doctor is healthy and the user explicitly asked, run `ghcrawl refresh owner/repo` -5. `ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json` -6. `ghcrawl cluster-detail owner/repo --id --json` -7. `ghcrawl cluster-explain owner/repo --id --json` when evidence or governance matters -8. optionally `threads`, `author`, `search`, or `neighbors` with `--json` +5. `ghcrawl runs owner/repo --limit 20 --json` when freshness or failures matter +6. `ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json` +7. `ghcrawl cluster-detail owner/repo --id --json` +8. `ghcrawl cluster-explain owner/repo --id --json` when evidence or governance matters +9. optionally `threads`, `author`, `search`, or `neighbors` with `--json` From 0b2e6e3fbf7c9d04c854b20618a1a676ab377b7f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 00:03:50 -0700 Subject: [PATCH 064/215] refactor: use bare token crawler mode --- apps/cli/src/init-wizard.test.ts | 185 --------- apps/cli/src/init-wizard.ts | 348 ----------------- apps/cli/src/main.test.ts | 74 +--- apps/cli/src/main.ts | 57 +-- apps/cli/src/tui/app.test.ts | 1 - apps/cli/src/tui/app.ts | 113 +----- package.json | 6 - packages/api-contract/src/client.ts | 9 - packages/api-contract/src/contracts.test.ts | 41 -- packages/api-contract/src/contracts.ts | 48 --- packages/api-core/src/api/server.test.ts | 127 ------- packages/api-core/src/api/server.ts | 24 -- .../api-core/src/cluster/perf.integration.ts | 2 - .../src/cluster/persistent-store.test.ts | 36 -- .../api-core/src/cluster/persistent-store.ts | 103 +---- packages/api-core/src/config.test.ts | 20 +- packages/api-core/src/config.ts | 39 +- packages/api-core/src/db/migrate.test.ts | 1 - packages/api-core/src/db/migrate.ts | 35 -- packages/api-core/src/github/client.ts | 6 - packages/api-core/src/openai/provider.ts | 5 - packages/api-core/src/service.test.ts | 237 +----------- packages/api-core/src/service.ts | 357 +----------------- scripts/op-run.mjs | 135 ------- scripts/run-all-prompt-experiments.mjs | 2 +- scripts/run-cluster-experiments.mjs | 2 +- scripts/summarize-single.mjs | 4 +- 27 files changed, 42 insertions(+), 1975 deletions(-) delete mode 100644 apps/cli/src/init-wizard.test.ts delete mode 100644 apps/cli/src/init-wizard.ts delete mode 100644 scripts/op-run.mjs diff --git a/apps/cli/src/init-wizard.test.ts b/apps/cli/src/init-wizard.test.ts deleted file mode 100644 index e13ce1e..0000000 --- a/apps/cli/src/init-wizard.test.ts +++ /dev/null @@ -1,185 +0,0 @@ -import test from 'node:test'; -import assert from 'node:assert/strict'; -import fs from 'node:fs'; -import os from 'node:os'; -import path from 'node:path'; - -import { readPersistedConfig, writePersistedConfig } from '@ghcrawl/api-core'; - -import { runInitWizard, type InitPrompter } from './init-wizard.js'; - -function makeTestEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv { - return { - ...process.env, - XDG_CONFIG_HOME: undefined, - APPDATA: undefined, - ...overrides, - }; -} - -function makePrompter(overrides: Partial = {}): InitPrompter { - return { - intro: async () => undefined, - note: async () => undefined, - select: async () => 'plaintext', - text: async () => { - throw new Error('unexpected text prompt'); - }, - confirm: async () => true, - password: async () => { - throw new Error('unexpected password prompt'); - }, - outro: async () => undefined, - cancel: () => undefined, - ...overrides, - }; -} - -test('runInitWizard skips prompting when config already has both API keys', async () => { - const home = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-init-test-')); - const env = makeTestEnv({ HOME: home }); - writePersistedConfig( - { - githubToken: 'ghp_testtoken1234567890', - openaiApiKey: 'sk-proj-testkey1234567890', - }, - { env }, - ); - - const result = await runInitWizard({ - env, - prompter: makePrompter(), - isInteractive: true, - }); - - assert.equal(result.changed, false); - assert.equal(fs.existsSync(result.configPath), true); -}); - -test('runInitWizard prompts for missing keys and writes the config file', async () => { - const home = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-init-test-')); - const env = makeTestEnv({ HOME: home }); - const prompts: string[] = []; - - const result = await runInitWizard({ - env, - prompter: makePrompter({ - select: async () => 'plaintext', - password: async ({ message }) => { - prompts.push(message); - return message.includes('GitHub') ? 'ghp_testtoken1234567890' : 'sk-proj-testkey1234567890'; - }, - }), - isInteractive: true, - }); - - assert.equal(result.changed, true); - assert.deepEqual(prompts, ['GitHub personal access token', 'OpenAI API key']); - - const persisted = readPersistedConfig({ env }); - assert.equal(persisted.data.githubToken, 'ghp_testtoken1234567890'); - assert.equal(persisted.data.openaiApiKey, 'sk-proj-testkey1234567890'); -}); - -test('runInitWizard can persist detected environment keys without prompting for secrets', async () => { - const home = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-init-test-')); - const env = makeTestEnv({ - HOME: home, - GITHUB_TOKEN: 'ghp_envtoken1234567890', - OPENAI_API_KEY: 'sk-proj-envkey1234567890', - }); - - const result = await runInitWizard({ - env, - prompter: makePrompter({ - select: async () => 'plaintext', - confirm: async () => true, - }), - isInteractive: true, - }); - - assert.equal(result.changed, true); - const persisted = readPersistedConfig({ env }); - assert.equal(persisted.data.githubToken, 'ghp_envtoken1234567890'); - assert.equal(persisted.data.openaiApiKey, 'sk-proj-envkey1234567890'); -}); - -test('runInitWizard can configure 1Password CLI metadata without persisting plaintext keys', async () => { - const home = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-init-test-')); - const env = makeTestEnv({ HOME: home }); - const notes: Array<{ title?: string; message: string }> = []; - const confirms: string[] = []; - - const result = await runInitWizard({ - env, - prompter: makePrompter({ - select: async () => 'op', - text: async ({ message }) => (message.includes('vault') ? 'Private' : 'ghcrawl'), - note: async (message, title) => { - notes.push({ title, message }); - }, - confirm: async ({ message }) => { - confirms.push(message); - return true; - }, - }), - isInteractive: true, - }); - - assert.equal(result.changed, true); - const persisted = readPersistedConfig({ env }); - assert.equal(persisted.data.secretProvider, 'op'); - assert.equal(persisted.data.opVaultName, 'Private'); - assert.equal(persisted.data.opItemName, 'ghcrawl'); - assert.equal(persisted.data.githubToken, undefined); - assert.equal(persisted.data.openaiApiKey, undefined); - assert.equal( - notes.some((entry) => entry.title === '1Password Setup' && entry.message.includes('op://Private/ghcrawl/GITHUB_TOKEN')), - true, - ); - assert.equal(notes.some((entry) => entry.title === 'Next Commands' && entry.message.includes('ghcrawl-op()')), true); - assert.equal(notes.some((entry) => entry.title === 'Next Commands' && entry.message.includes('ghcrawl-op doctor')), true); - assert.equal(notes.some((entry) => entry.title === 'Next Commands' && entry.message.includes('ghcrawl-op sync org/repo')), true); - assert.equal(notes.some((entry) => entry.title === 'Responsibility' && entry.message.includes('accept no liability')), true); - assert.equal(confirms.some((message) => message.includes('I created the Secure Note')), true); - assert.equal(confirms.some((message) => message.includes('I copied those commands')), true); - assert.equal(confirms.some((message) => message.includes('accept full responsibility')), true); -}); - -test('runInitWizard accepts empty 1Password vault and item input as defaults', async () => { - const home = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-init-test-')); - const env = makeTestEnv({ HOME: home }); - - await runInitWizard({ - env, - prompter: makePrompter({ - select: async () => 'op', - text: async () => '', - confirm: async () => true, - }), - isInteractive: true, - }); - - const persisted = readPersistedConfig({ env }); - assert.equal(persisted.data.opVaultName, 'Private'); - assert.equal(persisted.data.opItemName, 'ghcrawl'); -}); - -test('runInitWizard accepts undefined 1Password text responses as defaults', async () => { - const home = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-init-test-')); - const env = makeTestEnv({ HOME: home }); - - await runInitWizard({ - env, - prompter: makePrompter({ - select: async () => 'op', - text: async () => undefined, - confirm: async () => true, - }), - isInteractive: true, - }); - - const persisted = readPersistedConfig({ env }); - assert.equal(persisted.data.opVaultName, 'Private'); - assert.equal(persisted.data.opItemName, 'ghcrawl'); -}); diff --git a/apps/cli/src/init-wizard.ts b/apps/cli/src/init-wizard.ts deleted file mode 100644 index d6e0e06..0000000 --- a/apps/cli/src/init-wizard.ts +++ /dev/null @@ -1,348 +0,0 @@ -import { cancel, confirm, intro, isCancel, note, outro, password, select, text } from '@clack/prompts'; -import { - loadConfig, - readPersistedConfig, - writePersistedConfig, - isLikelyGitHubToken, - isLikelyOpenAiApiKey, -} from '@ghcrawl/api-core'; - -type InitSecretMode = 'plaintext' | 'op'; - -export type InitWizardResult = { - configPath: string; - changed: boolean; -}; - -export type InitPrompter = { - intro: (message: string) => Promise | void; - note: (message: string, title?: string) => Promise | void; - select: (options: { - message: string; - initialValue?: string; - options: Array<{ value: string; label: string; hint?: string }>; - }) => Promise; - text: (options: { - message: string; - placeholder?: string; - validate?: (value: string) => string | undefined; - }) => Promise; - confirm: (options: { message: string; initialValue?: boolean }) => Promise; - password: (options: { message: string; validate?: (value: string) => string | undefined }) => Promise; - outro: (message: string) => Promise | void; - cancel: (message: string) => void; -}; - -function resolveTextValue(value: string | symbol | undefined, fallback: string): string | symbol { - if (isCancel(value)) { - return value; - } - if (typeof value !== 'string') { - return fallback; - } - const trimmed = value.trim(); - return trimmed.length > 0 ? trimmed : fallback; -} - -export function createClackInitPrompter(): InitPrompter { - return { - intro, - note, - select, - text: async (options) => { - const validate = options.validate; - return text({ - message: options.message, - placeholder: options.placeholder, - validate: validate - ? (value) => validate(value ?? '') - : undefined, - }); - }, - confirm, - password: async (options) => { - const validate = options.validate; - return password({ - message: options.message, - validate: validate - ? (value) => validate(value ?? '') - : undefined, - }); - }, - outro, - cancel, - }; -} - -export async function runInitWizard( - options: { - cwd?: string; - env?: NodeJS.ProcessEnv; - reconfigure?: boolean; - configPathOverride?: string; - workspaceRootOverride?: string; - prompter?: InitPrompter; - isInteractive?: boolean; - } = {}, -): Promise { - const cwd = options.cwd ?? process.cwd(); - const env = options.env ?? process.env; - const reconfigure = options.reconfigure ?? false; - const prompter = options.prompter ?? createClackInitPrompter(); - const configOptions = { - cwd, - env, - configPathOverride: options.configPathOverride, - workspaceRootOverride: options.workspaceRootOverride, - }; - const current = loadConfig(configOptions); - const stored = readPersistedConfig(configOptions); - - const hasStoredGithub = Boolean(stored.data.githubToken); - const hasStoredOpenAi = Boolean(stored.data.openaiApiKey); - if (!reconfigure && hasStoredGithub && hasStoredOpenAi) { - return { configPath: current.configPath, changed: false }; - } - - const isInteractive = options.isInteractive ?? (process.stdin.isTTY && process.stdout.isTTY); - if (!isInteractive) { - throw new Error(`ghcrawl init requires a TTY. Create ${current.configPath} manually or set environment variables first.`); - } - - await prompter.intro('ghcrawl init'); - await prompter.note( - [ - `Config file: ${current.configPath}`, - '', - 'Secret storage modes:', - '- Plaintext config: writes both keys to ~/.config/ghcrawl/config.json', - '- 1Password CLI: keeps keys out of the config file and expects you to run ghcrawl through an op wrapper', - '', - 'GitHub token recommendation:', - '- Fine-grained PAT scoped to the repos you want to crawl', - '- Repository permissions: Metadata (read), Issues (read), Pull requests (read)', - '- For private repos with a classic PAT, repo is the safe fallback', - '', - 'OpenAI key recommendation:', - '- Standard API key for the project/account you want to bill', - ].join('\n'), - 'Setup', - ); - - const nextConfig = { ...stored.data }; - let changed = false; - - const secretMode = await prompter.select({ - message: 'How should ghcrawl get your GitHub and OpenAI secrets?', - initialValue: stored.data.secretProvider ?? (hasStoredGithub && hasStoredOpenAi ? 'plaintext' : 'op'), - options: [ - { - value: 'plaintext', - label: 'Store plaintext keys in ~/.config/ghcrawl/config.json', - hint: 'simpler, but you are responsible for any bills caused by misuse', - }, - { - value: 'op', - label: 'Keep keys in 1Password CLI and run through op', - hint: 'recommended if you already use op', - }, - ], - }); - if (isCancel(secretMode) || (secretMode !== 'plaintext' && secretMode !== 'op')) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - - if (secretMode === 'plaintext') { - await prompter.note( - [ - 'Plaintext storage warning:', - '- ghcrawl will write both API keys to ~/.config/ghcrawl/config.json', - '- anyone who can read that file can use your keys', - '- any OpenAI/API bills caused by misuse are your responsibility', - ].join('\n'), - 'Security', - ); - - if (reconfigure || !hasStoredGithub) { - const detectedGithub = env.GITHUB_TOKEN; - let githubToken = stored.data.githubToken; - let usedDetectedGithub = false; - if (detectedGithub && (!githubToken || reconfigure)) { - const useDetected = await prompter.confirm({ - message: 'Persist the detected GITHUB_TOKEN environment value to the ghcrawl config file?', - initialValue: true, - }); - if (isCancel(useDetected)) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - if (useDetected) { - if (isLikelyGitHubToken(detectedGithub)) { - githubToken = detectedGithub; - usedDetectedGithub = true; - } else { - await prompter.note('The detected GITHUB_TOKEN value does not look like a GitHub PAT, so init will prompt for it instead.', 'GitHub token'); - } - } - } - if (!githubToken || (reconfigure && !usedDetectedGithub)) { - const value = await prompter.password({ - message: 'GitHub personal access token', - validate: (candidate) => (isLikelyGitHubToken(candidate) ? undefined : 'Enter a GitHub PAT like ghp_... or github_pat_...'), - }); - if (isCancel(value)) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - githubToken = value; - } - nextConfig.githubToken = githubToken; - changed = true; - } - - if (reconfigure || !hasStoredOpenAi) { - const detectedOpenAi = env.OPENAI_API_KEY; - let openaiApiKey = stored.data.openaiApiKey; - let usedDetectedOpenAi = false; - if (detectedOpenAi && (!openaiApiKey || reconfigure)) { - const useDetected = await prompter.confirm({ - message: 'Persist the detected OPENAI_API_KEY environment value to the ghcrawl config file?', - initialValue: true, - }); - if (isCancel(useDetected)) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - if (useDetected) { - if (isLikelyOpenAiApiKey(detectedOpenAi)) { - openaiApiKey = detectedOpenAi; - usedDetectedOpenAi = true; - } else { - await prompter.note('The detected OPENAI_API_KEY value does not look like an OpenAI API key, so init will prompt for it instead.', 'OpenAI key'); - } - } - } - if (!openaiApiKey || (reconfigure && !usedDetectedOpenAi)) { - const value = await prompter.password({ - message: 'OpenAI API key', - validate: (candidate) => (isLikelyOpenAiApiKey(candidate) ? undefined : 'Enter an OpenAI API key like sk-...'), - }); - if (isCancel(value)) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - openaiApiKey = value; - } - nextConfig.openaiApiKey = openaiApiKey; - changed = true; - } - - nextConfig.secretProvider = 'plaintext'; - nextConfig.opVaultName = undefined; - nextConfig.opItemName = undefined; - } else { - const defaultVaultName = stored.data.opVaultName ?? 'Private'; - const vaultNameInput = await prompter.text({ - message: '1Password vault name', - placeholder: defaultVaultName, - }); - const vaultName = resolveTextValue(vaultNameInput, defaultVaultName); - if (isCancel(vaultName)) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - const defaultItemName = stored.data.opItemName ?? 'ghcrawl'; - const itemNameInput = await prompter.text({ - message: '1Password item name', - placeholder: defaultItemName, - }); - const itemName = resolveTextValue(itemNameInput, defaultItemName); - if (isCancel(itemName)) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - - nextConfig.secretProvider = 'op'; - nextConfig.opVaultName = vaultName.trim(); - nextConfig.opItemName = itemName.trim(); - nextConfig.githubToken = undefined; - nextConfig.openaiApiKey = undefined; - changed = true; - - const opReferenceBase = `op://${nextConfig.opVaultName}/${nextConfig.opItemName}`; - await prompter.note( - [ - 'Create a 1Password Secure Note with:', - `- Vault: ${nextConfig.opVaultName}`, - `- Item: ${nextConfig.opItemName}`, - '', - 'Add concealed fields named exactly:', - '- GITHUB_TOKEN', - '- OPENAI_API_KEY', - '', - 'Secret refs:', - `- ${opReferenceBase}/GITHUB_TOKEN`, - `- ${opReferenceBase}/OPENAI_API_KEY`, - ].join('\n'), - '1Password Setup', - ); - const readyNote = await prompter.confirm({ - message: 'I created the Secure Note with those exact field names and secret refs.', - initialValue: true, - }); - if (isCancel(readyNote) || readyNote !== true) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - - await prompter.note( - [ - 'After saving that Secure Note, run ghcrawl through an op-backed shell helper:', - '', - 'ghcrawl-op() {', - ` env GITHUB_TOKEN=\"$(op read '${opReferenceBase}/GITHUB_TOKEN')\" \\`, - ` OPENAI_API_KEY=\"$(op read '${opReferenceBase}/OPENAI_API_KEY')\" \\`, - ' ghcrawl "$@"', - '}', - '', - 'Examples:', - '- ghcrawl-op doctor', - '- ghcrawl-op tui', - '- ghcrawl-op sync org/repo', - ].join('\n'), - 'Next Commands', - ); - const readyCommands = await prompter.confirm({ - message: 'I copied those commands and I am ready to save this ghcrawl config.', - initialValue: true, - }); - if (isCancel(readyCommands) || readyCommands !== true) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - } - - await prompter.note( - [ - 'Responsibility attestation:', - '- You are responsible for obtaining and using GitHub and OpenAI API keys in compliance with the agreements and usage policies for those platforms.', - '- You and any employer or organization you operate this tool for accept full responsibility for monitoring API usage, spend, and access.', - '- You are fully responsible for storing your API keys securely and for any misuse, theft, or unexpected spend caused by those keys.', - '- The creators and contributors of ghcrawl accept no liability for API charges, account actions, data loss, or misuse resulting from operation of this tool.', - ].join('\n'), - 'Responsibility', - ); - const acceptResponsibility = await prompter.confirm({ - message: 'I understand and accept full responsibility for using ghcrawl and for securing any API keys it uses.', - initialValue: false, - }); - if (isCancel(acceptResponsibility) || acceptResponsibility !== true) { - prompter.cancel('init cancelled'); - throw new Error('init cancelled'); - } - - const result = writePersistedConfig(nextConfig, configOptions); - await prompter.outro(`Saved ghcrawl config to ${result.configPath}`); - return { configPath: result.configPath, changed }; -} diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 905c360..1fbfef0 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -37,7 +37,6 @@ function makeRunContext(): { env: NodeJS.ProcessEnv; cwd: string; cleanup: () => } const publicCommands = [ - 'init', 'doctor', 'configure', 'version', @@ -45,7 +44,6 @@ const publicCommands = [ 'refresh', 'runs', 'threads', - 'author', 'close-thread', 'close-cluster', 'exclude-cluster-member', @@ -224,7 +222,6 @@ test('unknown command exits with code 2 and a top-level help hint', async () => test('missing required flags exit with code 2 and command-specific hints', async () => { const cases = [ - { argv: ['author', 'openclaw/openclaw'], message: /Missing --login/, hint: /Run 'ghcrawl author --help' for usage\./ }, { argv: ['close-thread', 'openclaw/openclaw'], message: /Missing --number/, hint: /Run 'ghcrawl close-thread --help' for usage\./ }, { argv: ['cluster-detail', 'openclaw/openclaw'], message: /Missing --id/, hint: /Run 'ghcrawl cluster-detail --help' for usage\./ }, ]; @@ -238,70 +235,6 @@ test('missing required flags exit with code 2 and command-specific hints', async } }); -test('author command returns actor profile stats and threads', async () => { - const stdout = createWritableCapture(); - const context = makeRunContext(); - const original = GHCrawlService.prototype.getAuthor; - let received: unknown; - - GHCrawlService.prototype.getAuthor = function getAuthorStub(params: unknown) { - received = params; - return { - repository: { - id: 1, - owner: 'openclaw', - name: 'openclaw', - fullName: 'openclaw/openclaw', - githubRepoId: '1', - updatedAt: '2026-03-09T00:00:00Z', - }, - authorLogin: 'alice', - actor: { - id: 1, - provider: 'github', - providerUserId: '501', - login: 'alice', - displayName: null, - actorType: 'User', - siteAdmin: false, - firstSeenAt: '2026-03-09T00:00:00Z', - lastSeenAt: '2026-03-09T00:00:00Z', - updatedAt: '2026-03-09T00:00:00Z', - }, - stats: { - openedIssueCount: 1, - openedPullRequestCount: 0, - commentCount: 0, - mergedPullRequestCount: 0, - closedThreadCount: 0, - firstActivityAt: '2026-03-09T00:00:00Z', - lastActivityAt: '2026-03-09T00:00:00Z', - trustTier: 'unknown', - }, - threads: [], - } as never; - }; - - try { - await run(['author', 'openclaw/openclaw', '--login', 'alice', '--include-closed'], stdout.stream, { - env: context.env, - cwd: context.cwd, - }); - } finally { - GHCrawlService.prototype.getAuthor = original; - context.cleanup(); - } - - assert.deepEqual(received, { - owner: 'openclaw', - repo: 'openclaw', - login: 'alice', - includeClosed: true, - }); - assert.match(stdout.read(), /"providerUserId": "501"/); - assert.match(stdout.read(), /"openedIssueCount": 1/); -}); - test('runs command returns pipeline history', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); @@ -390,7 +323,6 @@ test('agent-facing command help advertises explicit --json', async () => { 'sync', 'refresh', 'threads', - 'author', 'close-thread', 'close-cluster', 'exclude-cluster-member', @@ -959,15 +891,13 @@ test('formatDoctorReport renders a human-readable health summary', () => { github: { configured: true, source: 'config', - formatOk: true, - authOk: true, + tokenPresent: true, error: null, }, openai: { configured: false, source: 'none', - formatOk: false, - authOk: false, + tokenPresent: false, error: 'missing', }, vectorlite: { diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index c6850cc..b998d07 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -7,11 +7,9 @@ import { fileURLToPath } from 'node:url'; import { createApiServer, GHCrawlService, loadConfig, readPersistedConfig, writePersistedConfig, type LoadConfigOptions } from '@ghcrawl/api-core'; import { createHeapDiagnostics, type HeapDiagnostics } from './heap-diagnostics.js'; -import { runInitWizard } from './init-wizard.js'; import { startTui } from './tui/app.js'; type CommandName = - | 'init' | 'doctor' | 'configure' | 'version' @@ -19,7 +17,6 @@ type CommandName = | 'refresh' | 'runs' | 'threads' - | 'author' | 'close-thread' | 'close-cluster' | 'exclude-cluster-member' @@ -96,13 +93,6 @@ type ParsedRepoFlags = { owner: string; repo: string; values: RepoCommandValues const CLI_VERSION = loadCliVersion(); const COMMAND_SPECS: readonly CommandSpec[] = [ - { - name: 'init', - synopsis: 'init [--reconfigure]', - description: 'Configure secrets and local runtime paths.', - options: ['--reconfigure Re-run setup even if config already exists'], - examples: ['ghcrawl init', 'ghcrawl init --reconfigure'], - }, { name: 'doctor', synopsis: 'doctor [--json]', @@ -186,18 +176,6 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl threads openclaw/openclaw --numbers 42,43,44 --json', 'ghcrawl threads openclaw/openclaw --numbers 42 --include-closed --json'], agentJson: true, }, - { - name: 'author', - synopsis: 'author --login [--include-closed] [--json]', - description: 'Show actor identity, repo stats, and local threads for one author.', - options: [ - '--login GitHub login to inspect', - '--include-closed Include locally closed items', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl author openclaw/openclaw --login lqquan --json'], - agentJson: true, - }, { name: 'close-thread', synopsis: 'close-thread --number [--json]', @@ -777,8 +755,7 @@ export function formatDoctorReport(result: DoctorReport): string { 'GitHub', ` configured: ${formatBooleanStatus(result.github.configured)}`, ` source: ${result.github.source}`, - ` format ok: ${formatBooleanStatus(result.github.formatOk)}`, - ` auth ok: ${formatBooleanStatus(result.github.authOk)}`, + ` token present: ${formatBooleanStatus(result.github.tokenPresent)}`, ]; if (result.github.error) { lines.push(` note: ${result.github.error}`); @@ -788,8 +765,7 @@ export function formatDoctorReport(result: DoctorReport): string { 'OpenAI', ` configured: ${formatBooleanStatus(result.openai.configured)}`, ` source: ${result.openai.source}`, - ` format ok: ${formatBooleanStatus(result.openai.formatOk)}`, - ` auth ok: ${formatBooleanStatus(result.openai.authOk)}`, + ` token present: ${formatBooleanStatus(result.openai.tokenPresent)}`, ); if (result.openai.error) { lines.push(` note: ${result.openai.error}`); @@ -984,21 +960,6 @@ export async function run( try { switch (commandSpec.name) { - case 'init': { - const parsed = parseArgsForCommand('init', rest, { - reconfigure: { type: 'boolean' }, - }); - const values = parsed.values as RepoCommandValues; - await runInitWizard({ - reconfigure: values.reconfigure === true, - cwd, - env, - configPathOverride: parsedGlobals.configPathOverride, - workspaceRootOverride: parsedGlobals.workspaceRootOverride, - }); - writeJson(stdout, getService().init()); - return; - } case 'doctor': { const parsed = parseArgsForCommand('doctor', rest, { json: { type: 'boolean' }, @@ -1116,20 +1077,6 @@ export async function run( writeJson(stdout, result); return; } - case 'author': { - const { owner, repo, values } = parseRepoFlags('author', rest); - if (typeof values.login !== 'string' || values.login.trim().length === 0) { - throw new CliUsageError('Missing --login', 'author'); - } - const result = getService().getAuthor({ - owner, - repo, - login: values.login, - includeClosed: values['include-closed'] === true, - }); - writeJson(stdout, result); - return; - } case 'close-thread': { const { owner, repo, values } = parseRepoFlags('close-thread', rest); if (typeof values.number !== 'string') { diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index dde75fa..3106103 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -193,7 +193,6 @@ test('buildHelpContent includes the full key command list', () => { assert.match(content, /#\s+jump directly to an issue or PR number/); assert.match(content, /g\s+start the staged update pipeline in the background/); assert.match(content, /p\s+open the repository browser/); - assert.match(content, /u\s+show all open threads for the selected author/); assert.match(content, /l\s+toggle wide layout/); assert.match(content, /x\s+show or hide locally closed clusters and members/); assert.match(content, /h or \?\s+open this help popup/); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 38a266f..632cc55 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -55,12 +55,6 @@ type RepositoryChoice = label: string; }; -type AuthorThreadChoice = { - threadId: number; - clusterId: number | null | undefined; - label: string; -}; - type Widgets = { screen: blessed.Widgets.Screen; header: blessed.Widgets.BoxElement; @@ -404,7 +398,7 @@ export async function startTui(params: StartTuiParams): Promise { footerLines.unshift(''); } footerLines.push( - `${status} | jobs:${activeJobs} | h/? help # jump g update p repos u author / filter s sort f min l layout x closed`, + `${status} | jobs:${activeJobs} | h/? help # jump g update p repos / filter s sort f min l layout x closed`, ); footerLines.push( `Tab focus arrows move-or-scroll PgUp/PgDn page r refresh o open q quit`, @@ -707,36 +701,6 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; - const promptAuthorThreads = (): void => { - if (modalOpen) return; - const authorLogin = threadDetail?.thread.authorLogin?.trim() ?? ''; - if (!authorLogin) { - status = 'Selected thread has no author login'; - render(); - return; - } - - void (async () => { - modalOpen = true; - try { - const response = params.service.listAuthorThreads({ - owner: currentRepository.owner, - repo: currentRepository.repo, - login: authorLogin, - }); - const choice = await promptAuthorThreadChoice(widgets.screen, response.authorLogin, response.threads); - if (!choice) { - render(); - return; - } - jumpToThread(choice.threadId, choice.clusterId); - updateFocus('members'); - } finally { - modalOpen = false; - } - })(); - }; - const openHelp = (): void => { if (modalOpen) return; void (async () => { @@ -1148,10 +1112,6 @@ export async function startTui(params: StartTuiParams): Promise { if (modalOpen) return; openSelectedThread(); }); - widgets.screen.key(['u'], () => { - if (modalOpen) return; - promptAuthorThreads(); - }); widgets.screen.on('resize', () => render()); widgets.screen.on('destroy', () => { @@ -1426,7 +1386,6 @@ export function buildHelpContent(): string { '{bold}Actions{/bold}', 'g start the staged update pipeline in the background (GitHub, embeddings, clusters)', 'p open the repository browser / sync a new repository', - 'u show all open threads for the selected author', 'o open the selected thread URL in your browser', '', '{bold}Help And Exit{/bold}', @@ -1619,76 +1578,6 @@ export function getRepositoryChoices(service: Pick['threads'], -): Promise { - const choices: AuthorThreadChoice[] = threads.map((item) => { - const match = item.strongestSameAuthorMatch; - const matchLabel = match ? ` sim:${(match.score * 100).toFixed(1)}% -> #${match.number}` : ' sim:none'; - const clusterLabel = item.thread.clusterId ? `C${item.thread.clusterId}` : 'C-'; - return { - threadId: item.thread.id, - clusterId: item.thread.clusterId, - label: `#${item.thread.number} ${item.thread.kind === 'pull_request' ? 'pr' : 'issue'} ${clusterLabel}${matchLabel} ${item.thread.title}`, - }; - }); - - const box = blessed.list({ - parent: screen, - border: 'line', - label: ` @${authorLogin} Threads `, - keys: true, - vi: true, - mouse: false, - top: 'center', - left: 'center', - width: '80%', - height: '70%', - style: { - border: { fg: '#fde74c' }, - item: { fg: 'white' }, - selected: { bg: '#fde74c', fg: 'black', bold: true }, - }, - items: choices.length > 0 ? choices.map((choice) => choice.label) : ['No open threads for this author'], - }); - const help = blessed.box({ - parent: screen, - bottom: 0, - left: 0, - width: '100%', - height: 1, - content: 'Enter jumps to the selected thread. Esc cancels.', - style: { fg: 'black', bg: '#fde74c' }, - }); - - box.focus(); - box.select(0); - screen.render(); - - return await new Promise((resolve) => { - const teardown = (): void => { - screen.off('keypress', handleKeypress); - box.destroy(); - help.destroy(); - screen.render(); - }; - const finish = (value: AuthorThreadChoice | null): void => { - teardown(); - resolve(value); - }; - const handleKeypress = (_char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { - if (key.name === 'escape' || key.name === 'q') { - finish(null); - } - }; - - screen.on('keypress', handleKeypress); - box.on('select', (_item, index) => finish(choices[index] ?? null)); - }); -} - async function promptRepositoryChoice( screen: blessed.Widgets.Screen, service: GHCrawlService, diff --git a/package.json b/package.json index 92dd420..e81a8fd 100644 --- a/package.json +++ b/package.json @@ -13,14 +13,8 @@ "clean:dist": "node ./scripts/clean-workspace.mjs dist", "clean:tsbuildinfo": "node ./scripts/clean-workspace.mjs tsbuildinfo", "cli": "node ./apps/cli/bin/ghcrawl.js", - "bootstrap": "node ./apps/cli/bin/ghcrawl.js init", "doctor": "node ./apps/cli/bin/ghcrawl.js doctor", "health": "node ./apps/cli/bin/ghcrawl.js doctor", - "op:exec": "node ./scripts/op-run.mjs exec", - "op:shell": "node ./scripts/op-run.mjs shell", - "op:doctor": "node ./scripts/op-run.mjs exec -- doctor", - "op:health": "node ./scripts/op-run.mjs exec -- doctor", - "op:tui": "node ./scripts/op-run.mjs exec -- tui", "sync": "node ./apps/cli/bin/ghcrawl.js sync", "refresh": "node ./apps/cli/bin/ghcrawl.js refresh", "embed": "node ./apps/cli/bin/ghcrawl.js embed", diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index 8c791b0..64bed6d 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -4,7 +4,6 @@ import { closeClusterRequestSchema, closeResponseSchema, closeThreadRequestSchema, - authorThreadsResponseSchema, clusterDetailResponseSchema, clusterExplainResponseSchema, clusterMergeResponseSchema, @@ -29,7 +28,6 @@ import { type ClusterMergeResponse, type ClusterOverrideResponse, type ClusterSplitResponse, - type AuthorThreadsResponse, type ClusterDetailResponse, type ClusterExplainResponse, type ClusterSummariesResponse, @@ -47,7 +45,6 @@ export type GitcrawlClient = { health: () => Promise; listRepositories: () => Promise; listThreads: (params: { owner: string; repo: string; kind?: 'issue' | 'pull_request'; numbers?: number[]; includeClosed?: boolean }) => Promise; - listAuthorThreads: (params: { owner: string; repo: string; login: string; includeClosed?: boolean }) => Promise; search: (params: { owner: string; repo: string; query: string; mode?: SearchMode }) => Promise; listClusters: (params: { owner: string; repo: string; includeClosed?: boolean }) => Promise; listClusterSummaries: (params: { @@ -110,12 +107,6 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet const res = await fetchImpl(`${normalized}/threads?${search.toString()}`); return readJson(res, threadsResponseSchema); }, - async listAuthorThreads(params) { - const search = new URLSearchParams({ owner: params.owner, repo: params.repo, login: params.login }); - if (params.includeClosed) search.set('includeClosed', 'true'); - const res = await fetchImpl(`${normalized}/author-threads?${search.toString()}`); - return readJson(res, authorThreadsResponseSchema); - }, async search(params) { const search = new URLSearchParams({ owner: params.owner, diff --git a/packages/api-contract/src/contracts.test.ts b/packages/api-contract/src/contracts.test.ts index a8a3408..a0ff99d 100644 --- a/packages/api-contract/src/contracts.test.ts +++ b/packages/api-contract/src/contracts.test.ts @@ -3,7 +3,6 @@ import assert from 'node:assert/strict'; import { actionRequestSchema, - authorResponseSchema, clusterExplainResponseSchema, clusterMergeResponseSchema, clusterOverrideResponseSchema, @@ -91,46 +90,6 @@ test('run history response accepts mixed pipeline records', () => { assert.equal(parsed.runs[0]?.stats?.threadsSynced, 2); }); -test('author response accepts actor identity and repo stats', () => { - const parsed = authorResponseSchema.parse({ - repository: { - id: 1, - owner: 'openclaw', - name: 'openclaw', - fullName: 'openclaw/openclaw', - githubRepoId: null, - updatedAt: new Date().toISOString(), - }, - authorLogin: 'alice', - actor: { - id: 1, - provider: 'github', - providerUserId: '501', - login: 'alice', - displayName: null, - actorType: 'User', - siteAdmin: false, - firstSeenAt: new Date().toISOString(), - lastSeenAt: new Date().toISOString(), - updatedAt: new Date().toISOString(), - }, - stats: { - openedIssueCount: 1, - openedPullRequestCount: 0, - commentCount: 0, - mergedPullRequestCount: 0, - closedThreadCount: 0, - firstActivityAt: new Date().toISOString(), - lastActivityAt: new Date().toISOString(), - trustTier: 'unknown', - }, - threads: [], - }); - - assert.equal(parsed.actor?.providerUserId, '501'); - assert.equal(parsed.stats.openedIssueCount, 1); -}); - test('exclude cluster member request trims optional reason', () => { const parsed = excludeClusterMemberRequestSchema.parse({ owner: 'openclaw', diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 5bec16a..4ec7e76 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -88,54 +88,6 @@ export const neighborSchema = z.object({ }); export type NeighborDto = z.infer; -export const authorThreadSchema = z.object({ - thread: threadSchema, - strongestSameAuthorMatch: neighborSchema.nullable(), -}); -export type AuthorThreadDto = z.infer; - -export const authorThreadsResponseSchema = z.object({ - repository: repositorySchema, - authorLogin: z.string(), - threads: z.array(authorThreadSchema), -}); -export type AuthorThreadsResponse = z.infer; - -export const actorSchema = z.object({ - id: z.number().int().positive(), - provider: z.string(), - providerUserId: z.string(), - login: z.string(), - displayName: z.string().nullable(), - actorType: z.string().nullable(), - siteAdmin: z.boolean(), - firstSeenAt: z.string(), - lastSeenAt: z.string(), - updatedAt: z.string(), -}); -export type ActorDto = z.infer; - -export const authorStatsSchema = z.object({ - openedIssueCount: z.number().int().nonnegative(), - openedPullRequestCount: z.number().int().nonnegative(), - commentCount: z.number().int().nonnegative(), - mergedPullRequestCount: z.number().int().nonnegative(), - closedThreadCount: z.number().int().nonnegative(), - firstActivityAt: z.string().nullable(), - lastActivityAt: z.string().nullable(), - trustTier: z.string().nullable(), -}); -export type AuthorStatsDto = z.infer; - -export const authorResponseSchema = z.object({ - repository: repositorySchema, - authorLogin: z.string(), - actor: actorSchema.nullable(), - stats: authorStatsSchema, - threads: z.array(authorThreadSchema), -}); -export type AuthorResponse = z.infer; - export const searchHitSchema = z.object({ thread: threadSchema, keywordScore: z.number().nullable(), diff --git a/packages/api-core/src/api/server.test.ts b/packages/api-core/src/api/server.test.ts index f96c802..a3affda 100644 --- a/packages/api-core/src/api/server.test.ts +++ b/packages/api-core/src/api/server.test.ts @@ -2,8 +2,6 @@ import test from 'node:test'; import assert from 'node:assert/strict'; import { - authorResponseSchema, - authorThreadsResponseSchema, closeResponseSchema, clusterDetailResponseSchema, clusterExplainResponseSchema, @@ -29,7 +27,6 @@ test('health endpoint returns contract payload', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -43,7 +40,6 @@ test('health endpoint returns contract payload', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -82,7 +78,6 @@ test('runs endpoint returns recent pipeline history', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -96,7 +91,6 @@ test('runs endpoint returns recent pipeline history', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -146,7 +140,6 @@ test('neighbors endpoint returns contract payload', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -160,7 +153,6 @@ test('neighbors endpoint returns contract payload', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -241,7 +233,6 @@ test('threads endpoint can filter by a bulk number list', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -255,7 +246,6 @@ test('threads endpoint can filter by a bulk number list', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -303,111 +293,6 @@ test('threads endpoint can filter by a bulk number list', async () => { } }); -test('author-threads endpoint returns one author with strongest same-author matches', async () => { - const service = new GHCrawlService({ - config: { - workspaceRoot: process.cwd(), - configDir: '/tmp/ghcrawl-test', - configPath: '/tmp/ghcrawl-test/config.json', - configFileExists: true, - dbPath: ':memory:', - dbPathSource: 'config', - apiPort: 5179, - secretProvider: 'plaintext', - githubTokenSource: 'none', - openaiApiKeySource: 'none', - summaryModel: 'gpt-5-mini', - embedModel: 'text-embedding-3-large', - embeddingBasis: 'title_original', - vectorBackend: 'vectorlite', - embedBatchSize: 8, - embedConcurrency: 10, - embedMaxUnread: 20, - openSearchIndex: 'ghcrawl-threads', - tuiPreferences: {}, - }, - github: { - checkAuth: async () => undefined, - getRepo: async () => ({}), - listRepositoryIssues: async () => [], - getIssue: async () => ({}), - getPull: async () => ({}), - listIssueComments: async () => [], - listPullReviews: async () => [], - listPullReviewComments: async () => [], - listPullFiles: async () => [], - }, - }); - - const now = '2026-03-09T00:00:00Z'; - service.db - .prepare( - `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) - values (?, ?, ?, ?, ?, ?, ?)`, - ) - .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); - const insertThread = service.db.prepare( - `insert into threads ( - id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, - labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, - merged_at_gh, first_pulled_at, last_pulled_at, updated_at - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - ); - insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'lqquan', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); - insertThread.run(11, 1, '101', 43, 'pull_request', 'open', 'Fix downloader hang', 'Implements a fix.', 'lqquan', 'User', 'https://github.com/openclaw/openclaw/pull/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); - service.db - .prepare(`insert into cluster_runs (id, repo_id, scope, status, started_at, finished_at) values (?, ?, ?, ?, ?, ?)`) - .run(1, 1, 'openclaw/openclaw', 'completed', now, now); - service.db - .prepare( - `insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at) - values (?, ?, ?, ?, ?, ?, ?, ?)`, - ) - .run(1, 1, 10, 11, 'exact_cosine', 0.91, '{}', now); - service.db - .prepare( - `insert into actors ( - id, provider, provider_user_id, login, display_name, actor_type, site_admin, raw_json_blob_id, - first_seen_at, last_seen_at, updated_at - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - ) - .run(1, 'github', '501', 'lqquan', null, 'User', 0, null, now, now, now); - service.db - .prepare( - `insert into actor_repo_stats ( - repo_id, actor_id, opened_issues, opened_prs, comments, merged_prs, closed_threads, - first_activity_at, last_activity_at, trust_tier - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - ) - .run(1, 1, 1, 1, 0, 0, 0, now, now, 'unknown'); - - const server = createApiServer(service); - try { - await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); - const address = server.address(); - assert(address && typeof address === 'object'); - - const response = await fetch( - `http://127.0.0.1:${address.port}/author-threads?owner=openclaw&repo=openclaw&login=lqquan`, - ); - assert.equal(response.status, 200); - const payload = authorThreadsResponseSchema.parse((await response.json()) as unknown); - assert.equal(payload.authorLogin, 'lqquan'); - assert.deepEqual(payload.threads.map((item) => item.thread.number), [43, 42]); - assert.equal(payload.threads[0]?.strongestSameAuthorMatch?.number, 42); - - const authorResponse = await fetch(`http://127.0.0.1:${address.port}/author?owner=openclaw&repo=openclaw&login=lqquan`); - assert.equal(authorResponse.status, 200); - const author = authorResponseSchema.parse((await authorResponse.json()) as unknown); - assert.equal(author.actor?.providerUserId, '501'); - assert.equal(author.stats.openedIssueCount, 1); - assert.deepEqual(author.threads.map((item) => item.thread.number), [43, 42]); - } finally { - await new Promise((resolve, reject) => server.close((error) => (error ? reject(error) : resolve()))); - service.close(); - } -}); - test('close-thread and includeClosed thread routes expose locally closed items', async () => { const service = new GHCrawlService({ config: { @@ -418,7 +303,6 @@ test('close-thread and includeClosed thread routes expose locally closed items', dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -432,7 +316,6 @@ test('close-thread and includeClosed thread routes expose locally closed items', tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -504,7 +387,6 @@ test('exclude cluster member action records a durable override', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -518,7 +400,6 @@ test('exclude cluster member action records a durable override', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -596,7 +477,6 @@ test('set cluster canonical action records a durable override', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -610,7 +490,6 @@ test('set cluster canonical action records a durable override', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -700,7 +579,6 @@ test('durable clusters endpoint returns stable cluster state', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -714,7 +592,6 @@ test('durable clusters endpoint returns stable cluster state', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -767,7 +644,6 @@ test('server returns 400 for malformed request inputs', async () => { dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -781,7 +657,6 @@ test('server returns 400 for malformed request inputs', async () => { tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -824,7 +699,6 @@ test('cluster summary and detail endpoints return contract payloads', async () = dbPath: ':memory:', dbPathSource: 'config', apiPort: 5179, - secretProvider: 'plaintext', githubTokenSource: 'none', openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -838,7 +712,6 @@ test('cluster summary and detail endpoints return contract payloads', async () = tuiPreferences: {}, }, github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 64d2bc8..0f82d06 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -84,30 +84,6 @@ export function createApiServer(service: GHCrawlService): http.Server { return; } - if (req.method === 'GET' && url.pathname === '/author-threads') { - const params = parseRepoParams(url); - const login = (url.searchParams.get('login') ?? '').trim(); - if (!login) { - sendJson(res, 400, { error: 'Missing login parameter' }); - return; - } - const includeClosed = url.searchParams.get('includeClosed') === 'true'; - sendJson(res, 200, service.listAuthorThreads({ ...params, login, includeClosed })); - return; - } - - if (req.method === 'GET' && url.pathname === '/author') { - const params = parseRepoParams(url); - const login = (url.searchParams.get('login') ?? '').trim(); - if (!login) { - sendJson(res, 400, { error: 'Missing login parameter' }); - return; - } - const includeClosed = url.searchParams.get('includeClosed') === 'true'; - sendJson(res, 200, service.getAuthor({ ...params, login, includeClosed })); - return; - } - if (req.method === 'GET' && url.pathname === '/search') { const params = parseRepoParams(url); const query = url.searchParams.get('query'); diff --git a/packages/api-core/src/cluster/perf.integration.ts b/packages/api-core/src/cluster/perf.integration.ts index 7b9da8e..65dca79 100644 --- a/packages/api-core/src/cluster/perf.integration.ts +++ b/packages/api-core/src/cluster/perf.integration.ts @@ -173,7 +173,6 @@ function buildSuggestedBaseline(result: PerfRunResult): SuggestedBaseline | null function createGitHubStub(): GHCrawlService['github'] { return { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -197,7 +196,6 @@ function createService(dbPath: string): GHCrawlService { apiPort: 5179, githubToken: 'ghp_testtoken1234567890', githubTokenSource: 'config', - secretProvider: 'plaintext', tuiPreferences: {}, openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index aa01362..b745c9a 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -11,9 +11,7 @@ import { scoreSimilarityEvidence } from './evidence-score.js'; import { createPipelineRun, finishPipelineRun, - refreshActorRepoStats, recordClusterEvent, - upsertActor, upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, @@ -136,40 +134,6 @@ test('persistent cluster store upserts edge evidence and governed memberships', } }); -test('persistent cluster store upserts actors and recomputes repo stats', () => { - const db = openDb(':memory:'); - try { - migrate(db); - seedRepoAndThreads(db); - const actorId = upsertActor(db, { - providerUserId: 'alice-id', - login: 'alice', - displayName: 'Alice', - actorType: 'User', - rawJson: '{"login":"alice"}', - }); - db.prepare( - `insert into comments ( - thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - ).run(10, 'c1', 'issue_comment', 'alice', 'User', 'confirmed', 0, '{}', '2026-01-02T00:00:00Z', '2026-01-02T00:00:00Z'); - - refreshActorRepoStats(db, 1); - - const actor = db.prepare('select login, display_name from actors where id = ?').get(actorId) as { login: string; display_name: string }; - const stats = db.prepare('select opened_prs, comments, trust_tier from actor_repo_stats where repo_id = ? and actor_id = ?').get(1, actorId) as { - opened_prs: number; - comments: number; - trust_tier: string; - }; - - assert.deepEqual(actor, { login: 'alice', display_name: 'Alice' }); - assert.deepEqual(stats, { opened_prs: 2, comments: 1, trust_tier: 'unknown' }); - } finally { - db.close(); - } -}); - test('persistent cluster store records thread revisions and deterministic fingerprints', () => { const db = openDb(':memory:'); try { diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index f945151..0e1317a 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -55,102 +55,6 @@ function upsertTextBlob( export type PipelineRunKind = 'sync' | 'fingerprint' | 'enrich' | 'edge' | 'cluster' | 'cluster_incremental'; -export function upsertActor( - db: SqliteDatabase, - params: { - provider?: 'github'; - providerUserId: string; - login: string; - displayName?: string | null; - actorType?: string | null; - siteAdmin?: boolean; - rawJson?: string | null; - }, -): number { - const timestamp = nowIso(); - const rawJsonBlobId = - params.rawJson && params.rawJson !== '{}' - ? upsertInlineBlob(db, { - text: params.rawJson, - mediaType: 'application/vnd.ghcrawl.actor.raw+json', - }) - : null; - db.prepare( - `insert into actors ( - provider, provider_user_id, login, display_name, actor_type, site_admin, - raw_json_blob_id, first_seen_at, last_seen_at, updated_at - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(provider, provider_user_id) do update set - login = excluded.login, - display_name = excluded.display_name, - actor_type = excluded.actor_type, - site_admin = excluded.site_admin, - raw_json_blob_id = excluded.raw_json_blob_id, - last_seen_at = excluded.last_seen_at, - updated_at = excluded.updated_at`, - ).run( - params.provider ?? 'github', - params.providerUserId, - params.login, - params.displayName ?? null, - params.actorType ?? null, - params.siteAdmin ? 1 : 0, - rawJsonBlobId, - timestamp, - timestamp, - timestamp, - ); - const row = db - .prepare('select id from actors where provider = ? and provider_user_id = ? limit 1') - .get(params.provider ?? 'github', params.providerUserId) as { id: number }; - return row.id; -} - -export function refreshActorRepoStats(db: SqliteDatabase, repoId: number): void { - db.prepare('delete from actor_repo_stats where repo_id = ?').run(repoId); - db.prepare( - `insert into actor_repo_stats ( - repo_id, actor_id, opened_issues, opened_prs, comments, merged_prs, closed_threads, first_activity_at, last_activity_at, trust_tier - ) - select - ?, - a.id, - (select count(*) from threads t where t.repo_id = ? and t.kind = 'issue' and lower(t.author_login) = lower(a.login)), - (select count(*) from threads t where t.repo_id = ? and t.kind = 'pull_request' and lower(t.author_login) = lower(a.login)), - (select count(*) - from comments c - join threads t on t.id = c.thread_id - where t.repo_id = ? and lower(c.author_login) = lower(a.login)), - (select count(*) from threads t where t.repo_id = ? and t.kind = 'pull_request' and t.merged_at_gh is not null and lower(t.author_login) = lower(a.login)), - (select count(*) from threads t where t.repo_id = ? and t.closed_at_gh is not null and lower(t.author_login) = lower(a.login)), - (select min(activity_at) - from ( - select created_at_gh as activity_at from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login) - union all - select c.created_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(a.login) - ) - where activity_at is not null), - (select max(activity_at) - from ( - select updated_at_gh as activity_at from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login) - union all - select c.updated_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(a.login) - ) - where activity_at is not null), - case - when a.actor_type = 'Bot' then 'bot' - when (select count(*) from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login)) >= 3 then 'repeat_contributor' - else 'unknown' - end - from actors a - where exists (select 1 from threads t where t.repo_id = ? and lower(t.author_login) = lower(a.login)) - or exists ( - select 1 from comments c join threads t on t.id = c.thread_id - where t.repo_id = ? and lower(c.author_login) = lower(a.login) - )`, - ).run(repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId, repoId); -} - export function upsertThreadRevision( db: SqliteDatabase, params: { @@ -570,12 +474,11 @@ export function recordClusterEvent( runId?: number | null; eventType: string; actorKind: 'algo' | 'user' | 'import'; - actorId?: number | null; payload: unknown; }, ): void { db.prepare( - `insert into cluster_events (cluster_id, run_id, event_type, actor_kind, actor_id, payload_json, created_at) - values (?, ?, ?, ?, ?, ?, ?)`, - ).run(params.clusterId, params.runId ?? null, params.eventType, params.actorKind, params.actorId ?? null, JSON.stringify(params.payload), nowIso()); + `insert into cluster_events (cluster_id, run_id, event_type, actor_kind, payload_json, created_at) + values (?, ?, ?, ?, ?, ?)`, + ).run(params.clusterId, params.runId ?? null, params.eventType, params.actorKind, JSON.stringify(params.payload), nowIso()); } diff --git a/packages/api-core/src/config.test.ts b/packages/api-core/src/config.test.ts index 7b7895f..4cc1b77 100644 --- a/packages/api-core/src/config.test.ts +++ b/packages/api-core/src/config.test.ts @@ -7,8 +7,6 @@ import path from 'node:path'; import { getConfigPath, getTuiRepositoryPreference, - isLikelyGitHubToken, - isLikelyOpenAiApiKey, loadConfig, readPersistedConfig, writeTuiRepositoryPreference, @@ -218,7 +216,7 @@ test('config path override redirects persisted config reads and writes', () => { assert.equal(loaded.configDir, path.dirname(overridePath)); }); -test('loadConfig restores op metadata and repository tui preferences', () => { +test('loadConfig restores repository tui preferences', () => { const home = makeTempHome(); const workspace = fs.mkdtempSync(path.join(os.tmpdir(), 'ghcrawl-workspace-')); fs.writeFileSync(path.join(workspace, 'pnpm-workspace.yaml'), 'packages:\n - "packages/*"\n'); @@ -229,9 +227,6 @@ test('loadConfig restores op metadata and repository tui preferences', () => { writePersistedConfig( { - secretProvider: 'op', - opVaultName: 'PwrDrvr LLC', - opItemName: 'ghcrawl', tuiPreferences: { 'openclaw/openclaw': { minClusterSize: 1, @@ -244,9 +239,6 @@ test('loadConfig restores op metadata and repository tui preferences', () => { ); const config = loadConfig({ cwd: workspace, env }); - assert.equal(config.secretProvider, 'op'); - assert.equal(config.opVaultName, 'PwrDrvr LLC'); - assert.equal(config.opItemName, 'ghcrawl'); assert.deepEqual(getTuiRepositoryPreference(config, 'openclaw', 'openclaw'), { minClusterSize: 1, sortMode: 'size', @@ -316,13 +308,3 @@ test('loadConfig rejects invalid embed queue settings', () => { }), ); }); - -test('token format helpers match expected API key shapes', () => { - assert.equal(isLikelyGitHubToken('ghp_testtoken1234567890'), true); - assert.equal(isLikelyGitHubToken('github_pat_1234567890abcdefghijklmnopqrstuvwxyz'), true); - assert.equal(isLikelyGitHubToken('not-a-token'), false); - - assert.equal(isLikelyOpenAiApiKey('sk-proj-testkey1234567890'), true); - assert.equal(isLikelyOpenAiApiKey('sk-testkey1234567890'), true); - assert.equal(isLikelyOpenAiApiKey('openai-key'), false); -}); diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index d159709..a8c6553 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -5,7 +5,6 @@ import path from 'node:path'; import dotenv from 'dotenv'; export type ConfigValueSource = 'env' | 'config' | 'dotenv' | 'default' | 'none'; -export type SecretProvider = 'plaintext' | 'op'; export type TuiSortPreference = 'recent' | 'size'; export type TuiMinClusterSize = 0 | 1 | 10 | 20 | 50; export type TuiWideLayoutPreference = 'columns' | 'right-stack'; @@ -21,9 +20,6 @@ export type TuiRepositoryPreference = { export type PersistedGitcrawlConfig = { githubToken?: string; openaiApiKey?: string; - secretProvider?: SecretProvider; - opVaultName?: string; - opItemName?: string; dbPath?: string; apiPort?: number; summaryModel?: string; @@ -50,9 +46,6 @@ export type GitcrawlConfig = { githubTokenSource: ConfigValueSource; openaiApiKey?: string; openaiApiKeySource: ConfigValueSource; - secretProvider: SecretProvider; - opVaultName?: string; - opItemName?: string; summaryModel: string; embedModel: string; embeddingBasis: EmbeddingBasis; @@ -164,10 +157,6 @@ function getNumber(value: unknown): number | undefined { return typeof value === 'number' && Number.isFinite(value) ? value : undefined; } -function getSecretProvider(value: unknown): SecretProvider | undefined { - return value === 'plaintext' || value === 'op' ? value : undefined; -} - function getTuiSortPreference(value: unknown): TuiSortPreference | undefined { return value === 'recent' || value === 'size' ? value : undefined; } @@ -226,9 +215,6 @@ export function readPersistedConfig(options: LoadConfigOptions = {}): LoadedStor data: { githubToken: getString(raw.githubToken), openaiApiKey: getString(raw.openaiApiKey), - secretProvider: getSecretProvider(raw.secretProvider), - opVaultName: getString(raw.opVaultName), - opItemName: getString(raw.opItemName), dbPath: getString(raw.dbPath), apiPort: getNumber(raw.apiPort), summaryModel: getString(raw.summaryModel), @@ -273,14 +259,6 @@ function parseIntegerSetting(name: string, raw: string): number { return parsed; } -export function isLikelyGitHubToken(value: string): boolean { - return /^(gh[pousr]_[A-Za-z0-9_]+|github_pat_[A-Za-z0-9_]+)$/.test(value.trim()); -} - -export function isLikelyOpenAiApiKey(value: string): boolean { - return /^sk-[A-Za-z0-9._-]+$/.test(value.trim()); -} - export function loadConfig(options: LoadConfigOptions = {}): GitcrawlConfig { const cwd = options.cwd ?? process.cwd(); const env = options.env ?? process.env; @@ -398,9 +376,6 @@ export function loadConfig(options: LoadConfigOptions = {}): GitcrawlConfig { githubTokenSource: githubToken.source, openaiApiKey: openaiApiKey.value, openaiApiKeySource: openaiApiKey.source, - secretProvider: stored.data.secretProvider ?? 'plaintext', - opVaultName: stored.data.opVaultName, - opItemName: stored.data.opItemName, summaryModel: summaryModel.value ?? 'gpt-5-mini', embedModel: embedModel.value ?? 'text-embedding-3-large', embeddingBasis: embeddingBasis.value ?? 'title_original', @@ -453,24 +428,14 @@ export function writeTuiRepositoryPreference( export function requireGithubToken(config: GitcrawlConfig): string { if (!config.githubToken) { - if (config.secretProvider === 'op' && config.opVaultName && config.opItemName) { - throw new Error( - `Missing GitHub token in the environment. This config is set to use 1Password CLI via ${config.opVaultName}/${config.opItemName}; run ghcrawl through your op wrapper or set GITHUB_TOKEN. Expected config at ${config.configPath}`, - ); - } - throw new Error(`Missing GitHub token. Run ghcrawl init or set GITHUB_TOKEN. Expected config at ${config.configPath}`); + throw new Error(`Missing GitHub token. Set GITHUB_TOKEN or add githubToken to ${config.configPath}.`); } return config.githubToken; } export function requireOpenAiKey(config: GitcrawlConfig): string { if (!config.openaiApiKey) { - if (config.secretProvider === 'op' && config.opVaultName && config.opItemName) { - throw new Error( - `Missing OpenAI API key in the environment. This config is set to use 1Password CLI via ${config.opVaultName}/${config.opItemName}; run ghcrawl through your op wrapper or set OPENAI_API_KEY. Expected config at ${config.configPath}`, - ); - } - throw new Error(`Missing OpenAI API key. Run ghcrawl init or set OPENAI_API_KEY. Expected config at ${config.configPath}`); + throw new Error(`Missing OpenAI API key. Set OPENAI_API_KEY or add openaiApiKey to ${config.configPath}.`); } return config.openaiApiKey; } diff --git a/packages/api-core/src/db/migrate.test.ts b/packages/api-core/src/db/migrate.test.ts index 1ad4115..cc71748 100644 --- a/packages/api-core/src/db/migrate.test.ts +++ b/packages/api-core/src/db/migrate.test.ts @@ -19,7 +19,6 @@ test('migrate creates core tables', () => { assert.ok(names.includes('document_embeddings')); assert.ok(names.includes('thread_vectors')); assert.ok(names.includes('blobs')); - assert.ok(names.includes('actors')); assert.ok(names.includes('thread_revisions')); assert.ok(names.includes('thread_fingerprints')); assert.ok(names.includes('thread_key_summaries')); diff --git a/packages/api-core/src/db/migrate.ts b/packages/api-core/src/db/migrate.ts index d51eedc..1fb744d 100644 --- a/packages/api-core/src/db/migrate.ts +++ b/packages/api-core/src/db/migrate.ts @@ -71,37 +71,6 @@ const migrationStatements = [ ) `, ` - create table if not exists actors ( - id integer primary key, - provider text not null, - provider_user_id text not null, - login text not null, - display_name text, - actor_type text, - site_admin integer not null default 0, - raw_json_blob_id integer references blobs(id) on delete set null, - first_seen_at text not null, - last_seen_at text not null, - updated_at text not null, - unique(provider, provider_user_id) - ) - `, - ` - create table if not exists actor_repo_stats ( - repo_id integer not null references repositories(id) on delete cascade, - actor_id integer not null references actors(id) on delete cascade, - opened_issues integer not null default 0, - opened_prs integer not null default 0, - comments integer not null default 0, - merged_prs integer not null default 0, - closed_threads integer not null default 0, - first_activity_at text, - last_activity_at text, - trust_tier text, - primary key (repo_id, actor_id) - ) - `, - ` create table if not exists thread_revisions ( id integer primary key, thread_id integer not null references threads(id) on delete cascade, @@ -454,7 +423,6 @@ const migrationStatements = [ cluster_id integer not null references cluster_groups(id) on delete cascade, thread_id integer not null references threads(id) on delete cascade, action text not null, - actor_id integer references actors(id) on delete set null, reason text, created_at text not null, expires_at text, @@ -468,7 +436,6 @@ const migrationStatements = [ run_id integer references pipeline_runs(id) on delete set null, event_type text not null, actor_kind text not null, - actor_id integer references actors(id) on delete set null, payload_json text not null, created_at text not null ) @@ -539,8 +506,6 @@ export function migrate(db: SqliteDatabase): void { db.exec('create index if not exists idx_threads_repo_number on threads(repo_id, number)'); db.exec('create index if not exists idx_blobs_sha256 on blobs(sha256)'); - db.exec('create index if not exists idx_actors_provider_login on actors(provider, login)'); - db.exec('create index if not exists idx_actor_repo_stats_actor on actor_repo_stats(actor_id)'); db.exec('create index if not exists idx_thread_revisions_thread_created on thread_revisions(thread_id, created_at)'); db.exec('create index if not exists idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash)'); db.exec('create index if not exists idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug)'); diff --git a/packages/api-core/src/github/client.ts b/packages/api-core/src/github/client.ts index 62740c5..2eeb073 100644 --- a/packages/api-core/src/github/client.ts +++ b/packages/api-core/src/github/client.ts @@ -3,7 +3,6 @@ import { throttling } from '@octokit/plugin-throttling'; import { Octokit } from 'octokit'; export type GitHubClient = { - checkAuth: (reporter?: GitHubReporter) => Promise; getRepo: (owner: string, repo: string, reporter?: GitHubReporter) => Promise>; listRepositoryIssues: ( owner: string, @@ -157,11 +156,6 @@ export function makeGitHubClient(options: RequestOptions): GitHubClient { } return { - async checkAuth(reporter) { - await request('GET /rate_limit', reporter, async (octokit) => { - await octokit.request('GET /rate_limit'); - }); - }, async getRepo(owner, repo, reporter) { return request(`GET /repos/${owner}/${repo}`, reporter, async (octokit) => { const response = await octokit.rest.repos.get({ owner, repo }); diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index 71ee588..db7204e 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -22,7 +22,6 @@ export type SummaryUsage = { export type AiProvider = { providerName?: string; - checkAuth: () => Promise; summarizeThread: (params: { model: string; text: string }) => Promise<{ summary: SummaryResult; usage?: SummaryUsage }>; generateKeySummary?: (params: { model: string; text: string }) => Promise<{ summary: LlmKeySummary; usage?: SummaryUsage }>; embedTexts: (params: { model: string; texts: string[]; dimensions?: number }) => Promise; @@ -44,10 +43,6 @@ export class OpenAiProvider implements AiProvider { this.client = new OpenAI({ apiKey }); } - async checkAuth(): Promise { - await this.client.models.list(); - } - async summarizeThread(params: { model: string; text: string }): Promise<{ summary: SummaryResult; usage?: SummaryUsage }> { const format = zodTextFormat(summarySchema, 'ghcrawl_thread_summary'); let lastError: Error | null = null; diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index c49e723..d2c6c32 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -19,7 +19,6 @@ function makeTestConfig(overrides: Partial = {}): GHCr apiPort: 5179, githubToken: 'ghp_testtoken1234567890', githubTokenSource: 'config', - secretProvider: 'plaintext', tuiPreferences: {}, openaiApiKeySource: 'none', summaryModel: 'gpt-5-mini', @@ -53,18 +52,13 @@ function makeEmbedding(seed: number, variant = 0): number[] { }); } -test('doctor reports config path and successful auth smoke checks', async () => { - let githubChecked = 0; - let openAiChecked = 0; +test('doctor reports config path and token presence without network auth checks', async () => { const service = new GHCrawlService({ config: makeTestConfig({ openaiApiKey: 'sk-proj-testkey1234567890', openaiApiKeySource: 'config', }), github: { - checkAuth: async () => { - githubChecked += 1; - }, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -75,9 +69,6 @@ test('doctor reports config path and successful auth smoke checks', async () => listPullFiles: async () => [], }, ai: { - checkAuth: async () => { - openAiChecked += 1; - }, summarizeThread: async () => { throw new Error('not expected'); }, @@ -88,29 +79,22 @@ test('doctor reports config path and successful auth smoke checks', async () => try { const result = await service.doctor(); assert.equal(result.health.configPath, service.config.configPath); - assert.equal(result.github.formatOk, true); - assert.equal(result.github.authOk, true); - assert.equal(result.openai.formatOk, true); - assert.equal(result.openai.authOk, true); + assert.equal(result.github.tokenPresent, true); + assert.equal(result.openai.tokenPresent, true); assert.equal(result.vectorlite.configured, true); assert.equal(result.vectorlite.runtimeOk, true); - assert.equal(githubChecked, 1); - assert.equal(openAiChecked, 1); } finally { service.close(); } }); -test('doctor reports invalid token format without attempting auth', async () => { - let githubChecked = 0; +test('doctor reports missing GitHub token without attempting network auth', async () => { const service = new GHCrawlService({ config: makeTestConfig({ - githubToken: 'not-a-token', + githubToken: undefined, + githubTokenSource: 'none', }), github: { - checkAuth: async () => { - githubChecked += 1; - }, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -122,36 +106,11 @@ test('doctor reports invalid token format without attempting auth', async () => }, }); - try { - const result = await service.doctor(); - assert.equal(result.github.formatOk, false); - assert.equal(result.github.authOk, false); - assert.match(result.github.error ?? '', /does not look like a GitHub personal access token/); - assert.equal(githubChecked, 0); - } finally { - service.close(); - } -}); - -test('doctor explains when secrets are expected from 1Password CLI env injection', async () => { - const service = new GHCrawlService({ - config: makeTestConfig({ - githubToken: undefined, - githubTokenSource: 'none', - openaiApiKey: undefined, - openaiApiKeySource: 'none', - secretProvider: 'op', - opVaultName: 'PwrDrvr LLC', - opItemName: 'ghcrawl', - }), - }); - try { const result = await service.doctor(); assert.equal(result.github.configured, false); - assert.match(result.github.error ?? '', /1Password CLI/); - assert.equal(result.openai.configured, false); - assert.match(result.openai.error ?? '', /OPENAI_API_KEY/); + assert.equal(result.github.tokenPresent, false); + assert.match(result.github.error ?? '', /GITHUB_TOKEN/); } finally { service.close(); } @@ -159,7 +118,6 @@ test('doctor explains when secrets are expected from 1Password CLI env injection test('listRunHistory returns recent runs across pipeline tables', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -208,7 +166,6 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and let listPullReviewCommentCalls = 0; let listPullFileCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, _since, limit) => [ @@ -309,9 +266,6 @@ test('syncRepository defaults to metadata-only mode, preserves thread kind, and service.listThreads({ owner: 'openclaw', repo: 'openclaw', numbers: [43, 42, 999] }).threads.map((thread) => thread.number), [43, 42], ); - const authorThreads = service.listAuthorThreads({ owner: 'openclaw', repo: 'openclaw', login: 'alice' }); - assert.equal(authorThreads.authorLogin, 'alice'); - assert.deepEqual(authorThreads.threads.map((item) => item.thread.number), [43, 42]); assert.equal(listIssueCommentCalls, 0); assert.equal(listPullReviewCalls, 0); assert.equal(listPullReviewCommentCalls, 0); @@ -351,7 +305,6 @@ test('syncRepository fetches comments, reviews, and review comments when include let listPullReviewCommentCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [ { @@ -453,7 +406,6 @@ test('syncRepository fetches comments, reviews, and review comments when include test('syncRepository hydrates pull request code snapshots when includeCode is enabled', async () => { let listPullFileCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [ { @@ -535,7 +487,6 @@ test('summarizeRepository excludes hydrated comments by default and reports toke const summaryInputs: string[] = []; const service = makeTestService( { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -550,7 +501,6 @@ test('summarizeRepository excludes hydrated comments by default and reports toke listPullFiles: async () => [], }, { - checkAuth: async () => undefined, summarizeThread: async ({ text }) => { summaryInputs.push(text); return { @@ -644,7 +594,6 @@ test('summarizeRepository includes hydrated human comments when includeComments const summaryInputs: string[] = []; const service = makeTestService( { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -659,7 +608,6 @@ test('summarizeRepository includes hydrated human comments when includeComments listPullFiles: async () => [], }, { - checkAuth: async () => undefined, summarizeThread: async ({ text }) => { summaryInputs.push(text); return { @@ -752,7 +700,6 @@ test('summarizeRepository prices progress output using the configured summary mo const progress: string[] = []; const service = makeTestService( { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -767,7 +714,6 @@ test('summarizeRepository prices progress output using the configured summary mo listPullFiles: async () => [], }, { - checkAuth: async () => undefined, summarizeThread: async () => ({ summary: { problemSummary: 'Problem', @@ -846,7 +792,6 @@ test('generateKeySummaries stores cached 3-line key summaries', async () => { let calls = 0; const service = makeTestService( { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -858,7 +803,6 @@ test('generateKeySummaries stores cached 3-line key summaries', async () => { }, { providerName: 'test-agent', - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -918,7 +862,6 @@ test('generateKeySummaries stores cached 3-line key summaries', async () => { test('purgeComments removes hydrated comments and refreshes canonical documents', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1016,7 +959,6 @@ test('embedRepository batches multi-source embeddings and skips unchanged inputs const embedCalls: string[][] = []; const service = makeTestService( { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1031,7 +973,6 @@ test('embedRepository batches multi-source embeddings and skips unchanged inputs listPullFiles: async () => [], }, { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1125,7 +1066,6 @@ test('embedRepository can use stored 3-line key summaries as active vector input const service = new GHCrawlService({ config: makeTestConfig({ embeddingBasis: 'llm_key_summary' }), github: { - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -1136,7 +1076,6 @@ test('embedRepository can use stored 3-line key summaries as active vector input listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1203,7 +1142,6 @@ test('listNeighbors uses the vectorlite sidecar for current active vectors', asy const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1218,7 +1156,6 @@ test('listNeighbors uses the vectorlite sidecar for current active vectors', asy listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1264,7 +1201,6 @@ test('embedRepository prunes closed vectors before reusing current active vector const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1279,7 +1215,6 @@ test('embedRepository prunes closed vectors before reusing current active vector listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1348,7 +1283,6 @@ test('embedRepository truncates oversized inputs before submission', async () => embedMaxUnread: 2, }), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1363,7 +1297,6 @@ test('embedRepository truncates oversized inputs before submission', async () => listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1473,7 +1406,6 @@ test('embedRepository isolates a failing oversized item from a mixed batch and r embedMaxUnread: 2, }), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1488,7 +1420,6 @@ test('embedRepository isolates a failing oversized item from a mixed batch and r listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1601,7 +1532,6 @@ test('embedRepository recovers from wrapped maximum input length errors by shrin embedMaxUnread: 2, }), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -1616,7 +1546,6 @@ test('embedRepository recovers from wrapped maximum input length errors by shrin listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -1730,7 +1659,6 @@ test('embedRepository recovers from wrapped maximum input length errors by shrin test('listNeighbors returns exact nearest neighbors for an embedded thread', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -1812,72 +1740,6 @@ test('listNeighbors returns exact nearest neighbors for an embedded thread', () } }); -test('listAuthorThreads returns one author view with strongest same-author match from stored cluster edges', () => { - const service = makeTestService({ - checkAuth: async () => undefined, - getRepo: async () => ({}), - listRepositoryIssues: async () => [], - getIssue: async () => ({}), - getPull: async () => ({}), - listIssueComments: async () => [], - listPullReviews: async () => [], - listPullReviewComments: async () => [], - listPullFiles: async () => [], - }); - - try { - const now = '2026-03-09T00:00:00Z'; - service.db - .prepare( - `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) - values (?, ?, ?, ?, ?, ?, ?)`, - ) - .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); - - const insertThread = service.db.prepare( - `insert into threads ( - id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, - labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, - merged_at_gh, first_pulled_at, last_pulled_at, updated_at - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - ); - insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'lqquan', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); - insertThread.run(11, 1, '101', 43, 'pull_request', 'open', 'Fix downloader hang', 'Implements a fix.', 'lqquan', 'User', 'https://github.com/openclaw/openclaw/pull/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); - insertThread.run(12, 1, '102', 44, 'issue', 'open', 'Retry issue', 'Retries are broken.', 'other', 'User', 'https://github.com/openclaw/openclaw/issues/44', '[]', '[]', '{}', 'hash-44', 0, now, now, null, null, now, now, now); - - service.db - .prepare(`insert into cluster_runs (id, repo_id, scope, status, started_at, finished_at) values (?, ?, ?, ?, ?, ?)`) - .run(1, 1, 'openclaw/openclaw', 'completed', now, now); - service.db - .prepare( - `insert into clusters (id, repo_id, cluster_run_id, representative_thread_id, member_count, created_at) - values (?, ?, ?, ?, ?, ?)`, - ) - .run(100, 1, 1, 10, 2, now); - service.db - .prepare(`insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)`) - .run(100, 10, null, now); - service.db - .prepare(`insert into cluster_members (cluster_id, thread_id, score_to_representative, created_at) values (?, ?, ?, ?)`) - .run(100, 11, 0.91, now); - service.db - .prepare( - `insert into similarity_edges (repo_id, cluster_run_id, left_thread_id, right_thread_id, method, score, explanation_json, created_at) - values (?, ?, ?, ?, ?, ?, ?, ?)`, - ) - .run(1, 1, 10, 11, 'exact_cosine', 0.91, '{}', now); - - const result = service.listAuthorThreads({ owner: 'openclaw', repo: 'openclaw', login: 'lqquan' }); - - assert.deepEqual(result.threads.map((item) => item.thread.number), [43, 42]); - assert.equal(result.threads[0]?.strongestSameAuthorMatch?.number, 42); - assert.equal(result.threads[0]?.strongestSameAuthorMatch?.score, 0.91); - assert.equal(result.threads[1]?.strongestSameAuthorMatch?.number, 43); - } finally { - service.close(); - } -}); - test('clusterRepository emits timed progress updates while identifying similarities', async () => { const messages: string[] = []; const originalDateNow = Date.now; @@ -1888,7 +1750,6 @@ test('clusterRepository emits timed progress updates while identifying similarit }; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -1945,7 +1806,6 @@ test('clusterRepository emits timed progress updates while identifying similarit test('clusterRepository merges source kinds into one edge without directional duplicates', async () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -2008,7 +1868,6 @@ test('clusterRepository merges source kinds into one edge without directional du test('clusterRepository prunes older cluster runs for the repo after a successful rebuild', async () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -2071,7 +1930,6 @@ test('clusterRepository purges legacy embeddings and inline vector payloads afte const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2086,7 +1944,6 @@ test('clusterRepository purges legacy embeddings and inline vector payloads afte listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -2177,7 +2034,6 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a config: makeTestConfig(), vectorStore, github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2192,7 +2048,6 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -2266,7 +2121,6 @@ test('clusterRepository falls back to deterministic fingerprints when vectors ar const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2323,7 +2177,6 @@ test('clusterRepository preserves a forced canonical representative on rebuild', const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2392,7 +2245,6 @@ test('clusterRepository preserves a forced include on rebuild', async () => { const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2455,7 +2307,6 @@ test('mergeDurableClusters preserves source slug and force-includes active sourc const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2538,7 +2389,6 @@ test('splitDurableCluster creates a governed cluster and blocks source re-entry' const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2637,7 +2487,6 @@ test('clusterRepository materializes only changed deterministic fingerprints', a const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2711,7 +2560,6 @@ test('clusterRepository can refresh one durable neighborhood without replacing t const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -2785,7 +2633,6 @@ test('clusterRepository uses hydrated code hunk signatures without embeddings', const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [ { @@ -2868,7 +2715,6 @@ test('clusterRepository keeps deterministic hunk edges when active vectors are c const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [ { @@ -2926,7 +2772,6 @@ test('clusterRepository keeps deterministic hunk edges when active vectors are c ], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -2987,7 +2832,6 @@ test('embedRepository rebuilds a corrupted active vector store during upsert', a config: makeTestConfig(), vectorStore, github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -3002,7 +2846,6 @@ test('embedRepository rebuilds a corrupted active vector store during upsert', a listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -3042,7 +2885,6 @@ test('clusterExperiment falls back to active vectors when legacy embeddings are const service = new GHCrawlService({ config: makeTestConfig(), github: { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -3057,7 +2899,6 @@ test('clusterExperiment falls back to active vectors when legacy embeddings are listPullFiles: async () => [], }, ai: { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -3112,7 +2953,6 @@ test('clusterExperiment falls back to active vectors when legacy embeddings are test('clusterRepository does not retain a parsed embedding cache in-process', async () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3166,7 +3006,6 @@ test('clusterRepository does not retain a parsed embedding cache in-process', as test('tui snapshot returns mixed issue and pull request counts with default recent sort and filters', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3267,7 +3106,6 @@ test('tui snapshot returns mixed issue and pull request counts with default rece test('tui cluster detail and thread detail expose members, summaries, and neighbors', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3371,7 +3209,6 @@ test('tui cluster detail and thread detail expose members, summaries, and neighb test('getTuiThreadDetail prefers stored cluster neighbors over exact embedding search', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3436,7 +3273,6 @@ test('refreshRepository runs sync, embed, and cluster in order and returns the c const messages: string[] = []; const service = makeTestService( { - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [ { @@ -3473,7 +3309,6 @@ test('refreshRepository runs sync, embed, and cluster in order and returns the c listPullFiles: async () => [], }, { - checkAuth: async () => undefined, summarizeThread: async () => { throw new Error('not expected'); }, @@ -3508,7 +3343,6 @@ test('refreshRepository runs sync, embed, and cluster in order and returns the c test('refreshRepository forwards includeCode to sync stage', async () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3549,7 +3383,6 @@ test('refreshRepository forwards includeCode to sync stage', async () => { test('agent cluster summary and detail dumps expose repo stats, snippets, and summaries', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3696,7 +3529,6 @@ test('agent cluster summary and detail dumps expose repo stats, snippets, and su test('getTuiThreadDetail can skip neighbor loading for fast browse paths', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [], getIssue: async () => { @@ -3777,7 +3609,6 @@ test('getTuiThreadDetail can skip neighbor loading for fast browse paths', () => test('local thread closure updates default thread filters and auto-closes fully closed clusters', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3863,7 +3694,6 @@ test('local thread closure updates default thread filters and auto-closes fully test('manual cluster closure is hidden from JSON summaries by default but remains visible in the tui snapshot', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -3929,7 +3759,6 @@ test('manual cluster closure is hidden from JSON summaries by default but remain test('excludeThreadFromCluster records a durable manual exclusion', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -4005,7 +3834,6 @@ test('excludeThreadFromCluster records a durable manual exclusion', () => { test('listDurableClusters returns stable slugs and governed member states', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -4062,7 +3890,6 @@ test('listDurableClusters returns stable slugs and governed member states', () = test('explainDurableCluster returns evidence and governance records', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({}), listRepositoryIssues: async () => [], getIssue: async () => ({}), @@ -4136,9 +3963,8 @@ test('explainDurableCluster returns evidence and governance records', () => { } }); -test('syncRepository records actors and repo stats from thread and comment authors', async () => { +test('syncRepository keeps source author fields without building actor profiles', async () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => [ { @@ -4182,33 +4008,17 @@ test('syncRepository records actors and repo stats from thread and comment autho limit: 1, }); - const actors = service.db.prepare('select login, actor_type from actors order by login').all() as Array<{ - login: string; - actor_type: string; - }>; - const stats = service.db - .prepare( - `select a.login, s.opened_issues, s.comments - from actor_repo_stats s - join actors a on a.id = s.actor_id - order by a.login`, - ) - .all() as Array<{ login: string; opened_issues: number; comments: number }>; - - assert.deepEqual(actors, [ - { login: 'alice', actor_type: 'User' }, - { login: 'bob', actor_type: 'User' }, - ]); - assert.deepEqual(stats, [ - { login: 'alice', opened_issues: 1, comments: 0 }, - { login: 'bob', opened_issues: 0, comments: 1 }, - ]); + const thread = service.db.prepare('select author_login, author_type from threads where number = 42').get() as { + author_login: string; + author_type: string; + }; + const comment = service.db.prepare('select author_login, author_type from comments where github_id = ?').get('900') as { + author_login: string; + author_type: string; + }; - const author = service.getAuthor({ owner: 'openclaw', repo: 'openclaw', login: 'alice' }); - assert.equal(author.actor?.providerUserId, '501'); - assert.equal(author.stats.openedIssueCount, 1); - assert.equal(author.stats.commentCount, 0); - assert.deepEqual(author.threads.map((item) => item.thread.number), [42]); + assert.deepEqual(thread, { author_login: 'alice', author_type: 'User' }); + assert.deepEqual(comment, { author_login: 'bob', author_type: 'User' }); } finally { service.close(); } @@ -4221,7 +4031,6 @@ test('syncRepository reconciles stale open threads and marks confirmed closures let closedListCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, _since, _limit, _reporter, state = 'open') => { if (state === 'closed') { @@ -4329,7 +4138,6 @@ test('syncRepository treats missing stale pull requests as closed and continues' const messages: string[] = []; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async () => { listRepositoryIssuesCalls += 1; @@ -4416,7 +4224,6 @@ test('syncRepository skips stale-open reconciliation for filtered crawls', async let getIssueCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, _since, limit) => { listRepositoryIssuesCalls += 1; @@ -4481,7 +4288,6 @@ test('syncRepository leaves unseen stale open items alone by default when closed let getIssueCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, _since, _limit, _reporter, state = 'open') => { if (state === 'closed') { @@ -4547,7 +4353,6 @@ test('syncRepository performs direct stale-open reconciliation when fullReconcil let openListCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, _since, _limit, _reporter, state = 'open') => { if (state === 'closed') { @@ -4625,7 +4430,6 @@ test('syncRepository derives the default overlapping since window from the last const closedSinceValues: Array = []; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, since, _limit, _reporter, state = 'open') => { if (state === 'closed') { @@ -4727,7 +4531,6 @@ test('syncRepository uses an explicit since window for both open and closed over let openListCalls = 0; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, since, _limit, _reporter, state = 'open') => { if (state === 'closed') { @@ -4800,7 +4603,6 @@ test('syncRepository skips the closed overlap sweep on the first full scan with const closedSinceValues: Array = []; const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), listRepositoryIssues: async (_owner, _repo, since, _limit, _reporter, state = 'open') => { if (state === 'closed') { @@ -4860,7 +4662,6 @@ test('syncRepository skips the closed overlap sweep on the first full scan with test('repository-scoped reads and neighbors do not leak across repos in the same database', () => { const service = makeTestService({ - checkAuth: async () => undefined, getRepo: async () => ({ id: 1, full_name: 'owner-one/repo-one' }), listRepositoryIssues: async () => [], getIssue: async () => { diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 2f8cd48..8ecda19 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -11,8 +11,6 @@ import { Worker } from 'node:worker_threads'; import { IterableMapper } from '@shutterstock/p-map-iterable'; import { actionResponseSchema, - authorResponseSchema, - authorThreadsResponseSchema, closeResponseSchema, clusterOverrideResponseSchema, clusterMergeResponseSchema, @@ -34,9 +32,6 @@ import { threadsResponseSchema, type ActionRequest, type ActionResponse, - type AuthorResponse, - type AuthorStatsDto, - type AuthorThreadsResponse, type CloseResponse, type ClusterMergeResponse, type ClusterOverrideResponse, @@ -78,9 +73,7 @@ import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-k import { createPipelineRun, finishPipelineRun, - refreshActorRepoStats, recordClusterEvent, - upsertActor, upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, @@ -96,8 +89,6 @@ import { } from './cluster/thread-fingerprint.js'; import { ensureRuntimeDirs, - isLikelyGitHubToken, - isLikelyOpenAiApiKey, loadConfig, requireGithubToken, requireOpenAiKey, @@ -369,15 +360,13 @@ export type DoctorResult = { github: { configured: boolean; source: ConfigValueSource; - formatOk: boolean; - authOk: boolean; + tokenPresent: boolean; error: string | null; }; openai: { configured: boolean; source: ConfigValueSource; - formatOk: boolean; - authOk: boolean; + tokenPresent: boolean; error: string | null; }; vectorlite: { @@ -616,19 +605,6 @@ function threadToDto(row: ThreadRow, clusterId?: number | null): ThreadDto { }; } -function emptyAuthorStats(): AuthorStatsDto { - return { - openedIssueCount: 0, - openedPullRequestCount: 0, - commentCount: 0, - mergedPullRequestCount: 0, - closedThreadCount: 0, - firstActivityAt: null, - lastActivityAt: null, - trustTier: null, - }; -} - export class GHCrawlService { readonly config: GitcrawlConfig; readonly db: SqliteDatabase; @@ -677,47 +653,20 @@ export class GHCrawlService { const github = { configured: Boolean(this.config.githubToken), source: this.config.githubTokenSource, - formatOk: this.config.githubToken ? isLikelyGitHubToken(this.config.githubToken) : false, - authOk: false, + tokenPresent: Boolean(this.config.githubToken), error: null as string | null, }; const openai = { configured: Boolean(this.config.openaiApiKey), source: this.config.openaiApiKeySource, - formatOk: this.config.openaiApiKey ? isLikelyOpenAiApiKey(this.config.openaiApiKey) : false, - authOk: false, + tokenPresent: Boolean(this.config.openaiApiKey), error: null as string | null, }; - if (!github.configured && this.config.secretProvider === 'op' && this.config.opVaultName && this.config.opItemName) { - github.error = `Configured for 1Password CLI via ${this.config.opVaultName}/${this.config.opItemName}; run ghcrawl through your op wrapper so GITHUB_TOKEN is present in the environment.`; + if (!github.configured) { + github.error = 'Set GITHUB_TOKEN to crawl GitHub data.'; } - if (!openai.configured && this.config.secretProvider === 'op' && this.config.opVaultName && this.config.opItemName) { - openai.error = `Configured for 1Password CLI via ${this.config.opVaultName}/${this.config.opItemName}; run ghcrawl through your op wrapper so OPENAI_API_KEY is present in the environment.`; - } - if (github.configured) { - if (!github.formatOk) { - github.error = 'Token format does not look like a GitHub personal access token.'; - } else { - try { - await this.requireGithub().checkAuth(); - github.authOk = true; - } catch (error) { - github.error = error instanceof Error ? error.message : String(error); - } - } - } - - if (openai.configured) { - if (!openai.formatOk) { - openai.error = 'Key format does not look like an OpenAI API key.'; - } else { - try { - await this.requireAi().checkAuth(); - openai.authOk = true; - } catch (error) { - openai.error = error instanceof Error ? error.message : String(error); - } - } + if (!openai.configured) { + openai.error = 'Set OPENAI_API_KEY only for summary or embedding commands.'; } const vectorliteHealth = this.vectorStore.checkRuntime(); @@ -834,275 +783,6 @@ export class GHCrawlService { }); } - listAuthorThreads(params: { owner: string; repo: string; login: string; includeClosed?: boolean }): AuthorThreadsResponse { - const repository = this.requireRepository(params.owner, params.repo); - const normalizedLogin = params.login.trim(); - if (!normalizedLogin) { - return authorThreadsResponseSchema.parse({ - repository, - authorLogin: '', - threads: [], - }); - } - - const clusterIds = new Map(); - const clusterRows = this.db - .prepare( - `select cm.thread_id, cm.cluster_id - from cluster_members cm - join clusters c on c.id = cm.cluster_id - where c.repo_id = ? and c.cluster_run_id = ( - select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1 - )`, - ) - .all(repository.id, repository.id) as Array<{ thread_id: number; cluster_id: number }>; - for (const row of clusterRows) clusterIds.set(row.thread_id, row.cluster_id); - - const rows = this.db - .prepare( - `select * - from threads - where repo_id = ? and lower(author_login) = lower(?) - ${params.includeClosed ? '' : "and state = 'open' and closed_at_local is null"} - order by updated_at_gh desc, number desc`, - ) - .all(repository.id, normalizedLogin) as ThreadRow[]; - - const latestRun = this.getLatestClusterRun(repository.id); - const strongestByThread = new Map['threads'][number]['strongestSameAuthorMatch']>>(); - if (latestRun && rows.length > 1) { - const edges = this.db - .prepare( - `select - se.left_thread_id, - se.right_thread_id, - se.score, - t1.number as left_number, - t1.kind as left_kind, - t1.title as left_title, - t2.number as right_number, - t2.kind as right_kind, - t2.title as right_title - from similarity_edges se - join threads t1 on t1.id = se.left_thread_id - join threads t2 on t2.id = se.right_thread_id - where se.repo_id = ? - and se.cluster_run_id = ? - and lower(t1.author_login) = lower(?) - and lower(t2.author_login) = lower(?) - ${params.includeClosed ? '' : "and t1.state = 'open' and t1.closed_at_local is null and t2.state = 'open' and t2.closed_at_local is null"}`, - ) - .all(repository.id, latestRun.id, normalizedLogin, normalizedLogin) as Array<{ - left_thread_id: number; - right_thread_id: number; - score: number; - left_number: number; - left_kind: 'issue' | 'pull_request'; - left_title: string; - right_number: number; - right_kind: 'issue' | 'pull_request'; - right_title: string; - }>; - - const updateStrongest = ( - sourceThreadId: number, - match: { threadId: number; number: number; kind: 'issue' | 'pull_request'; title: string; score: number }, - ): void => { - const previous = strongestByThread.get(sourceThreadId); - if (!previous || match.score > previous.score) { - strongestByThread.set(sourceThreadId, match); - } - }; - - for (const edge of edges) { - updateStrongest(edge.left_thread_id, { - threadId: edge.right_thread_id, - number: edge.right_number, - kind: edge.right_kind, - title: edge.right_title, - score: edge.score, - }); - updateStrongest(edge.right_thread_id, { - threadId: edge.left_thread_id, - number: edge.left_number, - kind: edge.left_kind, - title: edge.left_title, - score: edge.score, - }); - } - } - - return authorThreadsResponseSchema.parse({ - repository, - authorLogin: normalizedLogin, - threads: rows.map((row) => ({ - thread: threadToDto(row, clusterIds.get(row.id) ?? null), - strongestSameAuthorMatch: strongestByThread.get(row.id) ?? null, - })), - }); - } - - getAuthor(params: { owner: string; repo: string; login: string; includeClosed?: boolean }): AuthorResponse { - const repository = this.requireRepository(params.owner, params.repo); - const normalizedLogin = params.login.trim(); - if (!normalizedLogin) { - return authorResponseSchema.parse({ - repository, - authorLogin: '', - actor: null, - stats: emptyAuthorStats(), - threads: [], - }); - } - - const threads = this.listAuthorThreads(params).threads; - const actorRow = this.db - .prepare( - `select - a.id, - a.provider, - a.provider_user_id, - a.login, - a.display_name, - a.actor_type, - a.site_admin, - a.first_seen_at, - a.last_seen_at, - a.updated_at, - s.opened_issues, - s.opened_prs, - s.comments, - s.merged_prs, - s.closed_threads, - s.first_activity_at, - s.last_activity_at, - s.trust_tier - from actors a - left join actor_repo_stats s on s.actor_id = a.id and s.repo_id = ? - where lower(a.login) = lower(?) - order by s.last_activity_at desc nulls last, a.last_seen_at desc - limit 1`, - ) - .get(repository.id, normalizedLogin) as - | { - id: number; - provider: string; - provider_user_id: string; - login: string; - display_name: string | null; - actor_type: string | null; - site_admin: number; - first_seen_at: string; - last_seen_at: string; - updated_at: string; - opened_issues: number | null; - opened_prs: number | null; - comments: number | null; - merged_prs: number | null; - closed_threads: number | null; - first_activity_at: string | null; - last_activity_at: string | null; - trust_tier: string | null; - } - | undefined; - const fallbackStats = this.computeAuthorStats(repository.id, normalizedLogin); - - return authorResponseSchema.parse({ - repository, - authorLogin: actorRow?.login ?? normalizedLogin, - actor: actorRow - ? { - id: actorRow.id, - provider: actorRow.provider, - providerUserId: actorRow.provider_user_id, - login: actorRow.login, - displayName: actorRow.display_name, - actorType: actorRow.actor_type, - siteAdmin: actorRow.site_admin === 1, - firstSeenAt: actorRow.first_seen_at, - lastSeenAt: actorRow.last_seen_at, - updatedAt: actorRow.updated_at, - } - : null, - stats: { - openedIssueCount: actorRow?.opened_issues ?? fallbackStats.openedIssueCount, - openedPullRequestCount: actorRow?.opened_prs ?? fallbackStats.openedPullRequestCount, - commentCount: actorRow?.comments ?? fallbackStats.commentCount, - mergedPullRequestCount: actorRow?.merged_prs ?? fallbackStats.mergedPullRequestCount, - closedThreadCount: actorRow?.closed_threads ?? fallbackStats.closedThreadCount, - firstActivityAt: actorRow?.first_activity_at ?? fallbackStats.firstActivityAt, - lastActivityAt: actorRow?.last_activity_at ?? fallbackStats.lastActivityAt, - trustTier: actorRow?.trust_tier ?? fallbackStats.trustTier, - }, - threads, - }); - } - - private computeAuthorStats(repoId: number, login: string): ReturnType { - const row = this.db - .prepare( - `select - (select count(*) from threads where repo_id = ? and kind = 'issue' and lower(author_login) = lower(?)) as opened_issues, - (select count(*) from threads where repo_id = ? and kind = 'pull_request' and lower(author_login) = lower(?)) as opened_prs, - (select count(*) from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(?)) as comments, - (select count(*) from threads where repo_id = ? and kind = 'pull_request' and merged_at_gh is not null and lower(author_login) = lower(?)) as merged_prs, - (select count(*) from threads where repo_id = ? and closed_at_gh is not null and lower(author_login) = lower(?)) as closed_threads, - (select min(activity_at) - from ( - select created_at_gh as activity_at from threads where repo_id = ? and lower(author_login) = lower(?) - union all - select c.created_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(?) - ) - where activity_at is not null) as first_activity_at, - (select max(activity_at) - from ( - select updated_at_gh as activity_at from threads where repo_id = ? and lower(author_login) = lower(?) - union all - select c.updated_at_gh as activity_at from comments c join threads t on t.id = c.thread_id where t.repo_id = ? and lower(c.author_login) = lower(?) - ) - where activity_at is not null) as last_activity_at`, - ) - .get( - repoId, - login, - repoId, - login, - repoId, - login, - repoId, - login, - repoId, - login, - repoId, - login, - repoId, - login, - repoId, - login, - repoId, - login, - ) as { - opened_issues: number; - opened_prs: number; - comments: number; - merged_prs: number; - closed_threads: number; - first_activity_at: string | null; - last_activity_at: string | null; - }; - - return { - openedIssueCount: row.opened_issues, - openedPullRequestCount: row.opened_prs, - commentCount: row.comments, - mergedPullRequestCount: row.merged_prs, - closedThreadCount: row.closed_threads, - firstActivityAt: row.first_activity_at, - lastActivityAt: row.last_activity_at, - trustTier: row.opened_issues + row.opened_prs >= 3 ? 'repeat_contributor' : null, - }; - } - closeThreadLocally(params: { owner: string; repo: string; threadNumber: number }): CloseResponse { const repository = this.requireRepository(params.owner, params.repo); const row = this.db @@ -1820,8 +1500,6 @@ export class GHCrawlService { lastReconciledOpenCloseAt: reconciledOpenCloseAt ?? syncCursor.lastReconciledOpenCloseAt, }; this.writeSyncCursorState(repoId, nextSyncCursor); - refreshActorRepoStats(this.db, repoId); - this.finishRun('sync_runs', runId, 'completed', { threadsSynced, commentsSynced, @@ -4326,7 +4004,6 @@ export class GHCrawlService { const issueComments = await github.listIssueComments(owner, repo, number, reporter); comments.push( ...issueComments.map((comment) => { - this.upsertActorFromPayload(comment); const authorLogin = userLogin(comment); const authorType = userType(comment); return { @@ -4347,7 +4024,6 @@ export class GHCrawlService { const reviews = await github.listPullReviews(owner, repo, number, reporter); comments.push( ...reviews.map((review) => { - this.upsertActorFromPayload(review); const authorLogin = userLogin(review); const authorType = userType(review); return { @@ -4367,7 +4043,6 @@ export class GHCrawlService { const reviewComments = await github.listPullReviewComments(owner, repo, number, reporter); comments.push( ...reviewComments.map((comment) => { - this.upsertActorFromPayload(comment); const authorLogin = userLogin(comment); const authorType = userType(comment); return { @@ -4427,21 +4102,6 @@ export class GHCrawlService { return row.id; } - private upsertActorFromPayload(payload: Record): number | null { - const user = payload.user as Record | undefined; - const login = userLogin(payload); - if (!user || !login) return null; - const providerUserId = user.id === undefined || user.id === null ? login : String(user.id); - return upsertActor(this.db, { - providerUserId, - login, - displayName: typeof user.name === 'string' ? user.name : null, - actorType: userType(payload), - siteAdmin: user.site_admin === true, - rawJson: asJson(user), - }); - } - private upsertThread( repoId: number, kind: 'issue' | 'pull_request', @@ -4453,7 +4113,6 @@ export class GHCrawlService { const labels = parseLabels(payload); const assignees = parseAssignees(payload); const contentHash = stableContentHash(`${title}\n${body ?? ''}`); - this.upsertActorFromPayload(payload); this.db .prepare( `insert into threads ( diff --git a/scripts/op-run.mjs b/scripts/op-run.mjs deleted file mode 100644 index 6430200..0000000 --- a/scripts/op-run.mjs +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env node -import fs from 'node:fs'; -import os from 'node:os'; -import path from 'node:path'; -import { execFileSync, spawn } from 'node:child_process'; - -function resolveHomeDirectory(env) { - return path.resolve(env.HOME ?? env.USERPROFILE ?? os.homedir()); -} - -function getConfigPath(env = process.env) { - if (env.XDG_CONFIG_HOME) { - return path.resolve(env.XDG_CONFIG_HOME, 'ghcrawl', 'config.json'); - } - if (process.platform === 'win32' && env.APPDATA) { - return path.resolve(env.APPDATA, 'ghcrawl', 'config.json'); - } - return path.join(resolveHomeDirectory(env), '.config', 'ghcrawl', 'config.json'); -} - -function readConfig(configPath) { - if (!fs.existsSync(configPath)) { - throw new Error(`Missing ghcrawl config at ${configPath}. Run pnpm bootstrap first.`); - } - return JSON.parse(fs.readFileSync(configPath, 'utf8')); -} - -function requireOpConfig(config, configPath) { - if (config.secretProvider !== 'op') { - throw new Error( - `ghcrawl is not configured for 1Password CLI in ${configPath}. Re-run pnpm bootstrap and choose the 1Password CLI option.`, - ); - } - if (!config.opVaultName || !config.opItemName) { - throw new Error(`Missing opVaultName/opItemName in ${configPath}. Re-run pnpm bootstrap.`); - } - return { - vaultName: config.opVaultName, - itemName: config.opItemName, - }; -} - -function readSecret(reference) { - return execFileSync('op', ['read', reference], { - encoding: 'utf8', - stdio: ['ignore', 'pipe', 'inherit'], - }).trim(); -} - -function loadOpEnv(env = process.env) { - const configPath = getConfigPath(env); - const config = readConfig(configPath); - const { vaultName, itemName } = requireOpConfig(config, configPath); - return { - ...env, - GITHUB_TOKEN: readSecret(`op://${vaultName}/${itemName}/GITHUB_TOKEN`), - OPENAI_API_KEY: readSecret(`op://${vaultName}/${itemName}/OPENAI_API_KEY`), - }; -} - -function runWithEnv(command, args, env = process.env) { - const child = spawn(command, args, { - stdio: 'inherit', - env: loadOpEnv(env), - shell: false, - }); - child.on('exit', (code, signal) => { - if (signal) { - process.kill(process.pid, signal); - return; - } - process.exit(code ?? 0); - }); -} - -function runShell(env = process.env) { - const shell = - process.platform === 'win32' - ? env.ComSpec ?? 'cmd.exe' - : env.SHELL ?? '/bin/zsh'; - runWithEnv(shell, [], env); -} - -function main(argv = process.argv.slice(2)) { - const [mode, ...rest] = argv; - if (!mode || mode === '--help' || mode === '-h') { - process.stdout.write( - [ - 'Usage:', - ' node scripts/op-run.mjs exec -- ', - ' node scripts/op-run.mjs shell', - '', - 'Examples:', - ' pnpm op:doctor', - ' pnpm op:tui', - ' pnpm op:exec -- sync openclaw/openclaw', - ' pnpm op:shell', - '', - ].join('\n'), - ); - return; - } - - if (mode === 'shell') { - runShell(); - return; - } - - if (mode === 'exec') { - const args = rest[0] === '--' ? rest.slice(1) : rest; - if (args.length === 0) { - throw new Error('Missing ghcrawl arguments. Example: pnpm op:exec -- doctor'); - } - runWithEnv('pnpm', ['--filter', 'ghcrawl', 'cli', ...args]); - return; - } - - if (mode === 'run') { - const args = rest[0] === '--' ? rest.slice(1) : rest; - if (args.length === 0) { - throw new Error('Missing command. Example: node scripts/op-run.mjs run -- node scripts/my-script.mjs'); - } - runWithEnv(args[0], args.slice(1)); - return; - } - - throw new Error(`Unknown mode: ${mode}`); -} - -try { - main(); -} catch (error) { - process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`); - process.exit(1); -} diff --git a/scripts/run-all-prompt-experiments.mjs b/scripts/run-all-prompt-experiments.mjs index a532f5a..471ff12 100644 --- a/scripts/run-all-prompt-experiments.mjs +++ b/scripts/run-all-prompt-experiments.mjs @@ -1,7 +1,7 @@ #!/usr/bin/env node /** * Run all prompt experiments sequentially. - * Usage: node scripts/op-run.mjs run -- node scripts/run-all-prompt-experiments.mjs + * Usage: node scripts/run-all-prompt-experiments.mjs */ import fs from 'node:fs'; import path from 'node:path'; diff --git a/scripts/run-cluster-experiments.mjs b/scripts/run-cluster-experiments.mjs index 9bbf8c2..bbf9a24 100644 --- a/scripts/run-cluster-experiments.mjs +++ b/scripts/run-cluster-experiments.mjs @@ -1,7 +1,7 @@ #!/usr/bin/env node /** * Run all clustering experiments sequentially. - * Usage: node scripts/op-run.mjs run -- node scripts/run-cluster-experiments.mjs + * Usage: node scripts/run-cluster-experiments.mjs */ import fs from 'node:fs'; import path from 'node:path'; diff --git a/scripts/summarize-single.mjs b/scripts/summarize-single.mjs index 4f94bbc..e20aaf2 100644 --- a/scripts/summarize-single.mjs +++ b/scripts/summarize-single.mjs @@ -6,7 +6,7 @@ * node scripts/summarize-single.mjs [--prompt-file ] * node scripts/summarize-single.mjs [--prompt ""] * - * Requires OPENAI_API_KEY in environment (use pnpm op:shell or op:exec). + * Requires OPENAI_API_KEY in the environment. */ import fs from 'node:fs'; import path from 'node:path'; @@ -89,7 +89,7 @@ try { // Call OpenAI directly with optional prompt override const apiKey = process.env.OPENAI_API_KEY; if (!apiKey) { - throw new Error('OPENAI_API_KEY not set. Use pnpm op:shell or set the env var.'); + throw new Error('OPENAI_API_KEY not set. Set the env var before running this script.'); } const { default: OpenAI } = await import('openai'); From acbe950727d32aada2aaa62ac51cf3f8dbe236d5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 00:03:57 -0700 Subject: [PATCH 065/215] docs: document bare token workflow --- CONTRIBUTING.md | 10 --- README.md | 68 ++++----------- SPEC.md | 9 +- apps/cli/README.md | 117 ++++++++++++++------------ docs/DESIGN.md | 7 +- skills/ghcrawl/SKILL.md | 5 -- skills/ghcrawl/references/protocol.md | 32 ++----- 7 files changed, 91 insertions(+), 157 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9773617..f692305 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,7 +6,6 @@ This file is for maintainers and contributors working from source. ```bash pnpm install -pnpm bootstrap pnpm health ``` @@ -23,15 +22,6 @@ pnpm typecheck pnpm test ``` -If you configured 1Password CLI support in init: - -```bash -pnpm op:doctor -pnpm op:tui -pnpm op:exec -- sync openclaw/openclaw -pnpm op:shell -``` - ## Release Flow This repo uses tag-driven releases from the GitHub Releases UI. diff --git a/README.md b/README.md index ff8e6ea..d4b541f 100644 --- a/README.md +++ b/README.md @@ -23,27 +23,23 @@ If you are working from source or maintaining the repo, use [CONTRIBUTING.md](./ ## Requirements -Normal `ghcrawl` use needs both: +Normal `ghcrawl` crawl use needs: - a GitHub personal access token -- an OpenAI API key -GitHub is required to crawl issue and PR data. OpenAI is required for embeddings and the maintainer clustering and search workflow. If you already have a populated local DB you can still browse it without live keys, but a fresh `sync` + `embed` + `cluster` or `refresh` run needs both. +OpenAI is optional and only needed when you run summary or embedding workflows. Deterministic sync, fingerprinting, and clustering can run without it. ## Quick Start ```bash -ghcrawl init +export GITHUB_TOKEN=github_pat_... ghcrawl configure ghcrawl doctor ghcrawl refresh owner/repo ghcrawl tui owner/repo ``` -`ghcrawl init` runs the setup wizard. It can either: - -- save plaintext keys in `~/.config/ghcrawl/config.json` -- or guide you through a 1Password CLI (`op`) setup that keeps keys out of the config file +`ghcrawl` reads bare tokens from environment variables, `.env.local`, or `~/.config/ghcrawl/config.json`. No setup wizard or external secret provider is required. `ghcrawl refresh owner/repo` is the main pipeline command. It pulls the latest open GitHub issues and pull requests, summarizes changed items only when the active embedding basis depends on summaries, refreshes vectors, and rebuilds the clusters you browse in the TUI. @@ -120,10 +116,10 @@ ghcrawl refresh owner/repo ### TUI Screenshots -| User open issue/PR list modal | Refresh modal | +| Issue/PR list modal | Refresh modal | | --- | --- | | ![User open issue and PR list modal](./docs/images/ghcrawl-tui-user-modal.png) | ![GitHub, embed, and cluster refresh modal](./docs/images/ghcrawl-tui-refresh-modal.png) | -| Press `u` to open the current user's issue and PR list modal. | Press `g` to open the GitHub/embed/cluster refresh modal. | +| Browse open issue and PR records from local SQLite. | Press `g` to open the GitHub/embed/cluster refresh modal. | | Closed members in a cluster | Fully closed cluster | | --- | --- | @@ -149,24 +145,20 @@ ghcrawl cluster owner/repo # rebuild local related-work clusters from the curre Run them in that order. If your embedding basis is `title_summary`, `refresh` automatically inserts the summarize stage before embed for you. With the default `title_original` basis, `refresh` does not summarize unless you run `summarize` explicitly. -## Init And Doctor +## Tokens And Doctor First run: ```bash -ghcrawl init +export GITHUB_TOKEN=github_pat_... ghcrawl doctor ``` -`init` behavior: +Token loading order: -- prompts you to choose one of two secret-storage modes: - - `plaintext`: saves both keys to `~/.config/ghcrawl/config.json` - - `1Password CLI`: stores only vault and item metadata and tells you how to run `ghcrawl` through `op` -- if you choose plaintext storage, init warns that anyone who can read that file can use your keys and that resulting API charges are your responsibility -- if you choose 1Password CLI mode, init tells you to create a Secure Note with concealed fields named: - - `GITHUB_TOKEN` - - `OPENAI_API_KEY` +- environment variables: `GITHUB_TOKEN`, `OPENAI_API_KEY` +- workspace `.env.local` +- user config: `~/.config/ghcrawl/config.json` GitHub token guidance: @@ -181,10 +173,9 @@ GitHub token guidance: - config file presence and path - local DB path wiring -- GitHub token presence, token-shape validation, and a live auth smoke check -- OpenAI key presence, key-shape validation, and a live auth smoke check +- GitHub token presence +- OpenAI key presence for optional summary and embedding commands - `vectorlite` runtime readiness -- if init is configured for 1Password CLI but you forgot to run through your `op` wrapper, doctor tells you that explicitly ## Configure @@ -206,31 +197,6 @@ Changing the summary model or embedding basis makes the next `refresh` rebuild v If you opt into `title_summary`, ghcrawl summarizes before embedding and uses `title + dedupe summary` as the active vector text. On `openclaw/openclaw`, that improved non-solo cluster membership by about 50% versus `title_original`, but it adds OpenAI spend. A first summarize of roughly `18k` open issues and PRs in that repo typically costs about `$15-$30` with `gpt-5-mini`; later refreshes are usually much cheaper because only changed items need summaries. -### 1Password CLI Example - -If you choose 1Password CLI mode, create a 1Password Secure Note with concealed fields named exactly: - -- `GITHUB_TOKEN` -- `OPENAI_API_KEY` - -Then add this wrapper to `~/.zshrc`: - -```bash -ghcrawl-op() { - env GITHUB_TOKEN="$(op read 'op://Private/ghcrawl/GITHUB_TOKEN')" \ - OPENAI_API_KEY="$(op read 'op://Private/ghcrawl/OPENAI_API_KEY')" \ - ghcrawl "$@" -} -``` - -Then use: - -```bash -ghcrawl-op doctor -ghcrawl-op refresh owner/repo -ghcrawl-op tui owner/repo -``` - ## Using The CLI To Extract JSON Data These commands are intended more for scripts, bots, and agent integrations than for normal day-to-day terminal browsing: @@ -238,7 +204,6 @@ These commands are intended more for scripts, bots, and agent integrations than ```bash ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl threads owner/repo --numbers 42,43,44 --include-closed --json -ghcrawl author owner/repo --login lqquan --json ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --json @@ -252,8 +217,6 @@ ghcrawl search owner/repo --query "download stalls" --json Use `threads --numbers ...` when you want several specific issue or PR records in one CLI call instead of paying process startup overhead repeatedly. -Use `author --login ...` when you want all currently open issue/PR records from one user plus the strongest stored same-author similarity match for each item. - By default, JSON list commands filter out locally closed issues/PRs and completely closed clusters. Use `--include-closed` when you need to inspect those records too. Use `close-thread` when you know a local issue/PR should be treated as closed before the next GitHub sync catches up. If that was the last open item in its cluster, `ghcrawl` automatically marks the cluster closed too. @@ -312,7 +275,7 @@ npx skills add -g pwrdrvr/ghcrawl The skill is built around the stable JSON CLI surface and is intentionally conservative: - default mode assumes no valid API keys and stays read-only -- API-backed operations only become available after `ghcrawl doctor --json` shows healthy auth +- API-backed operations only need the relevant bare token in env, `.env.local`, or config JSON - even then, `refresh`, `sync`, `embed`, and `cluster` should only run when the user explicitly asks for them - JSON list commands hide locally closed issues/PRs and closed clusters by default unless `--include-closed` is passed @@ -321,7 +284,6 @@ ghcrawl doctor --json ghcrawl refresh owner/repo ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 42,43,44 --json -ghcrawl author owner/repo --login lqquan --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json diff --git a/SPEC.md b/SPEC.md index e1f52cd..e08e79e 100644 --- a/SPEC.md +++ b/SPEC.md @@ -52,9 +52,7 @@ These are settled unless the user explicitly changes them: - sync default: metadata-only - comment hydration: opt-in - kNN strategy: exact local cosine search first -- secret modes: - - plaintext config storage - - 1Password CLI metadata + env injection +- token input: read bare `GITHUB_TOKEN` / `OPENAI_API_KEY` from env, `.env.local`, or config JSON ## Local Environment Contract @@ -127,7 +125,6 @@ The product must keep these machine-facing surfaces working: - `ghcrawl refresh owner/repo --json` - `ghcrawl runs owner/repo --json` - `ghcrawl threads owner/repo --numbers --json` -- `ghcrawl author owner/repo --login --json` for actor identity, repo stats, and authored threads - `ghcrawl close-thread owner/repo --number --json` - `ghcrawl close-cluster owner/repo --id --json` - `ghcrawl embed owner/repo --json` @@ -151,8 +148,6 @@ The product must keep these machine-facing surfaces working: - `GET /repositories` - `GET /runs` - `GET /threads` -- `GET /author` -- `GET /author-threads` - `GET /search` - `GET /neighbors` - `GET /clusters` @@ -206,7 +201,7 @@ The installable skill lives in: - keep DB-backed operational state in SQLite, not in config - keep user preferences in config -- keep secret values out of repo files +- keep token values out of repo files - default to stable machine-readable interfaces before adding new UI affordances - prefer exact local search until there is measured evidence that a separate vector service is required diff --git a/apps/cli/README.md b/apps/cli/README.md index 80dc101..15dc0ac 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -25,29 +25,46 @@ If you are working from source or maintaining the repo, use [CONTRIBUTING.md](ht ## Requirements -Normal `ghcrawl` use needs both: +Normal `ghcrawl` crawl use needs: - a GitHub personal access token -- an OpenAI API key -GitHub is required to crawl issue and PR data. OpenAI is required for embeddings and the maintainer clustering and search workflow. If you already have a populated local DB you can still browse it without live keys, but a fresh `sync` + `embed` + `cluster` or `refresh` run needs both. +OpenAI is optional and only needed when you run summary or embedding workflows. Deterministic sync, fingerprinting, and clustering can run without it. ## Quick Start ```bash -ghcrawl init +export GITHUB_TOKEN=github_pat_... ghcrawl configure ghcrawl doctor ghcrawl refresh owner/repo ghcrawl tui owner/repo ``` -`ghcrawl init` runs the setup wizard. It can either: +`ghcrawl` reads bare tokens from environment variables, `.env.local`, or `~/.config/ghcrawl/config.json`. No setup wizard or external secret provider is required. -- save plaintext keys in `~/.config/ghcrawl/config.json` -- or guide you through a 1Password CLI (`op`) setup that keeps keys out of the config file +`ghcrawl refresh owner/repo` is the main pipeline command. It pulls the latest open GitHub issues and pull requests, summarizes changed items only when the active embedding basis depends on summaries, refreshes vectors, and rebuilds the clusters you browse in the TUI. -`ghcrawl refresh owner/repo` is the main pipeline command. It pulls the latest open GitHub issues and pull requests, summarizes changed items when the active embedding basis depends on summaries, refreshes vectors, and rebuilds the clusters you browse in the TUI. +## One-Time Migration + +Upgrading to this release changes the local vector and cluster pipeline: + +- vectors now use a persistent `vectorlite` sidecar index +- the active vector is one vector per open thread +- old multi-row `document_embeddings` are removed after the first successful rebuild + +For an existing repo, the one-time migration command is: + +```bash +ghcrawl refresh owner/repo +``` + +Important notes: + +- `refresh` performs the migration; plain `sync` does not +- with the default `title_original` basis, the migration rebuilds vectors and clusters without running LLM summaries +- if you switch to `title_summary`, `refresh` also runs the summarize step before embedding +- after the first successful migration refresh, ghcrawl removes legacy embeddings, compacts the local DB, and rebuilds clusters from the current vectors ## Typical Commands @@ -101,10 +118,10 @@ ghcrawl refresh owner/repo ### TUI Screenshots -| User open issue/PR list modal | Refresh modal | +| Issue/PR list modal | Refresh modal | | --- | --- | | ![User open issue and PR list modal](https://raw.githubusercontent.com/pwrdrvr/ghcrawl/main/docs/images/ghcrawl-tui-user-modal.png) | ![GitHub, embed, and cluster refresh modal](https://raw.githubusercontent.com/pwrdrvr/ghcrawl/main/docs/images/ghcrawl-tui-refresh-modal.png) | -| Press `u` to open the current user's issue and PR list modal. | Press `g` to open the GitHub/embed/cluster refresh modal. | +| Browse open issue and PR records from local SQLite. | Press `g` to open the GitHub/embed/cluster refresh modal. | | Closed members in a cluster | Fully closed cluster | | --- | --- | @@ -128,26 +145,22 @@ ghcrawl embed owner/repo # generate or refresh the single active vector per t ghcrawl cluster owner/repo # rebuild local related-work clusters from the current vectors (local-only, but can take ~10 minutes on a ~12k issue/PR repo) ``` -Run them in that order. If your embedding basis is `title_summary`, `refresh` automatically inserts the summarize stage before embed for you. +Run them in that order. If your embedding basis is `title_summary`, `refresh` automatically inserts the summarize stage before embed for you. With the default `title_original` basis, `refresh` does not summarize unless you run `summarize` explicitly. -## Init And Doctor +## Tokens And Doctor First run: ```bash -ghcrawl init +export GITHUB_TOKEN=github_pat_... ghcrawl doctor ``` -`init` behavior: +Token loading order: -- prompts you to choose one of two secret-storage modes: - - `plaintext`: saves both keys to `~/.config/ghcrawl/config.json` - - `1Password CLI`: stores only vault and item metadata and tells you how to run `ghcrawl` through `op` -- if you choose plaintext storage, init warns that anyone who can read that file can use your keys and that resulting API charges are your responsibility -- if you choose 1Password CLI mode, init tells you to create a Secure Note with concealed fields named: - - `GITHUB_TOKEN` - - `OPENAI_API_KEY` +- environment variables: `GITHUB_TOKEN`, `OPENAI_API_KEY` +- workspace `.env.local` +- user config: `~/.config/ghcrawl/config.json` GitHub token guidance: @@ -162,10 +175,9 @@ GitHub token guidance: - config file presence and path - local DB path wiring -- GitHub token presence, token-shape validation, and a live auth smoke check -- OpenAI key presence, key-shape validation, and a live auth smoke check +- GitHub token presence +- OpenAI key presence for optional summary and embedding commands - `vectorlite` runtime readiness -- if init is configured for 1Password CLI but you forgot to run through your `op` wrapper, doctor tells you that explicitly ## Configure @@ -180,35 +192,12 @@ ghcrawl configure --embedding-basis title_original Current defaults: - summary model: `gpt-5-mini` -- embedding basis: `title_summary` (`title + dedupe summary`) +- embedding basis: `title_original` (`title + original body`) - vector backend: `vectorlite` Changing the summary model or embedding basis makes the next `refresh` rebuild vectors and clusters for that repo. -### 1Password CLI Example - -If you choose 1Password CLI mode, create a 1Password Secure Note with concealed fields named exactly: - -- `GITHUB_TOKEN` -- `OPENAI_API_KEY` - -Then add this wrapper to `~/.zshrc`: - -```bash -ghcrawl-op() { - env GITHUB_TOKEN="$(op read 'op://Private/ghcrawl/GITHUB_TOKEN')" \ - OPENAI_API_KEY="$(op read 'op://Private/ghcrawl/OPENAI_API_KEY')" \ - ghcrawl "$@" -} -``` - -Then use: - -```bash -ghcrawl-op doctor -ghcrawl-op refresh owner/repo -ghcrawl-op tui owner/repo -``` +If you opt into `title_summary`, ghcrawl summarizes before embedding and uses `title + dedupe summary` as the active vector text. On `openclaw/openclaw`, that improved non-solo cluster membership by about 50% versus `title_original`, but it adds OpenAI spend. A first summarize of roughly `18k` open issues and PRs in that repo typically costs about `$15-$30` with `gpt-5-mini`; later refreshes are usually much cheaper because only changed items need summaries. ## Using The CLI To Extract JSON Data @@ -217,26 +206,44 @@ These commands are intended more for scripts, bots, and agent integrations than ```bash ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl threads owner/repo --numbers 42,43,44 --include-closed --json -ghcrawl author owner/repo --login lqquan --json ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --include-closed --json +ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --json ghcrawl cluster-detail owner/repo --id 123 --include-closed --json +ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ghcrawl search owner/repo --query "download stalls" --json ``` Use `threads --numbers ...` when you want several specific issue or PR records in one CLI call instead of paying process startup overhead repeatedly. -Use `author --login ...` when you want all currently open issue/PR records from one user plus the strongest stored same-author similarity match for each item. - By default, JSON list commands filter out locally closed issues/PRs and completely closed clusters. Use `--include-closed` when you need to inspect those records too. Use `close-thread` when you know a local issue/PR should be treated as closed before the next GitHub sync catches up. If that was the last open item in its cluster, `ghcrawl` automatically marks the cluster closed too. Use `close-cluster` when you want to locally suppress a whole cluster from default JSON exploration without waiting for a rebuild. +## Durable Cluster Governance + +The durable cluster commands operate on stable cluster identities, not one-off run snapshots: + +```bash +ghcrawl durable-clusters owner/repo --member-limit 10 --json +ghcrawl cluster-explain owner/repo --id 123 --json +ghcrawl exclude-cluster-member owner/repo --id 123 --number 42 --reason "false positive" --json +ghcrawl include-cluster-member owner/repo --id 123 --number 42 --reason "same root cause" --json +ghcrawl set-cluster-canonical owner/repo --id 123 --number 42 --reason "best root issue" --json +ghcrawl merge-clusters owner/repo --source 123 --target 456 --reason "same incident" --json +ghcrawl split-cluster owner/repo --source 123 --numbers 42,43 --reason "separate root cause" --json +ghcrawl cluster owner/repo --number 42 --json +``` + +Use `cluster-explain` when you need to answer why a durable cluster exists. It returns the stable slug, aliases, governed memberships, overrides, event history, and pairwise evidence sources such as deterministic fingerprints, hunk overlap, and vector-backed edges. + +Maintainer overrides are sticky. If you exclude a thread from a durable cluster, future clustering records that decision and will not silently re-add it to the same cluster. `cluster --number` refreshes only one durable neighborhood, which is the cheaper path after a small sync or a manual governance edit. + ## Cost To Operate The main variable costs are summarization and embeddings. Embedding pricing is published by OpenAI here: [OpenAI API pricing](https://developers.openai.com/api/docs/pricing#embeddings). @@ -270,16 +277,18 @@ npx skills add -g pwrdrvr/ghcrawl The skill is built around the stable JSON CLI surface and is intentionally conservative: - default mode assumes no valid API keys and stays read-only -- API-backed operations only become available after `ghcrawl doctor --json` shows healthy auth +- API-backed operations only need the relevant bare token in env, `.env.local`, or config JSON - even then, `refresh`, `sync`, `embed`, and `cluster` should only run when the user explicitly asks for them - JSON list commands hide locally closed issues/PRs and closed clusters by default unless `--include-closed` is passed ```bash ghcrawl doctor --json ghcrawl refresh owner/repo +ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json +ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ``` ### Video Walkthrough @@ -300,7 +309,7 @@ The agent and build contract for this repo lives in [SPEC.md](https://github.com - `embed` defaults to `text-embedding-3-large` with `dimensions=1024` - `embed` maintains one active vector per thread, stored in a persistent `vectorlite` sidecar index - `embed` stores an input hash per thread and will not resubmit unchanged text for re-embedding -- the default embedding basis is `title + dedupe summary`; use `ghcrawl configure` to switch to `title + original body` +- the default embedding basis is `title + original body`; use `ghcrawl configure --embedding-basis title_summary` if you want to summarize before embedding - `sync --since` accepts ISO timestamps and relative durations like `15m`, `2h`, `7d`, and `1mo` - `sync --limit ` is the best smoke-test path on a busy repository - `tui` remembers sort order and min cluster size per repository in the persisted config file diff --git a/docs/DESIGN.md b/docs/DESIGN.md index c910d55..e0ab9f8 100644 --- a/docs/DESIGN.md +++ b/docs/DESIGN.md @@ -12,7 +12,7 @@ Use `discrawl` as the main product pattern: - local-first - deterministic CLI entry points -- explicit `init` / `doctor` / `sync` style commands +- explicit `doctor` / `sync` style commands - SQLite as the canonical local store - optional higher-level search on top of the local store @@ -96,7 +96,7 @@ Reasoning: Primary interface should feel like `discrawl`: ```bash -ghcrawl init +export GITHUB_TOKEN=github_pat_... ghcrawl doctor ghcrawl sync --owner openclaw --repo openclaw ghcrawl summarize --since 30d @@ -108,8 +108,7 @@ ghcrawl serve Recommended initial commands: -- `init`: write config and local paths -- `doctor`: verify env, GitHub auth, OpenAI auth, DB, and optional OpenSearch reachability +- `doctor`: verify token presence, DB, and vector runtime readiness - `sync`: fetch repository data into SQLite - `summarize`: generate or refresh thread summaries - `embed`: generate embeddings for summary documents diff --git a/skills/ghcrawl/SKILL.md b/skills/ghcrawl/SKILL.md index e4d5537..cdfe0bd 100644 --- a/skills/ghcrawl/SKILL.md +++ b/skills/ghcrawl/SKILL.md @@ -67,7 +67,6 @@ ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ghcrawl threads owner/repo --numbers 42,43,44 --json -ghcrawl author owner/repo --login lqquan --json ghcrawl search owner/repo --query "download stalls" --mode hybrid --json ghcrawl neighbors owner/repo --number 42 --limit 10 --json ghcrawl configure --json @@ -86,16 +85,12 @@ If the user explicitly wants to inspect those records, add `--include-closed`. Use `threads --numbers 12345` when you need to find the cluster for one specific issue/PR number. The returned thread record includes `clusterId`. If it is non-null, follow with `cluster-detail --id ` for snapshot details or `cluster-explain --id ` for durable evidence and governance. -Use `author --login ` when the user asks about a contributor or maintainer. It returns actor identity, repo-local activity stats, authored threads, and the strongest same-author similarity match for each thread. - Use `runs` when freshness, repeated failures, or background pipeline status matters. It returns recent sync, summary, embedding, and cluster runs with status, timestamps, stats, and errors. Use `configure --json` when you need to confirm the currently selected summary model or embedding basis before suggesting an expensive refresh. Use `threads --numbers ...` when you need a batch of specific issue/PR records. Do not pay the CLI startup cost 10 times for 10 separate single-thread lookups. -Use `author --login ...` when you need one author's open threads and their strongest stored same-author similarity matches in one call. - If the user explicitly asks to mark a local issue/PR or cluster closed, use: ```bash diff --git a/skills/ghcrawl/references/protocol.md b/skills/ghcrawl/references/protocol.md index a608d50..2580880 100644 --- a/skills/ghcrawl/references/protocol.md +++ b/skills/ghcrawl/references/protocol.md @@ -10,19 +10,20 @@ Do not start with `ghcrawl --help` or ` --help`. Use the command sur ### `ghcrawl doctor --json` -Health and auth smoke check. +Local setup and token-presence check. Use this only when needed. Treat the result as a gate: -- If GitHub/OpenAI auth is missing or unhealthy, stay read-only. -- If GitHub/OpenAI auth is healthy, API-backed commands are available, but still require explicit user direction. +- If the GitHub token is missing, stay read-only. +- If the GitHub token is present, API-backed GitHub commands are available, but still require explicit user direction. +- If the OpenAI key is missing, avoid summary and embedding commands. Do not call this automatically on every skill invocation. Use it when: - the user explicitly asked for API-backed work -- or a read-only request failed and local setup/auth may be the reason +- or a read-only request failed and local setup may be the reason -If the user asked only for read-only analysis, missing auth is not itself a blocker. Work from the existing local dataset through the CLI. +If the user asked only for read-only analysis, missing tokens are not themselves a blocker. Work from the existing local dataset through the CLI. ### `ghcrawl configure --json` @@ -59,22 +60,6 @@ Useful flags: - `--kind issue|pull_request` - `--include-closed` -### `ghcrawl author owner/repo --login --json` - -Read path for one local GitHub actor. - -Use this when you want to inspect a user's identity, repo-local activity stats, open authored items, and strongest stored same-author similarity match for each item. - -Useful flags: - -- `--include-closed` - -Returns: - -- `actor` -- `stats` -- `threads[]` - ### `ghcrawl refresh owner/repo` Runs the staged pipeline in fixed order: @@ -266,7 +251,6 @@ pnpm --filter ghcrawl cli runs owner/repo --limit 20 --json pnpm --filter ghcrawl cli threads owner/repo --numbers 12345 --json pnpm --filter ghcrawl cli threads owner/repo --numbers 42,43,44 --json pnpm --filter ghcrawl cli threads owner/repo --numbers 42,43,44 --include-closed --json -pnpm --filter ghcrawl cli author owner/repo --login lqquan --json pnpm --filter ghcrawl cli refresh owner/repo pnpm --filter ghcrawl cli clusters owner/repo --min-size 10 --limit 20 --sort recent --json pnpm --filter ghcrawl cli clusters owner/repo --min-size 10 --limit 20 --sort recent --include-closed --json @@ -282,7 +266,7 @@ If the supported CLI path still fails, hangs, or returns unusable output, stop a ## Suggested analysis flow -1. Start read-only with `clusters`, `cluster-detail`, `threads`, `author`, `runs`, `search`, or `neighbors` +1. Start read-only with `clusters`, `cluster-detail`, `threads`, `runs`, `search`, or `neighbors` 2. Only if API-backed work is needed or a read-only request failed, run `ghcrawl doctor --json` 3. If auth is unavailable, stay read-only 4. Only if doctor is healthy and the user explicitly asked, run `ghcrawl refresh owner/repo` @@ -290,4 +274,4 @@ If the supported CLI path still fails, hangs, or returns unusable output, stop a 6. `ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json` 7. `ghcrawl cluster-detail owner/repo --id --json` 8. `ghcrawl cluster-explain owner/repo --id --json` when evidence or governance matters -9. optionally `threads`, `author`, `search`, or `neighbors` with `--json` +9. optionally `threads`, `search`, or `neighbors` with `--json` From 7716f284f9da8192c0d4e39f131d6814e938950b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 00:45:59 -0700 Subject: [PATCH 066/215] fix(summary): continue after key summary failures --- packages/api-core/src/service.ts | 34 ++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 8ecda19..01af448 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1681,7 +1681,16 @@ export class GHCrawlService { threadNumber?: number; limit?: number; onProgress?: (message: string) => void; - }): Promise<{ runId: number; generated: number; skipped: number; inputTokens: number; outputTokens: number; totalTokens: number }> { + }): Promise<{ + runId: number; + generated: number; + skipped: number; + failed: number; + inputTokens: number; + outputTokens: number; + totalTokens: number; + errorSamples: Array<{ number: number; error: string }>; + }> { const ai = this.requireAi(); if (!ai.generateKeySummary) { throw new Error('Configured AI provider does not support key summary generation.'); @@ -1719,9 +1728,11 @@ export class GHCrawlService { let generated = 0; let skipped = 0; + let failed = 0; let inputTokens = 0; let outputTokens = 0; let totalTokens = 0; + const errorSamples: Array<{ number: number; error: string }> = []; for (const row of rows) { const labels = parseArray(row.labels_json); @@ -1756,10 +1767,21 @@ export class GHCrawlService { continue; } - const result = await ai.generateKeySummary({ - model: this.config.summaryModel, - text: [`title: ${row.title}`, `labels: ${labels.join(', ')}`, `body: ${row.body ?? ''}`].join('\n'), - }); + let result: Awaited>>; + try { + result = await ai.generateKeySummary({ + model: this.config.summaryModel, + text: [`title: ${row.title}`, `labels: ${labels.join(', ')}`, `body: ${row.body ?? ''}`].join('\n'), + }); + } catch (error) { + failed += 1; + const message = error instanceof Error ? error.message : String(error); + if (errorSamples.length < 10) { + errorSamples.push({ number: row.number, error: message }); + } + params.onProgress?.(`[key-summary] failed thread #${row.number}: ${message}`); + continue; + } upsertThreadKeySummary(this.db, { threadRevisionId: revisionId, summaryKind: 'llm_key_3line', @@ -1778,7 +1800,7 @@ export class GHCrawlService { params.onProgress?.(`[key-summary] generated ${generated}/${rows.length} thread #${row.number}`); } - const payload = { runId, generated, skipped, inputTokens, outputTokens, totalTokens }; + const payload = { runId, generated, skipped, failed, inputTokens, outputTokens, totalTokens, errorSamples }; this.finishRun('summary_runs', runId, 'completed', payload); return payload; } catch (error) { From 8a6b3115295a2fe5dce54b072f239f972659661a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 09:11:50 -0700 Subject: [PATCH 067/215] fix(tui): simplify local cluster browser --- apps/cli/src/tui/app.test.ts | 90 +--- apps/cli/src/tui/app.ts | 582 +------------------------- apps/cli/src/tui/state.test.ts | 3 +- apps/cli/src/tui/state.ts | 4 +- packages/api-core/src/config.test.ts | 2 +- packages/api-core/src/config.ts | 6 +- packages/api-core/src/service.test.ts | 7 +- packages/api-core/src/service.ts | 2 +- 8 files changed, 39 insertions(+), 657 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 3106103..10b287a 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -1,14 +1,10 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import type { TuiClusterDetail, TuiRepoStats, TuiThreadDetail } from '@ghcrawl/api-core'; +import type { TuiClusterDetail, TuiThreadDetail } from '@ghcrawl/api-core'; import { - buildRefreshCliArgs, buildHelpContent, - buildUpdatePipelineHelpContent, - buildUpdatePipelineLabels, - describeUpdateTask, escapeBlessedText, formatClusterDateColumn, getRepositoryChoices, @@ -131,59 +127,6 @@ test('getRepositoryChoices sorts by most recent update and includes the new-repo assert.equal(choices.at(-1)?.kind, 'new'); }); -test('describeUpdateTask reports stale embeddings relative to GitHub sync', () => { - const stats: TuiRepoStats = { - openIssueCount: 10, - openPullRequestCount: 5, - lastGithubReconciliationAt: '2026-03-09T14:00:00Z', - lastEmbedRefreshAt: '2026-03-09T12:00:00Z', - staleEmbedThreadCount: 0, - staleEmbedSourceCount: 0, - latestClusterRunId: 7, - latestClusterRunFinishedAt: '2026-03-09T14:30:00Z', - }; - - assert.equal(describeUpdateTask('embed', stats, new Date('2026-03-09T15:00:00Z')), 'outdated: GitHub is newer by 2h'); -}); - -test('describeUpdateTask reports stale clusters relative to embed refresh', () => { - const stats: TuiRepoStats = { - openIssueCount: 10, - openPullRequestCount: 5, - lastGithubReconciliationAt: '2026-03-09T14:00:00Z', - lastEmbedRefreshAt: '2026-03-09T15:00:00Z', - staleEmbedThreadCount: 0, - staleEmbedSourceCount: 0, - latestClusterRunId: 7, - latestClusterRunFinishedAt: '2026-03-09T12:00:00Z', - }; - - assert.equal(describeUpdateTask('cluster', stats, new Date('2026-03-09T16:00:00Z')), 'outdated: embeddings are newer by 3h'); -}); - -test('buildUpdatePipelineLabels marks the selected tasks and includes task guidance', () => { - const stats: TuiRepoStats = { - openIssueCount: 10, - openPullRequestCount: 5, - lastGithubReconciliationAt: '2026-03-09T14:00:00Z', - lastEmbedRefreshAt: '2026-03-09T15:00:00Z', - staleEmbedThreadCount: 2, - staleEmbedSourceCount: 4, - latestClusterRunId: 7, - latestClusterRunFinishedAt: '2026-03-09T12:00:00Z', - }; - - const labels = buildUpdatePipelineLabels( - stats, - { sync: true, embed: true, cluster: false }, - new Date('2026-03-09T16:00:00Z'), - ); - - assert.match(labels[0] ?? '', /^\[x\] GitHub sync\/reconcile up to date, last 2h ago$/); - assert.match(labels[1] ?? '', /^\[x\] Embed refresh outdated: 2 stale, last 1h ago$/); - assert.match(labels[2] ?? '', /^\[ \] Cluster rebuild outdated: embeddings are newer by 3h$/); -}); - test('buildHelpContent includes the full key command list', () => { const content = buildHelpContent(); @@ -191,7 +134,8 @@ test('buildHelpContent includes the full key command list', () => { assert.match(content, /Left \/ Right\s+cycle focus backward or forward across panes/); assert.match(content, /Up \/ Down\s+move selection, or scroll detail when detail is focused/); assert.match(content, /#\s+jump directly to an issue or PR number/); - assert.match(content, /g\s+start the staged update pipeline in the background/); + assert.match(content, /TUI only reads local SQLite/); + assert.match(content, /default cluster filter is 1\+/); assert.match(content, /p\s+open the repository browser/); assert.match(content, /l\s+toggle wide layout/); assert.match(content, /x\s+show or hide locally closed clusters and members/); @@ -200,31 +144,3 @@ test('buildHelpContent includes the full key command list', () => { assert.doesNotMatch(content, /j \/ k/); assert.match(content, /This popup scrolls\./); }); - -test('buildUpdatePipelineHelpContent explains the LLM summary tradeoff for both modes', () => { - const disabled = buildUpdatePipelineHelpContent('title_original'); - assert.match(disabled, /LLM summaries: disabled/); - assert.match(disabled, /configure --embedding-basis title_summary/); - assert.match(disabled, /\$15-\$30/); - - const enabled = buildUpdatePipelineHelpContent('title_summary'); - assert.match(enabled, /LLM summaries: enabled/); - assert.match(enabled, /about 50%/); - - const keySummary = buildUpdatePipelineHelpContent('llm_key_summary'); - assert.match(keySummary, /3-line key summaries/); - assert.match(keySummary, /key-summaries/); -}); - -test('buildRefreshCliArgs maps the staged selection to refresh skip flags', () => { - assert.deepEqual(buildRefreshCliArgs({ owner: 'openclaw', repo: 'openclaw' }, { sync: true, embed: true, cluster: true }), [ - 'refresh', - 'openclaw/openclaw', - ]); - assert.deepEqual(buildRefreshCliArgs({ owner: 'openclaw', repo: 'openclaw' }, { sync: false, embed: true, cluster: false }), [ - 'refresh', - 'openclaw/openclaw', - '--no-sync', - '--no-cluster', - ]); -}); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 632cc55..b3ef2bf 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1,18 +1,11 @@ -import { spawn, type ChildProcessByStdio } from 'node:child_process'; -import { existsSync } from 'node:fs'; -import { createRequire } from 'node:module'; -import path from 'node:path'; -import type { Readable } from 'node:stream'; -import { fileURLToPath } from 'node:url'; +import { spawn } from 'node:child_process'; import blessed from 'neo-blessed'; import type { - EmbeddingBasis, GHCrawlService, TuiClusterDetail, TuiClusterSortMode, - TuiRepoStats, TuiSnapshot, TuiThreadDetail, TuiWideLayoutPreference, @@ -69,28 +62,6 @@ type ThreadDetailCacheEntry = { hasNeighbors: boolean; }; -type UpdateTaskSelection = { - sync: boolean; - embed: boolean; - cluster: boolean; -}; - -type BackgroundJobResult = { - code: number | null; - signal: NodeJS.Signals | null; - stdout: string; - error: Error | null; -}; - -type BackgroundRefreshJob = { - child: ChildProcessByStdio; - repo: RepositoryTarget; - selection: UpdateTaskSelection; - stdoutBuffer: string; - terminatedByUser: boolean; - exitPromise: Promise; -}; - export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { const term = env.TERM; if (!term) { @@ -111,31 +82,6 @@ function createScreen(options: Parameters[0]): blessed.Wi const ACTIVITY_LOG_LIMIT = 200; const FOOTER_LOG_LINES = 3; -const UPDATE_TASK_ORDER: Array = ['sync', 'embed', 'cluster']; - -export function buildRefreshCliArgs(target: RepositoryTarget, selection: UpdateTaskSelection): string[] { - const args = ['refresh', `${target.owner}/${target.repo}`]; - if (!selection.sync) args.push('--no-sync'); - if (!selection.embed) args.push('--no-embed'); - if (!selection.cluster) args.push('--no-cluster'); - return args; -} - -function createCliLaunch(args: string[]): { command: string; args: string[] } { - const here = path.dirname(fileURLToPath(import.meta.url)); - const distEntrypoint = path.resolve(here, '..', 'main.js'); - if (existsSync(distEntrypoint)) { - return { command: process.execPath, args: [distEntrypoint, ...args] }; - } - - const sourceEntrypoint = path.resolve(here, '..', 'main.ts'); - const require = createRequire(import.meta.url); - const tsxLoader = require.resolve('tsx'); - return { - command: process.execPath, - args: ['--conditions=development', '--import', tsxLoader, sourceEntrypoint, ...args], - }; -} export async function startTui(params: StartTuiParams): Promise { const selectedRepository = params.owner && params.repo ? { owner: params.owner, repo: params.repo } : null; @@ -145,7 +91,7 @@ export async function startTui(params: StartTuiParams): Promise { let focusPane: TuiFocusPane = 'clusters'; const initialPreference = selectedRepository ? getTuiRepositoryPreference(params.service.config, currentRepository.owner, currentRepository.repo) - : { sortMode: 'recent' as TuiClusterSortMode, minClusterSize: 10 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference }; + : { sortMode: 'recent' as TuiClusterSortMode, minClusterSize: 1 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference }; let sortMode: TuiClusterSortMode = initialPreference.sortMode; let minSize: TuiMinSizeFilter = initialPreference.minClusterSize; let wideLayout: TuiWideLayoutPreference = initialPreference.wideLayout; @@ -164,12 +110,7 @@ export async function startTui(params: StartTuiParams): Promise { const activityLines: string[] = []; const clusterDetailCache = new Map(); const threadDetailCache = new Map(); - let syncJobRunning = false; - let embedJobRunning = false; - let clusterJobRunning = false; - let activeJob: BackgroundRefreshJob | null = null; let modalOpen = false; - let exitRequested = false; const clearCaches = (): void => { clusterDetailCache.clear(); @@ -190,7 +131,8 @@ export async function startTui(params: StartTuiParams): Promise { clusterItems = snapshot.clusters.map((cluster, index) => { clusterIndexById.set(cluster.clusterId, index); const updated = formatClusterDateColumn(cluster.latestUpdatedAt); - const label = `${String(cluster.totalCount).padStart(3, ' ')} C${String(cluster.clusterId).padStart(5, ' ')} ${String(cluster.pullRequestCount).padStart(2, ' ')}P/${String(cluster.issueCount).padStart(2, ' ')}I ${updated} ${cluster.displayTitle}`; + const meta = `${cluster.totalCount} items C${cluster.clusterId} ${cluster.pullRequestCount}P/${cluster.issueCount}I ${updated}`; + const label = `${cluster.displayTitle} ${meta}`; return cluster.isClosed ? `{gray-fg}${escapeBlessedText(label)}{/gray-fg}` : escapeBlessedText(label); }); widgets.clusters.setItems(clusterItems); @@ -204,12 +146,6 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; - const setActiveJobFlags = (selection: UpdateTaskSelection | null): void => { - syncJobRunning = selection?.sync === true; - embedJobRunning = selection?.embed === true; - clusterJobRunning = selection?.cluster === true; - }; - const loadClusterDetail = (clusterId: number): TuiClusterDetail => { const cached = clusterDetailCache.get(clusterId); if (cached) return cached; @@ -389,16 +325,13 @@ export async function startTui(params: StartTuiParams): Promise { widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane)); updatePaneStyles(widgets, focusPane); - const activeJobs = [syncJobRunning ? 'sync' : null, embedJobRunning ? 'embed' : null, clusterJobRunning ? 'cluster' : null] - .filter(Boolean) - .join(', ') || 'idle'; const logLines = activityLines.slice(-FOOTER_LOG_LINES); const footerLines = [...logLines]; while (footerLines.length < FOOTER_LOG_LINES) { footerLines.unshift(''); } footerLines.push( - `${status} | jobs:${activeJobs} | h/? help # jump g update p repos / filter s sort f min l layout x closed`, + `${status} | h/? help # jump p repos / filter s sort f min l layout x closed`, ); footerLines.push( `Tab focus arrows move-or-scroll PgUp/PgDn page r refresh o open q quit`, @@ -417,133 +350,6 @@ export async function startTui(params: StartTuiParams): Promise { widgets.screen.render(); }; - const consumeStreamLines = ( - stream: NodeJS.ReadableStream, - onLine: (line: string) => void, - ): void => { - let buffer = ''; - stream.setEncoding('utf8'); - stream.on('data', (chunk: string) => { - buffer += chunk; - while (true) { - const newlineIndex = buffer.indexOf('\n'); - if (newlineIndex === -1) break; - const line = buffer.slice(0, newlineIndex).replace(/\r$/, '').trimEnd(); - buffer = buffer.slice(newlineIndex + 1); - if (line.length > 0) onLine(line); - } - }); - stream.on('end', () => { - const line = buffer.replace(/\r$/, '').trim(); - if (line.length > 0) onLine(line); - }); - }; - - const finalizeBackgroundJob = (job: BackgroundRefreshJob): void => { - void (async () => { - const result = await job.exitPromise; - if (activeJob === job) { - activeJob = null; - } - setActiveJobFlags(null); - - if (job.terminatedByUser) { - pushActivity(`[jobs] update pipeline terminated for ${job.repo.owner}/${job.repo.repo}`); - } else if (result.error) { - pushActivity(`[jobs] update pipeline failed for ${job.repo.owner}/${job.repo.repo}: ${result.error.message}`); - } else if (result.code === 0) { - pushActivity(`[jobs] update pipeline complete for ${job.repo.owner}/${job.repo.repo}`); - try { - const parsed = JSON.parse(result.stdout.trim()) as { - sync?: { threadsSynced?: number; threadsClosed?: number } | null; - embed?: { embedded?: number } | null; - cluster?: { clusters?: number; edges?: number } | null; - }; - const summaryParts = [ - parsed.sync ? `sync:${parsed.sync.threadsSynced ?? 0} threads` : null, - parsed.sync ? `closed:${parsed.sync.threadsClosed ?? 0}` : null, - parsed.embed ? `embed:${parsed.embed.embedded ?? 0}` : null, - parsed.cluster ? `cluster:${parsed.cluster.clusters ?? 0}` : null, - parsed.cluster ? `edges:${parsed.cluster.edges ?? 0}` : null, - ].filter((value): value is string => value !== null); - if (summaryParts.length > 0) { - pushActivity(`[jobs] result ${summaryParts.join(' ')}`); - } - } catch { - // Ignore malformed stdout; progress is already visible in the activity log. - } - if (currentRepository.owner === job.repo.owner && currentRepository.repo === job.repo.repo) { - refreshAll(true); - } - } else { - const exitSuffix = - result.signal !== null ? `signal=${result.signal}` : `code=${result.code ?? 1}`; - pushActivity(`[jobs] update pipeline failed for ${job.repo.owner}/${job.repo.repo}: exited ${exitSuffix}`); - } - - status = 'Ready'; - if (!exitRequested) { - render(); - } - })(); - }; - - const startBackgroundUpdatePipeline = (target: RepositoryTarget, selection: UpdateTaskSelection): boolean => { - if (activeJob !== null) { - pushActivity('[jobs] another update pipeline is already running'); - return false; - } - if (!selection.sync && !selection.embed && !selection.cluster) { - pushActivity('[jobs] select at least one update step'); - return false; - } - - const cliArgs = buildRefreshCliArgs(target, selection); - const launch = createCliLaunch(cliArgs); - const child = spawn(launch.command, launch.args, { - env: process.env, - stdio: ['ignore', 'pipe', 'pipe'], - }); - - const job: BackgroundRefreshJob = { - child, - repo: target, - selection, - stdoutBuffer: '', - terminatedByUser: false, - exitPromise: new Promise((resolve) => { - let resolved = false; - const finish = (result: BackgroundJobResult): void => { - if (resolved) return; - resolved = true; - resolve(result); - }; - child.on('error', (error) => { - finish({ code: null, signal: null, stdout: job.stdoutBuffer, error }); - }); - child.on('close', (code, signal) => { - finish({ code, signal, stdout: job.stdoutBuffer, error: null }); - }); - }), - }; - - child.stdout.setEncoding('utf8'); - child.stdout.on('data', (chunk: string) => { - job.stdoutBuffer += chunk; - }); - consumeStreamLines(child.stderr, (line) => pushActivity(line, { raw: true })); - - activeJob = job; - setActiveJobFlags(selection); - status = `Running update pipeline for ${target.owner}/${target.repo}`; - pushActivity( - `[jobs] starting update pipeline for ${target.owner}/${target.repo}: ${UPDATE_TASK_ORDER.filter((task) => selection[task]).join(' -> ')}`, - ); - render(); - finalizeBackgroundJob(job); - return true; - }; - const moveSelection = (delta: -1 | 1, options?: { steps?: number; wrap?: boolean }): void => { if (!snapshot) return; const steps = Math.max(1, options?.steps ?? 1); @@ -714,116 +520,11 @@ export async function startTui(params: StartTuiParams): Promise { })(); }; - const promptConfirm = async (label: string, message: string): Promise => { - const box = blessed.box({ - parent: widgets.screen, - border: 'line', - label: ` ${label} `, - tags: true, - top: 'center', - left: 'center', - width: '68%', - height: 9, - padding: { - left: 1, - right: 1, - }, - style: { - border: { fg: '#fde74c' }, - fg: 'white', - bg: '#101522', - }, - content: `${message}\n\nPress y or Enter to confirm. Press n or Esc to cancel.`, - }); - - widgets.screen.render(); - - return await new Promise((resolve) => { - const finish = (value: boolean): void => { - widgets.screen.off('keypress', handleKeypress); - box.destroy(); - widgets.screen.render(); - resolve(value); - }; - const handleKeypress = (char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { - if (key.name === 'enter' || char.toLowerCase() === 'y') { - finish(true); - return; - } - if (key.name === 'escape' || char.toLowerCase() === 'n' || key.name === 'q') { - finish(false); - } - }; - - widgets.screen.on('keypress', handleKeypress); - }); - }; - const requestQuit = (): void => { if (modalOpen) return; - void (async () => { - if (activeJob === null) { - widgets.screen.destroy(); - return; - } - - modalOpen = true; - try { - const confirmed = await promptConfirm( - 'Stop Update Pipeline', - `A background update pipeline is still running for ${activeJob.repo.owner}/${activeJob.repo.repo}.\nQuitting now will send SIGTERM to that refresh process and wait for it to exit.`, - ); - if (!confirmed) { - render(); - return; - } - - exitRequested = true; - status = 'Stopping background update pipeline'; - pushActivity(`[jobs] stopping update pipeline for ${activeJob.repo.owner}/${activeJob.repo.repo}`); - render(); - activeJob.terminatedByUser = true; - activeJob.child.kill('SIGTERM'); - await activeJob.exitPromise; - widgets.screen.destroy(); - } finally { - modalOpen = false; - } - })(); - }; - - const promptUpdatePipeline = (): void => { - if (modalOpen || hasActiveJobs()) { - if (hasActiveJobs()) { - pushActivity('[jobs] update pipeline is unavailable while another job is running'); - } - return; - } - - void (async () => { - modalOpen = true; - try { - const selection = await promptUpdatePipelineSelection( - widgets.screen, - snapshot?.stats ?? null, - params.service.config.embeddingBasis, - ); - if (!selection) { - render(); - return; - } - const selectedTasks = UPDATE_TASK_ORDER.filter((task) => selection[task]).join(' -> '); - pushActivity(`[jobs] queued update pipeline: ${selectedTasks}`); - startBackgroundUpdatePipeline(currentRepository, selection); - updateFocus('clusters'); - } finally { - modalOpen = false; - } - })(); + widgets.screen.destroy(); }; - const hasActiveJobs = (): boolean => activeJob !== null; - const persistRepositoryPreference = (): void => { writeTuiRepositoryPreference(params.service.config, { owner: currentRepository.owner, @@ -902,26 +603,8 @@ export async function startTui(params: StartTuiParams): Promise { refreshAll(false); }; - const runRepositoryBootstrap = (target: RepositoryTarget): boolean => { - if (hasActiveJobs()) { - pushActivity('[repo] repository setup is blocked while jobs are already running'); - return false; - } - - setRepositoryPending(target, { - minClusterSize: 1, - status: `Preparing ${target.owner}/${target.repo}`, - }); - pushActivity(`[repo] opened ${target.owner}/${target.repo}; starting initial update pipeline in the background`); - return startBackgroundUpdatePipeline(target, { sync: true, embed: true, cluster: true }); - }; - const browseRepositories = (): void => { if (modalOpen) return; - if (hasActiveJobs()) { - pushActivity('[repo] repository switching is disabled while jobs are running'); - return; - } void (async () => { modalOpen = true; @@ -946,7 +629,11 @@ export async function startTui(params: StartTuiParams): Promise { render(); return; } - runRepositoryBootstrap(target); + setRepositoryPending(target, { + minClusterSize: 1, + status: `No local data for ${target.owner}/${target.repo}; run sync/embed/cluster in the CLI, then press r`, + }); + pushActivity(`[repo] selected ${target.owner}/${target.repo}; run ghcrawl sync/embed/cluster from the shell`); updateFocus('clusters'); } catch (error) { status = 'Repository action failed'; @@ -982,10 +669,11 @@ export async function startTui(params: StartTuiParams): Promise { if (!target) { return false; } - const ready = runRepositoryBootstrap(target); - if (!ready) { - return false; - } + setRepositoryPending(target, { + minClusterSize: 1, + status: `No local data for ${target.owner}/${target.repo}; run sync/embed/cluster in the CLI, then press r`, + }); + pushActivity(`[repo] selected ${target.owner}/${target.repo}; run ghcrawl sync/embed/cluster from the shell`); updateFocus('clusters'); return true; } catch (error) { @@ -1099,10 +787,6 @@ export async function startTui(params: StartTuiParams): Promise { openHelp(); }); widgets.screen.key(['p'], () => browseRepositories()); - widgets.screen.key(['g'], () => { - if (modalOpen) return; - promptUpdatePipeline(); - }); widgets.screen.key(['r'], () => { if (modalOpen) return; status = 'Refreshing'; @@ -1130,7 +814,6 @@ export async function startTui(params: StartTuiParams): Promise { return; } } - pushActivity('[jobs] press g to run the staged update pipeline: GitHub sync, embeddings, then clusters'); updateFocus('clusters'); await new Promise((resolve) => widgets.screen.once('destroy', () => resolve())); @@ -1289,79 +972,6 @@ function openUrl(url: string): void { child.unref(); } -export function describeUpdateTask( - task: keyof UpdateTaskSelection, - stats: TuiRepoStats | null, - now: Date = new Date(), -): string { - if (!stats) { - if (task === 'sync') return 'recommended'; - if (task === 'embed') return 'recommended after sync'; - return 'recommended after embeddings'; - } - - if (task === 'sync') { - return stats.lastGithubReconciliationAt - ? `up to date, last ${formatRelativeTime(stats.lastGithubReconciliationAt, now)}` - : 'never run'; - } - - if (task === 'embed') { - if (!stats.lastEmbedRefreshAt) return 'never run'; - if (stats.staleEmbedThreadCount > 0) { - return `outdated: ${stats.staleEmbedThreadCount} stale, last ${formatRelativeTime(stats.lastEmbedRefreshAt, now)}`; - } - const syncMs = parseDateOrNull(stats.lastGithubReconciliationAt); - const embedMs = parseDateOrNull(stats.lastEmbedRefreshAt); - if (syncMs !== null && embedMs !== null && embedMs < syncMs) { - return `outdated: GitHub is newer by ${formatAge(syncMs - embedMs)}`; - } - return `up to date, last ${formatRelativeTime(stats.lastEmbedRefreshAt, now)}`; - } - - if (!stats.latestClusterRunFinishedAt) return 'never run'; - const embedMs = parseDateOrNull(stats.lastEmbedRefreshAt); - const clusterMs = parseDateOrNull(stats.latestClusterRunFinishedAt); - if (embedMs !== null && clusterMs !== null && clusterMs < embedMs) { - return `outdated: embeddings are newer by ${formatAge(embedMs - clusterMs)}`; - } - return `up to date, last ${formatRelativeTime(stats.latestClusterRunFinishedAt, now)}`; -} - -export function buildUpdatePipelineLabels( - stats: TuiRepoStats | null, - selection: UpdateTaskSelection, - now: Date = new Date(), -): string[] { - return UPDATE_TASK_ORDER.map((task) => { - const mark = selection[task] ? '[x]' : '[ ]'; - const title = task === 'sync' ? 'GitHub sync/reconcile' : task === 'embed' ? 'Embed refresh' : 'Cluster rebuild'; - return `${mark} ${title} ${describeUpdateTask(task, stats, now)}`; - }); -} - -export function buildUpdatePipelineHelpContent(embeddingBasis: EmbeddingBasis): string { - const summaryStatus = - embeddingBasis === 'title_summary' - ? 'LLM summaries: enabled via title_summary.' - : embeddingBasis === 'llm_key_summary' - ? '3-line key summaries: active embedding basis.' - : 'LLM summaries: disabled; current basis is title_original.'; - const summaryAction = - embeddingBasis === 'title_summary' - ? 'On openclaw/openclaw this improved non-solo cluster membership by about 50% versus title_original.' - : embeddingBasis === 'llm_key_summary' - ? 'Run `ghcrawl key-summaries` before embedding so the active vectors have deterministic key text.' - : 'Enable with `ghcrawl configure --embedding-basis title_summary` if you want richer clustering; on openclaw/openclaw that improved non-solo cluster membership by about 50%.'; - return [ - 'Usually you want all three. Run order is fixed: GitHub sync/reconcile -> embeddings -> clusters.', - `${summaryStatus} ${summaryAction}`, - 'A first summarize of ~18k open issues/PRs in openclaw/openclaw typically costs about $15-$30 with gpt-5-mini.', - 'Later refreshes are usually much cheaper because only changed items need summaries.', - 'Toggle with space, move with arrows, Enter to start, Esc to cancel.', - ].join('\n'); -} - export function buildHelpContent(): string { return [ '{bold}ghcrawl TUI Help{/bold}', @@ -1384,16 +994,17 @@ export function buildHelpContent(): string { 'r refresh the current local view from SQLite', '', '{bold}Actions{/bold}', - 'g start the staged update pipeline in the background (GitHub, embeddings, clusters)', - 'p open the repository browser / sync a new repository', + 'p open the repository browser / select another local repository', 'o open the selected thread URL in your browser', '', '{bold}Help And Exit{/bold}', 'h or ? open this help popup', - 'q quit the TUI (or close this popup); warns if a background update is running', + 'q quit the TUI or close this popup', 'Esc close this popup', '', '{bold}Notes{/bold}', + 'The TUI only reads local SQLite. Run ghcrawl sync, ghcrawl embed, and ghcrawl cluster from the shell to update data.', + 'The default cluster filter is 1+, so solo clusters are visible unless you raise it with f.', 'Clusters show C so the cluster id is easy to copy into CLI or skill flows.', 'The footer only shows the short command list. Open help to see the full list.', 'This popup scrolls. Use arrows, PgUp/PgDn, Home, and End if it does not fit.', @@ -1484,85 +1095,6 @@ async function promptHelp(screen: blessed.Widgets.Screen): Promise { }); } -async function promptUpdatePipelineSelection( - screen: blessed.Widgets.Screen, - stats: TuiRepoStats | null, - embeddingBasis: EmbeddingBasis, -): Promise { - const selection: UpdateTaskSelection = { sync: true, embed: true, cluster: true }; - const modalWidth = '76%'; - const box = blessed.list({ - parent: screen, - border: 'line', - label: ' Update Pipeline ', - keys: true, - vi: true, - mouse: false, - top: 'center', - left: 'center', - width: modalWidth, - height: 14, - style: { - border: { fg: '#5bc0eb' }, - item: { fg: 'white' }, - selected: { bg: '#5bc0eb', fg: 'black', bold: true }, - }, - items: buildUpdatePipelineLabels(stats, selection), - }); - const help = blessed.box({ - parent: screen, - top: 'center-5', - left: 'center', - width: modalWidth, - height: 7, - style: { fg: 'white', bg: '#101522' }, - content: buildUpdatePipelineHelpContent(embeddingBasis), - }); - - box.focus(); - box.select(0); - screen.render(); - - return await new Promise((resolve) => { - const getSelectedIndex = (): number => { - const selectedIndex = (box as blessed.Widgets.ListElement & { selected?: number }).selected; - return typeof selectedIndex === 'number' && selectedIndex >= 0 ? selectedIndex : 0; - }; - const refreshItems = (): void => { - const selectedIndex = getSelectedIndex(); - box.setItems(buildUpdatePipelineLabels(stats, selection)); - box.select(selectedIndex); - screen.render(); - }; - const finish = (value: UpdateTaskSelection | null): void => { - screen.off('keypress', handleKeypress); - box.destroy(); - help.destroy(); - screen.render(); - resolve(value); - }; - const handleKeypress = (_char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { - if (key.name === 'escape' || key.name === 'q') { - finish(null); - return; - } - if (key.name === 'space') { - const index = getSelectedIndex(); - const task = UPDATE_TASK_ORDER[index]; - if (!task) return; - selection[task] = !selection[task]; - if (!selection.sync && !selection.embed && !selection.cluster) { - selection[task] = true; - } - refreshItems(); - } - }; - - screen.on('keypress', handleKeypress); - box.on('select', () => finish({ ...selection })); - }); -} - export function getRepositoryChoices(service: Pick, now: Date = new Date()): RepositoryChoice[] { const repositories = service.listRepositories().repositories .slice() @@ -1574,7 +1106,7 @@ export function getRepositoryChoices(service: Pick((resolve) => { - prompt.input('Repository to sync (owner/repo)', '', (_error, value) => { + prompt.input('Repository to open (owner/repo)', '', (_error, value) => { prompt.destroy(); const parsed = parseOwnerRepoValue((value ?? '').trim()); resolve(parsed); @@ -1672,52 +1204,6 @@ async function promptRepositoryInput(screen: blessed.Widgets.Screen): Promise { - log?.log(`[setup] starting initial setup for ${target.owner}/${target.repo}`); - footer?.setContent('Running initial sync, embed, and cluster. This can take a while.'); - screen.render(); - - try { - const reporter = (message: string): void => { - log?.log(message); - screen.render(); - }; - await service.syncRepository({ - owner: target.owner, - repo: target.repo, - onProgress: reporter, - }); - await service.embedRepository({ - owner: target.owner, - repo: target.repo, - onProgress: reporter, - }); - await service.clusterRepository({ - owner: target.owner, - repo: target.repo, - onProgress: reporter, - }); - writeTuiRepositoryPreference(service.config, { - owner: target.owner, - repo: target.repo, - minClusterSize: 1, - sortMode: 'recent', - wideLayout: 'columns', - }); - log?.log('[setup] initial setup complete'); - return true; - } catch (error) { - log?.log(`[setup] failed: ${error instanceof Error ? error.message : String(error)}`); - return false; - } -} - export function parseOwnerRepoValue(value: string): { owner: string; repo: string } | null { const parts = value.trim().split('/'); if (parts.length !== 2 || !parts[0] || !parts[1]) { @@ -1730,12 +1216,6 @@ function formatActivityTimestamp(now: Date = new Date()): string { return now.toISOString().slice(11, 19); } -function parseDateOrNull(value: string | null | undefined): number | null { - if (!value) return null; - const parsed = Date.parse(value); - return Number.isNaN(parsed) ? null : parsed; -} - export function formatClusterDateColumn(value: string | null, locales?: Intl.LocalesArgument): string { if (!value) return 'unknown'; const parsed = new Date(value); @@ -1757,24 +1237,6 @@ export function formatClusterDateColumn(value: string | null, locales?: Intl.Loc return `${date} ${hour}:${minute}`; } -function formatAge(diffMs: number): string { - const safeDiffMs = Math.max(0, diffMs); - const minuteMs = 60_000; - const hourMs = 60 * minuteMs; - const dayMs = 24 * hourMs; - - if (safeDiffMs < hourMs) { - return `${Math.max(1, Math.floor(safeDiffMs / minuteMs))}m`; - } - if (safeDiffMs < dayMs) { - return `${Math.floor(safeDiffMs / hourMs)}h`; - } - if (safeDiffMs < 14 * dayMs) { - return `${Math.floor(safeDiffMs / dayMs)}d`; - } - return `${Math.floor(safeDiffMs / dayMs)}d`; -} - function formatRelativeTime(value: string | null, now: Date = new Date()): string { if (!value) return 'never'; const parsed = new Date(value); diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 26d38d9..6df8898 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -10,7 +10,8 @@ test('cycleSortMode toggles recent and size', () => { }); test('cycleMinSizeFilter rotates through presets', () => { - assert.equal(cycleMinSizeFilter(1), 10); + assert.equal(cycleMinSizeFilter(1), 2); + assert.equal(cycleMinSizeFilter(2), 10); assert.equal(cycleMinSizeFilter(10), 20); assert.equal(cycleMinSizeFilter(20), 50); assert.equal(cycleMinSizeFilter(50), 0); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 696b944..7316120 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -1,14 +1,14 @@ import type { TuiClusterDetail, TuiClusterSortMode, TuiClusterSummary } from '@ghcrawl/api-core'; export type TuiFocusPane = 'clusters' | 'members' | 'detail'; -export type TuiMinSizeFilter = 0 | 1 | 10 | 20 | 50; +export type TuiMinSizeFilter = 0 | 1 | 2 | 10 | 20 | 50; export type MemberListRow = | { key: string; label: string; selectable: false } | { key: string; label: string; selectable: true; threadId: number }; export const SORT_MODE_ORDER: TuiClusterSortMode[] = ['recent', 'size']; -export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [1, 10, 20, 50, 0]; +export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [1, 2, 10, 20, 50, 0]; export const FOCUS_PANE_ORDER: TuiFocusPane[] = ['clusters', 'members', 'detail']; export function cycleSortMode(current: TuiClusterSortMode): TuiClusterSortMode { diff --git a/packages/api-core/src/config.test.ts b/packages/api-core/src/config.test.ts index 4cc1b77..af9c9ad 100644 --- a/packages/api-core/src/config.test.ts +++ b/packages/api-core/src/config.test.ts @@ -271,7 +271,7 @@ test('writeTuiRepositoryPreference persists sort and min cluster size by reposit wideLayout: 'right-stack', }); assert.deepEqual(getTuiRepositoryPreference(reloaded, 'other', 'repo'), { - minClusterSize: 10, + minClusterSize: 1, sortMode: 'recent', wideLayout: 'columns', }); diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index a8c6553..2ff12f3 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -6,7 +6,7 @@ import dotenv from 'dotenv'; export type ConfigValueSource = 'env' | 'config' | 'dotenv' | 'default' | 'none'; export type TuiSortPreference = 'recent' | 'size'; -export type TuiMinClusterSize = 0 | 1 | 10 | 20 | 50; +export type TuiMinClusterSize = 0 | 1 | 2 | 10 | 20 | 50; export type TuiWideLayoutPreference = 'columns' | 'right-stack'; export type EmbeddingBasis = 'title_original' | 'title_summary' | 'llm_key_summary'; export type VectorBackend = 'vectorlite'; @@ -162,7 +162,7 @@ function getTuiSortPreference(value: unknown): TuiSortPreference | undefined { } function getTuiMinClusterSize(value: unknown): TuiMinClusterSize | undefined { - return value === 0 || value === 1 || value === 10 || value === 20 || value === 50 ? value : undefined; + return value === 0 || value === 1 || value === 2 || value === 10 || value === 20 || value === 50 ? value : undefined; } function getTuiWideLayoutPreference(value: unknown): TuiWideLayoutPreference | undefined { @@ -396,7 +396,7 @@ export function ensureRuntimeDirs(config: GitcrawlConfig): void { } export function getTuiRepositoryPreference(config: GitcrawlConfig, owner: string, repo: string): TuiRepositoryPreference { - return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 10, sortMode: 'recent', wideLayout: 'columns' }; + return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 1, sortMode: 'recent', wideLayout: 'columns' }; } export function writeTuiRepositoryPreference( diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index d2c6c32..3681820 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3004,7 +3004,7 @@ test('clusterRepository does not retain a parsed embedding cache in-process', as } }); -test('tui snapshot returns mixed issue and pull request counts with default recent sort and filters', () => { +test('tui snapshot returns mixed issue and pull request counts with default visible cluster filter', () => { const service = makeTestService({ getRepo: async () => ({}), listRepositoryIssues: async () => [], @@ -3077,7 +3077,10 @@ test('tui snapshot returns mixed issue and pull request counts with default rece assert.equal(snapshot.stats.staleEmbedThreadCount, 5); assert.equal(snapshot.stats.staleEmbedSourceCount, 5); assert.equal(snapshot.stats.latestClusterRunId, 1); - assert.equal(snapshot.clusters.length, 0); + assert.deepEqual( + snapshot.clusters.map((cluster) => cluster.clusterId), + [101, 100], + ); const allSnapshot = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); assert.deepEqual( diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 01af448..439369c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3183,7 +3183,7 @@ export class GHCrawlService { const includeClosedClusters = params.includeClosedClusters ?? true; const clusters = this.listRawTuiClusters(repository.id, latestRun.id) .filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed)) - .filter((cluster) => cluster.totalCount >= (params.minSize ?? 10)) + .filter((cluster) => cluster.totalCount >= (params.minSize ?? 1)) .filter((cluster) => { const search = params.search?.trim().toLowerCase(); if (!search) return true; From b4ed2b436ebc5be2ab2667d36475d50d8a6bba33 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 09:21:25 -0700 Subject: [PATCH 068/215] fix(tui): improve cluster browsing ergonomics --- apps/cli/src/tui/app.test.ts | 29 +++++ apps/cli/src/tui/app.ts | 160 ++++++++++++++++++++------- apps/cli/src/tui/state.test.ts | 4 +- apps/cli/src/tui/state.ts | 9 +- packages/api-core/src/config.test.ts | 2 +- packages/api-core/src/config.ts | 2 +- packages/api-core/src/service.ts | 2 +- 7 files changed, 156 insertions(+), 52 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 10b287a..828557e 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -7,6 +7,8 @@ import { buildHelpContent, escapeBlessedText, formatClusterDateColumn, + formatClusterListLabel, + formatClusterShortName, getRepositoryChoices, parseOwnerRepoValue, renderDetailPane, @@ -94,6 +96,31 @@ test('formatClusterDateColumn follows locale month/day ordering while keeping fi assert.equal(formatClusterDateColumn(iso, 'en-GB'), '10-03 16:04'); }); +test('formatClusterListLabel keeps counts first and adds a short cluster name', () => { + const label = formatClusterListLabel({ + clusterId: 1507, + displayTitle: 'Fix: dedupe section title/desc in single-section config view', + isClosed: false, + closedAtLocal: null, + closeReasonLocal: null, + totalCount: 3, + issueCount: 0, + pullRequestCount: 3, + latestUpdatedAt: '2026-04-24T07:29:02', + representativeThreadId: 252, + representativeNumber: 55342, + representativeKind: 'issue', + searchText: 'fix dedupe section', + }); + + assert.match(label, /3 items\s+C1507\s+3P\/0I\s+04-24 07:29\s+Fix: dedupe section/); +}); + +test('formatClusterShortName returns the first meaningful words', () => { + assert.equal(formatClusterShortName('[codex] fix agent session-id routing'), 'codex fix agent'); + assert.equal(formatClusterShortName(''), 'untitled'); +}); + test('getRepositoryChoices sorts by most recent update and includes the new-repo action', () => { const service = { listRepositories() { @@ -136,6 +163,8 @@ test('buildHelpContent includes the full key command list', () => { assert.match(content, /#\s+jump directly to an issue or PR number/); assert.match(content, /TUI only reads local SQLite/); assert.match(content, /default cluster filter is 1\+/); + assert.match(content, /default sort is size/); + assert.match(content, /Mouse clicks focus panes/); assert.match(content, /p\s+open the repository browser/); assert.match(content, /l\s+toggle wide layout/); assert.match(content, /x\s+show or hide locally closed clusters and members/); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index b3ef2bf..dda6c71 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -5,6 +5,7 @@ import blessed from 'neo-blessed'; import type { GHCrawlService, TuiClusterDetail, + TuiClusterSummary, TuiClusterSortMode, TuiSnapshot, TuiThreadDetail, @@ -19,7 +20,6 @@ import { findSelectableIndex, moveSelectableIndex, preserveSelectedId, - selectedThreadIdFromRow, type MemberListRow, type TuiFocusPane, type TuiMinSizeFilter, @@ -89,9 +89,10 @@ export async function startTui(params: StartTuiParams): Promise { const widgets = createWidgets(currentRepository.owner, currentRepository.repo); let focusPane: TuiFocusPane = 'clusters'; + let isRendering = false; const initialPreference = selectedRepository ? getTuiRepositoryPreference(params.service.config, currentRepository.owner, currentRepository.repo) - : { sortMode: 'recent' as TuiClusterSortMode, minClusterSize: 1 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference }; + : { sortMode: 'size' as TuiClusterSortMode, minClusterSize: 1 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference }; let sortMode: TuiClusterSortMode = initialPreference.sortMode; let minSize: TuiMinSizeFilter = initialPreference.minClusterSize; let wideLayout: TuiWideLayoutPreference = initialPreference.wideLayout; @@ -130,9 +131,7 @@ export async function startTui(params: StartTuiParams): Promise { clusterIndexById = new Map(); clusterItems = snapshot.clusters.map((cluster, index) => { clusterIndexById.set(cluster.clusterId, index); - const updated = formatClusterDateColumn(cluster.latestUpdatedAt); - const meta = `${cluster.totalCount} items C${cluster.clusterId} ${cluster.pullRequestCount}P/${cluster.issueCount}I ${updated}`; - const label = `${cluster.displayTitle} ${meta}`; + const label = formatClusterListLabel(cluster); return cluster.isClosed ? `{gray-fg}${escapeBlessedText(label)}{/gray-fg}` : escapeBlessedText(label); }); widgets.clusters.setItems(clusterItems); @@ -315,12 +314,17 @@ export async function startTui(params: StartTuiParams): Promise { `{bold}${repoLabel}{/bold} {cyan-fg}${snapshot?.stats.openPullRequestCount ?? 0} PR{/cyan-fg} {green-fg}${snapshot?.stats.openIssueCount ?? 0} issues{/green-fg} GH:${ghStatus} Emb:${embedStatus} Cl:${clusterStatus} sort:${sortMode} min:${minSize === 0 ? 'all' : `${minSize}+`} layout:${wideLayout === 'columns' ? 'cols' : 'stack'} closed:${showClosed ? 'shown' : 'hidden'} filter:${search || 'none'}`, ); - const clusterIndex = snapshot && selectedClusterId !== null ? Math.max(0, clusterIndexById.get(selectedClusterId) ?? -1) : 0; - widgets.clusters.select(clusterIndex); + isRendering = true; + try { + const clusterIndex = snapshot && selectedClusterId !== null ? Math.max(0, clusterIndexById.get(selectedClusterId) ?? -1) : 0; + widgets.clusters.select(clusterIndex); - widgets.members.setItems(memberRows.length > 0 ? memberRows.map((row) => row.label) : ['No members']); - if (memberIndex >= 0) { - widgets.members.select(memberIndex); + widgets.members.setItems(memberRows.length > 0 ? memberRows.map((row) => row.label) : ['No members']); + if (memberIndex >= 0) { + widgets.members.select(memberIndex); + } + } finally { + isRendering = false; } widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane)); @@ -331,10 +335,10 @@ export async function startTui(params: StartTuiParams): Promise { footerLines.unshift(''); } footerLines.push( - `${status} | h/? help # jump p repos / filter s sort f min l layout x closed`, + `${status} | focus:${focusPane} sort:${sortMode} h/? help # jump p repos / filter s sort f min`, ); footerLines.push( - `Tab focus arrows move-or-scroll PgUp/PgDn page r refresh o open q quit`, + `Tab focus mouse click/select/scroll PgUp/PgDn page l layout x closed r refresh o open q quit`, ); widgets.footer.setContent(footerLines.join('\n')); widgets.screen.render(); @@ -363,26 +367,7 @@ export async function startTui(params: StartTuiParams): Promise { } else { nextIndex = Math.max(0, Math.min(snapshot.clusters.length - 1, nextIndex)); } - selectedClusterId = snapshot.clusters[nextIndex]?.clusterId ?? null; - if (selectedClusterId !== null) { - try { - clusterDetail = loadClusterDetail(selectedClusterId); - } catch { - status = 'Cluster data changed; refreshing view'; - refreshAll(true); - return; - } - memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed }); - selectedMemberThreadId = preserveSelectedId( - memberRows.filter((row) => row.selectable).map((row) => row.threadId), - null, - ); - memberIndex = findSelectableIndex(memberRows, selectedMemberThreadId); - loadSelectedThreadDetail(false); - resetDetailScroll(); - } - status = selectedClusterId !== null ? `Cluster ${selectedClusterId} (${nextIndex + 1}/${snapshot.clusters.length})` : `Cluster ${nextIndex + 1}/${snapshot.clusters.length}`; - render(); + selectClusterIndex(nextIndex); return; } @@ -396,12 +381,7 @@ export async function startTui(params: StartTuiParams): Promise { } nextIndex = candidateIndex; } - memberIndex = nextIndex; - selectedMemberThreadId = selectedThreadIdFromRow(memberRows, memberIndex); - loadSelectedThreadDetail(false); - resetDetailScroll(); - status = selectedMemberThreadId !== null ? `Selected #${threadDetail?.thread.number ?? '?'}` : 'No selectable member'; - render(); + selectMemberIndex(nextIndex); } }; @@ -418,6 +398,49 @@ export async function startTui(params: StartTuiParams): Promise { moveSelection(delta, { steps: getFocusedListPageSize(), wrap: false }); }; + const selectClusterIndex = (nextIndex: number): void => { + if (!snapshot || snapshot.clusters.length === 0) return; + const boundedIndex = Math.max(0, Math.min(snapshot.clusters.length - 1, nextIndex)); + selectedClusterId = snapshot.clusters[boundedIndex]?.clusterId ?? null; + if (selectedClusterId !== null) { + try { + clusterDetail = loadClusterDetail(selectedClusterId); + } catch { + status = 'Cluster data changed; refreshing view'; + refreshAll(true); + return; + } + memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed }); + selectedMemberThreadId = preserveSelectedId( + memberRows.filter((row) => row.selectable).map((row) => row.threadId), + null, + ); + memberIndex = findSelectableIndex(memberRows, selectedMemberThreadId); + loadSelectedThreadDetail(false); + resetDetailScroll(); + } + status = + selectedClusterId !== null + ? `Cluster ${selectedClusterId} (${boundedIndex + 1}/${snapshot.clusters.length})` + : `Cluster ${boundedIndex + 1}/${snapshot.clusters.length}`; + render(); + }; + + const selectMemberIndex = (nextIndex: number): void => { + if (memberRows.length === 0) return; + const row = memberRows[nextIndex]; + if (!row?.selectable) { + render(); + return; + } + memberIndex = nextIndex; + selectedMemberThreadId = row.threadId; + loadSelectedThreadDetail(false); + resetDetailScroll(); + status = selectedMemberThreadId !== null ? `Selected #${threadDetail?.thread.number ?? '?'}` : 'No selectable member'; + render(); + }; + const promptFilter = (): void => { modalOpen = true; const prompt = blessed.prompt({ @@ -796,6 +819,32 @@ export async function startTui(params: StartTuiParams): Promise { if (modalOpen) return; openSelectedThread(); }); + widgets.clusters.on('select item', (_item, index) => { + if (isRendering || modalOpen) return; + focusPane = 'clusters'; + widgets.clusters.focus(); + selectClusterIndex(Number(index)); + }); + widgets.clusters.on('select', () => { + if (isRendering || modalOpen) return; + updateFocus('members'); + }); + widgets.members.on('select item', (_item, index) => { + if (isRendering || modalOpen) return; + focusPane = 'members'; + widgets.members.focus(); + selectMemberIndex(Number(index)); + }); + widgets.members.on('select', () => { + if (isRendering || modalOpen) return; + loadSelectedThreadDetail(true); + status = selectedMemberThreadId !== null ? `Loaded neighbors for #${threadDetail?.thread.number ?? '?'}` : status; + updateFocus('detail'); + }); + widgets.detail.on('click', () => { + if (modalOpen) return; + updateFocus('detail'); + }); widgets.screen.on('resize', () => render()); widgets.screen.on('destroy', () => { @@ -825,6 +874,7 @@ function createWidgets(owner: string, repo: string): Widgets { fullUnicode: true, dockBorders: true, autoPadding: false, + mouse: true, title: owner && repo ? `ghcrawl ${owner}/${repo}` : 'ghcrawl', }); const header = blessed.box({ @@ -838,6 +888,7 @@ function createWidgets(owner: string, repo: string): Widgets { label: ' Clusters ', tags: true, keys: false, + mouse: true, style: { border: { fg: '#5bc0eb' }, item: { fg: 'white' }, @@ -851,6 +902,7 @@ function createWidgets(owner: string, repo: string): Widgets { label: ' Members ', tags: true, keys: false, + mouse: true, style: { border: { fg: '#9bc53d' }, item: { fg: 'white' }, @@ -866,6 +918,7 @@ function createWidgets(owner: string, repo: string): Widgets { scrollable: true, alwaysScroll: true, keys: false, + mouse: true, scrollbar: { ch: ' ' }, style: { border: { fg: '#fde74c' }, @@ -882,9 +935,16 @@ function createWidgets(owner: string, repo: string): Widgets { } function updatePaneStyles(widgets: Widgets, focus: TuiFocusPane): void { + widgets.clusters.setLabel(`${focus === 'clusters' ? '[*]' : '[ ]'} Clusters `); + widgets.members.setLabel(`${focus === 'members' ? '[*]' : '[ ]'} Members `); + widgets.detail.setLabel(`${focus === 'detail' ? '[*]' : '[ ]'} Detail `); widgets.clusters.style.border = { fg: focus === 'clusters' ? 'white' : '#5bc0eb' }; widgets.members.style.border = { fg: focus === 'members' ? 'white' : '#9bc53d' }; widgets.detail.style.border = { fg: focus === 'detail' ? 'white' : '#fde74c' }; + widgets.clusters.style.selected = + focus === 'clusters' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#23445c', fg: 'white', bold: true }; + widgets.members.style.selected = + focus === 'members' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#33521e', fg: 'white', bold: true }; } export function renderDetailPane( @@ -981,6 +1041,7 @@ export function buildHelpContent(): string { 'Left / Right cycle focus backward or forward across panes', 'Up / Down move selection, or scroll detail when detail is focused', 'Enter clusters -> members, members -> detail', + 'Mouse click a pane or row to focus/select; wheel scrolls lists and detail', 'PgUp / PgDn page through the focused pane or this help popup faster', 'Home / End jump to the top or bottom of detail or help', '', @@ -1005,6 +1066,8 @@ export function buildHelpContent(): string { '{bold}Notes{/bold}', 'The TUI only reads local SQLite. Run ghcrawl sync, ghcrawl embed, and ghcrawl cluster from the shell to update data.', 'The default cluster filter is 1+, so solo clusters are visible unless you raise it with f.', + 'The default sort is size. Press s to toggle size and recent.', + 'Mouse clicks focus panes; clicking an already selected row advances to the next pane.', 'Clusters show C so the cluster id is easy to copy into CLI or skill flows.', 'The footer only shows the short command list. Open help to see the full list.', 'This popup scrolls. Use arrows, PgUp/PgDn, Home, and End if it does not fit.', @@ -1022,7 +1085,7 @@ async function promptHelp(screen: blessed.Widgets.Screen): Promise { alwaysScroll: true, keys: true, vi: true, - mouse: false, + mouse: true, top: 'center', left: 'center', width: modalWidth, @@ -1121,7 +1184,7 @@ async function promptRepositoryChoice( label: ' Repositories ', keys: true, vi: true, - mouse: false, + mouse: true, top: 'center', left: 'center', width: '70%', @@ -1212,6 +1275,23 @@ export function parseOwnerRepoValue(value: string): { owner: string; repo: strin return { owner: parts[0], repo: parts[1] }; } +export function formatClusterListLabel(cluster: TuiClusterSummary): string { + const countLabel = `${cluster.totalCount} ${cluster.totalCount === 1 ? 'item' : 'items'}`.padStart(7); + const mixLabel = `${cluster.pullRequestCount}P/${cluster.issueCount}I`.padStart(6); + const updated = formatClusterDateColumn(cluster.latestUpdatedAt); + return `${countLabel} C${cluster.clusterId} ${mixLabel} ${updated} ${formatClusterShortName(cluster.displayTitle)}`; +} + +export function formatClusterShortName(title: string, maxWords = 3): string { + const words = title + .replace(/[\[\]{}()<>]/g, ' ') + .split(/\s+/) + .map((word) => word.trim()) + .filter(Boolean) + .slice(0, maxWords); + return words.join(' ') || 'untitled'; +} + function formatActivityTimestamp(now: Date = new Date()): string { return now.toISOString().slice(11, 19); } diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 6df8898..460611f 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -4,9 +4,9 @@ import assert from 'node:assert/strict'; import { buildMemberRows, cycleFocusPane, cycleMinSizeFilter, cycleSortMode, findSelectableIndex, moveSelectableIndex, preserveSelectedId, applyClusterFilters } from './state.js'; import type { TuiClusterDetail, TuiClusterSummary } from '@ghcrawl/api-core'; -test('cycleSortMode toggles recent and size', () => { - assert.equal(cycleSortMode('recent'), 'size'); +test('cycleSortMode toggles size and recent', () => { assert.equal(cycleSortMode('size'), 'recent'); + assert.equal(cycleSortMode('recent'), 'size'); }); test('cycleMinSizeFilter rotates through presets', () => { diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 7316120..404d5b6 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -7,13 +7,13 @@ export type MemberListRow = | { key: string; label: string; selectable: false } | { key: string; label: string; selectable: true; threadId: number }; -export const SORT_MODE_ORDER: TuiClusterSortMode[] = ['recent', 'size']; +export const SORT_MODE_ORDER: TuiClusterSortMode[] = ['size', 'recent']; export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [1, 2, 10, 20, 50, 0]; export const FOCUS_PANE_ORDER: TuiFocusPane[] = ['clusters', 'members', 'detail']; export function cycleSortMode(current: TuiClusterSortMode): TuiClusterSortMode { const index = SORT_MODE_ORDER.indexOf(current); - return SORT_MODE_ORDER[(index + 1) % SORT_MODE_ORDER.length] ?? 'recent'; + return SORT_MODE_ORDER[(index + 1) % SORT_MODE_ORDER.length] ?? 'size'; } export function cycleMinSizeFilter(current: TuiMinSizeFilter): TuiMinSizeFilter { @@ -103,11 +103,6 @@ export function moveSelectableIndex(rows: MemberListRow[], currentIndex: number, return currentIndex; } -export function selectedThreadIdFromRow(rows: MemberListRow[], index: number): number | null { - const row = rows[index]; - return row && row.selectable ? row.threadId : null; -} - function compareClusters(left: TuiClusterSummary, right: TuiClusterSummary, sortMode: TuiClusterSortMode): number { const leftTime = left.latestUpdatedAt ? Date.parse(left.latestUpdatedAt) : 0; const rightTime = right.latestUpdatedAt ? Date.parse(right.latestUpdatedAt) : 0; diff --git a/packages/api-core/src/config.test.ts b/packages/api-core/src/config.test.ts index af9c9ad..a2cae72 100644 --- a/packages/api-core/src/config.test.ts +++ b/packages/api-core/src/config.test.ts @@ -272,7 +272,7 @@ test('writeTuiRepositoryPreference persists sort and min cluster size by reposit }); assert.deepEqual(getTuiRepositoryPreference(reloaded, 'other', 'repo'), { minClusterSize: 1, - sortMode: 'recent', + sortMode: 'size', wideLayout: 'columns', }); }); diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index 2ff12f3..6f0f81f 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -396,7 +396,7 @@ export function ensureRuntimeDirs(config: GitcrawlConfig): void { } export function getTuiRepositoryPreference(config: GitcrawlConfig, owner: string, repo: string): TuiRepositoryPreference { - return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 1, sortMode: 'recent', wideLayout: 'columns' }; + return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 1, sortMode: 'size', wideLayout: 'columns' }; } export function writeTuiRepositoryPreference( diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 439369c..cd6289c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3189,7 +3189,7 @@ export class GHCrawlService { if (!search) return true; return cluster.searchText.includes(search); }) - .sort((left, right) => this.compareTuiClusterSummary(left, right, params.sort ?? 'recent')); + .sort((left, right) => this.compareTuiClusterSummary(left, right, params.sort ?? 'size')); return { repository, From 61485e00d24a1b73249b8b196e32a043df7327c1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 09:28:47 -0700 Subject: [PATCH 069/215] fix(cluster): bound automatic cluster components --- apps/cli/src/main.ts | 7 ++++- apps/cli/src/tui/app.test.ts | 5 ++-- apps/cli/src/tui/app.ts | 21 +++++++++++++-- packages/api-core/src/cluster/build.test.ts | 29 ++++++++++++++++++++- packages/api-core/src/service.ts | 8 +++++- 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index b998d07..2d64f7f 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -279,12 +279,13 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'cluster', - synopsis: 'cluster [--number ] [--k ] [--threshold ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', + synopsis: 'cluster [--number ] [--k ] [--threshold ] [--max-cluster-size ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', description: 'Build or refresh local similarity clusters.', options: [ '--number Refresh only one durable cluster neighborhood', '--k Limit nearest-neighbor fanout', '--threshold Minimum similarity score', + '--max-cluster-size Soft cap for automatic cluster components before starting a new component', '--heap-snapshot-dir Write heap snapshots during long-running work', '--heap-log-interval-ms Emit periodic heap diagnostics', '--json Emit machine-readable JSON output explicitly', @@ -1249,6 +1250,10 @@ export async function run( threadNumber: typeof values.number === 'string' ? parsePositiveInteger('number', values.number, 'cluster') : undefined, k: typeof values.k === 'string' ? parsePositiveInteger('k', values.k, 'cluster') : undefined, minScore: typeof values.threshold === 'string' ? parseFiniteNumber('threshold', values.threshold, 'cluster') : undefined, + maxClusterSize: + typeof values['max-cluster-size'] === 'string' + ? parsePositiveInteger('max-cluster-size', values['max-cluster-size'], 'cluster') + : undefined, onProgress: heapDiagnostics?.wrapProgress((message: string) => writeProgress(message, stderr)) ?? ((message: string) => writeProgress(message, stderr)), diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 828557e..c204c21 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -113,11 +113,12 @@ test('formatClusterListLabel keeps counts first and adds a short cluster name', searchText: 'fix dedupe section', }); - assert.match(label, /3 items\s+C1507\s+3P\/0I\s+04-24 07:29\s+Fix: dedupe section/); + assert.match(label, /3 items\s+dedupe section title\/des\s+C1507\s+3P\/0I\s+04-24 07:29/); }); test('formatClusterShortName returns the first meaningful words', () => { - assert.equal(formatClusterShortName('[codex] fix agent session-id routing'), 'codex fix agent'); + assert.equal(formatClusterShortName('[codex] fix agent session-id routing'), 'agent session-id routing'); + assert.equal(formatClusterShortName('fix(agents): exclude volatile inbound metadata'), 'agents exclude volatile'); assert.equal(formatClusterShortName(''), 'untitled'); }); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index dda6c71..6258840 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1279,7 +1279,7 @@ export function formatClusterListLabel(cluster: TuiClusterSummary): string { const countLabel = `${cluster.totalCount} ${cluster.totalCount === 1 ? 'item' : 'items'}`.padStart(7); const mixLabel = `${cluster.pullRequestCount}P/${cluster.issueCount}I`.padStart(6); const updated = formatClusterDateColumn(cluster.latestUpdatedAt); - return `${countLabel} C${cluster.clusterId} ${mixLabel} ${updated} ${formatClusterShortName(cluster.displayTitle)}`; + return `${countLabel} ${formatClusterShortName(cluster.displayTitle).padEnd(24).slice(0, 24)} C${cluster.clusterId} ${mixLabel} ${updated}`; } export function formatClusterShortName(title: string, maxWords = 3): string { @@ -1287,11 +1287,28 @@ export function formatClusterShortName(title: string, maxWords = 3): string { .replace(/[\[\]{}()<>]/g, ' ') .split(/\s+/) .map((word) => word.trim()) - .filter(Boolean) + .map((word) => word.replace(/^[:/#-]+|[:/#-]+$/g, '')) + .filter((word) => word && !CLUSTER_SHORT_NAME_STOPWORDS.has(word.toLowerCase())) .slice(0, maxWords); return words.join(' ') || 'untitled'; } +const CLUSTER_SHORT_NAME_STOPWORDS = new Set([ + 'ai', + 'assisted', + 'bug', + 'chore', + 'codex', + 'docs', + 'feat', + 'feature', + 'fix', + 'issue', + 'pr', + 'refactor', + 'test', +]); + function formatActivityTimestamp(now: Date = new Date()): string { return now.toISOString().slice(11, 19); } diff --git a/packages/api-core/src/cluster/build.test.ts b/packages/api-core/src/cluster/build.test.ts index a6dd858..600cda8 100644 --- a/packages/api-core/src/cluster/build.test.ts +++ b/packages/api-core/src/cluster/build.test.ts @@ -1,7 +1,7 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { buildClusters } from './build.js'; +import { buildClusters, buildSizeBoundedClusters } from './build.js'; test('buildClusters groups connected components', () => { const clusters = buildClusters( @@ -16,3 +16,30 @@ test('buildClusters groups connected components', () => { assert.equal(clusters.length, 2); assert.deepEqual(clusters[0]?.members, [1, 2]); }); + +test('buildSizeBoundedClusters prevents weak chains from forming catch-all clusters', () => { + const nodes = Array.from({ length: 6 }, (_, index) => ({ + threadId: index + 1, + number: index + 10, + title: `thread ${index + 1}`, + })); + const clusters = buildSizeBoundedClusters( + nodes, + [ + { leftThreadId: 1, rightThreadId: 2, score: 0.95 }, + { leftThreadId: 2, rightThreadId: 3, score: 0.94 }, + { leftThreadId: 3, rightThreadId: 4, score: 0.82 }, + { leftThreadId: 4, rightThreadId: 5, score: 0.81 }, + { leftThreadId: 5, rightThreadId: 6, score: 0.8 }, + ], + { maxClusterSize: 3 }, + ); + + assert.deepEqual( + clusters.map((cluster) => cluster.members), + [ + [1, 2, 3], + [4, 5, 6], + ], + ); +}); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index cd6289c..81564ac 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -409,6 +409,7 @@ const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; const DEFAULT_CLUSTER_MIN_SCORE = 0.78; +const DEFAULT_CLUSTER_MAX_SIZE = 24; const VECTORLITE_CLUSTER_EXPANDED_K = 24; const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512; @@ -1930,6 +1931,7 @@ export class GHCrawlService { repo: string; threadNumber?: number; minScore?: number; + maxClusterSize?: number; k?: number; onProgress?: (message: string) => void; }): Promise { @@ -1944,6 +1946,8 @@ export class GHCrawlService { JSON.stringify({ threadNumber: params.threadNumber ?? null, minScore: params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE, + maxClusterSize: params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE, + clusterMode: 'size_bounded', k: params.k ?? 6, embedModel: this.config.embedModel, embeddingBasis: this.config.embeddingBasis, @@ -1951,6 +1955,7 @@ export class GHCrawlService { ), }); const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE; + const maxClusterSize = params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE; const k = params.k ?? 6; try { @@ -2070,9 +2075,10 @@ export class GHCrawlService { } } const clusterItems = seedThreadIds ? deterministicItems.filter((item) => involvedIds.has(item.id)) : deterministicItems; - const clusters = buildClusters( + const clusters = buildSizeBoundedClusters( clusterItems.map((item) => ({ threadId: item.id, number: item.number, title: item.title })), edges, + { maxClusterSize }, ); if (!seedThreadIds) { this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); From 5c0e877230a93cc5795e1aafdf6d2d267a8b210a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 09:47:24 -0700 Subject: [PATCH 070/215] fix(summary): speed up key summary enrichment --- packages/api-core/src/openai/provider.ts | 2 +- packages/api-core/src/service.ts | 102 +++++++++++++++++++---- 2 files changed, 89 insertions(+), 15 deletions(-) diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index db7204e..ab995da 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -121,7 +121,7 @@ export class OpenAiProvider implements AiProvider { const format = zodTextFormat(llmKeySummarySchema, 'ghcrawl_key_summary'); let lastError: Error | null = null; - for (const [attemptIndex, maxOutputTokens] of [240, 400, 600].entries()) { + for (const [attemptIndex, maxOutputTokens] of [600, 900, 1200].entries()) { try { const response = await this.client.responses.create({ model: params.model, diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 81564ac..0d79b3d 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -167,6 +167,14 @@ type ActiveVectorTask = { wasTruncated: boolean; }; +type KeySummaryTask = { + threadId: number; + threadNumber: number; + revisionId: number; + inputHash: string; + text: string; +}; + type ActiveVectorRow = ThreadRow & { basis: EmbeddingBasis; model: string; @@ -405,6 +413,9 @@ const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]'; const EMBED_CONTEXT_RETRY_ATTEMPTS = 5; const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9; const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95; +const KEY_SUMMARY_MAX_BODY_CHARS = 6000; +const KEY_SUMMARY_CONCURRENCY = 8; +const KEY_SUMMARY_MAX_UNREAD = 16; const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; @@ -1696,6 +1707,7 @@ export class GHCrawlService { if (!ai.generateKeySummary) { throw new Error('Configured AI provider does not support key summary generation.'); } + const generateKeySummary = ai.generateKeySummary; const providerName = ai.providerName ?? 'custom'; const repository = this.requireRepository(params.owner, params.repo); const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `key-summary:${params.threadNumber}` : `key-summary:${repository.fullName}`); @@ -1734,12 +1746,18 @@ export class GHCrawlService { let outputTokens = 0; let totalTokens = 0; const errorSamples: Array<{ number: number; error: string }> = []; + const tasks: KeySummaryTask[] = []; for (const row of rows) { const labels = parseArray(row.labels_json); - const inputHash = llmKeyInputHash({ + const text = this.buildKeySummaryInputText({ title: row.title, + labels, body: row.body, + }); + const inputHash = llmKeyInputHash({ + title: row.title, + body: text, commentsText: null, diffText: null, }); @@ -1768,28 +1786,72 @@ export class GHCrawlService { continue; } - let result: Awaited>>; - try { - result = await ai.generateKeySummary({ - model: this.config.summaryModel, - text: [`title: ${row.title}`, `labels: ${labels.join(', ')}`, `body: ${row.body ?? ''}`].join('\n'), - }); - } catch (error) { + tasks.push({ + threadId: row.id, + threadNumber: row.number, + revisionId, + inputHash, + text, + }); + } + + params.onProgress?.( + `[key-summary] pending=${tasks.length} skipped=${skipped} concurrency=${KEY_SUMMARY_CONCURRENCY} max_body_chars=${KEY_SUMMARY_MAX_BODY_CHARS}`, + ); + + const mapper = new IterableMapper( + tasks, + async (task: KeySummaryTask) => { + try { + const result = await generateKeySummary({ + model: this.config.summaryModel, + text: task.text, + }); + return { task, result, error: null }; + } catch (error) { + return { + task, + result: null, + error: error instanceof Error ? error : new Error(String(error)), + }; + } + }, + { + concurrency: KEY_SUMMARY_CONCURRENCY, + maxUnread: KEY_SUMMARY_MAX_UNREAD, + }, + ); + + for await (const item of mapper) { + const { task } = item; + if (item.error) { failed += 1; - const message = error instanceof Error ? error.message : String(error); + const message = item.error.message; if (errorSamples.length < 10) { - errorSamples.push({ number: row.number, error: message }); + errorSamples.push({ number: task.threadNumber, error: message }); } - params.onProgress?.(`[key-summary] failed thread #${row.number}: ${message}`); + params.onProgress?.(`[key-summary] failed thread #${task.threadNumber}: ${message}`); continue; } + + const result = item.result; + if (!result) { + failed += 1; + const message = 'AI provider returned no key summary result'; + if (errorSamples.length < 10) { + errorSamples.push({ number: task.threadNumber, error: message }); + } + params.onProgress?.(`[key-summary] failed thread #${task.threadNumber}: ${message}`); + continue; + } + upsertThreadKeySummary(this.db, { - threadRevisionId: revisionId, + threadRevisionId: task.revisionId, summaryKind: 'llm_key_3line', promptVersion: LLM_KEY_SUMMARY_PROMPT_VERSION, provider: providerName, model: this.config.summaryModel, - inputHash, + inputHash: task.inputHash, summary: result.summary, }); generated += 1; @@ -1798,7 +1860,10 @@ export class GHCrawlService { outputTokens += result.usage.outputTokens; totalTokens += result.usage.totalTokens; } - params.onProgress?.(`[key-summary] generated ${generated}/${rows.length} thread #${row.number}`); + const completed = generated + failed; + params.onProgress?.( + `[key-summary] generated ${generated}/${tasks.length} failed=${failed} completed=${completed}/${tasks.length} thread #${task.threadNumber}`, + ); } const payload = { runId, generated, skipped, failed, inputTokens, outputTokens, totalTokens, errorSamples }; @@ -1810,6 +1875,15 @@ export class GHCrawlService { } } + private buildKeySummaryInputText(params: { title: string; labels: string[]; body: string | null }): string { + const body = normalizeSummaryText(params.body ?? ''); + const truncatedBody = + body.length > KEY_SUMMARY_MAX_BODY_CHARS + ? `${body.slice(0, KEY_SUMMARY_MAX_BODY_CHARS)}\n\n[truncated for key summary]` + : body; + return [`title: ${params.title}`, `labels: ${params.labels.join(', ')}`, `body: ${truncatedBody}`].join('\n'); + } + purgeComments(params: { owner: string; repo: string; From 52990e05649610f666c4a3a86d2489fa5be6ade4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 09:48:11 -0700 Subject: [PATCH 071/215] fix(summary): preserve provider context in key summaries --- packages/api-core/src/service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 0d79b3d..8c59fc3 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1707,7 +1707,7 @@ export class GHCrawlService { if (!ai.generateKeySummary) { throw new Error('Configured AI provider does not support key summary generation.'); } - const generateKeySummary = ai.generateKeySummary; + const generateKeySummary = ai.generateKeySummary.bind(ai); const providerName = ai.providerName ?? 'custom'; const repository = this.requireRepository(params.owner, params.repo); const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `key-summary:${params.threadNumber}` : `key-summary:${repository.fullName}`); From 04c89aaf27d906c909a4bdec4321cc0b7ab11d62 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 09:52:39 -0700 Subject: [PATCH 072/215] fix(summary): increase key summary concurrency --- packages/api-core/src/service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 8c59fc3..4b76316 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -414,8 +414,8 @@ const EMBED_CONTEXT_RETRY_ATTEMPTS = 5; const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9; const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95; const KEY_SUMMARY_MAX_BODY_CHARS = 6000; -const KEY_SUMMARY_CONCURRENCY = 8; -const KEY_SUMMARY_MAX_UNREAD = 16; +const KEY_SUMMARY_CONCURRENCY = 24; +const KEY_SUMMARY_MAX_UNREAD = 48; const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; From ae53492efe78e78eb9dc7a5b093b277557d9f674 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 10:24:44 -0700 Subject: [PATCH 073/215] fix(openai): minimize reasoning for summaries --- packages/api-core/src/openai/provider.ts | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index ab995da..bef9157 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -83,10 +83,16 @@ export class OpenAiProvider implements AiProvider { format, verbosity: 'low', }, + reasoning: { + effort: 'minimal', + }, max_output_tokens: maxOutputTokens, }); const raw = response.output_text ?? ''; + if (!raw.trim()) { + throw new Error(`empty structured output${response.incomplete_details?.reason ? ` (${response.incomplete_details.reason})` : ''}`); + } const parsed = summarySchema.parse(JSON.parse(raw)); return { @@ -139,10 +145,16 @@ export class OpenAiProvider implements AiProvider { format, verbosity: 'low', }, + reasoning: { + effort: 'minimal', + }, max_output_tokens: maxOutputTokens, }); const raw = response.output_text ?? ''; + if (!raw.trim()) { + throw new Error(`empty structured output${response.incomplete_details?.reason ? ` (${response.incomplete_details.reason})` : ''}`); + } return { summary: llmKeySummarySchema.parse(JSON.parse(raw)), usage: response.usage From 04d8cf31c64792d19b810dabea501a8621914163 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 10:51:53 -0700 Subject: [PATCH 074/215] fix(cluster): show dictionary cluster names --- .../api-core/src/cluster/governance.test.ts | 2 +- .../api-core/src/cluster/human-key.test.ts | 3 +- packages/api-core/src/cluster/human-key.ts | 45 ++++++++++++----- packages/api-core/src/service.test.ts | 4 +- packages/api-core/src/service.ts | 50 ++++++++++++------- 5 files changed, 70 insertions(+), 34 deletions(-) diff --git a/packages/api-core/src/cluster/governance.test.ts b/packages/api-core/src/cluster/governance.test.ts index ae1868b..73f63ea 100644 --- a/packages/api-core/src/cluster/governance.test.ts +++ b/packages/api-core/src/cluster/governance.test.ts @@ -19,7 +19,7 @@ test('applyClusterGovernance creates a stable cluster for new evidence', () => { }); assert.equal(result.clusters.length, 1); - assert.match(result.clusters[0].stableSlug, /^[a-z]+-[a-z]+-[a-z]+-[a-z0-9]{4}$/); + assert.match(result.clusters[0].stableSlug, /^[a-z]+-[a-z]+-[a-z]+$/); assert.deepEqual(result.clusters[0].memberThreadIds, [10, 11]); assert.equal(result.events[0].eventType, 'create_cluster'); }); diff --git a/packages/api-core/src/cluster/human-key.test.ts b/packages/api-core/src/cluster/human-key.test.ts index bde1328..fae857d 100644 --- a/packages/api-core/src/cluster/human-key.test.ts +++ b/packages/api-core/src/cluster/human-key.test.ts @@ -10,7 +10,8 @@ test('humanKeyForValue returns a stable operator slug and machine hash', () => { assert.equal(first.hash, second.hash); assert.equal(first.slug, second.slug); assert.match(first.hash, /^[a-f0-9]{64}$/); - assert.match(first.slug, /^[a-z]+-[a-z]+-[a-z]+-[a-z0-9]{4}$/); + assert.match(first.slug, /^[a-z]+-[a-z]+-[a-z]+$/); + assert.match(first.checksum, /^[a-z0-9]{4}$/); }); test('humanKeyFromHash rejects non-SHA256 input', () => { diff --git a/packages/api-core/src/cluster/human-key.ts b/packages/api-core/src/cluster/human-key.ts index 551d9bb..a2513d4 100644 --- a/packages/api-core/src/cluster/human-key.ts +++ b/packages/api-core/src/cluster/human-key.ts @@ -1,18 +1,37 @@ import crypto from 'node:crypto'; const WORDS = [ - 'anchor', 'apex', 'atlas', 'beacon', 'binary', 'bridge', 'cable', 'canvas', - 'cipher', 'clear', 'cloud', 'cobalt', 'comet', 'copper', 'delta', 'drift', - 'ember', 'engine', 'falcon', 'fiber', 'field', 'filter', 'focus', 'forge', - 'frame', 'garden', 'glide', 'harbor', 'helix', 'hollow', 'index', 'island', - 'kernel', 'keystone', 'lantern', 'lattice', 'ledger', 'level', 'maple', 'matrix', - 'meadow', 'merge', 'mirror', 'module', 'needle', 'noble', 'nova', 'orbit', - 'origin', 'parcel', 'patch', 'pillar', 'pixel', 'plume', 'portal', 'pulse', - 'quartz', 'quiet', 'radar', 'raven', 'relay', 'render', 'ripple', 'river', - 'signal', 'silver', 'sketch', 'socket', 'solar', 'span', 'spiral', 'spring', - 'stable', 'stone', 'summit', 'switch', 'thread', 'timber', 'token', 'trace', - 'union', 'vector', 'velvet', 'vertex', 'vessel', 'violet', 'vista', 'wave', - 'willow', 'window', 'yellow', 'zenith', + 'able', 'acid', 'acre', 'actor', 'acute', 'admin', 'aisle', 'album', + 'alert', 'alias', 'amber', 'angle', 'apple', 'apron', 'array', 'asset', + 'atlas', 'audio', 'badge', 'basic', 'batch', 'beach', 'beacon', 'bench', + 'binary', 'block', 'bonus', 'border', 'branch', 'bridge', 'brief', 'buffer', + 'build', 'bundle', 'cable', 'cache', 'canal', 'canvas', 'carbon', 'cargo', + 'cedar', 'center', 'chance', 'change', 'charge', 'chart', 'cipher', 'circle', + 'civic', 'clear', 'client', 'cloud', 'cobalt', 'column', 'comet', 'common', + 'copper', 'corner', 'course', 'credit', 'crisp', 'cycle', 'daily', 'data', + 'delta', 'detail', 'device', 'domain', 'draft', 'drift', 'driver', 'early', + 'earth', 'echo', 'edge', 'ember', 'engine', 'entry', 'error', 'event', + 'fabric', 'factor', 'field', 'filter', 'final', 'focus', 'forge', 'format', + 'frame', 'fresh', 'future', 'garden', 'gentle', 'glide', 'golden', 'graph', + 'grid', 'group', 'harbor', 'header', 'helix', 'hidden', 'hollow', 'honest', + 'icon', 'index', 'input', 'island', 'kernel', 'key', 'keystone', 'label', + 'lantern', 'laser', 'latest', 'lattice', 'layer', 'ledger', 'level', 'light', + 'limit', 'linear', 'local', 'logic', 'major', 'maple', 'margin', 'matrix', + 'meadow', 'medium', 'memory', 'merge', 'method', 'mirror', 'mobile', 'module', + 'motion', 'native', 'needle', 'noble', 'normal', 'notion', 'nova', 'number', + 'object', 'ocean', 'offset', 'olive', 'online', 'option', 'orbit', 'origin', + 'output', 'packet', 'panel', 'parcel', 'patch', 'pattern', 'phase', 'pillar', + 'pixel', 'plain', 'planet', 'plume', 'point', 'portal', 'prime', 'profile', + 'prompt', 'proper', 'public', 'pulse', 'query', 'quartz', 'quiet', 'radar', + 'range', 'rapid', 'record', 'region', 'relay', 'render', 'reply', 'report', + 'result', 'ripple', 'river', 'route', 'sample', 'schema', 'screen', 'script', + 'search', 'second', 'section', 'secure', 'select', 'shadow', 'signal', 'silver', + 'simple', 'single', 'sketch', 'socket', 'solar', 'source', 'space', 'span', + 'spiral', 'spring', 'stable', 'static', 'status', 'steady', 'stone', 'stream', + 'strict', 'studio', 'subtle', 'summit', 'switch', 'system', 'table', 'target', + 'thread', 'timber', 'token', 'trace', 'transit', 'union', 'update', 'usage', + 'valid', 'vector', 'velvet', 'vertex', 'vessel', 'view', 'violet', 'virtual', + 'vista', 'visual', 'volume', 'wave', 'window', 'yellow', 'zenith', 'zero', ] as const; export type HumanKey = { @@ -35,7 +54,7 @@ export function humanKeyFromHash(hash: string): HumanKey { const checksum = Number.parseInt(normalized.slice(6, 12), 16).toString(36).padStart(4, '0').slice(-4); return { hash: normalized, - slug: `${WORDS[indexes[0]]}-${WORDS[indexes[1]]}-${WORDS[indexes[2]]}-${checksum}`, + slug: `${WORDS[indexes[0]]}-${WORDS[indexes[1]]}-${WORDS[indexes[2]]}`, checksum, }; } diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 3681820..3fb571e 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3089,7 +3089,7 @@ test('tui snapshot returns mixed issue and pull request counts with default visi ); assert.equal(allSnapshot.clusters[0].issueCount, 2); assert.equal(allSnapshot.clusters[0].pullRequestCount, 1); - assert.equal(allSnapshot.clusters[0].displayTitle, 'Recent issue cluster'); + assert.match(allSnapshot.clusters[0]?.displayTitle ?? '', /^[a-z]+-[a-z]+-[a-z]+ Recent issue cluster$/); const sizeSorted = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0, sort: 'size' }); assert.deepEqual( @@ -3512,7 +3512,7 @@ test('agent cluster summary and detail dumps expose repo stats, snippets, and su const summaries = service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); assert.equal(summaries.stats.openIssueCount, 1); assert.equal(summaries.clusters.length, 1); - assert.equal(summaries.clusters[0]?.displayTitle, 'Downloader hangs'); + assert.match(summaries.clusters[0]?.displayTitle ?? '', /^[a-z]+-[a-z]+-[a-z]+ Downloader hangs$/); const detail = service.getClusterDetailDump({ owner: 'openclaw', diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 4b76316..8d15f17 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3996,21 +3996,24 @@ export class GHCrawlService { search_text: string | null; }>; - return rows.map((row) => ({ - clusterId: row.cluster_id, - displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`, - isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count, - closedAtLocal: row.closed_at_local, - closeReasonLocal: row.close_reason_local, - totalCount: row.member_count, - issueCount: row.issue_count, - pullRequestCount: row.pull_request_count, - latestUpdatedAt: row.latest_updated_at, - representativeThreadId: row.representative_thread_id, - representativeNumber: row.representative_number, - representativeKind: row.representative_kind, - searchText: `${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), - })); + return rows.map((row) => { + const clusterName = this.clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); + return { + clusterId: row.cluster_id, + displayTitle: this.clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), + isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count, + closedAtLocal: row.closed_at_local, + closeReasonLocal: row.close_reason_local, + totalCount: row.member_count, + issueCount: row.issue_count, + pullRequestCount: row.pull_request_count, + latestUpdatedAt: row.latest_updated_at, + representativeThreadId: row.representative_thread_id, + representativeNumber: row.representative_number, + representativeKind: row.representative_kind, + searchText: `${clusterName} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), + }; + }); } private getRawTuiClusterSummary(repoId: number, clusterRunId: number, clusterId: number): TuiClusterSummary | null { @@ -4067,9 +4070,10 @@ export class GHCrawlService { return null; } + const clusterName = this.clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); return { clusterId: row.cluster_id, - displayTitle: row.representative_title ?? `Cluster ${row.cluster_id}`, + displayTitle: this.clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count, closedAtLocal: row.closed_at_local, closeReasonLocal: row.close_reason_local, @@ -4080,10 +4084,22 @@ export class GHCrawlService { representativeThreadId: row.representative_thread_id, representativeNumber: row.representative_number, representativeKind: row.representative_kind, - searchText: `${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), + searchText: `${clusterName} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), }; } + private clusterHumanName(repoId: number, representativeThreadId: number | null, clusterId: number): string { + return humanKeyForValue( + representativeThreadId === null + ? `repo:${repoId}:cluster:${clusterId}` + : `repo:${repoId}:cluster-representative:${representativeThreadId}`, + ).slug; + } + + private clusterDisplayTitle(clusterName: string, representativeTitle: string | null, clusterId: number): string { + return `${clusterName} ${representativeTitle ?? `Cluster ${clusterId}`}`; + } + private compareTuiClusterSummary(left: TuiClusterSummary, right: TuiClusterSummary, sort: TuiClusterSortMode): number { const leftTime = left.latestUpdatedAt ? Date.parse(left.latestUpdatedAt) : 0; const rightTime = right.latestUpdatedAt ? Date.parse(right.latestUpdatedAt) : 0; From 849163d87b689300dd502caf1309e67015c65c72 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 10:53:44 -0700 Subject: [PATCH 075/215] fix(cluster): tighten issue pr edges --- packages/api-core/src/service.test.ts | 60 +++++++++++++++++++++++++++ packages/api-core/src/service.ts | 58 ++++++++++++++++++++++---- 2 files changed, 111 insertions(+), 7 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 3fb571e..daddd05 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -1866,6 +1866,66 @@ test('clusterRepository merges source kinds into one edge without directional du } }); +test('clusterRepository drops weak issue/pr semantic edges', async () => { + const service = makeTestService({ + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Cache invalidation fails', 'Cache entries remain stale.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'pull_request', 'open', 'Worker cleanup', 'Moves worker code.', 'bob', 'User', 'https://github.com/openclaw/openclaw/pull/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + insertThread.run(12, 1, '102', 44, 'issue', 'open', 'Cache invalidation stale entries', 'Stale cache entries are not removed.', 'carol', 'User', 'https://github.com/openclaw/openclaw/issues/44', '[]', '[]', '{}', 'hash-44', 0, now, now, null, null, now, now, now); + + const insertEmbedding = service.db.prepare( + `insert into document_embeddings (thread_id, source_kind, model, dimensions, content_hash, embedding_json, created_at, updated_at) + values (?, ?, ?, ?, ?, ?, ?, ?)`, + ); + for (const sourceKind of ['title', 'body', 'dedupe_summary'] as const) { + insertEmbedding.run(10, sourceKind, 'text-embedding-3-large', 2, `hash-42-${sourceKind}`, '[1,0]', now, now); + insertEmbedding.run(11, sourceKind, 'text-embedding-3-large', 2, `hash-43-${sourceKind}`, '[0.8,-0.6]', now, now); + insertEmbedding.run(12, sourceKind, 'text-embedding-3-large', 2, `hash-44-${sourceKind}`, '[0.83,0.56]', now, now); + } + + const result = await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + k: 2, + minScore: 0.78, + }); + + const edges = service.db.prepare( + 'select left_thread_id, right_thread_id from similarity_edges where cluster_run_id = ? order by left_thread_id, right_thread_id', + ).all(result.runId) as Array<{ left_thread_id: number; right_thread_id: number }>; + + assert.deepEqual(edges, [{ left_thread_id: 10, right_thread_id: 12 }]); + assert.equal(result.edges, 1); + } finally { + service.close(); + } +}); + test('clusterRepository prunes older cluster runs for the repo after a successful rebuild', async () => { const service = makeTestService({ getRepo: async () => ({}), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 8d15f17..5c6dff4 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -141,6 +141,12 @@ type CommentSeed = { type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary' | 'llm_key_summary'; type SimilaritySourceKind = EmbeddingSourceKind | 'deterministic_fingerprint'; +type AggregatedClusterEdge = { + leftThreadId: number; + rightThreadId: number; + score: number; + sourceKinds: Set; +}; type EmbeddingTask = { threadId: number; @@ -420,6 +426,7 @@ const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; const DEFAULT_CLUSTER_MIN_SCORE = 0.78; +const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.88; const DEFAULT_CLUSTER_MAX_SIZE = 24; const VECTORLITE_CLUSTER_EXPANDED_K = 24; const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; @@ -2022,6 +2029,7 @@ export class GHCrawlService { minScore: params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE, maxClusterSize: params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE, clusterMode: 'size_bounded', + crossKindMinScore: Math.max(params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE, DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE), k: params.k ?? 6, embedModel: this.config.embedModel, embeddingBasis: this.config.embeddingBasis, @@ -2029,6 +2037,7 @@ export class GHCrawlService { ), }); const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE; + const crossKindMinScore = Math.max(minScore, DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE); const maxClusterSize = params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE; const k = params.k ?? 6; @@ -2059,7 +2068,7 @@ export class GHCrawlService { persistedFingerprints, { topK: Math.max(k * 8, 64), seedThreadIds }, ); - const aggregatedEdges = new Map }>(); + const aggregatedEdges = new Map(); this.mergeSourceKindEdges( aggregatedEdges, deterministic.edges.filter((edge) => edge.score >= minScore), @@ -2132,6 +2141,14 @@ export class GHCrawlService { } } + const threadKinds = new Map(deterministicItems.map((item) => [item.id, item.kind])); + const droppedCrossKindEdges = this.pruneWeakCrossKindEdges(aggregatedEdges, threadKinds, crossKindMinScore); + if (droppedCrossKindEdges > 0) { + params.onProgress?.( + `[cluster] dropped ${droppedCrossKindEdges} weak issue/pr edge(s) below cross_kind_min_score=${crossKindMinScore}`, + ); + } + const edges = Array.from(aggregatedEdges.values()).map((entry) => ({ leftThreadId: entry.leftThreadId, rightThreadId: entry.rightThreadId, @@ -2172,7 +2189,13 @@ export class GHCrawlService { : `[cluster] persisted ${clusters.length} cluster(s) and pruned older cluster runs`, ); - const stats = { edges: edges.length, clusters: clusters.length, threadNumber: params.threadNumber ?? null }; + const stats = { + edges: edges.length, + clusters: clusters.length, + threadNumber: params.threadNumber ?? null, + droppedCrossKindEdges, + crossKindMinScore, + }; this.finishRun('cluster_runs', runId, 'completed', stats); finishPipelineRun(this.db, pipelineRunId, { status: 'completed', stats }); return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length }); @@ -5565,8 +5588,8 @@ export class GHCrawlService { repoId: number, sourceKinds: EmbeddingSourceKind[], params: { limit: number; minScore: number; onProgress?: (message: string) => void }, - ): Promise }>> { - const aggregated = new Map }>(); + ): Promise> { + const aggregated = new Map(); const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repoId, sourceKind), 0); if (sourceKinds.length === 0 || totalItems === 0) { @@ -5662,7 +5685,7 @@ export class GHCrawlService { } private mergeSourceKindEdges( - aggregated: Map }>, + aggregated: Map, edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, sourceKind: SimilaritySourceKind, ): void { @@ -5683,6 +5706,27 @@ export class GHCrawlService { } } + private pruneWeakCrossKindEdges( + aggregated: Map, + threadKinds: Map, + crossKindMinScore: number, + ): number { + let dropped = 0; + for (const [key, edge] of aggregated) { + const leftKind = threadKinds.get(edge.leftThreadId); + const rightKind = threadKinds.get(edge.rightThreadId); + if (!leftKind || !rightKind || leftKind === rightKind) { + continue; + } + if (edge.sourceKinds.has('deterministic_fingerprint') || edge.score >= crossKindMinScore) { + continue; + } + aggregated.delete(key); + dropped += 1; + } + return dropped; + } + private collectSourceKindScores( perSourceScores: Map }>, edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, @@ -5794,7 +5838,7 @@ export class GHCrawlService { private persistClusterRun( repoId: number, runId: number, - aggregatedEdges: Map }>, + aggregatedEdges: Map, clusters: Array<{ representativeThreadId: number; members: number[] }>, ): void { const insertEdge = this.db.prepare( @@ -5848,7 +5892,7 @@ export class GHCrawlService { private persistDurableClusterState( repoId: number, pipelineRunId: number, - aggregatedEdges: Map }>, + aggregatedEdges: Map, clusters: Array<{ representativeThreadId: number; members: number[] }>, ): void { this.db.transaction(() => { From 2c155ec73a8edad0aee4d9a516291294de20bc2b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 11:40:40 -0700 Subject: [PATCH 076/215] fix(sync): avoid pull fetches for metadata sync --- packages/api-core/src/service.test.ts | 19 ++----------------- packages/api-core/src/service.ts | 3 ++- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index daddd05..e6bdb00 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -4225,23 +4225,8 @@ test('syncRepository treats missing stale pull requests as closed and continues' getIssue: async () => { throw new Error('not expected'); }, - getPull: async (_owner, _repo, number) => { + getPull: async () => { getPullCalls += 1; - if (getPullCalls === 1) { - return { - id: 101, - number, - state: 'open', - title: 'Fix downloader hang', - body: 'Implements a fix.', - html_url: `https://github.com/openclaw/openclaw/pull/${number}`, - labels: [{ name: 'bug' }], - assignees: [], - user: { login: 'bob', type: 'User' }, - draft: false, - updated_at: '2026-03-09T00:00:00Z', - }; - } throw Object.assign(new Error('GitHub request failed for GET /repos/openclaw/openclaw/pulls/43: Not Found'), { status: 404, }); @@ -4275,7 +4260,7 @@ test('syncRepository treats missing stale pull requests as closed and continues' assert.equal(after.state, 'closed'); assert.ok(after.closed_at_gh); assert.ok(after.last_pulled_at); - assert.equal(getPullCalls, 2); + assert.equal(getPullCalls, 1); assert.match(messages.join('\n'), /missing on GitHub; marking it closed locally and continuing/); } finally { service.close(); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 5c6dff4..40ab765 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1445,7 +1445,8 @@ export class GHCrawlService { const kind = isPr ? 'pull_request' : 'issue'; params.onProgress?.(`[sync] ${index + 1}/${items.length} ${kind} #${number}`); try { - const threadPayload = isPr ? await github.getPull(params.owner, params.repo, number, reporter) : item; + const shouldFetchPullPayload = isPr && includeCode; + const threadPayload = shouldFetchPullPayload ? await github.getPull(params.owner, params.repo, number, reporter) : item; const threadId = this.upsertThread(repoId, kind, threadPayload, crawlStartedAt); if (includeCode && isPr) { const files = await github.listPullFiles(params.owner, params.repo, number, reporter); From 172f2b8847fd8b29cc244c1f5b062f5312116cd9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 11:42:07 -0700 Subject: [PATCH 077/215] fix(sync): speed metadata crawls --- packages/api-core/src/github/client.ts | 2 +- packages/api-core/src/service.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/github/client.ts b/packages/api-core/src/github/client.ts index 2eeb073..9e46a53 100644 --- a/packages/api-core/src/github/client.ts +++ b/packages/api-core/src/github/client.ts @@ -74,7 +74,7 @@ function formatResetTime(resetSeconds: string | null | undefined): string | null export function makeGitHubClient(options: RequestOptions): GitHubClient { const userAgent = options.userAgent ?? 'ghcrawl'; const timeoutMs = options.timeoutMs ?? 30_000; - const pageDelayMs = options.pageDelayMs ?? 5000; + const pageDelayMs = options.pageDelayMs ?? 250; const BaseOctokit = Octokit.plugin(retry, throttling); function createOctokit(reporter?: GitHubReporter) { diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 40ab765..24c0991 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1436,7 +1436,7 @@ export class GHCrawlService { const fingerprintThreadIds: number[] = []; for (const [index, item] of items.entries()) { - if (index > 0 && index % SYNC_BATCH_SIZE === 0) { + if ((includeComments || includeCode) && index > 0 && index % SYNC_BATCH_SIZE === 0) { params.onProgress?.(`[sync] batch boundary reached at ${index} threads; sleeping 5s before continuing`); await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS)); } From 6b31fbb4e3a430ba09970c968a3fb63aa8768b77 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 12:33:32 -0700 Subject: [PATCH 078/215] fix(openai): clamp key summary fields --- .../src/cluster/llm-key-summary.test.ts | 19 +++++++++++++--- .../api-core/src/cluster/llm-key-summary.ts | 22 +++++++++++++++---- packages/api-core/src/openai/provider.ts | 9 ++++++-- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/packages/api-core/src/cluster/llm-key-summary.test.ts b/packages/api-core/src/cluster/llm-key-summary.test.ts index bec136c..1ec133f 100644 --- a/packages/api-core/src/cluster/llm-key-summary.test.ts +++ b/packages/api-core/src/cluster/llm-key-summary.test.ts @@ -27,18 +27,31 @@ test('parseLlmKeySummary accepts the strict 3-line contract', () => { ); }); -test('parseLlmKeySummary rejects missing or oversized fields', () => { +test('parseLlmKeySummary rejects missing fields', () => { assert.throws( () => parseLlmKeySummary({ - intent: 'x'.repeat(121), + intent: '', surface: 'CLI', mechanism: 'Patch retry loop.', }), - /Too big/, + /Too small/, ); }); +test('parseLlmKeySummary clamps oversized fields deterministically', () => { + const summary = parseLlmKeySummary({ + intent: 'x'.repeat(140), + surface: 'y'.repeat(140), + mechanism: 'z'.repeat(180), + }); + + assert.equal(summary.intent.length, 120); + assert.equal(summary.surface.length, 120); + assert.equal(summary.mechanism.length, 160); + assert.equal(summary.intent.at(-1), '.'); +}); + test('llmKeyInputHash is deterministic and prompt-version scoped', () => { const first = llmKeyInputHash({ title: 'Fix retry', body: 'Retry forever' }); const second = llmKeyInputHash({ title: 'Fix retry', body: 'Retry forever' }); diff --git a/packages/api-core/src/cluster/llm-key-summary.ts b/packages/api-core/src/cluster/llm-key-summary.ts index a7b5f78..1084db1 100644 --- a/packages/api-core/src/cluster/llm-key-summary.ts +++ b/packages/api-core/src/cluster/llm-key-summary.ts @@ -12,15 +12,29 @@ mechanism: one sentence, max 160 chars, cause or implementation approach. Use concrete nouns from the input. Do not mention uncertainty. Do not add advice.`; export const llmKeySummarySchema = z.object({ - intent: z.string().trim().min(1).max(120), - surface: z.string().trim().min(1).max(120), - mechanism: z.string().trim().min(1).max(160), + intent: z.string().trim().min(1), + surface: z.string().trim().min(1), + mechanism: z.string().trim().min(1), }); export type LlmKeySummary = z.infer; export function parseLlmKeySummary(value: unknown): LlmKeySummary { - return llmKeySummarySchema.parse(value); + const summary = llmKeySummarySchema.parse(value); + return { + intent: clampSentence(summary.intent, 120), + surface: clampSentence(summary.surface, 120), + mechanism: clampSentence(summary.mechanism, 160), + }; +} + +function clampSentence(value: string, maxLength: number): string { + const normalized = value.replace(/\s+/g, ' ').trim(); + if (normalized.length <= maxLength) { + return normalized; + } + + return normalized.slice(0, maxLength - 1).trimEnd() + '.'; } export function llmKeyEmbeddingText(summary: LlmKeySummary): string { diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index bef9157..daf9904 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -3,7 +3,12 @@ import { APIConnectionError, APIConnectionTimeoutError, APIError, RateLimitError import { zodTextFormat } from 'openai/helpers/zod'; import { z } from 'zod'; -import { LLM_KEY_SUMMARY_SYSTEM_PROMPT, llmKeySummarySchema, type LlmKeySummary } from '../cluster/llm-key-summary.js'; +import { + LLM_KEY_SUMMARY_SYSTEM_PROMPT, + llmKeySummarySchema, + parseLlmKeySummary, + type LlmKeySummary, +} from '../cluster/llm-key-summary.js'; export type SummaryResult = { problemSummary: string; @@ -156,7 +161,7 @@ export class OpenAiProvider implements AiProvider { throw new Error(`empty structured output${response.incomplete_details?.reason ? ` (${response.incomplete_details.reason})` : ''}`); } return { - summary: llmKeySummarySchema.parse(JSON.parse(raw)), + summary: parseLlmKeySummary(JSON.parse(raw)), usage: response.usage ? { inputTokens: response.usage.input_tokens, From 48dc9210a9aba082c2d42458ec33a619996accb3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 12:38:52 -0700 Subject: [PATCH 079/215] fix(cluster): disambiguate durable slugs --- packages/api-core/src/cluster/human-key.test.ts | 3 ++- packages/api-core/src/cluster/human-key.ts | 4 ++++ packages/api-core/src/service.ts | 6 +++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/packages/api-core/src/cluster/human-key.test.ts b/packages/api-core/src/cluster/human-key.test.ts index fae857d..3c370a5 100644 --- a/packages/api-core/src/cluster/human-key.test.ts +++ b/packages/api-core/src/cluster/human-key.test.ts @@ -1,7 +1,7 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { humanKeyForValue, humanKeyFromHash, stableHash } from './human-key.js'; +import { humanKeyForValue, humanKeyFromHash, humanKeyStableSlug, stableHash } from './human-key.js'; test('humanKeyForValue returns a stable operator slug and machine hash', () => { const first = humanKeyForValue('repo:openclaw/openclaw thread:42 title:download stalls'); @@ -12,6 +12,7 @@ test('humanKeyForValue returns a stable operator slug and machine hash', () => { assert.match(first.hash, /^[a-f0-9]{64}$/); assert.match(first.slug, /^[a-z]+-[a-z]+-[a-z]+$/); assert.match(first.checksum, /^[a-z0-9]{4}$/); + assert.match(humanKeyStableSlug(first), /^[a-z]+-[a-z]+-[a-z]+-[a-z0-9]{4}$/); }); test('humanKeyFromHash rejects non-SHA256 input', () => { diff --git a/packages/api-core/src/cluster/human-key.ts b/packages/api-core/src/cluster/human-key.ts index a2513d4..d88300b 100644 --- a/packages/api-core/src/cluster/human-key.ts +++ b/packages/api-core/src/cluster/human-key.ts @@ -62,3 +62,7 @@ export function humanKeyFromHash(hash: string): HumanKey { export function humanKeyForValue(value: string): HumanKey { return humanKeyFromHash(stableHash(value)); } + +export function humanKeyStableSlug(key: HumanKey): string { + return `${key.slug}-${key.checksum}`; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 24c0991..1797567 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -68,7 +68,7 @@ import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from '. import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; import { buildDeterministicClusterGraphFromFingerprints, extractDeterministicRefs } from './cluster/deterministic-engine.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; -import { humanKeyForValue } from './cluster/human-key.js'; +import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; import { createPipelineRun, @@ -1277,7 +1277,7 @@ export class GHCrawlService { newClusterId = upsertClusterGroup(this.db, { repoId: repository.id, stableKey: identity.hash, - stableSlug: identity.slug, + stableSlug: humanKeyStableSlug(identity), status: 'active', clusterType: 'duplicate_candidate', representativeThreadId: selectedCanonical.thread_id, @@ -5920,7 +5920,7 @@ export class GHCrawlService { const clusterId = upsertClusterGroup(this.db, { repoId, stableKey: identity.hash, - stableSlug: identity.slug, + stableSlug: humanKeyStableSlug(identity), status: 'active', clusterType: cluster.members.length > 1 ? 'duplicate_candidate' : 'singleton_orphan', representativeThreadId: cluster.representativeThreadId, From b9d159607d309be6b39d23f75e3bccb5e187e844 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 12:46:21 -0700 Subject: [PATCH 080/215] fix(cluster): weight file and key evidence --- .../api-core/src/cluster/persistent-store.ts | 1 + .../src/cluster/thread-fingerprint.ts | 25 ++++++++++- packages/api-core/src/service.ts | 45 ++++++++++++++----- 3 files changed, 60 insertions(+), 11 deletions(-) diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index 0e1317a..0144dd3 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -123,6 +123,7 @@ export function upsertThreadFingerprint( changedFiles: params.fingerprint.changedFiles, hunkSignatures: params.fingerprint.hunkSignatures, patchIds: params.fingerprint.patchIds, + featureHash: params.fingerprint.featureHash, }); db.prepare( `insert into thread_fingerprints ( diff --git a/packages/api-core/src/cluster/thread-fingerprint.ts b/packages/api-core/src/cluster/thread-fingerprint.ts index ded7ab3..5a58e88 100644 --- a/packages/api-core/src/cluster/thread-fingerprint.ts +++ b/packages/api-core/src/cluster/thread-fingerprint.ts @@ -1,5 +1,5 @@ import { buildShingles, jaccard, minhashSignature, minhashSimilarity, simhash64, simhashSimilarity, winnowingFingerprints } from './fingerprint-algorithms.js'; -import { humanKeyForValue } from './human-key.js'; +import { humanKeyForValue, stableHash } from './human-key.js'; const TOKEN_RE = /[a-zA-Z0-9_]+/g; const TITLE_STOPWORDS = new Set([ @@ -54,6 +54,7 @@ export type DeterministicThreadFingerprint = { changedFiles: string[]; hunkSignatures: string[]; patchIds: string[]; + featureHash: string; minhashSignature: string[]; simhash64: string; winnowHashes: string[]; @@ -85,6 +86,26 @@ export function moduleBucket(path: string, depth = 2): string { return `${parts.slice(0, depth).join('/')}/*`; } +export function fingerprintFeatureHash(input: { + linkedRefs: string[]; + changedFiles: string[]; + moduleBuckets?: string[]; + hunkSignatures: string[]; + patchIds: string[]; +}): string { + const changedFiles = uniqueSorted(input.changedFiles); + const moduleBuckets = uniqueSorted(input.moduleBuckets ?? changedFiles.map((path) => moduleBucket(path))); + return stableHash( + JSON.stringify({ + linkedRefs: uniqueSorted(input.linkedRefs), + changedFiles, + moduleBuckets, + hunkSignatures: uniqueSorted(input.hunkSignatures), + patchIds: uniqueSorted(input.patchIds), + }), + ); +} + function uniqueSorted(values: string[]): string[] { return Array.from(new Set(values.filter(Boolean))).sort(); } @@ -107,6 +128,7 @@ export function buildDeterministicThreadFingerprint(input: FingerprintInput): De const patchIds = uniqueSorted(input.patchIds ?? []); const moduleBuckets = uniqueSorted(changedFiles.map((path) => moduleBucket(path))); const salientTitleTokens = uniqueSorted(titleTokens.filter((token) => token.length >= 4 && !TITLE_STOPWORDS.has(token))); + const featureHash = fingerprintFeatureHash({ linkedRefs, changedFiles, moduleBuckets, hunkSignatures, patchIds }); const materialTokens = [ ...titleTokens, ...bodyTokens, @@ -146,6 +168,7 @@ export function buildDeterministicThreadFingerprint(input: FingerprintInput): De changedFiles, hunkSignatures, patchIds, + featureHash, minhashSignature: minhash, simhash64: simhash, winnowHashes: winnow, diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 1797567..48f6e1b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -84,6 +84,7 @@ import { } from './cluster/persistent-store.js'; import { buildDeterministicThreadFingerprint, + fingerprintFeatureHash, THREAD_FINGERPRINT_ALGORITHM_VERSION, type DeterministicThreadFingerprint, } from './cluster/thread-fingerprint.js'; @@ -425,9 +426,10 @@ const KEY_SUMMARY_MAX_UNREAD = 48; const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; -const DEFAULT_CLUSTER_MIN_SCORE = 0.78; -const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.88; -const DEFAULT_CLUSTER_MAX_SIZE = 24; +const DEFAULT_CLUSTER_MIN_SCORE = 0.76; +const DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE = 0.48; +const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.9; +const DEFAULT_CLUSTER_MAX_SIZE = 48; const VECTORLITE_CLUSTER_EXPANDED_K = 24; const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512; @@ -2038,9 +2040,10 @@ export class GHCrawlService { ), }); const minScore = params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE; + const deterministicMinScore = Math.min(minScore, DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE); const crossKindMinScore = Math.max(minScore, DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE); const maxClusterSize = params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE; - const k = params.k ?? 6; + const k = params.k ?? 12; try { const seedThread = params.threadNumber @@ -2072,7 +2075,12 @@ export class GHCrawlService { const aggregatedEdges = new Map(); this.mergeSourceKindEdges( aggregatedEdges, - deterministic.edges.filter((edge) => edge.score >= minScore), + deterministic.edges + .filter((edge) => edge.tier === 'strong' || edge.score >= deterministicMinScore) + .map((edge) => ({ + ...edge, + score: Math.max(edge.score, edge.tier === 'strong' ? 0.94 : Math.min(0.86, minScore + 0.04)), + })), 'deterministic_fingerprint', ); params.onProgress?.( @@ -5277,21 +5285,37 @@ export class GHCrawlService { labels: item.labels, rawJson: item.rawJson, }); + const inferredRefs = extractDeterministicRefs(`${item.title}\n${item.body ?? ''}`); + const featureHash = fingerprintFeatureHash({ + linkedRefs: inferredRefs, + changedFiles: item.changedFiles, + hunkSignatures: item.hunkSignatures, + patchIds: item.patchIds, + }); const existing = this.db .prepare( - `select id + `select id, feature_json from thread_fingerprints where thread_revision_id = ? and algorithm_version = ? limit 1`, ) - .get(revisionId, THREAD_FINGERPRINT_ALGORITHM_VERSION) as { id: number } | undefined; + .get(revisionId, THREAD_FINGERPRINT_ALGORITHM_VERSION) as { id: number; feature_json: string } | undefined; if (existing) { - skipped += 1; - continue; + const existingFeatureHash = (() => { + try { + const feature = JSON.parse(existing.feature_json) as Record; + return typeof feature.featureHash === 'string' ? feature.featureHash : null; + } catch { + return null; + } + })(); + if (existingFeatureHash === featureHash) { + skipped += 1; + continue; + } } - const inferredRefs = extractDeterministicRefs(`${item.title}\n${item.body ?? ''}`); const fingerprint = buildDeterministicThreadFingerprint({ threadId: item.id, number: item.number, @@ -5375,6 +5399,7 @@ export class GHCrawlService { changedFiles: stringFeature('changedFiles'), hunkSignatures: stringFeature('hunkSignatures'), patchIds: stringFeature('patchIds'), + featureHash: typeof feature.featureHash === 'string' ? feature.featureHash : '', minhashSignature: row.minhash_signature_blob_id ? parseStringArrayJson(readTextBlob(this.db, this.blobStoreRoot(), row.minhash_signature_blob_id)) : [], From 8b181f015276fc888c543bd49e34be2ffeee2ebd Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 13:04:05 -0700 Subject: [PATCH 081/215] fix(cluster): cap deterministic candidates --- packages/api-core/src/service.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 48f6e1b..310a619 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -2070,7 +2070,11 @@ export class GHCrawlService { const deterministic = buildDeterministicClusterGraphFromFingerprints( deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), persistedFingerprints, - { topK: Math.max(k * 8, 64), seedThreadIds }, + { + maxBucketSize: seedThreadIds ? 500 : 200, + topK: seedThreadIds ? Math.max(k * 8, 64) : 32, + seedThreadIds, + }, ); const aggregatedEdges = new Map(); this.mergeSourceKindEdges( From 0b99785bbaefb4414c22561ac1fcdeed760682b0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 13:14:05 -0700 Subject: [PATCH 082/215] fix(cluster): strengthen file evidence scoring --- .../api-core/src/cluster/evidence-score.ts | 19 +++++++++++-------- .../src/cluster/thread-fingerprint.ts | 2 +- packages/api-core/src/service.ts | 8 ++++---- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/packages/api-core/src/cluster/evidence-score.ts b/packages/api-core/src/cluster/evidence-score.ts index 5f16df7..79437ef 100644 --- a/packages/api-core/src/cluster/evidence-score.ts +++ b/packages/api-core/src/cluster/evidence-score.ts @@ -29,17 +29,17 @@ export type SimilarityEvidenceBreakdown = FingerprintPairBreakdown & { }; export const DEFAULT_EVIDENCE_SCORE_CONFIG: EvidenceScoreConfig = { - minScore: 0.48, + minScore: 0.36, strongScore: 0.74, - weightLineage: 0.25, - weightStructure: 0.22, - weightLinkedRefs: 0.16, - weightTitle: 0.08, + weightLineage: 0.18, + weightStructure: 0.36, + weightLinkedRefs: 0.14, + weightTitle: 0.10, weightMinhash: 0.10, weightSimhash: 0.08, - weightWinnow: 0.07, - weightEmbedding: 0.02, - weightLlmKey: 0.02, + weightWinnow: 0.04, + weightEmbedding: 0.03, + weightLlmKey: 0.03, }; function clamp01(value: number | null | undefined): number { @@ -71,12 +71,15 @@ export function scoreSimilarityEvidence( if ( base.lineage >= 0.8 || base.hunkOverlap >= 0.8 || + (base.fileOverlap >= 0.8 && (base.titleOverlap >= 0.15 || base.tokenSimhash >= 0.5 || base.tokenMinhash >= 0.2)) || (base.linkedRefOverlap >= 0.8 && (base.structure >= 0.25 || base.titleOverlap >= 0.25)) || score >= config.strongScore ) { tier = 'strong'; } else if ( score >= config.minScore || + base.fileOverlap >= 0.4 || + (base.moduleOverlap >= 0.5 && base.titleOverlap >= 0.15) || (base.titleOverlap >= 0.25 && base.tokenSimhash >= 0.55) || (base.structure >= 0.5 && base.tokenSimhash >= 0.55) || (base.linkedRefOverlap >= 0.5 && base.tokenMinhash >= 0.25) diff --git a/packages/api-core/src/cluster/thread-fingerprint.ts b/packages/api-core/src/cluster/thread-fingerprint.ts index 5a58e88..26b6908 100644 --- a/packages/api-core/src/cluster/thread-fingerprint.ts +++ b/packages/api-core/src/cluster/thread-fingerprint.ts @@ -198,7 +198,7 @@ export function compareDeterministicFingerprints( moduleOverlap, hunkOverlap, patchOverlap, - structure: 0.7 * hunkOverlap + 0.2 * fileOverlap + 0.1 * moduleOverlap, + structure: Math.max(hunkOverlap, patchOverlap, fileOverlap, 0.65 * moduleOverlap), lineage: patchOverlap, }; } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 310a619..7704f5e 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -426,10 +426,10 @@ const KEY_SUMMARY_MAX_UNREAD = 48; const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; -const DEFAULT_CLUSTER_MIN_SCORE = 0.76; -const DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE = 0.48; +const DEFAULT_CLUSTER_MIN_SCORE = 0.74; +const DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE = 0.36; const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.9; -const DEFAULT_CLUSTER_MAX_SIZE = 48; +const DEFAULT_CLUSTER_MAX_SIZE = 64; const VECTORLITE_CLUSTER_EXPANDED_K = 24; const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512; @@ -2043,7 +2043,7 @@ export class GHCrawlService { const deterministicMinScore = Math.min(minScore, DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE); const crossKindMinScore = Math.max(minScore, DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE); const maxClusterSize = params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE; - const k = params.k ?? 12; + const k = params.k ?? 16; try { const seedThread = params.threadNumber From 8f42f4c3ec46a866bbdee0b678569505bd6d5ac5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 13:35:33 -0700 Subject: [PATCH 083/215] feat(openai): add purpose key summaries --- README.md | 7 ++++--- apps/cli/src/main.test.ts | 7 ++++++- apps/cli/src/main.ts | 21 +++++++++++-------- docs/DESIGN.md | 4 ++-- .../src/cluster/llm-key-summary.test.ts | 9 +++++++- .../api-core/src/cluster/llm-key-summary.ts | 12 +++++++++-- .../src/cluster/persistent-store.test.ts | 4 +++- packages/api-core/src/config.test.ts | 6 +++++- packages/api-core/src/config.ts | 4 ++-- packages/api-core/src/service.test.ts | 9 ++++---- 10 files changed, 57 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index d4b541f..2b5b090 100644 --- a/README.md +++ b/README.md @@ -183,13 +183,13 @@ Use `configure` to inspect or change the active summary model and embedding basi ```bash ghcrawl configure -ghcrawl configure --summary-model gpt-5.4-mini +ghcrawl configure --summary-model gpt-5.4 ghcrawl configure --embedding-basis title_original ``` Current defaults: -- summary model: `gpt-5-mini` +- summary model: `gpt-5.4` - embedding basis: `title_original` (`title + original body`) - vector backend: `vectorlite` @@ -250,10 +250,11 @@ On a real local run against roughly `12k` issues plus about `1.2x` related PR an For one-time summary migration planning on a repo around the size of `openclaw/openclaw` (`~20k` issues and PRs), `ghcrawl configure` reports these operator estimates using the April 1, 2026 USD pricing assumptions for this release: +- `gpt-5.4`: not estimated locally in this release - `gpt-5-mini`: about **$12 USD** one time - `gpt-5.4-mini`: about **$30 USD** one time -`gpt-5-mini` is the default to keep that migration cost lower. `gpt-5.4-mini` is available when you want higher-quality summaries and are comfortable with the higher one-time spend. +`gpt-5.4` is the default summary model. The mini model estimates are kept as operator planning references for lower-cost migrations. This screenshot is the reference point for that estimate: diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 1fbfef0..728504e 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -28,6 +28,10 @@ function makeRunContext(): { env: NodeJS.ProcessEnv; cwd: string; cleanup: () => env: { ...process.env, HOME: home, + GITHUB_TOKEN: undefined, + OPENAI_API_KEY: undefined, + GHCRAWL_SUMMARY_MODEL: undefined, + GHCRAWL_API_PORT: undefined, XDG_CONFIG_HOME: undefined, APPDATA: undefined, }, @@ -190,8 +194,9 @@ test('configure prints current persisted settings and cost estimates', async () } assert.match(stdout.read(), /ghcrawl configure/); - assert.match(stdout.read(), /summary model: gpt-5-mini/); + assert.match(stdout.read(), /summary model: gpt-5\.4/); assert.match(stdout.read(), /embedding basis: title_original/); + assert.match(stdout.read(), /gpt-5\.4: not estimated locally/); assert.match(stdout.read(), /gpt-5\.4-mini: ~\$30 USD/); }); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 2d64f7f..4091ade 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -62,12 +62,13 @@ type DoctorReport = DoctorResult & { type ConfigureReport = { configPath: string; updated: boolean; - summaryModel: 'gpt-5-mini' | 'gpt-5.4-mini'; + summaryModel: 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini'; embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; vectorBackend: 'vectorlite'; costEstimateUsd: { sampleThreads: number; pricingDate: string; + gpt54: number | null; gpt5Mini: number; gpt54Mini: number; }; @@ -103,14 +104,14 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'configure', - synopsis: 'configure [--summary-model gpt-5-mini|gpt-5.4-mini] [--embedding-basis title_original|title_summary|llm_key_summary] [--json]', + synopsis: 'configure [--summary-model gpt-5.4|gpt-5-mini|gpt-5.4-mini] [--embedding-basis title_original|title_summary|llm_key_summary] [--json]', description: 'Show or update persisted summarization and embedding settings.', options: [ - '--summary-model Select gpt-5-mini or gpt-5.4-mini for summarization', + '--summary-model Select gpt-5.4, gpt-5-mini, or gpt-5.4-mini for summarization', '--embedding-basis Select title_original, title_summary, or llm_key_summary for active vectors', '--json Emit machine-readable JSON output explicitly', ], - examples: ['ghcrawl configure', 'ghcrawl configure --summary-model gpt-5.4-mini', 'ghcrawl configure --embedding-basis title_original --json'], + examples: ['ghcrawl configure', 'ghcrawl configure --summary-model gpt-5.4', 'ghcrawl configure --embedding-basis title_original --json'], agentJson: true, }, { @@ -268,7 +269,7 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ { name: 'key-summaries', synopsis: 'key-summaries [--number ] [--limit ] [--json]', - description: 'Generate cached 3-line LLM key summaries for clustering enrichment.', + description: 'Generate cached structured LLM key summaries for clustering enrichment.', options: [ '--number Restrict key summary work to one thread', '--limit Limit the number of generated summaries', @@ -726,7 +727,7 @@ function parseEnum(command: CommandName, flagName: string, val function buildConfigureReport(options: { configPath: string; updated: boolean; - summaryModel: 'gpt-5-mini' | 'gpt-5.4-mini'; + summaryModel: 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini'; embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; vectorBackend: 'vectorlite'; }): ConfigureReport { @@ -735,6 +736,7 @@ function buildConfigureReport(options: { costEstimateUsd: { sampleThreads: 20_000, pricingDate: 'April 1, 2026', + gpt54: null, gpt5Mini: 12, gpt54Mini: 30, }, @@ -788,7 +790,7 @@ export function formatConfigureReport(result: ConfigureReport): string { result.embeddingBasis === 'title_summary' ? 'title + dedupe summary' : result.embeddingBasis === 'llm_key_summary' - ? 'title + 3-line LLM key summary' + ? 'title + structured LLM key summary' : 'title + original body'; const summaryModeNote = result.embeddingBasis === 'title_summary' @@ -809,6 +811,7 @@ export function formatConfigureReport(result: ConfigureReport): string { '', `Estimated one-time summary cost for ~${result.costEstimateUsd.sampleThreads.toLocaleString()} threads`, ` pricing date: ${result.costEstimateUsd.pricingDate}`, + ` gpt-5.4: ${result.costEstimateUsd.gpt54 === null ? 'not estimated locally' : `~$${result.costEstimateUsd.gpt54.toFixed(0)} USD`}`, ` gpt-5-mini: ~$${result.costEstimateUsd.gpt5Mini.toFixed(0)} USD`, ` gpt-5.4-mini: ~$${result.costEstimateUsd.gpt54Mini.toFixed(0)} USD`, '', @@ -981,7 +984,7 @@ export async function run( json: { type: 'boolean' }, }); const values = parsed.values as RepoCommandValues; - const summaryModel = parseEnum('configure', 'summary-model', values['summary-model'], ['gpt-5-mini', 'gpt-5.4-mini']); + const summaryModel = parseEnum('configure', 'summary-model', values['summary-model'], ['gpt-5.4', 'gpt-5-mini', 'gpt-5.4-mini']); const embeddingBasis = parseEnum('configure', 'embedding-basis', values['embedding-basis'], ['title_original', 'title_summary', 'llm_key_summary']); const current = getConfig(); const stored = readPersistedConfig(loadConfigOptions); @@ -1001,7 +1004,7 @@ export async function run( const result = buildConfigureReport({ configPath: current.configPath, updated, - summaryModel: next.summaryModel as 'gpt-5-mini' | 'gpt-5.4-mini', + summaryModel: next.summaryModel as 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini', embeddingBasis: next.embeddingBasis as 'title_original' | 'title_summary' | 'llm_key_summary', vectorBackend: 'vectorlite', }); diff --git a/docs/DESIGN.md b/docs/DESIGN.md index e0ab9f8..7e070b4 100644 --- a/docs/DESIGN.md +++ b/docs/DESIGN.md @@ -136,7 +136,7 @@ Environment variables: - `OPENAI_API_KEY` - `GHCRAWL_DB_PATH` with default `data/ghcrawl.db` - `GHCRAWL_API_PORT` with default `5179` -- `GHCRAWL_SUMMARY_MODEL` with default `gpt-5-mini` +- `GHCRAWL_SUMMARY_MODEL` with default `gpt-5.4` - `GHCRAWL_EMBED_MODEL` with default `text-embedding-3-small` - `GHCRAWL_OPENSEARCH_URL` optional - `GHCRAWL_OPENSEARCH_INDEX` optional @@ -278,7 +278,7 @@ Use OpenAI for two distinct jobs: Default models: -- summarization: `gpt-5-mini` +- summarization: `gpt-5.4` - embeddings: `text-embedding-3-small` Relevant official constraints to design around: diff --git a/packages/api-core/src/cluster/llm-key-summary.test.ts b/packages/api-core/src/cluster/llm-key-summary.test.ts index 1ec133f..9b03533 100644 --- a/packages/api-core/src/cluster/llm-key-summary.test.ts +++ b/packages/api-core/src/cluster/llm-key-summary.test.ts @@ -9,17 +9,20 @@ import { parseLlmKeySummary, } from './llm-key-summary.js'; -test('parseLlmKeySummary accepts the strict 3-line contract', () => { +test('parseLlmKeySummary accepts the strict key-summary contract', () => { const summary = parseLlmKeySummary({ + purpose: 'Prevent stalled downloads from blocking repository sync.', intent: 'Stop downloads from retrying forever after timeout.', surface: 'CLI sync downloader and retry loop.', mechanism: 'Exit retry loop when timeout state is terminal.', }); + assert.equal(summary.purpose, 'Prevent stalled downloads from blocking repository sync.'); assert.equal(summary.intent, 'Stop downloads from retrying forever after timeout.'); assert.equal( llmKeyEmbeddingText(summary), [ + 'purpose: Prevent stalled downloads from blocking repository sync.', 'intent: Stop downloads from retrying forever after timeout.', 'surface: CLI sync downloader and retry loop.', 'mechanism: Exit retry loop when timeout state is terminal.', @@ -31,6 +34,7 @@ test('parseLlmKeySummary rejects missing fields', () => { assert.throws( () => parseLlmKeySummary({ + purpose: 'Keep sync reliable.', intent: '', surface: 'CLI', mechanism: 'Patch retry loop.', @@ -41,11 +45,13 @@ test('parseLlmKeySummary rejects missing fields', () => { test('parseLlmKeySummary clamps oversized fields deterministically', () => { const summary = parseLlmKeySummary({ + purpose: 'w'.repeat(180), intent: 'x'.repeat(140), surface: 'y'.repeat(140), mechanism: 'z'.repeat(180), }); + assert.equal(summary.purpose.length, 160); assert.equal(summary.intent.length, 120); assert.equal(summary.surface.length, 120); assert.equal(summary.mechanism.length, 160); @@ -67,6 +73,7 @@ test('llmKeyInputHash is deterministic and prompt-version scoped', () => { test('LLM_KEY_SUMMARY_SYSTEM_PROMPT requires strict JSON fields', () => { assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /Return only strict JSON/); + assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /purpose/); assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /intent/); assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /surface/); assert.match(LLM_KEY_SUMMARY_SYSTEM_PROMPT, /mechanism/); diff --git a/packages/api-core/src/cluster/llm-key-summary.ts b/packages/api-core/src/cluster/llm-key-summary.ts index 1084db1..31f8856 100644 --- a/packages/api-core/src/cluster/llm-key-summary.ts +++ b/packages/api-core/src/cluster/llm-key-summary.ts @@ -2,16 +2,18 @@ import crypto from 'node:crypto'; import { z } from 'zod'; -export const LLM_KEY_SUMMARY_PROMPT_VERSION = 'llm-key-summary-v1'; +export const LLM_KEY_SUMMARY_PROMPT_VERSION = 'llm-key-summary-v2'; export const LLM_KEY_SUMMARY_SYSTEM_PROMPT = `You produce stable deduplication keys for GitHub issues and pull requests. Return only strict JSON with exactly these fields: +purpose: one sentence, max 160 chars, issue or feature summary describing why this exists or what user need it serves. intent: one sentence, max 120 chars, what outcome is being requested or changed. surface: one sentence, max 120 chars, affected user/API/module/file area. mechanism: one sentence, max 160 chars, cause or implementation approach. Use concrete nouns from the input. Do not mention uncertainty. Do not add advice.`; export const llmKeySummarySchema = z.object({ + purpose: z.string().trim().min(1), intent: z.string().trim().min(1), surface: z.string().trim().min(1), mechanism: z.string().trim().min(1), @@ -22,6 +24,7 @@ export type LlmKeySummary = z.infer; export function parseLlmKeySummary(value: unknown): LlmKeySummary { const summary = llmKeySummarySchema.parse(value); return { + purpose: clampSentence(summary.purpose, 160), intent: clampSentence(summary.intent, 120), surface: clampSentence(summary.surface, 120), mechanism: clampSentence(summary.mechanism, 160), @@ -38,7 +41,12 @@ function clampSentence(value: string, maxLength: number): string { } export function llmKeyEmbeddingText(summary: LlmKeySummary): string { - return [`intent: ${summary.intent}`, `surface: ${summary.surface}`, `mechanism: ${summary.mechanism}`].join('\n'); + return [ + `purpose: ${summary.purpose}`, + `intent: ${summary.intent}`, + `surface: ${summary.surface}`, + `mechanism: ${summary.mechanism}`, + ].join('\n'); } export function llmKeyInputHash(input: { diff --git a/packages/api-core/src/cluster/persistent-store.test.ts b/packages/api-core/src/cluster/persistent-store.test.ts index b745c9a..d6e34c8 100644 --- a/packages/api-core/src/cluster/persistent-store.test.ts +++ b/packages/api-core/src/cluster/persistent-store.test.ts @@ -330,9 +330,10 @@ test('persistent cluster store records structured key summaries', () => { summaryKind: 'llm_key_3line', promptVersion: 'llm-key-summary-v1', provider: 'openai', - model: 'gpt-5-mini', + model: 'gpt-5.4', inputHash: 'input-hash', summary: { + purpose: 'Keep cache entries distinct for repeated API reads.', intent: 'Fix cache collision.', surface: 'API core cache.', mechanism: 'Changes cache key derivation.', @@ -344,6 +345,7 @@ test('persistent cluster store records structured key summaries', () => { key_text: string; }; assert.equal(row.input_hash, 'input-hash'); + assert.match(row.key_text, /purpose: Keep cache entries distinct/); assert.match(row.key_text, /intent: Fix cache collision\./); assert.match(row.key_text, /surface: API core cache\./); } finally { diff --git a/packages/api-core/src/config.test.ts b/packages/api-core/src/config.test.ts index a2cae72..f30bed7 100644 --- a/packages/api-core/src/config.test.ts +++ b/packages/api-core/src/config.test.ts @@ -20,6 +20,10 @@ function makeTempHome(): string { function makeTestEnv(overrides: NodeJS.ProcessEnv = {}): NodeJS.ProcessEnv { return { ...process.env, + GITHUB_TOKEN: undefined, + OPENAI_API_KEY: undefined, + GHCRAWL_SUMMARY_MODEL: undefined, + GHCRAWL_API_PORT: undefined, XDG_CONFIG_HOME: undefined, APPDATA: undefined, ...overrides, @@ -53,7 +57,7 @@ test('loadConfig prefers persisted config and stores defaults under the user con assert.equal(config.githubTokenSource, 'config'); assert.equal(config.openaiApiKeySource, 'config'); assert.equal(config.dbPath, path.join(home, '.config', 'ghcrawl', 'ghcrawl.db')); - assert.equal(config.summaryModel, 'gpt-5-mini'); + assert.equal(config.summaryModel, 'gpt-5.4'); assert.equal(config.embeddingBasis, 'title_original'); assert.equal(config.vectorBackend, 'vectorlite'); }); diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index 6f0f81f..71deed5 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -323,7 +323,7 @@ export function loadConfig(options: LoadConfigOptions = {}): GitcrawlConfig { { source: 'env', value: getEnvString(env, 'GHCRAWL_SUMMARY_MODEL', 'GHCRAWL_SUMMARY_MODEL') }, { source: 'config', value: stored.data.summaryModel }, { source: 'dotenv', value: getDotenvString(dotenvValues, 'GHCRAWL_SUMMARY_MODEL', 'GHCRAWL_SUMMARY_MODEL') }, - { source: 'default', value: 'gpt-5-mini' }, + { source: 'default', value: 'gpt-5.4' }, ); const embedModel = pickDefined( { source: 'env', value: getEnvString(env, 'GHCRAWL_EMBED_MODEL', 'GHCRAWL_EMBED_MODEL') }, @@ -376,7 +376,7 @@ export function loadConfig(options: LoadConfigOptions = {}): GitcrawlConfig { githubTokenSource: githubToken.source, openaiApiKey: openaiApiKey.value, openaiApiKeySource: openaiApiKey.source, - summaryModel: summaryModel.value ?? 'gpt-5-mini', + summaryModel: summaryModel.value ?? 'gpt-5.4', embedModel: embedModel.value ?? 'text-embedding-3-large', embeddingBasis: embeddingBasis.value ?? 'title_original', vectorBackend: vectorBackend.value ?? 'vectorlite', diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index e6bdb00..11240ef 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -788,7 +788,7 @@ test('summarizeRepository prices progress output using the configured summary mo } }); -test('generateKeySummaries stores cached 3-line key summaries', async () => { +test('generateKeySummaries stores cached structured key summaries', async () => { let calls = 0; const service = makeTestService( { @@ -810,6 +810,7 @@ test('generateKeySummaries stores cached 3-line key summaries', async () => { calls += 1; return { summary: { + purpose: 'Keep downloads from hanging repository sync.', intent: 'Fix retry loop.', surface: 'Downloader.', mechanism: 'Changes timeout handling.', @@ -1061,7 +1062,7 @@ test('embedRepository batches multi-source embeddings and skips unchanged inputs } }); -test('embedRepository can use stored 3-line key summaries as active vector input', async () => { +test('embedRepository can use stored structured key summaries as active vector input', async () => { let embeddedText = ''; const service = new GHCrawlService({ config: makeTestConfig({ embeddingBasis: 'llm_key_summary' }), @@ -1118,13 +1119,13 @@ test('embedRepository can use stored 3-line key summaries as active vector input .run( 100, 'llm_key_3line', - 'llm-key-summary-v1', + 'llm-key-summary-v2', 'openai', 'gpt-5-mini', 'input-hash', 'output-hash', null, - 'intent: Fix retry loop.\nsurface: Downloader.\nmechanism: Changes timeout handling.', + 'purpose: Keep downloads from hanging repository sync.\nintent: Fix retry loop.\nsurface: Downloader.\nmechanism: Changes timeout handling.', now, ); From 13ce16d2a41ffc7b583e7b9c872d3c01f47b9488 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 13:35:48 -0700 Subject: [PATCH 084/215] fix(cluster): ignore broad setup patches --- .../src/cluster/code-signature.test.ts | 34 +++++++++++++++++++ .../api-core/src/cluster/code-signature.ts | 32 +++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/cluster/code-signature.test.ts b/packages/api-core/src/cluster/code-signature.test.ts index 6159c00..6113fec 100644 --- a/packages/api-core/src/cluster/code-signature.test.ts +++ b/packages/api-core/src/cluster/code-signature.test.ts @@ -63,3 +63,37 @@ test('buildCodeSnapshotSignature returns files, patch digest, and hunk signature assert.equal(snapshot.hunkSignatures.length, 1); assert.match(snapshot.patchDigest, /^[a-f0-9]{64}$/); }); + +test('buildCodeSnapshotSignature keeps metadata but skips broad setup patches', () => { + const snapshot = buildCodeSnapshotSignature([ + { + filename: 'pnpm-lock.yaml', + status: 'modified', + additions: 5_000, + deletions: 5_000, + changes: 10_000, + patch: '@@ -1 +1 @@\n-old\n+new', + }, + ]); + + assert.equal(snapshot.files.length, 1); + assert.equal(snapshot.files[0]?.filename, 'pnpm-lock.yaml'); + assert.equal(snapshot.hunkSignatures.length, 0); + assert.match(snapshot.patchDigest, /^[a-f0-9]{64}$/); +}); + +test('buildCodeSnapshotSignature skips hunk extraction for massive snapshots', () => { + const files = Array.from({ length: 101 }, (_, index) => ({ + filename: `src/file-${index}.ts`, + status: 'modified', + additions: 1, + deletions: 1, + changes: 2, + patch: '@@ -1 +1 @@\n-old\n+new', + })); + + const snapshot = buildCodeSnapshotSignature(files); + + assert.equal(snapshot.files.length, 101); + assert.equal(snapshot.hunkSignatures.length, 0); +}); diff --git a/packages/api-core/src/cluster/code-signature.ts b/packages/api-core/src/cluster/code-signature.ts index d64a338..0a5b7cb 100644 --- a/packages/api-core/src/cluster/code-signature.ts +++ b/packages/api-core/src/cluster/code-signature.ts @@ -1,6 +1,11 @@ import crypto from 'node:crypto'; const TOKEN_RE = /[a-zA-Z0-9_.$/-]+/g; +const MAX_PATCH_CHARS_FOR_HUNKS = 120_000; +const MAX_PATCH_CHARS_PER_FILE = 20_000; +const MAX_FILES_FOR_HUNK_EXTRACTION = 100; +const GENERATED_OR_SETUP_PATH_RE = + /(^|\/)(package-lock\.json|pnpm-lock\.yaml|yarn\.lock|bun\.lockb|npm-shrinkwrap\.json|Cargo\.lock|Gemfile\.lock|poetry\.lock|go\.sum|dist|build|coverage|vendor|generated)(\/|$)/i; export type PullFileMetadata = { filename: string; @@ -104,7 +109,17 @@ export function extractHunkSignatures(path: string, patch: string | null | undef export function buildCodeSnapshotSignature(files: Array>): CodeSnapshotSignature { const normalizedFiles = files.map(normalizePullFile).filter((file) => file.filename.length > 0); - const hunkSignatures = normalizedFiles.flatMap((file) => extractHunkSignatures(file.filename, file.patch)); + const totalPatchChars = normalizedFiles.reduce((total, file) => total + (file.patch?.length ?? 0), 0); + const shouldExtractHunks = + normalizedFiles.length <= MAX_FILES_FOR_HUNK_EXTRACTION && totalPatchChars <= MAX_PATCH_CHARS_FOR_HUNKS; + const hunkSignatures = shouldExtractHunks + ? normalizedFiles.flatMap((file) => { + if (isPatchTooBroadForHunks(file)) { + return []; + } + return extractHunkSignatures(file.filename, file.patch); + }) + : []; const patchDigest = sha256( JSON.stringify( normalizedFiles.map((file) => ({ @@ -113,7 +128,7 @@ export function buildCodeSnapshotSignature(files: Array> previousFilename: file.previousFilename, additions: file.additions, deletions: file.deletions, - patchHash: file.patch ? sha256(file.patch) : null, + patchHash: shouldHashPatch(file) ? sha256(file.patch ?? '') : null, })), ), ); @@ -124,3 +139,16 @@ export function buildCodeSnapshotSignature(files: Array> hunkSignatures, }; } + +function isPatchTooBroadForHunks(file: PullFileMetadata): boolean { + return ( + !file.patch || + file.patch.length > MAX_PATCH_CHARS_PER_FILE || + file.changes > 2_000 || + GENERATED_OR_SETUP_PATH_RE.test(file.filename) + ); +} + +function shouldHashPatch(file: PullFileMetadata): boolean { + return Boolean(file.patch) && file.patch!.length <= MAX_PATCH_CHARS_PER_FILE && !GENERATED_OR_SETUP_PATH_RE.test(file.filename); +} From 3604e88f971f40f942e7fee22d5e986ccadad554 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 13:35:57 -0700 Subject: [PATCH 085/215] fix(cluster): dampen broad module evidence --- .../src/cluster/evidence-score.test.ts | 17 +++++++++++ .../api-core/src/cluster/evidence-score.ts | 2 +- .../src/cluster/thread-fingerprint.test.ts | 28 +++++++++++++++++++ .../src/cluster/thread-fingerprint.ts | 14 ++++++++-- 4 files changed, 57 insertions(+), 4 deletions(-) diff --git a/packages/api-core/src/cluster/evidence-score.test.ts b/packages/api-core/src/cluster/evidence-score.test.ts index c88c6ee..f4da156 100644 --- a/packages/api-core/src/cluster/evidence-score.test.ts +++ b/packages/api-core/src/cluster/evidence-score.test.ts @@ -81,3 +81,20 @@ test('scoreSimilarityEvidence rejects unrelated deterministic fingerprints', () assert.equal(evidence.tier, 'none'); }); + +test('scoreSimilarityEvidence rejects broad same-module maintenance without close title evidence', () => { + const left = fp({ + id: 1, + title: 'Fix cron missing job state', + files: Array.from({ length: 30 }, (_, index) => `packages/api-core/src/cron-${index}.ts`), + }); + const right = fp({ + id: 2, + title: 'Clear session model override', + files: Array.from({ length: 30 }, (_, index) => `packages/api-core/src/session-${index}.ts`), + }); + + const evidence = scoreSimilarityEvidence(left, right); + + assert.equal(evidence.tier, 'none'); +}); diff --git a/packages/api-core/src/cluster/evidence-score.ts b/packages/api-core/src/cluster/evidence-score.ts index 79437ef..5ee58ba 100644 --- a/packages/api-core/src/cluster/evidence-score.ts +++ b/packages/api-core/src/cluster/evidence-score.ts @@ -79,7 +79,7 @@ export function scoreSimilarityEvidence( } else if ( score >= config.minScore || base.fileOverlap >= 0.4 || - (base.moduleOverlap >= 0.5 && base.titleOverlap >= 0.15) || + (base.moduleOverlap >= 0.5 && base.titleOverlap >= 0.25 && base.tokenSimhash >= 0.55) || (base.titleOverlap >= 0.25 && base.tokenSimhash >= 0.55) || (base.structure >= 0.5 && base.tokenSimhash >= 0.55) || (base.linkedRefOverlap >= 0.5 && base.tokenMinhash >= 0.25) diff --git a/packages/api-core/src/cluster/thread-fingerprint.test.ts b/packages/api-core/src/cluster/thread-fingerprint.test.ts index 96323ae..aee55c6 100644 --- a/packages/api-core/src/cluster/thread-fingerprint.test.ts +++ b/packages/api-core/src/cluster/thread-fingerprint.test.ts @@ -72,3 +72,31 @@ test('compareDeterministicFingerprints scores deterministic overlap features', ( assert.ok(Math.abs(breakdown.structure - 1) < 1e-9); assert.equal(breakdown.lineage, 1); }); + +test('compareDeterministicFingerprints dampens broad file overlap', () => { + const shared = 'packages/api-core/src/service.ts'; + const first = buildDeterministicThreadFingerprint({ + threadId: 1, + number: 42, + kind: 'pull_request', + title: 'Fix cron missing job state', + body: '', + labels: [], + changedFiles: [shared, ...Array.from({ length: 60 }, (_, index) => `packages/api-core/src/a-${index}.ts`)], + }); + const second = buildDeterministicThreadFingerprint({ + threadId: 2, + number: 43, + kind: 'pull_request', + title: 'Fix session model override', + body: '', + labels: [], + changedFiles: [shared, ...Array.from({ length: 60 }, (_, index) => `packages/api-core/src/b-${index}.ts`)], + }); + + const breakdown = compareDeterministicFingerprints(first, second); + + assert.ok(breakdown.fileOverlap < 0.01); + assert.ok(breakdown.moduleOverlap < 1); + assert.ok(breakdown.structure < 0.3); +}); diff --git a/packages/api-core/src/cluster/thread-fingerprint.ts b/packages/api-core/src/cluster/thread-fingerprint.ts index 26b6908..0cce5d2 100644 --- a/packages/api-core/src/cluster/thread-fingerprint.ts +++ b/packages/api-core/src/cluster/thread-fingerprint.ts @@ -184,8 +184,11 @@ export function compareDeterministicFingerprints( overlapMin(new Set(left.linkedRefs), new Set(right.linkedRefs)), ); const titleOverlap = jaccard(new Set(left.salientTitleTokens), new Set(right.salientTitleTokens)); - const fileOverlap = jaccard(new Set(left.changedFiles), new Set(right.changedFiles)); - const moduleOverlap = jaccard(new Set(left.moduleBuckets), new Set(right.moduleBuckets)); + const maxChangedFiles = Math.max(left.changedFiles.length, right.changedFiles.length); + const fileBreadthPenalty = breadthPenalty(maxChangedFiles, 40); + const moduleBreadthPenalty = breadthPenalty(maxChangedFiles, 12); + const fileOverlap = jaccard(new Set(left.changedFiles), new Set(right.changedFiles)) * fileBreadthPenalty; + const moduleOverlap = jaccard(new Set(left.moduleBuckets), new Set(right.moduleBuckets)) * moduleBreadthPenalty; const hunkOverlap = jaccard(new Set(left.hunkSignatures), new Set(right.hunkSignatures)); const patchOverlap = overlapMin(new Set(left.patchIds), new Set(right.patchIds)); return { @@ -198,11 +201,16 @@ export function compareDeterministicFingerprints( moduleOverlap, hunkOverlap, patchOverlap, - structure: Math.max(hunkOverlap, patchOverlap, fileOverlap, 0.65 * moduleOverlap), + structure: Math.max(hunkOverlap, patchOverlap, fileOverlap, 0.25 * moduleOverlap), lineage: patchOverlap, }; } +function breadthPenalty(count: number, freeCount: number): number { + if (count <= freeCount) return 1; + return freeCount / count; +} + export function tokenShinglesForDebug(tokens: string[], size = 3): string[] { return buildShingles(tokens, size); } From bb478ab3405037e0662b9f2bebad361e42299dc3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 13:41:17 -0700 Subject: [PATCH 086/215] fix(tui): show stale cluster runs --- packages/api-core/src/service.test.ts | 20 ++++++++++++++++++++ packages/api-core/src/service.ts | 4 ---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 11240ef..5ea1e3d 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3108,6 +3108,26 @@ test('tui snapshot returns mixed issue and pull request counts with default visi service.db .prepare(`insert into embedding_runs (id, repo_id, scope, status, started_at, finished_at) values (?, ?, ?, ?, ?, ?)`) .run(1, 1, 'openclaw/openclaw', 'completed', now, '2026-03-09T13:00:00Z'); + service.db + .prepare( + `insert into repo_pipeline_state ( + repo_id, summary_model, summary_prompt_version, embedding_basis, embed_model, embed_dimensions, + embed_pipeline_version, vector_backend, vectors_current_at, clusters_current_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 1, + 'previous-summary-model', + 'previous-summary-prompt', + 'title_original', + 'text-embedding-3-large', + 1024, + 'previous-embed-pipeline', + 'vectorlite', + '2026-03-09T13:00:00Z', + '2026-03-09T14:30:00Z', + now, + ); service.db .prepare( `insert into clusters (id, repo_id, cluster_run_id, representative_thread_id, member_count, created_at) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 7704f5e..9621276 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3887,10 +3887,6 @@ export class GHCrawlService { } private getLatestClusterRun(repoId: number): { id: number; finished_at: string | null } | null { - const state = this.getRepoPipelineState(repoId); - if (state && !this.isRepoClusterStateCurrent(repoId)) { - return null; - } return ( (this.db .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") From 2fd372c31844b9ecbd58a78a05c3fe1237c88682 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:01:02 -0700 Subject: [PATCH 087/215] fix(cluster): tighten default topology --- apps/cli/src/main.test.ts | 13 ++++- apps/cli/src/main.ts | 1 + packages/api-core/src/service.test.ts | 68 +++++++++++++++++++++++++++ packages/api-core/src/service.ts | 65 +++++++++++++++++++++---- 4 files changed, 137 insertions(+), 10 deletions(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 728504e..2462940 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -579,7 +579,7 @@ test('cluster command forwards neighborhood refresh inputs', async () => { }; try { - await run(['cluster', 'openclaw/openclaw', '--number', '42', '--k', '4', '--threshold', '0.82'], stdout.stream, { + await run(['cluster', 'openclaw/openclaw', '--number', '42', '--k', '4', '--threshold', '0.82', '--max-cluster-size', '24'], stdout.stream, { env: context.env, cwd: context.cwd, }); @@ -588,12 +588,21 @@ test('cluster command forwards neighborhood refresh inputs', async () => { context.cleanup(); } - const params = received as { owner: string; repo: string; threadNumber: number; k: number; minScore: number; onProgress?: unknown }; + const params = received as { + owner: string; + repo: string; + threadNumber: number; + k: number; + minScore: number; + maxClusterSize: number; + onProgress?: unknown; + }; assert.equal(params.owner, 'openclaw'); assert.equal(params.repo, 'openclaw'); assert.equal(params.threadNumber, 42); assert.equal(params.k, 4); assert.equal(params.minScore, 0.82); + assert.equal(params.maxClusterSize, 24); assert.equal(typeof params.onProgress, 'function'); assert.match(stdout.read(), /"edges": 3/); }); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 4091ade..0ed5e6c 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -576,6 +576,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo backend: { type: 'string' }, 'candidate-k': { type: 'string' }, threshold: { type: 'string' }, + 'max-cluster-size': { type: 'string' }, port: { type: 'string' }, id: { type: 'string' }, source: { type: 'string' }, diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 5ea1e3d..c8d9570 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3012,6 +3012,74 @@ test('clusterExperiment falls back to active vectors when legacy embeddings are } }); +test('clusterRepository can reuse stale active vectors for offline reclustering', async () => { + const progress: string[] = []; + const service = new GHCrawlService({ + config: makeTestConfig(), + github: { + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + ai: { + summarizeThread: async () => { + throw new Error('not expected'); + }, + embedTexts: async ({ texts }) => texts.map((_text, index) => (index === 0 ? makeEmbedding(1, 0) : makeEmbedding(0.99, 0.01))), + }, + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Downloader hangs', 'The transfer never finishes.', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Downloader retry issue', 'The transfer retries forever.', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + + await service.embedRepository({ owner: 'openclaw', repo: 'openclaw' }); + service.db.prepare("update repo_pipeline_state set summary_model = 'previous-model' where repo_id = 1").run(); + + const result = await service.clusterRepository({ + owner: 'openclaw', + repo: 'openclaw', + minScore: 0.8, + k: 1, + onProgress: (message) => progress.push(message), + }); + + assert.equal(result.edges, 1); + assert.equal(result.clusters, 1); + assert.ok(progress.some((message) => message.includes('stale active vector'))); + const state = service.db.prepare('select clusters_current_at from repo_pipeline_state where repo_id = 1').get() as { + clusters_current_at: string | null; + }; + assert.equal(state.clusters_current_at, null); + } finally { + service.close(); + } +}); + test('clusterRepository does not retain a parsed embedding cache in-process', async () => { const service = makeTestService({ getRepo: async () => ({}), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 9621276..863a3dc 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -426,10 +426,10 @@ const KEY_SUMMARY_MAX_UNREAD = 48; const SUMMARY_PROMPT_VERSION = 'v1'; const ACTIVE_EMBED_DIMENSIONS = 1024; const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; -const DEFAULT_CLUSTER_MIN_SCORE = 0.74; +const DEFAULT_CLUSTER_MIN_SCORE = 0.8; const DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE = 0.36; -const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.9; -const DEFAULT_CLUSTER_MAX_SIZE = 64; +const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.93; +const DEFAULT_CLUSTER_MAX_SIZE = 40; const VECTORLITE_CLUSTER_EXPANDED_K = 24; const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512; @@ -2033,7 +2033,7 @@ export class GHCrawlService { maxClusterSize: params.maxClusterSize ?? DEFAULT_CLUSTER_MAX_SIZE, clusterMode: 'size_bounded', crossKindMinScore: Math.max(params.minScore ?? DEFAULT_CLUSTER_MIN_SCORE, DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE), - k: params.k ?? 6, + k: params.k ?? 16, embedModel: this.config.embedModel, embeddingBasis: this.config.embeddingBasis, }), @@ -2091,8 +2091,9 @@ export class GHCrawlService { `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${runSubject}`, ); - if (this.isRepoVectorStateCurrent(repository.id)) { - const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); + const vectorStateCurrent = this.isRepoVectorStateCurrent(repository.id); + const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); + if (vectorItems.length > 0) { const queryVectorItems = seedThreadIds ? vectorItems.filter((item) => seedThreadIds.includes(item.id)) : vectorItems; const activeSourceKind = this.activeVectorSourceKind(); const activeIds = new Set(vectorItems.map((item) => item.id)); @@ -2101,7 +2102,7 @@ export class GHCrawlService { let lastProgressAt = Date.now(); params.onProgress?.( - `[cluster] loaded ${vectorItems.length} active vector(s), querying ${queryVectorItems.length} for ${runSubject} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`, + `[cluster] loaded ${vectorItems.length} ${vectorStateCurrent ? 'current' : 'stale'} active vector(s), querying ${queryVectorItems.length} for ${runSubject} backend=${this.config.vectorBackend} k=${k} query_limit=${annQuery.limit} candidateK=${annQuery.candidateK} efSearch=${annQuery.efSearch ?? 'default'} minScore=${minScore}`, ); for (const item of queryVectorItems) { const neighbors = this.queryNearestWithRecovery(repository.id, repository.fullName, { @@ -2184,6 +2185,7 @@ export class GHCrawlService { edges, { maxClusterSize }, ); + const clusterQuality = this.summarizeClusterQuality(clusters, threadKinds, maxClusterSize); if (!seedThreadIds) { this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); } @@ -2191,7 +2193,7 @@ export class GHCrawlService { if (!seedThreadIds) { this.pruneOldClusterRuns(repository.id, runId); } - if (!seedThreadIds && this.isRepoVectorStateCurrent(repository.id)) { + if (!seedThreadIds && vectorStateCurrent) { this.markRepoClustersCurrent(repository.id); this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress); } @@ -2208,6 +2210,7 @@ export class GHCrawlService { threadNumber: params.threadNumber ?? null, droppedCrossKindEdges, crossKindMinScore, + ...clusterQuality, }; this.finishRun('cluster_runs', runId, 'completed', stats); finishPipelineRun(this.db, pipelineRunId, { status: 'completed', stats }); @@ -6123,6 +6126,52 @@ export class GHCrawlService { }; } + private summarizeClusterQuality( + clusters: Array<{ representativeThreadId: number; members: number[] }>, + threadKinds: Map, + maxClusterSize: number, + ): { + maxClusterSize: number; + maxObservedClusterSize: number; + maxedClusterCount: number; + mixedKindClusterCount: number; + singletonClusterCount: number; + nonSingletonClusterCount: number; + } { + let maxObservedClusterSize = 0; + let maxedClusterCount = 0; + let mixedKindClusterCount = 0; + let singletonClusterCount = 0; + + for (const cluster of clusters) { + const size = cluster.members.length; + maxObservedClusterSize = Math.max(maxObservedClusterSize, size); + if (size >= maxClusterSize) maxedClusterCount += 1; + if (size === 1) singletonClusterCount += 1; + + let hasIssue = false; + let hasPullRequest = false; + for (const memberId of cluster.members) { + const kind = threadKinds.get(memberId); + hasIssue ||= kind === 'issue'; + hasPullRequest ||= kind === 'pull_request'; + if (hasIssue && hasPullRequest) { + mixedKindClusterCount += 1; + break; + } + } + } + + return { + maxClusterSize, + maxObservedClusterSize, + maxedClusterCount, + mixedKindClusterCount, + singletonClusterCount, + nonSingletonClusterCount: clusters.length - singletonClusterCount, + }; + } + private upsertSummary(threadId: number, contentHash: string, summaryKind: string, summaryText: string): void { this.db .prepare( From bbd436924a4e14be8440de34bc344c535f499490 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:12:48 -0700 Subject: [PATCH 088/215] fix(tui): tighten cluster and detail panes --- apps/cli/src/tui/app.test.ts | 34 ++++++- apps/cli/src/tui/app.ts | 166 ++++++++++++++++++++++---------- apps/cli/src/tui/layout.test.ts | 4 +- apps/cli/src/tui/layout.ts | 2 +- apps/cli/src/tui/state.test.ts | 10 +- apps/cli/src/tui/state.ts | 25 ++++- 6 files changed, 182 insertions(+), 59 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index c204c21..2725217 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -11,8 +11,10 @@ import { formatClusterShortName, getRepositoryChoices, parseOwnerRepoValue, + renderMarkdownForTerminal, renderDetailPane, resolveBlessedTerminal, + splitClusterDisplayTitle, } from './app.js'; test('escapeBlessedText escapes blessed tag delimiters', () => { @@ -70,13 +72,14 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e }; const rendered = renderDetailPane(detail, cluster, 'detail'); - assert.match(rendered, /Cluster 1 \(#42 representative issue\)/); + assert.match(rendered, /C1 \(#42 representative issue\)/); assert.match(rendered, /Bad \\{bold\\}title\\{\/bold\\}/); assert.match(rendered, /LLM Summary:/); + assert.match(rendered, /Main/); assert.match(rendered, /Body with \\{red-fg\\}tags\\{\/red-fg\\}/); assert.match(rendered, /Summary \\{yellow-fg\\}text\\{\/yellow-fg\\}/); assert.match(rendered, /Neighbor \\{blue-fg\\}title\\{\/blue-fg\\}/); - assert.ok(rendered.indexOf('LLM Summary:') < rendered.indexOf('{bold}Body{/bold}')); + assert.ok(rendered.indexOf('LLM Summary:') < rendered.indexOf('{bold}Main{/bold}')); }); test('parseOwnerRepoValue accepts owner slash repo values and rejects invalid ones', () => { @@ -96,10 +99,10 @@ test('formatClusterDateColumn follows locale month/day ordering while keeping fi assert.equal(formatClusterDateColumn(iso, 'en-GB'), '10-03 16:04'); }); -test('formatClusterListLabel keeps counts first and adds a short cluster name', () => { +test('formatClusterListLabel keeps counts first and splits cluster name from title', () => { const label = formatClusterListLabel({ clusterId: 1507, - displayTitle: 'Fix: dedupe section title/desc in single-section config view', + displayTitle: 'alpha-beta-gamma Fix: dedupe section title/desc in single-section config view', isClosed: false, closedAtLocal: null, closeReasonLocal: null, @@ -113,7 +116,9 @@ test('formatClusterListLabel keeps counts first and adds a short cluster name', searchText: 'fix dedupe section', }); - assert.match(label, /3 items\s+dedupe section title\/des\s+C1507\s+3P\/0I\s+04-24 07:29/); + assert.match(label, /^\s*3\s+alpha-beta-gamma\s+Fix: dedupe section title\/desc/); + assert.match(label, /0I\/3P/); + assert.doesNotMatch(label, /items/); }); test('formatClusterShortName returns the first meaningful words', () => { @@ -122,6 +127,25 @@ test('formatClusterShortName returns the first meaningful words', () => { assert.equal(formatClusterShortName(''), 'untitled'); }); +test('splitClusterDisplayTitle separates stable slug from representative title', () => { + assert.deepEqual(splitClusterDisplayTitle('alpha-beta-gamma Fix gateway timeout'), { + name: 'alpha-beta-gamma', + title: 'Fix gateway timeout', + }); + assert.equal(splitClusterDisplayTitle('Fix gateway timeout').name, 'gateway timeout'); +}); + +test('renderMarkdownForTerminal formats common markdown without exposing blessed tags', () => { + const rendered = renderMarkdownForTerminal( + ['# Heading {boom}', '- **bold** and `code`', '[site](https://example.com/path)', 'https://example.com/raw'].join('\n'), + ); + + assert.match(rendered, /\{bold\}Heading \\{boom\\}\{\/bold\}/); + assert.match(rendered, /- \{bold\}bold\{\/bold\} and \{yellow-fg\}code\{\/yellow-fg\}/); + assert.match(rendered, /\x1B\]8;;https:\/\/example\.com\/path/); + assert.match(rendered, /\x1B\]8;;https:\/\/example\.com\/raw/); +}); + test('getRepositoryChoices sorts by most recent update and includes the new-repo action', () => { const service = { listRepositories() { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 6258840..37d2bd0 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -18,6 +18,7 @@ import { cycleMinSizeFilter, cycleSortMode, findSelectableIndex, + formatRelativeTime, moveSelectableIndex, preserveSelectedId, type MemberListRow, @@ -81,7 +82,7 @@ function createScreen(options: Parameters[0]): blessed.Wi } const ACTIVITY_LOG_LIMIT = 200; -const FOOTER_LOG_LINES = 3; +const FOOTER_LOG_LINES = 1; export async function startTui(params: StartTuiParams): Promise { const selectedRepository = params.owner && params.repo ? { owner: params.owner, repo: params.repo } : null; @@ -329,17 +330,10 @@ export async function startTui(params: StartTuiParams): Promise { widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane)); updatePaneStyles(widgets, focusPane); - const logLines = activityLines.slice(-FOOTER_LOG_LINES); - const footerLines = [...logLines]; - while (footerLines.length < FOOTER_LOG_LINES) { - footerLines.unshift(''); - } - footerLines.push( - `${status} | focus:${focusPane} sort:${sortMode} h/? help # jump p repos / filter s sort f min`, - ); - footerLines.push( - `Tab focus mouse click/select/scroll PgUp/PgDn page l layout x closed r refresh o open q quit`, - ); + const footerLines = [ + activityLines.at(-1) ?? status, + `focus:${focusPane} sort:${sortMode} min:${minSize === 0 ? 'all' : `${minSize}+`} Tab focus / filter s sort f min # jump o open h help q quit`, + ]; widgets.footer.setContent(footerLines.join('\n')); widgets.screen.render(); }; @@ -955,12 +949,19 @@ export function renderDetailPane( if (!clusterDetail) { return 'No cluster selected.\n\nRun `ghcrawl cluster owner/repo` if you have not clustered this repository yet.'; } + const clusterTitle = splitClusterDisplayTitle(clusterDetail.displayTitle); if (!threadDetail) { const representativeLabel = clusterDetail.representativeNumber !== null && clusterDetail.representativeKind !== null ? ` (#${clusterDetail.representativeNumber} representative ${clusterDetail.representativeKind === 'pull_request' ? 'pr' : 'issue'})` : ''; - return `{bold}Cluster ${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}{/bold}\n${escapeBlessedText(clusterDetail.displayTitle)}\n\nSelect a member to inspect thread details.`; + return [ + `{bold}Cluster ${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}{/bold}`, + `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg}`, + escapeBlessedText(clusterTitle.title), + '', + 'Select a member to inspect thread details.', + ].join('\n'); } const thread = threadDetail.thread; @@ -968,7 +969,7 @@ export function renderDetailPane( clusterDetail.representativeNumber !== null && clusterDetail.representativeKind !== null ? ` (#${clusterDetail.representativeNumber} representative ${clusterDetail.representativeKind === 'pull_request' ? 'pr' : 'issue'})` : ''; - const labels = thread.labels.length > 0 ? escapeBlessedText(thread.labels.join(', ')) : 'none'; + const labels = thread.labels.length > 0 ? thread.labels.map((label) => `{cyan-fg}${escapeBlessedText(label)}{/cyan-fg}`).join(' ') : 'none'; const closedLabel = thread.isClosed ? `{bold}Closed:{/bold} ${escapeBlessedText(thread.closedAtLocal ?? thread.closedAtGh ?? 'yes')} ${thread.closeReasonLocal ? `(${escapeBlessedText(thread.closeReasonLocal)})` : ''}`.trimEnd() : '{bold}Closed:{/bold} no'; @@ -986,20 +987,19 @@ export function renderDetailPane( : focusPane === 'detail' ? 'No neighbors available.' : 'Neighbors load when the detail pane is focused.'; + const body = renderMarkdownForTerminal(thread.body ?? '(no body)'); return [ - `{bold}Cluster ${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}{/bold}`, + `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, + `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, '', - `{bold}${thread.kind} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, - '', - `{bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, - closedLabel, - `{bold}Updated:{/bold} ${thread.updatedAtGh ?? 'unknown'}`, + `${closedLabel} {bold}Updated:{/bold} ${escapeBlessedText(formatRelativeTime(thread.updatedAtGh))} {bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, `{bold}Labels:{/bold} ${labels}`, - `{bold}URL:{/bold} ${escapeBlessedText(thread.htmlUrl)}`, + `{bold}URL:{/bold} ${formatTerminalLink(thread.htmlUrl, thread.htmlUrl)}`, + '', summaries ? `\n\n${summaries}` : '', '', - `{bold}Body{/bold}`, - escapeBlessedText(thread.body ?? '(no body)'), + `{bold}Main{/bold}`, + body, `\n\n{bold}Neighbors{/bold}\n${neighbors}`, ] .filter(Boolean) @@ -1010,6 +1010,95 @@ export function escapeBlessedText(value: string): string { return value.replace(/\\/g, '\\\\').replace(/\{/g, '\\{').replace(/\}/g, '\\}'); } +export function splitClusterDisplayTitle(displayTitle: string): { name: string; title: string } { + const match = displayTitle.match(/^([a-z]+(?:-[a-z]+){2})\s{2,}(.+)$/); + if (match) { + return { name: match[1] ?? 'cluster', title: match[2] ?? displayTitle }; + } + return { name: formatClusterShortName(displayTitle), title: displayTitle || 'Untitled cluster' }; +} + +export function renderMarkdownForTerminal(markdown: string): string { + let inFence = false; + const rendered = markdown.split(/\r?\n/).map((line) => { + if (/^```/.test(line.trim())) { + inFence = !inFence; + return '{gray-fg}--- code ---{/gray-fg}'; + } + if (inFence) { + return `{gray-fg}${escapeBlessedText(line)}{/gray-fg}`; + } + const heading = line.match(/^(#{1,6})\s+(.+)$/); + if (heading) { + return `{bold}${escapeBlessedText(heading[2] ?? '')}{/bold}`; + } + const quote = line.match(/^>\s?(.*)$/); + if (quote) { + return `{gray-fg}> ${renderInlineMarkdown(quote[1] ?? '')}{/gray-fg}`; + } + const listItem = line.match(/^(\s*)([-*+]|\d+[.)])\s+(.+)$/); + if (listItem) { + const indent = listItem[1] ?? ''; + return `${indent}- ${renderInlineMarkdown(listItem[3] ?? '')}`; + } + return renderInlineMarkdown(line); + }); + return rendered.join('\n').replace(/\n{4,}/g, '\n\n\n').trimEnd(); +} + +type InlineMarkdownSegment = + | { kind: 'text'; value: string } + | { kind: 'link'; label: string; url: string }; + +function renderInlineMarkdown(value: string): string { + const segments: InlineMarkdownSegment[] = []; + const markdownLinkPattern = /\[([^\]]+)\]\((https?:\/\/[^)\s]+)\)/g; + let cursor = 0; + + for (const match of value.matchAll(markdownLinkPattern)) { + const index = match.index ?? 0; + if (index > cursor) { + pushBareLinkSegments(value.slice(cursor, index), segments); + } + segments.push({ kind: 'link', label: match[1] ?? '', url: match[2] ?? '' }); + cursor = index + match[0].length; + } + + if (cursor < value.length) { + pushBareLinkSegments(value.slice(cursor), segments); + } + + return segments.map((segment) => (segment.kind === 'link' ? formatTerminalLink(segment.url, segment.label) : renderInlineText(segment.value))).join(''); +} + +function pushBareLinkSegments(value: string, segments: InlineMarkdownSegment[]): void { + const bareLinkPattern = /https?:\/\/[^\s)]+/g; + let cursor = 0; + for (const match of value.matchAll(bareLinkPattern)) { + const index = match.index ?? 0; + if (index > cursor) { + segments.push({ kind: 'text', value: value.slice(cursor, index) }); + } + const url = match[0]; + segments.push({ kind: 'link', label: url, url }); + cursor = index + url.length; + } + if (cursor < value.length) { + segments.push({ kind: 'text', value: value.slice(cursor) }); + } +} + +function renderInlineText(value: string): string { + return escapeBlessedText(value) + .replace(/`([^`]+)`/g, '{yellow-fg}$1{/yellow-fg}') + .replace(/\*\*([^*]+)\*\*/g, '{bold}$1{/bold}'); +} + +function formatTerminalLink(url: string, label: string): string { + const escapedUrl = url.replace(/[\u0007\u001b]/g, ''); + return `\u001b]8;;${escapedUrl}\u0007${escapeBlessedText(label)}\u001b]8;;\u0007`; +} + function applyRect(element: blessed.Widgets.BoxElement | blessed.Widgets.ListElement, rect: { top: number; left: number; width: number; height: number }): void { element.top = rect.top; element.left = rect.left; @@ -1276,10 +1365,11 @@ export function parseOwnerRepoValue(value: string): { owner: string; repo: strin } export function formatClusterListLabel(cluster: TuiClusterSummary): string { - const countLabel = `${cluster.totalCount} ${cluster.totalCount === 1 ? 'item' : 'items'}`.padStart(7); - const mixLabel = `${cluster.pullRequestCount}P/${cluster.issueCount}I`.padStart(6); - const updated = formatClusterDateColumn(cluster.latestUpdatedAt); - return `${countLabel} ${formatClusterShortName(cluster.displayTitle).padEnd(24).slice(0, 24)} C${cluster.clusterId} ${mixLabel} ${updated}`; + const countLabel = String(cluster.totalCount).padStart(3); + const mixLabel = `${cluster.issueCount}I/${cluster.pullRequestCount}P`.padStart(7); + const updated = formatRelativeTime(cluster.latestUpdatedAt).padStart(8); + const title = splitClusterDisplayTitle(cluster.displayTitle); + return `${countLabel} ${title.name.padEnd(22).slice(0, 22)} ${title.title.padEnd(56).slice(0, 56)} ${mixLabel} ${updated}`; } export function formatClusterShortName(title: string, maxWords = 3): string { @@ -1333,25 +1423,3 @@ export function formatClusterDateColumn(value: string | null, locales?: Intl.Loc return `${date} ${hour}:${minute}`; } - -function formatRelativeTime(value: string | null, now: Date = new Date()): string { - if (!value) return 'never'; - const parsed = new Date(value); - if (Number.isNaN(parsed.getTime())) return value; - const diffMs = Math.max(0, now.getTime() - parsed.getTime()); - const minuteMs = 60_000; - const hourMs = 60 * minuteMs; - const dayMs = 24 * hourMs; - - if (diffMs < hourMs) { - const minutes = Math.max(1, Math.floor(diffMs / minuteMs)); - return `${minutes}m ago`; - } - if (diffMs < dayMs) { - return `${Math.floor(diffMs / hourMs)}h ago`; - } - if (diffMs < 14 * dayMs) { - return `${Math.floor(diffMs / dayMs)}d ago`; - } - return parsed.toISOString().slice(0, 10); -} diff --git a/apps/cli/src/tui/layout.test.ts b/apps/cli/src/tui/layout.test.ts index 24f12f6..1196d72 100644 --- a/apps/cli/src/tui/layout.test.ts +++ b/apps/cli/src/tui/layout.test.ts @@ -7,8 +7,8 @@ test('computeTuiLayout uses wide mode for large terminals', () => { const layout = computeTuiLayout(160, 40); assert.equal(layout.mode, 'wide-columns'); assert.equal(layout.clusters.top, 1); - assert.equal(layout.footer.top, 35); - assert.equal(layout.footer.height, 5); + assert.equal(layout.footer.top, 38); + assert.equal(layout.footer.height, 2); }); test('computeTuiLayout can stack members and detail on the right in wide mode', () => { diff --git a/apps/cli/src/tui/layout.ts b/apps/cli/src/tui/layout.ts index e7f358a..2ed4e0d 100644 --- a/apps/cli/src/tui/layout.ts +++ b/apps/cli/src/tui/layout.ts @@ -20,7 +20,7 @@ export type TuiLayout = { export function computeTuiLayout(width: number, height: number, wideMode: TuiWideLayoutMode = 'columns'): TuiLayout { const safeWidth = Math.max(60, width); const safeHeight = Math.max(16, height); - const footerHeight = 5; + const footerHeight = 2; const contentTop = 1; const contentHeight = Math.max(6, safeHeight - 1 - footerHeight); const header = { top: 0, left: 0, width: safeWidth, height: 1 }; diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 460611f..8633c6e 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -1,7 +1,7 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { buildMemberRows, cycleFocusPane, cycleMinSizeFilter, cycleSortMode, findSelectableIndex, moveSelectableIndex, preserveSelectedId, applyClusterFilters } from './state.js'; +import { buildMemberRows, cycleFocusPane, cycleMinSizeFilter, cycleSortMode, findSelectableIndex, formatRelativeTime, moveSelectableIndex, preserveSelectedId, applyClusterFilters } from './state.js'; import type { TuiClusterDetail, TuiClusterSummary } from '@ghcrawl/api-core'; test('cycleSortMode toggles size and recent', () => { @@ -23,6 +23,13 @@ test('cycleFocusPane moves forward and backward', () => { assert.equal(cycleFocusPane('clusters', -1), 'detail'); }); +test('formatRelativeTime returns compact human readable ages', () => { + const now = new Date('2026-04-24T12:00:00Z'); + assert.equal(formatRelativeTime('2026-04-24T11:58:00Z', now), '2m ago'); + assert.equal(formatRelativeTime('2026-04-24T06:00:00Z', now), '6h ago'); + assert.equal(formatRelativeTime('2026-04-18T12:00:00Z', now), '6d ago'); +}); + test('applyClusterFilters sorts by recent and size and respects min size/search', () => { const clusters: TuiClusterSummary[] = [ { @@ -123,6 +130,7 @@ test('buildMemberRows groups issues and pull requests and selection skips header const rows = buildMemberRows(detail); assert.equal(rows[0]?.selectable, false); + assert.match(rows[1]?.label ?? '', /#42\s+\d+d ago|#42\s+2026-03-09/); assert.equal(findSelectableIndex(rows, 10), 1); assert.equal(moveSelectableIndex(rows, 1, 1), 3); }); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 404d5b6..fdd3ff1 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -113,11 +113,34 @@ function compareClusters(left: TuiClusterSummary, right: TuiClusterSummary, sort } function formatMemberLabel(number: number, title: string, updatedAtGh: string | null, isClosed: boolean): string { - const updated = updatedAtGh ? updatedAtGh.slice(5, 16).replace('T', ' ') : 'unknown'; + const updated = formatRelativeTime(updatedAtGh); const label = escapeBlessedInline(`#${number} ${updated} ${title}`); return isClosed ? `{gray-fg}${label}{/gray-fg}` : label; } +export function formatRelativeTime(value: string | null, now: Date = new Date()): string { + if (!value) return 'never'; + const parsed = new Date(value); + if (Number.isNaN(parsed.getTime())) return value; + const diffMs = Math.max(0, now.getTime() - parsed.getTime()); + const minuteMs = 60_000; + const hourMs = 60 * minuteMs; + const dayMs = 24 * hourMs; + + if (diffMs < minuteMs) return 'now'; + if (diffMs < hourMs) { + const minutes = Math.max(1, Math.floor(diffMs / minuteMs)); + return `${minutes}m ago`; + } + if (diffMs < dayMs) { + return `${Math.floor(diffMs / hourMs)}h ago`; + } + if (diffMs < 14 * dayMs) { + return `${Math.floor(diffMs / dayMs)}d ago`; + } + return parsed.toISOString().slice(0, 10); +} + function escapeBlessedInline(value: string): string { return value.replace(/\\/g, '\\\\').replace(/\{/g, '\\{').replace(/\}/g, '\\}'); } From 548b3f184499e366cedb9287ffcad5bd7aaf973e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:20:16 -0700 Subject: [PATCH 089/215] feat(tui): add thread context menu --- apps/cli/src/tui/app.test.ts | 37 +++++++++- apps/cli/src/tui/app.ts | 131 ++++++++++++++++++++++++++++++++++- 2 files changed, 165 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 2725217..e4f177a 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -4,6 +4,7 @@ import assert from 'node:assert/strict'; import type { TuiClusterDetail, TuiThreadDetail } from '@ghcrawl/api-core'; import { + buildThreadContextMenuItems, buildHelpContent, escapeBlessedText, formatClusterDateColumn, @@ -146,6 +147,40 @@ test('renderMarkdownForTerminal formats common markdown without exposing blessed assert.match(rendered, /\x1B\]8;;https:\/\/example\.com\/raw/); }); +test('buildThreadContextMenuItems exposes thread actions for right-click menus', () => { + const items = buildThreadContextMenuItems({ + thread: { + id: 1, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Example', + body: null, + authorLogin: 'dev', + htmlUrl: 'https://example.com/42', + labels: [], + updatedAtGh: '2026-03-09T00:00:00Z', + clusterId: 1, + }, + summaries: {}, + neighbors: [], + }); + + assert.deepEqual( + items.map((item) => item.action), + ['open', 'copy-url', 'copy-title', 'copy-markdown-link', 'load-neighbors', 'close'], + ); +}); + +test('buildThreadContextMenuItems only closes when no thread is selected', () => { + assert.deepEqual(buildThreadContextMenuItems(null), [{ label: 'Close', action: 'close' }]); +}); + test('getRepositoryChoices sorts by most recent update and includes the new-repo action', () => { const service = { listRepositories() { @@ -189,7 +224,7 @@ test('buildHelpContent includes the full key command list', () => { assert.match(content, /TUI only reads local SQLite/); assert.match(content, /default cluster filter is 1\+/); assert.match(content, /default sort is size/); - assert.match(content, /Mouse clicks focus panes/); + assert.match(content, /right-click threads for actions/); assert.match(content, /p\s+open the repository browser/); assert.match(content, /l\s+toggle wide layout/); assert.match(content, /x\s+show or hide locally closed clusters and members/); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 37d2bd0..5bcdefc 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1,4 +1,4 @@ -import { spawn } from 'node:child_process'; +import { spawn, spawnSync } from 'node:child_process'; import blessed from 'neo-blessed'; @@ -63,6 +63,17 @@ type ThreadDetailCacheEntry = { hasNeighbors: boolean; }; +type MouseEventArg = blessed.Widgets.Events.IMouseEventArg & { + button?: 'left' | 'middle' | 'right' | 'unknown'; +}; + +export type ThreadContextAction = 'open' | 'copy-url' | 'copy-title' | 'copy-markdown-link' | 'load-neighbors' | 'close'; + +export type ThreadContextMenuItem = { + label: string; + action: ThreadContextAction; +}; + export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { const term = env.TERM; if (!term) { @@ -524,6 +535,73 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; + const openThreadContextMenu = (event?: MouseEventArg): void => { + if (modalOpen || !threadDetail) { + return; + } + modalOpen = true; + const items = buildThreadContextMenuItems(threadDetail); + const width = 30; + const height = items.length + 2; + const screenWidth = Number(widgets.screen.width); + const screenHeight = Number(widgets.screen.height); + const left = Math.max(0, Math.min((event?.x ?? Math.floor(screenWidth * 0.72)) - 1, screenWidth - width)); + const top = Math.max(0, Math.min((event?.y ?? Math.floor(screenHeight * 0.35)) - 1, screenHeight - height)); + const menu = blessed.list({ + parent: widgets.screen, + border: 'line', + label: ' Thread ', + top, + left, + width, + height, + tags: true, + keys: true, + mouse: true, + items: items.map((item) => item.label), + style: { + border: { fg: '#fde74c' }, + selected: { bg: '#f7f7ff', fg: 'black', bold: true }, + item: { fg: 'white' }, + bg: '#101522', + }, + }); + + const closeMenu = (): void => { + menu.destroy(); + modalOpen = false; + render(); + }; + const runAction = (action: ThreadContextAction): void => { + const selectedThread = threadDetail?.thread; + if (!selectedThread) { + closeMenu(); + return; + } + if (action === 'open') { + openUrl(selectedThread.htmlUrl); + status = `Opened ${selectedThread.htmlUrl}`; + } else if (action === 'copy-url') { + status = copyTextToClipboard(selectedThread.htmlUrl) ? 'Copied URL' : 'Clipboard copy failed'; + } else if (action === 'copy-title') { + status = copyTextToClipboard(`#${selectedThread.number} ${selectedThread.title}`) ? 'Copied title' : 'Clipboard copy failed'; + } else if (action === 'copy-markdown-link') { + const markdownLink = `[#${selectedThread.number} ${selectedThread.title}](${selectedThread.htmlUrl})`; + status = copyTextToClipboard(markdownLink) ? 'Copied markdown link' : 'Clipboard copy failed'; + } else if (action === 'load-neighbors') { + loadSelectedThreadDetail(true); + status = `Loaded neighbors for #${threadDetail?.thread.number ?? selectedThread.number}`; + focusPane = 'detail'; + } + closeMenu(); + }; + + menu.key(['escape', 'q'], closeMenu); + menu.on('select', (_item, index) => runAction(items[Number(index)]?.action ?? 'close')); + menu.focus(); + widgets.screen.render(); + }; + const openHelp = (): void => { if (modalOpen) return; void (async () => { @@ -835,10 +913,31 @@ export async function startTui(params: StartTuiParams): Promise { status = selectedMemberThreadId !== null ? `Loaded neighbors for #${threadDetail?.thread.number ?? '?'}` : status; updateFocus('detail'); }); + widgets.members.on('mousedown', (event: MouseEventArg) => { + if (isRendering || modalOpen || event.button !== 'right') return; + focusPane = 'members'; + widgets.members.focus(); + const itemIndex = Number(event.y) - Number(widgets.members.atop) - 2 + Number(widgets.members.getScroll()); + const row = Number.isInteger(itemIndex) && itemIndex >= 0 && itemIndex < memberRows.length ? memberRows[itemIndex] : null; + if (!row?.selectable) { + status = 'Right-click a thread row'; + render(); + return; + } + if (row.threadId !== selectedMemberThreadId) { + selectMemberIndex(itemIndex); + } + openThreadContextMenu(event); + }); widgets.detail.on('click', () => { if (modalOpen) return; updateFocus('detail'); }); + widgets.detail.on('mousedown', (event: MouseEventArg) => { + if (modalOpen || event.button !== 'right') return; + updateFocus('detail'); + openThreadContextMenu(event); + }); widgets.screen.on('resize', () => render()); widgets.screen.on('destroy', () => { @@ -1099,6 +1198,20 @@ function formatTerminalLink(url: string, label: string): string { return `\u001b]8;;${escapedUrl}\u0007${escapeBlessedText(label)}\u001b]8;;\u0007`; } +export function buildThreadContextMenuItems(threadDetail: TuiThreadDetail | null): ThreadContextMenuItem[] { + if (!threadDetail) { + return [{ label: 'Close', action: 'close' }]; + } + return [ + { label: 'Open in browser', action: 'open' }, + { label: 'Copy URL', action: 'copy-url' }, + { label: 'Copy title', action: 'copy-title' }, + { label: 'Copy Markdown link', action: 'copy-markdown-link' }, + { label: 'Load neighbors', action: 'load-neighbors' }, + { label: 'Close', action: 'close' }, + ]; +} + function applyRect(element: blessed.Widgets.BoxElement | blessed.Widgets.ListElement, rect: { top: number; left: number; width: number; height: number }): void { element.top = rect.top; element.left = rect.left; @@ -1121,6 +1234,20 @@ function openUrl(url: string): void { child.unref(); } +function copyTextToClipboard(value: string): boolean { + const copyCommand = + process.platform === 'darwin' + ? { command: 'pbcopy', args: [] } + : process.platform === 'win32' + ? { command: 'clip', args: [] } + : { command: 'xclip', args: ['-selection', 'clipboard'] }; + const result = spawnSync(copyCommand.command, copyCommand.args, { + input: value, + stdio: ['pipe', 'ignore', 'ignore'], + }); + return result.status === 0; +} + export function buildHelpContent(): string { return [ '{bold}ghcrawl TUI Help{/bold}', @@ -1130,7 +1257,7 @@ export function buildHelpContent(): string { 'Left / Right cycle focus backward or forward across panes', 'Up / Down move selection, or scroll detail when detail is focused', 'Enter clusters -> members, members -> detail', - 'Mouse click a pane or row to focus/select; wheel scrolls lists and detail', + 'Mouse click to focus/select; right-click threads for actions; wheel scrolls lists and detail', 'PgUp / PgDn page through the focused pane or this help popup faster', 'Home / End jump to the top or bottom of detail or help', '', From 144e0a50001427e0c9ce0995db25d1c66d0bcde9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:21:17 -0700 Subject: [PATCH 090/215] fix(tui): render links plainly --- apps/cli/src/tui/app.test.ts | 5 +++-- apps/cli/src/tui/app.ts | 10 ++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index e4f177a..c6b8bf2 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -143,8 +143,9 @@ test('renderMarkdownForTerminal formats common markdown without exposing blessed assert.match(rendered, /\{bold\}Heading \\{boom\\}\{\/bold\}/); assert.match(rendered, /- \{bold\}bold\{\/bold\} and \{yellow-fg\}code\{\/yellow-fg\}/); - assert.match(rendered, /\x1B\]8;;https:\/\/example\.com\/path/); - assert.match(rendered, /\x1B\]8;;https:\/\/example\.com\/raw/); + assert.match(rendered, /site /); + assert.match(rendered, /https:\/\/example\.com\/raw/); + assert.doesNotMatch(rendered, /\x1B\]8;;/); }); test('buildThreadContextMenuItems exposes thread actions for right-click menus', () => { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 5bcdefc..e8ab6aa 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1194,8 +1194,14 @@ function renderInlineText(value: string): string { } function formatTerminalLink(url: string, label: string): string { - const escapedUrl = url.replace(/[\u0007\u001b]/g, ''); - return `\u001b]8;;${escapedUrl}\u0007${escapeBlessedText(label)}\u001b]8;;\u0007`; + const safeUrl = stripTerminalControls(url); + const safeLabel = stripTerminalControls(label); + const visibleLink = safeLabel && safeLabel !== safeUrl ? `${safeLabel} <${safeUrl}>` : safeUrl; + return escapeBlessedText(visibleLink); +} + +function stripTerminalControls(value: string): string { + return value.replace(/[\u0000-\u001f\u007f]/g, ''); } export function buildThreadContextMenuItems(threadDetail: TuiThreadDetail | null): ThreadContextMenuItem[] { From 67559a41200c06f5977268310ed32406f3996e20 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:34:41 -0700 Subject: [PATCH 091/215] fix(tui): improve pane mouse interactions --- apps/cli/src/tui/app.test.ts | 16 +- apps/cli/src/tui/app.ts | 260 +++++++++++++++++++++++++-------- apps/cli/src/tui/state.test.ts | 9 +- apps/cli/src/tui/state.ts | 22 ++- 4 files changed, 240 insertions(+), 67 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index c6b8bf2..958703e 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -8,6 +8,7 @@ import { buildHelpContent, escapeBlessedText, formatClusterDateColumn, + formatClusterListHeader, formatClusterListLabel, formatClusterShortName, getRepositoryChoices, @@ -83,6 +84,14 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e assert.ok(rendered.indexOf('LLM Summary:') < rendered.indexOf('{bold}Main{/bold}')); }); +test('renderDetailPane gives useful empty detail content before a cluster is selected', () => { + const rendered = renderDetailPane(null, null, 'clusters'); + + assert.match(rendered, /No repository selected/); + assert.match(rendered, /s sort/); + assert.match(rendered, /right-click any pane/); +}); + test('parseOwnerRepoValue accepts owner slash repo values and rejects invalid ones', () => { assert.deepEqual(parseOwnerRepoValue('openclaw/openclaw'), { owner: 'openclaw', repo: 'openclaw' }); assert.equal(parseOwnerRepoValue('openclaw'), null); @@ -122,6 +131,11 @@ test('formatClusterListLabel keeps counts first and splits cluster name from tit assert.doesNotMatch(label, /items/); }); +test('formatClusterListHeader marks the active clickable sort column', () => { + assert.match(formatClusterListHeader('size'), /cnt↓/); + assert.match(formatClusterListHeader('recent'), /updated↓/); +}); + test('formatClusterShortName returns the first meaningful words', () => { assert.equal(formatClusterShortName('[codex] fix agent session-id routing'), 'agent session-id routing'); assert.equal(formatClusterShortName('fix(agents): exclude volatile inbound metadata'), 'agents exclude volatile'); @@ -225,7 +239,7 @@ test('buildHelpContent includes the full key command list', () => { assert.match(content, /TUI only reads local SQLite/); assert.match(content, /default cluster filter is 1\+/); assert.match(content, /default sort is size/); - assert.match(content, /right-click threads for actions/); + assert.match(content, /right-click opens pane actions/); assert.match(content, /p\s+open the repository browser/); assert.match(content, /l\s+toggle wide layout/); assert.match(content, /x\s+show or hide locally closed clusters and members/); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index e8ab6aa..ed03133 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -74,6 +74,11 @@ export type ThreadContextMenuItem = { action: ThreadContextAction; }; +type ContextMenuItem = { + label: string; + run: () => boolean | void; +}; + export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { const term = env.TERM; if (!term) { @@ -94,6 +99,8 @@ function createScreen(options: Parameters[0]): blessed.Wi const ACTIVITY_LOG_LIMIT = 200; const FOOTER_LOG_LINES = 1; +const CLUSTER_LIST_HEADER_INDEX = 0; +const CLUSTER_LIST_FIRST_ITEM_INDEX = 1; export async function startTui(params: StartTuiParams): Promise { const selectedRepository = params.owner && params.repo ? { owner: params.owner, repo: params.repo } : null; @@ -124,6 +131,7 @@ export async function startTui(params: StartTuiParams): Promise { const clusterDetailCache = new Map(); const threadDetailCache = new Map(); let modalOpen = false; + let suppressNextClusterSelect = false; const clearCaches = (): void => { clusterDetailCache.clear(); @@ -141,11 +149,12 @@ export async function startTui(params: StartTuiParams): Promise { } clusterIndexById = new Map(); - clusterItems = snapshot.clusters.map((cluster, index) => { - clusterIndexById.set(cluster.clusterId, index); + clusterItems = [`{bold}${formatClusterListHeader(sortMode)}{/bold}`]; + clusterItems.push(...snapshot.clusters.map((cluster, index) => { + clusterIndexById.set(cluster.clusterId, index + CLUSTER_LIST_FIRST_ITEM_INDEX); const label = formatClusterListLabel(cluster); return cluster.isClosed ? `{gray-fg}${escapeBlessedText(label)}{/gray-fg}` : escapeBlessedText(label); - }); + })); widgets.clusters.setItems(clusterItems); }; @@ -328,7 +337,8 @@ export async function startTui(params: StartTuiParams): Promise { isRendering = true; try { - const clusterIndex = snapshot && selectedClusterId !== null ? Math.max(0, clusterIndexById.get(selectedClusterId) ?? -1) : 0; + const clusterIndex = + snapshot && selectedClusterId !== null ? Math.max(CLUSTER_LIST_FIRST_ITEM_INDEX, clusterIndexById.get(selectedClusterId) ?? -1) : CLUSTER_LIST_HEADER_INDEX; widgets.clusters.select(clusterIndex); widgets.members.setItems(memberRows.length > 0 ? memberRows.map((row) => row.label) : ['No members']); @@ -339,7 +349,7 @@ export async function startTui(params: StartTuiParams): Promise { isRendering = false; } - widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane)); + widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane, snapshot)); updatePaneStyles(widgets, focusPane); const footerLines = [ activityLines.at(-1) ?? status, @@ -365,12 +375,16 @@ export async function startTui(params: StartTuiParams): Promise { const wrap = options?.wrap ?? true; if (focusPane === 'clusters') { if (snapshot.clusters.length === 0) return; - const currentIndex = Math.max(0, selectedClusterId === null ? -1 : (clusterIndexById.get(selectedClusterId) ?? -1)); + const currentIndex = Math.max( + CLUSTER_LIST_FIRST_ITEM_INDEX, + selectedClusterId === null ? CLUSTER_LIST_FIRST_ITEM_INDEX : (clusterIndexById.get(selectedClusterId) ?? CLUSTER_LIST_FIRST_ITEM_INDEX), + ); let nextIndex = currentIndex + delta * steps; if (wrap) { - nextIndex = ((nextIndex % snapshot.clusters.length) + snapshot.clusters.length) % snapshot.clusters.length; + const relativeIndex = nextIndex - CLUSTER_LIST_FIRST_ITEM_INDEX; + nextIndex = ((relativeIndex % snapshot.clusters.length) + snapshot.clusters.length) % snapshot.clusters.length + CLUSTER_LIST_FIRST_ITEM_INDEX; } else { - nextIndex = Math.max(0, Math.min(snapshot.clusters.length - 1, nextIndex)); + nextIndex = Math.max(CLUSTER_LIST_FIRST_ITEM_INDEX, Math.min(snapshot.clusters.length, nextIndex)); } selectClusterIndex(nextIndex); return; @@ -403,10 +417,34 @@ export async function startTui(params: StartTuiParams): Promise { moveSelection(delta, { steps: getFocusedListPageSize(), wrap: false }); }; + const setSortMode = (nextSortMode: TuiClusterSortMode): void => { + if (sortMode === nextSortMode) { + return; + } + sortMode = nextSortMode; + persistRepositoryPreference(); + status = `Sort: ${sortMode}`; + refreshAll(true); + }; + + const toggleSortMode = (): void => { + setSortMode(cycleSortMode(sortMode)); + }; + + const toggleClosedVisibility = (): void => { + showClosed = !showClosed; + status = showClosed ? 'Showing closed clusters and members' : 'Hiding closed clusters and members'; + refreshAll(true); + }; + const selectClusterIndex = (nextIndex: number): void => { if (!snapshot || snapshot.clusters.length === 0) return; - const boundedIndex = Math.max(0, Math.min(snapshot.clusters.length - 1, nextIndex)); - selectedClusterId = snapshot.clusters[boundedIndex]?.clusterId ?? null; + if (nextIndex === CLUSTER_LIST_HEADER_INDEX) { + toggleSortMode(); + return; + } + const snapshotIndex = Math.max(0, Math.min(snapshot.clusters.length - 1, nextIndex - CLUSTER_LIST_FIRST_ITEM_INDEX)); + selectedClusterId = snapshot.clusters[snapshotIndex]?.clusterId ?? null; if (selectedClusterId !== null) { try { clusterDetail = loadClusterDetail(selectedClusterId); @@ -426,8 +464,8 @@ export async function startTui(params: StartTuiParams): Promise { } status = selectedClusterId !== null - ? `Cluster ${selectedClusterId} (${boundedIndex + 1}/${snapshot.clusters.length})` - : `Cluster ${boundedIndex + 1}/${snapshot.clusters.length}`; + ? `Cluster ${selectedClusterId} (${snapshotIndex + 1}/${snapshot.clusters.length})` + : `Cluster ${snapshotIndex + 1}/${snapshot.clusters.length}`; render(); }; @@ -535,13 +573,12 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; - const openThreadContextMenu = (event?: MouseEventArg): void => { - if (modalOpen || !threadDetail) { + const openContextMenu = (label: string, items: ContextMenuItem[], event?: MouseEventArg): void => { + if (modalOpen || items.length === 0) { return; } modalOpen = true; - const items = buildThreadContextMenuItems(threadDetail); - const width = 30; + const width = Math.max(26, Math.min(42, Math.max(...items.map((item) => item.label.length)) + 4)); const height = items.length + 2; const screenWidth = Number(widgets.screen.width); const screenHeight = Number(widgets.screen.height); @@ -550,7 +587,6 @@ export async function startTui(params: StartTuiParams): Promise { const menu = blessed.list({ parent: widgets.screen, border: 'line', - label: ' Thread ', top, left, width, @@ -558,6 +594,7 @@ export async function startTui(params: StartTuiParams): Promise { tags: true, keys: true, mouse: true, + label: ` ${label} `, items: items.map((item) => item.label), style: { border: { fg: '#fde74c' }, @@ -572,36 +609,92 @@ export async function startTui(params: StartTuiParams): Promise { modalOpen = false; render(); }; - const runAction = (action: ThreadContextAction): void => { - const selectedThread = threadDetail?.thread; - if (!selectedThread) { - closeMenu(); - return; - } - if (action === 'open') { - openUrl(selectedThread.htmlUrl); - status = `Opened ${selectedThread.htmlUrl}`; - } else if (action === 'copy-url') { - status = copyTextToClipboard(selectedThread.htmlUrl) ? 'Copied URL' : 'Clipboard copy failed'; - } else if (action === 'copy-title') { - status = copyTextToClipboard(`#${selectedThread.number} ${selectedThread.title}`) ? 'Copied title' : 'Clipboard copy failed'; - } else if (action === 'copy-markdown-link') { - const markdownLink = `[#${selectedThread.number} ${selectedThread.title}](${selectedThread.htmlUrl})`; - status = copyTextToClipboard(markdownLink) ? 'Copied markdown link' : 'Clipboard copy failed'; - } else if (action === 'load-neighbors') { - loadSelectedThreadDetail(true); - status = `Loaded neighbors for #${threadDetail?.thread.number ?? selectedThread.number}`; - focusPane = 'detail'; - } - closeMenu(); - }; - menu.key(['escape', 'q'], closeMenu); - menu.on('select', (_item, index) => runAction(items[Number(index)]?.action ?? 'close')); + menu.on('select', (_item, index) => { + const item = items[Number(index)]; + closeMenu(); + const shouldRender = item?.run(); + if (shouldRender !== false) { + render(); + } + }); menu.focus(); widgets.screen.render(); }; + const threadContextItems = (): ContextMenuItem[] => { + const selectedThread = threadDetail?.thread; + if (!selectedThread) { + return [{ label: 'Close', run: () => undefined }]; + } + return buildThreadContextMenuItems(threadDetail).map((item) => ({ + label: item.label, + run: () => { + if (item.action === 'open') { + openUrl(selectedThread.htmlUrl); + status = `Opened ${selectedThread.htmlUrl}`; + } else if (item.action === 'copy-url') { + status = copyTextToClipboard(selectedThread.htmlUrl) ? 'Copied URL' : 'Clipboard copy failed'; + } else if (item.action === 'copy-title') { + status = copyTextToClipboard(`#${selectedThread.number} ${selectedThread.title}`) ? 'Copied title' : 'Clipboard copy failed'; + } else if (item.action === 'copy-markdown-link') { + const markdownLink = `[#${selectedThread.number} ${selectedThread.title}](${selectedThread.htmlUrl})`; + status = copyTextToClipboard(markdownLink) ? 'Copied markdown link' : 'Clipboard copy failed'; + } else if (item.action === 'load-neighbors') { + loadSelectedThreadDetail(true); + status = `Loaded neighbors for #${threadDetail?.thread.number ?? selectedThread.number}`; + focusPane = 'detail'; + } + }, + })); + }; + + const clusterContextItems = (): ContextMenuItem[] => { + const selectedCluster = clusterDetail; + const title = selectedCluster ? splitClusterDisplayTitle(selectedCluster.displayTitle) : null; + return [ + ...(selectedCluster + ? [ + { label: 'Focus members', run: () => updateFocus('members') }, + { + label: 'Copy cluster id', + run: () => { + status = copyTextToClipboard(String(selectedCluster.clusterId)) ? `Copied cluster ${selectedCluster.clusterId}` : 'Clipboard copy failed'; + }, + }, + { + label: 'Copy cluster title', + run: () => { + status = copyTextToClipboard(title?.title ?? selectedCluster.displayTitle) ? 'Copied cluster title' : 'Clipboard copy failed'; + }, + }, + ] + : []), + { label: 'Sort by size', run: () => setSortMode('size') }, + { label: 'Sort by recent', run: () => setSortMode('recent') }, + { label: showClosed ? 'Hide closed' : 'Show closed', run: () => toggleClosedVisibility() }, + { label: 'Filter clusters', run: promptFilter }, + { label: 'Refresh', run: () => refreshAll(true) }, + { label: 'Help', run: openHelp }, + ]; + }; + + const globalContextItems = (): ContextMenuItem[] => [ + { label: 'Refresh', run: () => refreshAll(true) }, + { label: 'Repository browser', run: browseRepositories }, + { label: 'Sort by size', run: () => setSortMode('size') }, + { label: 'Sort by recent', run: () => setSortMode('recent') }, + { label: showClosed ? 'Hide closed' : 'Show closed', run: () => toggleClosedVisibility() }, + { label: 'Help', run: openHelp }, + { + label: 'Quit', + run: () => { + requestQuit(); + return false; + }, + }, + ]; + const openHelp = (): void => { if (modalOpen) return; void (async () => { @@ -844,10 +937,7 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.screen.key(['s'], () => { if (modalOpen) return; - sortMode = cycleSortMode(sortMode); - persistRepositoryPreference(); - status = `Sort: ${sortMode}`; - refreshAll(false); + toggleSortMode(); }); widgets.screen.key(['f'], () => { if (modalOpen) return; @@ -865,9 +955,7 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.screen.key(['x'], () => { if (modalOpen) return; - showClosed = !showClosed; - status = showClosed ? 'Showing closed clusters and members' : 'Hiding closed clusters and members'; - refreshAll(true); + toggleClosedVisibility(); }); widgets.screen.key(['/'], () => { if (modalOpen) return; @@ -893,6 +981,10 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.clusters.on('select item', (_item, index) => { if (isRendering || modalOpen) return; + if (suppressNextClusterSelect) { + suppressNextClusterSelect = false; + return; + } focusPane = 'clusters'; widgets.clusters.focus(); selectClusterIndex(Number(index)); @@ -901,6 +993,25 @@ export async function startTui(params: StartTuiParams): Promise { if (isRendering || modalOpen) return; updateFocus('members'); }); + widgets.clusters.on('mousedown', (event: MouseEventArg) => { + if (isRendering || modalOpen) return; + const itemIndex = getListItemIndexFromMouse(widgets.clusters, event); + if (event.button === 'left' && itemIndex === CLUSTER_LIST_HEADER_INDEX) { + suppressNextClusterSelect = true; + const relativeX = Number(event.x) - Number(widgets.clusters.aleft) - 2; + setSortMode(relativeX <= 5 ? 'size' : relativeX >= 88 ? 'recent' : cycleSortMode(sortMode)); + return; + } + if (event.button !== 'right') return; + focusPane = 'clusters'; + widgets.clusters.focus(); + if (itemIndex !== null && itemIndex >= CLUSTER_LIST_FIRST_ITEM_INDEX) { + selectClusterIndex(itemIndex); + } else { + render(); + } + openContextMenu('Cluster', clusterContextItems(), event); + }); widgets.members.on('select item', (_item, index) => { if (isRendering || modalOpen) return; focusPane = 'members'; @@ -917,17 +1028,16 @@ export async function startTui(params: StartTuiParams): Promise { if (isRendering || modalOpen || event.button !== 'right') return; focusPane = 'members'; widgets.members.focus(); - const itemIndex = Number(event.y) - Number(widgets.members.atop) - 2 + Number(widgets.members.getScroll()); - const row = Number.isInteger(itemIndex) && itemIndex >= 0 && itemIndex < memberRows.length ? memberRows[itemIndex] : null; + const itemIndex = getListItemIndexFromMouse(widgets.members, event); + const row = itemIndex !== null && itemIndex >= 0 && itemIndex < memberRows.length ? memberRows[itemIndex] : null; if (!row?.selectable) { - status = 'Right-click a thread row'; - render(); + openContextMenu('Members', clusterContextItems(), event); return; } if (row.threadId !== selectedMemberThreadId) { - selectMemberIndex(itemIndex); + selectMemberIndex(itemIndex ?? 0); } - openThreadContextMenu(event); + openContextMenu('Thread', threadContextItems(), event); }); widgets.detail.on('click', () => { if (modalOpen) return; @@ -936,7 +1046,15 @@ export async function startTui(params: StartTuiParams): Promise { widgets.detail.on('mousedown', (event: MouseEventArg) => { if (modalOpen || event.button !== 'right') return; updateFocus('detail'); - openThreadContextMenu(event); + openContextMenu(threadDetail ? 'Thread' : clusterDetail ? 'Cluster' : 'ghcrawl', threadDetail ? threadContextItems() : clusterDetail ? clusterContextItems() : globalContextItems(), event); + }); + widgets.header.on('mousedown', (event: MouseEventArg) => { + if (modalOpen || event.button !== 'right') return; + openContextMenu('ghcrawl', globalContextItems(), event); + }); + widgets.footer.on('mousedown', (event: MouseEventArg) => { + if (modalOpen || event.button !== 'right') return; + openContextMenu('ghcrawl', globalContextItems(), event); }); widgets.screen.on('resize', () => render()); @@ -973,6 +1091,7 @@ function createWidgets(owner: string, repo: string): Widgets { const header = blessed.box({ parent: screen, tags: true, + mouse: true, style: { fg: 'white', bg: '#0d1321' }, }); const clusters = blessed.list({ @@ -1021,6 +1140,7 @@ function createWidgets(owner: string, repo: string): Widgets { const footer = blessed.box({ parent: screen, tags: false, + mouse: true, style: { fg: 'black', bg: '#5bc0eb' }, }); @@ -1044,9 +1164,20 @@ export function renderDetailPane( threadDetail: TuiThreadDetail | null, clusterDetail: TuiClusterDetail | null, focusPane: TuiFocusPane, + snapshot?: TuiSnapshot | null, ): string { if (!clusterDetail) { - return 'No cluster selected.\n\nRun `ghcrawl cluster owner/repo` if you have not clustered this repository yet.'; + const repoLabel = snapshot?.repository.fullName ?? 'No repository selected'; + const clusterCount = snapshot?.clusters.length ?? 0; + return [ + `{bold}${escapeBlessedText(repoLabel)}{/bold}`, + '', + clusterCount > 0 ? `${clusterCount} clusters loaded. Click a cluster or press Enter to inspect members.` : 'No clusters visible in this view.', + '', + `{bold}Controls{/bold}`, + 's sort f min size / filter x closed r refresh', + 'right-click any pane for actions', + ].join('\n'); } const clusterTitle = splitClusterDisplayTitle(clusterDetail.displayTitle); if (!threadDetail) { @@ -1225,6 +1356,11 @@ function applyRect(element: blessed.Widgets.BoxElement | blessed.Widgets.ListEle element.height = rect.height; } +function getListItemIndexFromMouse(list: blessed.Widgets.ListElement, event: MouseEventArg): number | null { + const itemIndex = Number(event.y) - Number(list.atop) - 2 + Number(list.getScroll()); + return Number.isInteger(itemIndex) ? itemIndex : null; +} + function openUrl(url: string): void { const launch = process.platform === 'darwin' @@ -1263,7 +1399,7 @@ export function buildHelpContent(): string { 'Left / Right cycle focus backward or forward across panes', 'Up / Down move selection, or scroll detail when detail is focused', 'Enter clusters -> members, members -> detail', - 'Mouse click to focus/select; right-click threads for actions; wheel scrolls lists and detail', + 'Mouse click to focus/select; click cluster header to sort; right-click opens pane actions; wheel scrolls', 'PgUp / PgDn page through the focused pane or this help popup faster', 'Home / End jump to the top or bottom of detail or help', '', @@ -1289,7 +1425,7 @@ export function buildHelpContent(): string { 'The TUI only reads local SQLite. Run ghcrawl sync, ghcrawl embed, and ghcrawl cluster from the shell to update data.', 'The default cluster filter is 1+, so solo clusters are visible unless you raise it with f.', 'The default sort is size. Press s to toggle size and recent.', - 'Mouse clicks focus panes; clicking an already selected row advances to the next pane.', + 'Mouse clicks focus panes; clicking an already selected row advances to the next pane. Right-click works on every pane.', 'Clusters show C so the cluster id is easy to copy into CLI or skill flows.', 'The footer only shows the short command list. Open help to see the full list.', 'This popup scrolls. Use arrows, PgUp/PgDn, Home, and End if it does not fit.', @@ -1505,6 +1641,12 @@ export function formatClusterListLabel(cluster: TuiClusterSummary): string { return `${countLabel} ${title.name.padEnd(22).slice(0, 22)} ${title.title.padEnd(56).slice(0, 56)} ${mixLabel} ${updated}`; } +export function formatClusterListHeader(sortMode: TuiClusterSortMode): string { + const countLabel = (sortMode === 'size' ? 'cnt↓' : 'cnt').padStart(3); + const updated = (sortMode === 'recent' ? 'updated↓' : 'updated').padStart(8); + return `${countLabel} ${'cluster'.padEnd(22)} ${'title'.padEnd(56)} ${'mix'.padStart(7)} ${updated}`; +} + export function formatClusterShortName(title: string, maxWords = 3): string { const words = title .replace(/[\[\]{}()<>]/g, ' ') diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 8633c6e..0ff891a 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -28,6 +28,8 @@ test('formatRelativeTime returns compact human readable ages', () => { assert.equal(formatRelativeTime('2026-04-24T11:58:00Z', now), '2m ago'); assert.equal(formatRelativeTime('2026-04-24T06:00:00Z', now), '6h ago'); assert.equal(formatRelativeTime('2026-04-18T12:00:00Z', now), '6d ago'); + assert.equal(formatRelativeTime('2026-03-12T12:00:00Z', now), '43d ago'); + assert.equal(formatRelativeTime('2026-01-12T12:00:00Z', now), '3mo ago'); }); test('applyClusterFilters sorts by recent and size and respects min size/search', () => { @@ -118,8 +120,8 @@ test('buildMemberRows groups issues and pull requests and selection skips header id: 11, number: 43, kind: 'pull_request', - isClosed: false, - title: 'PR one', + isClosed: true, + title: '[Bug]: PR one', updatedAtGh: '2026-03-09T10:00:00Z', htmlUrl: 'https://example.com/43', labels: ['bug'], @@ -130,7 +132,8 @@ test('buildMemberRows groups issues and pull requests and selection skips header const rows = buildMemberRows(detail); assert.equal(rows[0]?.selectable, false); - assert.match(rows[1]?.label ?? '', /#42\s+\d+d ago|#42\s+2026-03-09/); + assert.match(rows[1]?.label ?? '', /#42\s+\d+d ago/); + assert.match(rows[3]?.label ?? '', /closed\s+\d+d ago\s+Bug: PR one/); assert.equal(findSelectableIndex(rows, 10), 1); assert.equal(moveSelectableIndex(rows, 1, 1), 3); }); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index fdd3ff1..0089357 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -5,7 +5,7 @@ export type TuiMinSizeFilter = 0 | 1 | 2 | 10 | 20 | 50; export type MemberListRow = | { key: string; label: string; selectable: false } - | { key: string; label: string; selectable: true; threadId: number }; + | { key: string; label: string; selectable: true; threadId: number; isClosed: boolean; kind: 'issue' | 'pull_request' }; export const SORT_MODE_ORDER: TuiClusterSortMode[] = ['size', 'recent']; export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [1, 2, 10, 20, 50, 0]; @@ -62,6 +62,8 @@ export function buildMemberRows(detail: TuiClusterDetail | null, options?: { inc label: formatMemberLabel(issue.number, issue.title, issue.updatedAtGh, issue.isClosed), selectable: true, threadId: issue.id, + isClosed: issue.isClosed, + kind: issue.kind, }); } } @@ -74,6 +76,8 @@ export function buildMemberRows(detail: TuiClusterDetail | null, options?: { inc label: formatMemberLabel(pullRequest.number, pullRequest.title, pullRequest.updatedAtGh, pullRequest.isClosed), selectable: true, threadId: pullRequest.id, + isClosed: pullRequest.isClosed, + kind: pullRequest.kind, }); } } @@ -114,7 +118,8 @@ function compareClusters(left: TuiClusterSummary, right: TuiClusterSummary, sort function formatMemberLabel(number: number, title: string, updatedAtGh: string | null, isClosed: boolean): string { const updated = formatRelativeTime(updatedAtGh); - const label = escapeBlessedInline(`#${number} ${updated} ${title}`); + const status = isClosed ? 'closed ' : ''; + const label = escapeBlessedInline(`#${number} ${status}${updated} ${normalizeMemberTitle(title)}`); return isClosed ? `{gray-fg}${label}{/gray-fg}` : label; } @@ -135,10 +140,19 @@ export function formatRelativeTime(value: string | null, now: Date = new Date()) if (diffMs < dayMs) { return `${Math.floor(diffMs / hourMs)}h ago`; } - if (diffMs < 14 * dayMs) { + if (diffMs < 60 * dayMs) { return `${Math.floor(diffMs / dayMs)}d ago`; } - return parsed.toISOString().slice(0, 10); + const monthMs = 30 * dayMs; + const yearMs = 365 * dayMs; + if (diffMs < 2 * yearMs) { + return `${Math.max(1, Math.floor(diffMs / monthMs))}mo ago`; + } + return `${Math.max(1, Math.floor(diffMs / yearMs))}y ago`; +} + +function normalizeMemberTitle(title: string): string { + return title.replace(/^\[([^\]]{1,30})\]:?\s+/, '$1: '); } function escapeBlessedInline(value: string): string { From 1706b650db55c1baffd45205ef2b4a5341185903 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:42:48 -0700 Subject: [PATCH 092/215] fix(tui): align member list columns --- apps/cli/src/tui/state.test.ts | 26 +++++++++++++++++++++----- apps/cli/src/tui/state.ts | 12 +++++++++--- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 0ff891a..c35f017 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -1,7 +1,18 @@ import test from 'node:test'; import assert from 'node:assert/strict'; -import { buildMemberRows, cycleFocusPane, cycleMinSizeFilter, cycleSortMode, findSelectableIndex, formatRelativeTime, moveSelectableIndex, preserveSelectedId, applyClusterFilters } from './state.js'; +import { + applyClusterFilters, + buildMemberRows, + cycleFocusPane, + cycleMinSizeFilter, + cycleSortMode, + findSelectableIndex, + formatMemberListHeader, + formatRelativeTime, + moveSelectableIndex, + preserveSelectedId, +} from './state.js'; import type { TuiClusterDetail, TuiClusterSummary } from '@ghcrawl/api-core'; test('cycleSortMode toggles size and recent', () => { @@ -132,8 +143,13 @@ test('buildMemberRows groups issues and pull requests and selection skips header const rows = buildMemberRows(detail); assert.equal(rows[0]?.selectable, false); - assert.match(rows[1]?.label ?? '', /#42\s+\d+d ago/); - assert.match(rows[3]?.label ?? '', /closed\s+\d+d ago\s+Bug: PR one/); - assert.equal(findSelectableIndex(rows, 10), 1); - assert.equal(moveSelectableIndex(rows, 1, 1), 3); + assert.match(rows[0]?.label ?? '', /number\s+state\s+updated\s+title/); + assert.match(rows[2]?.label ?? '', /#42\s+open\s+\d+d ago\s+Issue one/); + assert.match(rows[4]?.label ?? '', /closed\s+\d+d ago\s+Bug: PR one/); + assert.equal(findSelectableIndex(rows, 10), 2); + assert.equal(moveSelectableIndex(rows, 2, 1), 4); +}); + +test('formatMemberListHeader aligns the member table columns', () => { + assert.equal(formatMemberListHeader(), 'number state updated title'); }); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 0089357..441e45a 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -52,7 +52,7 @@ export function buildMemberRows(detail: TuiClusterDetail | null, options?: { inc const visibleMembers = includeClosedMembers ? detail.members : detail.members.filter((member) => !member.isClosed); const issues = visibleMembers.filter((member) => member.kind === 'issue'); const pullRequests = visibleMembers.filter((member) => member.kind === 'pull_request'); - const rows: MemberListRow[] = []; + const rows: MemberListRow[] = [{ key: 'members-table-header', label: `{bold}${formatMemberListHeader()}{/bold}`, selectable: false }]; if (issues.length > 0) { rows.push({ key: 'issues-header', label: `ISSUES (${issues.length})`, selectable: false }); @@ -118,11 +118,17 @@ function compareClusters(left: TuiClusterSummary, right: TuiClusterSummary, sort function formatMemberLabel(number: number, title: string, updatedAtGh: string | null, isClosed: boolean): string { const updated = formatRelativeTime(updatedAtGh); - const status = isClosed ? 'closed ' : ''; - const label = escapeBlessedInline(`#${number} ${status}${updated} ${normalizeMemberTitle(title)}`); + const numberLabel = `#${number}`.padEnd(8).slice(0, 8); + const status = (isClosed ? 'closed' : 'open').padEnd(7); + const age = updated.padEnd(8).slice(0, 8); + const label = escapeBlessedInline(`${numberLabel}${status}${age}${normalizeMemberTitle(title)}`); return isClosed ? `{gray-fg}${label}{/gray-fg}` : label; } +export function formatMemberListHeader(): string { + return `${'number'.padEnd(8)}${'state'.padEnd(7)}${'updated'.padEnd(8)}title`; +} + export function formatRelativeTime(value: string | null, now: Date = new Date()): string { if (!value) return 'never'; const parsed = new Date(value); From 1d613a4766c1e9e7a81536244acfb9fa7d0c3e22 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:44:20 -0700 Subject: [PATCH 093/215] fix(tui): add quick filter context actions --- apps/cli/src/tui/app.test.ts | 4 ++-- apps/cli/src/tui/app.ts | 25 +++++++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 958703e..ff621cb 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -132,8 +132,8 @@ test('formatClusterListLabel keeps counts first and splits cluster name from tit }); test('formatClusterListHeader marks the active clickable sort column', () => { - assert.match(formatClusterListHeader('size'), /cnt↓/); - assert.match(formatClusterListHeader('recent'), /updated↓/); + assert.match(formatClusterListHeader('size'), /cnt\*/); + assert.match(formatClusterListHeader('recent'), /updated\*/); }); test('formatClusterShortName returns the first meaningful words', () => { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index ed03133..c9c370f 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -437,6 +437,16 @@ export async function startTui(params: StartTuiParams): Promise { refreshAll(true); }; + const setMinSize = (nextMinSize: TuiMinSizeFilter): void => { + if (minSize === nextMinSize) { + return; + } + minSize = nextMinSize; + persistRepositoryPreference(); + status = `Min size: ${minSize === 0 ? 'all' : `${minSize}+`}`; + refreshAll(true); + }; + const selectClusterIndex = (nextIndex: number): void => { if (!snapshot || snapshot.clusters.length === 0) return; if (nextIndex === CLUSTER_LIST_HEADER_INDEX) { @@ -672,6 +682,9 @@ export async function startTui(params: StartTuiParams): Promise { : []), { label: 'Sort by size', run: () => setSortMode('size') }, { label: 'Sort by recent', run: () => setSortMode('recent') }, + { label: 'Min size 1+', run: () => setMinSize(1) }, + { label: 'Min size 10+', run: () => setMinSize(10) }, + { label: 'Min size all', run: () => setMinSize(0) }, { label: showClosed ? 'Hide closed' : 'Show closed', run: () => toggleClosedVisibility() }, { label: 'Filter clusters', run: promptFilter }, { label: 'Refresh', run: () => refreshAll(true) }, @@ -684,6 +697,9 @@ export async function startTui(params: StartTuiParams): Promise { { label: 'Repository browser', run: browseRepositories }, { label: 'Sort by size', run: () => setSortMode('size') }, { label: 'Sort by recent', run: () => setSortMode('recent') }, + { label: 'Min size 1+', run: () => setMinSize(1) }, + { label: 'Min size 10+', run: () => setMinSize(10) }, + { label: 'Min size all', run: () => setMinSize(0) }, { label: showClosed ? 'Hide closed' : 'Show closed', run: () => toggleClosedVisibility() }, { label: 'Help', run: openHelp }, { @@ -941,10 +957,7 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.screen.key(['f'], () => { if (modalOpen) return; - minSize = cycleMinSizeFilter(minSize); - persistRepositoryPreference(); - status = `Min size: ${minSize === 0 ? 'all' : `${minSize}+`}`; - refreshAll(false); + setMinSize(cycleMinSizeFilter(minSize)); }); widgets.screen.key(['l'], () => { if (modalOpen) return; @@ -1642,8 +1655,8 @@ export function formatClusterListLabel(cluster: TuiClusterSummary): string { } export function formatClusterListHeader(sortMode: TuiClusterSortMode): string { - const countLabel = (sortMode === 'size' ? 'cnt↓' : 'cnt').padStart(3); - const updated = (sortMode === 'recent' ? 'updated↓' : 'updated').padStart(8); + const countLabel = (sortMode === 'size' ? 'cnt*' : 'cnt').padStart(3); + const updated = (sortMode === 'recent' ? 'updated*' : 'updated').padStart(8); return `${countLabel} ${'cluster'.padEnd(22)} ${'title'.padEnd(56)} ${'mix'.padStart(7)} ${updated}`; } From a352fd2d6f8e36a71232a576420f7c13410f8098 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 14:59:49 -0700 Subject: [PATCH 094/215] fix(tui): stabilize cluster header sorting --- apps/cli/src/tui/app.test.ts | 9 ++++++ apps/cli/src/tui/app.ts | 54 ++++++++++++++++++++++++++++++------ 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index ff621cb..0c9cd35 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -16,6 +16,7 @@ import { renderMarkdownForTerminal, renderDetailPane, resolveBlessedTerminal, + resolveClusterHeaderSortFromClick, splitClusterDisplayTitle, } from './app.js'; @@ -136,6 +137,14 @@ test('formatClusterListHeader marks the active clickable sort column', () => { assert.match(formatClusterListHeader('recent'), /updated\*/); }); +test('resolveClusterHeaderSortFromClick maps visible header regions to stable sort choices', () => { + assert.equal(resolveClusterHeaderSortFromClick(0, 120, 'recent'), 'size'); + assert.equal(resolveClusterHeaderSortFromClick(115, 120, 'size'), 'recent'); + assert.equal(resolveClusterHeaderSortFromClick(24, 120, 'size'), 'recent'); + assert.equal(resolveClusterHeaderSortFromClick(24, 120, 'recent'), 'size'); + assert.equal(resolveClusterHeaderSortFromClick(52, 60, 'size'), 'recent'); +}); + test('formatClusterShortName returns the first meaningful words', () => { assert.equal(formatClusterShortName('[codex] fix agent session-id routing'), 'agent session-id routing'); assert.equal(formatClusterShortName('fix(agents): exclude volatile inbound metadata'), 'agents exclude volatile'); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index c9c370f..b982b9f 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -101,6 +101,16 @@ const ACTIVITY_LOG_LIMIT = 200; const FOOTER_LOG_LINES = 1; const CLUSTER_LIST_HEADER_INDEX = 0; const CLUSTER_LIST_FIRST_ITEM_INDEX = 1; +const CLUSTER_COUNT_WIDTH = 3; +const CLUSTER_NAME_WIDTH = 22; +const CLUSTER_TITLE_WIDTH = 56; +const CLUSTER_MIX_WIDTH = 7; +const CLUSTER_UPDATED_WIDTH = 8; +const CLUSTER_COLUMN_GAP = 2; +const CLUSTER_NAME_START = CLUSTER_COUNT_WIDTH + CLUSTER_COLUMN_GAP; +const CLUSTER_TITLE_START = CLUSTER_NAME_START + CLUSTER_NAME_WIDTH + CLUSTER_COLUMN_GAP; +const CLUSTER_MIX_START = CLUSTER_TITLE_START + CLUSTER_TITLE_WIDTH + CLUSTER_COLUMN_GAP; +const CLUSTER_UPDATED_START = CLUSTER_MIX_START + CLUSTER_MIX_WIDTH + CLUSTER_COLUMN_GAP; export async function startTui(params: StartTuiParams): Promise { const selectedRepository = params.owner && params.repo ? { owner: params.owner, repo: params.repo } : null; @@ -1011,8 +1021,9 @@ export async function startTui(params: StartTuiParams): Promise { const itemIndex = getListItemIndexFromMouse(widgets.clusters, event); if (event.button === 'left' && itemIndex === CLUSTER_LIST_HEADER_INDEX) { suppressNextClusterSelect = true; - const relativeX = Number(event.x) - Number(widgets.clusters.aleft) - 2; - setSortMode(relativeX <= 5 ? 'size' : relativeX >= 88 ? 'recent' : cycleSortMode(sortMode)); + const relativeX = Math.max(0, Number(event.x) - Number(widgets.clusters.aleft) - 2); + const innerWidth = Math.max(1, Number(widgets.clusters.width) - 2); + setSortMode(resolveClusterHeaderSortFromClick(relativeX, innerWidth, sortMode)); return; } if (event.button !== 'right') return; @@ -1647,17 +1658,42 @@ export function parseOwnerRepoValue(value: string): { owner: string; repo: strin } export function formatClusterListLabel(cluster: TuiClusterSummary): string { - const countLabel = String(cluster.totalCount).padStart(3); - const mixLabel = `${cluster.issueCount}I/${cluster.pullRequestCount}P`.padStart(7); - const updated = formatRelativeTime(cluster.latestUpdatedAt).padStart(8); + const countLabel = String(cluster.totalCount).padStart(CLUSTER_COUNT_WIDTH); + const mixLabel = `${cluster.issueCount}I/${cluster.pullRequestCount}P`.padStart(CLUSTER_MIX_WIDTH); + const updated = formatRelativeTime(cluster.latestUpdatedAt).padStart(CLUSTER_UPDATED_WIDTH); const title = splitClusterDisplayTitle(cluster.displayTitle); - return `${countLabel} ${title.name.padEnd(22).slice(0, 22)} ${title.title.padEnd(56).slice(0, 56)} ${mixLabel} ${updated}`; + return [ + countLabel, + title.name.padEnd(CLUSTER_NAME_WIDTH).slice(0, CLUSTER_NAME_WIDTH), + title.title.padEnd(CLUSTER_TITLE_WIDTH).slice(0, CLUSTER_TITLE_WIDTH), + mixLabel, + updated, + ].join(' '); } export function formatClusterListHeader(sortMode: TuiClusterSortMode): string { - const countLabel = (sortMode === 'size' ? 'cnt*' : 'cnt').padStart(3); - const updated = (sortMode === 'recent' ? 'updated*' : 'updated').padStart(8); - return `${countLabel} ${'cluster'.padEnd(22)} ${'title'.padEnd(56)} ${'mix'.padStart(7)} ${updated}`; + const countLabel = (sortMode === 'size' ? 'cnt*' : 'cnt').padStart(CLUSTER_COUNT_WIDTH); + const updated = (sortMode === 'recent' ? 'updated*' : 'updated').padStart(CLUSTER_UPDATED_WIDTH); + return [ + countLabel, + 'cluster'.padEnd(CLUSTER_NAME_WIDTH), + 'title'.padEnd(CLUSTER_TITLE_WIDTH), + 'mix'.padStart(CLUSTER_MIX_WIDTH), + updated, + ].join(' '); +} + +export function resolveClusterHeaderSortFromClick(relativeX: number, visibleWidth: number, currentSortMode: TuiClusterSortMode): TuiClusterSortMode { + if (relativeX < CLUSTER_NAME_START) { + return 'size'; + } + + const visibleUpdatedStart = Math.min(CLUSTER_UPDATED_START, Math.max(CLUSTER_NAME_START, visibleWidth - CLUSTER_UPDATED_WIDTH - CLUSTER_COLUMN_GAP)); + if (relativeX >= visibleUpdatedStart) { + return 'recent'; + } + + return cycleSortMode(currentSortMode); } export function formatClusterShortName(title: string, maxWords = 3): string { From eb68bf331260b5d5ff237041fd84c5f19f610847 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:01:03 -0700 Subject: [PATCH 095/215] fix(tui): clarify summary sections --- apps/cli/src/tui/app.test.ts | 19 +++++++++++++++++-- apps/cli/src/tui/app.ts | 26 ++++++++++++++++++++------ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 0c9cd35..812f4d2 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -17,6 +17,7 @@ import { renderDetailPane, resolveBlessedTerminal, resolveClusterHeaderSortFromClick, + renderSummarySections, splitClusterDisplayTitle, } from './app.js'; @@ -77,12 +78,12 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e const rendered = renderDetailPane(detail, cluster, 'detail'); assert.match(rendered, /C1 \(#42 representative issue\)/); assert.match(rendered, /Bad \\{bold\\}title\\{\/bold\\}/); - assert.match(rendered, /LLM Summary:/); + assert.match(rendered, /Cluster signal:/); assert.match(rendered, /Main/); assert.match(rendered, /Body with \\{red-fg\\}tags\\{\/red-fg\\}/); assert.match(rendered, /Summary \\{yellow-fg\\}text\\{\/yellow-fg\\}/); assert.match(rendered, /Neighbor \\{blue-fg\\}title\\{\/blue-fg\\}/); - assert.ok(rendered.indexOf('LLM Summary:') < rendered.indexOf('{bold}Main{/bold}')); + assert.ok(rendered.indexOf('Cluster signal:') < rendered.indexOf('{bold}Main{/bold}')); }); test('renderDetailPane gives useful empty detail content before a cluster is selected', () => { @@ -171,6 +172,20 @@ test('renderMarkdownForTerminal formats common markdown without exposing blessed assert.doesNotMatch(rendered, /\x1B\]8;;/); }); +test('renderSummarySections orders and labels LLM summaries for scanning', () => { + const rendered = renderSummarySections({ + dedupe_summary: 'same failure mode', + problem_summary: '**cron** timeout', + maintainer_signal_summary: 'needs owner', + solution_summary: 'raise timeout', + }); + + assert.ok(rendered.indexOf('Purpose:') < rendered.indexOf('Solution:')); + assert.ok(rendered.indexOf('Solution:') < rendered.indexOf('Maintainer signal:')); + assert.ok(rendered.indexOf('Maintainer signal:') < rendered.indexOf('Cluster signal:')); + assert.match(rendered, /\{bold\}cron\{\/bold\} timeout/); +}); + test('buildThreadContextMenuItems exposes thread actions for right-click menus', () => { const items = buildThreadContextMenuItems({ thread: { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index b982b9f..274164b 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1227,12 +1227,7 @@ export function renderDetailPane( const closedLabel = thread.isClosed ? `{bold}Closed:{/bold} ${escapeBlessedText(thread.closedAtLocal ?? thread.closedAtGh ?? 'yes')} ${thread.closeReasonLocal ? `(${escapeBlessedText(thread.closeReasonLocal)})` : ''}`.trimEnd() : '{bold}Closed:{/bold} no'; - const summaries = Object.entries(threadDetail.summaries) - .map(([key, value]) => { - const label = key === 'dedupe_summary' ? 'LLM Summary' : key; - return `{bold}${escapeBlessedText(label)}:{/bold}\n${escapeBlessedText(value)}`; - }) - .join('\n\n'); + const summaries = renderSummarySections(threadDetail.summaries); const neighbors = threadDetail.neighbors.length > 0 ? threadDetail.neighbors @@ -1300,6 +1295,25 @@ export function renderMarkdownForTerminal(markdown: string): string { return rendered.join('\n').replace(/\n{4,}/g, '\n\n\n').trimEnd(); } +type SummaryKey = NonNullable; + +const SUMMARY_SECTION_ORDER: SummaryKey[] = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary']; + +export function renderSummarySections(summaries: TuiThreadDetail['summaries']): string { + return SUMMARY_SECTION_ORDER.flatMap((key) => { + const value = summaries[key]; + if (!value) return []; + return [`{bold}${formatSummaryLabel(key)}:{/bold}\n${renderMarkdownForTerminal(value)}`]; + }).join('\n\n'); +} + +function formatSummaryLabel(key: SummaryKey): string { + if (key === 'problem_summary') return 'Purpose'; + if (key === 'solution_summary') return 'Solution'; + if (key === 'maintainer_signal_summary') return 'Maintainer signal'; + return 'Cluster signal'; +} + type InlineMarkdownSegment = | { kind: 'text'; value: string } | { kind: 'link'; label: string; url: string }; From ffca42fe6371c94d42ae119829b6660510f0b6f5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:05:52 -0700 Subject: [PATCH 096/215] feat(tui): add sortable member headers --- apps/cli/src/tui/app.test.ts | 2 + apps/cli/src/tui/app.ts | 65 ++++++++++++++++++--- apps/cli/src/tui/state.test.ts | 77 +++++++++++++++++++++++++ apps/cli/src/tui/state.ts | 102 ++++++++++++++++++++++++--------- 4 files changed, 211 insertions(+), 35 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 812f4d2..f7645aa 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -263,6 +263,8 @@ test('buildHelpContent includes the full key command list', () => { assert.match(content, /TUI only reads local SQLite/); assert.match(content, /default cluster filter is 1\+/); assert.match(content, /default sort is size/); + assert.match(content, /m\s+cycle member sort mode/); + assert.match(content, /click the member header to sort/); assert.match(content, /right-click opens pane actions/); assert.match(content, /p\s+open the repository browser/); assert.match(content, /l\s+toggle wide layout/); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 274164b..81cef84 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -14,6 +14,7 @@ import type { import { getTuiRepositoryPreference, writeTuiRepositoryPreference } from '@ghcrawl/api-core'; import { buildMemberRows, + cycleMemberSortMode, cycleFocusPane, cycleMinSizeFilter, cycleSortMode, @@ -21,8 +22,10 @@ import { formatRelativeTime, moveSelectableIndex, preserveSelectedId, + resolveMemberHeaderSortFromClick, type MemberListRow, type TuiFocusPane, + type TuiMemberSortMode, type TuiMinSizeFilter, } from './state.js'; import { computeTuiLayout } from './layout.js'; @@ -123,6 +126,7 @@ export async function startTui(params: StartTuiParams): Promise { ? getTuiRepositoryPreference(params.service.config, currentRepository.owner, currentRepository.repo) : { sortMode: 'size' as TuiClusterSortMode, minClusterSize: 1 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference }; let sortMode: TuiClusterSortMode = initialPreference.sortMode; + let memberSortMode: TuiMemberSortMode = 'kind'; let minSize: TuiMinSizeFilter = initialPreference.minClusterSize; let wideLayout: TuiWideLayoutPreference = initialPreference.wideLayout; let showClosed = true; @@ -142,6 +146,7 @@ export async function startTui(params: StartTuiParams): Promise { const threadDetailCache = new Map(); let modalOpen = false; let suppressNextClusterSelect = false; + let suppressNextMemberSelect = false; const clearCaches = (): void => { clusterDetailCache.clear(); @@ -229,7 +234,7 @@ export async function startTui(params: StartTuiParams): Promise { refreshAll(true); return false; } - memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed }); + memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed, sortMode: memberSortMode }); selectedMemberThreadId = threadId; memberIndex = findSelectableIndex(memberRows, selectedMemberThreadId); loadSelectedThreadDetail(false); @@ -289,7 +294,7 @@ export async function startTui(params: StartTuiParams): Promise { } if (selectedClusterId !== null && clusterDetail) { - memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed }); + memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed, sortMode: memberSortMode }); selectedMemberThreadId = preserveSelectedId( memberRows.filter((row) => row.selectable).map((row) => row.threadId), previousMemberId, @@ -342,7 +347,7 @@ export async function startTui(params: StartTuiParams): Promise { ? `#${snapshot.stats.latestClusterRunId} ${formatRelativeTime(snapshot.stats.latestClusterRunFinishedAt ?? null)}` : 'never'; widgets.header.setContent( - `{bold}${repoLabel}{/bold} {cyan-fg}${snapshot?.stats.openPullRequestCount ?? 0} PR{/cyan-fg} {green-fg}${snapshot?.stats.openIssueCount ?? 0} issues{/green-fg} GH:${ghStatus} Emb:${embedStatus} Cl:${clusterStatus} sort:${sortMode} min:${minSize === 0 ? 'all' : `${minSize}+`} layout:${wideLayout === 'columns' ? 'cols' : 'stack'} closed:${showClosed ? 'shown' : 'hidden'} filter:${search || 'none'}`, + `{bold}${repoLabel}{/bold} {cyan-fg}${snapshot?.stats.openPullRequestCount ?? 0} PR{/cyan-fg} {green-fg}${snapshot?.stats.openIssueCount ?? 0} issues{/green-fg} GH:${ghStatus} Emb:${embedStatus} Cl:${clusterStatus} sort:${sortMode} members:${memberSortMode} min:${minSize === 0 ? 'all' : `${minSize}+`} layout:${wideLayout === 'columns' ? 'cols' : 'stack'} closed:${showClosed ? 'shown' : 'hidden'} filter:${search || 'none'}`, ); isRendering = true; @@ -363,7 +368,7 @@ export async function startTui(params: StartTuiParams): Promise { updatePaneStyles(widgets, focusPane); const footerLines = [ activityLines.at(-1) ?? status, - `focus:${focusPane} sort:${sortMode} min:${minSize === 0 ? 'all' : `${minSize}+`} Tab focus / filter s sort f min # jump o open h help q quit`, + `focus:${focusPane} sort:${sortMode} members:${memberSortMode} min:${minSize === 0 ? 'all' : `${minSize}+`} Tab focus / filter s sort m members f min o open h help`, ]; widgets.footer.setContent(footerLines.join('\n')); widgets.screen.render(); @@ -447,6 +452,29 @@ export async function startTui(params: StartTuiParams): Promise { refreshAll(true); }; + const setMemberSortMode = (nextMemberSortMode: TuiMemberSortMode): void => { + if (memberSortMode === nextMemberSortMode) { + return; + } + const previousMemberId = selectedMemberThreadId; + memberSortMode = nextMemberSortMode; + if (clusterDetail) { + memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed, sortMode: memberSortMode }); + selectedMemberThreadId = preserveSelectedId( + memberRows.filter((row) => row.selectable).map((row) => row.threadId), + previousMemberId, + ); + memberIndex = findSelectableIndex(memberRows, selectedMemberThreadId); + loadSelectedThreadDetail(false); + } + status = `Member sort: ${memberSortMode}`; + render(); + }; + + const toggleMemberSortMode = (): void => { + setMemberSortMode(cycleMemberSortMode(memberSortMode)); + }; + const setMinSize = (nextMinSize: TuiMinSizeFilter): void => { if (minSize === nextMinSize) { return; @@ -473,7 +501,7 @@ export async function startTui(params: StartTuiParams): Promise { refreshAll(true); return; } - memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed }); + memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed, sortMode: memberSortMode }); selectedMemberThreadId = preserveSelectedId( memberRows.filter((row) => row.selectable).map((row) => row.threadId), null, @@ -692,6 +720,10 @@ export async function startTui(params: StartTuiParams): Promise { : []), { label: 'Sort by size', run: () => setSortMode('size') }, { label: 'Sort by recent', run: () => setSortMode('recent') }, + { label: 'Member sort grouped', run: () => setMemberSortMode('kind') }, + { label: 'Member sort recent', run: () => setMemberSortMode('recent') }, + { label: 'Member sort number', run: () => setMemberSortMode('number') }, + { label: 'Member sort state', run: () => setMemberSortMode('state') }, { label: 'Min size 1+', run: () => setMinSize(1) }, { label: 'Min size 10+', run: () => setMinSize(10) }, { label: 'Min size all', run: () => setMinSize(0) }, @@ -707,6 +739,8 @@ export async function startTui(params: StartTuiParams): Promise { { label: 'Repository browser', run: browseRepositories }, { label: 'Sort by size', run: () => setSortMode('size') }, { label: 'Sort by recent', run: () => setSortMode('recent') }, + { label: 'Member sort grouped', run: () => setMemberSortMode('kind') }, + { label: 'Member sort recent', run: () => setMemberSortMode('recent') }, { label: 'Min size 1+', run: () => setMinSize(1) }, { label: 'Min size 10+', run: () => setMinSize(10) }, { label: 'Min size all', run: () => setMinSize(0) }, @@ -965,6 +999,10 @@ export async function startTui(params: StartTuiParams): Promise { if (modalOpen) return; toggleSortMode(); }); + widgets.screen.key(['m'], () => { + if (modalOpen) return; + toggleMemberSortMode(); + }); widgets.screen.key(['f'], () => { if (modalOpen) return; setMinSize(cycleMinSizeFilter(minSize)); @@ -1038,6 +1076,10 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.members.on('select item', (_item, index) => { if (isRendering || modalOpen) return; + if (suppressNextMemberSelect) { + suppressNextMemberSelect = false; + return; + } focusPane = 'members'; widgets.members.focus(); selectMemberIndex(Number(index)); @@ -1049,10 +1091,17 @@ export async function startTui(params: StartTuiParams): Promise { updateFocus('detail'); }); widgets.members.on('mousedown', (event: MouseEventArg) => { - if (isRendering || modalOpen || event.button !== 'right') return; + if (isRendering || modalOpen) return; focusPane = 'members'; widgets.members.focus(); const itemIndex = getListItemIndexFromMouse(widgets.members, event); + if (event.button === 'left' && itemIndex === 0) { + suppressNextMemberSelect = true; + const relativeX = Math.max(0, Number(event.x) - Number(widgets.members.aleft) - 2); + setMemberSortMode(resolveMemberHeaderSortFromClick(relativeX, memberSortMode)); + return; + } + if (event.button !== 'right') return; const row = itemIndex !== null && itemIndex >= 0 && itemIndex < memberRows.length ? memberRows[itemIndex] : null; if (!row?.selectable) { openContextMenu('Members', clusterContextItems(), event); @@ -1437,13 +1486,14 @@ export function buildHelpContent(): string { 'Left / Right cycle focus backward or forward across panes', 'Up / Down move selection, or scroll detail when detail is focused', 'Enter clusters -> members, members -> detail', - 'Mouse click to focus/select; click cluster header to sort; right-click opens pane actions; wheel scrolls', + 'Mouse click to focus/select; click list headers to sort; right-click opens pane actions; wheel scrolls', 'PgUp / PgDn page through the focused pane or this help popup faster', 'Home / End jump to the top or bottom of detail or help', '', '{bold}Views And Filters{/bold}', '# jump directly to an issue or PR number', 's cycle cluster sort mode', + 'm cycle member sort mode', 'f cycle minimum cluster size filter', 'l toggle wide layout: columns vs. wide-left stacked-right', 'x show or hide locally closed clusters and members', @@ -1463,6 +1513,7 @@ export function buildHelpContent(): string { 'The TUI only reads local SQLite. Run ghcrawl sync, ghcrawl embed, and ghcrawl cluster from the shell to update data.', 'The default cluster filter is 1+, so solo clusters are visible unless you raise it with f.', 'The default sort is size. Press s to toggle size and recent.', + 'Member rows default to issue/PR grouping. Press m or click the member header to sort by updated, number, state, or title.', 'Mouse clicks focus panes; clicking an already selected row advances to the next pane. Right-click works on every pane.', 'Clusters show C so the cluster id is easy to copy into CLI or skill flows.', 'The footer only shows the short command list. Open help to see the full list.', diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index c35f017..ccc7362 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -5,6 +5,7 @@ import { applyClusterFilters, buildMemberRows, cycleFocusPane, + cycleMemberSortMode, cycleMinSizeFilter, cycleSortMode, findSelectableIndex, @@ -12,6 +13,7 @@ import { formatRelativeTime, moveSelectableIndex, preserveSelectedId, + resolveMemberHeaderSortFromClick, } from './state.js'; import type { TuiClusterDetail, TuiClusterSummary } from '@ghcrawl/api-core'; @@ -29,6 +31,14 @@ test('cycleMinSizeFilter rotates through presets', () => { assert.equal(cycleMinSizeFilter(0), 1); }); +test('cycleMemberSortMode rotates through member sort modes', () => { + assert.equal(cycleMemberSortMode('kind'), 'recent'); + assert.equal(cycleMemberSortMode('recent'), 'number'); + assert.equal(cycleMemberSortMode('number'), 'state'); + assert.equal(cycleMemberSortMode('state'), 'title'); + assert.equal(cycleMemberSortMode('title'), 'kind'); +}); + test('cycleFocusPane moves forward and backward', () => { assert.equal(cycleFocusPane('clusters', 1), 'members'); assert.equal(cycleFocusPane('clusters', -1), 'detail'); @@ -152,4 +162,71 @@ test('buildMemberRows groups issues and pull requests and selection skips header test('formatMemberListHeader aligns the member table columns', () => { assert.equal(formatMemberListHeader(), 'number state updated title'); + assert.equal(formatMemberListHeader('recent'), 'number state updated*title'); +}); + +test('buildMemberRows can sort members by recent without group headers', () => { + const detail: TuiClusterDetail = { + clusterId: 1, + displayTitle: 'Cluster 1', + isClosed: false, + closedAtLocal: null, + closeReasonLocal: null, + totalCount: 3, + issueCount: 2, + pullRequestCount: 1, + latestUpdatedAt: '2026-03-09T11:00:00Z', + representativeThreadId: 10, + representativeNumber: 42, + representativeKind: 'issue', + members: [ + { + id: 10, + number: 42, + kind: 'issue', + isClosed: false, + title: 'Issue one', + updatedAtGh: '2026-03-09T09:00:00Z', + htmlUrl: 'https://example.com/42', + labels: [], + clusterScore: null, + }, + { + id: 11, + number: 43, + kind: 'pull_request', + isClosed: false, + title: 'PR one', + updatedAtGh: '2026-03-09T11:00:00Z', + htmlUrl: 'https://example.com/43', + labels: [], + clusterScore: null, + }, + { + id: 12, + number: 44, + kind: 'issue', + isClosed: true, + title: 'Issue closed', + updatedAtGh: '2026-03-09T10:00:00Z', + htmlUrl: 'https://example.com/44', + labels: [], + clusterScore: null, + }, + ], + }; + + const rows = buildMemberRows(detail, { sortMode: 'recent' }); + assert.equal(rows.length, 4); + assert.match(rows[1]?.label ?? '', /#43/); + assert.match(rows[2]?.label ?? '', /#44/); + assert.match(rows[3]?.label ?? '', /#42/); +}); + +test('resolveMemberHeaderSortFromClick maps member header columns to sort modes', () => { + assert.equal(resolveMemberHeaderSortFromClick(0, 'kind'), 'number'); + assert.equal(resolveMemberHeaderSortFromClick(8, 'kind'), 'state'); + assert.equal(resolveMemberHeaderSortFromClick(15, 'kind'), 'recent'); + assert.equal(resolveMemberHeaderSortFromClick(23, 'kind'), 'title'); + assert.equal(resolveMemberHeaderSortFromClick(23, 'title'), 'kind'); }); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 441e45a..391dcd2 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -2,15 +2,24 @@ import type { TuiClusterDetail, TuiClusterSortMode, TuiClusterSummary } from '@g export type TuiFocusPane = 'clusters' | 'members' | 'detail'; export type TuiMinSizeFilter = 0 | 1 | 2 | 10 | 20 | 50; +export type TuiMemberSortMode = 'kind' | 'recent' | 'number' | 'state' | 'title'; export type MemberListRow = | { key: string; label: string; selectable: false } | { key: string; label: string; selectable: true; threadId: number; isClosed: boolean; kind: 'issue' | 'pull_request' }; export const SORT_MODE_ORDER: TuiClusterSortMode[] = ['size', 'recent']; +export const MEMBER_SORT_MODE_ORDER: TuiMemberSortMode[] = ['kind', 'recent', 'number', 'state', 'title']; export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [1, 2, 10, 20, 50, 0]; export const FOCUS_PANE_ORDER: TuiFocusPane[] = ['clusters', 'members', 'detail']; +const MEMBER_NUMBER_WIDTH = 8; +const MEMBER_STATE_WIDTH = 7; +const MEMBER_UPDATED_WIDTH = 8; +const MEMBER_STATE_START = MEMBER_NUMBER_WIDTH; +const MEMBER_UPDATED_START = MEMBER_STATE_START + MEMBER_STATE_WIDTH; +const MEMBER_TITLE_START = MEMBER_UPDATED_START + MEMBER_UPDATED_WIDTH; + export function cycleSortMode(current: TuiClusterSortMode): TuiClusterSortMode { const index = SORT_MODE_ORDER.indexOf(current); return SORT_MODE_ORDER[(index + 1) % SORT_MODE_ORDER.length] ?? 'size'; @@ -21,6 +30,11 @@ export function cycleMinSizeFilter(current: TuiMinSizeFilter): TuiMinSizeFilter return MIN_SIZE_FILTER_ORDER[(index + 1) % MIN_SIZE_FILTER_ORDER.length] ?? 10; } +export function cycleMemberSortMode(current: TuiMemberSortMode): TuiMemberSortMode { + const index = MEMBER_SORT_MODE_ORDER.indexOf(current); + return MEMBER_SORT_MODE_ORDER[(index + 1) % MEMBER_SORT_MODE_ORDER.length] ?? 'kind'; +} + export function cycleFocusPane(current: TuiFocusPane, direction: 1 | -1 = 1): TuiFocusPane { const index = FOCUS_PANE_ORDER.indexOf(current); const next = (index + direction + FOCUS_PANE_ORDER.length) % FOCUS_PANE_ORDER.length; @@ -46,45 +60,65 @@ export function preserveSelectedId(ids: number[], selectedId: number | null): nu return ids[0] ?? null; } -export function buildMemberRows(detail: TuiClusterDetail | null, options?: { includeClosedMembers?: boolean }): MemberListRow[] { +export function buildMemberRows(detail: TuiClusterDetail | null, options?: { includeClosedMembers?: boolean; sortMode?: TuiMemberSortMode }): MemberListRow[] { if (!detail) return []; const includeClosedMembers = options?.includeClosedMembers ?? true; + const sortMode = options?.sortMode ?? 'kind'; const visibleMembers = includeClosedMembers ? detail.members : detail.members.filter((member) => !member.isClosed); + const rows: MemberListRow[] = [{ key: 'members-table-header', label: `{bold}${formatMemberListHeader(sortMode)}{/bold}`, selectable: false }]; + + if (sortMode !== 'kind') { + appendMemberRows(rows, sortMembers(visibleMembers, sortMode)); + return rows; + } + const issues = visibleMembers.filter((member) => member.kind === 'issue'); const pullRequests = visibleMembers.filter((member) => member.kind === 'pull_request'); - const rows: MemberListRow[] = [{ key: 'members-table-header', label: `{bold}${formatMemberListHeader()}{/bold}`, selectable: false }]; - if (issues.length > 0) { rows.push({ key: 'issues-header', label: `ISSUES (${issues.length})`, selectable: false }); - for (const issue of issues) { - rows.push({ - key: `thread-${issue.id}`, - label: formatMemberLabel(issue.number, issue.title, issue.updatedAtGh, issue.isClosed), - selectable: true, - threadId: issue.id, - isClosed: issue.isClosed, - kind: issue.kind, - }); - } + appendMemberRows(rows, issues); } if (pullRequests.length > 0) { rows.push({ key: 'pulls-header', label: `PULL REQUESTS (${pullRequests.length})`, selectable: false }); - for (const pullRequest of pullRequests) { - rows.push({ - key: `thread-${pullRequest.id}`, - label: formatMemberLabel(pullRequest.number, pullRequest.title, pullRequest.updatedAtGh, pullRequest.isClosed), - selectable: true, - threadId: pullRequest.id, - isClosed: pullRequest.isClosed, - kind: pullRequest.kind, - }); - } + appendMemberRows(rows, pullRequests); } return rows; } +type TuiMember = TuiClusterDetail['members'][number]; + +function appendMemberRows(rows: MemberListRow[], members: TuiMember[]): void { + for (const member of members) { + rows.push({ + key: `thread-${member.id}`, + label: formatMemberLabel(member.number, member.title, member.updatedAtGh, member.isClosed), + selectable: true, + threadId: member.id, + isClosed: member.isClosed, + kind: member.kind, + }); + } +} + +function sortMembers(members: TuiMember[], sortMode: TuiMemberSortMode): TuiMember[] { + return members.slice().sort((left, right) => { + const leftTime = left.updatedAtGh ? Date.parse(left.updatedAtGh) : 0; + const rightTime = right.updatedAtGh ? Date.parse(right.updatedAtGh) : 0; + if (sortMode === 'recent') { + return rightTime - leftTime || right.number - left.number; + } + if (sortMode === 'number') { + return left.number - right.number; + } + if (sortMode === 'state') { + return Number(left.isClosed) - Number(right.isClosed) || rightTime - leftTime || left.number - right.number; + } + return normalizeMemberTitle(left.title).localeCompare(normalizeMemberTitle(right.title)) || left.number - right.number; + }); +} + export function findSelectableIndex(rows: MemberListRow[], threadId: number | null): number { if (threadId !== null) { const index = rows.findIndex((row) => row.selectable && row.threadId === threadId); @@ -118,15 +152,27 @@ function compareClusters(left: TuiClusterSummary, right: TuiClusterSummary, sort function formatMemberLabel(number: number, title: string, updatedAtGh: string | null, isClosed: boolean): string { const updated = formatRelativeTime(updatedAtGh); - const numberLabel = `#${number}`.padEnd(8).slice(0, 8); - const status = (isClosed ? 'closed' : 'open').padEnd(7); - const age = updated.padEnd(8).slice(0, 8); + const numberLabel = `#${number}`.padEnd(MEMBER_NUMBER_WIDTH).slice(0, MEMBER_NUMBER_WIDTH); + const status = (isClosed ? 'closed' : 'open').padEnd(MEMBER_STATE_WIDTH); + const age = updated.padEnd(MEMBER_UPDATED_WIDTH).slice(0, MEMBER_UPDATED_WIDTH); const label = escapeBlessedInline(`${numberLabel}${status}${age}${normalizeMemberTitle(title)}`); return isClosed ? `{gray-fg}${label}{/gray-fg}` : label; } -export function formatMemberListHeader(): string { - return `${'number'.padEnd(8)}${'state'.padEnd(7)}${'updated'.padEnd(8)}title`; +export function formatMemberListHeader(sortMode: TuiMemberSortMode = 'kind'): string { + const number = (sortMode === 'number' ? 'number*' : 'number').padEnd(MEMBER_NUMBER_WIDTH); + const state = (sortMode === 'state' ? 'state*' : 'state').padEnd(MEMBER_STATE_WIDTH); + const updated = (sortMode === 'recent' ? 'updated*' : 'updated').padEnd(MEMBER_UPDATED_WIDTH); + const title = sortMode === 'title' ? 'title*' : 'title'; + return `${number}${state}${updated}${title}`; +} + +export function resolveMemberHeaderSortFromClick(relativeX: number, currentSortMode: TuiMemberSortMode): TuiMemberSortMode { + if (relativeX < MEMBER_STATE_START) return 'number'; + if (relativeX < MEMBER_UPDATED_START) return 'state'; + if (relativeX < MEMBER_TITLE_START) return 'recent'; + if (currentSortMode === 'title') return 'kind'; + return 'title'; } export function formatRelativeTime(value: string | null, now: Date = new Date()): string { From 6bf93ce863eb929fed80544b3e9e819280aabda6 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:07:32 -0700 Subject: [PATCH 097/215] fix(tui): expose referenced links in context menu --- apps/cli/src/tui/app.test.ts | 34 ++++++++++++++++++++-- apps/cli/src/tui/app.ts | 55 +++++++++++++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index f7645aa..ebc82b9 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -11,6 +11,7 @@ import { formatClusterListHeader, formatClusterListLabel, formatClusterShortName, + getThreadReferenceLinks, getRepositoryChoices, parseOwnerRepoValue, renderMarkdownForTerminal, @@ -199,7 +200,7 @@ test('buildThreadContextMenuItems exposes thread actions for right-click menus', closedAtLocal: null, closeReasonLocal: null, title: 'Example', - body: null, + body: 'See [run](https://example.com/run) and https://example.com/raw.', authorLogin: 'dev', htmlUrl: 'https://example.com/42', labels: [], @@ -212,7 +213,7 @@ test('buildThreadContextMenuItems exposes thread actions for right-click menus', assert.deepEqual( items.map((item) => item.action), - ['open', 'copy-url', 'copy-title', 'copy-markdown-link', 'load-neighbors', 'close'], + ['open', 'copy-url', 'copy-title', 'copy-markdown-link', 'open-first-link', 'copy-first-link', 'load-neighbors', 'close'], ); }); @@ -220,6 +221,35 @@ test('buildThreadContextMenuItems only closes when no thread is selected', () => assert.deepEqual(buildThreadContextMenuItems(null), [{ label: 'Close', action: 'close' }]); }); +test('getThreadReferenceLinks extracts unique body and summary links', () => { + const links = getThreadReferenceLinks({ + thread: { + id: 1, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Example', + body: 'See [run](https://example.com/run), https://example.com/raw.', + authorLogin: 'dev', + htmlUrl: 'https://example.com/42', + labels: [], + updatedAtGh: '2026-03-09T00:00:00Z', + clusterId: 1, + }, + summaries: { + dedupe_summary: 'same as https://example.com/raw and https://example.com/summary', + }, + neighbors: [], + }); + + assert.deepEqual(links, ['https://example.com/run', 'https://example.com/raw', 'https://example.com/summary']); +}); + test('getRepositoryChoices sorts by most recent update and includes the new-repo action', () => { const service = { listRepositories() { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 81cef84..27eaee3 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -70,7 +70,15 @@ type MouseEventArg = blessed.Widgets.Events.IMouseEventArg & { button?: 'left' | 'middle' | 'right' | 'unknown'; }; -export type ThreadContextAction = 'open' | 'copy-url' | 'copy-title' | 'copy-markdown-link' | 'load-neighbors' | 'close'; +export type ThreadContextAction = + | 'open' + | 'copy-url' + | 'copy-title' + | 'copy-markdown-link' + | 'open-first-link' + | 'copy-first-link' + | 'load-neighbors' + | 'close'; export type ThreadContextMenuItem = { label: string; @@ -688,6 +696,17 @@ export async function startTui(params: StartTuiParams): Promise { } else if (item.action === 'copy-markdown-link') { const markdownLink = `[#${selectedThread.number} ${selectedThread.title}](${selectedThread.htmlUrl})`; status = copyTextToClipboard(markdownLink) ? 'Copied markdown link' : 'Clipboard copy failed'; + } else if (item.action === 'open-first-link') { + const url = getThreadReferenceLinks(threadDetail).at(0); + if (url) { + openUrl(url); + status = `Opened ${url}`; + } else { + status = 'No referenced links found'; + } + } else if (item.action === 'copy-first-link') { + const url = getThreadReferenceLinks(threadDetail).at(0); + status = url ? (copyTextToClipboard(url) ? 'Copied referenced link' : 'Clipboard copy failed') : 'No referenced links found'; } else if (item.action === 'load-neighbors') { loadSelectedThreadDetail(true); status = `Loaded neighbors for #${threadDetail?.thread.number ?? selectedThread.number}`; @@ -1344,6 +1363,33 @@ export function renderMarkdownForTerminal(markdown: string): string { return rendered.join('\n').replace(/\n{4,}/g, '\n\n\n').trimEnd(); } +export function getThreadReferenceLinks(threadDetail: TuiThreadDetail | null): string[] { + if (!threadDetail) return []; + return uniqueStrings([ + ...extractMarkdownLinks(threadDetail.thread.body ?? ''), + ...Object.values(threadDetail.summaries).flatMap((summary) => extractMarkdownLinks(summary ?? '')), + ]).filter((url) => url !== threadDetail.thread.htmlUrl); +} + +function extractMarkdownLinks(markdown: string): string[] { + const urls: string[] = []; + for (const match of markdown.matchAll(/\[[^\]]+\]\((https?:\/\/[^)\s]+)\)/g)) { + urls.push(stripTrailingUrlPunctuation(match[1] ?? '')); + } + for (const match of markdown.matchAll(/(^|[\s(<])(https?:\/\/[^\s<>)]+)/g)) { + urls.push(stripTrailingUrlPunctuation(match[2] ?? '')); + } + return urls.filter(Boolean); +} + +function stripTrailingUrlPunctuation(url: string): string { + return url.replace(/[.,;:!?]+$/g, ''); +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values)]; +} + type SummaryKey = NonNullable; const SUMMARY_SECTION_ORDER: SummaryKey[] = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary']; @@ -1426,11 +1472,18 @@ export function buildThreadContextMenuItems(threadDetail: TuiThreadDetail | null if (!threadDetail) { return [{ label: 'Close', action: 'close' }]; } + const referenceLinks = getThreadReferenceLinks(threadDetail); return [ { label: 'Open in browser', action: 'open' }, { label: 'Copy URL', action: 'copy-url' }, { label: 'Copy title', action: 'copy-title' }, { label: 'Copy Markdown link', action: 'copy-markdown-link' }, + ...(referenceLinks.length > 0 + ? [ + { label: 'Open first body link', action: 'open-first-link' as const }, + { label: 'Copy first body link', action: 'copy-first-link' as const }, + ] + : []), { label: 'Load neighbors', action: 'load-neighbors' }, { label: 'Close', action: 'close' }, ]; From 430c07cfbe15aa95ec174017aa10fbd5016be219 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:08:37 -0700 Subject: [PATCH 098/215] fix(tui): list referenced links in detail --- apps/cli/src/tui/app.test.ts | 4 +++- apps/cli/src/tui/app.ts | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index ebc82b9..0103d08 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -55,7 +55,7 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e closedAtLocal: null, closeReasonLocal: null, title: 'Bad {bold}title{/bold}', - body: 'Body with {red-fg}tags{/red-fg}', + body: 'Body with {red-fg}tags{/red-fg} and https://example.com/body-link', authorLogin: 'dev{cyan-fg}', htmlUrl: 'https://example.com/{oops}', labels: ['bug{green-fg}'], @@ -82,6 +82,8 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e assert.match(rendered, /Cluster signal:/); assert.match(rendered, /Main/); assert.match(rendered, /Body with \\{red-fg\\}tags\\{\/red-fg\\}/); + assert.match(rendered, /Links/); + assert.match(rendered, /1\. https:\/\/example\.com\/body-link/); assert.match(rendered, /Summary \\{yellow-fg\\}text\\{\/yellow-fg\\}/); assert.match(rendered, /Neighbor \\{blue-fg\\}title\\{\/blue-fg\\}/); assert.ok(rendered.indexOf('Cluster signal:') < rendered.indexOf('{bold}Main{/bold}')); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 27eaee3..d7b4ccb 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1305,6 +1305,9 @@ export function renderDetailPane( ? 'No neighbors available.' : 'Neighbors load when the detail pane is focused.'; const body = renderMarkdownForTerminal(thread.body ?? '(no body)'); + const referenceLinks = getThreadReferenceLinks(threadDetail); + const linksSection = + referenceLinks.length > 0 ? `\n\n{bold}Links{/bold}\n${referenceLinks.map((url, index) => `${index + 1}. ${escapeBlessedText(url)}`).join('\n')}` : ''; return [ `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, @@ -1317,6 +1320,7 @@ export function renderDetailPane( '', `{bold}Main{/bold}`, body, + linksSection, `\n\n{bold}Neighbors{/bold}\n${neighbors}`, ] .filter(Boolean) From e99afb5eff8113e125f660206354ba555f3543de Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:14:10 -0700 Subject: [PATCH 099/215] fix(tui): persist member sort preference --- apps/cli/src/tui/app.ts | 12 ++++++++++-- apps/cli/src/tui/state.ts | 4 ++-- packages/api-core/src/config.test.ts | 5 +++++ packages/api-core/src/config.ts | 22 +++++++++++++++++++--- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index d7b4ccb..361854d 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -132,9 +132,14 @@ export async function startTui(params: StartTuiParams): Promise { let isRendering = false; const initialPreference = selectedRepository ? getTuiRepositoryPreference(params.service.config, currentRepository.owner, currentRepository.repo) - : { sortMode: 'size' as TuiClusterSortMode, minClusterSize: 1 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference }; + : { + sortMode: 'size' as TuiClusterSortMode, + memberSortMode: 'kind' as TuiMemberSortMode, + minClusterSize: 1 as TuiMinSizeFilter, + wideLayout: 'columns' as TuiWideLayoutPreference, + }; let sortMode: TuiClusterSortMode = initialPreference.sortMode; - let memberSortMode: TuiMemberSortMode = 'kind'; + let memberSortMode: TuiMemberSortMode = initialPreference.memberSortMode; let minSize: TuiMinSizeFilter = initialPreference.minClusterSize; let wideLayout: TuiWideLayoutPreference = initialPreference.wideLayout; let showClosed = true; @@ -466,6 +471,7 @@ export async function startTui(params: StartTuiParams): Promise { } const previousMemberId = selectedMemberThreadId; memberSortMode = nextMemberSortMode; + persistRepositoryPreference(); if (clusterDetail) { memberRows = buildMemberRows(clusterDetail, { includeClosedMembers: showClosed, sortMode: memberSortMode }); selectedMemberThreadId = preserveSelectedId( @@ -798,6 +804,7 @@ export async function startTui(params: StartTuiParams): Promise { repo: currentRepository.repo, minClusterSize: minSize, sortMode, + memberSortMode, wideLayout, }); }; @@ -841,6 +848,7 @@ export async function startTui(params: StartTuiParams): Promise { const preference = getTuiRepositoryPreference(params.service.config, target.owner, target.repo); minSize = overrides?.minClusterSize ?? preference.minClusterSize; sortMode = overrides?.sortMode ?? preference.sortMode; + memberSortMode = preference.memberSortMode; wideLayout = preference.wideLayout; persistRepositoryPreference(); clearCaches(); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 391dcd2..dfd2897 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -1,8 +1,8 @@ -import type { TuiClusterDetail, TuiClusterSortMode, TuiClusterSummary } from '@ghcrawl/api-core'; +import type { TuiClusterDetail, TuiClusterSortMode, TuiClusterSummary, TuiMemberSortPreference } from '@ghcrawl/api-core'; export type TuiFocusPane = 'clusters' | 'members' | 'detail'; export type TuiMinSizeFilter = 0 | 1 | 2 | 10 | 20 | 50; -export type TuiMemberSortMode = 'kind' | 'recent' | 'number' | 'state' | 'title'; +export type TuiMemberSortMode = TuiMemberSortPreference; export type MemberListRow = | { key: string; label: string; selectable: false } diff --git a/packages/api-core/src/config.test.ts b/packages/api-core/src/config.test.ts index f30bed7..7581991 100644 --- a/packages/api-core/src/config.test.ts +++ b/packages/api-core/src/config.test.ts @@ -235,6 +235,7 @@ test('loadConfig restores repository tui preferences', () => { 'openclaw/openclaw': { minClusterSize: 1, sortMode: 'size', + memberSortMode: 'recent', wideLayout: 'right-stack', }, }, @@ -246,6 +247,7 @@ test('loadConfig restores repository tui preferences', () => { assert.deepEqual(getTuiRepositoryPreference(config, 'openclaw', 'openclaw'), { minClusterSize: 1, sortMode: 'size', + memberSortMode: 'recent', wideLayout: 'right-stack', }); }); @@ -265,6 +267,7 @@ test('writeTuiRepositoryPreference persists sort and min cluster size by reposit repo: 'openclaw', minClusterSize: 1, sortMode: 'size', + memberSortMode: 'title', wideLayout: 'right-stack', }); @@ -272,11 +275,13 @@ test('writeTuiRepositoryPreference persists sort and min cluster size by reposit assert.deepEqual(getTuiRepositoryPreference(reloaded, 'openclaw', 'openclaw'), { minClusterSize: 1, sortMode: 'size', + memberSortMode: 'title', wideLayout: 'right-stack', }); assert.deepEqual(getTuiRepositoryPreference(reloaded, 'other', 'repo'), { minClusterSize: 1, sortMode: 'size', + memberSortMode: 'kind', wideLayout: 'columns', }); }); diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index 71deed5..4ab3c91 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -6,6 +6,7 @@ import dotenv from 'dotenv'; export type ConfigValueSource = 'env' | 'config' | 'dotenv' | 'default' | 'none'; export type TuiSortPreference = 'recent' | 'size'; +export type TuiMemberSortPreference = 'kind' | 'recent' | 'number' | 'state' | 'title'; export type TuiMinClusterSize = 0 | 1 | 2 | 10 | 20 | 50; export type TuiWideLayoutPreference = 'columns' | 'right-stack'; export type EmbeddingBasis = 'title_original' | 'title_summary' | 'llm_key_summary'; @@ -14,6 +15,7 @@ export type VectorBackend = 'vectorlite'; export type TuiRepositoryPreference = { minClusterSize: TuiMinClusterSize; sortMode: TuiSortPreference; + memberSortMode: TuiMemberSortPreference; wideLayout: TuiWideLayoutPreference; }; @@ -161,6 +163,10 @@ function getTuiSortPreference(value: unknown): TuiSortPreference | undefined { return value === 'recent' || value === 'size' ? value : undefined; } +function getTuiMemberSortPreference(value: unknown): TuiMemberSortPreference | undefined { + return value === 'kind' || value === 'recent' || value === 'number' || value === 'state' || value === 'title' ? value : undefined; +} + function getTuiMinClusterSize(value: unknown): TuiMinClusterSize | undefined { return value === 0 || value === 1 || value === 2 || value === 10 || value === 20 || value === 50 ? value : undefined; } @@ -190,11 +196,12 @@ function getTuiPreferences(value: unknown): Record; const minClusterSize = getTuiMinClusterSize(record.minClusterSize); const sortMode = getTuiSortPreference(record.sortMode); + const memberSortMode = getTuiMemberSortPreference(record.memberSortMode) ?? 'kind'; const wideLayout = getTuiWideLayoutPreference(record.wideLayout) ?? 'columns'; if (minClusterSize === undefined || sortMode === undefined) { continue; } - preferences[fullName] = { minClusterSize, sortMode, wideLayout }; + preferences[fullName] = { minClusterSize, sortMode, memberSortMode, wideLayout }; } return preferences; @@ -396,19 +403,28 @@ export function ensureRuntimeDirs(config: GitcrawlConfig): void { } export function getTuiRepositoryPreference(config: GitcrawlConfig, owner: string, repo: string): TuiRepositoryPreference { - return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 1, sortMode: 'size', wideLayout: 'columns' }; + return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 1, sortMode: 'size', memberSortMode: 'kind', wideLayout: 'columns' }; } export function writeTuiRepositoryPreference( config: GitcrawlConfig, - params: { owner: string; repo: string; minClusterSize: TuiMinClusterSize; sortMode: TuiSortPreference; wideLayout: TuiWideLayoutPreference }, + params: { + owner: string; + repo: string; + minClusterSize: TuiMinClusterSize; + sortMode: TuiSortPreference; + memberSortMode?: TuiMemberSortPreference; + wideLayout: TuiWideLayoutPreference; + }, ): { configPath: string } { const fullName = `${params.owner}/${params.repo}`; + const previousPreference = config.tuiPreferences[fullName]; const nextPreferences = { ...config.tuiPreferences, [fullName]: { minClusterSize: params.minClusterSize, sortMode: params.sortMode, + memberSortMode: params.memberSortMode ?? previousPreference?.memberSortMode ?? 'kind', wideLayout: params.wideLayout, }, }; From 2640963669d4fe1acd162ae916c5a1d7d3ec0a9b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:15:36 -0700 Subject: [PATCH 100/215] feat(tui): add referenced link picker --- apps/cli/src/tui/app.test.ts | 19 +++++++++- apps/cli/src/tui/app.ts | 72 ++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 0103d08..ac8fba6 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -11,6 +11,7 @@ import { formatClusterListHeader, formatClusterListLabel, formatClusterShortName, + formatLinkChoiceLabel, getThreadReferenceLinks, getRepositoryChoices, parseOwnerRepoValue, @@ -215,7 +216,18 @@ test('buildThreadContextMenuItems exposes thread actions for right-click menus', assert.deepEqual( items.map((item) => item.action), - ['open', 'copy-url', 'copy-title', 'copy-markdown-link', 'open-first-link', 'copy-first-link', 'load-neighbors', 'close'], + [ + 'open', + 'copy-url', + 'copy-title', + 'copy-markdown-link', + 'open-first-link', + 'copy-first-link', + 'open-link-picker', + 'copy-link-picker', + 'load-neighbors', + 'close', + ], ); }); @@ -252,6 +264,11 @@ test('getThreadReferenceLinks extracts unique body and summary links', () => { assert.deepEqual(links, ['https://example.com/run', 'https://example.com/raw', 'https://example.com/summary']); }); +test('formatLinkChoiceLabel numbers picker rows', () => { + assert.equal(formatLinkChoiceLabel('https://example.com/run', 0), ' 1 https://example.com/run'); + assert.equal(formatLinkChoiceLabel('https://example.com/run', 10), '11 https://example.com/run'); +}); + test('getRepositoryChoices sorts by most recent update and includes the new-repo action', () => { const service = { listRepositories() { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 361854d..19037cc 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -77,6 +77,8 @@ export type ThreadContextAction = | 'copy-markdown-link' | 'open-first-link' | 'copy-first-link' + | 'open-link-picker' + | 'copy-link-picker' | 'load-neighbors' | 'close'; @@ -713,6 +715,10 @@ export async function startTui(params: StartTuiParams): Promise { } else if (item.action === 'copy-first-link') { const url = getThreadReferenceLinks(threadDetail).at(0); status = url ? (copyTextToClipboard(url) ? 'Copied referenced link' : 'Clipboard copy failed') : 'No referenced links found'; + } else if (item.action === 'open-link-picker') { + openLinkPicker('open'); + } else if (item.action === 'copy-link-picker') { + openLinkPicker('copy'); } else if (item.action === 'load-neighbors') { loadSelectedThreadDetail(true); status = `Loaded neighbors for #${threadDetail?.thread.number ?? selectedThread.number}`; @@ -722,6 +728,62 @@ export async function startTui(params: StartTuiParams): Promise { })); }; + const openLinkPicker = (mode: 'open' | 'copy'): void => { + const links = getThreadReferenceLinks(threadDetail); + if (links.length === 0 || modalOpen) { + status = 'No referenced links found'; + render(); + return; + } + modalOpen = true; + const width = Math.min(92, Math.max(48, Math.max(...links.map((url) => url.length)) + 8)); + const height = Math.min(Number(widgets.screen.height) - 4, Math.max(5, links.length + 2)); + const picker = blessed.list({ + parent: widgets.screen, + border: 'line', + label: mode === 'open' ? ' Open Link ' : ' Copy Link ', + top: 'center', + left: 'center', + width, + height, + tags: false, + keys: true, + vi: true, + mouse: true, + items: links.map((url, index) => formatLinkChoiceLabel(url, index)), + scrollbar: { ch: ' ' }, + style: { + border: { fg: '#fde74c' }, + selected: { bg: '#f7f7ff', fg: 'black', bold: true }, + item: { fg: 'white' }, + bg: '#101522', + }, + }); + + const closePicker = (): void => { + picker.destroy(); + modalOpen = false; + render(); + }; + picker.key(['escape', 'q'], closePicker); + picker.on('select', (_item, index) => { + const url = links[Number(index)]; + if (!url) { + closePicker(); + return; + } + if (mode === 'open') { + openUrl(url); + status = `Opened ${url}`; + } else { + status = copyTextToClipboard(url) ? 'Copied referenced link' : 'Clipboard copy failed'; + } + closePicker(); + }); + picker.focus(); + widgets.screen.render(); + }; + const clusterContextItems = (): ContextMenuItem[] => { const selectedCluster = clusterDetail; const title = selectedCluster ? splitClusterDisplayTitle(selectedCluster.displayTitle) : null; @@ -1383,6 +1445,10 @@ export function getThreadReferenceLinks(threadDetail: TuiThreadDetail | null): s ]).filter((url) => url !== threadDetail.thread.htmlUrl); } +export function formatLinkChoiceLabel(url: string, index: number): string { + return `${String(index + 1).padStart(2)} ${url}`; +} + function extractMarkdownLinks(markdown: string): string[] { const urls: string[] = []; for (const match of markdown.matchAll(/\[[^\]]+\]\((https?:\/\/[^)\s]+)\)/g)) { @@ -1494,6 +1560,12 @@ export function buildThreadContextMenuItems(threadDetail: TuiThreadDetail | null ? [ { label: 'Open first body link', action: 'open-first-link' as const }, { label: 'Copy first body link', action: 'copy-first-link' as const }, + ...(referenceLinks.length > 1 + ? [ + { label: 'Open body link...', action: 'open-link-picker' as const }, + { label: 'Copy body link...', action: 'copy-link-picker' as const }, + ] + : []), ] : []), { label: 'Load neighbors', action: 'load-neighbors' }, From 1c35c83c8d8c06206fff1ccb16ccee0a888dd5ed Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:17:19 -0700 Subject: [PATCH 101/215] feat(tui): add compact detail actions --- apps/cli/src/tui/app.test.ts | 62 +++++++++++++++++++++++++++++++++ apps/cli/src/tui/app.ts | 67 +++++++++++++++++++++++++++++++++--- 2 files changed, 125 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index ac8fba6..ffb8b6e 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -12,7 +12,9 @@ import { formatClusterListLabel, formatClusterShortName, formatLinkChoiceLabel, + formatSummariesForClipboard, getThreadReferenceLinks, + limitRenderedLines, getRepositoryChoices, parseOwnerRepoValue, renderMarkdownForTerminal, @@ -90,6 +92,51 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e assert.ok(rendered.indexOf('Cluster signal:') < rendered.indexOf('{bold}Main{/bold}')); }); +test('renderDetailPane can compact very long bodies', () => { + const cluster: TuiClusterDetail = { + clusterId: 1, + displayTitle: 'Cluster 1', + isClosed: false, + closedAtLocal: null, + closeReasonLocal: null, + totalCount: 1, + issueCount: 1, + pullRequestCount: 0, + latestUpdatedAt: '2026-03-09T00:00:00Z', + representativeThreadId: 1, + representativeNumber: 42, + representativeKind: 'issue', + members: [], + }; + const detail: TuiThreadDetail = { + thread: { + id: 1, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Long body', + body: Array.from({ length: 24 }, (_value, index) => `line ${index + 1}`).join('\n'), + authorLogin: 'dev', + htmlUrl: 'https://example.com/42', + labels: [], + updatedAtGh: '2026-03-09T00:00:00Z', + clusterId: 1, + }, + summaries: {}, + neighbors: [], + }; + + const rendered = renderDetailPane(detail, cluster, 'detail', null, 'compact'); + assert.match(rendered, /line 18/); + assert.doesNotMatch(rendered, /line 24/); + assert.match(rendered, /6 more line/); +}); + test('renderDetailPane gives useful empty detail content before a cluster is selected', () => { const rendered = renderDetailPane(null, null, 'clusters'); @@ -190,6 +237,21 @@ test('renderSummarySections orders and labels LLM summaries for scanning', () => assert.match(rendered, /\{bold\}cron\{\/bold\} timeout/); }); +test('formatSummariesForClipboard preserves ordered raw summary text', () => { + assert.equal( + formatSummariesForClipboard({ + dedupe_summary: 'cluster', + problem_summary: 'purpose', + }), + 'Purpose:\npurpose\n\nCluster signal:\ncluster', + ); +}); + +test('limitRenderedLines truncates long rendered sections with an affordance', () => { + assert.equal(limitRenderedLines('a\nb\nc', 2), 'a\nb\n{gray-fg}... 1 more line(s). Use full detail or copy body to inspect all content.{/gray-fg}'); + assert.equal(limitRenderedLines('a\nb', 2), 'a\nb'); +}); + test('buildThreadContextMenuItems exposes thread actions for right-click menus', () => { const items = buildThreadContextMenuItems({ thread: { diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 19037cc..5b2044f 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -92,6 +92,8 @@ type ContextMenuItem = { run: () => boolean | void; }; +type DetailMode = 'full' | 'compact'; + export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { const term = env.TERM; if (!term) { @@ -142,6 +144,7 @@ export async function startTui(params: StartTuiParams): Promise { }; let sortMode: TuiClusterSortMode = initialPreference.sortMode; let memberSortMode: TuiMemberSortMode = initialPreference.memberSortMode; + let detailMode: DetailMode = 'full'; let minSize: TuiMinSizeFilter = initialPreference.minClusterSize; let wideLayout: TuiWideLayoutPreference = initialPreference.wideLayout; let showClosed = true; @@ -379,7 +382,7 @@ export async function startTui(params: StartTuiParams): Promise { isRendering = false; } - widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane, snapshot)); + widgets.detail.setContent(renderDetailPane(threadDetail, clusterDetail, focusPane, snapshot, detailMode)); updatePaneStyles(widgets, focusPane); const footerLines = [ activityLines.at(-1) ?? status, @@ -637,6 +640,12 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; + const toggleDetailMode = (): void => { + detailMode = detailMode === 'full' ? 'compact' : 'full'; + status = `Detail mode: ${detailMode}`; + render(); + }; + const openContextMenu = (label: string, items: ContextMenuItem[], event?: MouseEventArg): void => { if (modalOpen || items.length === 0) { return; @@ -691,7 +700,8 @@ export async function startTui(params: StartTuiParams): Promise { if (!selectedThread) { return [{ label: 'Close', run: () => undefined }]; } - return buildThreadContextMenuItems(threadDetail).map((item) => ({ + return [ + ...buildThreadContextMenuItems(threadDetail).map((item) => ({ label: item.label, run: () => { if (item.action === 'open') { @@ -725,7 +735,38 @@ export async function startTui(params: StartTuiParams): Promise { focusPane = 'detail'; } }, - })); + })), + ...detailCopyContextItems(), + ]; + }; + + const detailCopyContextItems = (): ContextMenuItem[] => { + if (!threadDetail) return []; + return [ + { + label: detailMode === 'full' ? 'Use compact detail' : 'Use full detail', + run: toggleDetailMode, + }, + { + label: 'Copy body', + run: () => { + status = copyTextToClipboard(threadDetail?.thread.body ?? '') ? 'Copied body' : 'Clipboard copy failed'; + }, + }, + { + label: 'Copy summaries', + run: () => { + status = copyTextToClipboard(formatSummariesForClipboard(threadDetail?.summaries ?? {})) ? 'Copied summaries' : 'Clipboard copy failed'; + }, + }, + { + label: 'Copy links', + run: () => { + const links = getThreadReferenceLinks(threadDetail); + status = links.length > 0 ? (copyTextToClipboard(links.join('\n')) ? 'Copied links' : 'Clipboard copy failed') : 'No referenced links found'; + }, + }, + ]; }; const openLinkPicker = (mode: 'open' | 'copy'): void => { @@ -1327,6 +1368,7 @@ export function renderDetailPane( clusterDetail: TuiClusterDetail | null, focusPane: TuiFocusPane, snapshot?: TuiSnapshot | null, + detailMode: DetailMode = 'full', ): string { if (!clusterDetail) { const repoLabel = snapshot?.repository.fullName ?? 'No repository selected'; @@ -1374,7 +1416,7 @@ export function renderDetailPane( : focusPane === 'detail' ? 'No neighbors available.' : 'Neighbors load when the detail pane is focused.'; - const body = renderMarkdownForTerminal(thread.body ?? '(no body)'); + const body = limitRenderedLines(renderMarkdownForTerminal(thread.body ?? '(no body)'), detailMode === 'compact' ? 18 : 240); const referenceLinks = getThreadReferenceLinks(threadDetail); const linksSection = referenceLinks.length > 0 ? `\n\n{bold}Links{/bold}\n${referenceLinks.map((url, index) => `${index + 1}. ${escapeBlessedText(url)}`).join('\n')}` : ''; @@ -1437,6 +1479,15 @@ export function renderMarkdownForTerminal(markdown: string): string { return rendered.join('\n').replace(/\n{4,}/g, '\n\n\n').trimEnd(); } +export function limitRenderedLines(value: string, maxLines: number): string { + const lines = value.split('\n'); + if (lines.length <= maxLines) { + return value; + } + const omitted = lines.length - maxLines; + return `${lines.slice(0, maxLines).join('\n')}\n{gray-fg}... ${omitted} more line(s). Use full detail or copy body to inspect all content.{/gray-fg}`; +} + export function getThreadReferenceLinks(threadDetail: TuiThreadDetail | null): string[] { if (!threadDetail) return []; return uniqueStrings([ @@ -1487,6 +1538,14 @@ function formatSummaryLabel(key: SummaryKey): string { return 'Cluster signal'; } +export function formatSummariesForClipboard(summaries: TuiThreadDetail['summaries']): string { + return SUMMARY_SECTION_ORDER.flatMap((key) => { + const value = summaries[key]; + if (!value) return []; + return [`${formatSummaryLabel(key)}:\n${value}`]; + }).join('\n\n'); +} + type InlineMarkdownSegment = | { kind: 'text'; value: string } | { kind: 'link'; label: string; url: string }; From f1ca3b1ffa7200dddb61d7c756884902733c0ce3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:19:38 -0700 Subject: [PATCH 102/215] feat(tui): add local cluster actions --- apps/cli/src/tui/app.ts | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 5b2044f..669384d 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -646,6 +646,18 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; + const runLocalMutation = (action: () => { message?: string }): void => { + try { + const result = action(); + status = result.message ?? 'Updated local state'; + clearCaches(); + refreshAll(true); + } catch (error) { + status = formatTuiError(error); + render(); + } + }; + const openContextMenu = (label: string, items: ContextMenuItem[], event?: MouseEventArg): void => { if (modalOpen || items.length === 0) { return; @@ -737,6 +749,43 @@ export async function startTui(params: StartTuiParams): Promise { }, })), ...detailCopyContextItems(), + { + label: 'Close thread locally', + run: () => + runLocalMutation(() => + params.service.closeThreadLocally({ + owner: currentRepository.owner, + repo: currentRepository.repo, + threadNumber: selectedThread.number, + }), + ), + }, + { + label: 'Remove from durable cluster', + run: () => + runLocalMutation(() => + params.service.excludeThreadFromCluster({ + owner: currentRepository.owner, + repo: currentRepository.repo, + clusterId: clusterDetail?.clusterId ?? selectedThread.clusterId ?? 0, + threadNumber: selectedThread.number, + reason: 'TUI manual remove', + }), + ), + }, + { + label: 'Set as durable canonical', + run: () => + runLocalMutation(() => + params.service.setClusterCanonicalThread({ + owner: currentRepository.owner, + repo: currentRepository.repo, + clusterId: clusterDetail?.clusterId ?? selectedThread.clusterId ?? 0, + threadNumber: selectedThread.number, + reason: 'TUI manual canonical', + }), + ), + }, ]; }; @@ -846,6 +895,21 @@ export async function startTui(params: StartTuiParams): Promise { }, ] : []), + ...(selectedCluster + ? [ + { + label: 'Close cluster locally', + run: () => + runLocalMutation(() => + params.service.closeClusterLocally({ + owner: currentRepository.owner, + repo: currentRepository.repo, + clusterId: selectedCluster.clusterId, + }), + ), + }, + ] + : []), { label: 'Sort by size', run: () => setSortMode('size') }, { label: 'Sort by recent', run: () => setSortMode('recent') }, { label: 'Member sort grouped', run: () => setMemberSortMode('kind') }, From 3144a4d46cf1345d278090a374a39b826dae60b8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:28:59 -0700 Subject: [PATCH 103/215] fix(tui): harden modal dismissal and confirmations --- apps/cli/src/tui/app.ts | 250 +++++++++++++++++++++++++++++++++------- 1 file changed, 211 insertions(+), 39 deletions(-) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 669384d..3e3b5b5 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -163,6 +163,7 @@ export async function startTui(params: StartTuiParams): Promise { const clusterDetailCache = new Map(); const threadDetailCache = new Map(); let modalOpen = false; + let dismissModal: (() => void) | null = null; let suppressNextClusterSelect = false; let suppressNextMemberSelect = false; @@ -173,6 +174,25 @@ export async function startTui(params: StartTuiParams): Promise { const formatTuiError = (error: unknown): string => (error instanceof Error ? error.message : String(error)); + const clearModal = (): void => { + modalOpen = false; + dismissModal = null; + }; + + const dismissActiveModal = (): boolean => { + if (!modalOpen) { + return false; + } + const dismiss = dismissModal; + if (dismiss) { + dismiss(); + return true; + } + clearModal(); + render(); + return true; + }; + const rebuildClusterItems = (): void => { if (!snapshot) { clusterItems = ['Pick a repository with p']; @@ -569,12 +589,26 @@ export async function startTui(params: StartTuiParams): Promise { bg: '#101522', }, }); + let closed = false; + const closePrompt = (): boolean => { + if (closed) return false; + closed = true; + prompt.destroy(); + clearModal(); + render(); + return true; + }; + dismissModal = () => { + closePrompt(); + }; prompt.input('Filter clusters', search, (_error, value) => { + if (closed) return; + closed = true; search = (value ?? '').trim(); status = search ? `Filter: ${search}` : 'Filter cleared'; refreshAll(false); prompt.destroy(); - modalOpen = false; + clearModal(); updateFocus('clusters'); }); }; @@ -598,9 +632,23 @@ export async function startTui(params: StartTuiParams): Promise { bg: '#101522', }, }); + let closed = false; + const closePrompt = (): boolean => { + if (closed) return false; + closed = true; + prompt.destroy(); + clearModal(); + render(); + return true; + }; + dismissModal = () => { + closePrompt(); + }; prompt.input('Issue or PR number', '', (_error, value) => { + if (closed) return; + closed = true; prompt.destroy(); - modalOpen = false; + clearModal(); const parsed = Number((value ?? '').trim()); if (!Number.isInteger(parsed) || parsed <= 0) { status = 'Enter a positive issue or PR number'; @@ -658,6 +706,60 @@ export async function startTui(params: StartTuiParams): Promise { } }; + const confirmMutation = (message: string, action: () => { message?: string }): void => { + if (modalOpen) return; + modalOpen = true; + const box = blessed.box({ + parent: widgets.screen, + border: 'line', + label: ' Confirm ', + top: 'center', + left: 'center', + width: '62%', + height: 7, + tags: true, + mouse: true, + content: `${message}\n\nDefault: no. Press y to confirm, Enter/Esc/n/q to cancel.`, + style: { + border: { fg: '#fde74c' }, + fg: 'white', + bg: '#101522', + }, + }); + let closed = false; + const closeConfirm = (confirmed: boolean): void => { + if (closed) return; + closed = true; + widgets.screen.off('keypress', handleKeypress); + box.destroy(); + clearModal(); + if (confirmed) { + runLocalMutation(action); + return; + } + status = 'Cancelled'; + render(); + }; + const handleKeypress = (_char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { + if (key.name === 'y') { + closeConfirm(true); + return; + } + if (key.name === 'enter' || key.name === 'escape' || key.name === 'n' || key.name === 'q') { + closeConfirm(false); + } + }; + box.on('mousedown', (event: MouseEventArg) => { + if (event.button === 'right') { + closeConfirm(false); + } + }); + dismissModal = () => closeConfirm(false); + widgets.screen.on('keypress', handleKeypress); + box.focus(); + widgets.screen.render(); + }; + const openContextMenu = (label: string, items: ContextMenuItem[], event?: MouseEventArg): void => { if (modalOpen || items.length === 0) { return; @@ -689,12 +791,21 @@ export async function startTui(params: StartTuiParams): Promise { }, }); + let closed = false; const closeMenu = (): void => { + if (closed) return; + closed = true; menu.destroy(); - modalOpen = false; + clearModal(); render(); }; + dismissModal = closeMenu; menu.key(['escape', 'q'], closeMenu); + menu.on('mousedown', (mouseEvent: MouseEventArg) => { + if (mouseEvent.button === 'right') { + closeMenu(); + } + }); menu.on('select', (_item, index) => { const item = items[Number(index)]; closeMenu(); @@ -752,38 +863,44 @@ export async function startTui(params: StartTuiParams): Promise { { label: 'Close thread locally', run: () => - runLocalMutation(() => - params.service.closeThreadLocally({ - owner: currentRepository.owner, - repo: currentRepository.repo, - threadNumber: selectedThread.number, - }), + confirmMutation( + `Close ${selectedThread.kind === 'pull_request' ? 'PR' : 'issue'} #${selectedThread.number} locally? This hides it from active sync/cluster views.`, + () => + params.service.closeThreadLocally({ + owner: currentRepository.owner, + repo: currentRepository.repo, + threadNumber: selectedThread.number, + }), ), }, { label: 'Remove from durable cluster', run: () => - runLocalMutation(() => - params.service.excludeThreadFromCluster({ - owner: currentRepository.owner, - repo: currentRepository.repo, - clusterId: clusterDetail?.clusterId ?? selectedThread.clusterId ?? 0, - threadNumber: selectedThread.number, - reason: 'TUI manual remove', - }), + confirmMutation( + `Remove #${selectedThread.number} from durable cluster ${clusterDetail?.clusterId ?? selectedThread.clusterId ?? '?'}? Future clustering should respect this override.`, + () => + params.service.excludeThreadFromCluster({ + owner: currentRepository.owner, + repo: currentRepository.repo, + clusterId: clusterDetail?.clusterId ?? selectedThread.clusterId ?? 0, + threadNumber: selectedThread.number, + reason: 'TUI manual remove', + }), ), }, { label: 'Set as durable canonical', run: () => - runLocalMutation(() => - params.service.setClusterCanonicalThread({ - owner: currentRepository.owner, - repo: currentRepository.repo, - clusterId: clusterDetail?.clusterId ?? selectedThread.clusterId ?? 0, - threadNumber: selectedThread.number, - reason: 'TUI manual canonical', - }), + confirmMutation( + `Set #${selectedThread.number} as canonical for durable cluster ${clusterDetail?.clusterId ?? selectedThread.clusterId ?? '?'}?`, + () => + params.service.setClusterCanonicalThread({ + owner: currentRepository.owner, + repo: currentRepository.repo, + clusterId: clusterDetail?.clusterId ?? selectedThread.clusterId ?? 0, + threadNumber: selectedThread.number, + reason: 'TUI manual canonical', + }), ), }, ]; @@ -850,12 +967,21 @@ export async function startTui(params: StartTuiParams): Promise { }, }); + let closed = false; const closePicker = (): void => { + if (closed) return; + closed = true; picker.destroy(); - modalOpen = false; + clearModal(); render(); }; + dismissModal = closePicker; picker.key(['escape', 'q'], closePicker); + picker.on('mousedown', (mouseEvent: MouseEventArg) => { + if (mouseEvent.button === 'right') { + closePicker(); + } + }); picker.on('select', (_item, index) => { const url = links[Number(index)]; if (!url) { @@ -900,12 +1026,14 @@ export async function startTui(params: StartTuiParams): Promise { { label: 'Close cluster locally', run: () => - runLocalMutation(() => - params.service.closeClusterLocally({ - owner: currentRepository.owner, - repo: currentRepository.repo, - clusterId: selectedCluster.clusterId, - }), + confirmMutation( + `Close cluster ${selectedCluster.clusterId} locally? Default is no.`, + () => + params.service.closeClusterLocally({ + owner: currentRepository.owner, + repo: currentRepository.repo, + clusterId: selectedCluster.clusterId, + }), ), }, ] @@ -955,13 +1083,13 @@ export async function startTui(params: StartTuiParams): Promise { await promptHelp(widgets.screen); render(); } finally { - modalOpen = false; + clearModal(); } })(); }; const requestQuit = (): void => { - if (modalOpen) return; + if (dismissActiveModal()) return; widgets.screen.destroy(); }; @@ -1081,7 +1209,7 @@ export async function startTui(params: StartTuiParams): Promise { status = 'Repository action failed'; pushActivity(`[repo] action failed: ${formatTuiError(error)}`); } finally { - modalOpen = false; + clearModal(); } })(); }; @@ -1123,7 +1251,7 @@ export async function startTui(params: StartTuiParams): Promise { pushActivity(`[repo] selection failed: ${formatTuiError(error)}`); return false; } finally { - modalOpen = false; + clearModal(); } }; @@ -1131,7 +1259,10 @@ export async function startTui(params: StartTuiParams): Promise { requestQuit(); }); widgets.screen.key(['C-c'], () => { - requestQuit(); + widgets.screen.destroy(); + }); + widgets.screen.key(['escape'], () => { + dismissActiveModal(); }); widgets.screen.key(['tab', 'right'], () => { if (modalOpen) return; @@ -1323,6 +1454,11 @@ export async function startTui(params: StartTuiParams): Promise { if (modalOpen || event.button !== 'right') return; openContextMenu('ghcrawl', globalContextItems(), event); }); + widgets.screen.on('mousedown', (event: MouseEventArg) => { + if (event.button === 'right' && modalOpen && dismissModal) { + dismissActiveModal(); + } + }); widgets.screen.on('resize', () => render()); widgets.screen.on('destroy', () => { @@ -1828,8 +1964,12 @@ async function promptHelp(screen: blessed.Widgets.Screen): Promise { screen.render(); return await new Promise((resolve) => { + let closed = false; const finish = (): void => { + if (closed) return; + closed = true; screen.off('keypress', handleKeypress); + screen.off('mousedown', handleMouse); box.destroy(); help.destroy(); screen.render(); @@ -1860,8 +2000,14 @@ async function promptHelp(screen: blessed.Widgets.Screen): Promise { screen.render(); } }; + const handleMouse = (event: MouseEventArg): void => { + if (event.button === 'right') { + finish(); + } + }; screen.on('keypress', handleKeypress); + screen.on('mousedown', handleMouse); }); } @@ -1918,8 +2064,12 @@ async function promptRepositoryChoice( screen.render(); return await new Promise((resolve) => { + let closed = false; const teardown = (): void => { + if (closed) return; + closed = true; screen.off('keypress', handleKeypress); + screen.off('mousedown', handleMouse); box.destroy(); help.destroy(); screen.render(); @@ -1941,8 +2091,14 @@ async function promptRepositoryChoice( } } }; + const handleMouse = (event: MouseEventArg): void => { + if (event.button === 'right') { + finish(null); + } + }; screen.on('keypress', handleKeypress); + screen.on('mousedown', handleMouse); box.on('select', (_item, index) => finish(choices[index] ?? null)); }); } @@ -1966,10 +2122,26 @@ async function promptRepositoryInput(screen: blessed.Widgets.Screen): Promise((resolve) => { - prompt.input('Repository to open (owner/repo)', '', (_error, value) => { + let closed = false; + const finish = (value: RepositoryTarget | null): void => { + if (closed) return; + closed = true; + screen.off('mousedown', handleMouse); prompt.destroy(); + screen.render(); + resolve(value); + }; + const handleMouse = (event: MouseEventArg): void => { + if (event.button === 'right') { + finish(null); + } + }; + + screen.on('mousedown', handleMouse); + prompt.key(['escape'], () => finish(null)); + prompt.input('Repository to open (owner/repo)', '', (_error, value) => { const parsed = parseOwnerRepoValue((value ?? '').trim()); - resolve(parsed); + finish(parsed); }); }); } From 07f45aeb93e15ce1cb40405917df5e9ae14a2969 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:31:07 -0700 Subject: [PATCH 104/215] perf(db): bound sqlite wal growth and busy waits --- packages/api-core/src/db/migrate.test.ts | 12 +++++++++ packages/api-core/src/db/migrate.ts | 7 ++++++ packages/api-core/src/db/sqlite.ts | 31 ++++++++++++++++++++++-- packages/api-core/src/service.ts | 3 ++- 4 files changed, 50 insertions(+), 3 deletions(-) diff --git a/packages/api-core/src/db/migrate.test.ts b/packages/api-core/src/db/migrate.test.ts index cc71748..afbff34 100644 --- a/packages/api-core/src/db/migrate.test.ts +++ b/packages/api-core/src/db/migrate.test.ts @@ -47,3 +47,15 @@ test('migrate creates core tables', () => { db.close(); } }); + +test('openDb applies bounded WAL and concurrency pragmas', () => { + const db = openDb(':memory:'); + try { + assert.equal(db.pragma('foreign_keys', { simple: true }), 1); + assert.equal(db.pragma('busy_timeout', { simple: true }), 5000); + assert.equal(db.pragma('temp_store', { simple: true }), 2); + assert.equal(db.pragma('cache_size', { simple: true }), -65536); + } finally { + db.close(); + } +}); diff --git a/packages/api-core/src/db/migrate.ts b/packages/api-core/src/db/migrate.ts index 1fb744d..614c6eb 100644 --- a/packages/api-core/src/db/migrate.ts +++ b/packages/api-core/src/db/migrate.ts @@ -505,6 +505,8 @@ export function migrate(db: SqliteDatabase): void { } db.exec('create index if not exists idx_threads_repo_number on threads(repo_id, number)'); + db.exec('create index if not exists idx_threads_repo_state_closed on threads(repo_id, state, closed_at_local)'); + db.exec('create index if not exists idx_threads_repo_updated on threads(repo_id, updated_at)'); db.exec('create index if not exists idx_blobs_sha256 on blobs(sha256)'); db.exec('create index if not exists idx_thread_revisions_thread_created on thread_revisions(thread_id, created_at)'); db.exec('create index if not exists idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash)'); @@ -516,14 +518,19 @@ export function migrate(db: SqliteDatabase): void { db.exec('create index if not exists idx_document_summaries_thread_model on document_summaries(thread_id, model)'); db.exec('create index if not exists idx_thread_vectors_basis_model on thread_vectors(basis, model)'); db.exec('create index if not exists idx_pipeline_runs_repo_kind_id on pipeline_runs(repo_id, run_kind, id)'); + db.exec('create index if not exists idx_sync_runs_repo_status_id on sync_runs(repo_id, status, id)'); + db.exec('create index if not exists idx_embedding_runs_repo_status_id on embedding_runs(repo_id, status, id)'); db.exec('create index if not exists idx_cluster_runs_repo_status_id on cluster_runs(repo_id, status, id)'); db.exec('create index if not exists idx_clusters_repo_run_id on clusters(repo_id, cluster_run_id, id)'); + db.exec('create index if not exists idx_clusters_repo_closed on clusters(repo_id, closed_at_local)'); db.exec('create index if not exists idx_cluster_members_thread_cluster on cluster_members(thread_id, cluster_id)'); db.exec('create index if not exists idx_similarity_edge_evidence_repo_pair on similarity_edge_evidence(repo_id, left_thread_id, right_thread_id)'); db.exec('create index if not exists idx_similarity_edge_evidence_repo_state_score on similarity_edge_evidence(repo_id, state, tier, score)'); db.exec('create index if not exists idx_cluster_groups_repo_status on cluster_groups(repo_id, status)'); + db.exec('create index if not exists idx_cluster_groups_repo_updated on cluster_groups(repo_id, updated_at)'); db.exec('create index if not exists idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state)'); db.exec('create index if not exists idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state)'); + db.exec('create index if not exists idx_cluster_memberships_cluster_updated on cluster_memberships(cluster_id, updated_at)'); db.exec('create index if not exists idx_cluster_overrides_repo_target on cluster_overrides(repo_id, cluster_id, thread_id, action)'); db.exec('create index if not exists idx_cluster_events_cluster_created on cluster_events(cluster_id, created_at)'); } diff --git a/packages/api-core/src/db/sqlite.ts b/packages/api-core/src/db/sqlite.ts index 5fd6596..120bea0 100644 --- a/packages/api-core/src/db/sqlite.ts +++ b/packages/api-core/src/db/sqlite.ts @@ -5,10 +5,37 @@ import BetterSqlite3 from 'better-sqlite3'; export type SqliteDatabase = InstanceType; +const BUSY_TIMEOUT_MS = 5_000; +const CACHE_SIZE_KIB = 64 * 1024; +const WAL_AUTOCHECKPOINT_PAGES = 1_000; +const JOURNAL_SIZE_LIMIT_BYTES = 64 * 1024 * 1024; +const MMAP_SIZE_BYTES = 256 * 1024 * 1024; + export function openDb(dbPath: string): SqliteDatabase { fs.mkdirSync(path.dirname(dbPath), { recursive: true }); const db = new BetterSqlite3(dbPath); - db.pragma('journal_mode = WAL'); - db.pragma('foreign_keys = ON'); + configureDb(db, { persistent: dbPath !== ':memory:' }); return db; } + +export function configureDb(db: SqliteDatabase, options: { persistent: boolean }): void { + db.pragma(`busy_timeout = ${BUSY_TIMEOUT_MS}`); + if (options.persistent) { + db.pragma('journal_mode = WAL'); + db.pragma('synchronous = NORMAL'); + db.pragma(`wal_autocheckpoint = ${WAL_AUTOCHECKPOINT_PAGES}`); + db.pragma(`journal_size_limit = ${JOURNAL_SIZE_LIMIT_BYTES}`); + db.pragma(`mmap_size = ${MMAP_SIZE_BYTES}`); + } + db.pragma('foreign_keys = ON'); + db.pragma('temp_store = MEMORY'); + db.pragma(`cache_size = -${CACHE_SIZE_KIB}`); +} + +export function checkpointWal(db: SqliteDatabase): void { + try { + db.pragma('wal_checkpoint(PASSIVE)'); + } catch { + // Other processes may hold the WAL; SQLite will checkpoint on a later connection. + } +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 863a3dc..51077ea 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -98,7 +98,7 @@ import { type GitcrawlConfig, } from './config.js'; import { migrate } from './db/migrate.js'; -import { openDb, type SqliteDatabase } from './db/sqlite.js'; +import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; import { readTextBlob, storeTextBlob } from './db/blob-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; @@ -651,6 +651,7 @@ export class GHCrawlService { close(): void { this.vectorStore.close(); + checkpointWal(this.db); this.db.close(); } From adb45e4db15478af2b1910b96325bd2ef08d25b9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:33:23 -0700 Subject: [PATCH 105/215] feat(tui): auto-refresh external database updates --- apps/cli/src/tui/app.ts | 47 +++++++++++++++++++++++ packages/api-core/src/service.ts | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 3e3b5b5..189424f 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -7,6 +7,7 @@ import type { TuiClusterDetail, TuiClusterSummary, TuiClusterSortMode, + TuiRefreshState, TuiSnapshot, TuiThreadDetail, TuiWideLayoutPreference, @@ -126,6 +127,7 @@ const CLUSTER_NAME_START = CLUSTER_COUNT_WIDTH + CLUSTER_COLUMN_GAP; const CLUSTER_TITLE_START = CLUSTER_NAME_START + CLUSTER_NAME_WIDTH + CLUSTER_COLUMN_GAP; const CLUSTER_MIX_START = CLUSTER_TITLE_START + CLUSTER_TITLE_WIDTH + CLUSTER_COLUMN_GAP; const CLUSTER_UPDATED_START = CLUSTER_MIX_START + CLUSTER_MIX_WIDTH + CLUSTER_COLUMN_GAP; +const TUI_AUTO_REFRESH_INTERVAL_MS = 15_000; export async function startTui(params: StartTuiParams): Promise { const selectedRepository = params.owner && params.repo ? { owner: params.owner, repo: params.repo } : null; @@ -166,6 +168,7 @@ export async function startTui(params: StartTuiParams): Promise { let dismissModal: (() => void) | null = null; let suppressNextClusterSelect = false; let suppressNextMemberSelect = false; + let lastRefreshState: TuiRefreshState | null = null; const clearCaches = (): void => { clusterDetailCache.clear(); @@ -310,6 +313,10 @@ export async function startTui(params: StartTuiParams): Promise { search, includeClosedClusters: showClosed, }); + lastRefreshState = params.service.getTuiRefreshState({ + owner: currentRepository.owner, + repo: currentRepository.repo, + }); selectedClusterId = preserveSelectedId(snapshot.clusters.map((cluster) => cluster.clusterId), previousClusterId); rebuildClusterItems(); @@ -351,6 +358,28 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; + const autoRefreshIfChanged = (): void => { + if (!currentRepository.owner || !currentRepository.repo || modalOpen || isRendering) { + return; + } + try { + const nextState = params.service.getTuiRefreshState({ + owner: currentRepository.owner, + repo: currentRepository.repo, + }); + if (lastRefreshState && formatTuiRefreshStateKey(nextState) !== formatTuiRefreshStateKey(lastRefreshState)) { + refreshAll(true); + status = 'External DB update detected; refreshed'; + render(); + return; + } + lastRefreshState = nextState; + } catch (error) { + status = `Auto-refresh failed: ${formatTuiError(error)}`; + render(); + } + }; + const updateFocus = (nextFocus: TuiFocusPane): void => { focusPane = nextFocus; if (focusPane === 'detail' && selectedMemberThreadId !== null) { @@ -1461,7 +1490,11 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.screen.on('resize', () => render()); + const autoRefreshTimer = setInterval(autoRefreshIfChanged, TUI_AUTO_REFRESH_INTERVAL_MS); + autoRefreshTimer.unref?.(); + widgets.screen.on('destroy', () => { + clearInterval(autoRefreshTimer); widgets.screen.program.showCursor(); }); @@ -1643,6 +1676,20 @@ export function escapeBlessedText(value: string): string { return value.replace(/\\/g, '\\\\').replace(/\{/g, '\\{').replace(/\}/g, '\\}'); } +function formatTuiRefreshStateKey(state: TuiRefreshState): string { + return [ + state.repositoryUpdatedAt ?? '', + state.threadUpdatedAt ?? '', + state.threadClosedAt ?? '', + state.clusterClosedAt ?? '', + state.durableClusterUpdatedAt ?? '', + state.durableMembershipUpdatedAt ?? '', + state.latestSyncRunId ?? '', + state.latestEmbeddingRunId ?? '', + state.latestClusterRunId ?? '', + ].join('|'); +} + export function splitClusterDisplayTitle(displayTitle: string): { name: string; title: string } { const match = displayTitle.match(/^([a-z]+(?:-[a-z]+){2})\s{2,}(.+)$/); if (match) { diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 51077ea..2611e63 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -370,6 +370,18 @@ export type TuiSnapshot = { clusters: TuiClusterSummary[]; }; +export type TuiRefreshState = { + repositoryUpdatedAt: string | null; + threadUpdatedAt: string | null; + threadClosedAt: string | null; + clusterClosedAt: string | null; + durableClusterUpdatedAt: string | null; + durableMembershipUpdatedAt: string | null; + latestSyncRunId: number | null; + latestEmbeddingRunId: number | null; + latestClusterRunId: number | null; +}; + export type DoctorResult = { health: HealthResponse; github: { @@ -3319,6 +3331,60 @@ export class GHCrawlService { }; } + getTuiRefreshState(params: { owner: string; repo: string }): TuiRefreshState { + const repository = this.requireRepository(params.owner, params.repo); + const threadState = this.db + .prepare( + `select + max(updated_at) as thread_updated_at, + max(closed_at_local) as thread_closed_at + from threads + where repo_id = ?`, + ) + .get(repository.id) as { thread_updated_at: string | null; thread_closed_at: string | null }; + const clusterState = this.db + .prepare( + `select max(closed_at_local) as cluster_closed_at + from clusters + where repo_id = ?`, + ) + .get(repository.id) as { cluster_closed_at: string | null }; + const durableClusterState = this.db + .prepare( + `select max(updated_at) as durable_cluster_updated_at + from cluster_groups + where repo_id = ?`, + ) + .get(repository.id) as { durable_cluster_updated_at: string | null }; + const durableMembershipState = this.db + .prepare( + `select max(cm.updated_at) as durable_membership_updated_at + from cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + where cg.repo_id = ?`, + ) + .get(repository.id) as { durable_membership_updated_at: string | null }; + const latestSync = this.db + .prepare("select id from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(repository.id) as { id: number } | undefined; + const latestEmbedding = this.db + .prepare("select id from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(repository.id) as { id: number } | undefined; + const latestClusterRun = this.getLatestClusterRun(repository.id); + + return { + repositoryUpdatedAt: repository.updatedAt, + threadUpdatedAt: threadState.thread_updated_at, + threadClosedAt: threadState.thread_closed_at, + clusterClosedAt: clusterState.cluster_closed_at, + durableClusterUpdatedAt: durableClusterState.durable_cluster_updated_at, + durableMembershipUpdatedAt: durableMembershipState.durable_membership_updated_at, + latestSyncRunId: latestSync?.id ?? null, + latestEmbeddingRunId: latestEmbedding?.id ?? null, + latestClusterRunId: latestClusterRun?.id ?? null, + }; + } + getTuiClusterDetail(params: { owner: string; repo: string; clusterId: number; clusterRunId?: number }): TuiClusterDetail { const repository = this.requireRepository(params.owner, params.repo); const clusterRunId = From 8443107e8eb61bb217dcc95c3c339c36d3a489a4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 15:44:07 -0700 Subject: [PATCH 106/215] fix(cli): pin launcher node for native sqlite deps --- .node-version | 1 + apps/cli/bin/ghcrawl.js | 45 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 .node-version diff --git a/.node-version b/.node-version new file mode 100644 index 0000000..442c758 --- /dev/null +++ b/.node-version @@ -0,0 +1 @@ +22.20.0 diff --git a/apps/cli/bin/ghcrawl.js b/apps/cli/bin/ghcrawl.js index e34afd0..fb3f8a5 100755 --- a/apps/cli/bin/ghcrawl.js +++ b/apps/cli/bin/ghcrawl.js @@ -1,13 +1,54 @@ #!/usr/bin/env node -import { existsSync } from 'node:fs'; -import { spawn } from 'node:child_process'; +import { existsSync, readFileSync } from 'node:fs'; +import { spawn, spawnSync } from 'node:child_process'; import { createRequire } from 'node:module'; import path from 'node:path'; import { fileURLToPath, pathToFileURL } from 'node:url'; const binDir = path.dirname(fileURLToPath(import.meta.url)); +const repoRoot = path.resolve(binDir, '..', '..', '..'); const distEntrypoint = path.join(binDir, '..', 'dist', 'main.js'); const sourceEntrypoint = path.join(binDir, '..', 'src', 'main.ts'); +const nodeVersionPath = path.join(repoRoot, '.node-version'); + +if (!process.env.GHCRAWL_NODE_REEXEC && existsSync(nodeVersionPath)) { + const desiredNodeVersion = readFileSync(nodeVersionPath, 'utf8').trim(); + if (desiredNodeVersion) { + const nodenvResult = spawnSync('nodenv', ['which', 'node'], { + encoding: 'utf8', + env: { + ...process.env, + NODENV_VERSION: desiredNodeVersion, + }, + }); + const nodenvNode = nodenvResult.status === 0 ? nodenvResult.stdout.trim() : ''; + if (nodenvNode && path.resolve(nodenvNode) !== path.resolve(process.execPath)) { + const child = spawn(nodenvNode, process.argv.slice(1), { + stdio: 'inherit', + env: { + ...process.env, + GHCRAWL_NODE_REEXEC: '1', + NODENV_VERSION: desiredNodeVersion, + }, + }); + + child.on('exit', (code, signal) => { + if (signal) { + process.kill(process.pid, signal); + return; + } + process.exit(code ?? 0); + }); + + child.on('error', (error) => { + process.stderr.write(`${error instanceof Error ? error.message : String(error)}\n`); + process.exit(1); + }); + + await new Promise(() => undefined); + } + } +} if (!existsSync(sourceEntrypoint) && existsSync(distEntrypoint)) { const entrypoint = await import(pathToFileURL(distEntrypoint).href); From 5c83dc45448863206e337437b32042dd1341f064 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 16:45:10 -0700 Subject: [PATCH 107/215] feat(tui): restore mouse menus and richer previews --- apps/cli/src/tui/app.test.ts | 17 ++++- apps/cli/src/tui/app.ts | 103 +++++++++++++++++++++----- apps/cli/src/tui/state.test.ts | 4 +- apps/cli/src/tui/state.ts | 7 +- packages/api-core/src/service.test.ts | 30 ++++++++ packages/api-core/src/service.ts | 32 ++++++++ 6 files changed, 168 insertions(+), 25 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index ffb8b6e..b30eeab 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -68,6 +68,14 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e summaries: { dedupe_summary: 'Summary {yellow-fg}text{/yellow-fg}', }, + topFiles: [ + { + path: 'apps/cli/src/tui/app.ts', + status: 'modified', + additions: 10, + deletions: 2, + }, + ], neighbors: [ { threadId: 2, @@ -83,13 +91,15 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e assert.match(rendered, /C1 \(#42 representative issue\)/); assert.match(rendered, /Bad \\{bold\\}title\\{\/bold\\}/); assert.match(rendered, /Cluster signal:/); - assert.match(rendered, /Main/); + assert.match(rendered, /Top files/); + assert.match(rendered, /apps\/cli\/src\/tui\/app\.ts/); + assert.match(rendered, /Main Preview/); assert.match(rendered, /Body with \\{red-fg\\}tags\\{\/red-fg\\}/); assert.match(rendered, /Links/); assert.match(rendered, /1\. https:\/\/example\.com\/body-link/); assert.match(rendered, /Summary \\{yellow-fg\\}text\\{\/yellow-fg\\}/); assert.match(rendered, /Neighbor \\{blue-fg\\}title\\{\/blue-fg\\}/); - assert.ok(rendered.indexOf('Cluster signal:') < rendered.indexOf('{bold}Main{/bold}')); + assert.ok(rendered.indexOf('Cluster signal:') < rendered.indexOf('{bold}Main Preview{/bold}')); }); test('renderDetailPane can compact very long bodies', () => { @@ -128,6 +138,7 @@ test('renderDetailPane can compact very long bodies', () => { clusterId: 1, }, summaries: {}, + topFiles: [], neighbors: [], }; @@ -273,6 +284,7 @@ test('buildThreadContextMenuItems exposes thread actions for right-click menus', clusterId: 1, }, summaries: {}, + topFiles: [], neighbors: [], }); @@ -320,6 +332,7 @@ test('getThreadReferenceLinks extracts unique body and summary links', () => { summaries: { dedupe_summary: 'same as https://example.com/raw and https://example.com/summary', }, + topFiles: [], neighbors: [], }); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 189424f..6d6faaa 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -209,7 +209,7 @@ export async function startTui(params: StartTuiParams): Promise { clusterItems.push(...snapshot.clusters.map((cluster, index) => { clusterIndexById.set(cluster.clusterId, index + CLUSTER_LIST_FIRST_ITEM_INDEX); const label = formatClusterListLabel(cluster); - return cluster.isClosed ? `{gray-fg}${escapeBlessedText(label)}{/gray-fg}` : escapeBlessedText(label); + return cluster.isClosed ? `{gray-fg}${escapeBlessedText(label)}{/gray-fg}` : `{green-fg}${escapeBlessedText(label)}{/green-fg}`; })); widgets.clusters.setItems(clusterItems); }; @@ -499,6 +499,34 @@ export async function startTui(params: StartTuiParams): Promise { moveSelection(delta, { steps: getFocusedListPageSize(), wrap: false }); }; + const jumpFocusedPaneToEdge = (edge: 'start' | 'end'): void => { + if (focusPane === 'detail') { + if (edge === 'start') { + widgets.detail.setScroll(0); + } else { + widgets.detail.setScrollPerc(100); + } + widgets.screen.render(); + return; + } + + if (focusPane === 'clusters') { + if (!snapshot || snapshot.clusters.length === 0) return; + selectClusterIndex(edge === 'start' ? CLUSTER_LIST_FIRST_ITEM_INDEX : snapshot.clusters.length); + return; + } + + if (focusPane === 'members') { + const selectable = memberRows + .map((row, index) => ({ row, index })) + .filter((item) => item.row.selectable); + const target = edge === 'start' ? selectable.at(0) : selectable.at(-1); + if (target) { + selectMemberIndex(target.index); + } + } + }; + const setSortMode = (nextSortMode: TuiClusterSortMode): void => { if (sortMode === nextSortMode) { return; @@ -1327,15 +1355,11 @@ export async function startTui(params: StartTuiParams): Promise { }); widgets.screen.key(['home'], () => { if (modalOpen) return; - if (focusPane !== 'detail') return; - widgets.detail.setScroll(0); - widgets.screen.render(); + jumpFocusedPaneToEdge('start'); }); widgets.screen.key(['end'], () => { if (modalOpen) return; - if (focusPane !== 'detail') return; - widgets.detail.setScrollPerc(100); - widgets.screen.render(); + jumpFocusedPaneToEdge('end'); }); widgets.screen.key(['enter'], () => { if (modalOpen) return; @@ -1428,6 +1452,18 @@ export async function startTui(params: StartTuiParams): Promise { } openContextMenu('Cluster', clusterContextItems(), event); }); + widgets.clusters.on('wheelup', () => { + if (isRendering || modalOpen) return; + focusPane = 'clusters'; + widgets.clusters.focus(); + moveSelection(-1, { wrap: false }); + }); + widgets.clusters.on('wheeldown', () => { + if (isRendering || modalOpen) return; + focusPane = 'clusters'; + widgets.clusters.focus(); + moveSelection(1, { wrap: false }); + }); widgets.members.on('select item', (_item, index) => { if (isRendering || modalOpen) return; if (suppressNextMemberSelect) { @@ -1466,6 +1502,18 @@ export async function startTui(params: StartTuiParams): Promise { } openContextMenu('Thread', threadContextItems(), event); }); + widgets.members.on('wheelup', () => { + if (isRendering || modalOpen) return; + focusPane = 'members'; + widgets.members.focus(); + moveSelection(-1, { wrap: false }); + }); + widgets.members.on('wheeldown', () => { + if (isRendering || modalOpen) return; + focusPane = 'members'; + widgets.members.focus(); + moveSelection(1, { wrap: false }); + }); widgets.detail.on('click', () => { if (modalOpen) return; updateFocus('detail'); @@ -1475,6 +1523,18 @@ export async function startTui(params: StartTuiParams): Promise { updateFocus('detail'); openContextMenu(threadDetail ? 'Thread' : clusterDetail ? 'Cluster' : 'ghcrawl', threadDetail ? threadContextItems() : clusterDetail ? clusterContextItems() : globalContextItems(), event); }); + widgets.detail.on('wheelup', () => { + if (modalOpen) return; + focusPane = 'detail'; + widgets.detail.focus(); + scrollDetail(-3); + }); + widgets.detail.on('wheeldown', () => { + if (modalOpen) return; + focusPane = 'detail'; + widgets.detail.focus(); + scrollDetail(3); + }); widgets.header.on('mousedown', (event: MouseEventArg) => { if (modalOpen || event.button !== 'right') return; openContextMenu('ghcrawl', globalContextItems(), event); @@ -1483,11 +1543,6 @@ export async function startTui(params: StartTuiParams): Promise { if (modalOpen || event.button !== 'right') return; openContextMenu('ghcrawl', globalContextItems(), event); }); - widgets.screen.on('mousedown', (event: MouseEventArg) => { - if (event.button === 'right' && modalOpen && dismissModal) { - dismissActiveModal(); - } - }); widgets.screen.on('resize', () => render()); const autoRefreshTimer = setInterval(autoRefreshIfChanged, TUI_AUTO_REFRESH_INTERVAL_MS); @@ -1556,7 +1611,6 @@ function createWidgets(owner: string, repo: string): Widgets { item: { fg: 'white' }, selected: { bg: '#9bc53d', fg: 'black', bold: true }, }, - scrollbar: { ch: ' ' }, }); const detail = blessed.box({ parent: screen, @@ -1567,7 +1621,6 @@ function createWidgets(owner: string, repo: string): Widgets { alwaysScroll: true, keys: false, mouse: true, - scrollbar: { ch: ' ' }, style: { border: { fg: '#fde74c' }, fg: 'white', @@ -1641,6 +1694,7 @@ export function renderDetailPane( ? `{bold}Closed:{/bold} ${escapeBlessedText(thread.closedAtLocal ?? thread.closedAtGh ?? 'yes')} ${thread.closeReasonLocal ? `(${escapeBlessedText(thread.closeReasonLocal)})` : ''}`.trimEnd() : '{bold}Closed:{/bold} no'; const summaries = renderSummarySections(threadDetail.summaries); + const topFiles = renderTopFiles(threadDetail.topFiles); const neighbors = threadDetail.neighbors.length > 0 ? threadDetail.neighbors @@ -1656,14 +1710,15 @@ export function renderDetailPane( return [ `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, - '', + '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', `${closedLabel} {bold}Updated:{/bold} ${escapeBlessedText(formatRelativeTime(thread.updatedAtGh))} {bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, `{bold}Labels:{/bold} ${labels}`, `{bold}URL:{/bold} ${formatTerminalLink(thread.htmlUrl, thread.htmlUrl)}`, + topFiles ? `\n{bold}Top files{/bold}\n${topFiles}` : '', + summaries ? `\n{bold}LLM Summary{/bold}\n${summaries}` : '', '', - summaries ? `\n\n${summaries}` : '', - '', - `{bold}Main{/bold}`, + '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', + `{bold}Main Preview{/bold}`, body, linksSection, `\n\n{bold}Neighbors{/bold}\n${neighbors}`, @@ -1778,6 +1833,18 @@ export function renderSummarySections(summaries: TuiThreadDetail['summaries']): }).join('\n\n'); } +export function renderTopFiles(files: TuiThreadDetail['topFiles']): string { + if (files.length === 0) return ''; + return files + .slice(0, 5) + .map((file) => { + const churn = file.additions + file.deletions; + const status = file.status ? `${file.status} ` : ''; + return `- ${escapeBlessedText(file.path)} {gray-fg}${escapeBlessedText(status)}+${file.additions}/-${file.deletions} (${churn}){/gray-fg}`; + }) + .join('\n'); +} + function formatSummaryLabel(key: SummaryKey): string { if (key === 'problem_summary') return 'Purpose'; if (key === 'solution_summary') return 'Solution'; diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index ccc7362..83d0ad1 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -154,8 +154,8 @@ test('buildMemberRows groups issues and pull requests and selection skips header const rows = buildMemberRows(detail); assert.equal(rows[0]?.selectable, false); assert.match(rows[0]?.label ?? '', /number\s+state\s+updated\s+title/); - assert.match(rows[2]?.label ?? '', /#42\s+open\s+\d+d ago\s+Issue one/); - assert.match(rows[4]?.label ?? '', /closed\s+\d+d ago\s+Bug: PR one/); + assert.match(rows[2]?.label ?? '', /#42\s+\{green-fg\}open\{\/green-fg\}\s+\d+d ago\s+Issue one/); + assert.match(rows[4]?.label ?? '', /\{gray-fg\}closed\{\/gray-fg\}\s+\d+d ago\s+Bug: PR one/); assert.equal(findSelectableIndex(rows, 10), 2); assert.equal(moveSelectableIndex(rows, 2, 1), 4); }); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index dfd2897..95e81ca 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -153,10 +153,11 @@ function compareClusters(left: TuiClusterSummary, right: TuiClusterSummary, sort function formatMemberLabel(number: number, title: string, updatedAtGh: string | null, isClosed: boolean): string { const updated = formatRelativeTime(updatedAtGh); const numberLabel = `#${number}`.padEnd(MEMBER_NUMBER_WIDTH).slice(0, MEMBER_NUMBER_WIDTH); - const status = (isClosed ? 'closed' : 'open').padEnd(MEMBER_STATE_WIDTH); + const status = isClosed ? '{gray-fg}closed{/gray-fg} ' : '{green-fg}open{/green-fg} '; const age = updated.padEnd(MEMBER_UPDATED_WIDTH).slice(0, MEMBER_UPDATED_WIDTH); - const label = escapeBlessedInline(`${numberLabel}${status}${age}${normalizeMemberTitle(title)}`); - return isClosed ? `{gray-fg}${label}{/gray-fg}` : label; + const titleLabel = escapeBlessedInline(normalizeMemberTitle(title)); + const prefix = `${escapeBlessedInline(numberLabel)}${status}${escapeBlessedInline(age)}`; + return isClosed ? `{gray-fg}${prefix}${titleLabel}{/gray-fg}` : `${prefix}${titleLabel}`; } export function formatMemberListHeader(sortMode: TuiMemberSortMode = 'kind'): string { diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index c8d9570..3926f90 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3338,6 +3338,30 @@ test('tui cluster detail and thread detail expose members, summaries, and neighb values (?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(11, 'title', 'text-embedding-3-large', 2, 'hash-title-43', '[0.95,0.05]', now, now); + service.db + .prepare( + `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + values (?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1000, 10, now, 'hash-42', 'title-hash', 'body-hash', 'labels-hash', now); + service.db + .prepare( + `insert into thread_code_snapshots (id, thread_revision_id, base_sha, head_sha, files_changed, additions, deletions, patch_digest, created_at) + values (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(2000, 1000, 'base', 'head', 2, 14, 4, 'patch-digest', now); + service.db + .prepare( + `insert into thread_changed_files (snapshot_id, path, status, additions, deletions, previous_path, patch_hash) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(2000, 'apps/cli/src/tui/app.ts', 'modified', 10, 2, null, 'patch-1'); + service.db + .prepare( + `insert into thread_changed_files (snapshot_id, path, status, additions, deletions, previous_path, patch_hash) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(2000, 'README.md', 'modified', 4, 2, null, 'patch-2'); const detail = service.getTuiClusterDetail({ owner: 'openclaw', repo: 'openclaw', clusterId: 100 }); assert.equal(detail.totalCount, 2); @@ -3353,6 +3377,12 @@ test('tui cluster detail and thread detail expose members, summaries, and neighb assert.equal(threadDetail.thread.htmlUrl, 'https://github.com/openclaw/openclaw/issues/42'); assert.equal(threadDetail.summaries.problem_summary, 'Downloads hang before completion.'); assert.equal(threadDetail.summaries.dedupe_summary, 'Transfer stalls near completion.'); + assert.deepEqual(threadDetail.topFiles[0], { + path: 'apps/cli/src/tui/app.ts', + status: 'modified', + additions: 10, + deletions: 2, + }); assert.equal(threadDetail.neighbors[0]?.number, 43); } finally { service.close(); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 2611e63..3a54edb 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -360,6 +360,12 @@ export type TuiClusterDetail = { export type TuiThreadDetail = { thread: ThreadDto; summaries: Partial>; + topFiles: Array<{ + path: string; + status: string | null; + additions: number; + deletions: number; + }>; neighbors: SearchHitDto['neighbors']; }; @@ -3504,6 +3510,7 @@ export class GHCrawlService { summaries[summary.summary_kind] = summary.summary_text; } } + const topFiles = this.getTopChangedFiles(row.id, 5); let neighbors: SearchHitDto['neighbors'] = []; if (params.includeNeighbors !== false) { @@ -3526,10 +3533,35 @@ export class GHCrawlService { return { thread: threadToDto(row, clusterMembership?.cluster_id ?? null), summaries, + topFiles, neighbors, }; } + private getTopChangedFiles(threadId: number, limit: number): TuiThreadDetail['topFiles'] { + const latestRevision = this.db + .prepare( + `select id + from thread_revisions + where thread_id = ? + order by id desc + limit 1`, + ) + .get(threadId) as { id: number } | undefined; + if (!latestRevision) return []; + + return this.db + .prepare( + `select cf.path, cf.status, cf.additions, cf.deletions + from thread_code_snapshots cs + join thread_changed_files cf on cf.snapshot_id = cs.id + where cs.thread_revision_id = ? + order by (cf.additions + cf.deletions) desc, cf.path asc + limit ?`, + ) + .all(latestRevision.id, limit) as TuiThreadDetail['topFiles']; + } + async rerunAction(request: ActionRequest): Promise { switch (request.action) { case 'summarize': { From 889e8e6c3f826dcb84807ad83331852216a421e5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 16:45:16 -0700 Subject: [PATCH 108/215] docs(skill): add ghcrawl cluster operator guide --- .../skills/ghcrawl-cluster-operator/SKILL.md | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 .agents/skills/ghcrawl-cluster-operator/SKILL.md diff --git a/.agents/skills/ghcrawl-cluster-operator/SKILL.md b/.agents/skills/ghcrawl-cluster-operator/SKILL.md new file mode 100644 index 0000000..7606eb9 --- /dev/null +++ b/.agents/skills/ghcrawl-cluster-operator/SKILL.md @@ -0,0 +1,120 @@ +--- +name: ghcrawl-cluster-operator +description: "Use when inspecting a ghcrawl SQLite store, pulling GitHub issue/PR data, refreshing summaries, embeddings, and clusters, or extracting one cluster and its evidence through the ghcrawl CLI." +--- + +# ghcrawl Cluster Operator + +Use this skill when operating this repo's local-first GitHub crawler and cluster browser. + +## Ground Rules + +- Prefer read-only inspection commands first: `doctor`, `runs`, `clusters`, `cluster-explain`, `threads`. +- Treat `refresh`, `sync`, `summarize`, `key-summaries`, and `embed` as remote/API-spend commands. +- `cluster` is local-only but can be CPU-heavy on huge repos. +- Always pass `--json` for agent-readable output. +- Use `--include-code` only when file overlap matters; it hydrates PR file metadata and can increase DB size. + +## Setup Check + +```bash +ghcrawl doctor --json +ghcrawl configure --json +ghcrawl runs owner/repo --limit 10 --json +``` + +If the local store is empty or stale, pull current open GitHub data: + +```bash +ghcrawl sync owner/repo --limit 200 --json +ghcrawl sync owner/repo --include-code --limit 200 --json +``` + +For a normal end-to-end update: + +```bash +ghcrawl refresh owner/repo --json +``` + +Use code hydration when file evidence should affect clustering: + +```bash +ghcrawl refresh owner/repo --include-code --json +``` + +## LLM And Embedding Pipeline + +Default clustering can run without LLM summaries. LLM summaries and embeddings enrich the cluster graph. + +Useful configurations: + +```bash +ghcrawl configure --summary-model gpt-5.4 --embedding-basis title_original --json +ghcrawl configure --summary-model gpt-5.4 --embedding-basis llm_key_summary --json +``` + +For structured key summaries: + +```bash +ghcrawl key-summaries owner/repo --limit 200 --json +ghcrawl key-summaries owner/repo --number 12345 --json +``` + +Then refresh vectors and clusters: + +```bash +ghcrawl embed owner/repo --json +ghcrawl cluster owner/repo --json +``` + +## Pull A Cluster And Its Info + +List clusters: + +```bash +ghcrawl clusters owner/repo --min-size 2 --limit 20 --sort size --json +ghcrawl clusters owner/repo --search "cron timeout" --limit 10 --json +``` + +Explain one durable cluster: + +```bash +ghcrawl cluster-explain owner/repo --id 123 --member-limit 50 --event-limit 50 --json +``` + +Inspect current durable clusters with members: + +```bash +ghcrawl durable-clusters owner/repo --member-limit 25 --json +ghcrawl durable-clusters owner/repo --include-inactive --member-limit 25 --json +``` + +Pull specific issues/PRs from the local store: + +```bash +ghcrawl threads owner/repo --numbers 123,456,789 --json +``` + +Open the TUI: + +```bash +ghcrawl tui owner/repo +``` + +## Local Maintainer Actions + +Use these only when the operator asks for durable cluster edits: + +```bash +ghcrawl exclude-cluster-member owner/repo --id 123 --number 456 --reason "not same root cause" --json +ghcrawl include-cluster-member owner/repo --id 123 --number 456 --reason "same root cause" --json +ghcrawl set-cluster-canonical owner/repo --id 123 --number 456 --reason "clearest report" --json +ghcrawl merge-clusters owner/repo --source 123 --target 456 --reason "same issue family" --json +``` + +After edits, re-run: + +```bash +ghcrawl cluster owner/repo --json +ghcrawl cluster-explain owner/repo --id 123 --member-limit 50 --event-limit 50 --json +``` From cccbdd6db5fe63cac67ee32ce57a900632dc1958 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 16:54:38 -0700 Subject: [PATCH 109/215] fix(tui): expand copy menus and quiet preview markdown --- apps/cli/src/tui/app.test.ts | 67 +++++++++++++++- apps/cli/src/tui/app.ts | 149 ++++++++++++++++++++++++++++++----- 2 files changed, 196 insertions(+), 20 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index b30eeab..ada5f3a 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -8,11 +8,15 @@ import { buildHelpContent, escapeBlessedText, formatClusterDateColumn, + formatClusterForClipboard, formatClusterListHeader, formatClusterListLabel, + formatClusterMembersForClipboard, formatClusterShortName, formatLinkChoiceLabel, formatSummariesForClipboard, + formatThreadDetailForClipboard, + formatVisibleClustersForClipboard, getThreadReferenceLinks, limitRenderedLines, getRepositoryChoices, @@ -100,6 +104,7 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e assert.match(rendered, /Summary \\{yellow-fg\\}text\\{\/yellow-fg\\}/); assert.match(rendered, /Neighbor \\{blue-fg\\}title\\{\/blue-fg\\}/); assert.ok(rendered.indexOf('Cluster signal:') < rendered.indexOf('{bold}Main Preview{/bold}')); + assert.ok(rendered.indexOf('{bold}LLM Summary{/bold}') < rendered.indexOf('{bold}Top files{/bold}')); }); test('renderDetailPane can compact very long bodies', () => { @@ -228,12 +233,72 @@ test('renderMarkdownForTerminal formats common markdown without exposing blessed ); assert.match(rendered, /\{bold\}Heading \\{boom\\}\{\/bold\}/); - assert.match(rendered, /- \{bold\}bold\{\/bold\} and \{yellow-fg\}code\{\/yellow-fg\}/); + assert.match(rendered, /- \{bold\}bold\{\/bold\} and code/); + assert.doesNotMatch(rendered, /yellow-fg/); assert.match(rendered, /site /); assert.match(rendered, /https:\/\/example\.com\/raw/); assert.doesNotMatch(rendered, /\x1B\]8;;/); }); +test('clipboard formatters expose cluster and thread context without blessed tags', () => { + const cluster: TuiClusterDetail = { + clusterId: 7, + displayTitle: 'alpha-bravo-charlie Fix retries', + isClosed: false, + closedAtLocal: null, + closeReasonLocal: null, + totalCount: 1, + issueCount: 1, + pullRequestCount: 0, + latestUpdatedAt: '2026-03-09T00:00:00Z', + representativeThreadId: 1, + representativeNumber: 42, + representativeKind: 'issue', + members: [ + { + id: 1, + number: 42, + kind: 'issue', + isClosed: false, + title: 'Fix retries', + updatedAtGh: '2026-03-09T00:00:00Z', + htmlUrl: 'https://example.com/42', + labels: [], + clusterScore: null, + }, + ], + }; + const detail: TuiThreadDetail = { + thread: { + id: 1, + repoId: 1, + number: 42, + kind: 'issue', + state: 'open', + isClosed: false, + closedAtGh: null, + closedAtLocal: null, + closeReasonLocal: null, + title: 'Fix retries', + body: 'Body', + authorLogin: 'dev', + htmlUrl: 'https://example.com/42', + labels: ['bug'], + updatedAtGh: '2026-03-09T00:00:00Z', + clusterId: 7, + }, + summaries: { problem_summary: 'Retries fail' }, + topFiles: [{ path: 'src/retry.ts', status: 'modified', additions: 3, deletions: 1 }], + neighbors: [], + }; + + assert.match(formatClusterForClipboard(cluster), /Name: alpha-bravo-charlie/); + assert.match(formatClusterMembersForClipboard(cluster), /Issue #42 \[open\] Fix retries/); + assert.match(formatThreadDetailForClipboard(detail, cluster), /LLM Summary:\nPurpose:\nRetries fail/); + assert.match(formatThreadDetailForClipboard(detail, cluster), /Top files:\nsrc\/retry\.ts modified \+3\/-1/); + assert.match(formatVisibleClustersForClipboard([{ ...cluster, searchText: '' }]), /C7 \[open\] 1 items alpha-bravo-charlie/); +}); + test('renderSummarySections orders and labels LLM summaries for scanning', () => { const rendered = renderSummarySections({ dedupe_summary: 'same failure mode', diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 6d6faaa..f299226 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -745,6 +745,14 @@ export async function startTui(params: StartTuiParams): Promise { render(); }; + const copyToStatus = (label: string, value: string): void => { + if (!value.trim()) { + status = `No ${label} to copy`; + return; + } + status = copyTextToClipboard(value) ? `Copied ${label}` : 'Clipboard copy failed'; + }; + const toggleDetailMode = (): void => { detailMode = detailMode === 'full' ? 'compact' : 'full'; status = `Detail mode: ${detailMode}`; @@ -917,6 +925,7 @@ export async function startTui(params: StartTuiParams): Promise { }, })), ...detailCopyContextItems(), + ...clusterCopyContextItems({ includeVisibleClusters: false }), { label: 'Close thread locally', run: () => @@ -965,28 +974,35 @@ export async function startTui(params: StartTuiParams): Promise { const detailCopyContextItems = (): ContextMenuItem[] => { if (!threadDetail) return []; + const selectedThreadDetail = threadDetail; return [ { label: detailMode === 'full' ? 'Use compact detail' : 'Use full detail', run: toggleDetailMode, }, + { + label: 'Copy all detail', + run: () => { + copyToStatus('detail', formatThreadDetailForClipboard(selectedThreadDetail, clusterDetail)); + }, + }, { label: 'Copy body', run: () => { - status = copyTextToClipboard(threadDetail?.thread.body ?? '') ? 'Copied body' : 'Clipboard copy failed'; + copyToStatus('body', selectedThreadDetail.thread.body ?? ''); }, }, { label: 'Copy summaries', run: () => { - status = copyTextToClipboard(formatSummariesForClipboard(threadDetail?.summaries ?? {})) ? 'Copied summaries' : 'Clipboard copy failed'; + copyToStatus('summaries', formatSummariesForClipboard(selectedThreadDetail.summaries)); }, }, { label: 'Copy links', run: () => { - const links = getThreadReferenceLinks(threadDetail); - status = links.length > 0 ? (copyTextToClipboard(links.join('\n')) ? 'Copied links' : 'Clipboard copy failed') : 'No referenced links found'; + const links = getThreadReferenceLinks(selectedThreadDetail); + copyToStatus('links', links.join('\n')); }, }, ]; @@ -1059,25 +1075,13 @@ export async function startTui(params: StartTuiParams): Promise { const clusterContextItems = (): ContextMenuItem[] => { const selectedCluster = clusterDetail; - const title = selectedCluster ? splitClusterDisplayTitle(selectedCluster.displayTitle) : null; return [ ...(selectedCluster ? [ { label: 'Focus members', run: () => updateFocus('members') }, - { - label: 'Copy cluster id', - run: () => { - status = copyTextToClipboard(String(selectedCluster.clusterId)) ? `Copied cluster ${selectedCluster.clusterId}` : 'Clipboard copy failed'; - }, - }, - { - label: 'Copy cluster title', - run: () => { - status = copyTextToClipboard(title?.title ?? selectedCluster.displayTitle) ? 'Copied cluster title' : 'Clipboard copy failed'; - }, - }, ] : []), + ...clusterCopyContextItems({ includeVisibleClusters: true }), ...(selectedCluster ? [ { @@ -1114,6 +1118,12 @@ export async function startTui(params: StartTuiParams): Promise { const globalContextItems = (): ContextMenuItem[] => [ { label: 'Refresh', run: () => refreshAll(true) }, { label: 'Repository browser', run: browseRepositories }, + { + label: 'Copy visible clusters', + run: () => { + copyToStatus('visible clusters', formatVisibleClustersForClipboard(snapshot?.clusters ?? [])); + }, + }, { label: 'Sort by size', run: () => setSortMode('size') }, { label: 'Sort by recent', run: () => setSortMode('recent') }, { label: 'Member sort grouped', run: () => setMemberSortMode('kind') }, @@ -1132,6 +1142,45 @@ export async function startTui(params: StartTuiParams): Promise { }, ]; + const clusterCopyContextItems = (options: { includeVisibleClusters: boolean }): ContextMenuItem[] => { + const selectedCluster = clusterDetail; + const title = selectedCluster ? splitClusterDisplayTitle(selectedCluster.displayTitle) : null; + return [ + ...(selectedCluster && title + ? [ + { + label: 'Copy cluster name', + run: () => copyToStatus('cluster name', title.name), + }, + { + label: 'Copy cluster title', + run: () => copyToStatus('cluster title', title.title), + }, + { + label: 'Copy cluster id', + run: () => copyToStatus('cluster id', String(selectedCluster.clusterId)), + }, + { + label: 'Copy cluster details', + run: () => copyToStatus('cluster details', formatClusterForClipboard(selectedCluster)), + }, + { + label: 'Copy member list', + run: () => copyToStatus('member list', formatClusterMembersForClipboard(selectedCluster)), + }, + ] + : []), + ...(options.includeVisibleClusters + ? [ + { + label: 'Copy visible clusters', + run: () => copyToStatus('visible clusters', formatVisibleClustersForClipboard(snapshot?.clusters ?? [])), + }, + ] + : []), + ]; + }; + const openHelp = (): void => { if (modalOpen) return; void (async () => { @@ -1711,11 +1760,12 @@ export function renderDetailPane( `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', + summaries ? `{bold}LLM Summary{/bold}\n${summaries}` : '', + summaries ? '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}' : '', `${closedLabel} {bold}Updated:{/bold} ${escapeBlessedText(formatRelativeTime(thread.updatedAtGh))} {bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, `{bold}Labels:{/bold} ${labels}`, `{bold}URL:{/bold} ${formatTerminalLink(thread.htmlUrl, thread.htmlUrl)}`, topFiles ? `\n{bold}Top files{/bold}\n${topFiles}` : '', - summaries ? `\n{bold}LLM Summary{/bold}\n${summaries}` : '', '', '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', `{bold}Main Preview{/bold}`, @@ -1860,6 +1910,67 @@ export function formatSummariesForClipboard(summaries: TuiThreadDetail['summarie }).join('\n\n'); } +export function formatThreadDetailForClipboard(threadDetail: TuiThreadDetail, clusterDetail: TuiClusterDetail | null): string { + const thread = threadDetail.thread; + const clusterTitle = clusterDetail ? splitClusterDisplayTitle(clusterDetail.displayTitle) : null; + const sections = [ + `${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}: ${thread.title}`, + clusterDetail && clusterTitle ? `Cluster ${clusterDetail.clusterId}: ${clusterTitle.name} | ${clusterTitle.title}` : '', + `State: ${thread.isClosed ? 'closed' : 'open'}`, + `Updated: ${thread.updatedAtGh ?? 'unknown'}`, + `Author: ${thread.authorLogin ?? 'unknown'}`, + `Labels: ${thread.labels.join(', ') || 'none'}`, + `URL: ${thread.htmlUrl}`, + formatSummariesForClipboard(threadDetail.summaries) ? `LLM Summary:\n${formatSummariesForClipboard(threadDetail.summaries)}` : '', + threadDetail.topFiles.length > 0 ? `Top files:\n${formatTopFilesForClipboard(threadDetail.topFiles)}` : '', + `Body:\n${thread.body ?? ''}`, + getThreadReferenceLinks(threadDetail).length > 0 ? `Links:\n${getThreadReferenceLinks(threadDetail).join('\n')}` : '', + ]; + return sections.filter((section) => section.trim()).join('\n\n'); +} + +export function formatClusterForClipboard(cluster: TuiClusterDetail): string { + const title = splitClusterDisplayTitle(cluster.displayTitle); + return [ + `Cluster ${cluster.clusterId}`, + `Name: ${title.name}`, + `Title: ${title.title}`, + `State: ${cluster.isClosed ? 'closed' : 'open'}`, + `Members: ${cluster.totalCount} (${cluster.issueCount} issues, ${cluster.pullRequestCount} PRs)`, + `Updated: ${cluster.latestUpdatedAt ?? 'unknown'}`, + cluster.representativeNumber !== null ? `Representative: #${cluster.representativeNumber} ${cluster.representativeKind ?? ''}`.trimEnd() : '', + ] + .filter(Boolean) + .join('\n'); +} + +export function formatClusterMembersForClipboard(cluster: TuiClusterDetail): string { + return cluster.members + .map((member) => { + const state = member.isClosed ? 'closed' : 'open'; + const kind = member.kind === 'pull_request' ? 'PR' : 'Issue'; + return `${kind} #${member.number} [${state}] ${member.title} ${member.htmlUrl}`; + }) + .join('\n'); +} + +export function formatVisibleClustersForClipboard(clusters: TuiClusterSummary[]): string { + return clusters + .map((cluster) => { + const title = splitClusterDisplayTitle(cluster.displayTitle); + const state = cluster.isClosed ? 'closed' : 'open'; + return `C${cluster.clusterId} [${state}] ${cluster.totalCount} items ${title.name} | ${title.title}`; + }) + .join('\n'); +} + +function formatTopFilesForClipboard(files: TuiThreadDetail['topFiles']): string { + return files + .slice(0, 5) + .map((file) => `${file.path} ${file.status ? `${file.status} ` : ''}+${file.additions}/-${file.deletions}`) + .join('\n'); +} + type InlineMarkdownSegment = | { kind: 'text'; value: string } | { kind: 'link'; label: string; url: string }; @@ -1904,7 +2015,7 @@ function pushBareLinkSegments(value: string, segments: InlineMarkdownSegment[]): function renderInlineText(value: string): string { return escapeBlessedText(value) - .replace(/`([^`]+)`/g, '{yellow-fg}$1{/yellow-fg}') + .replace(/`([^`]+)`/g, '$1') .replace(/\*\*([^*]+)\*\*/g, '{bold}$1{/bold}'); } From 89e645f45f489bdc7856435c7611a3f7e5f092c8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 17:06:02 -0700 Subject: [PATCH 110/215] fix(tui): surface structured key summaries --- apps/cli/src/tui/app.test.ts | 18 ++++++++++ apps/cli/src/tui/app.ts | 17 ++++++++-- packages/api-core/src/service.test.ts | 19 +++++++++++ packages/api-core/src/service.ts | 49 +++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index ada5f3a..b6d4d9a 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -72,6 +72,12 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e summaries: { dedupe_summary: 'Summary {yellow-fg}text{/yellow-fg}', }, + keySummary: { + summaryKind: 'llm_key_3line', + promptVersion: 'v1', + model: 'gpt-5-mini', + text: 'intent: Escape preview text\nsurface: TUI detail pane\nmechanism: Render existing summary data', + }, topFiles: [ { path: 'apps/cli/src/tui/app.ts', @@ -95,6 +101,8 @@ test('renderDetailPane escapes user-provided text before rendering into a tags-e assert.match(rendered, /C1 \(#42 representative issue\)/); assert.match(rendered, /Bad \\{bold\\}title\\{\/bold\\}/); assert.match(rendered, /Cluster signal:/); + assert.match(rendered, /Key summary/); + assert.match(rendered, /intent: Escape preview text/); assert.match(rendered, /Top files/); assert.match(rendered, /apps\/cli\/src\/tui\/app\.ts/); assert.match(rendered, /Main Preview/); @@ -143,6 +151,7 @@ test('renderDetailPane can compact very long bodies', () => { clusterId: 1, }, summaries: {}, + keySummary: null, topFiles: [], neighbors: [], }; @@ -288,6 +297,12 @@ test('clipboard formatters expose cluster and thread context without blessed tag clusterId: 7, }, summaries: { problem_summary: 'Retries fail' }, + keySummary: { + summaryKind: 'llm_key_3line', + promptVersion: 'v1', + model: 'gpt-5-mini', + text: 'intent: Fix retries\nsurface: retry worker\nmechanism: update retry path', + }, topFiles: [{ path: 'src/retry.ts', status: 'modified', additions: 3, deletions: 1 }], neighbors: [], }; @@ -295,6 +310,7 @@ test('clipboard formatters expose cluster and thread context without blessed tag assert.match(formatClusterForClipboard(cluster), /Name: alpha-bravo-charlie/); assert.match(formatClusterMembersForClipboard(cluster), /Issue #42 \[open\] Fix retries/); assert.match(formatThreadDetailForClipboard(detail, cluster), /LLM Summary:\nPurpose:\nRetries fail/); + assert.match(formatThreadDetailForClipboard(detail, cluster), /Key summary \(gpt-5-mini\):\nintent: Fix retries/); assert.match(formatThreadDetailForClipboard(detail, cluster), /Top files:\nsrc\/retry\.ts modified \+3\/-1/); assert.match(formatVisibleClustersForClipboard([{ ...cluster, searchText: '' }]), /C7 \[open\] 1 items alpha-bravo-charlie/); }); @@ -349,6 +365,7 @@ test('buildThreadContextMenuItems exposes thread actions for right-click menus', clusterId: 1, }, summaries: {}, + keySummary: null, topFiles: [], neighbors: [], }); @@ -397,6 +414,7 @@ test('getThreadReferenceLinks extracts unique body and summary links', () => { summaries: { dedupe_summary: 'same as https://example.com/raw and https://example.com/summary', }, + keySummary: null, topFiles: [], neighbors: [], }); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index f299226..cae8903 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1742,7 +1742,7 @@ export function renderDetailPane( const closedLabel = thread.isClosed ? `{bold}Closed:{/bold} ${escapeBlessedText(thread.closedAtLocal ?? thread.closedAtGh ?? 'yes')} ${thread.closeReasonLocal ? `(${escapeBlessedText(thread.closeReasonLocal)})` : ''}`.trimEnd() : '{bold}Closed:{/bold} no'; - const summaries = renderSummarySections(threadDetail.summaries); + const summaryBlock = renderThreadSummaryBlock(threadDetail); const topFiles = renderTopFiles(threadDetail.topFiles); const neighbors = threadDetail.neighbors.length > 0 @@ -1760,8 +1760,8 @@ export function renderDetailPane( `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', - summaries ? `{bold}LLM Summary{/bold}\n${summaries}` : '', - summaries ? '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}' : '', + summaryBlock ? `{bold}LLM Summary{/bold}\n${summaryBlock}` : '', + summaryBlock ? '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}' : '', `${closedLabel} {bold}Updated:{/bold} ${escapeBlessedText(formatRelativeTime(thread.updatedAtGh))} {bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, `{bold}Labels:{/bold} ${labels}`, `{bold}URL:{/bold} ${formatTerminalLink(thread.htmlUrl, thread.htmlUrl)}`, @@ -1883,6 +1883,16 @@ export function renderSummarySections(summaries: TuiThreadDetail['summaries']): }).join('\n\n'); } +export function renderThreadSummaryBlock(threadDetail: TuiThreadDetail): string { + const sections = [ + threadDetail.keySummary + ? `{bold}Key summary{/bold} {gray-fg}${escapeBlessedText(threadDetail.keySummary.model)}{/gray-fg}\n${renderMarkdownForTerminal(threadDetail.keySummary.text)}` + : '', + renderSummarySections(threadDetail.summaries), + ]; + return sections.filter((section) => section.trim()).join('\n\n'); +} + export function renderTopFiles(files: TuiThreadDetail['topFiles']): string { if (files.length === 0) return ''; return files @@ -1921,6 +1931,7 @@ export function formatThreadDetailForClipboard(threadDetail: TuiThreadDetail, cl `Author: ${thread.authorLogin ?? 'unknown'}`, `Labels: ${thread.labels.join(', ') || 'none'}`, `URL: ${thread.htmlUrl}`, + threadDetail.keySummary ? `Key summary (${threadDetail.keySummary.model}):\n${threadDetail.keySummary.text}` : '', formatSummariesForClipboard(threadDetail.summaries) ? `LLM Summary:\n${formatSummariesForClipboard(threadDetail.summaries)}` : '', threadDetail.topFiles.length > 0 ? `Top files:\n${formatTopFilesForClipboard(threadDetail.topFiles)}` : '', `Body:\n${thread.body ?? ''}`, diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 3926f90..ded615a 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3362,6 +3362,23 @@ test('tui cluster detail and thread detail expose members, summaries, and neighb values (?, ?, ?, ?, ?, ?, ?)`, ) .run(2000, 'README.md', 'modified', 4, 2, null, 'patch-2'); + service.db + .prepare( + `insert into thread_key_summaries ( + thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 1000, + 'llm_key_3line', + 'v1', + 'openai', + 'gpt-5-mini', + 'input-hash', + 'output-hash', + 'intent: Fix downloader hangs\nsurface: download progress\nmechanism: align timeout handling', + now, + ); const detail = service.getTuiClusterDetail({ owner: 'openclaw', repo: 'openclaw', clusterId: 100 }); assert.equal(detail.totalCount, 2); @@ -3377,6 +3394,8 @@ test('tui cluster detail and thread detail expose members, summaries, and neighb assert.equal(threadDetail.thread.htmlUrl, 'https://github.com/openclaw/openclaw/issues/42'); assert.equal(threadDetail.summaries.problem_summary, 'Downloads hang before completion.'); assert.equal(threadDetail.summaries.dedupe_summary, 'Transfer stalls near completion.'); + assert.equal(threadDetail.keySummary?.text, 'intent: Fix downloader hangs\nsurface: download progress\nmechanism: align timeout handling'); + assert.equal(threadDetail.keySummary?.model, 'gpt-5-mini'); assert.deepEqual(threadDetail.topFiles[0], { path: 'apps/cli/src/tui/app.ts', status: 'modified', diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 3a54edb..5ac5dda 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -360,6 +360,12 @@ export type TuiClusterDetail = { export type TuiThreadDetail = { thread: ThreadDto; summaries: Partial>; + keySummary: { + summaryKind: string; + promptVersion: string; + model: string; + text: string; + } | null; topFiles: Array<{ path: string; status: string | null; @@ -604,6 +610,14 @@ function normalizeSummaryText(value: string): string { return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim(); } +function normalizeKeySummaryDisplayText(value: string): string { + return value + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean) + .join('\n'); +} + function snippetText(value: string | null | undefined, maxChars: number): string | null { if (!value) return null; const normalized = value.replace(/\s+/g, ' ').trim(); @@ -3511,6 +3525,7 @@ export class GHCrawlService { } } const topFiles = this.getTopChangedFiles(row.id, 5); + const keySummary = this.getLatestKeySummary(row.id); let neighbors: SearchHitDto['neighbors'] = []; if (params.includeNeighbors !== false) { @@ -3533,11 +3548,45 @@ export class GHCrawlService { return { thread: threadToDto(row, clusterMembership?.cluster_id ?? null), summaries, + keySummary, topFiles, neighbors, }; } + private getLatestKeySummary(threadId: number): TuiThreadDetail['keySummary'] { + const row = this.db + .prepare( + `select ks.summary_kind, ks.prompt_version, ks.model, ks.key_text + from thread_key_summaries ks + join thread_revisions tr on tr.id = ks.thread_revision_id + where tr.thread_id = ? + and ks.summary_kind = 'llm_key_3line' + order by + case when ks.model = ? then 0 else 1 end, + tr.id desc, + ks.created_at desc + limit 1`, + ) + .get(threadId, this.config.summaryModel) as + | { + summary_kind: string; + prompt_version: string; + model: string; + key_text: string; + } + | undefined; + if (!row) return null; + const text = normalizeKeySummaryDisplayText(row.key_text); + if (!text) return null; + return { + summaryKind: row.summary_kind, + promptVersion: row.prompt_version, + model: row.model, + text, + }; + } + private getTopChangedFiles(threadId: number, limit: number): TuiThreadDetail['topFiles'] { const latestRevision = this.db .prepare( From 247c777250d68612046ec98eee14326356d600aa Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 17:14:47 -0700 Subject: [PATCH 111/215] fix(tui): default cluster filter to five members --- apps/cli/src/tui/app.ts | 8 +++++--- apps/cli/src/tui/state.test.ts | 5 +++-- apps/cli/src/tui/state.ts | 4 ++-- packages/api-core/src/config.test.ts | 2 +- packages/api-core/src/config.ts | 6 +++--- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index cae8903..ccd4f2a 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -141,7 +141,7 @@ export async function startTui(params: StartTuiParams): Promise { : { sortMode: 'size' as TuiClusterSortMode, memberSortMode: 'kind' as TuiMemberSortMode, - minClusterSize: 1 as TuiMinSizeFilter, + minClusterSize: 5 as TuiMinSizeFilter, wideLayout: 'columns' as TuiWideLayoutPreference, }; let sortMode: TuiClusterSortMode = initialPreference.sortMode; @@ -1106,6 +1106,7 @@ export async function startTui(params: StartTuiParams): Promise { { label: 'Member sort number', run: () => setMemberSortMode('number') }, { label: 'Member sort state', run: () => setMemberSortMode('state') }, { label: 'Min size 1+', run: () => setMinSize(1) }, + { label: 'Min size 5+', run: () => setMinSize(5) }, { label: 'Min size 10+', run: () => setMinSize(10) }, { label: 'Min size all', run: () => setMinSize(0) }, { label: showClosed ? 'Hide closed' : 'Show closed', run: () => toggleClosedVisibility() }, @@ -1129,6 +1130,7 @@ export async function startTui(params: StartTuiParams): Promise { { label: 'Member sort grouped', run: () => setMemberSortMode('kind') }, { label: 'Member sort recent', run: () => setMemberSortMode('recent') }, { label: 'Min size 1+', run: () => setMinSize(1) }, + { label: 'Min size 5+', run: () => setMinSize(5) }, { label: 'Min size 10+', run: () => setMinSize(10) }, { label: 'Min size all', run: () => setMinSize(0) }, { label: showClosed ? 'Hide closed' : 'Show closed', run: () => toggleClosedVisibility() }, @@ -1306,7 +1308,7 @@ export async function startTui(params: StartTuiParams): Promise { return; } setRepositoryPending(target, { - minClusterSize: 1, + minClusterSize: 5, status: `No local data for ${target.owner}/${target.repo}; run sync/embed/cluster in the CLI, then press r`, }); pushActivity(`[repo] selected ${target.owner}/${target.repo}; run ghcrawl sync/embed/cluster from the shell`); @@ -1346,7 +1348,7 @@ export async function startTui(params: StartTuiParams): Promise { return false; } setRepositoryPending(target, { - minClusterSize: 1, + minClusterSize: 5, status: `No local data for ${target.owner}/${target.repo}; run sync/embed/cluster in the CLI, then press r`, }); pushActivity(`[repo] selected ${target.owner}/${target.repo}; run ghcrawl sync/embed/cluster from the shell`); diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 83d0ad1..49df505 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -23,12 +23,13 @@ test('cycleSortMode toggles size and recent', () => { }); test('cycleMinSizeFilter rotates through presets', () => { - assert.equal(cycleMinSizeFilter(1), 2); - assert.equal(cycleMinSizeFilter(2), 10); + assert.equal(cycleMinSizeFilter(5), 10); assert.equal(cycleMinSizeFilter(10), 20); assert.equal(cycleMinSizeFilter(20), 50); assert.equal(cycleMinSizeFilter(50), 0); assert.equal(cycleMinSizeFilter(0), 1); + assert.equal(cycleMinSizeFilter(1), 2); + assert.equal(cycleMinSizeFilter(2), 5); }); test('cycleMemberSortMode rotates through member sort modes', () => { diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index 95e81ca..a698fa0 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -1,7 +1,7 @@ import type { TuiClusterDetail, TuiClusterSortMode, TuiClusterSummary, TuiMemberSortPreference } from '@ghcrawl/api-core'; export type TuiFocusPane = 'clusters' | 'members' | 'detail'; -export type TuiMinSizeFilter = 0 | 1 | 2 | 10 | 20 | 50; +export type TuiMinSizeFilter = 0 | 1 | 2 | 5 | 10 | 20 | 50; export type TuiMemberSortMode = TuiMemberSortPreference; export type MemberListRow = @@ -10,7 +10,7 @@ export type MemberListRow = export const SORT_MODE_ORDER: TuiClusterSortMode[] = ['size', 'recent']; export const MEMBER_SORT_MODE_ORDER: TuiMemberSortMode[] = ['kind', 'recent', 'number', 'state', 'title']; -export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [1, 2, 10, 20, 50, 0]; +export const MIN_SIZE_FILTER_ORDER: TuiMinSizeFilter[] = [5, 10, 20, 50, 0, 1, 2]; export const FOCUS_PANE_ORDER: TuiFocusPane[] = ['clusters', 'members', 'detail']; const MEMBER_NUMBER_WIDTH = 8; diff --git a/packages/api-core/src/config.test.ts b/packages/api-core/src/config.test.ts index 7581991..cd02bc5 100644 --- a/packages/api-core/src/config.test.ts +++ b/packages/api-core/src/config.test.ts @@ -279,7 +279,7 @@ test('writeTuiRepositoryPreference persists sort and min cluster size by reposit wideLayout: 'right-stack', }); assert.deepEqual(getTuiRepositoryPreference(reloaded, 'other', 'repo'), { - minClusterSize: 1, + minClusterSize: 5, sortMode: 'size', memberSortMode: 'kind', wideLayout: 'columns', diff --git a/packages/api-core/src/config.ts b/packages/api-core/src/config.ts index 4ab3c91..665e42d 100644 --- a/packages/api-core/src/config.ts +++ b/packages/api-core/src/config.ts @@ -7,7 +7,7 @@ import dotenv from 'dotenv'; export type ConfigValueSource = 'env' | 'config' | 'dotenv' | 'default' | 'none'; export type TuiSortPreference = 'recent' | 'size'; export type TuiMemberSortPreference = 'kind' | 'recent' | 'number' | 'state' | 'title'; -export type TuiMinClusterSize = 0 | 1 | 2 | 10 | 20 | 50; +export type TuiMinClusterSize = 0 | 1 | 2 | 5 | 10 | 20 | 50; export type TuiWideLayoutPreference = 'columns' | 'right-stack'; export type EmbeddingBasis = 'title_original' | 'title_summary' | 'llm_key_summary'; export type VectorBackend = 'vectorlite'; @@ -168,7 +168,7 @@ function getTuiMemberSortPreference(value: unknown): TuiMemberSortPreference | u } function getTuiMinClusterSize(value: unknown): TuiMinClusterSize | undefined { - return value === 0 || value === 1 || value === 2 || value === 10 || value === 20 || value === 50 ? value : undefined; + return value === 0 || value === 1 || value === 2 || value === 5 || value === 10 || value === 20 || value === 50 ? value : undefined; } function getTuiWideLayoutPreference(value: unknown): TuiWideLayoutPreference | undefined { @@ -403,7 +403,7 @@ export function ensureRuntimeDirs(config: GitcrawlConfig): void { } export function getTuiRepositoryPreference(config: GitcrawlConfig, owner: string, repo: string): TuiRepositoryPreference { - return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 1, sortMode: 'size', memberSortMode: 'kind', wideLayout: 'columns' }; + return config.tuiPreferences[`${owner}/${repo}`] ?? { minClusterSize: 5, sortMode: 'size', memberSortMode: 'kind', wideLayout: 'columns' }; } export function writeTuiRepositoryPreference( From 7379647c4db8e53d1f5835ccc6878c5e4f888bc1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 22:46:03 -0700 Subject: [PATCH 112/215] fix: support gpt-5.4 key summaries --- packages/api-core/src/openai/provider.ts | 4 ++-- packages/api-core/src/service.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/api-core/src/openai/provider.ts b/packages/api-core/src/openai/provider.ts index daf9904..fab11ac 100644 --- a/packages/api-core/src/openai/provider.ts +++ b/packages/api-core/src/openai/provider.ts @@ -89,7 +89,7 @@ export class OpenAiProvider implements AiProvider { verbosity: 'low', }, reasoning: { - effort: 'minimal', + effort: 'low', }, max_output_tokens: maxOutputTokens, }); @@ -151,7 +151,7 @@ export class OpenAiProvider implements AiProvider { verbosity: 'low', }, reasoning: { - effort: 'minimal', + effort: 'low', }, max_output_tokens: maxOutputTokens, }); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 5ac5dda..651e49f 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1765,7 +1765,7 @@ export class GHCrawlService { sql += ' and number = ?'; args.push(params.threadNumber); } - sql += ' order by number asc'; + sql += ' order by datetime(coalesce(updated_at_gh, updated_at)) desc, number desc'; if (params.limit) { sql += ' limit ?'; args.push(params.limit); From b93f11b0cb2b18432d471c2520f00b0a3db9b5a0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 23:39:45 -0700 Subject: [PATCH 113/215] fix: allow incremental embedding with existing vectors --- packages/api-core/src/service.ts | 43 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 651e49f..f730938 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -5645,26 +5645,6 @@ export class GHCrawlService { title: string; body: string | null; }>; - const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber); - const keySummaryTexts = this.loadKeySummaryTextMap(repoId, threadNumber); - const missingSummaryThreadNumbers: number[] = []; - const tasks = rows.flatMap((row) => { - const task = this.buildActiveVectorTask({ - threadId: row.id, - threadNumber: row.number, - title: row.title, - body: row.body, - dedupeSummary: summaryTexts.get(row.id) ?? null, - keySummary: keySummaryTexts.get(row.id) ?? null, - }); - if (task) { - return [task]; - } - if (this.config.embeddingBasis === 'title_summary' || this.config.embeddingBasis === 'llm_key_summary') { - missingSummaryThreadNumbers.push(row.number); - } - return []; - }); const pipelineCurrent = this.isRepoVectorStateCurrent(repoId); const existingRows = this.db .prepare( @@ -5684,6 +5664,29 @@ export class GHCrawlService { for (const row of existingRows) { existing.set(String(row.thread_id), row.content_hash); } + const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber); + const keySummaryTexts = this.loadKeySummaryTextMap(repoId, threadNumber); + const missingSummaryThreadNumbers: number[] = []; + const tasks = rows.flatMap((row) => { + const task = this.buildActiveVectorTask({ + threadId: row.id, + threadNumber: row.number, + title: row.title, + body: row.body, + dedupeSummary: summaryTexts.get(row.id) ?? null, + keySummary: keySummaryTexts.get(row.id) ?? null, + }); + if (task) { + return [task]; + } + if ( + (this.config.embeddingBasis === 'title_summary' || this.config.embeddingBasis === 'llm_key_summary') && + (!pipelineCurrent || !existing.has(String(row.id))) + ) { + missingSummaryThreadNumbers.push(row.number); + } + return []; + }); const pending = pipelineCurrent ? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash) : tasks; From 92d6ee81447834a2fc46a9c6ef7e2360d0c55d64 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Fri, 24 Apr 2026 23:57:24 -0700 Subject: [PATCH 114/215] feat: show closed clusters by default --- README.md | 9 +- apps/cli/src/main.test.ts | 37 ++ apps/cli/src/main.ts | 13 +- packages/api-contract/src/client.ts | 6 +- packages/api-core/src/api/server.ts | 6 +- .../api-core/src/cluster/persistent-store.ts | 5 +- packages/api-core/src/service.test.ts | 88 +++- packages/api-core/src/service.ts | 386 +++++++++++++++--- 8 files changed, 476 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 2b5b090..686d9e3 100644 --- a/README.md +++ b/README.md @@ -207,17 +207,18 @@ ghcrawl threads owner/repo --numbers 42,43,44 --include-closed --json ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --include-closed --json +ghcrawl clusters owner/repo --min-size 10 --limit 20 --json +ghcrawl clusters owner/repo --min-size 10 --hide-closed --json ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --json -ghcrawl cluster-detail owner/repo --id 123 --include-closed --json +ghcrawl cluster-detail owner/repo --id 123 --hide-closed --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ghcrawl search owner/repo --query "download stalls" --json ``` Use `threads --numbers ...` when you want several specific issue or PR records in one CLI call instead of paying process startup overhead repeatedly. -By default, JSON list commands filter out locally closed issues/PRs and completely closed clusters. Use `--include-closed` when you need to inspect those records too. +By default, cluster JSON commands show locally closed clusters. Use `--hide-closed` when you only want active clusters. Thread list commands still hide locally closed issues/PRs unless `--include-closed` is passed. Use `close-thread` when you know a local issue/PR should be treated as closed before the next GitHub sync catches up. If that was the last open item in its cluster, `ghcrawl` automatically marks the cluster closed too. @@ -278,7 +279,7 @@ The skill is built around the stable JSON CLI surface and is intentionally conse - default mode assumes no valid API keys and stays read-only - API-backed operations only need the relevant bare token in env, `.env.local`, or config JSON - even then, `refresh`, `sync`, `embed`, and `cluster` should only run when the user explicitly asks for them -- JSON list commands hide locally closed issues/PRs and closed clusters by default unless `--include-closed` is passed +- cluster JSON commands show closed clusters by default; use `--hide-closed` for active-only cluster views ```bash ghcrawl doctor --json diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 2462940..674a2b6 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -607,6 +607,36 @@ test('cluster command forwards neighborhood refresh inputs', async () => { assert.match(stdout.read(), /"edges": 3/); }); +test('clusters command shows closed clusters by default and forwards hide-closed', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.listClusterSummaries; + const received: unknown[] = []; + + GHCrawlService.prototype.listClusterSummaries = function listClusterSummariesStub(params: unknown) { + received.push(params); + return { repository: { fullName: 'openclaw/openclaw' }, stats: {}, clusters: [] } as never; + }; + + try { + await run(['clusters', 'openclaw/openclaw', '--min-size', '5'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + await run(['clusters', 'openclaw/openclaw', '--hide-closed'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.listClusterSummaries = original; + context.cleanup(); + } + + assert.equal((received[0] as { includeClosed?: boolean }).includeClosed, true); + assert.equal((received[0] as { minSize?: number }).minSize, 5); + assert.equal((received[1] as { includeClosed?: boolean }).includeClosed, false); +}); + test('durable-clusters command forwards stable cluster list options', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); @@ -839,6 +869,13 @@ test('parseRepoFlags accepts include-closed boolean flag', () => { assert.equal(parsed.values['include-closed'], true); }); +test('parseRepoFlags accepts hide-closed cluster flag', () => { + const parsed = parseRepoFlags('clusters', ['openclaw/openclaw', '--hide-closed']); + assert.equal(parsed.owner, 'openclaw'); + assert.equal(parsed.repo, 'openclaw'); + assert.equal(parsed.values['hide-closed'], true); +}); + test('parseRepoFlags accepts include-inactive durable cluster flag', () => { const parsed = parseRepoFlags('durable-clusters', ['openclaw/openclaw', '--include-inactive']); assert.equal(parsed.owner, 'openclaw'); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 0ed5e6c..5c4e0f7 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -296,14 +296,14 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'clusters', - synopsis: 'clusters [--min-size ] [--limit ] [--sort recent|size] [--search ] [--include-closed] [--json]', + synopsis: 'clusters [--min-size ] [--limit ] [--sort recent|size] [--search ] [--hide-closed] [--json]', description: 'List local cluster summaries for one repository.', options: [ '--min-size Minimum cluster size to return', '--limit Maximum number of clusters to return', '--sort recent|size Sort by recency or cluster size', '--search Filter clusters by text', - '--include-closed Include locally closed clusters', + '--hide-closed Hide locally closed clusters', '--json Emit machine-readable JSON output explicitly', ], examples: ['ghcrawl clusters openclaw/openclaw --min-size 10 --limit 20 --sort recent --json'], @@ -311,13 +311,13 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'cluster-detail', - synopsis: 'cluster-detail --id [--member-limit ] [--body-chars ] [--include-closed] [--json]', + synopsis: 'cluster-detail --id [--member-limit ] [--body-chars ] [--hide-closed] [--json]', description: 'Dump one local cluster and its members.', options: [ '--id Cluster id to inspect', '--member-limit Limit member rows in the response', '--body-chars Limit body snippet size', - '--include-closed Include locally closed clusters', + '--hide-closed Hide locally closed clusters', '--json Emit machine-readable JSON output explicitly', ], examples: ['ghcrawl cluster-detail openclaw/openclaw --id 123 --member-limit 20 --body-chars 280 --json'], @@ -565,6 +565,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo 'include-code': { type: 'boolean' }, 'full-reconcile': { type: 'boolean' }, 'include-closed': { type: 'boolean' }, + 'hide-closed': { type: 'boolean' }, 'include-inactive': { type: 'boolean' }, kind: { type: 'string' }, number: { type: 'string' }, @@ -1297,7 +1298,7 @@ export async function run( limit: typeof values.limit === 'string' ? parsePositiveInteger('limit', values.limit, 'clusters') : undefined, sort, search: typeof values.search === 'string' ? values.search : undefined, - includeClosed: values['include-closed'] === true, + includeClosed: values['hide-closed'] === true ? false : true, }); writeJson(stdout, result); return; @@ -1333,7 +1334,7 @@ export async function run( typeof values['body-chars'] === 'string' ? parsePositiveInteger('body-chars', values['body-chars'], 'cluster-detail') : undefined, - includeClosed: values['include-closed'] === true, + includeClosed: values['hide-closed'] === true ? false : true, }); writeJson(stdout, result); return; diff --git a/packages/api-contract/src/client.ts b/packages/api-contract/src/client.ts index 64bed6d..e0440b2 100644 --- a/packages/api-contract/src/client.ts +++ b/packages/api-contract/src/client.ts @@ -119,7 +119,7 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet }, async listClusters(params) { const search = new URLSearchParams({ owner: params.owner, repo: params.repo }); - if (params.includeClosed) search.set('includeClosed', 'true'); + if (params.includeClosed !== undefined) search.set('includeClosed', String(params.includeClosed)); const res = await fetchImpl(`${normalized}/clusters?${search.toString()}`); return readJson(res, clustersResponseSchema); }, @@ -129,7 +129,7 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet if (params.limit !== undefined) search.set('limit', String(params.limit)); if (params.sort) search.set('sort', params.sort); if (params.search) search.set('search', params.search); - if (params.includeClosed) search.set('includeClosed', 'true'); + if (params.includeClosed !== undefined) search.set('includeClosed', String(params.includeClosed)); const res = await fetchImpl(`${normalized}/cluster-summaries?${search.toString()}`); return readJson(res, clusterSummariesResponseSchema); }, @@ -141,7 +141,7 @@ export function createGitcrawlClient(baseUrl: string, fetchImpl: FetchLike = fet }); if (params.memberLimit !== undefined) search.set('memberLimit', String(params.memberLimit)); if (params.bodyChars !== undefined) search.set('bodyChars', String(params.bodyChars)); - if (params.includeClosed) search.set('includeClosed', 'true'); + if (params.includeClosed !== undefined) search.set('includeClosed', String(params.includeClosed)); const res = await fetchImpl(`${normalized}/cluster-detail?${search.toString()}`); return readJson(res, clusterDetailResponseSchema); }, diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index 0f82d06..ef88d8a 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -126,7 +126,7 @@ export function createApiServer(service: GHCrawlService): http.Server { if (req.method === 'GET' && url.pathname === '/clusters') { const params = parseRepoParams(url); - const includeClosed = url.searchParams.get('includeClosed') === 'true'; + const includeClosed = url.searchParams.get('includeClosed') !== 'false'; sendJson(res, 200, service.listClusters({ ...params, includeClosed })); return; } @@ -154,7 +154,7 @@ export function createApiServer(service: GHCrawlService): http.Server { const minSizeValue = url.searchParams.get('minSize'); const limitValue = url.searchParams.get('limit'); const search = url.searchParams.get('search') ?? undefined; - const includeClosed = url.searchParams.get('includeClosed') === 'true'; + const includeClosed = url.searchParams.get('includeClosed') !== 'false'; sendJson( res, 200, @@ -184,7 +184,7 @@ export function createApiServer(service: GHCrawlService): http.Server { } const memberLimitValue = url.searchParams.get('memberLimit'); const bodyCharsValue = url.searchParams.get('bodyChars'); - const includeClosed = url.searchParams.get('includeClosed') === 'true'; + const includeClosed = url.searchParams.get('includeClosed') !== 'false'; sendJson( res, 200, diff --git a/packages/api-core/src/cluster/persistent-store.ts b/packages/api-core/src/cluster/persistent-store.ts index 0144dd3..964c478 100644 --- a/packages/api-core/src/cluster/persistent-store.ts +++ b/packages/api-core/src/cluster/persistent-store.ts @@ -397,7 +397,10 @@ export function upsertClusterGroup( ) values (?, ?, ?, ?, ?, ?, ?, ?, ?) on conflict(repo_id, stable_key) do update set stable_slug = excluded.stable_slug, - status = excluded.status, + status = case + when cluster_groups.status <> 'active' and excluded.status = 'active' then cluster_groups.status + else excluded.status + end, cluster_type = excluded.cluster_type, representative_thread_id = excluded.representative_thread_id, title = excluded.title, diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index ded615a..fb614f7 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3872,18 +3872,17 @@ test('local thread closure updates default thread filters and auto-closes fully assert.equal(secondClose.ok, true); assert.equal(secondClose.clusterClosed, true); - const summaries = service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); - assert.equal(summaries.clusters.length, 0); + const hiddenSummaries = service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosed: false }); + assert.equal(hiddenSummaries.clusters.length, 0); - const summariesWithClosed = service.listClusterSummaries({ + const summaries = service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0, - includeClosed: true, }); - assert.equal(summariesWithClosed.clusters.length, 1); - assert.equal(summariesWithClosed.clusters[0]?.isClosed, true); - assert.equal(summariesWithClosed.clusters[0]?.closeReasonLocal, 'all_members_closed'); + assert.equal(summaries.clusters.length, 1); + assert.equal(summaries.clusters[0]?.isClosed, true); + assert.equal(summaries.clusters[0]?.closeReasonLocal, 'all_members_closed'); const snapshot = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); assert.equal(snapshot.clusters.length, 1); @@ -3893,7 +3892,7 @@ test('local thread closure updates default thread filters and auto-closes fully } }); -test('manual cluster closure is hidden from JSON summaries by default but remains visible in the tui snapshot', () => { +test('manual cluster closure is shown by default and can be hidden from JSON summaries', () => { const service = makeTestService({ getRepo: async () => ({}), listRepositoryIssues: async () => [], @@ -3939,7 +3938,8 @@ test('manual cluster closure is hidden from JSON summaries by default but remain assert.equal(response.ok, true); assert.equal(response.clusterClosed, true); - assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }).clusters.length, 0); + assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosed: false }).clusters.length, 0); + assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }).clusters.length, 1); const detail = service.getClusterDetailDump({ owner: 'openclaw', repo: 'openclaw', @@ -3958,6 +3958,76 @@ test('manual cluster closure is hidden from JSON summaries by default but remain } }); +test('tui snapshot includes durable closed clusters missing from the latest run', () => { + const service = makeTestService({ + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + + try { + const now = '2026-03-10T12:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, 1, '100', 42, 'issue', 'closed', 'Closed durable issue', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, now, null, now, now, now); + service.db + .prepare(`insert into cluster_runs (id, repo_id, scope, status, started_at, finished_at) values (?, ?, ?, ?, ?, ?)`) + .run(1, 1, 'openclaw/openclaw', 'completed', now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, 'stable-key', 'trace-alpha-river', 'closed', 'duplicate_candidate', 10, 'Closed durable cluster', now, now, now); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 10, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); + + const hidden = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosedClusters: false }); + assert.equal(hidden.clusters.length, 0); + + const snapshot = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); + assert.equal(snapshot.clusters.length, 1); + assert.equal(snapshot.clusters[0]?.clusterId, 7); + assert.equal(snapshot.clusters[0]?.isClosed, true); + assert.equal(snapshot.clusters[0]?.closeReasonLocal, 'closed'); + + const detail = service.getTuiClusterDetail({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 7, + clusterRunId: snapshot.clusterRunId ?? undefined, + }); + assert.equal(detail.members.length, 1); + assert.equal(detail.members[0]?.number, 42); + } finally { + service.close(); + } +}); + test('excludeThreadFromCluster records a durable manual exclusion', () => { const service = makeTestService({ getRepo: async () => ({}), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f730938..e57dca2 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -191,6 +191,12 @@ type ActiveVectorRow = ThreadRow & { vector_backend: string; }; +type DurableTuiClosure = { + clusterId: number; + status: 'active' | 'closed' | 'merged' | 'split'; + closedAt: string | null; +}; + type RepoPipelineStateRow = { repo_id: number; summary_model: string; @@ -878,8 +884,8 @@ export class GHCrawlService { } const row = this.db - .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1') - .get(repository.id, latestRun.id, params.clusterId) as { id: number } | undefined; + .prepare('select id, representative_thread_id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1') + .get(repository.id, latestRun.id, params.clusterId) as { id: number; representative_thread_id: number | null } | undefined; if (!row) { throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`); } @@ -893,6 +899,7 @@ export class GHCrawlService { where id = ?`, ) .run(closedAt, row.id); + this.markDurableClusterClosedByRepresentative(repository.id, row.representative_thread_id ?? null, closedAt, 'manual'); return closeResponseSchema.parse({ ok: true, @@ -2882,7 +2889,7 @@ export class GHCrawlService { return clustersResponseSchema.parse({ repository, - clusters: clusterValues.filter((cluster) => (params.includeClosed ? true : !cluster.isClosed)), + clusters: clusterValues.filter((cluster) => (params.includeClosed ?? true ? true : !cluster.isClosed)), }); } @@ -3231,7 +3238,7 @@ export class GHCrawlService { minSize: params.minSize, sort: params.sort, search: params.search, - includeClosedClusters: params.includeClosed === true, + includeClosedClusters: params.includeClosed ?? true, }); const clusters = params.limit ? snapshot.clusters.slice(0, params.limit) : snapshot.clusters; return clusterSummariesResponseSchema.parse({ @@ -3266,7 +3273,7 @@ export class GHCrawlService { owner: params.owner, repo: params.repo, minSize: 0, - includeClosedClusters: params.includeClosed === true, + includeClosedClusters: params.includeClosed ?? true, }); const cluster = snapshot.clusters.find((item) => item.clusterId === params.clusterId); if (!cluster) { @@ -3328,12 +3335,17 @@ export class GHCrawlService { const repository = this.requireRepository(params.owner, params.repo); const stats = this.getTuiRepoStats(repository.id); const latestRun = this.getLatestClusterRun(repository.id); - if (!latestRun) { - return { repository, stats, clusterRunId: null, clusters: [] }; - } - const includeClosedClusters = params.includeClosedClusters ?? true; - const clusters = this.listRawTuiClusters(repository.id, latestRun.id) + const rawClusters = latestRun ? this.listRawTuiClusters(repository.id, latestRun.id) : []; + const representedThreadIds = new Set( + rawClusters + .map((cluster) => cluster.representativeThreadId) + .filter((threadId): threadId is number => threadId !== null), + ); + const durableClosedClusters = includeClosedClusters + ? this.listClosedDurableTuiClusters(repository.id, representedThreadIds) + : []; + const clusters = [...rawClusters, ...durableClosedClusters] .filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed)) .filter((cluster) => cluster.totalCount >= (params.minSize ?? 1)) .filter((cluster) => { @@ -3346,7 +3358,7 @@ export class GHCrawlService { return { repository, stats, - clusterRunId: latestRun.id, + clusterRunId: latestRun?.id ?? null, clusters, }; } @@ -3410,27 +3422,52 @@ export class GHCrawlService { const clusterRunId = params.clusterRunId ?? (this.getLatestClusterRun(repository.id)?.id ?? null); - if (!clusterRunId) { - throw new Error(`No completed cluster run found for ${repository.fullName}. Run cluster first.`); - } - const summary = this.getRawTuiClusterSummary(repository.id, clusterRunId, params.clusterId); - if (!summary) { + const summary = clusterRunId ? this.getRawTuiClusterSummary(repository.id, clusterRunId, params.clusterId) : null; + const durableSummary = summary ? null : this.getDurableTuiClusterSummary(repository.id, params.clusterId); + const resolvedSummary = summary ?? durableSummary; + if (!resolvedSummary) { throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`); } - const rows = this.db - .prepare( - `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative - from cluster_members cm - join threads t on t.id = cm.thread_id - where cm.cluster_id = ? - order by - case t.kind when 'issue' then 0 else 1 end asc, - coalesce(t.updated_at_gh, t.updated_at) desc, - t.number desc`, - ) - .all(params.clusterId) as Array<{ + const rows = summary + ? (this.db + .prepare( + `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative + from cluster_members cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + order by + case t.kind when 'issue' then 0 else 1 end asc, + coalesce(t.updated_at_gh, t.updated_at) desc, + t.number desc`, + ) + .all(params.clusterId) as Array<{ + id: number; + number: number; + kind: 'issue' | 'pull_request'; + state: string; + closed_at_local: string | null; + title: string; + updated_at_gh: string | null; + html_url: string; + labels_json: string; + score_to_representative: number | null; + }>) + : (this.db + .prepare( + `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + and cm.state <> 'removed_by_user' + order by + case cm.role when 'canonical' then 0 else 1 end asc, + case t.kind when 'issue' then 0 else 1 end asc, + coalesce(t.updated_at_gh, t.updated_at) desc, + t.number desc`, + ) + .all(params.clusterId) as Array<{ id: number; number: number; kind: 'issue' | 'pull_request'; @@ -3441,21 +3478,21 @@ export class GHCrawlService { html_url: string; labels_json: string; score_to_representative: number | null; - }>; + }>); return { - clusterId: summary.clusterId, - displayTitle: summary.displayTitle, - isClosed: summary.isClosed, - closedAtLocal: summary.closedAtLocal, - closeReasonLocal: summary.closeReasonLocal, - totalCount: summary.totalCount, - issueCount: summary.issueCount, - pullRequestCount: summary.pullRequestCount, - latestUpdatedAt: summary.latestUpdatedAt, - representativeThreadId: summary.representativeThreadId, - representativeNumber: summary.representativeNumber, - representativeKind: summary.representativeKind, + clusterId: resolvedSummary.clusterId, + displayTitle: resolvedSummary.displayTitle, + isClosed: resolvedSummary.isClosed, + closedAtLocal: resolvedSummary.closedAtLocal, + closeReasonLocal: resolvedSummary.closeReasonLocal, + totalCount: resolvedSummary.totalCount, + issueCount: resolvedSummary.issueCount, + pullRequestCount: resolvedSummary.pullRequestCount, + latestUpdatedAt: resolvedSummary.latestUpdatedAt, + representativeThreadId: resolvedSummary.representativeThreadId, + representativeNumber: resolvedSummary.representativeNumber, + representativeKind: resolvedSummary.representativeKind, members: rows.map((row) => ({ id: row.id, number: row.number, @@ -4120,7 +4157,12 @@ export class GHCrawlService { continue; } if (row.member_count > 0 && row.closed_member_count >= row.member_count) { - const result = markClosed.run(nowIso(), clusterId); + const closedAt = nowIso(); + const result = markClosed.run(closedAt, clusterId); + const cluster = this.db.prepare('select representative_thread_id from clusters where id = ? limit 1').get(clusterId) as + | { representative_thread_id: number | null } + | undefined; + this.markDurableClusterClosedByRepresentative(repoId, cluster?.representative_thread_id ?? null, closedAt, 'all_members_closed'); changed += result.changes; continue; } @@ -4131,6 +4173,242 @@ export class GHCrawlService { return changed; } + private markDurableClusterClosedByRepresentative( + repoId: number, + representativeThreadId: number | null, + closedAt: string, + reason: string, + ): void { + if (representativeThreadId === null) return; + const identity = humanKeyForValue(`repo:${repoId}:cluster-representative:${representativeThreadId}`); + const durable = this.db + .prepare('select id from cluster_groups where repo_id = ? and stable_key = ? limit 1') + .get(repoId, identity.hash) as { id: number } | undefined; + if (!durable) return; + + this.db + .prepare( + `update cluster_groups + set status = 'closed', + closed_at = coalesce(closed_at, ?), + updated_at = ? + where id = ?`, + ) + .run(closedAt, closedAt, durable.id); + recordClusterEvent(this.db, { + clusterId: durable.id, + eventType: 'close_cluster', + actorKind: reason === 'manual' ? 'user' : 'algo', + payload: { + representativeThreadId, + reason, + }, + }); + } + + private durableClosureReason(closure: DurableTuiClosure): string { + return closure.status === 'active' ? 'closed' : closure.status; + } + + private getDurableClosuresByRepresentative(repoId: number, representativeThreadIds: number[]): Map { + const uniqueThreadIds = Array.from(new Set(representativeThreadIds)); + if (uniqueThreadIds.length === 0) { + return new Map(); + } + + const identities = uniqueThreadIds.map((threadId) => ({ + threadId, + stableKey: humanKeyForValue(`repo:${repoId}:cluster-representative:${threadId}`).hash, + })); + const placeholders = identities.map(() => '?').join(','); + const rows = this.db + .prepare( + `select id, stable_key, status, closed_at + from cluster_groups + where repo_id = ? + and stable_key in (${placeholders}) + and (status <> 'active' or closed_at is not null)`, + ) + .all(repoId, ...identities.map((identity) => identity.stableKey)) as Array<{ + id: number; + stable_key: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + }>; + const threadIdByStableKey = new Map(identities.map((identity) => [identity.stableKey, identity.threadId])); + const closures = new Map(); + for (const row of rows) { + const threadId = threadIdByStableKey.get(row.stable_key); + if (threadId === undefined) continue; + closures.set(threadId, { + clusterId: row.id, + status: row.status, + closedAt: row.closed_at, + }); + } + return closures; + } + + private listClosedDurableTuiClusters(repoId: number, representedThreadIds: Set): TuiClusterSummary[] { + const rows = this.db + .prepare( + `select + cg.id as cluster_id, + cg.stable_slug, + cg.status, + cg.closed_at, + cg.representative_thread_id, + cg.title, + rt.number as representative_number, + rt.kind as representative_kind, + rt.title as representative_title, + count(*) as member_count, + max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, + sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, + sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + group_concat(lower(coalesce(t.title, '')), ' ') as search_text + from cluster_groups cg + left join threads rt on rt.id = cg.representative_thread_id + join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' + join threads t on t.id = cm.thread_id + where cg.repo_id = ? + and (cg.status <> 'active' or cg.closed_at is not null) + group by + cg.id, + cg.stable_slug, + cg.status, + cg.closed_at, + cg.representative_thread_id, + cg.title, + rt.number, + rt.kind, + rt.title`, + ) + .all(repoId) as Array<{ + cluster_id: number; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + representative_thread_id: number | null; + title: string | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + member_count: number; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + search_text: string | null; + }>; + + return rows + .filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)) + .map((row) => + this.durableTuiSummaryFromRow({ + ...row, + representative_title: row.representative_title ?? row.title, + }), + ); + } + + private getDurableTuiClusterSummary(repoId: number, clusterId: number): TuiClusterSummary | null { + const row = this.db + .prepare( + `select + cg.id as cluster_id, + cg.stable_slug, + cg.status, + cg.closed_at, + cg.representative_thread_id, + cg.title, + rt.number as representative_number, + rt.kind as representative_kind, + rt.title as representative_title, + count(*) as member_count, + max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, + sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, + sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + group_concat(lower(coalesce(t.title, '')), ' ') as search_text + from cluster_groups cg + left join threads rt on rt.id = cg.representative_thread_id + join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' + join threads t on t.id = cm.thread_id + where cg.repo_id = ? + and cg.id = ? + group by + cg.id, + cg.stable_slug, + cg.status, + cg.closed_at, + cg.representative_thread_id, + cg.title, + rt.number, + rt.kind, + rt.title`, + ) + .get(repoId, clusterId) as + | { + cluster_id: number; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + representative_thread_id: number | null; + title: string | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + member_count: number; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + search_text: string | null; + } + | undefined; + if (!row) return null; + return this.durableTuiSummaryFromRow({ + ...row, + representative_title: row.representative_title ?? row.title, + }); + } + + private durableTuiSummaryFromRow(row: { + cluster_id: number; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + representative_thread_id: number | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + member_count: number; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + search_text: string | null; + }): TuiClusterSummary { + const closure: DurableTuiClosure = { + clusterId: row.cluster_id, + status: row.status, + closedAt: row.closed_at, + }; + const isClosed = row.status !== 'active' || row.closed_at !== null; + return { + clusterId: row.cluster_id, + displayTitle: this.clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), + isClosed, + closedAtLocal: row.closed_at, + closeReasonLocal: isClosed ? this.durableClosureReason(closure) : null, + totalCount: row.member_count, + issueCount: row.issue_count, + pullRequestCount: row.pull_request_count, + latestUpdatedAt: row.latest_updated_at, + representativeThreadId: row.representative_thread_id, + representativeNumber: row.representative_number, + representativeKind: row.representative_kind, + searchText: `${row.stable_slug} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), + }; + } + private listRawTuiClusters(repoId: number, clusterRunId: number): TuiClusterSummary[] { const rows = this.db .prepare( @@ -4178,15 +4456,23 @@ export class GHCrawlService { closed_member_count: number; search_text: string | null; }>; + const durableClosures = this.getDurableClosuresByRepresentative( + repoId, + rows + .map((row) => row.representative_thread_id) + .filter((threadId): threadId is number => threadId !== null), + ); return rows.map((row) => { const clusterName = this.clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); + const durableClosure = + row.representative_thread_id === null ? null : (durableClosures.get(row.representative_thread_id) ?? null); return { clusterId: row.cluster_id, displayTitle: this.clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), - isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count, - closedAtLocal: row.closed_at_local, - closeReasonLocal: row.close_reason_local, + isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, + closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, + closeReasonLocal: row.close_reason_local ?? (durableClosure ? this.durableClosureReason(durableClosure) : null), totalCount: row.member_count, issueCount: row.issue_count, pullRequestCount: row.pull_request_count, @@ -4254,12 +4540,16 @@ export class GHCrawlService { } const clusterName = this.clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); + const durableClosure = + row.representative_thread_id === null + ? null + : (this.getDurableClosuresByRepresentative(repoId, [row.representative_thread_id]).get(row.representative_thread_id) ?? null); return { clusterId: row.cluster_id, displayTitle: this.clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), - isClosed: row.close_reason_local !== null || row.closed_member_count >= row.member_count, - closedAtLocal: row.closed_at_local, - closeReasonLocal: row.close_reason_local, + isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, + closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, + closeReasonLocal: row.close_reason_local ?? (durableClosure ? this.durableClosureReason(durableClosure) : null), totalCount: row.member_count, issueCount: row.issue_count, pullRequestCount: row.pull_request_count, From bf0a697c72f3dd295767e922b9df29e6731d1c47 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 00:04:07 -0700 Subject: [PATCH 115/215] fix: show archived closed clusters --- packages/api-core/src/service.test.ts | 37 ++++++++++++++++++++++++++- packages/api-core/src/service.ts | 21 ++++++++++++--- 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index fb614f7..3945e2f 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -3987,6 +3987,15 @@ test('tui snapshot includes durable closed clusters missing from the latest run' ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(10, 1, '100', 42, 'issue', 'closed', 'Closed durable issue', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, now, null, now, now, now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(11, 1, '101', 43, 'pull_request', 'closed', 'Archived durable PR', 'body', 'bob', 'User', 'https://github.com/openclaw/openclaw/pull/43', '[]', '[]', '{}', 'hash-43', 0, now, now, now, now, now, now, now); service.db .prepare(`insert into cluster_runs (id, repo_id, scope, status, started_at, finished_at) values (?, ?, ?, ?, ?, ?)`) .run(1, 1, 'openclaw/openclaw', 'completed', now, now); @@ -3997,6 +4006,13 @@ test('tui snapshot includes durable closed clusters missing from the latest run' ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(7, 1, 'stable-key', 'trace-alpha-river', 'closed', 'duplicate_candidate', 10, 'Closed durable cluster', now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(8, 1, 'stable-key-archived', 'archive-blue-harbor', 'active', 'duplicate_candidate', 11, 'Archived durable cluster', now, now, null); service.db .prepare( `insert into cluster_memberships ( @@ -4005,15 +4021,26 @@ test('tui snapshot includes durable closed clusters missing from the latest run' ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(7, 10, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(8, 11, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); const hidden = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosedClusters: false }); assert.equal(hidden.clusters.length, 0); const snapshot = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); - assert.equal(snapshot.clusters.length, 1); + assert.equal(snapshot.clusters.length, 2); assert.equal(snapshot.clusters[0]?.clusterId, 7); assert.equal(snapshot.clusters[0]?.isClosed, true); assert.equal(snapshot.clusters[0]?.closeReasonLocal, 'closed'); + assert.equal(snapshot.clusters[1]?.clusterId, 8); + assert.equal(snapshot.clusters[1]?.isClosed, true); + assert.equal(snapshot.clusters[1]?.closeReasonLocal, 'all_members_closed'); const detail = service.getTuiClusterDetail({ owner: 'openclaw', @@ -4023,6 +4050,14 @@ test('tui snapshot includes durable closed clusters missing from the latest run' }); assert.equal(detail.members.length, 1); assert.equal(detail.members[0]?.number, 42); + const archivedDetail = service.getTuiClusterDetail({ + owner: 'openclaw', + repo: 'openclaw', + clusterId: 8, + clusterRunId: snapshot.clusterRunId ?? undefined, + }); + assert.equal(archivedDetail.members.length, 1); + assert.equal(archivedDetail.members[0]?.number, 43); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e57dca2..a12c9c9 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -4266,13 +4266,13 @@ export class GHCrawlService { max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, group_concat(lower(coalesce(t.title, '')), ' ') as search_text from cluster_groups cg left join threads rt on rt.id = cg.representative_thread_id join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' join threads t on t.id = cm.thread_id where cg.repo_id = ? - and (cg.status <> 'active' or cg.closed_at is not null) group by cg.id, cg.stable_slug, @@ -4282,7 +4282,10 @@ export class GHCrawlService { cg.title, rt.number, rt.kind, - rt.title`, + rt.title + having cg.status <> 'active' + or cg.closed_at is not null + or closed_member_count >= member_count`, ) .all(repoId) as Array<{ cluster_id: number; @@ -4298,6 +4301,7 @@ export class GHCrawlService { latest_updated_at: string | null; issue_count: number; pull_request_count: number; + closed_member_count: number; search_text: string | null; }>; @@ -4328,6 +4332,7 @@ export class GHCrawlService { max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, group_concat(lower(coalesce(t.title, '')), ' ') as search_text from cluster_groups cg left join threads rt on rt.id = cg.representative_thread_id @@ -4361,6 +4366,7 @@ export class GHCrawlService { latest_updated_at: string | null; issue_count: number; pull_request_count: number; + closed_member_count: number; search_text: string | null; } | undefined; @@ -4384,6 +4390,7 @@ export class GHCrawlService { latest_updated_at: string | null; issue_count: number; pull_request_count: number; + closed_member_count: number; search_text: string | null; }): TuiClusterSummary { const closure: DurableTuiClosure = { @@ -4391,13 +4398,19 @@ export class GHCrawlService { status: row.status, closedAt: row.closed_at, }; - const isClosed = row.status !== 'active' || row.closed_at !== null; + const isClosed = row.status !== 'active' || row.closed_at !== null || row.closed_member_count >= row.member_count; + const closeReasonLocal = + row.status !== 'active' || row.closed_at !== null + ? this.durableClosureReason(closure) + : row.closed_member_count >= row.member_count + ? 'all_members_closed' + : null; return { clusterId: row.cluster_id, displayTitle: this.clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), isClosed, closedAtLocal: row.closed_at, - closeReasonLocal: isClosed ? this.durableClosureReason(closure) : null, + closeReasonLocal, totalCount: row.member_count, issueCount: row.issue_count, pullRequestCount: row.pull_request_count, From b9c952ea88a2f5e2a5260d2aac33ead6c985a6ea Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 00:13:25 -0700 Subject: [PATCH 116/215] feat: add sqlite optimize command --- .../skills/ghcrawl-cluster-operator/SKILL.md | 12 ++ README.md | 5 +- apps/cli/src/main.test.ts | 39 ++++ apps/cli/src/main.ts | 40 ++++ packages/api-contract/src/contracts.ts | 34 ++++ packages/api-core/src/service.test.ts | 49 +++++ packages/api-core/src/service.ts | 180 ++++++++++++++++++ 7 files changed, 358 insertions(+), 1 deletion(-) diff --git a/.agents/skills/ghcrawl-cluster-operator/SKILL.md b/.agents/skills/ghcrawl-cluster-operator/SKILL.md index 7606eb9..4050c12 100644 --- a/.agents/skills/ghcrawl-cluster-operator/SKILL.md +++ b/.agents/skills/ghcrawl-cluster-operator/SKILL.md @@ -12,6 +12,7 @@ Use this skill when operating this repo's local-first GitHub crawler and cluster - Prefer read-only inspection commands first: `doctor`, `runs`, `clusters`, `cluster-explain`, `threads`. - Treat `refresh`, `sync`, `summarize`, `key-summaries`, and `embed` as remote/API-spend commands. - `cluster` is local-only but can be CPU-heavy on huge repos. +- `optimize` is local-only SQLite maintenance; run it after heavy sync, embedding, clustering, or close/archive sessions. - Always pass `--json` for agent-readable output. - Use `--include-code` only when file overlap matters; it hydrates PR file metadata and can increase DB size. @@ -21,6 +22,7 @@ Use this skill when operating this repo's local-first GitHub crawler and cluster ghcrawl doctor --json ghcrawl configure --json ghcrawl runs owner/repo --limit 10 --json +ghcrawl optimize owner/repo --json ``` If the local store is empty or stale, pull current open GitHub data: @@ -118,3 +120,13 @@ After edits, re-run: ghcrawl cluster owner/repo --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 50 --event-limit 50 --json ``` + +## Local Store Maintenance + +Run maintenance after large data changes: + +```bash +ghcrawl optimize owner/repo --json +``` + +Without `owner/repo`, `optimize` only checkpoints, analyzes, optimizes, and vacuums the main ghcrawl SQLite database. With `owner/repo`, it also optimizes that repo's vector SQLite store and reports the vector `.hnsw` sidecar size without rebuilding it. diff --git a/README.md b/README.md index 686d9e3..0859279 100644 --- a/README.md +++ b/README.md @@ -207,13 +207,13 @@ ghcrawl threads owner/repo --numbers 42,43,44 --include-closed --json ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --json ghcrawl clusters owner/repo --min-size 10 --hide-closed --json ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --json ghcrawl cluster-detail owner/repo --id 123 --hide-closed --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ghcrawl search owner/repo --query "download stalls" --json +ghcrawl optimize owner/repo --json ``` Use `threads --numbers ...` when you want several specific issue or PR records in one CLI call instead of paying process startup overhead repeatedly. @@ -224,6 +224,8 @@ Use `close-thread` when you know a local issue/PR should be treated as closed be Use `close-cluster` when you want to locally suppress a whole cluster from default JSON exploration without waiting for a rebuild. +Use `optimize` after heavy sync, embedding, clustering, or close/archive sessions. It checkpoints WAL files, refreshes planner stats, runs SQLite optimize, and vacuums the main database. When passed `owner/repo`, it also optimizes that repo's vector SQLite store and reports the `.hnsw` sidecar size without rebuilding it. + ## Durable Cluster Governance The durable cluster commands operate on stable cluster identities, not one-off run snapshots: @@ -284,6 +286,7 @@ The skill is built around the stable JSON CLI surface and is intentionally conse ```bash ghcrawl doctor --json ghcrawl refresh owner/repo +ghcrawl optimize owner/repo --json ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 674a2b6..0045851 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -46,6 +46,7 @@ const publicCommands = [ 'version', 'sync', 'refresh', + 'optimize', 'runs', 'threads', 'close-thread', @@ -607,6 +608,44 @@ test('cluster command forwards neighborhood refresh inputs', async () => { assert.match(stdout.read(), /"edges": 3/); }); +test('optimize command forwards optional repository target', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.optimizeStorage; + const received: unknown[] = []; + + GHCrawlService.prototype.optimizeStorage = function optimizeStorageStub(params: unknown) { + received.push(params); + return { + ok: true, + repository: params ? { fullName: 'openclaw/openclaw' } : null, + startedAt: '2026-03-10T12:00:00Z', + finishedAt: '2026-03-10T12:00:01Z', + targets: [], + bytesReclaimed: 42, + message: 'optimized', + } as never; + }; + + try { + await run(['optimize'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + await run(['optimize', 'openclaw/openclaw'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.optimizeStorage = original; + context.cleanup(); + } + + assert.deepEqual(received[0], undefined); + assert.deepEqual(received[1], { owner: 'openclaw', repo: 'openclaw' }); + assert.match(stdout.read(), /"bytesReclaimed": 42/); +}); + test('clusters command shows closed clusters by default and forwards hide-closed', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 5c4e0f7..6eb381c 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -15,6 +15,7 @@ type CommandName = | 'version' | 'sync' | 'refresh' + | 'optimize' | 'runs' | 'threads' | 'close-thread' @@ -152,6 +153,17 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl refresh openclaw/openclaw', 'ghcrawl refresh openclaw/openclaw --no-sync --json'], agentJson: true, }, + { + name: 'optimize', + synopsis: 'optimize [owner/repo] [--json]', + description: 'Checkpoint, analyze, optimize, and vacuum local SQLite stores.', + options: [ + 'owner/repo Also optimize this repository vector store when present', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl optimize --json', 'ghcrawl optimize openclaw/openclaw --json'], + agentJson: true, + }, { name: 'runs', synopsis: 'runs [--kind sync|summary|embedding|cluster] [--limit ] [--json]', @@ -1058,6 +1070,34 @@ export async function run( heapDiagnostics?.dispose(); } } + case 'optimize': { + const parsed = parseArgsForCommand( + 'optimize', + rest, + { + owner: { type: 'string' }, + repo: { type: 'string' }, + json: { type: 'boolean' }, + }, + true, + ); + const values = parsed.values as RepoCommandValues; + if (parsed.positionals.length > 1) { + throw new CliUsageError('Too many positional arguments for optimize', 'optimize'); + } + let target: { owner: string; repo: string } | undefined; + if (parsed.positionals.length === 1) { + target = parseOwnerRepo(parsed.positionals[0]); + } else if (typeof values.owner === 'string' || typeof values.repo === 'string') { + if (typeof values.owner !== 'string' || typeof values.repo !== 'string') { + throw new CliUsageError('Both --owner and --repo are required when either is set', 'optimize'); + } + target = { owner: values.owner, repo: values.repo }; + } + const result = getService().optimizeStorage(target); + writeJson(stdout, result); + return; + } case 'runs': { const { owner, repo, values } = parseRepoFlags('runs', rest); const kind = parseEnum('runs', 'kind', values.kind, ['sync', 'summary', 'embedding', 'cluster']); diff --git a/packages/api-contract/src/contracts.ts b/packages/api-contract/src/contracts.ts index 4ec7e76..a74d239 100644 --- a/packages/api-contract/src/contracts.ts +++ b/packages/api-contract/src/contracts.ts @@ -319,6 +319,40 @@ export const refreshResponseSchema = z.object({ }); export type RefreshResponse = z.infer; +export const optimizeTargetSchema = z.object({ + name: z.enum(['main', 'vector']), + path: z.string(), + existed: z.boolean(), + pageSize: z.number().int().nonnegative(), + pageCountBefore: z.number().int().nonnegative(), + pageCountAfter: z.number().int().nonnegative(), + freelistPagesBefore: z.number().int().nonnegative(), + freelistPagesAfter: z.number().int().nonnegative(), + bytesBefore: z.number().int().nonnegative(), + bytesAfter: z.number().int().nonnegative(), + walBytesBefore: z.number().int().nonnegative(), + walBytesAfter: z.number().int().nonnegative(), + shmBytesBefore: z.number().int().nonnegative(), + shmBytesAfter: z.number().int().nonnegative(), + sidecarBytesBefore: z.number().int().nonnegative(), + sidecarBytesAfter: z.number().int().nonnegative(), + bytesReclaimed: z.number().int().nonnegative(), + operations: z.array(z.string()), + durationMs: z.number().int().nonnegative(), +}); +export type OptimizeTargetDto = z.infer; + +export const optimizeResponseSchema = z.object({ + ok: z.boolean(), + repository: repositorySchema.nullable(), + startedAt: z.string(), + finishedAt: z.string(), + targets: z.array(optimizeTargetSchema), + bytesReclaimed: z.number().int().nonnegative(), + message: z.string(), +}); +export type OptimizeResponse = z.infer; + export const closeThreadRequestSchema = z.object({ owner: z.string(), repo: z.string(), diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 3945e2f..e12410a 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -116,6 +116,55 @@ test('doctor reports missing GitHub token without attempting network auth', asyn } }); +test('optimizeStorage runs SQLite maintenance and reports missing vector store', () => { + const config = makeTestConfig(); + const service = new GHCrawlService({ + config: { + ...config, + dbPath: path.join(config.configDir, 'optimize.db'), + }, + github: { + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-10T12:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db.exec('create table optimize_scratch (value text)'); + const insert = service.db.prepare('insert into optimize_scratch (value) values (?)'); + for (let index = 0; index < 200; index += 1) { + insert.run(`payload-${index}`); + } + service.db.exec('delete from optimize_scratch'); + + const response = service.optimizeStorage({ owner: 'openclaw', repo: 'openclaw' }); + + assert.equal(response.ok, true); + assert.equal(response.repository?.fullName, 'openclaw/openclaw'); + assert.equal(response.targets[0]?.name, 'main'); + assert.equal(response.targets[0]?.existed, true); + assert.ok(response.targets[0]?.operations.includes('vacuum')); + assert.equal(response.targets[1]?.name, 'vector'); + assert.equal(response.targets[1]?.existed, false); + assert.deepEqual(response.targets[1]?.operations, ['skipped_missing_vector_store']); + } finally { + service.close(); + } +}); + test('listRunHistory returns recent runs across pipeline tables', () => { const service = makeTestService({ getRepo: async () => ({}), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index a12c9c9..591bc69 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -24,6 +24,7 @@ import { embedResultSchema, healthResponseSchema, neighborsResponseSchema, + optimizeResponseSchema, refreshResponseSchema, repositoriesResponseSchema, runHistoryResponseSchema, @@ -49,6 +50,7 @@ import { type IncludeClusterMemberRequest, type MergeClustersRequest, type NeighborsResponse, + type OptimizeResponse, type RefreshResponse, type RepositoriesResponse, type RepositoryDto, @@ -191,6 +193,16 @@ type ActiveVectorRow = ThreadRow & { vector_backend: string; }; +type SqliteMaintenanceStats = { + pageSize: number; + pageCount: number; + freelistPages: number; + bytes: number; + walBytes: number; + shmBytes: number; + sidecarBytes: number; +}; + type DurableTuiClosure = { clusterId: number; status: 'active' | 'closed' | 'merged' | 'split'; @@ -3223,6 +3235,170 @@ export class GHCrawlService { }); } + optimizeStorage(params: { owner?: string; repo?: string } = {}): OptimizeResponse { + const startedAt = nowIso(); + const repository = + params.owner && params.repo + ? this.requireRepository(params.owner, params.repo) + : null; + + const targets = [ + this.optimizeSqliteTarget({ + name: 'main', + db: this.db, + dbPath: this.config.dbPath, + }), + ]; + + if (repository) { + const storePath = this.repoVectorStorePath(repository.fullName); + const sidecarPath = this.vectorStoreSidecarPath(storePath); + if (existsSync(storePath)) { + this.vectorStore.close(); + const vectorDb = openDb(storePath) as SqliteDatabase & { loadExtension: (extensionPath: string) => void }; + try { + const vectorlite = requireFromHere('vectorlite') as { vectorlitePath: () => string }; + vectorDb.loadExtension(vectorlite.vectorlitePath()); + targets.push( + this.optimizeSqliteTarget({ + name: 'vector', + db: vectorDb, + dbPath: storePath, + sidecarPath, + }), + ); + } finally { + vectorDb.close(); + } + } else { + targets.push({ + name: 'vector' as const, + path: storePath, + existed: false, + pageSize: 0, + pageCountBefore: 0, + pageCountAfter: 0, + freelistPagesBefore: 0, + freelistPagesAfter: 0, + bytesBefore: 0, + bytesAfter: 0, + walBytesBefore: 0, + walBytesAfter: 0, + shmBytesBefore: 0, + shmBytesAfter: 0, + sidecarBytesBefore: this.fileSize(sidecarPath), + sidecarBytesAfter: this.fileSize(sidecarPath), + bytesReclaimed: 0, + operations: ['skipped_missing_vector_store'], + durationMs: 0, + }); + } + } + + const bytesReclaimed = targets.reduce((sum, target) => sum + target.bytesReclaimed, 0); + return optimizeResponseSchema.parse({ + ok: true, + repository, + startedAt, + finishedAt: nowIso(), + targets, + bytesReclaimed, + message: `Optimized ${targets.filter((target) => target.existed).length} SQLite store(s); reclaimed ${bytesReclaimed} byte(s).`, + }); + } + + private optimizeSqliteTarget(params: { + name: 'main' | 'vector'; + db: SqliteDatabase; + dbPath: string; + sidecarPath?: string; + }): OptimizeResponse['targets'][number] { + const startedAt = Date.now(); + const before = this.sqliteMaintenanceStats(params.db, params.dbPath, params.sidecarPath); + const operations: string[] = []; + + this.runMaintenanceStep(params.db, 'wal_checkpoint_truncate_before', operations, () => { + params.db.pragma('wal_checkpoint(TRUNCATE)'); + }); + this.runMaintenanceStep(params.db, 'analyze', operations, () => { + params.db.exec('analyze'); + }); + this.runMaintenanceStep(params.db, 'pragma_optimize', operations, () => { + params.db.pragma('optimize'); + }); + this.runMaintenanceStep(params.db, 'vacuum', operations, () => { + params.db.exec('vacuum'); + }); + this.runMaintenanceStep(params.db, 'wal_checkpoint_truncate_after', operations, () => { + params.db.pragma('wal_checkpoint(TRUNCATE)'); + }); + + const after = this.sqliteMaintenanceStats(params.db, params.dbPath, params.sidecarPath); + const bytesBefore = before.bytes + before.walBytes + before.shmBytes; + const bytesAfter = after.bytes + after.walBytes + after.shmBytes; + + return { + name: params.name, + path: params.dbPath, + existed: params.dbPath === ':memory:' || existsSync(params.dbPath), + pageSize: after.pageSize || before.pageSize, + pageCountBefore: before.pageCount, + pageCountAfter: after.pageCount, + freelistPagesBefore: before.freelistPages, + freelistPagesAfter: after.freelistPages, + bytesBefore: before.bytes, + bytesAfter: after.bytes, + walBytesBefore: before.walBytes, + walBytesAfter: after.walBytes, + shmBytesBefore: before.shmBytes, + shmBytesAfter: after.shmBytes, + sidecarBytesBefore: before.sidecarBytes, + sidecarBytesAfter: after.sidecarBytes, + bytesReclaimed: Math.max(0, bytesBefore - bytesAfter), + operations, + durationMs: Date.now() - startedAt, + }; + } + + private runMaintenanceStep(db: SqliteDatabase, label: string, operations: string[], step: () => void): void { + try { + step(); + operations.push(label); + } catch (error) { + operations.push(`${label}_skipped:${error instanceof Error ? error.message : String(error)}`); + } + } + + private sqliteMaintenanceStats(db: SqliteDatabase, dbPath: string, sidecarPath?: string): SqliteMaintenanceStats { + return { + pageSize: this.safePragmaNumber(db, 'page_size'), + pageCount: this.safePragmaNumber(db, 'page_count'), + freelistPages: this.safePragmaNumber(db, 'freelist_count'), + bytes: this.fileSize(dbPath), + walBytes: this.fileSize(`${dbPath}-wal`), + shmBytes: this.fileSize(`${dbPath}-shm`), + sidecarBytes: sidecarPath ? this.fileSize(sidecarPath) : 0, + }; + } + + private safePragmaNumber(db: SqliteDatabase, name: string): number { + try { + const value = db.pragma(name, { simple: true }) as unknown; + return typeof value === 'number' && Number.isFinite(value) ? value : 0; + } catch { + return 0; + } + } + + private fileSize(filePath: string): number { + if (filePath === ':memory:') return 0; + try { + return fs.statSync(filePath).size; + } catch { + return 0; + } + } + listClusterSummaries(params: { owner: string; repo: string; @@ -3929,6 +4105,10 @@ export class GHCrawlService { return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`); } + private vectorStoreSidecarPath(storePath: string): string { + return path.join(path.dirname(storePath), `${path.basename(storePath, path.extname(storePath))}.hnsw`); + } + private queryNearestWithRecovery( repoId: number, repoFullName: string, From 00c8c246e9cdd36a05cb4e524aeb757d6cb4a78c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 00:27:49 -0700 Subject: [PATCH 117/215] fix: keep durable clusters stable --- packages/api-core/src/service.test.ts | 101 +++++++++++++- packages/api-core/src/service.ts | 190 +++++++++++++++++++------- 2 files changed, 238 insertions(+), 53 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index e12410a..51ae5ab 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -4,6 +4,7 @@ import fs from 'node:fs'; import os from 'node:os'; import path from 'node:path'; +import { humanKeyForValue } from './cluster/human-key.js'; import { GHCrawlService } from './service.js'; import type { VectorStore } from './vector/store.js'; @@ -2227,6 +2228,76 @@ test('clusterRepository rebuilds a corrupted active vector store and retries', a } }); +test('durable cluster identity survives representative changes by member overlap', () => { + const service = makeTestService({ + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + + try { + const now = '2026-03-09T00:00:00Z'; + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', '{}', now); + service.db + .prepare("insert into pipeline_runs (id, repo_id, run_kind, status, started_at) values (?, ?, 'cluster', 'completed', ?)") + .run(1, 1, now); + service.db + .prepare("insert into pipeline_runs (id, repo_id, run_kind, status, started_at) values (?, ?, 'cluster', 'completed', ?)") + .run(2, 1, now); + const insertThread = service.db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + insertThread.run(10, 1, '100', 42, 'issue', 'open', 'Gateway crash', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + insertThread.run(11, 1, '101', 43, 'issue', 'open', 'Gateway crash duplicate', 'body', 'bob', 'User', 'https://github.com/openclaw/openclaw/issues/43', '[]', '[]', '{}', 'hash-43', 0, now, now, null, null, now, now, now); + insertThread.run(12, 1, '102', 44, 'issue', 'open', 'Gateway crash follow-up', 'body', 'carol', 'User', 'https://github.com/openclaw/openclaw/issues/44', '[]', '[]', '{}', 'hash-44', 0, now, now, null, null, now, now, now); + + const durable = service as unknown as { + persistDurableClusterState( + repoId: number, + pipelineRunId: number, + aggregatedEdges: Map }>, + clusters: Array<{ representativeThreadId: number; members: number[] }>, + ): void; + }; + const noEdges = new Map }>(); + + durable.persistDurableClusterState(1, 1, noEdges, [{ representativeThreadId: 10, members: [10, 11] }]); + const first = service.db.prepare('select id, stable_slug from cluster_groups limit 1').get() as { id: number; stable_slug: string }; + + durable.persistDurableClusterState(1, 2, noEdges, [{ representativeThreadId: 11, members: [10, 11, 12] }]); + const groups = service.db.prepare('select id, stable_slug, representative_thread_id from cluster_groups order by id asc').all() as Array<{ + id: number; + stable_slug: string; + representative_thread_id: number; + }>; + const members = service.db + .prepare('select thread_id from cluster_memberships where cluster_id = ? order by thread_id asc') + .all(first.id) as Array<{ thread_id: number }>; + + assert.deepEqual(groups, [{ id: first.id, stable_slug: first.stable_slug, representative_thread_id: 11 }]); + assert.deepEqual( + members.map((member) => member.thread_id), + [10, 11, 12], + ); + } finally { + service.close(); + } +}); + test('clusterRepository falls back to deterministic fingerprints when vectors are missing', async () => { const service = new GHCrawlService({ config: makeTestConfig(), @@ -3970,6 +4041,14 @@ test('manual cluster closure is shown by default and can be hidden from JSON sum ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(10, 1, '100', 42, 'issue', 'open', 'Issue one', 'body', 'alice', 'User', 'https://github.com/openclaw/openclaw/issues/42', '[]', '[]', '{}', 'hash-42', 0, now, now, null, null, now, now, now); + const durableIdentity = humanKeyForValue('repo:1:cluster-representative:10'); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(7, 1, durableIdentity.hash, durableIdentity.slug, 'active', 'duplicate_candidate', 10, 'Durable cluster', now, now, null); service.db .prepare(`insert into cluster_runs (id, repo_id, scope, status, started_at, finished_at) values (?, ?, ?, ?, ?, ?)`) .run(1, 1, 'openclaw/openclaw', 'completed', now, now); @@ -3986,6 +4065,11 @@ test('manual cluster closure is shown by default and can be hidden from JSON sum const response = service.closeClusterLocally({ owner: 'openclaw', repo: 'openclaw', clusterId: 100 }); assert.equal(response.ok, true); assert.equal(response.clusterClosed, true); + const durable = service.db.prepare('select status, closed_at from cluster_groups where id = ?').get(7) as { + status: string; + closed_at: string | null; + }; + assert.deepEqual(durable, { status: 'active', closed_at: null }); assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosed: false }).clusters.length, 0); assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }).clusters.length, 1); @@ -4062,6 +4146,13 @@ test('tui snapshot includes durable closed clusters missing from the latest run' ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(8, 1, 'stable-key-archived', 'archive-blue-harbor', 'active', 'duplicate_candidate', 11, 'Archived durable cluster', now, now, null); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(9, 1, 'stable-key-duplicate-archived', 'archive-blue-duplicate', 'active', 'duplicate_candidate', 11, 'Duplicate archived durable cluster', now, now, null); service.db .prepare( `insert into cluster_memberships ( @@ -4078,6 +4169,14 @@ test('tui snapshot includes durable closed clusters missing from the latest run' ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ) .run(8, 11, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(9, 11, 'canonical', 'active', 1, null, null, 'algo', null, '{}', null, now, now, null); const hidden = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosedClusters: false }); assert.equal(hidden.clusters.length, 0); @@ -4086,7 +4185,7 @@ test('tui snapshot includes durable closed clusters missing from the latest run' assert.equal(snapshot.clusters.length, 2); assert.equal(snapshot.clusters[0]?.clusterId, 7); assert.equal(snapshot.clusters[0]?.isClosed, true); - assert.equal(snapshot.clusters[0]?.closeReasonLocal, 'closed'); + assert.equal(snapshot.clusters[0]?.closeReasonLocal, 'all_members_closed'); assert.equal(snapshot.clusters[1]?.clusterId, 8); assert.equal(snapshot.clusters[1]?.isClosed, true); assert.equal(snapshot.clusters[1]?.closeReasonLocal, 'all_members_closed'); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 591bc69..4d3bbec 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -452,6 +452,7 @@ const SYNC_BATCH_SIZE = 100; const SYNC_BATCH_DELAY_MS = 5000; const STALE_CLOSED_SWEEP_LIMIT = 1000; const CLUSTER_PROGRESS_INTERVAL_MS = 5000; +const DURABLE_CLUSTER_REUSE_MIN_OVERLAP = 0.8; const RAW_JSON_INLINE_THRESHOLD_BYTES = 4096; const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000; const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3; @@ -896,8 +897,8 @@ export class GHCrawlService { } const row = this.db - .prepare('select id, representative_thread_id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1') - .get(repository.id, latestRun.id, params.clusterId) as { id: number; representative_thread_id: number | null } | undefined; + .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1') + .get(repository.id, latestRun.id, params.clusterId) as { id: number } | undefined; if (!row) { throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`); } @@ -911,7 +912,6 @@ export class GHCrawlService { where id = ?`, ) .run(closedAt, row.id); - this.markDurableClusterClosedByRepresentative(repository.id, row.representative_thread_id ?? null, closedAt, 'manual'); return closeResponseSchema.parse({ ok: true, @@ -4339,10 +4339,6 @@ export class GHCrawlService { if (row.member_count > 0 && row.closed_member_count >= row.member_count) { const closedAt = nowIso(); const result = markClosed.run(closedAt, clusterId); - const cluster = this.db.prepare('select representative_thread_id from clusters where id = ? limit 1').get(clusterId) as - | { representative_thread_id: number | null } - | undefined; - this.markDurableClusterClosedByRepresentative(repoId, cluster?.representative_thread_id ?? null, closedAt, 'all_members_closed'); changed += result.changes; continue; } @@ -4353,41 +4349,8 @@ export class GHCrawlService { return changed; } - private markDurableClusterClosedByRepresentative( - repoId: number, - representativeThreadId: number | null, - closedAt: string, - reason: string, - ): void { - if (representativeThreadId === null) return; - const identity = humanKeyForValue(`repo:${repoId}:cluster-representative:${representativeThreadId}`); - const durable = this.db - .prepare('select id from cluster_groups where repo_id = ? and stable_key = ? limit 1') - .get(repoId, identity.hash) as { id: number } | undefined; - if (!durable) return; - - this.db - .prepare( - `update cluster_groups - set status = 'closed', - closed_at = coalesce(closed_at, ?), - updated_at = ? - where id = ?`, - ) - .run(closedAt, closedAt, durable.id); - recordClusterEvent(this.db, { - clusterId: durable.id, - eventType: 'close_cluster', - actorKind: reason === 'manual' ? 'user' : 'algo', - payload: { - representativeThreadId, - reason, - }, - }); - } - - private durableClosureReason(closure: DurableTuiClosure): string { - return closure.status === 'active' ? 'closed' : closure.status; + private durableClosureReason(closure: DurableTuiClosure): string | null { + return closure.status === 'merged' || closure.status === 'split' ? closure.status : null; } private getDurableClosuresByRepresentative(repoId: number, representativeThreadIds: number[]): Map { @@ -4407,7 +4370,7 @@ export class GHCrawlService { from cluster_groups where repo_id = ? and stable_key in (${placeholders}) - and (status <> 'active' or closed_at is not null)`, + and status in ('merged', 'split')`, ) .all(repoId, ...identities.map((identity) => identity.stableKey)) as Array<{ id: number; @@ -4447,6 +4410,7 @@ export class GHCrawlService { sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, + group_concat(t.id, ',') as member_thread_ids, group_concat(lower(coalesce(t.title, '')), ' ') as search_text from cluster_groups cg left join threads rt on rt.id = cg.representative_thread_id @@ -4463,8 +4427,7 @@ export class GHCrawlService { rt.number, rt.kind, rt.title - having cg.status <> 'active' - or cg.closed_at is not null + having cg.status in ('merged', 'split') or closed_member_count >= member_count`, ) .all(repoId) as Array<{ @@ -4482,11 +4445,13 @@ export class GHCrawlService { issue_count: number; pull_request_count: number; closed_member_count: number; + member_thread_ids: string | null; search_text: string | null; }>; - return rows - .filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)) + return this.collapseOverlappingClosedDurableRows( + rows.filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)), + ) .map((row) => this.durableTuiSummaryFromRow({ ...row, @@ -4495,6 +4460,50 @@ export class GHCrawlService { ); } + private collapseOverlappingClosedDurableRows< + T extends { + cluster_id: number; + member_count: number; + latest_updated_at: string | null; + member_thread_ids: string | null; + }, + >(rows: T[]): T[] { + const sortedRows = [...rows].sort((left, right) => { + const leftTime = left.latest_updated_at ? Date.parse(left.latest_updated_at) : 0; + const rightTime = right.latest_updated_at ? Date.parse(right.latest_updated_at) : 0; + return right.member_count - left.member_count || rightTime - leftTime || left.cluster_id - right.cluster_id; + }); + const selected: Array<{ row: T; memberIds: Set }> = []; + + for (const row of sortedRows) { + const memberIds = this.parseMemberThreadIdSet(row.member_thread_ids); + const duplicate = selected.some((entry) => { + const smallerSize = Math.min(memberIds.size, entry.memberIds.size); + if (smallerSize === 0) return false; + let overlap = 0; + for (const memberId of memberIds) { + if (entry.memberIds.has(memberId)) overlap += 1; + } + return overlap / smallerSize >= 0.8; + }); + if (!duplicate) { + selected.push({ row, memberIds }); + } + } + + return selected.map((entry) => entry.row); + } + + private parseMemberThreadIdSet(value: string | null): Set { + if (!value) return new Set(); + return new Set( + value + .split(',') + .map((part) => Number(part)) + .filter((memberId) => Number.isSafeInteger(memberId) && memberId > 0), + ); + } + private getDurableTuiClusterSummary(repoId: number, clusterId: number): TuiClusterSummary | null { const row = this.db .prepare( @@ -4578,9 +4587,10 @@ export class GHCrawlService { status: row.status, closedAt: row.closed_at, }; - const isClosed = row.status !== 'active' || row.closed_at !== null || row.closed_member_count >= row.member_count; + const lifecycleClosed = row.status === 'merged' || row.status === 'split'; + const isClosed = lifecycleClosed || row.closed_member_count >= row.member_count; const closeReasonLocal = - row.status !== 'active' || row.closed_at !== null + lifecycleClosed ? this.durableClosureReason(closure) : row.closed_member_count >= row.member_count ? 'all_members_closed' @@ -4589,7 +4599,7 @@ export class GHCrawlService { clusterId: row.cluster_id, displayTitle: this.clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), isClosed, - closedAtLocal: row.closed_at, + closedAtLocal: lifecycleClosed ? row.closed_at : null, closeReasonLocal, totalCount: row.member_count, issueCount: row.issue_count, @@ -6559,6 +6569,7 @@ export class GHCrawlService { clusters: Array<{ representativeThreadId: number; members: number[] }>, ): void { this.db.transaction(() => { + const claimedDurableClusterIds = new Set(); for (const edge of aggregatedEdges.values()) { upsertSimilarityEdgeEvidence(this.db, { repoId, @@ -6579,15 +6590,17 @@ export class GHCrawlService { for (const cluster of clusters) { const identity = humanKeyForValue(`repo:${repoId}:cluster-representative:${cluster.representativeThreadId}`); + const durableIdentity = this.resolveDurableClusterIdentity(repoId, identity.hash, cluster.members, claimedDurableClusterIds); const clusterId = upsertClusterGroup(this.db, { repoId, - stableKey: identity.hash, - stableSlug: humanKeyStableSlug(identity), + stableKey: durableIdentity?.stable_key ?? identity.hash, + stableSlug: durableIdentity?.stable_slug ?? humanKeyStableSlug(identity), status: 'active', clusterType: cluster.members.length > 1 ? 'duplicate_candidate' : 'singleton_orphan', representativeThreadId: cluster.representativeThreadId, title: `Cluster ${identity.slug}`, }); + claimedDurableClusterIds.add(clusterId); const forcedCanonical = this.db .prepare( `select thread_id @@ -6731,6 +6744,79 @@ export class GHCrawlService { })(); } + private resolveDurableClusterIdentity( + repoId: number, + representativeStableKey: string, + memberIds: number[], + claimedClusterIds: Set, + ): { id: number; stable_key: string; stable_slug: string } | null { + const exact = this.db + .prepare( + `select id, stable_key, stable_slug + from cluster_groups + where repo_id = ? + and stable_key = ? + and status <> 'merged' + limit 1`, + ) + .get(repoId, representativeStableKey) as { id: number; stable_key: string; stable_slug: string } | undefined; + if (exact && !claimedClusterIds.has(exact.id)) { + return exact; + } + + const uniqueMemberIds = Array.from(new Set(memberIds)); + if (uniqueMemberIds.length === 0) { + return null; + } + + const placeholders = uniqueMemberIds.map(() => '?').join(','); + const rows = this.db + .prepare( + `select + cg.id, + cg.stable_key, + cg.stable_slug, + count(*) as member_count, + sum(case when cm.thread_id in (${placeholders}) then 1 else 0 end) as overlap_count, + max(cm.updated_at) as latest_membership_updated_at + from cluster_groups cg + join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' + where cg.repo_id = ? + and cg.status <> 'merged' + group by cg.id, cg.stable_key, cg.stable_slug + having overlap_count > 0`, + ) + .all(...uniqueMemberIds, repoId) as Array<{ + id: number; + stable_key: string; + stable_slug: string; + member_count: number; + overlap_count: number; + latest_membership_updated_at: string | null; + }>; + + return ( + rows + .filter((row) => !claimedClusterIds.has(row.id)) + .map((row) => { + const overlapBase = Math.min(uniqueMemberIds.length, row.member_count); + return { + row, + overlapScore: overlapBase > 0 ? row.overlap_count / overlapBase : 0, + latestMembershipTime: row.latest_membership_updated_at ? Date.parse(row.latest_membership_updated_at) : 0, + }; + }) + .filter((entry) => entry.overlapScore >= DURABLE_CLUSTER_REUSE_MIN_OVERLAP) + .sort( + (left, right) => + right.overlapScore - left.overlapScore || + right.row.overlap_count - left.row.overlap_count || + right.latestMembershipTime - left.latestMembershipTime || + left.row.id - right.row.id, + )[0]?.row ?? null + ); + } + private pruneOldClusterRuns(repoId: number, keepRunId: number): void { this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId); } From 0656144718445f58bf155cbcd18be63ed3cdb6fe Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 01:07:00 -0700 Subject: [PATCH 118/215] fix: persist cluster closures and speed cluster boot --- apps/cli/bin/ghcrawl.js | 14 +- apps/cli/src/bin.test.ts | 23 ++- packages/api-core/src/db/migrate.ts | 10 ++ packages/api-core/src/service.test.ts | 13 ++ packages/api-core/src/service.ts | 192 +++++++++++++++++++++----- 5 files changed, 213 insertions(+), 39 deletions(-) diff --git a/apps/cli/bin/ghcrawl.js b/apps/cli/bin/ghcrawl.js index fb3f8a5..c90f92e 100755 --- a/apps/cli/bin/ghcrawl.js +++ b/apps/cli/bin/ghcrawl.js @@ -1,5 +1,5 @@ #!/usr/bin/env node -import { existsSync, readFileSync } from 'node:fs'; +import { existsSync, readFileSync, statSync } from 'node:fs'; import { spawn, spawnSync } from 'node:child_process'; import { createRequire } from 'node:module'; import path from 'node:path'; @@ -50,7 +50,17 @@ if (!process.env.GHCRAWL_NODE_REEXEC && existsSync(nodeVersionPath)) { } } -if (!existsSync(sourceEntrypoint) && existsSync(distEntrypoint)) { +function isDistFresh() { + if (!existsSync(distEntrypoint)) return false; + if (!existsSync(sourceEntrypoint)) return true; + try { + return statSync(distEntrypoint).mtimeMs >= statSync(sourceEntrypoint).mtimeMs; + } catch { + return false; + } +} + +if (process.env.GHCRAWL_DEV_SOURCE !== '1' && isDistFresh()) { const entrypoint = await import(pathToFileURL(distEntrypoint).href); const exitCode = typeof entrypoint.runCli === 'function' diff --git a/apps/cli/src/bin.test.ts b/apps/cli/src/bin.test.ts index 958e661..df6036a 100644 --- a/apps/cli/src/bin.test.ts +++ b/apps/cli/src/bin.test.ts @@ -25,7 +25,7 @@ function runFixture(binDir: string): string { }).trim(); } -test('bin launcher prefers source when source and dist are both present', () => { +test('bin launcher prefers fresh dist when source and dist are both present', () => { const fixtureDir = createFixture(); try { mkdirSync(path.join(fixtureDir, 'src')); @@ -33,7 +33,26 @@ test('bin launcher prefers source when source and dist are both present', () => writeFileSync(path.join(fixtureDir, 'src', 'main.ts'), "process.stdout.write('source');\n", 'utf8'); writeFileSync(path.join(fixtureDir, 'dist', 'main.js'), "export async function run() { process.stdout.write('dist'); }\n", 'utf8'); - assert.equal(runFixture(fixtureDir), 'source'); + assert.equal(runFixture(fixtureDir), 'dist'); + } finally { + rmSync(fixtureDir, { recursive: true, force: true }); + } +}); + +test('bin launcher uses source when explicitly requested', () => { + const fixtureDir = createFixture(); + try { + mkdirSync(path.join(fixtureDir, 'src')); + mkdirSync(path.join(fixtureDir, 'dist')); + writeFileSync(path.join(fixtureDir, 'src', 'main.ts'), "process.stdout.write('source');\n", 'utf8'); + writeFileSync(path.join(fixtureDir, 'dist', 'main.js'), "export async function run() { process.stdout.write('dist'); }\n", 'utf8'); + + const output = execFileSync(process.execPath, [path.join(fixtureDir, 'bin', 'ghcrawl.js')], { + cwd: fixtureDir, + encoding: 'utf8', + env: { ...process.env, GHCRAWL_DEV_SOURCE: '1' }, + }).trim(); + assert.equal(output, 'source'); } finally { rmSync(fixtureDir, { recursive: true, force: true }); } diff --git a/packages/api-core/src/db/migrate.ts b/packages/api-core/src/db/migrate.ts index 614c6eb..87ac4e1 100644 --- a/packages/api-core/src/db/migrate.ts +++ b/packages/api-core/src/db/migrate.ts @@ -448,6 +448,15 @@ const migrationStatements = [ created_at text not null, primary key (cluster_id, alias_slug) ) + `, + ` + create table if not exists cluster_closures ( + cluster_id integer primary key references cluster_groups(id) on delete cascade, + reason text not null, + actor_kind text not null, + created_at text not null, + updated_at text not null + ) ` ]; @@ -533,4 +542,5 @@ export function migrate(db: SqliteDatabase): void { db.exec('create index if not exists idx_cluster_memberships_cluster_updated on cluster_memberships(cluster_id, updated_at)'); db.exec('create index if not exists idx_cluster_overrides_repo_target on cluster_overrides(repo_id, cluster_id, thread_id, action)'); db.exec('create index if not exists idx_cluster_events_cluster_created on cluster_events(cluster_id, created_at)'); + db.exec('create index if not exists idx_cluster_closures_updated on cluster_closures(updated_at)'); } diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 51ae5ab..f878a11 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -4070,6 +4070,11 @@ test('manual cluster closure is shown by default and can be hidden from JSON sum closed_at: string | null; }; assert.deepEqual(durable, { status: 'active', closed_at: null }); + const closure = service.db.prepare('select reason, actor_kind from cluster_closures where cluster_id = ?').get(7) as { + reason: string; + actor_kind: string; + }; + assert.deepEqual(closure, { reason: 'manual', actor_kind: 'user' }); assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0, includeClosed: false }).clusters.length, 0); assert.equal(service.listClusterSummaries({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }).clusters.length, 1); @@ -4086,6 +4091,14 @@ test('manual cluster closure is shown by default and can be hidden from JSON sum assert.equal(snapshot.clusters.length, 1); assert.equal(snapshot.clusters[0]?.isClosed, true); assert.equal(snapshot.clusters[0]?.closeReasonLocal, 'manual'); + + service.db.prepare('delete from cluster_members where cluster_id = ?').run(100); + service.db.prepare('delete from clusters where id = ?').run(100); + const prunedSnapshot = service.getTuiSnapshot({ owner: 'openclaw', repo: 'openclaw', minSize: 0 }); + assert.equal(prunedSnapshot.clusters.length, 1); + assert.equal(prunedSnapshot.clusters[0]?.clusterId, 7); + assert.equal(prunedSnapshot.clusters[0]?.isClosed, true); + assert.equal(prunedSnapshot.clusters[0]?.closeReasonLocal, 'manual'); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 4d3bbec..e58576d 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -207,6 +207,7 @@ type DurableTuiClosure = { clusterId: number; status: 'active' | 'closed' | 'merged' | 'split'; closedAt: string | null; + reason: string | null; }; type RepoPipelineStateRow = { @@ -897,21 +898,44 @@ export class GHCrawlService { } const row = this.db - .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1') - .get(repository.id, latestRun.id, params.clusterId) as { id: number } | undefined; + .prepare('select id, representative_thread_id from clusters where repo_id = ? and cluster_run_id = ? and id = ? limit 1') + .get(repository.id, latestRun.id, params.clusterId) as { id: number; representative_thread_id: number | null } | undefined; if (!row) { throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`); } const closedAt = nowIso(); - this.db - .prepare( - `update clusters - set closed_at_local = ?, - close_reason_local = 'manual' - where id = ?`, - ) - .run(closedAt, row.id); + let durableClusterId = 0; + this.db.transaction(() => { + durableClusterId = this.ensureDurableClusterForRunCluster(repository.id, row.id, row.representative_thread_id); + this.db + .prepare( + `update clusters + set closed_at_local = ?, + close_reason_local = 'manual' + where id = ?`, + ) + .run(closedAt, row.id); + this.db + .prepare( + `insert into cluster_closures (cluster_id, reason, actor_kind, created_at, updated_at) + values (?, 'manual', 'user', ?, ?) + on conflict(cluster_id) do update set + reason = excluded.reason, + actor_kind = excluded.actor_kind, + updated_at = excluded.updated_at`, + ) + .run(durableClusterId, closedAt, closedAt); + recordClusterEvent(this.db, { + clusterId: durableClusterId, + eventType: 'manual_close_cluster', + actorKind: 'user', + payload: { + runClusterId: row.id, + reason: 'manual', + }, + }); + })(); return closeResponseSchema.parse({ ok: true, @@ -1156,7 +1180,7 @@ export class GHCrawlService { const clusters = this.db .prepare( `select id, stable_slug - from cluster_groups + from cluster_groups cg where repo_id = ? and id in (?, ?)`, ) @@ -1270,7 +1294,7 @@ export class GHCrawlService { const source = this.db .prepare( `select id, stable_slug - from cluster_groups + from cluster_groups cg where repo_id = ? and id = ? limit 1`, @@ -3512,18 +3536,18 @@ export class GHCrawlService { const stats = this.getTuiRepoStats(repository.id); const latestRun = this.getLatestClusterRun(repository.id); const includeClosedClusters = params.includeClosedClusters ?? true; - const rawClusters = latestRun ? this.listRawTuiClusters(repository.id, latestRun.id) : []; + const minSize = params.minSize ?? 1; + const rawClusters = latestRun ? this.listRawTuiClusters(repository.id, latestRun.id, minSize) : []; const representedThreadIds = new Set( rawClusters .map((cluster) => cluster.representativeThreadId) .filter((threadId): threadId is number => threadId !== null), ); const durableClosedClusters = includeClosedClusters - ? this.listClosedDurableTuiClusters(repository.id, representedThreadIds) + ? this.listClosedDurableTuiClusters(repository.id, representedThreadIds, minSize) : []; const clusters = [...rawClusters, ...durableClosedClusters] .filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed)) - .filter((cluster) => cluster.totalCount >= (params.minSize ?? 1)) .filter((cluster) => { const search = params.search?.trim().toLowerCase(); if (!search) return true; @@ -4349,7 +4373,58 @@ export class GHCrawlService { return changed; } + private ensureDurableClusterForRunCluster(repoId: number, runClusterId: number, representativeThreadId: number | null): number { + const members = this.db + .prepare( + `select thread_id, score_to_representative + from cluster_members + where cluster_id = ? + order by thread_id asc`, + ) + .all(runClusterId) as Array<{ thread_id: number; score_to_representative: number | null }>; + if (members.length === 0) { + throw new Error(`Cluster ${runClusterId} has no members.`); + } + + const resolvedRepresentativeThreadId = representativeThreadId ?? members[0]?.thread_id; + if (resolvedRepresentativeThreadId === undefined) { + throw new Error(`Cluster ${runClusterId} has no representative.`); + } + + const identity = humanKeyForValue(`repo:${repoId}:cluster-representative:${resolvedRepresentativeThreadId}`); + const memberIds = members.map((member) => member.thread_id); + const durableIdentity = this.resolveDurableClusterIdentity(repoId, identity.hash, memberIds, new Set()); + const durableClusterId = upsertClusterGroup(this.db, { + repoId, + stableKey: durableIdentity?.stable_key ?? identity.hash, + stableSlug: durableIdentity?.stable_slug ?? humanKeyStableSlug(identity), + status: 'active', + clusterType: members.length > 1 ? 'duplicate_candidate' : 'singleton_orphan', + representativeThreadId: resolvedRepresentativeThreadId, + title: `Cluster ${identity.slug}`, + }); + + for (const member of members) { + upsertClusterMembership(this.db, { + clusterId: durableClusterId, + threadId: member.thread_id, + role: member.thread_id === resolvedRepresentativeThreadId ? 'canonical' : 'related', + state: 'active', + scoreToRepresentative: member.thread_id === resolvedRepresentativeThreadId ? 1 : member.score_to_representative, + addedBy: 'algo', + addedReason: { + source: 'closeClusterLocally', + runClusterId, + representativeThreadId: resolvedRepresentativeThreadId, + }, + }); + } + + return durableClusterId; + } + private durableClosureReason(closure: DurableTuiClosure): string | null { + if (closure.reason) return closure.reason; return closure.status === 'merged' || closure.status === 'split' ? closure.status : null; } @@ -4366,17 +4441,19 @@ export class GHCrawlService { const placeholders = identities.map(() => '?').join(','); const rows = this.db .prepare( - `select id, stable_key, status, closed_at - from cluster_groups - where repo_id = ? - and stable_key in (${placeholders}) - and status in ('merged', 'split')`, + `select cg.id, cg.stable_key, cg.status, coalesce(cc.updated_at, cg.closed_at) as closed_at, cc.reason + from cluster_groups cg + left join cluster_closures cc on cc.cluster_id = cg.id + where cg.repo_id = ? + and cg.stable_key in (${placeholders}) + and (cc.cluster_id is not null or cg.status in ('merged', 'split'))`, ) .all(repoId, ...identities.map((identity) => identity.stableKey)) as Array<{ id: number; stable_key: string; status: 'active' | 'closed' | 'merged' | 'split'; closed_at: string | null; + reason: string | null; }>; const threadIdByStableKey = new Map(identities.map((identity) => [identity.stableKey, identity.threadId])); const closures = new Map(); @@ -4387,19 +4464,21 @@ export class GHCrawlService { clusterId: row.id, status: row.status, closedAt: row.closed_at, + reason: row.reason, }); } return closures; } - private listClosedDurableTuiClusters(repoId: number, representedThreadIds: Set): TuiClusterSummary[] { + private listClosedDurableTuiClusters(repoId: number, representedThreadIds: Set, minSize: number): TuiClusterSummary[] { const rows = this.db .prepare( `select cg.id as cluster_id, cg.stable_slug, cg.status, - cg.closed_at, + coalesce(cc.updated_at, cg.closed_at) as closed_at, + cc.reason as closure_reason, cg.representative_thread_id, cg.title, rt.number as representative_number, @@ -4413,6 +4492,7 @@ export class GHCrawlService { group_concat(t.id, ',') as member_thread_ids, group_concat(lower(coalesce(t.title, '')), ' ') as search_text from cluster_groups cg + left join cluster_closures cc on cc.cluster_id = cg.id left join threads rt on rt.id = cg.representative_thread_id join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' join threads t on t.id = cm.thread_id @@ -4422,19 +4502,24 @@ export class GHCrawlService { cg.stable_slug, cg.status, cg.closed_at, + cc.updated_at, + cc.reason, cg.representative_thread_id, cg.title, rt.number, rt.kind, rt.title - having cg.status in ('merged', 'split') - or closed_member_count >= member_count`, + having member_count >= ? + and (cc.cluster_id is not null + or cg.status in ('merged', 'split') + or closed_member_count >= member_count)`, ) - .all(repoId) as Array<{ + .all(repoId, minSize) as Array<{ cluster_id: number; stable_slug: string; status: 'active' | 'closed' | 'merged' | 'split'; closed_at: string | null; + closure_reason: string | null; representative_thread_id: number | null; title: string | null; representative_number: number | null; @@ -4511,7 +4596,8 @@ export class GHCrawlService { cg.id as cluster_id, cg.stable_slug, cg.status, - cg.closed_at, + coalesce(cc.updated_at, cg.closed_at) as closed_at, + cc.reason as closure_reason, cg.representative_thread_id, cg.title, rt.number as representative_number, @@ -4524,6 +4610,7 @@ export class GHCrawlService { sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, group_concat(lower(coalesce(t.title, '')), ' ') as search_text from cluster_groups cg + left join cluster_closures cc on cc.cluster_id = cg.id left join threads rt on rt.id = cg.representative_thread_id join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' join threads t on t.id = cm.thread_id @@ -4534,6 +4621,8 @@ export class GHCrawlService { cg.stable_slug, cg.status, cg.closed_at, + cc.updated_at, + cc.reason, cg.representative_thread_id, cg.title, rt.number, @@ -4546,6 +4635,7 @@ export class GHCrawlService { stable_slug: string; status: 'active' | 'closed' | 'merged' | 'split'; closed_at: string | null; + closure_reason: string | null; representative_thread_id: number | null; title: string | null; representative_number: number | null; @@ -4571,6 +4661,7 @@ export class GHCrawlService { stable_slug: string; status: 'active' | 'closed' | 'merged' | 'split'; closed_at: string | null; + closure_reason?: string | null; representative_thread_id: number | null; representative_number: number | null; representative_kind: 'issue' | 'pull_request' | null; @@ -4586,11 +4677,13 @@ export class GHCrawlService { clusterId: row.cluster_id, status: row.status, closedAt: row.closed_at, + reason: row.closure_reason ?? null, }; const lifecycleClosed = row.status === 'merged' || row.status === 'split'; - const isClosed = lifecycleClosed || row.closed_member_count >= row.member_count; + const manuallyClosed = row.closure_reason !== undefined && row.closure_reason !== null; + const isClosed = manuallyClosed || lifecycleClosed || row.closed_member_count >= row.member_count; const closeReasonLocal = - lifecycleClosed + manuallyClosed || lifecycleClosed ? this.durableClosureReason(closure) : row.closed_member_count >= row.member_count ? 'all_members_closed' @@ -4599,7 +4692,7 @@ export class GHCrawlService { clusterId: row.cluster_id, displayTitle: this.clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), isClosed, - closedAtLocal: lifecycleClosed ? row.closed_at : null, + closedAtLocal: manuallyClosed || lifecycleClosed ? row.closed_at : null, closeReasonLocal, totalCount: row.member_count, issueCount: row.issue_count, @@ -4612,7 +4705,7 @@ export class GHCrawlService { }; } - private listRawTuiClusters(repoId: number, clusterRunId: number): TuiClusterSummary[] { + private listRawTuiClusters(repoId: number, clusterRunId: number, minSize: number): TuiClusterSummary[] { const rows = this.db .prepare( `select @@ -4642,9 +4735,10 @@ export class GHCrawlService { c.representative_thread_id, rt.number, rt.kind, - rt.title`, + rt.title + having c.member_count >= ?`, ) - .all(repoId, clusterRunId) as Array<{ + .all(repoId, clusterRunId, minSize) as Array<{ cluster_id: number; member_count: number; closed_at_local: string | null; @@ -5740,7 +5834,14 @@ export class GHCrawlService { join document_embeddings e on e.thread_id = t.id where t.repo_id = ? and t.state = 'open' - and t.closed_at_local is null`, + and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + )`, ) .all(repoId) as Array<{ id: number; number: number; title: string; source_kind: EmbeddingSourceKind }>; @@ -5766,6 +5867,13 @@ export class GHCrawlService { where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + ) and tv.model = ? and tv.basis = ? and tv.dimensions = ? @@ -5803,7 +5911,14 @@ export class GHCrawlService { from threads where repo_id = ? and state = 'open' - and closed_at_local is null`; + and closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = threads.id + and cm.state <> 'removed_by_user' + )`; const args: Array = [repoId]; if (threadIds && threadIds.length > 0) { sql += ` and id in (${threadIds.map(() => '?').join(',')})`; @@ -6125,7 +6240,14 @@ export class GHCrawlService { let sql = `select t.id, t.number, t.title, t.body from threads t - where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null`; + where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + )`; const args: Array = [repoId]; if (threadNumber) { sql += ' and t.number = ?'; From 2e604a723659e074df174e4d7420c8769c7a18b9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 01:13:38 -0700 Subject: [PATCH 119/215] fix: skip closed sync hydration --- packages/api-core/src/service.test.ts | 100 ++++++++++++++++++++++++++ packages/api-core/src/service.ts | 25 +++++-- 2 files changed, 121 insertions(+), 4 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index f878a11..4d225ab 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -533,6 +533,106 @@ test('syncRepository hydrates pull request code snapshots when includeCode is en } }); +test('syncRepository skips comment/code hydration and fingerprint refresh when a PR closes during sync', async () => { + let getPullCalls = 0; + let listIssueCommentCalls = 0; + let listPullReviewCalls = 0; + let listPullReviewCommentCalls = 0; + let listPullFileCalls = 0; + const messages: string[] = []; + const service = makeTestService({ + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async () => [ + { + id: 101, + number: 43, + state: 'open', + title: 'Downloader PR', + body: 'Implements a fix.', + html_url: 'https://github.com/openclaw/openclaw/pull/43', + labels: [{ name: 'bug' }], + assignees: [], + pull_request: { url: 'https://api.github.com/repos/openclaw/openclaw/pulls/43' }, + user: { login: 'alice', type: 'User' }, + }, + ], + getIssue: async () => { + throw new Error('not expected'); + }, + getPull: async (_owner, _repo, number) => { + getPullCalls += 1; + return { + id: 101, + number, + state: 'closed', + title: 'Downloader PR', + body: 'Implements a fix.', + html_url: `https://github.com/openclaw/openclaw/pull/${number}`, + labels: [{ name: 'bug' }], + assignees: [], + pull_request: { url: `https://api.github.com/repos/openclaw/openclaw/pulls/${number}` }, + user: { login: 'alice', type: 'User' }, + draft: false, + closed_at: '2026-03-10T00:00:00Z', + merged_at: '2026-03-10T00:00:00Z', + updated_at: '2026-03-10T00:00:00Z', + }; + }, + listIssueComments: async () => { + listIssueCommentCalls += 1; + return []; + }, + listPullReviews: async () => { + listPullReviewCalls += 1; + return []; + }, + listPullReviewComments: async () => { + listPullReviewCommentCalls += 1; + return []; + }, + listPullFiles: async () => { + listPullFileCalls += 1; + return []; + }, + }); + + try { + const result = await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + includeComments: true, + includeCode: true, + onProgress: (message) => messages.push(message), + }); + + assert.equal(result.threadsSynced, 1); + assert.equal(result.commentsSynced, 0); + assert.equal(result.codeFilesSynced, 0); + assert.equal(getPullCalls, 1); + assert.equal(listIssueCommentCalls, 0); + assert.equal(listPullReviewCalls, 0); + assert.equal(listPullReviewCommentCalls, 0); + assert.equal(listPullFileCalls, 0); + assert.match(messages.join('\n'), /metadata-only update, skipping comment\/code hydration and fingerprint refresh/); + + const thread = service.db + .prepare("select state, closed_at_gh, merged_at_gh from threads where number = 43 and kind = 'pull_request'") + .get() as { state: string; closed_at_gh: string | null; merged_at_gh: string | null }; + assert.deepEqual(thread, { + state: 'closed', + closed_at_gh: '2026-03-10T00:00:00Z', + merged_at_gh: '2026-03-10T00:00:00Z', + }); + + const snapshotCount = service.db.prepare('select count(*) as count from thread_code_snapshots').get() as { count: number }; + const fingerprintCount = service.db.prepare('select count(*) as count from thread_fingerprints').get() as { count: number }; + assert.equal(snapshotCount.count, 0); + assert.equal(fingerprintCount.count, 0); + } finally { + service.close(); + } +}); + test('summarizeRepository excludes hydrated comments by default and reports token usage', async () => { const summaryInputs: string[] = []; const service = makeTestService( diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e58576d..f554136 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -505,6 +505,14 @@ function isEffectivelyClosed(row: { state: string; closed_at_local: string | nul return row.state !== 'open' || row.closed_at_local !== null; } +function isClosedGitHubPayload(payload: Record): boolean { + const state = typeof payload.state === 'string' ? payload.state.toLowerCase() : null; + if (state !== null && state !== 'open') return true; + if (typeof payload.closed_at === 'string' && payload.closed_at.length > 0) return true; + if (typeof payload.merged_at === 'string' && payload.merged_at.length > 0) return true; + return false; +} + function isMissingGitHubResourceError(error: unknown): boolean { const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : null; if (status === 404 || status === 410) { @@ -1523,21 +1531,30 @@ export class GHCrawlService { const kind = isPr ? 'pull_request' : 'issue'; params.onProgress?.(`[sync] ${index + 1}/${items.length} ${kind} #${number}`); try { - const shouldFetchPullPayload = isPr && includeCode; + const itemIsClosed = isClosedGitHubPayload(item); + const shouldFetchPullPayload = isPr && includeCode && !itemIsClosed; const threadPayload = shouldFetchPullPayload ? await github.getPull(params.owner, params.repo, number, reporter) : item; + const threadIsClosed = isClosedGitHubPayload(threadPayload); const threadId = this.upsertThread(repoId, kind, threadPayload, crawlStartedAt); - if (includeCode && isPr) { + if (threadIsClosed && (includeComments || includeCode)) { + params.onProgress?.( + `[sync] ${kind} #${number} is closed; metadata-only update, skipping comment/code hydration and fingerprint refresh`, + ); + } + if (includeCode && isPr && !threadIsClosed) { const files = await github.listPullFiles(params.owner, params.repo, number, reporter); this.persistThreadCodeSnapshot(threadId, threadPayload, files); codeFilesSynced += files.length; } - if (includeComments) { + if (includeComments && !threadIsClosed) { const comments = await this.fetchThreadComments(params.owner, params.repo, number, isPr, reporter); this.replaceComments(threadId, comments); commentsSynced += comments.length; } this.refreshDocument(threadId); - fingerprintThreadIds.push(threadId); + if (!threadIsClosed) { + fingerprintThreadIds.push(threadId); + } threadsSynced += 1; } catch (error) { const message = error instanceof Error ? error.message : String(error); From 8ab7cfeb8f596aee2d98db3822a1b9744ff4faab Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 01:46:25 -0700 Subject: [PATCH 120/215] fix: backfill closed state during full reconcile --- packages/api-core/src/service.test.ts | 89 +++++++++++++++++++++++++++ packages/api-core/src/service.ts | 69 +++++++++++++++++---- 2 files changed, 146 insertions(+), 12 deletions(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 4d225ab..e8ec133 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -4977,6 +4977,95 @@ test('syncRepository performs direct stale-open reconciliation when fullReconcil } }); +test('syncRepository fullReconcile backfills stale closed items from closed pages before direct checks', async () => { + let getIssueCalls = 0; + let openListCalls = 0; + const closedSinceValues: Array = []; + + const service = makeTestService({ + getRepo: async () => ({ id: 1, full_name: 'openclaw/openclaw' }), + listRepositoryIssues: async (_owner, _repo, since, _limit, _reporter, state = 'open') => { + if (state === 'closed') { + closedSinceValues.push(since); + return since === undefined + ? [ + { + id: 100, + number: 42, + state: 'closed', + title: 'Downloader hangs', + body: 'The transfer never finishes.', + html_url: 'https://github.com/openclaw/openclaw/issues/42', + labels: [{ name: 'bug' }], + assignees: [], + user: { login: 'alice', type: 'User' }, + updated_at: '2026-03-10T00:00:00Z', + closed_at: '2026-03-10T00:00:00Z', + }, + ] + : []; + } + openListCalls += 1; + return openListCalls === 1 + ? [ + { + id: 100, + number: 42, + state: 'open', + title: 'Downloader hangs', + body: 'The transfer never finishes.', + html_url: 'https://github.com/openclaw/openclaw/issues/42', + labels: [{ name: 'bug' }], + assignees: [], + user: { login: 'alice', type: 'User' }, + updated_at: '2026-03-09T00:00:00Z', + }, + ] + : []; + }, + getIssue: async () => { + getIssueCalls += 1; + throw new Error('not expected'); + }, + getPull: async () => { + throw new Error('not expected'); + }, + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }); + + try { + await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + startedAt: '2026-03-09T13:13:00.000Z', + }); + const result = await service.syncRepository({ + owner: 'openclaw', + repo: 'openclaw', + fullReconcile: true, + startedAt: '2026-03-09T14:13:01.000Z', + }); + const after = service.db + .prepare("select state from threads where number = 42 and kind = 'issue'") + .get() as { state: string }; + const statsRow = service.db + .prepare("select stats_json from sync_runs where status = 'completed' order by id desc limit 1") + .get() as { stats_json: string }; + const stats = JSON.parse(statsRow.stats_json) as { threadsClosedFromClosedBackfill?: number }; + + assert.equal(result.threadsClosed, 1); + assert.equal(getIssueCalls, 0); + assert.deepEqual(closedSinceValues, ['2026-03-09T12:13:01.000Z', undefined]); + assert.equal(after.state, 'closed'); + assert.equal(stats.threadsClosedFromClosedBackfill, 1); + } finally { + service.close(); + } +}); + test('syncRepository derives the default overlapping since window from the last completed full scan', async () => { const openSinceValues: Array = []; const closedSinceValues: Array = []; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f554136..c36e04b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -305,7 +305,9 @@ type SyncRunStats = { codeFilesSynced: number; threadsClosed: number; threadsClosedFromClosedSweep?: number; + threadsClosedFromClosedBackfill?: number; threadsClosedFromDirectReconcile?: number; + directReconcileSkippedStaleThreadCount?: number; crawlStartedAt: string; requestedSince: string | null; effectiveSince: string | null; @@ -452,6 +454,8 @@ type NeighborsResultInternal = NeighborsResponse; const SYNC_BATCH_SIZE = 100; const SYNC_BATCH_DELAY_MS = 5000; const STALE_CLOSED_SWEEP_LIMIT = 1000; +const STALE_CLOSED_BACKFILL_LIMIT = 5000; +const MAX_DIRECT_RECONCILE_THREADS = 500; const CLUSTER_PROGRESS_INTERVAL_MS = 5000; const DURABLE_CLUSTER_REUSE_MIN_OVERLAP = 0.8; const RAW_JSON_INLINE_THRESHOLD_BYTES = 4096; @@ -1577,12 +1581,33 @@ export class GHCrawlService { onProgress: params.onProgress, }) : 0; + const canFullReconcile = params.fullReconcile === true && params.limit === undefined && (isFullOpenScan || isOverlappingOpenScan); + const threadsClosedFromClosedBackfill = canFullReconcile + ? await this.applyClosedOverlapSweep({ + repoId, + owner: params.owner, + repo: params.repo, + crawlStartedAt, + closedSweepSince: undefined, + closedSweepLimit: STALE_CLOSED_BACKFILL_LIMIT, + sweepLabel: 'closed backfill', + reporter, + onProgress: params.onProgress, + }) + : 0; + const staleOpenThreadCountForDirectReconcile = canFullReconcile + ? this.countStaleOpenThreads(repoId, crawlStartedAt) + : 0; const shouldReconcileMissingOpenThreads = - params.fullReconcile === true && params.limit === undefined && (isFullOpenScan || isOverlappingOpenScan); - if (!shouldReconcileMissingOpenThreads && params.fullReconcile !== true) { + canFullReconcile && staleOpenThreadCountForDirectReconcile <= MAX_DIRECT_RECONCILE_THREADS; + if (!canFullReconcile && params.fullReconcile !== true) { params.onProgress?.('[sync] skipping full stale-open reconciliation by default; use --full-reconcile to force direct checks of all unseen open items'); - } else if (!shouldReconcileMissingOpenThreads) { + } else if (!canFullReconcile) { params.onProgress?.('[sync] skipping full stale-open reconciliation because this scan did not overlap a confirmed full/overlap cursor'); + } else if (!shouldReconcileMissingOpenThreads) { + params.onProgress?.( + `[sync] skipping direct stale-open reconciliation because ${staleOpenThreadCountForDirectReconcile} thread(s) remain; closed backfill already checked the latest ${STALE_CLOSED_BACKFILL_LIMIT} closed items`, + ); } const threadsClosedFromDirectReconcile = shouldReconcileMissingOpenThreads ? await this.reconcileMissingOpenThreads({ @@ -1594,7 +1619,7 @@ export class GHCrawlService { onProgress: params.onProgress, }) : 0; - const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromDirectReconcile; + const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromClosedBackfill + threadsClosedFromDirectReconcile; if (threadsClosed > 0) { this.reconcileClusterCloseState(repoId); } @@ -1631,7 +1656,11 @@ export class GHCrawlService { isOverlappingOpenScan, overlapReferenceAt, threadsClosedFromClosedSweep, + threadsClosedFromClosedBackfill, threadsClosedFromDirectReconcile, + directReconcileSkippedStaleThreadCount: canFullReconcile && !shouldReconcileMissingOpenThreads + ? staleOpenThreadCountForDirectReconcile + : 0, reconciledOpenCloseAt, } satisfies SyncRunStats, undefined, finishedAt); return syncResultSchema.parse({ runId, threadsSynced, commentsSynced, codeFilesSynced, threadsClosed }); @@ -5117,7 +5146,9 @@ export class GHCrawlService { owner: string; repo: string; crawlStartedAt: string; - closedSweepSince: string; + closedSweepSince?: string; + closedSweepLimit?: number; + sweepLabel?: string; reporter?: (message: string) => void; onProgress?: (message: string) => void; }): Promise { @@ -5137,9 +5168,11 @@ export class GHCrawlService { return 0; } - params.onProgress?.( - `[sync] scanning ${staleRows.length} unseen previously-open thread(s) against recently-updated closed items since ${params.closedSweepSince}`, - ); + const sweepLabel = params.sweepLabel ?? 'recent closed sweep'; + const sweepWindow = params.closedSweepSince + ? `since ${params.closedSweepSince}` + : `from the latest ${params.closedSweepLimit ?? STALE_CLOSED_SWEEP_LIMIT} closed items`; + params.onProgress?.(`[sync] ${sweepLabel}: scanning ${staleRows.length} unseen previously-open thread(s) against closed items ${sweepWindow}`); const github = this.requireGithub(); const staleByNumber = new Map( @@ -5149,7 +5182,7 @@ export class GHCrawlService { params.owner, params.repo, params.closedSweepSince, - STALE_CLOSED_SWEEP_LIMIT, + params.closedSweepLimit ?? STALE_CLOSED_SWEEP_LIMIT, params.reporter, 'closed', ); @@ -5188,13 +5221,25 @@ export class GHCrawlService { threadsClosed += 1; } - params.onProgress?.( - `[sync] recent closed sweep matched ${threadsClosed} stale thread(s); ${staleByNumber.size} remain open locally`, - ); + params.onProgress?.(`[sync] ${sweepLabel} matched ${threadsClosed} stale thread(s); ${staleByNumber.size} remain open locally`); return threadsClosed; } + private countStaleOpenThreads(repoId: number, crawlStartedAt: string): number { + const row = this.db + .prepare( + `select count(*) as count + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null + and (last_pulled_at is null or last_pulled_at < ?)`, + ) + .get(repoId, crawlStartedAt) as { count: number }; + return row.count; + } + private async reconcileMissingOpenThreads(params: { repoId: number; owner: string; From bf2ed0962867c92334e5b7d43d2cec48a14fe7e2 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:12:37 -0700 Subject: [PATCH 121/215] feat: export portable sync database --- apps/cli/src/main.ts | 28 ++ packages/api-core/src/service.test.ts | 182 +++++++++++ packages/api-core/src/service.ts | 416 ++++++++++++++++++++++++++ 3 files changed, 626 insertions(+) diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 6eb381c..7f8e6fb 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -14,6 +14,7 @@ type CommandName = | 'configure' | 'version' | 'sync' + | 'export-sync' | 'refresh' | 'optimize' | 'runs' @@ -137,6 +138,18 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl sync openclaw/openclaw --limit 1', 'ghcrawl sync openclaw/openclaw --since 7d --json'], agentJson: true, }, + { + name: 'export-sync', + synopsis: 'export-sync [--output ] [--body-chars ] [--json]', + description: 'Export a compact portable SQLite core for git-style file sync.', + options: [ + '--output Output SQLite path; defaults to the ghcrawl config exports directory', + '--body-chars Maximum body excerpt characters per thread; default 2048', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl export-sync openclaw/openclaw --output ./openclaw.sync.db --json'], + agentJson: true, + }, { name: 'refresh', synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', @@ -601,6 +614,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo 'member-limit': { type: 'string' }, 'event-limit': { type: 'string' }, 'body-chars': { type: 'string' }, + output: { type: 'string' }, 'no-sync': { type: 'boolean' }, 'no-embed': { type: 'boolean' }, 'no-cluster': { type: 'boolean' }, @@ -1045,6 +1059,20 @@ export async function run( writeJson(stdout, result); return; } + case 'export-sync': { + const { owner, repo, values } = parseRepoFlags('export-sync', rest); + const result = getService().exportPortableSync({ + owner, + repo, + outputPath: typeof values.output === 'string' ? values.output : undefined, + bodyChars: + typeof values['body-chars'] === 'string' + ? parsePositiveInteger('body-chars', values['body-chars'], 'export-sync') + : undefined, + }); + writeJson(stdout, result); + return; + } case 'refresh': { const { owner, repo, values } = parseRepoFlags('refresh', rest); const heapDiagnostics = createOptionalHeapDiagnostics(values, stderr, 'refresh'); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index e8ec133..76e42df 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -5,6 +5,7 @@ import os from 'node:os'; import path from 'node:path'; import { humanKeyForValue } from './cluster/human-key.js'; +import { openDb } from './db/sqlite.js'; import { GHCrawlService } from './service.js'; import type { VectorStore } from './vector/store.js'; @@ -166,6 +167,187 @@ test('optimizeStorage runs SQLite maintenance and reports missing vector store', } }); +test('exportPortableSync writes a compact sync database without bulky cache tables', () => { + const config = makeTestConfig(); + const sourcePath = path.join(config.configDir, 'source.db'); + const outputPath = path.join(config.configDir, 'openclaw.sync.db'); + const service = new GHCrawlService({ + config: { + ...config, + dbPath: sourcePath, + }, + github: { + getRepo: async () => ({}), + listRepositoryIssues: async () => [], + getIssue: async () => ({}), + getPull: async () => ({}), + listIssueComments: async () => [], + listPullReviews: async () => [], + listPullReviewComments: async () => [], + listPullFiles: async () => [], + }, + }); + + try { + const now = '2026-03-10T12:00:00Z'; + const longBody = 'body '.repeat(2000); + const hugeRaw = JSON.stringify({ payload: 'x'.repeat(200_000) }); + service.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', hugeRaw, now); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, + closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 10, + 1, + '100', + 42, + 'issue', + 'open', + 'Gateway crash', + longBody, + 'alice', + 'User', + 'https://github.com/openclaw/openclaw/issues/42', + '["bug"]', + '[]', + hugeRaw, + 'content-hash', + 0, + now, + now, + null, + null, + now, + now, + now, + ); + service.db + .prepare( + `insert into documents (thread_id, title, body, raw_text, dedupe_text, updated_at) + values (?, ?, ?, ?, ?, ?)`, + ) + .run(10, 'Gateway crash', longBody, 'raw '.repeat(50_000), 'dedupe '.repeat(50_000), now); + service.db + .prepare( + `insert into comments (thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, '200', 'issue_comment', 'bob', 'User', 'comment '.repeat(5000), 0, hugeRaw, now, now); + service.db + .prepare( + `insert into thread_vectors (thread_id, basis, model, dimensions, content_hash, vector_json, vector_backend, created_at, updated_at) + values (?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(10, 'title_original', 'text-embedding-3-large', 1024, 'vector-hash', `[${Array.from({ length: 1024 }, () => 0.1).join(',')}]`, 'vectorlite', now, now); + service.db + .prepare( + `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + values (?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(20, 10, now, 'content-hash', 'title-hash', 'body-hash', 'labels-hash', now); + service.db + .prepare( + `insert into thread_fingerprints ( + id, thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, + linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(30, 20, 'v1', 'fingerprint-hash', 'amber-river-slate-abc', '["gateway","crash"]', 'body-token-hash', '[]', 'file-set-hash', '[]', '1234', '{"signals":["gateway"]}', now); + service.db + .prepare( + `insert into thread_key_summaries ( + id, thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(40, 20, 'key_summary', 'v1', 'openai', 'gpt-5-mini', 'input-hash', 'output-hash', 'intent: fix gateway crash\nsurface: startup\nmechanism: guard config', now); + service.db + .prepare( + `insert into repo_sync_state ( + repo_id, last_full_open_scan_started_at, last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at, last_open_close_reconciled_at, updated_at + ) values (?, ?, ?, ?, ?, ?)`, + ) + .run(1, now, now, null, now, now); + service.db + .prepare( + `insert into repo_pipeline_state ( + repo_id, summary_model, summary_prompt_version, embedding_basis, embed_model, embed_dimensions, + embed_pipeline_version, vector_backend, vectors_current_at, clusters_current_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'gpt-5-mini', 'v1', 'title_original', 'text-embedding-3-large', 1024, 'pipeline-v1', 'vectorlite', now, now, now); + service.db + .prepare( + `insert into cluster_groups ( + id, repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(50, 1, 'stable-key', 'amber-river-slate-abc', 'active', 'dedupe', 10, 'Gateway crash cluster', now, now, null); + service.db + .prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run(50, 10, 'canonical', 'active', 1, null, null, 'system', null, '{}', null, now, now, null); + + const response = service.exportPortableSync({ + owner: 'openclaw', + repo: 'openclaw', + outputPath, + bodyChars: 64, + }); + + assert.equal(response.ok, true); + assert.equal(response.repository.fullName, 'openclaw/openclaw'); + assert.equal(response.outputPath, outputPath); + assert.ok(response.outputBytes < response.sourceBytes); + assert.ok(response.excluded.includes('documents')); + assert.ok(response.excluded.includes('thread_vectors')); + assert.equal(response.tables.find((table) => table.name === 'threads')?.rows, 1); + + const portable = openDb(outputPath); + try { + const thread = portable.prepare('select body_excerpt, body_length from threads where number = 42').get() as { + body_excerpt: string; + body_length: number; + }; + assert.equal(thread.body_excerpt.length, 64); + assert.equal(thread.body_length, longBody.length); + const bulkyTables = portable + .prepare("select name from sqlite_master where type = 'table' and name in ('documents', 'comments', 'blobs', 'thread_vectors', 'cluster_events')") + .all() as Array<{ name: string }>; + assert.deepEqual(bulkyTables, []); + const summaryCount = portable.prepare('select count(*) as count from thread_key_summaries').get() as { count: number }; + const membershipCount = portable.prepare('select count(*) as count from cluster_memberships').get() as { count: number }; + assert.equal(summaryCount.count, 1); + assert.equal(membershipCount.count, 1); + } finally { + portable.close(); + } + + const sourceThread = service.db.prepare('select raw_json, body from threads where id = 10').get() as { + raw_json: string; + body: string; + }; + assert.equal(sourceThread.raw_json, hugeRaw); + assert.equal(sourceThread.body, longBody); + } finally { + service.close(); + } +}); + test('listRunHistory returns recent runs across pipeline tables', () => { const service = makeTestService({ getRepo: async () => ({}), diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index c36e04b..b8e952a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -448,6 +448,31 @@ type SyncOptions = { startedAt?: string; }; +type PortableSyncExportOptions = { + owner: string; + repo: string; + outputPath?: string; + bodyChars?: number; +}; + +type PortableSyncExportResponse = { + ok: true; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + outputPath: string; + sourcePath: string; + sourceBytes: number; + outputBytes: number; + compressionRatio: number; + bodyChars: number; + tables: Array<{ name: string; rows: number }>; + excluded: string[]; +}; + type SearchResultInternal = SearchResponse; type NeighborsResultInternal = NeighborsResponse; @@ -517,6 +542,10 @@ function isClosedGitHubPayload(payload: Record): boolean { return false; } +function sqlStringLiteral(value: string): string { + return `'${value.replace(/'/g, "''")}'`; +} + function isMissingGitHubResourceError(error: unknown): boolean { const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : null; if (status === 404 || status === 410) { @@ -3377,6 +3406,393 @@ export class GHCrawlService { }); } + exportPortableSync(params: PortableSyncExportOptions): PortableSyncExportResponse { + if (this.config.dbPath === ':memory:') { + throw new Error('Portable sync export requires a file-backed source database'); + } + + const repository = this.requireRepository(params.owner, params.repo); + const bodyChars = params.bodyChars ?? 2048; + if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { + throw new Error('bodyChars must be a non-negative integer'); + } + + const sourcePath = path.resolve(this.config.dbPath); + const outputPath = path.resolve( + params.outputPath ?? path.join(this.config.configDir, 'exports', `${repository.owner}__${repository.name}.sync.db`), + ); + if (outputPath === sourcePath) { + throw new Error('Refusing to export portable sync database over the source database'); + } + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + const tmpPath = `${outputPath}.tmp-${process.pid}-${Date.now()}`; + fs.rmSync(tmpPath, { force: true }); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + + checkpointWal(this.db); + const out = openDb(tmpPath); + try { + out.pragma('journal_mode = DELETE'); + out.exec('pragma foreign_keys = OFF'); + this.createPortableSyncSchema(out); + out.exec(`attach database ${sqlStringLiteral(sourcePath)} as source`); + this.populatePortableSyncDb(out, { repoId: repository.id, sourcePath, bodyChars }); + out.exec('detach database source'); + out.exec('pragma foreign_keys = ON'); + out.exec('analyze'); + out.exec('pragma optimize'); + out.exec('vacuum'); + } catch (error) { + try { + out.close(); + } catch { + // Ignore cleanup close errors after an export failure. + } + fs.rmSync(tmpPath, { force: true }); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + throw error; + } + out.close(); + + fs.renameSync(tmpPath, outputPath); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + + const portableTables = [ + 'repositories', + 'threads', + 'thread_revisions', + 'thread_fingerprints', + 'thread_key_summaries', + 'repo_sync_state', + 'repo_pipeline_state', + 'cluster_groups', + 'cluster_memberships', + 'cluster_overrides', + 'cluster_aliases', + 'cluster_closures', + ]; + const outputBytes = fs.statSync(outputPath).size; + const sourceBytes = fs.statSync(sourcePath).size + this.fileSize(`${sourcePath}-wal`) + this.fileSize(`${sourcePath}-shm`); + const verify = openDb(outputPath); + try { + verify.pragma('journal_mode = DELETE'); + const tables = portableTables.map((name) => ({ name, rows: this.countPortableRows(verify, name) })); + return { + ok: true, + repository: { + id: repository.id, + owner: repository.owner, + name: repository.name, + fullName: repository.fullName, + }, + outputPath, + sourcePath, + sourceBytes, + outputBytes, + compressionRatio: sourceBytes > 0 ? outputBytes / sourceBytes : 0, + bodyChars, + tables, + excluded: [ + 'blobs', + 'comments', + 'documents', + 'documents_fts', + 'document_embeddings', + 'thread_vectors', + 'thread_code_snapshots', + 'thread_changed_files', + 'thread_hunk_signatures', + 'cluster_events', + 'pipeline_runs', + 'sync_runs', + 'summary_runs', + 'embedding_runs', + 'cluster_runs', + 'similarity_edges', + 'similarity_edge_evidence', + ], + }; + } finally { + verify.close(); + } + } + + private createPortableSyncSchema(db: SqliteDatabase): void { + db.exec(` + create table portable_metadata (key text primary key, value text not null); + create table repositories ( + id integer primary key, + owner text not null, + name text not null, + full_name text not null unique, + github_repo_id text, + updated_at text not null + ); + create table threads ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + github_id text not null, + number integer not null, + kind text not null, + state text not null, + title text not null, + body_excerpt text, + body_length integer not null default 0, + author_login text, + author_type text, + html_url text not null, + labels_json text not null, + assignees_json text not null, + content_hash text not null, + is_draft integer not null default 0, + created_at_gh text, + updated_at_gh text, + closed_at_gh text, + merged_at_gh text, + first_pulled_at text, + last_pulled_at text, + updated_at text not null, + closed_at_local text, + close_reason_local text, + unique(repo_id, kind, number) + ); + create table thread_revisions ( + id integer primary key, + thread_id integer not null references threads(id) on delete cascade, + source_updated_at text, + content_hash text not null, + title_hash text not null, + body_hash text not null, + labels_hash text not null, + created_at text not null, + unique(thread_id, content_hash) + ); + create table thread_fingerprints ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + algorithm_version text not null, + fingerprint_hash text not null, + fingerprint_slug text not null, + title_tokens_json text not null, + body_token_hash text not null, + linked_refs_json text not null, + file_set_hash text not null, + module_buckets_json text not null, + simhash64 text not null, + feature_json text not null, + created_at text not null, + unique(thread_revision_id, algorithm_version) + ); + create table thread_key_summaries ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + summary_kind text not null, + prompt_version text not null, + provider text not null, + model text not null, + input_hash text not null, + output_hash text not null, + key_text text not null, + created_at text not null, + unique(thread_revision_id, summary_kind, prompt_version, provider, model) + ); + create table repo_sync_state ( + repo_id integer primary key references repositories(id) on delete cascade, + last_full_open_scan_started_at text, + last_overlapping_open_scan_completed_at text, + last_non_overlapping_scan_completed_at text, + last_open_close_reconciled_at text, + updated_at text not null + ); + create table repo_pipeline_state ( + repo_id integer primary key references repositories(id) on delete cascade, + summary_model text not null, + summary_prompt_version text not null, + embedding_basis text not null, + embed_model text not null, + embed_dimensions integer not null, + embed_pipeline_version text not null, + vector_backend text not null, + vectors_current_at text, + clusters_current_at text, + updated_at text not null + ); + create table cluster_groups ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + stable_key text not null, + stable_slug text not null, + status text not null, + cluster_type text, + representative_thread_id integer references threads(id) on delete set null, + title text, + created_at text not null, + updated_at text not null, + closed_at text, + unique(repo_id, stable_key), + unique(repo_id, stable_slug) + ); + create table cluster_memberships ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + role text not null, + state text not null, + score_to_representative real, + first_seen_run_id integer, + last_seen_run_id integer, + added_by text not null, + removed_by text, + added_reason_json text not null, + removed_reason_json text, + created_at text not null, + updated_at text not null, + removed_at text, + primary key (cluster_id, thread_id) + ); + create table cluster_overrides ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + action text not null, + reason text, + created_at text not null, + expires_at text, + unique(cluster_id, thread_id, action) + ); + create table cluster_aliases ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + alias_slug text not null, + reason text not null, + created_at text not null, + primary key (cluster_id, alias_slug) + ); + create table cluster_closures ( + cluster_id integer primary key references cluster_groups(id) on delete cascade, + reason text not null, + actor_kind text not null, + created_at text not null, + updated_at text not null + ); + create index idx_threads_repo_number on threads(repo_id, number); + create index idx_threads_repo_state_closed on threads(repo_id, state, closed_at_local); + create index idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash); + create index idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug); + create index idx_cluster_groups_repo_status on cluster_groups(repo_id, status); + create index idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state); + create index idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state); + `); + } + + private populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { + const exportedAt = nowIso(); + const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); + insertMetadata.run('schema', 'ghcrawl-portable-sync-v1'); + insertMetadata.run('exported_at', exportedAt); + insertMetadata.run('source_path', params.sourcePath); + insertMetadata.run('body_chars', String(params.bodyChars)); + insertMetadata.run('excluded', 'raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs'); + + db.prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, updated_at) + select id, owner, name, full_name, github_repo_id, updated_at + from source.repositories + where id = ?`, + ).run(params.repoId); + + db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body_excerpt, body_length, author_login, author_type, html_url, + labels_json, assignees_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local + ) + select + id, repo_id, github_id, number, kind, state, title, + case + when body is null then null + when ? = 0 then '' + when length(body) <= ? then body + else substr(body, 1, ?) + end, + case when body is null then 0 else length(body) end, + author_login, author_type, html_url, labels_json, assignees_json, content_hash, is_draft, + created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, + updated_at, closed_at_local, close_reason_local + from source.threads + where repo_id = ?`, + ).run(params.bodyChars, params.bodyChars, params.bodyChars, params.repoId); + + db.prepare( + `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + select tr.id, tr.thread_id, tr.source_updated_at, tr.content_hash, tr.title_hash, tr.body_hash, tr.labels_hash, tr.created_at + from source.thread_revisions tr + join threads t on t.id = tr.thread_id`, + ).run(); + + db.prepare( + `insert into thread_fingerprints ( + id, thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, + linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at + ) + select + tf.id, tf.thread_revision_id, tf.algorithm_version, tf.fingerprint_hash, tf.fingerprint_slug, tf.title_tokens_json, + tf.body_token_hash, tf.linked_refs_json, tf.file_set_hash, tf.module_buckets_json, tf.simhash64, tf.feature_json, tf.created_at + from source.thread_fingerprints tf + join thread_revisions tr on tr.id = tf.thread_revision_id`, + ).run(); + + db.prepare( + `insert into thread_key_summaries ( + id, thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) + select + tks.id, tks.thread_revision_id, tks.summary_kind, tks.prompt_version, tks.provider, tks.model, + tks.input_hash, tks.output_hash, tks.key_text, tks.created_at + from source.thread_key_summaries tks + join thread_revisions tr on tr.id = tks.thread_revision_id`, + ).run(); + + db.prepare('insert into repo_sync_state select * from source.repo_sync_state where repo_id = ?').run(params.repoId); + db.prepare('insert into repo_pipeline_state select * from source.repo_pipeline_state where repo_id = ?').run(params.repoId); + db.prepare('insert into cluster_groups select * from source.cluster_groups where repo_id = ?').run(params.repoId); + db.prepare( + `insert into cluster_memberships + select cm.* + from source.cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + join threads t on t.id = cm.thread_id`, + ).run(); + db.prepare( + `insert into cluster_overrides + select co.* + from source.cluster_overrides co + join cluster_groups cg on cg.id = co.cluster_id + join threads t on t.id = co.thread_id + where co.repo_id = ?`, + ).run(params.repoId); + db.prepare( + `insert into cluster_aliases + select ca.* + from source.cluster_aliases ca + join cluster_groups cg on cg.id = ca.cluster_id`, + ).run(); + db.prepare( + `insert into cluster_closures + select cc.* + from source.cluster_closures cc + join cluster_groups cg on cg.id = cc.cluster_id`, + ).run(); + } + + private countPortableRows(db: SqliteDatabase, tableName: string): number { + const row = db.prepare(`select count(*) as count from "${tableName}"`).get() as { count: number }; + return row.count; + } + private optimizeSqliteTarget(params: { name: 'main' | 'vector'; db: SqliteDatabase; From 576f155ff3c725f2b9758485cd0f445c1202b36b Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:14:35 -0700 Subject: [PATCH 122/215] fix: tolerate portable export schema drift --- packages/api-core/src/service.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b8e952a..0cac4b5 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3659,6 +3659,7 @@ export class GHCrawlService { cluster_id integer not null references cluster_groups(id) on delete cascade, thread_id integer not null references threads(id) on delete cascade, action text not null, + actor_id integer, reason text, created_at text not null, expires_at text, @@ -3766,9 +3767,12 @@ export class GHCrawlService { join cluster_groups cg on cg.id = cm.cluster_id join threads t on t.id = cm.thread_id`, ).run(); + const overrideActorExpr = this.attachedTableHasColumn(db, 'source', 'cluster_overrides', 'actor_id') ? 'co.actor_id' : 'null'; db.prepare( - `insert into cluster_overrides - select co.* + `insert into cluster_overrides ( + id, repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at + ) + select co.id, co.repo_id, co.cluster_id, co.thread_id, co.action, ${overrideActorExpr}, co.reason, co.created_at, co.expires_at from source.cluster_overrides co join cluster_groups cg on cg.id = co.cluster_id join threads t on t.id = co.thread_id @@ -3793,6 +3797,11 @@ export class GHCrawlService { return row.count; } + private attachedTableHasColumn(db: SqliteDatabase, schemaName: string, tableName: string, columnName: string): boolean { + const rows = db.prepare(`pragma ${schemaName}.table_info("${tableName}")`).all() as Array<{ name: string }>; + return rows.some((row) => row.name === columnName); + } + private optimizeSqliteTarget(params: { name: 'main' | 'vector'; db: SqliteDatabase; From e7de8753e2cad2dacf0ae8632a32b6195c7ecf20 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:16:06 -0700 Subject: [PATCH 123/215] fix: shrink portable sync default bodies --- apps/cli/src/main.ts | 2 +- packages/api-core/src/service.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 7f8e6fb..66ecb63 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -144,7 +144,7 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ description: 'Export a compact portable SQLite core for git-style file sync.', options: [ '--output Output SQLite path; defaults to the ghcrawl config exports directory', - '--body-chars Maximum body excerpt characters per thread; default 2048', + '--body-chars Maximum body excerpt characters per thread; default 512', '--json Emit machine-readable JSON output explicitly', ], examples: ['ghcrawl export-sync openclaw/openclaw --output ./openclaw.sync.db --json'], diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 0cac4b5..00e389f 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -3412,7 +3412,7 @@ export class GHCrawlService { } const repository = this.requireRepository(params.owner, params.repo); - const bodyChars = params.bodyChars ?? 2048; + const bodyChars = params.bodyChars ?? 512; if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { throw new Error('bodyChars must be a non-negative integer'); } From da2ba077f10301e60dcd3ad557250713e3b316e7 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:17:14 -0700 Subject: [PATCH 124/215] docs: document portable sync export --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 0859279..1bec272 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,20 @@ ghcrawl optimize owner/repo --json Use `threads --numbers ...` when you want several specific issue or PR records in one CLI call instead of paying process startup overhead repeatedly. +## Portable Git Sync Export + +The main SQLite database is a local cache and can grow large because it stores raw GitHub payloads, documents, FTS data, vectors, comments, run history, and other rebuildable evidence. Do not put `~/.config/ghcrawl/ghcrawl.db` directly into a git file sync workflow. + +Use `export-sync` to write a compact portable core DB: + +```bash +ghcrawl export-sync owner/repo --output ./owner__repo.sync.db --json +``` + +The export keeps the syncable state: repository metadata, issue/PR metadata, bounded body excerpts, latest revisions, deterministic fingerprints, LLM key summaries, sync/pipeline state, and durable cluster identities/memberships/overrides. It intentionally excludes bulky or rebuildable caches such as raw JSON blobs, comments, documents/FTS, vectors, code snapshots, cluster event history, run logs, and similarity edge evidence. + +Default body excerpts are capped at `512` characters per thread. Raise or lower that with `--body-chars ` depending on how much preview text you want in the portable file. + By default, cluster JSON commands show locally closed clusters. Use `--hide-closed` when you only want active clusters. Thread list commands still hide locally closed issues/PRs unless `--include-closed` is passed. Use `close-thread` when you know a local issue/PR should be treated as closed before the next GitHub sync catches up. If that was the last open item in its cluster, `ghcrawl` automatically marks the cluster closed too. From f3e0ac585a18fcf912115e7271e7e15d9b14ed34 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:33:57 -0700 Subject: [PATCH 125/215] refactor: extract portable sync store --- packages/api-core/src/portable/sync-store.ts | 447 +++++++++++++++++++ packages/api-core/src/service.ts | 417 +---------------- 2 files changed, 459 insertions(+), 405 deletions(-) create mode 100644 packages/api-core/src/portable/sync-store.ts diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts new file mode 100644 index 0000000..7a39b64 --- /dev/null +++ b/packages/api-core/src/portable/sync-store.ts @@ -0,0 +1,447 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +import type { RepositoryDto } from '@ghcrawl/api-contract'; + +import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; + +export const PORTABLE_SYNC_SCHEMA_VERSION = 'ghcrawl-portable-sync-v1'; +export const DEFAULT_PORTABLE_BODY_CHARS = 512; + +export const PORTABLE_SYNC_TABLES = [ + 'repositories', + 'threads', + 'thread_revisions', + 'thread_fingerprints', + 'thread_key_summaries', + 'repo_sync_state', + 'repo_pipeline_state', + 'cluster_groups', + 'cluster_memberships', + 'cluster_overrides', + 'cluster_aliases', + 'cluster_closures', +] as const; + +export const PORTABLE_SYNC_EXCLUDED_TABLES = [ + 'blobs', + 'comments', + 'documents', + 'documents_fts', + 'document_embeddings', + 'thread_vectors', + 'thread_code_snapshots', + 'thread_changed_files', + 'thread_hunk_signatures', + 'cluster_events', + 'pipeline_runs', + 'sync_runs', + 'summary_runs', + 'embedding_runs', + 'cluster_runs', + 'similarity_edges', + 'similarity_edge_evidence', +] as const; + +export type PortableSyncExportOptions = { + repository: RepositoryDto; + sourceDb: SqliteDatabase; + sourcePath: string; + outputPath: string; + bodyChars?: number; +}; + +export type PortableSyncExportResponse = { + ok: true; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + outputPath: string; + sourcePath: string; + sourceBytes: number; + outputBytes: number; + compressionRatio: number; + bodyChars: number; + tables: Array<{ name: string; rows: number }>; + excluded: string[]; +}; + +export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { + const bodyChars = params.bodyChars ?? DEFAULT_PORTABLE_BODY_CHARS; + if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { + throw new Error('bodyChars must be a non-negative integer'); + } + + const sourcePath = path.resolve(params.sourcePath); + const outputPath = path.resolve(params.outputPath); + if (outputPath === sourcePath) { + throw new Error('Refusing to export portable sync database over the source database'); + } + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + const tmpPath = `${outputPath}.tmp-${process.pid}-${Date.now()}`; + fs.rmSync(tmpPath, { force: true }); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + + checkpointWal(params.sourceDb); + const out = openDb(tmpPath); + try { + out.pragma('journal_mode = DELETE'); + out.exec('pragma foreign_keys = OFF'); + createPortableSyncSchema(out); + out.exec(`attach database ${sqlStringLiteral(sourcePath)} as source`); + populatePortableSyncDb(out, { + repoId: params.repository.id, + sourcePath, + bodyChars, + }); + out.exec('detach database source'); + out.exec('pragma foreign_keys = ON'); + out.exec('analyze'); + out.exec('pragma optimize'); + out.exec('vacuum'); + } catch (error) { + try { + out.close(); + } catch { + // Ignore cleanup close errors after an export failure. + } + fs.rmSync(tmpPath, { force: true }); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + throw error; + } + out.close(); + + fs.renameSync(tmpPath, outputPath); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + + const outputBytes = fs.statSync(outputPath).size; + const sourceBytes = fs.statSync(sourcePath).size + fileSize(`${sourcePath}-wal`) + fileSize(`${sourcePath}-shm`); + const verify = openDb(outputPath); + try { + verify.pragma('journal_mode = DELETE'); + const tables = PORTABLE_SYNC_TABLES.map((name) => ({ name, rows: countRows(verify, name) })); + return { + ok: true, + repository: { + id: params.repository.id, + owner: params.repository.owner, + name: params.repository.name, + fullName: params.repository.fullName, + }, + outputPath, + sourcePath, + sourceBytes, + outputBytes, + compressionRatio: sourceBytes > 0 ? outputBytes / sourceBytes : 0, + bodyChars, + tables, + excluded: [...PORTABLE_SYNC_EXCLUDED_TABLES], + }; + } finally { + verify.close(); + } +} + +export function createPortableSyncSchema(db: SqliteDatabase): void { + db.exec(` + create table portable_metadata (key text primary key, value text not null); + create table repositories ( + id integer primary key, + owner text not null, + name text not null, + full_name text not null unique, + github_repo_id text, + updated_at text not null + ); + create table threads ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + github_id text not null, + number integer not null, + kind text not null, + state text not null, + title text not null, + body_excerpt text, + body_length integer not null default 0, + author_login text, + author_type text, + html_url text not null, + labels_json text not null, + assignees_json text not null, + content_hash text not null, + is_draft integer not null default 0, + created_at_gh text, + updated_at_gh text, + closed_at_gh text, + merged_at_gh text, + first_pulled_at text, + last_pulled_at text, + updated_at text not null, + closed_at_local text, + close_reason_local text, + unique(repo_id, kind, number) + ); + create table thread_revisions ( + id integer primary key, + thread_id integer not null references threads(id) on delete cascade, + source_updated_at text, + content_hash text not null, + title_hash text not null, + body_hash text not null, + labels_hash text not null, + created_at text not null, + unique(thread_id, content_hash) + ); + create table thread_fingerprints ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + algorithm_version text not null, + fingerprint_hash text not null, + fingerprint_slug text not null, + title_tokens_json text not null, + body_token_hash text not null, + linked_refs_json text not null, + file_set_hash text not null, + module_buckets_json text not null, + simhash64 text not null, + feature_json text not null, + created_at text not null, + unique(thread_revision_id, algorithm_version) + ); + create table thread_key_summaries ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + summary_kind text not null, + prompt_version text not null, + provider text not null, + model text not null, + input_hash text not null, + output_hash text not null, + key_text text not null, + created_at text not null, + unique(thread_revision_id, summary_kind, prompt_version, provider, model) + ); + create table repo_sync_state ( + repo_id integer primary key references repositories(id) on delete cascade, + last_full_open_scan_started_at text, + last_overlapping_open_scan_completed_at text, + last_non_overlapping_scan_completed_at text, + last_open_close_reconciled_at text, + updated_at text not null + ); + create table repo_pipeline_state ( + repo_id integer primary key references repositories(id) on delete cascade, + summary_model text not null, + summary_prompt_version text not null, + embedding_basis text not null, + embed_model text not null, + embed_dimensions integer not null, + embed_pipeline_version text not null, + vector_backend text not null, + vectors_current_at text, + clusters_current_at text, + updated_at text not null + ); + create table cluster_groups ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + stable_key text not null, + stable_slug text not null, + status text not null, + cluster_type text, + representative_thread_id integer references threads(id) on delete set null, + title text, + created_at text not null, + updated_at text not null, + closed_at text, + unique(repo_id, stable_key), + unique(repo_id, stable_slug) + ); + create table cluster_memberships ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + role text not null, + state text not null, + score_to_representative real, + first_seen_run_id integer, + last_seen_run_id integer, + added_by text not null, + removed_by text, + added_reason_json text not null, + removed_reason_json text, + created_at text not null, + updated_at text not null, + removed_at text, + primary key (cluster_id, thread_id) + ); + create table cluster_overrides ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + action text not null, + actor_id integer, + reason text, + created_at text not null, + expires_at text, + unique(cluster_id, thread_id, action) + ); + create table cluster_aliases ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + alias_slug text not null, + reason text not null, + created_at text not null, + primary key (cluster_id, alias_slug) + ); + create table cluster_closures ( + cluster_id integer primary key references cluster_groups(id) on delete cascade, + reason text not null, + actor_kind text not null, + created_at text not null, + updated_at text not null + ); + create index idx_threads_repo_number on threads(repo_id, number); + create index idx_threads_repo_state_closed on threads(repo_id, state, closed_at_local); + create index idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash); + create index idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug); + create index idx_cluster_groups_repo_status on cluster_groups(repo_id, status); + create index idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state); + create index idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state); + `); +} + +export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { + const exportedAt = nowIso(); + const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); + insertMetadata.run('schema', PORTABLE_SYNC_SCHEMA_VERSION); + insertMetadata.run('exported_at', exportedAt); + insertMetadata.run('source_path', params.sourcePath); + insertMetadata.run('body_chars', String(params.bodyChars)); + insertMetadata.run('excluded', 'raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs'); + + db.prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, updated_at) + select id, owner, name, full_name, github_repo_id, updated_at + from source.repositories + where id = ?`, + ).run(params.repoId); + + db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body_excerpt, body_length, author_login, author_type, html_url, + labels_json, assignees_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local + ) + select + id, repo_id, github_id, number, kind, state, title, + case + when body is null then null + when ? = 0 then '' + when length(body) <= ? then body + else substr(body, 1, ?) + end, + case when body is null then 0 else length(body) end, + author_login, author_type, html_url, labels_json, assignees_json, content_hash, is_draft, + created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, + updated_at, closed_at_local, close_reason_local + from source.threads + where repo_id = ?`, + ).run(params.bodyChars, params.bodyChars, params.bodyChars, params.repoId); + + db.prepare( + `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + select tr.id, tr.thread_id, tr.source_updated_at, tr.content_hash, tr.title_hash, tr.body_hash, tr.labels_hash, tr.created_at + from source.thread_revisions tr + join threads t on t.id = tr.thread_id`, + ).run(); + + db.prepare( + `insert into thread_fingerprints ( + id, thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, + linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at + ) + select + tf.id, tf.thread_revision_id, tf.algorithm_version, tf.fingerprint_hash, tf.fingerprint_slug, tf.title_tokens_json, + tf.body_token_hash, tf.linked_refs_json, tf.file_set_hash, tf.module_buckets_json, tf.simhash64, tf.feature_json, tf.created_at + from source.thread_fingerprints tf + join thread_revisions tr on tr.id = tf.thread_revision_id`, + ).run(); + + db.prepare( + `insert into thread_key_summaries ( + id, thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) + select + tks.id, tks.thread_revision_id, tks.summary_kind, tks.prompt_version, tks.provider, tks.model, + tks.input_hash, tks.output_hash, tks.key_text, tks.created_at + from source.thread_key_summaries tks + join thread_revisions tr on tr.id = tks.thread_revision_id`, + ).run(); + + db.prepare('insert into repo_sync_state select * from source.repo_sync_state where repo_id = ?').run(params.repoId); + db.prepare('insert into repo_pipeline_state select * from source.repo_pipeline_state where repo_id = ?').run(params.repoId); + db.prepare('insert into cluster_groups select * from source.cluster_groups where repo_id = ?').run(params.repoId); + db.prepare( + `insert into cluster_memberships + select cm.* + from source.cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + join threads t on t.id = cm.thread_id`, + ).run(); + const overrideActorExpr = attachedTableHasColumn(db, 'source', 'cluster_overrides', 'actor_id') ? 'co.actor_id' : 'null'; + db.prepare( + `insert into cluster_overrides ( + id, repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at + ) + select co.id, co.repo_id, co.cluster_id, co.thread_id, co.action, ${overrideActorExpr}, co.reason, co.created_at, co.expires_at + from source.cluster_overrides co + join cluster_groups cg on cg.id = co.cluster_id + join threads t on t.id = co.thread_id + where co.repo_id = ?`, + ).run(params.repoId); + db.prepare( + `insert into cluster_aliases + select ca.* + from source.cluster_aliases ca + join cluster_groups cg on cg.id = ca.cluster_id`, + ).run(); + db.prepare( + `insert into cluster_closures + select cc.* + from source.cluster_closures cc + join cluster_groups cg on cg.id = cc.cluster_id`, + ).run(); +} + +function countRows(db: SqliteDatabase, tableName: string): number { + const row = db.prepare(`select count(*) as count from "${tableName}"`).get() as { count: number }; + return row.count; +} + +function attachedTableHasColumn(db: SqliteDatabase, schemaName: string, tableName: string, columnName: string): boolean { + const rows = db.prepare(`pragma ${schemaName}.table_info("${tableName}")`).all() as Array<{ name: string }>; + return rows.some((row) => row.name === columnName); +} + +function fileSize(filePath: string): number { + try { + return fs.statSync(filePath).size; + } catch { + return 0; + } +} + +function nowIso(): string { + return new Date().toISOString(); +} + +function sqlStringLiteral(value: string): string { + return `'${value.replaceAll("'", "''")}'`; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 00e389f..ce72198 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -105,6 +105,11 @@ import { readTextBlob, storeTextBlob } from './db/blob-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; +import { + DEFAULT_PORTABLE_BODY_CHARS, + exportPortableSyncDatabase, + type PortableSyncExportResponse, +} from './portable/sync-store.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; @@ -455,24 +460,6 @@ type PortableSyncExportOptions = { bodyChars?: number; }; -type PortableSyncExportResponse = { - ok: true; - repository: { - id: number; - owner: string; - name: string; - fullName: string; - }; - outputPath: string; - sourcePath: string; - sourceBytes: number; - outputBytes: number; - compressionRatio: number; - bodyChars: number; - tables: Array<{ name: string; rows: number }>; - excluded: string[]; -}; - type SearchResultInternal = SearchResponse; type NeighborsResultInternal = NeighborsResponse; @@ -542,10 +529,6 @@ function isClosedGitHubPayload(payload: Record): boolean { return false; } -function sqlStringLiteral(value: string): string { - return `'${value.replace(/'/g, "''")}'`; -} - function isMissingGitHubResourceError(error: unknown): boolean { const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : null; if (status === 404 || status === 410) { @@ -3412,394 +3395,18 @@ export class GHCrawlService { } const repository = this.requireRepository(params.owner, params.repo); - const bodyChars = params.bodyChars ?? 512; - if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { - throw new Error('bodyChars must be a non-negative integer'); - } - const sourcePath = path.resolve(this.config.dbPath); const outputPath = path.resolve( params.outputPath ?? path.join(this.config.configDir, 'exports', `${repository.owner}__${repository.name}.sync.db`), ); - if (outputPath === sourcePath) { - throw new Error('Refusing to export portable sync database over the source database'); - } - - fs.mkdirSync(path.dirname(outputPath), { recursive: true }); - const tmpPath = `${outputPath}.tmp-${process.pid}-${Date.now()}`; - fs.rmSync(tmpPath, { force: true }); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); - checkpointWal(this.db); - const out = openDb(tmpPath); - try { - out.pragma('journal_mode = DELETE'); - out.exec('pragma foreign_keys = OFF'); - this.createPortableSyncSchema(out); - out.exec(`attach database ${sqlStringLiteral(sourcePath)} as source`); - this.populatePortableSyncDb(out, { repoId: repository.id, sourcePath, bodyChars }); - out.exec('detach database source'); - out.exec('pragma foreign_keys = ON'); - out.exec('analyze'); - out.exec('pragma optimize'); - out.exec('vacuum'); - } catch (error) { - try { - out.close(); - } catch { - // Ignore cleanup close errors after an export failure. - } - fs.rmSync(tmpPath, { force: true }); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); - throw error; - } - out.close(); - - fs.renameSync(tmpPath, outputPath); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); - - const portableTables = [ - 'repositories', - 'threads', - 'thread_revisions', - 'thread_fingerprints', - 'thread_key_summaries', - 'repo_sync_state', - 'repo_pipeline_state', - 'cluster_groups', - 'cluster_memberships', - 'cluster_overrides', - 'cluster_aliases', - 'cluster_closures', - ]; - const outputBytes = fs.statSync(outputPath).size; - const sourceBytes = fs.statSync(sourcePath).size + this.fileSize(`${sourcePath}-wal`) + this.fileSize(`${sourcePath}-shm`); - const verify = openDb(outputPath); - try { - verify.pragma('journal_mode = DELETE'); - const tables = portableTables.map((name) => ({ name, rows: this.countPortableRows(verify, name) })); - return { - ok: true, - repository: { - id: repository.id, - owner: repository.owner, - name: repository.name, - fullName: repository.fullName, - }, - outputPath, - sourcePath, - sourceBytes, - outputBytes, - compressionRatio: sourceBytes > 0 ? outputBytes / sourceBytes : 0, - bodyChars, - tables, - excluded: [ - 'blobs', - 'comments', - 'documents', - 'documents_fts', - 'document_embeddings', - 'thread_vectors', - 'thread_code_snapshots', - 'thread_changed_files', - 'thread_hunk_signatures', - 'cluster_events', - 'pipeline_runs', - 'sync_runs', - 'summary_runs', - 'embedding_runs', - 'cluster_runs', - 'similarity_edges', - 'similarity_edge_evidence', - ], - }; - } finally { - verify.close(); - } - } - - private createPortableSyncSchema(db: SqliteDatabase): void { - db.exec(` - create table portable_metadata (key text primary key, value text not null); - create table repositories ( - id integer primary key, - owner text not null, - name text not null, - full_name text not null unique, - github_repo_id text, - updated_at text not null - ); - create table threads ( - id integer primary key, - repo_id integer not null references repositories(id) on delete cascade, - github_id text not null, - number integer not null, - kind text not null, - state text not null, - title text not null, - body_excerpt text, - body_length integer not null default 0, - author_login text, - author_type text, - html_url text not null, - labels_json text not null, - assignees_json text not null, - content_hash text not null, - is_draft integer not null default 0, - created_at_gh text, - updated_at_gh text, - closed_at_gh text, - merged_at_gh text, - first_pulled_at text, - last_pulled_at text, - updated_at text not null, - closed_at_local text, - close_reason_local text, - unique(repo_id, kind, number) - ); - create table thread_revisions ( - id integer primary key, - thread_id integer not null references threads(id) on delete cascade, - source_updated_at text, - content_hash text not null, - title_hash text not null, - body_hash text not null, - labels_hash text not null, - created_at text not null, - unique(thread_id, content_hash) - ); - create table thread_fingerprints ( - id integer primary key, - thread_revision_id integer not null references thread_revisions(id) on delete cascade, - algorithm_version text not null, - fingerprint_hash text not null, - fingerprint_slug text not null, - title_tokens_json text not null, - body_token_hash text not null, - linked_refs_json text not null, - file_set_hash text not null, - module_buckets_json text not null, - simhash64 text not null, - feature_json text not null, - created_at text not null, - unique(thread_revision_id, algorithm_version) - ); - create table thread_key_summaries ( - id integer primary key, - thread_revision_id integer not null references thread_revisions(id) on delete cascade, - summary_kind text not null, - prompt_version text not null, - provider text not null, - model text not null, - input_hash text not null, - output_hash text not null, - key_text text not null, - created_at text not null, - unique(thread_revision_id, summary_kind, prompt_version, provider, model) - ); - create table repo_sync_state ( - repo_id integer primary key references repositories(id) on delete cascade, - last_full_open_scan_started_at text, - last_overlapping_open_scan_completed_at text, - last_non_overlapping_scan_completed_at text, - last_open_close_reconciled_at text, - updated_at text not null - ); - create table repo_pipeline_state ( - repo_id integer primary key references repositories(id) on delete cascade, - summary_model text not null, - summary_prompt_version text not null, - embedding_basis text not null, - embed_model text not null, - embed_dimensions integer not null, - embed_pipeline_version text not null, - vector_backend text not null, - vectors_current_at text, - clusters_current_at text, - updated_at text not null - ); - create table cluster_groups ( - id integer primary key, - repo_id integer not null references repositories(id) on delete cascade, - stable_key text not null, - stable_slug text not null, - status text not null, - cluster_type text, - representative_thread_id integer references threads(id) on delete set null, - title text, - created_at text not null, - updated_at text not null, - closed_at text, - unique(repo_id, stable_key), - unique(repo_id, stable_slug) - ); - create table cluster_memberships ( - cluster_id integer not null references cluster_groups(id) on delete cascade, - thread_id integer not null references threads(id) on delete cascade, - role text not null, - state text not null, - score_to_representative real, - first_seen_run_id integer, - last_seen_run_id integer, - added_by text not null, - removed_by text, - added_reason_json text not null, - removed_reason_json text, - created_at text not null, - updated_at text not null, - removed_at text, - primary key (cluster_id, thread_id) - ); - create table cluster_overrides ( - id integer primary key, - repo_id integer not null references repositories(id) on delete cascade, - cluster_id integer not null references cluster_groups(id) on delete cascade, - thread_id integer not null references threads(id) on delete cascade, - action text not null, - actor_id integer, - reason text, - created_at text not null, - expires_at text, - unique(cluster_id, thread_id, action) - ); - create table cluster_aliases ( - cluster_id integer not null references cluster_groups(id) on delete cascade, - alias_slug text not null, - reason text not null, - created_at text not null, - primary key (cluster_id, alias_slug) - ); - create table cluster_closures ( - cluster_id integer primary key references cluster_groups(id) on delete cascade, - reason text not null, - actor_kind text not null, - created_at text not null, - updated_at text not null - ); - create index idx_threads_repo_number on threads(repo_id, number); - create index idx_threads_repo_state_closed on threads(repo_id, state, closed_at_local); - create index idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash); - create index idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug); - create index idx_cluster_groups_repo_status on cluster_groups(repo_id, status); - create index idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state); - create index idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state); - `); - } - - private populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { - const exportedAt = nowIso(); - const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); - insertMetadata.run('schema', 'ghcrawl-portable-sync-v1'); - insertMetadata.run('exported_at', exportedAt); - insertMetadata.run('source_path', params.sourcePath); - insertMetadata.run('body_chars', String(params.bodyChars)); - insertMetadata.run('excluded', 'raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs'); - - db.prepare( - `insert into repositories (id, owner, name, full_name, github_repo_id, updated_at) - select id, owner, name, full_name, github_repo_id, updated_at - from source.repositories - where id = ?`, - ).run(params.repoId); - - db.prepare( - `insert into threads ( - id, repo_id, github_id, number, kind, state, title, body_excerpt, body_length, author_login, author_type, html_url, - labels_json, assignees_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, - merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local - ) - select - id, repo_id, github_id, number, kind, state, title, - case - when body is null then null - when ? = 0 then '' - when length(body) <= ? then body - else substr(body, 1, ?) - end, - case when body is null then 0 else length(body) end, - author_login, author_type, html_url, labels_json, assignees_json, content_hash, is_draft, - created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, - updated_at, closed_at_local, close_reason_local - from source.threads - where repo_id = ?`, - ).run(params.bodyChars, params.bodyChars, params.bodyChars, params.repoId); - - db.prepare( - `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) - select tr.id, tr.thread_id, tr.source_updated_at, tr.content_hash, tr.title_hash, tr.body_hash, tr.labels_hash, tr.created_at - from source.thread_revisions tr - join threads t on t.id = tr.thread_id`, - ).run(); - - db.prepare( - `insert into thread_fingerprints ( - id, thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, - linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at - ) - select - tf.id, tf.thread_revision_id, tf.algorithm_version, tf.fingerprint_hash, tf.fingerprint_slug, tf.title_tokens_json, - tf.body_token_hash, tf.linked_refs_json, tf.file_set_hash, tf.module_buckets_json, tf.simhash64, tf.feature_json, tf.created_at - from source.thread_fingerprints tf - join thread_revisions tr on tr.id = tf.thread_revision_id`, - ).run(); - - db.prepare( - `insert into thread_key_summaries ( - id, thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at - ) - select - tks.id, tks.thread_revision_id, tks.summary_kind, tks.prompt_version, tks.provider, tks.model, - tks.input_hash, tks.output_hash, tks.key_text, tks.created_at - from source.thread_key_summaries tks - join thread_revisions tr on tr.id = tks.thread_revision_id`, - ).run(); - - db.prepare('insert into repo_sync_state select * from source.repo_sync_state where repo_id = ?').run(params.repoId); - db.prepare('insert into repo_pipeline_state select * from source.repo_pipeline_state where repo_id = ?').run(params.repoId); - db.prepare('insert into cluster_groups select * from source.cluster_groups where repo_id = ?').run(params.repoId); - db.prepare( - `insert into cluster_memberships - select cm.* - from source.cluster_memberships cm - join cluster_groups cg on cg.id = cm.cluster_id - join threads t on t.id = cm.thread_id`, - ).run(); - const overrideActorExpr = this.attachedTableHasColumn(db, 'source', 'cluster_overrides', 'actor_id') ? 'co.actor_id' : 'null'; - db.prepare( - `insert into cluster_overrides ( - id, repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at - ) - select co.id, co.repo_id, co.cluster_id, co.thread_id, co.action, ${overrideActorExpr}, co.reason, co.created_at, co.expires_at - from source.cluster_overrides co - join cluster_groups cg on cg.id = co.cluster_id - join threads t on t.id = co.thread_id - where co.repo_id = ?`, - ).run(params.repoId); - db.prepare( - `insert into cluster_aliases - select ca.* - from source.cluster_aliases ca - join cluster_groups cg on cg.id = ca.cluster_id`, - ).run(); - db.prepare( - `insert into cluster_closures - select cc.* - from source.cluster_closures cc - join cluster_groups cg on cg.id = cc.cluster_id`, - ).run(); - } - - private countPortableRows(db: SqliteDatabase, tableName: string): number { - const row = db.prepare(`select count(*) as count from "${tableName}"`).get() as { count: number }; - return row.count; - } - - private attachedTableHasColumn(db: SqliteDatabase, schemaName: string, tableName: string, columnName: string): boolean { - const rows = db.prepare(`pragma ${schemaName}.table_info("${tableName}")`).all() as Array<{ name: string }>; - return rows.some((row) => row.name === columnName); + return exportPortableSyncDatabase({ + repository, + sourceDb: this.db, + sourcePath, + outputPath, + bodyChars: params.bodyChars ?? DEFAULT_PORTABLE_BODY_CHARS, + }); } private optimizeSqliteTarget(params: { From 7c91b2c0713cf255da084d40437c89a6c1961090 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:36:58 -0700 Subject: [PATCH 126/215] feat: validate portable sync stores --- apps/cli/src/main.test.ts | 3 + apps/cli/src/main.ts | 47 ++++++- packages/api-core/src/index.ts | 1 + packages/api-core/src/portable/sync-store.ts | 139 +++++++++++++++++++ packages/api-core/src/service.test.ts | 12 ++ packages/api-core/src/service.ts | 12 ++ 6 files changed, 213 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 0045851..ddecd49 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -45,6 +45,9 @@ const publicCommands = [ 'configure', 'version', 'sync', + 'export-sync', + 'validate-sync', + 'portable-size', 'refresh', 'optimize', 'runs', diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 66ecb63..6136a4c 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -5,7 +5,16 @@ import path from 'node:path'; import { parseArgs } from 'node:util'; import { fileURLToPath } from 'node:url'; -import { createApiServer, GHCrawlService, loadConfig, readPersistedConfig, writePersistedConfig, type LoadConfigOptions } from '@ghcrawl/api-core'; +import { + createApiServer, + GHCrawlService, + loadConfig, + portableSyncSizeReport, + readPersistedConfig, + validatePortableSyncDatabase, + writePersistedConfig, + type LoadConfigOptions, +} from '@ghcrawl/api-core'; import { createHeapDiagnostics, type HeapDiagnostics } from './heap-diagnostics.js'; import { startTui } from './tui/app.js'; @@ -15,6 +24,8 @@ type CommandName = | 'version' | 'sync' | 'export-sync' + | 'validate-sync' + | 'portable-size' | 'refresh' | 'optimize' | 'runs' @@ -150,6 +161,22 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl export-sync openclaw/openclaw --output ./openclaw.sync.db --json'], agentJson: true, }, + { + name: 'validate-sync', + synopsis: 'validate-sync [--json]', + description: 'Validate a portable git-sync SQLite database without mutating it.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl validate-sync ./openclaw.sync.db --json'], + agentJson: true, + }, + { + name: 'portable-size', + synopsis: 'portable-size [--json]', + description: 'Report portable git-sync SQLite table sizes.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl portable-size ./openclaw.sync.db --json'], + agentJson: true, + }, { name: 'refresh', synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', @@ -1073,6 +1100,24 @@ export async function run( writeJson(stdout, result); return; } + case 'validate-sync': { + const parsed = parseArgsForCommand('validate-sync', rest, { json: { type: 'boolean' } }, true); + if (parsed.positionals.length !== 1) { + throw new CliUsageError('validate-sync requires exactly one portable database path', 'validate-sync'); + } + const result = validatePortableSyncDatabase(parsed.positionals[0]); + writeJson(stdout, result); + return; + } + case 'portable-size': { + const parsed = parseArgsForCommand('portable-size', rest, { json: { type: 'boolean' } }, true); + if (parsed.positionals.length !== 1) { + throw new CliUsageError('portable-size requires exactly one portable database path', 'portable-size'); + } + const result = portableSyncSizeReport(parsed.positionals[0]); + writeJson(stdout, result); + return; + } case 'refresh': { const { owner, repo, values } = parseRepoFlags('refresh', rest); const heapDiagnostics = createOptionalHeapDiagnostics(values, stderr, 'refresh'); diff --git a/packages/api-core/src/index.ts b/packages/api-core/src/index.ts index 0119310..fd6b627 100644 --- a/packages/api-core/src/index.ts +++ b/packages/api-core/src/index.ts @@ -4,5 +4,6 @@ export * from './documents/normalize.js'; export * from './search/exact.js'; export * from './cluster/build.js'; export * from './service.js'; +export * from './portable/sync-store.js'; export * from './vector/store.js'; export * from './vector/vectorlite-store.js'; diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 7a39b64..62d0486 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -1,6 +1,7 @@ import fs from 'node:fs'; import path from 'node:path'; +import BetterSqlite3 from 'better-sqlite3'; import type { RepositoryDto } from '@ghcrawl/api-contract'; import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; @@ -69,6 +70,28 @@ export type PortableSyncExportResponse = { excluded: string[]; }; +export type PortableSyncValidationResponse = { + ok: boolean; + path: string; + schema: string | null; + metadata: Record; + integrity: string[]; + foreignKeyViolations: Array>; + missingTables: string[]; + unexpectedExcludedTables: string[]; + tables: Array<{ name: string; rows: number }>; + errors: string[]; +}; + +export type PortableSyncSizeResponse = { + ok: true; + path: string; + totalBytes: number; + walBytes: number; + shmBytes: number; + tables: Array<{ name: string; bytes: number | null; rows: number | null }>; +}; + export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { const bodyChars = params.bodyChars ?? DEFAULT_PORTABLE_BODY_CHARS; if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { @@ -317,6 +340,60 @@ export function createPortableSyncSchema(db: SqliteDatabase): void { `); } +export function validatePortableSyncDatabase(dbPath: string): PortableSyncValidationResponse { + const resolvedPath = path.resolve(dbPath); + const db = openReadonlyDb(resolvedPath); + try { + const tableNames = listTables(db); + const missingTables = PORTABLE_SYNC_TABLES.filter((name) => !tableNames.has(name)); + const unexpectedExcludedTables = PORTABLE_SYNC_EXCLUDED_TABLES.filter((name) => tableNames.has(name)); + const metadata = tableNames.has('portable_metadata') ? readPortableMetadata(db) : {}; + const integrity = readIntegrityCheck(db); + const foreignKeyViolations = readForeignKeyViolations(db); + const schema = metadata.schema ?? null; + const errors = [ + ...missingTables.map((name) => `missing required table: ${name}`), + ...unexpectedExcludedTables.map((name) => `excluded cache table is present: ${name}`), + ...(schema === PORTABLE_SYNC_SCHEMA_VERSION ? [] : [`unexpected schema: ${schema ?? 'missing'}`]), + ...integrity.filter((message) => message !== 'ok').map((message) => `integrity_check: ${message}`), + ...foreignKeyViolations.map((violation) => `foreign_key_check: ${JSON.stringify(violation)}`), + ]; + + return { + ok: errors.length === 0, + path: resolvedPath, + schema, + metadata, + integrity, + foreignKeyViolations, + missingTables, + unexpectedExcludedTables, + tables: PORTABLE_SYNC_TABLES.filter((name) => tableNames.has(name)).map((name) => ({ name, rows: countRows(db, name) })), + errors, + }; + } finally { + db.close(); + } +} + +export function portableSyncSizeReport(dbPath: string): PortableSyncSizeResponse { + const resolvedPath = path.resolve(dbPath); + const db = openReadonlyDb(resolvedPath); + try { + const tables = readDbstatSizes(db); + return { + ok: true, + path: resolvedPath, + totalBytes: fileSize(resolvedPath), + walBytes: fileSize(`${resolvedPath}-wal`), + shmBytes: fileSize(`${resolvedPath}-shm`), + tables, + }; + } finally { + db.close(); + } +} + export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { const exportedAt = nowIso(); const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); @@ -425,6 +502,68 @@ function countRows(db: SqliteDatabase, tableName: string): number { return row.count; } +function openReadonlyDb(dbPath: string): SqliteDatabase { + return new BetterSqlite3(dbPath, { readonly: true, fileMustExist: true }); +} + +function listTables(db: SqliteDatabase): Set { + const rows = db + .prepare("select name from sqlite_master where type in ('table', 'view') and name not like 'sqlite_%'") + .all() as Array<{ name: string }>; + return new Set(rows.map((row) => row.name)); +} + +function readPortableMetadata(db: SqliteDatabase): Record { + const rows = db.prepare('select key, value from portable_metadata order by key').all() as Array<{ key: string; value: string }>; + return Object.fromEntries(rows.map((row) => [row.key, row.value])); +} + +function readIntegrityCheck(db: SqliteDatabase): string[] { + const rows = db.prepare('pragma integrity_check').all() as Array<{ integrity_check: string }>; + return rows.map((row) => row.integrity_check); +} + +function readForeignKeyViolations(db: SqliteDatabase): Array> { + return db.prepare('pragma foreign_key_check').all() as Array>; +} + +function readDbstatSizes(db: SqliteDatabase): Array<{ name: string; bytes: number | null; rows: number | null }> { + try { + const rows = db + .prepare( + `select + s.name as name, + s.bytes as bytes, + coalesce(t.row_count, 0) as rows + from ( + select name, sum(pgsize) as bytes + from dbstat + where name not like 'sqlite_%' + group by name + ) s + left join ( + select name, null as row_count + from sqlite_master + where 0 + ) t on t.name = s.name + order by s.bytes desc, s.name asc`, + ) + .all() as Array<{ name: string; bytes: number; rows: number | null }>; + return rows.map((row) => ({ name: row.name, bytes: row.bytes, rows: safeCountRows(db, row.name) })); + } catch { + const tableNames = [...listTables(db)].sort(); + return tableNames.map((name) => ({ name, bytes: null, rows: safeCountRows(db, name) })); + } +} + +function safeCountRows(db: SqliteDatabase, tableName: string): number | null { + try { + return countRows(db, tableName); + } catch { + return null; + } +} + function attachedTableHasColumn(db: SqliteDatabase, schemaName: string, tableName: string, columnName: string): boolean { const rows = db.prepare(`pragma ${schemaName}.table_info("${tableName}")`).all() as Array<{ name: string }>; return rows.some((row) => row.name === columnName); diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 76e42df..8cef642 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -317,6 +317,18 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.ok(response.excluded.includes('thread_vectors')); assert.equal(response.tables.find((table) => table.name === 'threads')?.rows, 1); + const validation = service.validatePortableSync(outputPath); + assert.equal(validation.ok, true); + assert.equal(validation.schema, 'ghcrawl-portable-sync-v1'); + assert.deepEqual(validation.missingTables, []); + assert.deepEqual(validation.unexpectedExcludedTables, []); + + const size = service.portableSyncSize(outputPath); + assert.equal(size.ok, true); + assert.equal(size.path, outputPath); + assert.ok(size.totalBytes > 0); + assert.ok((size.tables.find((table) => table.name === 'threads')?.bytes ?? 0) > 0); + const portable = openDb(outputPath); try { const thread = portable.prepare('select body_excerpt, body_length from threads where number = 42').get() as { diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index ce72198..67235a2 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -108,7 +108,11 @@ import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { DEFAULT_PORTABLE_BODY_CHARS, exportPortableSyncDatabase, + portableSyncSizeReport, + validatePortableSyncDatabase, type PortableSyncExportResponse, + type PortableSyncSizeResponse, + type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; @@ -3409,6 +3413,14 @@ export class GHCrawlService { }); } + validatePortableSync(dbPath: string): PortableSyncValidationResponse { + return validatePortableSyncDatabase(dbPath); + } + + portableSyncSize(dbPath: string): PortableSyncSizeResponse { + return portableSyncSizeReport(dbPath); + } + private optimizeSqliteTarget(params: { name: 'main' | 'vector'; db: SqliteDatabase; From 012dfc900b49dd697feb29babaa2bcaba154a9e4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:40:01 -0700 Subject: [PATCH 127/215] feat: emit portable sync manifests --- apps/cli/src/main.test.ts | 36 +++++++++ apps/cli/src/main.ts | 10 ++- packages/api-core/src/portable/sync-store.ts | 77 +++++++++++++++++++- packages/api-core/src/service.test.ts | 17 +++++ packages/api-core/src/service.ts | 8 +- 5 files changed, 142 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index ddecd49..622da79 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -803,6 +803,42 @@ test('sync command forwards include-code hydration flag', async () => { assert.match(stdout.read(), /"codeFilesSynced": 1/); }); +test('export-sync command forwards profile and manifest options', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.exportPortableSync; + let received: unknown; + + GHCrawlService.prototype.exportPortableSync = function exportPortableSyncStub(params: unknown) { + received = params; + return { + ok: true, + profile: 'lean', + manifestPath: '/tmp/openclaw.sync.db.manifest.json', + } as never; + }; + + try { + await run(['export-sync', 'openclaw/openclaw', '--profile', 'lean', '--manifest', '--output', '/tmp/openclaw.sync.db'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.exportPortableSync = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + outputPath: '/tmp/openclaw.sync.db', + profile: 'lean', + writeManifest: true, + bodyChars: undefined, + }); + assert.match(stdout.read(), /"profile": "lean"/); +}); + test('refresh command forwards include-code hydration flag', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 6136a4c..cdd594d 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -151,14 +151,16 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ }, { name: 'export-sync', - synopsis: 'export-sync [--output ] [--body-chars ] [--json]', + synopsis: 'export-sync [--output ] [--profile lean|review] [--manifest] [--body-chars ] [--json]', description: 'Export a compact portable SQLite core for git-style file sync.', options: [ '--output Output SQLite path; defaults to the ghcrawl config exports directory', + '--profile lean|review Use a preset body excerpt budget for git sync', + '--manifest Write a JSON sidecar with counts, SHA256, and validation status', '--body-chars Maximum body excerpt characters per thread; default 512', '--json Emit machine-readable JSON output explicitly', ], - examples: ['ghcrawl export-sync openclaw/openclaw --output ./openclaw.sync.db --json'], + examples: ['ghcrawl export-sync openclaw/openclaw --profile lean --manifest --output ./openclaw.sync.db --json'], agentJson: true, }, { @@ -642,6 +644,8 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo 'event-limit': { type: 'string' }, 'body-chars': { type: 'string' }, output: { type: 'string' }, + profile: { type: 'string' }, + manifest: { type: 'boolean' }, 'no-sync': { type: 'boolean' }, 'no-embed': { type: 'boolean' }, 'no-cluster': { type: 'boolean' }, @@ -1092,6 +1096,8 @@ export async function run( owner, repo, outputPath: typeof values.output === 'string' ? values.output : undefined, + profile: parseEnum('export-sync', 'profile', values.profile, ['lean', 'review']), + writeManifest: values.manifest === true, bodyChars: typeof values['body-chars'] === 'string' ? parsePositiveInteger('body-chars', values['body-chars'], 'export-sync') diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 62d0486..f7c1fca 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -1,4 +1,5 @@ import fs from 'node:fs'; +import crypto from 'node:crypto'; import path from 'node:path'; import BetterSqlite3 from 'better-sqlite3'; @@ -8,6 +9,7 @@ import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; export const PORTABLE_SYNC_SCHEMA_VERSION = 'ghcrawl-portable-sync-v1'; export const DEFAULT_PORTABLE_BODY_CHARS = 512; +export type PortableSyncProfile = 'lean' | 'review'; export const PORTABLE_SYNC_TABLES = [ 'repositories', @@ -50,6 +52,27 @@ export type PortableSyncExportOptions = { sourcePath: string; outputPath: string; bodyChars?: number; + profile?: PortableSyncProfile; + writeManifest?: boolean; +}; + +export type PortableSyncManifest = { + schema: string; + profile: PortableSyncProfile | 'default'; + exportedAt: string; + outputPath: string; + outputBytes: number; + sha256: string; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + bodyChars: number; + tables: Array<{ name: string; rows: number }>; + excluded: string[]; + validationOk: boolean; }; export type PortableSyncExportResponse = { @@ -66,8 +89,11 @@ export type PortableSyncExportResponse = { outputBytes: number; compressionRatio: number; bodyChars: number; + profile: PortableSyncProfile | 'default'; tables: Array<{ name: string; rows: number }>; excluded: string[]; + manifestPath: string | null; + manifest: PortableSyncManifest; }; export type PortableSyncValidationResponse = { @@ -93,7 +119,8 @@ export type PortableSyncSizeResponse = { }; export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { - const bodyChars = params.bodyChars ?? DEFAULT_PORTABLE_BODY_CHARS; + const profile: PortableSyncProfile | 'default' = params.profile ?? 'default'; + const bodyChars = params.bodyChars ?? bodyCharsForProfile(params.profile); if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { throw new Error('bodyChars must be a non-negative integer'); } @@ -150,7 +177,7 @@ export function exportPortableSyncDatabase(params: PortableSyncExportOptions): P try { verify.pragma('journal_mode = DELETE'); const tables = PORTABLE_SYNC_TABLES.map((name) => ({ name, rows: countRows(verify, name) })); - return { + const responseBase: Omit = { ok: true, repository: { id: params.repository.id, @@ -164,14 +191,54 @@ export function exportPortableSyncDatabase(params: PortableSyncExportOptions): P outputBytes, compressionRatio: sourceBytes > 0 ? outputBytes / sourceBytes : 0, bodyChars, + profile, tables, excluded: [...PORTABLE_SYNC_EXCLUDED_TABLES], }; + const validation = validatePortableSyncDatabase(outputPath); + const manifest = buildPortableSyncManifest(responseBase, validation.ok); + const manifestPath = params.writeManifest ? writePortableSyncManifest(outputPath, manifest) : null; + return { + ...responseBase, + manifestPath, + manifest, + }; } finally { verify.close(); } } +function bodyCharsForProfile(profile: PortableSyncProfile | undefined): number { + if (profile === 'lean') return 256; + if (profile === 'review') return 1024; + return DEFAULT_PORTABLE_BODY_CHARS; +} + +function buildPortableSyncManifest( + response: Omit, + validationOk: boolean, +): PortableSyncManifest { + return { + schema: PORTABLE_SYNC_SCHEMA_VERSION, + profile: response.profile, + exportedAt: nowIso(), + outputPath: response.outputPath, + outputBytes: response.outputBytes, + sha256: sha256File(response.outputPath), + repository: response.repository, + bodyChars: response.bodyChars, + tables: response.tables, + excluded: response.excluded, + validationOk, + }; +} + +function writePortableSyncManifest(outputPath: string, manifest: PortableSyncManifest): string { + const manifestPath = `${outputPath}.manifest.json`; + fs.writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`); + return manifestPath; +} + export function createPortableSyncSchema(db: SqliteDatabase): void { db.exec(` create table portable_metadata (key text primary key, value text not null); @@ -577,6 +644,12 @@ function fileSize(filePath: string): number { } } +function sha256File(filePath: string): string { + const hash = crypto.createHash('sha256'); + hash.update(fs.readFileSync(filePath)); + return hash.digest('hex'); +} + function nowIso(): string { return new Date().toISOString(); } diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 8cef642..8422f85 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -313,6 +313,10 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.equal(response.repository.fullName, 'openclaw/openclaw'); assert.equal(response.outputPath, outputPath); assert.ok(response.outputBytes < response.sourceBytes); + assert.equal(response.profile, 'default'); + assert.equal(response.manifestPath, null); + assert.equal(response.manifest.validationOk, true); + assert.equal(response.manifest.sha256.length, 64); assert.ok(response.excluded.includes('documents')); assert.ok(response.excluded.includes('thread_vectors')); assert.equal(response.tables.find((table) => table.name === 'threads')?.rows, 1); @@ -355,6 +359,19 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl }; assert.equal(sourceThread.raw_json, hugeRaw); assert.equal(sourceThread.body, longBody); + + const leanOutputPath = path.join(config.configDir, 'portable-lean.sync.db'); + const leanResponse = service.exportPortableSync({ + owner: 'openclaw', + repo: 'openclaw', + outputPath: leanOutputPath, + profile: 'lean', + writeManifest: true, + }); + assert.equal(leanResponse.bodyChars, 256); + assert.equal(leanResponse.profile, 'lean'); + assert.equal(leanResponse.manifestPath, `${leanOutputPath}.manifest.json`); + assert.equal(fs.existsSync(leanResponse.manifestPath), true); } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 67235a2..43ddea8 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -106,11 +106,11 @@ import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.j import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { - DEFAULT_PORTABLE_BODY_CHARS, exportPortableSyncDatabase, portableSyncSizeReport, validatePortableSyncDatabase, type PortableSyncExportResponse, + type PortableSyncProfile, type PortableSyncSizeResponse, type PortableSyncValidationResponse, } from './portable/sync-store.js'; @@ -462,6 +462,8 @@ type PortableSyncExportOptions = { repo: string; outputPath?: string; bodyChars?: number; + profile?: PortableSyncProfile; + writeManifest?: boolean; }; type SearchResultInternal = SearchResponse; @@ -3409,7 +3411,9 @@ export class GHCrawlService { sourceDb: this.db, sourcePath, outputPath, - bodyChars: params.bodyChars ?? DEFAULT_PORTABLE_BODY_CHARS, + bodyChars: params.bodyChars, + profile: params.profile, + writeManifest: params.writeManifest, }); } From ff63616b319b5ea9ab2c6b7c564a69e1b80ab3ef Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:43:55 -0700 Subject: [PATCH 128/215] feat: compare portable sync drift --- apps/cli/src/main.test.ts | 34 +++ apps/cli/src/main.ts | 23 ++ packages/api-core/src/portable/sync-store.ts | 280 +++++++++++++++++++ packages/api-core/src/service.test.ts | 12 + packages/api-core/src/service.ts | 11 + 5 files changed, 360 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 622da79..04f26a8 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -48,6 +48,7 @@ const publicCommands = [ 'export-sync', 'validate-sync', 'portable-size', + 'sync-status', 'refresh', 'optimize', 'runs', @@ -839,6 +840,39 @@ test('export-sync command forwards profile and manifest options', async () => { assert.match(stdout.read(), /"profile": "lean"/); }); +test('sync-status command forwards portable path', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.portableSyncStatus; + let received: unknown; + + GHCrawlService.prototype.portableSyncStatus = function portableSyncStatusStub(params: unknown) { + received = params; + return { + ok: true, + portableRepositoryFound: true, + drift: { changedThreads: 0 }, + } as never; + }; + + try { + await run(['sync-status', 'openclaw/openclaw', '--portable', '/tmp/openclaw.sync.db'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.portableSyncStatus = original; + context.cleanup(); + } + + assert.deepEqual(received, { + owner: 'openclaw', + repo: 'openclaw', + portablePath: '/tmp/openclaw.sync.db', + }); + assert.match(stdout.read(), /"portableRepositoryFound": true/); +}); + test('refresh command forwards include-code hydration flag', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index cdd594d..9298e18 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -26,6 +26,7 @@ type CommandName = | 'export-sync' | 'validate-sync' | 'portable-size' + | 'sync-status' | 'refresh' | 'optimize' | 'runs' @@ -179,6 +180,14 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl portable-size ./openclaw.sync.db --json'], agentJson: true, }, + { + name: 'sync-status', + synopsis: 'sync-status --portable [--json]', + description: 'Compare the live repository store against a portable git-sync SQLite database.', + options: ['--portable Portable SQLite path to compare', '--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl sync-status openclaw/openclaw --portable ./openclaw.sync.db --json'], + agentJson: true, + }, { name: 'refresh', synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', @@ -646,6 +655,7 @@ export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepo output: { type: 'string' }, profile: { type: 'string' }, manifest: { type: 'boolean' }, + portable: { type: 'string' }, 'no-sync': { type: 'boolean' }, 'no-embed': { type: 'boolean' }, 'no-cluster': { type: 'boolean' }, @@ -1124,6 +1134,19 @@ export async function run( writeJson(stdout, result); return; } + case 'sync-status': { + const { owner, repo, values } = parseRepoFlags('sync-status', rest); + if (typeof values.portable !== 'string') { + throw new CliUsageError('Missing --portable', 'sync-status'); + } + const result = getService().portableSyncStatus({ + owner, + repo, + portablePath: values.portable, + }); + writeJson(stdout, result); + return; + } case 'refresh': { const { owner, repo, values } = parseRepoFlags('refresh', rest); const heapDiagnostics = createOptionalHeapDiagnostics(values, stderr, 'refresh'); diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index f7c1fca..808fe7c 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -118,6 +118,49 @@ export type PortableSyncSizeResponse = { tables: Array<{ name: string; bytes: number | null; rows: number | null }>; }; +export type PortableSyncStatusResponse = { + ok: true; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + portablePath: string; + portableRepositoryFound: boolean; + live: PortableRepoSnapshot; + portable: PortableRepoSnapshot; + drift: { + liveOnlyThreads: number; + portableOnlyThreads: number; + changedThreads: number; + liveOnlyClusters: number; + portableOnlyClusters: number; + changedClusters: number; + liveOnlyMemberships: number; + portableOnlyMemberships: number; + changedMemberships: number; + }; +}; + +type PortableRepoSnapshot = { + threads: { + total: number; + open: number; + closed: number; + issues: number; + pullRequests: number; + latestUpdatedAt: string | null; + }; + clusters: { + groups: number; + memberships: number; + overrides: number; + aliases: number; + closures: number; + }; +}; + export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { const profile: PortableSyncProfile | 'default' = params.profile ?? 'default'; const bodyChars = params.bodyChars ?? bodyCharsForProfile(params.profile); @@ -198,6 +241,7 @@ export function exportPortableSyncDatabase(params: PortableSyncExportOptions): P const validation = validatePortableSyncDatabase(outputPath); const manifest = buildPortableSyncManifest(responseBase, validation.ok); const manifestPath = params.writeManifest ? writePortableSyncManifest(outputPath, manifest) : null; + return { ...responseBase, manifestPath, @@ -461,6 +505,60 @@ export function portableSyncSizeReport(dbPath: string): PortableSyncSizeResponse } } +export function portableSyncStatusReport(params: { + liveDb: SqliteDatabase; + repository: RepositoryDto; + portablePath: string; +}): PortableSyncStatusResponse { + const resolvedPath = path.resolve(params.portablePath); + const portableDb = openReadonlyDb(resolvedPath); + try { + const portableRepo = portableDb + .prepare('select id from repositories where full_name = ?') + .get(params.repository.fullName) as { id: number } | undefined; + const portableRepoId = portableRepo?.id ?? null; + const liveSnapshot = readRepoSnapshot(params.liveDb, params.repository.id); + const portableSnapshot = portableRepoId === null ? emptyRepoSnapshot() : readRepoSnapshot(portableDb, portableRepoId); + + const liveThreads = readThreadComparableRows(params.liveDb, params.repository.id); + const portableThreads = portableRepoId === null ? [] : readThreadComparableRows(portableDb, portableRepoId); + const liveClusters = readClusterComparableRows(params.liveDb, params.repository.id); + const portableClusters = portableRepoId === null ? [] : readClusterComparableRows(portableDb, portableRepoId); + const liveMemberships = readMembershipComparableRows(params.liveDb, params.repository.id); + const portableMemberships = portableRepoId === null ? [] : readMembershipComparableRows(portableDb, portableRepoId); + const threadDrift = compareComparableRows(liveThreads, portableThreads); + const clusterDrift = compareComparableRows(liveClusters, portableClusters); + const membershipDrift = compareComparableRows(liveMemberships, portableMemberships); + + return { + ok: true, + repository: { + id: params.repository.id, + owner: params.repository.owner, + name: params.repository.name, + fullName: params.repository.fullName, + }, + portablePath: resolvedPath, + portableRepositoryFound: portableRepoId !== null, + live: liveSnapshot, + portable: portableSnapshot, + drift: { + liveOnlyThreads: threadDrift.liveOnly, + portableOnlyThreads: threadDrift.portableOnly, + changedThreads: threadDrift.changed, + liveOnlyClusters: clusterDrift.liveOnly, + portableOnlyClusters: clusterDrift.portableOnly, + changedClusters: clusterDrift.changed, + liveOnlyMemberships: membershipDrift.liveOnly, + portableOnlyMemberships: membershipDrift.portableOnly, + changedMemberships: membershipDrift.changed, + }, + }; + } finally { + portableDb.close(); + } +} + export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { const exportedAt = nowIso(); const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); @@ -569,6 +667,188 @@ function countRows(db: SqliteDatabase, tableName: string): number { return row.count; } +function emptyRepoSnapshot(): PortableRepoSnapshot { + return { + threads: { + total: 0, + open: 0, + closed: 0, + issues: 0, + pullRequests: 0, + latestUpdatedAt: null, + }, + clusters: { + groups: 0, + memberships: 0, + overrides: 0, + aliases: 0, + closures: 0, + }, + }; +} + +function readRepoSnapshot(db: SqliteDatabase, repoId: number): PortableRepoSnapshot { + const threads = db + .prepare( + `select + count(*) as total, + sum(case when state = 'open' and closed_at_local is null then 1 else 0 end) as open, + sum(case when state <> 'open' or closed_at_local is not null then 1 else 0 end) as closed, + sum(case when kind = 'issue' then 1 else 0 end) as issues, + sum(case when kind = 'pull_request' then 1 else 0 end) as pull_requests, + max(coalesce(updated_at_gh, updated_at)) as latest_updated_at + from threads + where repo_id = ?`, + ) + .get(repoId) as { + total: number; + open: number | null; + closed: number | null; + issues: number | null; + pull_requests: number | null; + latest_updated_at: string | null; + }; + const clusters = db + .prepare( + `select + (select count(*) from cluster_groups where repo_id = ?) as groups_count, + (select count(*) + from cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + where cg.repo_id = ?) as memberships_count, + (select count(*) from cluster_overrides where repo_id = ?) as overrides_count, + (select count(*) + from cluster_aliases ca + join cluster_groups cg on cg.id = ca.cluster_id + where cg.repo_id = ?) as aliases_count, + (select count(*) + from cluster_closures cc + join cluster_groups cg on cg.id = cc.cluster_id + where cg.repo_id = ?) as closures_count`, + ) + .get(repoId, repoId, repoId, repoId, repoId) as { + groups_count: number; + memberships_count: number; + overrides_count: number; + aliases_count: number; + closures_count: number; + }; + + return { + threads: { + total: threads.total, + open: threads.open ?? 0, + closed: threads.closed ?? 0, + issues: threads.issues ?? 0, + pullRequests: threads.pull_requests ?? 0, + latestUpdatedAt: threads.latest_updated_at, + }, + clusters: { + groups: clusters.groups_count, + memberships: clusters.memberships_count, + overrides: clusters.overrides_count, + aliases: clusters.aliases_count, + closures: clusters.closures_count, + }, + }; +} + +type ComparableRow = { key: string; value: string }; + +function readThreadComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { + const rows = db + .prepare( + `select kind, number, state, title, content_hash, updated_at_gh, closed_at_gh, closed_at_local + from threads + where repo_id = ? + order by kind, number`, + ) + .all(repoId) as Array<{ + kind: string; + number: number; + state: string; + title: string; + content_hash: string; + updated_at_gh: string | null; + closed_at_gh: string | null; + closed_at_local: string | null; + }>; + return rows.map((row) => ({ + key: `${row.kind}:${row.number}`, + value: JSON.stringify([row.state, row.title, row.content_hash, row.updated_at_gh, row.closed_at_gh, row.closed_at_local]), + })); +} + +function readClusterComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { + const rows = db + .prepare( + `select stable_key, stable_slug, status, cluster_type, title, closed_at + from cluster_groups + where repo_id = ? + order by stable_key`, + ) + .all(repoId) as Array<{ + stable_key: string; + stable_slug: string; + status: string; + cluster_type: string | null; + title: string | null; + closed_at: string | null; + }>; + return rows.map((row) => ({ + key: row.stable_key, + value: JSON.stringify([row.stable_slug, row.status, row.cluster_type, row.title, row.closed_at]), + })); +} + +function readMembershipComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { + const rows = db + .prepare( + `select cg.stable_key, t.kind, t.number, cm.role, cm.state, cm.score_to_representative, cm.added_by, cm.removed_by, cm.removed_at + from cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + join threads t on t.id = cm.thread_id + where cg.repo_id = ? + order by cg.stable_key, t.kind, t.number`, + ) + .all(repoId) as Array<{ + stable_key: string; + kind: string; + number: number; + role: string; + state: string; + score_to_representative: number | null; + added_by: string; + removed_by: string | null; + removed_at: string | null; + }>; + return rows.map((row) => ({ + key: `${row.stable_key}:${row.kind}:${row.number}`, + value: JSON.stringify([row.role, row.state, row.score_to_representative, row.added_by, row.removed_by, row.removed_at]), + })); +} + +function compareComparableRows(liveRows: ComparableRow[], portableRows: ComparableRow[]): { liveOnly: number; portableOnly: number; changed: number } { + const live = new Map(liveRows.map((row) => [row.key, row.value])); + const portable = new Map(portableRows.map((row) => [row.key, row.value])); + let liveOnly = 0; + let portableOnly = 0; + let changed = 0; + + for (const [key, value] of live) { + if (!portable.has(key)) { + liveOnly += 1; + } else if (portable.get(key) !== value) { + changed += 1; + } + } + for (const key of portable.keys()) { + if (!live.has(key)) portableOnly += 1; + } + + return { liveOnly, portableOnly, changed }; +} + function openReadonlyDb(dbPath: string): SqliteDatabase { return new BetterSqlite3(dbPath, { readonly: true, fileMustExist: true }); } diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 8422f85..bf7860a 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -333,6 +333,18 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.ok(size.totalBytes > 0); assert.ok((size.tables.find((table) => table.name === 'threads')?.bytes ?? 0) > 0); + const status = service.portableSyncStatus({ + owner: 'openclaw', + repo: 'openclaw', + portablePath: outputPath, + }); + assert.equal(status.portableRepositoryFound, true); + assert.equal(status.live.threads.total, 1); + assert.equal(status.portable.threads.total, 1); + assert.equal(status.drift.liveOnlyThreads, 0); + assert.equal(status.drift.portableOnlyThreads, 0); + assert.equal(status.drift.changedThreads, 0); + const portable = openDb(outputPath); try { const thread = portable.prepare('select body_excerpt, body_length from threads where number = 42').get() as { diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 43ddea8..8e5dfdc 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -108,10 +108,12 @@ import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { exportPortableSyncDatabase, portableSyncSizeReport, + portableSyncStatusReport, validatePortableSyncDatabase, type PortableSyncExportResponse, type PortableSyncProfile, type PortableSyncSizeResponse, + type PortableSyncStatusResponse, type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; @@ -3425,6 +3427,15 @@ export class GHCrawlService { return portableSyncSizeReport(dbPath); } + portableSyncStatus(params: { owner: string; repo: string; portablePath: string }): PortableSyncStatusResponse { + const repository = this.requireRepository(params.owner, params.repo); + return portableSyncStatusReport({ + liveDb: this.db, + repository, + portablePath: params.portablePath, + }); + } + private optimizeSqliteTarget(params: { name: 'main' | 'vector'; db: SqliteDatabase; From cd92ea4ace9ead42f929ea46c967666703613209 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:47:05 -0700 Subject: [PATCH 129/215] feat: import portable sync stores --- apps/cli/src/main.test.ts | 30 + apps/cli/src/main.ts | 18 + packages/api-core/src/portable/sync-store.ts | 584 +++++++++++++++++++ packages/api-core/src/service.test.ts | 24 + packages/api-core/src/service.ts | 9 + 5 files changed, 665 insertions(+) diff --git a/apps/cli/src/main.test.ts b/apps/cli/src/main.test.ts index 04f26a8..0e0b0e9 100644 --- a/apps/cli/src/main.test.ts +++ b/apps/cli/src/main.test.ts @@ -49,6 +49,7 @@ const publicCommands = [ 'validate-sync', 'portable-size', 'sync-status', + 'import-sync', 'refresh', 'optimize', 'runs', @@ -873,6 +874,35 @@ test('sync-status command forwards portable path', async () => { assert.match(stdout.read(), /"portableRepositoryFound": true/); }); +test('import-sync command forwards portable path', async () => { + const stdout = createWritableCapture(); + const context = makeRunContext(); + const original = GHCrawlService.prototype.importPortableSync; + let received: unknown; + + GHCrawlService.prototype.importPortableSync = function importPortableSyncStub(dbPath: string) { + received = dbPath; + return { + ok: true, + repository: { fullName: 'openclaw/openclaw' }, + imported: { threads: 1 }, + } as never; + }; + + try { + await run(['import-sync', '/tmp/openclaw.sync.db'], stdout.stream, { + env: context.env, + cwd: context.cwd, + }); + } finally { + GHCrawlService.prototype.importPortableSync = original; + context.cleanup(); + } + + assert.equal(received, '/tmp/openclaw.sync.db'); + assert.match(stdout.read(), /"threads": 1/); +}); + test('refresh command forwards include-code hydration flag', async () => { const stdout = createWritableCapture(); const context = makeRunContext(); diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 9298e18..5f017b0 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -27,6 +27,7 @@ type CommandName = | 'validate-sync' | 'portable-size' | 'sync-status' + | 'import-sync' | 'refresh' | 'optimize' | 'runs' @@ -188,6 +189,14 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ examples: ['ghcrawl sync-status openclaw/openclaw --portable ./openclaw.sync.db --json'], agentJson: true, }, + { + name: 'import-sync', + synopsis: 'import-sync [--json]', + description: 'Import a portable git-sync SQLite database into the configured live store.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl import-sync ./openclaw.sync.db --json'], + agentJson: true, + }, { name: 'refresh', synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', @@ -1147,6 +1156,15 @@ export async function run( writeJson(stdout, result); return; } + case 'import-sync': { + const parsed = parseArgsForCommand('import-sync', rest, { json: { type: 'boolean' } }, true); + if (parsed.positionals.length !== 1) { + throw new CliUsageError('import-sync requires exactly one portable database path', 'import-sync'); + } + const result = getService().importPortableSync(parsed.positionals[0]); + writeJson(stdout, result); + return; + } case 'refresh': { const { owner, repo, values } = parseRepoFlags('refresh', rest); const heapDiagnostics = createOptionalHeapDiagnostics(values, stderr, 'refresh'); diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 808fe7c..8b6818a 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -143,6 +143,32 @@ export type PortableSyncStatusResponse = { }; }; +export type PortableSyncImportResponse = { + ok: true; + path: string; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + validationOk: boolean; + imported: { + repositories: number; + threads: number; + threadRevisions: number; + threadFingerprints: number; + threadKeySummaries: number; + repoSyncState: number; + repoPipelineState: number; + clusterGroups: number; + clusterMemberships: number; + clusterOverrides: number; + clusterAliases: number; + clusterClosures: number; + }; +}; + type PortableRepoSnapshot = { threads: { total: number; @@ -559,6 +585,115 @@ export function portableSyncStatusReport(params: { } } +export function importPortableSyncDatabase(params: { liveDb: SqliteDatabase; portablePath: string }): PortableSyncImportResponse { + const resolvedPath = path.resolve(params.portablePath); + const validation = validatePortableSyncDatabase(resolvedPath); + if (!validation.ok) { + throw new Error(`Portable sync validation failed: ${validation.errors.join('; ')}`); + } + + const portableDb = openReadonlyDb(resolvedPath); + try { + const portableRepo = portableDb.prepare('select * from repositories order by id limit 1').get() as PortableRepositoryRow | undefined; + if (!portableRepo) { + throw new Error('Portable sync database has no repository row'); + } + + const imported = emptyImportCounts(); + const threadIdMap = new Map(); + const revisionIdMap = new Map(); + const clusterIdMap = new Map(); + + const runImport = params.liveDb.transaction(() => { + const repoId = upsertImportedRepository(params.liveDb, portableRepo); + imported.repositories = 1; + + for (const thread of readPortableThreads(portableDb, portableRepo.id)) { + threadIdMap.set(thread.id, upsertImportedThread(params.liveDb, repoId, thread)); + imported.threads += 1; + } + + for (const revision of readPortableThreadRevisions(portableDb)) { + const liveThreadId = threadIdMap.get(revision.thread_id); + if (!liveThreadId) continue; + revisionIdMap.set(revision.id, upsertImportedThreadRevision(params.liveDb, liveThreadId, revision)); + imported.threadRevisions += 1; + } + + for (const fingerprint of readPortableThreadFingerprints(portableDb)) { + const liveRevisionId = revisionIdMap.get(fingerprint.thread_revision_id); + if (!liveRevisionId) continue; + upsertImportedThreadFingerprint(params.liveDb, liveRevisionId, fingerprint); + imported.threadFingerprints += 1; + } + + for (const summary of readPortableThreadKeySummaries(portableDb)) { + const liveRevisionId = revisionIdMap.get(summary.thread_revision_id); + if (!liveRevisionId) continue; + upsertImportedThreadKeySummary(params.liveDb, liveRevisionId, summary); + imported.threadKeySummaries += 1; + } + + if (upsertImportedRepoSyncState(params.liveDb, repoId, portableDb, portableRepo.id)) imported.repoSyncState = 1; + if (upsertImportedRepoPipelineState(params.liveDb, repoId, portableDb, portableRepo.id)) imported.repoPipelineState = 1; + + for (const cluster of readPortableClusterGroups(portableDb, portableRepo.id)) { + const representativeThreadId = cluster.representative_thread_id ? (threadIdMap.get(cluster.representative_thread_id) ?? null) : null; + clusterIdMap.set(cluster.id, upsertImportedClusterGroup(params.liveDb, repoId, representativeThreadId, cluster)); + imported.clusterGroups += 1; + } + + for (const membership of readPortableClusterMemberships(portableDb)) { + const liveClusterId = clusterIdMap.get(membership.cluster_id); + const liveThreadId = threadIdMap.get(membership.thread_id); + if (!liveClusterId || !liveThreadId) continue; + upsertImportedClusterMembership(params.liveDb, liveClusterId, liveThreadId, membership); + imported.clusterMemberships += 1; + } + + for (const override of readPortableClusterOverrides(portableDb, portableRepo.id)) { + const liveClusterId = clusterIdMap.get(override.cluster_id); + const liveThreadId = threadIdMap.get(override.thread_id); + if (!liveClusterId || !liveThreadId) continue; + upsertImportedClusterOverride(params.liveDb, repoId, liveClusterId, liveThreadId, override); + imported.clusterOverrides += 1; + } + + for (const alias of readPortableClusterAliases(portableDb)) { + const liveClusterId = clusterIdMap.get(alias.cluster_id); + if (!liveClusterId) continue; + upsertImportedClusterAlias(params.liveDb, liveClusterId, alias); + imported.clusterAliases += 1; + } + + for (const closure of readPortableClusterClosures(portableDb)) { + const liveClusterId = clusterIdMap.get(closure.cluster_id); + if (!liveClusterId) continue; + upsertImportedClusterClosure(params.liveDb, liveClusterId, closure); + imported.clusterClosures += 1; + } + + return repoId; + }); + + const repoId = runImport(); + return { + ok: true, + path: resolvedPath, + repository: { + id: repoId, + owner: portableRepo.owner, + name: portableRepo.name, + fullName: portableRepo.full_name, + }, + validationOk: validation.ok, + imported, + }; + } finally { + portableDb.close(); + } +} + export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { const exportedAt = nowIso(); const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); @@ -849,6 +984,455 @@ function compareComparableRows(liveRows: ComparableRow[], portableRows: Comparab return { liveOnly, portableOnly, changed }; } +type PortableRepositoryRow = { + id: number; + owner: string; + name: string; + full_name: string; + github_repo_id: string | null; + updated_at: string; +}; + +type PortableThreadRow = { + id: number; + github_id: string; + number: number; + kind: string; + state: string; + title: string; + body_excerpt: string | null; + author_login: string | null; + author_type: string | null; + html_url: string; + labels_json: string; + assignees_json: string; + content_hash: string; + is_draft: number; + created_at_gh: string | null; + updated_at_gh: string | null; + closed_at_gh: string | null; + merged_at_gh: string | null; + first_pulled_at: string | null; + last_pulled_at: string | null; + updated_at: string; + closed_at_local: string | null; + close_reason_local: string | null; +}; + +type PortableThreadRevisionRow = { + id: number; + thread_id: number; + source_updated_at: string | null; + content_hash: string; + title_hash: string; + body_hash: string; + labels_hash: string; + created_at: string; +}; + +type PortableThreadFingerprintRow = Record & { + thread_revision_id: number; +}; + +type PortableThreadKeySummaryRow = Record & { + thread_revision_id: number; +}; + +type PortableClusterGroupRow = Record & { + id: number; + representative_thread_id: number | null; +}; + +type PortableClusterMembershipRow = Record & { + cluster_id: number; + thread_id: number; +}; + +type PortableClusterOverrideRow = Record & { + cluster_id: number; + thread_id: number; +}; + +type PortableClusterAliasRow = Record & { + cluster_id: number; +}; + +type PortableClusterClosureRow = Record & { + cluster_id: number; +}; + +function emptyImportCounts(): PortableSyncImportResponse['imported'] { + return { + repositories: 0, + threads: 0, + threadRevisions: 0, + threadFingerprints: 0, + threadKeySummaries: 0, + repoSyncState: 0, + repoPipelineState: 0, + clusterGroups: 0, + clusterMemberships: 0, + clusterOverrides: 0, + clusterAliases: 0, + clusterClosures: 0, + }; +} + +function readPortableThreads(db: SqliteDatabase, repoId: number): PortableThreadRow[] { + return db.prepare('select * from threads where repo_id = ? order by id').all(repoId) as PortableThreadRow[]; +} + +function readPortableThreadRevisions(db: SqliteDatabase): PortableThreadRevisionRow[] { + return db.prepare('select * from thread_revisions order by id').all() as PortableThreadRevisionRow[]; +} + +function readPortableThreadFingerprints(db: SqliteDatabase): PortableThreadFingerprintRow[] { + return db.prepare('select * from thread_fingerprints order by id').all() as PortableThreadFingerprintRow[]; +} + +function readPortableThreadKeySummaries(db: SqliteDatabase): PortableThreadKeySummaryRow[] { + return db.prepare('select * from thread_key_summaries order by id').all() as PortableThreadKeySummaryRow[]; +} + +function readPortableClusterGroups(db: SqliteDatabase, repoId: number): PortableClusterGroupRow[] { + return db.prepare('select * from cluster_groups where repo_id = ? order by id').all(repoId) as PortableClusterGroupRow[]; +} + +function readPortableClusterMemberships(db: SqliteDatabase): PortableClusterMembershipRow[] { + return db.prepare('select * from cluster_memberships order by cluster_id, thread_id').all() as PortableClusterMembershipRow[]; +} + +function readPortableClusterOverrides(db: SqliteDatabase, repoId: number): PortableClusterOverrideRow[] { + return db.prepare('select * from cluster_overrides where repo_id = ? order by id').all(repoId) as PortableClusterOverrideRow[]; +} + +function readPortableClusterAliases(db: SqliteDatabase): PortableClusterAliasRow[] { + return db.prepare('select * from cluster_aliases order by cluster_id, alias_slug').all() as PortableClusterAliasRow[]; +} + +function readPortableClusterClosures(db: SqliteDatabase): PortableClusterClosureRow[] { + return db.prepare('select * from cluster_closures order by cluster_id').all() as PortableClusterClosureRow[]; +} + +function upsertImportedRepository(db: SqliteDatabase, row: PortableRepositoryRow): number { + db.prepare( + `insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, '{}', ?) + on conflict(full_name) do update set + owner = excluded.owner, + name = excluded.name, + github_repo_id = excluded.github_repo_id, + updated_at = excluded.updated_at`, + ).run(row.owner, row.name, row.full_name, row.github_repo_id, row.updated_at); + const live = db.prepare('select id from repositories where full_name = ?').get(row.full_name) as { id: number }; + return live.id; +} + +function upsertImportedThread(db: SqliteDatabase, repoId: number, row: PortableThreadRow): number { + db.prepare( + `insert into threads ( + repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, + closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '{}', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, kind, number) do update set + github_id = excluded.github_id, + state = excluded.state, + title = excluded.title, + body = coalesce(threads.body, excluded.body), + author_login = excluded.author_login, + author_type = excluded.author_type, + html_url = excluded.html_url, + labels_json = excluded.labels_json, + assignees_json = excluded.assignees_json, + content_hash = excluded.content_hash, + is_draft = excluded.is_draft, + created_at_gh = excluded.created_at_gh, + updated_at_gh = excluded.updated_at_gh, + closed_at_gh = excluded.closed_at_gh, + merged_at_gh = excluded.merged_at_gh, + first_pulled_at = coalesce(threads.first_pulled_at, excluded.first_pulled_at), + last_pulled_at = excluded.last_pulled_at, + updated_at = excluded.updated_at, + closed_at_local = excluded.closed_at_local, + close_reason_local = excluded.close_reason_local`, + ).run( + repoId, + row.github_id, + row.number, + row.kind, + row.state, + row.title, + row.body_excerpt, + row.author_login, + row.author_type, + row.html_url, + row.labels_json, + row.assignees_json, + row.content_hash, + row.is_draft, + row.created_at_gh, + row.updated_at_gh, + row.closed_at_gh, + row.merged_at_gh, + row.first_pulled_at, + row.last_pulled_at, + row.updated_at, + row.closed_at_local, + row.close_reason_local, + ); + const live = db.prepare('select id from threads where repo_id = ? and kind = ? and number = ?').get(repoId, row.kind, row.number) as { id: number }; + return live.id; +} + +function upsertImportedThreadRevision(db: SqliteDatabase, liveThreadId: number, row: PortableThreadRevisionRow): number { + db.prepare( + `insert into thread_revisions (thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + values (?, ?, ?, ?, ?, ?, ?) + on conflict(thread_id, content_hash) do update set + source_updated_at = excluded.source_updated_at, + title_hash = excluded.title_hash, + body_hash = excluded.body_hash, + labels_hash = excluded.labels_hash`, + ).run(liveThreadId, row.source_updated_at, row.content_hash, row.title_hash, row.body_hash, row.labels_hash, row.created_at); + const live = db.prepare('select id from thread_revisions where thread_id = ? and content_hash = ?').get(liveThreadId, row.content_hash) as { + id: number; + }; + return live.id; +} + +function upsertImportedThreadFingerprint(db: SqliteDatabase, liveRevisionId: number, row: PortableThreadFingerprintRow): void { + db.prepare( + `insert into thread_fingerprints ( + thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, + linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_revision_id, algorithm_version) do update set + fingerprint_hash = excluded.fingerprint_hash, + fingerprint_slug = excluded.fingerprint_slug, + title_tokens_json = excluded.title_tokens_json, + body_token_hash = excluded.body_token_hash, + linked_refs_json = excluded.linked_refs_json, + file_set_hash = excluded.file_set_hash, + module_buckets_json = excluded.module_buckets_json, + simhash64 = excluded.simhash64, + feature_json = excluded.feature_json`, + ).run( + liveRevisionId, + row.algorithm_version, + row.fingerprint_hash, + row.fingerprint_slug, + row.title_tokens_json, + row.body_token_hash, + row.linked_refs_json, + row.file_set_hash, + row.module_buckets_json, + row.simhash64, + row.feature_json, + row.created_at, + ); +} + +function upsertImportedThreadKeySummary(db: SqliteDatabase, liveRevisionId: number, row: PortableThreadKeySummaryRow): void { + db.prepare( + `insert into thread_key_summaries ( + thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_revision_id, summary_kind, prompt_version, provider, model) do update set + input_hash = excluded.input_hash, + output_hash = excluded.output_hash, + key_text = excluded.key_text, + created_at = excluded.created_at`, + ).run( + liveRevisionId, + row.summary_kind, + row.prompt_version, + row.provider, + row.model, + row.input_hash, + row.output_hash, + row.key_text, + row.created_at, + ); +} + +function upsertImportedRepoSyncState(db: SqliteDatabase, repoId: number, portableDb: SqliteDatabase, portableRepoId: number): boolean { + const row = portableDb.prepare('select * from repo_sync_state where repo_id = ?').get(portableRepoId) as Record | undefined; + if (!row) return false; + db.prepare( + `insert into repo_sync_state ( + repo_id, last_full_open_scan_started_at, last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at, last_open_close_reconciled_at, updated_at + ) + values (?, ?, ?, ?, ?, ?) + on conflict(repo_id) do update set + last_full_open_scan_started_at = excluded.last_full_open_scan_started_at, + last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at, + last_open_close_reconciled_at = excluded.last_open_close_reconciled_at, + updated_at = excluded.updated_at`, + ).run( + repoId, + row.last_full_open_scan_started_at, + row.last_overlapping_open_scan_completed_at, + row.last_non_overlapping_scan_completed_at, + row.last_open_close_reconciled_at, + row.updated_at, + ); + return true; +} + +function upsertImportedRepoPipelineState(db: SqliteDatabase, repoId: number, portableDb: SqliteDatabase, portableRepoId: number): boolean { + const row = portableDb.prepare('select * from repo_pipeline_state where repo_id = ?').get(portableRepoId) as Record | undefined; + if (!row) return false; + db.prepare( + `insert into repo_pipeline_state ( + repo_id, summary_model, summary_prompt_version, embedding_basis, embed_model, embed_dimensions, + embed_pipeline_version, vector_backend, vectors_current_at, clusters_current_at, updated_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id) do update set + summary_model = excluded.summary_model, + summary_prompt_version = excluded.summary_prompt_version, + embedding_basis = excluded.embedding_basis, + embed_model = excluded.embed_model, + embed_dimensions = excluded.embed_dimensions, + embed_pipeline_version = excluded.embed_pipeline_version, + vector_backend = excluded.vector_backend, + vectors_current_at = excluded.vectors_current_at, + clusters_current_at = excluded.clusters_current_at, + updated_at = excluded.updated_at`, + ).run( + repoId, + row.summary_model, + row.summary_prompt_version, + row.embedding_basis, + row.embed_model, + row.embed_dimensions, + row.embed_pipeline_version, + row.vector_backend, + row.vectors_current_at, + row.clusters_current_at, + row.updated_at, + ); + return true; +} + +function upsertImportedClusterGroup( + db: SqliteDatabase, + repoId: number, + representativeThreadId: number | null, + row: PortableClusterGroupRow, +): number { + db.prepare( + `insert into cluster_groups ( + repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, stable_key) do update set + stable_slug = excluded.stable_slug, + status = excluded.status, + cluster_type = excluded.cluster_type, + representative_thread_id = excluded.representative_thread_id, + title = excluded.title, + updated_at = excluded.updated_at, + closed_at = excluded.closed_at`, + ).run( + repoId, + row.stable_key, + row.stable_slug, + row.status, + row.cluster_type, + representativeThreadId, + row.title, + row.created_at, + row.updated_at, + row.closed_at, + ); + const live = db.prepare('select id from cluster_groups where repo_id = ? and stable_key = ?').get(repoId, row.stable_key) as { id: number }; + return live.id; +} + +function upsertImportedClusterMembership( + db: SqliteDatabase, + liveClusterId: number, + liveThreadId: number, + row: PortableClusterMembershipRow, +): void { + db.prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(cluster_id, thread_id) do update set + role = excluded.role, + state = excluded.state, + score_to_representative = excluded.score_to_representative, + last_seen_run_id = excluded.last_seen_run_id, + added_by = excluded.added_by, + removed_by = excluded.removed_by, + added_reason_json = excluded.added_reason_json, + removed_reason_json = excluded.removed_reason_json, + updated_at = excluded.updated_at, + removed_at = excluded.removed_at`, + ).run( + liveClusterId, + liveThreadId, + row.role, + row.state, + row.score_to_representative, + row.first_seen_run_id, + row.last_seen_run_id, + row.added_by, + row.removed_by, + row.added_reason_json, + row.removed_reason_json, + row.created_at, + row.updated_at, + row.removed_at, + ); +} + +function upsertImportedClusterOverride( + db: SqliteDatabase, + repoId: number, + liveClusterId: number, + liveThreadId: number, + row: PortableClusterOverrideRow, +): void { + db.prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at) + values (?, ?, ?, ?, ?, ?, ?, ?) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + actor_id = excluded.actor_id, + expires_at = excluded.expires_at`, + ).run(repoId, liveClusterId, liveThreadId, row.action, row.actor_id, row.reason, row.created_at, row.expires_at); +} + +function upsertImportedClusterAlias(db: SqliteDatabase, liveClusterId: number, row: PortableClusterAliasRow): void { + db.prepare( + `insert into cluster_aliases (cluster_id, alias_slug, reason, created_at) + values (?, ?, ?, ?) + on conflict(cluster_id, alias_slug) do update set reason = excluded.reason`, + ).run(liveClusterId, row.alias_slug, row.reason, row.created_at); +} + +function upsertImportedClusterClosure(db: SqliteDatabase, liveClusterId: number, row: PortableClusterClosureRow): void { + db.prepare( + `insert into cluster_closures (cluster_id, reason, actor_kind, created_at, updated_at) + values (?, ?, ?, ?, ?) + on conflict(cluster_id) do update set + reason = excluded.reason, + actor_kind = excluded.actor_kind, + updated_at = excluded.updated_at`, + ).run(liveClusterId, row.reason, row.actor_kind, row.created_at, row.updated_at); +} + function openReadonlyDb(dbPath: string): SqliteDatabase { return new BetterSqlite3(dbPath, { readonly: true, fileMustExist: true }); } diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index bf7860a..5f1e947 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -384,6 +384,30 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.equal(leanResponse.profile, 'lean'); assert.equal(leanResponse.manifestPath, `${leanOutputPath}.manifest.json`); assert.equal(fs.existsSync(leanResponse.manifestPath), true); + + const importService = new GHCrawlService({ + config: { + ...config, + dbPath: path.join(config.configDir, 'import-target.db'), + }, + github: service.github, + }); + try { + const importResult = importService.importPortableSync(outputPath); + assert.equal(importResult.ok, true); + assert.equal(importResult.repository.fullName, 'openclaw/openclaw'); + assert.equal(importResult.imported.threads, 1); + assert.equal(importResult.imported.clusterGroups, 1); + assert.equal(importResult.imported.clusterMemberships, 1); + const importedThread = importService.db.prepare('select body, raw_json from threads where number = 42').get() as { + body: string; + raw_json: string; + }; + assert.equal(importedThread.body.length, 64); + assert.equal(importedThread.raw_json, '{}'); + } finally { + importService.close(); + } } finally { service.close(); } diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 8e5dfdc..32cd29a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -107,10 +107,12 @@ import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { exportPortableSyncDatabase, + importPortableSyncDatabase, portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase, type PortableSyncExportResponse, + type PortableSyncImportResponse, type PortableSyncProfile, type PortableSyncSizeResponse, type PortableSyncStatusResponse, @@ -3436,6 +3438,13 @@ export class GHCrawlService { }); } + importPortableSync(dbPath: string): PortableSyncImportResponse { + return importPortableSyncDatabase({ + liveDb: this.db, + portablePath: dbPath, + }); + } + private optimizeSqliteTarget(params: { name: 'main' | 'vector'; db: SqliteDatabase; From b90ddf5ec2bea4da11c574b7cb97c70e793ac129 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:48:06 -0700 Subject: [PATCH 130/215] docs: describe portable sync workflow --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1bec272..3d581c6 100644 --- a/README.md +++ b/README.md @@ -225,12 +225,25 @@ The main SQLite database is a local cache and can grow large because it stores r Use `export-sync` to write a compact portable core DB: ```bash -ghcrawl export-sync owner/repo --output ./owner__repo.sync.db --json +ghcrawl export-sync owner/repo --profile lean --manifest --output ./owner__repo.sync.db --json +ghcrawl validate-sync ./owner__repo.sync.db --json +ghcrawl portable-size ./owner__repo.sync.db --json +ghcrawl sync-status owner/repo --portable ./owner__repo.sync.db --json ``` The export keeps the syncable state: repository metadata, issue/PR metadata, bounded body excerpts, latest revisions, deterministic fingerprints, LLM key summaries, sync/pipeline state, and durable cluster identities/memberships/overrides. It intentionally excludes bulky or rebuildable caches such as raw JSON blobs, comments, documents/FTS, vectors, code snapshots, cluster event history, run logs, and similarity edge evidence. -Default body excerpts are capped at `512` characters per thread. Raise or lower that with `--body-chars ` depending on how much preview text you want in the portable file. +Default body excerpts are capped at `512` characters per thread. Use `--profile lean` for a smaller `256` character excerpt budget, `--profile review` for `1024`, or `--body-chars ` when you need an explicit value. `--manifest` writes a JSON sidecar with the export SHA256, table counts, validation status, profile, and repository identity. + +Use `import-sync` to hydrate a configured local store from a portable DB: + +```bash +ghcrawl import-sync ./owner__repo.sync.db --json +``` + +Import preserves richer existing live-cache data where possible. For example, an existing full thread body is not replaced by a portable excerpt, and raw GitHub JSON is not invented beyond a minimal placeholder for newly imported rows. + +External CI or worker systems should call these commands from outside this repository. This repo intentionally does not include a scheduled sync workflow. By default, cluster JSON commands show locally closed clusters. Use `--hide-closed` when you only want active clusters. Thread list commands still hide locally closed issues/PRs unless `--include-closed` is passed. From 56cea734a676b68b4f5d87b1ad48e9c379cb66ec Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:52:26 -0700 Subject: [PATCH 131/215] refactor: split portable sync contract --- packages/api-core/src/portable/sync-store.ts | 197 ++----------------- packages/api-core/src/portable/types.ts | 183 +++++++++++++++++ 2 files changed, 200 insertions(+), 180 deletions(-) create mode 100644 packages/api-core/src/portable/types.ts diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 8b6818a..97ee97b 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -6,186 +6,23 @@ import BetterSqlite3 from 'better-sqlite3'; import type { RepositoryDto } from '@ghcrawl/api-contract'; import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; - -export const PORTABLE_SYNC_SCHEMA_VERSION = 'ghcrawl-portable-sync-v1'; -export const DEFAULT_PORTABLE_BODY_CHARS = 512; -export type PortableSyncProfile = 'lean' | 'review'; - -export const PORTABLE_SYNC_TABLES = [ - 'repositories', - 'threads', - 'thread_revisions', - 'thread_fingerprints', - 'thread_key_summaries', - 'repo_sync_state', - 'repo_pipeline_state', - 'cluster_groups', - 'cluster_memberships', - 'cluster_overrides', - 'cluster_aliases', - 'cluster_closures', -] as const; - -export const PORTABLE_SYNC_EXCLUDED_TABLES = [ - 'blobs', - 'comments', - 'documents', - 'documents_fts', - 'document_embeddings', - 'thread_vectors', - 'thread_code_snapshots', - 'thread_changed_files', - 'thread_hunk_signatures', - 'cluster_events', - 'pipeline_runs', - 'sync_runs', - 'summary_runs', - 'embedding_runs', - 'cluster_runs', - 'similarity_edges', - 'similarity_edge_evidence', -] as const; - -export type PortableSyncExportOptions = { - repository: RepositoryDto; - sourceDb: SqliteDatabase; - sourcePath: string; - outputPath: string; - bodyChars?: number; - profile?: PortableSyncProfile; - writeManifest?: boolean; -}; - -export type PortableSyncManifest = { - schema: string; - profile: PortableSyncProfile | 'default'; - exportedAt: string; - outputPath: string; - outputBytes: number; - sha256: string; - repository: { - id: number; - owner: string; - name: string; - fullName: string; - }; - bodyChars: number; - tables: Array<{ name: string; rows: number }>; - excluded: string[]; - validationOk: boolean; -}; - -export type PortableSyncExportResponse = { - ok: true; - repository: { - id: number; - owner: string; - name: string; - fullName: string; - }; - outputPath: string; - sourcePath: string; - sourceBytes: number; - outputBytes: number; - compressionRatio: number; - bodyChars: number; - profile: PortableSyncProfile | 'default'; - tables: Array<{ name: string; rows: number }>; - excluded: string[]; - manifestPath: string | null; - manifest: PortableSyncManifest; -}; - -export type PortableSyncValidationResponse = { - ok: boolean; - path: string; - schema: string | null; - metadata: Record; - integrity: string[]; - foreignKeyViolations: Array>; - missingTables: string[]; - unexpectedExcludedTables: string[]; - tables: Array<{ name: string; rows: number }>; - errors: string[]; -}; - -export type PortableSyncSizeResponse = { - ok: true; - path: string; - totalBytes: number; - walBytes: number; - shmBytes: number; - tables: Array<{ name: string; bytes: number | null; rows: number | null }>; -}; - -export type PortableSyncStatusResponse = { - ok: true; - repository: { - id: number; - owner: string; - name: string; - fullName: string; - }; - portablePath: string; - portableRepositoryFound: boolean; - live: PortableRepoSnapshot; - portable: PortableRepoSnapshot; - drift: { - liveOnlyThreads: number; - portableOnlyThreads: number; - changedThreads: number; - liveOnlyClusters: number; - portableOnlyClusters: number; - changedClusters: number; - liveOnlyMemberships: number; - portableOnlyMemberships: number; - changedMemberships: number; - }; -}; - -export type PortableSyncImportResponse = { - ok: true; - path: string; - repository: { - id: number; - owner: string; - name: string; - fullName: string; - }; - validationOk: boolean; - imported: { - repositories: number; - threads: number; - threadRevisions: number; - threadFingerprints: number; - threadKeySummaries: number; - repoSyncState: number; - repoPipelineState: number; - clusterGroups: number; - clusterMemberships: number; - clusterOverrides: number; - clusterAliases: number; - clusterClosures: number; - }; -}; - -type PortableRepoSnapshot = { - threads: { - total: number; - open: number; - closed: number; - issues: number; - pullRequests: number; - latestUpdatedAt: string | null; - }; - clusters: { - groups: number; - memberships: number; - overrides: number; - aliases: number; - closures: number; - }; -}; +import { + DEFAULT_PORTABLE_BODY_CHARS, + PORTABLE_SYNC_EXCLUDED_TABLES, + PORTABLE_SYNC_SCHEMA_VERSION, + PORTABLE_SYNC_TABLES, + type PortableRepoSnapshot, + type PortableSyncExportOptions, + type PortableSyncExportResponse, + type PortableSyncImportResponse, + type PortableSyncManifest, + type PortableSyncProfile, + type PortableSyncSizeResponse, + type PortableSyncStatusResponse, + type PortableSyncValidationResponse, +} from './types.js'; + +export * from './types.js'; export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { const profile: PortableSyncProfile | 'default' = params.profile ?? 'default'; diff --git a/packages/api-core/src/portable/types.ts b/packages/api-core/src/portable/types.ts new file mode 100644 index 0000000..e44333f --- /dev/null +++ b/packages/api-core/src/portable/types.ts @@ -0,0 +1,183 @@ +import type { RepositoryDto } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; + +export const PORTABLE_SYNC_SCHEMA_VERSION = 'ghcrawl-portable-sync-v1'; +export const DEFAULT_PORTABLE_BODY_CHARS = 512; +export type PortableSyncProfile = 'lean' | 'review'; + +export const PORTABLE_SYNC_TABLES = [ + 'repositories', + 'threads', + 'thread_revisions', + 'thread_fingerprints', + 'thread_key_summaries', + 'repo_sync_state', + 'repo_pipeline_state', + 'cluster_groups', + 'cluster_memberships', + 'cluster_overrides', + 'cluster_aliases', + 'cluster_closures', +] as const; + +export const PORTABLE_SYNC_EXCLUDED_TABLES = [ + 'blobs', + 'comments', + 'documents', + 'documents_fts', + 'document_embeddings', + 'thread_vectors', + 'thread_code_snapshots', + 'thread_changed_files', + 'thread_hunk_signatures', + 'cluster_events', + 'pipeline_runs', + 'sync_runs', + 'summary_runs', + 'embedding_runs', + 'cluster_runs', + 'similarity_edges', + 'similarity_edge_evidence', +] as const; + +export type PortableSyncExportOptions = { + repository: RepositoryDto; + sourceDb: SqliteDatabase; + sourcePath: string; + outputPath: string; + bodyChars?: number; + profile?: PortableSyncProfile; + writeManifest?: boolean; +}; + +export type PortableSyncManifest = { + schema: string; + profile: PortableSyncProfile | 'default'; + exportedAt: string; + outputPath: string; + outputBytes: number; + sha256: string; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + bodyChars: number; + tables: Array<{ name: string; rows: number }>; + excluded: string[]; + validationOk: boolean; +}; + +export type PortableSyncExportResponse = { + ok: true; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + outputPath: string; + sourcePath: string; + sourceBytes: number; + outputBytes: number; + compressionRatio: number; + bodyChars: number; + profile: PortableSyncProfile | 'default'; + tables: Array<{ name: string; rows: number }>; + excluded: string[]; + manifestPath: string | null; + manifest: PortableSyncManifest; +}; + +export type PortableSyncValidationResponse = { + ok: boolean; + path: string; + schema: string | null; + metadata: Record; + integrity: string[]; + foreignKeyViolations: Array>; + missingTables: string[]; + unexpectedExcludedTables: string[]; + tables: Array<{ name: string; rows: number }>; + errors: string[]; +}; + +export type PortableSyncSizeResponse = { + ok: true; + path: string; + totalBytes: number; + walBytes: number; + shmBytes: number; + tables: Array<{ name: string; bytes: number | null; rows: number | null }>; +}; + +export type PortableRepoSnapshot = { + threads: { + total: number; + open: number; + closed: number; + issues: number; + pullRequests: number; + latestUpdatedAt: string | null; + }; + clusters: { + groups: number; + memberships: number; + overrides: number; + aliases: number; + closures: number; + }; +}; + +export type PortableSyncStatusResponse = { + ok: true; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + portablePath: string; + portableRepositoryFound: boolean; + live: PortableRepoSnapshot; + portable: PortableRepoSnapshot; + drift: { + liveOnlyThreads: number; + portableOnlyThreads: number; + changedThreads: number; + liveOnlyClusters: number; + portableOnlyClusters: number; + changedClusters: number; + liveOnlyMemberships: number; + portableOnlyMemberships: number; + changedMemberships: number; + }; +}; + +export type PortableSyncImportResponse = { + ok: true; + path: string; + repository: { + id: number; + owner: string; + name: string; + fullName: string; + }; + validationOk: boolean; + imported: { + repositories: number; + threads: number; + threadRevisions: number; + threadFingerprints: number; + threadKeySummaries: number; + repoSyncState: number; + repoPipelineState: number; + clusterGroups: number; + clusterMemberships: number; + clusterOverrides: number; + clusterAliases: number; + clusterClosures: number; + }; +}; From 8c3397bfcaf57a47f55723fc5f1cd1b802f9bf15 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:53:24 -0700 Subject: [PATCH 132/215] refactor: share portable sqlite helpers --- .../api-core/src/portable/sqlite-utils.ts | 100 ++++++++++++++++ packages/api-core/src/portable/sync-store.ts | 110 +++--------------- 2 files changed, 114 insertions(+), 96 deletions(-) create mode 100644 packages/api-core/src/portable/sqlite-utils.ts diff --git a/packages/api-core/src/portable/sqlite-utils.ts b/packages/api-core/src/portable/sqlite-utils.ts new file mode 100644 index 0000000..b9fa8b5 --- /dev/null +++ b/packages/api-core/src/portable/sqlite-utils.ts @@ -0,0 +1,100 @@ +import crypto from 'node:crypto'; +import fs from 'node:fs'; + +import BetterSqlite3 from 'better-sqlite3'; + +import type { SqliteDatabase } from '../db/sqlite.js'; + +export function openReadonlyDb(dbPath: string): SqliteDatabase { + return new BetterSqlite3(dbPath, { readonly: true, fileMustExist: true }); +} + +export function listTables(db: SqliteDatabase): Set { + const rows = db + .prepare("select name from sqlite_master where type in ('table', 'view') and name not like 'sqlite_%'") + .all() as Array<{ name: string }>; + return new Set(rows.map((row) => row.name)); +} + +export function readPortableMetadata(db: SqliteDatabase): Record { + const rows = db.prepare('select key, value from portable_metadata order by key').all() as Array<{ key: string; value: string }>; + return Object.fromEntries(rows.map((row) => [row.key, row.value])); +} + +export function readIntegrityCheck(db: SqliteDatabase): string[] { + const rows = db.prepare('pragma integrity_check').all() as Array<{ integrity_check: string }>; + return rows.map((row) => row.integrity_check); +} + +export function readForeignKeyViolations(db: SqliteDatabase): Array> { + return db.prepare('pragma foreign_key_check').all() as Array>; +} + +export function readDbstatSizes(db: SqliteDatabase): Array<{ name: string; bytes: number | null; rows: number | null }> { + try { + const rows = db + .prepare( + `select + s.name as name, + s.bytes as bytes, + coalesce(t.row_count, 0) as rows + from ( + select name, sum(pgsize) as bytes + from dbstat + where name not like 'sqlite_%' + group by name + ) s + left join ( + select name, null as row_count + from sqlite_master + where 0 + ) t on t.name = s.name + order by s.bytes desc, s.name asc`, + ) + .all() as Array<{ name: string; bytes: number; rows: number | null }>; + return rows.map((row) => ({ name: row.name, bytes: row.bytes, rows: safeCountRows(db, row.name) })); + } catch { + const tableNames = [...listTables(db)].sort(); + return tableNames.map((name) => ({ name, bytes: null, rows: safeCountRows(db, name) })); + } +} + +export function countRows(db: SqliteDatabase, tableName: string): number { + const row = db.prepare(`select count(*) as count from "${tableName}"`).get() as { count: number }; + return row.count; +} + +export function safeCountRows(db: SqliteDatabase, tableName: string): number | null { + try { + return countRows(db, tableName); + } catch { + return null; + } +} + +export function attachedTableHasColumn(db: SqliteDatabase, schemaName: string, tableName: string, columnName: string): boolean { + const rows = db.prepare(`pragma ${schemaName}.table_info("${tableName}")`).all() as Array<{ name: string }>; + return rows.some((row) => row.name === columnName); +} + +export function fileSize(filePath: string): number { + try { + return fs.statSync(filePath).size; + } catch { + return 0; + } +} + +export function sha256File(filePath: string): string { + const hash = crypto.createHash('sha256'); + hash.update(fs.readFileSync(filePath)); + return hash.digest('hex'); +} + +export function nowIso(): string { + return new Date().toISOString(); +} + +export function sqlStringLiteral(value: string): string { + return `'${value.replaceAll("'", "''")}'`; +} diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 97ee97b..99cfcac 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -1,11 +1,23 @@ import fs from 'node:fs'; -import crypto from 'node:crypto'; import path from 'node:path'; -import BetterSqlite3 from 'better-sqlite3'; import type { RepositoryDto } from '@ghcrawl/api-contract'; import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; +import { + attachedTableHasColumn, + countRows, + fileSize, + listTables, + nowIso, + openReadonlyDb, + readDbstatSizes, + readForeignKeyViolations, + readIntegrityCheck, + readPortableMetadata, + sha256File, + sqlStringLiteral, +} from './sqlite-utils.js'; import { DEFAULT_PORTABLE_BODY_CHARS, PORTABLE_SYNC_EXCLUDED_TABLES, @@ -634,11 +646,6 @@ export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: num ).run(); } -function countRows(db: SqliteDatabase, tableName: string): number { - const row = db.prepare(`select count(*) as count from "${tableName}"`).get() as { count: number }; - return row.count; -} - function emptyRepoSnapshot(): PortableRepoSnapshot { return { threads: { @@ -1269,92 +1276,3 @@ function upsertImportedClusterClosure(db: SqliteDatabase, liveClusterId: number, updated_at = excluded.updated_at`, ).run(liveClusterId, row.reason, row.actor_kind, row.created_at, row.updated_at); } - -function openReadonlyDb(dbPath: string): SqliteDatabase { - return new BetterSqlite3(dbPath, { readonly: true, fileMustExist: true }); -} - -function listTables(db: SqliteDatabase): Set { - const rows = db - .prepare("select name from sqlite_master where type in ('table', 'view') and name not like 'sqlite_%'") - .all() as Array<{ name: string }>; - return new Set(rows.map((row) => row.name)); -} - -function readPortableMetadata(db: SqliteDatabase): Record { - const rows = db.prepare('select key, value from portable_metadata order by key').all() as Array<{ key: string; value: string }>; - return Object.fromEntries(rows.map((row) => [row.key, row.value])); -} - -function readIntegrityCheck(db: SqliteDatabase): string[] { - const rows = db.prepare('pragma integrity_check').all() as Array<{ integrity_check: string }>; - return rows.map((row) => row.integrity_check); -} - -function readForeignKeyViolations(db: SqliteDatabase): Array> { - return db.prepare('pragma foreign_key_check').all() as Array>; -} - -function readDbstatSizes(db: SqliteDatabase): Array<{ name: string; bytes: number | null; rows: number | null }> { - try { - const rows = db - .prepare( - `select - s.name as name, - s.bytes as bytes, - coalesce(t.row_count, 0) as rows - from ( - select name, sum(pgsize) as bytes - from dbstat - where name not like 'sqlite_%' - group by name - ) s - left join ( - select name, null as row_count - from sqlite_master - where 0 - ) t on t.name = s.name - order by s.bytes desc, s.name asc`, - ) - .all() as Array<{ name: string; bytes: number; rows: number | null }>; - return rows.map((row) => ({ name: row.name, bytes: row.bytes, rows: safeCountRows(db, row.name) })); - } catch { - const tableNames = [...listTables(db)].sort(); - return tableNames.map((name) => ({ name, bytes: null, rows: safeCountRows(db, name) })); - } -} - -function safeCountRows(db: SqliteDatabase, tableName: string): number | null { - try { - return countRows(db, tableName); - } catch { - return null; - } -} - -function attachedTableHasColumn(db: SqliteDatabase, schemaName: string, tableName: string, columnName: string): boolean { - const rows = db.prepare(`pragma ${schemaName}.table_info("${tableName}")`).all() as Array<{ name: string }>; - return rows.some((row) => row.name === columnName); -} - -function fileSize(filePath: string): number { - try { - return fs.statSync(filePath).size; - } catch { - return 0; - } -} - -function sha256File(filePath: string): string { - const hash = crypto.createHash('sha256'); - hash.update(fs.readFileSync(filePath)); - return hash.digest('hex'); -} - -function nowIso(): string { - return new Date().toISOString(); -} - -function sqlStringLiteral(value: string): string { - return `'${value.replaceAll("'", "''")}'`; -} From 4ec3025a540fb8ca54dae0bcec3269efb5939339 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:54:37 -0700 Subject: [PATCH 133/215] refactor: isolate portable sync schema --- packages/api-core/src/portable/schema.ts | 169 ++++++++++++++++++ packages/api-core/src/portable/sync-store.ts | 170 +------------------ 2 files changed, 171 insertions(+), 168 deletions(-) create mode 100644 packages/api-core/src/portable/schema.ts diff --git a/packages/api-core/src/portable/schema.ts b/packages/api-core/src/portable/schema.ts new file mode 100644 index 0000000..d81d437 --- /dev/null +++ b/packages/api-core/src/portable/schema.ts @@ -0,0 +1,169 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; + +export function createPortableSyncSchema(db: SqliteDatabase): void { + db.exec(` + create table portable_metadata (key text primary key, value text not null); + create table repositories ( + id integer primary key, + owner text not null, + name text not null, + full_name text not null unique, + github_repo_id text, + updated_at text not null + ); + create table threads ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + github_id text not null, + number integer not null, + kind text not null, + state text not null, + title text not null, + body_excerpt text, + body_length integer not null default 0, + author_login text, + author_type text, + html_url text not null, + labels_json text not null, + assignees_json text not null, + content_hash text not null, + is_draft integer not null default 0, + created_at_gh text, + updated_at_gh text, + closed_at_gh text, + merged_at_gh text, + first_pulled_at text, + last_pulled_at text, + updated_at text not null, + closed_at_local text, + close_reason_local text, + unique(repo_id, kind, number) + ); + create table thread_revisions ( + id integer primary key, + thread_id integer not null references threads(id) on delete cascade, + source_updated_at text, + content_hash text not null, + title_hash text not null, + body_hash text not null, + labels_hash text not null, + created_at text not null, + unique(thread_id, content_hash) + ); + create table thread_fingerprints ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + algorithm_version text not null, + fingerprint_hash text not null, + fingerprint_slug text not null, + title_tokens_json text not null, + body_token_hash text not null, + linked_refs_json text not null, + file_set_hash text not null, + module_buckets_json text not null, + simhash64 text not null, + feature_json text not null, + created_at text not null, + unique(thread_revision_id, algorithm_version) + ); + create table thread_key_summaries ( + id integer primary key, + thread_revision_id integer not null references thread_revisions(id) on delete cascade, + summary_kind text not null, + prompt_version text not null, + provider text not null, + model text not null, + input_hash text not null, + output_hash text not null, + key_text text not null, + created_at text not null, + unique(thread_revision_id, summary_kind, prompt_version, provider, model) + ); + create table repo_sync_state ( + repo_id integer primary key references repositories(id) on delete cascade, + last_full_open_scan_started_at text, + last_overlapping_open_scan_completed_at text, + last_non_overlapping_scan_completed_at text, + last_open_close_reconciled_at text, + updated_at text not null + ); + create table repo_pipeline_state ( + repo_id integer primary key references repositories(id) on delete cascade, + summary_model text not null, + summary_prompt_version text not null, + embedding_basis text not null, + embed_model text not null, + embed_dimensions integer not null, + embed_pipeline_version text not null, + vector_backend text not null, + vectors_current_at text, + clusters_current_at text, + updated_at text not null + ); + create table cluster_groups ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + stable_key text not null, + stable_slug text not null, + status text not null, + cluster_type text, + representative_thread_id integer references threads(id) on delete set null, + title text, + created_at text not null, + updated_at text not null, + closed_at text, + unique(repo_id, stable_key), + unique(repo_id, stable_slug) + ); + create table cluster_memberships ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + role text not null, + state text not null, + score_to_representative real, + first_seen_run_id integer, + last_seen_run_id integer, + added_by text not null, + removed_by text, + added_reason_json text not null, + removed_reason_json text, + created_at text not null, + updated_at text not null, + removed_at text, + primary key (cluster_id, thread_id) + ); + create table cluster_overrides ( + id integer primary key, + repo_id integer not null references repositories(id) on delete cascade, + cluster_id integer not null references cluster_groups(id) on delete cascade, + thread_id integer not null references threads(id) on delete cascade, + action text not null, + actor_id integer, + reason text, + created_at text not null, + expires_at text, + unique(cluster_id, thread_id, action) + ); + create table cluster_aliases ( + cluster_id integer not null references cluster_groups(id) on delete cascade, + alias_slug text not null, + reason text not null, + created_at text not null, + primary key (cluster_id, alias_slug) + ); + create table cluster_closures ( + cluster_id integer primary key references cluster_groups(id) on delete cascade, + reason text not null, + actor_kind text not null, + created_at text not null, + updated_at text not null + ); + create index idx_threads_repo_number on threads(repo_id, number); + create index idx_threads_repo_state_closed on threads(repo_id, state, closed_at_local); + create index idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash); + create index idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug); + create index idx_cluster_groups_repo_status on cluster_groups(repo_id, status); + create index idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state); + create index idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state); + `); +} diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 99cfcac..a58b8c6 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -4,6 +4,7 @@ import path from 'node:path'; import type { RepositoryDto } from '@ghcrawl/api-contract'; import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; +import { createPortableSyncSchema } from './schema.js'; import { attachedTableHasColumn, countRows, @@ -35,6 +36,7 @@ import { } from './types.js'; export * from './types.js'; +export { createPortableSyncSchema } from './schema.js'; export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { const profile: PortableSyncProfile | 'default' = params.profile ?? 'default'; @@ -158,174 +160,6 @@ function writePortableSyncManifest(outputPath: string, manifest: PortableSyncMan return manifestPath; } -export function createPortableSyncSchema(db: SqliteDatabase): void { - db.exec(` - create table portable_metadata (key text primary key, value text not null); - create table repositories ( - id integer primary key, - owner text not null, - name text not null, - full_name text not null unique, - github_repo_id text, - updated_at text not null - ); - create table threads ( - id integer primary key, - repo_id integer not null references repositories(id) on delete cascade, - github_id text not null, - number integer not null, - kind text not null, - state text not null, - title text not null, - body_excerpt text, - body_length integer not null default 0, - author_login text, - author_type text, - html_url text not null, - labels_json text not null, - assignees_json text not null, - content_hash text not null, - is_draft integer not null default 0, - created_at_gh text, - updated_at_gh text, - closed_at_gh text, - merged_at_gh text, - first_pulled_at text, - last_pulled_at text, - updated_at text not null, - closed_at_local text, - close_reason_local text, - unique(repo_id, kind, number) - ); - create table thread_revisions ( - id integer primary key, - thread_id integer not null references threads(id) on delete cascade, - source_updated_at text, - content_hash text not null, - title_hash text not null, - body_hash text not null, - labels_hash text not null, - created_at text not null, - unique(thread_id, content_hash) - ); - create table thread_fingerprints ( - id integer primary key, - thread_revision_id integer not null references thread_revisions(id) on delete cascade, - algorithm_version text not null, - fingerprint_hash text not null, - fingerprint_slug text not null, - title_tokens_json text not null, - body_token_hash text not null, - linked_refs_json text not null, - file_set_hash text not null, - module_buckets_json text not null, - simhash64 text not null, - feature_json text not null, - created_at text not null, - unique(thread_revision_id, algorithm_version) - ); - create table thread_key_summaries ( - id integer primary key, - thread_revision_id integer not null references thread_revisions(id) on delete cascade, - summary_kind text not null, - prompt_version text not null, - provider text not null, - model text not null, - input_hash text not null, - output_hash text not null, - key_text text not null, - created_at text not null, - unique(thread_revision_id, summary_kind, prompt_version, provider, model) - ); - create table repo_sync_state ( - repo_id integer primary key references repositories(id) on delete cascade, - last_full_open_scan_started_at text, - last_overlapping_open_scan_completed_at text, - last_non_overlapping_scan_completed_at text, - last_open_close_reconciled_at text, - updated_at text not null - ); - create table repo_pipeline_state ( - repo_id integer primary key references repositories(id) on delete cascade, - summary_model text not null, - summary_prompt_version text not null, - embedding_basis text not null, - embed_model text not null, - embed_dimensions integer not null, - embed_pipeline_version text not null, - vector_backend text not null, - vectors_current_at text, - clusters_current_at text, - updated_at text not null - ); - create table cluster_groups ( - id integer primary key, - repo_id integer not null references repositories(id) on delete cascade, - stable_key text not null, - stable_slug text not null, - status text not null, - cluster_type text, - representative_thread_id integer references threads(id) on delete set null, - title text, - created_at text not null, - updated_at text not null, - closed_at text, - unique(repo_id, stable_key), - unique(repo_id, stable_slug) - ); - create table cluster_memberships ( - cluster_id integer not null references cluster_groups(id) on delete cascade, - thread_id integer not null references threads(id) on delete cascade, - role text not null, - state text not null, - score_to_representative real, - first_seen_run_id integer, - last_seen_run_id integer, - added_by text not null, - removed_by text, - added_reason_json text not null, - removed_reason_json text, - created_at text not null, - updated_at text not null, - removed_at text, - primary key (cluster_id, thread_id) - ); - create table cluster_overrides ( - id integer primary key, - repo_id integer not null references repositories(id) on delete cascade, - cluster_id integer not null references cluster_groups(id) on delete cascade, - thread_id integer not null references threads(id) on delete cascade, - action text not null, - actor_id integer, - reason text, - created_at text not null, - expires_at text, - unique(cluster_id, thread_id, action) - ); - create table cluster_aliases ( - cluster_id integer not null references cluster_groups(id) on delete cascade, - alias_slug text not null, - reason text not null, - created_at text not null, - primary key (cluster_id, alias_slug) - ); - create table cluster_closures ( - cluster_id integer primary key references cluster_groups(id) on delete cascade, - reason text not null, - actor_kind text not null, - created_at text not null, - updated_at text not null - ); - create index idx_threads_repo_number on threads(repo_id, number); - create index idx_threads_repo_state_closed on threads(repo_id, state, closed_at_local); - create index idx_thread_fingerprints_hash on thread_fingerprints(fingerprint_hash); - create index idx_thread_fingerprints_slug on thread_fingerprints(fingerprint_slug); - create index idx_cluster_groups_repo_status on cluster_groups(repo_id, status); - create index idx_cluster_memberships_thread_state on cluster_memberships(thread_id, state); - create index idx_cluster_memberships_cluster_state on cluster_memberships(cluster_id, state); - `); -} - export function validatePortableSyncDatabase(dbPath: string): PortableSyncValidationResponse { const resolvedPath = path.resolve(dbPath); const db = openReadonlyDb(resolvedPath); From c5031d03577b03ef8f724520807dd900060f391d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:56:16 -0700 Subject: [PATCH 134/215] refactor: split portable sync inspection --- packages/api-core/src/portable/inspect.ts | 314 +++++++++++++++++++ packages/api-core/src/portable/sync-store.ts | 303 +----------------- 2 files changed, 316 insertions(+), 301 deletions(-) create mode 100644 packages/api-core/src/portable/inspect.ts diff --git a/packages/api-core/src/portable/inspect.ts b/packages/api-core/src/portable/inspect.ts new file mode 100644 index 0000000..e00fa58 --- /dev/null +++ b/packages/api-core/src/portable/inspect.ts @@ -0,0 +1,314 @@ +import path from 'node:path'; + +import type { RepositoryDto } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import { + countRows, + fileSize, + listTables, + openReadonlyDb, + readDbstatSizes, + readForeignKeyViolations, + readIntegrityCheck, + readPortableMetadata, +} from './sqlite-utils.js'; +import { + PORTABLE_SYNC_EXCLUDED_TABLES, + PORTABLE_SYNC_SCHEMA_VERSION, + PORTABLE_SYNC_TABLES, + type PortableRepoSnapshot, + type PortableSyncSizeResponse, + type PortableSyncStatusResponse, + type PortableSyncValidationResponse, +} from './types.js'; + +export function validatePortableSyncDatabase(dbPath: string): PortableSyncValidationResponse { + const resolvedPath = path.resolve(dbPath); + const db = openReadonlyDb(resolvedPath); + try { + const tableNames = listTables(db); + const missingTables = PORTABLE_SYNC_TABLES.filter((name) => !tableNames.has(name)); + const unexpectedExcludedTables = PORTABLE_SYNC_EXCLUDED_TABLES.filter((name) => tableNames.has(name)); + const metadata = tableNames.has('portable_metadata') ? readPortableMetadata(db) : {}; + const integrity = readIntegrityCheck(db); + const foreignKeyViolations = readForeignKeyViolations(db); + const schema = metadata.schema ?? null; + const errors = [ + ...missingTables.map((name) => `missing required table: ${name}`), + ...unexpectedExcludedTables.map((name) => `excluded cache table is present: ${name}`), + ...(schema === PORTABLE_SYNC_SCHEMA_VERSION ? [] : [`unexpected schema: ${schema ?? 'missing'}`]), + ...integrity.filter((message) => message !== 'ok').map((message) => `integrity_check: ${message}`), + ...foreignKeyViolations.map((violation) => `foreign_key_check: ${JSON.stringify(violation)}`), + ]; + + return { + ok: errors.length === 0, + path: resolvedPath, + schema, + metadata, + integrity, + foreignKeyViolations, + missingTables, + unexpectedExcludedTables, + tables: PORTABLE_SYNC_TABLES.filter((name) => tableNames.has(name)).map((name) => ({ name, rows: countRows(db, name) })), + errors, + }; + } finally { + db.close(); + } +} + +export function portableSyncSizeReport(dbPath: string): PortableSyncSizeResponse { + const resolvedPath = path.resolve(dbPath); + const db = openReadonlyDb(resolvedPath); + try { + const tables = readDbstatSizes(db); + return { + ok: true, + path: resolvedPath, + totalBytes: fileSize(resolvedPath), + walBytes: fileSize(`${resolvedPath}-wal`), + shmBytes: fileSize(`${resolvedPath}-shm`), + tables, + }; + } finally { + db.close(); + } +} + +export function portableSyncStatusReport(params: { + liveDb: SqliteDatabase; + repository: RepositoryDto; + portablePath: string; +}): PortableSyncStatusResponse { + const resolvedPath = path.resolve(params.portablePath); + const portableDb = openReadonlyDb(resolvedPath); + try { + const portableRepo = portableDb + .prepare('select id from repositories where full_name = ?') + .get(params.repository.fullName) as { id: number } | undefined; + const portableRepoId = portableRepo?.id ?? null; + const liveSnapshot = readRepoSnapshot(params.liveDb, params.repository.id); + const portableSnapshot = portableRepoId === null ? emptyRepoSnapshot() : readRepoSnapshot(portableDb, portableRepoId); + + const liveThreads = readThreadComparableRows(params.liveDb, params.repository.id); + const portableThreads = portableRepoId === null ? [] : readThreadComparableRows(portableDb, portableRepoId); + const liveClusters = readClusterComparableRows(params.liveDb, params.repository.id); + const portableClusters = portableRepoId === null ? [] : readClusterComparableRows(portableDb, portableRepoId); + const liveMemberships = readMembershipComparableRows(params.liveDb, params.repository.id); + const portableMemberships = portableRepoId === null ? [] : readMembershipComparableRows(portableDb, portableRepoId); + const threadDrift = compareComparableRows(liveThreads, portableThreads); + const clusterDrift = compareComparableRows(liveClusters, portableClusters); + const membershipDrift = compareComparableRows(liveMemberships, portableMemberships); + + return { + ok: true, + repository: { + id: params.repository.id, + owner: params.repository.owner, + name: params.repository.name, + fullName: params.repository.fullName, + }, + portablePath: resolvedPath, + portableRepositoryFound: portableRepoId !== null, + live: liveSnapshot, + portable: portableSnapshot, + drift: { + liveOnlyThreads: threadDrift.liveOnly, + portableOnlyThreads: threadDrift.portableOnly, + changedThreads: threadDrift.changed, + liveOnlyClusters: clusterDrift.liveOnly, + portableOnlyClusters: clusterDrift.portableOnly, + changedClusters: clusterDrift.changed, + liveOnlyMemberships: membershipDrift.liveOnly, + portableOnlyMemberships: membershipDrift.portableOnly, + changedMemberships: membershipDrift.changed, + }, + }; + } finally { + portableDb.close(); + } +} + +function emptyRepoSnapshot(): PortableRepoSnapshot { + return { + threads: { + total: 0, + open: 0, + closed: 0, + issues: 0, + pullRequests: 0, + latestUpdatedAt: null, + }, + clusters: { + groups: 0, + memberships: 0, + overrides: 0, + aliases: 0, + closures: 0, + }, + }; +} + +function readRepoSnapshot(db: SqliteDatabase, repoId: number): PortableRepoSnapshot { + const threads = db + .prepare( + `select + count(*) as total, + sum(case when state = 'open' and closed_at_local is null then 1 else 0 end) as open, + sum(case when state <> 'open' or closed_at_local is not null then 1 else 0 end) as closed, + sum(case when kind = 'issue' then 1 else 0 end) as issues, + sum(case when kind = 'pull_request' then 1 else 0 end) as pull_requests, + max(coalesce(updated_at_gh, updated_at)) as latest_updated_at + from threads + where repo_id = ?`, + ) + .get(repoId) as { + total: number; + open: number | null; + closed: number | null; + issues: number | null; + pull_requests: number | null; + latest_updated_at: string | null; + }; + const clusters = db + .prepare( + `select + (select count(*) from cluster_groups where repo_id = ?) as groups_count, + (select count(*) + from cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + where cg.repo_id = ?) as memberships_count, + (select count(*) from cluster_overrides where repo_id = ?) as overrides_count, + (select count(*) + from cluster_aliases ca + join cluster_groups cg on cg.id = ca.cluster_id + where cg.repo_id = ?) as aliases_count, + (select count(*) + from cluster_closures cc + join cluster_groups cg on cg.id = cc.cluster_id + where cg.repo_id = ?) as closures_count`, + ) + .get(repoId, repoId, repoId, repoId, repoId) as { + groups_count: number; + memberships_count: number; + overrides_count: number; + aliases_count: number; + closures_count: number; + }; + + return { + threads: { + total: threads.total, + open: threads.open ?? 0, + closed: threads.closed ?? 0, + issues: threads.issues ?? 0, + pullRequests: threads.pull_requests ?? 0, + latestUpdatedAt: threads.latest_updated_at, + }, + clusters: { + groups: clusters.groups_count, + memberships: clusters.memberships_count, + overrides: clusters.overrides_count, + aliases: clusters.aliases_count, + closures: clusters.closures_count, + }, + }; +} + +type ComparableRow = { key: string; value: string }; + +function readThreadComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { + const rows = db + .prepare( + `select kind, number, state, title, content_hash, updated_at_gh, closed_at_gh, closed_at_local + from threads + where repo_id = ? + order by kind, number`, + ) + .all(repoId) as Array<{ + kind: string; + number: number; + state: string; + title: string; + content_hash: string; + updated_at_gh: string | null; + closed_at_gh: string | null; + closed_at_local: string | null; + }>; + return rows.map((row) => ({ + key: `${row.kind}:${row.number}`, + value: JSON.stringify([row.state, row.title, row.content_hash, row.updated_at_gh, row.closed_at_gh, row.closed_at_local]), + })); +} + +function readClusterComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { + const rows = db + .prepare( + `select stable_key, stable_slug, status, cluster_type, title, closed_at + from cluster_groups + where repo_id = ? + order by stable_key`, + ) + .all(repoId) as Array<{ + stable_key: string; + stable_slug: string; + status: string; + cluster_type: string | null; + title: string | null; + closed_at: string | null; + }>; + return rows.map((row) => ({ + key: row.stable_key, + value: JSON.stringify([row.stable_slug, row.status, row.cluster_type, row.title, row.closed_at]), + })); +} + +function readMembershipComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { + const rows = db + .prepare( + `select cg.stable_key, t.kind, t.number, cm.role, cm.state, cm.score_to_representative, cm.added_by, cm.removed_by, cm.removed_at + from cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + join threads t on t.id = cm.thread_id + where cg.repo_id = ? + order by cg.stable_key, t.kind, t.number`, + ) + .all(repoId) as Array<{ + stable_key: string; + kind: string; + number: number; + role: string; + state: string; + score_to_representative: number | null; + added_by: string; + removed_by: string | null; + removed_at: string | null; + }>; + return rows.map((row) => ({ + key: `${row.stable_key}:${row.kind}:${row.number}`, + value: JSON.stringify([row.role, row.state, row.score_to_representative, row.added_by, row.removed_by, row.removed_at]), + })); +} + +function compareComparableRows(liveRows: ComparableRow[], portableRows: ComparableRow[]): { liveOnly: number; portableOnly: number; changed: number } { + const live = new Map(liveRows.map((row) => [row.key, row.value])); + const portable = new Map(portableRows.map((row) => [row.key, row.value])); + let liveOnly = 0; + let portableOnly = 0; + let changed = 0; + + for (const [key, value] of live) { + if (!portable.has(key)) { + liveOnly += 1; + } else if (portable.get(key) !== value) { + changed += 1; + } + } + for (const key of portable.keys()) { + if (!live.has(key)) portableOnly += 1; + } + + return { liveOnly, portableOnly, changed }; +} diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index a58b8c6..03b3efe 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -1,21 +1,15 @@ import fs from 'node:fs'; import path from 'node:path'; -import type { RepositoryDto } from '@ghcrawl/api-contract'; - import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; +import { portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase } from './inspect.js'; import { createPortableSyncSchema } from './schema.js'; import { attachedTableHasColumn, countRows, fileSize, - listTables, nowIso, openReadonlyDb, - readDbstatSizes, - readForeignKeyViolations, - readIntegrityCheck, - readPortableMetadata, sha256File, sqlStringLiteral, } from './sqlite-utils.js'; @@ -24,18 +18,15 @@ import { PORTABLE_SYNC_EXCLUDED_TABLES, PORTABLE_SYNC_SCHEMA_VERSION, PORTABLE_SYNC_TABLES, - type PortableRepoSnapshot, type PortableSyncExportOptions, type PortableSyncExportResponse, type PortableSyncImportResponse, type PortableSyncManifest, type PortableSyncProfile, - type PortableSyncSizeResponse, - type PortableSyncStatusResponse, - type PortableSyncValidationResponse, } from './types.js'; export * from './types.js'; +export { portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase } from './inspect.js'; export { createPortableSyncSchema } from './schema.js'; export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { @@ -160,114 +151,6 @@ function writePortableSyncManifest(outputPath: string, manifest: PortableSyncMan return manifestPath; } -export function validatePortableSyncDatabase(dbPath: string): PortableSyncValidationResponse { - const resolvedPath = path.resolve(dbPath); - const db = openReadonlyDb(resolvedPath); - try { - const tableNames = listTables(db); - const missingTables = PORTABLE_SYNC_TABLES.filter((name) => !tableNames.has(name)); - const unexpectedExcludedTables = PORTABLE_SYNC_EXCLUDED_TABLES.filter((name) => tableNames.has(name)); - const metadata = tableNames.has('portable_metadata') ? readPortableMetadata(db) : {}; - const integrity = readIntegrityCheck(db); - const foreignKeyViolations = readForeignKeyViolations(db); - const schema = metadata.schema ?? null; - const errors = [ - ...missingTables.map((name) => `missing required table: ${name}`), - ...unexpectedExcludedTables.map((name) => `excluded cache table is present: ${name}`), - ...(schema === PORTABLE_SYNC_SCHEMA_VERSION ? [] : [`unexpected schema: ${schema ?? 'missing'}`]), - ...integrity.filter((message) => message !== 'ok').map((message) => `integrity_check: ${message}`), - ...foreignKeyViolations.map((violation) => `foreign_key_check: ${JSON.stringify(violation)}`), - ]; - - return { - ok: errors.length === 0, - path: resolvedPath, - schema, - metadata, - integrity, - foreignKeyViolations, - missingTables, - unexpectedExcludedTables, - tables: PORTABLE_SYNC_TABLES.filter((name) => tableNames.has(name)).map((name) => ({ name, rows: countRows(db, name) })), - errors, - }; - } finally { - db.close(); - } -} - -export function portableSyncSizeReport(dbPath: string): PortableSyncSizeResponse { - const resolvedPath = path.resolve(dbPath); - const db = openReadonlyDb(resolvedPath); - try { - const tables = readDbstatSizes(db); - return { - ok: true, - path: resolvedPath, - totalBytes: fileSize(resolvedPath), - walBytes: fileSize(`${resolvedPath}-wal`), - shmBytes: fileSize(`${resolvedPath}-shm`), - tables, - }; - } finally { - db.close(); - } -} - -export function portableSyncStatusReport(params: { - liveDb: SqliteDatabase; - repository: RepositoryDto; - portablePath: string; -}): PortableSyncStatusResponse { - const resolvedPath = path.resolve(params.portablePath); - const portableDb = openReadonlyDb(resolvedPath); - try { - const portableRepo = portableDb - .prepare('select id from repositories where full_name = ?') - .get(params.repository.fullName) as { id: number } | undefined; - const portableRepoId = portableRepo?.id ?? null; - const liveSnapshot = readRepoSnapshot(params.liveDb, params.repository.id); - const portableSnapshot = portableRepoId === null ? emptyRepoSnapshot() : readRepoSnapshot(portableDb, portableRepoId); - - const liveThreads = readThreadComparableRows(params.liveDb, params.repository.id); - const portableThreads = portableRepoId === null ? [] : readThreadComparableRows(portableDb, portableRepoId); - const liveClusters = readClusterComparableRows(params.liveDb, params.repository.id); - const portableClusters = portableRepoId === null ? [] : readClusterComparableRows(portableDb, portableRepoId); - const liveMemberships = readMembershipComparableRows(params.liveDb, params.repository.id); - const portableMemberships = portableRepoId === null ? [] : readMembershipComparableRows(portableDb, portableRepoId); - const threadDrift = compareComparableRows(liveThreads, portableThreads); - const clusterDrift = compareComparableRows(liveClusters, portableClusters); - const membershipDrift = compareComparableRows(liveMemberships, portableMemberships); - - return { - ok: true, - repository: { - id: params.repository.id, - owner: params.repository.owner, - name: params.repository.name, - fullName: params.repository.fullName, - }, - portablePath: resolvedPath, - portableRepositoryFound: portableRepoId !== null, - live: liveSnapshot, - portable: portableSnapshot, - drift: { - liveOnlyThreads: threadDrift.liveOnly, - portableOnlyThreads: threadDrift.portableOnly, - changedThreads: threadDrift.changed, - liveOnlyClusters: clusterDrift.liveOnly, - portableOnlyClusters: clusterDrift.portableOnly, - changedClusters: clusterDrift.changed, - liveOnlyMemberships: membershipDrift.liveOnly, - portableOnlyMemberships: membershipDrift.portableOnly, - changedMemberships: membershipDrift.changed, - }, - }; - } finally { - portableDb.close(); - } -} - export function importPortableSyncDatabase(params: { liveDb: SqliteDatabase; portablePath: string }): PortableSyncImportResponse { const resolvedPath = path.resolve(params.portablePath); const validation = validatePortableSyncDatabase(resolvedPath); @@ -480,188 +363,6 @@ export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: num ).run(); } -function emptyRepoSnapshot(): PortableRepoSnapshot { - return { - threads: { - total: 0, - open: 0, - closed: 0, - issues: 0, - pullRequests: 0, - latestUpdatedAt: null, - }, - clusters: { - groups: 0, - memberships: 0, - overrides: 0, - aliases: 0, - closures: 0, - }, - }; -} - -function readRepoSnapshot(db: SqliteDatabase, repoId: number): PortableRepoSnapshot { - const threads = db - .prepare( - `select - count(*) as total, - sum(case when state = 'open' and closed_at_local is null then 1 else 0 end) as open, - sum(case when state <> 'open' or closed_at_local is not null then 1 else 0 end) as closed, - sum(case when kind = 'issue' then 1 else 0 end) as issues, - sum(case when kind = 'pull_request' then 1 else 0 end) as pull_requests, - max(coalesce(updated_at_gh, updated_at)) as latest_updated_at - from threads - where repo_id = ?`, - ) - .get(repoId) as { - total: number; - open: number | null; - closed: number | null; - issues: number | null; - pull_requests: number | null; - latest_updated_at: string | null; - }; - const clusters = db - .prepare( - `select - (select count(*) from cluster_groups where repo_id = ?) as groups_count, - (select count(*) - from cluster_memberships cm - join cluster_groups cg on cg.id = cm.cluster_id - where cg.repo_id = ?) as memberships_count, - (select count(*) from cluster_overrides where repo_id = ?) as overrides_count, - (select count(*) - from cluster_aliases ca - join cluster_groups cg on cg.id = ca.cluster_id - where cg.repo_id = ?) as aliases_count, - (select count(*) - from cluster_closures cc - join cluster_groups cg on cg.id = cc.cluster_id - where cg.repo_id = ?) as closures_count`, - ) - .get(repoId, repoId, repoId, repoId, repoId) as { - groups_count: number; - memberships_count: number; - overrides_count: number; - aliases_count: number; - closures_count: number; - }; - - return { - threads: { - total: threads.total, - open: threads.open ?? 0, - closed: threads.closed ?? 0, - issues: threads.issues ?? 0, - pullRequests: threads.pull_requests ?? 0, - latestUpdatedAt: threads.latest_updated_at, - }, - clusters: { - groups: clusters.groups_count, - memberships: clusters.memberships_count, - overrides: clusters.overrides_count, - aliases: clusters.aliases_count, - closures: clusters.closures_count, - }, - }; -} - -type ComparableRow = { key: string; value: string }; - -function readThreadComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { - const rows = db - .prepare( - `select kind, number, state, title, content_hash, updated_at_gh, closed_at_gh, closed_at_local - from threads - where repo_id = ? - order by kind, number`, - ) - .all(repoId) as Array<{ - kind: string; - number: number; - state: string; - title: string; - content_hash: string; - updated_at_gh: string | null; - closed_at_gh: string | null; - closed_at_local: string | null; - }>; - return rows.map((row) => ({ - key: `${row.kind}:${row.number}`, - value: JSON.stringify([row.state, row.title, row.content_hash, row.updated_at_gh, row.closed_at_gh, row.closed_at_local]), - })); -} - -function readClusterComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { - const rows = db - .prepare( - `select stable_key, stable_slug, status, cluster_type, title, closed_at - from cluster_groups - where repo_id = ? - order by stable_key`, - ) - .all(repoId) as Array<{ - stable_key: string; - stable_slug: string; - status: string; - cluster_type: string | null; - title: string | null; - closed_at: string | null; - }>; - return rows.map((row) => ({ - key: row.stable_key, - value: JSON.stringify([row.stable_slug, row.status, row.cluster_type, row.title, row.closed_at]), - })); -} - -function readMembershipComparableRows(db: SqliteDatabase, repoId: number): ComparableRow[] { - const rows = db - .prepare( - `select cg.stable_key, t.kind, t.number, cm.role, cm.state, cm.score_to_representative, cm.added_by, cm.removed_by, cm.removed_at - from cluster_memberships cm - join cluster_groups cg on cg.id = cm.cluster_id - join threads t on t.id = cm.thread_id - where cg.repo_id = ? - order by cg.stable_key, t.kind, t.number`, - ) - .all(repoId) as Array<{ - stable_key: string; - kind: string; - number: number; - role: string; - state: string; - score_to_representative: number | null; - added_by: string; - removed_by: string | null; - removed_at: string | null; - }>; - return rows.map((row) => ({ - key: `${row.stable_key}:${row.kind}:${row.number}`, - value: JSON.stringify([row.role, row.state, row.score_to_representative, row.added_by, row.removed_by, row.removed_at]), - })); -} - -function compareComparableRows(liveRows: ComparableRow[], portableRows: ComparableRow[]): { liveOnly: number; portableOnly: number; changed: number } { - const live = new Map(liveRows.map((row) => [row.key, row.value])); - const portable = new Map(portableRows.map((row) => [row.key, row.value])); - let liveOnly = 0; - let portableOnly = 0; - let changed = 0; - - for (const [key, value] of live) { - if (!portable.has(key)) { - liveOnly += 1; - } else if (portable.get(key) !== value) { - changed += 1; - } - } - for (const key of portable.keys()) { - if (!live.has(key)) portableOnly += 1; - } - - return { liveOnly, portableOnly, changed }; -} - type PortableRepositoryRow = { id: number; owner: string; From 8078a8efefc64d2ad5a75f2bf2eb759eeef26d3c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:58:10 -0700 Subject: [PATCH 135/215] refactor: split portable sync export --- packages/api-core/src/portable/export.ts | 242 ++++++++++++++++++ packages/api-core/src/portable/sync-store.ts | 253 +------------------ 2 files changed, 247 insertions(+), 248 deletions(-) create mode 100644 packages/api-core/src/portable/export.ts diff --git a/packages/api-core/src/portable/export.ts b/packages/api-core/src/portable/export.ts new file mode 100644 index 0000000..884e9e3 --- /dev/null +++ b/packages/api-core/src/portable/export.ts @@ -0,0 +1,242 @@ +import fs from 'node:fs'; +import path from 'node:path'; + +import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; +import { validatePortableSyncDatabase } from './inspect.js'; +import { createPortableSyncSchema } from './schema.js'; +import { attachedTableHasColumn, countRows, fileSize, nowIso, sha256File, sqlStringLiteral } from './sqlite-utils.js'; +import { + DEFAULT_PORTABLE_BODY_CHARS, + PORTABLE_SYNC_EXCLUDED_TABLES, + PORTABLE_SYNC_SCHEMA_VERSION, + PORTABLE_SYNC_TABLES, + type PortableSyncExportOptions, + type PortableSyncExportResponse, + type PortableSyncManifest, + type PortableSyncProfile, +} from './types.js'; + +export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { + const profile: PortableSyncProfile | 'default' = params.profile ?? 'default'; + const bodyChars = params.bodyChars ?? bodyCharsForProfile(params.profile); + if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { + throw new Error('bodyChars must be a non-negative integer'); + } + + const sourcePath = path.resolve(params.sourcePath); + const outputPath = path.resolve(params.outputPath); + if (outputPath === sourcePath) { + throw new Error('Refusing to export portable sync database over the source database'); + } + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + const tmpPath = `${outputPath}.tmp-${process.pid}-${Date.now()}`; + fs.rmSync(tmpPath, { force: true }); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + + checkpointWal(params.sourceDb); + const out = openDb(tmpPath); + try { + out.pragma('journal_mode = DELETE'); + out.exec('pragma foreign_keys = OFF'); + createPortableSyncSchema(out); + out.exec(`attach database ${sqlStringLiteral(sourcePath)} as source`); + populatePortableSyncDb(out, { + repoId: params.repository.id, + sourcePath, + bodyChars, + }); + out.exec('detach database source'); + out.exec('pragma foreign_keys = ON'); + out.exec('analyze'); + out.exec('pragma optimize'); + out.exec('vacuum'); + } catch (error) { + try { + out.close(); + } catch { + // Ignore cleanup close errors after an export failure. + } + fs.rmSync(tmpPath, { force: true }); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + throw error; + } + out.close(); + + fs.renameSync(tmpPath, outputPath); + fs.rmSync(`${tmpPath}-wal`, { force: true }); + fs.rmSync(`${tmpPath}-shm`, { force: true }); + + const outputBytes = fs.statSync(outputPath).size; + const sourceBytes = fs.statSync(sourcePath).size + fileSize(`${sourcePath}-wal`) + fileSize(`${sourcePath}-shm`); + const verify = openDb(outputPath); + try { + verify.pragma('journal_mode = DELETE'); + const tables = PORTABLE_SYNC_TABLES.map((name) => ({ name, rows: countRows(verify, name) })); + const responseBase: Omit = { + ok: true, + repository: { + id: params.repository.id, + owner: params.repository.owner, + name: params.repository.name, + fullName: params.repository.fullName, + }, + outputPath, + sourcePath, + sourceBytes, + outputBytes, + compressionRatio: sourceBytes > 0 ? outputBytes / sourceBytes : 0, + bodyChars, + profile, + tables, + excluded: [...PORTABLE_SYNC_EXCLUDED_TABLES], + }; + const validation = validatePortableSyncDatabase(outputPath); + const manifest = buildPortableSyncManifest(responseBase, validation.ok); + const manifestPath = params.writeManifest ? writePortableSyncManifest(outputPath, manifest) : null; + + return { + ...responseBase, + manifestPath, + manifest, + }; + } finally { + verify.close(); + } +} + +export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { + const exportedAt = nowIso(); + const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); + insertMetadata.run('schema', PORTABLE_SYNC_SCHEMA_VERSION); + insertMetadata.run('exported_at', exportedAt); + insertMetadata.run('source_path', params.sourcePath); + insertMetadata.run('body_chars', String(params.bodyChars)); + insertMetadata.run('excluded', 'raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs'); + + db.prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, updated_at) + select id, owner, name, full_name, github_repo_id, updated_at + from source.repositories + where id = ?`, + ).run(params.repoId); + + db.prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body_excerpt, body_length, author_login, author_type, html_url, + labels_json, assignees_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, + merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local + ) + select + id, repo_id, github_id, number, kind, state, title, + case + when body is null then null + when ? = 0 then '' + when length(body) <= ? then body + else substr(body, 1, ?) + end, + case when body is null then 0 else length(body) end, + author_login, author_type, html_url, labels_json, assignees_json, content_hash, is_draft, + created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, + updated_at, closed_at_local, close_reason_local + from source.threads + where repo_id = ?`, + ).run(params.bodyChars, params.bodyChars, params.bodyChars, params.repoId); + + db.prepare( + `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + select tr.id, tr.thread_id, tr.source_updated_at, tr.content_hash, tr.title_hash, tr.body_hash, tr.labels_hash, tr.created_at + from source.thread_revisions tr + join threads t on t.id = tr.thread_id`, + ).run(); + + db.prepare( + `insert into thread_fingerprints ( + id, thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, + linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at + ) + select + tf.id, tf.thread_revision_id, tf.algorithm_version, tf.fingerprint_hash, tf.fingerprint_slug, tf.title_tokens_json, + tf.body_token_hash, tf.linked_refs_json, tf.file_set_hash, tf.module_buckets_json, tf.simhash64, tf.feature_json, tf.created_at + from source.thread_fingerprints tf + join thread_revisions tr on tr.id = tf.thread_revision_id`, + ).run(); + + db.prepare( + `insert into thread_key_summaries ( + id, thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) + select + tks.id, tks.thread_revision_id, tks.summary_kind, tks.prompt_version, tks.provider, tks.model, + tks.input_hash, tks.output_hash, tks.key_text, tks.created_at + from source.thread_key_summaries tks + join thread_revisions tr on tr.id = tks.thread_revision_id`, + ).run(); + + db.prepare('insert into repo_sync_state select * from source.repo_sync_state where repo_id = ?').run(params.repoId); + db.prepare('insert into repo_pipeline_state select * from source.repo_pipeline_state where repo_id = ?').run(params.repoId); + db.prepare('insert into cluster_groups select * from source.cluster_groups where repo_id = ?').run(params.repoId); + db.prepare( + `insert into cluster_memberships + select cm.* + from source.cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + join threads t on t.id = cm.thread_id`, + ).run(); + const overrideActorExpr = attachedTableHasColumn(db, 'source', 'cluster_overrides', 'actor_id') ? 'co.actor_id' : 'null'; + db.prepare( + `insert into cluster_overrides ( + id, repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at + ) + select co.id, co.repo_id, co.cluster_id, co.thread_id, co.action, ${overrideActorExpr}, co.reason, co.created_at, co.expires_at + from source.cluster_overrides co + join cluster_groups cg on cg.id = co.cluster_id + join threads t on t.id = co.thread_id + where co.repo_id = ?`, + ).run(params.repoId); + db.prepare( + `insert into cluster_aliases + select ca.* + from source.cluster_aliases ca + join cluster_groups cg on cg.id = ca.cluster_id`, + ).run(); + db.prepare( + `insert into cluster_closures + select cc.* + from source.cluster_closures cc + join cluster_groups cg on cg.id = cc.cluster_id`, + ).run(); +} + +function bodyCharsForProfile(profile: PortableSyncProfile | undefined): number { + if (profile === 'lean') return 256; + if (profile === 'review') return 1024; + return DEFAULT_PORTABLE_BODY_CHARS; +} + +function buildPortableSyncManifest( + response: Omit, + validationOk: boolean, +): PortableSyncManifest { + return { + schema: PORTABLE_SYNC_SCHEMA_VERSION, + profile: response.profile, + exportedAt: nowIso(), + outputPath: response.outputPath, + outputBytes: response.outputBytes, + sha256: sha256File(response.outputPath), + repository: response.repository, + bodyChars: response.bodyChars, + tables: response.tables, + excluded: response.excluded, + validationOk, + }; +} + +function writePortableSyncManifest(outputPath: string, manifest: PortableSyncManifest): string { + const manifestPath = `${outputPath}.manifest.json`; + fs.writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`); + return manifestPath; +} diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 03b3efe..5f3a9aa 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -1,156 +1,16 @@ -import fs from 'node:fs'; import path from 'node:path'; -import { checkpointWal, openDb, type SqliteDatabase } from '../db/sqlite.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { exportPortableSyncDatabase, populatePortableSyncDb } from './export.js'; import { portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase } from './inspect.js'; -import { createPortableSyncSchema } from './schema.js'; -import { - attachedTableHasColumn, - countRows, - fileSize, - nowIso, - openReadonlyDb, - sha256File, - sqlStringLiteral, -} from './sqlite-utils.js'; -import { - DEFAULT_PORTABLE_BODY_CHARS, - PORTABLE_SYNC_EXCLUDED_TABLES, - PORTABLE_SYNC_SCHEMA_VERSION, - PORTABLE_SYNC_TABLES, - type PortableSyncExportOptions, - type PortableSyncExportResponse, - type PortableSyncImportResponse, - type PortableSyncManifest, - type PortableSyncProfile, -} from './types.js'; +import { openReadonlyDb } from './sqlite-utils.js'; +import type { PortableSyncImportResponse } from './types.js'; +export { exportPortableSyncDatabase, populatePortableSyncDb } from './export.js'; export * from './types.js'; export { portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase } from './inspect.js'; export { createPortableSyncSchema } from './schema.js'; -export function exportPortableSyncDatabase(params: PortableSyncExportOptions): PortableSyncExportResponse { - const profile: PortableSyncProfile | 'default' = params.profile ?? 'default'; - const bodyChars = params.bodyChars ?? bodyCharsForProfile(params.profile); - if (!Number.isSafeInteger(bodyChars) || bodyChars < 0) { - throw new Error('bodyChars must be a non-negative integer'); - } - - const sourcePath = path.resolve(params.sourcePath); - const outputPath = path.resolve(params.outputPath); - if (outputPath === sourcePath) { - throw new Error('Refusing to export portable sync database over the source database'); - } - - fs.mkdirSync(path.dirname(outputPath), { recursive: true }); - const tmpPath = `${outputPath}.tmp-${process.pid}-${Date.now()}`; - fs.rmSync(tmpPath, { force: true }); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); - - checkpointWal(params.sourceDb); - const out = openDb(tmpPath); - try { - out.pragma('journal_mode = DELETE'); - out.exec('pragma foreign_keys = OFF'); - createPortableSyncSchema(out); - out.exec(`attach database ${sqlStringLiteral(sourcePath)} as source`); - populatePortableSyncDb(out, { - repoId: params.repository.id, - sourcePath, - bodyChars, - }); - out.exec('detach database source'); - out.exec('pragma foreign_keys = ON'); - out.exec('analyze'); - out.exec('pragma optimize'); - out.exec('vacuum'); - } catch (error) { - try { - out.close(); - } catch { - // Ignore cleanup close errors after an export failure. - } - fs.rmSync(tmpPath, { force: true }); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); - throw error; - } - out.close(); - - fs.renameSync(tmpPath, outputPath); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); - - const outputBytes = fs.statSync(outputPath).size; - const sourceBytes = fs.statSync(sourcePath).size + fileSize(`${sourcePath}-wal`) + fileSize(`${sourcePath}-shm`); - const verify = openDb(outputPath); - try { - verify.pragma('journal_mode = DELETE'); - const tables = PORTABLE_SYNC_TABLES.map((name) => ({ name, rows: countRows(verify, name) })); - const responseBase: Omit = { - ok: true, - repository: { - id: params.repository.id, - owner: params.repository.owner, - name: params.repository.name, - fullName: params.repository.fullName, - }, - outputPath, - sourcePath, - sourceBytes, - outputBytes, - compressionRatio: sourceBytes > 0 ? outputBytes / sourceBytes : 0, - bodyChars, - profile, - tables, - excluded: [...PORTABLE_SYNC_EXCLUDED_TABLES], - }; - const validation = validatePortableSyncDatabase(outputPath); - const manifest = buildPortableSyncManifest(responseBase, validation.ok); - const manifestPath = params.writeManifest ? writePortableSyncManifest(outputPath, manifest) : null; - - return { - ...responseBase, - manifestPath, - manifest, - }; - } finally { - verify.close(); - } -} - -function bodyCharsForProfile(profile: PortableSyncProfile | undefined): number { - if (profile === 'lean') return 256; - if (profile === 'review') return 1024; - return DEFAULT_PORTABLE_BODY_CHARS; -} - -function buildPortableSyncManifest( - response: Omit, - validationOk: boolean, -): PortableSyncManifest { - return { - schema: PORTABLE_SYNC_SCHEMA_VERSION, - profile: response.profile, - exportedAt: nowIso(), - outputPath: response.outputPath, - outputBytes: response.outputBytes, - sha256: sha256File(response.outputPath), - repository: response.repository, - bodyChars: response.bodyChars, - tables: response.tables, - excluded: response.excluded, - validationOk, - }; -} - -function writePortableSyncManifest(outputPath: string, manifest: PortableSyncManifest): string { - const manifestPath = `${outputPath}.manifest.json`; - fs.writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`); - return manifestPath; -} - export function importPortableSyncDatabase(params: { liveDb: SqliteDatabase; portablePath: string }): PortableSyncImportResponse { const resolvedPath = path.resolve(params.portablePath); const validation = validatePortableSyncDatabase(resolvedPath); @@ -260,109 +120,6 @@ export function importPortableSyncDatabase(params: { liveDb: SqliteDatabase; por } } -export function populatePortableSyncDb(db: SqliteDatabase, params: { repoId: number; sourcePath: string; bodyChars: number }): void { - const exportedAt = nowIso(); - const insertMetadata = db.prepare('insert into portable_metadata (key, value) values (?, ?)'); - insertMetadata.run('schema', PORTABLE_SYNC_SCHEMA_VERSION); - insertMetadata.run('exported_at', exportedAt); - insertMetadata.run('source_path', params.sourcePath); - insertMetadata.run('body_chars', String(params.bodyChars)); - insertMetadata.run('excluded', 'raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs'); - - db.prepare( - `insert into repositories (id, owner, name, full_name, github_repo_id, updated_at) - select id, owner, name, full_name, github_repo_id, updated_at - from source.repositories - where id = ?`, - ).run(params.repoId); - - db.prepare( - `insert into threads ( - id, repo_id, github_id, number, kind, state, title, body_excerpt, body_length, author_login, author_type, html_url, - labels_json, assignees_json, content_hash, is_draft, created_at_gh, updated_at_gh, closed_at_gh, - merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local - ) - select - id, repo_id, github_id, number, kind, state, title, - case - when body is null then null - when ? = 0 then '' - when length(body) <= ? then body - else substr(body, 1, ?) - end, - case when body is null then 0 else length(body) end, - author_login, author_type, html_url, labels_json, assignees_json, content_hash, is_draft, - created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, - updated_at, closed_at_local, close_reason_local - from source.threads - where repo_id = ?`, - ).run(params.bodyChars, params.bodyChars, params.bodyChars, params.repoId); - - db.prepare( - `insert into thread_revisions (id, thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) - select tr.id, tr.thread_id, tr.source_updated_at, tr.content_hash, tr.title_hash, tr.body_hash, tr.labels_hash, tr.created_at - from source.thread_revisions tr - join threads t on t.id = tr.thread_id`, - ).run(); - - db.prepare( - `insert into thread_fingerprints ( - id, thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, - linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at - ) - select - tf.id, tf.thread_revision_id, tf.algorithm_version, tf.fingerprint_hash, tf.fingerprint_slug, tf.title_tokens_json, - tf.body_token_hash, tf.linked_refs_json, tf.file_set_hash, tf.module_buckets_json, tf.simhash64, tf.feature_json, tf.created_at - from source.thread_fingerprints tf - join thread_revisions tr on tr.id = tf.thread_revision_id`, - ).run(); - - db.prepare( - `insert into thread_key_summaries ( - id, thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at - ) - select - tks.id, tks.thread_revision_id, tks.summary_kind, tks.prompt_version, tks.provider, tks.model, - tks.input_hash, tks.output_hash, tks.key_text, tks.created_at - from source.thread_key_summaries tks - join thread_revisions tr on tr.id = tks.thread_revision_id`, - ).run(); - - db.prepare('insert into repo_sync_state select * from source.repo_sync_state where repo_id = ?').run(params.repoId); - db.prepare('insert into repo_pipeline_state select * from source.repo_pipeline_state where repo_id = ?').run(params.repoId); - db.prepare('insert into cluster_groups select * from source.cluster_groups where repo_id = ?').run(params.repoId); - db.prepare( - `insert into cluster_memberships - select cm.* - from source.cluster_memberships cm - join cluster_groups cg on cg.id = cm.cluster_id - join threads t on t.id = cm.thread_id`, - ).run(); - const overrideActorExpr = attachedTableHasColumn(db, 'source', 'cluster_overrides', 'actor_id') ? 'co.actor_id' : 'null'; - db.prepare( - `insert into cluster_overrides ( - id, repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at - ) - select co.id, co.repo_id, co.cluster_id, co.thread_id, co.action, ${overrideActorExpr}, co.reason, co.created_at, co.expires_at - from source.cluster_overrides co - join cluster_groups cg on cg.id = co.cluster_id - join threads t on t.id = co.thread_id - where co.repo_id = ?`, - ).run(params.repoId); - db.prepare( - `insert into cluster_aliases - select ca.* - from source.cluster_aliases ca - join cluster_groups cg on cg.id = ca.cluster_id`, - ).run(); - db.prepare( - `insert into cluster_closures - select cc.* - from source.cluster_closures cc - join cluster_groups cg on cg.id = cc.cluster_id`, - ).run(); -} - type PortableRepositoryRow = { id: number; owner: string; From 030b2fdf9e3746dcd2486c56d57414596d2e6ae9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 12:59:20 -0700 Subject: [PATCH 136/215] refactor: isolate portable sync import --- packages/api-core/src/portable/import.ts | 564 ++++++++++++++++++ packages/api-core/src/portable/sync-store.ts | 569 +------------------ 2 files changed, 566 insertions(+), 567 deletions(-) create mode 100644 packages/api-core/src/portable/import.ts diff --git a/packages/api-core/src/portable/import.ts b/packages/api-core/src/portable/import.ts new file mode 100644 index 0000000..4f52016 --- /dev/null +++ b/packages/api-core/src/portable/import.ts @@ -0,0 +1,564 @@ +import path from 'node:path'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import { validatePortableSyncDatabase } from './inspect.js'; +import { openReadonlyDb } from './sqlite-utils.js'; +import type { PortableSyncImportResponse } from './types.js'; + +export function importPortableSyncDatabase(params: { liveDb: SqliteDatabase; portablePath: string }): PortableSyncImportResponse { + const resolvedPath = path.resolve(params.portablePath); + const validation = validatePortableSyncDatabase(resolvedPath); + if (!validation.ok) { + throw new Error(`Portable sync validation failed: ${validation.errors.join('; ')}`); + } + + const portableDb = openReadonlyDb(resolvedPath); + try { + const portableRepo = portableDb.prepare('select * from repositories order by id limit 1').get() as PortableRepositoryRow | undefined; + if (!portableRepo) { + throw new Error('Portable sync database has no repository row'); + } + + const imported = emptyImportCounts(); + const threadIdMap = new Map(); + const revisionIdMap = new Map(); + const clusterIdMap = new Map(); + + const runImport = params.liveDb.transaction(() => { + const repoId = upsertImportedRepository(params.liveDb, portableRepo); + imported.repositories = 1; + + for (const thread of readPortableThreads(portableDb, portableRepo.id)) { + threadIdMap.set(thread.id, upsertImportedThread(params.liveDb, repoId, thread)); + imported.threads += 1; + } + + for (const revision of readPortableThreadRevisions(portableDb)) { + const liveThreadId = threadIdMap.get(revision.thread_id); + if (!liveThreadId) continue; + revisionIdMap.set(revision.id, upsertImportedThreadRevision(params.liveDb, liveThreadId, revision)); + imported.threadRevisions += 1; + } + + for (const fingerprint of readPortableThreadFingerprints(portableDb)) { + const liveRevisionId = revisionIdMap.get(fingerprint.thread_revision_id); + if (!liveRevisionId) continue; + upsertImportedThreadFingerprint(params.liveDb, liveRevisionId, fingerprint); + imported.threadFingerprints += 1; + } + + for (const summary of readPortableThreadKeySummaries(portableDb)) { + const liveRevisionId = revisionIdMap.get(summary.thread_revision_id); + if (!liveRevisionId) continue; + upsertImportedThreadKeySummary(params.liveDb, liveRevisionId, summary); + imported.threadKeySummaries += 1; + } + + if (upsertImportedRepoSyncState(params.liveDb, repoId, portableDb, portableRepo.id)) imported.repoSyncState = 1; + if (upsertImportedRepoPipelineState(params.liveDb, repoId, portableDb, portableRepo.id)) imported.repoPipelineState = 1; + + for (const cluster of readPortableClusterGroups(portableDb, portableRepo.id)) { + const representativeThreadId = cluster.representative_thread_id ? (threadIdMap.get(cluster.representative_thread_id) ?? null) : null; + clusterIdMap.set(cluster.id, upsertImportedClusterGroup(params.liveDb, repoId, representativeThreadId, cluster)); + imported.clusterGroups += 1; + } + + for (const membership of readPortableClusterMemberships(portableDb)) { + const liveClusterId = clusterIdMap.get(membership.cluster_id); + const liveThreadId = threadIdMap.get(membership.thread_id); + if (!liveClusterId || !liveThreadId) continue; + upsertImportedClusterMembership(params.liveDb, liveClusterId, liveThreadId, membership); + imported.clusterMemberships += 1; + } + + for (const override of readPortableClusterOverrides(portableDb, portableRepo.id)) { + const liveClusterId = clusterIdMap.get(override.cluster_id); + const liveThreadId = threadIdMap.get(override.thread_id); + if (!liveClusterId || !liveThreadId) continue; + upsertImportedClusterOverride(params.liveDb, repoId, liveClusterId, liveThreadId, override); + imported.clusterOverrides += 1; + } + + for (const alias of readPortableClusterAliases(portableDb)) { + const liveClusterId = clusterIdMap.get(alias.cluster_id); + if (!liveClusterId) continue; + upsertImportedClusterAlias(params.liveDb, liveClusterId, alias); + imported.clusterAliases += 1; + } + + for (const closure of readPortableClusterClosures(portableDb)) { + const liveClusterId = clusterIdMap.get(closure.cluster_id); + if (!liveClusterId) continue; + upsertImportedClusterClosure(params.liveDb, liveClusterId, closure); + imported.clusterClosures += 1; + } + + return repoId; + }); + + const repoId = runImport(); + return { + ok: true, + path: resolvedPath, + repository: { + id: repoId, + owner: portableRepo.owner, + name: portableRepo.name, + fullName: portableRepo.full_name, + }, + validationOk: validation.ok, + imported, + }; + } finally { + portableDb.close(); + } +} + +type PortableRepositoryRow = { + id: number; + owner: string; + name: string; + full_name: string; + github_repo_id: string | null; + updated_at: string; +}; + +type PortableThreadRow = { + id: number; + github_id: string; + number: number; + kind: string; + state: string; + title: string; + body_excerpt: string | null; + author_login: string | null; + author_type: string | null; + html_url: string; + labels_json: string; + assignees_json: string; + content_hash: string; + is_draft: number; + created_at_gh: string | null; + updated_at_gh: string | null; + closed_at_gh: string | null; + merged_at_gh: string | null; + first_pulled_at: string | null; + last_pulled_at: string | null; + updated_at: string; + closed_at_local: string | null; + close_reason_local: string | null; +}; + +type PortableThreadRevisionRow = { + id: number; + thread_id: number; + source_updated_at: string | null; + content_hash: string; + title_hash: string; + body_hash: string; + labels_hash: string; + created_at: string; +}; + +type PortableThreadFingerprintRow = Record & { + thread_revision_id: number; +}; + +type PortableThreadKeySummaryRow = Record & { + thread_revision_id: number; +}; + +type PortableClusterGroupRow = Record & { + id: number; + representative_thread_id: number | null; +}; + +type PortableClusterMembershipRow = Record & { + cluster_id: number; + thread_id: number; +}; + +type PortableClusterOverrideRow = Record & { + cluster_id: number; + thread_id: number; +}; + +type PortableClusterAliasRow = Record & { + cluster_id: number; +}; + +type PortableClusterClosureRow = Record & { + cluster_id: number; +}; + +function emptyImportCounts(): PortableSyncImportResponse['imported'] { + return { + repositories: 0, + threads: 0, + threadRevisions: 0, + threadFingerprints: 0, + threadKeySummaries: 0, + repoSyncState: 0, + repoPipelineState: 0, + clusterGroups: 0, + clusterMemberships: 0, + clusterOverrides: 0, + clusterAliases: 0, + clusterClosures: 0, + }; +} + +function readPortableThreads(db: SqliteDatabase, repoId: number): PortableThreadRow[] { + return db.prepare('select * from threads where repo_id = ? order by id').all(repoId) as PortableThreadRow[]; +} + +function readPortableThreadRevisions(db: SqliteDatabase): PortableThreadRevisionRow[] { + return db.prepare('select * from thread_revisions order by id').all() as PortableThreadRevisionRow[]; +} + +function readPortableThreadFingerprints(db: SqliteDatabase): PortableThreadFingerprintRow[] { + return db.prepare('select * from thread_fingerprints order by id').all() as PortableThreadFingerprintRow[]; +} + +function readPortableThreadKeySummaries(db: SqliteDatabase): PortableThreadKeySummaryRow[] { + return db.prepare('select * from thread_key_summaries order by id').all() as PortableThreadKeySummaryRow[]; +} + +function readPortableClusterGroups(db: SqliteDatabase, repoId: number): PortableClusterGroupRow[] { + return db.prepare('select * from cluster_groups where repo_id = ? order by id').all(repoId) as PortableClusterGroupRow[]; +} + +function readPortableClusterMemberships(db: SqliteDatabase): PortableClusterMembershipRow[] { + return db.prepare('select * from cluster_memberships order by cluster_id, thread_id').all() as PortableClusterMembershipRow[]; +} + +function readPortableClusterOverrides(db: SqliteDatabase, repoId: number): PortableClusterOverrideRow[] { + return db.prepare('select * from cluster_overrides where repo_id = ? order by id').all(repoId) as PortableClusterOverrideRow[]; +} + +function readPortableClusterAliases(db: SqliteDatabase): PortableClusterAliasRow[] { + return db.prepare('select * from cluster_aliases order by cluster_id, alias_slug').all() as PortableClusterAliasRow[]; +} + +function readPortableClusterClosures(db: SqliteDatabase): PortableClusterClosureRow[] { + return db.prepare('select * from cluster_closures order by cluster_id').all() as PortableClusterClosureRow[]; +} + +function upsertImportedRepository(db: SqliteDatabase, row: PortableRepositoryRow): number { + db.prepare( + `insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, '{}', ?) + on conflict(full_name) do update set + owner = excluded.owner, + name = excluded.name, + github_repo_id = excluded.github_repo_id, + updated_at = excluded.updated_at`, + ).run(row.owner, row.name, row.full_name, row.github_repo_id, row.updated_at); + const live = db.prepare('select id from repositories where full_name = ?').get(row.full_name) as { id: number }; + return live.id; +} + +function upsertImportedThread(db: SqliteDatabase, repoId: number, row: PortableThreadRow): number { + db.prepare( + `insert into threads ( + repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, + closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '{}', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, kind, number) do update set + github_id = excluded.github_id, + state = excluded.state, + title = excluded.title, + body = coalesce(threads.body, excluded.body), + author_login = excluded.author_login, + author_type = excluded.author_type, + html_url = excluded.html_url, + labels_json = excluded.labels_json, + assignees_json = excluded.assignees_json, + content_hash = excluded.content_hash, + is_draft = excluded.is_draft, + created_at_gh = excluded.created_at_gh, + updated_at_gh = excluded.updated_at_gh, + closed_at_gh = excluded.closed_at_gh, + merged_at_gh = excluded.merged_at_gh, + first_pulled_at = coalesce(threads.first_pulled_at, excluded.first_pulled_at), + last_pulled_at = excluded.last_pulled_at, + updated_at = excluded.updated_at, + closed_at_local = excluded.closed_at_local, + close_reason_local = excluded.close_reason_local`, + ).run( + repoId, + row.github_id, + row.number, + row.kind, + row.state, + row.title, + row.body_excerpt, + row.author_login, + row.author_type, + row.html_url, + row.labels_json, + row.assignees_json, + row.content_hash, + row.is_draft, + row.created_at_gh, + row.updated_at_gh, + row.closed_at_gh, + row.merged_at_gh, + row.first_pulled_at, + row.last_pulled_at, + row.updated_at, + row.closed_at_local, + row.close_reason_local, + ); + const live = db.prepare('select id from threads where repo_id = ? and kind = ? and number = ?').get(repoId, row.kind, row.number) as { id: number }; + return live.id; +} + +function upsertImportedThreadRevision(db: SqliteDatabase, liveThreadId: number, row: PortableThreadRevisionRow): number { + db.prepare( + `insert into thread_revisions (thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) + values (?, ?, ?, ?, ?, ?, ?) + on conflict(thread_id, content_hash) do update set + source_updated_at = excluded.source_updated_at, + title_hash = excluded.title_hash, + body_hash = excluded.body_hash, + labels_hash = excluded.labels_hash`, + ).run(liveThreadId, row.source_updated_at, row.content_hash, row.title_hash, row.body_hash, row.labels_hash, row.created_at); + const live = db.prepare('select id from thread_revisions where thread_id = ? and content_hash = ?').get(liveThreadId, row.content_hash) as { + id: number; + }; + return live.id; +} + +function upsertImportedThreadFingerprint(db: SqliteDatabase, liveRevisionId: number, row: PortableThreadFingerprintRow): void { + db.prepare( + `insert into thread_fingerprints ( + thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, + linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_revision_id, algorithm_version) do update set + fingerprint_hash = excluded.fingerprint_hash, + fingerprint_slug = excluded.fingerprint_slug, + title_tokens_json = excluded.title_tokens_json, + body_token_hash = excluded.body_token_hash, + linked_refs_json = excluded.linked_refs_json, + file_set_hash = excluded.file_set_hash, + module_buckets_json = excluded.module_buckets_json, + simhash64 = excluded.simhash64, + feature_json = excluded.feature_json`, + ).run( + liveRevisionId, + row.algorithm_version, + row.fingerprint_hash, + row.fingerprint_slug, + row.title_tokens_json, + row.body_token_hash, + row.linked_refs_json, + row.file_set_hash, + row.module_buckets_json, + row.simhash64, + row.feature_json, + row.created_at, + ); +} + +function upsertImportedThreadKeySummary(db: SqliteDatabase, liveRevisionId: number, row: PortableThreadKeySummaryRow): void { + db.prepare( + `insert into thread_key_summaries ( + thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(thread_revision_id, summary_kind, prompt_version, provider, model) do update set + input_hash = excluded.input_hash, + output_hash = excluded.output_hash, + key_text = excluded.key_text, + created_at = excluded.created_at`, + ).run( + liveRevisionId, + row.summary_kind, + row.prompt_version, + row.provider, + row.model, + row.input_hash, + row.output_hash, + row.key_text, + row.created_at, + ); +} + +function upsertImportedRepoSyncState(db: SqliteDatabase, repoId: number, portableDb: SqliteDatabase, portableRepoId: number): boolean { + const row = portableDb.prepare('select * from repo_sync_state where repo_id = ?').get(portableRepoId) as Record | undefined; + if (!row) return false; + db.prepare( + `insert into repo_sync_state ( + repo_id, last_full_open_scan_started_at, last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at, last_open_close_reconciled_at, updated_at + ) + values (?, ?, ?, ?, ?, ?) + on conflict(repo_id) do update set + last_full_open_scan_started_at = excluded.last_full_open_scan_started_at, + last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at, + last_open_close_reconciled_at = excluded.last_open_close_reconciled_at, + updated_at = excluded.updated_at`, + ).run( + repoId, + row.last_full_open_scan_started_at, + row.last_overlapping_open_scan_completed_at, + row.last_non_overlapping_scan_completed_at, + row.last_open_close_reconciled_at, + row.updated_at, + ); + return true; +} + +function upsertImportedRepoPipelineState(db: SqliteDatabase, repoId: number, portableDb: SqliteDatabase, portableRepoId: number): boolean { + const row = portableDb.prepare('select * from repo_pipeline_state where repo_id = ?').get(portableRepoId) as Record | undefined; + if (!row) return false; + db.prepare( + `insert into repo_pipeline_state ( + repo_id, summary_model, summary_prompt_version, embedding_basis, embed_model, embed_dimensions, + embed_pipeline_version, vector_backend, vectors_current_at, clusters_current_at, updated_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id) do update set + summary_model = excluded.summary_model, + summary_prompt_version = excluded.summary_prompt_version, + embedding_basis = excluded.embedding_basis, + embed_model = excluded.embed_model, + embed_dimensions = excluded.embed_dimensions, + embed_pipeline_version = excluded.embed_pipeline_version, + vector_backend = excluded.vector_backend, + vectors_current_at = excluded.vectors_current_at, + clusters_current_at = excluded.clusters_current_at, + updated_at = excluded.updated_at`, + ).run( + repoId, + row.summary_model, + row.summary_prompt_version, + row.embedding_basis, + row.embed_model, + row.embed_dimensions, + row.embed_pipeline_version, + row.vector_backend, + row.vectors_current_at, + row.clusters_current_at, + row.updated_at, + ); + return true; +} + +function upsertImportedClusterGroup( + db: SqliteDatabase, + repoId: number, + representativeThreadId: number | null, + row: PortableClusterGroupRow, +): number { + db.prepare( + `insert into cluster_groups ( + repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, stable_key) do update set + stable_slug = excluded.stable_slug, + status = excluded.status, + cluster_type = excluded.cluster_type, + representative_thread_id = excluded.representative_thread_id, + title = excluded.title, + updated_at = excluded.updated_at, + closed_at = excluded.closed_at`, + ).run( + repoId, + row.stable_key, + row.stable_slug, + row.status, + row.cluster_type, + representativeThreadId, + row.title, + row.created_at, + row.updated_at, + row.closed_at, + ); + const live = db.prepare('select id from cluster_groups where repo_id = ? and stable_key = ?').get(repoId, row.stable_key) as { id: number }; + return live.id; +} + +function upsertImportedClusterMembership( + db: SqliteDatabase, + liveClusterId: number, + liveThreadId: number, + row: PortableClusterMembershipRow, +): void { + db.prepare( + `insert into cluster_memberships ( + cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, + added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at + ) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(cluster_id, thread_id) do update set + role = excluded.role, + state = excluded.state, + score_to_representative = excluded.score_to_representative, + last_seen_run_id = excluded.last_seen_run_id, + added_by = excluded.added_by, + removed_by = excluded.removed_by, + added_reason_json = excluded.added_reason_json, + removed_reason_json = excluded.removed_reason_json, + updated_at = excluded.updated_at, + removed_at = excluded.removed_at`, + ).run( + liveClusterId, + liveThreadId, + row.role, + row.state, + row.score_to_representative, + row.first_seen_run_id, + row.last_seen_run_id, + row.added_by, + row.removed_by, + row.added_reason_json, + row.removed_reason_json, + row.created_at, + row.updated_at, + row.removed_at, + ); +} + +function upsertImportedClusterOverride( + db: SqliteDatabase, + repoId: number, + liveClusterId: number, + liveThreadId: number, + row: PortableClusterOverrideRow, +): void { + db.prepare( + `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at) + values (?, ?, ?, ?, ?, ?, ?, ?) + on conflict(cluster_id, thread_id, action) do update set + reason = excluded.reason, + actor_id = excluded.actor_id, + expires_at = excluded.expires_at`, + ).run(repoId, liveClusterId, liveThreadId, row.action, row.actor_id, row.reason, row.created_at, row.expires_at); +} + +function upsertImportedClusterAlias(db: SqliteDatabase, liveClusterId: number, row: PortableClusterAliasRow): void { + db.prepare( + `insert into cluster_aliases (cluster_id, alias_slug, reason, created_at) + values (?, ?, ?, ?) + on conflict(cluster_id, alias_slug) do update set reason = excluded.reason`, + ).run(liveClusterId, row.alias_slug, row.reason, row.created_at); +} + +function upsertImportedClusterClosure(db: SqliteDatabase, liveClusterId: number, row: PortableClusterClosureRow): void { + db.prepare( + `insert into cluster_closures (cluster_id, reason, actor_kind, created_at, updated_at) + values (?, ?, ?, ?, ?) + on conflict(cluster_id) do update set + reason = excluded.reason, + actor_kind = excluded.actor_kind, + updated_at = excluded.updated_at`, + ).run(liveClusterId, row.reason, row.actor_kind, row.created_at, row.updated_at); +} diff --git a/packages/api-core/src/portable/sync-store.ts b/packages/api-core/src/portable/sync-store.ts index 5f3a9aa..e167300 100644 --- a/packages/api-core/src/portable/sync-store.ts +++ b/packages/api-core/src/portable/sync-store.ts @@ -1,570 +1,5 @@ -import path from 'node:path'; - -import type { SqliteDatabase } from '../db/sqlite.js'; -import { exportPortableSyncDatabase, populatePortableSyncDb } from './export.js'; -import { portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase } from './inspect.js'; -import { openReadonlyDb } from './sqlite-utils.js'; -import type { PortableSyncImportResponse } from './types.js'; - export { exportPortableSyncDatabase, populatePortableSyncDb } from './export.js'; -export * from './types.js'; +export { importPortableSyncDatabase } from './import.js'; export { portableSyncSizeReport, portableSyncStatusReport, validatePortableSyncDatabase } from './inspect.js'; export { createPortableSyncSchema } from './schema.js'; - -export function importPortableSyncDatabase(params: { liveDb: SqliteDatabase; portablePath: string }): PortableSyncImportResponse { - const resolvedPath = path.resolve(params.portablePath); - const validation = validatePortableSyncDatabase(resolvedPath); - if (!validation.ok) { - throw new Error(`Portable sync validation failed: ${validation.errors.join('; ')}`); - } - - const portableDb = openReadonlyDb(resolvedPath); - try { - const portableRepo = portableDb.prepare('select * from repositories order by id limit 1').get() as PortableRepositoryRow | undefined; - if (!portableRepo) { - throw new Error('Portable sync database has no repository row'); - } - - const imported = emptyImportCounts(); - const threadIdMap = new Map(); - const revisionIdMap = new Map(); - const clusterIdMap = new Map(); - - const runImport = params.liveDb.transaction(() => { - const repoId = upsertImportedRepository(params.liveDb, portableRepo); - imported.repositories = 1; - - for (const thread of readPortableThreads(portableDb, portableRepo.id)) { - threadIdMap.set(thread.id, upsertImportedThread(params.liveDb, repoId, thread)); - imported.threads += 1; - } - - for (const revision of readPortableThreadRevisions(portableDb)) { - const liveThreadId = threadIdMap.get(revision.thread_id); - if (!liveThreadId) continue; - revisionIdMap.set(revision.id, upsertImportedThreadRevision(params.liveDb, liveThreadId, revision)); - imported.threadRevisions += 1; - } - - for (const fingerprint of readPortableThreadFingerprints(portableDb)) { - const liveRevisionId = revisionIdMap.get(fingerprint.thread_revision_id); - if (!liveRevisionId) continue; - upsertImportedThreadFingerprint(params.liveDb, liveRevisionId, fingerprint); - imported.threadFingerprints += 1; - } - - for (const summary of readPortableThreadKeySummaries(portableDb)) { - const liveRevisionId = revisionIdMap.get(summary.thread_revision_id); - if (!liveRevisionId) continue; - upsertImportedThreadKeySummary(params.liveDb, liveRevisionId, summary); - imported.threadKeySummaries += 1; - } - - if (upsertImportedRepoSyncState(params.liveDb, repoId, portableDb, portableRepo.id)) imported.repoSyncState = 1; - if (upsertImportedRepoPipelineState(params.liveDb, repoId, portableDb, portableRepo.id)) imported.repoPipelineState = 1; - - for (const cluster of readPortableClusterGroups(portableDb, portableRepo.id)) { - const representativeThreadId = cluster.representative_thread_id ? (threadIdMap.get(cluster.representative_thread_id) ?? null) : null; - clusterIdMap.set(cluster.id, upsertImportedClusterGroup(params.liveDb, repoId, representativeThreadId, cluster)); - imported.clusterGroups += 1; - } - - for (const membership of readPortableClusterMemberships(portableDb)) { - const liveClusterId = clusterIdMap.get(membership.cluster_id); - const liveThreadId = threadIdMap.get(membership.thread_id); - if (!liveClusterId || !liveThreadId) continue; - upsertImportedClusterMembership(params.liveDb, liveClusterId, liveThreadId, membership); - imported.clusterMemberships += 1; - } - - for (const override of readPortableClusterOverrides(portableDb, portableRepo.id)) { - const liveClusterId = clusterIdMap.get(override.cluster_id); - const liveThreadId = threadIdMap.get(override.thread_id); - if (!liveClusterId || !liveThreadId) continue; - upsertImportedClusterOverride(params.liveDb, repoId, liveClusterId, liveThreadId, override); - imported.clusterOverrides += 1; - } - - for (const alias of readPortableClusterAliases(portableDb)) { - const liveClusterId = clusterIdMap.get(alias.cluster_id); - if (!liveClusterId) continue; - upsertImportedClusterAlias(params.liveDb, liveClusterId, alias); - imported.clusterAliases += 1; - } - - for (const closure of readPortableClusterClosures(portableDb)) { - const liveClusterId = clusterIdMap.get(closure.cluster_id); - if (!liveClusterId) continue; - upsertImportedClusterClosure(params.liveDb, liveClusterId, closure); - imported.clusterClosures += 1; - } - - return repoId; - }); - - const repoId = runImport(); - return { - ok: true, - path: resolvedPath, - repository: { - id: repoId, - owner: portableRepo.owner, - name: portableRepo.name, - fullName: portableRepo.full_name, - }, - validationOk: validation.ok, - imported, - }; - } finally { - portableDb.close(); - } -} - -type PortableRepositoryRow = { - id: number; - owner: string; - name: string; - full_name: string; - github_repo_id: string | null; - updated_at: string; -}; - -type PortableThreadRow = { - id: number; - github_id: string; - number: number; - kind: string; - state: string; - title: string; - body_excerpt: string | null; - author_login: string | null; - author_type: string | null; - html_url: string; - labels_json: string; - assignees_json: string; - content_hash: string; - is_draft: number; - created_at_gh: string | null; - updated_at_gh: string | null; - closed_at_gh: string | null; - merged_at_gh: string | null; - first_pulled_at: string | null; - last_pulled_at: string | null; - updated_at: string; - closed_at_local: string | null; - close_reason_local: string | null; -}; - -type PortableThreadRevisionRow = { - id: number; - thread_id: number; - source_updated_at: string | null; - content_hash: string; - title_hash: string; - body_hash: string; - labels_hash: string; - created_at: string; -}; - -type PortableThreadFingerprintRow = Record & { - thread_revision_id: number; -}; - -type PortableThreadKeySummaryRow = Record & { - thread_revision_id: number; -}; - -type PortableClusterGroupRow = Record & { - id: number; - representative_thread_id: number | null; -}; - -type PortableClusterMembershipRow = Record & { - cluster_id: number; - thread_id: number; -}; - -type PortableClusterOverrideRow = Record & { - cluster_id: number; - thread_id: number; -}; - -type PortableClusterAliasRow = Record & { - cluster_id: number; -}; - -type PortableClusterClosureRow = Record & { - cluster_id: number; -}; - -function emptyImportCounts(): PortableSyncImportResponse['imported'] { - return { - repositories: 0, - threads: 0, - threadRevisions: 0, - threadFingerprints: 0, - threadKeySummaries: 0, - repoSyncState: 0, - repoPipelineState: 0, - clusterGroups: 0, - clusterMemberships: 0, - clusterOverrides: 0, - clusterAliases: 0, - clusterClosures: 0, - }; -} - -function readPortableThreads(db: SqliteDatabase, repoId: number): PortableThreadRow[] { - return db.prepare('select * from threads where repo_id = ? order by id').all(repoId) as PortableThreadRow[]; -} - -function readPortableThreadRevisions(db: SqliteDatabase): PortableThreadRevisionRow[] { - return db.prepare('select * from thread_revisions order by id').all() as PortableThreadRevisionRow[]; -} - -function readPortableThreadFingerprints(db: SqliteDatabase): PortableThreadFingerprintRow[] { - return db.prepare('select * from thread_fingerprints order by id').all() as PortableThreadFingerprintRow[]; -} - -function readPortableThreadKeySummaries(db: SqliteDatabase): PortableThreadKeySummaryRow[] { - return db.prepare('select * from thread_key_summaries order by id').all() as PortableThreadKeySummaryRow[]; -} - -function readPortableClusterGroups(db: SqliteDatabase, repoId: number): PortableClusterGroupRow[] { - return db.prepare('select * from cluster_groups where repo_id = ? order by id').all(repoId) as PortableClusterGroupRow[]; -} - -function readPortableClusterMemberships(db: SqliteDatabase): PortableClusterMembershipRow[] { - return db.prepare('select * from cluster_memberships order by cluster_id, thread_id').all() as PortableClusterMembershipRow[]; -} - -function readPortableClusterOverrides(db: SqliteDatabase, repoId: number): PortableClusterOverrideRow[] { - return db.prepare('select * from cluster_overrides where repo_id = ? order by id').all(repoId) as PortableClusterOverrideRow[]; -} - -function readPortableClusterAliases(db: SqliteDatabase): PortableClusterAliasRow[] { - return db.prepare('select * from cluster_aliases order by cluster_id, alias_slug').all() as PortableClusterAliasRow[]; -} - -function readPortableClusterClosures(db: SqliteDatabase): PortableClusterClosureRow[] { - return db.prepare('select * from cluster_closures order by cluster_id').all() as PortableClusterClosureRow[]; -} - -function upsertImportedRepository(db: SqliteDatabase, row: PortableRepositoryRow): number { - db.prepare( - `insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at) - values (?, ?, ?, ?, '{}', ?) - on conflict(full_name) do update set - owner = excluded.owner, - name = excluded.name, - github_repo_id = excluded.github_repo_id, - updated_at = excluded.updated_at`, - ).run(row.owner, row.name, row.full_name, row.github_repo_id, row.updated_at); - const live = db.prepare('select id from repositories where full_name = ?').get(row.full_name) as { id: number }; - return live.id; -} - -function upsertImportedThread(db: SqliteDatabase, repoId: number, row: PortableThreadRow): number { - db.prepare( - `insert into threads ( - repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, - labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, - closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local - ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '{}', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(repo_id, kind, number) do update set - github_id = excluded.github_id, - state = excluded.state, - title = excluded.title, - body = coalesce(threads.body, excluded.body), - author_login = excluded.author_login, - author_type = excluded.author_type, - html_url = excluded.html_url, - labels_json = excluded.labels_json, - assignees_json = excluded.assignees_json, - content_hash = excluded.content_hash, - is_draft = excluded.is_draft, - created_at_gh = excluded.created_at_gh, - updated_at_gh = excluded.updated_at_gh, - closed_at_gh = excluded.closed_at_gh, - merged_at_gh = excluded.merged_at_gh, - first_pulled_at = coalesce(threads.first_pulled_at, excluded.first_pulled_at), - last_pulled_at = excluded.last_pulled_at, - updated_at = excluded.updated_at, - closed_at_local = excluded.closed_at_local, - close_reason_local = excluded.close_reason_local`, - ).run( - repoId, - row.github_id, - row.number, - row.kind, - row.state, - row.title, - row.body_excerpt, - row.author_login, - row.author_type, - row.html_url, - row.labels_json, - row.assignees_json, - row.content_hash, - row.is_draft, - row.created_at_gh, - row.updated_at_gh, - row.closed_at_gh, - row.merged_at_gh, - row.first_pulled_at, - row.last_pulled_at, - row.updated_at, - row.closed_at_local, - row.close_reason_local, - ); - const live = db.prepare('select id from threads where repo_id = ? and kind = ? and number = ?').get(repoId, row.kind, row.number) as { id: number }; - return live.id; -} - -function upsertImportedThreadRevision(db: SqliteDatabase, liveThreadId: number, row: PortableThreadRevisionRow): number { - db.prepare( - `insert into thread_revisions (thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) - values (?, ?, ?, ?, ?, ?, ?) - on conflict(thread_id, content_hash) do update set - source_updated_at = excluded.source_updated_at, - title_hash = excluded.title_hash, - body_hash = excluded.body_hash, - labels_hash = excluded.labels_hash`, - ).run(liveThreadId, row.source_updated_at, row.content_hash, row.title_hash, row.body_hash, row.labels_hash, row.created_at); - const live = db.prepare('select id from thread_revisions where thread_id = ? and content_hash = ?').get(liveThreadId, row.content_hash) as { - id: number; - }; - return live.id; -} - -function upsertImportedThreadFingerprint(db: SqliteDatabase, liveRevisionId: number, row: PortableThreadFingerprintRow): void { - db.prepare( - `insert into thread_fingerprints ( - thread_revision_id, algorithm_version, fingerprint_hash, fingerprint_slug, title_tokens_json, body_token_hash, - linked_refs_json, file_set_hash, module_buckets_json, simhash64, feature_json, created_at - ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(thread_revision_id, algorithm_version) do update set - fingerprint_hash = excluded.fingerprint_hash, - fingerprint_slug = excluded.fingerprint_slug, - title_tokens_json = excluded.title_tokens_json, - body_token_hash = excluded.body_token_hash, - linked_refs_json = excluded.linked_refs_json, - file_set_hash = excluded.file_set_hash, - module_buckets_json = excluded.module_buckets_json, - simhash64 = excluded.simhash64, - feature_json = excluded.feature_json`, - ).run( - liveRevisionId, - row.algorithm_version, - row.fingerprint_hash, - row.fingerprint_slug, - row.title_tokens_json, - row.body_token_hash, - row.linked_refs_json, - row.file_set_hash, - row.module_buckets_json, - row.simhash64, - row.feature_json, - row.created_at, - ); -} - -function upsertImportedThreadKeySummary(db: SqliteDatabase, liveRevisionId: number, row: PortableThreadKeySummaryRow): void { - db.prepare( - `insert into thread_key_summaries ( - thread_revision_id, summary_kind, prompt_version, provider, model, input_hash, output_hash, key_text, created_at - ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(thread_revision_id, summary_kind, prompt_version, provider, model) do update set - input_hash = excluded.input_hash, - output_hash = excluded.output_hash, - key_text = excluded.key_text, - created_at = excluded.created_at`, - ).run( - liveRevisionId, - row.summary_kind, - row.prompt_version, - row.provider, - row.model, - row.input_hash, - row.output_hash, - row.key_text, - row.created_at, - ); -} - -function upsertImportedRepoSyncState(db: SqliteDatabase, repoId: number, portableDb: SqliteDatabase, portableRepoId: number): boolean { - const row = portableDb.prepare('select * from repo_sync_state where repo_id = ?').get(portableRepoId) as Record | undefined; - if (!row) return false; - db.prepare( - `insert into repo_sync_state ( - repo_id, last_full_open_scan_started_at, last_overlapping_open_scan_completed_at, - last_non_overlapping_scan_completed_at, last_open_close_reconciled_at, updated_at - ) - values (?, ?, ?, ?, ?, ?) - on conflict(repo_id) do update set - last_full_open_scan_started_at = excluded.last_full_open_scan_started_at, - last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at, - last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at, - last_open_close_reconciled_at = excluded.last_open_close_reconciled_at, - updated_at = excluded.updated_at`, - ).run( - repoId, - row.last_full_open_scan_started_at, - row.last_overlapping_open_scan_completed_at, - row.last_non_overlapping_scan_completed_at, - row.last_open_close_reconciled_at, - row.updated_at, - ); - return true; -} - -function upsertImportedRepoPipelineState(db: SqliteDatabase, repoId: number, portableDb: SqliteDatabase, portableRepoId: number): boolean { - const row = portableDb.prepare('select * from repo_pipeline_state where repo_id = ?').get(portableRepoId) as Record | undefined; - if (!row) return false; - db.prepare( - `insert into repo_pipeline_state ( - repo_id, summary_model, summary_prompt_version, embedding_basis, embed_model, embed_dimensions, - embed_pipeline_version, vector_backend, vectors_current_at, clusters_current_at, updated_at - ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(repo_id) do update set - summary_model = excluded.summary_model, - summary_prompt_version = excluded.summary_prompt_version, - embedding_basis = excluded.embedding_basis, - embed_model = excluded.embed_model, - embed_dimensions = excluded.embed_dimensions, - embed_pipeline_version = excluded.embed_pipeline_version, - vector_backend = excluded.vector_backend, - vectors_current_at = excluded.vectors_current_at, - clusters_current_at = excluded.clusters_current_at, - updated_at = excluded.updated_at`, - ).run( - repoId, - row.summary_model, - row.summary_prompt_version, - row.embedding_basis, - row.embed_model, - row.embed_dimensions, - row.embed_pipeline_version, - row.vector_backend, - row.vectors_current_at, - row.clusters_current_at, - row.updated_at, - ); - return true; -} - -function upsertImportedClusterGroup( - db: SqliteDatabase, - repoId: number, - representativeThreadId: number | null, - row: PortableClusterGroupRow, -): number { - db.prepare( - `insert into cluster_groups ( - repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at, closed_at - ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(repo_id, stable_key) do update set - stable_slug = excluded.stable_slug, - status = excluded.status, - cluster_type = excluded.cluster_type, - representative_thread_id = excluded.representative_thread_id, - title = excluded.title, - updated_at = excluded.updated_at, - closed_at = excluded.closed_at`, - ).run( - repoId, - row.stable_key, - row.stable_slug, - row.status, - row.cluster_type, - representativeThreadId, - row.title, - row.created_at, - row.updated_at, - row.closed_at, - ); - const live = db.prepare('select id from cluster_groups where repo_id = ? and stable_key = ?').get(repoId, row.stable_key) as { id: number }; - return live.id; -} - -function upsertImportedClusterMembership( - db: SqliteDatabase, - liveClusterId: number, - liveThreadId: number, - row: PortableClusterMembershipRow, -): void { - db.prepare( - `insert into cluster_memberships ( - cluster_id, thread_id, role, state, score_to_representative, first_seen_run_id, last_seen_run_id, - added_by, removed_by, added_reason_json, removed_reason_json, created_at, updated_at, removed_at - ) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(cluster_id, thread_id) do update set - role = excluded.role, - state = excluded.state, - score_to_representative = excluded.score_to_representative, - last_seen_run_id = excluded.last_seen_run_id, - added_by = excluded.added_by, - removed_by = excluded.removed_by, - added_reason_json = excluded.added_reason_json, - removed_reason_json = excluded.removed_reason_json, - updated_at = excluded.updated_at, - removed_at = excluded.removed_at`, - ).run( - liveClusterId, - liveThreadId, - row.role, - row.state, - row.score_to_representative, - row.first_seen_run_id, - row.last_seen_run_id, - row.added_by, - row.removed_by, - row.added_reason_json, - row.removed_reason_json, - row.created_at, - row.updated_at, - row.removed_at, - ); -} - -function upsertImportedClusterOverride( - db: SqliteDatabase, - repoId: number, - liveClusterId: number, - liveThreadId: number, - row: PortableClusterOverrideRow, -): void { - db.prepare( - `insert into cluster_overrides (repo_id, cluster_id, thread_id, action, actor_id, reason, created_at, expires_at) - values (?, ?, ?, ?, ?, ?, ?, ?) - on conflict(cluster_id, thread_id, action) do update set - reason = excluded.reason, - actor_id = excluded.actor_id, - expires_at = excluded.expires_at`, - ).run(repoId, liveClusterId, liveThreadId, row.action, row.actor_id, row.reason, row.created_at, row.expires_at); -} - -function upsertImportedClusterAlias(db: SqliteDatabase, liveClusterId: number, row: PortableClusterAliasRow): void { - db.prepare( - `insert into cluster_aliases (cluster_id, alias_slug, reason, created_at) - values (?, ?, ?, ?) - on conflict(cluster_id, alias_slug) do update set reason = excluded.reason`, - ).run(liveClusterId, row.alias_slug, row.reason, row.created_at); -} - -function upsertImportedClusterClosure(db: SqliteDatabase, liveClusterId: number, row: PortableClusterClosureRow): void { - db.prepare( - `insert into cluster_closures (cluster_id, reason, actor_kind, created_at, updated_at) - values (?, ?, ?, ?, ?) - on conflict(cluster_id) do update set - reason = excluded.reason, - actor_kind = excluded.actor_kind, - updated_at = excluded.updated_at`, - ).run(liveClusterId, row.reason, row.actor_kind, row.created_at, row.updated_at); -} +export * from './types.js'; From 9296ce8a1b57d83d4fe57fbc8cd0c00832930741 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:03:07 -0700 Subject: [PATCH 137/215] refactor: extract service types --- packages/api-core/src/service-types.ts | 362 +++++++++++++++++++++++ packages/api-core/src/service.ts | 386 +++---------------------- 2 files changed, 397 insertions(+), 351 deletions(-) create mode 100644 packages/api-core/src/service-types.ts diff --git a/packages/api-core/src/service-types.ts b/packages/api-core/src/service-types.ts new file mode 100644 index 0000000..9717b66 --- /dev/null +++ b/packages/api-core/src/service-types.ts @@ -0,0 +1,362 @@ +import type { + HealthResponse, + NeighborsResponse, + RepositoryDto, + SearchHitDto, + SearchResponse, + ThreadDto, +} from '@ghcrawl/api-contract'; + +import type { ConfigValueSource, EmbeddingBasis } from './config.js'; +import type { PortableSyncProfile } from './portable/sync-store.js'; + +export type RunTable = 'sync_runs' | 'summary_runs' | 'embedding_runs' | 'cluster_runs'; + +export type ThreadRow = { + id: number; + repo_id: number; + number: number; + kind: 'issue' | 'pull_request'; + state: string; + closed_at_gh: string | null; + closed_at_local: string | null; + close_reason_local: string | null; + title: string; + body: string | null; + author_login: string | null; + html_url: string; + labels_json: string; + updated_at_gh: string | null; + first_pulled_at: string | null; + last_pulled_at: string | null; +}; + +export type CommentSeed = { + githubId: string; + commentType: string; + authorLogin: string | null; + authorType: string | null; + body: string; + isBot: boolean; + rawJson: string; + createdAtGh: string | null; + updatedAtGh: string | null; +}; + +export type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary' | 'llm_key_summary'; +export type SimilaritySourceKind = EmbeddingSourceKind | 'deterministic_fingerprint'; +export type AggregatedClusterEdge = { + leftThreadId: number; + rightThreadId: number; + score: number; + sourceKinds: Set; +}; + +export type EmbeddingTask = { + threadId: number; + threadNumber: number; + sourceKind: EmbeddingSourceKind; + text: string; + contentHash: string; + estimatedTokens: number; + wasTruncated: boolean; +}; + +export type StoredEmbeddingRow = ThreadRow & { + source_kind: EmbeddingSourceKind; + embedding_json: string; +}; + +export type ActiveVectorTask = { + threadId: number; + threadNumber: number; + basis: EmbeddingBasis; + text: string; + contentHash: string; + estimatedTokens: number; + wasTruncated: boolean; +}; + +export type KeySummaryTask = { + threadId: number; + threadNumber: number; + revisionId: number; + inputHash: string; + text: string; +}; + +export type ActiveVectorRow = ThreadRow & { + basis: EmbeddingBasis; + model: string; + dimensions: number; + content_hash: string; + vector_json: Buffer | string; + vector_backend: string; +}; + +export type SqliteMaintenanceStats = { + pageSize: number; + pageCount: number; + freelistPages: number; + bytes: number; + walBytes: number; + shmBytes: number; + sidecarBytes: number; +}; + +export type DurableTuiClosure = { + clusterId: number; + status: 'active' | 'closed' | 'merged' | 'split'; + closedAt: string | null; + reason: string | null; +}; + +export type RepoPipelineStateRow = { + repo_id: number; + summary_model: string; + summary_prompt_version: string; + embedding_basis: EmbeddingBasis; + embed_model: string; + embed_dimensions: number; + embed_pipeline_version: string; + vector_backend: string; + vectors_current_at: string | null; + clusters_current_at: string | null; + updated_at: string; +}; + +export type ClusterExperimentMemoryStats = { + rssBeforeBytes: number; + rssAfterBytes: number; + peakRssBytes: number; + heapUsedBeforeBytes: number; + heapUsedAfterBytes: number; + peakHeapUsedBytes: number; +}; + +export type ClusterExperimentSizeBucket = { + size: number; + count: number; +}; + +export type ClusterExperimentClusterSizeStats = { + soloClusters: number; + maxClusterSize: number; + topClusterSizes: number[]; + histogram: ClusterExperimentSizeBucket[]; +}; + +export type ClusterExperimentCluster = { + representativeThreadId: number; + memberThreadIds: number[]; +}; + +export type ClusterExperimentResult = { + backend: 'exact' | 'vectorlite'; + repository: RepositoryDto; + tempDbPath: string | null; + threads: number; + sourceKinds: number; + edges: number; + clusters: number; + timingBasis: 'cluster-only'; + durationMs: number; + totalDurationMs: number; + loadMs: number; + setupMs: number; + edgeBuildMs: number; + indexBuildMs: number; + queryMs: number; + clusterBuildMs: number; + candidateK: number; + memory: ClusterExperimentMemoryStats; + clusterSizes: ClusterExperimentClusterSizeStats; + clustersDetail: ClusterExperimentCluster[] | null; +}; + +export type SummaryModelPricing = { + inputCostPerM: number; + cachedInputCostPerM: number; + outputCostPerM: number; +}; + +export type EmbeddingWorkset = { + rows: Array<{ + id: number; + number: number; + title: string; + body: string | null; + }>; + tasks: ActiveVectorTask[]; + existing: Map; + pending: ActiveVectorTask[]; + missingSummaryThreadNumbers: number[]; +}; + +export type SyncCursorState = { + lastFullOpenScanStartedAt: string | null; + lastOverlappingOpenScanCompletedAt: string | null; + lastNonOverlappingScanCompletedAt: string | null; + lastReconciledOpenCloseAt: string | null; +}; + +export type SyncRunStats = { + threadsSynced: number; + commentsSynced: number; + codeFilesSynced: number; + threadsClosed: number; + threadsClosedFromClosedSweep?: number; + threadsClosedFromClosedBackfill?: number; + threadsClosedFromDirectReconcile?: number; + directReconcileSkippedStaleThreadCount?: number; + crawlStartedAt: string; + requestedSince: string | null; + effectiveSince: string | null; + limit: number | null; + includeComments: boolean; + includeCode?: boolean; + fullReconcile?: boolean; + isFullOpenScan: boolean; + isOverlappingOpenScan: boolean; + overlapReferenceAt: string | null; + reconciledOpenCloseAt: string | null; +}; + +export type TuiClusterSortMode = 'recent' | 'size'; + +export type TuiRepoStats = { + openIssueCount: number; + openPullRequestCount: number; + lastGithubReconciliationAt: string | null; + lastEmbedRefreshAt: string | null; + staleEmbedThreadCount: number; + staleEmbedSourceCount: number; + latestClusterRunId: number | null; + latestClusterRunFinishedAt: string | null; +}; + +export type TuiClusterSummary = { + clusterId: number; + displayTitle: string; + isClosed: boolean; + closedAtLocal: string | null; + closeReasonLocal: string | null; + totalCount: number; + issueCount: number; + pullRequestCount: number; + latestUpdatedAt: string | null; + representativeThreadId: number | null; + representativeNumber: number | null; + representativeKind: 'issue' | 'pull_request' | null; + searchText: string; +}; + +export type TuiClusterMember = { + id: number; + number: number; + kind: 'issue' | 'pull_request'; + isClosed: boolean; + title: string; + updatedAtGh: string | null; + htmlUrl: string; + labels: string[]; + clusterScore: number | null; +}; + +export type TuiClusterDetail = { + clusterId: number; + displayTitle: string; + isClosed: boolean; + closedAtLocal: string | null; + closeReasonLocal: string | null; + totalCount: number; + issueCount: number; + pullRequestCount: number; + latestUpdatedAt: string | null; + representativeThreadId: number | null; + representativeNumber: number | null; + representativeKind: 'issue' | 'pull_request' | null; + members: TuiClusterMember[]; +}; + +export type TuiThreadDetail = { + thread: ThreadDto; + summaries: Partial>; + keySummary: { + summaryKind: string; + promptVersion: string; + model: string; + text: string; + } | null; + topFiles: Array<{ + path: string; + status: string | null; + additions: number; + deletions: number; + }>; + neighbors: SearchHitDto['neighbors']; +}; + +export type TuiSnapshot = { + repository: RepositoryDto; + stats: TuiRepoStats; + clusterRunId: number | null; + clusters: TuiClusterSummary[]; +}; + +export type TuiRefreshState = { + repositoryUpdatedAt: string | null; + threadUpdatedAt: string | null; + threadClosedAt: string | null; + clusterClosedAt: string | null; + durableClusterUpdatedAt: string | null; + durableMembershipUpdatedAt: string | null; + latestSyncRunId: number | null; + latestEmbeddingRunId: number | null; + latestClusterRunId: number | null; +}; + +export type DoctorResult = { + health: HealthResponse; + github: { + configured: boolean; + source: ConfigValueSource; + tokenPresent: boolean; + error: string | null; + }; + openai: { + configured: boolean; + source: ConfigValueSource; + tokenPresent: boolean; + error: string | null; + }; + vectorlite: { + configured: boolean; + runtimeOk: boolean; + error: string | null; + }; +}; + +export type SyncOptions = { + owner: string; + repo: string; + since?: string; + limit?: number; + includeComments?: boolean; + includeCode?: boolean; + fullReconcile?: boolean; + onProgress?: (message: string) => void; + startedAt?: string; +}; + +export type PortableSyncExportOptions = { + owner: string; + repo: string; + outputPath?: string; + bodyChars?: number; + profile?: PortableSyncProfile; + writeManifest?: boolean; +}; + +export type SearchResultInternal = SearchResponse; +export type NeighborsResultInternal = NeighborsResponse; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 32cd29a..f6482d0 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -113,365 +113,49 @@ import { validatePortableSyncDatabase, type PortableSyncExportResponse, type PortableSyncImportResponse, - type PortableSyncProfile, type PortableSyncSizeResponse, type PortableSyncStatusResponse, type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; +import type { + ActiveVectorRow, + ActiveVectorTask, + AggregatedClusterEdge, + ClusterExperimentResult, + ClusterExperimentClusterSizeStats, + CommentSeed, + DoctorResult, + DurableTuiClosure, + EmbeddingSourceKind, + EmbeddingTask, + EmbeddingWorkset, + KeySummaryTask, + NeighborsResultInternal, + PortableSyncExportOptions, + RepoPipelineStateRow, + RunTable, + SearchResultInternal, + SimilaritySourceKind, + SqliteMaintenanceStats, + StoredEmbeddingRow, + SummaryModelPricing, + SyncCursorState, + SyncOptions, + SyncRunStats, + ThreadRow, + TuiClusterDetail, + TuiClusterSortMode, + TuiClusterSummary, + TuiRefreshState, + TuiRepoStats, + TuiSnapshot, + TuiThreadDetail, +} from './service-types.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; -type RunTable = 'sync_runs' | 'summary_runs' | 'embedding_runs' | 'cluster_runs'; - -type ThreadRow = { - id: number; - repo_id: number; - number: number; - kind: 'issue' | 'pull_request'; - state: string; - closed_at_gh: string | null; - closed_at_local: string | null; - close_reason_local: string | null; - title: string; - body: string | null; - author_login: string | null; - html_url: string; - labels_json: string; - updated_at_gh: string | null; - first_pulled_at: string | null; - last_pulled_at: string | null; -}; - -type CommentSeed = { - githubId: string; - commentType: string; - authorLogin: string | null; - authorType: string | null; - body: string; - isBot: boolean; - rawJson: string; - createdAtGh: string | null; - updatedAtGh: string | null; -}; - -type EmbeddingSourceKind = 'title' | 'body' | 'dedupe_summary' | 'llm_key_summary'; -type SimilaritySourceKind = EmbeddingSourceKind | 'deterministic_fingerprint'; -type AggregatedClusterEdge = { - leftThreadId: number; - rightThreadId: number; - score: number; - sourceKinds: Set; -}; - -type EmbeddingTask = { - threadId: number; - threadNumber: number; - sourceKind: EmbeddingSourceKind; - text: string; - contentHash: string; - estimatedTokens: number; - wasTruncated: boolean; -}; - -type StoredEmbeddingRow = ThreadRow & { - source_kind: EmbeddingSourceKind; - embedding_json: string; -}; - -type ActiveVectorTask = { - threadId: number; - threadNumber: number; - basis: EmbeddingBasis; - text: string; - contentHash: string; - estimatedTokens: number; - wasTruncated: boolean; -}; - -type KeySummaryTask = { - threadId: number; - threadNumber: number; - revisionId: number; - inputHash: string; - text: string; -}; - -type ActiveVectorRow = ThreadRow & { - basis: EmbeddingBasis; - model: string; - dimensions: number; - content_hash: string; - vector_json: Buffer | string; - vector_backend: string; -}; - -type SqliteMaintenanceStats = { - pageSize: number; - pageCount: number; - freelistPages: number; - bytes: number; - walBytes: number; - shmBytes: number; - sidecarBytes: number; -}; - -type DurableTuiClosure = { - clusterId: number; - status: 'active' | 'closed' | 'merged' | 'split'; - closedAt: string | null; - reason: string | null; -}; - -type RepoPipelineStateRow = { - repo_id: number; - summary_model: string; - summary_prompt_version: string; - embedding_basis: EmbeddingBasis; - embed_model: string; - embed_dimensions: number; - embed_pipeline_version: string; - vector_backend: string; - vectors_current_at: string | null; - clusters_current_at: string | null; - updated_at: string; -}; - -type ClusterExperimentMemoryStats = { - rssBeforeBytes: number; - rssAfterBytes: number; - peakRssBytes: number; - heapUsedBeforeBytes: number; - heapUsedAfterBytes: number; - peakHeapUsedBytes: number; -}; - -type ClusterExperimentSizeBucket = { - size: number; - count: number; -}; - -type ClusterExperimentClusterSizeStats = { - soloClusters: number; - maxClusterSize: number; - topClusterSizes: number[]; - histogram: ClusterExperimentSizeBucket[]; -}; - -type ClusterExperimentCluster = { - representativeThreadId: number; - memberThreadIds: number[]; -}; - -type ClusterExperimentResult = { - backend: 'exact' | 'vectorlite'; - repository: RepositoryDto; - tempDbPath: string | null; - threads: number; - sourceKinds: number; - edges: number; - clusters: number; - timingBasis: 'cluster-only'; - durationMs: number; - totalDurationMs: number; - loadMs: number; - setupMs: number; - edgeBuildMs: number; - indexBuildMs: number; - queryMs: number; - clusterBuildMs: number; - candidateK: number; - memory: ClusterExperimentMemoryStats; - clusterSizes: ClusterExperimentClusterSizeStats; - clustersDetail: ClusterExperimentCluster[] | null; -}; - -type SummaryModelPricing = { - inputCostPerM: number; - cachedInputCostPerM: number; - outputCostPerM: number; -}; - -type EmbeddingWorkset = { - rows: Array<{ - id: number; - number: number; - title: string; - body: string | null; - }>; - tasks: ActiveVectorTask[]; - existing: Map; - pending: ActiveVectorTask[]; - missingSummaryThreadNumbers: number[]; -}; - -type SyncCursorState = { - lastFullOpenScanStartedAt: string | null; - lastOverlappingOpenScanCompletedAt: string | null; - lastNonOverlappingScanCompletedAt: string | null; - lastReconciledOpenCloseAt: string | null; -}; - -type SyncRunStats = { - threadsSynced: number; - commentsSynced: number; - codeFilesSynced: number; - threadsClosed: number; - threadsClosedFromClosedSweep?: number; - threadsClosedFromClosedBackfill?: number; - threadsClosedFromDirectReconcile?: number; - directReconcileSkippedStaleThreadCount?: number; - crawlStartedAt: string; - requestedSince: string | null; - effectiveSince: string | null; - limit: number | null; - includeComments: boolean; - includeCode?: boolean; - fullReconcile?: boolean; - isFullOpenScan: boolean; - isOverlappingOpenScan: boolean; - overlapReferenceAt: string | null; - reconciledOpenCloseAt: string | null; -}; - -export type TuiClusterSortMode = 'recent' | 'size'; - -export type TuiRepoStats = { - openIssueCount: number; - openPullRequestCount: number; - lastGithubReconciliationAt: string | null; - lastEmbedRefreshAt: string | null; - staleEmbedThreadCount: number; - staleEmbedSourceCount: number; - latestClusterRunId: number | null; - latestClusterRunFinishedAt: string | null; -}; - -export type TuiClusterSummary = { - clusterId: number; - displayTitle: string; - isClosed: boolean; - closedAtLocal: string | null; - closeReasonLocal: string | null; - totalCount: number; - issueCount: number; - pullRequestCount: number; - latestUpdatedAt: string | null; - representativeThreadId: number | null; - representativeNumber: number | null; - representativeKind: 'issue' | 'pull_request' | null; - searchText: string; -}; - -export type TuiClusterMember = { - id: number; - number: number; - kind: 'issue' | 'pull_request'; - isClosed: boolean; - title: string; - updatedAtGh: string | null; - htmlUrl: string; - labels: string[]; - clusterScore: number | null; -}; - -export type TuiClusterDetail = { - clusterId: number; - displayTitle: string; - isClosed: boolean; - closedAtLocal: string | null; - closeReasonLocal: string | null; - totalCount: number; - issueCount: number; - pullRequestCount: number; - latestUpdatedAt: string | null; - representativeThreadId: number | null; - representativeNumber: number | null; - representativeKind: 'issue' | 'pull_request' | null; - members: TuiClusterMember[]; -}; - -export type TuiThreadDetail = { - thread: ThreadDto; - summaries: Partial>; - keySummary: { - summaryKind: string; - promptVersion: string; - model: string; - text: string; - } | null; - topFiles: Array<{ - path: string; - status: string | null; - additions: number; - deletions: number; - }>; - neighbors: SearchHitDto['neighbors']; -}; - -export type TuiSnapshot = { - repository: RepositoryDto; - stats: TuiRepoStats; - clusterRunId: number | null; - clusters: TuiClusterSummary[]; -}; - -export type TuiRefreshState = { - repositoryUpdatedAt: string | null; - threadUpdatedAt: string | null; - threadClosedAt: string | null; - clusterClosedAt: string | null; - durableClusterUpdatedAt: string | null; - durableMembershipUpdatedAt: string | null; - latestSyncRunId: number | null; - latestEmbeddingRunId: number | null; - latestClusterRunId: number | null; -}; - -export type DoctorResult = { - health: HealthResponse; - github: { - configured: boolean; - source: ConfigValueSource; - tokenPresent: boolean; - error: string | null; - }; - openai: { - configured: boolean; - source: ConfigValueSource; - tokenPresent: boolean; - error: string | null; - }; - vectorlite: { - configured: boolean; - runtimeOk: boolean; - error: string | null; - }; -}; - -type SyncOptions = { - owner: string; - repo: string; - since?: string; - limit?: number; - includeComments?: boolean; - includeCode?: boolean; - fullReconcile?: boolean; - onProgress?: (message: string) => void; - startedAt?: string; -}; - -type PortableSyncExportOptions = { - owner: string; - repo: string; - outputPath?: string; - bodyChars?: number; - profile?: PortableSyncProfile; - writeManifest?: boolean; -}; - -type SearchResultInternal = SearchResponse; -type NeighborsResultInternal = NeighborsResponse; +export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; const SYNC_BATCH_SIZE = 100; const SYNC_BATCH_DELAY_MS = 5000; From 203b644244fc5baf9222598b9fa22d47e5af5056 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:04:31 -0700 Subject: [PATCH 138/215] refactor: extract service utilities --- packages/api-core/src/service-utils.ts | 200 ++++++++++++++++++++++ packages/api-core/src/service.ts | 220 +++---------------------- 2 files changed, 224 insertions(+), 196 deletions(-) create mode 100644 packages/api-core/src/service-utils.ts diff --git a/packages/api-core/src/service-utils.ts b/packages/api-core/src/service-utils.ts new file mode 100644 index 0000000..8d5200a --- /dev/null +++ b/packages/api-core/src/service-utils.ts @@ -0,0 +1,200 @@ +import crypto from 'node:crypto'; + +import type { RepositoryDto, ThreadDto } from '@ghcrawl/api-contract'; + +import type { SyncRunStats, ThreadRow } from './service-types.js'; + +export function nowIso(): string { + return new Date().toISOString(); +} + +export function parseIso(value: string | null | undefined): number | null { + if (!value) return null; + const parsed = Date.parse(value); + return Number.isNaN(parsed) ? null : parsed; +} + +export function isEffectivelyClosed(row: { state: string; closed_at_local: string | null }): boolean { + return row.state !== 'open' || row.closed_at_local !== null; +} + +export function isClosedGitHubPayload(payload: Record): boolean { + const state = typeof payload.state === 'string' ? payload.state.toLowerCase() : null; + if (state !== null && state !== 'open') return true; + if (typeof payload.closed_at === 'string' && payload.closed_at.length > 0) return true; + if (typeof payload.merged_at === 'string' && payload.merged_at.length > 0) return true; + return false; +} + +export function isMissingGitHubResourceError(error: unknown): boolean { + const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : null; + if (status === 404 || status === 410) { + return true; + } + const message = error instanceof Error ? error.message : String(error); + return /\b(404|410)\b/.test(message) || /Not Found|Gone/i.test(message); +} + +export function deriveIncrementalSince(referenceAt: string, crawlStartedAt: string): string { + const referenceMs = parseIso(referenceAt) ?? Date.now(); + const crawlMs = parseIso(crawlStartedAt) ?? Date.now(); + const gapMs = Math.max(0, crawlMs - referenceMs); + const hourMs = 60 * 60 * 1000; + const roundedHours = Math.max(2, Math.ceil(gapMs / hourMs)); + return new Date(crawlMs - roundedHours * hourMs).toISOString(); +} + +export function parseSyncRunStats(statsJson: string | null): SyncRunStats | null { + if (!statsJson) return null; + try { + const parsed = JSON.parse(statsJson) as Partial; + if (typeof parsed.crawlStartedAt !== 'string') { + return null; + } + return { + threadsSynced: typeof parsed.threadsSynced === 'number' ? parsed.threadsSynced : 0, + commentsSynced: typeof parsed.commentsSynced === 'number' ? parsed.commentsSynced : 0, + threadsClosed: typeof parsed.threadsClosed === 'number' ? parsed.threadsClosed : 0, + crawlStartedAt: parsed.crawlStartedAt, + requestedSince: typeof parsed.requestedSince === 'string' ? parsed.requestedSince : null, + effectiveSince: typeof parsed.effectiveSince === 'string' ? parsed.effectiveSince : null, + limit: typeof parsed.limit === 'number' ? parsed.limit : null, + includeComments: parsed.includeComments === true, + codeFilesSynced: typeof parsed.codeFilesSynced === 'number' ? parsed.codeFilesSynced : 0, + includeCode: parsed.includeCode === true, + isFullOpenScan: parsed.isFullOpenScan === true, + isOverlappingOpenScan: parsed.isOverlappingOpenScan === true, + overlapReferenceAt: typeof parsed.overlapReferenceAt === 'string' ? parsed.overlapReferenceAt : null, + reconciledOpenCloseAt: typeof parsed.reconciledOpenCloseAt === 'string' ? parsed.reconciledOpenCloseAt : null, + }; + } catch { + return null; + } +} + +export function asJson(value: unknown): string { + return JSON.stringify(value ?? null); +} + +export function parseArray(value: string): string[] { + return JSON.parse(value) as string[]; +} + +export function parseStringArrayJson(value: string | null | undefined): string[] { + if (!value) return []; + try { + const parsed = JSON.parse(value) as unknown; + return Array.isArray(parsed) ? parsed.filter((entry): entry is string => typeof entry === 'string') : []; + } catch { + return []; + } +} + +export function parseObjectJson(value: string | null | undefined): Record | null { + if (!value) return null; + try { + const parsed = JSON.parse(value) as unknown; + return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? (parsed as Record) : null; + } catch { + return null; + } +} + +export function userLogin(payload: Record): string | null { + const user = payload.user as Record | undefined; + const login = user?.login; + return typeof login === 'string' ? login : null; +} + +export function userType(payload: Record): string | null { + const user = payload.user as Record | undefined; + const type = user?.type; + return typeof type === 'string' ? type : null; +} + +export function isPullRequestPayload(payload: Record): boolean { + return Boolean(payload.pull_request); +} + +export function parseLabels(payload: Record): string[] { + const labels = payload.labels; + if (!Array.isArray(labels)) return []; + return labels + .map((label) => { + if (typeof label === 'string') return label; + if (label && typeof label === 'object' && typeof (label as Record).name === 'string') { + return String((label as Record).name); + } + return null; + }) + .filter((value): value is string => Boolean(value)); +} + +export function parseAssignees(payload: Record): string[] { + const assignees = payload.assignees; + if (!Array.isArray(assignees)) return []; + return assignees + .map((assignee) => { + if (assignee && typeof assignee === 'object' && typeof (assignee as Record).login === 'string') { + return String((assignee as Record).login); + } + return null; + }) + .filter((value): value is string => Boolean(value)); +} + +export function stableContentHash(input: string): string { + return crypto.createHash('sha256').update(input).digest('hex'); +} + +export function normalizeSummaryText(value: string): string { + return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim(); +} + +export function normalizeKeySummaryDisplayText(value: string): string { + return value + .split(/\r?\n/) + .map((line) => line.trim()) + .filter(Boolean) + .join('\n'); +} + +export function snippetText(value: string | null | undefined, maxChars: number): string | null { + if (!value) return null; + const normalized = value.replace(/\s+/g, ' ').trim(); + if (!normalized) return null; + if (normalized.length <= maxChars) return normalized; + return `${normalized.slice(0, Math.max(0, maxChars - 1)).trimEnd()}…`; +} + +export function repositoryToDto(row: Record): RepositoryDto { + return { + id: Number(row.id), + owner: String(row.owner), + name: String(row.name), + fullName: String(row.full_name), + githubRepoId: row.github_repo_id === null ? null : String(row.github_repo_id), + updatedAt: String(row.updated_at), + }; +} + +export function threadToDto(row: ThreadRow, clusterId?: number | null): ThreadDto { + return { + id: row.id, + repoId: row.repo_id, + number: row.number, + kind: row.kind, + state: row.state, + isClosed: isEffectivelyClosed(row), + closedAtGh: row.closed_at_gh ?? null, + closedAtLocal: row.closed_at_local ?? null, + closeReasonLocal: row.close_reason_local ?? null, + title: row.title, + body: row.body, + authorLogin: row.author_login, + htmlUrl: row.html_url, + labels: parseArray(row.labels_json), + updatedAtGh: row.updated_at_gh, + clusterId: clusterId ?? null, + }; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f6482d0..f8de4e9 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1,5 +1,4 @@ import http from 'node:http'; -import crypto from 'node:crypto'; import fs from 'node:fs'; import { existsSync } from 'node:fs'; import { createRequire } from 'node:module'; @@ -152,6 +151,30 @@ import type { TuiSnapshot, TuiThreadDetail, } from './service-types.js'; +import { + asJson, + deriveIncrementalSince, + isClosedGitHubPayload, + isEffectivelyClosed, + isMissingGitHubResourceError, + isPullRequestPayload, + normalizeKeySummaryDisplayText, + normalizeSummaryText, + nowIso, + parseArray, + parseAssignees, + parseIso, + parseLabels, + parseObjectJson, + parseStringArrayJson, + parseSyncRunStats, + repositoryToDto, + snippetText, + stableContentHash, + threadToDto, + userLogin, + userType, +} from './service-utils.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; @@ -201,201 +224,6 @@ const SUMMARY_MODEL_PRICING: Record = { }, }; -function nowIso(): string { - return new Date().toISOString(); -} - -function parseIso(value: string | null | undefined): number | null { - if (!value) return null; - const parsed = Date.parse(value); - return Number.isNaN(parsed) ? null : parsed; -} - -function isEffectivelyClosed(row: { state: string; closed_at_local: string | null }): boolean { - return row.state !== 'open' || row.closed_at_local !== null; -} - -function isClosedGitHubPayload(payload: Record): boolean { - const state = typeof payload.state === 'string' ? payload.state.toLowerCase() : null; - if (state !== null && state !== 'open') return true; - if (typeof payload.closed_at === 'string' && payload.closed_at.length > 0) return true; - if (typeof payload.merged_at === 'string' && payload.merged_at.length > 0) return true; - return false; -} - -function isMissingGitHubResourceError(error: unknown): boolean { - const status = typeof (error as { status?: unknown })?.status === 'number' ? Number((error as { status?: unknown }).status) : null; - if (status === 404 || status === 410) { - return true; - } - const message = error instanceof Error ? error.message : String(error); - return /\b(404|410)\b/.test(message) || /Not Found|Gone/i.test(message); -} - -function deriveIncrementalSince(referenceAt: string, crawlStartedAt: string): string { - const referenceMs = parseIso(referenceAt) ?? Date.now(); - const crawlMs = parseIso(crawlStartedAt) ?? Date.now(); - const gapMs = Math.max(0, crawlMs - referenceMs); - const hourMs = 60 * 60 * 1000; - const roundedHours = Math.max(2, Math.ceil(gapMs / hourMs)); - return new Date(crawlMs - roundedHours * hourMs).toISOString(); -} - -function parseSyncRunStats(statsJson: string | null): SyncRunStats | null { - if (!statsJson) return null; - try { - const parsed = JSON.parse(statsJson) as Partial; - if (typeof parsed.crawlStartedAt !== 'string') { - return null; - } - return { - threadsSynced: typeof parsed.threadsSynced === 'number' ? parsed.threadsSynced : 0, - commentsSynced: typeof parsed.commentsSynced === 'number' ? parsed.commentsSynced : 0, - threadsClosed: typeof parsed.threadsClosed === 'number' ? parsed.threadsClosed : 0, - crawlStartedAt: parsed.crawlStartedAt, - requestedSince: typeof parsed.requestedSince === 'string' ? parsed.requestedSince : null, - effectiveSince: typeof parsed.effectiveSince === 'string' ? parsed.effectiveSince : null, - limit: typeof parsed.limit === 'number' ? parsed.limit : null, - includeComments: parsed.includeComments === true, - codeFilesSynced: typeof parsed.codeFilesSynced === 'number' ? parsed.codeFilesSynced : 0, - includeCode: parsed.includeCode === true, - isFullOpenScan: parsed.isFullOpenScan === true, - isOverlappingOpenScan: parsed.isOverlappingOpenScan === true, - overlapReferenceAt: typeof parsed.overlapReferenceAt === 'string' ? parsed.overlapReferenceAt : null, - reconciledOpenCloseAt: typeof parsed.reconciledOpenCloseAt === 'string' ? parsed.reconciledOpenCloseAt : null, - }; - } catch { - return null; - } -} - -function asJson(value: unknown): string { - return JSON.stringify(value ?? null); -} - -function parseArray(value: string): string[] { - return JSON.parse(value) as string[]; -} - -function parseStringArrayJson(value: string | null | undefined): string[] { - if (!value) return []; - try { - const parsed = JSON.parse(value) as unknown; - return Array.isArray(parsed) ? parsed.filter((entry): entry is string => typeof entry === 'string') : []; - } catch { - return []; - } -} - -function parseObjectJson(value: string | null | undefined): Record | null { - if (!value) return null; - try { - const parsed = JSON.parse(value) as unknown; - return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? (parsed as Record) : null; - } catch { - return null; - } -} - -function userLogin(payload: Record): string | null { - const user = payload.user as Record | undefined; - const login = user?.login; - return typeof login === 'string' ? login : null; -} - -function userType(payload: Record): string | null { - const user = payload.user as Record | undefined; - const type = user?.type; - return typeof type === 'string' ? type : null; -} - -function isPullRequestPayload(payload: Record): boolean { - return Boolean(payload.pull_request); -} - -function parseLabels(payload: Record): string[] { - const labels = payload.labels; - if (!Array.isArray(labels)) return []; - return labels - .map((label) => { - if (typeof label === 'string') return label; - if (label && typeof label === 'object' && typeof (label as Record).name === 'string') { - return String((label as Record).name); - } - return null; - }) - .filter((value): value is string => Boolean(value)); -} - -function parseAssignees(payload: Record): string[] { - const assignees = payload.assignees; - if (!Array.isArray(assignees)) return []; - return assignees - .map((assignee) => { - if (assignee && typeof assignee === 'object' && typeof (assignee as Record).login === 'string') { - return String((assignee as Record).login); - } - return null; - }) - .filter((value): value is string => Boolean(value)); -} - -function stableContentHash(input: string): string { - return crypto.createHash('sha256').update(input).digest('hex'); -} - -function normalizeSummaryText(value: string): string { - return value.replace(/\r/g, '\n').replace(/\s+/g, ' ').trim(); -} - -function normalizeKeySummaryDisplayText(value: string): string { - return value - .split(/\r?\n/) - .map((line) => line.trim()) - .filter(Boolean) - .join('\n'); -} - -function snippetText(value: string | null | undefined, maxChars: number): string | null { - if (!value) return null; - const normalized = value.replace(/\s+/g, ' ').trim(); - if (!normalized) return null; - if (normalized.length <= maxChars) return normalized; - return `${normalized.slice(0, Math.max(0, maxChars - 1)).trimEnd()}…`; -} - -function repositoryToDto(row: Record): RepositoryDto { - return { - id: Number(row.id), - owner: String(row.owner), - name: String(row.name), - fullName: String(row.full_name), - githubRepoId: row.github_repo_id === null ? null : String(row.github_repo_id), - updatedAt: String(row.updated_at), - }; -} - -function threadToDto(row: ThreadRow, clusterId?: number | null): ThreadDto { - return { - id: row.id, - repoId: row.repo_id, - number: row.number, - kind: row.kind, - state: row.state, - isClosed: isEffectivelyClosed(row), - closedAtGh: row.closed_at_gh ?? null, - closedAtLocal: row.closed_at_local ?? null, - closeReasonLocal: row.close_reason_local ?? null, - title: row.title, - body: row.body, - authorLogin: row.author_login, - htmlUrl: row.html_url, - labels: parseArray(row.labels_json), - updatedAtGh: row.updated_at_gh, - clusterId: clusterId ?? null, - }; -} - export class GHCrawlService { readonly config: GitcrawlConfig; readonly db: SqliteDatabase; From 5d2567b956efc4e6e16bfa9964fe2fc3b782355d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:05:41 -0700 Subject: [PATCH 139/215] refactor: extract service constants --- packages/api-core/src/service-constants.ts | 48 +++++++++++++ packages/api-core/src/service.ts | 80 +++++++++------------- 2 files changed, 82 insertions(+), 46 deletions(-) create mode 100644 packages/api-core/src/service-constants.ts diff --git a/packages/api-core/src/service-constants.ts b/packages/api-core/src/service-constants.ts new file mode 100644 index 0000000..c7e4dc1 --- /dev/null +++ b/packages/api-core/src/service-constants.ts @@ -0,0 +1,48 @@ +import { createRequire } from 'node:module'; + +import type { SummaryModelPricing } from './service-types.js'; + +export const SYNC_BATCH_SIZE = 100; +export const SYNC_BATCH_DELAY_MS = 5000; +export const STALE_CLOSED_SWEEP_LIMIT = 1000; +export const STALE_CLOSED_BACKFILL_LIMIT = 5000; +export const MAX_DIRECT_RECONCILE_THREADS = 500; +export const CLUSTER_PROGRESS_INTERVAL_MS = 5000; +export const DURABLE_CLUSTER_REUSE_MIN_OVERLAP = 0.8; +export const RAW_JSON_INLINE_THRESHOLD_BYTES = 4096; +export const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000; +export const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3; +export const EMBED_MAX_ITEM_TOKENS = 7000; +export const EMBED_MAX_BATCH_TOKENS = 250000; +export const requireFromHere = createRequire(import.meta.url); +export const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]'; +export const EMBED_CONTEXT_RETRY_ATTEMPTS = 5; +export const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9; +export const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95; +export const KEY_SUMMARY_MAX_BODY_CHARS = 6000; +export const KEY_SUMMARY_CONCURRENCY = 24; +export const KEY_SUMMARY_MAX_UNREAD = 48; +export const SUMMARY_PROMPT_VERSION = 'v1'; +export const ACTIVE_EMBED_DIMENSIONS = 1024; +export const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; +export const DEFAULT_CLUSTER_MIN_SCORE = 0.8; +export const DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE = 0.36; +export const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.93; +export const DEFAULT_CLUSTER_MAX_SIZE = 40; +export const VECTORLITE_CLUSTER_EXPANDED_K = 24; +export const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; +export const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512; +export const VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH = 1024; + +export const SUMMARY_MODEL_PRICING: Record = { + 'gpt-5-mini': { + inputCostPerM: 0.25, + cachedInputCostPerM: 0.025, + outputCostPerM: 2.0, + }, + 'gpt-5.4-mini': { + inputCostPerM: 0.75, + cachedInputCostPerM: 0.075, + outputCostPerM: 4.5, + }, +}; diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f8de4e9..b8caf5a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1,7 +1,6 @@ import http from 'node:http'; import fs from 'node:fs'; import { existsSync } from 'node:fs'; -import { createRequire } from 'node:module'; import os from 'node:os'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; @@ -117,6 +116,40 @@ import { type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; +import { + ACTIVE_EMBED_DIMENSIONS, + ACTIVE_EMBED_PIPELINE_VERSION, + CLUSTER_PARALLEL_MIN_EMBEDDINGS, + CLUSTER_PROGRESS_INTERVAL_MS, + DEFAULT_CLUSTER_MAX_SIZE, + DEFAULT_CLUSTER_MIN_SCORE, + DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE, + DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE, + DURABLE_CLUSTER_REUSE_MIN_OVERLAP, + EMBED_CONTEXT_RETRY_ATTEMPTS, + EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO, + EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO, + EMBED_ESTIMATED_CHARS_PER_TOKEN, + EMBED_MAX_BATCH_TOKENS, + EMBED_MAX_ITEM_TOKENS, + EMBED_TRUNCATION_MARKER, + KEY_SUMMARY_CONCURRENCY, + KEY_SUMMARY_MAX_BODY_CHARS, + KEY_SUMMARY_MAX_UNREAD, + MAX_DIRECT_RECONCILE_THREADS, + RAW_JSON_INLINE_THRESHOLD_BYTES, + requireFromHere, + STALE_CLOSED_BACKFILL_LIMIT, + STALE_CLOSED_SWEEP_LIMIT, + SUMMARY_MODEL_PRICING, + SUMMARY_PROMPT_VERSION, + SYNC_BATCH_DELAY_MS, + SYNC_BATCH_SIZE, + VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K, + VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH, + VECTORLITE_CLUSTER_EXPANDED_K, + VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, +} from './service-constants.js'; import type { ActiveVectorRow, ActiveVectorTask, @@ -138,7 +171,6 @@ import type { SimilaritySourceKind, SqliteMaintenanceStats, StoredEmbeddingRow, - SummaryModelPricing, SyncCursorState, SyncOptions, SyncRunStats, @@ -180,50 +212,6 @@ import { VectorliteStore } from './vector/vectorlite-store.js'; export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; -const SYNC_BATCH_SIZE = 100; -const SYNC_BATCH_DELAY_MS = 5000; -const STALE_CLOSED_SWEEP_LIMIT = 1000; -const STALE_CLOSED_BACKFILL_LIMIT = 5000; -const MAX_DIRECT_RECONCILE_THREADS = 500; -const CLUSTER_PROGRESS_INTERVAL_MS = 5000; -const DURABLE_CLUSTER_REUSE_MIN_OVERLAP = 0.8; -const RAW_JSON_INLINE_THRESHOLD_BYTES = 4096; -const CLUSTER_PARALLEL_MIN_EMBEDDINGS = 5000; -const EMBED_ESTIMATED_CHARS_PER_TOKEN = 3; -const EMBED_MAX_ITEM_TOKENS = 7000; -const EMBED_MAX_BATCH_TOKENS = 250000; -const requireFromHere = createRequire(import.meta.url); -const EMBED_TRUNCATION_MARKER = '\n\n[truncated for embedding]'; -const EMBED_CONTEXT_RETRY_ATTEMPTS = 5; -const EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO = 0.9; -const EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO = 0.95; -const KEY_SUMMARY_MAX_BODY_CHARS = 6000; -const KEY_SUMMARY_CONCURRENCY = 24; -const KEY_SUMMARY_MAX_UNREAD = 48; -const SUMMARY_PROMPT_VERSION = 'v1'; -const ACTIVE_EMBED_DIMENSIONS = 1024; -const ACTIVE_EMBED_PIPELINE_VERSION = 'vectorlite-1024-v1'; -const DEFAULT_CLUSTER_MIN_SCORE = 0.8; -const DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE = 0.36; -const DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE = 0.93; -const DEFAULT_CLUSTER_MAX_SIZE = 40; -const VECTORLITE_CLUSTER_EXPANDED_K = 24; -const VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER = 4; -const VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K = 512; -const VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH = 1024; -const SUMMARY_MODEL_PRICING: Record = { - 'gpt-5-mini': { - inputCostPerM: 0.25, - cachedInputCostPerM: 0.025, - outputCostPerM: 2.0, - }, - 'gpt-5.4-mini': { - inputCostPerM: 0.75, - cachedInputCostPerM: 0.075, - outputCostPerM: 4.5, - }, -}; - export class GHCrawlService { readonly config: GitcrawlConfig; readonly db: SqliteDatabase; From a88443eff679543dc8c3b4ab8754fe5a379b71fb Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:08:33 -0700 Subject: [PATCH 140/215] refactor: move repo param parsing --- packages/api-core/src/api/params.ts | 8 ++++++++ packages/api-core/src/api/server.ts | 3 ++- packages/api-core/src/service.ts | 11 +---------- 3 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 packages/api-core/src/api/params.ts diff --git a/packages/api-core/src/api/params.ts b/packages/api-core/src/api/params.ts new file mode 100644 index 0000000..2458997 --- /dev/null +++ b/packages/api-core/src/api/params.ts @@ -0,0 +1,8 @@ +export function parseRepoParams(url: URL): { owner: string; repo: string } { + const owner = url.searchParams.get('owner'); + const repo = url.searchParams.get('repo'); + if (!owner || !repo) { + throw new Error('Missing owner or repo query parameter'); + } + return { owner, repo }; +} diff --git a/packages/api-core/src/api/server.ts b/packages/api-core/src/api/server.ts index ef88d8a..a27f29f 100644 --- a/packages/api-core/src/api/server.ts +++ b/packages/api-core/src/api/server.ts @@ -13,7 +13,8 @@ import { } from '@ghcrawl/api-contract'; import { ZodError } from 'zod'; -import { GHCrawlService, parseRepoParams } from '../service.js'; +import { GHCrawlService } from '../service.js'; +import { parseRepoParams } from './params.js'; function sendJson(res: http.ServerResponse, status: number, payload: unknown): void { res.writeHead(status, { 'content-type': 'application/json; charset=utf-8' }); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b8caf5a..525f97f 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1,4 +1,3 @@ -import http from 'node:http'; import fs from 'node:fs'; import { existsSync } from 'node:fs'; import os from 'node:os'; @@ -211,6 +210,7 @@ import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/st import { VectorliteStore } from './vector/vectorlite-store.js'; export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; +export { parseRepoParams } from './api/params.js'; export class GHCrawlService { readonly config: GitcrawlConfig; @@ -6820,12 +6820,3 @@ export class GHCrawlService { ); } } - -export function parseRepoParams(url: URL): { owner: string; repo: string } { - const owner = url.searchParams.get('owner'); - const repo = url.searchParams.get('repo'); - if (!owner || !repo) { - throw new Error('Missing owner or repo query parameter'); - } - return { owner, repo }; -} From 4675bb0e3b7c55a9bf99f742e0111ea1b5aaad73 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:11:15 -0700 Subject: [PATCH 141/215] refactor: extract storage maintenance --- packages/api-core/src/service.ts | 120 +----------------- packages/api-core/src/storage-maintenance.ts | 126 +++++++++++++++++++ 2 files changed, 130 insertions(+), 116 deletions(-) create mode 100644 packages/api-core/src/storage-maintenance.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 525f97f..5aaebc2 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -115,6 +115,7 @@ import { type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; +import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { ACTIVE_EMBED_DIMENSIONS, ACTIVE_EMBED_PIPELINE_VERSION, @@ -168,7 +169,6 @@ import type { RunTable, SearchResultInternal, SimilaritySourceKind, - SqliteMaintenanceStats, StoredEmbeddingRow, SyncCursorState, SyncOptions, @@ -2835,7 +2835,7 @@ export class GHCrawlService { : null; const targets = [ - this.optimizeSqliteTarget({ + optimizeSqliteTarget({ name: 'main', db: this.db, dbPath: this.config.dbPath, @@ -2852,7 +2852,7 @@ export class GHCrawlService { const vectorlite = requireFromHere('vectorlite') as { vectorlitePath: () => string }; vectorDb.loadExtension(vectorlite.vectorlitePath()); targets.push( - this.optimizeSqliteTarget({ + optimizeSqliteTarget({ name: 'vector', db: vectorDb, dbPath: storePath, @@ -2863,27 +2863,7 @@ export class GHCrawlService { vectorDb.close(); } } else { - targets.push({ - name: 'vector' as const, - path: storePath, - existed: false, - pageSize: 0, - pageCountBefore: 0, - pageCountAfter: 0, - freelistPagesBefore: 0, - freelistPagesAfter: 0, - bytesBefore: 0, - bytesAfter: 0, - walBytesBefore: 0, - walBytesAfter: 0, - shmBytesBefore: 0, - shmBytesAfter: 0, - sidecarBytesBefore: this.fileSize(sidecarPath), - sidecarBytesAfter: this.fileSize(sidecarPath), - bytesReclaimed: 0, - operations: ['skipped_missing_vector_store'], - durationMs: 0, - }); + targets.push(missingVectorStoreTarget(storePath, sidecarPath)); } } @@ -2945,98 +2925,6 @@ export class GHCrawlService { }); } - private optimizeSqliteTarget(params: { - name: 'main' | 'vector'; - db: SqliteDatabase; - dbPath: string; - sidecarPath?: string; - }): OptimizeResponse['targets'][number] { - const startedAt = Date.now(); - const before = this.sqliteMaintenanceStats(params.db, params.dbPath, params.sidecarPath); - const operations: string[] = []; - - this.runMaintenanceStep(params.db, 'wal_checkpoint_truncate_before', operations, () => { - params.db.pragma('wal_checkpoint(TRUNCATE)'); - }); - this.runMaintenanceStep(params.db, 'analyze', operations, () => { - params.db.exec('analyze'); - }); - this.runMaintenanceStep(params.db, 'pragma_optimize', operations, () => { - params.db.pragma('optimize'); - }); - this.runMaintenanceStep(params.db, 'vacuum', operations, () => { - params.db.exec('vacuum'); - }); - this.runMaintenanceStep(params.db, 'wal_checkpoint_truncate_after', operations, () => { - params.db.pragma('wal_checkpoint(TRUNCATE)'); - }); - - const after = this.sqliteMaintenanceStats(params.db, params.dbPath, params.sidecarPath); - const bytesBefore = before.bytes + before.walBytes + before.shmBytes; - const bytesAfter = after.bytes + after.walBytes + after.shmBytes; - - return { - name: params.name, - path: params.dbPath, - existed: params.dbPath === ':memory:' || existsSync(params.dbPath), - pageSize: after.pageSize || before.pageSize, - pageCountBefore: before.pageCount, - pageCountAfter: after.pageCount, - freelistPagesBefore: before.freelistPages, - freelistPagesAfter: after.freelistPages, - bytesBefore: before.bytes, - bytesAfter: after.bytes, - walBytesBefore: before.walBytes, - walBytesAfter: after.walBytes, - shmBytesBefore: before.shmBytes, - shmBytesAfter: after.shmBytes, - sidecarBytesBefore: before.sidecarBytes, - sidecarBytesAfter: after.sidecarBytes, - bytesReclaimed: Math.max(0, bytesBefore - bytesAfter), - operations, - durationMs: Date.now() - startedAt, - }; - } - - private runMaintenanceStep(db: SqliteDatabase, label: string, operations: string[], step: () => void): void { - try { - step(); - operations.push(label); - } catch (error) { - operations.push(`${label}_skipped:${error instanceof Error ? error.message : String(error)}`); - } - } - - private sqliteMaintenanceStats(db: SqliteDatabase, dbPath: string, sidecarPath?: string): SqliteMaintenanceStats { - return { - pageSize: this.safePragmaNumber(db, 'page_size'), - pageCount: this.safePragmaNumber(db, 'page_count'), - freelistPages: this.safePragmaNumber(db, 'freelist_count'), - bytes: this.fileSize(dbPath), - walBytes: this.fileSize(`${dbPath}-wal`), - shmBytes: this.fileSize(`${dbPath}-shm`), - sidecarBytes: sidecarPath ? this.fileSize(sidecarPath) : 0, - }; - } - - private safePragmaNumber(db: SqliteDatabase, name: string): number { - try { - const value = db.pragma(name, { simple: true }) as unknown; - return typeof value === 'number' && Number.isFinite(value) ? value : 0; - } catch { - return 0; - } - } - - private fileSize(filePath: string): number { - if (filePath === ':memory:') return 0; - try { - return fs.statSync(filePath).size; - } catch { - return 0; - } - } - listClusterSummaries(params: { owner: string; repo: string; diff --git a/packages/api-core/src/storage-maintenance.ts b/packages/api-core/src/storage-maintenance.ts new file mode 100644 index 0000000..9a1ad7c --- /dev/null +++ b/packages/api-core/src/storage-maintenance.ts @@ -0,0 +1,126 @@ +import fs from 'node:fs'; +import { existsSync } from 'node:fs'; + +import type { OptimizeResponse } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from './db/sqlite.js'; +import type { SqliteMaintenanceStats } from './service-types.js'; + +type OptimizeTarget = OptimizeResponse['targets'][number]; + +export function missingVectorStoreTarget(storePath: string, sidecarPath: string): OptimizeTarget { + const sidecarBytes = fileSize(sidecarPath); + return { + name: 'vector', + path: storePath, + existed: false, + pageSize: 0, + pageCountBefore: 0, + pageCountAfter: 0, + freelistPagesBefore: 0, + freelistPagesAfter: 0, + bytesBefore: 0, + bytesAfter: 0, + walBytesBefore: 0, + walBytesAfter: 0, + shmBytesBefore: 0, + shmBytesAfter: 0, + sidecarBytesBefore: sidecarBytes, + sidecarBytesAfter: sidecarBytes, + bytesReclaimed: 0, + operations: ['skipped_missing_vector_store'], + durationMs: 0, + }; +} + +export function optimizeSqliteTarget(params: { + name: 'main' | 'vector'; + db: SqliteDatabase; + dbPath: string; + sidecarPath?: string; +}): OptimizeTarget { + const startedAt = Date.now(); + const before = sqliteMaintenanceStats(params.db, params.dbPath, params.sidecarPath); + const operations: string[] = []; + + runMaintenanceStep(params.db, 'wal_checkpoint_truncate_before', operations, () => { + params.db.pragma('wal_checkpoint(TRUNCATE)'); + }); + runMaintenanceStep(params.db, 'analyze', operations, () => { + params.db.exec('analyze'); + }); + runMaintenanceStep(params.db, 'pragma_optimize', operations, () => { + params.db.pragma('optimize'); + }); + runMaintenanceStep(params.db, 'vacuum', operations, () => { + params.db.exec('vacuum'); + }); + runMaintenanceStep(params.db, 'wal_checkpoint_truncate_after', operations, () => { + params.db.pragma('wal_checkpoint(TRUNCATE)'); + }); + + const after = sqliteMaintenanceStats(params.db, params.dbPath, params.sidecarPath); + const bytesBefore = before.bytes + before.walBytes + before.shmBytes; + const bytesAfter = after.bytes + after.walBytes + after.shmBytes; + + return { + name: params.name, + path: params.dbPath, + existed: params.dbPath === ':memory:' || existsSync(params.dbPath), + pageSize: after.pageSize || before.pageSize, + pageCountBefore: before.pageCount, + pageCountAfter: after.pageCount, + freelistPagesBefore: before.freelistPages, + freelistPagesAfter: after.freelistPages, + bytesBefore: before.bytes, + bytesAfter: after.bytes, + walBytesBefore: before.walBytes, + walBytesAfter: after.walBytes, + shmBytesBefore: before.shmBytes, + shmBytesAfter: after.shmBytes, + sidecarBytesBefore: before.sidecarBytes, + sidecarBytesAfter: after.sidecarBytes, + bytesReclaimed: Math.max(0, bytesBefore - bytesAfter), + operations, + durationMs: Date.now() - startedAt, + }; +} + +function runMaintenanceStep(db: SqliteDatabase, label: string, operations: string[], step: () => void): void { + try { + step(); + operations.push(label); + } catch (error) { + operations.push(`${label}_skipped:${error instanceof Error ? error.message : String(error)}`); + } +} + +function sqliteMaintenanceStats(db: SqliteDatabase, dbPath: string, sidecarPath?: string): SqliteMaintenanceStats { + return { + pageSize: safePragmaNumber(db, 'page_size'), + pageCount: safePragmaNumber(db, 'page_count'), + freelistPages: safePragmaNumber(db, 'freelist_count'), + bytes: fileSize(dbPath), + walBytes: fileSize(`${dbPath}-wal`), + shmBytes: fileSize(`${dbPath}-shm`), + sidecarBytes: sidecarPath ? fileSize(sidecarPath) : 0, + }; +} + +function safePragmaNumber(db: SqliteDatabase, name: string): number { + try { + const value = db.pragma(name, { simple: true }) as unknown; + return typeof value === 'number' && Number.isFinite(value) ? value : 0; + } catch { + return 0; + } +} + +function fileSize(filePath: string): number { + if (filePath === ':memory:') return 0; + try { + return fs.statSync(filePath).size; + } catch { + return 0; + } +} From 3902b281458a999a0fa137f1ec2c0d799af8386e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:14:14 -0700 Subject: [PATCH 142/215] refactor: extract run history helpers --- packages/api-core/src/run-history.ts | 77 +++++++++++++++++++++ packages/api-core/src/service.ts | 100 ++++++--------------------- 2 files changed, 97 insertions(+), 80 deletions(-) create mode 100644 packages/api-core/src/run-history.ts diff --git a/packages/api-core/src/run-history.ts b/packages/api-core/src/run-history.ts new file mode 100644 index 0000000..e3afce9 --- /dev/null +++ b/packages/api-core/src/run-history.ts @@ -0,0 +1,77 @@ +import { runHistoryResponseSchema, type RepositoryDto, type RunHistoryResponse, type RunKind } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from './db/sqlite.js'; +import type { RunTable } from './service-types.js'; +import { asJson, nowIso, parseObjectJson } from './service-utils.js'; + +const RUN_TABLES: Array<{ kind: RunKind; table: RunTable }> = [ + { kind: 'sync', table: 'sync_runs' }, + { kind: 'summary', table: 'summary_runs' }, + { kind: 'embedding', table: 'embedding_runs' }, + { kind: 'cluster', table: 'cluster_runs' }, +]; + +export function listRunHistoryForRepository(params: { + db: SqliteDatabase; + repository: RepositoryDto; + kind?: RunKind; + limit?: number; +}): RunHistoryResponse { + const limit = Math.min(Math.max(params.limit ?? 20, 1), 200); + const selectedTables = params.kind ? RUN_TABLES.filter((entry) => entry.kind === params.kind) : RUN_TABLES; + const sql = selectedTables + .map( + (entry) => + `select '${entry.kind}' as run_kind, id, scope, status, started_at, finished_at, stats_json, error_text from ${entry.table} where repo_id = ?`, + ) + .join(' union all '); + const rows = params.db + .prepare(`select * from (${sql}) order by started_at desc, id desc limit ?`) + .all(...selectedTables.map(() => params.repository.id), limit) as Array<{ + run_kind: RunKind; + id: number; + scope: string; + status: string; + started_at: string; + finished_at: string | null; + stats_json: string | null; + error_text: string | null; + }>; + + return runHistoryResponseSchema.parse({ + repository: params.repository, + runs: rows.map((row) => ({ + runId: row.id, + runKind: row.run_kind, + scope: row.scope, + status: row.status, + startedAt: row.started_at, + finishedAt: row.finished_at, + stats: parseObjectJson(row.stats_json), + errorText: row.error_text, + })), + }); +} + +export function startServiceRun(db: SqliteDatabase, table: RunTable, repoId: number, scope: string): number { + const result = db.prepare(`insert into ${table} (repo_id, scope, status, started_at) values (?, ?, 'running', ?)`).run(repoId, scope, nowIso()); + return Number(result.lastInsertRowid); +} + +export function finishServiceRun( + db: SqliteDatabase, + table: RunTable, + runId: number, + status: 'completed' | 'failed', + stats?: unknown, + error?: unknown, + finishedAt = nowIso(), +): void { + db.prepare(`update ${table} set status = ?, finished_at = ?, stats_json = ?, error_text = ? where id = ?`).run( + status, + finishedAt, + stats === undefined ? null : asJson(stats), + error instanceof Error ? error.message : error ? String(error) : null, + runId, + ); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 5aaebc2..2e1e55a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -24,7 +24,6 @@ import { optimizeResponseSchema, refreshResponseSchema, repositoriesResponseSchema, - runHistoryResponseSchema, searchResponseSchema, syncResultSchema, threadsResponseSchema, @@ -114,6 +113,7 @@ import { type PortableSyncStatusResponse, type PortableSyncValidationResponse, } from './portable/sync-store.js'; +import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from './run-history.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { @@ -166,7 +166,6 @@ import type { NeighborsResultInternal, PortableSyncExportOptions, RepoPipelineStateRow, - RunTable, SearchResultInternal, SimilaritySourceKind, StoredEmbeddingRow, @@ -298,45 +297,11 @@ export class GHCrawlService { listRunHistory(params: { owner: string; repo: string; kind?: RunKind; limit?: number }): RunHistoryResponse { const repository = this.requireRepository(params.owner, params.repo); - const limit = Math.min(Math.max(params.limit ?? 20, 1), 200); - const tables: Array<{ kind: RunKind; table: RunTable }> = [ - { kind: 'sync', table: 'sync_runs' }, - { kind: 'summary', table: 'summary_runs' }, - { kind: 'embedding', table: 'embedding_runs' }, - { kind: 'cluster', table: 'cluster_runs' }, - ]; - const selectedTables = params.kind ? tables.filter((entry) => entry.kind === params.kind) : tables; - const sql = selectedTables - .map( - (entry) => - `select '${entry.kind}' as run_kind, id, scope, status, started_at, finished_at, stats_json, error_text from ${entry.table} where repo_id = ?`, - ) - .join(' union all '); - const rows = this.db - .prepare(`select * from (${sql}) order by started_at desc, id desc limit ?`) - .all(...selectedTables.map(() => repository.id), limit) as Array<{ - run_kind: RunKind; - id: number; - scope: string; - status: string; - started_at: string; - finished_at: string | null; - stats_json: string | null; - error_text: string | null; - }>; - - return runHistoryResponseSchema.parse({ + return listRunHistoryForRepository({ + db: this.db, repository, - runs: rows.map((row) => ({ - runId: row.id, - runKind: row.run_kind, - scope: row.scope, - status: row.status, - startedAt: row.started_at, - finishedAt: row.finished_at, - stats: parseObjectJson(row.stats_json), - errorText: row.error_text, - })), + kind: params.kind, + limit: params.limit, }); } @@ -1006,7 +971,7 @@ export class GHCrawlService { const reporter = params.onProgress ? (message: string) => params.onProgress?.(message.replace(/^\[github\]/, '[sync/github]')) : undefined; const repoData = await github.getRepo(params.owner, params.repo, reporter); const repoId = this.upsertRepository(params.owner, params.repo, repoData); - const runId = this.startRun('sync_runs', repoId, `${params.owner}/${params.repo}`); + const runId = startServiceRun(this.db, 'sync_runs', repoId, `${params.owner}/${params.repo}`); const syncCursor = this.getSyncCursorState(repoId); const overlapReferenceAt = syncCursor.lastOverlappingOpenScanCompletedAt ?? syncCursor.lastFullOpenScanStartedAt; const effectiveSince = @@ -1162,7 +1127,7 @@ export class GHCrawlService { lastReconciledOpenCloseAt: reconciledOpenCloseAt ?? syncCursor.lastReconciledOpenCloseAt, }; this.writeSyncCursorState(repoId, nextSyncCursor); - this.finishRun('sync_runs', runId, 'completed', { + finishServiceRun(this.db, 'sync_runs', runId, 'completed', { threadsSynced, commentsSynced, codeFilesSynced, @@ -1187,7 +1152,7 @@ export class GHCrawlService { } satisfies SyncRunStats, undefined, finishedAt); return syncResultSchema.parse({ runId, threadsSynced, commentsSynced, codeFilesSynced, threadsClosed }); } catch (error) { - this.finishRun('sync_runs', runId, 'failed', null, error); + finishServiceRun(this.db, 'sync_runs', runId, 'failed', null, error); throw error; } } @@ -1201,7 +1166,7 @@ export class GHCrawlService { }): Promise<{ runId: number; summarized: number; inputTokens: number; outputTokens: number; totalTokens: number }> { const ai = this.requireAi(); const repository = this.requireRepository(params.owner, params.repo); - const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName); + const runId = startServiceRun(this.db, 'summary_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName); const includeComments = params.includeComments ?? false; try { @@ -1333,10 +1298,10 @@ export class GHCrawlService { summarized += 1; } - this.finishRun('summary_runs', runId, 'completed', { summarized, inputTokens, outputTokens, totalTokens }); + finishServiceRun(this.db, 'summary_runs', runId, 'completed', { summarized, inputTokens, outputTokens, totalTokens }); return { runId, summarized, inputTokens, outputTokens, totalTokens }; } catch (error) { - this.finishRun('summary_runs', runId, 'failed', null, error); + finishServiceRun(this.db, 'summary_runs', runId, 'failed', null, error); throw error; } } @@ -1364,7 +1329,7 @@ export class GHCrawlService { const generateKeySummary = ai.generateKeySummary.bind(ai); const providerName = ai.providerName ?? 'custom'; const repository = this.requireRepository(params.owner, params.repo); - const runId = this.startRun('summary_runs', repository.id, params.threadNumber ? `key-summary:${params.threadNumber}` : `key-summary:${repository.fullName}`); + const runId = startServiceRun(this.db, 'summary_runs', repository.id, params.threadNumber ? `key-summary:${params.threadNumber}` : `key-summary:${repository.fullName}`); try { let sql = @@ -1521,10 +1486,10 @@ export class GHCrawlService { } const payload = { runId, generated, skipped, failed, inputTokens, outputTokens, totalTokens, errorSamples }; - this.finishRun('summary_runs', runId, 'completed', payload); + finishServiceRun(this.db, 'summary_runs', runId, 'completed', payload); return payload; } catch (error) { - this.finishRun('summary_runs', runId, 'failed', null, error); + finishServiceRun(this.db, 'summary_runs', runId, 'failed', null, error); throw error; } } @@ -1587,7 +1552,7 @@ export class GHCrawlService { }): Promise { const ai = this.requireAi(); const repository = this.requireRepository(params.owner, params.repo); - const runId = this.startRun('embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName); + const runId = startServiceRun(this.db, 'embedding_runs', repository.id, params.threadNumber ? `thread:${params.threadNumber}` : repository.fullName); try { if (params.threadNumber === undefined) { @@ -1646,10 +1611,10 @@ export class GHCrawlService { } this.markRepoVectorsCurrent(repository.id); - this.finishRun('embedding_runs', runId, 'completed', { embedded }); + finishServiceRun(this.db, 'embedding_runs', runId, 'completed', { embedded }); return embedResultSchema.parse({ runId, embedded }); } catch (error) { - this.finishRun('embedding_runs', runId, 'failed', null, error); + finishServiceRun(this.db, 'embedding_runs', runId, 'failed', null, error); throw error; } } @@ -1665,7 +1630,7 @@ export class GHCrawlService { }): Promise { const repository = this.requireRepository(params.owner, params.repo); const runSubject = params.threadNumber ? `${repository.fullName}#${params.threadNumber}` : repository.fullName; - const runId = this.startRun('cluster_runs', repository.id, runSubject); + const runId = startServiceRun(this.db, 'cluster_runs', repository.id, runSubject); const pipelineRunId = createPipelineRun(this.db, { repoId: repository.id, runKind: params.threadNumber ? 'cluster_incremental' : 'cluster', @@ -1856,11 +1821,11 @@ export class GHCrawlService { crossKindMinScore, ...clusterQuality, }; - this.finishRun('cluster_runs', runId, 'completed', stats); + finishServiceRun(this.db, 'cluster_runs', runId, 'completed', stats); finishPipelineRun(this.db, pipelineRunId, { status: 'completed', stats }); return clusterResultSchema.parse({ runId, edges: edges.length, clusters: clusters.length }); } catch (error) { - this.finishRun('cluster_runs', runId, 'failed', null, error); + finishServiceRun(this.db, 'cluster_runs', runId, 'failed', null, error); finishPipelineRun(this.db, pipelineRunId, { status: 'failed', errorText: error instanceof Error ? error.message : String(error) }); throw error; } @@ -6682,29 +6647,4 @@ export class GHCrawlService { ); } - private startRun(table: RunTable, repoId: number, scope: string): number { - const result = this.db - .prepare(`insert into ${table} (repo_id, scope, status, started_at) values (?, ?, 'running', ?)`) - .run(repoId, scope, nowIso()); - return Number(result.lastInsertRowid); - } - - private finishRun( - table: RunTable, - runId: number, - status: 'completed' | 'failed', - stats?: unknown, - error?: unknown, - finishedAt = nowIso(), - ): void { - this.db - .prepare(`update ${table} set status = ?, finished_at = ?, stats_json = ?, error_text = ? where id = ?`) - .run( - status, - finishedAt, - stats === undefined ? null : asJson(stats), - error instanceof Error ? error.message : error ? String(error) : null, - runId, - ); - } } From 4e4a5262d257310121b4b8c4cd8ec7c5f8a8aece Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:16:11 -0700 Subject: [PATCH 143/215] refactor: extract doctor diagnostics --- packages/api-core/src/doctor.ts | 39 ++++++++++++++++++++++++++++++++ packages/api-core/src/service.ts | 38 +++++-------------------------- 2 files changed, 45 insertions(+), 32 deletions(-) create mode 100644 packages/api-core/src/doctor.ts diff --git a/packages/api-core/src/doctor.ts b/packages/api-core/src/doctor.ts new file mode 100644 index 0000000..4a482d4 --- /dev/null +++ b/packages/api-core/src/doctor.ts @@ -0,0 +1,39 @@ +import type { HealthResponse } from '@ghcrawl/api-contract'; + +import type { GitcrawlConfig } from './config.js'; +import type { DoctorResult } from './service-types.js'; +import type { VectorStore } from './vector/store.js'; + +export function buildDoctorResult(params: { health: HealthResponse; config: GitcrawlConfig; vectorStore: VectorStore }): DoctorResult { + const github = { + configured: Boolean(params.config.githubToken), + source: params.config.githubTokenSource, + tokenPresent: Boolean(params.config.githubToken), + error: null as string | null, + }; + const openai = { + configured: Boolean(params.config.openaiApiKey), + source: params.config.openaiApiKeySource, + tokenPresent: Boolean(params.config.openaiApiKey), + error: null as string | null, + }; + if (!github.configured) { + github.error = 'Set GITHUB_TOKEN to crawl GitHub data.'; + } + if (!openai.configured) { + openai.error = 'Set OPENAI_API_KEY only for summary or embedding commands.'; + } + + const vectorliteHealth = params.vectorStore.checkRuntime(); + + return { + health: params.health, + github, + openai, + vectorlite: { + configured: params.config.vectorBackend === 'vectorlite', + runtimeOk: vectorliteHealth.ok, + error: vectorliteHealth.error, + }, + }; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 2e1e55a..be4dd96 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -99,6 +99,7 @@ import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; import { readTextBlob, storeTextBlob } from './db/blob-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; +import { buildDoctorResult } from './doctor.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { @@ -256,38 +257,11 @@ export class GHCrawlService { } async doctor(): Promise { - const health = this.init(); - const github = { - configured: Boolean(this.config.githubToken), - source: this.config.githubTokenSource, - tokenPresent: Boolean(this.config.githubToken), - error: null as string | null, - }; - const openai = { - configured: Boolean(this.config.openaiApiKey), - source: this.config.openaiApiKeySource, - tokenPresent: Boolean(this.config.openaiApiKey), - error: null as string | null, - }; - if (!github.configured) { - github.error = 'Set GITHUB_TOKEN to crawl GitHub data.'; - } - if (!openai.configured) { - openai.error = 'Set OPENAI_API_KEY only for summary or embedding commands.'; - } - - const vectorliteHealth = this.vectorStore.checkRuntime(); - - return { - health, - github, - openai, - vectorlite: { - configured: this.config.vectorBackend === 'vectorlite', - runtimeOk: vectorliteHealth.ok, - error: vectorliteHealth.error, - }, - }; + return buildDoctorResult({ + health: this.init(), + config: this.config, + vectorStore: this.vectorStore, + }); } listRepositories(): RepositoriesResponse { From 4b08dd954cd6ca75597156549f9adcecbd727e43 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:18:06 -0700 Subject: [PATCH 144/215] refactor: extract vector encoding helpers --- packages/api-core/src/service.ts | 71 +++--------------------- packages/api-core/src/vector/encoding.ts | 60 ++++++++++++++++++++ 2 files changed, 69 insertions(+), 62 deletions(-) create mode 100644 packages/api-core/src/vector/encoding.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index be4dd96..d5dbc51 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -146,10 +146,6 @@ import { SUMMARY_PROMPT_VERSION, SYNC_BATCH_DELAY_MS, SYNC_BATCH_SIZE, - VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K, - VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH, - VECTORLITE_CLUSTER_EXPANDED_K, - VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, } from './service-constants.js'; import type { ActiveVectorRow, @@ -207,6 +203,7 @@ import { userType, } from './service-utils.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; +import { getVectorliteClusterQuery, normalizedDistanceToScore, normalizedEmbeddingBuffer, parseStoredVector, vectorBlob } from './vector/encoding.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; @@ -1680,7 +1677,7 @@ export class GHCrawlService { const queryVectorItems = seedThreadIds ? vectorItems.filter((item) => seedThreadIds.includes(item.id)) : vectorItems; const activeSourceKind = this.activeVectorSourceKind(); const activeIds = new Set(vectorItems.map((item) => item.id)); - const annQuery = this.getVectorliteClusterQuery(vectorItems.length, k); + const annQuery = getVectorliteClusterQuery(vectorItems.length, k); let processed = 0; let lastProgressAt = Date.now(); @@ -1971,7 +1968,7 @@ export class GHCrawlService { tempDb.transaction(() => { const loadStartedAt = Date.now(); for (const row of source.rows) { - insert.run(row.id, this.normalizedEmbeddingBuffer(row.normalizedEmbedding)); + insert.run(row.id, normalizedEmbeddingBuffer(row.normalizedEmbedding)); } loadMs += Date.now() - loadStartedAt; })(); @@ -1988,7 +1985,7 @@ export class GHCrawlService { let lastProgressAt = Date.now(); const queryLoadStartedAt = Date.now(); for (const row of source.rows) { - const candidates = query.all(this.normalizedEmbeddingBuffer(row.normalizedEmbedding)) as Array<{ + const candidates = query.all(normalizedEmbeddingBuffer(row.normalizedEmbedding)) as Array<{ rowid: number; distance: number; }>; @@ -1999,7 +1996,7 @@ export class GHCrawlService { if (candidate.rowid === row.id) { return -1; } - return this.normalizedDistanceToScore(candidate.distance); + return normalizedDistanceToScore(candidate.distance); }, }); let addedThisRow = 0; @@ -2286,7 +2283,7 @@ export class GHCrawlService { if (targetRow) { responseThread = targetRow; const candidateRows = this.queryNearestWithRecovery(repository.id, repository.fullName, { - vector: this.parseStoredVector(targetRow.vector_json), + vector: parseStoredVector(targetRow.vector_json), limit: limit * 2, candidateK: Math.max(limit * 8, 64), excludeThreadId: targetRow.id, @@ -3705,7 +3702,7 @@ export class GHCrawlService { const update = this.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?'); this.db.transaction(() => { for (const row of rows) { - update.run(this.vectorBlob(JSON.parse(row.vector_json) as number[]), nowIso(), row.thread_id); + update.run(vectorBlob(JSON.parse(row.vector_json) as number[]), nowIso(), row.thread_id); } })(); onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`); @@ -5272,14 +5269,6 @@ export class GHCrawlService { })); } - private normalizedEmbeddingBuffer(values: number[]): Buffer { - return Buffer.from(Float32Array.from(values).buffer); - } - - private normalizedDistanceToScore(distance: number): number { - return 1 - distance / 2; - } - private loadClusterableThreadMeta(repoId: number): { items: Array<{ id: number; number: number; title: string }>; sourceKinds: EmbeddingSourceKind[]; @@ -5346,7 +5335,7 @@ export class GHCrawlService { id: row.id, number: row.number, title: row.title, - embedding: this.parseStoredVector(row.vector_json), + embedding: parseStoredVector(row.vector_json), })); } @@ -6512,7 +6501,7 @@ export class GHCrawlService { this.config.embedModel, embedding.length, contentHash, - this.vectorBlob(embedding), + vectorBlob(embedding), this.config.vectorBackend, nowIso(), nowIso(), @@ -6556,48 +6545,6 @@ export class GHCrawlService { return row.count; } - private getVectorliteClusterQuery(totalItems: number, requestedK: number): { - limit: number; - candidateK: number; - efSearch?: number; - } { - if (totalItems < CLUSTER_PARALLEL_MIN_EMBEDDINGS) { - return { - limit: requestedK, - candidateK: Math.max(requestedK * 16, 64), - }; - } - - const limit = Math.min( - Math.max(requestedK * VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, VECTORLITE_CLUSTER_EXPANDED_K), - Math.max(1, totalItems - 1), - ); - const candidateK = Math.min( - Math.max(limit * 16, VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K), - Math.max(limit, totalItems - 1), - ); - return { - limit, - candidateK, - efSearch: Math.max(candidateK * 2, VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH), - }; - } - - private vectorBlob(values: number[]): Buffer { - return Buffer.from(Float32Array.from(values).buffer); - } - - private parseStoredVector(value: Buffer | string): number[] { - if (typeof value === 'string') { - if (!value) { - throw new Error('Stored vector payload is empty. Run refresh or embed first.'); - } - return JSON.parse(value) as number[]; - } - const floats = new Float32Array(value.buffer, value.byteOffset, Math.floor(value.byteLength / Float32Array.BYTES_PER_ELEMENT)); - return Array.from(floats); - } - private upsertEmbedding(threadId: number, sourceKind: EmbeddingSourceKind, contentHash: string, embedding: number[]): void { this.db .prepare( diff --git a/packages/api-core/src/vector/encoding.ts b/packages/api-core/src/vector/encoding.ts new file mode 100644 index 0000000..5f5665f --- /dev/null +++ b/packages/api-core/src/vector/encoding.ts @@ -0,0 +1,60 @@ +import { + CLUSTER_PARALLEL_MIN_EMBEDDINGS, + VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K, + VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH, + VECTORLITE_CLUSTER_EXPANDED_K, + VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, +} from '../service-constants.js'; + +export function vectorBlob(values: number[]): Buffer { + return Buffer.from(Float32Array.from(values).buffer); +} + +export function parseStoredVector(value: Buffer | string): number[] { + if (typeof value === 'string') { + if (!value) { + throw new Error('Stored vector payload is empty. Run refresh or embed first.'); + } + return JSON.parse(value) as number[]; + } + const floats = new Float32Array(value.buffer, value.byteOffset, Math.floor(value.byteLength / Float32Array.BYTES_PER_ELEMENT)); + return Array.from(floats); +} + +export function normalizedEmbeddingBuffer(values: number[]): Buffer { + return vectorBlob(values); +} + +export function normalizedDistanceToScore(distance: number): number { + return 1 - distance / 2; +} + +export function getVectorliteClusterQuery( + totalItems: number, + requestedK: number, +): { + limit: number; + candidateK: number; + efSearch?: number; +} { + if (totalItems < CLUSTER_PARALLEL_MIN_EMBEDDINGS) { + return { + limit: requestedK, + candidateK: Math.max(requestedK * 16, 64), + }; + } + + const limit = Math.min( + Math.max(requestedK * VECTORLITE_CLUSTER_EXPANDED_MULTIPLIER, VECTORLITE_CLUSTER_EXPANDED_K), + Math.max(1, totalItems - 1), + ); + const candidateK = Math.min( + Math.max(limit * 16, VECTORLITE_CLUSTER_EXPANDED_CANDIDATE_K), + Math.max(limit, totalItems - 1), + ); + return { + limit, + candidateK, + efSearch: Math.max(candidateK * 2, VECTORLITE_CLUSTER_EXPANDED_EF_SEARCH), + }; +} From 4d42dda8c92a2417e704a347b38f25ed8f6cf1f8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:20:17 -0700 Subject: [PATCH 145/215] refactor: extract pipeline state helpers --- packages/api-core/src/pipeline-state.ts | 123 ++++++++++++++++++++ packages/api-core/src/service.ts | 147 +++--------------------- 2 files changed, 140 insertions(+), 130 deletions(-) create mode 100644 packages/api-core/src/pipeline-state.ts diff --git a/packages/api-core/src/pipeline-state.ts b/packages/api-core/src/pipeline-state.ts new file mode 100644 index 0000000..460a8e4 --- /dev/null +++ b/packages/api-core/src/pipeline-state.ts @@ -0,0 +1,123 @@ +import type { GitcrawlConfig } from './config.js'; +import type { SqliteDatabase } from './db/sqlite.js'; +import { ACTIVE_EMBED_DIMENSIONS, ACTIVE_EMBED_PIPELINE_VERSION, SUMMARY_PROMPT_VERSION } from './service-constants.js'; +import type { RepoPipelineStateRow } from './service-types.js'; +import { nowIso } from './service-utils.js'; + +type DesiredPipelineState = Omit; + +export function getDesiredPipelineState(config: GitcrawlConfig): DesiredPipelineState { + return { + summary_model: config.summaryModel, + summary_prompt_version: SUMMARY_PROMPT_VERSION, + embedding_basis: config.embeddingBasis, + embed_model: config.embedModel, + embed_dimensions: ACTIVE_EMBED_DIMENSIONS, + embed_pipeline_version: ACTIVE_EMBED_PIPELINE_VERSION, + vector_backend: config.vectorBackend, + }; +} + +export function getRepoPipelineState(db: SqliteDatabase, repoId: number): RepoPipelineStateRow | null { + return (db.prepare('select * from repo_pipeline_state where repo_id = ? limit 1').get(repoId) as RepoPipelineStateRow | undefined) ?? null; +} + +export function isRepoVectorStateCurrent(db: SqliteDatabase, config: GitcrawlConfig, repoId: number): boolean { + const state = getRepoPipelineState(db, repoId); + if (!state || !state.vectors_current_at) { + return false; + } + const desired = getDesiredPipelineState(config); + return ( + state.summary_model === desired.summary_model && + state.summary_prompt_version === desired.summary_prompt_version && + state.embedding_basis === desired.embedding_basis && + state.embed_model === desired.embed_model && + state.embed_dimensions === desired.embed_dimensions && + state.embed_pipeline_version === desired.embed_pipeline_version && + state.vector_backend === desired.vector_backend + ); +} + +export function isRepoClusterStateCurrent(db: SqliteDatabase, config: GitcrawlConfig, repoId: number): boolean { + const state = getRepoPipelineState(db, repoId); + return isRepoVectorStateCurrent(db, config, repoId) && Boolean(state?.clusters_current_at); +} + +export function hasLegacyEmbeddings(db: SqliteDatabase, embedModel: string, repoId: number): boolean { + const row = db + .prepare( + `select count(*) as count + from document_embeddings e + join threads t on t.id = e.thread_id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and e.model = ?`, + ) + .get(repoId, embedModel) as { count: number }; + return row.count > 0; +} + +export function writeRepoPipelineState( + db: SqliteDatabase, + config: GitcrawlConfig, + repoId: number, + overrides: Partial>, +): void { + const desired = getDesiredPipelineState(config); + const current = getRepoPipelineState(db, repoId); + db.prepare( + `insert into repo_pipeline_state ( + repo_id, + summary_model, + summary_prompt_version, + embedding_basis, + embed_model, + embed_dimensions, + embed_pipeline_version, + vector_backend, + vectors_current_at, + clusters_current_at, + updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id) do update set + summary_model = excluded.summary_model, + summary_prompt_version = excluded.summary_prompt_version, + embedding_basis = excluded.embedding_basis, + embed_model = excluded.embed_model, + embed_dimensions = excluded.embed_dimensions, + embed_pipeline_version = excluded.embed_pipeline_version, + vector_backend = excluded.vector_backend, + vectors_current_at = excluded.vectors_current_at, + clusters_current_at = excluded.clusters_current_at, + updated_at = excluded.updated_at`, + ).run( + repoId, + desired.summary_model, + desired.summary_prompt_version, + desired.embedding_basis, + desired.embed_model, + desired.embed_dimensions, + desired.embed_pipeline_version, + desired.vector_backend, + overrides.vectors_current_at ?? current?.vectors_current_at ?? null, + overrides.clusters_current_at ?? current?.clusters_current_at ?? null, + nowIso(), + ); +} + +export function markRepoVectorsCurrent(db: SqliteDatabase, config: GitcrawlConfig, repoId: number): void { + writeRepoPipelineState(db, config, repoId, { + vectors_current_at: nowIso(), + clusters_current_at: null, + }); +} + +export function markRepoClustersCurrent(db: SqliteDatabase, config: GitcrawlConfig, repoId: number): void { + const state = getRepoPipelineState(db, repoId); + writeRepoPipelineState(db, config, repoId, { + vectors_current_at: state?.vectors_current_at ?? nowIso(), + clusters_current_at: nowIso(), + }); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index d5dbc51..9308b60 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -102,6 +102,13 @@ import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.j import { buildDoctorResult } from './doctor.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; +import { + hasLegacyEmbeddings, + isRepoVectorStateCurrent, + markRepoClustersCurrent, + markRepoVectorsCurrent, + writeRepoPipelineState, +} from './pipeline-state.js'; import { exportPortableSyncDatabase, importPortableSyncDatabase, @@ -162,7 +169,6 @@ import type { KeySummaryTask, NeighborsResultInternal, PortableSyncExportOptions, - RepoPipelineStateRow, SearchResultInternal, SimilaritySourceKind, StoredEmbeddingRow, @@ -1527,7 +1533,7 @@ export class GHCrawlService { try { if (params.threadNumber === undefined) { - if (!this.isRepoVectorStateCurrent(repository.id)) { + if (!isRepoVectorStateCurrent(this.db, this.config, repository.id)) { this.resetRepositoryVectors(repository.id, repository.fullName); } else { const pruned = this.pruneInactiveRepositoryVectors(repository.id, repository.fullName); @@ -1581,7 +1587,7 @@ export class GHCrawlService { } } - this.markRepoVectorsCurrent(repository.id); + markRepoVectorsCurrent(this.db, this.config, repository.id); finishServiceRun(this.db, 'embedding_runs', runId, 'completed', { embedded }); return embedResultSchema.parse({ runId, embedded }); } catch (error) { @@ -1671,7 +1677,7 @@ export class GHCrawlService { `[cluster] built ${aggregatedEdges.size} deterministic similarity edge(s) for ${runSubject}`, ); - const vectorStateCurrent = this.isRepoVectorStateCurrent(repository.id); + const vectorStateCurrent = isRepoVectorStateCurrent(this.db, this.config, repository.id); const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); if (vectorItems.length > 0) { const queryVectorItems = seedThreadIds ? vectorItems.filter((item) => seedThreadIds.includes(item.id)) : vectorItems; @@ -1714,7 +1720,7 @@ export class GHCrawlService { lastProgressAt = now; } } - } else if (!seedThreadIds && this.hasLegacyEmbeddings(repository.id)) { + } else if (!seedThreadIds && hasLegacyEmbeddings(this.db, this.config.embedModel, repository.id)) { const legacy = this.loadClusterableThreadMeta(repository.id); params.onProgress?.( `[cluster] loaded ${legacy.items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`, @@ -1774,7 +1780,7 @@ export class GHCrawlService { this.pruneOldClusterRuns(repository.id, runId); } if (!seedThreadIds && vectorStateCurrent) { - this.markRepoClustersCurrent(repository.id); + markRepoClustersCurrent(this.db, this.config, repository.id); this.cleanupMigratedRepositoryArtifacts(repository.id, repository.fullName, params.onProgress); } @@ -1822,7 +1828,7 @@ export class GHCrawlService { const backend = params.backend ?? 'vectorlite'; const repository = this.requireRepository(params.owner, params.repo); const loaded = this.loadClusterableThreadMeta(repository.id); - const activeVectors = this.isRepoVectorStateCurrent(repository.id) ? this.loadNormalizedActiveVectors(repository.id) : []; + const activeVectors = isRepoVectorStateCurrent(this.db, this.config, repository.id) ? this.loadNormalizedActiveVectors(repository.id) : []; const activeSourceKind = this.activeVectorSourceKind(); const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0); const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds); @@ -2139,7 +2145,7 @@ export class GHCrawlService { } if (mode !== 'keyword' && this.ai) { - if (this.isRepoVectorStateCurrent(repository.id)) { + if (isRepoVectorStateCurrent(this.db, this.config, repository.id)) { const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query], @@ -2154,7 +2160,7 @@ export class GHCrawlService { if (neighbor.score < 0.2) continue; semanticScores.set(neighbor.threadId, Math.max(semanticScores.get(neighbor.threadId) ?? -1, neighbor.score)); } - } else if (this.hasLegacyEmbeddings(repository.id)) { + } else if (hasLegacyEmbeddings(this.db, this.config.embedModel, repository.id)) { const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] }); for (const row of this.iterateStoredEmbeddings(repository.id)) { const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json) as number[]); @@ -3443,125 +3449,6 @@ export class GHCrawlService { }; } - private getDesiredPipelineState(): Omit { - return { - summary_model: this.config.summaryModel, - summary_prompt_version: SUMMARY_PROMPT_VERSION, - embedding_basis: this.config.embeddingBasis, - embed_model: this.config.embedModel, - embed_dimensions: ACTIVE_EMBED_DIMENSIONS, - embed_pipeline_version: ACTIVE_EMBED_PIPELINE_VERSION, - vector_backend: this.config.vectorBackend, - }; - } - - private getRepoPipelineState(repoId: number): RepoPipelineStateRow | null { - return ( - (this.db.prepare('select * from repo_pipeline_state where repo_id = ? limit 1').get(repoId) as RepoPipelineStateRow | undefined) ?? - null - ); - } - - private isRepoVectorStateCurrent(repoId: number): boolean { - const state = this.getRepoPipelineState(repoId); - if (!state || !state.vectors_current_at) { - return false; - } - const desired = this.getDesiredPipelineState(); - return ( - state.summary_model === desired.summary_model && - state.summary_prompt_version === desired.summary_prompt_version && - state.embedding_basis === desired.embedding_basis && - state.embed_model === desired.embed_model && - state.embed_dimensions === desired.embed_dimensions && - state.embed_pipeline_version === desired.embed_pipeline_version && - state.vector_backend === desired.vector_backend - ); - } - - private isRepoClusterStateCurrent(repoId: number): boolean { - const state = this.getRepoPipelineState(repoId); - return this.isRepoVectorStateCurrent(repoId) && Boolean(state?.clusters_current_at); - } - - private hasLegacyEmbeddings(repoId: number): boolean { - const row = this.db - .prepare( - `select count(*) as count - from document_embeddings e - join threads t on t.id = e.thread_id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and e.model = ?`, - ) - .get(repoId, this.config.embedModel) as { count: number }; - return row.count > 0; - } - - private writeRepoPipelineState( - repoId: number, - overrides: Partial>, - ): void { - const desired = this.getDesiredPipelineState(); - const current = this.getRepoPipelineState(repoId); - this.db - .prepare( - `insert into repo_pipeline_state ( - repo_id, - summary_model, - summary_prompt_version, - embedding_basis, - embed_model, - embed_dimensions, - embed_pipeline_version, - vector_backend, - vectors_current_at, - clusters_current_at, - updated_at - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(repo_id) do update set - summary_model = excluded.summary_model, - summary_prompt_version = excluded.summary_prompt_version, - embedding_basis = excluded.embedding_basis, - embed_model = excluded.embed_model, - embed_dimensions = excluded.embed_dimensions, - embed_pipeline_version = excluded.embed_pipeline_version, - vector_backend = excluded.vector_backend, - vectors_current_at = excluded.vectors_current_at, - clusters_current_at = excluded.clusters_current_at, - updated_at = excluded.updated_at`, - ) - .run( - repoId, - desired.summary_model, - desired.summary_prompt_version, - desired.embedding_basis, - desired.embed_model, - desired.embed_dimensions, - desired.embed_pipeline_version, - desired.vector_backend, - overrides.vectors_current_at ?? current?.vectors_current_at ?? null, - overrides.clusters_current_at ?? current?.clusters_current_at ?? null, - nowIso(), - ); - } - - private markRepoVectorsCurrent(repoId: number): void { - this.writeRepoPipelineState(repoId, { - vectors_current_at: nowIso(), - clusters_current_at: null, - }); - } - - private markRepoClustersCurrent(repoId: number): void { - const state = this.getRepoPipelineState(repoId); - this.writeRepoPipelineState(repoId, { - vectors_current_at: state?.vectors_current_at ?? nowIso(), - clusters_current_at: nowIso(), - }); - } - private repoVectorStorePath(repoFullName: string): string { const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__'); return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`); @@ -3626,7 +3513,7 @@ export class GHCrawlService { storePath: this.repoVectorStorePath(repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, }); - this.writeRepoPipelineState(repoId, { + writeRepoPipelineState(this.db, this.config, repoId, { vectors_current_at: null, clusters_current_at: null, }); @@ -5706,7 +5593,7 @@ export class GHCrawlService { title: string; body: string | null; }>; - const pipelineCurrent = this.isRepoVectorStateCurrent(repoId); + const pipelineCurrent = isRepoVectorStateCurrent(this.db, this.config, repoId); const existingRows = this.db .prepare( `select tv.thread_id, tv.content_hash From 4274b27057a5daeaf6d2f39fa0c35f1563183cc6 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:21:40 -0700 Subject: [PATCH 146/215] refactor: extract vector store path helpers --- packages/api-core/src/service.ts | 39 +++++++------------ .../api-core/src/vector/repository-store.ts | 15 +++++++ 2 files changed, 28 insertions(+), 26 deletions(-) create mode 100644 packages/api-core/src/vector/repository-store.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 9308b60..0824074 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -210,6 +210,7 @@ import { } from './service-utils.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { getVectorliteClusterQuery, normalizedDistanceToScore, normalizedEmbeddingBuffer, parseStoredVector, vectorBlob } from './vector/encoding.js'; +import { isCorruptedVectorIndexError, repositoryVectorStorePath, vectorStoreSidecarPath } from './vector/repository-store.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; @@ -2785,8 +2786,8 @@ export class GHCrawlService { ]; if (repository) { - const storePath = this.repoVectorStorePath(repository.fullName); - const sidecarPath = this.vectorStoreSidecarPath(storePath); + const storePath = repositoryVectorStorePath(this.config.configDir, repository.fullName); + const sidecarPath = vectorStoreSidecarPath(storePath); if (existsSync(storePath)) { this.vectorStore.close(); const vectorDb = openDb(storePath) as SqliteDatabase & { loadExtension: (extensionPath: string) => void }; @@ -3449,15 +3450,6 @@ export class GHCrawlService { }; } - private repoVectorStorePath(repoFullName: string): string { - const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__'); - return path.join(this.config.configDir, 'vectors', `${safeName}.sqlite`); - } - - private vectorStoreSidecarPath(storePath: string): string { - return path.join(path.dirname(storePath), `${path.basename(storePath, path.extname(storePath))}.hnsw`); - } - private queryNearestWithRecovery( repoId: number, repoFullName: string, @@ -3466,17 +3458,17 @@ export class GHCrawlService { try { return this.vectorStore.queryNearest({ ...params, - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, }); } catch (error) { - if (!this.isCorruptedVectorIndexError(error)) { + if (!isCorruptedVectorIndexError(error)) { throw error; } this.rebuildRepositoryVectorStore(repoId, repoFullName); return this.vectorStore.queryNearest({ ...params, - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, }); } @@ -3484,12 +3476,12 @@ export class GHCrawlService { private rebuildRepositoryVectorStore(repoId: number, repoFullName: string): void { this.vectorStore.resetRepository({ - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, }); for (const row of this.loadClusterableActiveVectorMeta(repoId, repoFullName)) { this.vectorStore.upsertVector({ - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, threadId: row.id, vector: row.embedding, @@ -3497,11 +3489,6 @@ export class GHCrawlService { } } - private isCorruptedVectorIndexError(error: unknown): boolean { - const message = error instanceof Error ? error.message : String(error); - return /Failed to load index from file|corrupted or unsupported/i.test(message); - } - private resetRepositoryVectors(repoId: number, repoFullName: string): void { this.db .prepare( @@ -3510,7 +3497,7 @@ export class GHCrawlService { ) .run(repoId); this.vectorStore.resetRepository({ - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, }); writeRepoPipelineState(this.db, this.config, repoId, { @@ -3540,12 +3527,12 @@ export class GHCrawlService { deleteVectorRow.run(row.thread_id); try { this.vectorStore.deleteVector({ - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, threadId: row.thread_id, }); } catch (error) { - if (!this.isCorruptedVectorIndexError(error)) { + if (!isCorruptedVectorIndexError(error)) { throw error; } shouldRebuildVectorStore = true; @@ -6395,13 +6382,13 @@ export class GHCrawlService { ); try { this.vectorStore.upsertVector({ - storePath: this.repoVectorStorePath(repoFullName), + storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), dimensions: ACTIVE_EMBED_DIMENSIONS, threadId, vector: embedding, }); } catch (error) { - if (!this.isCorruptedVectorIndexError(error)) { + if (!isCorruptedVectorIndexError(error)) { throw error; } this.rebuildRepositoryVectorStore(repoId, repoFullName); diff --git a/packages/api-core/src/vector/repository-store.ts b/packages/api-core/src/vector/repository-store.ts new file mode 100644 index 0000000..a4300e7 --- /dev/null +++ b/packages/api-core/src/vector/repository-store.ts @@ -0,0 +1,15 @@ +import path from 'node:path'; + +export function repositoryVectorStorePath(configDir: string, repoFullName: string): string { + const safeName = repoFullName.replace(/[^a-zA-Z0-9._-]+/g, '__'); + return path.join(configDir, 'vectors', `${safeName}.sqlite`); +} + +export function vectorStoreSidecarPath(storePath: string): string { + return path.join(path.dirname(storePath), `${path.basename(storePath, path.extname(storePath))}.hnsw`); +} + +export function isCorruptedVectorIndexError(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return /Failed to load index from file|corrupted or unsupported/i.test(message); +} From 7e9ec87239178e359315e233a9a7788702c392e5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:23:37 -0700 Subject: [PATCH 147/215] refactor: extract sync cursor helpers --- packages/api-core/src/service.ts | 101 +-------------------------- packages/api-core/src/sync/cursor.ts | 97 +++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 98 deletions(-) create mode 100644 packages/api-core/src/sync/cursor.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 0824074..b1c6f0c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -124,6 +124,7 @@ import { import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from './run-history.js'; import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; +import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { ACTIVE_EMBED_DIMENSIONS, ACTIVE_EMBED_PIPELINE_VERSION, @@ -200,7 +201,6 @@ import { parseLabels, parseObjectJson, parseStringArrayJson, - parseSyncRunStats, repositoryToDto, snippetText, stableContentHash, @@ -950,7 +950,7 @@ export class GHCrawlService { const repoData = await github.getRepo(params.owner, params.repo, reporter); const repoId = this.upsertRepository(params.owner, params.repo, repoData); const runId = startServiceRun(this.db, 'sync_runs', repoId, `${params.owner}/${params.repo}`); - const syncCursor = this.getSyncCursorState(repoId); + const syncCursor = getSyncCursorState(this.db, repoId); const overlapReferenceAt = syncCursor.lastOverlappingOpenScanCompletedAt ?? syncCursor.lastFullOpenScanStartedAt; const effectiveSince = params.since ?? @@ -1104,7 +1104,7 @@ export class GHCrawlService { !isFullOpenScan && !isOverlappingOpenScan ? finishedAt : syncCursor.lastNonOverlappingScanCompletedAt, lastReconciledOpenCloseAt: reconciledOpenCloseAt ?? syncCursor.lastReconciledOpenCloseAt, }; - this.writeSyncCursorState(repoId, nextSyncCursor); + writeSyncCursorState(this.db, repoId, nextSyncCursor); finishServiceRun(this.db, 'sync_runs', runId, 'completed', { threadsSynced, commentsSynced, @@ -3325,101 +3325,6 @@ export class GHCrawlService { } } - private getSyncCursorState(repoId: number): SyncCursorState { - const persisted = (this.db - .prepare( - `select - last_full_open_scan_started_at, - last_overlapping_open_scan_completed_at, - last_non_overlapping_scan_completed_at, - last_open_close_reconciled_at - from repo_sync_state - where repo_id = ?`, - ) - .get(repoId) as - | { - last_full_open_scan_started_at: string | null; - last_overlapping_open_scan_completed_at: string | null; - last_non_overlapping_scan_completed_at: string | null; - last_open_close_reconciled_at: string | null; - } - | undefined) ?? null; - if (persisted) { - return { - lastFullOpenScanStartedAt: persisted.last_full_open_scan_started_at, - lastOverlappingOpenScanCompletedAt: persisted.last_overlapping_open_scan_completed_at, - lastNonOverlappingScanCompletedAt: persisted.last_non_overlapping_scan_completed_at, - lastReconciledOpenCloseAt: persisted.last_open_close_reconciled_at, - }; - } - - const rows = this.db - .prepare("select finished_at, stats_json from sync_runs where repo_id = ? and status = 'completed' order by id desc") - .all(repoId) as Array<{ finished_at: string | null; stats_json: string | null }>; - const state: SyncCursorState = { - lastFullOpenScanStartedAt: null, - lastOverlappingOpenScanCompletedAt: null, - lastNonOverlappingScanCompletedAt: null, - lastReconciledOpenCloseAt: null, - }; - - for (const row of rows) { - const stats = parseSyncRunStats(row.stats_json); - if (!stats) continue; - if (state.lastFullOpenScanStartedAt === null && stats.isFullOpenScan) { - state.lastFullOpenScanStartedAt = stats.crawlStartedAt; - } - if (state.lastOverlappingOpenScanCompletedAt === null && stats.isOverlappingOpenScan && row.finished_at) { - state.lastOverlappingOpenScanCompletedAt = row.finished_at; - } - if (state.lastNonOverlappingScanCompletedAt === null && !stats.isFullOpenScan && !stats.isOverlappingOpenScan && row.finished_at) { - state.lastNonOverlappingScanCompletedAt = row.finished_at; - } - if (state.lastReconciledOpenCloseAt === null && stats.reconciledOpenCloseAt) { - state.lastReconciledOpenCloseAt = stats.reconciledOpenCloseAt; - } - } - - if ( - state.lastFullOpenScanStartedAt !== null || - state.lastOverlappingOpenScanCompletedAt !== null || - state.lastNonOverlappingScanCompletedAt !== null || - state.lastReconciledOpenCloseAt !== null - ) { - this.writeSyncCursorState(repoId, state); - } - - return state; - } - - private writeSyncCursorState(repoId: number, state: SyncCursorState): void { - this.db - .prepare( - `insert into repo_sync_state ( - repo_id, - last_full_open_scan_started_at, - last_overlapping_open_scan_completed_at, - last_non_overlapping_scan_completed_at, - last_open_close_reconciled_at, - updated_at - ) values (?, ?, ?, ?, ?, ?) - on conflict(repo_id) do update set - last_full_open_scan_started_at = excluded.last_full_open_scan_started_at, - last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at, - last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at, - last_open_close_reconciled_at = excluded.last_open_close_reconciled_at, - updated_at = excluded.updated_at`, - ) - .run( - repoId, - state.lastFullOpenScanStartedAt, - state.lastOverlappingOpenScanCompletedAt, - state.lastNonOverlappingScanCompletedAt, - state.lastReconciledOpenCloseAt, - nowIso(), - ); - } - private getTuiRepoStats(repoId: number): TuiRepoStats { const counts = this.db .prepare( diff --git a/packages/api-core/src/sync/cursor.ts b/packages/api-core/src/sync/cursor.ts new file mode 100644 index 0000000..03b0398 --- /dev/null +++ b/packages/api-core/src/sync/cursor.ts @@ -0,0 +1,97 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { SyncCursorState } from '../service-types.js'; +import { nowIso, parseSyncRunStats } from '../service-utils.js'; + +export function getSyncCursorState(db: SqliteDatabase, repoId: number): SyncCursorState { + const persisted = + (db + .prepare( + `select + last_full_open_scan_started_at, + last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at, + last_open_close_reconciled_at + from repo_sync_state + where repo_id = ?`, + ) + .get(repoId) as + | { + last_full_open_scan_started_at: string | null; + last_overlapping_open_scan_completed_at: string | null; + last_non_overlapping_scan_completed_at: string | null; + last_open_close_reconciled_at: string | null; + } + | undefined) ?? null; + if (persisted) { + return { + lastFullOpenScanStartedAt: persisted.last_full_open_scan_started_at, + lastOverlappingOpenScanCompletedAt: persisted.last_overlapping_open_scan_completed_at, + lastNonOverlappingScanCompletedAt: persisted.last_non_overlapping_scan_completed_at, + lastReconciledOpenCloseAt: persisted.last_open_close_reconciled_at, + }; + } + + const rows = db + .prepare("select finished_at, stats_json from sync_runs where repo_id = ? and status = 'completed' order by id desc") + .all(repoId) as Array<{ finished_at: string | null; stats_json: string | null }>; + const state: SyncCursorState = { + lastFullOpenScanStartedAt: null, + lastOverlappingOpenScanCompletedAt: null, + lastNonOverlappingScanCompletedAt: null, + lastReconciledOpenCloseAt: null, + }; + + for (const row of rows) { + const stats = parseSyncRunStats(row.stats_json); + if (!stats) continue; + if (state.lastFullOpenScanStartedAt === null && stats.isFullOpenScan) { + state.lastFullOpenScanStartedAt = stats.crawlStartedAt; + } + if (state.lastOverlappingOpenScanCompletedAt === null && stats.isOverlappingOpenScan && row.finished_at) { + state.lastOverlappingOpenScanCompletedAt = row.finished_at; + } + if (state.lastNonOverlappingScanCompletedAt === null && !stats.isFullOpenScan && !stats.isOverlappingOpenScan && row.finished_at) { + state.lastNonOverlappingScanCompletedAt = row.finished_at; + } + if (state.lastReconciledOpenCloseAt === null && stats.reconciledOpenCloseAt) { + state.lastReconciledOpenCloseAt = stats.reconciledOpenCloseAt; + } + } + + if ( + state.lastFullOpenScanStartedAt !== null || + state.lastOverlappingOpenScanCompletedAt !== null || + state.lastNonOverlappingScanCompletedAt !== null || + state.lastReconciledOpenCloseAt !== null + ) { + writeSyncCursorState(db, repoId, state); + } + + return state; +} + +export function writeSyncCursorState(db: SqliteDatabase, repoId: number, state: SyncCursorState): void { + db.prepare( + `insert into repo_sync_state ( + repo_id, + last_full_open_scan_started_at, + last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at, + last_open_close_reconciled_at, + updated_at + ) values (?, ?, ?, ?, ?, ?) + on conflict(repo_id) do update set + last_full_open_scan_started_at = excluded.last_full_open_scan_started_at, + last_overlapping_open_scan_completed_at = excluded.last_overlapping_open_scan_completed_at, + last_non_overlapping_scan_completed_at = excluded.last_non_overlapping_scan_completed_at, + last_open_close_reconciled_at = excluded.last_open_close_reconciled_at, + updated_at = excluded.updated_at`, + ).run( + repoId, + state.lastFullOpenScanStartedAt, + state.lastOverlappingOpenScanCompletedAt, + state.lastNonOverlappingScanCompletedAt, + state.lastReconciledOpenCloseAt, + nowIso(), + ); +} From 5d70fa48d7bd6943f33b8740e02c743640adb7e4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:25:52 -0700 Subject: [PATCH 148/215] refactor: extract tui thread detail helpers --- packages/api-core/src/service.ts | 83 ++------------------- packages/api-core/src/tui/thread-detail.ts | 84 ++++++++++++++++++++++ 2 files changed, 88 insertions(+), 79 deletions(-) create mode 100644 packages/api-core/src/tui/thread-detail.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b1c6f0c..f3d9002 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -125,6 +125,7 @@ import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from ' import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; +import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, ACTIVE_EMBED_PIPELINE_VERSION, @@ -192,7 +193,6 @@ import { isEffectivelyClosed, isMissingGitHubResourceError, isPullRequestPayload, - normalizeKeySummaryDisplayText, normalizeSummaryText, nowIso, parseArray, @@ -3187,27 +3187,9 @@ export class GHCrawlService { .get(latestRun.id, row.id) as { cluster_id: number } | undefined) ?? null) : null; - const summaryRows = this.db - .prepare( - `select summary_kind, summary_text - from document_summaries - where thread_id = ? and model = ? and prompt_version = ? - order by summary_kind asc`, - ) - .all(row.id, this.config.summaryModel, SUMMARY_PROMPT_VERSION) as Array<{ summary_kind: string; summary_text: string }>; - const summaries: TuiThreadDetail['summaries'] = {}; - for (const summary of summaryRows) { - if ( - summary.summary_kind === 'problem_summary' || - summary.summary_kind === 'solution_summary' || - summary.summary_kind === 'maintainer_signal_summary' || - summary.summary_kind === 'dedupe_summary' - ) { - summaries[summary.summary_kind] = summary.summary_text; - } - } - const topFiles = this.getTopChangedFiles(row.id, 5); - const keySummary = this.getLatestKeySummary(row.id); + const summaries = getTuiThreadSummaries(this.db, row.id, this.config.summaryModel); + const topFiles = getTopChangedFiles(this.db, row.id, 5); + const keySummary = getLatestTuiKeySummary(this.db, row.id, this.config.summaryModel); let neighbors: SearchHitDto['neighbors'] = []; if (params.includeNeighbors !== false) { @@ -3236,63 +3218,6 @@ export class GHCrawlService { }; } - private getLatestKeySummary(threadId: number): TuiThreadDetail['keySummary'] { - const row = this.db - .prepare( - `select ks.summary_kind, ks.prompt_version, ks.model, ks.key_text - from thread_key_summaries ks - join thread_revisions tr on tr.id = ks.thread_revision_id - where tr.thread_id = ? - and ks.summary_kind = 'llm_key_3line' - order by - case when ks.model = ? then 0 else 1 end, - tr.id desc, - ks.created_at desc - limit 1`, - ) - .get(threadId, this.config.summaryModel) as - | { - summary_kind: string; - prompt_version: string; - model: string; - key_text: string; - } - | undefined; - if (!row) return null; - const text = normalizeKeySummaryDisplayText(row.key_text); - if (!text) return null; - return { - summaryKind: row.summary_kind, - promptVersion: row.prompt_version, - model: row.model, - text, - }; - } - - private getTopChangedFiles(threadId: number, limit: number): TuiThreadDetail['topFiles'] { - const latestRevision = this.db - .prepare( - `select id - from thread_revisions - where thread_id = ? - order by id desc - limit 1`, - ) - .get(threadId) as { id: number } | undefined; - if (!latestRevision) return []; - - return this.db - .prepare( - `select cf.path, cf.status, cf.additions, cf.deletions - from thread_code_snapshots cs - join thread_changed_files cf on cf.snapshot_id = cs.id - where cs.thread_revision_id = ? - order by (cf.additions + cf.deletions) desc, cf.path asc - limit ?`, - ) - .all(latestRevision.id, limit) as TuiThreadDetail['topFiles']; - } - async rerunAction(request: ActionRequest): Promise { switch (request.action) { case 'summarize': { diff --git a/packages/api-core/src/tui/thread-detail.ts b/packages/api-core/src/tui/thread-detail.ts new file mode 100644 index 0000000..5eb13fd --- /dev/null +++ b/packages/api-core/src/tui/thread-detail.ts @@ -0,0 +1,84 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import { SUMMARY_PROMPT_VERSION } from '../service-constants.js'; +import type { TuiThreadDetail } from '../service-types.js'; +import { normalizeKeySummaryDisplayText } from '../service-utils.js'; + +export function getTuiThreadSummaries(db: SqliteDatabase, threadId: number, summaryModel: string): TuiThreadDetail['summaries'] { + const rows = db + .prepare( + `select summary_kind, summary_text + from document_summaries + where thread_id = ? and model = ? and prompt_version = ? + order by summary_kind asc`, + ) + .all(threadId, summaryModel, SUMMARY_PROMPT_VERSION) as Array<{ summary_kind: string; summary_text: string }>; + const summaries: TuiThreadDetail['summaries'] = {}; + for (const summary of rows) { + if ( + summary.summary_kind === 'problem_summary' || + summary.summary_kind === 'solution_summary' || + summary.summary_kind === 'maintainer_signal_summary' || + summary.summary_kind === 'dedupe_summary' + ) { + summaries[summary.summary_kind] = summary.summary_text; + } + } + return summaries; +} + +export function getLatestTuiKeySummary(db: SqliteDatabase, threadId: number, summaryModel: string): TuiThreadDetail['keySummary'] { + const row = db + .prepare( + `select ks.summary_kind, ks.prompt_version, ks.model, ks.key_text + from thread_key_summaries ks + join thread_revisions tr on tr.id = ks.thread_revision_id + where tr.thread_id = ? + and ks.summary_kind = 'llm_key_3line' + order by + case when ks.model = ? then 0 else 1 end, + tr.id desc, + ks.created_at desc + limit 1`, + ) + .get(threadId, summaryModel) as + | { + summary_kind: string; + prompt_version: string; + model: string; + key_text: string; + } + | undefined; + if (!row) return null; + const text = normalizeKeySummaryDisplayText(row.key_text); + if (!text) return null; + return { + summaryKind: row.summary_kind, + promptVersion: row.prompt_version, + model: row.model, + text, + }; +} + +export function getTopChangedFiles(db: SqliteDatabase, threadId: number, limit: number): TuiThreadDetail['topFiles'] { + const latestRevision = db + .prepare( + `select id + from thread_revisions + where thread_id = ? + order by id desc + limit 1`, + ) + .get(threadId) as { id: number } | undefined; + if (!latestRevision) return []; + + return db + .prepare( + `select cf.path, cf.status, cf.additions, cf.deletions + from thread_code_snapshots cs + join thread_changed_files cf on cf.snapshot_id = cs.id + where cs.thread_revision_id = ? + order by (cf.additions + cf.deletions) desc, cf.path asc + limit ?`, + ) + .all(latestRevision.id, limit) as TuiThreadDetail['topFiles']; +} From e5d51d9cb95e7ea42b90d6f2b1a8417117a1e2d1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:27:16 -0700 Subject: [PATCH 149/215] refactor: extract raw json blob storage --- packages/api-core/src/db/raw-json-store.ts | 25 ++++++++++++++++++++ packages/api-core/src/service.ts | 27 +++++----------------- 2 files changed, 31 insertions(+), 21 deletions(-) create mode 100644 packages/api-core/src/db/raw-json-store.ts diff --git a/packages/api-core/src/db/raw-json-store.ts b/packages/api-core/src/db/raw-json-store.ts new file mode 100644 index 0000000..153532c --- /dev/null +++ b/packages/api-core/src/db/raw-json-store.ts @@ -0,0 +1,25 @@ +import path from 'node:path'; + +import { RAW_JSON_INLINE_THRESHOLD_BYTES } from '../service-constants.js'; +import type { SqliteDatabase } from './sqlite.js'; +import { storeTextBlob } from './blob-store.js'; + +export function blobStoreRoot(dbPath: string): string { + return path.join(path.dirname(dbPath), '.ghcrawl-store'); +} + +export function rawJsonStorage( + db: SqliteDatabase, + dbPath: string, + rawJson: string, + mediaType: string, +): { inlineJson: string; blobId: number | null } { + if (Buffer.byteLength(rawJson, 'utf8') <= RAW_JSON_INLINE_THRESHOLD_BYTES) { + return { inlineJson: rawJson, blobId: null }; + } + const blob = storeTextBlob(db, blobStoreRoot(dbPath), rawJson, { + mediaType, + inlineThresholdBytes: RAW_JSON_INLINE_THRESHOLD_BYTES, + }); + return { inlineJson: '{}', blobId: blob.id }; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f3d9002..3401514 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -97,7 +97,8 @@ import { } from './config.js'; import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; -import { readTextBlob, storeTextBlob } from './db/blob-store.js'; +import { readTextBlob } from './db/blob-store.js'; +import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; @@ -147,7 +148,6 @@ import { KEY_SUMMARY_MAX_BODY_CHARS, KEY_SUMMARY_MAX_UNREAD, MAX_DIRECT_RECONCILE_THREADS, - RAW_JSON_INLINE_THRESHOLD_BYTES, requireFromHere, STALE_CLOSED_BACKFILL_LIMIT, STALE_CLOSED_SWEEP_LIMIT, @@ -4218,25 +4218,10 @@ export class GHCrawlService { baseSha: typeof base?.sha === 'string' ? base.sha : null, headSha: typeof head?.sha === 'string' ? head.sha : null, signature: buildCodeSnapshotSignature(files), - storeRoot: this.blobStoreRoot(), + storeRoot: blobStoreRoot(this.config.dbPath), }); } - private blobStoreRoot(): string { - return path.join(path.dirname(this.config.dbPath), '.ghcrawl-store'); - } - - private rawJsonStorage(rawJson: string, mediaType: string): { inlineJson: string; blobId: number | null } { - if (Buffer.byteLength(rawJson, 'utf8') <= RAW_JSON_INLINE_THRESHOLD_BYTES) { - return { inlineJson: rawJson, blobId: null }; - } - const blob = storeTextBlob(this.db, this.blobStoreRoot(), rawJson, { - mediaType, - inlineThresholdBytes: RAW_JSON_INLINE_THRESHOLD_BYTES, - }); - return { inlineJson: '{}', blobId: blob.id }; - } - private async applyClosedOverlapSweep(params: { repoId: number; owner: string; @@ -4448,7 +4433,7 @@ export class GHCrawlService { const tx = this.db.transaction((commentRows: CommentSeed[]) => { this.db.prepare('delete from comments where thread_id = ?').run(threadId); for (const comment of commentRows) { - const raw = this.rawJsonStorage(comment.rawJson, `application/vnd.ghcrawl.${comment.commentType}.raw+json`); + const raw = rawJsonStorage(this.db, this.config.dbPath, comment.rawJson, `application/vnd.ghcrawl.${comment.commentType}.raw+json`); insert.run( threadId, comment.githubId, @@ -5310,11 +5295,11 @@ export class GHCrawlService { patchIds: stringFeature('patchIds'), featureHash: typeof feature.featureHash === 'string' ? feature.featureHash : '', minhashSignature: row.minhash_signature_blob_id - ? parseStringArrayJson(readTextBlob(this.db, this.blobStoreRoot(), row.minhash_signature_blob_id)) + ? parseStringArrayJson(readTextBlob(this.db, blobStoreRoot(this.config.dbPath), row.minhash_signature_blob_id)) : [], simhash64: row.simhash64, winnowHashes: row.winnow_hashes_blob_id - ? parseStringArrayJson(readTextBlob(this.db, this.blobStoreRoot(), row.winnow_hashes_blob_id)) + ? parseStringArrayJson(readTextBlob(this.db, blobStoreRoot(this.config.dbPath), row.winnow_hashes_blob_id)) : [], }); } From 1d82b9ba5f50f5721640ff34b59f79c4c297f42d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:29:22 -0700 Subject: [PATCH 150/215] refactor: extract summary source builder --- packages/api-core/src/service.ts | 68 ++++--------------------- packages/api-core/src/summary/source.ts | 60 ++++++++++++++++++++++ 2 files changed, 71 insertions(+), 57 deletions(-) create mode 100644 packages/api-core/src/summary/source.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 3401514..f77c56b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -126,6 +126,7 @@ import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from ' import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; +import { buildSummarySource } from './summary/source.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, @@ -1174,10 +1175,16 @@ export class GHCrawlService { : '[summarize] metadata-only mode; comments are excluded from the summary input', ); - const sources = rows.map((row) => { - const source = this.buildSummarySource(row.id, row.title, row.body, parseArray(row.labels_json), includeComments); - return { ...row, ...source }; - }); + const sources = rows.map((row) => ({ + ...row, + ...buildSummarySource(this.db, { + threadId: row.id, + title: row.title, + body: row.body, + labels: parseArray(row.labels_json), + includeComments, + }), + })); const pending = sources.filter((row) => { const latest = this.db @@ -4488,59 +4495,6 @@ export class GHCrawlService { this.db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId); } - private buildSummarySource( - threadId: number, - title: string, - body: string | null, - labels: string[], - includeComments: boolean, - ): { summaryInput: string; summaryContentHash: string } { - const parts = [`title: ${normalizeSummaryText(title)}`]; - const normalizedBody = normalizeSummaryText(body ?? ''); - if (normalizedBody) { - parts.push(`body: ${normalizedBody}`); - } - if (labels.length > 0) { - parts.push(`labels: ${labels.join(', ')}`); - } - - if (includeComments) { - const comments = this.db - .prepare( - `select body, author_login, author_type, is_bot - from comments - where thread_id = ? - order by coalesce(created_at_gh, updated_at_gh) asc, id asc`, - ) - .all(threadId) as Array<{ body: string; author_login: string | null; author_type: string | null; is_bot: number }>; - - const humanComments = comments - .filter((comment) => - !isBotLikeAuthor({ - authorLogin: comment.author_login, - authorType: comment.author_type, - isBot: comment.is_bot === 1, - }), - ) - .map((comment) => { - const author = comment.author_login ? `@${comment.author_login}` : 'unknown'; - const normalized = normalizeSummaryText(comment.body); - return normalized ? `${author}: ${normalized}` : ''; - }) - .filter(Boolean); - - if (humanComments.length > 0) { - parts.push(`discussion:\n${humanComments.join('\n')}`); - } - } - - const summaryInput = parts.join('\n\n'); - const summaryContentHash = stableContentHash( - `summary:${SUMMARY_PROMPT_VERSION}:${includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`, - ); - return { summaryInput, summaryContentHash }; - } - private buildEmbeddingTasks(params: { threadId: number; threadNumber: number; diff --git a/packages/api-core/src/summary/source.ts b/packages/api-core/src/summary/source.ts new file mode 100644 index 0000000..9c2da3d --- /dev/null +++ b/packages/api-core/src/summary/source.ts @@ -0,0 +1,60 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import { isBotLikeAuthor } from '../documents/normalize.js'; +import { SUMMARY_PROMPT_VERSION } from '../service-constants.js'; +import { normalizeSummaryText, stableContentHash } from '../service-utils.js'; + +export function buildSummarySource( + db: SqliteDatabase, + params: { + threadId: number; + title: string; + body: string | null; + labels: string[]; + includeComments: boolean; + }, +): { summaryInput: string; summaryContentHash: string } { + const parts = [`title: ${normalizeSummaryText(params.title)}`]; + const normalizedBody = normalizeSummaryText(params.body ?? ''); + if (normalizedBody) { + parts.push(`body: ${normalizedBody}`); + } + if (params.labels.length > 0) { + parts.push(`labels: ${params.labels.join(', ')}`); + } + + if (params.includeComments) { + const comments = db + .prepare( + `select body, author_login, author_type, is_bot + from comments + where thread_id = ? + order by coalesce(created_at_gh, updated_at_gh) asc, id asc`, + ) + .all(params.threadId) as Array<{ body: string; author_login: string | null; author_type: string | null; is_bot: number }>; + + const humanComments = comments + .filter((comment) => + !isBotLikeAuthor({ + authorLogin: comment.author_login, + authorType: comment.author_type, + isBot: comment.is_bot === 1, + }), + ) + .map((comment) => { + const author = comment.author_login ? `@${comment.author_login}` : 'unknown'; + const normalized = normalizeSummaryText(comment.body); + return normalized ? `${author}: ${normalized}` : ''; + }) + .filter(Boolean); + + if (humanComments.length > 0) { + parts.push(`discussion:\n${humanComments.join('\n')}`); + } + } + + const summaryInput = parts.join('\n\n'); + const summaryContentHash = stableContentHash( + `summary:${SUMMARY_PROMPT_VERSION}:${params.includeComments ? 'with-comments' : 'metadata-only'}\n${summaryInput}`, + ); + return { summaryInput, summaryContentHash }; +} From 0fde350bab3fa995f70cb52442e1ea33526260f3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:30:03 -0700 Subject: [PATCH 151/215] refactor: extract key summary input builder --- packages/api-core/src/service.ts | 13 ++----------- packages/api-core/src/summary/source.ts | 11 ++++++++++- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index f77c56b..5528c1a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -126,7 +126,7 @@ import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from ' import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; -import { buildSummarySource } from './summary/source.js'; +import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, @@ -1354,7 +1354,7 @@ export class GHCrawlService { for (const row of rows) { const labels = parseArray(row.labels_json); - const text = this.buildKeySummaryInputText({ + const text = buildKeySummaryInputText({ title: row.title, labels, body: row.body, @@ -1479,15 +1479,6 @@ export class GHCrawlService { } } - private buildKeySummaryInputText(params: { title: string; labels: string[]; body: string | null }): string { - const body = normalizeSummaryText(params.body ?? ''); - const truncatedBody = - body.length > KEY_SUMMARY_MAX_BODY_CHARS - ? `${body.slice(0, KEY_SUMMARY_MAX_BODY_CHARS)}\n\n[truncated for key summary]` - : body; - return [`title: ${params.title}`, `labels: ${params.labels.join(', ')}`, `body: ${truncatedBody}`].join('\n'); - } - purgeComments(params: { owner: string; repo: string; diff --git a/packages/api-core/src/summary/source.ts b/packages/api-core/src/summary/source.ts index 9c2da3d..0b62af4 100644 --- a/packages/api-core/src/summary/source.ts +++ b/packages/api-core/src/summary/source.ts @@ -1,8 +1,17 @@ import type { SqliteDatabase } from '../db/sqlite.js'; import { isBotLikeAuthor } from '../documents/normalize.js'; -import { SUMMARY_PROMPT_VERSION } from '../service-constants.js'; +import { KEY_SUMMARY_MAX_BODY_CHARS, SUMMARY_PROMPT_VERSION } from '../service-constants.js'; import { normalizeSummaryText, stableContentHash } from '../service-utils.js'; +export function buildKeySummaryInputText(params: { title: string; labels: string[]; body: string | null }): string { + const body = normalizeSummaryText(params.body ?? ''); + const truncatedBody = + body.length > KEY_SUMMARY_MAX_BODY_CHARS + ? `${body.slice(0, KEY_SUMMARY_MAX_BODY_CHARS)}\n\n[truncated for key summary]` + : body; + return [`title: ${params.title}`, `labels: ${params.labels.join(', ')}`, `body: ${truncatedBody}`].join('\n'); +} + export function buildSummarySource( db: SqliteDatabase, params: { From 6786e7509324db4a71b9cd73bff89938a7fb7508 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:32:04 -0700 Subject: [PATCH 152/215] refactor: extract active vector task builder --- packages/api-core/src/embedding/tasks.ts | 92 ++++++++++++++ packages/api-core/src/service.ts | 148 ++--------------------- 2 files changed, 103 insertions(+), 137 deletions(-) create mode 100644 packages/api-core/src/embedding/tasks.ts diff --git a/packages/api-core/src/embedding/tasks.ts b/packages/api-core/src/embedding/tasks.ts new file mode 100644 index 0000000..51bbdb4 --- /dev/null +++ b/packages/api-core/src/embedding/tasks.ts @@ -0,0 +1,92 @@ +import type { EmbeddingBasis } from '../config.js'; +import { + ACTIVE_EMBED_DIMENSIONS, + ACTIVE_EMBED_PIPELINE_VERSION, + EMBED_ESTIMATED_CHARS_PER_TOKEN, + EMBED_MAX_ITEM_TOKENS, + EMBED_TRUNCATION_MARKER, +} from '../service-constants.js'; +import type { ActiveVectorTask, EmbeddingSourceKind } from '../service-types.js'; +import { normalizeSummaryText, stableContentHash } from '../service-utils.js'; + +export function activeVectorSourceKind(embeddingBasis: EmbeddingBasis): EmbeddingSourceKind { + if (embeddingBasis === 'title_summary') { + return 'dedupe_summary'; + } + if (embeddingBasis === 'llm_key_summary') { + return 'llm_key_summary'; + } + return 'body'; +} + +export function buildActiveVectorTask(params: { + threadId: number; + threadNumber: number; + title: string; + body: string | null; + dedupeSummary: string | null; + keySummary: string | null; + embeddingBasis: EmbeddingBasis; + embedModel: string; +}): ActiveVectorTask | null { + const sections = [`title: ${normalizeSummaryText(params.title)}`]; + if (params.embeddingBasis === 'title_summary') { + const summary = normalizeSummaryText(params.dedupeSummary ?? ''); + if (!summary) { + return null; + } + sections.push(`summary: ${summary}`); + } else if (params.embeddingBasis === 'llm_key_summary') { + const keySummary = normalizeSummaryText(params.keySummary ?? ''); + if (!keySummary) { + return null; + } + sections.push(`key_summary:\n${keySummary}`); + } else { + const body = normalizeSummaryText(params.body ?? ''); + if (body) { + sections.push(`body: ${body}`); + } + } + + const prepared = prepareEmbeddingText(sections.join('\n\n'), EMBED_MAX_ITEM_TOKENS); + if (!prepared) { + return null; + } + + return { + threadId: params.threadId, + threadNumber: params.threadNumber, + basis: params.embeddingBasis, + text: prepared.text, + contentHash: stableContentHash( + `embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${params.embeddingBasis}:${params.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${prepared.text}`, + ), + estimatedTokens: prepared.estimatedTokens, + wasTruncated: prepared.wasTruncated, + }; +} + +export function prepareEmbeddingText( + text: string, + maxEstimatedTokens: number, +): { text: string; estimatedTokens: number; wasTruncated: boolean } | null { + if (!text) { + return null; + } + + const maxChars = maxEstimatedTokens * EMBED_ESTIMATED_CHARS_PER_TOKEN; + const wasTruncated = text.length > maxChars; + const prepared = wasTruncated + ? `${text.slice(0, Math.max(0, maxChars - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}` + : text; + return { + text: prepared, + estimatedTokens: estimateEmbeddingTokens(prepared), + wasTruncated, + }; +} + +export function estimateEmbeddingTokens(text: string): number { + return Math.max(1, Math.ceil(text.length / EMBED_ESTIMATED_CHARS_PER_TOKEN)); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 5528c1a..cf29c32 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -101,6 +101,11 @@ import { readTextBlob } from './db/blob-store.js'; import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; +import { + activeVectorSourceKind, + buildActiveVectorTask, + estimateEmbeddingTokens, +} from './embedding/tasks.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { @@ -141,9 +146,7 @@ import { EMBED_CONTEXT_RETRY_ATTEMPTS, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO, EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO, - EMBED_ESTIMATED_CHARS_PER_TOKEN, EMBED_MAX_BATCH_TOKENS, - EMBED_MAX_ITEM_TOKENS, EMBED_TRUNCATION_MARKER, KEY_SUMMARY_CONCURRENCY, KEY_SUMMARY_MAX_BODY_CHARS, @@ -167,7 +170,6 @@ import type { DoctorResult, DurableTuiClosure, EmbeddingSourceKind, - EmbeddingTask, EmbeddingWorkset, KeySummaryTask, NeighborsResultInternal, @@ -1680,7 +1682,7 @@ export class GHCrawlService { const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); if (vectorItems.length > 0) { const queryVectorItems = seedThreadIds ? vectorItems.filter((item) => seedThreadIds.includes(item.id)) : vectorItems; - const activeSourceKind = this.activeVectorSourceKind(); + const activeSourceKind = activeVectorSourceKind(this.config.embeddingBasis); const activeIds = new Set(vectorItems.map((item) => item.id)); const annQuery = getVectorliteClusterQuery(vectorItems.length, k); let processed = 0; @@ -1828,7 +1830,7 @@ export class GHCrawlService { const repository = this.requireRepository(params.owner, params.repo); const loaded = this.loadClusterableThreadMeta(repository.id); const activeVectors = isRepoVectorStateCurrent(this.db, this.config, repository.id) ? this.loadNormalizedActiveVectors(repository.id) : []; - const activeSourceKind = this.activeVectorSourceKind(); + const activeSourceKind = activeVectorSourceKind(this.config.embeddingBasis); const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0); const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds); const items = useActiveVectors @@ -4486,136 +4488,6 @@ export class GHCrawlService { this.db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId); } - private buildEmbeddingTasks(params: { - threadId: number; - threadNumber: number; - title: string; - body: string | null; - dedupeSummary: string | null; - }): EmbeddingTask[] { - const tasks: EmbeddingTask[] = []; - const titleText = this.prepareEmbeddingText(normalizeSummaryText(params.title), EMBED_MAX_ITEM_TOKENS); - if (titleText) { - tasks.push({ - threadId: params.threadId, - threadNumber: params.threadNumber, - sourceKind: 'title', - text: titleText.text, - contentHash: stableContentHash(`embedding:title\n${titleText.text}`), - estimatedTokens: titleText.estimatedTokens, - wasTruncated: titleText.wasTruncated, - }); - } - - const bodyText = this.prepareEmbeddingText(normalizeSummaryText(params.body ?? ''), EMBED_MAX_ITEM_TOKENS); - if (bodyText) { - tasks.push({ - threadId: params.threadId, - threadNumber: params.threadNumber, - sourceKind: 'body', - text: bodyText.text, - contentHash: stableContentHash(`embedding:body\n${bodyText.text}`), - estimatedTokens: bodyText.estimatedTokens, - wasTruncated: bodyText.wasTruncated, - }); - } - - const summaryText = this.prepareEmbeddingText(normalizeSummaryText(params.dedupeSummary ?? ''), EMBED_MAX_ITEM_TOKENS); - if (summaryText) { - tasks.push({ - threadId: params.threadId, - threadNumber: params.threadNumber, - sourceKind: 'dedupe_summary', - text: summaryText.text, - contentHash: stableContentHash(`embedding:dedupe_summary\n${summaryText.text}`), - estimatedTokens: summaryText.estimatedTokens, - wasTruncated: summaryText.wasTruncated, - }); - } - - return tasks; - } - - private buildActiveVectorTask(params: { - threadId: number; - threadNumber: number; - title: string; - body: string | null; - dedupeSummary: string | null; - keySummary: string | null; - }): ActiveVectorTask | null { - const sections = [`title: ${normalizeSummaryText(params.title)}`]; - if (this.config.embeddingBasis === 'title_summary') { - const summary = normalizeSummaryText(params.dedupeSummary ?? ''); - if (!summary) { - return null; - } - sections.push(`summary: ${summary}`); - } else if (this.config.embeddingBasis === 'llm_key_summary') { - const keySummary = normalizeSummaryText(params.keySummary ?? ''); - if (!keySummary) { - return null; - } - sections.push(`key_summary:\n${keySummary}`); - } else { - const body = normalizeSummaryText(params.body ?? ''); - if (body) { - sections.push(`body: ${body}`); - } - } - - const prepared = this.prepareEmbeddingText(sections.join('\n\n'), EMBED_MAX_ITEM_TOKENS); - if (!prepared) { - return null; - } - - return { - threadId: params.threadId, - threadNumber: params.threadNumber, - basis: this.config.embeddingBasis, - text: prepared.text, - contentHash: stableContentHash( - `embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${this.config.embeddingBasis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${prepared.text}`, - ), - estimatedTokens: prepared.estimatedTokens, - wasTruncated: prepared.wasTruncated, - }; - } - - private activeVectorSourceKind(): EmbeddingSourceKind { - if (this.config.embeddingBasis === 'title_summary') { - return 'dedupe_summary'; - } - if (this.config.embeddingBasis === 'llm_key_summary') { - return 'llm_key_summary'; - } - return 'body'; - } - - private prepareEmbeddingText( - text: string, - maxEstimatedTokens: number, - ): { text: string; estimatedTokens: number; wasTruncated: boolean } | null { - if (!text) { - return null; - } - - const maxChars = maxEstimatedTokens * EMBED_ESTIMATED_CHARS_PER_TOKEN; - const wasTruncated = text.length > maxChars; - const prepared = wasTruncated - ? `${text.slice(0, Math.max(0, maxChars - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}` - : text; - return { - text: prepared, - estimatedTokens: this.estimateEmbeddingTokens(prepared), - wasTruncated, - }; - } - - private estimateEmbeddingTokens(text: string): number { - return Math.max(1, Math.ceil(text.length / EMBED_ESTIMATED_CHARS_PER_TOKEN)); - } - private parseEmbeddingContextError(error: unknown): { limitTokens: number | null; requestedTokens: number | null } | null { const message = error instanceof Error ? error.message : String(error); const requestedMatch = message.match(/requested\s+(\d+)\s+tokens/i); @@ -4731,7 +4603,7 @@ export class GHCrawlService { contentHash: stableContentHash( `embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`, ), - estimatedTokens: this.estimateEmbeddingTokens(nextText), + estimatedTokens: estimateEmbeddingTokens(nextText), wasTruncated: true, }; } @@ -5363,13 +5235,15 @@ export class GHCrawlService { const keySummaryTexts = this.loadKeySummaryTextMap(repoId, threadNumber); const missingSummaryThreadNumbers: number[] = []; const tasks = rows.flatMap((row) => { - const task = this.buildActiveVectorTask({ + const task = buildActiveVectorTask({ threadId: row.id, threadNumber: row.number, title: row.title, body: row.body, dedupeSummary: summaryTexts.get(row.id) ?? null, keySummary: keySummaryTexts.get(row.id) ?? null, + embeddingBasis: this.config.embeddingBasis, + embedModel: this.config.embedModel, }); if (task) { return [task]; From fd82929defb6f7bc35f044dd3516d806e455e436 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:33:01 -0700 Subject: [PATCH 153/215] refactor: extract embedding retry helpers --- packages/api-core/src/embedding/retry.ts | 84 +++++++++++++++++++++++ packages/api-core/src/service.ts | 85 ++---------------------- 2 files changed, 89 insertions(+), 80 deletions(-) create mode 100644 packages/api-core/src/embedding/retry.ts diff --git a/packages/api-core/src/embedding/retry.ts b/packages/api-core/src/embedding/retry.ts new file mode 100644 index 0000000..147a91c --- /dev/null +++ b/packages/api-core/src/embedding/retry.ts @@ -0,0 +1,84 @@ +import { + ACTIVE_EMBED_DIMENSIONS, + ACTIVE_EMBED_PIPELINE_VERSION, + EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO, + EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO, + EMBED_TRUNCATION_MARKER, +} from '../service-constants.js'; +import type { ActiveVectorTask } from '../service-types.js'; +import { stableContentHash } from '../service-utils.js'; +import { estimateEmbeddingTokens } from './tasks.js'; + +export type EmbeddingContextError = { limitTokens: number | null; requestedTokens: number | null }; + +export function parseEmbeddingContextError(error: unknown): EmbeddingContextError | null { + const message = error instanceof Error ? error.message : String(error); + const requestedMatch = message.match(/requested\s+(\d+)\s+tokens/i); + const contextLimitMatch = message.match(/maximum context length is\s+(\d+)\s+tokens/i); + const inputLimitMatch = message.match(/maximum input length is\s+(\d+)\s+tokens/i); + const limitTokens = Number(contextLimitMatch?.[1] ?? inputLimitMatch?.[1] ?? NaN); + const requestedTokens = Number(requestedMatch?.[1] ?? NaN); + + if (!Number.isFinite(limitTokens) && !Number.isFinite(requestedTokens)) { + return null; + } + + return { + limitTokens: Number.isFinite(limitTokens) ? limitTokens : null, + requestedTokens: Number.isFinite(requestedTokens) ? requestedTokens : null, + }; +} + +export function isEmbeddingContextError(error: unknown): boolean { + return parseEmbeddingContextError(error) !== null; +} + +export function shrinkEmbeddingTask( + task: ActiveVectorTask, + params: { embedModel: string; context?: EmbeddingContextError }, +): ActiveVectorTask | null { + const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER) + ? task.text.slice(0, -EMBED_TRUNCATION_MARKER.length) + : task.text; + if (withoutMarker.length < 256) { + return null; + } + + const nextLength = Math.max( + 256, + projectEmbeddingRetryLength(withoutMarker.length, task.estimatedTokens, params.context), + ); + if (nextLength >= withoutMarker.length) { + return null; + } + const nextText = `${withoutMarker.slice(0, Math.max(0, nextLength - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`; + return { + ...task, + text: nextText, + contentHash: stableContentHash( + `embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${params.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`, + ), + estimatedTokens: estimateEmbeddingTokens(nextText), + wasTruncated: true, + }; +} + +function projectEmbeddingRetryLength( + textLength: number, + estimatedTokens: number, + context?: EmbeddingContextError, +): number { + const limitTokens = context?.limitTokens ?? null; + const requestedTokens = context?.requestedTokens ?? null; + if (limitTokens && requestedTokens && requestedTokens > limitTokens) { + const targetRatio = (limitTokens * EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO) / requestedTokens; + return Math.floor(textLength * Math.max(0.1, Math.min(targetRatio, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO))); + } + + if (limitTokens && estimatedTokens > limitTokens) { + const targetRatio = (limitTokens * EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO) / estimatedTokens; + return Math.floor(textLength * Math.max(0.1, Math.min(targetRatio, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO))); + } + + return Math.floor(textLength * EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index cf29c32..e14fc3c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -101,10 +101,10 @@ import { readTextBlob } from './db/blob-store.js'; import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; +import { isEmbeddingContextError, parseEmbeddingContextError, shrinkEmbeddingTask } from './embedding/retry.js'; import { activeVectorSourceKind, buildActiveVectorTask, - estimateEmbeddingTokens, } from './embedding/tasks.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; @@ -144,10 +144,7 @@ import { DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE, DURABLE_CLUSTER_REUSE_MIN_OVERLAP, EMBED_CONTEXT_RETRY_ATTEMPTS, - EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO, - EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO, EMBED_MAX_BATCH_TOKENS, - EMBED_TRUNCATION_MARKER, KEY_SUMMARY_CONCURRENCY, KEY_SUMMARY_MAX_BODY_CHARS, KEY_SUMMARY_MAX_UNREAD, @@ -4488,28 +4485,6 @@ export class GHCrawlService { this.db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId); } - private parseEmbeddingContextError(error: unknown): { limitTokens: number | null; requestedTokens: number | null } | null { - const message = error instanceof Error ? error.message : String(error); - const requestedMatch = message.match(/requested\s+(\d+)\s+tokens/i); - const contextLimitMatch = message.match(/maximum context length is\s+(\d+)\s+tokens/i); - const inputLimitMatch = message.match(/maximum input length is\s+(\d+)\s+tokens/i); - const limitTokens = Number(contextLimitMatch?.[1] ?? inputLimitMatch?.[1] ?? NaN); - const requestedTokens = Number(requestedMatch?.[1] ?? NaN); - - if (!Number.isFinite(limitTokens) && !Number.isFinite(requestedTokens)) { - return null; - } - - return { - limitTokens: Number.isFinite(limitTokens) ? limitTokens : null, - requestedTokens: Number.isFinite(requestedTokens) ? requestedTokens : null, - }; - } - - private isEmbeddingContextError(error: unknown): boolean { - return this.parseEmbeddingContextError(error) !== null; - } - private async embedBatchWithRecovery( ai: AiProvider, batch: ActiveVectorTask[], @@ -4523,8 +4498,8 @@ export class GHCrawlService { }); return batch.map((task, index) => ({ task, embedding: embeddings[index] })); } catch (error) { - if (!this.isEmbeddingContextError(error) || batch.length === 1) { - if (batch.length === 1 && this.isEmbeddingContextError(error)) { + if (!isEmbeddingContextError(error) || batch.length === 1) { + if (batch.length === 1 && isEmbeddingContextError(error)) { const recovered = await this.embedSingleTaskWithRecovery(ai, batch[0], onProgress); return [recovered]; } @@ -4559,12 +4534,12 @@ export class GHCrawlService { }); return { task: current, embedding }; } catch (error) { - const context = this.parseEmbeddingContextError(error); + const context = parseEmbeddingContextError(error); if (!context) { throw error; } - const next = this.shrinkEmbeddingTask(current, context); + const next = shrinkEmbeddingTask(current, { embedModel: this.config.embedModel, context }); if (!next || next.text === current.text) { throw error; } @@ -4578,56 +4553,6 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private shrinkEmbeddingTask( - task: ActiveVectorTask, - context?: { limitTokens: number | null; requestedTokens: number | null }, - ): ActiveVectorTask | null { - const withoutMarker = task.text.endsWith(EMBED_TRUNCATION_MARKER) - ? task.text.slice(0, -EMBED_TRUNCATION_MARKER.length) - : task.text; - if (withoutMarker.length < 256) { - return null; - } - - const nextLength = Math.max( - 256, - this.projectEmbeddingRetryLength(withoutMarker.length, task.estimatedTokens, context), - ); - if (nextLength >= withoutMarker.length) { - return null; - } - const nextText = `${withoutMarker.slice(0, Math.max(0, nextLength - EMBED_TRUNCATION_MARKER.length)).trimEnd()}${EMBED_TRUNCATION_MARKER}`; - return { - ...task, - text: nextText, - contentHash: stableContentHash( - `embedding:${ACTIVE_EMBED_PIPELINE_VERSION}:${task.basis}:${this.config.embedModel}:${ACTIVE_EMBED_DIMENSIONS}\n${nextText}`, - ), - estimatedTokens: estimateEmbeddingTokens(nextText), - wasTruncated: true, - }; - } - - private projectEmbeddingRetryLength( - textLength: number, - estimatedTokens: number, - context?: { limitTokens: number | null; requestedTokens: number | null }, - ): number { - const limitTokens = context?.limitTokens ?? null; - const requestedTokens = context?.requestedTokens ?? null; - if (limitTokens && requestedTokens && requestedTokens > limitTokens) { - const targetRatio = (limitTokens * EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO) / requestedTokens; - return Math.floor(textLength * Math.max(0.1, Math.min(targetRatio, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO))); - } - - if (limitTokens && estimatedTokens > limitTokens) { - const targetRatio = (limitTokens * EMBED_CONTEXT_RETRY_TARGET_BUFFER_RATIO) / estimatedTokens; - return Math.floor(textLength * Math.max(0.1, Math.min(targetRatio, EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO))); - } - - return Math.floor(textLength * EMBED_CONTEXT_RETRY_FALLBACK_SHRINK_RATIO); - } - private chunkEmbeddingTasks(items: ActiveVectorTask[], maxItems: number, maxEstimatedTokens: number): ActiveVectorTask[][] { const chunks: ActiveVectorTask[][] = []; let current: ActiveVectorTask[] = []; From 4514f401941524e820ad2133d2cbca23ed52f573 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:33:40 -0700 Subject: [PATCH 154/215] refactor: extract embedding batch chunking --- packages/api-core/src/embedding/chunks.ts | 25 +++++++++++++++++++++ packages/api-core/src/service.ts | 27 ++--------------------- 2 files changed, 27 insertions(+), 25 deletions(-) create mode 100644 packages/api-core/src/embedding/chunks.ts diff --git a/packages/api-core/src/embedding/chunks.ts b/packages/api-core/src/embedding/chunks.ts new file mode 100644 index 0000000..5b9ad13 --- /dev/null +++ b/packages/api-core/src/embedding/chunks.ts @@ -0,0 +1,25 @@ +import type { ActiveVectorTask } from '../service-types.js'; + +export function chunkEmbeddingTasks(items: ActiveVectorTask[], maxItems: number, maxEstimatedTokens: number): ActiveVectorTask[][] { + const chunks: ActiveVectorTask[][] = []; + let current: ActiveVectorTask[] = []; + let currentEstimatedTokens = 0; + + for (const item of items) { + const wouldExceedItemCount = current.length >= maxItems; + const wouldExceedTokenBudget = current.length > 0 && currentEstimatedTokens + item.estimatedTokens > maxEstimatedTokens; + if (wouldExceedItemCount || wouldExceedTokenBudget) { + chunks.push(current); + current = []; + currentEstimatedTokens = 0; + } + + current.push(item); + currentEstimatedTokens += item.estimatedTokens; + } + + if (current.length > 0) { + chunks.push(current); + } + return chunks; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e14fc3c..b096902 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -101,6 +101,7 @@ import { readTextBlob } from './db/blob-store.js'; import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; +import { chunkEmbeddingTasks } from './embedding/chunks.js'; import { isEmbeddingContextError, parseEmbeddingContextError, shrinkEmbeddingTask } from './embedding/retry.js'; import { activeVectorSourceKind, @@ -1559,7 +1560,7 @@ export class GHCrawlService { ); let embedded = 0; - const batches = this.chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS); + const batches = chunkEmbeddingTasks(pending, this.config.embedBatchSize, EMBED_MAX_BATCH_TOKENS); const mapper = new IterableMapper( batches, async (batch: ActiveVectorTask[]) => { @@ -4553,30 +4554,6 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private chunkEmbeddingTasks(items: ActiveVectorTask[], maxItems: number, maxEstimatedTokens: number): ActiveVectorTask[][] { - const chunks: ActiveVectorTask[][] = []; - let current: ActiveVectorTask[] = []; - let currentEstimatedTokens = 0; - - for (const item of items) { - const wouldExceedItemCount = current.length >= maxItems; - const wouldExceedTokenBudget = current.length > 0 && currentEstimatedTokens + item.estimatedTokens > maxEstimatedTokens; - if (wouldExceedItemCount || wouldExceedTokenBudget) { - chunks.push(current); - current = []; - currentEstimatedTokens = 0; - } - - current.push(item); - currentEstimatedTokens += item.estimatedTokens; - } - - if (current.length > 0) { - chunks.push(current); - } - return chunks; - } - private loadStoredEmbeddings(repoId: number): StoredEmbeddingRow[] { return this.db .prepare( From 2a0d7e2a3c1ef4a8d1e7c425585a3c90bb5598d7 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:36:18 -0700 Subject: [PATCH 155/215] refactor: extract vector repository maintenance --- packages/api-core/src/service.ts | 110 +++++---------- .../src/vector/repository-maintenance.ts | 132 ++++++++++++++++++ 2 files changed, 166 insertions(+), 76 deletions(-) create mode 100644 packages/api-core/src/vector/repository-maintenance.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b096902..d3fb059 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -114,7 +114,6 @@ import { isRepoVectorStateCurrent, markRepoClustersCurrent, markRepoVectorsCurrent, - writeRepoPipelineState, } from './pipeline-state.js'; import { exportPortableSyncDatabase, @@ -211,6 +210,12 @@ import { } from './service-utils.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { getVectorliteClusterQuery, normalizedDistanceToScore, normalizedEmbeddingBuffer, parseStoredVector, vectorBlob } from './vector/encoding.js'; +import { + pruneInactiveRepositoryVectors, + queryNearestWithRecovery, + rebuildRepositoryVectorStore, + resetRepositoryVectors, +} from './vector/repository-maintenance.js'; import { isCorruptedVectorIndexError, repositoryVectorStorePath, vectorStoreSidecarPath } from './vector/repository-store.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; @@ -3283,94 +3288,47 @@ export class GHCrawlService { repoFullName: string, params: Omit, ): VectorNeighbor[] { - try { - return this.vectorStore.queryNearest({ - ...params, - storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), - dimensions: ACTIVE_EMBED_DIMENSIONS, - }); - } catch (error) { - if (!isCorruptedVectorIndexError(error)) { - throw error; - } - this.rebuildRepositoryVectorStore(repoId, repoFullName); - return this.vectorStore.queryNearest({ - ...params, - storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), - dimensions: ACTIVE_EMBED_DIMENSIONS, - }); - } + return queryNearestWithRecovery({ + vectorStore: this.vectorStore, + configDir: this.config.configDir, + repoFullName, + dimensions: ACTIVE_EMBED_DIMENSIONS, + query: params, + rebuild: () => this.rebuildRepositoryVectorStore(repoId, repoFullName), + }); } private rebuildRepositoryVectorStore(repoId: number, repoFullName: string): void { - this.vectorStore.resetRepository({ - storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), + rebuildRepositoryVectorStore({ + vectorStore: this.vectorStore, + configDir: this.config.configDir, + repoFullName, dimensions: ACTIVE_EMBED_DIMENSIONS, + vectors: this.loadClusterableActiveVectorMeta(repoId, repoFullName), }); - for (const row of this.loadClusterableActiveVectorMeta(repoId, repoFullName)) { - this.vectorStore.upsertVector({ - storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), - dimensions: ACTIVE_EMBED_DIMENSIONS, - threadId: row.id, - vector: row.embedding, - }); - } } private resetRepositoryVectors(repoId: number, repoFullName: string): void { - this.db - .prepare( - `delete from thread_vectors - where thread_id in (select id from threads where repo_id = ?)`, - ) - .run(repoId); - this.vectorStore.resetRepository({ - storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), + resetRepositoryVectors({ + db: this.db, + vectorStore: this.vectorStore, + config: this.config, + repoId, + repoFullName, dimensions: ACTIVE_EMBED_DIMENSIONS, }); - writeRepoPipelineState(this.db, this.config, repoId, { - vectors_current_at: null, - clusters_current_at: null, - }); } private pruneInactiveRepositoryVectors(repoId: number, repoFullName: string): number { - const rows = this.db - .prepare( - `select tv.thread_id - from thread_vectors tv - join threads t on t.id = tv.thread_id - where t.repo_id = ? - and (t.state != 'open' or t.closed_at_local is not null)`, - ) - .all(repoId) as Array<{ thread_id: number }>; - if (rows.length === 0) { - return 0; - } - - const deleteVectorRow = this.db.prepare('delete from thread_vectors where thread_id = ?'); - let shouldRebuildVectorStore = false; - this.db.transaction(() => { - for (const row of rows) { - deleteVectorRow.run(row.thread_id); - try { - this.vectorStore.deleteVector({ - storePath: repositoryVectorStorePath(this.config.configDir, repoFullName), - dimensions: ACTIVE_EMBED_DIMENSIONS, - threadId: row.thread_id, - }); - } catch (error) { - if (!isCorruptedVectorIndexError(error)) { - throw error; - } - shouldRebuildVectorStore = true; - } - } - })(); - if (shouldRebuildVectorStore) { - this.rebuildRepositoryVectorStore(repoId, repoFullName); - } - return rows.length; + return pruneInactiveRepositoryVectors({ + db: this.db, + vectorStore: this.vectorStore, + configDir: this.config.configDir, + repoId, + repoFullName, + dimensions: ACTIVE_EMBED_DIMENSIONS, + rebuild: () => this.rebuildRepositoryVectorStore(repoId, repoFullName), + }); } private cleanupMigratedRepositoryArtifacts(repoId: number, repoFullName: string, onProgress?: (message: string) => void): void { diff --git a/packages/api-core/src/vector/repository-maintenance.ts b/packages/api-core/src/vector/repository-maintenance.ts new file mode 100644 index 0000000..90a1e6f --- /dev/null +++ b/packages/api-core/src/vector/repository-maintenance.ts @@ -0,0 +1,132 @@ +import type { GitcrawlConfig } from '../config.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { writeRepoPipelineState } from '../pipeline-state.js'; +import { isCorruptedVectorIndexError, repositoryVectorStorePath } from './repository-store.js'; +import type { VectorNeighbor, VectorQueryParams, VectorStore } from './store.js'; + +export type ActiveVectorMeta = { + id: number; + embedding: number[]; +}; + +export function queryNearestWithRecovery(params: { + vectorStore: VectorStore; + configDir: string; + repoFullName: string; + dimensions: number; + query: Omit; + rebuild: () => void; +}): VectorNeighbor[] { + const storePath = repositoryVectorStorePath(params.configDir, params.repoFullName); + try { + return params.vectorStore.queryNearest({ + ...params.query, + storePath, + dimensions: params.dimensions, + }); + } catch (error) { + if (!isCorruptedVectorIndexError(error)) { + throw error; + } + params.rebuild(); + return params.vectorStore.queryNearest({ + ...params.query, + storePath, + dimensions: params.dimensions, + }); + } +} + +export function rebuildRepositoryVectorStore(params: { + vectorStore: VectorStore; + configDir: string; + repoFullName: string; + dimensions: number; + vectors: ActiveVectorMeta[]; +}): void { + const storePath = repositoryVectorStorePath(params.configDir, params.repoFullName); + params.vectorStore.resetRepository({ + storePath, + dimensions: params.dimensions, + }); + for (const row of params.vectors) { + params.vectorStore.upsertVector({ + storePath, + dimensions: params.dimensions, + threadId: row.id, + vector: row.embedding, + }); + } +} + +export function resetRepositoryVectors(params: { + db: SqliteDatabase; + vectorStore: VectorStore; + config: GitcrawlConfig; + repoId: number; + repoFullName: string; + dimensions: number; +}): void { + params.db + .prepare( + `delete from thread_vectors + where thread_id in (select id from threads where repo_id = ?)`, + ) + .run(params.repoId); + params.vectorStore.resetRepository({ + storePath: repositoryVectorStorePath(params.config.configDir, params.repoFullName), + dimensions: params.dimensions, + }); + writeRepoPipelineState(params.db, params.config, params.repoId, { + vectors_current_at: null, + clusters_current_at: null, + }); +} + +export function pruneInactiveRepositoryVectors(params: { + db: SqliteDatabase; + vectorStore: VectorStore; + configDir: string; + repoId: number; + repoFullName: string; + dimensions: number; + rebuild: () => void; +}): number { + const rows = params.db + .prepare( + `select tv.thread_id + from thread_vectors tv + join threads t on t.id = tv.thread_id + where t.repo_id = ? + and (t.state != 'open' or t.closed_at_local is not null)`, + ) + .all(params.repoId) as Array<{ thread_id: number }>; + if (rows.length === 0) { + return 0; + } + + const storePath = repositoryVectorStorePath(params.configDir, params.repoFullName); + const deleteVectorRow = params.db.prepare('delete from thread_vectors where thread_id = ?'); + let shouldRebuildVectorStore = false; + params.db.transaction(() => { + for (const row of rows) { + deleteVectorRow.run(row.thread_id); + try { + params.vectorStore.deleteVector({ + storePath, + dimensions: params.dimensions, + threadId: row.thread_id, + }); + } catch (error) { + if (!isCorruptedVectorIndexError(error)) { + throw error; + } + shouldRebuildVectorStore = true; + } + } + })(); + if (shouldRebuildVectorStore) { + params.rebuild(); + } + return rows.length; +} From 729c221ba82d1cfc5b1885576fbd6e180d11cce0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:37:19 -0700 Subject: [PATCH 156/215] refactor: move vector migration cleanup --- packages/api-core/src/service.ts | 74 ++---------------- .../src/vector/repository-maintenance.ts | 77 +++++++++++++++++++ 2 files changed, 85 insertions(+), 66 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index d3fb059..2c2620a 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -211,6 +211,7 @@ import { import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { getVectorliteClusterQuery, normalizedDistanceToScore, normalizedEmbeddingBuffer, parseStoredVector, vectorBlob } from './vector/encoding.js'; import { + cleanupMigratedRepositoryArtifacts, pruneInactiveRepositoryVectors, queryNearestWithRecovery, rebuildRepositoryVectorStore, @@ -3332,48 +3333,13 @@ export class GHCrawlService { } private cleanupMigratedRepositoryArtifacts(repoId: number, repoFullName: string, onProgress?: (message: string) => void): void { - const legacyEmbeddingCount = this.countLegacyEmbeddings(repoId); - const inlineJsonVectorCount = this.countInlineJsonThreadVectors(repoId); - if (legacyEmbeddingCount === 0 && inlineJsonVectorCount === 0) { - return; - } - - if (legacyEmbeddingCount > 0) { - this.db - .prepare( - `delete from document_embeddings - where thread_id in (select id from threads where repo_id = ?)`, - ) - .run(repoId); - onProgress?.(`[cleanup] removed ${legacyEmbeddingCount} legacy document embedding row(s) after vector migration`); - } - - if (inlineJsonVectorCount > 0) { - const rows = this.db - .prepare( - `select tv.thread_id, tv.vector_json - from thread_vectors tv - join threads t on t.id = tv.thread_id - where t.repo_id = ? - and typeof(tv.vector_json) = 'text' - and tv.vector_json != ''`, - ) - .all(repoId) as Array<{ thread_id: number; vector_json: string }>; - const update = this.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?'); - this.db.transaction(() => { - for (const row of rows) { - update.run(vectorBlob(JSON.parse(row.vector_json) as number[]), nowIso(), row.thread_id); - } - })(); - onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`); - } - - if (this.config.dbPath !== ':memory:') { - onProgress?.(`[cleanup] checkpointing WAL and vacuuming ${repoFullName} migration changes`); - this.db.pragma('wal_checkpoint(TRUNCATE)'); - this.db.exec('VACUUM'); - this.db.pragma('wal_checkpoint(TRUNCATE)'); - } + cleanupMigratedRepositoryArtifacts({ + db: this.db, + dbPath: this.config.dbPath, + repoId, + repoFullName, + onProgress, + }); } private getLatestClusterRun(repoId: number): { id: number; finished_at: string | null } | null { @@ -5889,30 +5855,6 @@ export class GHCrawlService { } } - private countLegacyEmbeddings(repoId: number): number { - const row = this.db - .prepare( - `select count(*) as count - from document_embeddings - where thread_id in (select id from threads where repo_id = ?)`, - ) - .get(repoId) as { count: number }; - return row.count; - } - - private countInlineJsonThreadVectors(repoId: number): number { - const row = this.db - .prepare( - `select count(*) as count - from thread_vectors - where thread_id in (select id from threads where repo_id = ?) - and typeof(vector_json) = 'text' - and vector_json != ''`, - ) - .get(repoId) as { count: number }; - return row.count; - } - private upsertEmbedding(threadId: number, sourceKind: EmbeddingSourceKind, contentHash: string, embedding: number[]): void { this.db .prepare( diff --git a/packages/api-core/src/vector/repository-maintenance.ts b/packages/api-core/src/vector/repository-maintenance.ts index 90a1e6f..0cb1972 100644 --- a/packages/api-core/src/vector/repository-maintenance.ts +++ b/packages/api-core/src/vector/repository-maintenance.ts @@ -1,6 +1,8 @@ import type { GitcrawlConfig } from '../config.js'; import type { SqliteDatabase } from '../db/sqlite.js'; import { writeRepoPipelineState } from '../pipeline-state.js'; +import { nowIso } from '../service-utils.js'; +import { vectorBlob } from './encoding.js'; import { isCorruptedVectorIndexError, repositoryVectorStorePath } from './repository-store.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './store.js'; @@ -130,3 +132,78 @@ export function pruneInactiveRepositoryVectors(params: { } return rows.length; } + +export function cleanupMigratedRepositoryArtifacts(params: { + db: SqliteDatabase; + dbPath: string; + repoId: number; + repoFullName: string; + onProgress?: (message: string) => void; +}): void { + const legacyEmbeddingCount = countLegacyEmbeddings(params.db, params.repoId); + const inlineJsonVectorCount = countInlineJsonThreadVectors(params.db, params.repoId); + if (legacyEmbeddingCount === 0 && inlineJsonVectorCount === 0) { + return; + } + + if (legacyEmbeddingCount > 0) { + params.db + .prepare( + `delete from document_embeddings + where thread_id in (select id from threads where repo_id = ?)`, + ) + .run(params.repoId); + params.onProgress?.(`[cleanup] removed ${legacyEmbeddingCount} legacy document embedding row(s) after vector migration`); + } + + if (inlineJsonVectorCount > 0) { + const rows = params.db + .prepare( + `select tv.thread_id, tv.vector_json + from thread_vectors tv + join threads t on t.id = tv.thread_id + where t.repo_id = ? + and typeof(tv.vector_json) = 'text' + and tv.vector_json != ''`, + ) + .all(params.repoId) as Array<{ thread_id: number; vector_json: string }>; + const update = params.db.prepare('update thread_vectors set vector_json = ?, updated_at = ? where thread_id = ?'); + params.db.transaction(() => { + for (const row of rows) { + update.run(vectorBlob(JSON.parse(row.vector_json) as number[]), nowIso(), row.thread_id); + } + })(); + params.onProgress?.(`[cleanup] compacted ${inlineJsonVectorCount} inline SQLite vector payload(s) from JSON to binary blobs`); + } + + if (params.dbPath !== ':memory:') { + params.onProgress?.(`[cleanup] checkpointing WAL and vacuuming ${params.repoFullName} migration changes`); + params.db.pragma('wal_checkpoint(TRUNCATE)'); + params.db.exec('VACUUM'); + params.db.pragma('wal_checkpoint(TRUNCATE)'); + } +} + +function countLegacyEmbeddings(db: SqliteDatabase, repoId: number): number { + const row = db + .prepare( + `select count(*) as count + from document_embeddings + where thread_id in (select id from threads where repo_id = ?)`, + ) + .get(repoId) as { count: number }; + return row.count; +} + +function countInlineJsonThreadVectors(db: SqliteDatabase, repoId: number): number { + const row = db + .prepare( + `select count(*) as count + from thread_vectors + where thread_id in (select id from threads where repo_id = ?) + and typeof(vector_json) = 'text' + and vector_json != ''`, + ) + .get(repoId) as { count: number }; + return row.count; +} From 82fbaf23cdfbb0aef3174ab0aeb094a2ce769cbd Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:38:58 -0700 Subject: [PATCH 157/215] refactor: extract embedding workset loader --- packages/api-core/src/embedding/workset.ts | 162 +++++++++++++++++++++ packages/api-core/src/service.ts | 156 ++------------------ 2 files changed, 171 insertions(+), 147 deletions(-) create mode 100644 packages/api-core/src/embedding/workset.ts diff --git a/packages/api-core/src/embedding/workset.ts b/packages/api-core/src/embedding/workset.ts new file mode 100644 index 0000000..8476450 --- /dev/null +++ b/packages/api-core/src/embedding/workset.ts @@ -0,0 +1,162 @@ +import { LLM_KEY_SUMMARY_PROMPT_VERSION } from '../cluster/llm-key-summary.js'; +import type { GitcrawlConfig } from '../config.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { isRepoVectorStateCurrent } from '../pipeline-state.js'; +import { ACTIVE_EMBED_DIMENSIONS, SUMMARY_PROMPT_VERSION } from '../service-constants.js'; +import type { EmbeddingWorkset } from '../service-types.js'; +import { normalizeSummaryText } from '../service-utils.js'; +import { buildActiveVectorTask } from './tasks.js'; + +export function getEmbeddingWorkset(params: { + db: SqliteDatabase; + config: GitcrawlConfig; + repoId: number; + threadNumber?: number; +}): EmbeddingWorkset { + let sql = + `select t.id, t.number, t.title, t.body + from threads t + where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + )`; + const args: Array = [params.repoId]; + if (params.threadNumber) { + sql += ' and t.number = ?'; + args.push(params.threadNumber); + } + sql += ' order by t.number asc'; + const rows = params.db.prepare(sql).all(...args) as Array<{ + id: number; + number: number; + title: string; + body: string | null; + }>; + const pipelineCurrent = isRepoVectorStateCurrent(params.db, params.config, params.repoId); + const existingRows = params.db + .prepare( + `select tv.thread_id, tv.content_hash + from thread_vectors tv + join threads t on t.id = tv.thread_id + where t.repo_id = ? + and tv.model = ? + and tv.basis = ? + and tv.dimensions = ?`, + ) + .all(params.repoId, params.config.embedModel, params.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS) as Array<{ + thread_id: number; + content_hash: string; + }>; + const existing = new Map(); + for (const row of existingRows) { + existing.set(String(row.thread_id), row.content_hash); + } + const summaryTexts = loadDedupeSummaryTextMap(params); + const keySummaryTexts = loadKeySummaryTextMap(params); + const missingSummaryThreadNumbers: number[] = []; + const tasks = rows.flatMap((row) => { + const task = buildActiveVectorTask({ + threadId: row.id, + threadNumber: row.number, + title: row.title, + body: row.body, + dedupeSummary: summaryTexts.get(row.id) ?? null, + keySummary: keySummaryTexts.get(row.id) ?? null, + embeddingBasis: params.config.embeddingBasis, + embedModel: params.config.embedModel, + }); + if (task) { + return [task]; + } + if ( + (params.config.embeddingBasis === 'title_summary' || params.config.embeddingBasis === 'llm_key_summary') && + (!pipelineCurrent || !existing.has(String(row.id))) + ) { + missingSummaryThreadNumbers.push(row.number); + } + return []; + }); + const pending = pipelineCurrent + ? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash) + : tasks; + return { rows, tasks, existing, pending, missingSummaryThreadNumbers }; +} + +function loadDedupeSummaryTextMap(params: { + db: SqliteDatabase; + config: GitcrawlConfig; + repoId: number; + threadNumber?: number; +}): Map { + let sql = + `select s.thread_id, s.summary_text + from document_summaries s + join threads t on t.id = s.thread_id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and s.model = ? + and s.summary_kind = 'dedupe_summary' + and s.prompt_version = ?`; + const args: Array = [params.repoId, params.config.summaryModel, SUMMARY_PROMPT_VERSION]; + if (params.threadNumber) { + sql += ' and t.number = ?'; + args.push(params.threadNumber); + } + sql += ' order by t.number asc'; + + const rows = params.db.prepare(sql).all(...args) as Array<{ + thread_id: number; + summary_text: string; + }>; + const combined = new Map(); + for (const row of rows) { + const text = normalizeSummaryText(row.summary_text); + if (text) { + combined.set(row.thread_id, text); + } + } + return combined; +} + +function loadKeySummaryTextMap(params: { + db: SqliteDatabase; + config: GitcrawlConfig; + repoId: number; + threadNumber?: number; +}): Map { + let sql = + `select tr.thread_id, ks.key_text + from thread_key_summaries ks + join thread_revisions tr on tr.id = ks.thread_revision_id + join threads t on t.id = tr.thread_id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and ks.summary_kind = 'llm_key_3line' + and ks.prompt_version = ? + and ks.model = ?`; + const args: Array = [params.repoId, LLM_KEY_SUMMARY_PROMPT_VERSION, params.config.summaryModel]; + if (params.threadNumber) { + sql += ' and t.number = ?'; + args.push(params.threadNumber); + } + sql += ' order by tr.id asc'; + + const rows = params.db.prepare(sql).all(...args) as Array<{ + thread_id: number; + key_text: string; + }>; + const combined = new Map(); + for (const row of rows) { + const text = normalizeSummaryText(row.key_text); + if (text) { + combined.set(row.thread_id, text); + } + } + return combined; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 2c2620a..86f0344 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -103,10 +103,8 @@ import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.j import { buildDoctorResult } from './doctor.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; import { isEmbeddingContextError, parseEmbeddingContextError, shrinkEmbeddingTask } from './embedding/retry.js'; -import { - activeVectorSourceKind, - buildActiveVectorTask, -} from './embedding/tasks.js'; +import { activeVectorSourceKind } from './embedding/tasks.js'; +import { getEmbeddingWorkset } from './embedding/workset.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; import { OpenAiProvider, type AiProvider } from './openai/provider.js'; import { @@ -167,7 +165,6 @@ import type { DoctorResult, DurableTuiClosure, EmbeddingSourceKind, - EmbeddingWorkset, KeySummaryTask, NeighborsResultInternal, PortableSyncExportOptions, @@ -193,7 +190,6 @@ import { isEffectivelyClosed, isMissingGitHubResourceError, isPullRequestPayload, - normalizeSummaryText, nowIso, parseArray, parseAssignees, @@ -1548,7 +1544,12 @@ export class GHCrawlService { } } - const { rows, tasks, pending, missingSummaryThreadNumbers } = this.getEmbeddingWorkset(repository.id, params.threadNumber); + const { rows, tasks, pending, missingSummaryThreadNumbers } = getEmbeddingWorkset({ + db: this.db, + config: this.config, + repoId: repository.id, + threadNumber: params.threadNumber, + }); const skipped = tasks.length - pending.length; const truncated = tasks.filter((task) => task.wasTruncated).length; @@ -3270,7 +3271,7 @@ export class GHCrawlService { const latestEmbed = (this.db .prepare("select finished_at from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") .get(repoId) as { finished_at: string | null } | undefined) ?? null; - const embeddingWorkset = this.getEmbeddingWorkset(repoId); + const embeddingWorkset = getEmbeddingWorkset({ db: this.db, config: this.config, repoId }); const staleThreadIds = new Set(embeddingWorkset.pending.map((task) => task.threadId)); return { openIssueCount: counts.find((row) => row.kind === 'issue')?.count ?? 0, @@ -5014,145 +5015,6 @@ export class GHCrawlService { })); } - private getEmbeddingWorkset(repoId: number, threadNumber?: number): EmbeddingWorkset { - let sql = - `select t.id, t.number, t.title, t.body - from threads t - where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null - and not exists ( - select 1 - from cluster_closures cc - join cluster_memberships cm on cm.cluster_id = cc.cluster_id - where cm.thread_id = t.id - and cm.state <> 'removed_by_user' - )`; - const args: Array = [repoId]; - if (threadNumber) { - sql += ' and t.number = ?'; - args.push(threadNumber); - } - sql += ' order by t.number asc'; - const rows = this.db.prepare(sql).all(...args) as Array<{ - id: number; - number: number; - title: string; - body: string | null; - }>; - const pipelineCurrent = isRepoVectorStateCurrent(this.db, this.config, repoId); - const existingRows = this.db - .prepare( - `select tv.thread_id, tv.content_hash - from thread_vectors tv - join threads t on t.id = tv.thread_id - where t.repo_id = ? - and tv.model = ? - and tv.basis = ? - and tv.dimensions = ?`, - ) - .all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS) as Array<{ - thread_id: number; - content_hash: string; - }>; - const existing = new Map(); - for (const row of existingRows) { - existing.set(String(row.thread_id), row.content_hash); - } - const summaryTexts = this.loadDedupeSummaryTextMap(repoId, threadNumber); - const keySummaryTexts = this.loadKeySummaryTextMap(repoId, threadNumber); - const missingSummaryThreadNumbers: number[] = []; - const tasks = rows.flatMap((row) => { - const task = buildActiveVectorTask({ - threadId: row.id, - threadNumber: row.number, - title: row.title, - body: row.body, - dedupeSummary: summaryTexts.get(row.id) ?? null, - keySummary: keySummaryTexts.get(row.id) ?? null, - embeddingBasis: this.config.embeddingBasis, - embedModel: this.config.embedModel, - }); - if (task) { - return [task]; - } - if ( - (this.config.embeddingBasis === 'title_summary' || this.config.embeddingBasis === 'llm_key_summary') && - (!pipelineCurrent || !existing.has(String(row.id))) - ) { - missingSummaryThreadNumbers.push(row.number); - } - return []; - }); - const pending = pipelineCurrent - ? tasks.filter((task) => existing.get(String(task.threadId)) !== task.contentHash) - : tasks; - return { rows, tasks, existing, pending, missingSummaryThreadNumbers }; - } - - private loadDedupeSummaryTextMap(repoId: number, threadNumber?: number): Map { - let sql = - `select s.thread_id, s.summary_text - from document_summaries s - join threads t on t.id = s.thread_id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and s.model = ? - and s.summary_kind = 'dedupe_summary' - and s.prompt_version = ?`; - const args: Array = [repoId, this.config.summaryModel, SUMMARY_PROMPT_VERSION]; - if (threadNumber) { - sql += ' and t.number = ?'; - args.push(threadNumber); - } - sql += ' order by t.number asc'; - - const rows = this.db.prepare(sql).all(...args) as Array<{ - thread_id: number; - summary_text: string; - }>; - const combined = new Map(); - for (const row of rows) { - const text = normalizeSummaryText(row.summary_text); - if (text) { - combined.set(row.thread_id, text); - } - } - return combined; - } - - private loadKeySummaryTextMap(repoId: number, threadNumber?: number): Map { - let sql = - `select tr.thread_id, ks.key_text - from thread_key_summaries ks - join thread_revisions tr on tr.id = ks.thread_revision_id - join threads t on t.id = tr.thread_id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and ks.summary_kind = 'llm_key_3line' - and ks.prompt_version = ? - and ks.model = ?`; - const args: Array = [repoId, LLM_KEY_SUMMARY_PROMPT_VERSION, this.config.summaryModel]; - if (threadNumber) { - sql += ' and t.number = ?'; - args.push(threadNumber); - } - sql += ' order by tr.id asc'; - - const rows = this.db.prepare(sql).all(...args) as Array<{ - thread_id: number; - key_text: string; - }>; - const combined = new Map(); - for (const row of rows) { - const text = normalizeSummaryText(row.key_text); - if (text) { - combined.set(row.thread_id, text); - } - } - return combined; - } - private edgeKey(leftThreadId: number, rightThreadId: number): string { const left = Math.min(leftThreadId, rightThreadId); const right = Math.max(leftThreadId, rightThreadId); From 737799d05456e3c3b00529073f692699db92f4e0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:40:42 -0700 Subject: [PATCH 158/215] refactor: extract cluster edge aggregation --- .../api-core/src/cluster/edge-aggregation.ts | 140 +++++++++++++++ packages/api-core/src/service.ts | 170 +++--------------- 2 files changed, 162 insertions(+), 148 deletions(-) create mode 100644 packages/api-core/src/cluster/edge-aggregation.ts diff --git a/packages/api-core/src/cluster/edge-aggregation.ts b/packages/api-core/src/cluster/edge-aggregation.ts new file mode 100644 index 0000000..cd084db --- /dev/null +++ b/packages/api-core/src/cluster/edge-aggregation.ts @@ -0,0 +1,140 @@ +import type { AggregatedClusterEdge, EmbeddingSourceKind, SimilaritySourceKind } from '../service-types.js'; + +export type PerSourceScoreEntry = { + leftThreadId: number; + rightThreadId: number; + scores: Map; +}; + +export type EdgeAggregationMode = 'max' | 'mean' | 'weighted' | 'min-of-2' | 'boost'; + +export function edgeKey(leftThreadId: number, rightThreadId: number): string { + const left = Math.min(leftThreadId, rightThreadId); + const right = Math.max(leftThreadId, rightThreadId); + return `${left}:${right}`; +} + +export function mergeSourceKindEdges( + aggregated: Map, + edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, + sourceKind: SimilaritySourceKind, +): void { + for (const edge of edges) { + const key = edgeKey(edge.leftThreadId, edge.rightThreadId); + const existing = aggregated.get(key); + if (existing) { + existing.score = Math.max(existing.score, edge.score); + existing.sourceKinds.add(sourceKind); + continue; + } + aggregated.set(key, { + leftThreadId: edge.leftThreadId, + rightThreadId: edge.rightThreadId, + score: edge.score, + sourceKinds: new Set([sourceKind]), + }); + } +} + +export function pruneWeakCrossKindEdges( + aggregated: Map, + threadKinds: Map, + crossKindMinScore: number, +): number { + let dropped = 0; + for (const [key, edge] of aggregated) { + const leftKind = threadKinds.get(edge.leftThreadId); + const rightKind = threadKinds.get(edge.rightThreadId); + if (!leftKind || !rightKind || leftKind === rightKind) { + continue; + } + if (edge.sourceKinds.has('deterministic_fingerprint') || edge.score >= crossKindMinScore) { + continue; + } + aggregated.delete(key); + dropped += 1; + } + return dropped; +} + +export function collectSourceKindScores( + perSourceScores: Map, + edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, + sourceKind: EmbeddingSourceKind, +): void { + for (const edge of edges) { + const key = edgeKey(edge.leftThreadId, edge.rightThreadId); + const existing = perSourceScores.get(key); + if (existing) { + existing.scores.set(sourceKind, Math.max(existing.scores.get(sourceKind) ?? -1, edge.score)); + continue; + } + const scores = new Map(); + scores.set(sourceKind, edge.score); + perSourceScores.set(key, { + leftThreadId: edge.leftThreadId, + rightThreadId: edge.rightThreadId, + scores, + }); + } +} + +export function finalizeEdgeScores( + perSourceScores: Map, + aggregation: EdgeAggregationMode, + weights: Record, + minScore: number, +): Array<{ leftThreadId: number; rightThreadId: number; score: number }> { + const result: Array<{ leftThreadId: number; rightThreadId: number; score: number }> = []; + + for (const entry of perSourceScores.values()) { + const scoreValues = Array.from(entry.scores.values()); + let finalScore: number; + + switch (aggregation) { + case 'max': + finalScore = Math.max(...scoreValues); + break; + + case 'mean': + finalScore = scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length; + break; + + case 'weighted': { + let weightedSum = 0; + let weightSum = 0; + for (const [kind, score] of entry.scores) { + const weight = weights[kind] ?? 0.1; + weightedSum += score * weight; + weightSum += weight; + } + finalScore = weightSum > 0 ? weightedSum / weightSum : 0; + break; + } + + case 'min-of-2': + if (scoreValues.length < 2) { + continue; + } + finalScore = Math.max(...scoreValues); + break; + + case 'boost': { + const best = Math.max(...scoreValues); + const bonusSources = scoreValues.length - 1; + finalScore = Math.min(1.0, best + bonusSources * 0.05); + break; + } + } + + if (finalScore >= minScore) { + result.push({ + leftThreadId: entry.leftThreadId, + rightThreadId: entry.rightThreadId, + score: finalScore, + }); + } + } + + return result; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 86f0344..89fd60c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -65,6 +65,14 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; import { buildDeterministicClusterGraphFromFingerprints, extractDeterministicRefs } from './cluster/deterministic-engine.js'; +import { + collectSourceKindScores, + edgeKey, + finalizeEdgeScores, + mergeSourceKindEdges, + pruneWeakCrossKindEdges, + type PerSourceScoreEntry, +} from './cluster/edge-aggregation.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; @@ -169,7 +177,6 @@ import type { NeighborsResultInternal, PortableSyncExportOptions, SearchResultInternal, - SimilaritySourceKind, StoredEmbeddingRow, SyncCursorState, SyncOptions, @@ -1669,7 +1676,7 @@ export class GHCrawlService { }, ); const aggregatedEdges = new Map(); - this.mergeSourceKindEdges( + mergeSourceKindEdges( aggregatedEdges, deterministic.edges .filter((edge) => edge.tier === 'strong' || edge.score >= deterministicMinScore) @@ -1707,7 +1714,7 @@ export class GHCrawlService { for (const neighbor of neighbors) { if (!activeIds.has(neighbor.threadId)) continue; if (neighbor.score < minScore) continue; - this.mergeSourceKindEdges( + mergeSourceKindEdges( aggregatedEdges, [ { @@ -1738,7 +1745,7 @@ export class GHCrawlService { }); for (const legacyEdge of legacyEdges.values()) { for (const sourceKind of legacyEdge.sourceKinds) { - this.mergeSourceKindEdges( + mergeSourceKindEdges( aggregatedEdges, [{ leftThreadId: legacyEdge.leftThreadId, rightThreadId: legacyEdge.rightThreadId, score: legacyEdge.score }], sourceKind, @@ -1748,7 +1755,7 @@ export class GHCrawlService { } const threadKinds = new Map(deterministicItems.map((item) => [item.id, item.kind])); - const droppedCrossKindEdges = this.pruneWeakCrossKindEdges(aggregatedEdges, threadKinds, crossKindMinScore); + const droppedCrossKindEdges = pruneWeakCrossKindEdges(aggregatedEdges, threadKinds, crossKindMinScore); if (droppedCrossKindEdges > 0) { params.onProgress?.( `[cluster] dropped ${droppedCrossKindEdges} weak issue/pr edge(s) below cross_kind_min_score=${crossKindMinScore}`, @@ -1867,7 +1874,7 @@ export class GHCrawlService { `[cluster-experiment] loaded ${items.length} embedded thread(s) across ${sourceKinds.length} source kind(s) for ${repository.fullName} backend=${backend} k=${k} candidateK=${candidateK} minScore=${minScore} aggregation=${aggregation}`, ); - const perSourceScores = new Map }>(); + const perSourceScores = new Map(); let loadMs = 0; let setupMs = 0; let edgeBuildMs = 0; @@ -1900,7 +1907,7 @@ export class GHCrawlService { }, }); edgeBuildMs += Date.now() - edgesStartedAt; - this.collectSourceKindScores(perSourceScores, edges, activeSourceKind); + collectSourceKindScores(perSourceScores, edges, activeSourceKind); recordMemory(); } else { const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repository.id, sourceKind), 0); @@ -1927,7 +1934,7 @@ export class GHCrawlService { }); edgeBuildMs += Date.now() - edgesStartedAt; processedItems += normalizedRows.length; - this.collectSourceKindScores(perSourceScores, edges, sourceKind); + collectSourceKindScores(perSourceScores, edges, sourceKind); recordMemory(); } } @@ -2014,7 +2021,7 @@ export class GHCrawlService { let addedThisRow = 0; for (const candidate of ranked) { const score = candidate.score; - const key = this.edgeKey(row.id, candidate.item.rowid); + const key = edgeKey(row.id, candidate.item.rowid); const existing = perSourceScores.get(key); if (existing) { existing.scores.set(source.sourceKind, Math.max(existing.scores.get(source.sourceKind) ?? -1, score)); @@ -2049,7 +2056,7 @@ export class GHCrawlService { // Finalize edge scores using the configured aggregation method const defaultWeights: Record = { dedupe_summary: 0.5, llm_key_summary: 0.5, title: 0.3, body: 0.2 }; const weights = { ...defaultWeights, ...(params.aggregationWeights ?? {}) }; - const aggregated = this.finalizeEdgeScores(perSourceScores, aggregation, weights, minScore); + const aggregated = finalizeEdgeScores(perSourceScores, aggregation, weights, minScore); params.onProgress?.( `[cluster-experiment] finalized ${aggregated.length} edges from ${perSourceScores.size} candidate pairs using ${aggregation} aggregation`, @@ -5015,12 +5022,6 @@ export class GHCrawlService { })); } - private edgeKey(leftThreadId: number, rightThreadId: number): string { - const left = Math.min(leftThreadId, rightThreadId); - const right = Math.max(leftThreadId, rightThreadId); - return `${left}:${right}`; - } - private async aggregateRepositoryEdges( repoId: number, sourceKinds: EmbeddingSourceKind[], @@ -5051,7 +5052,7 @@ export class GHCrawlService { }, }); processedItems += items.length; - this.mergeSourceKindEdges(aggregated, edges, sourceKind); + mergeSourceKindEdges(aggregated, edges, sourceKind); } return aggregated; @@ -5115,139 +5116,12 @@ export class GHCrawlService { ); for (const [index, edges] of edgeSets.entries()) { - this.mergeSourceKindEdges(aggregated, edges, sourceKinds[index] as EmbeddingSourceKind); + mergeSourceKindEdges(aggregated, edges, sourceKinds[index] as EmbeddingSourceKind); } return aggregated; } - private mergeSourceKindEdges( - aggregated: Map, - edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, - sourceKind: SimilaritySourceKind, - ): void { - for (const edge of edges) { - const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId); - const existing = aggregated.get(key); - if (existing) { - existing.score = Math.max(existing.score, edge.score); - existing.sourceKinds.add(sourceKind); - continue; - } - aggregated.set(key, { - leftThreadId: edge.leftThreadId, - rightThreadId: edge.rightThreadId, - score: edge.score, - sourceKinds: new Set([sourceKind]), - }); - } - } - - private pruneWeakCrossKindEdges( - aggregated: Map, - threadKinds: Map, - crossKindMinScore: number, - ): number { - let dropped = 0; - for (const [key, edge] of aggregated) { - const leftKind = threadKinds.get(edge.leftThreadId); - const rightKind = threadKinds.get(edge.rightThreadId); - if (!leftKind || !rightKind || leftKind === rightKind) { - continue; - } - if (edge.sourceKinds.has('deterministic_fingerprint') || edge.score >= crossKindMinScore) { - continue; - } - aggregated.delete(key); - dropped += 1; - } - return dropped; - } - - private collectSourceKindScores( - perSourceScores: Map }>, - edges: Array<{ leftThreadId: number; rightThreadId: number; score: number }>, - sourceKind: EmbeddingSourceKind, - ): void { - for (const edge of edges) { - const key = this.edgeKey(edge.leftThreadId, edge.rightThreadId); - const existing = perSourceScores.get(key); - if (existing) { - existing.scores.set(sourceKind, Math.max(existing.scores.get(sourceKind) ?? -1, edge.score)); - continue; - } - const scores = new Map(); - scores.set(sourceKind, edge.score); - perSourceScores.set(key, { - leftThreadId: edge.leftThreadId, - rightThreadId: edge.rightThreadId, - scores, - }); - } - } - - private finalizeEdgeScores( - perSourceScores: Map }>, - aggregation: 'max' | 'mean' | 'weighted' | 'min-of-2' | 'boost', - weights: Record, - minScore: number, - ): Array<{ leftThreadId: number; rightThreadId: number; score: number }> { - const result: Array<{ leftThreadId: number; rightThreadId: number; score: number }> = []; - - for (const entry of perSourceScores.values()) { - const scoreValues = Array.from(entry.scores.values()); - let finalScore: number; - - switch (aggregation) { - case 'max': - finalScore = Math.max(...scoreValues); - break; - - case 'mean': - finalScore = scoreValues.reduce((a, b) => a + b, 0) / scoreValues.length; - break; - - case 'weighted': { - let weightedSum = 0; - let weightSum = 0; - for (const [kind, score] of entry.scores) { - const w = weights[kind] ?? 0.1; - weightedSum += score * w; - weightSum += w; - } - finalScore = weightSum > 0 ? weightedSum / weightSum : 0; - break; - } - - case 'min-of-2': - // Require at least 2 source kinds to agree (both above minScore) - if (scoreValues.length < 2) { - continue; // Skip edges with only 1 source kind - } - finalScore = Math.max(...scoreValues); - break; - - case 'boost': { - // Best score + bonus per additional agreeing source - const best = Math.max(...scoreValues); - const bonusSources = scoreValues.length - 1; - finalScore = Math.min(1.0, best + bonusSources * 0.05); - break; - } - } - - if (finalScore >= minScore) { - result.push({ - leftThreadId: entry.leftThreadId, - rightThreadId: entry.rightThreadId, - score: finalScore, - }); - } - } - - return result; - } - private countEmbeddingsForSourceKind(repoId: number, sourceKind: EmbeddingSourceKind): number { const row = this.db .prepare( @@ -5318,7 +5192,7 @@ export class GHCrawlService { ); const clusterId = Number(clusterResult.lastInsertRowid); for (const memberId of cluster.members) { - const key = this.edgeKey(cluster.representativeThreadId, memberId); + const key = edgeKey(cluster.representativeThreadId, memberId); const score = memberId === cluster.representativeThreadId ? null : (aggregatedEdges.get(key)?.score ?? null); insertMember.run(clusterId, memberId, score, createdAt); } @@ -5386,7 +5260,7 @@ export class GHCrawlService { .run(representativeThreadId, nowIso(), clusterId); } for (const memberId of cluster.members) { - const scoreKey = this.edgeKey(representativeThreadId, memberId); + const scoreKey = edgeKey(representativeThreadId, memberId); const score = memberId === representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); const excluded = this.db .prepare( @@ -5472,7 +5346,7 @@ export class GHCrawlService { if (cluster.members.includes(forced.thread_id)) { continue; } - const scoreKey = this.edgeKey(representativeThreadId, forced.thread_id); + const scoreKey = edgeKey(representativeThreadId, forced.thread_id); const score = forced.thread_id === representativeThreadId ? 1 : (aggregatedEdges.get(scoreKey)?.score ?? null); upsertClusterMembership(this.db, { clusterId, From fcd5daf9a44dd193a311a65d33ad34996c13eab0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:41:42 -0700 Subject: [PATCH 159/215] refactor: extract cluster quality summaries --- packages/api-core/src/cluster/quality.ts | 72 ++++++++++++++++++++++ packages/api-core/src/service.ts | 77 +----------------------- 2 files changed, 75 insertions(+), 74 deletions(-) create mode 100644 packages/api-core/src/cluster/quality.ts diff --git a/packages/api-core/src/cluster/quality.ts b/packages/api-core/src/cluster/quality.ts new file mode 100644 index 0000000..8ec76e8 --- /dev/null +++ b/packages/api-core/src/cluster/quality.ts @@ -0,0 +1,72 @@ +import type { ClusterExperimentClusterSizeStats } from '../service-types.js'; + +export function summarizeClusterSizes( + clusters: Array<{ representativeThreadId: number; members: number[] }>, +): ClusterExperimentClusterSizeStats { + const histogramCounts = new Map(); + const topClusterSizes = clusters.map((cluster) => cluster.members.length).sort((left, right) => right - left); + let soloClusters = 0; + + for (const cluster of clusters) { + const size = cluster.members.length; + histogramCounts.set(size, (histogramCounts.get(size) ?? 0) + 1); + if (size === 1) { + soloClusters += 1; + } + } + + return { + soloClusters, + maxClusterSize: topClusterSizes[0] ?? 0, + topClusterSizes: topClusterSizes.slice(0, 50), + histogram: Array.from(histogramCounts.entries()) + .map(([size, count]) => ({ size, count })) + .sort((left, right) => left.size - right.size), + }; +} + +export function summarizeClusterQuality( + clusters: Array<{ representativeThreadId: number; members: number[] }>, + threadKinds: Map, + maxClusterSize: number, +): { + maxClusterSize: number; + maxObservedClusterSize: number; + maxedClusterCount: number; + mixedKindClusterCount: number; + singletonClusterCount: number; + nonSingletonClusterCount: number; +} { + let maxObservedClusterSize = 0; + let maxedClusterCount = 0; + let mixedKindClusterCount = 0; + let singletonClusterCount = 0; + + for (const cluster of clusters) { + const size = cluster.members.length; + maxObservedClusterSize = Math.max(maxObservedClusterSize, size); + if (size >= maxClusterSize) maxedClusterCount += 1; + if (size === 1) singletonClusterCount += 1; + + let hasIssue = false; + let hasPullRequest = false; + for (const memberId of cluster.members) { + const kind = threadKinds.get(memberId); + hasIssue ||= kind === 'issue'; + hasPullRequest ||= kind === 'pull_request'; + if (hasIssue && hasPullRequest) { + mixedKindClusterCount += 1; + break; + } + } + } + + return { + maxClusterSize, + maxObservedClusterSize, + maxedClusterCount, + mixedKindClusterCount, + singletonClusterCount, + nonSingletonClusterCount: clusters.length - singletonClusterCount, + }; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 89fd60c..c9bfe17 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -76,6 +76,7 @@ import { import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; +import { summarizeClusterQuality, summarizeClusterSizes } from './cluster/quality.js'; import { createPipelineRun, finishPipelineRun, @@ -168,7 +169,6 @@ import type { ActiveVectorTask, AggregatedClusterEdge, ClusterExperimentResult, - ClusterExperimentClusterSizeStats, CommentSeed, DoctorResult, DurableTuiClosure, @@ -1784,7 +1784,7 @@ export class GHCrawlService { edges, { maxClusterSize }, ); - const clusterQuality = this.summarizeClusterQuality(clusters, threadKinds, maxClusterSize); + const clusterQuality = summarizeClusterQuality(clusters, threadKinds, maxClusterSize); if (!seedThreadIds) { this.persistClusterRun(repository.id, runId, aggregatedEdges, clusters); } @@ -2111,7 +2111,7 @@ export class GHCrawlService { heapUsedAfterBytes: memoryAfter.heapUsed, peakHeapUsedBytes, }, - clusterSizes: this.summarizeClusterSizes(clusters), + clusterSizes: summarizeClusterSizes(clusters), clustersDetail: params.includeClusters ? clusters.map((cluster) => ({ representativeThreadId: cluster.representativeThreadId, @@ -5459,77 +5459,6 @@ export class GHCrawlService { this.db.prepare('delete from cluster_runs where repo_id = ? and id <> ?').run(repoId, keepRunId); } - private summarizeClusterSizes( - clusters: Array<{ representativeThreadId: number; members: number[] }>, - ): ClusterExperimentClusterSizeStats { - const histogramCounts = new Map(); - const topClusterSizes = clusters.map((cluster) => cluster.members.length).sort((left, right) => right - left); - let soloClusters = 0; - - for (const cluster of clusters) { - const size = cluster.members.length; - histogramCounts.set(size, (histogramCounts.get(size) ?? 0) + 1); - if (size === 1) { - soloClusters += 1; - } - } - - return { - soloClusters, - maxClusterSize: topClusterSizes[0] ?? 0, - topClusterSizes: topClusterSizes.slice(0, 50), - histogram: Array.from(histogramCounts.entries()) - .map(([size, count]) => ({ size, count })) - .sort((left, right) => left.size - right.size), - }; - } - - private summarizeClusterQuality( - clusters: Array<{ representativeThreadId: number; members: number[] }>, - threadKinds: Map, - maxClusterSize: number, - ): { - maxClusterSize: number; - maxObservedClusterSize: number; - maxedClusterCount: number; - mixedKindClusterCount: number; - singletonClusterCount: number; - nonSingletonClusterCount: number; - } { - let maxObservedClusterSize = 0; - let maxedClusterCount = 0; - let mixedKindClusterCount = 0; - let singletonClusterCount = 0; - - for (const cluster of clusters) { - const size = cluster.members.length; - maxObservedClusterSize = Math.max(maxObservedClusterSize, size); - if (size >= maxClusterSize) maxedClusterCount += 1; - if (size === 1) singletonClusterCount += 1; - - let hasIssue = false; - let hasPullRequest = false; - for (const memberId of cluster.members) { - const kind = threadKinds.get(memberId); - hasIssue ||= kind === 'issue'; - hasPullRequest ||= kind === 'pull_request'; - if (hasIssue && hasPullRequest) { - mixedKindClusterCount += 1; - break; - } - } - } - - return { - maxClusterSize, - maxObservedClusterSize, - maxedClusterCount, - mixedKindClusterCount, - singletonClusterCount, - nonSingletonClusterCount: clusters.length - singletonClusterCount, - }; - } - private upsertSummary(threadId: number, contentHash: string, summaryKind: string, summaryText: string): void { this.db .prepare( From cbc778caf126fe92787bbdefb496dca722756849 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:44:10 -0700 Subject: [PATCH 160/215] refactor: extract tui cluster format helpers --- packages/api-core/src/service.ts | 45 +++++---------------- packages/api-core/src/tui/cluster-format.ts | 29 +++++++++++++ 2 files changed, 38 insertions(+), 36 deletions(-) create mode 100644 packages/api-core/src/tui/cluster-format.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index c9bfe17..8815f1d 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -139,6 +139,7 @@ import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; +import { clusterDisplayTitle, compareTuiClusterSummary, durableClosureReason, parseMemberThreadIdSet } from './tui/cluster-format.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, @@ -3010,7 +3011,7 @@ export class GHCrawlService { if (!search) return true; return cluster.searchText.includes(search); }) - .sort((left, right) => this.compareTuiClusterSummary(left, right, params.sort ?? 'size')); + .sort((left, right) => compareTuiClusterSummary(left, right, params.sort ?? 'size')); return { repository, @@ -3495,11 +3496,6 @@ export class GHCrawlService { return durableClusterId; } - private durableClosureReason(closure: DurableTuiClosure): string | null { - if (closure.reason) return closure.reason; - return closure.status === 'merged' || closure.status === 'split' ? closure.status : null; - } - private getDurableClosuresByRepresentative(repoId: number, representativeThreadIds: number[]): Map { const uniqueThreadIds = Array.from(new Set(representativeThreadIds)); if (uniqueThreadIds.length === 0) { @@ -3633,7 +3629,7 @@ export class GHCrawlService { const selected: Array<{ row: T; memberIds: Set }> = []; for (const row of sortedRows) { - const memberIds = this.parseMemberThreadIdSet(row.member_thread_ids); + const memberIds = parseMemberThreadIdSet(row.member_thread_ids); const duplicate = selected.some((entry) => { const smallerSize = Math.min(memberIds.size, entry.memberIds.size); if (smallerSize === 0) return false; @@ -3651,16 +3647,6 @@ export class GHCrawlService { return selected.map((entry) => entry.row); } - private parseMemberThreadIdSet(value: string | null): Set { - if (!value) return new Set(); - return new Set( - value - .split(',') - .map((part) => Number(part)) - .filter((memberId) => Number.isSafeInteger(memberId) && memberId > 0), - ); - } - private getDurableTuiClusterSummary(repoId: number, clusterId: number): TuiClusterSummary | null { const row = this.db .prepare( @@ -3756,13 +3742,13 @@ export class GHCrawlService { const isClosed = manuallyClosed || lifecycleClosed || row.closed_member_count >= row.member_count; const closeReasonLocal = manuallyClosed || lifecycleClosed - ? this.durableClosureReason(closure) + ? durableClosureReason(closure) : row.closed_member_count >= row.member_count ? 'all_members_closed' : null; return { clusterId: row.cluster_id, - displayTitle: this.clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), + displayTitle: clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), isClosed, closedAtLocal: manuallyClosed || lifecycleClosed ? row.closed_at : null, closeReasonLocal, @@ -3838,10 +3824,10 @@ export class GHCrawlService { row.representative_thread_id === null ? null : (durableClosures.get(row.representative_thread_id) ?? null); return { clusterId: row.cluster_id, - displayTitle: this.clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), + displayTitle: clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, - closeReasonLocal: row.close_reason_local ?? (durableClosure ? this.durableClosureReason(durableClosure) : null), + closeReasonLocal: row.close_reason_local ?? (durableClosure ? durableClosureReason(durableClosure) : null), totalCount: row.member_count, issueCount: row.issue_count, pullRequestCount: row.pull_request_count, @@ -3915,10 +3901,10 @@ export class GHCrawlService { : (this.getDurableClosuresByRepresentative(repoId, [row.representative_thread_id]).get(row.representative_thread_id) ?? null); return { clusterId: row.cluster_id, - displayTitle: this.clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), + displayTitle: clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, - closeReasonLocal: row.close_reason_local ?? (durableClosure ? this.durableClosureReason(durableClosure) : null), + closeReasonLocal: row.close_reason_local ?? (durableClosure ? durableClosureReason(durableClosure) : null), totalCount: row.member_count, issueCount: row.issue_count, pullRequestCount: row.pull_request_count, @@ -3938,19 +3924,6 @@ export class GHCrawlService { ).slug; } - private clusterDisplayTitle(clusterName: string, representativeTitle: string | null, clusterId: number): string { - return `${clusterName} ${representativeTitle ?? `Cluster ${clusterId}`}`; - } - - private compareTuiClusterSummary(left: TuiClusterSummary, right: TuiClusterSummary, sort: TuiClusterSortMode): number { - const leftTime = left.latestUpdatedAt ? Date.parse(left.latestUpdatedAt) : 0; - const rightTime = right.latestUpdatedAt ? Date.parse(right.latestUpdatedAt) : 0; - if (sort === 'size') { - return right.totalCount - left.totalCount || rightTime - leftTime || left.clusterId - right.clusterId; - } - return rightTime - leftTime || right.totalCount - left.totalCount || left.clusterId - right.clusterId; - } - private async fetchThreadComments( owner: string, repo: string, diff --git a/packages/api-core/src/tui/cluster-format.ts b/packages/api-core/src/tui/cluster-format.ts new file mode 100644 index 0000000..2cf4eed --- /dev/null +++ b/packages/api-core/src/tui/cluster-format.ts @@ -0,0 +1,29 @@ +import type { DurableTuiClosure, TuiClusterSortMode, TuiClusterSummary } from '../service-types.js'; + +export function durableClosureReason(closure: DurableTuiClosure): string | null { + if (closure.reason) return closure.reason; + return closure.status === 'merged' || closure.status === 'split' ? closure.status : null; +} + +export function parseMemberThreadIdSet(value: string | null): Set { + if (!value) return new Set(); + return new Set( + value + .split(',') + .map((part) => Number(part)) + .filter((memberId) => Number.isSafeInteger(memberId) && memberId > 0), + ); +} + +export function clusterDisplayTitle(clusterName: string, representativeTitle: string | null, clusterId: number): string { + return `${clusterName} ${representativeTitle ?? `Cluster ${clusterId}`}`; +} + +export function compareTuiClusterSummary(left: TuiClusterSummary, right: TuiClusterSummary, sort: TuiClusterSortMode): number { + const leftTime = left.latestUpdatedAt ? Date.parse(left.latestUpdatedAt) : 0; + const rightTime = right.latestUpdatedAt ? Date.parse(right.latestUpdatedAt) : 0; + if (sort === 'size') { + return right.totalCount - left.totalCount || rightTime - leftTime || left.clusterId - right.clusterId; + } + return rightTime - leftTime || right.totalCount - left.totalCount || left.clusterId - right.clusterId; +} From 85d6bec7d400fccf48dd9aff295e7555cb67f27e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:45:08 -0700 Subject: [PATCH 161/215] refactor: extract edge worker runtime probe --- .../api-core/src/cluster/edge-worker-runtime.ts | 11 +++++++++++ packages/api-core/src/service.ts | 13 ++----------- 2 files changed, 13 insertions(+), 11 deletions(-) create mode 100644 packages/api-core/src/cluster/edge-worker-runtime.ts diff --git a/packages/api-core/src/cluster/edge-worker-runtime.ts b/packages/api-core/src/cluster/edge-worker-runtime.ts new file mode 100644 index 0000000..f4c6024 --- /dev/null +++ b/packages/api-core/src/cluster/edge-worker-runtime.ts @@ -0,0 +1,11 @@ +import { existsSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; + +export function resolveEdgeWorkerRuntime(): { url: URL } | null { + const jsUrl = new URL('./edge-worker.js', import.meta.url); + if (existsSync(fileURLToPath(jsUrl))) { + return { url: jsUrl }; + } + // Source-mode runs do not have a compiled worker entrypoint, so keep clustering in-process. + return null; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 8815f1d..e5f76e7 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -2,7 +2,6 @@ import fs from 'node:fs'; import { existsSync } from 'node:fs'; import os from 'node:os'; import path from 'node:path'; -import { fileURLToPath } from 'node:url'; import { Worker } from 'node:worker_threads'; import { IterableMapper } from '@shutterstock/p-map-iterable'; @@ -73,6 +72,7 @@ import { pruneWeakCrossKindEdges, type PerSourceScoreEntry, } from './cluster/edge-aggregation.js'; +import { resolveEdgeWorkerRuntime } from './cluster/edge-worker-runtime.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; @@ -5007,7 +5007,7 @@ export class GHCrawlService { return aggregated; } - const workerRuntime = this.resolveEdgeWorkerRuntime(); + const workerRuntime = resolveEdgeWorkerRuntime(); const shouldParallelize = workerRuntime !== null && sourceKinds.length > 1 && totalItems >= CLUSTER_PARALLEL_MIN_EMBEDDINGS && os.availableParallelism() > 1; if (!shouldParallelize) { let processedItems = 0; @@ -5110,15 +5110,6 @@ export class GHCrawlService { return row.count; } - private resolveEdgeWorkerRuntime(): { url: URL } | null { - const jsUrl = new URL('./cluster/edge-worker.js', import.meta.url); - if (existsSync(fileURLToPath(jsUrl))) { - return { url: jsUrl }; - } - // Source-mode runs do not have a compiled worker entrypoint, so keep clustering in-process. - return null; - } - private persistClusterRun( repoId: number, runId: number, From 77c64b70be306798c9f0b3c7ac6a2acbd3bf9cb7 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:46:56 -0700 Subject: [PATCH 162/215] refactor: extract cluster run queries --- packages/api-core/src/cluster/run-queries.ts | 29 +++++++++++++ packages/api-core/src/service.ts | 45 +++++--------------- 2 files changed, 39 insertions(+), 35 deletions(-) create mode 100644 packages/api-core/src/cluster/run-queries.ts diff --git a/packages/api-core/src/cluster/run-queries.ts b/packages/api-core/src/cluster/run-queries.ts new file mode 100644 index 0000000..074ccb9 --- /dev/null +++ b/packages/api-core/src/cluster/run-queries.ts @@ -0,0 +1,29 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; + +export type LatestClusterRunRow = { id: number; finished_at: string | null }; + +export function getLatestClusterRun(db: SqliteDatabase, repoId: number): LatestClusterRunRow | null { + return ( + (db + .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(repoId) as LatestClusterRunRow | undefined) ?? null + ); +} + +export function getLatestRunClusterIdsForThread(db: SqliteDatabase, repoId: number, threadId: number): number[] { + const latestRun = getLatestClusterRun(db, repoId); + if (!latestRun) { + return []; + } + return ( + db + .prepare( + `select cm.cluster_id + from cluster_members cm + join clusters c on c.id = cm.cluster_id + where c.repo_id = ? and c.cluster_run_id = ? and cm.thread_id = ? + order by cm.cluster_id asc`, + ) + .all(repoId, latestRun.id, threadId) as Array<{ cluster_id: number }> + ).map((row) => row.cluster_id); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e5f76e7..16626dc 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -77,6 +77,7 @@ import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; import { summarizeClusterQuality, summarizeClusterSizes } from './cluster/quality.js'; +import { getLatestClusterRun, getLatestRunClusterIdsForThread } from './cluster/run-queries.js'; import { createPipelineRun, finishPipelineRun, @@ -364,7 +365,7 @@ export class GHCrawlService { where id = ?`, ) .run(closedAt, closedAt, row.id); - const clusterIds = this.getLatestRunClusterIdsForThread(repository.id, row.id); + const clusterIds = getLatestRunClusterIdsForThread(this.db, repository.id, row.id); const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0; const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id) as ThreadRow; @@ -380,7 +381,7 @@ export class GHCrawlService { closeClusterLocally(params: { owner: string; repo: string; clusterId: number }): CloseResponse { const repository = this.requireRepository(params.owner, params.repo); - const latestRun = this.getLatestClusterRun(repository.id); + const latestRun = getLatestClusterRun(this.db, repository.id); if (!latestRun) { throw new Error(`No completed cluster run found for ${repository.fullName}.`); } @@ -2992,7 +2993,7 @@ export class GHCrawlService { }): TuiSnapshot { const repository = this.requireRepository(params.owner, params.repo); const stats = this.getTuiRepoStats(repository.id); - const latestRun = this.getLatestClusterRun(repository.id); + const latestRun = getLatestClusterRun(this.db, repository.id); const includeClosedClusters = params.includeClosedClusters ?? true; const minSize = params.minSize ?? 1; const rawClusters = latestRun ? this.listRawTuiClusters(repository.id, latestRun.id, minSize) : []; @@ -3060,7 +3061,7 @@ export class GHCrawlService { const latestEmbedding = this.db .prepare("select id from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") .get(repository.id) as { id: number } | undefined; - const latestClusterRun = this.getLatestClusterRun(repository.id); + const latestClusterRun = getLatestClusterRun(this.db, repository.id); return { repositoryUpdatedAt: repository.updatedAt, @@ -3079,7 +3080,7 @@ export class GHCrawlService { const repository = this.requireRepository(params.owner, params.repo); const clusterRunId = params.clusterRunId ?? - (this.getLatestClusterRun(repository.id)?.id ?? null); + (getLatestClusterRun(this.db, repository.id)?.id ?? null); const summary = clusterRunId ? this.getRawTuiClusterSummary(repository.id, clusterRunId, params.clusterId) : null; const durableSummary = summary ? null : this.getDurableTuiClusterSummary(repository.id, params.clusterId); @@ -3187,7 +3188,7 @@ export class GHCrawlService { throw new Error(`Thread was not found for ${repository.fullName}.`); } - const latestRun = this.getLatestClusterRun(repository.id); + const latestRun = getLatestClusterRun(this.db, repository.id); const clusterMembership = latestRun ? ((this.db .prepare( @@ -3272,7 +3273,7 @@ export class GHCrawlService { group by kind`, ) .all(repoId) as Array<{ kind: 'issue' | 'pull_request'; count: number }>; - const latestRun = this.getLatestClusterRun(repoId); + const latestRun = getLatestClusterRun(this.db, repoId); const latestSync = (this.db .prepare("select finished_at from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1") .get(repoId) as { finished_at: string | null } | undefined) ?? null; @@ -3351,34 +3352,8 @@ export class GHCrawlService { }); } - private getLatestClusterRun(repoId: number): { id: number; finished_at: string | null } | null { - return ( - (this.db - .prepare("select id, finished_at from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") - .get(repoId) as { id: number; finished_at: string | null } | undefined) ?? null - ); - } - - private getLatestRunClusterIdsForThread(repoId: number, threadId: number): number[] { - const latestRun = this.getLatestClusterRun(repoId); - if (!latestRun) { - return []; - } - return ( - this.db - .prepare( - `select cm.cluster_id - from cluster_members cm - join clusters c on c.id = cm.cluster_id - where c.repo_id = ? and c.cluster_run_id = ? and cm.thread_id = ? - order by cm.cluster_id asc`, - ) - .all(repoId, latestRun.id, threadId) as Array<{ cluster_id: number }> - ).map((row) => row.cluster_id); - } - private reconcileClusterCloseState(repoId: number, clusterIds?: number[]): number { - const latestRun = this.getLatestClusterRun(repoId); + const latestRun = getLatestClusterRun(this.db, repoId); if (!latestRun) { return 0; } @@ -4940,7 +4915,7 @@ export class GHCrawlService { } private listStoredClusterNeighbors(repoId: number, threadId: number, limit: number): SearchHitDto['neighbors'] { - const latestRun = this.getLatestClusterRun(repoId); + const latestRun = getLatestClusterRun(this.db, repoId); if (!latestRun) { return []; } From ca2c9427fe119b55386915a3220896b32255244c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:48:04 -0700 Subject: [PATCH 163/215] refactor: extract tui repository stats --- packages/api-core/src/service.ts | 34 ++--------------------- packages/api-core/src/tui/repo-stats.ts | 37 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 32 deletions(-) create mode 100644 packages/api-core/src/tui/repo-stats.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 16626dc..b591c22 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -141,6 +141,7 @@ import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-mainte import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { clusterDisplayTitle, compareTuiClusterSummary, durableClosureReason, parseMemberThreadIdSet } from './tui/cluster-format.js'; +import { getTuiRepoStats } from './tui/repo-stats.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, @@ -188,7 +189,6 @@ import type { TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, - TuiRepoStats, TuiSnapshot, TuiThreadDetail, } from './service-types.js'; @@ -2992,7 +2992,7 @@ export class GHCrawlService { includeClosedClusters?: boolean; }): TuiSnapshot { const repository = this.requireRepository(params.owner, params.repo); - const stats = this.getTuiRepoStats(repository.id); + const stats = getTuiRepoStats({ db: this.db, config: this.config, repoId: repository.id }); const latestRun = getLatestClusterRun(this.db, repository.id); const includeClosedClusters = params.includeClosedClusters ?? true; const minSize = params.minSize ?? 1; @@ -3264,36 +3264,6 @@ export class GHCrawlService { } } - private getTuiRepoStats(repoId: number): TuiRepoStats { - const counts = this.db - .prepare( - `select kind, count(*) as count - from threads - where repo_id = ? and state = 'open' and closed_at_local is null - group by kind`, - ) - .all(repoId) as Array<{ kind: 'issue' | 'pull_request'; count: number }>; - const latestRun = getLatestClusterRun(this.db, repoId); - const latestSync = (this.db - .prepare("select finished_at from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1") - .get(repoId) as { finished_at: string | null } | undefined) ?? null; - const latestEmbed = (this.db - .prepare("select finished_at from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") - .get(repoId) as { finished_at: string | null } | undefined) ?? null; - const embeddingWorkset = getEmbeddingWorkset({ db: this.db, config: this.config, repoId }); - const staleThreadIds = new Set(embeddingWorkset.pending.map((task) => task.threadId)); - return { - openIssueCount: counts.find((row) => row.kind === 'issue')?.count ?? 0, - openPullRequestCount: counts.find((row) => row.kind === 'pull_request')?.count ?? 0, - lastGithubReconciliationAt: latestSync?.finished_at ?? null, - lastEmbedRefreshAt: latestEmbed?.finished_at ?? null, - staleEmbedThreadCount: staleThreadIds.size, - staleEmbedSourceCount: embeddingWorkset.pending.length, - latestClusterRunId: latestRun?.id ?? null, - latestClusterRunFinishedAt: latestRun?.finished_at ?? null, - }; - } - private queryNearestWithRecovery( repoId: number, repoFullName: string, diff --git a/packages/api-core/src/tui/repo-stats.ts b/packages/api-core/src/tui/repo-stats.ts new file mode 100644 index 0000000..aa58e3f --- /dev/null +++ b/packages/api-core/src/tui/repo-stats.ts @@ -0,0 +1,37 @@ +import { getLatestClusterRun } from '../cluster/run-queries.js'; +import type { GitcrawlConfig } from '../config.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { getEmbeddingWorkset } from '../embedding/workset.js'; +import type { TuiRepoStats } from '../service-types.js'; + +export function getTuiRepoStats(params: { db: SqliteDatabase; config: GitcrawlConfig; repoId: number }): TuiRepoStats { + const counts = params.db + .prepare( + `select kind, count(*) as count + from threads + where repo_id = ? and state = 'open' and closed_at_local is null + group by kind`, + ) + .all(params.repoId) as Array<{ kind: 'issue' | 'pull_request'; count: number }>; + const latestRun = getLatestClusterRun(params.db, params.repoId); + const latestSync = + (params.db + .prepare("select finished_at from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(params.repoId) as { finished_at: string | null } | undefined) ?? null; + const latestEmbed = + (params.db + .prepare("select finished_at from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(params.repoId) as { finished_at: string | null } | undefined) ?? null; + const embeddingWorkset = getEmbeddingWorkset({ db: params.db, config: params.config, repoId: params.repoId }); + const staleThreadIds = new Set(embeddingWorkset.pending.map((task) => task.threadId)); + return { + openIssueCount: counts.find((row) => row.kind === 'issue')?.count ?? 0, + openPullRequestCount: counts.find((row) => row.kind === 'pull_request')?.count ?? 0, + lastGithubReconciliationAt: latestSync?.finished_at ?? null, + lastEmbedRefreshAt: latestEmbed?.finished_at ?? null, + staleEmbedThreadCount: staleThreadIds.size, + staleEmbedSourceCount: embeddingWorkset.pending.length, + latestClusterRunId: latestRun?.id ?? null, + latestClusterRunFinishedAt: latestRun?.finished_at ?? null, + }; +} From 162a697ecb78aa926f9fc19ee15e5d6b4ee0e63d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:49:32 -0700 Subject: [PATCH 164/215] refactor: remove unused embedding iterator --- packages/api-core/src/service.ts | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index b591c22..c2d433b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -4477,32 +4477,6 @@ export class GHCrawlService { }; } - private *iterateNormalizedEmbeddingsForSourceKind( - repoId: number, - sourceKind: EmbeddingSourceKind, - ): IterableIterator<{ id: number; normalizedEmbedding: number[] }> { - const rows = this.db - .prepare( - `select t.id, e.embedding_json - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and e.model = ? - and e.source_kind = ? - order by t.number asc`, - ) - .iterate(repoId, this.config.embedModel, sourceKind) as IterableIterator<{ id: number; embedding_json: string }>; - - for (const row of rows) { - yield { - id: row.id, - normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json) as number[]).normalized, - }; - } - } - private loadNormalizedEmbeddingsForSourceKind( repoId: number, sourceKind: EmbeddingSourceKind, From 50a2d529561b950238bd7f1200f805c53bc3a97d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:50:46 -0700 Subject: [PATCH 165/215] refactor: extract stored embedding queries --- packages/api-core/src/embedding/queries.ts | 89 ++++++++++++ packages/api-core/src/service.ts | 158 +++++---------------- 2 files changed, 126 insertions(+), 121 deletions(-) create mode 100644 packages/api-core/src/embedding/queries.ts diff --git a/packages/api-core/src/embedding/queries.ts b/packages/api-core/src/embedding/queries.ts new file mode 100644 index 0000000..e7cdc8b --- /dev/null +++ b/packages/api-core/src/embedding/queries.ts @@ -0,0 +1,89 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import { normalizeEmbedding } from '../search/exact.js'; +import type { EmbeddingSourceKind, StoredEmbeddingRow } from '../service-types.js'; + +export function loadStoredEmbeddingsForThreadNumber(params: { + db: SqliteDatabase; + repoId: number; + threadNumber: number; + embedModel: string; +}): StoredEmbeddingRow[] { + return params.db + .prepare( + `select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local, + t.title, t.body, t.author_login, t.html_url, t.labels_json, + t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json + from threads t + join document_embeddings e on e.thread_id = t.id + where t.repo_id = ? + and t.number = ? + and t.state = 'open' + and t.closed_at_local is null + and e.model = ? + order by e.source_kind asc`, + ) + .all(params.repoId, params.threadNumber, params.embedModel) as StoredEmbeddingRow[]; +} + +export function iterateStoredEmbeddings(params: { + db: SqliteDatabase; + repoId: number; + embedModel: string; +}): IterableIterator { + return params.db + .prepare( + `select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local, + t.title, t.body, t.author_login, t.html_url, t.labels_json, + t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json + from threads t + join document_embeddings e on e.thread_id = t.id + where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ? + order by t.number asc, e.source_kind asc`, + ) + .iterate(params.repoId, params.embedModel) as IterableIterator; +} + +export function loadNormalizedEmbeddingsForSourceKind(params: { + db: SqliteDatabase; + repoId: number; + embedModel: string; + sourceKind: EmbeddingSourceKind; +}): Array<{ id: number; normalizedEmbedding: number[] }> { + const rows = params.db + .prepare( + `select t.id, e.embedding_json + from threads t + join document_embeddings e on e.thread_id = t.id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and e.model = ? + and e.source_kind = ? + order by t.number asc`, + ) + .all(params.repoId, params.embedModel, params.sourceKind) as Array<{ id: number; embedding_json: string }>; + + return rows.map((row) => ({ + id: row.id, + normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json) as number[]).normalized, + })); +} + +export function countEmbeddingsForSourceKind(params: { + db: SqliteDatabase; + repoId: number; + sourceKind: EmbeddingSourceKind; +}): number { + const row = params.db + .prepare( + `select count(*) as count + from document_embeddings e + join threads t on t.id = e.thread_id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and e.source_kind = ?`, + ) + .get(params.repoId, params.sourceKind) as { count: number }; + return row.count; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index c2d433b..1e6c3d9 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -112,6 +112,12 @@ import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; +import { + countEmbeddingsForSourceKind, + iterateStoredEmbeddings, + loadNormalizedEmbeddingsForSourceKind, + loadStoredEmbeddingsForThreadNumber, +} from './embedding/queries.js'; import { isEmbeddingContextError, parseEmbeddingContextError, shrinkEmbeddingTask } from './embedding/retry.js'; import { activeVectorSourceKind } from './embedding/tasks.js'; import { getEmbeddingWorkset } from './embedding/workset.js'; @@ -180,7 +186,6 @@ import type { NeighborsResultInternal, PortableSyncExportOptions, SearchResultInternal, - StoredEmbeddingRow, SyncCursorState, SyncOptions, SyncRunStats, @@ -1912,12 +1917,20 @@ export class GHCrawlService { collectSourceKindScores(perSourceScores, edges, activeSourceKind); recordMemory(); } else { - const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repository.id, sourceKind), 0); + const totalItems = sourceKinds.reduce( + (sum, sourceKind) => sum + countEmbeddingsForSourceKind({ db: this.db, repoId: repository.id, sourceKind }), + 0, + ); let processedItems = 0; for (const sourceKind of sourceKinds) { const loadStartedAt = Date.now(); - const normalizedRows = this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind); + const normalizedRows = loadNormalizedEmbeddingsForSourceKind({ + db: this.db, + repoId: repository.id, + embedModel: this.config.embedModel, + sourceKind, + }); loadMs += Date.now() - loadStartedAt; recordMemory(); @@ -1962,7 +1975,12 @@ export class GHCrawlService { ] : sourceKinds.map((sourceKind) => ({ sourceKind, - rows: this.loadNormalizedEmbeddingsForSourceKind(repository.id, sourceKind).map((row) => ({ + rows: loadNormalizedEmbeddingsForSourceKind({ + db: this.db, + repoId: repository.id, + embedModel: this.config.embedModel, + sourceKind, + }).map((row) => ({ id: row.id, normalizedEmbedding: row.normalizedEmbedding, })), @@ -2177,7 +2195,7 @@ export class GHCrawlService { } } else if (hasLegacyEmbeddings(this.db, this.config.embedModel, repository.id)) { const [queryEmbedding] = await this.ai.embedTexts({ model: this.config.embedModel, texts: [params.query] }); - for (const row of this.iterateStoredEmbeddings(repository.id)) { + for (const row of iterateStoredEmbeddings({ db: this.db, repoId: repository.id, embedModel: this.config.embedModel })) { const score = cosineSimilarity(queryEmbedding, JSON.parse(row.embedding_json) as number[]); if (score < 0.2) continue; semanticScores.set(row.id, Math.max(semanticScores.get(row.id) ?? -1, score)); @@ -2337,7 +2355,12 @@ export class GHCrawlService { .filter((row): row is NonNullable => row !== null) .slice(0, limit); } else { - const targetRows = this.loadStoredEmbeddingsForThreadNumber(repository.id, params.threadNumber); + const targetRows = loadStoredEmbeddingsForThreadNumber({ + db: this.db, + repoId: repository.id, + threadNumber: params.threadNumber, + embedModel: this.config.embedModel, + }); if (targetRows.length === 0) { throw new Error( `Thread #${params.threadNumber} for ${repository.fullName} was not found with an embedding. Run embed first.`, @@ -2350,7 +2373,7 @@ export class GHCrawlService { } const aggregated = new Map(); - for (const row of this.iterateStoredEmbeddings(repository.id)) { + for (const row of iterateStoredEmbeddings({ db: this.db, repoId: repository.id, embedModel: this.config.embedModel })) { if (row.id === responseThread.id) continue; const targetEmbedding = targetBySource.get(row.source_kind); if (!targetEmbedding) continue; @@ -4404,103 +4427,6 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private loadStoredEmbeddings(repoId: number): StoredEmbeddingRow[] { - return this.db - .prepare( - `select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local, - t.title, t.body, t.author_login, t.html_url, t.labels_json, - t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ? - order by t.number asc, e.source_kind asc`, - ) - .all(repoId, this.config.embedModel) as StoredEmbeddingRow[]; - } - - private loadStoredEmbeddingsForThreadNumber(repoId: number, threadNumber: number): StoredEmbeddingRow[] { - return this.db - .prepare( - `select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local, - t.title, t.body, t.author_login, t.html_url, t.labels_json, - t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? - and t.number = ? - and t.state = 'open' - and t.closed_at_local is null - and e.model = ? - order by e.source_kind asc`, - ) - .all(repoId, threadNumber, this.config.embedModel) as StoredEmbeddingRow[]; - } - - private iterateStoredEmbeddings(repoId: number): IterableIterator { - return this.db - .prepare( - `select t.id, t.repo_id, t.number, t.kind, t.state, t.closed_at_gh, t.closed_at_local, t.close_reason_local, - t.title, t.body, t.author_login, t.html_url, t.labels_json, - t.updated_at_gh, t.first_pulled_at, t.last_pulled_at, e.source_kind, e.embedding_json - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? and t.state = 'open' and t.closed_at_local is null and e.model = ? - order by t.number asc, e.source_kind asc`, - ) - .iterate(repoId, this.config.embedModel) as IterableIterator; - } - - private loadNormalizedEmbeddingForSourceKindHead( - repoId: number, - sourceKind: EmbeddingSourceKind, - ): { id: number; normalizedEmbedding: number[] } | null { - const row = this.db - .prepare( - `select t.id, e.embedding_json - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and e.model = ? - and e.source_kind = ? - order by t.number asc - limit 1`, - ) - .get(repoId, this.config.embedModel, sourceKind) as { id: number; embedding_json: string } | undefined; - if (!row) { - return null; - } - return { - id: row.id, - normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json) as number[]).normalized, - }; - } - - private loadNormalizedEmbeddingsForSourceKind( - repoId: number, - sourceKind: EmbeddingSourceKind, - ): Array<{ id: number; normalizedEmbedding: number[] }> { - const rows = this.db - .prepare( - `select t.id, e.embedding_json - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and e.model = ? - and e.source_kind = ? - order by t.number asc`, - ) - .all(repoId, this.config.embedModel, sourceKind) as Array<{ id: number; embedding_json: string }>; - - return rows.map((row) => ({ - id: row.id, - normalizedEmbedding: normalizeEmbedding(JSON.parse(row.embedding_json) as number[]).normalized, - })); - } - private loadClusterableThreadMeta(repoId: number): { items: Array<{ id: number; number: number; title: string }>; sourceKinds: EmbeddingSourceKind[]; @@ -4920,7 +4846,7 @@ export class GHCrawlService { params: { limit: number; minScore: number; onProgress?: (message: string) => void }, ): Promise> { const aggregated = new Map(); - const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + this.countEmbeddingsForSourceKind(repoId, sourceKind), 0); + const totalItems = sourceKinds.reduce((sum, sourceKind) => sum + countEmbeddingsForSourceKind({ db: this.db, repoId, sourceKind }), 0); if (sourceKinds.length === 0 || totalItems === 0) { return aggregated; @@ -4931,7 +4857,12 @@ export class GHCrawlService { if (!shouldParallelize) { let processedItems = 0; for (const sourceKind of sourceKinds) { - const items = this.loadNormalizedEmbeddingsForSourceKind(repoId, sourceKind); + const items = loadNormalizedEmbeddingsForSourceKind({ + db: this.db, + repoId, + embedModel: this.config.embedModel, + sourceKind, + }); const edges = buildSourceKindEdges(items, { limit: params.limit, minScore: params.minScore, @@ -5014,21 +4945,6 @@ export class GHCrawlService { return aggregated; } - private countEmbeddingsForSourceKind(repoId: number, sourceKind: EmbeddingSourceKind): number { - const row = this.db - .prepare( - `select count(*) as count - from document_embeddings e - join threads t on t.id = e.thread_id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and e.source_kind = ?`, - ) - .get(repoId, sourceKind) as { count: number }; - return row.count; - } - private persistClusterRun( repoId: number, runId: number, From 3956bdcd01da3b8cf6358d3556520083bddaeed7 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:52:16 -0700 Subject: [PATCH 166/215] refactor: extract clusterable vector loaders --- .../api-core/src/embedding/clusterable.ts | 96 +++++++++++++++++++ packages/api-core/src/service.ts | 94 ++---------------- 2 files changed, 105 insertions(+), 85 deletions(-) create mode 100644 packages/api-core/src/embedding/clusterable.ts diff --git a/packages/api-core/src/embedding/clusterable.ts b/packages/api-core/src/embedding/clusterable.ts new file mode 100644 index 0000000..4b651a5 --- /dev/null +++ b/packages/api-core/src/embedding/clusterable.ts @@ -0,0 +1,96 @@ +import type { GitcrawlConfig } from '../config.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { normalizeEmbedding } from '../search/exact.js'; +import { ACTIVE_EMBED_DIMENSIONS } from '../service-constants.js'; +import type { EmbeddingSourceKind } from '../service-types.js'; +import { parseStoredVector } from '../vector/encoding.js'; + +export function loadClusterableThreadMeta(params: { + db: SqliteDatabase; + repoId: number; +}): { + items: Array<{ id: number; number: number; title: string }>; + sourceKinds: EmbeddingSourceKind[]; +} { + const rows = params.db + .prepare( + `select t.id, t.number, t.title, e.source_kind + from threads t + join document_embeddings e on e.thread_id = t.id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + )`, + ) + .all(params.repoId) as Array<{ id: number; number: number; title: string; source_kind: EmbeddingSourceKind }>; + + const itemsById = new Map(); + const sourceKinds = new Set(); + for (const row of rows) { + itemsById.set(row.id, { id: row.id, number: row.number, title: row.title }); + sourceKinds.add(row.source_kind); + } + + return { + items: Array.from(itemsById.values()), + sourceKinds: Array.from(sourceKinds.values()), + }; +} + +export function loadClusterableActiveVectorMeta(params: { + db: SqliteDatabase; + config: GitcrawlConfig; + repoId: number; +}): Array<{ id: number; number: number; title: string; embedding: number[] }> { + const rows = params.db + .prepare( + `select t.id, t.number, t.title, tv.vector_json + from threads t + join thread_vectors tv on tv.thread_id = t.id + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + ) + and tv.model = ? + and tv.basis = ? + and tv.dimensions = ? + order by t.number asc`, + ) + .all(params.repoId, params.config.embedModel, params.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS) as Array<{ + id: number; + number: number; + title: string; + vector_json: Buffer | string; + }>; + return rows.map((row) => ({ + id: row.id, + number: row.number, + title: row.title, + embedding: parseStoredVector(row.vector_json), + })); +} + +export function loadNormalizedActiveVectors(params: { + db: SqliteDatabase; + config: GitcrawlConfig; + repoId: number; +}): Array<{ id: number; number: number; title: string; embedding: number[] }> { + return loadClusterableActiveVectorMeta(params).map((row) => ({ + id: row.id, + number: row.number, + title: row.title, + embedding: normalizeEmbedding(row.embedding).normalized, + })); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 1e6c3d9..1711c47 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -112,6 +112,7 @@ import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; +import { loadClusterableActiveVectorMeta, loadClusterableThreadMeta, loadNormalizedActiveVectors } from './embedding/clusterable.js'; import { countEmbeddingsForSourceKind, iterateStoredEmbeddings, @@ -142,7 +143,7 @@ import { type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from './run-history.js'; -import { cosineSimilarity, dotProduct, normalizeEmbedding, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; +import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; @@ -1698,7 +1699,7 @@ export class GHCrawlService { ); const vectorStateCurrent = isRepoVectorStateCurrent(this.db, this.config, repository.id); - const vectorItems = this.loadClusterableActiveVectorMeta(repository.id, repository.fullName); + const vectorItems = loadClusterableActiveVectorMeta({ db: this.db, config: this.config, repoId: repository.id }); if (vectorItems.length > 0) { const queryVectorItems = seedThreadIds ? vectorItems.filter((item) => seedThreadIds.includes(item.id)) : vectorItems; const activeSourceKind = activeVectorSourceKind(this.config.embeddingBasis); @@ -1741,7 +1742,7 @@ export class GHCrawlService { } } } else if (!seedThreadIds && hasLegacyEmbeddings(this.db, this.config.embedModel, repository.id)) { - const legacy = this.loadClusterableThreadMeta(repository.id); + const legacy = loadClusterableThreadMeta({ db: this.db, repoId: repository.id }); params.onProgress?.( `[cluster] loaded ${legacy.items.length} legacy embedded thread(s) across ${legacy.sourceKinds.length} source kind(s) for ${repository.fullName} k=${k} minScore=${minScore}`, ); @@ -1847,8 +1848,10 @@ export class GHCrawlService { }): ClusterExperimentResult { const backend = params.backend ?? 'vectorlite'; const repository = this.requireRepository(params.owner, params.repo); - const loaded = this.loadClusterableThreadMeta(repository.id); - const activeVectors = isRepoVectorStateCurrent(this.db, this.config, repository.id) ? this.loadNormalizedActiveVectors(repository.id) : []; + const loaded = loadClusterableThreadMeta({ db: this.db, repoId: repository.id }); + const activeVectors = isRepoVectorStateCurrent(this.db, this.config, repository.id) + ? loadNormalizedActiveVectors({ db: this.db, config: this.config, repoId: repository.id }) + : []; const activeSourceKind = activeVectorSourceKind(this.config.embeddingBasis); const useActiveVectors = activeVectors.length > 0 && (params.sourceKinds === undefined || loaded.items.length === 0); const sourceKinds = useActiveVectors ? [activeSourceKind] : (params.sourceKinds ?? loaded.sourceKinds); @@ -3308,7 +3311,7 @@ export class GHCrawlService { configDir: this.config.configDir, repoFullName, dimensions: ACTIVE_EMBED_DIMENSIONS, - vectors: this.loadClusterableActiveVectorMeta(repoId, repoFullName), + vectors: loadClusterableActiveVectorMeta({ db: this.db, config: this.config, repoId }), }); } @@ -4427,76 +4430,6 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private loadClusterableThreadMeta(repoId: number): { - items: Array<{ id: number; number: number; title: string }>; - sourceKinds: EmbeddingSourceKind[]; - } { - const rows = this.db - .prepare( - `select t.id, t.number, t.title, e.source_kind - from threads t - join document_embeddings e on e.thread_id = t.id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and not exists ( - select 1 - from cluster_closures cc - join cluster_memberships cm on cm.cluster_id = cc.cluster_id - where cm.thread_id = t.id - and cm.state <> 'removed_by_user' - )`, - ) - .all(repoId) as Array<{ id: number; number: number; title: string; source_kind: EmbeddingSourceKind }>; - - const itemsById = new Map(); - const sourceKinds = new Set(); - for (const row of rows) { - itemsById.set(row.id, { id: row.id, number: row.number, title: row.title }); - sourceKinds.add(row.source_kind); - } - - return { - items: Array.from(itemsById.values()), - sourceKinds: Array.from(sourceKinds.values()), - }; - } - - private loadClusterableActiveVectorMeta(repoId: number, _repoFullName: string): Array<{ id: number; number: number; title: string; embedding: number[] }> { - const rows = this.db - .prepare( - `select t.id, t.number, t.title, tv.vector_json - from threads t - join thread_vectors tv on tv.thread_id = t.id - where t.repo_id = ? - and t.state = 'open' - and t.closed_at_local is null - and not exists ( - select 1 - from cluster_closures cc - join cluster_memberships cm on cm.cluster_id = cc.cluster_id - where cm.thread_id = t.id - and cm.state <> 'removed_by_user' - ) - and tv.model = ? - and tv.basis = ? - and tv.dimensions = ? - order by t.number asc`, - ) - .all(repoId, this.config.embedModel, this.config.embeddingBasis, ACTIVE_EMBED_DIMENSIONS) as Array<{ - id: number; - number: number; - title: string; - vector_json: Buffer | string; - }>; - return rows.map((row) => ({ - id: row.id, - number: row.number, - title: row.title, - embedding: parseStoredVector(row.vector_json), - })); - } - private loadDeterministicClusterableThreadMeta(repoId: number, threadIds?: number[]): Array<{ id: number; number: number; @@ -4775,15 +4708,6 @@ export class GHCrawlService { return fingerprints; } - private loadNormalizedActiveVectors(repoId: number): Array<{ id: number; number: number; title: string; embedding: number[] }> { - return this.loadClusterableActiveVectorMeta(repoId, '').map((row) => ({ - id: row.id, - number: row.number, - title: row.title, - embedding: normalizeEmbedding(row.embedding).normalized, - })); - } - private listStoredClusterNeighbors(repoId: number, threadId: number, limit: number): SearchHitDto['neighbors'] { const latestRun = getLatestClusterRun(this.db, repoId); if (!latestRun) { From 1f055a4fa0ccf73a10776bd67acef37a70b2775a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 13:53:36 -0700 Subject: [PATCH 167/215] refactor: extract stored cluster neighbor query --- .../api-core/src/cluster/neighbor-queries.ts | 75 +++++++++++++++++++ packages/api-core/src/service.ts | 59 +-------------- 2 files changed, 77 insertions(+), 57 deletions(-) create mode 100644 packages/api-core/src/cluster/neighbor-queries.ts diff --git a/packages/api-core/src/cluster/neighbor-queries.ts b/packages/api-core/src/cluster/neighbor-queries.ts new file mode 100644 index 0000000..86acc41 --- /dev/null +++ b/packages/api-core/src/cluster/neighbor-queries.ts @@ -0,0 +1,75 @@ +import type { SearchHitDto } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import { getLatestClusterRun } from './run-queries.js'; + +export function listStoredClusterNeighbors(params: { + db: SqliteDatabase; + repoId: number; + threadId: number; + limit: number; +}): SearchHitDto['neighbors'] { + const latestRun = getLatestClusterRun(params.db, params.repoId); + if (!latestRun) { + return []; + } + + const rows = params.db + .prepare( + `select + case + when se.left_thread_id = ? then se.right_thread_id + else se.left_thread_id + end as neighbor_thread_id, + case + when se.left_thread_id = ? then t2.number + else t1.number + end as neighbor_number, + case + when se.left_thread_id = ? then t2.kind + else t1.kind + end as neighbor_kind, + case + when se.left_thread_id = ? then t2.title + else t1.title + end as neighbor_title, + se.score + from similarity_edges se + join threads t1 on t1.id = se.left_thread_id + join threads t2 on t2.id = se.right_thread_id + where se.repo_id = ? + and se.cluster_run_id = ? + and (se.left_thread_id = ? or se.right_thread_id = ?) + and t1.state = 'open' + and t1.closed_at_local is null + and t2.state = 'open' + and t2.closed_at_local is null + order by se.score desc + limit ?`, + ) + .all( + params.threadId, + params.threadId, + params.threadId, + params.threadId, + params.repoId, + latestRun.id, + params.threadId, + params.threadId, + params.limit, + ) as Array<{ + neighbor_thread_id: number; + neighbor_number: number; + neighbor_kind: 'issue' | 'pull_request'; + neighbor_title: string; + score: number; + }>; + + return rows.map((row) => ({ + threadId: row.neighbor_thread_id, + number: row.neighbor_number, + kind: row.neighbor_kind, + title: row.neighbor_title, + score: row.score, + })); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 1711c47..416de0b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -76,6 +76,7 @@ import { resolveEdgeWorkerRuntime } from './cluster/edge-worker-runtime.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; +import { listStoredClusterNeighbors } from './cluster/neighbor-queries.js'; import { summarizeClusterQuality, summarizeClusterSizes } from './cluster/quality.js'; import { getLatestClusterRun, getLatestRunClusterIdsForThread } from './cluster/run-queries.js'; import { @@ -3233,7 +3234,7 @@ export class GHCrawlService { let neighbors: SearchHitDto['neighbors'] = []; if (params.includeNeighbors !== false) { - neighbors = this.listStoredClusterNeighbors(repository.id, row.id, 8); + neighbors = listStoredClusterNeighbors({ db: this.db, repoId: repository.id, threadId: row.id, limit: 8 }); if (neighbors.length === 0) { try { neighbors = this.listNeighbors({ @@ -4708,62 +4709,6 @@ export class GHCrawlService { return fingerprints; } - private listStoredClusterNeighbors(repoId: number, threadId: number, limit: number): SearchHitDto['neighbors'] { - const latestRun = getLatestClusterRun(this.db, repoId); - if (!latestRun) { - return []; - } - - const rows = this.db - .prepare( - `select - case - when se.left_thread_id = ? then se.right_thread_id - else se.left_thread_id - end as neighbor_thread_id, - case - when se.left_thread_id = ? then t2.number - else t1.number - end as neighbor_number, - case - when se.left_thread_id = ? then t2.kind - else t1.kind - end as neighbor_kind, - case - when se.left_thread_id = ? then t2.title - else t1.title - end as neighbor_title, - se.score - from similarity_edges se - join threads t1 on t1.id = se.left_thread_id - join threads t2 on t2.id = se.right_thread_id - where se.repo_id = ? - and se.cluster_run_id = ? - and (se.left_thread_id = ? or se.right_thread_id = ?) - and t1.state = 'open' - and t1.closed_at_local is null - and t2.state = 'open' - and t2.closed_at_local is null - order by se.score desc - limit ?`, - ) - .all(threadId, threadId, threadId, threadId, repoId, latestRun.id, threadId, threadId, limit) as Array<{ - neighbor_thread_id: number; - neighbor_number: number; - neighbor_kind: 'issue' | 'pull_request'; - neighbor_title: string; - score: number; - }>; - - return rows.map((row) => ({ - threadId: row.neighbor_thread_id, - number: row.neighbor_number, - kind: row.neighbor_kind, - title: row.neighbor_title, - score: row.score, - })); - } - private async aggregateRepositoryEdges( repoId: number, sourceKinds: EmbeddingSourceKind[], From affa7367c451378bbe1d038562fa57c616ace4ef Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:37:19 -0700 Subject: [PATCH 168/215] refactor: extract latest code feature loader --- .../api-core/src/cluster/code-features.ts | 68 +++++++++++++++++++ packages/api-core/src/service.ts | 64 +---------------- 2 files changed, 70 insertions(+), 62 deletions(-) create mode 100644 packages/api-core/src/cluster/code-features.ts diff --git a/packages/api-core/src/cluster/code-features.ts b/packages/api-core/src/cluster/code-features.ts new file mode 100644 index 0000000..1ee5a7b --- /dev/null +++ b/packages/api-core/src/cluster/code-features.ts @@ -0,0 +1,68 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; + +export type LatestCodeFeatures = { + changedFiles: string[]; + hunkSignatures: string[]; + patchIds: string[]; +}; + +export function loadLatestCodeFeatures(db: SqliteDatabase, threadIds: number[]): Map { + if (threadIds.length === 0) return new Map(); + const placeholders = threadIds.map(() => '?').join(','); + const latestRevisions = db + .prepare( + `select thread_id, max(id) as revision_id + from thread_revisions + where thread_id in (${placeholders}) + group by thread_id`, + ) + .all(...threadIds) as Array<{ thread_id: number; revision_id: number }>; + if (latestRevisions.length === 0) return new Map(); + + const revisionToThread = new Map(latestRevisions.map((row) => [row.revision_id, row.thread_id])); + const revisionPlaceholders = latestRevisions.map(() => '?').join(','); + const fileRows = db + .prepare( + `select cs.thread_revision_id, cf.path, cf.patch_hash + from thread_code_snapshots cs + join thread_changed_files cf on cf.snapshot_id = cs.id + where cs.thread_revision_id in (${revisionPlaceholders}) + order by cf.path asc`, + ) + .all(...latestRevisions.map((row) => row.revision_id)) as Array<{ thread_revision_id: number; path: string; patch_hash: string | null }>; + const hunkRows = db + .prepare( + `select cs.thread_revision_id, hs.hunk_hash + from thread_code_snapshots cs + join thread_hunk_signatures hs on hs.snapshot_id = cs.id + where cs.thread_revision_id in (${revisionPlaceholders}) + order by hs.hunk_hash asc`, + ) + .all(...latestRevisions.map((row) => row.revision_id)) as Array<{ thread_revision_id: number; hunk_hash: string }>; + + const out = new Map(); + function entry(threadId: number): LatestCodeFeatures { + const existing = out.get(threadId) ?? { changedFiles: [], hunkSignatures: [], patchIds: [] }; + out.set(threadId, existing); + return existing; + } + for (const row of fileRows) { + const threadId = revisionToThread.get(row.thread_revision_id); + if (threadId === undefined) continue; + const target = entry(threadId); + target.changedFiles.push(row.path); + if (row.patch_hash) target.patchIds.push(row.patch_hash); + } + for (const row of hunkRows) { + const threadId = revisionToThread.get(row.thread_revision_id); + if (threadId === undefined) continue; + entry(threadId).hunkSignatures.push(row.hunk_hash); + } + + for (const target of out.values()) { + target.changedFiles = Array.from(new Set(target.changedFiles)).sort(); + target.hunkSignatures = Array.from(new Set(target.hunkSignatures)).sort(); + target.patchIds = Array.from(new Set(target.patchIds)).sort(); + } + return out; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 416de0b..00d36d8 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -63,6 +63,7 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; +import { loadLatestCodeFeatures } from './cluster/code-features.js'; import { buildDeterministicClusterGraphFromFingerprints, extractDeterministicRefs } from './cluster/deterministic-engine.js'; import { collectSourceKindScores, @@ -4478,7 +4479,7 @@ export class GHCrawlService { raw_json: string; updated_at_gh: string | null; }>; - const codeFeaturesByThread = this.loadLatestCodeFeatures(rows.map((row) => row.id)); + const codeFeaturesByThread = loadLatestCodeFeatures(this.db, rows.map((row) => row.id)); return rows.map((row) => ({ id: row.id, number: row.number, @@ -4494,67 +4495,6 @@ export class GHCrawlService { })); } - private loadLatestCodeFeatures(threadIds: number[]): Map { - if (threadIds.length === 0) return new Map(); - const placeholders = threadIds.map(() => '?').join(','); - const latestRevisions = this.db - .prepare( - `select thread_id, max(id) as revision_id - from thread_revisions - where thread_id in (${placeholders}) - group by thread_id`, - ) - .all(...threadIds) as Array<{ thread_id: number; revision_id: number }>; - if (latestRevisions.length === 0) return new Map(); - - const revisionToThread = new Map(latestRevisions.map((row) => [row.revision_id, row.thread_id])); - const revisionPlaceholders = latestRevisions.map(() => '?').join(','); - const fileRows = this.db - .prepare( - `select cs.thread_revision_id, cf.path, cf.patch_hash - from thread_code_snapshots cs - join thread_changed_files cf on cf.snapshot_id = cs.id - where cs.thread_revision_id in (${revisionPlaceholders}) - order by cf.path asc`, - ) - .all(...latestRevisions.map((row) => row.revision_id)) as Array<{ thread_revision_id: number; path: string; patch_hash: string | null }>; - const hunkRows = this.db - .prepare( - `select cs.thread_revision_id, hs.hunk_hash - from thread_code_snapshots cs - join thread_hunk_signatures hs on hs.snapshot_id = cs.id - where cs.thread_revision_id in (${revisionPlaceholders}) - order by hs.hunk_hash asc`, - ) - .all(...latestRevisions.map((row) => row.revision_id)) as Array<{ thread_revision_id: number; hunk_hash: string }>; - - const out = new Map(); - function entry(threadId: number): { changedFiles: string[]; hunkSignatures: string[]; patchIds: string[] } { - const existing = out.get(threadId) ?? { changedFiles: [], hunkSignatures: [], patchIds: [] }; - out.set(threadId, existing); - return existing; - } - for (const row of fileRows) { - const threadId = revisionToThread.get(row.thread_revision_id); - if (threadId === undefined) continue; - const target = entry(threadId); - target.changedFiles.push(row.path); - if (row.patch_hash) target.patchIds.push(row.patch_hash); - } - for (const row of hunkRows) { - const threadId = revisionToThread.get(row.thread_revision_id); - if (threadId === undefined) continue; - entry(threadId).hunkSignatures.push(row.hunk_hash); - } - - for (const target of out.values()) { - target.changedFiles = Array.from(new Set(target.changedFiles)).sort(); - target.hunkSignatures = Array.from(new Set(target.hunkSignatures)).sort(); - target.patchIds = Array.from(new Set(target.patchIds)).sort(); - } - return out; - } - private materializeLatestDeterministicFingerprints( items: Array<{ id: number; From 4d6a6b35747ce36bbe3be65713b941a83bc92b4e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:39:33 -0700 Subject: [PATCH 169/215] refactor: extract deterministic thread loader --- .../cluster/deterministic-thread-loader.ts | 68 +++++++++++++++ packages/api-core/src/service.ts | 87 ++----------------- 2 files changed, 76 insertions(+), 79 deletions(-) create mode 100644 packages/api-core/src/cluster/deterministic-thread-loader.ts diff --git a/packages/api-core/src/cluster/deterministic-thread-loader.ts b/packages/api-core/src/cluster/deterministic-thread-loader.ts new file mode 100644 index 0000000..c349bea --- /dev/null +++ b/packages/api-core/src/cluster/deterministic-thread-loader.ts @@ -0,0 +1,68 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import { parseArray } from '../service-utils.js'; +import { loadLatestCodeFeatures } from './code-features.js'; + +export type DeterministicClusterableThreadMeta = { + id: number; + number: number; + kind: 'issue' | 'pull_request'; + title: string; + body: string | null; + labels: string[]; + rawJson: string; + updatedAtGh: string | null; + changedFiles: string[]; + hunkSignatures: string[]; + patchIds: string[]; +}; + +export function loadDeterministicClusterableThreadMeta( + db: SqliteDatabase, + repoId: number, + threadIds?: number[], +): DeterministicClusterableThreadMeta[] { + let sql = + `select id, number, kind, title, body, labels_json, raw_json, updated_at_gh + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = threads.id + and cm.state <> 'removed_by_user' + )`; + const args: Array = [repoId]; + if (threadIds && threadIds.length > 0) { + sql += ` and id in (${threadIds.map(() => '?').join(',')})`; + args.push(...threadIds); + } + sql += ' order by number asc'; + + const rows = db.prepare(sql).all(...args) as Array<{ + id: number; + number: number; + kind: 'issue' | 'pull_request'; + title: string; + body: string | null; + labels_json: string; + raw_json: string; + updated_at_gh: string | null; + }>; + const codeFeaturesByThread = loadLatestCodeFeatures(db, rows.map((row) => row.id)); + return rows.map((row) => ({ + id: row.id, + number: row.number, + kind: row.kind, + title: row.title, + body: row.body, + labels: parseArray(row.labels_json), + rawJson: row.raw_json, + updatedAtGh: row.updated_at_gh, + changedFiles: codeFeaturesByThread.get(row.id)?.changedFiles ?? [], + hunkSignatures: codeFeaturesByThread.get(row.id)?.hunkSignatures ?? [], + patchIds: codeFeaturesByThread.get(row.id)?.patchIds ?? [], + })); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 00d36d8..9d7468d 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -63,8 +63,11 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; -import { loadLatestCodeFeatures } from './cluster/code-features.js'; import { buildDeterministicClusterGraphFromFingerprints, extractDeterministicRefs } from './cluster/deterministic-engine.js'; +import { + loadDeterministicClusterableThreadMeta, + type DeterministicClusterableThreadMeta, +} from './cluster/deterministic-thread-loader.js'; import { collectSourceKindScores, edgeKey, @@ -1109,7 +1112,8 @@ export class GHCrawlService { this.reconcileClusterCloseState(repoId); } if (fingerprintThreadIds.length > 0) { - const fingerprintItems = this.loadDeterministicClusterableThreadMeta( + const fingerprintItems = loadDeterministicClusterableThreadMeta( + this.db, repoId, Array.from(new Set(fingerprintThreadIds)), ); @@ -1672,7 +1676,7 @@ export class GHCrawlService { throw new Error(`Open thread #${params.threadNumber} was not found for ${repository.fullName}.`); } const seedThreadIds = seedThread ? [seedThread.id] : undefined; - const deterministicItems = this.loadDeterministicClusterableThreadMeta(repository.id); + const deterministicItems = loadDeterministicClusterableThreadMeta(this.db, repository.id); const fingerprintItems = seedThreadIds ? deterministicItems.filter((item) => seedThreadIds.includes(item.id)) : deterministicItems; this.materializeLatestDeterministicFingerprints(fingerprintItems, params.onProgress); const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); @@ -4432,83 +4436,8 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private loadDeterministicClusterableThreadMeta(repoId: number, threadIds?: number[]): Array<{ - id: number; - number: number; - kind: 'issue' | 'pull_request'; - title: string; - body: string | null; - labels: string[]; - rawJson: string; - updatedAtGh: string | null; - changedFiles: string[]; - hunkSignatures: string[]; - patchIds: string[]; - }> { - let sql = - `select id, number, kind, title, body, labels_json, raw_json, updated_at_gh - from threads - where repo_id = ? - and state = 'open' - and closed_at_local is null - and not exists ( - select 1 - from cluster_closures cc - join cluster_memberships cm on cm.cluster_id = cc.cluster_id - where cm.thread_id = threads.id - and cm.state <> 'removed_by_user' - )`; - const args: Array = [repoId]; - if (threadIds && threadIds.length > 0) { - sql += ` and id in (${threadIds.map(() => '?').join(',')})`; - args.push(...threadIds); - } - sql += ' order by number asc'; - - const rows = this.db - .prepare( - sql, - ) - .all(...args) as Array<{ - id: number; - number: number; - kind: 'issue' | 'pull_request'; - title: string; - body: string | null; - labels_json: string; - raw_json: string; - updated_at_gh: string | null; - }>; - const codeFeaturesByThread = loadLatestCodeFeatures(this.db, rows.map((row) => row.id)); - return rows.map((row) => ({ - id: row.id, - number: row.number, - kind: row.kind, - title: row.title, - body: row.body, - labels: parseArray(row.labels_json), - rawJson: row.raw_json, - updatedAtGh: row.updated_at_gh, - changedFiles: codeFeaturesByThread.get(row.id)?.changedFiles ?? [], - hunkSignatures: codeFeaturesByThread.get(row.id)?.hunkSignatures ?? [], - patchIds: codeFeaturesByThread.get(row.id)?.patchIds ?? [], - })); - } - private materializeLatestDeterministicFingerprints( - items: Array<{ - id: number; - number: number; - kind: 'issue' | 'pull_request'; - title: string; - body: string | null; - labels: string[]; - rawJson: string; - updatedAtGh: string | null; - changedFiles: string[]; - hunkSignatures: string[]; - patchIds: string[]; - }>, + items: DeterministicClusterableThreadMeta[], onProgress?: (message: string) => void, ): { computed: number; skipped: number } { let computed = 0; From 4297629f9a0587f8e27a26d838000bf2b2a2a6f1 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:50:21 -0700 Subject: [PATCH 170/215] refactor: extract fingerprint materializer --- .../src/cluster/fingerprint-materializer.ts | 75 +++++++++++++++++ packages/api-core/src/service.ts | 80 ++----------------- 2 files changed, 80 insertions(+), 75 deletions(-) create mode 100644 packages/api-core/src/cluster/fingerprint-materializer.ts diff --git a/packages/api-core/src/cluster/fingerprint-materializer.ts b/packages/api-core/src/cluster/fingerprint-materializer.ts new file mode 100644 index 0000000..2d23d2f --- /dev/null +++ b/packages/api-core/src/cluster/fingerprint-materializer.ts @@ -0,0 +1,75 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import { extractDeterministicRefs } from './deterministic-engine.js'; +import type { DeterministicClusterableThreadMeta } from './deterministic-thread-loader.js'; +import { upsertThreadFingerprint, upsertThreadRevision } from './persistent-store.js'; +import { + buildDeterministicThreadFingerprint, + fingerprintFeatureHash, + THREAD_FINGERPRINT_ALGORITHM_VERSION, +} from './thread-fingerprint.js'; + +export function materializeLatestDeterministicFingerprints( + db: SqliteDatabase, + items: DeterministicClusterableThreadMeta[], + onProgress?: (message: string) => void, +): { computed: number; skipped: number } { + let computed = 0; + let skipped = 0; + for (const item of items) { + const revisionId = upsertThreadRevision(db, { + threadId: item.id, + sourceUpdatedAt: item.updatedAtGh, + title: item.title, + body: item.body, + labels: item.labels, + rawJson: item.rawJson, + }); + const inferredRefs = extractDeterministicRefs(`${item.title}\n${item.body ?? ''}`); + const featureHash = fingerprintFeatureHash({ + linkedRefs: inferredRefs, + changedFiles: item.changedFiles, + hunkSignatures: item.hunkSignatures, + patchIds: item.patchIds, + }); + const existing = db + .prepare( + `select id, feature_json + from thread_fingerprints + where thread_revision_id = ? + and algorithm_version = ? + limit 1`, + ) + .get(revisionId, THREAD_FINGERPRINT_ALGORITHM_VERSION) as { id: number; feature_json: string } | undefined; + if (existing) { + const existingFeatureHash = (() => { + try { + const feature = JSON.parse(existing.feature_json) as Record; + return typeof feature.featureHash === 'string' ? feature.featureHash : null; + } catch { + return null; + } + })(); + if (existingFeatureHash === featureHash) { + skipped += 1; + continue; + } + } + + const fingerprint = buildDeterministicThreadFingerprint({ + threadId: item.id, + number: item.number, + kind: item.kind, + title: item.title, + body: item.body, + labels: item.labels, + linkedRefs: inferredRefs, + changedFiles: item.changedFiles, + hunkSignatures: item.hunkSignatures, + patchIds: item.patchIds, + }); + upsertThreadFingerprint(db, { threadRevisionId: revisionId, fingerprint }); + computed += 1; + } + onProgress?.(`[fingerprint] latest revisions computed=${computed} skipped=${skipped}`); + return { computed, skipped }; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 9d7468d..3e29e9c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -63,11 +63,8 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; -import { buildDeterministicClusterGraphFromFingerprints, extractDeterministicRefs } from './cluster/deterministic-engine.js'; -import { - loadDeterministicClusterableThreadMeta, - type DeterministicClusterableThreadMeta, -} from './cluster/deterministic-thread-loader.js'; +import { buildDeterministicClusterGraphFromFingerprints } from './cluster/deterministic-engine.js'; +import { loadDeterministicClusterableThreadMeta } from './cluster/deterministic-thread-loader.js'; import { collectSourceKindScores, edgeKey, @@ -78,6 +75,7 @@ import { } from './cluster/edge-aggregation.js'; import { resolveEdgeWorkerRuntime } from './cluster/edge-worker-runtime.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; +import { materializeLatestDeterministicFingerprints } from './cluster/fingerprint-materializer.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; import { listStoredClusterNeighbors } from './cluster/neighbor-queries.js'; @@ -90,14 +88,11 @@ import { upsertClusterGroup, upsertClusterMembership, upsertSimilarityEdgeEvidence, - upsertThreadFingerprint, upsertThreadRevision, upsertThreadCodeSnapshot, upsertThreadKeySummary, } from './cluster/persistent-store.js'; import { - buildDeterministicThreadFingerprint, - fingerprintFeatureHash, THREAD_FINGERPRINT_ALGORITHM_VERSION, type DeterministicThreadFingerprint, } from './cluster/thread-fingerprint.js'; @@ -1117,7 +1112,7 @@ export class GHCrawlService { repoId, Array.from(new Set(fingerprintThreadIds)), ); - this.materializeLatestDeterministicFingerprints(fingerprintItems, params.onProgress); + materializeLatestDeterministicFingerprints(this.db, fingerprintItems, params.onProgress); } const finishedAt = nowIso(); const reconciledOpenCloseAt = shouldSweepClosedOverlap || shouldReconcileMissingOpenThreads ? finishedAt : null; @@ -1678,7 +1673,7 @@ export class GHCrawlService { const seedThreadIds = seedThread ? [seedThread.id] : undefined; const deterministicItems = loadDeterministicClusterableThreadMeta(this.db, repository.id); const fingerprintItems = seedThreadIds ? deterministicItems.filter((item) => seedThreadIds.includes(item.id)) : deterministicItems; - this.materializeLatestDeterministicFingerprints(fingerprintItems, params.onProgress); + materializeLatestDeterministicFingerprints(this.db, fingerprintItems, params.onProgress); const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); const deterministic = buildDeterministicClusterGraphFromFingerprints( deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), @@ -4436,71 +4431,6 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private materializeLatestDeterministicFingerprints( - items: DeterministicClusterableThreadMeta[], - onProgress?: (message: string) => void, - ): { computed: number; skipped: number } { - let computed = 0; - let skipped = 0; - for (const item of items) { - const revisionId = upsertThreadRevision(this.db, { - threadId: item.id, - sourceUpdatedAt: item.updatedAtGh, - title: item.title, - body: item.body, - labels: item.labels, - rawJson: item.rawJson, - }); - const inferredRefs = extractDeterministicRefs(`${item.title}\n${item.body ?? ''}`); - const featureHash = fingerprintFeatureHash({ - linkedRefs: inferredRefs, - changedFiles: item.changedFiles, - hunkSignatures: item.hunkSignatures, - patchIds: item.patchIds, - }); - const existing = this.db - .prepare( - `select id, feature_json - from thread_fingerprints - where thread_revision_id = ? - and algorithm_version = ? - limit 1`, - ) - .get(revisionId, THREAD_FINGERPRINT_ALGORITHM_VERSION) as { id: number; feature_json: string } | undefined; - if (existing) { - const existingFeatureHash = (() => { - try { - const feature = JSON.parse(existing.feature_json) as Record; - return typeof feature.featureHash === 'string' ? feature.featureHash : null; - } catch { - return null; - } - })(); - if (existingFeatureHash === featureHash) { - skipped += 1; - continue; - } - } - - const fingerprint = buildDeterministicThreadFingerprint({ - threadId: item.id, - number: item.number, - kind: item.kind, - title: item.title, - body: item.body, - labels: item.labels, - linkedRefs: inferredRefs, - changedFiles: item.changedFiles, - hunkSignatures: item.hunkSignatures, - patchIds: item.patchIds, - }); - upsertThreadFingerprint(this.db, { threadRevisionId: revisionId, fingerprint }); - computed += 1; - } - onProgress?.(`[fingerprint] latest revisions computed=${computed} skipped=${skipped}`); - return { computed, skipped }; - } - private loadLatestDeterministicFingerprints(threadIds: number[]): Map { if (threadIds.length === 0) return new Map(); const placeholders = threadIds.map(() => '?').join(','); From 7e696da6aae7adcb23d1c06c92056dd8cd6d807f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:52:38 -0700 Subject: [PATCH 171/215] refactor: extract fingerprint loader --- .../src/cluster/fingerprint-loader.ts | 87 ++++++++++++++++++ packages/api-core/src/service.ts | 90 ++----------------- 2 files changed, 93 insertions(+), 84 deletions(-) create mode 100644 packages/api-core/src/cluster/fingerprint-loader.ts diff --git a/packages/api-core/src/cluster/fingerprint-loader.ts b/packages/api-core/src/cluster/fingerprint-loader.ts new file mode 100644 index 0000000..4731544 --- /dev/null +++ b/packages/api-core/src/cluster/fingerprint-loader.ts @@ -0,0 +1,87 @@ +import { readTextBlob } from '../db/blob-store.js'; +import { blobStoreRoot } from '../db/raw-json-store.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { parseStringArrayJson } from '../service-utils.js'; +import { THREAD_FINGERPRINT_ALGORITHM_VERSION, type DeterministicThreadFingerprint } from './thread-fingerprint.js'; + +export function loadLatestDeterministicFingerprints(params: { + db: SqliteDatabase; + dbPath: string; + threadIds: number[]; +}): Map { + const { db, dbPath, threadIds } = params; + if (threadIds.length === 0) return new Map(); + + const placeholders = threadIds.map(() => '?').join(','); + const rows = db + .prepare( + `select + tr.thread_id, + tf.fingerprint_hash, + tf.fingerprint_slug, + tf.title_tokens_json, + tf.linked_refs_json, + tf.module_buckets_json, + tf.minhash_signature_blob_id, + tf.simhash64, + tf.winnow_hashes_blob_id, + tf.feature_json + from thread_revisions tr + join ( + select thread_id, max(id) as revision_id + from thread_revisions + where thread_id in (${placeholders}) + group by thread_id + ) latest on latest.revision_id = tr.id + join thread_fingerprints tf on tf.thread_revision_id = tr.id + where tf.algorithm_version = ?`, + ) + .all(...threadIds, THREAD_FINGERPRINT_ALGORITHM_VERSION) as Array<{ + thread_id: number; + fingerprint_hash: string; + fingerprint_slug: string; + title_tokens_json: string; + linked_refs_json: string; + module_buckets_json: string; + minhash_signature_blob_id: number | null; + simhash64: string; + winnow_hashes_blob_id: number | null; + feature_json: string; + }>; + + const storeRoot = blobStoreRoot(dbPath); + const fingerprints = new Map(); + for (const row of rows) { + const feature = parseFingerprintFeature(row.feature_json); + const stringFeature = (key: string): string[] => { + const value = feature[key]; + return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : []; + }; + fingerprints.set(row.thread_id, { + algorithmVersion: THREAD_FINGERPRINT_ALGORITHM_VERSION, + fingerprintHash: row.fingerprint_hash, + fingerprintSlug: row.fingerprint_slug, + titleTokens: parseStringArrayJson(row.title_tokens_json), + salientTitleTokens: stringFeature('salientTitleTokens'), + bodyTokens: [], + linkedRefs: parseStringArrayJson(row.linked_refs_json), + moduleBuckets: parseStringArrayJson(row.module_buckets_json), + changedFiles: stringFeature('changedFiles'), + hunkSignatures: stringFeature('hunkSignatures'), + patchIds: stringFeature('patchIds'), + featureHash: typeof feature.featureHash === 'string' ? feature.featureHash : '', + minhashSignature: row.minhash_signature_blob_id ? parseStringArrayJson(readTextBlob(db, storeRoot, row.minhash_signature_blob_id)) : [], + simhash64: row.simhash64, + winnowHashes: row.winnow_hashes_blob_id ? parseStringArrayJson(readTextBlob(db, storeRoot, row.winnow_hashes_blob_id)) : [], + }); + } + return fingerprints; +} + +function parseFingerprintFeature(featureJson: string): Record { + try { + return JSON.parse(featureJson) as Record; + } catch { + return {}; + } +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 3e29e9c..e0fd685 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -75,6 +75,7 @@ import { } from './cluster/edge-aggregation.js'; import { resolveEdgeWorkerRuntime } from './cluster/edge-worker-runtime.js'; import { buildSourceKindEdges } from './cluster/exact-edges.js'; +import { loadLatestDeterministicFingerprints } from './cluster/fingerprint-loader.js'; import { materializeLatestDeterministicFingerprints } from './cluster/fingerprint-materializer.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; @@ -92,10 +93,6 @@ import { upsertThreadCodeSnapshot, upsertThreadKeySummary, } from './cluster/persistent-store.js'; -import { - THREAD_FINGERPRINT_ALGORITHM_VERSION, - type DeterministicThreadFingerprint, -} from './cluster/thread-fingerprint.js'; import { ensureRuntimeDirs, loadConfig, @@ -107,7 +104,6 @@ import { } from './config.js'; import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; -import { readTextBlob } from './db/blob-store.js'; import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; @@ -211,7 +207,6 @@ import { parseIso, parseLabels, parseObjectJson, - parseStringArrayJson, repositoryToDto, snippetText, stableContentHash, @@ -1674,7 +1669,11 @@ export class GHCrawlService { const deterministicItems = loadDeterministicClusterableThreadMeta(this.db, repository.id); const fingerprintItems = seedThreadIds ? deterministicItems.filter((item) => seedThreadIds.includes(item.id)) : deterministicItems; materializeLatestDeterministicFingerprints(this.db, fingerprintItems, params.onProgress); - const persistedFingerprints = this.loadLatestDeterministicFingerprints(deterministicItems.map((item) => item.id)); + const persistedFingerprints = loadLatestDeterministicFingerprints({ + db: this.db, + dbPath: this.config.dbPath, + threadIds: deterministicItems.map((item) => item.id), + }); const deterministic = buildDeterministicClusterGraphFromFingerprints( deterministicItems.map((item) => ({ id: item.id, number: item.number, title: item.title })), persistedFingerprints, @@ -4431,83 +4430,6 @@ export class GHCrawlService { throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); } - private loadLatestDeterministicFingerprints(threadIds: number[]): Map { - if (threadIds.length === 0) return new Map(); - const placeholders = threadIds.map(() => '?').join(','); - const rows = this.db - .prepare( - `select - tr.thread_id, - tf.fingerprint_hash, - tf.fingerprint_slug, - tf.title_tokens_json, - tf.linked_refs_json, - tf.module_buckets_json, - tf.minhash_signature_blob_id, - tf.simhash64, - tf.winnow_hashes_blob_id, - tf.feature_json - from thread_revisions tr - join ( - select thread_id, max(id) as revision_id - from thread_revisions - where thread_id in (${placeholders}) - group by thread_id - ) latest on latest.revision_id = tr.id - join thread_fingerprints tf on tf.thread_revision_id = tr.id - where tf.algorithm_version = ?`, - ) - .all(...threadIds, THREAD_FINGERPRINT_ALGORITHM_VERSION) as Array<{ - thread_id: number; - fingerprint_hash: string; - fingerprint_slug: string; - title_tokens_json: string; - linked_refs_json: string; - module_buckets_json: string; - minhash_signature_blob_id: number | null; - simhash64: string; - winnow_hashes_blob_id: number | null; - feature_json: string; - }>; - - const fingerprints = new Map(); - for (const row of rows) { - const feature = (() => { - try { - return JSON.parse(row.feature_json) as Record; - } catch { - return {}; - } - })(); - const stringFeature = (key: string): string[] => { - const value = feature[key]; - return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : []; - }; - fingerprints.set(row.thread_id, { - algorithmVersion: THREAD_FINGERPRINT_ALGORITHM_VERSION, - fingerprintHash: row.fingerprint_hash, - fingerprintSlug: row.fingerprint_slug, - titleTokens: parseStringArrayJson(row.title_tokens_json), - salientTitleTokens: stringFeature('salientTitleTokens'), - bodyTokens: [], - linkedRefs: parseStringArrayJson(row.linked_refs_json), - moduleBuckets: parseStringArrayJson(row.module_buckets_json), - changedFiles: stringFeature('changedFiles'), - hunkSignatures: stringFeature('hunkSignatures'), - patchIds: stringFeature('patchIds'), - featureHash: typeof feature.featureHash === 'string' ? feature.featureHash : '', - minhashSignature: row.minhash_signature_blob_id - ? parseStringArrayJson(readTextBlob(this.db, blobStoreRoot(this.config.dbPath), row.minhash_signature_blob_id)) - : [], - simhash64: row.simhash64, - winnowHashes: row.winnow_hashes_blob_id - ? parseStringArrayJson(readTextBlob(this.db, blobStoreRoot(this.config.dbPath), row.winnow_hashes_blob_id)) - : [], - }); - } - return fingerprints; - } - private async aggregateRepositoryEdges( repoId: number, sourceKinds: EmbeddingSourceKind[], From c8a8815b08b7d2a79734cca6de5ef478c0fedd57 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:54:58 -0700 Subject: [PATCH 172/215] refactor: extract cluster close reconciliation --- packages/api-core/src/cluster/close-state.ts | 71 +++++++++++++++++++ packages/api-core/src/service.ts | 74 +------------------- 2 files changed, 74 insertions(+), 71 deletions(-) create mode 100644 packages/api-core/src/cluster/close-state.ts diff --git a/packages/api-core/src/cluster/close-state.ts b/packages/api-core/src/cluster/close-state.ts new file mode 100644 index 0000000..080394a --- /dev/null +++ b/packages/api-core/src/cluster/close-state.ts @@ -0,0 +1,71 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import { nowIso } from '../service-utils.js'; +import { getLatestClusterRun } from './run-queries.js'; + +export function reconcileClusterCloseState(db: SqliteDatabase, repoId: number, clusterIds?: number[]): number { + const latestRun = getLatestClusterRun(db, repoId); + if (!latestRun) { + return 0; + } + + const resolvedClusterIds = + clusterIds && clusterIds.length > 0 + ? Array.from(new Set(clusterIds)) + : ( + db + .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? order by id asc') + .all(repoId, latestRun.id) as Array<{ id: number }> + ).map((row) => row.id); + if (resolvedClusterIds.length === 0) { + return 0; + } + + const summarize = db.prepare( + `select + c.id, + c.close_reason_local, + count(*) as member_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count + from clusters c + join cluster_members cm on cm.cluster_id = c.id + join threads t on t.id = cm.thread_id + where c.id = ? + group by c.id, c.close_reason_local`, + ); + const markClosed = db.prepare( + `update clusters + set closed_at_local = coalesce(closed_at_local, ?), + close_reason_local = 'all_members_closed' + where id = ?`, + ); + const clearClosed = db.prepare( + `update clusters + set closed_at_local = null, + close_reason_local = null + where id = ? and close_reason_local = 'all_members_closed'`, + ); + + let changed = 0; + for (const clusterId of resolvedClusterIds) { + const row = summarize.get(clusterId) as + | { + id: number; + close_reason_local: string | null; + member_count: number; + closed_member_count: number; + } + | undefined; + if (!row || row.close_reason_local === 'manual') { + continue; + } + if (row.member_count > 0 && row.closed_member_count >= row.member_count) { + const result = markClosed.run(nowIso(), clusterId); + changed += result.changes; + continue; + } + const cleared = clearClosed.run(clusterId); + changed += cleared.changes; + } + + return changed; +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e0fd685..e082693 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -62,6 +62,7 @@ import { } from '@ghcrawl/api-contract'; import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; +import { reconcileClusterCloseState } from './cluster/close-state.js'; import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; import { buildDeterministicClusterGraphFromFingerprints } from './cluster/deterministic-engine.js'; import { loadDeterministicClusterableThreadMeta } from './cluster/deterministic-thread-loader.js'; @@ -367,7 +368,7 @@ export class GHCrawlService { ) .run(closedAt, closedAt, row.id); const clusterIds = getLatestRunClusterIdsForThread(this.db, repository.id, row.id); - const clusterClosed = this.reconcileClusterCloseState(repository.id, clusterIds) > 0; + const clusterClosed = reconcileClusterCloseState(this.db, repository.id, clusterIds) > 0; const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id) as ThreadRow; return closeResponseSchema.parse({ @@ -1099,7 +1100,7 @@ export class GHCrawlService { : 0; const threadsClosed = threadsClosedFromClosedSweep + threadsClosedFromClosedBackfill + threadsClosedFromDirectReconcile; if (threadsClosed > 0) { - this.reconcileClusterCloseState(repoId); + reconcileClusterCloseState(this.db, repoId); } if (fingerprintThreadIds.length > 0) { const fingerprintItems = loadDeterministicClusterableThreadMeta( @@ -3348,75 +3349,6 @@ export class GHCrawlService { }); } - private reconcileClusterCloseState(repoId: number, clusterIds?: number[]): number { - const latestRun = getLatestClusterRun(this.db, repoId); - if (!latestRun) { - return 0; - } - - const resolvedClusterIds = - clusterIds && clusterIds.length > 0 - ? Array.from(new Set(clusterIds)) - : ( - this.db - .prepare('select id from clusters where repo_id = ? and cluster_run_id = ? order by id asc') - .all(repoId, latestRun.id) as Array<{ id: number }> - ).map((row) => row.id); - if (resolvedClusterIds.length === 0) { - return 0; - } - - const summarize = this.db.prepare( - `select - c.id, - c.close_reason_local, - count(*) as member_count, - sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count - from clusters c - join cluster_members cm on cm.cluster_id = c.id - join threads t on t.id = cm.thread_id - where c.id = ? - group by c.id, c.close_reason_local`, - ); - const markClosed = this.db.prepare( - `update clusters - set closed_at_local = coalesce(closed_at_local, ?), - close_reason_local = 'all_members_closed' - where id = ?`, - ); - const clearClosed = this.db.prepare( - `update clusters - set closed_at_local = null, - close_reason_local = null - where id = ? and close_reason_local = 'all_members_closed'`, - ); - - let changed = 0; - for (const clusterId of resolvedClusterIds) { - const row = summarize.get(clusterId) as - | { - id: number; - close_reason_local: string | null; - member_count: number; - closed_member_count: number; - } - | undefined; - if (!row || row.close_reason_local === 'manual') { - continue; - } - if (row.member_count > 0 && row.closed_member_count >= row.member_count) { - const closedAt = nowIso(); - const result = markClosed.run(closedAt, clusterId); - changed += result.changes; - continue; - } - const cleared = clearClosed.run(clusterId); - changed += cleared.changes; - } - - return changed; - } - private ensureDurableClusterForRunCluster(repoId: number, runClusterId: number, representativeThreadId: number | null): number { const members = this.db .prepare( From 72f2cd0367ea6cec4b5f8c232eb20a5a4e96b9da Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:56:40 -0700 Subject: [PATCH 173/215] refactor: move closed cluster row collapse --- packages/api-core/src/service.ts | 38 ++------------------- packages/api-core/src/tui/cluster-format.ts | 34 ++++++++++++++++++ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index e082693..35d3a51 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -144,7 +144,7 @@ import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighbor import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; -import { clusterDisplayTitle, compareTuiClusterSummary, durableClosureReason, parseMemberThreadIdSet } from './tui/cluster-format.js'; +import { clusterDisplayTitle, collapseOverlappingClosedDurableRows, compareTuiClusterSummary, durableClosureReason } from './tui/cluster-format.js'; import { getTuiRepoStats } from './tui/repo-stats.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { @@ -3505,7 +3505,7 @@ export class GHCrawlService { search_text: string | null; }>; - return this.collapseOverlappingClosedDurableRows( + return collapseOverlappingClosedDurableRows( rows.filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)), ) .map((row) => @@ -3516,40 +3516,6 @@ export class GHCrawlService { ); } - private collapseOverlappingClosedDurableRows< - T extends { - cluster_id: number; - member_count: number; - latest_updated_at: string | null; - member_thread_ids: string | null; - }, - >(rows: T[]): T[] { - const sortedRows = [...rows].sort((left, right) => { - const leftTime = left.latest_updated_at ? Date.parse(left.latest_updated_at) : 0; - const rightTime = right.latest_updated_at ? Date.parse(right.latest_updated_at) : 0; - return right.member_count - left.member_count || rightTime - leftTime || left.cluster_id - right.cluster_id; - }); - const selected: Array<{ row: T; memberIds: Set }> = []; - - for (const row of sortedRows) { - const memberIds = parseMemberThreadIdSet(row.member_thread_ids); - const duplicate = selected.some((entry) => { - const smallerSize = Math.min(memberIds.size, entry.memberIds.size); - if (smallerSize === 0) return false; - let overlap = 0; - for (const memberId of memberIds) { - if (entry.memberIds.has(memberId)) overlap += 1; - } - return overlap / smallerSize >= 0.8; - }); - if (!duplicate) { - selected.push({ row, memberIds }); - } - } - - return selected.map((entry) => entry.row); - } - private getDurableTuiClusterSummary(repoId: number, clusterId: number): TuiClusterSummary | null { const row = this.db .prepare( diff --git a/packages/api-core/src/tui/cluster-format.ts b/packages/api-core/src/tui/cluster-format.ts index 2cf4eed..a286d14 100644 --- a/packages/api-core/src/tui/cluster-format.ts +++ b/packages/api-core/src/tui/cluster-format.ts @@ -27,3 +27,37 @@ export function compareTuiClusterSummary(left: TuiClusterSummary, right: TuiClus } return rightTime - leftTime || right.totalCount - left.totalCount || left.clusterId - right.clusterId; } + +export function collapseOverlappingClosedDurableRows< + T extends { + cluster_id: number; + member_count: number; + latest_updated_at: string | null; + member_thread_ids: string | null; + }, +>(rows: T[]): T[] { + const sortedRows = [...rows].sort((left, right) => { + const leftTime = left.latest_updated_at ? Date.parse(left.latest_updated_at) : 0; + const rightTime = right.latest_updated_at ? Date.parse(right.latest_updated_at) : 0; + return right.member_count - left.member_count || rightTime - leftTime || left.cluster_id - right.cluster_id; + }); + const selected: Array<{ row: T; memberIds: Set }> = []; + + for (const row of sortedRows) { + const memberIds = parseMemberThreadIdSet(row.member_thread_ids); + const duplicate = selected.some((entry) => { + const smallerSize = Math.min(memberIds.size, entry.memberIds.size); + if (smallerSize === 0) return false; + let overlap = 0; + for (const memberId of memberIds) { + if (entry.memberIds.has(memberId)) overlap += 1; + } + return overlap / smallerSize >= 0.8; + }); + if (!duplicate) { + selected.push({ row, memberIds }); + } + } + + return selected.map((entry) => entry.row); +} From bc5adc36895571558e4b960ca26368f34b13fccc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 16:59:04 -0700 Subject: [PATCH 174/215] refactor: move durable tui summary mapper --- packages/api-core/src/service.ts | 61 +++------------------ packages/api-core/src/tui/cluster-format.ts | 49 +++++++++++++++++ 2 files changed, 58 insertions(+), 52 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 35d3a51..3536f79 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -144,7 +144,13 @@ import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighbor import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; -import { clusterDisplayTitle, collapseOverlappingClosedDurableRows, compareTuiClusterSummary, durableClosureReason } from './tui/cluster-format.js'; +import { + clusterDisplayTitle, + collapseOverlappingClosedDurableRows, + compareTuiClusterSummary, + durableClosureReason, + durableTuiSummaryFromRow, +} from './tui/cluster-format.js'; import { getTuiRepoStats } from './tui/repo-stats.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { @@ -3509,7 +3515,7 @@ export class GHCrawlService { rows.filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)), ) .map((row) => - this.durableTuiSummaryFromRow({ + durableTuiSummaryFromRow({ ...row, representative_title: row.representative_title ?? row.title, }), @@ -3577,61 +3583,12 @@ export class GHCrawlService { } | undefined; if (!row) return null; - return this.durableTuiSummaryFromRow({ + return durableTuiSummaryFromRow({ ...row, representative_title: row.representative_title ?? row.title, }); } - private durableTuiSummaryFromRow(row: { - cluster_id: number; - stable_slug: string; - status: 'active' | 'closed' | 'merged' | 'split'; - closed_at: string | null; - closure_reason?: string | null; - representative_thread_id: number | null; - representative_number: number | null; - representative_kind: 'issue' | 'pull_request' | null; - representative_title: string | null; - member_count: number; - latest_updated_at: string | null; - issue_count: number; - pull_request_count: number; - closed_member_count: number; - search_text: string | null; - }): TuiClusterSummary { - const closure: DurableTuiClosure = { - clusterId: row.cluster_id, - status: row.status, - closedAt: row.closed_at, - reason: row.closure_reason ?? null, - }; - const lifecycleClosed = row.status === 'merged' || row.status === 'split'; - const manuallyClosed = row.closure_reason !== undefined && row.closure_reason !== null; - const isClosed = manuallyClosed || lifecycleClosed || row.closed_member_count >= row.member_count; - const closeReasonLocal = - manuallyClosed || lifecycleClosed - ? durableClosureReason(closure) - : row.closed_member_count >= row.member_count - ? 'all_members_closed' - : null; - return { - clusterId: row.cluster_id, - displayTitle: clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), - isClosed, - closedAtLocal: manuallyClosed || lifecycleClosed ? row.closed_at : null, - closeReasonLocal, - totalCount: row.member_count, - issueCount: row.issue_count, - pullRequestCount: row.pull_request_count, - latestUpdatedAt: row.latest_updated_at, - representativeThreadId: row.representative_thread_id, - representativeNumber: row.representative_number, - representativeKind: row.representative_kind, - searchText: `${row.stable_slug} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), - }; - } - private listRawTuiClusters(repoId: number, clusterRunId: number, minSize: number): TuiClusterSummary[] { const rows = this.db .prepare( diff --git a/packages/api-core/src/tui/cluster-format.ts b/packages/api-core/src/tui/cluster-format.ts index a286d14..ec64910 100644 --- a/packages/api-core/src/tui/cluster-format.ts +++ b/packages/api-core/src/tui/cluster-format.ts @@ -61,3 +61,52 @@ export function collapseOverlappingClosedDurableRows< return selected.map((entry) => entry.row); } + +export function durableTuiSummaryFromRow(row: { + cluster_id: number; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + closure_reason?: string | null; + representative_thread_id: number | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + member_count: number; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + closed_member_count: number; + search_text: string | null; +}): TuiClusterSummary { + const closure: DurableTuiClosure = { + clusterId: row.cluster_id, + status: row.status, + closedAt: row.closed_at, + reason: row.closure_reason ?? null, + }; + const lifecycleClosed = row.status === 'merged' || row.status === 'split'; + const manuallyClosed = row.closure_reason !== undefined && row.closure_reason !== null; + const isClosed = manuallyClosed || lifecycleClosed || row.closed_member_count >= row.member_count; + const closeReasonLocal = + manuallyClosed || lifecycleClosed + ? durableClosureReason(closure) + : row.closed_member_count >= row.member_count + ? 'all_members_closed' + : null; + return { + clusterId: row.cluster_id, + displayTitle: clusterDisplayTitle(row.stable_slug, row.representative_title, row.cluster_id), + isClosed, + closedAtLocal: manuallyClosed || lifecycleClosed ? row.closed_at : null, + closeReasonLocal, + totalCount: row.member_count, + issueCount: row.issue_count, + pullRequestCount: row.pull_request_count, + latestUpdatedAt: row.latest_updated_at, + representativeThreadId: row.representative_thread_id, + representativeNumber: row.representative_number, + representativeKind: row.representative_kind, + searchText: `${row.stable_slug} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), + }; +} From 55e596382c2a5e8b4021b22f265b1e1c650ca427 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 17:01:58 -0700 Subject: [PATCH 175/215] refactor: extract tui cluster query helpers --- packages/api-core/src/service.ts | 61 ++------------------ packages/api-core/src/tui/cluster-queries.ts | 57 ++++++++++++++++++ 2 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 packages/api-core/src/tui/cluster-queries.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 3536f79..58f1f8e 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -151,6 +151,7 @@ import { durableClosureReason, durableTuiSummaryFromRow, } from './tui/cluster-format.js'; +import { clusterHumanName, getDurableClosuresByRepresentative } from './tui/cluster-queries.js'; import { getTuiRepoStats } from './tui/repo-stats.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { @@ -184,7 +185,6 @@ import type { ClusterExperimentResult, CommentSeed, DoctorResult, - DurableTuiClosure, EmbeddingSourceKind, KeySummaryTask, NeighborsResultInternal, @@ -3405,48 +3405,6 @@ export class GHCrawlService { return durableClusterId; } - private getDurableClosuresByRepresentative(repoId: number, representativeThreadIds: number[]): Map { - const uniqueThreadIds = Array.from(new Set(representativeThreadIds)); - if (uniqueThreadIds.length === 0) { - return new Map(); - } - - const identities = uniqueThreadIds.map((threadId) => ({ - threadId, - stableKey: humanKeyForValue(`repo:${repoId}:cluster-representative:${threadId}`).hash, - })); - const placeholders = identities.map(() => '?').join(','); - const rows = this.db - .prepare( - `select cg.id, cg.stable_key, cg.status, coalesce(cc.updated_at, cg.closed_at) as closed_at, cc.reason - from cluster_groups cg - left join cluster_closures cc on cc.cluster_id = cg.id - where cg.repo_id = ? - and cg.stable_key in (${placeholders}) - and (cc.cluster_id is not null or cg.status in ('merged', 'split'))`, - ) - .all(repoId, ...identities.map((identity) => identity.stableKey)) as Array<{ - id: number; - stable_key: string; - status: 'active' | 'closed' | 'merged' | 'split'; - closed_at: string | null; - reason: string | null; - }>; - const threadIdByStableKey = new Map(identities.map((identity) => [identity.stableKey, identity.threadId])); - const closures = new Map(); - for (const row of rows) { - const threadId = threadIdByStableKey.get(row.stable_key); - if (threadId === undefined) continue; - closures.set(threadId, { - clusterId: row.id, - status: row.status, - closedAt: row.closed_at, - reason: row.reason, - }); - } - return closures; - } - private listClosedDurableTuiClusters(repoId: number, representedThreadIds: Set, minSize: number): TuiClusterSummary[] { const rows = this.db .prepare( @@ -3637,7 +3595,8 @@ export class GHCrawlService { closed_member_count: number; search_text: string | null; }>; - const durableClosures = this.getDurableClosuresByRepresentative( + const durableClosures = getDurableClosuresByRepresentative( + this.db, repoId, rows .map((row) => row.representative_thread_id) @@ -3645,7 +3604,7 @@ export class GHCrawlService { ); return rows.map((row) => { - const clusterName = this.clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); + const clusterName = clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); const durableClosure = row.representative_thread_id === null ? null : (durableClosures.get(row.representative_thread_id) ?? null); return { @@ -3720,11 +3679,11 @@ export class GHCrawlService { return null; } - const clusterName = this.clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); + const clusterName = clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); const durableClosure = row.representative_thread_id === null ? null - : (this.getDurableClosuresByRepresentative(repoId, [row.representative_thread_id]).get(row.representative_thread_id) ?? null); + : (getDurableClosuresByRepresentative(this.db, repoId, [row.representative_thread_id]).get(row.representative_thread_id) ?? null); return { clusterId: row.cluster_id, displayTitle: clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), @@ -3742,14 +3701,6 @@ export class GHCrawlService { }; } - private clusterHumanName(repoId: number, representativeThreadId: number | null, clusterId: number): string { - return humanKeyForValue( - representativeThreadId === null - ? `repo:${repoId}:cluster:${clusterId}` - : `repo:${repoId}:cluster-representative:${representativeThreadId}`, - ).slug; - } - private async fetchThreadComments( owner: string, repo: string, diff --git a/packages/api-core/src/tui/cluster-queries.ts b/packages/api-core/src/tui/cluster-queries.ts new file mode 100644 index 0000000..a649018 --- /dev/null +++ b/packages/api-core/src/tui/cluster-queries.ts @@ -0,0 +1,57 @@ +import { humanKeyForValue } from '../cluster/human-key.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { DurableTuiClosure } from '../service-types.js'; + +export function clusterHumanName(repoId: number, representativeThreadId: number | null, clusterId: number): string { + return humanKeyForValue( + representativeThreadId === null + ? `repo:${repoId}:cluster:${clusterId}` + : `repo:${repoId}:cluster-representative:${representativeThreadId}`, + ).slug; +} + +export function getDurableClosuresByRepresentative( + db: SqliteDatabase, + repoId: number, + representativeThreadIds: number[], +): Map { + const uniqueThreadIds = Array.from(new Set(representativeThreadIds)); + if (uniqueThreadIds.length === 0) { + return new Map(); + } + + const identities = uniqueThreadIds.map((threadId) => ({ + threadId, + stableKey: humanKeyForValue(`repo:${repoId}:cluster-representative:${threadId}`).hash, + })); + const placeholders = identities.map(() => '?').join(','); + const rows = db + .prepare( + `select cg.id, cg.stable_key, cg.status, coalesce(cc.updated_at, cg.closed_at) as closed_at, cc.reason + from cluster_groups cg + left join cluster_closures cc on cc.cluster_id = cg.id + where cg.repo_id = ? + and cg.stable_key in (${placeholders}) + and (cc.cluster_id is not null or cg.status in ('merged', 'split'))`, + ) + .all(repoId, ...identities.map((identity) => identity.stableKey)) as Array<{ + id: number; + stable_key: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + reason: string | null; + }>; + const threadIdByStableKey = new Map(identities.map((identity) => [identity.stableKey, identity.threadId])); + const closures = new Map(); + for (const row of rows) { + const threadId = threadIdByStableKey.get(row.stable_key); + if (threadId === undefined) continue; + closures.set(threadId, { + clusterId: row.id, + status: row.status, + closedAt: row.closed_at, + reason: row.reason, + }); + } + return closures; +} From 7523e8c4b903e7e5106e42839084e817734b66d8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:00:02 -0700 Subject: [PATCH 176/215] refactor: extract tui cluster summary queries --- packages/api-core/src/service.ts | 318 +------------------ packages/api-core/src/tui/cluster-queries.ts | 285 ++++++++++++++++- 2 files changed, 294 insertions(+), 309 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 58f1f8e..40c8637 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -144,14 +144,13 @@ import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighbor import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; +import { compareTuiClusterSummary } from './tui/cluster-format.js'; import { - clusterDisplayTitle, - collapseOverlappingClosedDurableRows, - compareTuiClusterSummary, - durableClosureReason, - durableTuiSummaryFromRow, -} from './tui/cluster-format.js'; -import { clusterHumanName, getDurableClosuresByRepresentative } from './tui/cluster-queries.js'; + getDurableTuiClusterSummary, + getRawTuiClusterSummary, + listClosedDurableTuiClusters, + listRawTuiClusters, +} from './tui/cluster-queries.js'; import { getTuiRepoStats } from './tui/repo-stats.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { @@ -196,7 +195,6 @@ import type { ThreadRow, TuiClusterDetail, TuiClusterSortMode, - TuiClusterSummary, TuiRefreshState, TuiSnapshot, TuiThreadDetail, @@ -3029,14 +3027,14 @@ export class GHCrawlService { const latestRun = getLatestClusterRun(this.db, repository.id); const includeClosedClusters = params.includeClosedClusters ?? true; const minSize = params.minSize ?? 1; - const rawClusters = latestRun ? this.listRawTuiClusters(repository.id, latestRun.id, minSize) : []; + const rawClusters = latestRun ? listRawTuiClusters(this.db, repository.id, latestRun.id, minSize) : []; const representedThreadIds = new Set( rawClusters .map((cluster) => cluster.representativeThreadId) .filter((threadId): threadId is number => threadId !== null), ); const durableClosedClusters = includeClosedClusters - ? this.listClosedDurableTuiClusters(repository.id, representedThreadIds, minSize) + ? listClosedDurableTuiClusters(this.db, repository.id, representedThreadIds, minSize) : []; const clusters = [...rawClusters, ...durableClosedClusters] .filter((cluster) => (includeClosedClusters ? true : !cluster.isClosed)) @@ -3115,8 +3113,8 @@ export class GHCrawlService { params.clusterRunId ?? (getLatestClusterRun(this.db, repository.id)?.id ?? null); - const summary = clusterRunId ? this.getRawTuiClusterSummary(repository.id, clusterRunId, params.clusterId) : null; - const durableSummary = summary ? null : this.getDurableTuiClusterSummary(repository.id, params.clusterId); + const summary = clusterRunId ? getRawTuiClusterSummary(this.db, repository.id, clusterRunId, params.clusterId) : null; + const durableSummary = summary ? null : getDurableTuiClusterSummary(this.db, repository.id, params.clusterId); const resolvedSummary = summary ?? durableSummary; if (!resolvedSummary) { throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`); @@ -3405,302 +3403,6 @@ export class GHCrawlService { return durableClusterId; } - private listClosedDurableTuiClusters(repoId: number, representedThreadIds: Set, minSize: number): TuiClusterSummary[] { - const rows = this.db - .prepare( - `select - cg.id as cluster_id, - cg.stable_slug, - cg.status, - coalesce(cc.updated_at, cg.closed_at) as closed_at, - cc.reason as closure_reason, - cg.representative_thread_id, - cg.title, - rt.number as representative_number, - rt.kind as representative_kind, - rt.title as representative_title, - count(*) as member_count, - max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, - sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, - sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, - sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, - group_concat(t.id, ',') as member_thread_ids, - group_concat(lower(coalesce(t.title, '')), ' ') as search_text - from cluster_groups cg - left join cluster_closures cc on cc.cluster_id = cg.id - left join threads rt on rt.id = cg.representative_thread_id - join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' - join threads t on t.id = cm.thread_id - where cg.repo_id = ? - group by - cg.id, - cg.stable_slug, - cg.status, - cg.closed_at, - cc.updated_at, - cc.reason, - cg.representative_thread_id, - cg.title, - rt.number, - rt.kind, - rt.title - having member_count >= ? - and (cc.cluster_id is not null - or cg.status in ('merged', 'split') - or closed_member_count >= member_count)`, - ) - .all(repoId, minSize) as Array<{ - cluster_id: number; - stable_slug: string; - status: 'active' | 'closed' | 'merged' | 'split'; - closed_at: string | null; - closure_reason: string | null; - representative_thread_id: number | null; - title: string | null; - representative_number: number | null; - representative_kind: 'issue' | 'pull_request' | null; - representative_title: string | null; - member_count: number; - latest_updated_at: string | null; - issue_count: number; - pull_request_count: number; - closed_member_count: number; - member_thread_ids: string | null; - search_text: string | null; - }>; - - return collapseOverlappingClosedDurableRows( - rows.filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)), - ) - .map((row) => - durableTuiSummaryFromRow({ - ...row, - representative_title: row.representative_title ?? row.title, - }), - ); - } - - private getDurableTuiClusterSummary(repoId: number, clusterId: number): TuiClusterSummary | null { - const row = this.db - .prepare( - `select - cg.id as cluster_id, - cg.stable_slug, - cg.status, - coalesce(cc.updated_at, cg.closed_at) as closed_at, - cc.reason as closure_reason, - cg.representative_thread_id, - cg.title, - rt.number as representative_number, - rt.kind as representative_kind, - rt.title as representative_title, - count(*) as member_count, - max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, - sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, - sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, - sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, - group_concat(lower(coalesce(t.title, '')), ' ') as search_text - from cluster_groups cg - left join cluster_closures cc on cc.cluster_id = cg.id - left join threads rt on rt.id = cg.representative_thread_id - join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' - join threads t on t.id = cm.thread_id - where cg.repo_id = ? - and cg.id = ? - group by - cg.id, - cg.stable_slug, - cg.status, - cg.closed_at, - cc.updated_at, - cc.reason, - cg.representative_thread_id, - cg.title, - rt.number, - rt.kind, - rt.title`, - ) - .get(repoId, clusterId) as - | { - cluster_id: number; - stable_slug: string; - status: 'active' | 'closed' | 'merged' | 'split'; - closed_at: string | null; - closure_reason: string | null; - representative_thread_id: number | null; - title: string | null; - representative_number: number | null; - representative_kind: 'issue' | 'pull_request' | null; - representative_title: string | null; - member_count: number; - latest_updated_at: string | null; - issue_count: number; - pull_request_count: number; - closed_member_count: number; - search_text: string | null; - } - | undefined; - if (!row) return null; - return durableTuiSummaryFromRow({ - ...row, - representative_title: row.representative_title ?? row.title, - }); - } - - private listRawTuiClusters(repoId: number, clusterRunId: number, minSize: number): TuiClusterSummary[] { - const rows = this.db - .prepare( - `select - c.id as cluster_id, - c.member_count, - c.closed_at_local, - c.close_reason_local, - c.representative_thread_id, - rt.number as representative_number, - rt.kind as representative_kind, - rt.title as representative_title, - max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, - sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, - sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, - sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, - group_concat(lower(coalesce(t.title, '')), ' ') as search_text - from clusters c - left join threads rt on rt.id = c.representative_thread_id - join cluster_members cm on cm.cluster_id = c.id - join threads t on t.id = cm.thread_id - where c.repo_id = ? and c.cluster_run_id = ? - group by - c.id, - c.member_count, - c.closed_at_local, - c.close_reason_local, - c.representative_thread_id, - rt.number, - rt.kind, - rt.title - having c.member_count >= ?`, - ) - .all(repoId, clusterRunId, minSize) as Array<{ - cluster_id: number; - member_count: number; - closed_at_local: string | null; - close_reason_local: string | null; - representative_thread_id: number | null; - representative_number: number | null; - representative_kind: 'issue' | 'pull_request' | null; - representative_title: string | null; - latest_updated_at: string | null; - issue_count: number; - pull_request_count: number; - closed_member_count: number; - search_text: string | null; - }>; - const durableClosures = getDurableClosuresByRepresentative( - this.db, - repoId, - rows - .map((row) => row.representative_thread_id) - .filter((threadId): threadId is number => threadId !== null), - ); - - return rows.map((row) => { - const clusterName = clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); - const durableClosure = - row.representative_thread_id === null ? null : (durableClosures.get(row.representative_thread_id) ?? null); - return { - clusterId: row.cluster_id, - displayTitle: clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), - isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, - closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, - closeReasonLocal: row.close_reason_local ?? (durableClosure ? durableClosureReason(durableClosure) : null), - totalCount: row.member_count, - issueCount: row.issue_count, - pullRequestCount: row.pull_request_count, - latestUpdatedAt: row.latest_updated_at, - representativeThreadId: row.representative_thread_id, - representativeNumber: row.representative_number, - representativeKind: row.representative_kind, - searchText: `${clusterName} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), - }; - }); - } - - private getRawTuiClusterSummary(repoId: number, clusterRunId: number, clusterId: number): TuiClusterSummary | null { - const row = this.db - .prepare( - `select - c.id as cluster_id, - c.member_count, - c.closed_at_local, - c.close_reason_local, - c.representative_thread_id, - rt.number as representative_number, - rt.kind as representative_kind, - rt.title as representative_title, - max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, - sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, - sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, - sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, - group_concat(lower(coalesce(t.title, '')), ' ') as search_text - from clusters c - left join threads rt on rt.id = c.representative_thread_id - join cluster_members cm on cm.cluster_id = c.id - join threads t on t.id = cm.thread_id - where c.repo_id = ? and c.cluster_run_id = ? and c.id = ? - group by - c.id, - c.member_count, - c.closed_at_local, - c.close_reason_local, - c.representative_thread_id, - rt.number, - rt.kind, - rt.title`, - ) - .get(repoId, clusterRunId, clusterId) as - | { - cluster_id: number; - member_count: number; - closed_at_local: string | null; - close_reason_local: string | null; - representative_thread_id: number | null; - representative_number: number | null; - representative_kind: 'issue' | 'pull_request' | null; - representative_title: string | null; - latest_updated_at: string | null; - issue_count: number; - pull_request_count: number; - closed_member_count: number; - search_text: string | null; - } - | undefined; - - if (!row) { - return null; - } - - const clusterName = clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); - const durableClosure = - row.representative_thread_id === null - ? null - : (getDurableClosuresByRepresentative(this.db, repoId, [row.representative_thread_id]).get(row.representative_thread_id) ?? null); - return { - clusterId: row.cluster_id, - displayTitle: clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), - isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, - closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, - closeReasonLocal: row.close_reason_local ?? (durableClosure ? durableClosureReason(durableClosure) : null), - totalCount: row.member_count, - issueCount: row.issue_count, - pullRequestCount: row.pull_request_count, - latestUpdatedAt: row.latest_updated_at, - representativeThreadId: row.representative_thread_id, - representativeNumber: row.representative_number, - representativeKind: row.representative_kind, - searchText: `${clusterName} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), - }; - } - private async fetchThreadComments( owner: string, repo: string, diff --git a/packages/api-core/src/tui/cluster-queries.ts b/packages/api-core/src/tui/cluster-queries.ts index a649018..79ff070 100644 --- a/packages/api-core/src/tui/cluster-queries.ts +++ b/packages/api-core/src/tui/cluster-queries.ts @@ -1,6 +1,12 @@ import { humanKeyForValue } from '../cluster/human-key.js'; import type { SqliteDatabase } from '../db/sqlite.js'; -import type { DurableTuiClosure } from '../service-types.js'; +import type { DurableTuiClosure, TuiClusterSummary } from '../service-types.js'; +import { + clusterDisplayTitle, + collapseOverlappingClosedDurableRows, + durableClosureReason, + durableTuiSummaryFromRow, +} from './cluster-format.js'; export function clusterHumanName(repoId: number, representativeThreadId: number | null, clusterId: number): string { return humanKeyForValue( @@ -55,3 +61,280 @@ export function getDurableClosuresByRepresentative( } return closures; } + +export function listClosedDurableTuiClusters( + db: SqliteDatabase, + repoId: number, + representedThreadIds: Set, + minSize: number, +): TuiClusterSummary[] { + const rows = db + .prepare( + `select + cg.id as cluster_id, + cg.stable_slug, + cg.status, + coalesce(cc.updated_at, cg.closed_at) as closed_at, + cc.reason as closure_reason, + cg.representative_thread_id, + cg.title, + rt.number as representative_number, + rt.kind as representative_kind, + rt.title as representative_title, + count(*) as member_count, + max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, + sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, + sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, + group_concat(t.id, ',') as member_thread_ids, + group_concat(lower(coalesce(t.title, '')), ' ') as search_text + from cluster_groups cg + left join cluster_closures cc on cc.cluster_id = cg.id + left join threads rt on rt.id = cg.representative_thread_id + join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' + join threads t on t.id = cm.thread_id + where cg.repo_id = ? + group by + cg.id, + cg.stable_slug, + cg.status, + cg.closed_at, + cc.updated_at, + cc.reason, + cg.representative_thread_id, + cg.title, + rt.number, + rt.kind, + rt.title + having member_count >= ? + and (cc.cluster_id is not null + or cg.status in ('merged', 'split') + or closed_member_count >= member_count)`, + ) + .all(repoId, minSize) as Array<{ + cluster_id: number; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + closure_reason: string | null; + representative_thread_id: number | null; + title: string | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + member_count: number; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + closed_member_count: number; + member_thread_ids: string | null; + search_text: string | null; + }>; + + return collapseOverlappingClosedDurableRows( + rows.filter((row) => row.representative_thread_id === null || !representedThreadIds.has(row.representative_thread_id)), + ) + .map((row) => + durableTuiSummaryFromRow({ + ...row, + representative_title: row.representative_title ?? row.title, + }), + ); +} + +export function getDurableTuiClusterSummary(db: SqliteDatabase, repoId: number, clusterId: number): TuiClusterSummary | null { + const row = db + .prepare( + `select + cg.id as cluster_id, + cg.stable_slug, + cg.status, + coalesce(cc.updated_at, cg.closed_at) as closed_at, + cc.reason as closure_reason, + cg.representative_thread_id, + cg.title, + rt.number as representative_number, + rt.kind as representative_kind, + rt.title as representative_title, + count(*) as member_count, + max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, + sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, + sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, + group_concat(lower(coalesce(t.title, '')), ' ') as search_text + from cluster_groups cg + left join cluster_closures cc on cc.cluster_id = cg.id + left join threads rt on rt.id = cg.representative_thread_id + join cluster_memberships cm on cm.cluster_id = cg.id and cm.state <> 'removed_by_user' + join threads t on t.id = cm.thread_id + where cg.repo_id = ? + and cg.id = ? + group by + cg.id, + cg.stable_slug, + cg.status, + cg.closed_at, + cc.updated_at, + cc.reason, + cg.representative_thread_id, + cg.title, + rt.number, + rt.kind, + rt.title`, + ) + .get(repoId, clusterId) as DurableTuiClusterSummaryRow | undefined; + if (!row) return null; + return durableTuiSummaryFromRow({ + ...row, + representative_title: row.representative_title ?? row.title, + }); +} + +export function listRawTuiClusters(db: SqliteDatabase, repoId: number, clusterRunId: number, minSize: number): TuiClusterSummary[] { + const rows = db + .prepare( + `select + c.id as cluster_id, + c.member_count, + c.closed_at_local, + c.close_reason_local, + c.representative_thread_id, + rt.number as representative_number, + rt.kind as representative_kind, + rt.title as representative_title, + max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, + sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, + sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, + group_concat(lower(coalesce(t.title, '')), ' ') as search_text + from clusters c + left join threads rt on rt.id = c.representative_thread_id + join cluster_members cm on cm.cluster_id = c.id + join threads t on t.id = cm.thread_id + where c.repo_id = ? and c.cluster_run_id = ? + group by + c.id, + c.member_count, + c.closed_at_local, + c.close_reason_local, + c.representative_thread_id, + rt.number, + rt.kind, + rt.title + having c.member_count >= ?`, + ) + .all(repoId, clusterRunId, minSize) as RawTuiClusterSummaryRow[]; + const durableClosures = getDurableClosuresByRepresentative( + db, + repoId, + rows + .map((row) => row.representative_thread_id) + .filter((threadId): threadId is number => threadId !== null), + ); + + return rows.map((row) => rawTuiSummaryFromRow(repoId, row, durableClosures.get(row.representative_thread_id ?? -1) ?? null)); +} + +export function getRawTuiClusterSummary( + db: SqliteDatabase, + repoId: number, + clusterRunId: number, + clusterId: number, +): TuiClusterSummary | null { + const row = db + .prepare( + `select + c.id as cluster_id, + c.member_count, + c.closed_at_local, + c.close_reason_local, + c.representative_thread_id, + rt.number as representative_number, + rt.kind as representative_kind, + rt.title as representative_title, + max(coalesce(t.updated_at_gh, t.updated_at)) as latest_updated_at, + sum(case when t.kind = 'issue' then 1 else 0 end) as issue_count, + sum(case when t.kind = 'pull_request' then 1 else 0 end) as pull_request_count, + sum(case when t.state != 'open' or t.closed_at_local is not null then 1 else 0 end) as closed_member_count, + group_concat(lower(coalesce(t.title, '')), ' ') as search_text + from clusters c + left join threads rt on rt.id = c.representative_thread_id + join cluster_members cm on cm.cluster_id = c.id + join threads t on t.id = cm.thread_id + where c.repo_id = ? and c.cluster_run_id = ? and c.id = ? + group by + c.id, + c.member_count, + c.closed_at_local, + c.close_reason_local, + c.representative_thread_id, + rt.number, + rt.kind, + rt.title`, + ) + .get(repoId, clusterRunId, clusterId) as RawTuiClusterSummaryRow | undefined; + + if (!row) { + return null; + } + + const durableClosure = + row.representative_thread_id === null + ? null + : (getDurableClosuresByRepresentative(db, repoId, [row.representative_thread_id]).get(row.representative_thread_id) ?? null); + return rawTuiSummaryFromRow(repoId, row, durableClosure); +} + +type DurableTuiClusterSummaryRow = { + cluster_id: number; + stable_slug: string; + status: 'active' | 'closed' | 'merged' | 'split'; + closed_at: string | null; + closure_reason: string | null; + representative_thread_id: number | null; + title: string | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + member_count: number; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + closed_member_count: number; + search_text: string | null; +}; + +type RawTuiClusterSummaryRow = { + cluster_id: number; + member_count: number; + closed_at_local: string | null; + close_reason_local: string | null; + representative_thread_id: number | null; + representative_number: number | null; + representative_kind: 'issue' | 'pull_request' | null; + representative_title: string | null; + latest_updated_at: string | null; + issue_count: number; + pull_request_count: number; + closed_member_count: number; + search_text: string | null; +}; + +function rawTuiSummaryFromRow(repoId: number, row: RawTuiClusterSummaryRow, durableClosure: DurableTuiClosure | null): TuiClusterSummary { + const clusterName = clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); + return { + clusterId: row.cluster_id, + displayTitle: clusterDisplayTitle(clusterName, row.representative_title, row.cluster_id), + isClosed: row.close_reason_local !== null || durableClosure !== null || row.closed_member_count >= row.member_count, + closedAtLocal: row.closed_at_local ?? durableClosure?.closedAt ?? null, + closeReasonLocal: row.close_reason_local ?? (durableClosure ? durableClosureReason(durableClosure) : null), + totalCount: row.member_count, + issueCount: row.issue_count, + pullRequestCount: row.pull_request_count, + latestUpdatedAt: row.latest_updated_at, + representativeThreadId: row.representative_thread_id, + representativeNumber: row.representative_number, + representativeKind: row.representative_kind, + searchText: `${clusterName} ${(row.representative_title ?? '').toLowerCase()} ${row.search_text ?? ''}`.trim(), + }; +} From fd00ec23f87bb8e9ad349850aa3f5780a1fa2eff Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:01:26 -0700 Subject: [PATCH 177/215] refactor: extract tui refresh state query --- packages/api-core/src/service.ts | 53 +--------------------- packages/api-core/src/tui/repo-stats.ts | 58 ++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 52 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 40c8637..66b6d34 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -151,7 +151,7 @@ import { listClosedDurableTuiClusters, listRawTuiClusters, } from './tui/cluster-queries.js'; -import { getTuiRepoStats } from './tui/repo-stats.js'; +import { getTuiRepoStats, getTuiRepositoryRefreshState } from './tui/repo-stats.js'; import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, @@ -3055,56 +3055,7 @@ export class GHCrawlService { getTuiRefreshState(params: { owner: string; repo: string }): TuiRefreshState { const repository = this.requireRepository(params.owner, params.repo); - const threadState = this.db - .prepare( - `select - max(updated_at) as thread_updated_at, - max(closed_at_local) as thread_closed_at - from threads - where repo_id = ?`, - ) - .get(repository.id) as { thread_updated_at: string | null; thread_closed_at: string | null }; - const clusterState = this.db - .prepare( - `select max(closed_at_local) as cluster_closed_at - from clusters - where repo_id = ?`, - ) - .get(repository.id) as { cluster_closed_at: string | null }; - const durableClusterState = this.db - .prepare( - `select max(updated_at) as durable_cluster_updated_at - from cluster_groups - where repo_id = ?`, - ) - .get(repository.id) as { durable_cluster_updated_at: string | null }; - const durableMembershipState = this.db - .prepare( - `select max(cm.updated_at) as durable_membership_updated_at - from cluster_memberships cm - join cluster_groups cg on cg.id = cm.cluster_id - where cg.repo_id = ?`, - ) - .get(repository.id) as { durable_membership_updated_at: string | null }; - const latestSync = this.db - .prepare("select id from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1") - .get(repository.id) as { id: number } | undefined; - const latestEmbedding = this.db - .prepare("select id from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") - .get(repository.id) as { id: number } | undefined; - const latestClusterRun = getLatestClusterRun(this.db, repository.id); - - return { - repositoryUpdatedAt: repository.updatedAt, - threadUpdatedAt: threadState.thread_updated_at, - threadClosedAt: threadState.thread_closed_at, - clusterClosedAt: clusterState.cluster_closed_at, - durableClusterUpdatedAt: durableClusterState.durable_cluster_updated_at, - durableMembershipUpdatedAt: durableMembershipState.durable_membership_updated_at, - latestSyncRunId: latestSync?.id ?? null, - latestEmbeddingRunId: latestEmbedding?.id ?? null, - latestClusterRunId: latestClusterRun?.id ?? null, - }; + return getTuiRepositoryRefreshState({ db: this.db, repository }); } getTuiClusterDetail(params: { owner: string; repo: string; clusterId: number; clusterRunId?: number }): TuiClusterDetail { diff --git a/packages/api-core/src/tui/repo-stats.ts b/packages/api-core/src/tui/repo-stats.ts index aa58e3f..802c7bb 100644 --- a/packages/api-core/src/tui/repo-stats.ts +++ b/packages/api-core/src/tui/repo-stats.ts @@ -2,7 +2,7 @@ import { getLatestClusterRun } from '../cluster/run-queries.js'; import type { GitcrawlConfig } from '../config.js'; import type { SqliteDatabase } from '../db/sqlite.js'; import { getEmbeddingWorkset } from '../embedding/workset.js'; -import type { TuiRepoStats } from '../service-types.js'; +import type { TuiRefreshState, TuiRepoStats } from '../service-types.js'; export function getTuiRepoStats(params: { db: SqliteDatabase; config: GitcrawlConfig; repoId: number }): TuiRepoStats { const counts = params.db @@ -35,3 +35,59 @@ export function getTuiRepoStats(params: { db: SqliteDatabase; config: GitcrawlCo latestClusterRunFinishedAt: latestRun?.finished_at ?? null, }; } + +export function getTuiRepositoryRefreshState(params: { + db: SqliteDatabase; + repository: { id: number; updatedAt: string }; +}): TuiRefreshState { + const threadState = params.db + .prepare( + `select + max(updated_at) as thread_updated_at, + max(closed_at_local) as thread_closed_at + from threads + where repo_id = ?`, + ) + .get(params.repository.id) as { thread_updated_at: string | null; thread_closed_at: string | null }; + const clusterState = params.db + .prepare( + `select max(closed_at_local) as cluster_closed_at + from clusters + where repo_id = ?`, + ) + .get(params.repository.id) as { cluster_closed_at: string | null }; + const durableClusterState = params.db + .prepare( + `select max(updated_at) as durable_cluster_updated_at + from cluster_groups + where repo_id = ?`, + ) + .get(params.repository.id) as { durable_cluster_updated_at: string | null }; + const durableMembershipState = params.db + .prepare( + `select max(cm.updated_at) as durable_membership_updated_at + from cluster_memberships cm + join cluster_groups cg on cg.id = cm.cluster_id + where cg.repo_id = ?`, + ) + .get(params.repository.id) as { durable_membership_updated_at: string | null }; + const latestSync = params.db + .prepare("select id from sync_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(params.repository.id) as { id: number } | undefined; + const latestEmbedding = params.db + .prepare("select id from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(params.repository.id) as { id: number } | undefined; + const latestClusterRun = getLatestClusterRun(params.db, params.repository.id); + + return { + repositoryUpdatedAt: params.repository.updatedAt, + threadUpdatedAt: threadState.thread_updated_at, + threadClosedAt: threadState.thread_closed_at, + clusterClosedAt: clusterState.cluster_closed_at, + durableClusterUpdatedAt: durableClusterState.durable_cluster_updated_at, + durableMembershipUpdatedAt: durableMembershipState.durable_membership_updated_at, + latestSyncRunId: latestSync?.id ?? null, + latestEmbeddingRunId: latestEmbedding?.id ?? null, + latestClusterRunId: latestClusterRun?.id ?? null, + }; +} From 192589ed29e6bad7f0920a05f95a9c7c76d6991c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:03:25 -0700 Subject: [PATCH 178/215] refactor: extract tui cluster member query --- packages/api-core/src/service.ts | 63 +------------------- packages/api-core/src/tui/cluster-queries.ts | 63 +++++++++++++++++++- 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 66b6d34..012d228 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -148,6 +148,7 @@ import { compareTuiClusterSummary } from './tui/cluster-format.js'; import { getDurableTuiClusterSummary, getRawTuiClusterSummary, + listTuiClusterMembers, listClosedDurableTuiClusters, listRawTuiClusters, } from './tui/cluster-queries.js'; @@ -3071,56 +3072,6 @@ export class GHCrawlService { throw new Error(`Cluster ${params.clusterId} was not found for ${repository.fullName}.`); } - const rows = summary - ? (this.db - .prepare( - `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative - from cluster_members cm - join threads t on t.id = cm.thread_id - where cm.cluster_id = ? - order by - case t.kind when 'issue' then 0 else 1 end asc, - coalesce(t.updated_at_gh, t.updated_at) desc, - t.number desc`, - ) - .all(params.clusterId) as Array<{ - id: number; - number: number; - kind: 'issue' | 'pull_request'; - state: string; - closed_at_local: string | null; - title: string; - updated_at_gh: string | null; - html_url: string; - labels_json: string; - score_to_representative: number | null; - }>) - : (this.db - .prepare( - `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative - from cluster_memberships cm - join threads t on t.id = cm.thread_id - where cm.cluster_id = ? - and cm.state <> 'removed_by_user' - order by - case cm.role when 'canonical' then 0 else 1 end asc, - case t.kind when 'issue' then 0 else 1 end asc, - coalesce(t.updated_at_gh, t.updated_at) desc, - t.number desc`, - ) - .all(params.clusterId) as Array<{ - id: number; - number: number; - kind: 'issue' | 'pull_request'; - state: string; - closed_at_local: string | null; - title: string; - updated_at_gh: string | null; - html_url: string; - labels_json: string; - score_to_representative: number | null; - }>); - return { clusterId: resolvedSummary.clusterId, displayTitle: resolvedSummary.displayTitle, @@ -3134,17 +3085,7 @@ export class GHCrawlService { representativeThreadId: resolvedSummary.representativeThreadId, representativeNumber: resolvedSummary.representativeNumber, representativeKind: resolvedSummary.representativeKind, - members: rows.map((row) => ({ - id: row.id, - number: row.number, - kind: row.kind, - isClosed: isEffectivelyClosed(row), - title: row.title, - updatedAtGh: row.updated_at_gh, - htmlUrl: row.html_url, - labels: parseArray(row.labels_json), - clusterScore: row.score_to_representative, - })), + members: listTuiClusterMembers(this.db, params.clusterId, summary ? 'run_cluster' : 'durable_cluster'), }; } diff --git a/packages/api-core/src/tui/cluster-queries.ts b/packages/api-core/src/tui/cluster-queries.ts index 79ff070..ad6db8d 100644 --- a/packages/api-core/src/tui/cluster-queries.ts +++ b/packages/api-core/src/tui/cluster-queries.ts @@ -1,6 +1,7 @@ import { humanKeyForValue } from '../cluster/human-key.js'; import type { SqliteDatabase } from '../db/sqlite.js'; -import type { DurableTuiClosure, TuiClusterSummary } from '../service-types.js'; +import type { DurableTuiClosure, TuiClusterDetail, TuiClusterSummary } from '../service-types.js'; +import { isEffectivelyClosed, parseArray } from '../service-utils.js'; import { clusterDisplayTitle, collapseOverlappingClosedDurableRows, @@ -285,6 +286,53 @@ export function getRawTuiClusterSummary( return rawTuiSummaryFromRow(repoId, row, durableClosure); } +export function listTuiClusterMembers( + db: SqliteDatabase, + clusterId: number, + source: 'run_cluster' | 'durable_cluster', +): TuiClusterDetail['members'] { + const rows = + source === 'run_cluster' + ? (db + .prepare( + `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative + from cluster_members cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + order by + case t.kind when 'issue' then 0 else 1 end asc, + coalesce(t.updated_at_gh, t.updated_at) desc, + t.number desc`, + ) + .all(clusterId) as TuiClusterMemberRow[]) + : (db + .prepare( + `select t.id, t.number, t.kind, t.state, t.closed_at_local, t.title, t.updated_at_gh, t.html_url, t.labels_json, cm.score_to_representative + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + and cm.state <> 'removed_by_user' + order by + case cm.role when 'canonical' then 0 else 1 end asc, + case t.kind when 'issue' then 0 else 1 end asc, + coalesce(t.updated_at_gh, t.updated_at) desc, + t.number desc`, + ) + .all(clusterId) as TuiClusterMemberRow[]); + + return rows.map((row) => ({ + id: row.id, + number: row.number, + kind: row.kind, + isClosed: isEffectivelyClosed(row), + title: row.title, + updatedAtGh: row.updated_at_gh, + htmlUrl: row.html_url, + labels: parseArray(row.labels_json), + clusterScore: row.score_to_representative, + })); +} + type DurableTuiClusterSummaryRow = { cluster_id: number; stable_slug: string; @@ -320,6 +368,19 @@ type RawTuiClusterSummaryRow = { search_text: string | null; }; +type TuiClusterMemberRow = { + id: number; + number: number; + kind: 'issue' | 'pull_request'; + state: string; + closed_at_local: string | null; + title: string; + updated_at_gh: string | null; + html_url: string; + labels_json: string; + score_to_representative: number | null; +}; + function rawTuiSummaryFromRow(repoId: number, row: RawTuiClusterSummaryRow, durableClosure: DurableTuiClosure | null): TuiClusterSummary { const clusterName = clusterHumanName(repoId, row.representative_thread_id, row.cluster_id); return { From 58f4e4f711d11022c60c4482bb3c2b230d3eadd8 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:05:13 -0700 Subject: [PATCH 179/215] refactor: extract tui thread detail queries --- packages/api-core/src/service.ts | 39 ++++++++------------ packages/api-core/src/tui/thread-detail.ts | 42 +++++++++++++++++++++- 2 files changed, 56 insertions(+), 25 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 012d228..a038d6e 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -153,7 +153,13 @@ import { listRawTuiClusters, } from './tui/cluster-queries.js'; import { getTuiRepoStats, getTuiRepositoryRefreshState } from './tui/repo-stats.js'; -import { getLatestTuiKeySummary, getTopChangedFiles, getTuiThreadSummaries } from './tui/thread-detail.js'; +import { + getLatestTuiKeySummary, + getLatestTuiThreadClusterId, + getTopChangedFiles, + getTuiThreadRow, + getTuiThreadSummaries, +} from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, ACTIVE_EMBED_PIPELINE_VERSION, @@ -3097,33 +3103,18 @@ export class GHCrawlService { includeNeighbors?: boolean; }): TuiThreadDetail { const repository = this.requireRepository(params.owner, params.repo); - const row = params.threadId - ? ((this.db - .prepare('select * from threads where repo_id = ? and id = ? limit 1') - .get(repository.id, params.threadId) as ThreadRow | undefined) ?? null) - : params.threadNumber - ? ((this.db - .prepare('select * from threads where repo_id = ? and number = ? limit 1') - .get(repository.id, params.threadNumber) as ThreadRow | undefined) ?? null) - : null; + const row = getTuiThreadRow({ + db: this.db, + repoId: repository.id, + threadId: params.threadId, + threadNumber: params.threadNumber, + }); if (!row) { throw new Error(`Thread was not found for ${repository.fullName}.`); } - const latestRun = getLatestClusterRun(this.db, repository.id); - const clusterMembership = latestRun - ? ((this.db - .prepare( - `select cm.cluster_id - from cluster_members cm - join clusters c on c.id = cm.cluster_id - where c.cluster_run_id = ? and cm.thread_id = ? - limit 1`, - ) - .get(latestRun.id, row.id) as { cluster_id: number } | undefined) ?? null) - : null; - + const clusterId = getLatestTuiThreadClusterId(this.db, repository.id, row.id); const summaries = getTuiThreadSummaries(this.db, row.id, this.config.summaryModel); const topFiles = getTopChangedFiles(this.db, row.id, 5); const keySummary = getLatestTuiKeySummary(this.db, row.id, this.config.summaryModel); @@ -3147,7 +3138,7 @@ export class GHCrawlService { } return { - thread: threadToDto(row, clusterMembership?.cluster_id ?? null), + thread: threadToDto(row, clusterId), summaries, keySummary, topFiles, diff --git a/packages/api-core/src/tui/thread-detail.ts b/packages/api-core/src/tui/thread-detail.ts index 5eb13fd..7f66df4 100644 --- a/packages/api-core/src/tui/thread-detail.ts +++ b/packages/api-core/src/tui/thread-detail.ts @@ -1,8 +1,48 @@ +import { getLatestClusterRun } from '../cluster/run-queries.js'; import type { SqliteDatabase } from '../db/sqlite.js'; import { SUMMARY_PROMPT_VERSION } from '../service-constants.js'; -import type { TuiThreadDetail } from '../service-types.js'; +import type { ThreadRow, TuiThreadDetail } from '../service-types.js'; import { normalizeKeySummaryDisplayText } from '../service-utils.js'; +export function getTuiThreadRow(params: { + db: SqliteDatabase; + repoId: number; + threadId?: number; + threadNumber?: number; +}): ThreadRow | null { + if (params.threadId) { + return ( + (params.db + .prepare('select * from threads where repo_id = ? and id = ? limit 1') + .get(params.repoId, params.threadId) as ThreadRow | undefined) ?? null + ); + } + if (params.threadNumber) { + return ( + (params.db + .prepare('select * from threads where repo_id = ? and number = ? limit 1') + .get(params.repoId, params.threadNumber) as ThreadRow | undefined) ?? null + ); + } + return null; +} + +export function getLatestTuiThreadClusterId(db: SqliteDatabase, repoId: number, threadId: number): number | null { + const latestRun = getLatestClusterRun(db, repoId); + const clusterMembership = latestRun + ? ((db + .prepare( + `select cm.cluster_id + from cluster_members cm + join clusters c on c.id = cm.cluster_id + where c.cluster_run_id = ? and cm.thread_id = ? + limit 1`, + ) + .get(latestRun.id, threadId) as { cluster_id: number } | undefined) ?? null) + : null; + return clusterMembership?.cluster_id ?? null; +} + export function getTuiThreadSummaries(db: SqliteDatabase, threadId: number, summaryModel: string): TuiThreadDetail['summaries'] { const rows = db .prepare( From ce7f2e83240e280216734ac8043eb0b465356803 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:08:10 -0700 Subject: [PATCH 180/215] fix: default cluster min size to 5 --- README.md | 8 ++++---- apps/cli/README.md | 8 ++++---- apps/cli/src/main.ts | 2 +- apps/cli/src/tui/state.test.ts | 4 ++++ apps/cli/src/tui/state.ts | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 3d581c6..96cc728 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ For agent-facing and script-facing commands, prefer explicit machine mode: ghcrawl configure --json ghcrawl doctor --json ghcrawl threads owner/repo --numbers 42,43,44 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --sort recent --json ``` Contract notes: @@ -206,8 +206,8 @@ ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl threads owner/repo --numbers 42,43,44 --include-closed --json ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --json -ghcrawl clusters owner/repo --min-size 10 --hide-closed --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --json +ghcrawl clusters owner/repo --min-size 5 --hide-closed --json ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --json ghcrawl cluster-detail owner/repo --id 123 --hide-closed --json @@ -316,7 +316,7 @@ ghcrawl refresh owner/repo ghcrawl optimize owner/repo --json ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 42,43,44 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --sort recent --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ``` diff --git a/apps/cli/README.md b/apps/cli/README.md index 15dc0ac..f61c86a 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -96,7 +96,7 @@ For agent-facing and script-facing commands, prefer explicit machine mode: ghcrawl configure --json ghcrawl doctor --json ghcrawl threads owner/repo --numbers 42,43,44 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --sort recent --json ``` Contract notes: @@ -208,8 +208,8 @@ ghcrawl threads owner/repo --numbers 42,43,44 --json ghcrawl threads owner/repo --numbers 42,43,44 --include-closed --json ghcrawl close-thread owner/repo --number 42 --json ghcrawl close-cluster owner/repo --id 123 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --include-closed --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --include-closed --json ghcrawl durable-clusters owner/repo --member-limit 10 --json ghcrawl cluster-detail owner/repo --id 123 --json ghcrawl cluster-detail owner/repo --id 123 --include-closed --json @@ -286,7 +286,7 @@ ghcrawl doctor --json ghcrawl refresh owner/repo ghcrawl runs owner/repo --limit 20 --json ghcrawl threads owner/repo --numbers 42,43,44 --json -ghcrawl clusters owner/repo --min-size 10 --limit 20 --sort recent --json +ghcrawl clusters owner/repo --min-size 5 --limit 20 --sort recent --json ghcrawl cluster-detail owner/repo --id 123 --member-limit 20 --body-chars 280 --json ghcrawl cluster-explain owner/repo --id 123 --member-limit 20 --event-limit 50 --json ``` diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 5f017b0..61d8f68 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -378,7 +378,7 @@ const COMMAND_SPECS: readonly CommandSpec[] = [ '--hide-closed Hide locally closed clusters', '--json Emit machine-readable JSON output explicitly', ], - examples: ['ghcrawl clusters openclaw/openclaw --min-size 10 --limit 20 --sort recent --json'], + examples: ['ghcrawl clusters openclaw/openclaw --min-size 5 --limit 20 --sort recent --json'], agentJson: true, }, { diff --git a/apps/cli/src/tui/state.test.ts b/apps/cli/src/tui/state.test.ts index 49df505..e012571 100644 --- a/apps/cli/src/tui/state.test.ts +++ b/apps/cli/src/tui/state.test.ts @@ -32,6 +32,10 @@ test('cycleMinSizeFilter rotates through presets', () => { assert.equal(cycleMinSizeFilter(2), 5); }); +test('cycleMinSizeFilter falls back to the default 5+ view', () => { + assert.equal(cycleMinSizeFilter(99 as never), 5); +}); + test('cycleMemberSortMode rotates through member sort modes', () => { assert.equal(cycleMemberSortMode('kind'), 'recent'); assert.equal(cycleMemberSortMode('recent'), 'number'); diff --git a/apps/cli/src/tui/state.ts b/apps/cli/src/tui/state.ts index a698fa0..f555644 100644 --- a/apps/cli/src/tui/state.ts +++ b/apps/cli/src/tui/state.ts @@ -27,7 +27,7 @@ export function cycleSortMode(current: TuiClusterSortMode): TuiClusterSortMode { export function cycleMinSizeFilter(current: TuiMinSizeFilter): TuiMinSizeFilter { const index = MIN_SIZE_FILTER_ORDER.indexOf(current); - return MIN_SIZE_FILTER_ORDER[(index + 1) % MIN_SIZE_FILTER_ORDER.length] ?? 10; + return MIN_SIZE_FILTER_ORDER[(index + 1) % MIN_SIZE_FILTER_ORDER.length] ?? 5; } export function cycleMemberSortMode(current: TuiMemberSortMode): TuiMemberSortMode { From e0dee18130596c9f520a3fbcf79a97fd0acaefb3 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:08:38 -0700 Subject: [PATCH 181/215] refactor: extract sync comment hydration --- packages/api-core/src/service.ts | 84 +++----------------------- packages/api-core/src/sync/comments.ts | 81 +++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 74 deletions(-) create mode 100644 packages/api-core/src/sync/comments.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index a038d6e..d96fc3f 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -106,7 +106,7 @@ import { import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; -import { buildCanonicalDocument, isBotLikeAuthor } from './documents/normalize.js'; +import { buildCanonicalDocument } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; import { loadClusterableActiveVectorMeta, loadClusterableThreadMeta, loadNormalizedActiveVectors } from './embedding/clusterable.js'; @@ -142,6 +142,7 @@ import { import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from './run-history.js'; import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; +import { fetchThreadComments } from './sync/comments.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { compareTuiClusterSummary } from './tui/cluster-format.js'; @@ -1041,7 +1042,14 @@ export class GHCrawlService { codeFilesSynced += files.length; } if (includeComments && !threadIsClosed) { - const comments = await this.fetchThreadComments(params.owner, params.repo, number, isPr, reporter); + const comments = await fetchThreadComments({ + github, + owner: params.owner, + repo: params.repo, + number, + isPr, + reporter, + }); this.replaceComments(threadId, comments); commentsSynced += comments.length; } @@ -3286,78 +3294,6 @@ export class GHCrawlService { return durableClusterId; } - private async fetchThreadComments( - owner: string, - repo: string, - number: number, - isPr: boolean, - reporter?: (message: string) => void, - ): Promise { - const github = this.requireGithub(); - const comments: CommentSeed[] = []; - - const issueComments = await github.listIssueComments(owner, repo, number, reporter); - comments.push( - ...issueComments.map((comment) => { - const authorLogin = userLogin(comment); - const authorType = userType(comment); - return { - githubId: String(comment.id), - commentType: 'issue_comment', - authorLogin, - authorType, - body: String(comment.body ?? ''), - isBot: isBotLikeAuthor({ authorLogin, authorType }), - rawJson: asJson(comment), - createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, - updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, - }; - }), - ); - - if (isPr) { - const reviews = await github.listPullReviews(owner, repo, number, reporter); - comments.push( - ...reviews.map((review) => { - const authorLogin = userLogin(review); - const authorType = userType(review); - return { - githubId: String(review.id), - commentType: 'review', - authorLogin, - authorType, - body: String(review.body ?? review.state ?? ''), - isBot: isBotLikeAuthor({ authorLogin, authorType }), - rawJson: asJson(review), - createdAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, - updatedAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, - }; - }), - ); - - const reviewComments = await github.listPullReviewComments(owner, repo, number, reporter); - comments.push( - ...reviewComments.map((comment) => { - const authorLogin = userLogin(comment); - const authorType = userType(comment); - return { - githubId: String(comment.id), - commentType: 'review_comment', - authorLogin, - authorType, - body: String(comment.body ?? ''), - isBot: isBotLikeAuthor({ authorLogin, authorType }), - rawJson: asJson(comment), - createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, - updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, - }; - }), - ); - } - - return comments; - } - private requireAi(): AiProvider { if (!this.ai) { requireOpenAiKey(this.config); diff --git a/packages/api-core/src/sync/comments.ts b/packages/api-core/src/sync/comments.ts new file mode 100644 index 0000000..e9326d0 --- /dev/null +++ b/packages/api-core/src/sync/comments.ts @@ -0,0 +1,81 @@ +import { isBotLikeAuthor } from '../documents/normalize.js'; +import type { GitHubClient, GitHubReporter } from '../github/client.js'; +import type { CommentSeed } from '../service-types.js'; +import { asJson, userLogin, userType } from '../service-utils.js'; + +export async function fetchThreadComments(params: { + github: GitHubClient; + owner: string; + repo: string; + number: number; + isPr: boolean; + reporter?: GitHubReporter; +}): Promise { + const comments: CommentSeed[] = []; + + const issueComments = await params.github.listIssueComments(params.owner, params.repo, params.number, params.reporter); + comments.push( + ...issueComments.map((comment) => { + const authorLogin = userLogin(comment); + const authorType = userType(comment); + return { + githubId: String(comment.id), + commentType: 'issue_comment', + authorLogin, + authorType, + body: String(comment.body ?? ''), + isBot: isBotLikeAuthor({ authorLogin, authorType }), + rawJson: asJson(comment), + createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, + updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, + }; + }), + ); + + if (params.isPr) { + const reviews = await params.github.listPullReviews(params.owner, params.repo, params.number, params.reporter); + comments.push( + ...reviews.map((review) => { + const authorLogin = userLogin(review); + const authorType = userType(review); + return { + githubId: String(review.id), + commentType: 'review', + authorLogin, + authorType, + body: String(review.body ?? review.state ?? ''), + isBot: isBotLikeAuthor({ authorLogin, authorType }), + rawJson: asJson(review), + createdAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, + updatedAtGh: typeof review.submitted_at === 'string' ? review.submitted_at : null, + }; + }), + ); + + const reviewComments = await params.github.listPullReviewComments( + params.owner, + params.repo, + params.number, + params.reporter, + ); + comments.push( + ...reviewComments.map((comment) => { + const authorLogin = userLogin(comment); + const authorType = userType(comment); + return { + githubId: String(comment.id), + commentType: 'review_comment', + authorLogin, + authorType, + body: String(comment.body ?? ''), + isBot: isBotLikeAuthor({ authorLogin, authorType }), + rawJson: asJson(comment), + createdAtGh: typeof comment.created_at === 'string' ? comment.created_at : null, + updatedAtGh: typeof comment.updated_at === 'string' ? comment.updated_at : null, + }; + }), + ); + } + + return comments; +} From f8f4bad3153fd8e39134436a9b4f43beb8928e50 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:12:27 -0700 Subject: [PATCH 182/215] refactor: extract sync persistence helpers --- packages/api-core/src/service.ts | 138 ++++------------------ packages/api-core/src/sync/persistence.ts | 131 ++++++++++++++++++++ 2 files changed, 153 insertions(+), 116 deletions(-) create mode 100644 packages/api-core/src/sync/persistence.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index d96fc3f..ed2788f 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -63,7 +63,6 @@ import { import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from './cluster/build.js'; import { reconcileClusterCloseState } from './cluster/close-state.js'; -import { buildCodeSnapshotSignature } from './cluster/code-signature.js'; import { buildDeterministicClusterGraphFromFingerprints } from './cluster/deterministic-engine.js'; import { loadDeterministicClusterableThreadMeta } from './cluster/deterministic-thread-loader.js'; import { @@ -91,7 +90,6 @@ import { upsertClusterMembership, upsertSimilarityEdgeEvidence, upsertThreadRevision, - upsertThreadCodeSnapshot, upsertThreadKeySummary, } from './cluster/persistent-store.js'; import { @@ -105,7 +103,7 @@ import { } from './config.js'; import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; -import { blobStoreRoot, rawJsonStorage } from './db/raw-json-store.js'; +import { rawJsonStorage } from './db/raw-json-store.js'; import { buildCanonicalDocument } from './documents/normalize.js'; import { buildDoctorResult } from './doctor.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; @@ -144,6 +142,7 @@ import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighbor import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { fetchThreadComments } from './sync/comments.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; +import { persistThreadCodeSnapshot, upsertRepository, upsertThread } from './sync/persistence.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { compareTuiClusterSummary } from './tui/cluster-format.js'; import { @@ -216,16 +215,12 @@ import { isPullRequestPayload, nowIso, parseArray, - parseAssignees, parseIso, - parseLabels, parseObjectJson, repositoryToDto, snippetText, stableContentHash, threadToDto, - userLogin, - userType, } from './service-utils.js'; import type { VectorNeighbor, VectorQueryParams, VectorStore } from './vector/store.js'; import { getVectorliteClusterQuery, normalizedDistanceToScore, normalizedEmbeddingBuffer, parseStoredVector, vectorBlob } from './vector/encoding.js'; @@ -974,7 +969,12 @@ export class GHCrawlService { params.onProgress?.(`[sync] fetching repository metadata for ${params.owner}/${params.repo}`); const reporter = params.onProgress ? (message: string) => params.onProgress?.(message.replace(/^\[github\]/, '[sync/github]')) : undefined; const repoData = await github.getRepo(params.owner, params.repo, reporter); - const repoId = this.upsertRepository(params.owner, params.repo, repoData); + const repoId = upsertRepository({ + db: this.db, + owner: params.owner, + repo: params.repo, + payload: repoData, + }); const runId = startServiceRun(this.db, 'sync_runs', repoId, `${params.owner}/${params.repo}`); const syncCursor = getSyncCursorState(this.db, repoId); const overlapReferenceAt = syncCursor.lastOverlappingOpenScanCompletedAt ?? syncCursor.lastFullOpenScanStartedAt; @@ -1030,7 +1030,13 @@ export class GHCrawlService { const shouldFetchPullPayload = isPr && includeCode && !itemIsClosed; const threadPayload = shouldFetchPullPayload ? await github.getPull(params.owner, params.repo, number, reporter) : item; const threadIsClosed = isClosedGitHubPayload(threadPayload); - const threadId = this.upsertThread(repoId, kind, threadPayload, crawlStartedAt); + const threadId = upsertThread({ + db: this.db, + repoId, + kind, + payload: threadPayload, + pulledAt: crawlStartedAt, + }); if (threadIsClosed && (includeComments || includeCode)) { params.onProgress?.( `[sync] ${kind} #${number} is closed; metadata-only update, skipping comment/code hydration and fingerprint refresh`, @@ -1038,7 +1044,13 @@ export class GHCrawlService { } if (includeCode && isPr && !threadIsClosed) { const files = await github.listPullFiles(params.owner, params.repo, number, reporter); - this.persistThreadCodeSnapshot(threadId, threadPayload, files); + persistThreadCodeSnapshot({ + db: this.db, + dbPath: this.config.dbPath, + threadId, + threadPayload, + files, + }); codeFilesSynced += files.length; } if (includeComments && !threadIsClosed) { @@ -3317,112 +3329,6 @@ export class GHCrawlService { return repositoryToDto(row); } - private upsertRepository(owner: string, repo: string, payload: Record): number { - const fullName = `${owner}/${repo}`; - this.db - .prepare( - `insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at) - values (?, ?, ?, ?, ?, ?) - on conflict(full_name) do update set - github_repo_id = excluded.github_repo_id, - raw_json = excluded.raw_json, - updated_at = excluded.updated_at`, - ) - .run(owner, repo, fullName, payload.id ? String(payload.id) : null, asJson(payload), nowIso()); - const row = this.db.prepare('select id from repositories where full_name = ?').get(fullName) as { id: number }; - return row.id; - } - - private upsertThread( - repoId: number, - kind: 'issue' | 'pull_request', - payload: Record, - pulledAt: string, - ): number { - const title = String(payload.title ?? `#${payload.number}`); - const body = typeof payload.body === 'string' ? payload.body : null; - const labels = parseLabels(payload); - const assignees = parseAssignees(payload); - const contentHash = stableContentHash(`${title}\n${body ?? ''}`); - this.db - .prepare( - `insert into threads ( - repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, - labels_json, assignees_json, raw_json, content_hash, is_draft, - created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - on conflict(repo_id, kind, number) do update set - github_id = excluded.github_id, - state = excluded.state, - title = excluded.title, - body = excluded.body, - author_login = excluded.author_login, - author_type = excluded.author_type, - html_url = excluded.html_url, - labels_json = excluded.labels_json, - assignees_json = excluded.assignees_json, - raw_json = excluded.raw_json, - content_hash = excluded.content_hash, - is_draft = excluded.is_draft, - created_at_gh = excluded.created_at_gh, - updated_at_gh = excluded.updated_at_gh, - closed_at_gh = excluded.closed_at_gh, - merged_at_gh = excluded.merged_at_gh, - last_pulled_at = excluded.last_pulled_at, - updated_at = excluded.updated_at`, - ) - .run( - repoId, - String(payload.id), - Number(payload.number), - kind, - String(payload.state ?? 'open'), - title, - body, - userLogin(payload), - userType(payload), - String(payload.html_url), - asJson(labels), - asJson(assignees), - asJson(payload), - contentHash, - payload.draft ? 1 : 0, - typeof payload.created_at === 'string' ? payload.created_at : null, - typeof payload.updated_at === 'string' ? payload.updated_at : null, - typeof payload.closed_at === 'string' ? payload.closed_at : null, - typeof payload.merged_at === 'string' ? payload.merged_at : null, - pulledAt, - pulledAt, - nowIso(), - ); - const row = this.db - .prepare('select id from threads where repo_id = ? and kind = ? and number = ?') - .get(repoId, kind, Number(payload.number)) as { id: number }; - return row.id; - } - - private persistThreadCodeSnapshot(threadId: number, threadPayload: Record, files: Array>): void { - const title = String(threadPayload.title ?? `#${threadPayload.number}`); - const body = typeof threadPayload.body === 'string' ? threadPayload.body : null; - const revisionId = upsertThreadRevision(this.db, { - threadId, - sourceUpdatedAt: typeof threadPayload.updated_at === 'string' ? threadPayload.updated_at : null, - title, - body, - labels: parseLabels(threadPayload), - rawJson: asJson(threadPayload), - }); - const base = threadPayload.base as Record | undefined; - const head = threadPayload.head as Record | undefined; - upsertThreadCodeSnapshot(this.db, { - threadRevisionId: revisionId, - baseSha: typeof base?.sha === 'string' ? base.sha : null, - headSha: typeof head?.sha === 'string' ? head.sha : null, - signature: buildCodeSnapshotSignature(files), - storeRoot: blobStoreRoot(this.config.dbPath), - }); - } - private async applyClosedOverlapSweep(params: { repoId: number; owner: string; diff --git a/packages/api-core/src/sync/persistence.ts b/packages/api-core/src/sync/persistence.ts new file mode 100644 index 0000000..726919e --- /dev/null +++ b/packages/api-core/src/sync/persistence.ts @@ -0,0 +1,131 @@ +import { buildCodeSnapshotSignature } from '../cluster/code-signature.js'; +import { upsertThreadCodeSnapshot, upsertThreadRevision } from '../cluster/persistent-store.js'; +import { blobStoreRoot } from '../db/raw-json-store.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import { + asJson, + nowIso, + parseAssignees, + parseLabels, + stableContentHash, + userLogin, + userType, +} from '../service-utils.js'; + +export function upsertRepository(params: { + db: SqliteDatabase; + owner: string; + repo: string; + payload: Record; +}): number { + const fullName = `${params.owner}/${params.repo}`; + params.db + .prepare( + `insert into repositories (owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?) + on conflict(full_name) do update set + github_repo_id = excluded.github_repo_id, + raw_json = excluded.raw_json, + updated_at = excluded.updated_at`, + ) + .run(params.owner, params.repo, fullName, params.payload.id ? String(params.payload.id) : null, asJson(params.payload), nowIso()); + const row = params.db.prepare('select id from repositories where full_name = ?').get(fullName) as { id: number }; + return row.id; +} + +export function upsertThread(params: { + db: SqliteDatabase; + repoId: number; + kind: 'issue' | 'pull_request'; + payload: Record; + pulledAt: string; +}): number { + const title = String(params.payload.title ?? `#${params.payload.number}`); + const body = typeof params.payload.body === 'string' ? params.payload.body : null; + const labels = parseLabels(params.payload); + const assignees = parseAssignees(params.payload); + const contentHash = stableContentHash(`${title}\n${body ?? ''}`); + params.db + .prepare( + `insert into threads ( + repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, + created_at_gh, updated_at_gh, closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + on conflict(repo_id, kind, number) do update set + github_id = excluded.github_id, + state = excluded.state, + title = excluded.title, + body = excluded.body, + author_login = excluded.author_login, + author_type = excluded.author_type, + html_url = excluded.html_url, + labels_json = excluded.labels_json, + assignees_json = excluded.assignees_json, + raw_json = excluded.raw_json, + content_hash = excluded.content_hash, + is_draft = excluded.is_draft, + created_at_gh = excluded.created_at_gh, + updated_at_gh = excluded.updated_at_gh, + closed_at_gh = excluded.closed_at_gh, + merged_at_gh = excluded.merged_at_gh, + last_pulled_at = excluded.last_pulled_at, + updated_at = excluded.updated_at`, + ) + .run( + params.repoId, + String(params.payload.id), + Number(params.payload.number), + params.kind, + String(params.payload.state ?? 'open'), + title, + body, + userLogin(params.payload), + userType(params.payload), + String(params.payload.html_url), + asJson(labels), + asJson(assignees), + asJson(params.payload), + contentHash, + params.payload.draft ? 1 : 0, + typeof params.payload.created_at === 'string' ? params.payload.created_at : null, + typeof params.payload.updated_at === 'string' ? params.payload.updated_at : null, + typeof params.payload.closed_at === 'string' ? params.payload.closed_at : null, + typeof params.payload.merged_at === 'string' ? params.payload.merged_at : null, + params.pulledAt, + params.pulledAt, + nowIso(), + ); + const row = params.db + .prepare('select id from threads where repo_id = ? and kind = ? and number = ?') + .get(params.repoId, params.kind, Number(params.payload.number)) as { id: number }; + return row.id; +} + +export function persistThreadCodeSnapshot(params: { + db: SqliteDatabase; + dbPath: string; + threadId: number; + threadPayload: Record; + files: Array>; +}): void { + const title = String(params.threadPayload.title ?? `#${params.threadPayload.number}`); + const body = typeof params.threadPayload.body === 'string' ? params.threadPayload.body : null; + const revisionId = upsertThreadRevision(params.db, { + threadId: params.threadId, + sourceUpdatedAt: typeof params.threadPayload.updated_at === 'string' ? params.threadPayload.updated_at : null, + title, + body, + labels: parseLabels(params.threadPayload), + rawJson: asJson(params.threadPayload), + }); + const base = params.threadPayload.base as Record | undefined; + const head = params.threadPayload.head as Record | undefined; + upsertThreadCodeSnapshot(params.db, { + threadRevisionId: revisionId, + baseSha: typeof base?.sha === 'string' ? base.sha : null, + headSha: typeof head?.sha === 'string' ? head.sha : null, + signature: buildCodeSnapshotSignature(params.files), + storeRoot: blobStoreRoot(params.dbPath), + }); +} From 7fe60a0c1c1bb6a8268830c9f17a5ae4dd8d1d6c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:16:00 -0700 Subject: [PATCH 183/215] refactor: extract sync reconciliation --- packages/api-core/src/service.ts | 219 ++---------------------- packages/api-core/src/sync/reconcile.ts | 212 +++++++++++++++++++++++ 2 files changed, 223 insertions(+), 208 deletions(-) create mode 100644 packages/api-core/src/sync/reconcile.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index ed2788f..90f1aae 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -143,6 +143,7 @@ import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-mainte import { fetchThreadComments } from './sync/comments.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { persistThreadCodeSnapshot, upsertRepository, upsertThread } from './sync/persistence.js'; +import { applyClosedOverlapSweep, countStaleOpenThreads, reconcileMissingOpenThreads } from './sync/reconcile.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { compareTuiClusterSummary } from './tui/cluster-format.js'; import { @@ -178,7 +179,6 @@ import { MAX_DIRECT_RECONCILE_THREADS, requireFromHere, STALE_CLOSED_BACKFILL_LIMIT, - STALE_CLOSED_SWEEP_LIMIT, SUMMARY_MODEL_PRICING, SUMMARY_PROMPT_VERSION, SYNC_BATCH_DELAY_MS, @@ -211,7 +211,6 @@ import { deriveIncrementalSince, isClosedGitHubPayload, isEffectivelyClosed, - isMissingGitHubResourceError, isPullRequestPayload, nowIso, parseArray, @@ -1081,7 +1080,9 @@ export class GHCrawlService { params.onProgress?.('[sync] skipping closed overlap sweep because this scan has no overlap window'); } const threadsClosedFromClosedSweep = shouldSweepClosedOverlap - ? await this.applyClosedOverlapSweep({ + ? await applyClosedOverlapSweep({ + db: this.db, + github, repoId, owner: params.owner, repo: params.repo, @@ -1093,7 +1094,9 @@ export class GHCrawlService { : 0; const canFullReconcile = params.fullReconcile === true && params.limit === undefined && (isFullOpenScan || isOverlappingOpenScan); const threadsClosedFromClosedBackfill = canFullReconcile - ? await this.applyClosedOverlapSweep({ + ? await applyClosedOverlapSweep({ + db: this.db, + github, repoId, owner: params.owner, repo: params.repo, @@ -1106,7 +1109,7 @@ export class GHCrawlService { }) : 0; const staleOpenThreadCountForDirectReconcile = canFullReconcile - ? this.countStaleOpenThreads(repoId, crawlStartedAt) + ? countStaleOpenThreads(this.db, repoId, crawlStartedAt) : 0; const shouldReconcileMissingOpenThreads = canFullReconcile && staleOpenThreadCountForDirectReconcile <= MAX_DIRECT_RECONCILE_THREADS; @@ -1120,7 +1123,9 @@ export class GHCrawlService { ); } const threadsClosedFromDirectReconcile = shouldReconcileMissingOpenThreads - ? await this.reconcileMissingOpenThreads({ + ? await reconcileMissingOpenThreads({ + db: this.db, + github, repoId, owner: params.owner, repo: params.repo, @@ -3329,208 +3334,6 @@ export class GHCrawlService { return repositoryToDto(row); } - private async applyClosedOverlapSweep(params: { - repoId: number; - owner: string; - repo: string; - crawlStartedAt: string; - closedSweepSince?: string; - closedSweepLimit?: number; - sweepLabel?: string; - reporter?: (message: string) => void; - onProgress?: (message: string) => void; - }): Promise { - const staleRows = this.db - .prepare( - `select id, number, kind - from threads - where repo_id = ? - and state = 'open' - and closed_at_local is null - and (last_pulled_at is null or last_pulled_at < ?) - order by number asc`, - ) - .all(params.repoId, params.crawlStartedAt) as Array<{ id: number; number: number; kind: 'issue' | 'pull_request' }>; - - if (staleRows.length === 0) { - return 0; - } - - const sweepLabel = params.sweepLabel ?? 'recent closed sweep'; - const sweepWindow = params.closedSweepSince - ? `since ${params.closedSweepSince}` - : `from the latest ${params.closedSweepLimit ?? STALE_CLOSED_SWEEP_LIMIT} closed items`; - params.onProgress?.(`[sync] ${sweepLabel}: scanning ${staleRows.length} unseen previously-open thread(s) against closed items ${sweepWindow}`); - - const github = this.requireGithub(); - const staleByNumber = new Map( - staleRows.map((row) => [row.number, row]), - ); - const recentlyClosed = await github.listRepositoryIssues( - params.owner, - params.repo, - params.closedSweepSince, - params.closedSweepLimit ?? STALE_CLOSED_SWEEP_LIMIT, - params.reporter, - 'closed', - ); - - let threadsClosed = 0; - for (const payload of recentlyClosed) { - const number = Number(payload.number); - const staleRow = staleByNumber.get(number); - if (!staleRow) continue; - const state = String(payload.state ?? 'closed'); - if (state === 'open') continue; - const pulledAt = nowIso(); - this.db - .prepare( - `update threads - set state = ?, - raw_json = ?, - updated_at_gh = ?, - closed_at_gh = ?, - merged_at_gh = ?, - last_pulled_at = ?, - updated_at = ? - where id = ?`, - ) - .run( - state, - asJson(payload), - typeof payload.updated_at === 'string' ? payload.updated_at : null, - typeof payload.closed_at === 'string' ? payload.closed_at : null, - typeof payload.merged_at === 'string' ? payload.merged_at : null, - pulledAt, - pulledAt, - staleRow.id, - ); - staleByNumber.delete(number); - threadsClosed += 1; - } - - params.onProgress?.(`[sync] ${sweepLabel} matched ${threadsClosed} stale thread(s); ${staleByNumber.size} remain open locally`); - - return threadsClosed; - } - - private countStaleOpenThreads(repoId: number, crawlStartedAt: string): number { - const row = this.db - .prepare( - `select count(*) as count - from threads - where repo_id = ? - and state = 'open' - and closed_at_local is null - and (last_pulled_at is null or last_pulled_at < ?)`, - ) - .get(repoId, crawlStartedAt) as { count: number }; - return row.count; - } - - private async reconcileMissingOpenThreads(params: { - repoId: number; - owner: string; - repo: string; - crawlStartedAt: string; - reporter?: (message: string) => void; - onProgress?: (message: string) => void; - }): Promise { - const github = this.requireGithub(); - const staleRows = this.db - .prepare( - `select id, number, kind - from threads - where repo_id = ? - and state = 'open' - and closed_at_local is null - and (last_pulled_at is null or last_pulled_at < ?) - order by number asc`, - ) - .all(params.repoId, params.crawlStartedAt) as Array<{ id: number; number: number; kind: 'issue' | 'pull_request' }>; - - if (staleRows.length === 0) { - return 0; - } - - params.onProgress?.( - `[sync] full reconciliation requested; directly checking ${staleRows.length} previously-open thread(s) not seen in the open crawl`, - ); - - let threadsClosed = 0; - for (const [index, row] of staleRows.entries()) { - if (index > 0 && index % SYNC_BATCH_SIZE === 0) { - params.onProgress?.(`[sync] stale reconciliation batch boundary reached at ${index} threads; sleeping 5s before continuing`); - await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS)); - } - params.onProgress?.(`[sync] reconciling stale ${row.kind} #${row.number}`); - const pulledAt = nowIso(); - let payload: Record | null = null; - let state = 'closed'; - - try { - payload = - row.kind === 'pull_request' - ? await github.getPull(params.owner, params.repo, row.number, params.reporter) - : await github.getIssue(params.owner, params.repo, row.number, params.reporter); - state = String(payload.state ?? 'open'); - } catch (error) { - if (!isMissingGitHubResourceError(error)) { - throw error; - } - params.onProgress?.( - `[sync] stale ${row.kind} #${row.number} is missing on GitHub; marking it closed locally and continuing`, - ); - } - - if (payload) { - this.db - .prepare( - `update threads - set state = ?, - raw_json = ?, - updated_at_gh = ?, - closed_at_gh = ?, - merged_at_gh = ?, - last_pulled_at = ?, - updated_at = ? - where id = ?`, - ) - .run( - state, - asJson(payload), - typeof payload.updated_at === 'string' ? payload.updated_at : null, - typeof payload.closed_at === 'string' ? payload.closed_at : null, - typeof payload.merged_at === 'string' ? payload.merged_at : null, - pulledAt, - pulledAt, - row.id, - ); - } else { - this.db - .prepare( - `update threads - set state = 'closed', - closed_at_gh = coalesce(closed_at_gh, ?), - last_pulled_at = ?, - updated_at = ? - where id = ?`, - ) - .run(pulledAt, pulledAt, pulledAt, row.id); - } - - if (state !== 'open') { - threadsClosed += 1; - } - } - - if (threadsClosed > 0) { - params.onProgress?.(`[sync] marked ${threadsClosed} stale thread(s) as closed after GitHub confirmation`); - } - - return threadsClosed; - } - private replaceComments(threadId: number, comments: CommentSeed[]): void { const insert = this.db.prepare( `insert into comments ( diff --git a/packages/api-core/src/sync/reconcile.ts b/packages/api-core/src/sync/reconcile.ts new file mode 100644 index 0000000..53b10d6 --- /dev/null +++ b/packages/api-core/src/sync/reconcile.ts @@ -0,0 +1,212 @@ +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { GitHubClient, GitHubReporter } from '../github/client.js'; +import { STALE_CLOSED_SWEEP_LIMIT, SYNC_BATCH_DELAY_MS, SYNC_BATCH_SIZE } from '../service-constants.js'; +import { asJson, isMissingGitHubResourceError, nowIso } from '../service-utils.js'; + +type StaleThreadRow = { + id: number; + number: number; + kind: 'issue' | 'pull_request'; +}; + +export async function applyClosedOverlapSweep(params: { + db: SqliteDatabase; + github: GitHubClient; + repoId: number; + owner: string; + repo: string; + crawlStartedAt: string; + closedSweepSince?: string; + closedSweepLimit?: number; + sweepLabel?: string; + reporter?: GitHubReporter; + onProgress?: (message: string) => void; +}): Promise { + const staleRows = params.db + .prepare( + `select id, number, kind + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null + and (last_pulled_at is null or last_pulled_at < ?) + order by number asc`, + ) + .all(params.repoId, params.crawlStartedAt) as StaleThreadRow[]; + + if (staleRows.length === 0) { + return 0; + } + + const sweepLabel = params.sweepLabel ?? 'recent closed sweep'; + const sweepWindow = params.closedSweepSince + ? `since ${params.closedSweepSince}` + : `from the latest ${params.closedSweepLimit ?? STALE_CLOSED_SWEEP_LIMIT} closed items`; + params.onProgress?.(`[sync] ${sweepLabel}: scanning ${staleRows.length} unseen previously-open thread(s) against closed items ${sweepWindow}`); + + const staleByNumber = new Map(staleRows.map((row) => [row.number, row])); + const recentlyClosed = await params.github.listRepositoryIssues( + params.owner, + params.repo, + params.closedSweepSince, + params.closedSweepLimit ?? STALE_CLOSED_SWEEP_LIMIT, + params.reporter, + 'closed', + ); + + let threadsClosed = 0; + for (const payload of recentlyClosed) { + const number = Number(payload.number); + const staleRow = staleByNumber.get(number); + if (!staleRow) continue; + const state = String(payload.state ?? 'closed'); + if (state === 'open') continue; + const pulledAt = nowIso(); + params.db + .prepare( + `update threads + set state = ?, + raw_json = ?, + updated_at_gh = ?, + closed_at_gh = ?, + merged_at_gh = ?, + last_pulled_at = ?, + updated_at = ? + where id = ?`, + ) + .run( + state, + asJson(payload), + typeof payload.updated_at === 'string' ? payload.updated_at : null, + typeof payload.closed_at === 'string' ? payload.closed_at : null, + typeof payload.merged_at === 'string' ? payload.merged_at : null, + pulledAt, + pulledAt, + staleRow.id, + ); + staleByNumber.delete(number); + threadsClosed += 1; + } + + params.onProgress?.(`[sync] ${sweepLabel} matched ${threadsClosed} stale thread(s); ${staleByNumber.size} remain open locally`); + + return threadsClosed; +} + +export function countStaleOpenThreads(db: SqliteDatabase, repoId: number, crawlStartedAt: string): number { + const row = db + .prepare( + `select count(*) as count + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null + and (last_pulled_at is null or last_pulled_at < ?)`, + ) + .get(repoId, crawlStartedAt) as { count: number }; + return row.count; +} + +export async function reconcileMissingOpenThreads(params: { + db: SqliteDatabase; + github: GitHubClient; + repoId: number; + owner: string; + repo: string; + crawlStartedAt: string; + reporter?: GitHubReporter; + onProgress?: (message: string) => void; +}): Promise { + const staleRows = params.db + .prepare( + `select id, number, kind + from threads + where repo_id = ? + and state = 'open' + and closed_at_local is null + and (last_pulled_at is null or last_pulled_at < ?) + order by number asc`, + ) + .all(params.repoId, params.crawlStartedAt) as StaleThreadRow[]; + + if (staleRows.length === 0) { + return 0; + } + + params.onProgress?.( + `[sync] full reconciliation requested; directly checking ${staleRows.length} previously-open thread(s) not seen in the open crawl`, + ); + + let threadsClosed = 0; + for (const [index, row] of staleRows.entries()) { + if (index > 0 && index % SYNC_BATCH_SIZE === 0) { + params.onProgress?.(`[sync] stale reconciliation batch boundary reached at ${index} threads; sleeping 5s before continuing`); + await new Promise((resolve) => setTimeout(resolve, SYNC_BATCH_DELAY_MS)); + } + params.onProgress?.(`[sync] reconciling stale ${row.kind} #${row.number}`); + const pulledAt = nowIso(); + let payload: Record | null = null; + let state = 'closed'; + + try { + payload = + row.kind === 'pull_request' + ? await params.github.getPull(params.owner, params.repo, row.number, params.reporter) + : await params.github.getIssue(params.owner, params.repo, row.number, params.reporter); + state = String(payload.state ?? 'open'); + } catch (error) { + if (!isMissingGitHubResourceError(error)) { + throw error; + } + params.onProgress?.( + `[sync] stale ${row.kind} #${row.number} is missing on GitHub; marking it closed locally and continuing`, + ); + } + + if (payload) { + params.db + .prepare( + `update threads + set state = ?, + raw_json = ?, + updated_at_gh = ?, + closed_at_gh = ?, + merged_at_gh = ?, + last_pulled_at = ?, + updated_at = ? + where id = ?`, + ) + .run( + state, + asJson(payload), + typeof payload.updated_at === 'string' ? payload.updated_at : null, + typeof payload.closed_at === 'string' ? payload.closed_at : null, + typeof payload.merged_at === 'string' ? payload.merged_at : null, + pulledAt, + pulledAt, + row.id, + ); + } else { + params.db + .prepare( + `update threads + set state = 'closed', + closed_at_gh = coalesce(closed_at_gh, ?), + last_pulled_at = ?, + updated_at = ? + where id = ?`, + ) + .run(pulledAt, pulledAt, pulledAt, row.id); + } + + if (state !== 'open') { + threadsClosed += 1; + } + } + + if (threadsClosed > 0) { + params.onProgress?.(`[sync] marked ${threadsClosed} stale thread(s) as closed after GitHub confirmation`); + } + + return threadsClosed; +} From 42ec61a103568af47a8dbfb4512226c131c0db09 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:18:31 -0700 Subject: [PATCH 184/215] refactor: extract document storage helpers --- packages/api-core/src/documents/store.ts | 72 +++++++++++++++++++++++ packages/api-core/src/service.ts | 74 ++---------------------- 2 files changed, 76 insertions(+), 70 deletions(-) create mode 100644 packages/api-core/src/documents/store.ts diff --git a/packages/api-core/src/documents/store.ts b/packages/api-core/src/documents/store.ts new file mode 100644 index 0000000..d307348 --- /dev/null +++ b/packages/api-core/src/documents/store.ts @@ -0,0 +1,72 @@ +import { rawJsonStorage } from '../db/raw-json-store.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { CommentSeed, ThreadRow } from '../service-types.js'; +import { nowIso, parseArray } from '../service-utils.js'; +import { buildCanonicalDocument } from './normalize.js'; + +export function replaceComments(params: { + db: SqliteDatabase; + dbPath: string; + threadId: number; + comments: CommentSeed[]; +}): void { + const insert = params.db.prepare( + `insert into comments ( + thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, raw_json_blob_id, created_at_gh, updated_at_gh + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + const tx = params.db.transaction((commentRows: CommentSeed[]) => { + params.db.prepare('delete from comments where thread_id = ?').run(params.threadId); + for (const comment of commentRows) { + const raw = rawJsonStorage(params.db, params.dbPath, comment.rawJson, `application/vnd.ghcrawl.${comment.commentType}.raw+json`); + insert.run( + params.threadId, + comment.githubId, + comment.commentType, + comment.authorLogin, + comment.authorType, + comment.body, + comment.isBot ? 1 : 0, + raw.inlineJson, + raw.blobId, + comment.createdAtGh, + comment.updatedAtGh, + ); + } + }); + tx(params.comments); +} + +export function refreshThreadDocument(db: SqliteDatabase, threadId: number): void { + const thread = db.prepare('select * from threads where id = ?').get(threadId) as ThreadRow; + const comments = db + .prepare( + 'select body, author_login, author_type, is_bot from comments where thread_id = ? order by coalesce(created_at_gh, updated_at_gh) asc, id asc', + ) + .all(threadId) as Array<{ body: string; author_login: string | null; author_type: string | null; is_bot: number }>; + + const canonical = buildCanonicalDocument({ + title: thread.title, + body: thread.body, + labels: parseArray(thread.labels_json), + comments: comments.map((comment) => ({ + body: comment.body, + authorLogin: comment.author_login, + authorType: comment.author_type, + isBot: comment.is_bot === 1, + })), + }); + + db.prepare( + `insert into documents (thread_id, title, body, raw_text, dedupe_text, updated_at) + values (?, ?, ?, ?, ?, ?) + on conflict(thread_id) do update set + title = excluded.title, + body = excluded.body, + raw_text = excluded.raw_text, + dedupe_text = excluded.dedupe_text, + updated_at = excluded.updated_at`, + ).run(threadId, thread.title, thread.body, canonical.rawText, canonical.dedupeText, nowIso()); + + db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 90f1aae..6aaa2d1 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -103,8 +103,7 @@ import { } from './config.js'; import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; -import { rawJsonStorage } from './db/raw-json-store.js'; -import { buildCanonicalDocument } from './documents/normalize.js'; +import { replaceComments, refreshThreadDocument } from './documents/store.js'; import { buildDoctorResult } from './doctor.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; import { loadClusterableActiveVectorMeta, loadClusterableThreadMeta, loadNormalizedActiveVectors } from './embedding/clusterable.js'; @@ -189,7 +188,6 @@ import type { ActiveVectorTask, AggregatedClusterEdge, ClusterExperimentResult, - CommentSeed, DoctorResult, EmbeddingSourceKind, KeySummaryTask, @@ -1061,10 +1059,10 @@ export class GHCrawlService { isPr, reporter, }); - this.replaceComments(threadId, comments); + replaceComments({ db: this.db, dbPath: this.config.dbPath, threadId, comments }); commentsSynced += comments.length; } - this.refreshDocument(threadId); + refreshThreadDocument(this.db, threadId); if (!threadIsClosed) { fingerprintThreadIds.push(threadId); } @@ -1560,7 +1558,7 @@ export class GHCrawlService { deleteComments.run(thread.id); purgedComments += row.count; } - this.refreshDocument(thread.id); + refreshThreadDocument(this.db, thread.id); } params.onProgress?.( @@ -3334,70 +3332,6 @@ export class GHCrawlService { return repositoryToDto(row); } - private replaceComments(threadId: number, comments: CommentSeed[]): void { - const insert = this.db.prepare( - `insert into comments ( - thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, raw_json_blob_id, created_at_gh, updated_at_gh - ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, - ); - const tx = this.db.transaction((commentRows: CommentSeed[]) => { - this.db.prepare('delete from comments where thread_id = ?').run(threadId); - for (const comment of commentRows) { - const raw = rawJsonStorage(this.db, this.config.dbPath, comment.rawJson, `application/vnd.ghcrawl.${comment.commentType}.raw+json`); - insert.run( - threadId, - comment.githubId, - comment.commentType, - comment.authorLogin, - comment.authorType, - comment.body, - comment.isBot ? 1 : 0, - raw.inlineJson, - raw.blobId, - comment.createdAtGh, - comment.updatedAtGh, - ); - } - }); - tx(comments); - } - - private refreshDocument(threadId: number): void { - const thread = this.db.prepare('select * from threads where id = ?').get(threadId) as ThreadRow; - const comments = this.db - .prepare( - 'select body, author_login, author_type, is_bot from comments where thread_id = ? order by coalesce(created_at_gh, updated_at_gh) asc, id asc', - ) - .all(threadId) as Array<{ body: string; author_login: string | null; author_type: string | null; is_bot: number }>; - - const canonical = buildCanonicalDocument({ - title: thread.title, - body: thread.body, - labels: parseArray(thread.labels_json), - comments: comments.map((comment) => ({ - body: comment.body, - authorLogin: comment.author_login, - authorType: comment.author_type, - isBot: comment.is_bot === 1, - })), - }); - - this.db - .prepare( - `insert into documents (thread_id, title, body, raw_text, dedupe_text, updated_at) - values (?, ?, ?, ?, ?, ?) - on conflict(thread_id) do update set - title = excluded.title, - body = excluded.body, - raw_text = excluded.raw_text, - dedupe_text = excluded.dedupe_text, - updated_at = excluded.updated_at`, - ) - .run(threadId, thread.title, thread.body, canonical.rawText, canonical.dedupeText, nowIso()); - - this.db.prepare('update threads set content_hash = ?, updated_at = ? where id = ?').run(canonical.contentHash, nowIso(), threadId); - } - private async embedBatchWithRecovery( ai: AiProvider, batch: ActiveVectorTask[], From fbf5e475bd3db3723c0e68cf5200c8ca428a3469 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:30:28 -0700 Subject: [PATCH 185/215] refactor: extract tui cluster rendering --- apps/cli/src/tui/app.test.ts | 14 ++-- apps/cli/src/tui/app.ts | 113 ++--------------------------- apps/cli/src/tui/cluster-render.ts | 113 +++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 111 deletions(-) create mode 100644 apps/cli/src/tui/cluster-render.ts diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index b6d4d9a..5681e9c 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -7,12 +7,8 @@ import { buildThreadContextMenuItems, buildHelpContent, escapeBlessedText, - formatClusterDateColumn, formatClusterForClipboard, - formatClusterListHeader, - formatClusterListLabel, formatClusterMembersForClipboard, - formatClusterShortName, formatLinkChoiceLabel, formatSummariesForClipboard, formatThreadDetailForClipboard, @@ -24,10 +20,16 @@ import { renderMarkdownForTerminal, renderDetailPane, resolveBlessedTerminal, - resolveClusterHeaderSortFromClick, renderSummarySections, - splitClusterDisplayTitle, } from './app.js'; +import { + formatClusterDateColumn, + formatClusterListHeader, + formatClusterListLabel, + formatClusterShortName, + resolveClusterHeaderSortFromClick, + splitClusterDisplayTitle, +} from './cluster-render.js'; test('escapeBlessedText escapes blessed tag delimiters', () => { assert.equal(escapeBlessedText('{bold}wow{/bold}'), '\\{bold\\}wow\\{/bold\\}'); diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index ccd4f2a..d6717ef 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -30,6 +30,14 @@ import { type TuiMinSizeFilter, } from './state.js'; import { computeTuiLayout } from './layout.js'; +import { + formatClusterDateColumn, + formatClusterListHeader, + formatClusterListLabel, + formatClusterShortName, + resolveClusterHeaderSortFromClick, + splitClusterDisplayTitle, +} from './cluster-render.js'; type StartTuiParams = { service: GHCrawlService; @@ -117,16 +125,6 @@ const ACTIVITY_LOG_LIMIT = 200; const FOOTER_LOG_LINES = 1; const CLUSTER_LIST_HEADER_INDEX = 0; const CLUSTER_LIST_FIRST_ITEM_INDEX = 1; -const CLUSTER_COUNT_WIDTH = 3; -const CLUSTER_NAME_WIDTH = 22; -const CLUSTER_TITLE_WIDTH = 56; -const CLUSTER_MIX_WIDTH = 7; -const CLUSTER_UPDATED_WIDTH = 8; -const CLUSTER_COLUMN_GAP = 2; -const CLUSTER_NAME_START = CLUSTER_COUNT_WIDTH + CLUSTER_COLUMN_GAP; -const CLUSTER_TITLE_START = CLUSTER_NAME_START + CLUSTER_NAME_WIDTH + CLUSTER_COLUMN_GAP; -const CLUSTER_MIX_START = CLUSTER_TITLE_START + CLUSTER_TITLE_WIDTH + CLUSTER_COLUMN_GAP; -const CLUSTER_UPDATED_START = CLUSTER_MIX_START + CLUSTER_MIX_WIDTH + CLUSTER_COLUMN_GAP; const TUI_AUTO_REFRESH_INTERVAL_MS = 15_000; export async function startTui(params: StartTuiParams): Promise { @@ -1797,14 +1795,6 @@ function formatTuiRefreshStateKey(state: TuiRefreshState): string { ].join('|'); } -export function splitClusterDisplayTitle(displayTitle: string): { name: string; title: string } { - const match = displayTitle.match(/^([a-z]+(?:-[a-z]+){2})\s{2,}(.+)$/); - if (match) { - return { name: match[1] ?? 'cluster', title: match[2] ?? displayTitle }; - } - return { name: formatClusterShortName(displayTitle), title: displayTitle || 'Untitled cluster' }; -} - export function renderMarkdownForTerminal(markdown: string): string { let inFence = false; const rendered = markdown.split(/\r?\n/).map((line) => { @@ -2392,93 +2382,6 @@ export function parseOwnerRepoValue(value: string): { owner: string; repo: strin return { owner: parts[0], repo: parts[1] }; } -export function formatClusterListLabel(cluster: TuiClusterSummary): string { - const countLabel = String(cluster.totalCount).padStart(CLUSTER_COUNT_WIDTH); - const mixLabel = `${cluster.issueCount}I/${cluster.pullRequestCount}P`.padStart(CLUSTER_MIX_WIDTH); - const updated = formatRelativeTime(cluster.latestUpdatedAt).padStart(CLUSTER_UPDATED_WIDTH); - const title = splitClusterDisplayTitle(cluster.displayTitle); - return [ - countLabel, - title.name.padEnd(CLUSTER_NAME_WIDTH).slice(0, CLUSTER_NAME_WIDTH), - title.title.padEnd(CLUSTER_TITLE_WIDTH).slice(0, CLUSTER_TITLE_WIDTH), - mixLabel, - updated, - ].join(' '); -} - -export function formatClusterListHeader(sortMode: TuiClusterSortMode): string { - const countLabel = (sortMode === 'size' ? 'cnt*' : 'cnt').padStart(CLUSTER_COUNT_WIDTH); - const updated = (sortMode === 'recent' ? 'updated*' : 'updated').padStart(CLUSTER_UPDATED_WIDTH); - return [ - countLabel, - 'cluster'.padEnd(CLUSTER_NAME_WIDTH), - 'title'.padEnd(CLUSTER_TITLE_WIDTH), - 'mix'.padStart(CLUSTER_MIX_WIDTH), - updated, - ].join(' '); -} - -export function resolveClusterHeaderSortFromClick(relativeX: number, visibleWidth: number, currentSortMode: TuiClusterSortMode): TuiClusterSortMode { - if (relativeX < CLUSTER_NAME_START) { - return 'size'; - } - - const visibleUpdatedStart = Math.min(CLUSTER_UPDATED_START, Math.max(CLUSTER_NAME_START, visibleWidth - CLUSTER_UPDATED_WIDTH - CLUSTER_COLUMN_GAP)); - if (relativeX >= visibleUpdatedStart) { - return 'recent'; - } - - return cycleSortMode(currentSortMode); -} - -export function formatClusterShortName(title: string, maxWords = 3): string { - const words = title - .replace(/[\[\]{}()<>]/g, ' ') - .split(/\s+/) - .map((word) => word.trim()) - .map((word) => word.replace(/^[:/#-]+|[:/#-]+$/g, '')) - .filter((word) => word && !CLUSTER_SHORT_NAME_STOPWORDS.has(word.toLowerCase())) - .slice(0, maxWords); - return words.join(' ') || 'untitled'; -} - -const CLUSTER_SHORT_NAME_STOPWORDS = new Set([ - 'ai', - 'assisted', - 'bug', - 'chore', - 'codex', - 'docs', - 'feat', - 'feature', - 'fix', - 'issue', - 'pr', - 'refactor', - 'test', -]); - function formatActivityTimestamp(now: Date = new Date()): string { return now.toISOString().slice(11, 19); } - -export function formatClusterDateColumn(value: string | null, locales?: Intl.LocalesArgument): string { - if (!value) return 'unknown'; - const parsed = new Date(value); - if (Number.isNaN(parsed.getTime())) return value; - - const month = String(parsed.getMonth() + 1).padStart(2, '0'); - const day = String(parsed.getDate()).padStart(2, '0'); - const hour = String(parsed.getHours()).padStart(2, '0'); - const minute = String(parsed.getMinutes()).padStart(2, '0'); - const ordering = new Intl.DateTimeFormat(locales, { - month: '2-digit', - day: '2-digit', - }) - .formatToParts(parsed) - .filter((part) => part.type === 'month' || part.type === 'day') - .map((part) => part.type); - const date = ordering[0] === 'day' ? `${day}-${month}` : `${month}-${day}`; - - return `${date} ${hour}:${minute}`; -} diff --git a/apps/cli/src/tui/cluster-render.ts b/apps/cli/src/tui/cluster-render.ts new file mode 100644 index 0000000..d5a4175 --- /dev/null +++ b/apps/cli/src/tui/cluster-render.ts @@ -0,0 +1,113 @@ +import type { TuiClusterSortMode, TuiClusterSummary } from '@ghcrawl/api-core'; + +import { cycleSortMode, formatRelativeTime } from './state.js'; + +const CLUSTER_COUNT_WIDTH = 3; +const CLUSTER_NAME_WIDTH = 22; +const CLUSTER_TITLE_WIDTH = 56; +const CLUSTER_MIX_WIDTH = 7; +const CLUSTER_UPDATED_WIDTH = 8; +const CLUSTER_COLUMN_GAP = 2; +const CLUSTER_NAME_START = CLUSTER_COUNT_WIDTH + CLUSTER_COLUMN_GAP; +const CLUSTER_TITLE_START = CLUSTER_NAME_START + CLUSTER_NAME_WIDTH + CLUSTER_COLUMN_GAP; +const CLUSTER_MIX_START = CLUSTER_TITLE_START + CLUSTER_TITLE_WIDTH + CLUSTER_COLUMN_GAP; +const CLUSTER_UPDATED_START = CLUSTER_MIX_START + CLUSTER_MIX_WIDTH + CLUSTER_COLUMN_GAP; + +export function splitClusterDisplayTitle(displayTitle: string): { name: string; title: string } { + const match = displayTitle.match(/^([a-z]+(?:-[a-z]+){2})\s{2,}(.+)$/); + if (match) { + return { name: match[1] ?? 'cluster', title: match[2] ?? displayTitle }; + } + return { name: formatClusterShortName(displayTitle), title: displayTitle || 'Untitled cluster' }; +} + +export function formatClusterListLabel(cluster: TuiClusterSummary): string { + const countLabel = String(cluster.totalCount).padStart(CLUSTER_COUNT_WIDTH); + const mixLabel = `${cluster.issueCount}I/${cluster.pullRequestCount}P`.padStart(CLUSTER_MIX_WIDTH); + const updated = formatRelativeTime(cluster.latestUpdatedAt).padStart(CLUSTER_UPDATED_WIDTH); + const title = splitClusterDisplayTitle(cluster.displayTitle); + return [ + countLabel, + title.name.padEnd(CLUSTER_NAME_WIDTH).slice(0, CLUSTER_NAME_WIDTH), + title.title.padEnd(CLUSTER_TITLE_WIDTH).slice(0, CLUSTER_TITLE_WIDTH), + mixLabel, + updated, + ].join(' '); +} + +export function formatClusterListHeader(sortMode: TuiClusterSortMode): string { + const countLabel = (sortMode === 'size' ? 'cnt*' : 'cnt').padStart(CLUSTER_COUNT_WIDTH); + const updated = (sortMode === 'recent' ? 'updated*' : 'updated').padStart(CLUSTER_UPDATED_WIDTH); + return [ + countLabel, + 'cluster'.padEnd(CLUSTER_NAME_WIDTH), + 'title'.padEnd(CLUSTER_TITLE_WIDTH), + 'mix'.padStart(CLUSTER_MIX_WIDTH), + updated, + ].join(' '); +} + +export function resolveClusterHeaderSortFromClick( + relativeX: number, + visibleWidth: number, + currentSortMode: TuiClusterSortMode, +): TuiClusterSortMode { + if (relativeX < CLUSTER_NAME_START) { + return 'size'; + } + + const visibleUpdatedStart = Math.min(CLUSTER_UPDATED_START, Math.max(CLUSTER_NAME_START, visibleWidth - CLUSTER_UPDATED_WIDTH - CLUSTER_COLUMN_GAP)); + if (relativeX >= visibleUpdatedStart) { + return 'recent'; + } + + return cycleSortMode(currentSortMode); +} + +export function formatClusterShortName(title: string, maxWords = 3): string { + const words = title + .replace(/[\[\]{}()<>]/g, ' ') + .split(/\s+/) + .map((word) => word.trim()) + .map((word) => word.replace(/^[:/#-]+|[:/#-]+$/g, '')) + .filter((word) => word && !CLUSTER_SHORT_NAME_STOPWORDS.has(word.toLowerCase())) + .slice(0, maxWords); + return words.join(' ') || 'untitled'; +} + +export function formatClusterDateColumn(value: string | null, locales?: Intl.LocalesArgument): string { + if (!value) return 'unknown'; + const parsed = new Date(value); + if (Number.isNaN(parsed.getTime())) return value; + + const month = String(parsed.getMonth() + 1).padStart(2, '0'); + const day = String(parsed.getDate()).padStart(2, '0'); + const hour = String(parsed.getHours()).padStart(2, '0'); + const minute = String(parsed.getMinutes()).padStart(2, '0'); + const ordering = new Intl.DateTimeFormat(locales, { + month: '2-digit', + day: '2-digit', + }) + .formatToParts(parsed) + .filter((part) => part.type === 'month' || part.type === 'day') + .map((part) => part.type); + const date = ordering[0] === 'day' ? `${day}-${month}` : `${month}-${day}`; + + return `${date} ${hour}:${minute}`; +} + +const CLUSTER_SHORT_NAME_STOPWORDS = new Set([ + 'ai', + 'assisted', + 'bug', + 'chore', + 'codex', + 'docs', + 'feat', + 'feature', + 'fix', + 'issue', + 'pr', + 'refactor', + 'test', +]); From 38198f0e842770f44c99a0bf43eb9b6bfa2dc721 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:34:37 -0700 Subject: [PATCH 186/215] refactor: extract tui detail rendering --- apps/cli/src/tui/app.test.ts | 14 +- apps/cli/src/tui/app.ts | 381 +----------------------------- apps/cli/src/tui/detail-render.ts | 371 +++++++++++++++++++++++++++++ 3 files changed, 392 insertions(+), 374 deletions(-) create mode 100644 apps/cli/src/tui/detail-render.ts diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 5681e9c..05b429a 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -4,8 +4,13 @@ import assert from 'node:assert/strict'; import type { TuiClusterDetail, TuiThreadDetail } from '@ghcrawl/api-core'; import { - buildThreadContextMenuItems, buildHelpContent, + getRepositoryChoices, + parseOwnerRepoValue, + resolveBlessedTerminal, +} from './app.js'; +import { + buildThreadContextMenuItems, escapeBlessedText, formatClusterForClipboard, formatClusterMembersForClipboard, @@ -15,13 +20,10 @@ import { formatVisibleClustersForClipboard, getThreadReferenceLinks, limitRenderedLines, - getRepositoryChoices, - parseOwnerRepoValue, - renderMarkdownForTerminal, renderDetailPane, - resolveBlessedTerminal, + renderMarkdownForTerminal, renderSummarySections, -} from './app.js'; +} from './detail-render.js'; import { formatClusterDateColumn, formatClusterListHeader, diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index d6717ef..5e0e3f1 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -5,7 +5,6 @@ import blessed from 'neo-blessed'; import type { GHCrawlService, TuiClusterDetail, - TuiClusterSummary, TuiClusterSortMode, TuiRefreshState, TuiSnapshot, @@ -30,6 +29,19 @@ import { type TuiMinSizeFilter, } from './state.js'; import { computeTuiLayout } from './layout.js'; +import { + buildThreadContextMenuItems, + escapeBlessedText, + formatClusterForClipboard, + formatClusterMembersForClipboard, + formatLinkChoiceLabel, + formatSummariesForClipboard, + formatThreadDetailForClipboard, + formatVisibleClustersForClipboard, + getThreadReferenceLinks, + renderDetailPane, + type DetailMode, +} from './detail-render.js'; import { formatClusterDateColumn, formatClusterListHeader, @@ -79,30 +91,11 @@ type MouseEventArg = blessed.Widgets.Events.IMouseEventArg & { button?: 'left' | 'middle' | 'right' | 'unknown'; }; -export type ThreadContextAction = - | 'open' - | 'copy-url' - | 'copy-title' - | 'copy-markdown-link' - | 'open-first-link' - | 'copy-first-link' - | 'open-link-picker' - | 'copy-link-picker' - | 'load-neighbors' - | 'close'; - -export type ThreadContextMenuItem = { - label: string; - action: ThreadContextAction; -}; - type ContextMenuItem = { label: string; run: () => boolean | void; }; -type DetailMode = 'full' | 'compact'; - export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { const term = env.TERM; if (!term) { @@ -1698,89 +1691,6 @@ function updatePaneStyles(widgets: Widgets, focus: TuiFocusPane): void { focus === 'members' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#33521e', fg: 'white', bold: true }; } -export function renderDetailPane( - threadDetail: TuiThreadDetail | null, - clusterDetail: TuiClusterDetail | null, - focusPane: TuiFocusPane, - snapshot?: TuiSnapshot | null, - detailMode: DetailMode = 'full', -): string { - if (!clusterDetail) { - const repoLabel = snapshot?.repository.fullName ?? 'No repository selected'; - const clusterCount = snapshot?.clusters.length ?? 0; - return [ - `{bold}${escapeBlessedText(repoLabel)}{/bold}`, - '', - clusterCount > 0 ? `${clusterCount} clusters loaded. Click a cluster or press Enter to inspect members.` : 'No clusters visible in this view.', - '', - `{bold}Controls{/bold}`, - 's sort f min size / filter x closed r refresh', - 'right-click any pane for actions', - ].join('\n'); - } - const clusterTitle = splitClusterDisplayTitle(clusterDetail.displayTitle); - if (!threadDetail) { - const representativeLabel = - clusterDetail.representativeNumber !== null && clusterDetail.representativeKind !== null - ? ` (#${clusterDetail.representativeNumber} representative ${clusterDetail.representativeKind === 'pull_request' ? 'pr' : 'issue'})` - : ''; - return [ - `{bold}Cluster ${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}{/bold}`, - `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg}`, - escapeBlessedText(clusterTitle.title), - '', - 'Select a member to inspect thread details.', - ].join('\n'); - } - - const thread = threadDetail.thread; - const representativeLabel = - clusterDetail.representativeNumber !== null && clusterDetail.representativeKind !== null - ? ` (#${clusterDetail.representativeNumber} representative ${clusterDetail.representativeKind === 'pull_request' ? 'pr' : 'issue'})` - : ''; - const labels = thread.labels.length > 0 ? thread.labels.map((label) => `{cyan-fg}${escapeBlessedText(label)}{/cyan-fg}`).join(' ') : 'none'; - const closedLabel = thread.isClosed - ? `{bold}Closed:{/bold} ${escapeBlessedText(thread.closedAtLocal ?? thread.closedAtGh ?? 'yes')} ${thread.closeReasonLocal ? `(${escapeBlessedText(thread.closeReasonLocal)})` : ''}`.trimEnd() - : '{bold}Closed:{/bold} no'; - const summaryBlock = renderThreadSummaryBlock(threadDetail); - const topFiles = renderTopFiles(threadDetail.topFiles); - const neighbors = - threadDetail.neighbors.length > 0 - ? threadDetail.neighbors - .map((neighbor) => `#${neighbor.number} ${neighbor.kind} ${(neighbor.score * 100).toFixed(1)}% ${escapeBlessedText(neighbor.title)}`) - .join('\n') - : focusPane === 'detail' - ? 'No neighbors available.' - : 'Neighbors load when the detail pane is focused.'; - const body = limitRenderedLines(renderMarkdownForTerminal(thread.body ?? '(no body)'), detailMode === 'compact' ? 18 : 240); - const referenceLinks = getThreadReferenceLinks(threadDetail); - const linksSection = - referenceLinks.length > 0 ? `\n\n{bold}Links{/bold}\n${referenceLinks.map((url, index) => `${index + 1}. ${escapeBlessedText(url)}`).join('\n')}` : ''; - return [ - `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, - `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, - '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', - summaryBlock ? `{bold}LLM Summary{/bold}\n${summaryBlock}` : '', - summaryBlock ? '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}' : '', - `${closedLabel} {bold}Updated:{/bold} ${escapeBlessedText(formatRelativeTime(thread.updatedAtGh))} {bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, - `{bold}Labels:{/bold} ${labels}`, - `{bold}URL:{/bold} ${formatTerminalLink(thread.htmlUrl, thread.htmlUrl)}`, - topFiles ? `\n{bold}Top files{/bold}\n${topFiles}` : '', - '', - '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', - `{bold}Main Preview{/bold}`, - body, - linksSection, - `\n\n{bold}Neighbors{/bold}\n${neighbors}`, - ] - .filter(Boolean) - .join('\n'); -} - -export function escapeBlessedText(value: string): string { - return value.replace(/\\/g, '\\\\').replace(/\{/g, '\\{').replace(/\}/g, '\\}'); -} - function formatTuiRefreshStateKey(state: TuiRefreshState): string { return [ state.repositoryUpdatedAt ?? '', @@ -1795,271 +1705,6 @@ function formatTuiRefreshStateKey(state: TuiRefreshState): string { ].join('|'); } -export function renderMarkdownForTerminal(markdown: string): string { - let inFence = false; - const rendered = markdown.split(/\r?\n/).map((line) => { - if (/^```/.test(line.trim())) { - inFence = !inFence; - return '{gray-fg}--- code ---{/gray-fg}'; - } - if (inFence) { - return `{gray-fg}${escapeBlessedText(line)}{/gray-fg}`; - } - const heading = line.match(/^(#{1,6})\s+(.+)$/); - if (heading) { - return `{bold}${escapeBlessedText(heading[2] ?? '')}{/bold}`; - } - const quote = line.match(/^>\s?(.*)$/); - if (quote) { - return `{gray-fg}> ${renderInlineMarkdown(quote[1] ?? '')}{/gray-fg}`; - } - const listItem = line.match(/^(\s*)([-*+]|\d+[.)])\s+(.+)$/); - if (listItem) { - const indent = listItem[1] ?? ''; - return `${indent}- ${renderInlineMarkdown(listItem[3] ?? '')}`; - } - return renderInlineMarkdown(line); - }); - return rendered.join('\n').replace(/\n{4,}/g, '\n\n\n').trimEnd(); -} - -export function limitRenderedLines(value: string, maxLines: number): string { - const lines = value.split('\n'); - if (lines.length <= maxLines) { - return value; - } - const omitted = lines.length - maxLines; - return `${lines.slice(0, maxLines).join('\n')}\n{gray-fg}... ${omitted} more line(s). Use full detail or copy body to inspect all content.{/gray-fg}`; -} - -export function getThreadReferenceLinks(threadDetail: TuiThreadDetail | null): string[] { - if (!threadDetail) return []; - return uniqueStrings([ - ...extractMarkdownLinks(threadDetail.thread.body ?? ''), - ...Object.values(threadDetail.summaries).flatMap((summary) => extractMarkdownLinks(summary ?? '')), - ]).filter((url) => url !== threadDetail.thread.htmlUrl); -} - -export function formatLinkChoiceLabel(url: string, index: number): string { - return `${String(index + 1).padStart(2)} ${url}`; -} - -function extractMarkdownLinks(markdown: string): string[] { - const urls: string[] = []; - for (const match of markdown.matchAll(/\[[^\]]+\]\((https?:\/\/[^)\s]+)\)/g)) { - urls.push(stripTrailingUrlPunctuation(match[1] ?? '')); - } - for (const match of markdown.matchAll(/(^|[\s(<])(https?:\/\/[^\s<>)]+)/g)) { - urls.push(stripTrailingUrlPunctuation(match[2] ?? '')); - } - return urls.filter(Boolean); -} - -function stripTrailingUrlPunctuation(url: string): string { - return url.replace(/[.,;:!?]+$/g, ''); -} - -function uniqueStrings(values: string[]): string[] { - return [...new Set(values)]; -} - -type SummaryKey = NonNullable; - -const SUMMARY_SECTION_ORDER: SummaryKey[] = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary']; - -export function renderSummarySections(summaries: TuiThreadDetail['summaries']): string { - return SUMMARY_SECTION_ORDER.flatMap((key) => { - const value = summaries[key]; - if (!value) return []; - return [`{bold}${formatSummaryLabel(key)}:{/bold}\n${renderMarkdownForTerminal(value)}`]; - }).join('\n\n'); -} - -export function renderThreadSummaryBlock(threadDetail: TuiThreadDetail): string { - const sections = [ - threadDetail.keySummary - ? `{bold}Key summary{/bold} {gray-fg}${escapeBlessedText(threadDetail.keySummary.model)}{/gray-fg}\n${renderMarkdownForTerminal(threadDetail.keySummary.text)}` - : '', - renderSummarySections(threadDetail.summaries), - ]; - return sections.filter((section) => section.trim()).join('\n\n'); -} - -export function renderTopFiles(files: TuiThreadDetail['topFiles']): string { - if (files.length === 0) return ''; - return files - .slice(0, 5) - .map((file) => { - const churn = file.additions + file.deletions; - const status = file.status ? `${file.status} ` : ''; - return `- ${escapeBlessedText(file.path)} {gray-fg}${escapeBlessedText(status)}+${file.additions}/-${file.deletions} (${churn}){/gray-fg}`; - }) - .join('\n'); -} - -function formatSummaryLabel(key: SummaryKey): string { - if (key === 'problem_summary') return 'Purpose'; - if (key === 'solution_summary') return 'Solution'; - if (key === 'maintainer_signal_summary') return 'Maintainer signal'; - return 'Cluster signal'; -} - -export function formatSummariesForClipboard(summaries: TuiThreadDetail['summaries']): string { - return SUMMARY_SECTION_ORDER.flatMap((key) => { - const value = summaries[key]; - if (!value) return []; - return [`${formatSummaryLabel(key)}:\n${value}`]; - }).join('\n\n'); -} - -export function formatThreadDetailForClipboard(threadDetail: TuiThreadDetail, clusterDetail: TuiClusterDetail | null): string { - const thread = threadDetail.thread; - const clusterTitle = clusterDetail ? splitClusterDisplayTitle(clusterDetail.displayTitle) : null; - const sections = [ - `${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}: ${thread.title}`, - clusterDetail && clusterTitle ? `Cluster ${clusterDetail.clusterId}: ${clusterTitle.name} | ${clusterTitle.title}` : '', - `State: ${thread.isClosed ? 'closed' : 'open'}`, - `Updated: ${thread.updatedAtGh ?? 'unknown'}`, - `Author: ${thread.authorLogin ?? 'unknown'}`, - `Labels: ${thread.labels.join(', ') || 'none'}`, - `URL: ${thread.htmlUrl}`, - threadDetail.keySummary ? `Key summary (${threadDetail.keySummary.model}):\n${threadDetail.keySummary.text}` : '', - formatSummariesForClipboard(threadDetail.summaries) ? `LLM Summary:\n${formatSummariesForClipboard(threadDetail.summaries)}` : '', - threadDetail.topFiles.length > 0 ? `Top files:\n${formatTopFilesForClipboard(threadDetail.topFiles)}` : '', - `Body:\n${thread.body ?? ''}`, - getThreadReferenceLinks(threadDetail).length > 0 ? `Links:\n${getThreadReferenceLinks(threadDetail).join('\n')}` : '', - ]; - return sections.filter((section) => section.trim()).join('\n\n'); -} - -export function formatClusterForClipboard(cluster: TuiClusterDetail): string { - const title = splitClusterDisplayTitle(cluster.displayTitle); - return [ - `Cluster ${cluster.clusterId}`, - `Name: ${title.name}`, - `Title: ${title.title}`, - `State: ${cluster.isClosed ? 'closed' : 'open'}`, - `Members: ${cluster.totalCount} (${cluster.issueCount} issues, ${cluster.pullRequestCount} PRs)`, - `Updated: ${cluster.latestUpdatedAt ?? 'unknown'}`, - cluster.representativeNumber !== null ? `Representative: #${cluster.representativeNumber} ${cluster.representativeKind ?? ''}`.trimEnd() : '', - ] - .filter(Boolean) - .join('\n'); -} - -export function formatClusterMembersForClipboard(cluster: TuiClusterDetail): string { - return cluster.members - .map((member) => { - const state = member.isClosed ? 'closed' : 'open'; - const kind = member.kind === 'pull_request' ? 'PR' : 'Issue'; - return `${kind} #${member.number} [${state}] ${member.title} ${member.htmlUrl}`; - }) - .join('\n'); -} - -export function formatVisibleClustersForClipboard(clusters: TuiClusterSummary[]): string { - return clusters - .map((cluster) => { - const title = splitClusterDisplayTitle(cluster.displayTitle); - const state = cluster.isClosed ? 'closed' : 'open'; - return `C${cluster.clusterId} [${state}] ${cluster.totalCount} items ${title.name} | ${title.title}`; - }) - .join('\n'); -} - -function formatTopFilesForClipboard(files: TuiThreadDetail['topFiles']): string { - return files - .slice(0, 5) - .map((file) => `${file.path} ${file.status ? `${file.status} ` : ''}+${file.additions}/-${file.deletions}`) - .join('\n'); -} - -type InlineMarkdownSegment = - | { kind: 'text'; value: string } - | { kind: 'link'; label: string; url: string }; - -function renderInlineMarkdown(value: string): string { - const segments: InlineMarkdownSegment[] = []; - const markdownLinkPattern = /\[([^\]]+)\]\((https?:\/\/[^)\s]+)\)/g; - let cursor = 0; - - for (const match of value.matchAll(markdownLinkPattern)) { - const index = match.index ?? 0; - if (index > cursor) { - pushBareLinkSegments(value.slice(cursor, index), segments); - } - segments.push({ kind: 'link', label: match[1] ?? '', url: match[2] ?? '' }); - cursor = index + match[0].length; - } - - if (cursor < value.length) { - pushBareLinkSegments(value.slice(cursor), segments); - } - - return segments.map((segment) => (segment.kind === 'link' ? formatTerminalLink(segment.url, segment.label) : renderInlineText(segment.value))).join(''); -} - -function pushBareLinkSegments(value: string, segments: InlineMarkdownSegment[]): void { - const bareLinkPattern = /https?:\/\/[^\s)]+/g; - let cursor = 0; - for (const match of value.matchAll(bareLinkPattern)) { - const index = match.index ?? 0; - if (index > cursor) { - segments.push({ kind: 'text', value: value.slice(cursor, index) }); - } - const url = match[0]; - segments.push({ kind: 'link', label: url, url }); - cursor = index + url.length; - } - if (cursor < value.length) { - segments.push({ kind: 'text', value: value.slice(cursor) }); - } -} - -function renderInlineText(value: string): string { - return escapeBlessedText(value) - .replace(/`([^`]+)`/g, '$1') - .replace(/\*\*([^*]+)\*\*/g, '{bold}$1{/bold}'); -} - -function formatTerminalLink(url: string, label: string): string { - const safeUrl = stripTerminalControls(url); - const safeLabel = stripTerminalControls(label); - const visibleLink = safeLabel && safeLabel !== safeUrl ? `${safeLabel} <${safeUrl}>` : safeUrl; - return escapeBlessedText(visibleLink); -} - -function stripTerminalControls(value: string): string { - return value.replace(/[\u0000-\u001f\u007f]/g, ''); -} - -export function buildThreadContextMenuItems(threadDetail: TuiThreadDetail | null): ThreadContextMenuItem[] { - if (!threadDetail) { - return [{ label: 'Close', action: 'close' }]; - } - const referenceLinks = getThreadReferenceLinks(threadDetail); - return [ - { label: 'Open in browser', action: 'open' }, - { label: 'Copy URL', action: 'copy-url' }, - { label: 'Copy title', action: 'copy-title' }, - { label: 'Copy Markdown link', action: 'copy-markdown-link' }, - ...(referenceLinks.length > 0 - ? [ - { label: 'Open first body link', action: 'open-first-link' as const }, - { label: 'Copy first body link', action: 'copy-first-link' as const }, - ...(referenceLinks.length > 1 - ? [ - { label: 'Open body link...', action: 'open-link-picker' as const }, - { label: 'Copy body link...', action: 'copy-link-picker' as const }, - ] - : []), - ] - : []), - { label: 'Load neighbors', action: 'load-neighbors' }, - { label: 'Close', action: 'close' }, - ]; -} - function applyRect(element: blessed.Widgets.BoxElement | blessed.Widgets.ListElement, rect: { top: number; left: number; width: number; height: number }): void { element.top = rect.top; element.left = rect.left; diff --git a/apps/cli/src/tui/detail-render.ts b/apps/cli/src/tui/detail-render.ts new file mode 100644 index 0000000..e7262dc --- /dev/null +++ b/apps/cli/src/tui/detail-render.ts @@ -0,0 +1,371 @@ +import type { TuiClusterDetail, TuiClusterSummary, TuiSnapshot, TuiThreadDetail } from '@ghcrawl/api-core'; + +import { formatRelativeTime, type TuiFocusPane } from './state.js'; +import { splitClusterDisplayTitle } from './cluster-render.js'; + +export type ThreadContextAction = + | 'open' + | 'copy-url' + | 'copy-title' + | 'copy-markdown-link' + | 'open-first-link' + | 'copy-first-link' + | 'open-link-picker' + | 'copy-link-picker' + | 'load-neighbors' + | 'close'; + +export type ThreadContextMenuItem = { + label: string; + action: ThreadContextAction; +}; + +export type DetailMode = 'full' | 'compact'; + +export function renderDetailPane( + threadDetail: TuiThreadDetail | null, + clusterDetail: TuiClusterDetail | null, + focusPane: TuiFocusPane, + snapshot?: TuiSnapshot | null, + detailMode: DetailMode = 'full', +): string { + if (!clusterDetail) { + const repoLabel = snapshot?.repository.fullName ?? 'No repository selected'; + const clusterCount = snapshot?.clusters.length ?? 0; + return [ + `{bold}${escapeBlessedText(repoLabel)}{/bold}`, + '', + clusterCount > 0 ? `${clusterCount} clusters loaded. Click a cluster or press Enter to inspect members.` : 'No clusters visible in this view.', + '', + `{bold}Controls{/bold}`, + 's sort f min size / filter x closed r refresh', + 'right-click any pane for actions', + ].join('\n'); + } + const clusterTitle = splitClusterDisplayTitle(clusterDetail.displayTitle); + if (!threadDetail) { + const representativeLabel = + clusterDetail.representativeNumber !== null && clusterDetail.representativeKind !== null + ? ` (#${clusterDetail.representativeNumber} representative ${clusterDetail.representativeKind === 'pull_request' ? 'pr' : 'issue'})` + : ''; + return [ + `{bold}Cluster ${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}{/bold}`, + `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg}`, + escapeBlessedText(clusterTitle.title), + '', + 'Select a member to inspect thread details.', + ].join('\n'); + } + + const thread = threadDetail.thread; + const representativeLabel = + clusterDetail.representativeNumber !== null && clusterDetail.representativeKind !== null + ? ` (#${clusterDetail.representativeNumber} representative ${clusterDetail.representativeKind === 'pull_request' ? 'pr' : 'issue'})` + : ''; + const labels = thread.labels.length > 0 ? thread.labels.map((label) => `{cyan-fg}${escapeBlessedText(label)}{/cyan-fg}`).join(' ') : 'none'; + const closedLabel = thread.isClosed + ? `{bold}Closed:{/bold} ${escapeBlessedText(thread.closedAtLocal ?? thread.closedAtGh ?? 'yes')} ${thread.closeReasonLocal ? `(${escapeBlessedText(thread.closeReasonLocal)})` : ''}`.trimEnd() + : '{bold}Closed:{/bold} no'; + const summaryBlock = renderThreadSummaryBlock(threadDetail); + const topFiles = renderTopFiles(threadDetail.topFiles); + const neighbors = + threadDetail.neighbors.length > 0 + ? threadDetail.neighbors + .map((neighbor) => `#${neighbor.number} ${neighbor.kind} ${(neighbor.score * 100).toFixed(1)}% ${escapeBlessedText(neighbor.title)}`) + .join('\n') + : focusPane === 'detail' + ? 'No neighbors available.' + : 'Neighbors load when the detail pane is focused.'; + const body = limitRenderedLines(renderMarkdownForTerminal(thread.body ?? '(no body)'), detailMode === 'compact' ? 18 : 240); + const referenceLinks = getThreadReferenceLinks(threadDetail); + const linksSection = + referenceLinks.length > 0 ? `\n\n{bold}Links{/bold}\n${referenceLinks.map((url, index) => `${index + 1}. ${escapeBlessedText(url)}`).join('\n')}` : ''; + return [ + `{bold}${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}{/bold} ${escapeBlessedText(thread.title)}`, + `{cyan-fg}${escapeBlessedText(clusterTitle.name)}{/cyan-fg} C${clusterDetail.clusterId}${escapeBlessedText(representativeLabel)}`, + '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', + summaryBlock ? `{bold}LLM Summary{/bold}\n${summaryBlock}` : '', + summaryBlock ? '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}' : '', + `${closedLabel} {bold}Updated:{/bold} ${escapeBlessedText(formatRelativeTime(thread.updatedAtGh))} {bold}Author:{/bold} ${escapeBlessedText(thread.authorLogin ?? 'unknown')}`, + `{bold}Labels:{/bold} ${labels}`, + `{bold}URL:{/bold} ${formatTerminalLink(thread.htmlUrl, thread.htmlUrl)}`, + topFiles ? `\n{bold}Top files{/bold}\n${topFiles}` : '', + '', + '{gray-fg}' + '-'.repeat(72) + '{/gray-fg}', + `{bold}Main Preview{/bold}`, + body, + linksSection, + `\n\n{bold}Neighbors{/bold}\n${neighbors}`, + ] + .filter(Boolean) + .join('\n'); +} + +export function escapeBlessedText(value: string): string { + return value.replace(/\\/g, '\\\\').replace(/\{/g, '\\{').replace(/\}/g, '\\}'); +} + +export function renderMarkdownForTerminal(markdown: string): string { + let inFence = false; + const rendered = markdown.split(/\r?\n/).map((line) => { + if (/^```/.test(line.trim())) { + inFence = !inFence; + return '{gray-fg}--- code ---{/gray-fg}'; + } + if (inFence) { + return `{gray-fg}${escapeBlessedText(line)}{/gray-fg}`; + } + const heading = line.match(/^(#{1,6})\s+(.+)$/); + if (heading) { + return `{bold}${escapeBlessedText(heading[2] ?? '')}{/bold}`; + } + const quote = line.match(/^>\s?(.*)$/); + if (quote) { + return `{gray-fg}> ${renderInlineMarkdown(quote[1] ?? '')}{/gray-fg}`; + } + const listItem = line.match(/^(\s*)([-*+]|\d+[.)])\s+(.+)$/); + if (listItem) { + const indent = listItem[1] ?? ''; + return `${indent}- ${renderInlineMarkdown(listItem[3] ?? '')}`; + } + return renderInlineMarkdown(line); + }); + return rendered.join('\n').replace(/\n{4,}/g, '\n\n\n').trimEnd(); +} + +export function limitRenderedLines(value: string, maxLines: number): string { + const lines = value.split('\n'); + if (lines.length <= maxLines) { + return value; + } + const omitted = lines.length - maxLines; + return `${lines.slice(0, maxLines).join('\n')}\n{gray-fg}... ${omitted} more line(s). Use full detail or copy body to inspect all content.{/gray-fg}`; +} + +export function getThreadReferenceLinks(threadDetail: TuiThreadDetail | null): string[] { + if (!threadDetail) return []; + return uniqueStrings([ + ...extractMarkdownLinks(threadDetail.thread.body ?? ''), + ...Object.values(threadDetail.summaries).flatMap((summary) => extractMarkdownLinks(summary ?? '')), + ]).filter((url) => url !== threadDetail.thread.htmlUrl); +} + +export function formatLinkChoiceLabel(url: string, index: number): string { + return `${String(index + 1).padStart(2)} ${url}`; +} + +export function renderSummarySections(summaries: TuiThreadDetail['summaries']): string { + return SUMMARY_SECTION_ORDER.flatMap((key) => { + const value = summaries[key]; + if (!value) return []; + return [`{bold}${formatSummaryLabel(key)}:{/bold}\n${renderMarkdownForTerminal(value)}`]; + }).join('\n\n'); +} + +export function renderThreadSummaryBlock(threadDetail: TuiThreadDetail): string { + const sections = [ + threadDetail.keySummary + ? `{bold}Key summary{/bold} {gray-fg}${escapeBlessedText(threadDetail.keySummary.model)}{/gray-fg}\n${renderMarkdownForTerminal(threadDetail.keySummary.text)}` + : '', + renderSummarySections(threadDetail.summaries), + ]; + return sections.filter((section) => section.trim()).join('\n\n'); +} + +export function renderTopFiles(files: TuiThreadDetail['topFiles']): string { + if (files.length === 0) return ''; + return files + .slice(0, 5) + .map((file) => { + const churn = file.additions + file.deletions; + const status = file.status ? `${file.status} ` : ''; + return `- ${escapeBlessedText(file.path)} {gray-fg}${escapeBlessedText(status)}+${file.additions}/-${file.deletions} (${churn}){/gray-fg}`; + }) + .join('\n'); +} + +export function formatSummariesForClipboard(summaries: TuiThreadDetail['summaries']): string { + return SUMMARY_SECTION_ORDER.flatMap((key) => { + const value = summaries[key]; + if (!value) return []; + return [`${formatSummaryLabel(key)}:\n${value}`]; + }).join('\n\n'); +} + +export function formatThreadDetailForClipboard(threadDetail: TuiThreadDetail, clusterDetail: TuiClusterDetail | null): string { + const thread = threadDetail.thread; + const clusterTitle = clusterDetail ? splitClusterDisplayTitle(clusterDetail.displayTitle) : null; + const sections = [ + `${thread.kind === 'pull_request' ? 'PR' : 'Issue'} #${thread.number}: ${thread.title}`, + clusterDetail && clusterTitle ? `Cluster ${clusterDetail.clusterId}: ${clusterTitle.name} | ${clusterTitle.title}` : '', + `State: ${thread.isClosed ? 'closed' : 'open'}`, + `Updated: ${thread.updatedAtGh ?? 'unknown'}`, + `Author: ${thread.authorLogin ?? 'unknown'}`, + `Labels: ${thread.labels.join(', ') || 'none'}`, + `URL: ${thread.htmlUrl}`, + threadDetail.keySummary ? `Key summary (${threadDetail.keySummary.model}):\n${threadDetail.keySummary.text}` : '', + formatSummariesForClipboard(threadDetail.summaries) ? `LLM Summary:\n${formatSummariesForClipboard(threadDetail.summaries)}` : '', + threadDetail.topFiles.length > 0 ? `Top files:\n${formatTopFilesForClipboard(threadDetail.topFiles)}` : '', + `Body:\n${thread.body ?? ''}`, + getThreadReferenceLinks(threadDetail).length > 0 ? `Links:\n${getThreadReferenceLinks(threadDetail).join('\n')}` : '', + ]; + return sections.filter((section) => section.trim()).join('\n\n'); +} + +export function formatClusterForClipboard(cluster: TuiClusterDetail): string { + const title = splitClusterDisplayTitle(cluster.displayTitle); + return [ + `Cluster ${cluster.clusterId}`, + `Name: ${title.name}`, + `Title: ${title.title}`, + `State: ${cluster.isClosed ? 'closed' : 'open'}`, + `Members: ${cluster.totalCount} (${cluster.issueCount} issues, ${cluster.pullRequestCount} PRs)`, + `Updated: ${cluster.latestUpdatedAt ?? 'unknown'}`, + cluster.representativeNumber !== null ? `Representative: #${cluster.representativeNumber} ${cluster.representativeKind ?? ''}`.trimEnd() : '', + ] + .filter(Boolean) + .join('\n'); +} + +export function formatClusterMembersForClipboard(cluster: TuiClusterDetail): string { + return cluster.members + .map((member) => { + const state = member.isClosed ? 'closed' : 'open'; + const kind = member.kind === 'pull_request' ? 'PR' : 'Issue'; + return `${kind} #${member.number} [${state}] ${member.title} ${member.htmlUrl}`; + }) + .join('\n'); +} + +export function formatVisibleClustersForClipboard(clusters: TuiClusterSummary[]): string { + return clusters + .map((cluster) => { + const title = splitClusterDisplayTitle(cluster.displayTitle); + const state = cluster.isClosed ? 'closed' : 'open'; + return `C${cluster.clusterId} [${state}] ${cluster.totalCount} items ${title.name} | ${title.title}`; + }) + .join('\n'); +} + +export function buildThreadContextMenuItems(threadDetail: TuiThreadDetail | null): ThreadContextMenuItem[] { + if (!threadDetail) { + return [{ label: 'Close', action: 'close' }]; + } + const referenceLinks = getThreadReferenceLinks(threadDetail); + return [ + { label: 'Open in browser', action: 'open' }, + { label: 'Copy URL', action: 'copy-url' }, + { label: 'Copy title', action: 'copy-title' }, + { label: 'Copy Markdown link', action: 'copy-markdown-link' }, + ...(referenceLinks.length > 0 + ? [ + { label: 'Open first body link', action: 'open-first-link' as const }, + { label: 'Copy first body link', action: 'copy-first-link' as const }, + ...(referenceLinks.length > 1 + ? [ + { label: 'Open body link...', action: 'open-link-picker' as const }, + { label: 'Copy body link...', action: 'copy-link-picker' as const }, + ] + : []), + ] + : []), + { label: 'Load neighbors', action: 'load-neighbors' }, + { label: 'Close', action: 'close' }, + ]; +} + +function extractMarkdownLinks(markdown: string): string[] { + const urls: string[] = []; + for (const match of markdown.matchAll(/\[[^\]]+\]\((https?:\/\/[^)\s]+)\)/g)) { + urls.push(stripTrailingUrlPunctuation(match[1] ?? '')); + } + for (const match of markdown.matchAll(/(^|[\s(<])(https?:\/\/[^\s<>)]+)/g)) { + urls.push(stripTrailingUrlPunctuation(match[2] ?? '')); + } + return urls.filter(Boolean); +} + +function stripTrailingUrlPunctuation(url: string): string { + return url.replace(/[.,;:!?]+$/g, ''); +} + +function uniqueStrings(values: string[]): string[] { + return [...new Set(values)]; +} + +type SummaryKey = NonNullable; + +const SUMMARY_SECTION_ORDER: SummaryKey[] = ['problem_summary', 'solution_summary', 'maintainer_signal_summary', 'dedupe_summary']; + +function formatSummaryLabel(key: SummaryKey): string { + if (key === 'problem_summary') return 'Purpose'; + if (key === 'solution_summary') return 'Solution'; + if (key === 'maintainer_signal_summary') return 'Maintainer signal'; + return 'Cluster signal'; +} + +function formatTopFilesForClipboard(files: TuiThreadDetail['topFiles']): string { + return files + .slice(0, 5) + .map((file) => `${file.path} ${file.status ? `${file.status} ` : ''}+${file.additions}/-${file.deletions}`) + .join('\n'); +} + +type InlineMarkdownSegment = + | { kind: 'text'; value: string } + | { kind: 'link'; label: string; url: string }; + +function renderInlineMarkdown(value: string): string { + const segments: InlineMarkdownSegment[] = []; + const markdownLinkPattern = /\[([^\]]+)\]\((https?:\/\/[^)\s]+)\)/g; + let cursor = 0; + + for (const match of value.matchAll(markdownLinkPattern)) { + const index = match.index ?? 0; + if (index > cursor) { + pushBareLinkSegments(value.slice(cursor, index), segments); + } + segments.push({ kind: 'link', label: match[1] ?? '', url: match[2] ?? '' }); + cursor = index + match[0].length; + } + + if (cursor < value.length) { + pushBareLinkSegments(value.slice(cursor), segments); + } + + return segments.map((segment) => (segment.kind === 'link' ? formatTerminalLink(segment.url, segment.label) : renderInlineText(segment.value))).join(''); +} + +function pushBareLinkSegments(value: string, segments: InlineMarkdownSegment[]): void { + const bareLinkPattern = /https?:\/\/[^\s)]+/g; + let cursor = 0; + for (const match of value.matchAll(bareLinkPattern)) { + const index = match.index ?? 0; + if (index > cursor) { + segments.push({ kind: 'text', value: value.slice(cursor, index) }); + } + const url = match[0]; + segments.push({ kind: 'link', label: url, url }); + cursor = index + url.length; + } + if (cursor < value.length) { + segments.push({ kind: 'text', value: value.slice(cursor) }); + } +} + +function renderInlineText(value: string): string { + return escapeBlessedText(value) + .replace(/`([^`]+)`/g, '$1') + .replace(/\*\*([^*]+)\*\*/g, '{bold}$1{/bold}'); +} + +function formatTerminalLink(url: string, label: string): string { + const safeUrl = stripTerminalControls(url); + const safeLabel = stripTerminalControls(label); + const visibleLink = safeLabel && safeLabel !== safeUrl ? `${safeLabel} <${safeUrl}>` : safeUrl; + return escapeBlessedText(visibleLink); +} + +function stripTerminalControls(value: string): string { + return value.replace(/[\u0000-\u001f\u007f]/g, ''); +} From 5010a9f3f9250613b327c572dc7ad9d239abecae Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:37:27 -0700 Subject: [PATCH 187/215] refactor: extract embedding batch runner --- .../api-core/src/embedding/batch-runner.ts | 84 +++++++++++++++++++ packages/api-core/src/service.ts | 78 ++--------------- 2 files changed, 91 insertions(+), 71 deletions(-) create mode 100644 packages/api-core/src/embedding/batch-runner.ts diff --git a/packages/api-core/src/embedding/batch-runner.ts b/packages/api-core/src/embedding/batch-runner.ts new file mode 100644 index 0000000..414c0c0 --- /dev/null +++ b/packages/api-core/src/embedding/batch-runner.ts @@ -0,0 +1,84 @@ +import { ACTIVE_EMBED_DIMENSIONS, EMBED_CONTEXT_RETRY_ATTEMPTS } from '../service-constants.js'; +import type { ActiveVectorTask } from '../service-types.js'; +import type { AiProvider } from '../openai/provider.js'; +import { isEmbeddingContextError, parseEmbeddingContextError, shrinkEmbeddingTask } from './retry.js'; + +export async function embedBatchWithRecovery(params: { + ai: AiProvider; + embedModel: string; + batch: ActiveVectorTask[]; + onProgress?: (message: string) => void; +}): Promise> { + try { + const embeddings = await params.ai.embedTexts({ + model: params.embedModel, + texts: params.batch.map((task) => task.text), + dimensions: ACTIVE_EMBED_DIMENSIONS, + }); + return params.batch.map((task, index) => ({ task, embedding: embeddings[index] })); + } catch (error) { + if (!isEmbeddingContextError(error) || params.batch.length === 1) { + if (params.batch.length === 1 && isEmbeddingContextError(error)) { + const recovered = await embedSingleTaskWithRecovery({ + ai: params.ai, + embedModel: params.embedModel, + task: params.batch[0], + onProgress: params.onProgress, + }); + return [recovered]; + } + throw error; + } + + params.onProgress?.(`[embed] batch context error; isolating ${params.batch.length} item(s) to find oversized input(s)`); + + const recovered: Array<{ task: ActiveVectorTask; embedding: number[] }> = []; + for (const task of params.batch) { + recovered.push( + await embedSingleTaskWithRecovery({ + ai: params.ai, + embedModel: params.embedModel, + task, + onProgress: params.onProgress, + }), + ); + } + return recovered; + } +} + +async function embedSingleTaskWithRecovery(params: { + ai: AiProvider; + embedModel: string; + task: ActiveVectorTask; + onProgress?: (message: string) => void; +}): Promise<{ task: ActiveVectorTask; embedding: number[] }> { + let current = params.task; + + for (let attempt = 0; attempt < EMBED_CONTEXT_RETRY_ATTEMPTS; attempt += 1) { + try { + const [embedding] = await params.ai.embedTexts({ + model: params.embedModel, + texts: [current.text], + dimensions: ACTIVE_EMBED_DIMENSIONS, + }); + return { task: current, embedding }; + } catch (error) { + const context = parseEmbeddingContextError(error); + if (!context) { + throw error; + } + + const next = shrinkEmbeddingTask(current, { embedModel: params.embedModel, context }); + if (!next || next.text === current.text) { + throw error; + } + params.onProgress?.( + `[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`, + ); + current = next; + } + } + + throw new Error(`Unable to shrink embedding input for #${params.task.threadNumber}:${params.task.basis} below model limits`); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 6aaa2d1..ac566d6 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -105,6 +105,7 @@ import { migrate } from './db/migrate.js'; import { checkpointWal, openDb, type SqliteDatabase } from './db/sqlite.js'; import { replaceComments, refreshThreadDocument } from './documents/store.js'; import { buildDoctorResult } from './doctor.js'; +import { embedBatchWithRecovery } from './embedding/batch-runner.js'; import { chunkEmbeddingTasks } from './embedding/chunks.js'; import { loadClusterableActiveVectorMeta, loadClusterableThreadMeta, loadNormalizedActiveVectors } from './embedding/clusterable.js'; import { @@ -113,7 +114,6 @@ import { loadNormalizedEmbeddingsForSourceKind, loadStoredEmbeddingsForThreadNumber, } from './embedding/queries.js'; -import { isEmbeddingContextError, parseEmbeddingContextError, shrinkEmbeddingTask } from './embedding/retry.js'; import { activeVectorSourceKind } from './embedding/tasks.js'; import { getEmbeddingWorkset } from './embedding/workset.js'; import { makeGitHubClient, type GitHubClient } from './github/client.js'; @@ -170,7 +170,6 @@ import { DEFAULT_CROSS_KIND_CLUSTER_MIN_SCORE, DEFAULT_DETERMINISTIC_CLUSTER_MIN_SCORE, DURABLE_CLUSTER_REUSE_MIN_OVERLAP, - EMBED_CONTEXT_RETRY_ATTEMPTS, EMBED_MAX_BATCH_TOKENS, KEY_SUMMARY_CONCURRENCY, KEY_SUMMARY_MAX_BODY_CHARS, @@ -1617,7 +1616,12 @@ export class GHCrawlService { const mapper = new IterableMapper( batches, async (batch: ActiveVectorTask[]) => { - return this.embedBatchWithRecovery(ai, batch, params.onProgress); + return embedBatchWithRecovery({ + ai, + embedModel: this.config.embedModel, + batch, + onProgress: params.onProgress, + }); }, { concurrency: this.config.embedConcurrency, @@ -3332,74 +3336,6 @@ export class GHCrawlService { return repositoryToDto(row); } - private async embedBatchWithRecovery( - ai: AiProvider, - batch: ActiveVectorTask[], - onProgress?: (message: string) => void, - ): Promise> { - try { - const embeddings = await ai.embedTexts({ - model: this.config.embedModel, - texts: batch.map((task) => task.text), - dimensions: ACTIVE_EMBED_DIMENSIONS, - }); - return batch.map((task, index) => ({ task, embedding: embeddings[index] })); - } catch (error) { - if (!isEmbeddingContextError(error) || batch.length === 1) { - if (batch.length === 1 && isEmbeddingContextError(error)) { - const recovered = await this.embedSingleTaskWithRecovery(ai, batch[0], onProgress); - return [recovered]; - } - throw error; - } - - onProgress?.( - `[embed] batch context error; isolating ${batch.length} item(s) to find oversized input(s)`, - ); - - const recovered: Array<{ task: ActiveVectorTask; embedding: number[] }> = []; - for (const task of batch) { - recovered.push(await this.embedSingleTaskWithRecovery(ai, task, onProgress)); - } - return recovered; - } - } - - private async embedSingleTaskWithRecovery( - ai: AiProvider, - task: ActiveVectorTask, - onProgress?: (message: string) => void, - ): Promise<{ task: ActiveVectorTask; embedding: number[] }> { - let current = task; - - for (let attempt = 0; attempt < EMBED_CONTEXT_RETRY_ATTEMPTS; attempt += 1) { - try { - const [embedding] = await ai.embedTexts({ - model: this.config.embedModel, - texts: [current.text], - dimensions: ACTIVE_EMBED_DIMENSIONS, - }); - return { task: current, embedding }; - } catch (error) { - const context = parseEmbeddingContextError(error); - if (!context) { - throw error; - } - - const next = shrinkEmbeddingTask(current, { embedModel: this.config.embedModel, context }); - if (!next || next.text === current.text) { - throw error; - } - onProgress?.( - `[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`, - ); - current = next; - } - } - - throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); - } - private async aggregateRepositoryEdges( repoId: number, sourceKinds: EmbeddingSourceKind[], From d9382f4e32879274c28db2e66b19e93238946c5e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:41:06 -0700 Subject: [PATCH 188/215] test: cover embedding batch recovery --- .../src/embedding/batch-runner.test.ts | 49 +++++++++++++++++++ .../api-core/src/embedding/batch-runner.ts | 38 +++++++++----- 2 files changed, 76 insertions(+), 11 deletions(-) create mode 100644 packages/api-core/src/embedding/batch-runner.test.ts diff --git a/packages/api-core/src/embedding/batch-runner.test.ts b/packages/api-core/src/embedding/batch-runner.test.ts new file mode 100644 index 0000000..932c7d4 --- /dev/null +++ b/packages/api-core/src/embedding/batch-runner.test.ts @@ -0,0 +1,49 @@ +import test from 'node:test'; +import assert from 'node:assert/strict'; + +import type { AiProvider } from '../openai/provider.js'; +import type { ActiveVectorTask } from '../service-types.js'; +import { embedBatchWithRecovery } from './batch-runner.js'; + +function task(overrides: Partial = {}): ActiveVectorTask { + return { + threadId: 1, + threadNumber: 42, + basis: 'title_original', + text: 'x'.repeat(4096), + contentHash: 'hash', + estimatedTokens: 2000, + wasTruncated: false, + ...overrides, + }; +} + +test('embedBatchWithRecovery shrinks a single oversized embedding input and retries it', async () => { + const calls: string[][] = []; + const provider: AiProvider = { + async embedTexts(params) { + calls.push(params.texts); + if (calls.length === 1) { + throw new Error("This model's maximum input length is 1000 tokens. However, you requested 2000 tokens."); + } + return [[0.1, 0.2, 0.3]]; + }, + async summarizeThread() { + throw new Error('not used'); + }, + }; + const progress: string[] = []; + + const [result] = await embedBatchWithRecovery({ + ai: provider, + embedModel: 'text-embedding-3-small', + batch: [task()], + onProgress: (message) => progress.push(message), + }); + + assert.equal(calls.length, 2); + assert.ok(calls[1]?.[0]?.length < calls[0]?.[0]?.length); + assert.equal(result?.task.wasTruncated, true); + assert.deepEqual(result?.embedding, [0.1, 0.2, 0.3]); + assert.match(progress.join('\n'), /shortened #42:title_original/); +}); diff --git a/packages/api-core/src/embedding/batch-runner.ts b/packages/api-core/src/embedding/batch-runner.ts index 414c0c0..e8d502c 100644 --- a/packages/api-core/src/embedding/batch-runner.ts +++ b/packages/api-core/src/embedding/batch-runner.ts @@ -17,12 +17,14 @@ export async function embedBatchWithRecovery(params: { }); return params.batch.map((task, index) => ({ task, embedding: embeddings[index] })); } catch (error) { - if (!isEmbeddingContextError(error) || params.batch.length === 1) { - if (params.batch.length === 1 && isEmbeddingContextError(error)) { + const context = parseEmbeddingContextError(error); + if (!context || params.batch.length === 1) { + if (params.batch.length === 1 && context) { const recovered = await embedSingleTaskWithRecovery({ ai: params.ai, embedModel: params.embedModel, task: params.batch[0], + initialContext: context, onProgress: params.onProgress, }); return [recovered]; @@ -51,9 +53,12 @@ async function embedSingleTaskWithRecovery(params: { ai: AiProvider; embedModel: string; task: ActiveVectorTask; + initialContext?: NonNullable>; onProgress?: (message: string) => void; }): Promise<{ task: ActiveVectorTask; embedding: number[] }> { - let current = params.task; + let current = params.initialContext + ? shrinkForRetry(params.task, { embedModel: params.embedModel, context: params.initialContext, onProgress: params.onProgress }) + : params.task; for (let attempt = 0; attempt < EMBED_CONTEXT_RETRY_ATTEMPTS; attempt += 1) { try { @@ -69,16 +74,27 @@ async function embedSingleTaskWithRecovery(params: { throw error; } - const next = shrinkEmbeddingTask(current, { embedModel: params.embedModel, context }); - if (!next || next.text === current.text) { - throw error; - } - params.onProgress?.( - `[embed] shortened #${current.threadNumber}:${current.basis} after context error est_tokens=${current.estimatedTokens}->${next.estimatedTokens}`, - ); - current = next; + current = shrinkForRetry(current, { embedModel: params.embedModel, context, onProgress: params.onProgress }); } } throw new Error(`Unable to shrink embedding input for #${params.task.threadNumber}:${params.task.basis} below model limits`); } + +function shrinkForRetry( + task: ActiveVectorTask, + params: { + embedModel: string; + context: NonNullable>; + onProgress?: (message: string) => void; + }, +): ActiveVectorTask { + const next = shrinkEmbeddingTask(task, { embedModel: params.embedModel, context: params.context }); + if (!next || next.text === task.text) { + throw new Error(`Unable to shrink embedding input for #${task.threadNumber}:${task.basis} below model limits`); + } + params.onProgress?.( + `[embed] shortened #${task.threadNumber}:${task.basis} after context error est_tokens=${task.estimatedTokens}->${next.estimatedTokens}`, + ); + return next; +} From 0b1c0967a2847c2e6269a43a396824df31cc075a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:42:50 -0700 Subject: [PATCH 189/215] refactor: extract tui widget helpers --- apps/cli/src/tui/app.ts | 132 +++--------------------------------- apps/cli/src/tui/widgets.ts | 125 ++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 122 deletions(-) create mode 100644 apps/cli/src/tui/widgets.ts diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 5e0e3f1..8aca443 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -50,6 +50,16 @@ import { resolveClusterHeaderSortFromClick, splitClusterDisplayTitle, } from './cluster-render.js'; +import { + applyRect, + createWidgets, + getListItemIndexFromMouse, + updatePaneStyles, + type MouseEventArg, + type Widgets, +} from './widgets.js'; + +export { resolveBlessedTerminal } from './widgets.js'; type StartTuiParams = { service: GHCrawlService; @@ -73,47 +83,16 @@ type RepositoryChoice = label: string; }; -type Widgets = { - screen: blessed.Widgets.Screen; - header: blessed.Widgets.BoxElement; - clusters: blessed.Widgets.ListElement; - members: blessed.Widgets.ListElement; - detail: blessed.Widgets.BoxElement; - footer: blessed.Widgets.BoxElement; -}; - type ThreadDetailCacheEntry = { detail: TuiThreadDetail; hasNeighbors: boolean; }; -type MouseEventArg = blessed.Widgets.Events.IMouseEventArg & { - button?: 'left' | 'middle' | 'right' | 'unknown'; -}; - type ContextMenuItem = { label: string; run: () => boolean | void; }; -export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { - const term = env.TERM; - if (!term) { - return undefined; - } - if (term === 'xterm-ghostty') { - return 'xterm-256color'; - } - return term; -} - -function createScreen(options: Parameters[0]): blessed.Widgets.Screen { - return blessed.screen({ - ...options, - terminal: resolveBlessedTerminal(), - }); -} - const ACTIVITY_LOG_LIMIT = 200; const FOOTER_LOG_LINES = 1; const CLUSTER_LIST_HEADER_INDEX = 0; @@ -1612,85 +1591,6 @@ export async function startTui(params: StartTuiParams): Promise { await new Promise((resolve) => widgets.screen.once('destroy', () => resolve())); } -function createWidgets(owner: string, repo: string): Widgets { - const screen = createScreen({ - smartCSR: true, - fullUnicode: true, - dockBorders: true, - autoPadding: false, - mouse: true, - title: owner && repo ? `ghcrawl ${owner}/${repo}` : 'ghcrawl', - }); - const header = blessed.box({ - parent: screen, - tags: true, - mouse: true, - style: { fg: 'white', bg: '#0d1321' }, - }); - const clusters = blessed.list({ - parent: screen, - border: 'line', - label: ' Clusters ', - tags: true, - keys: false, - mouse: true, - style: { - border: { fg: '#5bc0eb' }, - item: { fg: 'white' }, - selected: { bg: '#5bc0eb', fg: 'black', bold: true }, - }, - scrollbar: { ch: ' ' }, - }); - const members = blessed.list({ - parent: screen, - border: 'line', - label: ' Members ', - tags: true, - keys: false, - mouse: true, - style: { - border: { fg: '#9bc53d' }, - item: { fg: 'white' }, - selected: { bg: '#9bc53d', fg: 'black', bold: true }, - }, - }); - const detail = blessed.box({ - parent: screen, - border: 'line', - label: ' Detail ', - tags: true, - scrollable: true, - alwaysScroll: true, - keys: false, - mouse: true, - style: { - border: { fg: '#fde74c' }, - fg: 'white', - }, - }); - const footer = blessed.box({ - parent: screen, - tags: false, - mouse: true, - style: { fg: 'black', bg: '#5bc0eb' }, - }); - - return { screen, header, clusters, members, detail, footer }; -} - -function updatePaneStyles(widgets: Widgets, focus: TuiFocusPane): void { - widgets.clusters.setLabel(`${focus === 'clusters' ? '[*]' : '[ ]'} Clusters `); - widgets.members.setLabel(`${focus === 'members' ? '[*]' : '[ ]'} Members `); - widgets.detail.setLabel(`${focus === 'detail' ? '[*]' : '[ ]'} Detail `); - widgets.clusters.style.border = { fg: focus === 'clusters' ? 'white' : '#5bc0eb' }; - widgets.members.style.border = { fg: focus === 'members' ? 'white' : '#9bc53d' }; - widgets.detail.style.border = { fg: focus === 'detail' ? 'white' : '#fde74c' }; - widgets.clusters.style.selected = - focus === 'clusters' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#23445c', fg: 'white', bold: true }; - widgets.members.style.selected = - focus === 'members' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#33521e', fg: 'white', bold: true }; -} - function formatTuiRefreshStateKey(state: TuiRefreshState): string { return [ state.repositoryUpdatedAt ?? '', @@ -1705,18 +1605,6 @@ function formatTuiRefreshStateKey(state: TuiRefreshState): string { ].join('|'); } -function applyRect(element: blessed.Widgets.BoxElement | blessed.Widgets.ListElement, rect: { top: number; left: number; width: number; height: number }): void { - element.top = rect.top; - element.left = rect.left; - element.width = rect.width; - element.height = rect.height; -} - -function getListItemIndexFromMouse(list: blessed.Widgets.ListElement, event: MouseEventArg): number | null { - const itemIndex = Number(event.y) - Number(list.atop) - 2 + Number(list.getScroll()); - return Number.isInteger(itemIndex) ? itemIndex : null; -} - function openUrl(url: string): void { const launch = process.platform === 'darwin' diff --git a/apps/cli/src/tui/widgets.ts b/apps/cli/src/tui/widgets.ts new file mode 100644 index 0000000..fafd64f --- /dev/null +++ b/apps/cli/src/tui/widgets.ts @@ -0,0 +1,125 @@ +import blessed from 'neo-blessed'; + +import type { TuiFocusPane } from './state.js'; + +export type Widgets = { + screen: blessed.Widgets.Screen; + header: blessed.Widgets.BoxElement; + clusters: blessed.Widgets.ListElement; + members: blessed.Widgets.ListElement; + detail: blessed.Widgets.BoxElement; + footer: blessed.Widgets.BoxElement; +}; + +export type MouseEventArg = blessed.Widgets.Events.IMouseEventArg & { + button?: 'left' | 'middle' | 'right' | 'unknown'; +}; + +export function resolveBlessedTerminal(env: NodeJS.ProcessEnv = process.env): string | undefined { + const term = env.TERM; + if (!term) { + return undefined; + } + if (term === 'xterm-ghostty') { + return 'xterm-256color'; + } + return term; +} + +export function createScreen(options: Parameters[0]): blessed.Widgets.Screen { + return blessed.screen({ + ...options, + terminal: resolveBlessedTerminal(), + }); +} + +export function createWidgets(owner: string, repo: string): Widgets { + const screen = createScreen({ + smartCSR: true, + fullUnicode: true, + dockBorders: true, + autoPadding: false, + mouse: true, + title: owner && repo ? `ghcrawl ${owner}/${repo}` : 'ghcrawl', + }); + const header = blessed.box({ + parent: screen, + tags: true, + mouse: true, + style: { fg: 'white', bg: '#0d1321' }, + }); + const clusters = blessed.list({ + parent: screen, + border: 'line', + label: ' Clusters ', + tags: true, + keys: false, + mouse: true, + style: { + border: { fg: '#5bc0eb' }, + item: { fg: 'white' }, + selected: { bg: '#5bc0eb', fg: 'black', bold: true }, + }, + scrollbar: { ch: ' ' }, + }); + const members = blessed.list({ + parent: screen, + border: 'line', + label: ' Members ', + tags: true, + keys: false, + mouse: true, + style: { + border: { fg: '#9bc53d' }, + item: { fg: 'white' }, + selected: { bg: '#9bc53d', fg: 'black', bold: true }, + }, + }); + const detail = blessed.box({ + parent: screen, + border: 'line', + label: ' Detail ', + tags: true, + scrollable: true, + alwaysScroll: true, + keys: false, + mouse: true, + style: { + border: { fg: '#fde74c' }, + fg: 'white', + }, + }); + const footer = blessed.box({ + parent: screen, + tags: false, + mouse: true, + style: { fg: 'black', bg: '#5bc0eb' }, + }); + + return { screen, header, clusters, members, detail, footer }; +} + +export function updatePaneStyles(widgets: Widgets, focus: TuiFocusPane): void { + widgets.clusters.setLabel(`${focus === 'clusters' ? '[*]' : '[ ]'} Clusters `); + widgets.members.setLabel(`${focus === 'members' ? '[*]' : '[ ]'} Members `); + widgets.detail.setLabel(`${focus === 'detail' ? '[*]' : '[ ]'} Detail `); + widgets.clusters.style.border = { fg: focus === 'clusters' ? 'white' : '#5bc0eb' }; + widgets.members.style.border = { fg: focus === 'members' ? 'white' : '#9bc53d' }; + widgets.detail.style.border = { fg: focus === 'detail' ? 'white' : '#fde74c' }; + widgets.clusters.style.selected = + focus === 'clusters' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#23445c', fg: 'white', bold: true }; + widgets.members.style.selected = + focus === 'members' ? { bg: '#f7f7ff', fg: 'black', bold: true } : { bg: '#33521e', fg: 'white', bold: true }; +} + +export function applyRect(element: blessed.Widgets.BoxElement | blessed.Widgets.ListElement, rect: { top: number; left: number; width: number; height: number }): void { + element.top = rect.top; + element.left = rect.left; + element.width = rect.width; + element.height = rect.height; +} + +export function getListItemIndexFromMouse(list: blessed.Widgets.ListElement, event: MouseEventArg): number | null { + const itemIndex = Number(event.y) - Number(list.atop) - 2 + Number(list.getScroll()); + return Number.isInteger(itemIndex) ? itemIndex : null; +} From 46d15c71d09ce6aeff266294985cbe6724fa5cde Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:43:50 -0700 Subject: [PATCH 190/215] refactor: isolate tui platform actions --- apps/cli/src/tui/app.ts | 32 +------------------------------- apps/cli/src/tui/platform.ts | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) create mode 100644 apps/cli/src/tui/platform.ts diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 8aca443..36f673c 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -1,5 +1,3 @@ -import { spawn, spawnSync } from 'node:child_process'; - import blessed from 'neo-blessed'; import type { @@ -50,6 +48,7 @@ import { resolveClusterHeaderSortFromClick, splitClusterDisplayTitle, } from './cluster-render.js'; +import { copyTextToClipboard, openUrl } from './platform.js'; import { applyRect, createWidgets, @@ -1605,35 +1604,6 @@ function formatTuiRefreshStateKey(state: TuiRefreshState): string { ].join('|'); } -function openUrl(url: string): void { - const launch = - process.platform === 'darwin' - ? { command: 'open', args: [url] } - : process.platform === 'win32' - ? { command: 'cmd', args: ['/c', 'start', '', url] } - : { command: 'xdg-open', args: [url] }; - const child = spawn(launch.command, launch.args, { - detached: true, - stdio: 'ignore', - windowsVerbatimArguments: process.platform === 'win32', - }); - child.unref(); -} - -function copyTextToClipboard(value: string): boolean { - const copyCommand = - process.platform === 'darwin' - ? { command: 'pbcopy', args: [] } - : process.platform === 'win32' - ? { command: 'clip', args: [] } - : { command: 'xclip', args: ['-selection', 'clipboard'] }; - const result = spawnSync(copyCommand.command, copyCommand.args, { - input: value, - stdio: ['pipe', 'ignore', 'ignore'], - }); - return result.status === 0; -} - export function buildHelpContent(): string { return [ '{bold}ghcrawl TUI Help{/bold}', diff --git a/apps/cli/src/tui/platform.ts b/apps/cli/src/tui/platform.ts new file mode 100644 index 0000000..8251050 --- /dev/null +++ b/apps/cli/src/tui/platform.ts @@ -0,0 +1,30 @@ +import { spawn, spawnSync } from 'node:child_process'; + +export function openUrl(url: string): void { + const launch = + process.platform === 'darwin' + ? { command: 'open', args: [url] } + : process.platform === 'win32' + ? { command: 'cmd', args: ['/c', 'start', '', url] } + : { command: 'xdg-open', args: [url] }; + const child = spawn(launch.command, launch.args, { + detached: true, + stdio: 'ignore', + windowsVerbatimArguments: process.platform === 'win32', + }); + child.unref(); +} + +export function copyTextToClipboard(value: string): boolean { + const copyCommand = + process.platform === 'darwin' + ? { command: 'pbcopy', args: [] } + : process.platform === 'win32' + ? { command: 'clip', args: [] } + : { command: 'xclip', args: ['-selection', 'clipboard'] }; + const result = spawnSync(copyCommand.command, copyCommand.args, { + input: value, + stdio: ['pipe', 'ignore', 'ignore'], + }); + return result.status === 0; +} From 098a7c01bc484edd3f2c0d0e934af209cc4a9f69 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:45:10 -0700 Subject: [PATCH 191/215] fix: drop stale portable sync sidecars --- packages/api-core/src/portable/export.ts | 10 ++++++++-- packages/api-core/src/service.test.ts | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/portable/export.ts b/packages/api-core/src/portable/export.ts index 884e9e3..aa2ca58 100644 --- a/packages/api-core/src/portable/export.ts +++ b/packages/api-core/src/portable/export.ts @@ -65,9 +65,10 @@ export function exportPortableSyncDatabase(params: PortableSyncExportOptions): P } out.close(); + removeSqliteSidecars(outputPath); fs.renameSync(tmpPath, outputPath); - fs.rmSync(`${tmpPath}-wal`, { force: true }); - fs.rmSync(`${tmpPath}-shm`, { force: true }); + removeSqliteSidecars(tmpPath); + removeSqliteSidecars(outputPath); const outputBytes = fs.statSync(outputPath).size; const sourceBytes = fs.statSync(sourcePath).size + fileSize(`${sourcePath}-wal`) + fileSize(`${sourcePath}-shm`); @@ -240,3 +241,8 @@ function writePortableSyncManifest(outputPath: string, manifest: PortableSyncMan fs.writeFileSync(manifestPath, `${JSON.stringify(manifest, null, 2)}\n`); return manifestPath; } + +function removeSqliteSidecars(dbPath: string): void { + fs.rmSync(`${dbPath}-wal`, { force: true }); + fs.rmSync(`${dbPath}-shm`, { force: true }); +} diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 5f1e947..336333a 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -171,6 +171,8 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl const config = makeTestConfig(); const sourcePath = path.join(config.configDir, 'source.db'); const outputPath = path.join(config.configDir, 'openclaw.sync.db'); + fs.writeFileSync(`${outputPath}-wal`, 'stale wal'); + fs.writeFileSync(`${outputPath}-shm`, 'stale shm'); const service = new GHCrawlService({ config: { ...config, @@ -320,6 +322,8 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.ok(response.excluded.includes('documents')); assert.ok(response.excluded.includes('thread_vectors')); assert.equal(response.tables.find((table) => table.name === 'threads')?.rows, 1); + assert.equal(fs.existsSync(`${outputPath}-wal`), false); + assert.equal(fs.existsSync(`${outputPath}-shm`), false); const validation = service.validatePortableSync(outputPath); assert.equal(validation.ok, true); From 136013458de7bc53e8d198df9e5254e5f9d956e9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:46:10 -0700 Subject: [PATCH 192/215] docs: refresh implementation plan status --- docs/PLAN.md | 96 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/docs/PLAN.md b/docs/PLAN.md index df60ddb..b26d3e7 100644 --- a/docs/PLAN.md +++ b/docs/PLAN.md @@ -14,12 +14,14 @@ - CLI is the only supported runtime host in V1. - Web is deferred and must stay HTTP-only against the local API boundary. - SQLite is the canonical store. -- Exact local cosine similarity is the active kNN plan. -- OpenSearch is explicitly deferred until local exact search proves insufficient. -- Sync is open-only. +- Persistent `vectorlite` sidecar search is the active kNN plan. +- OpenSearch is explicitly deferred; it is not on the supported runtime path. +- Sync is metadata-first and open-focused, with stale-open closure reconciliation on full unfiltered crawls. - Sync is metadata-only by default. - `sync --include-comments` is optional deeper hydration, not the default path. - Filtered crawls like `--limit` and `--since` do not perform stale-open reconciliation. +- Durable cluster identities are canonical. Maintainer overrides are sticky and must survive reclustering. +- Portable git-sync exports are the supported way to share compact state. The live DB is a cache and is intentionally not sync-friendly. ## Phase 0: Bootstrap @@ -37,7 +39,7 @@ - [x] Read `GITHUB_TOKEN` and fail clearly when missing. - [x] Read `OPENAI_API_KEY` and fail clearly when missing for OpenAI-dependent commands. - [x] Define `GHCRAWL_DB_PATH`, `GHCRAWL_API_PORT`, `GHCRAWL_SUMMARY_MODEL`, and `GHCRAWL_EMBED_MODEL`. -- [ ] Decide whether to add a persisted runtime config file now or after first sync works. +- [x] Add a persisted runtime config file for model, embedding, vector, and per-repo TUI preferences. - [x] Implement `doctor` checks for env vars, SQLite path creation, and optional OpenSearch reachability. - [x] Testing goal: config unit tests cover defaults, missing env vars, and override behavior. @@ -54,9 +56,9 @@ - [x] Add positional `owner/repo` CLI syntax. - [x] Add filtered crawls with `--since` and `--limit`. - [x] Make comment, review, and review-comment hydration opt-in with `--include-comments`. -- [ ] Implement durable incremental checkpoints/cursors instead of relying only on `--since`. +- [x] Persist durable sync checkpoints for full scans and overlapping closure sweeps. - [ ] Decide whether to persist GitHub ETags or GraphQL cursors for cheaper refreshes. -- [ ] Add a dedicated `refresh-closed` or equivalent command if full open reconciliation becomes too slow on large repos. +- [ ] Add a dedicated `refresh-closed` or equivalent command only if overlap/direct reconciliation becomes too slow on large repos. - [ ] Testing goal: add fixture-backed sync tests for idempotency, repeated refreshes, and partial-failure resume behavior. ## Phase 3: Document Building And Summaries @@ -77,23 +79,25 @@ ## Phase 4: Embeddings And Similarity Search -- [x] Implement embedding generation with `text-embedding-3-small` by default. -- [x] Persist embeddings in SQLite first. -- [x] Implement exact cosine similarity search in process. +- [x] Implement embedding generation with OpenAI embeddings. +- [x] Move active vectors to one vector per open thread. +- [x] Persist active vectors in a repository-scoped `vectorlite` sidecar instead of the main SQLite DB. +- [x] Keep legacy SQLite embedding rows as migration input only, then purge rebuildable vector payloads. +- [x] Implement vector search and neighbor lookup through `vectorlite`. - [x] Add `embed` and `search` CLI commands. -- [ ] Measure local performance on a realistic fixture corpus and capture the numbers in docs. -- [ ] Add retry/batching observability around embeddings and summaries so long runs are easier to operate. -- [ ] Design a clean backend abstraction if we later want to swap exact local search with OpenSearch-backed ANN. -- [ ] Testing goal: expand embedding job tests to cover retries, batching behavior, and unchanged-row skips more explicitly. +- [x] Add retry/batching recovery around oversized embedding inputs. +- [x] Add tests for batching, unchanged-row skips, closed-vector pruning, corrupted sidecar rebuild, and retry shrink behavior. +- [ ] Capture current large-repo timing numbers in docs from the latest `openclaw/openclaw` run. +- [ ] Keep the vector store interface narrow enough that a future backend can be swapped without leaking raw SQL into service code. Decision note: -- this phase is the primary kNN path for the foreseeable future -- do not block on Docker, OpenSearch, Lucene, or Faiss +- `vectorlite` sidecar search is the primary kNN path for the foreseeable future +- do not block normal operation on Docker, OpenSearch, Lucene, or Faiss ## Phase 5: OpenSearch Evaluation And Optional Backend -- [ ] Add a local recipe for OpenSearch 3.3 only if local exact search is proven inadequate. +- [ ] Add a local recipe for OpenSearch 3.3 only if `vectorlite` search is proven inadequate. - [ ] Implement OpenSearch index creation using `knn_vector`. - [ ] Start with Lucene/HNSW as the default OpenSearch backend. - [ ] Support metadata filters in vector search. @@ -104,32 +108,33 @@ Decision note: Decision note: - this phase is explicitly deferred -- only start it after exact local similarity is measured and shown to be insufficient +- only start it after the supported `vectorlite` sidecar path is measured and shown to be insufficient ## Phase 6: Clustering - [x] Implement a first clustering pass based on nearest-neighbor edges plus connected components. - [x] Persist similarity edges, clusters, and cluster members. - [x] Add `cluster` CLI command. -- [ ] Tune similarity thresholds and metadata boosts using real repo output. -- [ ] Improve representative-thread selection and cluster explanation quality. -- [ ] Decide whether issue-to-PR clustering needs different thresholds than issue-to-issue and PR-to-PR. -- [ ] Test on a real or sanitized fixture corpus to inspect false positives and false negatives. -- [ ] Testing goal: add golden cluster fixtures proving known related threads end up together. +- [x] Add deterministic fingerprints based on normalized text, MinHash/SimHash-style signals, linked refs, files, module buckets, and hunk signatures. +- [x] Make clustering work without embeddings or LLM summaries; model output only enriches the evidence. +- [x] Add durable cluster governance: stable slugs, aliases, manual include/exclude/canonical overrides, merge, split, and close. +- [x] Tune thresholds and metadata/file/LLM weights against real `openclaw/openclaw` output. +- [x] Preserve closed and manually closed clusters in operator views by default. +- [ ] Keep refining representative-thread selection and cluster explanation quality. +- [ ] Add a small golden fixture suite for known true-positive and false-positive clusters. -## Phase 7: API And Future UI +## Phase 7: API, TUI, And Future Web UI - [x] Implement local API endpoints for health, repositories, threads, search, clusters, and rerun actions. - [x] Keep the HTTP API hosted in-process by the CLI rather than as a separate daemon. - [x] Preserve package boundaries so future web code stays HTTP-only and does not import `api-core`. -- [ ] Add any missing read endpoints we want before UI work: - - neighbors - - [x] run history - - thread detail with summaries and optional hydrated comments +- [x] Add read endpoints and service methods for neighbors, run history, thread detail, cluster detail, durable clusters, and cluster evidence. +- [x] Build the local TUI as the primary V1 browsing UI. +- [x] Add TUI support for stable cluster names, closed-member display, markdown-ish detail previews, right-click menus, copy/open actions, pane focus, mouse selection, and per-repo preferences. - [ ] Build the deferred Vite web app only after the API shape settles. - [ ] Use `shadcn/ui` primitives with a custom visual system rather than stock styling. - [ ] Add filters for repo, item type, state, label, and cluster size. -- [ ] Add detail panels that show raw text, summaries, nearest neighbors, and cluster membership. +- [x] Add TUI detail panels that show thread metadata, LLM key summaries, top files, main preview, links, and cluster membership. - [ ] Add a search view with keyword, semantic, and hybrid modes. - [ ] Add status indicators for sync freshness and model/index freshness. - [ ] Testing goal: UI smoke tests prove the main list, detail, and search views render from seeded local data. @@ -137,24 +142,32 @@ Decision note: ## Phase 8: Hardening - [x] Persist run-history tables for sync, summarize, embed, and cluster. -- [ ] Add more structured logs and progress summaries for summarize, embed, and cluster. -- [ ] Add failure recovery for partial enrichment runs. -- [ ] Add export/report helpers for maintainers to share cluster results. +- [x] Add structured progress summaries for sync, embed, cluster, refresh, and storage optimization. +- [x] Add recovery behavior for partial enrichment runs through content hashes, current vector metadata, and sidecar rebuild. +- [x] Add export/report helpers for maintainers to share cluster results and compact portable state. - [ ] Revisit model defaults and prompt budget after real data review. -- [ ] Decide whether per-repo config files are needed. -- [ ] Add database maintenance helpers: +- [x] Add per-repo persisted TUI preferences. +- [x] Add database maintenance helpers: - vacuum/cleanup - - prune stale summaries/embeddings - - optional reset commands scoped by repo + - WAL checkpoints + - planner stats refresh + - vector sidecar maintenance +- [x] Add portable git-sync commands: + - `export-sync` + - `validate-sync` + - `portable-size` + - `sync-status` + - `import-sync` - [ ] Testing goal: end-to-end local workflow test covers `doctor`, `sync`, `summarize`, `embed`, `cluster`, and `serve`. ## Immediate Next Focus -- [ ] Run a real full open-only crawl against `openclaw/openclaw` and inspect what the current metadata-first corpus looks like. -- [ ] Review search quality on real examples before spending more tokens on broad summarization/embedding runs. -- [ ] Decide whether default dedupe quality is good enough from title/body/labels alone, or whether we need selective comment hydration. -- [ ] Add progress output for summarize, embed, and cluster similar to sync. -- [ ] Capture a short operator guide for “full crawl vs filtered crawl vs include-comments crawl”. +- [x] Run real large `openclaw/openclaw` crawls, embeddings, summaries, clustering, closure refreshes, and storage optimization. +- [x] Tune cluster quality on real output and validate in the TUI. +- [x] Capture operator docs for refresh, manual pipeline control, closed clusters, durable overrides, portable git-sync export, and optimize. +- [ ] Finish service decomposition so `service.ts`, `apps/cli/src/main.ts`, and `apps/cli/src/tui/app.ts` stay small enough to maintain. +- [ ] Add focused tests around portable import conflict handling and sync drift reporting. +- [ ] Add a release-readiness pass for packaged `vectorlite` installs across supported Node versions. ## Recommended Execution Order @@ -162,4 +175,5 @@ Decision note: - [x] Prove GitHub sync into SQLite before any UI work. - [x] Prove document building before embeddings. - [x] Prove exact local similarity before OpenSearch. -- [ ] Tune clustering quality before polishing the UI. +- [x] Tune clustering quality before polishing the TUI. +- [ ] Keep refactoring service/TUI/CLI command surfaces in small commits until the core files stop carrying unrelated responsibilities. From 4b441e9026eae442afa29a7298f370145a45cf96 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:47:48 -0700 Subject: [PATCH 193/215] test: preserve rich cache on sync import --- packages/api-core/src/service.test.ts | 61 +++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 336333a..413fd34 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -412,6 +412,67 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl } finally { importService.close(); } + + const richImportService = new GHCrawlService({ + config: { + ...config, + dbPath: path.join(config.configDir, 'rich-import-target.db'), + }, + github: service.github, + }); + try { + const richerBody = `${longBody}\nextra locally hydrated body`; + const richerRaw = JSON.stringify({ payload: 'already-hydrated' }); + richImportService.db + .prepare( + `insert into repositories (id, owner, name, full_name, github_repo_id, raw_json, updated_at) + values (?, ?, ?, ?, ?, ?, ?)`, + ) + .run(1, 'openclaw', 'openclaw', 'openclaw/openclaw', '1', richerRaw, now); + richImportService.db + .prepare( + `insert into threads ( + repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, + closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 1, + '100', + 42, + 'issue', + 'open', + 'Gateway crash', + richerBody, + 'alice', + 'User', + 'https://github.com/openclaw/openclaw/issues/42', + '["bug"]', + '[]', + richerRaw, + 'content-hash', + 0, + now, + now, + null, + null, + now, + now, + now, + ); + + const importResult = richImportService.importPortableSync(outputPath); + assert.equal(importResult.ok, true); + const importedThread = richImportService.db.prepare('select body, raw_json from threads where number = 42').get() as { + body: string; + raw_json: string; + }; + assert.equal(importedThread.body, richerBody); + assert.equal(importedThread.raw_json, richerRaw); + } finally { + richImportService.close(); + } } finally { service.close(); } From e176c4da5634def9f74a632ed3e7cbf73b820e6e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:48:51 -0700 Subject: [PATCH 194/215] refactor: extract tui help modal --- apps/cli/src/tui/app.ts | 140 +------------------------------------- apps/cli/src/tui/help.ts | 141 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 138 deletions(-) create mode 100644 apps/cli/src/tui/help.ts diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 36f673c..897779e 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -48,6 +48,7 @@ import { resolveClusterHeaderSortFromClick, splitClusterDisplayTitle, } from './cluster-render.js'; +import { promptHelp } from './help.js'; import { copyTextToClipboard, openUrl } from './platform.js'; import { applyRect, @@ -59,6 +60,7 @@ import { } from './widgets.js'; export { resolveBlessedTerminal } from './widgets.js'; +export { buildHelpContent } from './help.js'; type StartTuiParams = { service: GHCrawlService; @@ -1604,144 +1606,6 @@ function formatTuiRefreshStateKey(state: TuiRefreshState): string { ].join('|'); } -export function buildHelpContent(): string { - return [ - '{bold}ghcrawl TUI Help{/bold}', - '', - '{bold}Navigation{/bold}', - 'Tab / Shift-Tab cycle focus across clusters, members, and detail', - 'Left / Right cycle focus backward or forward across panes', - 'Up / Down move selection, or scroll detail when detail is focused', - 'Enter clusters -> members, members -> detail', - 'Mouse click to focus/select; click list headers to sort; right-click opens pane actions; wheel scrolls', - 'PgUp / PgDn page through the focused pane or this help popup faster', - 'Home / End jump to the top or bottom of detail or help', - '', - '{bold}Views And Filters{/bold}', - '# jump directly to an issue or PR number', - 's cycle cluster sort mode', - 'm cycle member sort mode', - 'f cycle minimum cluster size filter', - 'l toggle wide layout: columns vs. wide-left stacked-right', - 'x show or hide locally closed clusters and members', - '/ filter clusters by title/member text', - 'r refresh the current local view from SQLite', - '', - '{bold}Actions{/bold}', - 'p open the repository browser / select another local repository', - 'o open the selected thread URL in your browser', - '', - '{bold}Help And Exit{/bold}', - 'h or ? open this help popup', - 'q quit the TUI or close this popup', - 'Esc close this popup', - '', - '{bold}Notes{/bold}', - 'The TUI only reads local SQLite. Run ghcrawl sync, ghcrawl embed, and ghcrawl cluster from the shell to update data.', - 'The default cluster filter is 1+, so solo clusters are visible unless you raise it with f.', - 'The default sort is size. Press s to toggle size and recent.', - 'Member rows default to issue/PR grouping. Press m or click the member header to sort by updated, number, state, or title.', - 'Mouse clicks focus panes; clicking an already selected row advances to the next pane. Right-click works on every pane.', - 'Clusters show C so the cluster id is easy to copy into CLI or skill flows.', - 'The footer only shows the short command list. Open help to see the full list.', - 'This popup scrolls. Use arrows, PgUp/PgDn, Home, and End if it does not fit.', - ].join('\n'); -} - -async function promptHelp(screen: blessed.Widgets.Screen): Promise { - const modalWidth = '86%'; - const box = blessed.box({ - parent: screen, - border: 'line', - label: ' Help ', - tags: true, - scrollable: true, - alwaysScroll: true, - keys: true, - vi: true, - mouse: true, - top: 'center', - left: 'center', - width: modalWidth, - height: '80%', - padding: { - left: 1, - right: 1, - }, - scrollbar: { - ch: ' ', - }, - style: { - border: { fg: '#5bc0eb' }, - fg: 'white', - bg: '#101522', - scrollbar: { bg: '#5bc0eb' }, - }, - content: buildHelpContent(), - }); - const help = blessed.box({ - parent: screen, - width: modalWidth, - height: 1, - bottom: 1, - left: 'center', - tags: false, - content: 'Scroll with arrows, PgUp/PgDn, Home, End. Press Esc, q, h, ?, or Enter to close.', - style: { fg: 'black', bg: '#5bc0eb' }, - }); - - box.focus(); - box.setScroll(0); - screen.render(); - - return await new Promise((resolve) => { - let closed = false; - const finish = (): void => { - if (closed) return; - closed = true; - screen.off('keypress', handleKeypress); - screen.off('mousedown', handleMouse); - box.destroy(); - help.destroy(); - screen.render(); - resolve(); - }; - const handleKeypress = (char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { - if (key.name === 'escape' || key.name === 'enter' || key.name === 'q' || key.name === 'h' || char === '?') { - finish(); - return; - } - if (key.name === 'pageup') { - box.scroll(-12); - screen.render(); - return; - } - if (key.name === 'pagedown') { - box.scroll(12); - screen.render(); - return; - } - if (key.name === 'home') { - box.setScroll(0); - screen.render(); - return; - } - if (key.name === 'end') { - box.setScrollPerc(100); - screen.render(); - } - }; - const handleMouse = (event: MouseEventArg): void => { - if (event.button === 'right') { - finish(); - } - }; - - screen.on('keypress', handleKeypress); - screen.on('mousedown', handleMouse); - }); -} - export function getRepositoryChoices(service: Pick, now: Date = new Date()): RepositoryChoice[] { const repositories = service.listRepositories().repositories .slice() diff --git a/apps/cli/src/tui/help.ts b/apps/cli/src/tui/help.ts new file mode 100644 index 0000000..d2ea246 --- /dev/null +++ b/apps/cli/src/tui/help.ts @@ -0,0 +1,141 @@ +import blessed from 'neo-blessed'; + +import type { MouseEventArg } from './widgets.js'; + +export function buildHelpContent(): string { + return [ + '{bold}ghcrawl TUI Help{/bold}', + '', + '{bold}Navigation{/bold}', + 'Tab / Shift-Tab cycle focus across clusters, members, and detail', + 'Left / Right cycle focus backward or forward across panes', + 'Up / Down move selection, or scroll detail when detail is focused', + 'Enter clusters -> members, members -> detail', + 'Mouse click to focus/select; click list headers to sort; right-click opens pane actions; wheel scrolls', + 'PgUp / PgDn page through the focused pane or this help popup faster', + 'Home / End jump to the top or bottom of detail or help', + '', + '{bold}Views And Filters{/bold}', + '# jump directly to an issue or PR number', + 's cycle cluster sort mode', + 'm cycle member sort mode', + 'f cycle minimum cluster size filter', + 'l toggle wide layout: columns vs. wide-left stacked-right', + 'x show or hide locally closed clusters and members', + '/ filter clusters by title/member text', + 'r refresh the current local view from SQLite', + '', + '{bold}Actions{/bold}', + 'p open the repository browser / select another local repository', + 'o open the selected thread URL in your browser', + '', + '{bold}Help And Exit{/bold}', + 'h or ? open this help popup', + 'q quit the TUI or close this popup', + 'Esc close this popup', + '', + '{bold}Notes{/bold}', + 'The TUI only reads local SQLite. Run ghcrawl sync, ghcrawl embed, and ghcrawl cluster from the shell to update data.', + 'The default cluster filter is 1+, so solo clusters are visible unless you raise it with f.', + 'The default sort is size. Press s to toggle size and recent.', + 'Member rows default to issue/PR grouping. Press m or click the member header to sort by updated, number, state, or title.', + 'Mouse clicks focus panes; clicking an already selected row advances to the next pane. Right-click works on every pane.', + 'Clusters show C so the cluster id is easy to copy into CLI or skill flows.', + 'The footer only shows the short command list. Open help to see the full list.', + 'This popup scrolls. Use arrows, PgUp/PgDn, Home, and End if it does not fit.', + ].join('\n'); +} + +export async function promptHelp(screen: blessed.Widgets.Screen): Promise { + const modalWidth = '86%'; + const box = blessed.box({ + parent: screen, + border: 'line', + label: ' Help ', + tags: true, + scrollable: true, + alwaysScroll: true, + keys: true, + vi: true, + mouse: true, + top: 'center', + left: 'center', + width: modalWidth, + height: '80%', + padding: { + left: 1, + right: 1, + }, + scrollbar: { + ch: ' ', + }, + style: { + border: { fg: '#5bc0eb' }, + fg: 'white', + bg: '#101522', + scrollbar: { bg: '#5bc0eb' }, + }, + content: buildHelpContent(), + }); + const help = blessed.box({ + parent: screen, + width: modalWidth, + height: 1, + bottom: 1, + left: 'center', + tags: false, + content: 'Scroll with arrows, PgUp/PgDn, Home, End. Press Esc, q, h, ?, or Enter to close.', + style: { fg: 'black', bg: '#5bc0eb' }, + }); + + box.focus(); + box.setScroll(0); + screen.render(); + + return await new Promise((resolve) => { + let closed = false; + const finish = (): void => { + if (closed) return; + closed = true; + screen.off('keypress', handleKeypress); + screen.off('mousedown', handleMouse); + box.destroy(); + help.destroy(); + screen.render(); + resolve(); + }; + const handleKeypress = (char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { + if (key.name === 'escape' || key.name === 'enter' || key.name === 'q' || key.name === 'h' || char === '?') { + finish(); + return; + } + if (key.name === 'pageup') { + box.scroll(-12); + screen.render(); + return; + } + if (key.name === 'pagedown') { + box.scroll(12); + screen.render(); + return; + } + if (key.name === 'home') { + box.setScroll(0); + screen.render(); + return; + } + if (key.name === 'end') { + box.setScrollPerc(100); + screen.render(); + } + }; + const handleMouse = (event: MouseEventArg): void => { + if (event.button === 'right') { + finish(); + } + }; + + screen.on('keypress', handleKeypress); + screen.on('mousedown', handleMouse); + }); +} From 563e361190ff97b9a089318bded0387a3e07098d Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:53:27 -0700 Subject: [PATCH 195/215] refactor: extract cli report formatting --- apps/cli/src/main.ts | 128 +--------------------------------------- apps/cli/src/reports.ts | 126 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+), 125 deletions(-) create mode 100644 apps/cli/src/reports.ts diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 61d8f68..02fbe2d 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -16,6 +16,7 @@ import { type LoadConfigOptions, } from '@ghcrawl/api-core'; import { createHeapDiagnostics, type HeapDiagnostics } from './heap-diagnostics.js'; +import { buildConfigureReport, formatConfigureReport, formatDoctorReport, type DoctorReport } from './reports.js'; import { startTui } from './tui/app.js'; type CommandName = @@ -64,31 +65,6 @@ type CommandSpec = { agentJson?: boolean; }; -type DoctorResult = Awaited>; -type DoctorReport = DoctorResult & { - version: string; - vectorlite?: { - configured: boolean; - runtimeOk: boolean; - error: string | null; - }; -}; - -type ConfigureReport = { - configPath: string; - updated: boolean; - summaryModel: 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini'; - embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; - vectorBackend: 'vectorlite'; - costEstimateUsd: { - sampleThreads: number; - pricingDate: string; - gpt54: number | null; - gpt5Mini: number; - gpt54Mini: number; - }; -}; - type ParsedGlobalFlags = { argv: string[]; devMode: boolean; @@ -761,10 +737,6 @@ function writeProgress(message: string, stderr: NodeJS.WritableStream): void { stderr.write(`${formatLogLine(message)}\n`); } -function formatBooleanStatus(value: boolean): string { - return value ? 'yes' : 'no'; -} - function parsePositiveInteger(name: string, value: string, command: CommandName): number { const parsed = Number(value); if (!Number.isSafeInteger(parsed) || parsed <= 0) { @@ -802,102 +774,6 @@ function parseEnum(command: CommandName, flagName: string, val throw new CliUsageError(`Invalid --${flagName}: ${value}. Use one of ${allowed.join(', ')}.`, command); } -function buildConfigureReport(options: { - configPath: string; - updated: boolean; - summaryModel: 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini'; - embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; - vectorBackend: 'vectorlite'; -}): ConfigureReport { - return { - ...options, - costEstimateUsd: { - sampleThreads: 20_000, - pricingDate: 'April 1, 2026', - gpt54: null, - gpt5Mini: 12, - gpt54Mini: 30, - }, - }; -} - -export function formatDoctorReport(result: DoctorReport): string { - const lines = [ - 'ghcrawl doctor', - `version: ${result.version}`, - '', - 'Health', - ` ok: ${formatBooleanStatus(result.health.ok)}`, - ` config path: ${result.health.configPath}`, - ` config file exists: ${formatBooleanStatus(result.health.configFileExists)}`, - ` db path: ${result.health.dbPath}`, - ` api port: ${result.health.apiPort}`, - '', - 'GitHub', - ` configured: ${formatBooleanStatus(result.github.configured)}`, - ` source: ${result.github.source}`, - ` token present: ${formatBooleanStatus(result.github.tokenPresent)}`, - ]; - if (result.github.error) { - lines.push(` note: ${result.github.error}`); - } - lines.push( - '', - 'OpenAI', - ` configured: ${formatBooleanStatus(result.openai.configured)}`, - ` source: ${result.openai.source}`, - ` token present: ${formatBooleanStatus(result.openai.tokenPresent)}`, - ); - if (result.openai.error) { - lines.push(` note: ${result.openai.error}`); - } - lines.push( - '', - 'Vectorlite', - ` configured: ${formatBooleanStatus(result.vectorlite?.configured ?? false)}`, - ` runtime ok: ${formatBooleanStatus(result.vectorlite?.runtimeOk ?? false)}`, - ); - if (result.vectorlite?.error) { - lines.push(` note: ${result.vectorlite.error}`); - } - return `${lines.join('\n')}\n`; -} - -export function formatConfigureReport(result: ConfigureReport): string { - const basisLabel = - result.embeddingBasis === 'title_summary' - ? 'title + dedupe summary' - : result.embeddingBasis === 'llm_key_summary' - ? 'title + structured LLM key summary' - : 'title + original body'; - const summaryModeNote = - result.embeddingBasis === 'title_summary' - ? 'enabled automatically during refresh' - : result.embeddingBasis === 'llm_key_summary' - ? 'requires key-summaries before embedding' - : 'disabled by default; enable title_summary or llm_key_summary before embedding'; - const lines = [ - 'ghcrawl configure', - `config path: ${result.configPath}`, - `updated: ${result.updated ? 'yes' : 'no'}`, - '', - 'Active settings', - ` summary model: ${result.summaryModel}`, - ` embedding basis: ${result.embeddingBasis} (${basisLabel})`, - ` llm summaries: ${summaryModeNote}`, - ` vector backend: ${result.vectorBackend}`, - '', - `Estimated one-time summary cost for ~${result.costEstimateUsd.sampleThreads.toLocaleString()} threads`, - ` pricing date: ${result.costEstimateUsd.pricingDate}`, - ` gpt-5.4: ${result.costEstimateUsd.gpt54 === null ? 'not estimated locally' : `~$${result.costEstimateUsd.gpt54.toFixed(0)} USD`}`, - ` gpt-5-mini: ~$${result.costEstimateUsd.gpt5Mini.toFixed(0)} USD`, - ` gpt-5.4-mini: ~$${result.costEstimateUsd.gpt54Mini.toFixed(0)} USD`, - '', - 'Changing summary model or embedding basis will make the next refresh rebuild vectors and clusters.', - ]; - return `${lines.join('\n')}\n`; -} - function closeService(service: GHCrawlService | null): void { if (service) { service.close(); @@ -1625,6 +1501,8 @@ if (import.meta.url === `file://${process.argv[1]}`) { } } +export { formatConfigureReport, formatDoctorReport } from './reports.js'; + function loadCliVersion(): string { const here = path.dirname(fileURLToPath(import.meta.url)); const packageJsonPath = path.resolve(here, '..', 'package.json'); diff --git a/apps/cli/src/reports.ts b/apps/cli/src/reports.ts new file mode 100644 index 0000000..b4df04e --- /dev/null +++ b/apps/cli/src/reports.ts @@ -0,0 +1,126 @@ +import type { GHCrawlService } from '@ghcrawl/api-core'; + +export type DoctorResult = Awaited>; +export type DoctorReport = DoctorResult & { + version: string; + vectorlite?: { + configured: boolean; + runtimeOk: boolean; + error: string | null; + }; +}; + +export type ConfigureReport = { + configPath: string; + updated: boolean; + summaryModel: 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini'; + embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; + vectorBackend: 'vectorlite'; + costEstimateUsd: { + sampleThreads: number; + pricingDate: string; + gpt54: number | null; + gpt5Mini: number; + gpt54Mini: number; + }; +}; + +export function buildConfigureReport(options: { + configPath: string; + updated: boolean; + summaryModel: 'gpt-5.4' | 'gpt-5-mini' | 'gpt-5.4-mini'; + embeddingBasis: 'title_original' | 'title_summary' | 'llm_key_summary'; + vectorBackend: 'vectorlite'; +}): ConfigureReport { + return { + ...options, + costEstimateUsd: { + sampleThreads: 20_000, + pricingDate: 'April 1, 2026', + gpt54: null, + gpt5Mini: 12, + gpt54Mini: 30, + }, + }; +} + +export function formatDoctorReport(result: DoctorReport): string { + const lines = [ + 'ghcrawl doctor', + `version: ${result.version}`, + '', + 'Health', + ` ok: ${formatBooleanStatus(result.health.ok)}`, + ` config path: ${result.health.configPath}`, + ` config file exists: ${formatBooleanStatus(result.health.configFileExists)}`, + ` db path: ${result.health.dbPath}`, + ` api port: ${result.health.apiPort}`, + '', + 'GitHub', + ` configured: ${formatBooleanStatus(result.github.configured)}`, + ` source: ${result.github.source}`, + ` token present: ${formatBooleanStatus(result.github.tokenPresent)}`, + ]; + if (result.github.error) { + lines.push(` note: ${result.github.error}`); + } + lines.push( + '', + 'OpenAI', + ` configured: ${formatBooleanStatus(result.openai.configured)}`, + ` source: ${result.openai.source}`, + ` token present: ${formatBooleanStatus(result.openai.tokenPresent)}`, + ); + if (result.openai.error) { + lines.push(` note: ${result.openai.error}`); + } + lines.push( + '', + 'Vectorlite', + ` configured: ${formatBooleanStatus(result.vectorlite?.configured ?? false)}`, + ` runtime ok: ${formatBooleanStatus(result.vectorlite?.runtimeOk ?? false)}`, + ); + if (result.vectorlite?.error) { + lines.push(` note: ${result.vectorlite.error}`); + } + return `${lines.join('\n')}\n`; +} + +export function formatConfigureReport(result: ConfigureReport): string { + const basisLabel = + result.embeddingBasis === 'title_summary' + ? 'title + dedupe summary' + : result.embeddingBasis === 'llm_key_summary' + ? 'title + structured LLM key summary' + : 'title + original body'; + const summaryModeNote = + result.embeddingBasis === 'title_summary' + ? 'enabled automatically during refresh' + : result.embeddingBasis === 'llm_key_summary' + ? 'requires key-summaries before embedding' + : 'disabled by default; enable title_summary or llm_key_summary before embedding'; + const lines = [ + 'ghcrawl configure', + `config path: ${result.configPath}`, + `updated: ${result.updated ? 'yes' : 'no'}`, + '', + 'Active settings', + ` summary model: ${result.summaryModel}`, + ` embedding basis: ${result.embeddingBasis} (${basisLabel})`, + ` llm summaries: ${summaryModeNote}`, + ` vector backend: ${result.vectorBackend}`, + '', + `Estimated one-time summary cost for ~${result.costEstimateUsd.sampleThreads.toLocaleString()} threads`, + ` pricing date: ${result.costEstimateUsd.pricingDate}`, + ` gpt-5.4: ${result.costEstimateUsd.gpt54 === null ? 'not estimated locally' : `~$${result.costEstimateUsd.gpt54.toFixed(0)} USD`}`, + ` gpt-5-mini: ~$${result.costEstimateUsd.gpt5Mini.toFixed(0)} USD`, + ` gpt-5.4-mini: ~$${result.costEstimateUsd.gpt54Mini.toFixed(0)} USD`, + '', + 'Changing summary model or embedding basis will make the next refresh rebuild vectors and clusters.', + ]; + return `${lines.join('\n')}\n`; +} + +function formatBooleanStatus(value: boolean): string { + return value ? 'yes' : 'no'; +} From a5d2ed41c72b394121f0e926c116bad1d18bd3ce Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:55:08 -0700 Subject: [PATCH 196/215] refactor: extract thread listing query --- packages/api-core/src/service.ts | 52 ++----------------- packages/api-core/src/threads/list.ts | 73 +++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 47 deletions(-) create mode 100644 packages/api-core/src/threads/list.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index ac566d6..5a0e36c 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -25,7 +25,6 @@ import { repositoriesResponseSchema, searchResponseSchema, syncResultSchema, - threadsResponseSchema, type ActionRequest, type ActionResponse, type CloseResponse, @@ -57,7 +56,6 @@ import { type SetClusterCanonicalRequest, type SplitClusterRequest, type SyncResultDto, - type ThreadDto, type ThreadsResponse, } from '@ghcrawl/api-contract'; @@ -145,6 +143,7 @@ import { persistThreadCodeSnapshot, upsertRepository, upsertThread } from './syn import { applyClosedOverlapSweep, countStaleOpenThreads, reconcileMissingOpenThreads } from './sync/reconcile.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { compareTuiClusterSummary } from './tui/cluster-format.js'; +import { listRepositoryThreads } from './threads/list.js'; import { getDurableTuiClusterSummary, getRawTuiClusterSummary, @@ -302,52 +301,11 @@ export class GHCrawlService { listThreads(params: { owner: string; repo: string; kind?: 'issue' | 'pull_request'; numbers?: number[]; includeClosed?: boolean }): ThreadsResponse { const repository = this.requireRepository(params.owner, params.repo); - const clusterIds = new Map(); - const clusterRows = this.db - .prepare( - `select cm.thread_id, cm.cluster_id - from cluster_members cm - join clusters c on c.id = cm.cluster_id - where c.repo_id = ? and c.cluster_run_id = ( - select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1 - )`, - ) - .all(repository.id, repository.id) as Array<{ thread_id: number; cluster_id: number }>; - for (const row of clusterRows) clusterIds.set(row.thread_id, row.cluster_id); - - let sql = 'select * from threads where repo_id = ?'; - const args: Array = [repository.id]; - if (!params.includeClosed) { - sql += " and state = 'open' and closed_at_local is null"; - } - if (params.kind) { - sql += ' and kind = ?'; - args.push(params.kind); - } - if (params.numbers && params.numbers.length > 0) { - const uniqueNumbers = Array.from(new Set(params.numbers.filter((value) => Number.isSafeInteger(value) && value > 0))); - if (uniqueNumbers.length === 0) { - return threadsResponseSchema.parse({ - repository, - threads: [], - }); - } - sql += ` and number in (${uniqueNumbers.map(() => '?').join(', ')})`; - args.push(...uniqueNumbers); - } - sql += ' order by updated_at_gh desc, number desc'; - const rows = this.db.prepare(sql).all(...args) as ThreadRow[]; - const orderedRows = - params.numbers && params.numbers.length > 0 - ? (() => { - const byNumber = new Map(rows.map((row) => [row.number, row] as const)); - const uniqueRequested = Array.from(new Set(params.numbers)); - return uniqueRequested.map((number) => byNumber.get(number)).filter((row): row is ThreadRow => row !== undefined); - })() - : rows; - return threadsResponseSchema.parse({ + return listRepositoryThreads(this.db, { repository, - threads: orderedRows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)), + kind: params.kind, + numbers: params.numbers, + includeClosed: params.includeClosed, }); } diff --git a/packages/api-core/src/threads/list.ts b/packages/api-core/src/threads/list.ts new file mode 100644 index 0000000..d8ae873 --- /dev/null +++ b/packages/api-core/src/threads/list.ts @@ -0,0 +1,73 @@ +import { threadsResponseSchema, type RepositoryDto, type ThreadsResponse } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { ThreadRow } from '../service-types.js'; +import { threadToDto } from '../service-utils.js'; + +export function listRepositoryThreads( + db: SqliteDatabase, + params: { + repository: RepositoryDto; + kind?: 'issue' | 'pull_request'; + numbers?: number[]; + includeClosed?: boolean; + }, +): ThreadsResponse { + const clusterIds = loadLatestClusterIds(db, params.repository.id); + let sql = 'select * from threads where repo_id = ?'; + const args: Array = [params.repository.id]; + if (!params.includeClosed) { + sql += " and state = 'open' and closed_at_local is null"; + } + if (params.kind) { + sql += ' and kind = ?'; + args.push(params.kind); + } + if (params.numbers && params.numbers.length > 0) { + const uniqueNumbers = Array.from(new Set(params.numbers.filter((value) => Number.isSafeInteger(value) && value > 0))); + if (uniqueNumbers.length === 0) { + return threadsResponseSchema.parse({ + repository: params.repository, + threads: [], + }); + } + sql += ` and number in (${uniqueNumbers.map(() => '?').join(', ')})`; + args.push(...uniqueNumbers); + } + sql += ' order by updated_at_gh desc, number desc'; + + const rows = db.prepare(sql).all(...args) as ThreadRow[]; + const orderedRows = orderRowsByRequestedNumbers(rows, params.numbers); + return threadsResponseSchema.parse({ + repository: params.repository, + threads: orderedRows.map((row) => threadToDto(row, clusterIds.get(row.id) ?? null)), + }); +} + +function loadLatestClusterIds(db: SqliteDatabase, repoId: number): Map { + const clusterIds = new Map(); + const rows = db + .prepare( + `select cm.thread_id, cm.cluster_id + from cluster_members cm + join clusters c on c.id = cm.cluster_id + where c.repo_id = ? and c.cluster_run_id = ( + select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1 + )`, + ) + .all(repoId, repoId) as Array<{ thread_id: number; cluster_id: number }>; + for (const row of rows) { + clusterIds.set(row.thread_id, row.cluster_id); + } + return clusterIds; +} + +function orderRowsByRequestedNumbers(rows: ThreadRow[], numbers: number[] | undefined): ThreadRow[] { + if (!numbers || numbers.length === 0) { + return rows; + } + const byNumber = new Map(rows.map((row) => [row.number, row] as const)); + return Array.from(new Set(numbers)) + .map((number) => byNumber.get(number)) + .filter((row): row is ThreadRow => row !== undefined); +} From 18b3411750a22ab471678055431e92ea177e208e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:56:03 -0700 Subject: [PATCH 197/215] test: cover portable sync drift --- packages/api-core/src/service.test.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 413fd34..7eca580 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -349,6 +349,21 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.equal(status.drift.portableOnlyThreads, 0); assert.equal(status.drift.changedThreads, 0); + const driftDb = openDb(outputPath); + try { + driftDb.prepare("update threads set title = 'Gateway crash from stale portable copy' where number = 42").run(); + } finally { + driftDb.close(); + } + const driftStatus = service.portableSyncStatus({ + owner: 'openclaw', + repo: 'openclaw', + portablePath: outputPath, + }); + assert.equal(driftStatus.drift.liveOnlyThreads, 0); + assert.equal(driftStatus.drift.portableOnlyThreads, 0); + assert.equal(driftStatus.drift.changedThreads, 1); + const portable = openDb(outputPath); try { const thread = portable.prepare('select body_excerpt, body_length from threads where number = 42').get() as { From 8295564bcfa01a75293fe1c2462d5c142a877706 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:57:42 -0700 Subject: [PATCH 198/215] refactor: extract repository listing query --- packages/api-core/src/repositories/list.ts | 9 +++++++++ packages/api-core/src/service.ts | 5 ++--- 2 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 packages/api-core/src/repositories/list.ts diff --git a/packages/api-core/src/repositories/list.ts b/packages/api-core/src/repositories/list.ts new file mode 100644 index 0000000..5792873 --- /dev/null +++ b/packages/api-core/src/repositories/list.ts @@ -0,0 +1,9 @@ +import { repositoriesResponseSchema, type RepositoriesResponse } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import { repositoryToDto } from '../service-utils.js'; + +export function listStoredRepositories(db: SqliteDatabase): RepositoriesResponse { + const rows = db.prepare('select * from repositories order by full_name asc').all() as Array>; + return repositoriesResponseSchema.parse({ repositories: rows.map(repositoryToDto) }); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 5a0e36c..1428245 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -22,7 +22,6 @@ import { neighborsResponseSchema, optimizeResponseSchema, refreshResponseSchema, - repositoriesResponseSchema, searchResponseSchema, syncResultSchema, type ActionRequest, @@ -135,6 +134,7 @@ import { type PortableSyncValidationResponse, } from './portable/sync-store.js'; import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from './run-history.js'; +import { listStoredRepositories } from './repositories/list.js'; import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; import { fetchThreadComments } from './sync/comments.js'; @@ -285,8 +285,7 @@ export class GHCrawlService { } listRepositories(): RepositoriesResponse { - const rows = this.db.prepare('select * from repositories order by full_name asc').all() as Array>; - return repositoriesResponseSchema.parse({ repositories: rows.map(repositoryToDto) }); + return listStoredRepositories(this.db); } listRunHistory(params: { owner: string; repo: string; kind?: RunKind; limit?: number }): RunHistoryResponse { From a645a3b0192bb28530b19ce55f260b81b723b364 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 18:59:41 -0700 Subject: [PATCH 199/215] test: cover portable sync row drift --- packages/api-core/src/service.test.ts | 92 ++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 7eca580..6d2a4ab 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -364,6 +364,88 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl assert.equal(driftStatus.drift.portableOnlyThreads, 0); assert.equal(driftStatus.drift.changedThreads, 1); + service.db + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body, author_login, author_type, html_url, + labels_json, assignees_json, raw_json, content_hash, is_draft, created_at_gh, updated_at_gh, + closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 11, + 1, + '101', + 43, + 'issue', + 'open', + 'Live-only gateway crash', + 'new live row', + 'alice', + 'User', + 'https://github.com/openclaw/openclaw/issues/43', + '[]', + '[]', + '{}', + 'live-only-hash', + 0, + now, + now, + null, + null, + now, + now, + now, + ); + const portableDriftDb = openDb(outputPath); + try { + portableDriftDb + .prepare( + `insert into threads ( + id, repo_id, github_id, number, kind, state, title, body_excerpt, body_length, author_login, author_type, + html_url, labels_json, assignees_json, content_hash, is_draft, created_at_gh, updated_at_gh, + closed_at_gh, merged_at_gh, first_pulled_at, last_pulled_at, updated_at, closed_at_local, close_reason_local + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ) + .run( + 12, + 1, + '102', + 44, + 'issue', + 'closed', + 'Portable-only stale crash', + 'old portable row', + 16, + 'bob', + 'User', + 'https://github.com/openclaw/openclaw/issues/44', + '[]', + '[]', + 'portable-only-hash', + 0, + now, + now, + now, + null, + now, + now, + now, + null, + null, + ); + } finally { + portableDriftDb.close(); + } + const divergentStatus = service.portableSyncStatus({ + owner: 'openclaw', + repo: 'openclaw', + portablePath: outputPath, + }); + assert.equal(divergentStatus.drift.liveOnlyThreads, 1); + assert.equal(divergentStatus.drift.portableOnlyThreads, 1); + assert.equal(divergentStatus.drift.changedThreads, 1); + const portable = openDb(outputPath); try { const thread = portable.prepare('select body_excerpt, body_length from threads where number = 42').get() as { @@ -415,7 +497,7 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl const importResult = importService.importPortableSync(outputPath); assert.equal(importResult.ok, true); assert.equal(importResult.repository.fullName, 'openclaw/openclaw'); - assert.equal(importResult.imported.threads, 1); + assert.equal(importResult.imported.threads, 2); assert.equal(importResult.imported.clusterGroups, 1); assert.equal(importResult.imported.clusterMemberships, 1); const importedThread = importService.db.prepare('select body, raw_json from threads where number = 42').get() as { @@ -424,6 +506,14 @@ test('exportPortableSync writes a compact sync database without bulky cache tabl }; assert.equal(importedThread.body.length, 64); assert.equal(importedThread.raw_json, '{}'); + const importedPortableOnlyThread = importService.db.prepare('select state, body, raw_json from threads where number = 44').get() as { + state: string; + body: string; + raw_json: string; + }; + assert.equal(importedPortableOnlyThread.state, 'closed'); + assert.equal(importedPortableOnlyThread.body, 'old portable row'); + assert.equal(importedPortableOnlyThread.raw_json, '{}'); } finally { importService.close(); } From ae4a10b47b3e9c52f9d080d0be5791a63763a27a Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:00:25 -0700 Subject: [PATCH 200/215] docs: mark portable drift tests complete --- docs/PLAN.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/PLAN.md b/docs/PLAN.md index b26d3e7..47f6422 100644 --- a/docs/PLAN.md +++ b/docs/PLAN.md @@ -166,7 +166,7 @@ Decision note: - [x] Tune cluster quality on real output and validate in the TUI. - [x] Capture operator docs for refresh, manual pipeline control, closed clusters, durable overrides, portable git-sync export, and optimize. - [ ] Finish service decomposition so `service.ts`, `apps/cli/src/main.ts`, and `apps/cli/src/tui/app.ts` stay small enough to maintain. -- [ ] Add focused tests around portable import conflict handling and sync drift reporting. +- [x] Add focused tests around portable import conflict handling and sync drift reporting. - [ ] Add a release-readiness pass for packaged `vectorlite` installs across supported Node versions. ## Recommended Execution Order From afa8e5585f3ce34cfe2e4e9eecf5c708731d5183 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:01:51 -0700 Subject: [PATCH 201/215] refactor: extract local thread closure --- packages/api-core/src/service.ts | 33 +++-------------------- packages/api-core/src/threads/close.ts | 37 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 30 deletions(-) create mode 100644 packages/api-core/src/threads/close.ts diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 1428245..363c5ce 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -78,7 +78,7 @@ import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; import { listStoredClusterNeighbors } from './cluster/neighbor-queries.js'; import { summarizeClusterQuality, summarizeClusterSizes } from './cluster/quality.js'; -import { getLatestClusterRun, getLatestRunClusterIdsForThread } from './cluster/run-queries.js'; +import { getLatestClusterRun } from './cluster/run-queries.js'; import { createPipelineRun, finishPipelineRun, @@ -143,6 +143,7 @@ import { persistThreadCodeSnapshot, upsertRepository, upsertThread } from './syn import { applyClosedOverlapSweep, countStaleOpenThreads, reconcileMissingOpenThreads } from './sync/reconcile.js'; import { buildKeySummaryInputText, buildSummarySource } from './summary/source.js'; import { compareTuiClusterSummary } from './tui/cluster-format.js'; +import { closeRepositoryThreadLocally } from './threads/close.js'; import { listRepositoryThreads } from './threads/list.js'; import { getDurableTuiClusterSummary, @@ -310,35 +311,7 @@ export class GHCrawlService { closeThreadLocally(params: { owner: string; repo: string; threadNumber: number }): CloseResponse { const repository = this.requireRepository(params.owner, params.repo); - const row = this.db - .prepare('select * from threads where repo_id = ? and number = ? limit 1') - .get(repository.id, params.threadNumber) as ThreadRow | undefined; - if (!row) { - throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); - } - - const closedAt = nowIso(); - this.db - .prepare( - `update threads - set closed_at_local = ?, - close_reason_local = 'manual', - updated_at = ? - where id = ?`, - ) - .run(closedAt, closedAt, row.id); - const clusterIds = getLatestRunClusterIdsForThread(this.db, repository.id, row.id); - const clusterClosed = reconcileClusterCloseState(this.db, repository.id, clusterIds) > 0; - const updated = this.db.prepare('select * from threads where id = ? limit 1').get(row.id) as ThreadRow; - - return closeResponseSchema.parse({ - ok: true, - repository, - thread: threadToDto(updated), - clusterId: clusterIds[0] ?? null, - clusterClosed, - message: `Marked ${updated.kind} #${updated.number} closed locally.`, - }); + return closeRepositoryThreadLocally(this.db, repository, params.threadNumber); } closeClusterLocally(params: { owner: string; repo: string; clusterId: number }): CloseResponse { diff --git a/packages/api-core/src/threads/close.ts b/packages/api-core/src/threads/close.ts new file mode 100644 index 0000000..ff21c53 --- /dev/null +++ b/packages/api-core/src/threads/close.ts @@ -0,0 +1,37 @@ +import { closeResponseSchema, type CloseResponse, type RepositoryDto } from '@ghcrawl/api-contract'; + +import { reconcileClusterCloseState } from '../cluster/close-state.js'; +import { getLatestRunClusterIdsForThread } from '../cluster/run-queries.js'; +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { ThreadRow } from '../service-types.js'; +import { nowIso, threadToDto } from '../service-utils.js'; + +export function closeRepositoryThreadLocally(db: SqliteDatabase, repository: RepositoryDto, threadNumber: number): CloseResponse { + const row = db + .prepare('select * from threads where repo_id = ? and number = ? limit 1') + .get(repository.id, threadNumber) as ThreadRow | undefined; + if (!row) { + throw new Error(`Thread #${threadNumber} was not found for ${repository.fullName}.`); + } + + const closedAt = nowIso(); + db.prepare( + `update threads + set closed_at_local = ?, + close_reason_local = 'manual', + updated_at = ? + where id = ?`, + ).run(closedAt, closedAt, row.id); + const clusterIds = getLatestRunClusterIdsForThread(db, repository.id, row.id); + const clusterClosed = reconcileClusterCloseState(db, repository.id, clusterIds) > 0; + const updated = db.prepare('select * from threads where id = ? limit 1').get(row.id) as ThreadRow; + + return closeResponseSchema.parse({ + ok: true, + repository, + thread: threadToDto(updated), + clusterId: clusterIds[0] ?? null, + clusterClosed, + message: `Marked ${updated.kind} #${updated.number} closed locally.`, + }); +} From a629ce017db93be9e9f1d13e94c905e94e62d2c0 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:04:38 -0700 Subject: [PATCH 202/215] refactor: extract durable cluster queries --- .../api-core/src/cluster/durable-queries.ts | 273 ++++++++++++++++++ packages/api-core/src/service.ts | 259 +---------------- 2 files changed, 276 insertions(+), 256 deletions(-) create mode 100644 packages/api-core/src/cluster/durable-queries.ts diff --git a/packages/api-core/src/cluster/durable-queries.ts b/packages/api-core/src/cluster/durable-queries.ts new file mode 100644 index 0000000..976c904 --- /dev/null +++ b/packages/api-core/src/cluster/durable-queries.ts @@ -0,0 +1,273 @@ +import { + clusterExplainResponseSchema, + durableClustersResponseSchema, + type ClusterExplainResponse, + type DurableClustersResponse, + type RepositoryDto, +} from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import type { ThreadRow } from '../service-types.js'; +import { parseObjectJson, threadToDto } from '../service-utils.js'; + +type DurableClusterStatus = 'active' | 'closed' | 'merged' | 'split'; +type DurableMemberRole = 'canonical' | 'duplicate' | 'related'; +type DurableMemberState = 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; + +type DurableClusterRow = { + id: number; + stable_key: string; + stable_slug: string; + status: DurableClusterStatus; + cluster_type: string | null; + representative_thread_id: number | null; + title: string | null; +}; + +type DurableMemberRow = ThreadRow & { + membership_role: DurableMemberRole; + membership_state: DurableMemberState; + membership_score: number | null; +}; + +export function listStoredDurableClusters( + db: SqliteDatabase, + repository: RepositoryDto, + params: { includeInactive?: boolean; memberLimit?: number } = {}, +): DurableClustersResponse { + const clusterRows = db + .prepare( + `select id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title + from cluster_groups + where repo_id = ? + and (? = 1 or status = 'active') + order by updated_at desc, id asc`, + ) + .all(repository.id, params.includeInactive ? 1 : 0) as DurableClusterRow[]; + if (clusterRows.length === 0) { + return durableClustersResponseSchema.parse({ repository, clusters: [] }); + } + + const clusterIds = clusterRows.map((row) => row.id); + const placeholders = clusterIds.map(() => '?').join(','); + const memberRows = db + .prepare( + `select + cm.cluster_id, + cm.role as membership_role, + cm.state as membership_state, + cm.score_to_representative as membership_score, + t.* + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id in (${placeholders}) + order by + case cm.role when 'canonical' then 0 else 1 end, + case cm.state when 'active' then 0 when 'pending_review' then 1 else 2 end, + t.number asc`, + ) + .all(...clusterIds) as Array; + const membersByCluster = new Map>(); + for (const row of memberRows) { + const members = membersByCluster.get(row.cluster_id) ?? []; + members.push(row); + membersByCluster.set(row.cluster_id, members); + } + + return durableClustersResponseSchema.parse({ + repository, + clusters: clusterRows.map((cluster) => { + const rows = membersByCluster.get(cluster.id) ?? []; + const visibleRows = params.memberLimit === undefined ? rows : rows.slice(0, params.memberLimit); + return { + clusterId: cluster.id, + stableKey: cluster.stable_key, + stableSlug: cluster.stable_slug, + status: cluster.status, + clusterType: cluster.cluster_type, + title: cluster.title, + representativeThreadId: cluster.representative_thread_id, + activeCount: rows.filter((row) => row.membership_state === 'active').length, + removedCount: rows.filter((row) => row.membership_state === 'removed_by_user').length, + blockedCount: rows.filter((row) => row.membership_state === 'blocked_by_override').length, + members: visibleRows.map((row) => ({ + thread: threadToDto(row), + role: row.membership_role, + state: row.membership_state, + scoreToRepresentative: row.membership_score, + })), + }; + }), + }); +} + +export function explainStoredDurableCluster( + db: SqliteDatabase, + repository: RepositoryDto, + params: { clusterId: number; memberLimit?: number; eventLimit?: number }, +): ClusterExplainResponse { + const cluster = db + .prepare( + `select id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title + from cluster_groups + where repo_id = ? + and id = ? + limit 1`, + ) + .get(repository.id, params.clusterId) as DurableClusterRow | undefined; + if (!cluster) { + throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); + } + + const allMembers = db + .prepare( + `select + cm.role as membership_role, + cm.state as membership_state, + cm.score_to_representative as membership_score, + t.* + from cluster_memberships cm + join threads t on t.id = cm.thread_id + where cm.cluster_id = ? + order by + case cm.role when 'canonical' then 0 else 1 end, + case cm.state when 'active' then 0 when 'pending_review' then 1 else 2 end, + t.number asc`, + ) + .all(cluster.id) as DurableMemberRow[]; + const visibleMembers = allMembers.slice(0, params.memberLimit ?? 50); + const visibleThreadIds = visibleMembers.map((row) => row.id); + + const aliases = db + .prepare( + `select alias_slug, reason, created_at + from cluster_aliases + where cluster_id = ? + order by created_at desc, alias_slug asc`, + ) + .all(cluster.id) as Array<{ alias_slug: string; reason: string; created_at: string }>; + const overrides = db + .prepare( + `select t.number, co.action, co.reason, co.created_at, co.expires_at + from cluster_overrides co + join threads t on t.id = co.thread_id + where co.cluster_id = ? + order by co.created_at desc, t.number asc`, + ) + .all(cluster.id) as Array<{ + number: number; + action: 'exclude' | 'force_include' | 'force_canonical'; + reason: string | null; + created_at: string; + expires_at: string | null; + }>; + const events = db + .prepare( + `select event_type, actor_kind, payload_json, created_at + from cluster_events + where cluster_id = ? + order by created_at desc, id desc + limit ?`, + ) + .all(cluster.id, params.eventLimit ?? 25) as Array<{ event_type: string; actor_kind: string; payload_json: string; created_at: string }>; + + let evidence: Array<{ + leftThreadNumber: number; + rightThreadNumber: number; + score: number; + tier: 'strong' | 'weak'; + state: 'active' | 'stale' | 'rejected'; + sources: string[]; + breakdown: Record; + lastSeenRunId: number | null; + updatedAt: string; + }> = []; + if (visibleThreadIds.length >= 2) { + const placeholders = visibleThreadIds.map(() => '?').join(','); + const rows = db + .prepare( + `select + le.number as left_number, + re.number as right_number, + e.score, + e.tier, + e.state, + e.breakdown_json, + e.last_seen_run_id, + e.updated_at + from similarity_edge_evidence e + join threads le on le.id = e.left_thread_id + join threads re on re.id = e.right_thread_id + where e.repo_id = ? + and e.left_thread_id in (${placeholders}) + and e.right_thread_id in (${placeholders}) + order by e.score desc, le.number asc, re.number asc`, + ) + .all(repository.id, ...visibleThreadIds, ...visibleThreadIds) as Array<{ + left_number: number; + right_number: number; + score: number; + tier: 'strong' | 'weak'; + state: 'active' | 'stale' | 'rejected'; + breakdown_json: string; + last_seen_run_id: number | null; + updated_at: string; + }>; + evidence = rows.map((row) => { + const breakdown = parseObjectJson(row.breakdown_json) ?? {}; + const rawSources = breakdown.sources; + return { + leftThreadNumber: row.left_number, + rightThreadNumber: row.right_number, + score: row.score, + tier: row.tier, + state: row.state, + sources: Array.isArray(rawSources) ? rawSources.filter((source): source is string => typeof source === 'string') : [], + breakdown, + lastSeenRunId: row.last_seen_run_id, + updatedAt: row.updated_at, + }; + }); + } + + return clusterExplainResponseSchema.parse({ + repository, + cluster: { + clusterId: cluster.id, + stableKey: cluster.stable_key, + stableSlug: cluster.stable_slug, + status: cluster.status, + clusterType: cluster.cluster_type, + title: cluster.title, + representativeThreadId: cluster.representative_thread_id, + activeCount: allMembers.filter((row) => row.membership_state === 'active').length, + removedCount: allMembers.filter((row) => row.membership_state === 'removed_by_user').length, + blockedCount: allMembers.filter((row) => row.membership_state === 'blocked_by_override').length, + members: visibleMembers.map((row) => ({ + thread: threadToDto(row), + role: row.membership_role, + state: row.membership_state, + scoreToRepresentative: row.membership_score, + })), + }, + aliases: aliases.map((alias) => ({ + aliasSlug: alias.alias_slug, + reason: alias.reason, + createdAt: alias.created_at, + })), + overrides: overrides.map((override) => ({ + threadNumber: override.number, + action: override.action, + reason: override.reason, + createdAt: override.created_at, + expiresAt: override.expires_at, + })), + events: events.map((event) => ({ + eventType: event.event_type, + actorKind: event.actor_kind, + payload: parseObjectJson(event.payload_json), + createdAt: event.created_at, + })), + evidence, + }); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 363c5ce..fac241f 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -12,11 +12,9 @@ import { clusterMergeResponseSchema, clusterSplitResponseSchema, clusterDetailResponseSchema, - clusterExplainResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, clustersResponseSchema, - durableClustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, @@ -62,6 +60,7 @@ import { buildClusters, buildRefinedClusters, buildSizeBoundedClusters } from '. import { reconcileClusterCloseState } from './cluster/close-state.js'; import { buildDeterministicClusterGraphFromFingerprints } from './cluster/deterministic-engine.js'; import { loadDeterministicClusterableThreadMeta } from './cluster/deterministic-thread-loader.js'; +import { explainStoredDurableCluster, listStoredDurableClusters } from './cluster/durable-queries.js'; import { collectSourceKindScores, edgeKey, @@ -212,7 +211,6 @@ import { nowIso, parseArray, parseIso, - parseObjectJson, repositoryToDto, snippetText, stableContentHash, @@ -2451,263 +2449,12 @@ export class GHCrawlService { listDurableClusters(params: { owner: string; repo: string; includeInactive?: boolean; memberLimit?: number }): DurableClustersResponse { const repository = this.requireRepository(params.owner, params.repo); - const clusterRows = this.db - .prepare( - `select id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title - from cluster_groups - where repo_id = ? - and (? = 1 or status = 'active') - order by updated_at desc, id asc`, - ) - .all(repository.id, params.includeInactive ? 1 : 0) as Array<{ - id: number; - stable_key: string; - stable_slug: string; - status: 'active' | 'closed' | 'merged' | 'split'; - cluster_type: string | null; - representative_thread_id: number | null; - title: string | null; - }>; - if (clusterRows.length === 0) { - return durableClustersResponseSchema.parse({ repository, clusters: [] }); - } - - const clusterIds = clusterRows.map((row) => row.id); - const placeholders = clusterIds.map(() => '?').join(','); - const memberRows = this.db - .prepare( - `select - cm.cluster_id, - cm.role as membership_role, - cm.state as membership_state, - cm.score_to_representative as membership_score, - t.* - from cluster_memberships cm - join threads t on t.id = cm.thread_id - where cm.cluster_id in (${placeholders}) - order by - case cm.role when 'canonical' then 0 else 1 end, - case cm.state when 'active' then 0 when 'pending_review' then 1 else 2 end, - t.number asc`, - ) - .all(...clusterIds) as Array< - ThreadRow & { - cluster_id: number; - membership_role: 'canonical' | 'duplicate' | 'related'; - membership_state: 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; - membership_score: number | null; - } - >; - const membersByCluster = new Map(); - for (const row of memberRows) { - const members = membersByCluster.get(row.cluster_id) ?? []; - members.push(row); - membersByCluster.set(row.cluster_id, members); - } - - return durableClustersResponseSchema.parse({ - repository, - clusters: clusterRows.map((cluster) => { - const rows = membersByCluster.get(cluster.id) ?? []; - const visibleRows = params.memberLimit === undefined ? rows : rows.slice(0, params.memberLimit); - return { - clusterId: cluster.id, - stableKey: cluster.stable_key, - stableSlug: cluster.stable_slug, - status: cluster.status, - clusterType: cluster.cluster_type, - title: cluster.title, - representativeThreadId: cluster.representative_thread_id, - activeCount: rows.filter((row) => row.membership_state === 'active').length, - removedCount: rows.filter((row) => row.membership_state === 'removed_by_user').length, - blockedCount: rows.filter((row) => row.membership_state === 'blocked_by_override').length, - members: visibleRows.map((row) => ({ - thread: threadToDto(row), - role: row.membership_role, - state: row.membership_state, - scoreToRepresentative: row.membership_score, - })), - }; - }), - }); + return listStoredDurableClusters(this.db, repository, params); } explainDurableCluster(params: { owner: string; repo: string; clusterId: number; memberLimit?: number; eventLimit?: number }): ClusterExplainResponse { const repository = this.requireRepository(params.owner, params.repo); - const cluster = this.db - .prepare( - `select id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title - from cluster_groups - where repo_id = ? - and id = ? - limit 1`, - ) - .get(repository.id, params.clusterId) as - | { - id: number; - stable_key: string; - stable_slug: string; - status: 'active' | 'closed' | 'merged' | 'split'; - cluster_type: string | null; - representative_thread_id: number | null; - title: string | null; - } - | undefined; - if (!cluster) { - throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); - } - - const allMembers = this.db - .prepare( - `select - cm.role as membership_role, - cm.state as membership_state, - cm.score_to_representative as membership_score, - t.* - from cluster_memberships cm - join threads t on t.id = cm.thread_id - where cm.cluster_id = ? - order by - case cm.role when 'canonical' then 0 else 1 end, - case cm.state when 'active' then 0 when 'pending_review' then 1 else 2 end, - t.number asc`, - ) - .all(cluster.id) as Array< - ThreadRow & { - membership_role: 'canonical' | 'duplicate' | 'related'; - membership_state: 'active' | 'removed_by_user' | 'blocked_by_override' | 'pending_review' | 'stale'; - membership_score: number | null; - } - >; - const visibleMembers = allMembers.slice(0, params.memberLimit ?? 50); - const visibleThreadIds = visibleMembers.map((row) => row.id); - - const aliases = this.db - .prepare( - `select alias_slug, reason, created_at - from cluster_aliases - where cluster_id = ? - order by created_at desc, alias_slug asc`, - ) - .all(cluster.id) as Array<{ alias_slug: string; reason: string; created_at: string }>; - const overrides = this.db - .prepare( - `select t.number, co.action, co.reason, co.created_at, co.expires_at - from cluster_overrides co - join threads t on t.id = co.thread_id - where co.cluster_id = ? - order by co.created_at desc, t.number asc`, - ) - .all(cluster.id) as Array<{ number: number; action: 'exclude' | 'force_include' | 'force_canonical'; reason: string | null; created_at: string; expires_at: string | null }>; - const events = this.db - .prepare( - `select event_type, actor_kind, payload_json, created_at - from cluster_events - where cluster_id = ? - order by created_at desc, id desc - limit ?`, - ) - .all(cluster.id, params.eventLimit ?? 25) as Array<{ event_type: string; actor_kind: string; payload_json: string; created_at: string }>; - - let evidence: Array<{ - leftThreadNumber: number; - rightThreadNumber: number; - score: number; - tier: 'strong' | 'weak'; - state: 'active' | 'stale' | 'rejected'; - sources: string[]; - breakdown: Record; - lastSeenRunId: number | null; - updatedAt: string; - }> = []; - if (visibleThreadIds.length >= 2) { - const placeholders = visibleThreadIds.map(() => '?').join(','); - const rows = this.db - .prepare( - `select - le.number as left_number, - re.number as right_number, - e.score, - e.tier, - e.state, - e.breakdown_json, - e.last_seen_run_id, - e.updated_at - from similarity_edge_evidence e - join threads le on le.id = e.left_thread_id - join threads re on re.id = e.right_thread_id - where e.repo_id = ? - and e.left_thread_id in (${placeholders}) - and e.right_thread_id in (${placeholders}) - order by e.score desc, le.number asc, re.number asc`, - ) - .all(repository.id, ...visibleThreadIds, ...visibleThreadIds) as Array<{ - left_number: number; - right_number: number; - score: number; - tier: 'strong' | 'weak'; - state: 'active' | 'stale' | 'rejected'; - breakdown_json: string; - last_seen_run_id: number | null; - updated_at: string; - }>; - evidence = rows.map((row) => { - const breakdown = parseObjectJson(row.breakdown_json) ?? {}; - const rawSources = breakdown.sources; - return { - leftThreadNumber: row.left_number, - rightThreadNumber: row.right_number, - score: row.score, - tier: row.tier, - state: row.state, - sources: Array.isArray(rawSources) ? rawSources.filter((source): source is string => typeof source === 'string') : [], - breakdown, - lastSeenRunId: row.last_seen_run_id, - updatedAt: row.updated_at, - }; - }); - } - - return clusterExplainResponseSchema.parse({ - repository, - cluster: { - clusterId: cluster.id, - stableKey: cluster.stable_key, - stableSlug: cluster.stable_slug, - status: cluster.status, - clusterType: cluster.cluster_type, - title: cluster.title, - representativeThreadId: cluster.representative_thread_id, - activeCount: allMembers.filter((row) => row.membership_state === 'active').length, - removedCount: allMembers.filter((row) => row.membership_state === 'removed_by_user').length, - blockedCount: allMembers.filter((row) => row.membership_state === 'blocked_by_override').length, - members: visibleMembers.map((row) => ({ - thread: threadToDto(row), - role: row.membership_role, - state: row.membership_state, - scoreToRepresentative: row.membership_score, - })), - }, - aliases: aliases.map((alias) => ({ - aliasSlug: alias.alias_slug, - reason: alias.reason, - createdAt: alias.created_at, - })), - overrides: overrides.map((override) => ({ - threadNumber: override.number, - action: override.action, - reason: override.reason, - createdAt: override.created_at, - expiresAt: override.expires_at, - })), - events: events.map((event) => ({ - eventType: event.event_type, - actorKind: event.actor_kind, - payload: parseObjectJson(event.payload_json), - createdAt: event.created_at, - })), - evidence, - }); + return explainStoredDurableCluster(this.db, repository, params); } async refreshRepository(params: { From 8153ddb75fe40de2f1909e974da52b2537eae1e9 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:06:48 -0700 Subject: [PATCH 203/215] refactor: extract cluster list query --- packages/api-core/src/cluster/list-query.ts | 80 +++++++++++++++++++++ packages/api-core/src/service.ts | 74 +------------------ 2 files changed, 82 insertions(+), 72 deletions(-) create mode 100644 packages/api-core/src/cluster/list-query.ts diff --git a/packages/api-core/src/cluster/list-query.ts b/packages/api-core/src/cluster/list-query.ts new file mode 100644 index 0000000..e7c31ff --- /dev/null +++ b/packages/api-core/src/cluster/list-query.ts @@ -0,0 +1,80 @@ +import { clustersResponseSchema, type ClusterDto, type ClustersResponse, type RepositoryDto } from '@ghcrawl/api-contract'; + +import type { SqliteDatabase } from '../db/sqlite.js'; +import { isEffectivelyClosed } from '../service-utils.js'; + +export function listStoredClusters( + db: SqliteDatabase, + repository: RepositoryDto, + params: { includeClosed?: boolean } = {}, +): ClustersResponse { + const latestRun = db + .prepare("select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") + .get(repository.id) as { id: number } | undefined; + + if (!latestRun) { + return clustersResponseSchema.parse({ repository, clusters: [] }); + } + + const rows = db + .prepare( + `select c.id, c.repo_id, c.representative_thread_id, c.member_count, + c.closed_at_local, c.close_reason_local, + cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title, t.state, t.closed_at_local as thread_closed_at_local + from clusters c + left join cluster_members cm on cm.cluster_id = c.id + left join threads t on t.id = cm.thread_id + where c.cluster_run_id = ? + order by c.member_count desc, c.id asc, t.number asc`, + ) + .all(latestRun.id) as Array<{ + id: number; + repo_id: number; + representative_thread_id: number | null; + member_count: number; + closed_at_local: string | null; + close_reason_local: string | null; + thread_id: number | null; + score_to_representative: number | null; + number: number | null; + kind: 'issue' | 'pull_request' | null; + title: string | null; + state: string | null; + thread_closed_at_local: string | null; + }>; + + const clusters = new Map(); + for (const row of rows) { + const cluster = clusters.get(row.id) ?? { + id: row.id, + repoId: row.repo_id, + isClosed: row.close_reason_local !== null, + closedAtLocal: row.closed_at_local, + closeReasonLocal: row.close_reason_local, + representativeThreadId: row.representative_thread_id, + memberCount: row.member_count, + members: [], + }; + if (row.thread_id !== null && row.number !== null && row.kind !== null && row.title !== null) { + cluster.members.push({ + threadId: row.thread_id, + number: row.number, + kind: row.kind, + isClosed: row.state !== null && isEffectivelyClosed({ state: row.state, closed_at_local: row.thread_closed_at_local }), + title: row.title, + scoreToRepresentative: row.score_to_representative, + }); + } + clusters.set(row.id, cluster); + } + + const clusterValues = Array.from(clusters.values()).map((cluster) => ({ + ...cluster, + isClosed: cluster.isClosed || (cluster.memberCount > 0 && cluster.members.every((member) => member.isClosed)), + })); + + return clustersResponseSchema.parse({ + repository, + clusters: clusterValues.filter((cluster) => (params.includeClosed ?? true ? true : !cluster.isClosed)), + }); +} diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index fac241f..fe4f7f1 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -14,7 +14,6 @@ import { clusterDetailResponseSchema, clusterResultSchema, clusterSummariesResponseSchema, - clustersResponseSchema, embedResultSchema, healthResponseSchema, neighborsResponseSchema, @@ -30,7 +29,6 @@ import { type ClusterSplitResponse, type ClusterDetailResponse, type ClusterExplainResponse, - type ClusterDto, type ClusterResultDto, type ClusterSummariesResponse, type ClustersResponse, @@ -74,6 +72,7 @@ import { buildSourceKindEdges } from './cluster/exact-edges.js'; import { loadLatestDeterministicFingerprints } from './cluster/fingerprint-loader.js'; import { materializeLatestDeterministicFingerprints } from './cluster/fingerprint-materializer.js'; import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; +import { listStoredClusters } from './cluster/list-query.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; import { listStoredClusterNeighbors } from './cluster/neighbor-queries.js'; import { summarizeClusterQuality, summarizeClusterSizes } from './cluster/quality.js'; @@ -206,7 +205,6 @@ import { asJson, deriveIncrementalSince, isClosedGitHubPayload, - isEffectivelyClosed, isPullRequestPayload, nowIso, parseArray, @@ -2376,75 +2374,7 @@ export class GHCrawlService { listClusters(params: { owner: string; repo: string; includeClosed?: boolean }): ClustersResponse { const repository = this.requireRepository(params.owner, params.repo); - const latestRun = this.db - .prepare("select id from cluster_runs where repo_id = ? and status = 'completed' order by id desc limit 1") - .get(repository.id) as { id: number } | undefined; - - if (!latestRun) { - return clustersResponseSchema.parse({ repository, clusters: [] }); - } - - const rows = this.db - .prepare( - `select c.id, c.repo_id, c.representative_thread_id, c.member_count, - c.closed_at_local, c.close_reason_local, - cm.thread_id, cm.score_to_representative, t.number, t.kind, t.title, t.state, t.closed_at_local as thread_closed_at_local - from clusters c - left join cluster_members cm on cm.cluster_id = c.id - left join threads t on t.id = cm.thread_id - where c.cluster_run_id = ? - order by c.member_count desc, c.id asc, t.number asc`, - ) - .all(latestRun.id) as Array<{ - id: number; - repo_id: number; - representative_thread_id: number | null; - member_count: number; - closed_at_local: string | null; - close_reason_local: string | null; - thread_id: number | null; - score_to_representative: number | null; - number: number | null; - kind: 'issue' | 'pull_request' | null; - title: string | null; - state: string | null; - thread_closed_at_local: string | null; - }>; - - const clusters = new Map(); - for (const row of rows) { - const cluster = clusters.get(row.id) ?? { - id: row.id, - repoId: row.repo_id, - isClosed: row.close_reason_local !== null, - closedAtLocal: row.closed_at_local, - closeReasonLocal: row.close_reason_local, - representativeThreadId: row.representative_thread_id, - memberCount: row.member_count, - members: [], - }; - if (row.thread_id !== null && row.number !== null && row.kind !== null && row.title !== null) { - cluster.members.push({ - threadId: row.thread_id, - number: row.number, - kind: row.kind, - isClosed: row.state !== null && isEffectivelyClosed({ state: row.state, closed_at_local: row.thread_closed_at_local }), - title: row.title, - scoreToRepresentative: row.score_to_representative, - }); - } - clusters.set(row.id, cluster); - } - - const clusterValues = Array.from(clusters.values()).map((cluster) => ({ - ...cluster, - isClosed: cluster.isClosed || (cluster.memberCount > 0 && cluster.members.every((member) => member.isClosed)), - })); - - return clustersResponseSchema.parse({ - repository, - clusters: clusterValues.filter((cluster) => (params.includeClosed ?? true ? true : !cluster.isClosed)), - }); + return listStoredClusters(this.db, repository, params); } listDurableClusters(params: { owner: string; repo: string; includeInactive?: boolean; memberLimit?: number }): DurableClustersResponse { From 470f1a8a1e65da58b54c58dc1a76c264d18823f7 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:09:31 -0700 Subject: [PATCH 204/215] refactor: extract tui repository picker --- apps/cli/src/tui/app.ts | 165 +------------------------- apps/cli/src/tui/repository-picker.ts | 165 ++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 159 deletions(-) create mode 100644 apps/cli/src/tui/repository-picker.ts diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 897779e..4c4f780 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -50,6 +50,11 @@ import { } from './cluster-render.js'; import { promptHelp } from './help.js'; import { copyTextToClipboard, openUrl } from './platform.js'; +import { + promptRepositoryChoice, + promptRepositoryInput, + type RepositoryTarget, +} from './repository-picker.js'; import { applyRect, createWidgets, @@ -61,6 +66,7 @@ import { export { resolveBlessedTerminal } from './widgets.js'; export { buildHelpContent } from './help.js'; +export { getRepositoryChoices, parseOwnerRepoValue } from './repository-picker.js'; type StartTuiParams = { service: GHCrawlService; @@ -68,22 +74,6 @@ type StartTuiParams = { repo?: string; }; -type RepositoryTarget = { - owner: string; - repo: string; -}; - -type RepositoryChoice = - | { - kind: 'existing'; - target: RepositoryTarget; - label: string; - } - | { - kind: 'new'; - label: string; - }; - type ThreadDetailCacheEntry = { detail: TuiThreadDetail; hasNeighbors: boolean; @@ -1606,149 +1596,6 @@ function formatTuiRefreshStateKey(state: TuiRefreshState): string { ].join('|'); } -export function getRepositoryChoices(service: Pick, now: Date = new Date()): RepositoryChoice[] { - const repositories = service.listRepositories().repositories - .slice() - .sort((left, right) => Date.parse(right.updatedAt) - Date.parse(left.updatedAt) || left.fullName.localeCompare(right.fullName)); - - return [ - ...repositories.map((repository) => ({ - kind: 'existing' as const, - target: { owner: repository.owner, repo: repository.name }, - label: `${repository.fullName} ${formatRelativeTime(repository.updatedAt, now)}`, - })), - { kind: 'new' as const, label: '+ Select another repository path' }, - ]; -} - -async function promptRepositoryChoice( - screen: blessed.Widgets.Screen, - service: GHCrawlService, -): Promise { - const choices = getRepositoryChoices(service); - const box = blessed.list({ - parent: screen, - border: 'line', - label: ' Repositories ', - keys: true, - vi: true, - mouse: true, - top: 'center', - left: 'center', - width: '70%', - height: '70%', - style: { - border: { fg: '#5bc0eb' }, - item: { fg: 'white' }, - selected: { bg: '#5bc0eb', fg: 'black', bold: true }, - }, - items: choices.map((choice) => choice.label), - }); - const help = blessed.box({ - parent: screen, - bottom: 0, - left: 0, - width: '100%', - height: 1, - content: 'Select a repository with Enter. Press n for a new repo. Esc cancels.', - style: { fg: 'black', bg: '#5bc0eb' }, - }); - - box.focus(); - box.select(0); - screen.render(); - - return await new Promise((resolve) => { - let closed = false; - const teardown = (): void => { - if (closed) return; - closed = true; - screen.off('keypress', handleKeypress); - screen.off('mousedown', handleMouse); - box.destroy(); - help.destroy(); - screen.render(); - }; - const finish = (value: RepositoryChoice | null): void => { - teardown(); - resolve(value); - }; - const handleKeypress = (_char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { - if (key.name === 'escape' || key.name === 'q') { - finish(null); - return; - } - if (key.name === 'n') { - const newIndex = choices.findIndex((choice) => choice.kind === 'new'); - if (newIndex >= 0) { - box.select(newIndex); - screen.render(); - } - } - }; - const handleMouse = (event: MouseEventArg): void => { - if (event.button === 'right') { - finish(null); - } - }; - - screen.on('keypress', handleKeypress); - screen.on('mousedown', handleMouse); - box.on('select', (_item, index) => finish(choices[index] ?? null)); - }); -} - -async function promptRepositoryInput(screen: blessed.Widgets.Screen): Promise { - const prompt = blessed.prompt({ - parent: screen, - border: 'line', - height: 7, - width: '60%', - top: 'center', - left: 'center', - label: ' Repository ', - tags: true, - keys: true, - vi: true, - style: { - border: { fg: 'cyan' }, - bg: '#101522', - }, - }); - - return await new Promise((resolve) => { - let closed = false; - const finish = (value: RepositoryTarget | null): void => { - if (closed) return; - closed = true; - screen.off('mousedown', handleMouse); - prompt.destroy(); - screen.render(); - resolve(value); - }; - const handleMouse = (event: MouseEventArg): void => { - if (event.button === 'right') { - finish(null); - } - }; - - screen.on('mousedown', handleMouse); - prompt.key(['escape'], () => finish(null)); - prompt.input('Repository to open (owner/repo)', '', (_error, value) => { - const parsed = parseOwnerRepoValue((value ?? '').trim()); - finish(parsed); - }); - }); -} - -export function parseOwnerRepoValue(value: string): { owner: string; repo: string } | null { - const parts = value.trim().split('/'); - if (parts.length !== 2 || !parts[0] || !parts[1]) { - return null; - } - return { owner: parts[0], repo: parts[1] }; -} - function formatActivityTimestamp(now: Date = new Date()): string { return now.toISOString().slice(11, 19); } diff --git a/apps/cli/src/tui/repository-picker.ts b/apps/cli/src/tui/repository-picker.ts new file mode 100644 index 0000000..c6cf9fa --- /dev/null +++ b/apps/cli/src/tui/repository-picker.ts @@ -0,0 +1,165 @@ +import blessed from 'neo-blessed'; + +import type { GHCrawlService } from '@ghcrawl/api-core'; + +import { formatRelativeTime } from './state.js'; +import type { MouseEventArg } from './widgets.js'; + +export type RepositoryTarget = { + owner: string; + repo: string; +}; + +export type RepositoryChoice = + | { + kind: 'existing'; + target: RepositoryTarget; + label: string; + } + | { + kind: 'new'; + label: string; + }; + +export function getRepositoryChoices(service: Pick, now: Date = new Date()): RepositoryChoice[] { + const repositories = service.listRepositories().repositories + .slice() + .sort((left, right) => Date.parse(right.updatedAt) - Date.parse(left.updatedAt) || left.fullName.localeCompare(right.fullName)); + + return [ + ...repositories.map((repository) => ({ + kind: 'existing' as const, + target: { owner: repository.owner, repo: repository.name }, + label: `${repository.fullName} ${formatRelativeTime(repository.updatedAt, now)}`, + })), + { kind: 'new' as const, label: '+ Select another repository path' }, + ]; +} + +export async function promptRepositoryChoice( + screen: blessed.Widgets.Screen, + service: GHCrawlService, +): Promise { + const choices = getRepositoryChoices(service); + const box = blessed.list({ + parent: screen, + border: 'line', + label: ' Repositories ', + keys: true, + vi: true, + mouse: true, + top: 'center', + left: 'center', + width: '70%', + height: '70%', + style: { + border: { fg: '#5bc0eb' }, + item: { fg: 'white' }, + selected: { bg: '#5bc0eb', fg: 'black', bold: true }, + }, + items: choices.map((choice) => choice.label), + }); + const help = blessed.box({ + parent: screen, + bottom: 0, + left: 0, + width: '100%', + height: 1, + content: 'Select a repository with Enter. Press n for a new repo. Esc cancels.', + style: { fg: 'black', bg: '#5bc0eb' }, + }); + + box.focus(); + box.select(0); + screen.render(); + + return await new Promise((resolve) => { + let closed = false; + const teardown = (): void => { + if (closed) return; + closed = true; + screen.off('keypress', handleKeypress); + screen.off('mousedown', handleMouse); + box.destroy(); + help.destroy(); + screen.render(); + }; + const finish = (value: RepositoryChoice | null): void => { + teardown(); + resolve(value); + }; + const handleKeypress = (_char: string, key: blessed.Widgets.Events.IKeyEventArg): void => { + if (key.name === 'escape' || key.name === 'q') { + finish(null); + return; + } + if (key.name === 'n') { + const newIndex = choices.findIndex((choice) => choice.kind === 'new'); + if (newIndex >= 0) { + box.select(newIndex); + screen.render(); + } + } + }; + const handleMouse = (event: MouseEventArg): void => { + if (event.button === 'right') { + finish(null); + } + }; + + screen.on('keypress', handleKeypress); + screen.on('mousedown', handleMouse); + box.on('select', (_item, index) => finish(choices[index] ?? null)); + }); +} + +export async function promptRepositoryInput(screen: blessed.Widgets.Screen): Promise { + const prompt = blessed.prompt({ + parent: screen, + border: 'line', + height: 7, + width: '60%', + top: 'center', + left: 'center', + label: ' Repository ', + tags: true, + keys: true, + vi: true, + style: { + border: { fg: 'cyan' }, + bg: '#101522', + }, + }); + + return await new Promise((resolve) => { + let closed = false; + const finish = (value: RepositoryTarget | null): void => { + if (closed) return; + closed = true; + screen.off('mousedown', handleMouse); + prompt.destroy(); + screen.render(); + resolve(value); + }; + const handleMouse = (event: MouseEventArg): void => { + if (event.button === 'right') { + finish(null); + } + }; + + screen.on('mousedown', handleMouse); + prompt.key(['escape'], () => finish(null)); + prompt.input('Repository to open (owner/repo)', '', (_error, value) => { + const parsed = parseOwnerRepoValue((value ?? '').trim()); + finish(parsed); + }); + }); +} + +export function parseOwnerRepoValue(value: string): RepositoryTarget | null { + const parts = value.trim().split('/'); + if (parts.length !== 2 || !parts[0] || !parts[1]) { + return null; + } + return { owner: parts[0], repo: parts[1] }; +} From 0a83bcf089ca1acddbb4a7111a5bb700e430839c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:11:02 -0700 Subject: [PATCH 205/215] refactor: extract storage optimization flow --- packages/api-core/src/service.ts | 52 ++-------------- packages/api-core/src/storage-maintenance.ts | 63 +++++++++++++++++++- 2 files changed, 67 insertions(+), 48 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index fe4f7f1..941d418 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -1,5 +1,4 @@ import fs from 'node:fs'; -import { existsSync } from 'node:fs'; import os from 'node:os'; import path from 'node:path'; import { Worker } from 'node:worker_threads'; @@ -17,7 +16,6 @@ import { embedResultSchema, healthResponseSchema, neighborsResponseSchema, - optimizeResponseSchema, refreshResponseSchema, searchResponseSchema, syncResultSchema, @@ -134,7 +132,7 @@ import { import { finishServiceRun, listRunHistoryForRepository, startServiceRun } from './run-history.js'; import { listStoredRepositories } from './repositories/list.js'; import { cosineSimilarity, dotProduct, rankNearestNeighbors, rankNearestNeighborsByScore } from './search/exact.js'; -import { missingVectorStoreTarget, optimizeSqliteTarget } from './storage-maintenance.js'; +import { optimizeStorageStores } from './storage-maintenance.js'; import { fetchThreadComments } from './sync/comments.js'; import { getSyncCursorState, writeSyncCursorState } from './sync/cursor.js'; import { persistThreadCodeSnapshot, upsertRepository, upsertThread } from './sync/persistence.js'; @@ -223,7 +221,7 @@ import { rebuildRepositoryVectorStore, resetRepositoryVectors, } from './vector/repository-maintenance.js'; -import { isCorruptedVectorIndexError, repositoryVectorStorePath, vectorStoreSidecarPath } from './vector/repository-store.js'; +import { isCorruptedVectorIndexError, repositoryVectorStorePath } from './vector/repository-store.js'; import { VectorliteStore } from './vector/vectorlite-store.js'; export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; @@ -2457,54 +2455,16 @@ export class GHCrawlService { } optimizeStorage(params: { owner?: string; repo?: string } = {}): OptimizeResponse { - const startedAt = nowIso(); const repository = params.owner && params.repo ? this.requireRepository(params.owner, params.repo) : null; - const targets = [ - optimizeSqliteTarget({ - name: 'main', - db: this.db, - dbPath: this.config.dbPath, - }), - ]; - - if (repository) { - const storePath = repositoryVectorStorePath(this.config.configDir, repository.fullName); - const sidecarPath = vectorStoreSidecarPath(storePath); - if (existsSync(storePath)) { - this.vectorStore.close(); - const vectorDb = openDb(storePath) as SqliteDatabase & { loadExtension: (extensionPath: string) => void }; - try { - const vectorlite = requireFromHere('vectorlite') as { vectorlitePath: () => string }; - vectorDb.loadExtension(vectorlite.vectorlitePath()); - targets.push( - optimizeSqliteTarget({ - name: 'vector', - db: vectorDb, - dbPath: storePath, - sidecarPath, - }), - ); - } finally { - vectorDb.close(); - } - } else { - targets.push(missingVectorStoreTarget(storePath, sidecarPath)); - } - } - - const bytesReclaimed = targets.reduce((sum, target) => sum + target.bytesReclaimed, 0); - return optimizeResponseSchema.parse({ - ok: true, + return optimizeStorageStores({ + config: this.config, + db: this.db, + vectorStore: this.vectorStore, repository, - startedAt, - finishedAt: nowIso(), - targets, - bytesReclaimed, - message: `Optimized ${targets.filter((target) => target.existed).length} SQLite store(s); reclaimed ${bytesReclaimed} byte(s).`, }); } diff --git a/packages/api-core/src/storage-maintenance.ts b/packages/api-core/src/storage-maintenance.ts index 9a1ad7c..c92ebe0 100644 --- a/packages/api-core/src/storage-maintenance.ts +++ b/packages/api-core/src/storage-maintenance.ts @@ -1,13 +1,72 @@ import fs from 'node:fs'; import { existsSync } from 'node:fs'; -import type { OptimizeResponse } from '@ghcrawl/api-contract'; +import { optimizeResponseSchema, type OptimizeResponse, type RepositoryDto } from '@ghcrawl/api-contract'; -import type { SqliteDatabase } from './db/sqlite.js'; +import type { GitcrawlConfig } from './config.js'; +import { openDb, type SqliteDatabase } from './db/sqlite.js'; +import { requireFromHere } from './service-constants.js'; import type { SqliteMaintenanceStats } from './service-types.js'; +import { nowIso } from './service-utils.js'; +import { repositoryVectorStorePath, vectorStoreSidecarPath } from './vector/repository-store.js'; +import type { VectorStore } from './vector/store.js'; type OptimizeTarget = OptimizeResponse['targets'][number]; +export function optimizeStorageStores(params: { + config: GitcrawlConfig; + db: SqliteDatabase; + vectorStore: VectorStore; + repository?: RepositoryDto | null; +}): OptimizeResponse { + const startedAt = nowIso(); + const repository = params.repository ?? null; + + const targets = [ + optimizeSqliteTarget({ + name: 'main', + db: params.db, + dbPath: params.config.dbPath, + }), + ]; + + if (repository) { + const storePath = repositoryVectorStorePath(params.config.configDir, repository.fullName); + const sidecarPath = vectorStoreSidecarPath(storePath); + if (existsSync(storePath)) { + params.vectorStore.close(); + const vectorDb = openDb(storePath) as SqliteDatabase & { loadExtension: (extensionPath: string) => void }; + try { + const vectorlite = requireFromHere('vectorlite') as { vectorlitePath: () => string }; + vectorDb.loadExtension(vectorlite.vectorlitePath()); + targets.push( + optimizeSqliteTarget({ + name: 'vector', + db: vectorDb, + dbPath: storePath, + sidecarPath, + }), + ); + } finally { + vectorDb.close(); + } + } else { + targets.push(missingVectorStoreTarget(storePath, sidecarPath)); + } + } + + const bytesReclaimed = targets.reduce((sum, target) => sum + target.bytesReclaimed, 0); + return optimizeResponseSchema.parse({ + ok: true, + repository, + startedAt, + finishedAt: nowIso(), + targets, + bytesReclaimed, + message: `Optimized ${targets.filter((target) => target.existed).length} SQLite store(s); reclaimed ${bytesReclaimed} byte(s).`, + }); +} + export function missingVectorStoreTarget(storePath: string, sidecarPath: string): OptimizeTarget { const sidecarBytes = fileSize(sidecarPath); return { From ab3d7762b8b421be87f87d62ee5eab98a877127f Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:12:09 -0700 Subject: [PATCH 206/215] refactor: import repository picker helpers directly --- apps/cli/src/tui/app.test.ts | 3 +-- apps/cli/src/tui/app.ts | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/apps/cli/src/tui/app.test.ts b/apps/cli/src/tui/app.test.ts index 05b429a..f24fde2 100644 --- a/apps/cli/src/tui/app.test.ts +++ b/apps/cli/src/tui/app.test.ts @@ -5,10 +5,9 @@ import type { TuiClusterDetail, TuiThreadDetail } from '@ghcrawl/api-core'; import { buildHelpContent, - getRepositoryChoices, - parseOwnerRepoValue, resolveBlessedTerminal, } from './app.js'; +import { getRepositoryChoices, parseOwnerRepoValue } from './repository-picker.js'; import { buildThreadContextMenuItems, escapeBlessedText, diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 4c4f780..790016a 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -66,7 +66,6 @@ import { export { resolveBlessedTerminal } from './widgets.js'; export { buildHelpContent } from './help.js'; -export { getRepositoryChoices, parseOwnerRepoValue } from './repository-picker.js'; type StartTuiParams = { service: GHCrawlService; From aae508efb900b25fbebebfc4931ed08992ce0763 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:14:16 -0700 Subject: [PATCH 207/215] refactor: extract cli command catalog --- apps/cli/src/commands.ts | 472 ++++++++++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 481 +-------------------------------------- 2 files changed, 480 insertions(+), 473 deletions(-) create mode 100644 apps/cli/src/commands.ts diff --git a/apps/cli/src/commands.ts b/apps/cli/src/commands.ts new file mode 100644 index 0000000..789d702 --- /dev/null +++ b/apps/cli/src/commands.ts @@ -0,0 +1,472 @@ +export type CommandName = + | 'doctor' + | 'configure' + | 'version' + | 'sync' + | 'export-sync' + | 'validate-sync' + | 'portable-size' + | 'sync-status' + | 'import-sync' + | 'refresh' + | 'optimize' + | 'runs' + | 'threads' + | 'close-thread' + | 'close-cluster' + | 'exclude-cluster-member' + | 'include-cluster-member' + | 'set-cluster-canonical' + | 'merge-clusters' + | 'split-cluster' + | 'summarize' + | 'key-summaries' + | 'purge-comments' + | 'embed' + | 'cluster' + | 'cluster-experiment' + | 'clusters' + | 'durable-clusters' + | 'cluster-detail' + | 'cluster-explain' + | 'search' + | 'neighbors' + | 'tui' + | 'serve'; + +export type CommandSpec = { + name: CommandName; + synopsis: string; + description: string; + options: string[]; + examples: string[]; + devOnly?: boolean; + agentJson?: boolean; +}; + +const COMMAND_SPECS: readonly CommandSpec[] = [ + { + name: 'doctor', + synopsis: 'doctor [--json]', + description: 'Check local config, database wiring, and auth health.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl doctor', 'ghcrawl doctor --json'], + agentJson: true, + }, + { + name: 'configure', + synopsis: 'configure [--summary-model gpt-5.4|gpt-5-mini|gpt-5.4-mini] [--embedding-basis title_original|title_summary|llm_key_summary] [--json]', + description: 'Show or update persisted summarization and embedding settings.', + options: [ + '--summary-model Select gpt-5.4, gpt-5-mini, or gpt-5.4-mini for summarization', + '--embedding-basis Select title_original, title_summary, or llm_key_summary for active vectors', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl configure', 'ghcrawl configure --summary-model gpt-5.4', 'ghcrawl configure --embedding-basis title_original --json'], + agentJson: true, + }, + { + name: 'version', + synopsis: 'version', + description: 'Print the installed ghcrawl version.', + options: [], + examples: ['ghcrawl version', 'ghcrawl --version'], + }, + { + name: 'sync', + synopsis: 'sync [--since ] [--limit ] [--include-comments] [--include-code] [--full-reconcile] [--json]', + description: 'Sync open GitHub issues and PRs into the local database.', + options: [ + '--since Limit sync window using ISO time or 15m/2h/7d/1mo', + '--limit Limit the number of synced items', + '--include-comments Hydrate issue comments, PR reviews, and review comments', + '--include-code Hydrate pull request file metadata and patch signatures', + '--full-reconcile Reconcile stale open items instead of metadata-only incrementals', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl sync openclaw/openclaw --limit 1', 'ghcrawl sync openclaw/openclaw --since 7d --json'], + agentJson: true, + }, + { + name: 'export-sync', + synopsis: 'export-sync [--output ] [--profile lean|review] [--manifest] [--body-chars ] [--json]', + description: 'Export a compact portable SQLite core for git-style file sync.', + options: [ + '--output Output SQLite path; defaults to the ghcrawl config exports directory', + '--profile lean|review Use a preset body excerpt budget for git sync', + '--manifest Write a JSON sidecar with counts, SHA256, and validation status', + '--body-chars Maximum body excerpt characters per thread; default 512', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl export-sync openclaw/openclaw --profile lean --manifest --output ./openclaw.sync.db --json'], + agentJson: true, + }, + { + name: 'validate-sync', + synopsis: 'validate-sync [--json]', + description: 'Validate a portable git-sync SQLite database without mutating it.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl validate-sync ./openclaw.sync.db --json'], + agentJson: true, + }, + { + name: 'portable-size', + synopsis: 'portable-size [--json]', + description: 'Report portable git-sync SQLite table sizes.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl portable-size ./openclaw.sync.db --json'], + agentJson: true, + }, + { + name: 'sync-status', + synopsis: 'sync-status --portable [--json]', + description: 'Compare the live repository store against a portable git-sync SQLite database.', + options: ['--portable Portable SQLite path to compare', '--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl sync-status openclaw/openclaw --portable ./openclaw.sync.db --json'], + agentJson: true, + }, + { + name: 'import-sync', + synopsis: 'import-sync [--json]', + description: 'Import a portable git-sync SQLite database into the configured live store.', + options: ['--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl import-sync ./openclaw.sync.db --json'], + agentJson: true, + }, + { + name: 'refresh', + synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', + description: 'Run sync, embed, and cluster in one staged pipeline.', + options: [ + '--no-sync Skip the GitHub sync stage', + '--include-code Hydrate pull request file metadata during sync', + '--no-embed Skip the embeddings stage', + '--no-cluster Skip the clustering stage', + '--heap-snapshot-dir Write heap snapshots during long-running work', + '--heap-log-interval-ms Emit periodic heap diagnostics', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl refresh openclaw/openclaw', 'ghcrawl refresh openclaw/openclaw --no-sync --json'], + agentJson: true, + }, + { + name: 'optimize', + synopsis: 'optimize [owner/repo] [--json]', + description: 'Checkpoint, analyze, optimize, and vacuum local SQLite stores.', + options: [ + 'owner/repo Also optimize this repository vector store when present', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl optimize --json', 'ghcrawl optimize openclaw/openclaw --json'], + agentJson: true, + }, + { + name: 'runs', + synopsis: 'runs [--kind sync|summary|embedding|cluster] [--limit ] [--json]', + description: 'List recent local pipeline runs and failures for one repo.', + options: [ + '--kind sync|summary|embedding|cluster Restrict to one run table', + '--limit Maximum number of records to return', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl runs openclaw/openclaw --limit 20 --json', 'ghcrawl runs openclaw/openclaw --kind cluster --json'], + agentJson: true, + }, + { + name: 'threads', + synopsis: 'threads [--numbers ] [--kind issue|pull_request] [--include-closed] [--json]', + description: 'Read specific local issue and PR records from SQLite.', + options: [ + '--numbers Fetch one or more thread numbers in one call', + '--kind issue|pull_request Filter by issue or pull request', + '--include-closed Include locally closed items', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl threads openclaw/openclaw --numbers 42,43,44 --json', 'ghcrawl threads openclaw/openclaw --numbers 42 --include-closed --json'], + agentJson: true, + }, + { + name: 'close-thread', + synopsis: 'close-thread --number [--json]', + description: 'Mark one local issue or PR closed immediately.', + options: ['--number Thread number to close locally', '--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl close-thread openclaw/openclaw --number 42 --json'], + agentJson: true, + }, + { + name: 'close-cluster', + synopsis: 'close-cluster --id [--json]', + description: 'Mark one local cluster closed immediately.', + options: ['--id Cluster id to close locally', '--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl close-cluster openclaw/openclaw --id 123 --json'], + agentJson: true, + }, + { + name: 'exclude-cluster-member', + synopsis: 'exclude-cluster-member --id --number [--reason ] [--json]', + description: 'Remove one issue or PR from a durable cluster and block automatic re-entry.', + options: [ + '--id Durable cluster id', + '--number Issue or PR number to exclude', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl exclude-cluster-member openclaw/openclaw --id 123 --number 42 --reason "false positive" --json'], + agentJson: true, + }, + { + name: 'include-cluster-member', + synopsis: 'include-cluster-member --id --number [--reason ] [--json]', + description: 'Add one issue or PR to a durable cluster and keep it included across rebuilds.', + options: [ + '--id Durable cluster id', + '--number Issue or PR number to include', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl include-cluster-member openclaw/openclaw --id 123 --number 42 --reason "same root cause" --json'], + agentJson: true, + }, + { + name: 'set-cluster-canonical', + synopsis: 'set-cluster-canonical --id --number [--reason ] [--json]', + description: 'Pin one durable cluster member as the canonical representative.', + options: [ + '--id Durable cluster id', + '--number Issue or PR number to mark canonical', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl set-cluster-canonical openclaw/openclaw --id 123 --number 42 --reason "best root issue" --json'], + agentJson: true, + }, + { + name: 'merge-clusters', + synopsis: 'merge-clusters --source --target [--reason ] [--json]', + description: 'Merge one durable cluster into another and preserve the source slug as an alias.', + options: [ + '--source Durable cluster id to merge from', + '--target Durable cluster id to merge into', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl merge-clusters openclaw/openclaw --source 123 --target 456 --reason "same root cause" --json'], + agentJson: true, + }, + { + name: 'split-cluster', + synopsis: 'split-cluster --source --numbers [--reason ] [--json]', + description: 'Split selected active members into a new durable cluster and block automatic re-entry into the source.', + options: [ + '--source Durable cluster id to split from', + '--numbers Issue or PR numbers to move into the new cluster', + '--reason Optional maintainer reason', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl split-cluster openclaw/openclaw --source 123 --numbers 42,43 --reason "separate root cause" --json'], + agentJson: true, + }, + { + name: 'embed', + synopsis: 'embed [--number ] [--json]', + description: 'Generate or refresh embeddings for one repo or one thread.', + options: ['--number Restrict embedding work to one thread', '--json Emit machine-readable JSON output explicitly'], + examples: ['ghcrawl embed openclaw/openclaw --json', 'ghcrawl embed openclaw/openclaw --number 42 --json'], + agentJson: true, + }, + { + name: 'key-summaries', + synopsis: 'key-summaries [--number ] [--limit ] [--json]', + description: 'Generate cached structured LLM key summaries for clustering enrichment.', + options: [ + '--number Restrict key summary work to one thread', + '--limit Limit the number of generated summaries', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl key-summaries openclaw/openclaw --limit 25 --json'], + agentJson: true, + }, + { + name: 'cluster', + synopsis: 'cluster [--number ] [--k ] [--threshold ] [--max-cluster-size ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', + description: 'Build or refresh local similarity clusters.', + options: [ + '--number Refresh only one durable cluster neighborhood', + '--k Limit nearest-neighbor fanout', + '--threshold Minimum similarity score', + '--max-cluster-size Soft cap for automatic cluster components before starting a new component', + '--heap-snapshot-dir Write heap snapshots during long-running work', + '--heap-log-interval-ms Emit periodic heap diagnostics', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl cluster openclaw/openclaw --json', 'ghcrawl cluster openclaw/openclaw --number 42 --threshold 0.82 --json'], + agentJson: true, + }, + { + name: 'clusters', + synopsis: 'clusters [--min-size ] [--limit ] [--sort recent|size] [--search ] [--hide-closed] [--json]', + description: 'List local cluster summaries for one repository.', + options: [ + '--min-size Minimum cluster size to return', + '--limit Maximum number of clusters to return', + '--sort recent|size Sort by recency or cluster size', + '--search Filter clusters by text', + '--hide-closed Hide locally closed clusters', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl clusters openclaw/openclaw --min-size 5 --limit 20 --sort recent --json'], + agentJson: true, + }, + { + name: 'cluster-detail', + synopsis: 'cluster-detail --id [--member-limit ] [--body-chars ] [--hide-closed] [--json]', + description: 'Dump one local cluster and its members.', + options: [ + '--id Cluster id to inspect', + '--member-limit Limit member rows in the response', + '--body-chars Limit body snippet size', + '--hide-closed Hide locally closed clusters', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl cluster-detail openclaw/openclaw --id 123 --member-limit 20 --body-chars 280 --json'], + agentJson: true, + }, + { + name: 'cluster-explain', + synopsis: 'cluster-explain --id [--member-limit ] [--event-limit ] [--json]', + description: 'Explain one durable cluster with evidence, overrides, aliases, and event history.', + options: [ + '--id Durable cluster id to inspect', + '--member-limit Limit member rows and evidence scope', + '--event-limit Limit event history rows', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl cluster-explain openclaw/openclaw --id 123 --member-limit 20 --event-limit 50 --json'], + agentJson: true, + }, + { + name: 'durable-clusters', + synopsis: 'durable-clusters [--include-inactive] [--member-limit ] [--json]', + description: 'List persistent cluster identities, stable slugs, and governed memberships.', + options: [ + '--include-inactive Include closed, merged, and split durable clusters', + '--member-limit Limit returned members per cluster', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl durable-clusters openclaw/openclaw --member-limit 10 --json'], + agentJson: true, + }, + { + name: 'search', + synopsis: 'search --query [--mode keyword|semantic|hybrid] [--json]', + description: 'Search local cluster and thread data.', + options: [ + '--query Query string to search for', + '--mode keyword|semantic|hybrid Choose search mode explicitly', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl search openclaw/openclaw --query "download stalls" --mode hybrid --json'], + agentJson: true, + }, + { + name: 'neighbors', + synopsis: 'neighbors --number [--limit ] [--threshold ] [--json]', + description: 'List nearest semantic matches for one thread.', + options: [ + '--number Thread number to inspect', + '--limit Maximum number of neighbors to return', + '--threshold Minimum similarity score', + '--json Emit machine-readable JSON output explicitly', + ], + examples: ['ghcrawl neighbors openclaw/openclaw --number 42 --limit 10 --json'], + agentJson: true, + }, + { + name: 'tui', + synopsis: 'tui [owner/repo]', + description: 'Start the interactive terminal UI.', + options: [], + examples: ['ghcrawl tui', 'ghcrawl tui openclaw/openclaw'], + }, + { + name: 'serve', + synopsis: 'serve [--port ]', + description: 'Start the local HTTP API server.', + options: ['--port Override the configured local API port'], + examples: ['ghcrawl serve', 'ghcrawl serve --port 5179'], + }, + { + name: 'summarize', + synopsis: 'summarize [--number ] [--include-comments]', + description: 'Generate or refresh summaries for local thread content.', + options: ['--number Restrict summary work to one thread', '--include-comments Include comments in the summary input'], + examples: ['ghcrawl --dev summarize openclaw/openclaw', 'ghcrawl --dev summarize openclaw/openclaw --number 42 --include-comments'], + devOnly: true, + }, + { + name: 'purge-comments', + synopsis: 'purge-comments [--number ]', + description: 'Delete stored comments for one repo or one thread.', + options: ['--number Restrict purge to one thread'], + examples: ['ghcrawl --dev purge-comments openclaw/openclaw', 'ghcrawl --dev purge-comments openclaw/openclaw --number 42'], + devOnly: true, + }, +]; + +function visibleCommandSpecs(devMode: boolean): CommandSpec[] { + return COMMAND_SPECS.filter((spec) => devMode || spec.devOnly !== true); +} + +export function getCommandSpec(name: string, devMode: boolean): CommandSpec | undefined { + return visibleCommandSpecs(devMode).find((spec) => spec.name === name); +} + +function renderCommandList(devMode: boolean): string[] { + const specs = visibleCommandSpecs(devMode); + const width = Math.max(...specs.map((spec) => spec.name.length)); + return specs.map((spec) => ` ${spec.name.padEnd(width)} ${spec.description}`); +} + +function commonGlobalOptions(): string[] { + return [ + '--config-path Override the persisted config.json path', + '--workspace-root Override workspace root detection for .env.local and data/ghcrawl.db', + '--dev Enable dev-only commands and help output', + ]; +} + +export function usage(devMode = false): string { + const lines = [ + 'ghcrawl [options]', + '', + 'Commands:', + ...renderCommandList(devMode), + '', + 'Global options:', + ...commonGlobalOptions().map((line) => ` ${line}`), + '', + "Use 'ghcrawl help ' or 'ghcrawl --help' for details.", + ]; + return `${lines.join('\n')}\n`; +} + +export function commandUsage(spec: CommandSpec): string { + const lines = [`ghcrawl ${spec.synopsis}`, '', spec.description]; + if (spec.options.length > 0) { + lines.push('', 'Options:', ...spec.options.map((line) => ` ${line}`)); + } + lines.push('', 'Global options:', ...commonGlobalOptions().map((line) => ` ${line}`)); + if (spec.agentJson) { + lines.push('', 'Machine output:', ' Supports explicit --json. JSON remains the default in this compatibility pass.'); + } + lines.push('', 'Examples:', ...spec.examples.map((example) => ` ${example}`)); + return `${lines.join('\n')}\n`; +} + +export function hasHelpFlag(args: string[]): boolean { + return args.includes('--help') || args.includes('-h'); +} + +export function usageHint(command?: CommandName): string { + return command ? `Run 'ghcrawl ${command} --help' for usage.` : "Run 'ghcrawl --help' for usage."; +} diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 02fbe2d..8c171dc 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -15,56 +15,18 @@ import { writePersistedConfig, type LoadConfigOptions, } from '@ghcrawl/api-core'; +import { + commandUsage, + getCommandSpec, + hasHelpFlag, + usage, + usageHint, + type CommandName, +} from './commands.js'; import { createHeapDiagnostics, type HeapDiagnostics } from './heap-diagnostics.js'; import { buildConfigureReport, formatConfigureReport, formatDoctorReport, type DoctorReport } from './reports.js'; import { startTui } from './tui/app.js'; -type CommandName = - | 'doctor' - | 'configure' - | 'version' - | 'sync' - | 'export-sync' - | 'validate-sync' - | 'portable-size' - | 'sync-status' - | 'import-sync' - | 'refresh' - | 'optimize' - | 'runs' - | 'threads' - | 'close-thread' - | 'close-cluster' - | 'exclude-cluster-member' - | 'include-cluster-member' - | 'set-cluster-canonical' - | 'merge-clusters' - | 'split-cluster' - | 'summarize' - | 'key-summaries' - | 'purge-comments' - | 'embed' - | 'cluster' - | 'cluster-experiment' - | 'clusters' - | 'durable-clusters' - | 'cluster-detail' - | 'cluster-explain' - | 'search' - | 'neighbors' - | 'tui' - | 'serve'; - -type CommandSpec = { - name: CommandName; - synopsis: string; - description: string; - options: string[]; - examples: string[]; - devOnly?: boolean; - agentJson?: boolean; -}; - type ParsedGlobalFlags = { argv: string[]; devMode: boolean; @@ -84,375 +46,6 @@ type ParsedRepoFlags = { owner: string; repo: string; values: RepoCommandValues const CLI_VERSION = loadCliVersion(); -const COMMAND_SPECS: readonly CommandSpec[] = [ - { - name: 'doctor', - synopsis: 'doctor [--json]', - description: 'Check local config, database wiring, and auth health.', - options: ['--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl doctor', 'ghcrawl doctor --json'], - agentJson: true, - }, - { - name: 'configure', - synopsis: 'configure [--summary-model gpt-5.4|gpt-5-mini|gpt-5.4-mini] [--embedding-basis title_original|title_summary|llm_key_summary] [--json]', - description: 'Show or update persisted summarization and embedding settings.', - options: [ - '--summary-model Select gpt-5.4, gpt-5-mini, or gpt-5.4-mini for summarization', - '--embedding-basis Select title_original, title_summary, or llm_key_summary for active vectors', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl configure', 'ghcrawl configure --summary-model gpt-5.4', 'ghcrawl configure --embedding-basis title_original --json'], - agentJson: true, - }, - { - name: 'version', - synopsis: 'version', - description: 'Print the installed ghcrawl version.', - options: [], - examples: ['ghcrawl version', 'ghcrawl --version'], - }, - { - name: 'sync', - synopsis: 'sync [--since ] [--limit ] [--include-comments] [--include-code] [--full-reconcile] [--json]', - description: 'Sync open GitHub issues and PRs into the local database.', - options: [ - '--since Limit sync window using ISO time or 15m/2h/7d/1mo', - '--limit Limit the number of synced items', - '--include-comments Hydrate issue comments, PR reviews, and review comments', - '--include-code Hydrate pull request file metadata and patch signatures', - '--full-reconcile Reconcile stale open items instead of metadata-only incrementals', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl sync openclaw/openclaw --limit 1', 'ghcrawl sync openclaw/openclaw --since 7d --json'], - agentJson: true, - }, - { - name: 'export-sync', - synopsis: 'export-sync [--output ] [--profile lean|review] [--manifest] [--body-chars ] [--json]', - description: 'Export a compact portable SQLite core for git-style file sync.', - options: [ - '--output Output SQLite path; defaults to the ghcrawl config exports directory', - '--profile lean|review Use a preset body excerpt budget for git sync', - '--manifest Write a JSON sidecar with counts, SHA256, and validation status', - '--body-chars Maximum body excerpt characters per thread; default 512', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl export-sync openclaw/openclaw --profile lean --manifest --output ./openclaw.sync.db --json'], - agentJson: true, - }, - { - name: 'validate-sync', - synopsis: 'validate-sync [--json]', - description: 'Validate a portable git-sync SQLite database without mutating it.', - options: ['--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl validate-sync ./openclaw.sync.db --json'], - agentJson: true, - }, - { - name: 'portable-size', - synopsis: 'portable-size [--json]', - description: 'Report portable git-sync SQLite table sizes.', - options: ['--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl portable-size ./openclaw.sync.db --json'], - agentJson: true, - }, - { - name: 'sync-status', - synopsis: 'sync-status --portable [--json]', - description: 'Compare the live repository store against a portable git-sync SQLite database.', - options: ['--portable Portable SQLite path to compare', '--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl sync-status openclaw/openclaw --portable ./openclaw.sync.db --json'], - agentJson: true, - }, - { - name: 'import-sync', - synopsis: 'import-sync [--json]', - description: 'Import a portable git-sync SQLite database into the configured live store.', - options: ['--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl import-sync ./openclaw.sync.db --json'], - agentJson: true, - }, - { - name: 'refresh', - synopsis: 'refresh [--include-code] [--no-sync] [--no-embed] [--no-cluster] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', - description: 'Run sync, embed, and cluster in one staged pipeline.', - options: [ - '--no-sync Skip the GitHub sync stage', - '--include-code Hydrate pull request file metadata during sync', - '--no-embed Skip the embeddings stage', - '--no-cluster Skip the clustering stage', - '--heap-snapshot-dir Write heap snapshots during long-running work', - '--heap-log-interval-ms Emit periodic heap diagnostics', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl refresh openclaw/openclaw', 'ghcrawl refresh openclaw/openclaw --no-sync --json'], - agentJson: true, - }, - { - name: 'optimize', - synopsis: 'optimize [owner/repo] [--json]', - description: 'Checkpoint, analyze, optimize, and vacuum local SQLite stores.', - options: [ - 'owner/repo Also optimize this repository vector store when present', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl optimize --json', 'ghcrawl optimize openclaw/openclaw --json'], - agentJson: true, - }, - { - name: 'runs', - synopsis: 'runs [--kind sync|summary|embedding|cluster] [--limit ] [--json]', - description: 'List recent local pipeline runs and failures for one repo.', - options: [ - '--kind sync|summary|embedding|cluster Restrict to one run table', - '--limit Maximum number of records to return', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl runs openclaw/openclaw --limit 20 --json', 'ghcrawl runs openclaw/openclaw --kind cluster --json'], - agentJson: true, - }, - { - name: 'threads', - synopsis: 'threads [--numbers ] [--kind issue|pull_request] [--include-closed] [--json]', - description: 'Read specific local issue and PR records from SQLite.', - options: [ - '--numbers Fetch one or more thread numbers in one call', - '--kind issue|pull_request Filter by issue or pull request', - '--include-closed Include locally closed items', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl threads openclaw/openclaw --numbers 42,43,44 --json', 'ghcrawl threads openclaw/openclaw --numbers 42 --include-closed --json'], - agentJson: true, - }, - { - name: 'close-thread', - synopsis: 'close-thread --number [--json]', - description: 'Mark one local issue or PR closed immediately.', - options: ['--number Thread number to close locally', '--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl close-thread openclaw/openclaw --number 42 --json'], - agentJson: true, - }, - { - name: 'close-cluster', - synopsis: 'close-cluster --id [--json]', - description: 'Mark one local cluster closed immediately.', - options: ['--id Cluster id to close locally', '--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl close-cluster openclaw/openclaw --id 123 --json'], - agentJson: true, - }, - { - name: 'exclude-cluster-member', - synopsis: 'exclude-cluster-member --id --number [--reason ] [--json]', - description: 'Remove one issue or PR from a durable cluster and block automatic re-entry.', - options: [ - '--id Durable cluster id', - '--number Issue or PR number to exclude', - '--reason Optional maintainer reason', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl exclude-cluster-member openclaw/openclaw --id 123 --number 42 --reason "false positive" --json'], - agentJson: true, - }, - { - name: 'include-cluster-member', - synopsis: 'include-cluster-member --id --number [--reason ] [--json]', - description: 'Add one issue or PR to a durable cluster and keep it included across rebuilds.', - options: [ - '--id Durable cluster id', - '--number Issue or PR number to include', - '--reason Optional maintainer reason', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl include-cluster-member openclaw/openclaw --id 123 --number 42 --reason "same root cause" --json'], - agentJson: true, - }, - { - name: 'set-cluster-canonical', - synopsis: 'set-cluster-canonical --id --number [--reason ] [--json]', - description: 'Pin one durable cluster member as the canonical representative.', - options: [ - '--id Durable cluster id', - '--number Issue or PR number to mark canonical', - '--reason Optional maintainer reason', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl set-cluster-canonical openclaw/openclaw --id 123 --number 42 --reason "best root issue" --json'], - agentJson: true, - }, - { - name: 'merge-clusters', - synopsis: 'merge-clusters --source --target [--reason ] [--json]', - description: 'Merge one durable cluster into another and preserve the source slug as an alias.', - options: [ - '--source Durable cluster id to merge from', - '--target Durable cluster id to merge into', - '--reason Optional maintainer reason', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl merge-clusters openclaw/openclaw --source 123 --target 456 --reason "same root cause" --json'], - agentJson: true, - }, - { - name: 'split-cluster', - synopsis: 'split-cluster --source --numbers [--reason ] [--json]', - description: 'Split selected active members into a new durable cluster and block automatic re-entry into the source.', - options: [ - '--source Durable cluster id to split from', - '--numbers Issue or PR numbers to move into the new cluster', - '--reason Optional maintainer reason', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl split-cluster openclaw/openclaw --source 123 --numbers 42,43 --reason "separate root cause" --json'], - agentJson: true, - }, - { - name: 'embed', - synopsis: 'embed [--number ] [--json]', - description: 'Generate or refresh embeddings for one repo or one thread.', - options: ['--number Restrict embedding work to one thread', '--json Emit machine-readable JSON output explicitly'], - examples: ['ghcrawl embed openclaw/openclaw --json', 'ghcrawl embed openclaw/openclaw --number 42 --json'], - agentJson: true, - }, - { - name: 'key-summaries', - synopsis: 'key-summaries [--number ] [--limit ] [--json]', - description: 'Generate cached structured LLM key summaries for clustering enrichment.', - options: [ - '--number Restrict key summary work to one thread', - '--limit Limit the number of generated summaries', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl key-summaries openclaw/openclaw --limit 25 --json'], - agentJson: true, - }, - { - name: 'cluster', - synopsis: 'cluster [--number ] [--k ] [--threshold ] [--max-cluster-size ] [--heap-snapshot-dir ] [--heap-log-interval-ms ] [--json]', - description: 'Build or refresh local similarity clusters.', - options: [ - '--number Refresh only one durable cluster neighborhood', - '--k Limit nearest-neighbor fanout', - '--threshold Minimum similarity score', - '--max-cluster-size Soft cap for automatic cluster components before starting a new component', - '--heap-snapshot-dir Write heap snapshots during long-running work', - '--heap-log-interval-ms Emit periodic heap diagnostics', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl cluster openclaw/openclaw --json', 'ghcrawl cluster openclaw/openclaw --number 42 --threshold 0.82 --json'], - agentJson: true, - }, - { - name: 'clusters', - synopsis: 'clusters [--min-size ] [--limit ] [--sort recent|size] [--search ] [--hide-closed] [--json]', - description: 'List local cluster summaries for one repository.', - options: [ - '--min-size Minimum cluster size to return', - '--limit Maximum number of clusters to return', - '--sort recent|size Sort by recency or cluster size', - '--search Filter clusters by text', - '--hide-closed Hide locally closed clusters', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl clusters openclaw/openclaw --min-size 5 --limit 20 --sort recent --json'], - agentJson: true, - }, - { - name: 'cluster-detail', - synopsis: 'cluster-detail --id [--member-limit ] [--body-chars ] [--hide-closed] [--json]', - description: 'Dump one local cluster and its members.', - options: [ - '--id Cluster id to inspect', - '--member-limit Limit member rows in the response', - '--body-chars Limit body snippet size', - '--hide-closed Hide locally closed clusters', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl cluster-detail openclaw/openclaw --id 123 --member-limit 20 --body-chars 280 --json'], - agentJson: true, - }, - { - name: 'cluster-explain', - synopsis: 'cluster-explain --id [--member-limit ] [--event-limit ] [--json]', - description: 'Explain one durable cluster with evidence, overrides, aliases, and event history.', - options: [ - '--id Durable cluster id to inspect', - '--member-limit Limit member rows and evidence scope', - '--event-limit Limit event history rows', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl cluster-explain openclaw/openclaw --id 123 --member-limit 20 --event-limit 50 --json'], - agentJson: true, - }, - { - name: 'durable-clusters', - synopsis: 'durable-clusters [--include-inactive] [--member-limit ] [--json]', - description: 'List persistent cluster identities, stable slugs, and governed memberships.', - options: [ - '--include-inactive Include closed, merged, and split durable clusters', - '--member-limit Limit returned members per cluster', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl durable-clusters openclaw/openclaw --member-limit 10 --json'], - agentJson: true, - }, - { - name: 'search', - synopsis: 'search --query [--mode keyword|semantic|hybrid] [--json]', - description: 'Search local cluster and thread data.', - options: [ - '--query Query string to search for', - '--mode keyword|semantic|hybrid Choose search mode explicitly', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl search openclaw/openclaw --query "download stalls" --mode hybrid --json'], - agentJson: true, - }, - { - name: 'neighbors', - synopsis: 'neighbors --number [--limit ] [--threshold ] [--json]', - description: 'List nearest semantic matches for one thread.', - options: [ - '--number Thread number to inspect', - '--limit Maximum number of neighbors to return', - '--threshold Minimum similarity score', - '--json Emit machine-readable JSON output explicitly', - ], - examples: ['ghcrawl neighbors openclaw/openclaw --number 42 --limit 10 --json'], - agentJson: true, - }, - { - name: 'tui', - synopsis: 'tui [owner/repo]', - description: 'Start the interactive terminal UI.', - options: [], - examples: ['ghcrawl tui', 'ghcrawl tui openclaw/openclaw'], - }, - { - name: 'serve', - synopsis: 'serve [--port ]', - description: 'Start the local HTTP API server.', - options: ['--port Override the configured local API port'], - examples: ['ghcrawl serve', 'ghcrawl serve --port 5179'], - }, - { - name: 'summarize', - synopsis: 'summarize [--number ] [--include-comments]', - description: 'Generate or refresh summaries for local thread content.', - options: ['--number Restrict summary work to one thread', '--include-comments Include comments in the summary input'], - examples: ['ghcrawl --dev summarize openclaw/openclaw', 'ghcrawl --dev summarize openclaw/openclaw --number 42 --include-comments'], - devOnly: true, - }, - { - name: 'purge-comments', - synopsis: 'purge-comments [--number ]', - description: 'Delete stored comments for one repo or one thread.', - options: ['--number Restrict purge to one thread'], - examples: ['ghcrawl --dev purge-comments openclaw/openclaw', 'ghcrawl --dev purge-comments openclaw/openclaw --number 42'], - devOnly: true, - }, -]; - class CliError extends Error { readonly exitCode: number; readonly command?: CommandName; @@ -472,64 +65,6 @@ class CliUsageError extends CliError { } } -function visibleCommandSpecs(devMode: boolean): CommandSpec[] { - return COMMAND_SPECS.filter((spec) => devMode || spec.devOnly !== true); -} - -function getCommandSpec(name: string, devMode: boolean): CommandSpec | undefined { - return visibleCommandSpecs(devMode).find((spec) => spec.name === name); -} - -function renderCommandList(devMode: boolean): string[] { - const specs = visibleCommandSpecs(devMode); - const width = Math.max(...specs.map((spec) => spec.name.length)); - return specs.map((spec) => ` ${spec.name.padEnd(width)} ${spec.description}`); -} - -function commonGlobalOptions(): string[] { - return [ - '--config-path Override the persisted config.json path', - '--workspace-root Override workspace root detection for .env.local and data/ghcrawl.db', - '--dev Enable dev-only commands and help output', - ]; -} - -function usage(devMode = false): string { - const lines = [ - 'ghcrawl [options]', - '', - 'Commands:', - ...renderCommandList(devMode), - '', - 'Global options:', - ...commonGlobalOptions().map((line) => ` ${line}`), - '', - "Use 'ghcrawl help ' or 'ghcrawl --help' for details.", - ]; - return `${lines.join('\n')}\n`; -} - -function commandUsage(spec: CommandSpec): string { - const lines = [`ghcrawl ${spec.synopsis}`, '', spec.description]; - if (spec.options.length > 0) { - lines.push('', 'Options:', ...spec.options.map((line) => ` ${line}`)); - } - lines.push('', 'Global options:', ...commonGlobalOptions().map((line) => ` ${line}`)); - if (spec.agentJson) { - lines.push('', 'Machine output:', ' Supports explicit --json. JSON remains the default in this compatibility pass.'); - } - lines.push('', 'Examples:', ...spec.examples.map((example) => ` ${example}`)); - return `${lines.join('\n')}\n`; -} - -function hasHelpFlag(args: string[]): boolean { - return args.includes('--help') || args.includes('-h'); -} - -function usageHint(command?: CommandName): string { - return command ? `Run 'ghcrawl ${command} --help' for usage.` : "Run 'ghcrawl --help' for usage."; -} - function readFlagValue(argv: string[], index: number, flag: string): { value: string; nextIndex: number } { const arg = argv[index]; const inlinePrefix = `${flag}=`; From 81937dc2325983629a25a203ef7a9deddf4113dc Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 19:15:39 -0700 Subject: [PATCH 208/215] refactor: compose tui thread detail in helper --- packages/api-core/src/service.ts | 56 +++++----------------- packages/api-core/src/tui/thread-detail.ts | 51 +++++++++++++++++++- 2 files changed, 63 insertions(+), 44 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 941d418..560ef3e 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -72,7 +72,6 @@ import { materializeLatestDeterministicFingerprints } from './cluster/fingerprin import { humanKeyForValue, humanKeyStableSlug } from './cluster/human-key.js'; import { listStoredClusters } from './cluster/list-query.js'; import { LLM_KEY_SUMMARY_PROMPT_VERSION, llmKeyInputHash } from './cluster/llm-key-summary.js'; -import { listStoredClusterNeighbors } from './cluster/neighbor-queries.js'; import { summarizeClusterQuality, summarizeClusterSizes } from './cluster/quality.js'; import { getLatestClusterRun } from './cluster/run-queries.js'; import { @@ -150,11 +149,7 @@ import { } from './tui/cluster-queries.js'; import { getTuiRepoStats, getTuiRepositoryRefreshState } from './tui/repo-stats.js'; import { - getLatestTuiKeySummary, - getLatestTuiThreadClusterId, - getTopChangedFiles, - getTuiThreadRow, - getTuiThreadSummaries, + buildTuiThreadDetail, } from './tui/thread-detail.js'; import { ACTIVE_EMBED_DIMENSIONS, @@ -2697,47 +2692,22 @@ export class GHCrawlService { includeNeighbors?: boolean; }): TuiThreadDetail { const repository = this.requireRepository(params.owner, params.repo); - const row = getTuiThreadRow({ + return buildTuiThreadDetail({ db: this.db, - repoId: repository.id, + repository, + summaryModel: this.config.summaryModel, threadId: params.threadId, threadNumber: params.threadNumber, + includeNeighbors: params.includeNeighbors, + neighborFallback: (threadNumber) => + this.listNeighbors({ + owner: params.owner, + repo: params.repo, + threadNumber, + limit: 8, + minScore: 0.2, + }).neighbors, }); - - if (!row) { - throw new Error(`Thread was not found for ${repository.fullName}.`); - } - - const clusterId = getLatestTuiThreadClusterId(this.db, repository.id, row.id); - const summaries = getTuiThreadSummaries(this.db, row.id, this.config.summaryModel); - const topFiles = getTopChangedFiles(this.db, row.id, 5); - const keySummary = getLatestTuiKeySummary(this.db, row.id, this.config.summaryModel); - - let neighbors: SearchHitDto['neighbors'] = []; - if (params.includeNeighbors !== false) { - neighbors = listStoredClusterNeighbors({ db: this.db, repoId: repository.id, threadId: row.id, limit: 8 }); - if (neighbors.length === 0) { - try { - neighbors = this.listNeighbors({ - owner: params.owner, - repo: params.repo, - threadNumber: row.number, - limit: 8, - minScore: 0.2, - }).neighbors; - } catch { - neighbors = []; - } - } - } - - return { - thread: threadToDto(row, clusterId), - summaries, - keySummary, - topFiles, - neighbors, - }; } async rerunAction(request: ActionRequest): Promise { diff --git a/packages/api-core/src/tui/thread-detail.ts b/packages/api-core/src/tui/thread-detail.ts index 7f66df4..adbb03e 100644 --- a/packages/api-core/src/tui/thread-detail.ts +++ b/packages/api-core/src/tui/thread-detail.ts @@ -1,8 +1,57 @@ +import type { RepositoryDto, SearchHitDto } from '@ghcrawl/api-contract'; + +import { listStoredClusterNeighbors } from '../cluster/neighbor-queries.js'; import { getLatestClusterRun } from '../cluster/run-queries.js'; import type { SqliteDatabase } from '../db/sqlite.js'; import { SUMMARY_PROMPT_VERSION } from '../service-constants.js'; import type { ThreadRow, TuiThreadDetail } from '../service-types.js'; -import { normalizeKeySummaryDisplayText } from '../service-utils.js'; +import { normalizeKeySummaryDisplayText, threadToDto } from '../service-utils.js'; + +export function buildTuiThreadDetail(params: { + db: SqliteDatabase; + repository: RepositoryDto; + summaryModel: string; + threadId?: number; + threadNumber?: number; + includeNeighbors?: boolean; + neighborFallback?: (threadNumber: number) => SearchHitDto['neighbors']; +}): TuiThreadDetail { + const row = getTuiThreadRow({ + db: params.db, + repoId: params.repository.id, + threadId: params.threadId, + threadNumber: params.threadNumber, + }); + + if (!row) { + throw new Error(`Thread was not found for ${params.repository.fullName}.`); + } + + const clusterId = getLatestTuiThreadClusterId(params.db, params.repository.id, row.id); + const summaries = getTuiThreadSummaries(params.db, row.id, params.summaryModel); + const topFiles = getTopChangedFiles(params.db, row.id, 5); + const keySummary = getLatestTuiKeySummary(params.db, row.id, params.summaryModel); + + let neighbors: SearchHitDto['neighbors'] = []; + if (params.includeNeighbors !== false) { + neighbors = listStoredClusterNeighbors({ db: params.db, repoId: params.repository.id, threadId: row.id, limit: 8 }); + if (neighbors.length === 0) { + try { + neighbors = params.neighborFallback?.(row.number) ?? []; + } catch { + neighbors = []; + } + } + } + + return { + thread: threadToDto(row, clusterId), + summaries, + keySummary, + topFiles, + neighbors, + }; +} export function getTuiThreadRow(params: { db: SqliteDatabase; From 66c685cfce6997394bd713bdeb848add49ed4277 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:21:52 -0700 Subject: [PATCH 209/215] refactor: extract cli argument parsing --- apps/cli/src/args.ts | 271 +++++++++++++++++++++++++++++++++++++++++ apps/cli/src/main.ts | 282 +++---------------------------------------- 2 files changed, 287 insertions(+), 266 deletions(-) create mode 100644 apps/cli/src/args.ts diff --git a/apps/cli/src/args.ts b/apps/cli/src/args.ts new file mode 100644 index 0000000..b855f7e --- /dev/null +++ b/apps/cli/src/args.ts @@ -0,0 +1,271 @@ +import { parseArgs } from 'node:util'; + +import type { CommandName } from './commands.js'; + +export type ParsedGlobalFlags = { + argv: string[]; + devMode: boolean; + configPathOverride?: string; + workspaceRootOverride?: string; +}; + +export type RepoCommandValues = Record; +export type ParsedRepoFlags = { owner: string; repo: string; values: RepoCommandValues }; + +type ParseArgsOptions = NonNullable[0]>['options']; + +export class CliError extends Error { + readonly exitCode: number; + readonly command?: CommandName; + + constructor(message: string, exitCode: number, command?: CommandName) { + super(message); + this.name = 'CliError'; + this.exitCode = exitCode; + this.command = command; + } +} + +export class CliUsageError extends CliError { + constructor(message: string, command?: CommandName) { + super(message, 2, command); + this.name = 'CliUsageError'; + } +} + +function readFlagValue(argv: string[], index: number, flag: string): { value: string; nextIndex: number } { + const arg = argv[index]; + const inlinePrefix = `${flag}=`; + if (arg.startsWith(inlinePrefix)) { + return { value: arg.slice(inlinePrefix.length), nextIndex: index }; + } + const value = argv[index + 1]; + if (value === undefined) { + throw new CliUsageError(`Missing value for ${flag}`); + } + return { value, nextIndex: index + 1 }; +} + +export function parseGlobalFlags(argv: string[], env: NodeJS.ProcessEnv = process.env): ParsedGlobalFlags { + let devMode = env.GHCRAWL_DEV_MODE === '1'; + let configPathOverride: string | undefined; + let workspaceRootOverride: string | undefined; + const filtered: string[] = []; + + for (let index = 0; index < argv.length; index += 1) { + const arg = argv[index]; + if (arg === '--dev') { + devMode = true; + continue; + } + if (arg === '--config-path' || arg.startsWith('--config-path=')) { + const { value, nextIndex } = readFlagValue(argv, index, '--config-path'); + configPathOverride = value; + index = nextIndex; + continue; + } + if (arg === '--workspace-root' || arg.startsWith('--workspace-root=')) { + const { value, nextIndex } = readFlagValue(argv, index, '--workspace-root'); + workspaceRootOverride = value; + index = nextIndex; + continue; + } + filtered.push(arg); + } + + return { argv: filtered, devMode, configPathOverride, workspaceRootOverride }; +} + +export function parseArgsForCommand( + command: CommandName, + args: string[], + options: ParseArgsOptions, + allowPositionals = false, +) { + try { + return parseArgs({ + args, + allowPositionals, + options, + }); + } catch (error) { + throw new CliUsageError(error instanceof Error ? error.message : String(error), command); + } +} + +export function parseOwnerRepo(value: string): { owner: string; repo: string } { + const trimmed = value.trim(); + const parts = trimmed.split('/'); + if (parts.length !== 2 || !parts[0] || !parts[1]) { + throw new CliUsageError(`Expected owner/repo, received: ${value}`); + } + return { owner: parts[0], repo: parts[1] }; +} + +export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepoFlags { + const parsed = parseArgsForCommand( + command, + args, + { + owner: { type: 'string' }, + repo: { type: 'string' }, + since: { type: 'string' }, + limit: { type: 'string' }, + json: { type: 'boolean' }, + 'include-comments': { type: 'boolean' }, + 'include-code': { type: 'boolean' }, + 'full-reconcile': { type: 'boolean' }, + 'include-closed': { type: 'boolean' }, + 'hide-closed': { type: 'boolean' }, + 'include-inactive': { type: 'boolean' }, + kind: { type: 'string' }, + number: { type: 'string' }, + numbers: { type: 'string' }, + login: { type: 'string' }, + query: { type: 'string' }, + mode: { type: 'string' }, + k: { type: 'string' }, + backend: { type: 'string' }, + 'candidate-k': { type: 'string' }, + threshold: { type: 'string' }, + 'max-cluster-size': { type: 'string' }, + port: { type: 'string' }, + id: { type: 'string' }, + source: { type: 'string' }, + target: { type: 'string' }, + reason: { type: 'string' }, + sort: { type: 'string' }, + search: { type: 'string' }, + 'min-size': { type: 'string' }, + 'member-limit': { type: 'string' }, + 'event-limit': { type: 'string' }, + 'body-chars': { type: 'string' }, + output: { type: 'string' }, + profile: { type: 'string' }, + manifest: { type: 'boolean' }, + portable: { type: 'string' }, + 'no-sync': { type: 'boolean' }, + 'no-embed': { type: 'boolean' }, + 'no-cluster': { type: 'boolean' }, + 'heap-snapshot-dir': { type: 'string' }, + 'heap-log-interval-ms': { type: 'string' }, + }, + true, + ); + const values = parsed.values as RepoCommandValues; + + if (parsed.positionals.length > 1) { + throw new CliUsageError(`Too many positional arguments for ${command}`, command); + } + + if (typeof values.repo === 'string' && values.repo.includes('/')) { + try { + return { ...parseOwnerRepo(values.repo), values }; + } catch (error) { + throw new CliUsageError(error instanceof Error ? error.message : String(error), command); + } + } + + if (parsed.positionals.length === 1) { + try { + return { ...parseOwnerRepo(parsed.positionals[0]), values }; + } catch (error) { + throw new CliUsageError(error instanceof Error ? error.message : String(error), command); + } + } + + const owner = values.owner; + const repo = values.repo; + if (typeof owner === 'string' && typeof repo === 'string') { + return { owner, repo, values }; + } + + throw new CliUsageError('Use --repo owner/repo or provide owner/repo as the first positional argument', command); +} + +export function resolveSinceValue(value: string, now: Date = new Date()): string { + const trimmed = value.trim(); + const absolute = new Date(trimmed); + if (!Number.isNaN(absolute.getTime())) { + return absolute.toISOString(); + } + + const match = trimmed.match(/^(\d+)(s|m|h|d|w|mo|y)$/i); + if (!match) { + throw new CliUsageError(`Invalid --since value: ${value}. Use an ISO timestamp or duration like 15m, 2h, 7d, or 1mo.`); + } + + const amount = Number(match[1]); + const unit = match[2].toLowerCase(); + const resolved = new Date(now); + + switch (unit) { + case 's': + resolved.setTime(resolved.getTime() - amount * 1000); + break; + case 'm': + resolved.setTime(resolved.getTime() - amount * 60 * 1000); + break; + case 'h': + resolved.setTime(resolved.getTime() - amount * 60 * 60 * 1000); + break; + case 'd': + resolved.setTime(resolved.getTime() - amount * 24 * 60 * 60 * 1000); + break; + case 'w': + resolved.setTime(resolved.getTime() - amount * 7 * 24 * 60 * 60 * 1000); + break; + case 'mo': + resolved.setUTCMonth(resolved.getUTCMonth() - amount); + break; + case 'y': + resolved.setUTCFullYear(resolved.getUTCFullYear() - amount); + break; + default: + throw new CliUsageError(`Unsupported --since unit: ${unit}`); + } + + return resolved.toISOString(); +} + +export function parsePositiveInteger(name: string, value: string, command: CommandName): number { + const parsed = Number(value); + if (!Number.isSafeInteger(parsed) || parsed <= 0) { + throw new CliUsageError(`Invalid --${name}: ${value}`, command); + } + return parsed; +} + +export function parseFiniteNumber(name: string, value: string, command: CommandName): number { + const parsed = Number(value); + if (!Number.isFinite(parsed)) { + throw new CliUsageError(`Invalid --${name}: ${value}`, command); + } + return parsed; +} + +export function parsePositiveIntegerList(name: string, value: string, command: CommandName): number[] { + const parts = value + .split(',') + .map((part) => part.trim()) + .filter(Boolean); + if (parts.length === 0) { + throw new CliUsageError(`Invalid --${name}: ${value}`, command); + } + return parts.map((part) => parsePositiveInteger(name, part, command)); +} + +export function parseEnum( + command: CommandName, + flagName: string, + value: string | boolean | undefined, + allowed: readonly T[], +): T | undefined { + if (typeof value !== 'string') { + return undefined; + } + if ((allowed as readonly string[]).includes(value)) { + return value as T; + } + throw new CliUsageError(`Invalid --${flagName}: ${value}. Use one of ${allowed.join(', ')}.`, command); +} diff --git a/apps/cli/src/main.ts b/apps/cli/src/main.ts index 8c171dc..6e65bf8 100644 --- a/apps/cli/src/main.ts +++ b/apps/cli/src/main.ts @@ -2,7 +2,6 @@ import { once } from 'node:events'; import { readFileSync } from 'node:fs'; import path from 'node:path'; -import { parseArgs } from 'node:util'; import { fileURLToPath } from 'node:url'; import { @@ -15,6 +14,21 @@ import { writePersistedConfig, type LoadConfigOptions, } from '@ghcrawl/api-core'; +import { + CliError, + CliUsageError, + parseArgsForCommand, + parseEnum, + parseFiniteNumber, + parseGlobalFlags, + parseOwnerRepo, + parsePositiveInteger, + parsePositiveIntegerList, + parseRepoFlags, + resolveSinceValue, + type ParsedGlobalFlags, + type RepoCommandValues, +} from './args.js'; import { commandUsage, getCommandSpec, @@ -27,13 +41,6 @@ import { createHeapDiagnostics, type HeapDiagnostics } from './heap-diagnostics. import { buildConfigureReport, formatConfigureReport, formatDoctorReport, type DoctorReport } from './reports.js'; import { startTui } from './tui/app.js'; -type ParsedGlobalFlags = { - argv: string[]; - devMode: boolean; - configPathOverride?: string; - workspaceRootOverride?: string; -}; - type RunContext = { stdout?: NodeJS.WritableStream; stderr?: NodeJS.WritableStream; @@ -41,229 +48,8 @@ type RunContext = { cwd?: string; }; -type RepoCommandValues = Record; -type ParsedRepoFlags = { owner: string; repo: string; values: RepoCommandValues }; - const CLI_VERSION = loadCliVersion(); -class CliError extends Error { - readonly exitCode: number; - readonly command?: CommandName; - - constructor(message: string, exitCode: number, command?: CommandName) { - super(message); - this.name = 'CliError'; - this.exitCode = exitCode; - this.command = command; - } -} - -class CliUsageError extends CliError { - constructor(message: string, command?: CommandName) { - super(message, 2, command); - this.name = 'CliUsageError'; - } -} - -function readFlagValue(argv: string[], index: number, flag: string): { value: string; nextIndex: number } { - const arg = argv[index]; - const inlinePrefix = `${flag}=`; - if (arg.startsWith(inlinePrefix)) { - return { value: arg.slice(inlinePrefix.length), nextIndex: index }; - } - const value = argv[index + 1]; - if (value === undefined) { - throw new CliUsageError(`Missing value for ${flag}`); - } - return { value, nextIndex: index + 1 }; -} - -function parseGlobalFlags(argv: string[], env: NodeJS.ProcessEnv = process.env): ParsedGlobalFlags { - let devMode = env.GHCRAWL_DEV_MODE === '1'; - let configPathOverride: string | undefined; - let workspaceRootOverride: string | undefined; - const filtered: string[] = []; - - for (let index = 0; index < argv.length; index += 1) { - const arg = argv[index]; - if (arg === '--dev') { - devMode = true; - continue; - } - if (arg === '--config-path' || arg.startsWith('--config-path=')) { - const { value, nextIndex } = readFlagValue(argv, index, '--config-path'); - configPathOverride = value; - index = nextIndex; - continue; - } - if (arg === '--workspace-root' || arg.startsWith('--workspace-root=')) { - const { value, nextIndex } = readFlagValue(argv, index, '--workspace-root'); - workspaceRootOverride = value; - index = nextIndex; - continue; - } - filtered.push(arg); - } - - return { argv: filtered, devMode, configPathOverride, workspaceRootOverride }; -} - -function parseArgsForCommand( - command: CommandName, - args: string[], - options: NonNullable[0]>['options'], - allowPositionals = false, -) { - try { - return parseArgs({ - args, - allowPositionals, - options, - }); - } catch (error) { - throw new CliUsageError(error instanceof Error ? error.message : String(error), command); - } -} - -export function parseOwnerRepo(value: string): { owner: string; repo: string } { - const trimmed = value.trim(); - const parts = trimmed.split('/'); - if (parts.length !== 2 || !parts[0] || !parts[1]) { - throw new CliUsageError(`Expected owner/repo, received: ${value}`); - } - return { owner: parts[0], repo: parts[1] }; -} - -export function parseRepoFlags(command: CommandName, args: string[]): ParsedRepoFlags { - const parsed = parseArgsForCommand( - command, - args, - { - owner: { type: 'string' }, - repo: { type: 'string' }, - since: { type: 'string' }, - limit: { type: 'string' }, - json: { type: 'boolean' }, - 'include-comments': { type: 'boolean' }, - 'include-code': { type: 'boolean' }, - 'full-reconcile': { type: 'boolean' }, - 'include-closed': { type: 'boolean' }, - 'hide-closed': { type: 'boolean' }, - 'include-inactive': { type: 'boolean' }, - kind: { type: 'string' }, - number: { type: 'string' }, - numbers: { type: 'string' }, - login: { type: 'string' }, - query: { type: 'string' }, - mode: { type: 'string' }, - k: { type: 'string' }, - backend: { type: 'string' }, - 'candidate-k': { type: 'string' }, - threshold: { type: 'string' }, - 'max-cluster-size': { type: 'string' }, - port: { type: 'string' }, - id: { type: 'string' }, - source: { type: 'string' }, - target: { type: 'string' }, - reason: { type: 'string' }, - sort: { type: 'string' }, - search: { type: 'string' }, - 'min-size': { type: 'string' }, - 'member-limit': { type: 'string' }, - 'event-limit': { type: 'string' }, - 'body-chars': { type: 'string' }, - output: { type: 'string' }, - profile: { type: 'string' }, - manifest: { type: 'boolean' }, - portable: { type: 'string' }, - 'no-sync': { type: 'boolean' }, - 'no-embed': { type: 'boolean' }, - 'no-cluster': { type: 'boolean' }, - 'heap-snapshot-dir': { type: 'string' }, - 'heap-log-interval-ms': { type: 'string' }, - }, - true, - ); - const values = parsed.values as RepoCommandValues; - - if (parsed.positionals.length > 1) { - throw new CliUsageError(`Too many positional arguments for ${command}`, command); - } - - if (typeof values.repo === 'string' && values.repo.includes('/')) { - let target: { owner: string; repo: string }; - try { - target = parseOwnerRepo(values.repo); - } catch (error) { - throw new CliUsageError(formatErrorMessage(error), command); - } - return { ...target, values }; - } - - if (parsed.positionals.length === 1) { - let target: { owner: string; repo: string }; - try { - target = parseOwnerRepo(parsed.positionals[0]); - } catch (error) { - throw new CliUsageError(formatErrorMessage(error), command); - } - return { ...target, values }; - } - - const owner = values.owner; - const repo = values.repo; - if (typeof owner === 'string' && typeof repo === 'string') { - return { owner, repo, values }; - } - - throw new CliUsageError('Use --repo owner/repo or provide owner/repo as the first positional argument', command); -} - -export function resolveSinceValue(value: string, now: Date = new Date()): string { - const trimmed = value.trim(); - const absolute = new Date(trimmed); - if (!Number.isNaN(absolute.getTime())) { - return absolute.toISOString(); - } - - const match = trimmed.match(/^(\d+)(s|m|h|d|w|mo|y)$/i); - if (!match) { - throw new CliUsageError(`Invalid --since value: ${value}. Use an ISO timestamp or duration like 15m, 2h, 7d, or 1mo.`); - } - - const amount = Number(match[1]); - const unit = match[2].toLowerCase(); - const resolved = new Date(now); - - switch (unit) { - case 's': - resolved.setTime(resolved.getTime() - amount * 1000); - break; - case 'm': - resolved.setTime(resolved.getTime() - amount * 60 * 1000); - break; - case 'h': - resolved.setTime(resolved.getTime() - amount * 60 * 60 * 1000); - break; - case 'd': - resolved.setTime(resolved.getTime() - amount * 24 * 60 * 60 * 1000); - break; - case 'w': - resolved.setTime(resolved.getTime() - amount * 7 * 24 * 60 * 60 * 1000); - break; - case 'mo': - resolved.setUTCMonth(resolved.getUTCMonth() - amount); - break; - case 'y': - resolved.setUTCFullYear(resolved.getUTCFullYear() - amount); - break; - default: - throw new CliUsageError(`Unsupported --since unit: ${unit}`); - } - - return resolved.toISOString(); -} - export function formatLogLine(message: string, now: Date = new Date()): string { return `[${now.toISOString()}] ${message}`; } @@ -272,43 +58,6 @@ function writeProgress(message: string, stderr: NodeJS.WritableStream): void { stderr.write(`${formatLogLine(message)}\n`); } -function parsePositiveInteger(name: string, value: string, command: CommandName): number { - const parsed = Number(value); - if (!Number.isSafeInteger(parsed) || parsed <= 0) { - throw new CliUsageError(`Invalid --${name}: ${value}`, command); - } - return parsed; -} - -function parseFiniteNumber(name: string, value: string, command: CommandName): number { - const parsed = Number(value); - if (!Number.isFinite(parsed)) { - throw new CliUsageError(`Invalid --${name}: ${value}`, command); - } - return parsed; -} - -function parsePositiveIntegerList(name: string, value: string, command: CommandName): number[] { - const parts = value - .split(',') - .map((part) => part.trim()) - .filter(Boolean); - if (parts.length === 0) { - throw new CliUsageError(`Invalid --${name}: ${value}`, command); - } - return parts.map((part) => parsePositiveInteger(name, part, command)); -} - -function parseEnum(command: CommandName, flagName: string, value: string | boolean | undefined, allowed: readonly T[]): T | undefined { - if (typeof value !== 'string') { - return undefined; - } - if ((allowed as readonly string[]).includes(value)) { - return value as T; - } - throw new CliUsageError(`Invalid --${flagName}: ${value}. Use one of ${allowed.join(', ')}.`, command); -} - function closeService(service: GHCrawlService | null): void { if (service) { service.close(); @@ -1037,6 +786,7 @@ if (import.meta.url === `file://${process.argv[1]}`) { } export { formatConfigureReport, formatDoctorReport } from './reports.js'; +export { parseOwnerRepo, parseRepoFlags, resolveSinceValue } from './args.js'; function loadCliVersion(): string { const here = path.dirname(fileURLToPath(import.meta.url)); From 89ff0dfa9b0ac10a012385f4aa4e5c9c229504a4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:23:51 -0700 Subject: [PATCH 210/215] refactor: share service entity lookups --- packages/api-core/src/service.ts | 65 +++++++++++++------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 560ef3e..0c7e03b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -361,19 +361,8 @@ export class GHCrawlService { excludeThreadFromCluster(params: ExcludeClusterMemberRequest): ClusterOverrideResponse { const repository = this.requireRepository(params.owner, params.repo); - const cluster = this.db - .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') - .get(repository.id, params.clusterId) as { id: number } | undefined; - if (!cluster) { - throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); - } - - const thread = this.db - .prepare('select * from threads where repo_id = ? and number = ? limit 1') - .get(repository.id, params.threadNumber) as ThreadRow | undefined; - if (!thread) { - throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); - } + const cluster = this.requireDurableCluster(repository, params.clusterId); + const thread = this.requireThread(repository, params.threadNumber); const existingMembership = this.db .prepare('select role, score_to_representative from cluster_memberships where cluster_id = ? and thread_id = ? limit 1') @@ -431,19 +420,8 @@ export class GHCrawlService { includeThreadInCluster(params: IncludeClusterMemberRequest): ClusterOverrideResponse { const repository = this.requireRepository(params.owner, params.repo); - const cluster = this.db - .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') - .get(repository.id, params.clusterId) as { id: number } | undefined; - if (!cluster) { - throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); - } - - const thread = this.db - .prepare('select * from threads where repo_id = ? and number = ? limit 1') - .get(repository.id, params.threadNumber) as ThreadRow | undefined; - if (!thread) { - throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); - } + const cluster = this.requireDurableCluster(repository, params.clusterId); + const thread = this.requireThread(repository, params.threadNumber); const timestamp = nowIso(); this.db.transaction(() => { @@ -500,19 +478,8 @@ export class GHCrawlService { setClusterCanonicalThread(params: SetClusterCanonicalRequest): ClusterOverrideResponse { const repository = this.requireRepository(params.owner, params.repo); - const cluster = this.db - .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') - .get(repository.id, params.clusterId) as { id: number } | undefined; - if (!cluster) { - throw new Error(`Durable cluster ${params.clusterId} was not found for ${repository.fullName}.`); - } - - const thread = this.db - .prepare('select * from threads where repo_id = ? and number = ? limit 1') - .get(repository.id, params.threadNumber) as ThreadRow | undefined; - if (!thread) { - throw new Error(`Thread #${params.threadNumber} was not found for ${repository.fullName}.`); - } + const cluster = this.requireDurableCluster(repository, params.clusterId); + const thread = this.requireThread(repository, params.threadNumber); const membership = this.db .prepare('select score_to_representative from cluster_memberships where cluster_id = ? and thread_id = ? limit 1') @@ -2873,6 +2840,26 @@ export class GHCrawlService { return repositoryToDto(row); } + private requireDurableCluster(repository: RepositoryDto, clusterId: number): { id: number } { + const cluster = this.db + .prepare('select id from cluster_groups where repo_id = ? and id = ? limit 1') + .get(repository.id, clusterId) as { id: number } | undefined; + if (!cluster) { + throw new Error(`Durable cluster ${clusterId} was not found for ${repository.fullName}.`); + } + return cluster; + } + + private requireThread(repository: RepositoryDto, threadNumber: number): ThreadRow { + const thread = this.db + .prepare('select * from threads where repo_id = ? and number = ? limit 1') + .get(repository.id, threadNumber) as ThreadRow | undefined; + if (!thread) { + throw new Error(`Thread #${threadNumber} was not found for ${repository.fullName}.`); + } + return thread; + } + private async aggregateRepositoryEdges( repoId: number, sourceKinds: EmbeddingSourceKind[], From f39fffa833791f8a665c0a1fabe1265599904f03 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:25:14 -0700 Subject: [PATCH 211/215] perf: load only active summary basis --- packages/api-core/src/embedding/workset.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/api-core/src/embedding/workset.ts b/packages/api-core/src/embedding/workset.ts index 8476450..2c4f3fc 100644 --- a/packages/api-core/src/embedding/workset.ts +++ b/packages/api-core/src/embedding/workset.ts @@ -55,8 +55,14 @@ export function getEmbeddingWorkset(params: { for (const row of existingRows) { existing.set(String(row.thread_id), row.content_hash); } - const summaryTexts = loadDedupeSummaryTextMap(params); - const keySummaryTexts = loadKeySummaryTextMap(params); + const summaryTexts = + params.config.embeddingBasis === 'title_summary' + ? loadDedupeSummaryTextMap(params) + : new Map(); + const keySummaryTexts = + params.config.embeddingBasis === 'llm_key_summary' + ? loadKeySummaryTextMap(params) + : new Map(); const missingSummaryThreadNumbers: number[] = []; const tasks = rows.flatMap((row) => { const task = buildActiveVectorTask({ From 734dbc31f8b2abfe964af615a597c6bdc38be139 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:28:06 -0700 Subject: [PATCH 212/215] perf: use fast tui embedding stats --- apps/cli/src/tui/app.ts | 2 + packages/api-core/src/service.ts | 8 +++- packages/api-core/src/tui/repo-stats.ts | 61 +++++++++++++++++++++++-- 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 790016a..56de0f0 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -272,6 +272,7 @@ export async function startTui(params: StartTuiParams): Promise { sort: sortMode, search, includeClosedClusters: showClosed, + statsMode: 'pipeline', }); lastRefreshState = params.service.getTuiRefreshState({ owner: currentRepository.owner, @@ -291,6 +292,7 @@ export async function startTui(params: StartTuiParams): Promise { sort: sortMode, search, includeClosedClusters: showClosed, + statsMode: 'pipeline', }); rebuildClusterItems(); selectedClusterId = preserveSelectedId(snapshot.clusters.map((cluster) => cluster.clusterId), null); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 0c7e03b..3b7d20b 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -2584,9 +2584,15 @@ export class GHCrawlService { sort?: TuiClusterSortMode; search?: string; includeClosedClusters?: boolean; + statsMode?: 'exact' | 'pipeline'; }): TuiSnapshot { const repository = this.requireRepository(params.owner, params.repo); - const stats = getTuiRepoStats({ db: this.db, config: this.config, repoId: repository.id }); + const stats = getTuiRepoStats({ + db: this.db, + config: this.config, + repoId: repository.id, + embeddingStatsMode: params.statsMode, + }); const latestRun = getLatestClusterRun(this.db, repository.id); const includeClosedClusters = params.includeClosedClusters ?? true; const minSize = params.minSize ?? 1; diff --git a/packages/api-core/src/tui/repo-stats.ts b/packages/api-core/src/tui/repo-stats.ts index 802c7bb..df593f9 100644 --- a/packages/api-core/src/tui/repo-stats.ts +++ b/packages/api-core/src/tui/repo-stats.ts @@ -2,9 +2,17 @@ import { getLatestClusterRun } from '../cluster/run-queries.js'; import type { GitcrawlConfig } from '../config.js'; import type { SqliteDatabase } from '../db/sqlite.js'; import { getEmbeddingWorkset } from '../embedding/workset.js'; +import { isRepoVectorStateCurrent } from '../pipeline-state.js'; import type { TuiRefreshState, TuiRepoStats } from '../service-types.js'; -export function getTuiRepoStats(params: { db: SqliteDatabase; config: GitcrawlConfig; repoId: number }): TuiRepoStats { +type TuiEmbeddingStatsMode = 'exact' | 'pipeline'; + +export function getTuiRepoStats(params: { + db: SqliteDatabase; + config: GitcrawlConfig; + repoId: number; + embeddingStatsMode?: TuiEmbeddingStatsMode; +}): TuiRepoStats { const counts = params.db .prepare( `select kind, count(*) as count @@ -22,20 +30,63 @@ export function getTuiRepoStats(params: { db: SqliteDatabase; config: GitcrawlCo (params.db .prepare("select finished_at from embedding_runs where repo_id = ? and status = 'completed' order by id desc limit 1") .get(params.repoId) as { finished_at: string | null } | undefined) ?? null; - const embeddingWorkset = getEmbeddingWorkset({ db: params.db, config: params.config, repoId: params.repoId }); - const staleThreadIds = new Set(embeddingWorkset.pending.map((task) => task.threadId)); + const embeddingStats = + params.embeddingStatsMode === 'pipeline' + ? getPipelineEmbeddingStats(params) + : getExactEmbeddingStats(params); return { openIssueCount: counts.find((row) => row.kind === 'issue')?.count ?? 0, openPullRequestCount: counts.find((row) => row.kind === 'pull_request')?.count ?? 0, lastGithubReconciliationAt: latestSync?.finished_at ?? null, lastEmbedRefreshAt: latestEmbed?.finished_at ?? null, - staleEmbedThreadCount: staleThreadIds.size, - staleEmbedSourceCount: embeddingWorkset.pending.length, + staleEmbedThreadCount: embeddingStats.staleThreadCount, + staleEmbedSourceCount: embeddingStats.staleSourceCount, latestClusterRunId: latestRun?.id ?? null, latestClusterRunFinishedAt: latestRun?.finished_at ?? null, }; } +function getExactEmbeddingStats(params: { db: SqliteDatabase; config: GitcrawlConfig; repoId: number }): { + staleThreadCount: number; + staleSourceCount: number; +} { + const embeddingWorkset = getEmbeddingWorkset({ db: params.db, config: params.config, repoId: params.repoId }); + const staleThreadIds = new Set(embeddingWorkset.pending.map((task) => task.threadId)); + return { + staleThreadCount: staleThreadIds.size, + staleSourceCount: embeddingWorkset.pending.length, + }; +} + +function getPipelineEmbeddingStats(params: { db: SqliteDatabase; config: GitcrawlConfig; repoId: number }): { + staleThreadCount: number; + staleSourceCount: number; +} { + if (isRepoVectorStateCurrent(params.db, params.config, params.repoId)) { + return { staleThreadCount: 0, staleSourceCount: 0 }; + } + const row = params.db + .prepare( + `select count(*) as count + from threads t + where t.repo_id = ? + and t.state = 'open' + and t.closed_at_local is null + and not exists ( + select 1 + from cluster_closures cc + join cluster_memberships cm on cm.cluster_id = cc.cluster_id + where cm.thread_id = t.id + and cm.state <> 'removed_by_user' + )`, + ) + .get(params.repoId) as { count: number }; + return { + staleThreadCount: row.count, + staleSourceCount: row.count, + }; +} + export function getTuiRepositoryRefreshState(params: { db: SqliteDatabase; repository: { id: number; updatedAt: string }; From b38ebe6a2698e921255c2f9ae612672e32b9df85 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:29:23 -0700 Subject: [PATCH 213/215] refactor: name tui embedding stats mode --- apps/cli/src/tui/app.ts | 4 ++-- packages/api-core/src/service.ts | 6 +++--- packages/api-core/src/tui/repo-stats.ts | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/tui/app.ts b/apps/cli/src/tui/app.ts index 56de0f0..27f93bc 100644 --- a/apps/cli/src/tui/app.ts +++ b/apps/cli/src/tui/app.ts @@ -272,7 +272,7 @@ export async function startTui(params: StartTuiParams): Promise { sort: sortMode, search, includeClosedClusters: showClosed, - statsMode: 'pipeline', + embeddingStatsMode: 'pipeline', }); lastRefreshState = params.service.getTuiRefreshState({ owner: currentRepository.owner, @@ -292,7 +292,7 @@ export async function startTui(params: StartTuiParams): Promise { sort: sortMode, search, includeClosedClusters: showClosed, - statsMode: 'pipeline', + embeddingStatsMode: 'pipeline', }); rebuildClusterItems(); selectedClusterId = preserveSelectedId(snapshot.clusters.map((cluster) => cluster.clusterId), null); diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 3b7d20b..9eccd68 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -147,7 +147,7 @@ import { listClosedDurableTuiClusters, listRawTuiClusters, } from './tui/cluster-queries.js'; -import { getTuiRepoStats, getTuiRepositoryRefreshState } from './tui/repo-stats.js'; +import { getTuiRepoStats, getTuiRepositoryRefreshState, type TuiEmbeddingStatsMode } from './tui/repo-stats.js'; import { buildTuiThreadDetail, } from './tui/thread-detail.js'; @@ -2584,14 +2584,14 @@ export class GHCrawlService { sort?: TuiClusterSortMode; search?: string; includeClosedClusters?: boolean; - statsMode?: 'exact' | 'pipeline'; + embeddingStatsMode?: TuiEmbeddingStatsMode; }): TuiSnapshot { const repository = this.requireRepository(params.owner, params.repo); const stats = getTuiRepoStats({ db: this.db, config: this.config, repoId: repository.id, - embeddingStatsMode: params.statsMode, + embeddingStatsMode: params.embeddingStatsMode, }); const latestRun = getLatestClusterRun(this.db, repository.id); const includeClosedClusters = params.includeClosedClusters ?? true; diff --git a/packages/api-core/src/tui/repo-stats.ts b/packages/api-core/src/tui/repo-stats.ts index df593f9..de8c1ba 100644 --- a/packages/api-core/src/tui/repo-stats.ts +++ b/packages/api-core/src/tui/repo-stats.ts @@ -5,7 +5,7 @@ import { getEmbeddingWorkset } from '../embedding/workset.js'; import { isRepoVectorStateCurrent } from '../pipeline-state.js'; import type { TuiRefreshState, TuiRepoStats } from '../service-types.js'; -type TuiEmbeddingStatsMode = 'exact' | 'pipeline'; +export type TuiEmbeddingStatsMode = 'exact' | 'pipeline'; export function getTuiRepoStats(params: { db: SqliteDatabase; From 6645f8ccc087bde6daa09cdc24042cb0dbea805c Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:30:49 -0700 Subject: [PATCH 214/215] refactor: share cluster summary dto mapping --- packages/api-core/src/service.ts | 49 ++++++++++++++------------------ 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/packages/api-core/src/service.ts b/packages/api-core/src/service.ts index 9eccd68..579ef71 100644 --- a/packages/api-core/src/service.ts +++ b/packages/api-core/src/service.ts @@ -28,6 +28,7 @@ import { type ClusterDetailResponse, type ClusterExplainResponse, type ClusterResultDto, + type ClusterSummaryDto, type ClusterSummariesResponse, type ClustersResponse, type DurableClustersResponse, @@ -189,6 +190,7 @@ import type { SyncRunStats, ThreadRow, TuiClusterDetail, + TuiClusterSummary, TuiClusterSortMode, TuiRefreshState, TuiSnapshot, @@ -222,6 +224,23 @@ import { VectorliteStore } from './vector/vectorlite-store.js'; export type { DoctorResult, TuiClusterDetail, TuiClusterMember, TuiClusterSortMode, TuiClusterSummary, TuiRefreshState, TuiRepoStats, TuiSnapshot, TuiThreadDetail } from './service-types.js'; export { parseRepoParams } from './api/params.js'; +function tuiClusterSummaryToDto(cluster: TuiClusterSummary): ClusterSummaryDto { + return { + clusterId: cluster.clusterId, + displayTitle: cluster.displayTitle, + isClosed: cluster.isClosed, + closedAtLocal: cluster.closedAtLocal, + closeReasonLocal: cluster.closeReasonLocal, + totalCount: cluster.totalCount, + issueCount: cluster.issueCount, + pullRequestCount: cluster.pullRequestCount, + latestUpdatedAt: cluster.latestUpdatedAt, + representativeThreadId: cluster.representativeThreadId, + representativeNumber: cluster.representativeNumber, + representativeKind: cluster.representativeKind, + }; +} + export class GHCrawlService { readonly config: GitcrawlConfig; readonly db: SqliteDatabase; @@ -2497,20 +2516,7 @@ export class GHCrawlService { return clusterSummariesResponseSchema.parse({ repository: snapshot.repository, stats: snapshot.stats, - clusters: clusters.map((cluster) => ({ - clusterId: cluster.clusterId, - displayTitle: cluster.displayTitle, - isClosed: cluster.isClosed, - closedAtLocal: cluster.closedAtLocal, - closeReasonLocal: cluster.closeReasonLocal, - totalCount: cluster.totalCount, - issueCount: cluster.issueCount, - pullRequestCount: cluster.pullRequestCount, - latestUpdatedAt: cluster.latestUpdatedAt, - representativeThreadId: cluster.representativeThreadId, - representativeNumber: cluster.representativeNumber, - representativeKind: cluster.representativeKind, - })), + clusters: clusters.map(tuiClusterSummaryToDto), }); } @@ -2559,20 +2565,7 @@ export class GHCrawlService { return clusterDetailResponseSchema.parse({ repository: snapshot.repository, stats: snapshot.stats, - cluster: { - clusterId: cluster.clusterId, - displayTitle: cluster.displayTitle, - isClosed: cluster.isClosed, - closedAtLocal: cluster.closedAtLocal, - closeReasonLocal: cluster.closeReasonLocal, - totalCount: cluster.totalCount, - issueCount: cluster.issueCount, - pullRequestCount: cluster.pullRequestCount, - latestUpdatedAt: cluster.latestUpdatedAt, - representativeThreadId: cluster.representativeThreadId, - representativeNumber: cluster.representativeNumber, - representativeKind: cluster.representativeKind, - }, + cluster: tuiClusterSummaryToDto(cluster), members, }); } From 3ac7ea905bb9d07e26f4c80f9c588be94fa4872e Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Sat, 25 Apr 2026 20:32:08 -0700 Subject: [PATCH 215/215] test: cover pipeline tui embedding stats --- packages/api-core/src/service.test.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/packages/api-core/src/service.test.ts b/packages/api-core/src/service.test.ts index 6d2a4ab..9777415 100644 --- a/packages/api-core/src/service.test.ts +++ b/packages/api-core/src/service.test.ts @@ -6,6 +6,7 @@ import path from 'node:path'; import { humanKeyForValue } from './cluster/human-key.js'; import { openDb } from './db/sqlite.js'; +import { markRepoVectorsCurrent } from './pipeline-state.js'; import { GHCrawlService } from './service.js'; import type { VectorStore } from './vector/store.js'; @@ -3863,6 +3864,21 @@ test('tui snapshot returns mixed issue and pull request counts with default visi assert.equal(snapshot.stats.staleEmbedThreadCount, 5); assert.equal(snapshot.stats.staleEmbedSourceCount, 5); assert.equal(snapshot.stats.latestClusterRunId, 1); + + const stalePipelineStats = service.getTuiSnapshot({ + owner: 'openclaw', + repo: 'openclaw', + embeddingStatsMode: 'pipeline', + }); + assert.equal(stalePipelineStats.stats.staleEmbedThreadCount, 5); + markRepoVectorsCurrent(service.db, service.config, 1); + const currentPipelineStats = service.getTuiSnapshot({ + owner: 'openclaw', + repo: 'openclaw', + embeddingStatsMode: 'pipeline', + }); + assert.equal(currentPipelineStats.stats.staleEmbedThreadCount, 0); + assert.deepEqual( snapshot.clusters.map((cluster) => cluster.clusterId), [101, 100],