diff --git a/src/app/runtime-container.ts b/src/app/runtime-container.ts index 7f3dec7..fc5ffa2 100644 --- a/src/app/runtime-container.ts +++ b/src/app/runtime-container.ts @@ -113,6 +113,10 @@ export interface CoreRuntimeConfig { retrievalProfileSettings: RetrievalProfile; temporalQueryConstraintBoost: number; temporalQueryConstraintEnabled: boolean; + retrievalConfidenceGateEnabled: boolean; + retrievalConfidenceMarginNormalizer: number; + retrievalConfidenceSimilarityNormalizer: number; + retrievalConfidenceFloor: number; } /** Repositories constructed by the runtime container. */ diff --git a/src/config.ts b/src/config.ts index 10025d0..eed7ec4 100644 --- a/src/config.ts +++ b/src/config.ts @@ -127,6 +127,10 @@ export interface RuntimeConfig { literalListProtectionMaxProtected: number; temporalQueryConstraintEnabled: boolean; temporalQueryConstraintBoost: number; + retrievalConfidenceGateEnabled: boolean; + retrievalConfidenceMarginNormalizer: number; + retrievalConfidenceSimilarityNormalizer: number; + retrievalConfidenceFloor: number; deferredAudnEnabled: boolean; deferredAudnBatchSize: number; compositeGroupingEnabled: boolean; @@ -374,6 +378,10 @@ export const config: RuntimeConfig = { literalListProtectionMaxProtected: parsePositiveIntEnv('LITERAL_LIST_PROTECTION_MAX_PROTECTED', 3), temporalQueryConstraintEnabled: (optionalEnv('TEMPORAL_QUERY_CONSTRAINT_ENABLED') ?? 'false') === 'true', temporalQueryConstraintBoost: parseFloat(optionalEnv('TEMPORAL_QUERY_CONSTRAINT_BOOST') ?? '2'), + retrievalConfidenceGateEnabled: (optionalEnv('RETRIEVAL_CONFIDENCE_GATE_ENABLED') ?? 'false') === 'true', + retrievalConfidenceMarginNormalizer: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_MARGIN_NORMALIZER') ?? '0.05'), + retrievalConfidenceSimilarityNormalizer: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_SIMILARITY_NORMALIZER') ?? '0.5'), + retrievalConfidenceFloor: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_FLOOR') ?? '0.3'), deferredAudnEnabled: (optionalEnv('DEFERRED_AUDN_ENABLED') ?? 'false') === 'true', deferredAudnBatchSize: parseInt(optionalEnv('DEFERRED_AUDN_BATCH_SIZE') ?? '20', 10), compositeGroupingEnabled: (optionalEnv('COMPOSITE_GROUPING_ENABLED') ?? 'true') === 'true', @@ -514,6 +522,9 @@ export const INTERNAL_POLICY_CONFIG_FIELDS = [ 'literalListProtectionEnabled', 'literalListProtectionMaxProtected', // Temporal query selection 'temporalQueryConstraintEnabled', 'temporalQueryConstraintBoost', + // Retrieval confidence gate + 'retrievalConfidenceGateEnabled', 'retrievalConfidenceMarginNormalizer', + 'retrievalConfidenceSimilarityNormalizer', 'retrievalConfidenceFloor', // Fast AUDN 'fastAudnEnabled', 'fastAudnDuplicateThreshold', // Observation / deferred diff --git a/src/routes/memories.ts b/src/routes/memories.ts index 0a182cf..cf91b31 100644 --- a/src/routes/memories.ts +++ b/src/routes/memories.ts @@ -824,5 +824,13 @@ function formatSearchResponse(result: RetrievalResult, scope: MemoryScope) { }, } : {}), ...(observability ? { observability: formatObservability(observability) } : {}), + ...(result.retrievalConfidence ? { + retrieval_confidence: { + low_confidence: result.retrievalConfidence.lowConfidence, + confidence: result.retrievalConfidence.confidence, + top_similarity: result.retrievalConfidence.topSimilarity, + margin: result.retrievalConfidence.margin, + }, + } : {}), }; } diff --git a/src/services/__tests__/retrieval-confidence-gate.test.ts b/src/services/__tests__/retrieval-confidence-gate.test.ts new file mode 100644 index 0000000..e4e6edd --- /dev/null +++ b/src/services/__tests__/retrieval-confidence-gate.test.ts @@ -0,0 +1,122 @@ +/** + * Tests for retrieval-confidence-gate.ts + * + * Validates the confidence computation used by EXP-14 (retrieval-side + * abstention gate). The gate must: + * - Return null when disabled + * - Flag low confidence on empty results + * - Flag low confidence on narrow margin + weak top similarity + * - NOT flag when separation is strong or top similarity is high + * - Respect config overrides for normalizers and floor + */ + +import { describe, it, expect } from 'vitest'; +import { computeRetrievalConfidence } from '../retrieval-confidence-gate.js'; + +function result(similarity: number): { similarity: number } { + return { similarity }; +} + +const enabledCfg = { + retrievalConfidenceGateEnabled: true, +} as const; + +const disabledCfg = { + retrievalConfidenceGateEnabled: false, +} as const; + +describe('computeRetrievalConfidence', () => { + it('returns null when the gate is disabled', () => { + const res = computeRetrievalConfidence([result(0.9), result(0.8)], disabledCfg); + expect(res).toBeNull(); + }); + + it('flags low confidence on empty results', () => { + const res = computeRetrievalConfidence([], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(true); + expect(res!.confidence).toBe(0); + expect(res!.topSimilarity).toBe(0); + expect(res!.margin).toBe(0); + }); + + it('does NOT flag single result with decent absolute similarity', () => { + // top=0.4, second=0 → margin=0.4 → marginConf=1.0, absConf=0.8 + // confidence = 0.6*1.0 + 0.4*0.8 = 0.92 ≥ 0.3 + const res = computeRetrievalConfidence([result(0.4)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + expect(res!.margin).toBe(0.4); + }); + + it('does NOT flag when top is strong and well-separated', () => { + const res = computeRetrievalConfidence([result(0.9), result(0.4)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + expect(res!.confidence).toBeGreaterThan(0.8); + }); + + it('flags narrow margin and weak top similarity', () => { + // top=0.10, second=0.09 → margin=0.01 + // marginConf=0.01/0.05=0.2, absConf=0.10/0.5=0.2 + // confidence = 0.6*0.2 + 0.4*0.2 = 0.20 < 0.3 + const res = computeRetrievalConfidence([result(0.10), result(0.09)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(true); + expect(res!.margin).toBeCloseTo(0.01, 5); + expect(res!.confidence).toBeCloseTo(0.20, 2); + }); + + it('does NOT flag weak top when margin is strong', () => { + // top=0.15, second=0.02 → margin=0.13 → marginConf=min(1, 0.13/0.05)=1.0 + // absConf=min(1, 0.15/0.5)=0.3 + // confidence = 0.6*1.0 + 0.4*0.3 = 0.72 ≥ 0.3 + const res = computeRetrievalConfidence([result(0.15), result(0.02)], enabledCfg); + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + expect(res!.confidence).toBeCloseTo(0.72, 2); + }); + + it('respects margin normalizer override', () => { + const narrow = computeRetrievalConfidence([result(0.25), result(0.23)], { + retrievalConfidenceGateEnabled: true, + retrievalConfidenceMarginNormalizer: 0.01, + }); + // margin=0.02, normalizer=0.01 → marginConf=1.0 → confidence much higher + expect(narrow).not.toBeNull(); + expect(narrow!.margin).toBeCloseTo(0.02, 5); + expect(narrow!.confidence).toBeGreaterThan(0.5); + }); + + it('respects floor override', () => { + const res = computeRetrievalConfidence([result(0.25), result(0.23)], { + retrievalConfidenceGateEnabled: true, + retrievalConfidenceFloor: 0.05, + }); + // Same narrow margin, but floor is 0.05 → confidence ≈0.24 < 0.05? No, 0.24 > 0.05 + expect(res).not.toBeNull(); + expect(res!.lowConfidence).toBe(false); + }); + + it('uses similarity, not score, for computation', () => { + // The gate reads `similarity` directly; it does not depend on `score`. + const res = computeRetrievalConfidence( + [{ similarity: 0.8 }, { similarity: 0.3 }], + enabledCfg, + ); + expect(res).not.toBeNull(); + expect(res!.topSimilarity).toBe(0.8); + expect(res!.margin).toBe(0.5); + expect(res!.lowConfidence).toBe(false); + }); + + it('computes exact confidence for a mid-range case', () => { + // top=0.5, second=0.4 → margin=0.1 + // marginConf=min(1, 0.1/0.05)=1.0 + // absConf=min(1, 0.5/0.5)=1.0 + // confidence = 0.6*1.0 + 0.4*1.0 = 1.0 + const res = computeRetrievalConfidence([result(0.5), result(0.4)], enabledCfg); + expect(res!.confidence).toBeCloseTo(1.0, 5); + expect(res!.lowConfidence).toBe(false); + }); +}); diff --git a/src/services/extraction.ts b/src/services/extraction.ts index 3ce6730..7e4d0f9 100644 --- a/src/services/extraction.ts +++ b/src/services/extraction.ts @@ -18,7 +18,7 @@ import { type ExtractionOptions, } from './observation-date-extraction.js'; -const EXTRACTION_MAX_TOKENS = 4096; +const EXTRACTION_MAX_TOKENS = 8192; const AUDN_MAX_TOKENS = 2048; export type { ExtractionOptions }; diff --git a/src/services/memory-search.ts b/src/services/memory-search.ts index c221f12..40d0791 100644 --- a/src/services/memory-search.ts +++ b/src/services/memory-search.ts @@ -63,11 +63,11 @@ async function executeSearchStep( retrievalOptions: RetrievalOptions | undefined, asOf: string | undefined, trace: TraceCollector, -): Promise<{ memories: SearchResult[]; activeTrace: TraceCollector }> { +): Promise<{ memories: SearchResult[]; activeTrace: TraceCollector; retrievalConfidence: import('./retrieval-confidence-gate.js').RetrievalConfidence | null }> { if (asOf) { const memories = await deps.stores.claim.searchClaimVersions(userId, await embedText(query, 'query'), effectiveLimit, asOf, sourceSite); trace.stage('as-of-search', memories, { asOf }); - return { memories, activeTrace: trace }; + return { memories, activeTrace: trace, retrievalConfidence: null }; } const pipelineStores = { search: deps.stores.search, link: deps.stores.link, memory: deps.stores.memory, entity: deps.stores.entity, pool: deps.stores.pool }; const pipelineResult = await runSearchPipelineWithTrace(pipelineStores, userId, query, effectiveLimit, sourceSite, referenceTime, { @@ -78,7 +78,7 @@ async function executeSearchStep( skipReranking: retrievalOptions?.skipReranking, runtimeConfig: deps.config, }); - return { memories: pipelineResult.filtered, activeTrace: pipelineResult.trace }; + return { memories: pipelineResult.filtered, activeTrace: pipelineResult.trace, retrievalConfidence: pipelineResult.retrievalConfidence }; } /** Filter workspace-scoped, stale composites, and consensus-violating memories. */ @@ -135,6 +135,7 @@ function assembleResponse( asOf: string | undefined, sourceSite: string | undefined, lessonCheck: LessonCheckResult | undefined, + retrievalConfidence: import('./retrieval-confidence-gate.js').RetrievalConfidence | null, ): RetrievalResult { const mode = retrievalOptions?.retrievalMode ?? 'flat'; const packaged = applyFlatPackagingPolicy(postProcessed.memories, query, mode, activeTrace); @@ -150,7 +151,7 @@ function assembleResponse( }); activeTrace.finalize(outputMemories); - return { + const result: RetrievalResult = { memories: outputMemories, injectionText, citations: buildRichCitations(outputMemories).map((c) => c.memory_id), retrievalMode: mode, tierAssignments, expandIds, estimatedContextTokens, @@ -159,6 +160,10 @@ function assembleResponse( retrievalSummary: activeTrace.getRetrievalSummary(), packagingSummary, assemblySummary, }; + if (retrievalConfidence) { + result.retrievalConfidence = retrievalConfidence; + } + return result; } /** Full search with lesson check, URI resolution, pipeline, post-processing, and packaging. */ @@ -185,9 +190,9 @@ export async function performSearch( const uriResult = await tryUriResolution(deps, query, userId, retrievalOptions, trace); if (uriResult) return uriResult; - const { memories: rawMemories, activeTrace } = await executeSearchStep(deps, userId, query, effectiveLimit, sourceSite, referenceTime, namespaceScope, retrievalOptions, asOf, trace); + const { memories: rawMemories, activeTrace, retrievalConfidence } = await executeSearchStep(deps, userId, query, effectiveLimit, sourceSite, referenceTime, namespaceScope, retrievalOptions, asOf, trace); const filteredMemories = await postProcessResults(deps, rawMemories, activeTrace, userId, query, asOf); - return assembleResponse(deps, filteredMemories, query, userId, activeTrace, retrievalOptions, asOf, sourceSite, lessonCheck); + return assembleResponse(deps, filteredMemories, query, userId, activeTrace, retrievalOptions, asOf, sourceSite, lessonCheck, retrievalConfidence); } /** diff --git a/src/services/memory-service-types.ts b/src/services/memory-service-types.ts index 0187247..ba5fbc5 100644 --- a/src/services/memory-service-types.ts +++ b/src/services/memory-service-types.ts @@ -190,6 +190,7 @@ export interface RetrievalResult { retrievalSummary?: import('./retrieval-trace.js').RetrievalTraceSummary; packagingSummary?: import('./retrieval-trace.js').PackagingTraceSummary; assemblySummary?: import('./retrieval-trace.js').AssemblyTraceSummary; + retrievalConfidence?: import('./retrieval-confidence-gate.js').RetrievalConfidence; } /** Options controlling retrieval packaging. */ diff --git a/src/services/retrieval-confidence-gate.ts b/src/services/retrieval-confidence-gate.ts new file mode 100644 index 0000000..148e812 --- /dev/null +++ b/src/services/retrieval-confidence-gate.ts @@ -0,0 +1,78 @@ +/** + * Retrieval confidence gate — computes a confidence score based on the + * separation between top results. When confidence is low, signals to the + * caller that retrieval may be insufficient for a definitive answer. + * + * This targets the abstention ability (ABS) on BEAM, where Honcho scores + * below the no-memory baseline because over-retrieval poisons "I don't know" + * answers. + */ + +export interface RetrievalConfidence { + /** True when the confidence composite falls below the configured floor. */ + lowConfidence: boolean; + /** Composite confidence in [0, 1]. */ + confidence: number; + /** Similarity of the top result (the stable, scale-invariant signal). */ + topSimilarity: number; + /** Margin between top and second result similarities. */ + margin: number; +} + +export interface RetrievalConfidenceConfig { + retrievalConfidenceGateEnabled: boolean; + retrievalConfidenceMarginNormalizer: number; + retrievalConfidenceSimilarityNormalizer: number; + retrievalConfidenceFloor: number; +} + +const DEFAULT_MARGIN_NORMALIZER = 0.05; +const DEFAULT_SIMILARITY_NORMALIZER = 0.5; +const DEFAULT_CONFIDENCE_FLOOR = 0.3; +const MARGIN_WEIGHT = 0.6; +const ABSOLUTE_WEIGHT = 0.4; + +/** + * Compute retrieval confidence from a ranked list of results. + * + * Uses `similarity` (not `score`) because `score` is rewritten by RRF, + * cross-encoder, MMR, and additive boosts mid-pipeline. `similarity` is the + * only stable, scale-invariant signal that survives all stages. + * + * @param results — ranked search results; must expose `similarity: number`. + * @param cfg — gate configuration; when disabled returns `null`. + */ +export function computeRetrievalConfidence( + results: ReadonlyArray<{ similarity: number }>, + cfg: Partial & { retrievalConfidenceGateEnabled: boolean }, +): RetrievalConfidence | null { + if (!cfg.retrievalConfidenceGateEnabled) return null; + + if (results.length === 0) { + return { + lowConfidence: true, + confidence: 0, + topSimilarity: 0, + margin: 0, + }; + } + + const top = results[0].similarity; + const second = results.length > 1 ? results[1].similarity : 0; + const margin = Math.max(0, top - second); + + const marginNormalizer = cfg.retrievalConfidenceMarginNormalizer ?? DEFAULT_MARGIN_NORMALIZER; + const similarityNormalizer = cfg.retrievalConfidenceSimilarityNormalizer ?? DEFAULT_SIMILARITY_NORMALIZER; + const floor = cfg.retrievalConfidenceFloor ?? DEFAULT_CONFIDENCE_FLOOR; + + const marginConf = Math.min(1, margin / marginNormalizer); + const absConf = Math.min(1, top / similarityNormalizer); + const confidence = MARGIN_WEIGHT * marginConf + ABSOLUTE_WEIGHT * absConf; + + return { + lowConfidence: confidence < floor, + confidence, + topSimilarity: top, + margin, + }; +} diff --git a/src/services/search-pipeline.ts b/src/services/search-pipeline.ts index 1c4d445..1e9a3e7 100644 --- a/src/services/search-pipeline.ts +++ b/src/services/search-pipeline.ts @@ -38,6 +38,7 @@ import { applyCurrentStateRanking } from './current-state-ranking.js'; import { applyConcisenessPenalty } from './conciseness-preference.js'; import { protectLiteralListAnswerCandidates } from './literal-list-protection.js'; import { applyTemporalQueryConstraints } from './temporal-query-constraints.js'; +import { computeRetrievalConfidence, type RetrievalConfidence } from './retrieval-confidence-gate.js'; const TEMPORAL_NEIGHBOR_WINDOW_MINUTES = 30; const SEMANTIC_RRF_WEIGHT = 1.2; @@ -85,6 +86,10 @@ export type SearchPipelineRuntimeConfig = Pick< | 'retrievalProfileSettings' | 'temporalQueryConstraintBoost' | 'temporalQueryConstraintEnabled' + | 'retrievalConfidenceGateEnabled' + | 'retrievalConfidenceMarginNormalizer' + | 'retrievalConfidenceSimilarityNormalizer' + | 'retrievalConfidenceFloor' >; /** * Decide whether to auto-skip cross-encoder reranking. @@ -142,7 +147,7 @@ export async function runSearchPipelineWithTrace( sourceSite?: string, referenceTime?: Date, options: SearchPipelineOptions = {}, -): Promise<{ filtered: SearchResult[]; trace: TraceCollector }> { +): Promise<{ filtered: SearchResult[]; trace: TraceCollector; retrievalConfidence: RetrievalConfidence | null }> { const trace = new TraceCollector(query, userId); const policyConfig: SearchPipelineRuntimeConfig = options.runtimeConfig ?? config; const mmrPoolMultiplier = policyConfig.mmrEnabled ? 3 : 1; @@ -267,6 +272,15 @@ export async function runSearchPipelineWithTrace( policyConfig, )); + const retrievalConfidence = computeRetrievalConfidence(selected, policyConfig); + if (retrievalConfidence?.lowConfidence) { + trace.event('low-confidence-gate', { + confidence: retrievalConfidence.confidence, + topSimilarity: retrievalConfidence.topSimilarity, + margin: retrievalConfidence.margin, + }); + } + const namespaceScope = options.namespaceScope ?? null; trace.setRetrievalSummary({ candidateIds: selected.map((result) => result.id), @@ -281,7 +295,7 @@ export async function runSearchPipelineWithTrace( ? selected.filter((r) => isInScope(r.namespace, namespaceScope)) : selected; - return { filtered, trace }; + return { filtered, trace, retrievalConfidence }; } async function runInitialRetrieval(