Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/app/runtime-container.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ export interface CoreRuntimeConfig {
retrievalProfileSettings: RetrievalProfile;
temporalQueryConstraintBoost: number;
temporalQueryConstraintEnabled: boolean;
retrievalConfidenceGateEnabled: boolean;
retrievalConfidenceMarginNormalizer: number;
retrievalConfidenceSimilarityNormalizer: number;
retrievalConfidenceFloor: number;
}

/** Repositories constructed by the runtime container. */
Expand Down
11 changes: 11 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ export interface RuntimeConfig {
literalListProtectionMaxProtected: number;
temporalQueryConstraintEnabled: boolean;
temporalQueryConstraintBoost: number;
retrievalConfidenceGateEnabled: boolean;
retrievalConfidenceMarginNormalizer: number;
retrievalConfidenceSimilarityNormalizer: number;
retrievalConfidenceFloor: number;
deferredAudnEnabled: boolean;
deferredAudnBatchSize: number;
compositeGroupingEnabled: boolean;
Expand Down Expand Up @@ -374,6 +378,10 @@ export const config: RuntimeConfig = {
literalListProtectionMaxProtected: parsePositiveIntEnv('LITERAL_LIST_PROTECTION_MAX_PROTECTED', 3),
temporalQueryConstraintEnabled: (optionalEnv('TEMPORAL_QUERY_CONSTRAINT_ENABLED') ?? 'false') === 'true',
temporalQueryConstraintBoost: parseFloat(optionalEnv('TEMPORAL_QUERY_CONSTRAINT_BOOST') ?? '2'),
retrievalConfidenceGateEnabled: (optionalEnv('RETRIEVAL_CONFIDENCE_GATE_ENABLED') ?? 'false') === 'true',
retrievalConfidenceMarginNormalizer: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_MARGIN_NORMALIZER') ?? '0.05'),
retrievalConfidenceSimilarityNormalizer: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_SIMILARITY_NORMALIZER') ?? '0.5'),
retrievalConfidenceFloor: parseFloat(optionalEnv('RETRIEVAL_CONFIDENCE_FLOOR') ?? '0.3'),
deferredAudnEnabled: (optionalEnv('DEFERRED_AUDN_ENABLED') ?? 'false') === 'true',
deferredAudnBatchSize: parseInt(optionalEnv('DEFERRED_AUDN_BATCH_SIZE') ?? '20', 10),
compositeGroupingEnabled: (optionalEnv('COMPOSITE_GROUPING_ENABLED') ?? 'true') === 'true',
Expand Down Expand Up @@ -514,6 +522,9 @@ export const INTERNAL_POLICY_CONFIG_FIELDS = [
'literalListProtectionEnabled', 'literalListProtectionMaxProtected',
// Temporal query selection
'temporalQueryConstraintEnabled', 'temporalQueryConstraintBoost',
// Retrieval confidence gate
'retrievalConfidenceGateEnabled', 'retrievalConfidenceMarginNormalizer',
'retrievalConfidenceSimilarityNormalizer', 'retrievalConfidenceFloor',
// Fast AUDN
'fastAudnEnabled', 'fastAudnDuplicateThreshold',
// Observation / deferred
Expand Down
8 changes: 8 additions & 0 deletions src/routes/memories.ts
Original file line number Diff line number Diff line change
Expand Up @@ -824,5 +824,13 @@ function formatSearchResponse(result: RetrievalResult, scope: MemoryScope) {
},
} : {}),
...(observability ? { observability: formatObservability(observability) } : {}),
...(result.retrievalConfidence ? {
retrieval_confidence: {
low_confidence: result.retrievalConfidence.lowConfidence,
confidence: result.retrievalConfidence.confidence,
top_similarity: result.retrievalConfidence.topSimilarity,
margin: result.retrievalConfidence.margin,
},
} : {}),
};
}
122 changes: 122 additions & 0 deletions src/services/__tests__/retrieval-confidence-gate.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/**
* Tests for retrieval-confidence-gate.ts
*
* Validates the confidence computation used by EXP-14 (retrieval-side
* abstention gate). The gate must:
* - Return null when disabled
* - Flag low confidence on empty results
* - Flag low confidence on narrow margin + weak top similarity
* - NOT flag when separation is strong or top similarity is high
* - Respect config overrides for normalizers and floor
*/

import { describe, it, expect } from 'vitest';
import { computeRetrievalConfidence } from '../retrieval-confidence-gate.js';

function result(similarity: number): { similarity: number } {
return { similarity };
}

const enabledCfg = {
retrievalConfidenceGateEnabled: true,
} as const;

const disabledCfg = {
retrievalConfidenceGateEnabled: false,
} as const;

describe('computeRetrievalConfidence', () => {
it('returns null when the gate is disabled', () => {
const res = computeRetrievalConfidence([result(0.9), result(0.8)], disabledCfg);
expect(res).toBeNull();
});

it('flags low confidence on empty results', () => {
const res = computeRetrievalConfidence([], enabledCfg);
expect(res).not.toBeNull();
expect(res!.lowConfidence).toBe(true);
expect(res!.confidence).toBe(0);
expect(res!.topSimilarity).toBe(0);
expect(res!.margin).toBe(0);
});

it('does NOT flag single result with decent absolute similarity', () => {
// top=0.4, second=0 → margin=0.4 → marginConf=1.0, absConf=0.8
// confidence = 0.6*1.0 + 0.4*0.8 = 0.92 ≥ 0.3
const res = computeRetrievalConfidence([result(0.4)], enabledCfg);
expect(res).not.toBeNull();
expect(res!.lowConfidence).toBe(false);
expect(res!.margin).toBe(0.4);
});

it('does NOT flag when top is strong and well-separated', () => {
const res = computeRetrievalConfidence([result(0.9), result(0.4)], enabledCfg);
expect(res).not.toBeNull();
expect(res!.lowConfidence).toBe(false);
expect(res!.confidence).toBeGreaterThan(0.8);
});

it('flags narrow margin and weak top similarity', () => {
// top=0.10, second=0.09 → margin=0.01
// marginConf=0.01/0.05=0.2, absConf=0.10/0.5=0.2
// confidence = 0.6*0.2 + 0.4*0.2 = 0.20 < 0.3
const res = computeRetrievalConfidence([result(0.10), result(0.09)], enabledCfg);
expect(res).not.toBeNull();
expect(res!.lowConfidence).toBe(true);
expect(res!.margin).toBeCloseTo(0.01, 5);
expect(res!.confidence).toBeCloseTo(0.20, 2);
});

it('does NOT flag weak top when margin is strong', () => {
// top=0.15, second=0.02 → margin=0.13 → marginConf=min(1, 0.13/0.05)=1.0
// absConf=min(1, 0.15/0.5)=0.3
// confidence = 0.6*1.0 + 0.4*0.3 = 0.72 ≥ 0.3
const res = computeRetrievalConfidence([result(0.15), result(0.02)], enabledCfg);
expect(res).not.toBeNull();
expect(res!.lowConfidence).toBe(false);
expect(res!.confidence).toBeCloseTo(0.72, 2);
});

it('respects margin normalizer override', () => {
const narrow = computeRetrievalConfidence([result(0.25), result(0.23)], {
retrievalConfidenceGateEnabled: true,
retrievalConfidenceMarginNormalizer: 0.01,
});
// margin=0.02, normalizer=0.01 → marginConf=1.0 → confidence much higher
expect(narrow).not.toBeNull();
expect(narrow!.margin).toBeCloseTo(0.02, 5);
expect(narrow!.confidence).toBeGreaterThan(0.5);
});

it('respects floor override', () => {
const res = computeRetrievalConfidence([result(0.25), result(0.23)], {
retrievalConfidenceGateEnabled: true,
retrievalConfidenceFloor: 0.05,
});
// Same narrow margin, but floor is 0.05 → confidence ≈0.24 < 0.05? No, 0.24 > 0.05
expect(res).not.toBeNull();
expect(res!.lowConfidence).toBe(false);
});

it('uses similarity, not score, for computation', () => {
// The gate reads `similarity` directly; it does not depend on `score`.
const res = computeRetrievalConfidence(
[{ similarity: 0.8 }, { similarity: 0.3 }],
enabledCfg,
);
expect(res).not.toBeNull();
expect(res!.topSimilarity).toBe(0.8);
expect(res!.margin).toBe(0.5);
expect(res!.lowConfidence).toBe(false);
});

it('computes exact confidence for a mid-range case', () => {
// top=0.5, second=0.4 → margin=0.1
// marginConf=min(1, 0.1/0.05)=1.0
// absConf=min(1, 0.5/0.5)=1.0
// confidence = 0.6*1.0 + 0.4*1.0 = 1.0
const res = computeRetrievalConfidence([result(0.5), result(0.4)], enabledCfg);
expect(res!.confidence).toBeCloseTo(1.0, 5);
expect(res!.lowConfidence).toBe(false);
});
});
2 changes: 1 addition & 1 deletion src/services/extraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import {
type ExtractionOptions,
} from './observation-date-extraction.js';

const EXTRACTION_MAX_TOKENS = 4096;
const EXTRACTION_MAX_TOKENS = 8192;
const AUDN_MAX_TOKENS = 2048;

export type { ExtractionOptions };
Expand Down
17 changes: 11 additions & 6 deletions src/services/memory-search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ async function executeSearchStep(
retrievalOptions: RetrievalOptions | undefined,
asOf: string | undefined,
trace: TraceCollector,
): Promise<{ memories: SearchResult[]; activeTrace: TraceCollector }> {
): Promise<{ memories: SearchResult[]; activeTrace: TraceCollector; retrievalConfidence: import('./retrieval-confidence-gate.js').RetrievalConfidence | null }> {
if (asOf) {
const memories = await deps.stores.claim.searchClaimVersions(userId, await embedText(query, 'query'), effectiveLimit, asOf, sourceSite);
trace.stage('as-of-search', memories, { asOf });
return { memories, activeTrace: trace };
return { memories, activeTrace: trace, retrievalConfidence: null };
}
const pipelineStores = { search: deps.stores.search, link: deps.stores.link, memory: deps.stores.memory, entity: deps.stores.entity, pool: deps.stores.pool };
const pipelineResult = await runSearchPipelineWithTrace(pipelineStores, userId, query, effectiveLimit, sourceSite, referenceTime, {
Expand All @@ -78,7 +78,7 @@ async function executeSearchStep(
skipReranking: retrievalOptions?.skipReranking,
runtimeConfig: deps.config,
});
return { memories: pipelineResult.filtered, activeTrace: pipelineResult.trace };
return { memories: pipelineResult.filtered, activeTrace: pipelineResult.trace, retrievalConfidence: pipelineResult.retrievalConfidence };
}

/** Filter workspace-scoped, stale composites, and consensus-violating memories. */
Expand Down Expand Up @@ -135,6 +135,7 @@ function assembleResponse(
asOf: string | undefined,
sourceSite: string | undefined,
lessonCheck: LessonCheckResult | undefined,
retrievalConfidence: import('./retrieval-confidence-gate.js').RetrievalConfidence | null,
): RetrievalResult {
const mode = retrievalOptions?.retrievalMode ?? 'flat';
const packaged = applyFlatPackagingPolicy(postProcessed.memories, query, mode, activeTrace);
Expand All @@ -150,7 +151,7 @@ function assembleResponse(
});
activeTrace.finalize(outputMemories);

return {
const result: RetrievalResult = {
memories: outputMemories, injectionText,
citations: buildRichCitations(outputMemories).map((c) => c.memory_id),
retrievalMode: mode, tierAssignments, expandIds, estimatedContextTokens,
Expand All @@ -159,6 +160,10 @@ function assembleResponse(
retrievalSummary: activeTrace.getRetrievalSummary(),
packagingSummary, assemblySummary,
};
if (retrievalConfidence) {
result.retrievalConfidence = retrievalConfidence;
}
return result;
}

/** Full search with lesson check, URI resolution, pipeline, post-processing, and packaging. */
Expand All @@ -185,9 +190,9 @@ export async function performSearch(
const uriResult = await tryUriResolution(deps, query, userId, retrievalOptions, trace);
if (uriResult) return uriResult;

const { memories: rawMemories, activeTrace } = await executeSearchStep(deps, userId, query, effectiveLimit, sourceSite, referenceTime, namespaceScope, retrievalOptions, asOf, trace);
const { memories: rawMemories, activeTrace, retrievalConfidence } = await executeSearchStep(deps, userId, query, effectiveLimit, sourceSite, referenceTime, namespaceScope, retrievalOptions, asOf, trace);
const filteredMemories = await postProcessResults(deps, rawMemories, activeTrace, userId, query, asOf);
return assembleResponse(deps, filteredMemories, query, userId, activeTrace, retrievalOptions, asOf, sourceSite, lessonCheck);
return assembleResponse(deps, filteredMemories, query, userId, activeTrace, retrievalOptions, asOf, sourceSite, lessonCheck, retrievalConfidence);
}

/**
Expand Down
1 change: 1 addition & 0 deletions src/services/memory-service-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ export interface RetrievalResult {
retrievalSummary?: import('./retrieval-trace.js').RetrievalTraceSummary;
packagingSummary?: import('./retrieval-trace.js').PackagingTraceSummary;
assemblySummary?: import('./retrieval-trace.js').AssemblyTraceSummary;
retrievalConfidence?: import('./retrieval-confidence-gate.js').RetrievalConfidence;
}

/** Options controlling retrieval packaging. */
Expand Down
78 changes: 78 additions & 0 deletions src/services/retrieval-confidence-gate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* Retrieval confidence gate — computes a confidence score based on the
* separation between top results. When confidence is low, signals to the
* caller that retrieval may be insufficient for a definitive answer.
*
* This targets the abstention ability (ABS) on BEAM, where Honcho scores
* below the no-memory baseline because over-retrieval poisons "I don't know"
* answers.
*/

export interface RetrievalConfidence {
/** True when the confidence composite falls below the configured floor. */
lowConfidence: boolean;
/** Composite confidence in [0, 1]. */
confidence: number;
/** Similarity of the top result (the stable, scale-invariant signal). */
topSimilarity: number;
/** Margin between top and second result similarities. */
margin: number;
}

export interface RetrievalConfidenceConfig {
retrievalConfidenceGateEnabled: boolean;
retrievalConfidenceMarginNormalizer: number;
retrievalConfidenceSimilarityNormalizer: number;
retrievalConfidenceFloor: number;
}

const DEFAULT_MARGIN_NORMALIZER = 0.05;
const DEFAULT_SIMILARITY_NORMALIZER = 0.5;
const DEFAULT_CONFIDENCE_FLOOR = 0.3;
const MARGIN_WEIGHT = 0.6;
const ABSOLUTE_WEIGHT = 0.4;

/**
* Compute retrieval confidence from a ranked list of results.
*
* Uses `similarity` (not `score`) because `score` is rewritten by RRF,
* cross-encoder, MMR, and additive boosts mid-pipeline. `similarity` is the
* only stable, scale-invariant signal that survives all stages.
*
* @param results — ranked search results; must expose `similarity: number`.
* @param cfg — gate configuration; when disabled returns `null`.
*/
export function computeRetrievalConfidence(
results: ReadonlyArray<{ similarity: number }>,
cfg: Partial<RetrievalConfidenceConfig> & { retrievalConfidenceGateEnabled: boolean },
): RetrievalConfidence | null {
if (!cfg.retrievalConfidenceGateEnabled) return null;

if (results.length === 0) {
return {
lowConfidence: true,
confidence: 0,
topSimilarity: 0,
margin: 0,
};
}

const top = results[0].similarity;
const second = results.length > 1 ? results[1].similarity : 0;
const margin = Math.max(0, top - second);

const marginNormalizer = cfg.retrievalConfidenceMarginNormalizer ?? DEFAULT_MARGIN_NORMALIZER;
const similarityNormalizer = cfg.retrievalConfidenceSimilarityNormalizer ?? DEFAULT_SIMILARITY_NORMALIZER;
const floor = cfg.retrievalConfidenceFloor ?? DEFAULT_CONFIDENCE_FLOOR;

const marginConf = Math.min(1, margin / marginNormalizer);
const absConf = Math.min(1, top / similarityNormalizer);
const confidence = MARGIN_WEIGHT * marginConf + ABSOLUTE_WEIGHT * absConf;

return {
lowConfidence: confidence < floor,
confidence,
topSimilarity: top,
margin,
};
}
18 changes: 16 additions & 2 deletions src/services/search-pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import { applyCurrentStateRanking } from './current-state-ranking.js';
import { applyConcisenessPenalty } from './conciseness-preference.js';
import { protectLiteralListAnswerCandidates } from './literal-list-protection.js';
import { applyTemporalQueryConstraints } from './temporal-query-constraints.js';
import { computeRetrievalConfidence, type RetrievalConfidence } from './retrieval-confidence-gate.js';

const TEMPORAL_NEIGHBOR_WINDOW_MINUTES = 30;
const SEMANTIC_RRF_WEIGHT = 1.2;
Expand Down Expand Up @@ -85,6 +86,10 @@ export type SearchPipelineRuntimeConfig = Pick<
| 'retrievalProfileSettings'
| 'temporalQueryConstraintBoost'
| 'temporalQueryConstraintEnabled'
| 'retrievalConfidenceGateEnabled'
| 'retrievalConfidenceMarginNormalizer'
| 'retrievalConfidenceSimilarityNormalizer'
| 'retrievalConfidenceFloor'
>;
/**
* Decide whether to auto-skip cross-encoder reranking.
Expand Down Expand Up @@ -142,7 +147,7 @@ export async function runSearchPipelineWithTrace(
sourceSite?: string,
referenceTime?: Date,
options: SearchPipelineOptions = {},
): Promise<{ filtered: SearchResult[]; trace: TraceCollector }> {
): Promise<{ filtered: SearchResult[]; trace: TraceCollector; retrievalConfidence: RetrievalConfidence | null }> {
const trace = new TraceCollector(query, userId);
const policyConfig: SearchPipelineRuntimeConfig = options.runtimeConfig ?? config;
const mmrPoolMultiplier = policyConfig.mmrEnabled ? 3 : 1;
Expand Down Expand Up @@ -267,6 +272,15 @@ export async function runSearchPipelineWithTrace(
policyConfig,
));

const retrievalConfidence = computeRetrievalConfidence(selected, policyConfig);
if (retrievalConfidence?.lowConfidence) {
trace.event('low-confidence-gate', {
confidence: retrievalConfidence.confidence,
topSimilarity: retrievalConfidence.topSimilarity,
margin: retrievalConfidence.margin,
});
}

const namespaceScope = options.namespaceScope ?? null;
trace.setRetrievalSummary({
candidateIds: selected.map((result) => result.id),
Expand All @@ -281,7 +295,7 @@ export async function runSearchPipelineWithTrace(
? selected.filter((r) => isInScope(r.namespace, namespaceScope))
: selected;

return { filtered, trace };
return { filtered, trace, retrievalConfidence };
}

async function runInitialRetrieval(
Expand Down
Loading