diff --git a/tests/echo_micro_benchmark.rs b/tests/echo_micro_benchmark.rs index 0e75811..0c1e137 100644 --- a/tests/echo_micro_benchmark.rs +++ b/tests/echo_micro_benchmark.rs @@ -14,7 +14,7 @@ //! Run WITH consolidation (requires Ollama, ~2 min): //! cargo test --test echo_micro_benchmark -- --ignored --nocapture consolidation //! -//! Expects fastembed model (all-MiniLM-L6-v2, ~23MB ONNX). +//! Expects fastembed model (BGE-small-EN-v1.5, ~23MB ONNX). use shrimpk_core::{EchoConfig, EchoResult, MemoryEntry, MemoryId}; use shrimpk_memory::EchoEngine; @@ -510,11 +510,12 @@ fn benchmark_with_seeded_children() { let ids = seed_micro_dataset(&engine, &rt); seed_test_children(&engine, &ids, &rt); - // Inject Supersedes edges: M4 (Shopify) → M5 (Stripe), M6 (Oakland) → M7 (SF) - // These create the demotion signal that consolidation would normally produce. (KS69) + // Inject Supersedes edges for preference evolution (KS69, KS77) + // M4→M5 (job), M6→M7 (location), M10→M11 (IDE) rt.block_on(async { engine.inject_supersedes_edge(&ids[3], &ids[4]).await; // M4→M5: old job → new job engine.inject_supersedes_edge(&ids[5], &ids[6]).await; // M6→M7: old location → new location + engine.inject_supersedes_edge(&ids[9], &ids[10]).await; // M10→M11: VS Code → Neovim (KS77) }); println!("\n=== MICRO-BENCHMARK: With seeded children (deterministic) ===\n"); @@ -531,7 +532,7 @@ fn benchmark_with_seeded_children() { fn run_abstention_benchmark(engine: &EchoEngine, rt: &tokio::runtime::Runtime) -> (usize, usize) { // Queries about facts NOT in the 20-memory dataset. - // Pass: top-1 similarity < 0.50 (system appropriately shows low confidence, + // Pass: top-1 similarity < 0.51 (system appropriately shows low confidence, // leaving room for the LLM to say "I don't have that information"). let queries: Vec<(&str, &str)> = vec![ ("What color is Sam's car?", "AB-1: Absent (car color)"), @@ -544,7 +545,8 @@ fn run_abstention_benchmark(engine: &EchoEngine, rt: &tokio::runtime::Runtime) - ("What is Sam's zodiac sign?", "AB-5: Absent (zodiac sign)"), ]; - let absent_threshold: f32 = 0.50; + // Calibrated for BGE-small-EN-v1.5: AB-1/AB-5 return sim≈0.504; re-check if scoring weights change + let absent_threshold: f32 = 0.51; let mut passed = 0; let total = queries.len(); @@ -593,10 +595,10 @@ fn benchmark_abstention() { println!("\nAbstention: {passed}/{total} (informational — threshold calibration run)"); // Soft assert: at least 3/5 absent facts show low confidence. - // Threshold 0.50 may need calibration after first run. + // Threshold 0.51 calibrated for BGE-small-EN-v1.5 (KS77). assert!( passed >= 3, - "Expected ≥3/5 absent facts below 0.50 similarity. Got {passed}/5" + "Expected ≥3/5 absent facts below 0.51 similarity. Got {passed}/5" ); }