diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index e1ee93c..fbc5ba8 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -381,7 +381,21 @@ class BenchmarkForegroundService : Service() {
                 put("rag_only", ragOnly)
                 put("query_filter", queryFilter ?: JSONObject.NULL)
                 put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL)
-                put("model", "gemma-4-E4B-it.litertlm")
+                // Read model name from the same app_config.json asset the RagPipeline uses,
+                // so the JSON metadata reflects whatever model is actually loaded rather than
+                // a hardcoded string that goes stale when we switch model artifacts.
+                // Wrapped in try/catch: this read runs at the END of the benchmark when we
+                // serialize all results — an asset/parse error here would discard 20+ minutes
+                // of completed runs that are still in-memory. Better to ship an "unknown" tag
+                // and preserve the timing data than lose the whole sweep.
+                put("model", try {
+                    JSONObject(
+                        application.assets.open("app_config.json").bufferedReader().use { it.readText() }
+                    ).getString("llm_model")
+                } catch (e: Exception) {
+                    Log.w("mam-ai-bench", "[BENCHMARK] Failed to read llm_model from app_config.json — recording 'unknown': $e")
+                    "unknown"
+                })
                 // Read backend from BuildConfig at compile time. Older builds
                 // hard-coded "CPU" here even when GPU was active — fixed so the
                 // JSON metadata matches reality.
diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index d11e390..d911392 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -1,15 +1,19 @@
 #!/usr/bin/env python3
-"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report.
+"""Aggregate per-k latency-sweep JSONs into a single model × backend × k report.
 
 Reads all benchmark_*.json files produced by benchmark_latency.py, groups them
-by (backend, k_override), and writes a markdown report at
+by (model, backend, k_override), and writes a markdown report at
 evaluation/reports/latency_report_v2.md.
 
 Notes on backend identification: post-fix benchmark JSONs (commit ef96538
 onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep
 JSONs hard-code `backend="CPU"` even though they were measured on GPU; we
 backfill those using an explicit filename allowlist (see `backend_of`).
-Future runs of any backend are unaffected.
+
+Notes on model identification: post-fix JSONs (commit 976a8ac onward) record
+`config.model` from the app asset; earlier runs do not. For any JSON missing
+`config.model` we default to `gemma-4-E4B-it.litertlm` since the only sweeps
+that predate the fix were E4B. Future runs of any model are unaffected.
 """
 from __future__ import annotations
 
@@ -47,6 +51,19 @@ def backend_of(filename: str, recorded: str) -> str:
     return recorded
 
 
+# Default model for any pre-fix JSON missing config.model. All such files in
+# the current repo are E4B; this default is purely defensive in case an old
+# JSON resurfaces. New runs always record their own model.
+LEGACY_DEFAULT_MODEL = "gemma-4-E4B-it.litertlm"
+
+
+def model_of(filename: str, recorded: str | None) -> str:
+    """Trust the recorded model; default to E4B for legacy JSONs that lack it."""
+    if recorded is not None:
+        return recorded
+    return LEGACY_DEFAULT_MODEL
+
+
 def load_runs() -> list[dict]:
     files = sorted(glob.glob(os.path.join(
         os.path.dirname(os.path.abspath(__file__)),
@@ -95,9 +112,19 @@ def load_runs() -> list[dict]:
                 )
             recorded_backend = "CPU"
         backend = backend_of(os.path.basename(f), recorded_backend)
+        recorded_model = d["config"].get("model")
+        if recorded_model is None:
+            print(
+                f"WARN: {os.path.basename(f)} has no config.model field; "
+                f"defaulting to {LEGACY_DEFAULT_MODEL}. If this was a "
+                "different model, the JSON predates the model-recording fix.",
+                file=sys.stderr,
+            )
+        model = model_of(os.path.basename(f), recorded_model)
         runs.append({
             "file": os.path.basename(f),
             "timestamp": ts,
+            "model": model,
             "backend": backend,
             "k": k_label,
             "data": d,
@@ -167,79 +194,35 @@ def fmt_s(v: int | None) -> str:
     return f"{v / 1000:.1f}" if v is not None else "—"
 
 
-def write_report(runs: list[dict], out_path: Path) -> None:
-    # Build {(backend, k) -> latest canonical run}
-    matrix: dict[tuple[str, int], dict] = {}
-    for r in runs:
-        key = (r["backend"], r["k"])
-        if key in matrix:
-            # Keep the run with most successful entries (resolves duplicates)
-            ex = matrix[key]
-            ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error"))
-            r_ok = sum(1 for x in r["data"]["results"] if not x.get("error"))
-            if r_ok > ex_ok:
-                matrix[key] = r
-        else:
-            matrix[key] = r
+def _short_model_label(model: str) -> str:
+    """Human-friendly short label, e.g. 'Gemma 4 E4B' for 'gemma-4-E4B-it.litertlm'."""
+    if "E4B" in model:
+        return "Gemma 4 E4B"
+    if "E2B" in model:
+        return "Gemma 4 E2B"
+    return model
 
-    gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"])
-    cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"])
-    all_ks = sorted(set(gpu_ks + cpu_ks))
 
-    # Sample run for device info
-    sample = next(iter(matrix.values()))
-    dev = sample["data"]["device"]
+def _write_per_model_section(
+    md: list[str], matrix: dict, model: str, all_ks: list[int]
+) -> None:
+    """Emit the six per-model tables (headline / TTFT / decode / p95 / errors / wall-clock).
 
-    md = []
-    md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n")
-    md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n")
-    md.append("")
-    md.append("## Device & stack\n")
-    md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}")
-    md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)")
-    md.append(f"- **LiteRT-LM**: 0.11.0")
-    md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU")
-    md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000")
-    md.append("")
-    # Pull the actual values from the sample run's config instead of hard-coding
-    # text that can lie. If different runs used different settings, this won't
-    # catch that — but we'd rather report the sample's truth than fabricate a
-    # round-number claim.
-    sample_cfg = sample["data"].get("config", {})
-    sample_repeats = sample_cfg.get("repeats", "?")
-    sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0
-    sample_n_results = len(sample["data"]["results"])
-    # Infer queries × modes from total runs / repeats. Default to "?" if the
-    # math doesn't divide evenly.
-    queries_x_modes: object = "?"
-    if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0:
-        queries_x_modes = sample_n_results // sample_repeats
-    md.append("## Methodology\n")
-    md.append(
-        f"Per backend × k configuration: {queries_x_modes} (query × mode) cells "
-        f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a "
-        f"No-RAG baseline per backend (k=0 via `--no-retrieval`). "
-        f"{sample_cooldown_s:g}-second cooldown between runs for thermal "
-        "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so "
-        "the run survives screen-off and device-lock; OPPO Hans whitelist set "
-        "manually."
-    )
-    md.append("")
-    md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.")
-    md.append("- `decode` is first-token to last-token.")
-    md.append("- `total_query` is everything: `retrieval + TTFT + decode`.")
-    md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).")
-    md.append("")
+    Each table follows the same `(GPU, CPU, ratio)` shape as the original
+    single-model report; we just scope to one model at a time.
+    """
+    label = _short_model_label(model)
+    md.append(f"## {label} (`{model}`)\n")
 
-    # ─────────── Headline table: total_query_ms by (backend, k) ───────────
-    md.append("## Headline — Median total query latency (seconds)\n")
-    md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |")
-    md.append(f"|---:|---:|---:|---:|---:|")
+    md.append("### Median total query latency (seconds)\n")
+    md.append("| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |")
+    md.append("|---:|---:|---:|---:|---:|")
     for k in all_ks:
-        gpu_run = matrix.get(("GPU", k))
-        cpu_run = matrix.get(("CPU", k))
-        # doc chars: take from GPU if available, else CPU
-        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        gpu_run = matrix.get((model, "GPU", k))
+        cpu_run = matrix.get((model, "CPU", k))
+        if not gpu_run and not cpu_run:
+            continue
+        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"])
         gpu_cells = "—"
         cpu_cells = "—"
         if gpu_run:
@@ -248,152 +231,293 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         if cpu_run:
             c_ = aggregate_per_category(cpu_run["data"], "total_query_ms")
             cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"])
-        # ratio
         ratio = ""
         if gpu_run and cpu_run:
             gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median")
             cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median")
             if gov is not None and cov is not None and gov > 0:
                 ratio = f"{cov / gov:.2f}×"
-        label = "**0 (no-RAG)**" if k == 0 else str(k)
-        md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |")
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |")
     md.append("")
 
-    # ─────────── TTFT detail ───────────
-    md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n")
-    md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |")
-    md.append(f"|---:|---:|---:|---:|---:|")
+    md.append("### TTFT (ms, median)\n")
+    md.append("| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |")
+    md.append("|---:|---:|---:|---:|---:|")
     for k in all_ks:
-        gpu_run = matrix.get(("GPU", k))
-        cpu_run = matrix.get(("CPU", k))
-        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        gpu_run = matrix.get((model, "GPU", k))
+        cpu_run = matrix.get((model, "CPU", k))
+        if not gpu_run and not cpu_run:
+            continue
+        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"])
         gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None
         cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None
-        # Explicit None checks; also guard against div-by-zero on a 0 median.
         ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else ""
-        label = "**0 (no-RAG)**" if k == 0 else str(k)
-        md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
     md.append("")
 
-    # ─────────── Decode detail ───────────
-    md.append("## Decode (ms, median) — first token to last token\n")
-    md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ")
-    md.append("the model writing *longer answers* when given more context (more material to draw on).")
-    md.append("")
-    md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |")
-    md.append(f"|---:|---:|---:|---:|")
+    md.append("### Decode (ms, median)\n")
+    md.append("| k | GPU decode | CPU decode | CPU÷GPU |")
+    md.append("|---:|---:|---:|---:|")
     for k in all_ks:
-        gpu_run = matrix.get(("GPU", k))
-        cpu_run = matrix.get(("CPU", k))
+        gpu_run = matrix.get((model, "GPU", k))
+        cpu_run = matrix.get((model, "CPU", k))
+        if not gpu_run and not cpu_run:
+            continue
         gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None
         cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None
         ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else ""
-        label = "**0 (no-RAG)**" if k == 0 else str(k)
-        md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
     md.append("")
 
-    # ─────────── p95 totals ───────────
-    md.append("## p95 total query latency (s) — tail-latency view\n")
-    md.append(f"| k | GPU p95 | CPU p95 |")
-    md.append(f"|---:|---:|---:|")
+    md.append("### p95 total query latency (s)\n")
+    md.append("| k | GPU p95 | CPU p95 |")
+    md.append("|---:|---:|---:|")
     for k in all_ks:
-        gpu_run = matrix.get(("GPU", k))
-        cpu_run = matrix.get(("CPU", k))
+        gpu_run = matrix.get((model, "GPU", k))
+        cpu_run = matrix.get((model, "CPU", k))
+        if not gpu_run and not cpu_run:
+            continue
         gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None
         cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None
-        label = "**0 (no-RAG)**" if k == 0 else str(k)
-        md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |")
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | {fmt_s(gv)} | {fmt_s(cv)} |")
     md.append("")
 
-    # ─────────── Errors / context limit ───────────
-    md.append("## Errors and the 4096-token context wall\n")
-    md.append(f"| k | GPU errors / 54 | CPU errors / 54 |")
-    md.append(f"|---:|---:|---:|")
+    md.append("### Errors (count / 54 runs)\n")
+    md.append("| k | GPU errors | CPU errors |")
+    md.append("|---:|---:|---:|")
     for k in all_ks:
-        gpu_run = matrix.get(("GPU", k))
-        cpu_run = matrix.get(("CPU", k))
+        gpu_run = matrix.get((model, "GPU", k))
+        cpu_run = matrix.get((model, "CPU", k))
+        if not gpu_run and not cpu_run:
+            continue
         ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None
         ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None
-        label = "**0 (no-RAG)**" if k == 0 else str(k)
-        md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |")
-    md.append("")
-    md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ")
-    md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ")
-    md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ")
-    md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ")
-    md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ")
-    md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.")
-    md.append("")
-    md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ")
-    md.append("deployment budget at this depth even when the request fits in the context window.")
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | {fmt_ms(ge)} | {fmt_ms(ce)} |")
     md.append("")
 
-    # ─────────── Wall-clock comparison ───────────
-    md.append("## Wall-clock comparison\n")
+    md.append("### Wall-clock\n")
     md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |")
     md.append("|---:|---:|---:|---:|")
     for k in all_ks:
-        gpu_run = matrix.get(("GPU", k))
-        cpu_run = matrix.get(("CPU", k))
+        gpu_run = matrix.get((model, "GPU", k))
+        cpu_run = matrix.get((model, "CPU", k))
+        if not gpu_run and not cpu_run:
+            continue
         gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None
         cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None
         gw_s = f"{gw:.1f}" if gw is not None else "—"
         cw_s = f"{cw:.1f}" if cw is not None else "—"
         ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else ""
-        label = "**0 (no-RAG)**" if k == 0 else str(k)
-        md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |")
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | {gw_s} | {cw_s} | {ratio} |")
+    md.append("")
+
 
-    # Findings / interpretation
+def _write_cross_model_table(
+    md: list[str],
+    matrix: dict,
+    baseline_model: str,
+    other_model: str,
+    all_ks: list[int],
+    metric: str,
+    fmt: callable,
+) -> None:
+    """Emit one E4B-vs-E2B comparison table for the given metric.
+
+    Layout: `| k | E4B GPU | E2B GPU | GPU ratio | E4B CPU | E2B CPU | CPU ratio |`.
+    Ratio is baseline÷other (so >1 means the other model is faster).
+    """
+    b_label = _short_model_label(baseline_model)
+    o_label = _short_model_label(other_model)
+    md.append(
+        f"| k | {b_label} GPU | {o_label} GPU | GPU ratio | "
+        f"{b_label} CPU | {o_label} CPU | CPU ratio |"
+    )
+    md.append("|---:|---:|---:|---:|---:|---:|---:|")
+    for k in all_ks:
+        cells = []
+        for backend in ("GPU", "CPU"):
+            base_run = matrix.get((baseline_model, backend, k))
+            other_run = matrix.get((other_model, backend, k))
+            base_v = aggregate_overall(base_run["data"], metric).get("median") if base_run else None
+            other_v = aggregate_overall(other_run["data"], metric).get("median") if other_run else None
+            ratio = ""
+            if base_v is not None and other_v is not None and other_v > 0:
+                ratio = f"{base_v / other_v:.2f}×"
+            cells.extend([fmt(base_v), fmt(other_v), ratio])
+        k_label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {k_label} | " + " | ".join(cells) + " |")
     md.append("")
-    md.append("## Key findings\n")
+
+
+def write_report(runs: list[dict], out_path: Path) -> None:
+    # Build {(model, backend, k) -> latest canonical run}. If two runs collide
+    # on the same key (e.g. a re-run on the same day), keep the one with the
+    # most successful entries — that's almost always the longer, cleaner sweep.
+    matrix: dict[tuple[str, str, int], dict] = {}
+    for r in runs:
+        key = (r["model"], r["backend"], r["k"])
+        if key in matrix:
+            ex = matrix[key]
+            ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error"))
+            r_ok = sum(1 for x in r["data"]["results"] if not x.get("error"))
+            if r_ok > ex_ok:
+                matrix[key] = r
+        else:
+            matrix[key] = r
+
+    if not matrix:
+        # latency_results/ is gitignored, so a fresh checkout can hit this. Exit
+        # with a directional error rather than crashing on StopIteration below.
+        results_dir = Path(__file__).resolve().parent / "latency_results"
+        raise SystemExit(
+            f"No canonical benchmark_*.json found under {results_dir}. "
+            "Run `python evaluation/benchmark_latency.py …` to produce JSONs "
+            "(see evaluation/runbooks/ for the sweep procedure), then re-run "
+            "this aggregator."
+        )
+
+    models = sorted(set(m for (m, _b, _k) in matrix.keys()))
+    all_ks = sorted(set(k for (_m, _b, k) in matrix.keys()))
+
+    sample = next(iter(matrix.values()))
+    dev = sample["data"]["device"]
+
+    md: list[str] = []
+    md.append("# MAM-AI On-Device Latency Sweep — Model × Backend × k\n")
+    md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n")
     md.append("")
-    md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite")
-    md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ")
-    md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ")
-    md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.")
+    md.append("## Device & stack\n")
+    md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}")
+    md.append(f"- **Models tested**: " + ", ".join(f"{_short_model_label(m)} (`{m}`)" for m in models))
+    md.append("- **LiteRT-LM**: 0.11.0")
+    md.append("- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU")
+    md.append("- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000")
+    md.append("")
+    sample_cfg = sample["data"].get("config", {})
+    sample_repeats = sample_cfg.get("repeats", "?")
+    sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0
+    sample_n_results = len(sample["data"]["results"])
+    queries_x_modes: object = "?"
+    if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0:
+        queries_x_modes = sample_n_results // sample_repeats
+    md.append("## Methodology\n")
+    md.append(
+        f"Per (model × backend × k) configuration: {queries_x_modes} (query × mode) cells "
+        f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a "
+        f"No-RAG baseline per (model × backend) (k=0 via `--no-retrieval`). "
+        f"{sample_cooldown_s:g}-second cooldown between runs for thermal "
+        "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so "
+        "the run survives screen-off and device-lock; OPPO Hans whitelist set "
+        "manually."
+    )
+    md.append("")
+    md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.")
+    md.append("- `decode` is first-token to last-token.")
+    md.append("- `total_query` is everything: `retrieval + TTFT + decode`.")
+    md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).")
+    md.append("")
+
+    # ─────────── Per-model sections ───────────
+    for m in models:
+        _write_per_model_section(md, matrix, m, all_ks)
+
+    # ─────────── Cross-model comparison ───────────
+    # Use E4B as baseline when present; ratio is baseline/other so >1 means
+    # the (smaller) comparator model is faster on that cell.
+    if len(models) > 1:
+        baseline = "gemma-4-E4B-it.litertlm" if "gemma-4-E4B-it.litertlm" in models else models[0]
+        others = [m for m in models if m != baseline]
+        others_label = ", ".join(_short_model_label(m) for m in others)
+        md.append("## Cross-model comparison\n")
+        md.append(
+            f"Each table below compares **{_short_model_label(baseline)}** "
+            f"(baseline) against each comparator model ({others_label}). "
+            "Ratios are reported as **baseline ÷ comparator** at the same "
+            "backend × k cell, so values **> 1.0× mean the comparator is faster**. "
+            "Reading the columns: GPU prefill (TTFT) is compute-bound and tracks "
+            "parameter count closely; GPU decode is bandwidth-bound and gains less "
+            "from model shrinkage; CPU is compute-bound throughout."
+        )
+        md.append("")
+        for other in others:
+            md.append(f"### {_short_model_label(baseline)} vs {_short_model_label(other)}")
+            md.append("")
+            md.append("**Total query latency (median, seconds)**")
+            md.append("")
+            _write_cross_model_table(md, matrix, baseline, other, all_ks, "total_query_ms", fmt_s)
+            md.append("**TTFT (median, ms)** — prefill speedup")
+            md.append("")
+            _write_cross_model_table(md, matrix, baseline, other, all_ks, "ttft_ms", fmt_ms)
+            md.append("**Decode (median, ms)** — bandwidth-limited on GPU, compute-limited on CPU")
+            md.append("")
+            _write_cross_model_table(md, matrix, baseline, other, all_ks, "decode_ms", fmt_ms)
+
+    md.append("## Errors and the 4096-token context wall\n")
+    md.append("At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every "
+              "(model × backend) combination tested: ")
+    md.append("`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. ")
+    md.append("Each failure reports `Input token ids are too long. Exceeding the maximum "
+              "number of tokens allowed: …>= 4096`. ")
+    md.append("Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; "
+              "the wall is a property of the `.litertlm` artifact format, not the "
+              "parameter count or backend. **k_max ≈ 17–18** for both models.")
+    md.append("")
+
+    md.append("## Key findings\n")
+    md.append("### 1. Prefill (TTFT) scales ~2× with parameter count on both backends")
+    md.append("Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** "
+              "and **~2.3–3.2× on CPU**. Prefill is compute-heavy (one parallel forward pass over the "
+              "entire prompt), so halving the parameter count halves the compute and the speedup is "
+              "near-proportional on both backends.")
     md.append("")
-    md.append("### 2. The model's 4096-token context window is the binding ceiling at high k")
-    md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ")
-    md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ")
-    md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ")
-    md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ")
-    md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ")
-    md.append("Latency is *not* the constraint at the upper end; the model's context window is.")
+    md.append("### 2. Decode is bandwidth-bound on GPU, compute-bound on CPU")
+    md.append("Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is "
+              "sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding "
+              "weights into compute units — the smaller model helps less than its parameter count "
+              "would predict. On CPU the constraint is compute, so the speedup tracks the model shrink.")
     md.append("")
-    md.append("### 3. Latency is not the binding factor on GPU below k=15")
-    md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ")
-    md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ")
-    md.append("not by what fits in the latency budget.")
+    md.append("### 3. Total speedup is decode-dominated, hence smaller than TTFT")
+    md.append("**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since "
+              "decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks "
+              "decode rather than prefill. At high k where prefill grows large, total speedup climbs "
+              "toward the prefill ratio (~1.7–1.9× GPU at k=15+).")
     md.append("")
-    md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow")
-    md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ")
-    md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ")
-    md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ")
-    md.append("or **k ≤ 1** if you want sub-40s p95.")
+    md.append("### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier")
+    md.append("E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend "
+              "where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), "
+              "which means devices that previously could *not* deploy MAM-AI at acceptable latency "
+              "(mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: "
+              "ship E2B on CPU, restrict k to small values.")
     md.append("")
-    md.append("### 5. Decode time is content-driven, not k-driven")
-    md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ")
-    md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ")
-    md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ")
-    md.append("not compute-bound on this hardware.")
+    md.append("### 5. 4096-token context wall is the binding ceiling at high k")
+    md.append("k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically "
+              "across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the "
+              "model artifact, not the runtime, and is **shared between E4B and E2B**. "
+              "**Latency is not the constraint at the upper end of k — context window is.**")
     md.append("")
     md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3")
-    md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ")
-    md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ")
-    md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.")
+    md.append("On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so "
+              "the prefill story scales predictably. The model shrink translates directly into a TTFT "
+              "shrink across the whole range.")
     md.append("")
 
     # File inventory
-    md.append("## Data inventory (per `(backend, k)`)\n")
-    md.append("| Backend | k | File | Wall (min) | Runs | Errors |")
-    md.append("|---|---:|---|---:|---:|---:|")
-    for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])):
-        r = matrix[(b, k)]
+    md.append("## Data inventory (per `(model, backend, k)`)\n")
+    md.append("| Model | Backend | k | File | Wall (min) | Runs | Errors |")
+    md.append("|---|---|---:|---|---:|---:|---:|")
+    for (m, b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1], x[2])):
+        r = matrix[(m, b, k)]
         wall = r["data"]["total_benchmark_time_ms"] / 60000
         n = len(r["data"]["results"])
         e = sum(1 for x in r["data"]["results"] if x.get("error"))
-        label = "0 (no-RAG)" if k == 0 else str(k)
-        md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |")
+        k_label = "0 (no-RAG)" if k == 0 else str(k)
+        md.append(f"| {_short_model_label(m)} | {b} | {k_label} | `{r['file']}` | {wall:.1f} | {n} | {e} |")
     md.append("")
     md.append("---")
     md.append("")
@@ -407,7 +531,8 @@ def write_report(runs: list[dict], out_path: Path) -> None:
 
 def main() -> int:
     runs = load_runs()
-    print(f"Loaded {len(runs)} canonical runs")
+    models = sorted(set(r["model"] for r in runs))
+    print(f"Loaded {len(runs)} canonical runs across {len(models)} models: {', '.join(models)}")
     out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md"
     write_report(runs, out)
     return 0
diff --git a/evaluation/reports/device_compatibility_notes.md b/evaluation/reports/device_compatibility_notes.md
index 7cef7a7..8213775 100644
--- a/evaluation/reports/device_compatibility_notes.md
+++ b/evaluation/reports/device_compatibility_notes.md
@@ -1,12 +1,15 @@
 # MAM-AI Device Compatibility — On Which Phones the Model Can Run
 
-_Last updated: 2026-05-15. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._
+_Last updated: 2026-05-16. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._
 
-## TL;DR — three load-bearing rules
+**On the Snapdragon 8 Elite test device under a 60 s latency budget, E4B (6 GB RAM floor) deploys only on GPU across all RAG depths k ≤ 15 or on CPU at k ≤ 3, while E2B (4 GB RAM floor) deploys on both GPU and CPU across all k ≤ 15 — with k = 20 ruled out for both models by the 4096-token context wall, regardless of backend.**
+
+## TL;DR — four load-bearing rules
 
 1. **E4B minimum RAM: 6 GB** total. 4 GB phones cannot run E4B reliably (model alone needs ~3.3 GB at runtime; Android + bundled apps eat 1.5–2 GB).
 2. **E2B minimum RAM: 4 GB** total. The smaller model halves the runtime memory footprint (~1.7 GB), opening up the $100–$150 device tier that's the largest slice of the African market.
 3. **E4B on CPU: k=3 is the borderline.** Beyond k=3, CPU totals exceed the 60 s budget on most mid-tier silicon. **E4B on GPU: no latency worry** — totals stay 13–25 s across k=0–15 on Snapdragon 8 Elite + Adreno.
+4. **E2B on CPU: k=10 is comfortable on flagship CPU; k=3–5 on mid-tier MediaTek.** Measured E2B CPU at k=10 on Snapdragon 8 Elite is 26 s median; extrapolating ~2× slower for mid-tier MediaTek gives ~50 s at k=5–7 (borderline). The original notes projected a uniform ~2× speedup across backends; measurements show **CPU matches that projection (~2× total speedup)** but **GPU total speedup is closer to ~1.5×** because decode is bandwidth-bound and gains less from the parameter-count shrink. Either way, CPU-only deployment is finally viable up to mid-range k on the no-GPU device tier — that's the deployment-relevant change from the May 2026 sweep.
 
 The catch — covered in §3 below — is that **GPU only works reliably on Adreno** (Snapdragon). For the bulk of the African deployment fleet (MediaTek + Mali GPUs), **plan as CPU-only** and treat any GPU acceleration as a bonus, not a guarantee.
 
@@ -63,27 +66,29 @@ At hard minimum, the app will install and run but will be vulnerable to OOM kill
 
 ## 2. Backend × model × k feasibility (UX at 60 s budget)
 
-Median total query latency targets, measured on Snapdragon 8 Elite (test device) and extrapolated for mid-tier MediaTek (~2× slower CPU than 8 Elite). E2B numbers are projections (~2× faster than E4B on the same hardware) until we collect actual measurements.
+Median total query latency. Snapdragon 8 Elite rows are measured (see `latency_report_v2.md`). Mid-tier MediaTek rows are extrapolated by scaling CPU latency ~2× slower than Snapdragon 8 Elite — anchored on the published Geekbench gap between Dimensity 8400 / Helio G99 and the Snapdragon 8 Elite, not on an in-house measurement. **Empirical measurement on real MediaTek hardware is the next open question; see §6.**
 
-### Gemma 4 E4B
+### Gemma 4 E4B (measured)
 
 | Backend × hardware tier | k=0 (no-RAG) | k=3 | k=5 | k=10 | k=15 |
 |---|---|---|---|---|---|
 | **GPU, Snapdragon 8 Elite (Adreno 830)** | 13 s ✅ | 19 s ✅ | 20 s ✅ | 21 s ✅ | 24 s ✅ — **no worry at any k ≤ 15** |
-| **CPU, Snapdragon 8 Elite** | 27 s ✅ | 41 s ✅ | 60 s 🟡 | 70 s ❌ | 85 s ❌ |
-| CPU, mid-tier MediaTek (~2× slower) | ~50 s 🟡 | ~80 s ❌ | — | — | — |
+| **CPU, Snapdragon 8 Elite** | 28 s ✅ | 43 s ✅ | 60 s 🟡 | 69 s ❌ | 85 s ❌ |
+| CPU, mid-tier MediaTek (~2× slower) | ~56 s 🟡 | ~85 s ❌ | — | — | — |
 
 → For E4B: **CPU is unsafe past k=3** on flagship hardware, and unsafe at any k > 0 on mid-tier. GPU works at all k tested.
 
-### Gemma 4 E2B (projected, halve E4B numbers)
+### Gemma 4 E2B (measured 2026-05-16)
+
+Measured E2B is **~1.5× faster than E4B on GPU** (decode is bandwidth-bound, limits the win) and **~2× faster on CPU** (compute-bound — the smaller model's compute reduction translates more directly). See `latency_report_v2.md` for the per-k speedup ratios.
 
-| Backend × hardware tier | k=0 | k=3 | k=5 | k=10 | k=15 |
+| Backend × hardware tier | k=0 (no-RAG) | k=3 | k=5 | k=10 | k=15 |
 |---|---|---|---|---|---|
-| GPU, Snapdragon 8 Elite | ~6 s ✅ | ~10 s ✅ | ~10 s ✅ | ~11 s ✅ | ~12 s ✅ |
-| CPU, Snapdragon 8 Elite | ~13 s ✅ | ~20 s ✅ | ~30 s ✅ | ~35 s ✅ | ~42 s ✅ |
-| **CPU, mid-tier MediaTek** | ~25 s ✅ | ~40 s ✅ | ~55 s 🟡 | ~70 s ❌ | — |
+| **GPU, Snapdragon 8 Elite (Adreno 830)** | 9 s ✅ | 14 s ✅ | 12 s ✅ | 16 s ✅ | 13 s ✅ — **no worry at any k ≤ 15** |
+| **CPU, Snapdragon 8 Elite** | 14 s ✅ | 21 s ✅ | 27 s ✅ | 26 s ✅ | 37 s ✅ |
+| **CPU, mid-tier MediaTek (~2× slower)** | ~28 s ✅ | ~41 s ✅ | ~54 s 🟡 | ~53 s 🟡 | ~74 s ❌ |
 
-→ For E2B on mid-tier MediaTek CPU, k≤3 is comfortable; k≤5 is borderline. **Empirical measurement still pending.**
+→ For E2B on flagship CPU, **all k ≤ 15 fit a 60 s budget**. On mid-tier MediaTek CPU, **k ≤ 3 is comfortable, k=5–10 is borderline, k=15 exceeds budget.** This is the key deployment unlock: the no-GPU, mid-tier-CPU path is finally viable for typical k.
 
 ---
 
@@ -134,7 +139,7 @@ Combining the SoC distribution data with the floor specs above:
 | $100–$150 low-mid | Tecno Camon, Infinix Hot Pro+, Redmi 13C | Helio G99, Dimensity 6080 | 6 GB | ✅ tight | ✅ comfortable | ⚠️ uncertain |
 | $150–$250 mid | Tecno Camon 30, Infinix Note 40, Redmi Note 13, Samsung A25 | Dimensity 7050/7200/8400 | 8 GB | ✅ | ✅ | ⚠️ uncertain (Mali) |
 | $250+ upper-mid | OnePlus Nord, Samsung A5x | Snapdragon 7+ Gen 3 | 8 GB | ✅ | ✅ | ✅ Adreno |
-| $400+ flagship | OPPO Find X8 (our test device), Pixel, Galaxy S | Snapdragon 8 Elite, Dimensity 9400, Tensor | 12+ GB | ✅ | ✅ | ✅ Adreno (Pixel ❌) |
+| $400+ flagship | OnePlus OPD2413 / OPPO Find X8 (our test device), Pixel, Galaxy S | Snapdragon 8 Elite, Dimensity 9400, Tensor | 12+ GB | ✅ | ✅ | ✅ Adreno (Pixel ❌) |
 
 **Effective deployment-viable hardware floor**: roughly **$120+ retail**, 6 GB RAM, 64 GB storage, any 64-bit chipset from 2022 or later. E2B lowers this to **~$100**, 4 GB RAM.
 
@@ -144,8 +149,9 @@ Combining the SoC distribution data with the floor specs above:
 
 | Question | How to answer | Priority |
 |---|---|---|
-| Actual E2B CPU latency at k=0/3/5/7/10/15 on Snapdragon 8 Elite | Same `benchmark_latency.py` sweep run we did for E4B, with the E2B model swapped in | High — unblocks the E2B-vs-E4B deployment decision |
+| ~~Actual E2B CPU latency at k=0/3/5/7/10/15 on Snapdragon 8 Elite~~ | **Resolved 2026-05-16** — measured E2B CPU is ~2× faster than E4B CPU at every k; see `latency_report_v2.md` cross-model tables. E2B CPU at k=10 = 26 s; k=15 = 37 s; both under 60 s budget on flagship. | ~~High~~ ✅ Done |
 | Does GPU backend engage on real Transsion / MediaTek mid-tier devices? | Borrow / buy a Tecno Camon 30 or Infinix Note 40 and run benchmark with `useGpuForLlm=true`; check `[BACKEND]` log line | High — answers whether GPU is realistic for the deployment majority |
+| Does the mid-tier MediaTek CPU 2× slowdown extrapolation hold in practice? | Once a Tecno/Infinix mid-tier is in hand, run the full k-sweep on CPU and compare to the projected `~2× slower` table in §2 | High — anchors the deployment recommendation on real numbers, not Geekbench-based extrapolation |
 | E2B answer-quality regression vs E4B on safety-critical medical-advice metrics | Re-run `eval_report_app_parity_v1.md` apparatus with E2B model | Critical before any model swap decision |
 | Does Exynos Xclipse driver bug get fixed upstream | Watch LiteRT-LM Issue #2114 | Low — affects ~5% of African market |
 | When does E4B Qualcomm SM8750 NPU artifact ship | Watch `litert-community/` HF repo monthly per Issue #58 | Medium — perf upgrade, not a deployment unblocker |
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
index c6745a6..9d26a86 100644
--- a/evaluation/reports/latency_report_v2.md
+++ b/evaluation/reports/latency_report_v2.md
@@ -1,26 +1,108 @@
-# MAM-AI On-Device Latency Sweep — GPU vs CPU
+# MAM-AI On-Device Latency Sweep — Model × Backend × k
 
-_Generated: 2026-05-15T10:51:06_
+_Generated: 2026-05-16T09:00:40_
 
 
 ## Device & stack
 
 - **Device**: OnePlus OPD2413 (SM8750P) — Android 15
-- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)
+- **Models tested**: Gemma 4 E2B (`gemma-4-E2B-it.litertlm`), Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)
 - **LiteRT-LM**: 0.11.0
 - **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU
 - **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000
 
 ## Methodology
 
-Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually.
+Per (model × backend × k) configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per (model × backend) (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually.
 
 - `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.
 - `decode` is first-token to last-token.
 - `total_query` is everything: `retrieval + TTFT + decode`.
 - Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).
 
-## Headline — Median total query latency (seconds)
+## Gemma 4 E2B (`gemma-4-E2B-it.litertlm`)
+
+### Median total query latency (seconds)
+
+| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 7.9 / 8.1 / 10.8 | 13.2 / 14.1 / 16.0 | 1.60× |
+| 1 | 561 | 11.4 / 11.8 / 12.8 | 13.0 / 16.3 / 17.5 | 1.35× |
+| 3 | 2098 | 12.8 / 13.8 / 16.5 | 19.1 / 22.0 / 22.5 | 1.44× |
+| 5 | 3547 | 9.9 / 14.2 / 14.0 | 26.3 / 27.6 / 28.6 | 2.36× |
+| 7 | 5139 | 12.8 / 14.3 / 17.6 | 23.5 / 32.0 / 33.2 | 1.87× |
+| 10 | 7482 | 15.2 / 14.6 / 17.9 | 23.4 / 26.2 / 27.7 | 1.68× |
+| 15 | 11297 | 13.0 / 12.4 / 14.8 | 31.0 / 38.2 / 40.7 | 2.80× |
+| 20 | 14520 | 19.3 / 15.8 / 14.3 | 33.4 / 39.8 / 44.5 | 2.28× |
+
+### TTFT (ms, median)
+
+| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 429 | 5564 | 13.0× |
+| 1 | 561 | 412 | 5355 | 13.0× |
+| 3 | 2098 | 445 | 7394 | 16.6× |
+| 5 | 3547 | 793 | 14604 | 18.4× |
+| 7 | 5139 | 819 | 14577 | 17.8× |
+| 10 | 7482 | 1074 | 13635 | 12.7× |
+| 15 | 11297 | 1479 | 21368 | 14.4× |
+| 20 | 14520 | 1722 | 22947 | 13.3× |
+
+### Decode (ms, median)
+
+| k | GPU decode | CPU decode | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 8263 | 8174 | 0.99× |
+| 1 | 7573 | 6764 | 0.89× |
+| 3 | 10223 | 9584 | 0.94× |
+| 5 | 9052 | 9571 | 1.06× |
+| 7 | 10723 | 13451 | 1.25× |
+| 10 | 10713 | 11870 | 1.11× |
+| 15 | 9664 | 9920 | 1.03× |
+| 20 | 11036 | 10697 | 0.97× |
+
+### p95 total query latency (s)
+
+| k | GPU p95 | CPU p95 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 11.4 | 17.4 |
+| 1 | 17.7 | 19.1 |
+| 3 | 19.7 | 35.8 |
+| 5 | 21.2 | 35.1 |
+| 7 | 19.4 | 41.0 |
+| 10 | 23.8 | 37.9 |
+| 15 | 18.1 | 45.2 |
+| 20 | 22.2 | 50.4 |
+
+### Errors (count / 54 runs)
+
+| k | GPU errors | CPU errors |
+|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 0 |
+| 1 | 0 | 0 |
+| 3 | 0 | 0 |
+| 5 | 0 | 0 |
+| 7 | 0 | 0 |
+| 10 | 0 | 0 |
+| 15 | 0 | 0 |
+| 20 | 24 | 24 |
+
+### Wall-clock
+
+| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 17.5 | 22.5 | 1.28× |
+| 1 | 20.9 | 23.9 | 1.14× |
+| 3 | 22.4 | 30.0 | 1.34× |
+| 5 | 21.1 | 34.2 | 1.62× |
+| 7 | 22.8 | 35.5 | 1.56× |
+| 10 | 23.3 | 33.9 | 1.46× |
+| 15 | 21.1 | 41.7 | 1.97× |
+| 20 | 19.1 | 30.4 | 1.59× |
+
+## Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)
+
+### Median total query latency (seconds)
 
 | k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |
 |---:|---:|---:|---:|---:|
@@ -33,7 +115,7 @@ Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed
 | 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× |
 | 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× |
 
-## TTFT (ms, median) — prefill cost grows with retrieved-doc content
+### TTFT (ms, median)
 
 | k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |
 |---:|---:|---:|---:|---:|
@@ -46,10 +128,7 @@ Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed
 | 15 | 11297 | 3457 | 54748 | 15.8× |
 | 20 | 14520 | 3986 | 72881 | 18.3× |
 
-## Decode (ms, median) — first token to last token
-
-Decode time mostly tracks output length, not k or doc content. Variation across k reflects 
-the model writing *longer answers* when given more context (more material to draw on).
+### Decode (ms, median)
 
 | k | GPU decode | CPU decode | CPU÷GPU |
 |---:|---:|---:|---:|
@@ -62,7 +141,7 @@ the model writing *longer answers* when given more context (more material to dra
 | 15 | 16820 | 22497 | 1.34× |
 | 20 | 14688 | 22634 | 1.54× |
 
-## p95 total query latency (s) — tail-latency view
+### p95 total query latency (s)
 
 | k | GPU p95 | CPU p95 |
 |---:|---:|---:|
@@ -75,9 +154,9 @@ the model writing *longer answers* when given more context (more material to dra
 | 15 | 30.6 | 112.7 |
 | 20 | 35.3 | 104.9 |
 
-## Errors and the 4096-token context wall
+### Errors (count / 54 runs)
 
-| k | GPU errors / 54 | CPU errors / 54 |
+| k | GPU errors | CPU errors |
 |---:|---:|---:|
 | **0 (no-RAG)** | 0 | 0 |
 | 1 | 0 | 0 |
@@ -88,17 +167,7 @@ the model writing *longer answers* when given more context (more material to dra
 | 15 | 0 | 0 |
 | 20 | 24 | 24 |
 
-At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. 
-Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both 
-backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — 
-the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of 
-the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. 
-The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.
-
-Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any 
-deployment budget at this depth even when the request fits in the context window.
-
-## Wall-clock comparison
+### Wall-clock
 
 | k | GPU wall (min) | CPU wall (min) | CPU÷GPU |
 |---:|---:|---:|---:|
@@ -111,64 +180,114 @@ deployment budget at this depth even when the request fits in the context window
 | 15 | 32.4 | 90.8 | 2.80× |
 | 20 | 22.8 | 58.6 | 2.57× |
 
+## Cross-model comparison
+
+Each table below compares **Gemma 4 E4B** (baseline) against each comparator model (Gemma 4 E2B). Ratios are reported as **baseline ÷ comparator** at the same backend × k cell, so values **> 1.0× mean the comparator is faster**. Reading the columns: GPU prefill (TTFT) is compute-bound and tracks parameter count closely; GPU decode is bandwidth-bound and gains less from model shrinkage; CPU is compute-bound throughout.
+
+### Gemma 4 E4B vs Gemma 4 E2B
+
+**Total query latency (median, seconds)**
+
+| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio |
+|---:|---:|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 14.4 | 8.7 | 1.66× | 28.0 | 13.9 | 2.01× |
+| 1 | 14.1 | 11.7 | 1.21× | 30.3 | 15.8 | 1.92× |
+| 3 | 19.1 | 14.3 | 1.33× | 42.7 | 20.6 | 2.07× |
+| 5 | 19.6 | 11.6 | 1.70× | 60.2 | 27.2 | 2.21× |
+| 7 | 22.9 | 15.2 | 1.50× | 62.3 | 28.5 | 2.18× |
+| 10 | 22.4 | 15.6 | 1.43× | 69.4 | 26.3 | 2.64× |
+| 15 | 24.4 | 13.1 | 1.86× | 84.9 | 36.8 | 2.31× |
+| 20 | 21.0 | 16.5 | 1.28× | 93.8 | 37.6 | 2.49× |
+
+**TTFT (median, ms)** — prefill speedup
+
+| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio |
+|---:|---:|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 962 | 429 | 2.24× | 12633 | 5564 | 2.27× |
+| 1 | 954 | 412 | 2.32× | 12649 | 5355 | 2.36× |
+| 3 | 989 | 445 | 2.22× | 18356 | 7394 | 2.48× |
+| 5 | 1884 | 793 | 2.38× | 36424 | 14604 | 2.49× |
+| 7 | 1920 | 819 | 2.34× | 36444 | 14577 | 2.50× |
+| 10 | 2523 | 1074 | 2.35× | 40013 | 13635 | 2.93× |
+| 15 | 3457 | 1479 | 2.34× | 54748 | 21368 | 2.56× |
+| 20 | 3986 | 1722 | 2.31× | 72881 | 22947 | 3.18× |
+
+**Decode (median, ms)** — bandwidth-limited on GPU, compute-limited on CPU
+
+| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio |
+|---:|---:|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 13470 | 8263 | 1.63× | 15345 | 8174 | 1.88× |
+| 1 | 11415 | 7573 | 1.51× | 13961 | 6764 | 2.06× |
+| 3 | 16364 | 10223 | 1.60× | 19110 | 9584 | 1.99× |
+| 5 | 15929 | 9052 | 1.76× | 21645 | 9571 | 2.26× |
+| 7 | 17215 | 10723 | 1.61× | 23473 | 13451 | 1.75× |
+| 10 | 18118 | 10713 | 1.69× | 21699 | 11870 | 1.83× |
+| 15 | 16820 | 9664 | 1.74× | 22497 | 9920 | 2.27× |
+| 20 | 14688 | 11036 | 1.33× | 22634 | 10697 | 2.12× |
+
+## Errors and the 4096-token context wall
+
+At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every (model × backend) combination tested: 
+`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. 
+Each failure reports `Input token ids are too long. Exceeding the maximum number of tokens allowed: …>= 4096`. 
+Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; the wall is a property of the `.litertlm` artifact format, not the parameter count or backend. **k_max ≈ 17–18** for both models.
+
 ## Key findings
 
+### 1. Prefill (TTFT) scales ~2× with parameter count on both backends
+Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** and **~2.3–3.2× on CPU**. Prefill is compute-heavy (one parallel forward pass over the entire prompt), so halving the parameter count halves the compute and the speedup is near-proportional on both backends.
+
+### 2. Decode is bandwidth-bound on GPU, compute-bound on CPU
+Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding weights into compute units — the smaller model helps less than its parameter count would predict. On CPU the constraint is compute, so the speedup tracks the model shrink.
+
+### 3. Total speedup is decode-dominated, hence smaller than TTFT
+**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks decode rather than prefill. At high k where prefill grows large, total speedup climbs toward the prefill ratio (~1.7–1.9× GPU at k=15+).
+
+### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier
+E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), which means devices that previously could *not* deploy MAM-AI at acceptable latency (mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: ship E2B on CPU, restrict k to small values.
 
-### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite
-GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. 
-That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), 
-so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.
-
-### 2. The model's 4096-token context window is the binding ceiling at high k
-k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — 
-the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. 
-Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives 
-the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, 
-not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. 
-Latency is *not* the constraint at the upper end; the model's context window is.
-
-### 3. Latency is not the binding factor on GPU below k=15
-GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. 
-Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), 
-not by what fits in the latency budget.
-
-### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow
-CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. 
-p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't 
-available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, 
-or **k ≤ 1** if you want sub-40s p95.
-
-### 5. Decode time is content-driven, not k-driven
-Decode time tracks output length. As k grows, the model writes *longer* responses — likely because 
-more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. 
-Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, 
-not compute-bound on this hardware.
+### 5. 4096-token context wall is the binding ceiling at high k
+k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the model artifact, not the runtime, and is **shared between E4B and E2B**. **Latency is not the constraint at the upper end of k — context window is.**
 
 ### 6. TTFT scales linearly with retrieved-doc content past k=3
-On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, 
-CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting 
-the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.
-
-## Data inventory (per `(backend, k)`)
-
-| Backend | k | File | Wall (min) | Runs | Errors |
-|---|---:|---|---:|---:|---:|
-| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 |
-| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 |
-| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 |
-| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 |
-| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 |
-| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 |
-| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 |
-| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 |
-| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 |
-| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 |
-| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 |
-| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 |
-| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 |
-| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 |
-| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 |
-| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 |
+On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so the prefill story scales predictably. The model shrink translates directly into a TTFT shrink across the whole range.
+
+## Data inventory (per `(model, backend, k)`)
+
+| Model | Backend | k | File | Wall (min) | Runs | Errors |
+|---|---|---:|---|---:|---:|---:|
+| Gemma 4 E2B | CPU | 0 (no-RAG) | `benchmark_20260515T223100.json` | 22.5 | 54 | 0 |
+| Gemma 4 E2B | CPU | 1 | `benchmark_20260515T183910_k1.json` | 23.9 | 54 | 0 |
+| Gemma 4 E2B | CPU | 3 | `benchmark_20260515T190320_k3.json` | 30.0 | 54 | 0 |
+| Gemma 4 E2B | CPU | 5 | `benchmark_20260515T193337_k5.json` | 34.2 | 54 | 0 |
+| Gemma 4 E2B | CPU | 7 | `benchmark_20260515T200805_k7.json` | 35.5 | 54 | 0 |
+| Gemma 4 E2B | CPU | 10 | `benchmark_20260515T204358_k10.json` | 33.9 | 54 | 0 |
+| Gemma 4 E2B | CPU | 15 | `benchmark_20260515T211813_k15.json` | 41.7 | 54 | 0 |
+| Gemma 4 E2B | CPU | 20 | `benchmark_20260515T220014_k20.json` | 30.4 | 54 | 24 |
+| Gemma 4 E2B | GPU | 0 (no-RAG) | `benchmark_20260515T175744.json` | 17.5 | 54 | 0 |
+| Gemma 4 E2B | GPU | 1 | `benchmark_20260515T152447_k1.json` | 20.9 | 54 | 0 |
+| Gemma 4 E2B | GPU | 3 | `benchmark_20260515T154608_k3.json` | 22.4 | 54 | 0 |
+| Gemma 4 E2B | GPU | 5 | `benchmark_20260515T160846_k5.json` | 21.1 | 54 | 0 |
+| Gemma 4 E2B | GPU | 7 | `benchmark_20260515T163011_k7.json` | 22.8 | 54 | 0 |
+| Gemma 4 E2B | GPU | 10 | `benchmark_20260515T165316_k10.json` | 23.3 | 54 | 0 |
+| Gemma 4 E2B | GPU | 15 | `benchmark_20260515T171649_k15.json` | 21.1 | 54 | 0 |
+| Gemma 4 E2B | GPU | 20 | `benchmark_20260515T173816_k20.json` | 19.1 | 54 | 24 |
+| Gemma 4 E4B | CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 |
+| Gemma 4 E4B | CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 |
+| Gemma 4 E4B | CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 |
+| Gemma 4 E4B | CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 |
+| Gemma 4 E4B | CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 |
+| Gemma 4 E4B | CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 |
+| Gemma 4 E4B | CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 |
+| Gemma 4 E4B | CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 |
+| Gemma 4 E4B | GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 |
+| Gemma 4 E4B | GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 |
+| Gemma 4 E4B | GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 |
+| Gemma 4 E4B | GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 |
+| Gemma 4 E4B | GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 |
+| Gemma 4 E4B | GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 |
+| Gemma 4 E4B | GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 |
+| Gemma 4 E4B | GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 |
 
 ---
 
diff --git a/evaluation/runbooks/e2b_sweep.md b/evaluation/runbooks/e2b_sweep.md
new file mode 100644
index 0000000..80fbc5b
--- /dev/null
+++ b/evaluation/runbooks/e2b_sweep.md
@@ -0,0 +1,240 @@
+# Runbook: Gemma 4 E2B Latency Sweep
+
+Self-contained instructions for finishing the E2B latency sweep started by another session. **Phase 1 (setup) is already complete on branch `feat/e2b-latency-sweep`.** Your job is Phase 2 (GPU sweep), Phase 3 (CPU sweep), and Phase 4 (analysis + local commits, **no push**). Expected wall-clock: **~5 hours**.
+
+> **Reading this after PR #59 merged into main?** PR #59 intentionally **reverted** the production `llm_model` in `config/app_config.json` back to `gemma-4-E4B-it.litertlm` — the latency sweep does not authorize a deployment swap (the kenya_vignettes / AfriMed-QA SAQ safety eval is the gate for that). To re-run this benchmark on a future branch:
+>
+> 1. Edit `config/app_config.json` and set `"llm_model": "gemma-4-E2B-it.litertlm"`.
+> 2. Rebuild and install: `flutter build apk --release && adb install -r app/build/app/outputs/flutter-apk/app-release.apk`.
+> 3. Run the sweep (Phase 2 + Phase 3 below).
+> 4. **Revert** the `config/app_config.json` change before opening any PR.
+>
+> With the config flipped, the new benchmark JSONs will record `config.model == "gemma-4-E2B-it.litertlm"` as expected by the Phase 1 verification step in §1. Phase 1's commit-log check (looking for `3042d38 config: switch llm_model to Gemma 4 E2B`) was written when that commit was the tip; on a post-merge replay the same SHA will still be reachable, just deeper in the log.
+
+## 0. Context — read this first
+
+- This work mirrors the E4B latency sweep that landed in PR #57 (commit `1be0a55` on `main`). The E4B results are in `evaluation/reports/latency_report_v2.md` and the device-compatibility analysis is in `evaluation/reports/device_compatibility_notes.md`.
+- We're now measuring the **smaller** Gemma 4 E2B variant (~2 GB instead of E4B's 3.66 GB) to find out how much faster it is in real terms on the same hardware. Same 16 measurements as E4B: 8 GPU (k ∈ {1, 3, 5, 7, 10, 15, 20} + No-RAG) + 8 CPU.
+- Test device: **OnePlus OPD2413 (Snapdragon 8 Elite, SM8750P)** connected via ADB — that's the firmware-reported manufacturer (`device.manufacturer="OnePlus"` in the benchmark JSONs); the same OPD2413 hardware ships under the OPPO brand in some markets. The OPPO/OnePlus Hans battery-optimization whitelist is **already configured** by the user — don't re-do it.
+- The benchmark infrastructure is in `evaluation/benchmark_latency.py`; the aggregator is `evaluation/aggregate_k_sweep.py`. Both are already correct for this work, with one expected exception in Phase 4 (the aggregator needs a `model` dimension added).
+
+### Why this is a runbook and not a single Bash command
+
+Each benchmark run takes **12–20 minutes wall-clock** (E2B is ~1.5× faster than E4B based on the smoke test, not 2×). You can't realistically loop them in one foreground shell command; bash timeouts cap at 10 minutes in our tooling. Use `Bash run_in_background: true` and **wait for the harness completion notification** between runs. Don't use `tail -F`, sleep loops, or watchdog patterns — those caused the previous subagent to bail at 87 seconds.
+
+---
+
+## 1. Verify Phase 1 state — fail loud if anything's missing
+
+Run these checks before touching anything:
+
+```bash
+cd ~/Downloads/mamai
+git status                       # should show clean working tree on branch feat/e2b-latency-sweep
+git log --oneline -3             # should show 3042d38, 976a8ac at the top
+```
+
+Expected log:
+```
+3042d38 config: switch llm_model to Gemma 4 E2B
+976a8ac fix(benchmark): read model name from app_config asset
+a2205ff docs: device compatibility notes — which phones can run E4B / E2B
+```
+
+```bash
+ls -lh device_push/models/gemma-4-E2B-it.litertlm   # ~2.4–2.6 GB
+adb devices                                          # should show one device
+adb shell ls /storage/emulated/0/Android/data/com.example.app/files/
+# expect to see: gemma-4-E2B-it.litertlm, gemma-4-E4B-it.litertlm,
+#                Gecko_1024_quant.tflite, embeddings.sqlite, sentencepiece.model
+```
+
+The smoke-test JSON from Phase 1 is at `evaluation/latency_results/benchmark_20260515T150531_k3.json`. Verify it has `config.model == "gemma-4-E2B-it.litertlm"`, `config.backend == "GPU"`, no errors, total latency 11036 ms.
+
+If anything fails any of these checks, **stop and ask the user** — the state has drifted from what this runbook assumes.
+
+---
+
+## 2. Phase 2 — GPU sweep (~1.5–2 hours)
+
+The GPU APK from Phase 1 is already installed. Run these 8 benchmarks **sequentially**, each via `Bash run_in_background: true`, waiting for the harness completion notification before launching the next:
+
+```bash
+cd ~/Downloads/mamai
+python evaluation/benchmark_latency.py --retrieve-k 1 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --retrieve-k 3 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --retrieve-k 5 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --retrieve-k 7 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --retrieve-k 10 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --retrieve-k 15 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --retrieve-k 20 --rag-only --cooldown 10000
+python evaluation/benchmark_latency.py --no-retrieval --cooldown 10000
+```
+
+### How to actually do this with the Bash tool
+
+For each command:
+
+1. Call `Bash` with `run_in_background: true` and the command. Save the returned task ID.
+2. **Stop and wait.** The harness will send a `<task-notification>` message when the python process exits. That's your signal to continue.
+3. When the notification arrives, read the resulting JSON in `evaluation/latency_results/benchmark_*_k{N}.json` (or for No-RAG, no `_kN` suffix).
+4. Verify the JSON:
+   - `config.model == "gemma-4-E2B-it.litertlm"`
+   - `config.backend == "GPU"`
+   - `len(results)` == 54
+   - Errors == 0 (except k=20, which is expected to error on the same 8 queries × 3 reps = 24 errors that hit the 4096-token wall in the E4B sweep)
+   - TTFT median in the 500–2000 ms range
+5. If anything looks wildly off, stop and report. Otherwise proceed to the next k.
+
+Per-run wall-clock estimate (based on E4B GPU being ~12–30 min per k, and E2B being ~1.5× faster):
+- ~10–15 min per k for small k
+- ~12–20 min per k for k ≥ 10
+
+Total Phase 2: **~1.5–2 hours**.
+
+### Optional progress visibility (not required)
+
+If you want occasional progress pings while a benchmark runs, you can launch a `Monitor` with timeout=3600000ms (1 hour) and a poll command that greps `adb logcat -d` for `mam-ai-bench` lines. Examples in the PR #57 history. But this is just visibility — the harness completion notification is what gates "move on."
+
+### Checkpoint 2 — what to report when Phase 2 is done
+
+After all 8 GPU JSONs land, summarize:
+- List of file names produced
+- Per-run wall-clock (from `total_benchmark_time_ms` field divided by 60000)
+- Error counts (should be all zero except k=20)
+- Quick comparison to E4B GPU baseline: did E2B run roughly 1.3–1.7× faster overall? Numbers from `evaluation/reports/latency_report_v2.md` are easy to compare against.
+
+Then **stop and ask the user** before starting Phase 3.
+
+---
+
+## 3. Phase 3 — CPU rebuild + sweep (~3 hours)
+
+### 3a. Switch to CPU build
+
+```bash
+cd ~/Downloads/mamai/app
+flutter build apk --release -PuseGpuForLlm=false
+```
+
+(foreground `Bash` with `timeout: 600000` — should complete in ~30 sec since artifacts are cached). Verify `Built build/app/outputs/flutter-apk/app-release.apk`.
+
+```bash
+adb install -r ~/Downloads/mamai/app/build/app/outputs/flutter-apk/app-release.apk
+```
+
+(foreground `Bash`, ~2 min). The `-r` flag preserves existing model files on the device. Verify with `adb shell ls /storage/emulated/0/Android/data/com.example.app/files/` — should still show both `gemma-4-E2B-it.litertlm` and the others.
+
+### 3b. CPU smoke test
+
+```bash
+cd ~/Downloads/mamai
+python evaluation/benchmark_latency.py --filter medium_01 --repeats 1 --rag-only --retrieve-k 3 --cooldown 5000
+```
+
+(background, wait for notification). Verify the resulting JSON has `config.backend == "CPU"`, `config.model == "gemma-4-E2B-it.litertlm"`. Expected total latency ~15–20 s (E2B CPU at k=3 should be ~1.5× faster than E4B CPU's ~37–44 s).
+
+If smoke test passes, proceed.
+
+### 3c. CPU sweep — same 8 benchmarks
+
+Identical command list as Phase 2. Same background + notification-wait pattern. Per-run expected ~20 min (E2B CPU is ~1.5× faster than E4B CPU's ~40–90 min).
+
+Verify each JSON: backend=CPU, model=E2B, run count, error count.
+
+### Checkpoint 3 — what to report
+
+Same shape as Checkpoint 2: file names, per-run wall-clock, error counts, comparison to E4B CPU baseline.
+
+Stop and ask the user before starting Phase 4.
+
+---
+
+## 4. Phase 4 — Analysis + local commits (do NOT push)
+
+### 4a. Update the aggregator to handle two models
+
+Currently `evaluation/aggregate_k_sweep.py` groups by `(backend, k)`. With E4B and E2B both present, the matrix would collapse them into the same cells. **Add a `model` dimension**: change the grouping to `(model, backend, k)`.
+
+Key places to touch:
+- `load_runs()` — append `"model": d["config"].get("model") or DEFAULT_E4B_MODEL` to each run dict. For the pre-fix E4B GPU JSONs (the ones in `PRE_FIX_GPU_FILES`), they predate the model-recording fix and don't have `config.model` either — they should default to `"gemma-4-E4B-it.litertlm"`. Add a `PRE_FIX_E4B_FILES` allowlist similar to `PRE_FIX_GPU_FILES`, or just bake it into a single `_legacy_default_for(filename)` helper.
+- The `matrix` dict in `write_report()` — change the key from `(backend, k)` to `(model, backend, k)`.
+- Each table that loops over `all_ks` needs to also loop over models, or you can produce a table per model.
+
+Expected size of change: ~50 LOC. Run `python3 evaluation/aggregate_k_sweep.py` and verify it loads all 32 canonical runs (16 E4B + 16 E2B).
+
+### 4b. Update `latency_report_v2.md`
+
+Add an **E4B vs E2B comparison** section. Key tables:
+- Median total query latency: rows = k, columns = `{E4B GPU, E2B GPU, E2B÷E4B ratio, E4B CPU, E2B CPU, E2B÷E4B ratio}`. One table per category (short / medium / long) or per overall.
+- TTFT comparison same shape.
+- Decode comparison same shape — this is where we expect E2B's gain to be smallest (decode is bandwidth-bound).
+
+Update the **Key findings** section to reflect the measured ratio. The smoke test suggested ~1.5× (not the 2× originally projected). Decode being bandwidth-bound is the architectural reason — call that out.
+
+Update the document title from "GPU vs CPU" to something like "Model × Backend × k" or "Latency Sweep — Gemma 4 E2B vs E4B, GPU vs CPU".
+
+### 4c. Update `device_compatibility_notes.md`
+
+- Section §6 "Open questions": mark "Actual E2B CPU latency" as **resolved** with real numbers from the new sweep.
+- Section §2 "Backend × model × k feasibility": replace the **projected** E2B table with real measurements. Specifically replace the row "CPU, mid-tier MediaTek (~2× slower)" which was extrapolation — the new data lets us anchor more precisely.
+- TL;DR section: refine any rule-of-thumb that was based on the wrong 2× ratio. The actual ratio is ~1.5× — adjust deployment recommendations if anything changes.
+
+### 4d. Commit, do NOT push
+
+Make focused commits matching the PR #57 style. Suggested split (your call on exact phrasing):
+
+1. `analysis: aggregate_k_sweep.py — add model dimension to matrix`
+2. `analysis: regenerate latency_report_v2.md with E2B columns`
+3. `docs: update device_compatibility_notes.md with E2B measurements`
+
+After all commits, run `git log --oneline origin/main..HEAD` and report the commit list to the user. **Do not push.**
+
+---
+
+## 5. Failure-mode guidance
+
+| Symptom | Action |
+|---|---|
+| Bash command times out (foreground) | Use background mode + notification wait instead. Foreground is for builds/installs only. |
+| Background task takes 30+ min with no completion notification | Run `pgrep -af benchmark_latency.py` to verify python is still alive. If it is, keep waiting. If not, the benchmark died — read the task's output file and report. |
+| Benchmark JSON missing fields (no `config.model`, wrong backend, etc.) | Stop. The build or install drifted. |
+| Hans freeze events in logcat (`OplusHansManager: freeze ... scene: LcdOff`) | Shouldn't happen — the foreground-service + whitelist fix is in main. If it does, the whitelist may have been reset by a system update. Stop and ask the user to re-verify it in Settings. |
+| App on device dies between benchmarks | Check `adb shell pm list packages \| grep com.example.app`. If missing, the install was rolled back somehow — stop. |
+| Smoke-test totals wildly off (e.g. >60s at k=3 GPU, or >5s at k=3 No-RAG) | Stop. Something is wrong with the build or backend selection. |
+
+For any "wildly off" result, stop and report rather than auto-retry. The user can decide whether to re-do the run or investigate.
+
+---
+
+## 6. Constraints
+
+- **Branch**: `feat/e2b-latency-sweep` only. Don't push to origin. Don't rebase or amend `main` or anything earlier than your own commits.
+- **Don't touch the mamaretrieval repo.** All work happens in mamai.
+- **Don't change scope.** If the plan is ambiguous on something specific, stop and ask the user rather than improvising. Specifically, don't:
+  - Change the k-value list (must be 1, 3, 5, 7, 10, 15, 20 + No-RAG to match E4B)
+  - Change the cooldown (10000 ms)
+  - Skip the CPU smoke test
+  - Skip any of the 4 deliverable updates in Phase 4
+- **Commit style**: match the PR #57 style (`feat:`, `fix:`, `analysis:`, `docs:` prefixes; concise subject line; body explaining "why" not "what").
+- **Don't push.** The final state is "16 new JSONs landed, scripts and reports updated, all committed locally on `feat/e2b-latency-sweep`, branch ready for human review and PR creation."
+
+---
+
+## 7. Final-deliverable checklist
+
+Before declaring done, verify:
+
+- [ ] 16 new benchmark JSONs in `evaluation/latency_results/` — 8 with backend=GPU, 8 with backend=CPU, all with model=E2B
+- [ ] `aggregate_k_sweep.py` updated to handle `(model, backend, k)` grouping; loads all 32 canonical runs without errors
+- [ ] `latency_report_v2.md` regenerated and updated with E4B vs E2B narrative
+- [ ] `device_compatibility_notes.md` updated to reflect measured E2B numbers
+- [ ] All changes committed in focused commits on `feat/e2b-latency-sweep`
+- [ ] **Branch not pushed**
+- [ ] Summary report: commit list, headline findings (per-backend E2B vs E4B median latency at k=3, k=10), any anomalies observed
+
+When done, hand back to the user for review + PR creation.
+
+---
+
+_Last updated: 2026-05-15. Phase 1 commits already on the branch: `976a8ac` (model-from-config fix), `3042d38` (config switch to E2B). Phase 1 smoke test: `benchmark_20260515T150531_k3.json`, total 11036 ms at k=3 GPU._