diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index e1ee93c..fbc5ba8 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -381,7 +381,21 @@ class BenchmarkForegroundService : Service() { put("rag_only", ragOnly) put("query_filter", queryFilter ?: JSONObject.NULL) put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) - put("model", "gemma-4-E4B-it.litertlm") + // Read model name from the same app_config.json asset the RagPipeline uses, + // so the JSON metadata reflects whatever model is actually loaded rather than + // a hardcoded string that goes stale when we switch model artifacts. + // Wrapped in try/catch: this read runs at the END of the benchmark when we + // serialize all results — an asset/parse error here would discard 20+ minutes + // of completed runs that are still in-memory. Better to ship an "unknown" tag + // and preserve the timing data than lose the whole sweep. + put("model", try { + JSONObject( + application.assets.open("app_config.json").bufferedReader().use { it.readText() } + ).getString("llm_model") + } catch (e: Exception) { + Log.w("mam-ai-bench", "[BENCHMARK] Failed to read llm_model from app_config.json — recording 'unknown': $e") + "unknown" + }) // Read backend from BuildConfig at compile time. Older builds // hard-coded "CPU" here even when GPU was active — fixed so the // JSON metadata matches reality. diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index d11e390..d911392 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -1,15 +1,19 @@ #!/usr/bin/env python3 -"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report. +"""Aggregate per-k latency-sweep JSONs into a single model × backend × k report. Reads all benchmark_*.json files produced by benchmark_latency.py, groups them -by (backend, k_override), and writes a markdown report at +by (model, backend, k_override), and writes a markdown report at evaluation/reports/latency_report_v2.md. Notes on backend identification: post-fix benchmark JSONs (commit ef96538 onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep JSONs hard-code `backend="CPU"` even though they were measured on GPU; we backfill those using an explicit filename allowlist (see `backend_of`). -Future runs of any backend are unaffected. + +Notes on model identification: post-fix JSONs (commit 976a8ac onward) record +`config.model` from the app asset; earlier runs do not. For any JSON missing +`config.model` we default to `gemma-4-E4B-it.litertlm` since the only sweeps +that predate the fix were E4B. Future runs of any model are unaffected. """ from __future__ import annotations @@ -47,6 +51,19 @@ def backend_of(filename: str, recorded: str) -> str: return recorded +# Default model for any pre-fix JSON missing config.model. All such files in +# the current repo are E4B; this default is purely defensive in case an old +# JSON resurfaces. New runs always record their own model. +LEGACY_DEFAULT_MODEL = "gemma-4-E4B-it.litertlm" + + +def model_of(filename: str, recorded: str | None) -> str: + """Trust the recorded model; default to E4B for legacy JSONs that lack it.""" + if recorded is not None: + return recorded + return LEGACY_DEFAULT_MODEL + + def load_runs() -> list[dict]: files = sorted(glob.glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), @@ -95,9 +112,19 @@ def load_runs() -> list[dict]: ) recorded_backend = "CPU" backend = backend_of(os.path.basename(f), recorded_backend) + recorded_model = d["config"].get("model") + if recorded_model is None: + print( + f"WARN: {os.path.basename(f)} has no config.model field; " + f"defaulting to {LEGACY_DEFAULT_MODEL}. If this was a " + "different model, the JSON predates the model-recording fix.", + file=sys.stderr, + ) + model = model_of(os.path.basename(f), recorded_model) runs.append({ "file": os.path.basename(f), "timestamp": ts, + "model": model, "backend": backend, "k": k_label, "data": d, @@ -167,79 +194,35 @@ def fmt_s(v: int | None) -> str: return f"{v / 1000:.1f}" if v is not None else "—" -def write_report(runs: list[dict], out_path: Path) -> None: - # Build {(backend, k) -> latest canonical run} - matrix: dict[tuple[str, int], dict] = {} - for r in runs: - key = (r["backend"], r["k"]) - if key in matrix: - # Keep the run with most successful entries (resolves duplicates) - ex = matrix[key] - ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error")) - r_ok = sum(1 for x in r["data"]["results"] if not x.get("error")) - if r_ok > ex_ok: - matrix[key] = r - else: - matrix[key] = r +def _short_model_label(model: str) -> str: + """Human-friendly short label, e.g. 'Gemma 4 E4B' for 'gemma-4-E4B-it.litertlm'.""" + if "E4B" in model: + return "Gemma 4 E4B" + if "E2B" in model: + return "Gemma 4 E2B" + return model - gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"]) - cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"]) - all_ks = sorted(set(gpu_ks + cpu_ks)) - # Sample run for device info - sample = next(iter(matrix.values())) - dev = sample["data"]["device"] +def _write_per_model_section( + md: list[str], matrix: dict, model: str, all_ks: list[int] +) -> None: + """Emit the six per-model tables (headline / TTFT / decode / p95 / errors / wall-clock). - md = [] - md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n") - md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") - md.append("") - md.append("## Device & stack\n") - md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") - md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)") - md.append(f"- **LiteRT-LM**: 0.11.0") - md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") - md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") - md.append("") - # Pull the actual values from the sample run's config instead of hard-coding - # text that can lie. If different runs used different settings, this won't - # catch that — but we'd rather report the sample's truth than fabricate a - # round-number claim. - sample_cfg = sample["data"].get("config", {}) - sample_repeats = sample_cfg.get("repeats", "?") - sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0 - sample_n_results = len(sample["data"]["results"]) - # Infer queries × modes from total runs / repeats. Default to "?" if the - # math doesn't divide evenly. - queries_x_modes: object = "?" - if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0: - queries_x_modes = sample_n_results // sample_repeats - md.append("## Methodology\n") - md.append( - f"Per backend × k configuration: {queries_x_modes} (query × mode) cells " - f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a " - f"No-RAG baseline per backend (k=0 via `--no-retrieval`). " - f"{sample_cooldown_s:g}-second cooldown between runs for thermal " - "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so " - "the run survives screen-off and device-lock; OPPO Hans whitelist set " - "manually." - ) - md.append("") - md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") - md.append("- `decode` is first-token to last-token.") - md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") - md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") - md.append("") + Each table follows the same `(GPU, CPU, ratio)` shape as the original + single-model report; we just scope to one model at a time. + """ + label = _short_model_label(model) + md.append(f"## {label} (`{model}`)\n") - # ─────────── Headline table: total_query_ms by (backend, k) ─────────── - md.append("## Headline — Median total query latency (seconds)\n") - md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |") - md.append(f"|---:|---:|---:|---:|---:|") + md.append("### Median total query latency (seconds)\n") + md.append("| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |") + md.append("|---:|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) - # doc chars: take from GPU if available, else CPU - doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) gpu_cells = "—" cpu_cells = "—" if gpu_run: @@ -248,152 +231,293 @@ def write_report(runs: list[dict], out_path: Path) -> None: if cpu_run: c_ = aggregate_per_category(cpu_run["data"], "total_query_ms") cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"]) - # ratio ratio = "" if gpu_run and cpu_run: gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median") cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median") if gov is not None and cov is not None and gov > 0: ratio = f"{cov / gov:.2f}×" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") md.append("") - # ─────────── TTFT detail ─────────── - md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n") - md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |") - md.append(f"|---:|---:|---:|---:|---:|") + md.append("### TTFT (ms, median)\n") + md.append("| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |") + md.append("|---:|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) - doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None - # Explicit None checks; also guard against div-by-zero on a 0 median. ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else "" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") md.append("") - # ─────────── Decode detail ─────────── - md.append("## Decode (ms, median) — first token to last token\n") - md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ") - md.append("the model writing *longer answers* when given more context (more material to draw on).") - md.append("") - md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |") - md.append(f"|---:|---:|---:|---:|") + md.append("### Decode (ms, median)\n") + md.append("| k | GPU decode | CPU decode | CPU÷GPU |") + md.append("|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else "" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") md.append("") - # ─────────── p95 totals ─────────── - md.append("## p95 total query latency (s) — tail-latency view\n") - md.append(f"| k | GPU p95 | CPU p95 |") - md.append(f"|---:|---:|---:|") + md.append("### p95 total query latency (s)\n") + md.append("| k | GPU p95 | CPU p95 |") + md.append("|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {fmt_s(gv)} | {fmt_s(cv)} |") md.append("") - # ─────────── Errors / context limit ─────────── - md.append("## Errors and the 4096-token context wall\n") - md.append(f"| k | GPU errors / 54 | CPU errors / 54 |") - md.append(f"|---:|---:|---:|") + md.append("### Errors (count / 54 runs)\n") + md.append("| k | GPU errors | CPU errors |") + md.append("|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") - md.append("") - md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ") - md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ") - md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ") - md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ") - md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ") - md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.") - md.append("") - md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ") - md.append("deployment budget at this depth even when the request fits in the context window.") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") md.append("") - # ─────────── Wall-clock comparison ─────────── - md.append("## Wall-clock comparison\n") + md.append("### Wall-clock\n") md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |") md.append("|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None gw_s = f"{gw:.1f}" if gw is not None else "—" cw_s = f"{cw:.1f}" if cw is not None else "—" ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else "" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {gw_s} | {cw_s} | {ratio} |") + md.append("") + - # Findings / interpretation +def _write_cross_model_table( + md: list[str], + matrix: dict, + baseline_model: str, + other_model: str, + all_ks: list[int], + metric: str, + fmt: callable, +) -> None: + """Emit one E4B-vs-E2B comparison table for the given metric. + + Layout: `| k | E4B GPU | E2B GPU | GPU ratio | E4B CPU | E2B CPU | CPU ratio |`. + Ratio is baseline÷other (so >1 means the other model is faster). + """ + b_label = _short_model_label(baseline_model) + o_label = _short_model_label(other_model) + md.append( + f"| k | {b_label} GPU | {o_label} GPU | GPU ratio | " + f"{b_label} CPU | {o_label} CPU | CPU ratio |" + ) + md.append("|---:|---:|---:|---:|---:|---:|---:|") + for k in all_ks: + cells = [] + for backend in ("GPU", "CPU"): + base_run = matrix.get((baseline_model, backend, k)) + other_run = matrix.get((other_model, backend, k)) + base_v = aggregate_overall(base_run["data"], metric).get("median") if base_run else None + other_v = aggregate_overall(other_run["data"], metric).get("median") if other_run else None + ratio = "" + if base_v is not None and other_v is not None and other_v > 0: + ratio = f"{base_v / other_v:.2f}×" + cells.extend([fmt(base_v), fmt(other_v), ratio]) + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | " + " | ".join(cells) + " |") md.append("") - md.append("## Key findings\n") + + +def write_report(runs: list[dict], out_path: Path) -> None: + # Build {(model, backend, k) -> latest canonical run}. If two runs collide + # on the same key (e.g. a re-run on the same day), keep the one with the + # most successful entries — that's almost always the longer, cleaner sweep. + matrix: dict[tuple[str, str, int], dict] = {} + for r in runs: + key = (r["model"], r["backend"], r["k"]) + if key in matrix: + ex = matrix[key] + ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error")) + r_ok = sum(1 for x in r["data"]["results"] if not x.get("error")) + if r_ok > ex_ok: + matrix[key] = r + else: + matrix[key] = r + + if not matrix: + # latency_results/ is gitignored, so a fresh checkout can hit this. Exit + # with a directional error rather than crashing on StopIteration below. + results_dir = Path(__file__).resolve().parent / "latency_results" + raise SystemExit( + f"No canonical benchmark_*.json found under {results_dir}. " + "Run `python evaluation/benchmark_latency.py …` to produce JSONs " + "(see evaluation/runbooks/ for the sweep procedure), then re-run " + "this aggregator." + ) + + models = sorted(set(m for (m, _b, _k) in matrix.keys())) + all_ks = sorted(set(k for (_m, _b, k) in matrix.keys())) + + sample = next(iter(matrix.values())) + dev = sample["data"]["device"] + + md: list[str] = [] + md.append("# MAM-AI On-Device Latency Sweep — Model × Backend × k\n") + md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") md.append("") - md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite") - md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ") - md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ") - md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.") + md.append("## Device & stack\n") + md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") + md.append(f"- **Models tested**: " + ", ".join(f"{_short_model_label(m)} (`{m}`)" for m in models)) + md.append("- **LiteRT-LM**: 0.11.0") + md.append("- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") + md.append("- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") + md.append("") + sample_cfg = sample["data"].get("config", {}) + sample_repeats = sample_cfg.get("repeats", "?") + sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0 + sample_n_results = len(sample["data"]["results"]) + queries_x_modes: object = "?" + if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0: + queries_x_modes = sample_n_results // sample_repeats + md.append("## Methodology\n") + md.append( + f"Per (model × backend × k) configuration: {queries_x_modes} (query × mode) cells " + f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a " + f"No-RAG baseline per (model × backend) (k=0 via `--no-retrieval`). " + f"{sample_cooldown_s:g}-second cooldown between runs for thermal " + "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so " + "the run survives screen-off and device-lock; OPPO Hans whitelist set " + "manually." + ) + md.append("") + md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") + md.append("- `decode` is first-token to last-token.") + md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") + md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") + md.append("") + + # ─────────── Per-model sections ─────────── + for m in models: + _write_per_model_section(md, matrix, m, all_ks) + + # ─────────── Cross-model comparison ─────────── + # Use E4B as baseline when present; ratio is baseline/other so >1 means + # the (smaller) comparator model is faster on that cell. + if len(models) > 1: + baseline = "gemma-4-E4B-it.litertlm" if "gemma-4-E4B-it.litertlm" in models else models[0] + others = [m for m in models if m != baseline] + others_label = ", ".join(_short_model_label(m) for m in others) + md.append("## Cross-model comparison\n") + md.append( + f"Each table below compares **{_short_model_label(baseline)}** " + f"(baseline) against each comparator model ({others_label}). " + "Ratios are reported as **baseline ÷ comparator** at the same " + "backend × k cell, so values **> 1.0× mean the comparator is faster**. " + "Reading the columns: GPU prefill (TTFT) is compute-bound and tracks " + "parameter count closely; GPU decode is bandwidth-bound and gains less " + "from model shrinkage; CPU is compute-bound throughout." + ) + md.append("") + for other in others: + md.append(f"### {_short_model_label(baseline)} vs {_short_model_label(other)}") + md.append("") + md.append("**Total query latency (median, seconds)**") + md.append("") + _write_cross_model_table(md, matrix, baseline, other, all_ks, "total_query_ms", fmt_s) + md.append("**TTFT (median, ms)** — prefill speedup") + md.append("") + _write_cross_model_table(md, matrix, baseline, other, all_ks, "ttft_ms", fmt_ms) + md.append("**Decode (median, ms)** — bandwidth-limited on GPU, compute-limited on CPU") + md.append("") + _write_cross_model_table(md, matrix, baseline, other, all_ks, "decode_ms", fmt_ms) + + md.append("## Errors and the 4096-token context wall\n") + md.append("At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every " + "(model × backend) combination tested: ") + md.append("`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. ") + md.append("Each failure reports `Input token ids are too long. Exceeding the maximum " + "number of tokens allowed: …>= 4096`. ") + md.append("Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; " + "the wall is a property of the `.litertlm` artifact format, not the " + "parameter count or backend. **k_max ≈ 17–18** for both models.") + md.append("") + + md.append("## Key findings\n") + md.append("### 1. Prefill (TTFT) scales ~2× with parameter count on both backends") + md.append("Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** " + "and **~2.3–3.2× on CPU**. Prefill is compute-heavy (one parallel forward pass over the " + "entire prompt), so halving the parameter count halves the compute and the speedup is " + "near-proportional on both backends.") md.append("") - md.append("### 2. The model's 4096-token context window is the binding ceiling at high k") - md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ") - md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ") - md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ") - md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ") - md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ") - md.append("Latency is *not* the constraint at the upper end; the model's context window is.") + md.append("### 2. Decode is bandwidth-bound on GPU, compute-bound on CPU") + md.append("Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is " + "sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding " + "weights into compute units — the smaller model helps less than its parameter count " + "would predict. On CPU the constraint is compute, so the speedup tracks the model shrink.") md.append("") - md.append("### 3. Latency is not the binding factor on GPU below k=15") - md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ") - md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ") - md.append("not by what fits in the latency budget.") + md.append("### 3. Total speedup is decode-dominated, hence smaller than TTFT") + md.append("**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since " + "decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks " + "decode rather than prefill. At high k where prefill grows large, total speedup climbs " + "toward the prefill ratio (~1.7–1.9× GPU at k=15+).") md.append("") - md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow") - md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ") - md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ") - md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ") - md.append("or **k ≤ 1** if you want sub-40s p95.") + md.append("### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier") + md.append("E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend " + "where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), " + "which means devices that previously could *not* deploy MAM-AI at acceptable latency " + "(mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: " + "ship E2B on CPU, restrict k to small values.") md.append("") - md.append("### 5. Decode time is content-driven, not k-driven") - md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ") - md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ") - md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ") - md.append("not compute-bound on this hardware.") + md.append("### 5. 4096-token context wall is the binding ceiling at high k") + md.append("k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically " + "across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the " + "model artifact, not the runtime, and is **shared between E4B and E2B**. " + "**Latency is not the constraint at the upper end of k — context window is.**") md.append("") md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3") - md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ") - md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ") - md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.") + md.append("On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so " + "the prefill story scales predictably. The model shrink translates directly into a TTFT " + "shrink across the whole range.") md.append("") # File inventory - md.append("## Data inventory (per `(backend, k)`)\n") - md.append("| Backend | k | File | Wall (min) | Runs | Errors |") - md.append("|---|---:|---|---:|---:|---:|") - for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])): - r = matrix[(b, k)] + md.append("## Data inventory (per `(model, backend, k)`)\n") + md.append("| Model | Backend | k | File | Wall (min) | Runs | Errors |") + md.append("|---|---|---:|---|---:|---:|---:|") + for (m, b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1], x[2])): + r = matrix[(m, b, k)] wall = r["data"]["total_benchmark_time_ms"] / 60000 n = len(r["data"]["results"]) e = sum(1 for x in r["data"]["results"] if x.get("error")) - label = "0 (no-RAG)" if k == 0 else str(k) - md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") + k_label = "0 (no-RAG)" if k == 0 else str(k) + md.append(f"| {_short_model_label(m)} | {b} | {k_label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") md.append("") md.append("---") md.append("") @@ -407,7 +531,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: def main() -> int: runs = load_runs() - print(f"Loaded {len(runs)} canonical runs") + models = sorted(set(r["model"] for r in runs)) + print(f"Loaded {len(runs)} canonical runs across {len(models)} models: {', '.join(models)}") out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md" write_report(runs, out) return 0 diff --git a/evaluation/reports/device_compatibility_notes.md b/evaluation/reports/device_compatibility_notes.md index 7cef7a7..8213775 100644 --- a/evaluation/reports/device_compatibility_notes.md +++ b/evaluation/reports/device_compatibility_notes.md @@ -1,12 +1,15 @@ # MAM-AI Device Compatibility — On Which Phones the Model Can Run -_Last updated: 2026-05-15. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._ +_Last updated: 2026-05-16. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._ -## TL;DR — three load-bearing rules +**On the Snapdragon 8 Elite test device under a 60 s latency budget, E4B (6 GB RAM floor) deploys only on GPU across all RAG depths k ≤ 15 or on CPU at k ≤ 3, while E2B (4 GB RAM floor) deploys on both GPU and CPU across all k ≤ 15 — with k = 20 ruled out for both models by the 4096-token context wall, regardless of backend.** + +## TL;DR — four load-bearing rules 1. **E4B minimum RAM: 6 GB** total. 4 GB phones cannot run E4B reliably (model alone needs ~3.3 GB at runtime; Android + bundled apps eat 1.5–2 GB). 2. **E2B minimum RAM: 4 GB** total. The smaller model halves the runtime memory footprint (~1.7 GB), opening up the $100–$150 device tier that's the largest slice of the African market. 3. **E4B on CPU: k=3 is the borderline.** Beyond k=3, CPU totals exceed the 60 s budget on most mid-tier silicon. **E4B on GPU: no latency worry** — totals stay 13–25 s across k=0–15 on Snapdragon 8 Elite + Adreno. +4. **E2B on CPU: k=10 is comfortable on flagship CPU; k=3–5 on mid-tier MediaTek.** Measured E2B CPU at k=10 on Snapdragon 8 Elite is 26 s median; extrapolating ~2× slower for mid-tier MediaTek gives ~50 s at k=5–7 (borderline). The original notes projected a uniform ~2× speedup across backends; measurements show **CPU matches that projection (~2× total speedup)** but **GPU total speedup is closer to ~1.5×** because decode is bandwidth-bound and gains less from the parameter-count shrink. Either way, CPU-only deployment is finally viable up to mid-range k on the no-GPU device tier — that's the deployment-relevant change from the May 2026 sweep. The catch — covered in §3 below — is that **GPU only works reliably on Adreno** (Snapdragon). For the bulk of the African deployment fleet (MediaTek + Mali GPUs), **plan as CPU-only** and treat any GPU acceleration as a bonus, not a guarantee. @@ -63,27 +66,29 @@ At hard minimum, the app will install and run but will be vulnerable to OOM kill ## 2. Backend × model × k feasibility (UX at 60 s budget) -Median total query latency targets, measured on Snapdragon 8 Elite (test device) and extrapolated for mid-tier MediaTek (~2× slower CPU than 8 Elite). E2B numbers are projections (~2× faster than E4B on the same hardware) until we collect actual measurements. +Median total query latency. Snapdragon 8 Elite rows are measured (see `latency_report_v2.md`). Mid-tier MediaTek rows are extrapolated by scaling CPU latency ~2× slower than Snapdragon 8 Elite — anchored on the published Geekbench gap between Dimensity 8400 / Helio G99 and the Snapdragon 8 Elite, not on an in-house measurement. **Empirical measurement on real MediaTek hardware is the next open question; see §6.** -### Gemma 4 E4B +### Gemma 4 E4B (measured) | Backend × hardware tier | k=0 (no-RAG) | k=3 | k=5 | k=10 | k=15 | |---|---|---|---|---|---| | **GPU, Snapdragon 8 Elite (Adreno 830)** | 13 s ✅ | 19 s ✅ | 20 s ✅ | 21 s ✅ | 24 s ✅ — **no worry at any k ≤ 15** | -| **CPU, Snapdragon 8 Elite** | 27 s ✅ | 41 s ✅ | 60 s 🟡 | 70 s ❌ | 85 s ❌ | -| CPU, mid-tier MediaTek (~2× slower) | ~50 s 🟡 | ~80 s ❌ | — | — | — | +| **CPU, Snapdragon 8 Elite** | 28 s ✅ | 43 s ✅ | 60 s 🟡 | 69 s ❌ | 85 s ❌ | +| CPU, mid-tier MediaTek (~2× slower) | ~56 s 🟡 | ~85 s ❌ | — | — | — | → For E4B: **CPU is unsafe past k=3** on flagship hardware, and unsafe at any k > 0 on mid-tier. GPU works at all k tested. -### Gemma 4 E2B (projected, halve E4B numbers) +### Gemma 4 E2B (measured 2026-05-16) + +Measured E2B is **~1.5× faster than E4B on GPU** (decode is bandwidth-bound, limits the win) and **~2× faster on CPU** (compute-bound — the smaller model's compute reduction translates more directly). See `latency_report_v2.md` for the per-k speedup ratios. -| Backend × hardware tier | k=0 | k=3 | k=5 | k=10 | k=15 | +| Backend × hardware tier | k=0 (no-RAG) | k=3 | k=5 | k=10 | k=15 | |---|---|---|---|---|---| -| GPU, Snapdragon 8 Elite | ~6 s ✅ | ~10 s ✅ | ~10 s ✅ | ~11 s ✅ | ~12 s ✅ | -| CPU, Snapdragon 8 Elite | ~13 s ✅ | ~20 s ✅ | ~30 s ✅ | ~35 s ✅ | ~42 s ✅ | -| **CPU, mid-tier MediaTek** | ~25 s ✅ | ~40 s ✅ | ~55 s 🟡 | ~70 s ❌ | — | +| **GPU, Snapdragon 8 Elite (Adreno 830)** | 9 s ✅ | 14 s ✅ | 12 s ✅ | 16 s ✅ | 13 s ✅ — **no worry at any k ≤ 15** | +| **CPU, Snapdragon 8 Elite** | 14 s ✅ | 21 s ✅ | 27 s ✅ | 26 s ✅ | 37 s ✅ | +| **CPU, mid-tier MediaTek (~2× slower)** | ~28 s ✅ | ~41 s ✅ | ~54 s 🟡 | ~53 s 🟡 | ~74 s ❌ | -→ For E2B on mid-tier MediaTek CPU, k≤3 is comfortable; k≤5 is borderline. **Empirical measurement still pending.** +→ For E2B on flagship CPU, **all k ≤ 15 fit a 60 s budget**. On mid-tier MediaTek CPU, **k ≤ 3 is comfortable, k=5–10 is borderline, k=15 exceeds budget.** This is the key deployment unlock: the no-GPU, mid-tier-CPU path is finally viable for typical k. --- @@ -134,7 +139,7 @@ Combining the SoC distribution data with the floor specs above: | $100–$150 low-mid | Tecno Camon, Infinix Hot Pro+, Redmi 13C | Helio G99, Dimensity 6080 | 6 GB | ✅ tight | ✅ comfortable | ⚠️ uncertain | | $150–$250 mid | Tecno Camon 30, Infinix Note 40, Redmi Note 13, Samsung A25 | Dimensity 7050/7200/8400 | 8 GB | ✅ | ✅ | ⚠️ uncertain (Mali) | | $250+ upper-mid | OnePlus Nord, Samsung A5x | Snapdragon 7+ Gen 3 | 8 GB | ✅ | ✅ | ✅ Adreno | -| $400+ flagship | OPPO Find X8 (our test device), Pixel, Galaxy S | Snapdragon 8 Elite, Dimensity 9400, Tensor | 12+ GB | ✅ | ✅ | ✅ Adreno (Pixel ❌) | +| $400+ flagship | OnePlus OPD2413 / OPPO Find X8 (our test device), Pixel, Galaxy S | Snapdragon 8 Elite, Dimensity 9400, Tensor | 12+ GB | ✅ | ✅ | ✅ Adreno (Pixel ❌) | **Effective deployment-viable hardware floor**: roughly **$120+ retail**, 6 GB RAM, 64 GB storage, any 64-bit chipset from 2022 or later. E2B lowers this to **~$100**, 4 GB RAM. @@ -144,8 +149,9 @@ Combining the SoC distribution data with the floor specs above: | Question | How to answer | Priority | |---|---|---| -| Actual E2B CPU latency at k=0/3/5/7/10/15 on Snapdragon 8 Elite | Same `benchmark_latency.py` sweep run we did for E4B, with the E2B model swapped in | High — unblocks the E2B-vs-E4B deployment decision | +| ~~Actual E2B CPU latency at k=0/3/5/7/10/15 on Snapdragon 8 Elite~~ | **Resolved 2026-05-16** — measured E2B CPU is ~2× faster than E4B CPU at every k; see `latency_report_v2.md` cross-model tables. E2B CPU at k=10 = 26 s; k=15 = 37 s; both under 60 s budget on flagship. | ~~High~~ ✅ Done | | Does GPU backend engage on real Transsion / MediaTek mid-tier devices? | Borrow / buy a Tecno Camon 30 or Infinix Note 40 and run benchmark with `useGpuForLlm=true`; check `[BACKEND]` log line | High — answers whether GPU is realistic for the deployment majority | +| Does the mid-tier MediaTek CPU 2× slowdown extrapolation hold in practice? | Once a Tecno/Infinix mid-tier is in hand, run the full k-sweep on CPU and compare to the projected `~2× slower` table in §2 | High — anchors the deployment recommendation on real numbers, not Geekbench-based extrapolation | | E2B answer-quality regression vs E4B on safety-critical medical-advice metrics | Re-run `eval_report_app_parity_v1.md` apparatus with E2B model | Critical before any model swap decision | | Does Exynos Xclipse driver bug get fixed upstream | Watch LiteRT-LM Issue #2114 | Low — affects ~5% of African market | | When does E4B Qualcomm SM8750 NPU artifact ship | Watch `litert-community/` HF repo monthly per Issue #58 | Medium — perf upgrade, not a deployment unblocker | diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index c6745a6..9d26a86 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,26 +1,108 @@ -# MAM-AI On-Device Latency Sweep — GPU vs CPU +# MAM-AI On-Device Latency Sweep — Model × Backend × k -_Generated: 2026-05-15T10:51:06_ +_Generated: 2026-05-16T09:00:40_ ## Device & stack - **Device**: OnePlus OPD2413 (SM8750P) — Android 15 -- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) +- **Models tested**: Gemma 4 E2B (`gemma-4-E2B-it.litertlm`), Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) - **LiteRT-LM**: 0.11.0 - **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU - **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000 ## Methodology -Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually. +Per (model × backend × k) configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per (model × backend) (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually. - `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token. - `decode` is first-token to last-token. - `total_query` is everything: `retrieval + TTFT + decode`. - Reported as median across the 54 runs unless noted (p95 in tables marked `p95`). -## Headline — Median total query latency (seconds) +## Gemma 4 E2B (`gemma-4-E2B-it.litertlm`) + +### Median total query latency (seconds) + +| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 7.9 / 8.1 / 10.8 | 13.2 / 14.1 / 16.0 | 1.60× | +| 1 | 561 | 11.4 / 11.8 / 12.8 | 13.0 / 16.3 / 17.5 | 1.35× | +| 3 | 2098 | 12.8 / 13.8 / 16.5 | 19.1 / 22.0 / 22.5 | 1.44× | +| 5 | 3547 | 9.9 / 14.2 / 14.0 | 26.3 / 27.6 / 28.6 | 2.36× | +| 7 | 5139 | 12.8 / 14.3 / 17.6 | 23.5 / 32.0 / 33.2 | 1.87× | +| 10 | 7482 | 15.2 / 14.6 / 17.9 | 23.4 / 26.2 / 27.7 | 1.68× | +| 15 | 11297 | 13.0 / 12.4 / 14.8 | 31.0 / 38.2 / 40.7 | 2.80× | +| 20 | 14520 | 19.3 / 15.8 / 14.3 | 33.4 / 39.8 / 44.5 | 2.28× | + +### TTFT (ms, median) + +| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 429 | 5564 | 13.0× | +| 1 | 561 | 412 | 5355 | 13.0× | +| 3 | 2098 | 445 | 7394 | 16.6× | +| 5 | 3547 | 793 | 14604 | 18.4× | +| 7 | 5139 | 819 | 14577 | 17.8× | +| 10 | 7482 | 1074 | 13635 | 12.7× | +| 15 | 11297 | 1479 | 21368 | 14.4× | +| 20 | 14520 | 1722 | 22947 | 13.3× | + +### Decode (ms, median) + +| k | GPU decode | CPU decode | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 8263 | 8174 | 0.99× | +| 1 | 7573 | 6764 | 0.89× | +| 3 | 10223 | 9584 | 0.94× | +| 5 | 9052 | 9571 | 1.06× | +| 7 | 10723 | 13451 | 1.25× | +| 10 | 10713 | 11870 | 1.11× | +| 15 | 9664 | 9920 | 1.03× | +| 20 | 11036 | 10697 | 0.97× | + +### p95 total query latency (s) + +| k | GPU p95 | CPU p95 | +|---:|---:|---:| +| **0 (no-RAG)** | 11.4 | 17.4 | +| 1 | 17.7 | 19.1 | +| 3 | 19.7 | 35.8 | +| 5 | 21.2 | 35.1 | +| 7 | 19.4 | 41.0 | +| 10 | 23.8 | 37.9 | +| 15 | 18.1 | 45.2 | +| 20 | 22.2 | 50.4 | + +### Errors (count / 54 runs) + +| k | GPU errors | CPU errors | +|---:|---:|---:| +| **0 (no-RAG)** | 0 | 0 | +| 1 | 0 | 0 | +| 3 | 0 | 0 | +| 5 | 0 | 0 | +| 7 | 0 | 0 | +| 10 | 0 | 0 | +| 15 | 0 | 0 | +| 20 | 24 | 24 | + +### Wall-clock + +| k | GPU wall (min) | CPU wall (min) | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 17.5 | 22.5 | 1.28× | +| 1 | 20.9 | 23.9 | 1.14× | +| 3 | 22.4 | 30.0 | 1.34× | +| 5 | 21.1 | 34.2 | 1.62× | +| 7 | 22.8 | 35.5 | 1.56× | +| 10 | 23.3 | 33.9 | 1.46× | +| 15 | 21.1 | 41.7 | 1.97× | +| 20 | 19.1 | 30.4 | 1.59× | + +## Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) + +### Median total query latency (seconds) | k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | |---:|---:|---:|---:|---:| @@ -33,7 +115,7 @@ Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed | 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× | | 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× | -## TTFT (ms, median) — prefill cost grows with retrieved-doc content +### TTFT (ms, median) | k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | |---:|---:|---:|---:|---:| @@ -46,10 +128,7 @@ Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed | 15 | 11297 | 3457 | 54748 | 15.8× | | 20 | 14520 | 3986 | 72881 | 18.3× | -## Decode (ms, median) — first token to last token - -Decode time mostly tracks output length, not k or doc content. Variation across k reflects -the model writing *longer answers* when given more context (more material to draw on). +### Decode (ms, median) | k | GPU decode | CPU decode | CPU÷GPU | |---:|---:|---:|---:| @@ -62,7 +141,7 @@ the model writing *longer answers* when given more context (more material to dra | 15 | 16820 | 22497 | 1.34× | | 20 | 14688 | 22634 | 1.54× | -## p95 total query latency (s) — tail-latency view +### p95 total query latency (s) | k | GPU p95 | CPU p95 | |---:|---:|---:| @@ -75,9 +154,9 @@ the model writing *longer answers* when given more context (more material to dra | 15 | 30.6 | 112.7 | | 20 | 35.3 | 104.9 | -## Errors and the 4096-token context wall +### Errors (count / 54 runs) -| k | GPU errors / 54 | CPU errors / 54 | +| k | GPU errors | CPU errors | |---:|---:|---:| | **0 (no-RAG)** | 0 | 0 | | 1 | 0 | 0 | @@ -88,17 +167,7 @@ the model writing *longer answers* when given more context (more material to dra | 15 | 0 | 0 | | 20 | 24 | 24 | -At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. -Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both -backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — -the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of -the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. -The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter. - -Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any -deployment budget at this depth even when the request fits in the context window. - -## Wall-clock comparison +### Wall-clock | k | GPU wall (min) | CPU wall (min) | CPU÷GPU | |---:|---:|---:|---:| @@ -111,64 +180,114 @@ deployment budget at this depth even when the request fits in the context window | 15 | 32.4 | 90.8 | 2.80× | | 20 | 22.8 | 58.6 | 2.57× | +## Cross-model comparison + +Each table below compares **Gemma 4 E4B** (baseline) against each comparator model (Gemma 4 E2B). Ratios are reported as **baseline ÷ comparator** at the same backend × k cell, so values **> 1.0× mean the comparator is faster**. Reading the columns: GPU prefill (TTFT) is compute-bound and tracks parameter count closely; GPU decode is bandwidth-bound and gains less from model shrinkage; CPU is compute-bound throughout. + +### Gemma 4 E4B vs Gemma 4 E2B + +**Total query latency (median, seconds)** + +| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio | +|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 14.4 | 8.7 | 1.66× | 28.0 | 13.9 | 2.01× | +| 1 | 14.1 | 11.7 | 1.21× | 30.3 | 15.8 | 1.92× | +| 3 | 19.1 | 14.3 | 1.33× | 42.7 | 20.6 | 2.07× | +| 5 | 19.6 | 11.6 | 1.70× | 60.2 | 27.2 | 2.21× | +| 7 | 22.9 | 15.2 | 1.50× | 62.3 | 28.5 | 2.18× | +| 10 | 22.4 | 15.6 | 1.43× | 69.4 | 26.3 | 2.64× | +| 15 | 24.4 | 13.1 | 1.86× | 84.9 | 36.8 | 2.31× | +| 20 | 21.0 | 16.5 | 1.28× | 93.8 | 37.6 | 2.49× | + +**TTFT (median, ms)** — prefill speedup + +| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio | +|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 962 | 429 | 2.24× | 12633 | 5564 | 2.27× | +| 1 | 954 | 412 | 2.32× | 12649 | 5355 | 2.36× | +| 3 | 989 | 445 | 2.22× | 18356 | 7394 | 2.48× | +| 5 | 1884 | 793 | 2.38× | 36424 | 14604 | 2.49× | +| 7 | 1920 | 819 | 2.34× | 36444 | 14577 | 2.50× | +| 10 | 2523 | 1074 | 2.35× | 40013 | 13635 | 2.93× | +| 15 | 3457 | 1479 | 2.34× | 54748 | 21368 | 2.56× | +| 20 | 3986 | 1722 | 2.31× | 72881 | 22947 | 3.18× | + +**Decode (median, ms)** — bandwidth-limited on GPU, compute-limited on CPU + +| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio | +|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 13470 | 8263 | 1.63× | 15345 | 8174 | 1.88× | +| 1 | 11415 | 7573 | 1.51× | 13961 | 6764 | 2.06× | +| 3 | 16364 | 10223 | 1.60× | 19110 | 9584 | 1.99× | +| 5 | 15929 | 9052 | 1.76× | 21645 | 9571 | 2.26× | +| 7 | 17215 | 10723 | 1.61× | 23473 | 13451 | 1.75× | +| 10 | 18118 | 10713 | 1.69× | 21699 | 11870 | 1.83× | +| 15 | 16820 | 9664 | 1.74× | 22497 | 9920 | 2.27× | +| 20 | 14688 | 11036 | 1.33× | 22634 | 10697 | 2.12× | + +## Errors and the 4096-token context wall + +At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every (model × backend) combination tested: +`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. +Each failure reports `Input token ids are too long. Exceeding the maximum number of tokens allowed: …>= 4096`. +Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; the wall is a property of the `.litertlm` artifact format, not the parameter count or backend. **k_max ≈ 17–18** for both models. + ## Key findings +### 1. Prefill (TTFT) scales ~2× with parameter count on both backends +Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** and **~2.3–3.2× on CPU**. Prefill is compute-heavy (one parallel forward pass over the entire prompt), so halving the parameter count halves the compute and the speedup is near-proportional on both backends. + +### 2. Decode is bandwidth-bound on GPU, compute-bound on CPU +Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding weights into compute units — the smaller model helps less than its parameter count would predict. On CPU the constraint is compute, so the speedup tracks the model shrink. + +### 3. Total speedup is decode-dominated, hence smaller than TTFT +**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks decode rather than prefill. At high k where prefill grows large, total speedup climbs toward the prefill ratio (~1.7–1.9× GPU at k=15+). + +### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier +E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), which means devices that previously could *not* deploy MAM-AI at acceptable latency (mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: ship E2B on CPU, restrict k to small values. -### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite -GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. -That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), -so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency. - -### 2. The model's 4096-token context window is the binding ceiling at high k -k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — -the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. -Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives -the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, -not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. -Latency is *not* the constraint at the upper end; the model's context window is. - -### 3. Latency is not the binding factor on GPU below k=15 -GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. -Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), -not by what fits in the latency budget. - -### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow -CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. -p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't -available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, -or **k ≤ 1** if you want sub-40s p95. - -### 5. Decode time is content-driven, not k-driven -Decode time tracks output length. As k grows, the model writes *longer* responses — likely because -more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. -Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, -not compute-bound on this hardware. +### 5. 4096-token context wall is the binding ceiling at high k +k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the model artifact, not the runtime, and is **shared between E4B and E2B**. **Latency is not the constraint at the upper end of k — context window is.** ### 6. TTFT scales linearly with retrieved-doc content past k=3 -On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, -CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting -the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both. - -## Data inventory (per `(backend, k)`) - -| Backend | k | File | Wall (min) | Runs | Errors | -|---|---:|---|---:|---:|---:| -| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | -| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | -| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | -| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 | -| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | -| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | -| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | -| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 | -| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | -| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | -| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | -| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 | -| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 | -| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | -| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | -| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | +On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so the prefill story scales predictably. The model shrink translates directly into a TTFT shrink across the whole range. + +## Data inventory (per `(model, backend, k)`) + +| Model | Backend | k | File | Wall (min) | Runs | Errors | +|---|---|---:|---|---:|---:|---:| +| Gemma 4 E2B | CPU | 0 (no-RAG) | `benchmark_20260515T223100.json` | 22.5 | 54 | 0 | +| Gemma 4 E2B | CPU | 1 | `benchmark_20260515T183910_k1.json` | 23.9 | 54 | 0 | +| Gemma 4 E2B | CPU | 3 | `benchmark_20260515T190320_k3.json` | 30.0 | 54 | 0 | +| Gemma 4 E2B | CPU | 5 | `benchmark_20260515T193337_k5.json` | 34.2 | 54 | 0 | +| Gemma 4 E2B | CPU | 7 | `benchmark_20260515T200805_k7.json` | 35.5 | 54 | 0 | +| Gemma 4 E2B | CPU | 10 | `benchmark_20260515T204358_k10.json` | 33.9 | 54 | 0 | +| Gemma 4 E2B | CPU | 15 | `benchmark_20260515T211813_k15.json` | 41.7 | 54 | 0 | +| Gemma 4 E2B | CPU | 20 | `benchmark_20260515T220014_k20.json` | 30.4 | 54 | 24 | +| Gemma 4 E2B | GPU | 0 (no-RAG) | `benchmark_20260515T175744.json` | 17.5 | 54 | 0 | +| Gemma 4 E2B | GPU | 1 | `benchmark_20260515T152447_k1.json` | 20.9 | 54 | 0 | +| Gemma 4 E2B | GPU | 3 | `benchmark_20260515T154608_k3.json` | 22.4 | 54 | 0 | +| Gemma 4 E2B | GPU | 5 | `benchmark_20260515T160846_k5.json` | 21.1 | 54 | 0 | +| Gemma 4 E2B | GPU | 7 | `benchmark_20260515T163011_k7.json` | 22.8 | 54 | 0 | +| Gemma 4 E2B | GPU | 10 | `benchmark_20260515T165316_k10.json` | 23.3 | 54 | 0 | +| Gemma 4 E2B | GPU | 15 | `benchmark_20260515T171649_k15.json` | 21.1 | 54 | 0 | +| Gemma 4 E2B | GPU | 20 | `benchmark_20260515T173816_k20.json` | 19.1 | 54 | 24 | +| Gemma 4 E4B | CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | +| Gemma 4 E4B | CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | +| Gemma 4 E4B | CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | +| Gemma 4 E4B | CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 | +| Gemma 4 E4B | CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | +| Gemma 4 E4B | CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | +| Gemma 4 E4B | CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | +| Gemma 4 E4B | CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 | +| Gemma 4 E4B | GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | +| Gemma 4 E4B | GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | +| Gemma 4 E4B | GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | +| Gemma 4 E4B | GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 | +| Gemma 4 E4B | GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 | +| Gemma 4 E4B | GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | +| Gemma 4 E4B | GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | +| Gemma 4 E4B | GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | --- diff --git a/evaluation/runbooks/e2b_sweep.md b/evaluation/runbooks/e2b_sweep.md new file mode 100644 index 0000000..80fbc5b --- /dev/null +++ b/evaluation/runbooks/e2b_sweep.md @@ -0,0 +1,240 @@ +# Runbook: Gemma 4 E2B Latency Sweep + +Self-contained instructions for finishing the E2B latency sweep started by another session. **Phase 1 (setup) is already complete on branch `feat/e2b-latency-sweep`.** Your job is Phase 2 (GPU sweep), Phase 3 (CPU sweep), and Phase 4 (analysis + local commits, **no push**). Expected wall-clock: **~5 hours**. + +> **Reading this after PR #59 merged into main?** PR #59 intentionally **reverted** the production `llm_model` in `config/app_config.json` back to `gemma-4-E4B-it.litertlm` — the latency sweep does not authorize a deployment swap (the kenya_vignettes / AfriMed-QA SAQ safety eval is the gate for that). To re-run this benchmark on a future branch: +> +> 1. Edit `config/app_config.json` and set `"llm_model": "gemma-4-E2B-it.litertlm"`. +> 2. Rebuild and install: `flutter build apk --release && adb install -r app/build/app/outputs/flutter-apk/app-release.apk`. +> 3. Run the sweep (Phase 2 + Phase 3 below). +> 4. **Revert** the `config/app_config.json` change before opening any PR. +> +> With the config flipped, the new benchmark JSONs will record `config.model == "gemma-4-E2B-it.litertlm"` as expected by the Phase 1 verification step in §1. Phase 1's commit-log check (looking for `3042d38 config: switch llm_model to Gemma 4 E2B`) was written when that commit was the tip; on a post-merge replay the same SHA will still be reachable, just deeper in the log. + +## 0. Context — read this first + +- This work mirrors the E4B latency sweep that landed in PR #57 (commit `1be0a55` on `main`). The E4B results are in `evaluation/reports/latency_report_v2.md` and the device-compatibility analysis is in `evaluation/reports/device_compatibility_notes.md`. +- We're now measuring the **smaller** Gemma 4 E2B variant (~2 GB instead of E4B's 3.66 GB) to find out how much faster it is in real terms on the same hardware. Same 16 measurements as E4B: 8 GPU (k ∈ {1, 3, 5, 7, 10, 15, 20} + No-RAG) + 8 CPU. +- Test device: **OnePlus OPD2413 (Snapdragon 8 Elite, SM8750P)** connected via ADB — that's the firmware-reported manufacturer (`device.manufacturer="OnePlus"` in the benchmark JSONs); the same OPD2413 hardware ships under the OPPO brand in some markets. The OPPO/OnePlus Hans battery-optimization whitelist is **already configured** by the user — don't re-do it. +- The benchmark infrastructure is in `evaluation/benchmark_latency.py`; the aggregator is `evaluation/aggregate_k_sweep.py`. Both are already correct for this work, with one expected exception in Phase 4 (the aggregator needs a `model` dimension added). + +### Why this is a runbook and not a single Bash command + +Each benchmark run takes **12–20 minutes wall-clock** (E2B is ~1.5× faster than E4B based on the smoke test, not 2×). You can't realistically loop them in one foreground shell command; bash timeouts cap at 10 minutes in our tooling. Use `Bash run_in_background: true` and **wait for the harness completion notification** between runs. Don't use `tail -F`, sleep loops, or watchdog patterns — those caused the previous subagent to bail at 87 seconds. + +--- + +## 1. Verify Phase 1 state — fail loud if anything's missing + +Run these checks before touching anything: + +```bash +cd ~/Downloads/mamai +git status # should show clean working tree on branch feat/e2b-latency-sweep +git log --oneline -3 # should show 3042d38, 976a8ac at the top +``` + +Expected log: +``` +3042d38 config: switch llm_model to Gemma 4 E2B +976a8ac fix(benchmark): read model name from app_config asset +a2205ff docs: device compatibility notes — which phones can run E4B / E2B +``` + +```bash +ls -lh device_push/models/gemma-4-E2B-it.litertlm # ~2.4–2.6 GB +adb devices # should show one device +adb shell ls /storage/emulated/0/Android/data/com.example.app/files/ +# expect to see: gemma-4-E2B-it.litertlm, gemma-4-E4B-it.litertlm, +# Gecko_1024_quant.tflite, embeddings.sqlite, sentencepiece.model +``` + +The smoke-test JSON from Phase 1 is at `evaluation/latency_results/benchmark_20260515T150531_k3.json`. Verify it has `config.model == "gemma-4-E2B-it.litertlm"`, `config.backend == "GPU"`, no errors, total latency 11036 ms. + +If anything fails any of these checks, **stop and ask the user** — the state has drifted from what this runbook assumes. + +--- + +## 2. Phase 2 — GPU sweep (~1.5–2 hours) + +The GPU APK from Phase 1 is already installed. Run these 8 benchmarks **sequentially**, each via `Bash run_in_background: true`, waiting for the harness completion notification before launching the next: + +```bash +cd ~/Downloads/mamai +python evaluation/benchmark_latency.py --retrieve-k 1 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 3 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 5 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 7 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 10 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 15 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 20 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --no-retrieval --cooldown 10000 +``` + +### How to actually do this with the Bash tool + +For each command: + +1. Call `Bash` with `run_in_background: true` and the command. Save the returned task ID. +2. **Stop and wait.** The harness will send a `` message when the python process exits. That's your signal to continue. +3. When the notification arrives, read the resulting JSON in `evaluation/latency_results/benchmark_*_k{N}.json` (or for No-RAG, no `_kN` suffix). +4. Verify the JSON: + - `config.model == "gemma-4-E2B-it.litertlm"` + - `config.backend == "GPU"` + - `len(results)` == 54 + - Errors == 0 (except k=20, which is expected to error on the same 8 queries × 3 reps = 24 errors that hit the 4096-token wall in the E4B sweep) + - TTFT median in the 500–2000 ms range +5. If anything looks wildly off, stop and report. Otherwise proceed to the next k. + +Per-run wall-clock estimate (based on E4B GPU being ~12–30 min per k, and E2B being ~1.5× faster): +- ~10–15 min per k for small k +- ~12–20 min per k for k ≥ 10 + +Total Phase 2: **~1.5–2 hours**. + +### Optional progress visibility (not required) + +If you want occasional progress pings while a benchmark runs, you can launch a `Monitor` with timeout=3600000ms (1 hour) and a poll command that greps `adb logcat -d` for `mam-ai-bench` lines. Examples in the PR #57 history. But this is just visibility — the harness completion notification is what gates "move on." + +### Checkpoint 2 — what to report when Phase 2 is done + +After all 8 GPU JSONs land, summarize: +- List of file names produced +- Per-run wall-clock (from `total_benchmark_time_ms` field divided by 60000) +- Error counts (should be all zero except k=20) +- Quick comparison to E4B GPU baseline: did E2B run roughly 1.3–1.7× faster overall? Numbers from `evaluation/reports/latency_report_v2.md` are easy to compare against. + +Then **stop and ask the user** before starting Phase 3. + +--- + +## 3. Phase 3 — CPU rebuild + sweep (~3 hours) + +### 3a. Switch to CPU build + +```bash +cd ~/Downloads/mamai/app +flutter build apk --release -PuseGpuForLlm=false +``` + +(foreground `Bash` with `timeout: 600000` — should complete in ~30 sec since artifacts are cached). Verify `Built build/app/outputs/flutter-apk/app-release.apk`. + +```bash +adb install -r ~/Downloads/mamai/app/build/app/outputs/flutter-apk/app-release.apk +``` + +(foreground `Bash`, ~2 min). The `-r` flag preserves existing model files on the device. Verify with `adb shell ls /storage/emulated/0/Android/data/com.example.app/files/` — should still show both `gemma-4-E2B-it.litertlm` and the others. + +### 3b. CPU smoke test + +```bash +cd ~/Downloads/mamai +python evaluation/benchmark_latency.py --filter medium_01 --repeats 1 --rag-only --retrieve-k 3 --cooldown 5000 +``` + +(background, wait for notification). Verify the resulting JSON has `config.backend == "CPU"`, `config.model == "gemma-4-E2B-it.litertlm"`. Expected total latency ~15–20 s (E2B CPU at k=3 should be ~1.5× faster than E4B CPU's ~37–44 s). + +If smoke test passes, proceed. + +### 3c. CPU sweep — same 8 benchmarks + +Identical command list as Phase 2. Same background + notification-wait pattern. Per-run expected ~20 min (E2B CPU is ~1.5× faster than E4B CPU's ~40–90 min). + +Verify each JSON: backend=CPU, model=E2B, run count, error count. + +### Checkpoint 3 — what to report + +Same shape as Checkpoint 2: file names, per-run wall-clock, error counts, comparison to E4B CPU baseline. + +Stop and ask the user before starting Phase 4. + +--- + +## 4. Phase 4 — Analysis + local commits (do NOT push) + +### 4a. Update the aggregator to handle two models + +Currently `evaluation/aggregate_k_sweep.py` groups by `(backend, k)`. With E4B and E2B both present, the matrix would collapse them into the same cells. **Add a `model` dimension**: change the grouping to `(model, backend, k)`. + +Key places to touch: +- `load_runs()` — append `"model": d["config"].get("model") or DEFAULT_E4B_MODEL` to each run dict. For the pre-fix E4B GPU JSONs (the ones in `PRE_FIX_GPU_FILES`), they predate the model-recording fix and don't have `config.model` either — they should default to `"gemma-4-E4B-it.litertlm"`. Add a `PRE_FIX_E4B_FILES` allowlist similar to `PRE_FIX_GPU_FILES`, or just bake it into a single `_legacy_default_for(filename)` helper. +- The `matrix` dict in `write_report()` — change the key from `(backend, k)` to `(model, backend, k)`. +- Each table that loops over `all_ks` needs to also loop over models, or you can produce a table per model. + +Expected size of change: ~50 LOC. Run `python3 evaluation/aggregate_k_sweep.py` and verify it loads all 32 canonical runs (16 E4B + 16 E2B). + +### 4b. Update `latency_report_v2.md` + +Add an **E4B vs E2B comparison** section. Key tables: +- Median total query latency: rows = k, columns = `{E4B GPU, E2B GPU, E2B÷E4B ratio, E4B CPU, E2B CPU, E2B÷E4B ratio}`. One table per category (short / medium / long) or per overall. +- TTFT comparison same shape. +- Decode comparison same shape — this is where we expect E2B's gain to be smallest (decode is bandwidth-bound). + +Update the **Key findings** section to reflect the measured ratio. The smoke test suggested ~1.5× (not the 2× originally projected). Decode being bandwidth-bound is the architectural reason — call that out. + +Update the document title from "GPU vs CPU" to something like "Model × Backend × k" or "Latency Sweep — Gemma 4 E2B vs E4B, GPU vs CPU". + +### 4c. Update `device_compatibility_notes.md` + +- Section §6 "Open questions": mark "Actual E2B CPU latency" as **resolved** with real numbers from the new sweep. +- Section §2 "Backend × model × k feasibility": replace the **projected** E2B table with real measurements. Specifically replace the row "CPU, mid-tier MediaTek (~2× slower)" which was extrapolation — the new data lets us anchor more precisely. +- TL;DR section: refine any rule-of-thumb that was based on the wrong 2× ratio. The actual ratio is ~1.5× — adjust deployment recommendations if anything changes. + +### 4d. Commit, do NOT push + +Make focused commits matching the PR #57 style. Suggested split (your call on exact phrasing): + +1. `analysis: aggregate_k_sweep.py — add model dimension to matrix` +2. `analysis: regenerate latency_report_v2.md with E2B columns` +3. `docs: update device_compatibility_notes.md with E2B measurements` + +After all commits, run `git log --oneline origin/main..HEAD` and report the commit list to the user. **Do not push.** + +--- + +## 5. Failure-mode guidance + +| Symptom | Action | +|---|---| +| Bash command times out (foreground) | Use background mode + notification wait instead. Foreground is for builds/installs only. | +| Background task takes 30+ min with no completion notification | Run `pgrep -af benchmark_latency.py` to verify python is still alive. If it is, keep waiting. If not, the benchmark died — read the task's output file and report. | +| Benchmark JSON missing fields (no `config.model`, wrong backend, etc.) | Stop. The build or install drifted. | +| Hans freeze events in logcat (`OplusHansManager: freeze ... scene: LcdOff`) | Shouldn't happen — the foreground-service + whitelist fix is in main. If it does, the whitelist may have been reset by a system update. Stop and ask the user to re-verify it in Settings. | +| App on device dies between benchmarks | Check `adb shell pm list packages \| grep com.example.app`. If missing, the install was rolled back somehow — stop. | +| Smoke-test totals wildly off (e.g. >60s at k=3 GPU, or >5s at k=3 No-RAG) | Stop. Something is wrong with the build or backend selection. | + +For any "wildly off" result, stop and report rather than auto-retry. The user can decide whether to re-do the run or investigate. + +--- + +## 6. Constraints + +- **Branch**: `feat/e2b-latency-sweep` only. Don't push to origin. Don't rebase or amend `main` or anything earlier than your own commits. +- **Don't touch the mamaretrieval repo.** All work happens in mamai. +- **Don't change scope.** If the plan is ambiguous on something specific, stop and ask the user rather than improvising. Specifically, don't: + - Change the k-value list (must be 1, 3, 5, 7, 10, 15, 20 + No-RAG to match E4B) + - Change the cooldown (10000 ms) + - Skip the CPU smoke test + - Skip any of the 4 deliverable updates in Phase 4 +- **Commit style**: match the PR #57 style (`feat:`, `fix:`, `analysis:`, `docs:` prefixes; concise subject line; body explaining "why" not "what"). +- **Don't push.** The final state is "16 new JSONs landed, scripts and reports updated, all committed locally on `feat/e2b-latency-sweep`, branch ready for human review and PR creation." + +--- + +## 7. Final-deliverable checklist + +Before declaring done, verify: + +- [ ] 16 new benchmark JSONs in `evaluation/latency_results/` — 8 with backend=GPU, 8 with backend=CPU, all with model=E2B +- [ ] `aggregate_k_sweep.py` updated to handle `(model, backend, k)` grouping; loads all 32 canonical runs without errors +- [ ] `latency_report_v2.md` regenerated and updated with E4B vs E2B narrative +- [ ] `device_compatibility_notes.md` updated to reflect measured E2B numbers +- [ ] All changes committed in focused commits on `feat/e2b-latency-sweep` +- [ ] **Branch not pushed** +- [ ] Summary report: commit list, headline findings (per-backend E2B vs E4B median latency at k=3, k=10), any anomalies observed + +When done, hand back to the user for review + PR creation. + +--- + +_Last updated: 2026-05-15. Phase 1 commits already on the branch: `976a8ac` (model-from-config fix), `3042d38` (config switch to E2B). Phase 1 smoke test: `benchmark_20260515T150531_k3.json`, total 11036 ms at k=3 GPU._