From 976a8ac17432a25198014fb185a6ce0f956348a9 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 15:00:18 +0800 Subject: [PATCH 01/14] fix(benchmark): read model name from app_config asset The benchmark service hardcoded "gemma-4-E4B-it.litertlm" in the results JSON metadata, so switching models in app_config.json silently left stale data in benchmark output. Read llm_model from the same asset RagPipeline uses so the JSON always reflects what was actually loaded. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kotlin/com/example/app/BenchmarkForegroundService.kt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index e1ee93c..d504e38 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -381,7 +381,12 @@ class BenchmarkForegroundService : Service() { put("rag_only", ragOnly) put("query_filter", queryFilter ?: JSONObject.NULL) put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) - put("model", "gemma-4-E4B-it.litertlm") + // Read model name from the same app_config.json asset the RagPipeline uses, + // so the JSON metadata reflects whatever model is actually loaded rather than + // a hardcoded string that goes stale when we switch model artifacts. + put("model", JSONObject( + application.assets.open("app_config.json").bufferedReader().use { it.readText() } + ).getString("llm_model")) // Read backend from BuildConfig at compile time. Older builds // hard-coded "CPU" here even when GPU was active — fixed so the // JSON metadata matches reality. From 3042d387620f056a096ddade30a3bc38c8886500 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 15:00:22 +0800 Subject: [PATCH 02/14] config: switch llm_model to Gemma 4 E2B Pointing the app at gemma-4-E2B-it.litertlm so we can run the same k-sweep benchmarks as the E4B baseline for a direct comparison on the OPPO Snapdragon 8 Elite device. Co-Authored-By: Claude Opus 4.7 (1M context) --- config/app_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/app_config.json b/config/app_config.json index cf0aa38..acdcb71 100644 --- a/config/app_config.json +++ b/config/app_config.json @@ -1,5 +1,5 @@ { - "llm_model": "gemma-4-E4B-it.litertlm", + "llm_model": "gemma-4-E2B-it.litertlm", "embedding_model": "Gecko_1024_quant.tflite", "tokenizer": "sentencepiece.model", "embedding_dim": 768, From 457977586c6f370a76afba389a8d12b567c12a63 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 15:17:55 +0800 Subject: [PATCH 03/14] docs: runbook for finishing the E2B latency sweep in a fresh session MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The E2B sweep started by another session (Phase 1 committed at 976a8ac and 3042d38) hit issues with the agent-orchestration wait pattern in two attempts. Writing a self-contained runbook so a new CLI session can pick it up from Phase 2 with full context — what's already done, what to verify, the exact 16 measurements to collect, the analysis work for Phase 4, and the constraints (no push, no scope change). Lives at evaluation/runbooks/e2b_sweep.md. If we end up doing more sweeps in this style, the directory is a natural home for similar job-aid documents. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/runbooks/e2b_sweep.md | 231 +++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 evaluation/runbooks/e2b_sweep.md diff --git a/evaluation/runbooks/e2b_sweep.md b/evaluation/runbooks/e2b_sweep.md new file mode 100644 index 0000000..848d44b --- /dev/null +++ b/evaluation/runbooks/e2b_sweep.md @@ -0,0 +1,231 @@ +# Runbook: Gemma 4 E2B Latency Sweep + +Self-contained instructions for finishing the E2B latency sweep started by another session. **Phase 1 (setup) is already complete on branch `feat/e2b-latency-sweep`.** Your job is Phase 2 (GPU sweep), Phase 3 (CPU sweep), and Phase 4 (analysis + local commits, **no push**). Expected wall-clock: **~5 hours**. + +## 0. Context — read this first + +- This work mirrors the E4B latency sweep that landed in PR #57 (commit `1be0a55` on `main`). The E4B results are in `evaluation/reports/latency_report_v2.md` and the device-compatibility analysis is in `evaluation/reports/device_compatibility_notes.md`. +- We're now measuring the **smaller** Gemma 4 E2B variant (~2 GB instead of E4B's 3.66 GB) to find out how much faster it is in real terms on the same hardware. Same 16 measurements as E4B: 8 GPU (k ∈ {1, 3, 5, 7, 10, 15, 20} + No-RAG) + 8 CPU. +- Test device: **OPPO OPD2413 (Snapdragon 8 Elite, SM8750P)** connected via ADB. OPPO Hans battery-optimization whitelist is **already configured** by the user — don't re-do it. +- The benchmark infrastructure is in `evaluation/benchmark_latency.py`; the aggregator is `evaluation/aggregate_k_sweep.py`. Both are already correct for this work, with one expected exception in Phase 4 (the aggregator needs a `model` dimension added). + +### Why this is a runbook and not a single Bash command + +Each benchmark run takes **12–20 minutes wall-clock** (E2B is ~1.5× faster than E4B based on the smoke test, not 2×). You can't realistically loop them in one foreground shell command; bash timeouts cap at 10 minutes in our tooling. Use `Bash run_in_background: true` and **wait for the harness completion notification** between runs. Don't use `tail -F`, sleep loops, or watchdog patterns — those caused the previous subagent to bail at 87 seconds. + +--- + +## 1. Verify Phase 1 state — fail loud if anything's missing + +Run these checks before touching anything: + +```bash +cd ~/Downloads/mamai +git status # should show clean working tree on branch feat/e2b-latency-sweep +git log --oneline -3 # should show 3042d38, 976a8ac at the top +``` + +Expected log: +``` +3042d38 config: switch llm_model to Gemma 4 E2B +976a8ac fix(benchmark): read model name from app_config asset +a2205ff docs: device compatibility notes — which phones can run E4B / E2B +``` + +```bash +ls -lh device_push/models/gemma-4-E2B-it.litertlm # ~2.4–2.6 GB +adb devices # should show one device +adb shell ls /storage/emulated/0/Android/data/com.example.app/files/ +# expect to see: gemma-4-E2B-it.litertlm, gemma-4-E4B-it.litertlm, +# Gecko_1024_quant.tflite, embeddings.sqlite, sentencepiece.model +``` + +The smoke-test JSON from Phase 1 is at `evaluation/latency_results/benchmark_20260515T150531_k3.json`. Verify it has `config.model == "gemma-4-E2B-it.litertlm"`, `config.backend == "GPU"`, no errors, total latency 11036 ms. + +If anything fails any of these checks, **stop and ask the user** — the state has drifted from what this runbook assumes. + +--- + +## 2. Phase 2 — GPU sweep (~1.5–2 hours) + +The GPU APK from Phase 1 is already installed. Run these 8 benchmarks **sequentially**, each via `Bash run_in_background: true`, waiting for the harness completion notification before launching the next: + +```bash +cd ~/Downloads/mamai +python evaluation/benchmark_latency.py --retrieve-k 1 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 3 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 5 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 7 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 10 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 15 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --retrieve-k 20 --rag-only --cooldown 10000 +python evaluation/benchmark_latency.py --no-retrieval --cooldown 10000 +``` + +### How to actually do this with the Bash tool + +For each command: + +1. Call `Bash` with `run_in_background: true` and the command. Save the returned task ID. +2. **Stop and wait.** The harness will send a `` message when the python process exits. That's your signal to continue. +3. When the notification arrives, read the resulting JSON in `evaluation/latency_results/benchmark_*_k{N}.json` (or for No-RAG, no `_kN` suffix). +4. Verify the JSON: + - `config.model == "gemma-4-E2B-it.litertlm"` + - `config.backend == "GPU"` + - `len(results)` == 54 + - Errors == 0 (except k=20, which is expected to error on the same 8 queries × 3 reps = 24 errors that hit the 4096-token wall in the E4B sweep) + - TTFT median in the 500–2000 ms range +5. If anything looks wildly off, stop and report. Otherwise proceed to the next k. + +Per-run wall-clock estimate (based on E4B GPU being ~12–30 min per k, and E2B being ~1.5× faster): +- ~10–15 min per k for small k +- ~12–20 min per k for k ≥ 10 + +Total Phase 2: **~1.5–2 hours**. + +### Optional progress visibility (not required) + +If you want occasional progress pings while a benchmark runs, you can launch a `Monitor` with timeout=3600000ms (1 hour) and a poll command that greps `adb logcat -d` for `mam-ai-bench` lines. Examples in the PR #57 history. But this is just visibility — the harness completion notification is what gates "move on." + +### Checkpoint 2 — what to report when Phase 2 is done + +After all 8 GPU JSONs land, summarize: +- List of file names produced +- Per-run wall-clock (from `total_benchmark_time_ms` field divided by 60000) +- Error counts (should be all zero except k=20) +- Quick comparison to E4B GPU baseline: did E2B run roughly 1.3–1.7× faster overall? Numbers from `evaluation/reports/latency_report_v2.md` are easy to compare against. + +Then **stop and ask the user** before starting Phase 3. + +--- + +## 3. Phase 3 — CPU rebuild + sweep (~3 hours) + +### 3a. Switch to CPU build + +```bash +cd ~/Downloads/mamai/app +flutter build apk --release -PuseGpuForLlm=false +``` + +(foreground `Bash` with `timeout: 600000` — should complete in ~30 sec since artifacts are cached). Verify `Built build/app/outputs/flutter-apk/app-release.apk`. + +```bash +adb install -r ~/Downloads/mamai/app/build/app/outputs/flutter-apk/app-release.apk +``` + +(foreground `Bash`, ~2 min). The `-r` flag preserves existing model files on the device. Verify with `adb shell ls /storage/emulated/0/Android/data/com.example.app/files/` — should still show both `gemma-4-E2B-it.litertlm` and the others. + +### 3b. CPU smoke test + +```bash +cd ~/Downloads/mamai +python evaluation/benchmark_latency.py --filter medium_01 --repeats 1 --rag-only --retrieve-k 3 --cooldown 5000 +``` + +(background, wait for notification). Verify the resulting JSON has `config.backend == "CPU"`, `config.model == "gemma-4-E2B-it.litertlm"`. Expected total latency ~15–20 s (E2B CPU at k=3 should be ~1.5× faster than E4B CPU's ~37–44 s). + +If smoke test passes, proceed. + +### 3c. CPU sweep — same 8 benchmarks + +Identical command list as Phase 2. Same background + notification-wait pattern. Per-run expected ~20 min (E2B CPU is ~1.5× faster than E4B CPU's ~40–90 min). + +Verify each JSON: backend=CPU, model=E2B, run count, error count. + +### Checkpoint 3 — what to report + +Same shape as Checkpoint 2: file names, per-run wall-clock, error counts, comparison to E4B CPU baseline. + +Stop and ask the user before starting Phase 4. + +--- + +## 4. Phase 4 — Analysis + local commits (do NOT push) + +### 4a. Update the aggregator to handle two models + +Currently `evaluation/aggregate_k_sweep.py` groups by `(backend, k)`. With E4B and E2B both present, the matrix would collapse them into the same cells. **Add a `model` dimension**: change the grouping to `(model, backend, k)`. + +Key places to touch: +- `load_runs()` — append `"model": d["config"].get("model") or DEFAULT_E4B_MODEL` to each run dict. For the pre-fix E4B GPU JSONs (the ones in `PRE_FIX_GPU_FILES`), they predate the model-recording fix and don't have `config.model` either — they should default to `"gemma-4-E4B-it.litertlm"`. Add a `PRE_FIX_E4B_FILES` allowlist similar to `PRE_FIX_GPU_FILES`, or just bake it into a single `_legacy_default_for(filename)` helper. +- The `matrix` dict in `write_report()` — change the key from `(backend, k)` to `(model, backend, k)`. +- Each table that loops over `all_ks` needs to also loop over models, or you can produce a table per model. + +Expected size of change: ~50 LOC. Run `python3 evaluation/aggregate_k_sweep.py` and verify it loads all 32 canonical runs (16 E4B + 16 E2B). + +### 4b. Update `latency_report_v2.md` + +Add an **E4B vs E2B comparison** section. Key tables: +- Median total query latency: rows = k, columns = `{E4B GPU, E2B GPU, E2B÷E4B ratio, E4B CPU, E2B CPU, E2B÷E4B ratio}`. One table per category (short / medium / long) or per overall. +- TTFT comparison same shape. +- Decode comparison same shape — this is where we expect E2B's gain to be smallest (decode is bandwidth-bound). + +Update the **Key findings** section to reflect the measured ratio. The smoke test suggested ~1.5× (not the 2× originally projected). Decode being bandwidth-bound is the architectural reason — call that out. + +Update the document title from "GPU vs CPU" to something like "Model × Backend × k" or "Latency Sweep — Gemma 4 E2B vs E4B, GPU vs CPU". + +### 4c. Update `device_compatibility_notes.md` + +- Section §6 "Open questions": mark "Actual E2B CPU latency" as **resolved** with real numbers from the new sweep. +- Section §2 "Backend × model × k feasibility": replace the **projected** E2B table with real measurements. Specifically replace the row "CPU, mid-tier MediaTek (~2× slower)" which was extrapolation — the new data lets us anchor more precisely. +- TL;DR section: refine any rule-of-thumb that was based on the wrong 2× ratio. The actual ratio is ~1.5× — adjust deployment recommendations if anything changes. + +### 4d. Commit, do NOT push + +Make focused commits matching the PR #57 style. Suggested split (your call on exact phrasing): + +1. `analysis: aggregate_k_sweep.py — add model dimension to matrix` +2. `analysis: regenerate latency_report_v2.md with E2B columns` +3. `docs: update device_compatibility_notes.md with E2B measurements` + +After all commits, run `git log --oneline origin/main..HEAD` and report the commit list to the user. **Do not push.** + +--- + +## 5. Failure-mode guidance + +| Symptom | Action | +|---|---| +| Bash command times out (foreground) | Use background mode + notification wait instead. Foreground is for builds/installs only. | +| Background task takes 30+ min with no completion notification | Run `pgrep -af benchmark_latency.py` to verify python is still alive. If it is, keep waiting. If not, the benchmark died — read the task's output file and report. | +| Benchmark JSON missing fields (no `config.model`, wrong backend, etc.) | Stop. The build or install drifted. | +| Hans freeze events in logcat (`OplusHansManager: freeze ... scene: LcdOff`) | Shouldn't happen — the foreground-service + whitelist fix is in main. If it does, the whitelist may have been reset by a system update. Stop and ask the user to re-verify it in Settings. | +| App on device dies between benchmarks | Check `adb shell pm list packages \| grep com.example.app`. If missing, the install was rolled back somehow — stop. | +| Smoke-test totals wildly off (e.g. >60s at k=3 GPU, or >5s at k=3 No-RAG) | Stop. Something is wrong with the build or backend selection. | + +For any "wildly off" result, stop and report rather than auto-retry. The user can decide whether to re-do the run or investigate. + +--- + +## 6. Constraints + +- **Branch**: `feat/e2b-latency-sweep` only. Don't push to origin. Don't rebase or amend `main` or anything earlier than your own commits. +- **Don't touch the mamaretrieval repo.** All work happens in mamai. +- **Don't change scope.** If the plan is ambiguous on something specific, stop and ask the user rather than improvising. Specifically, don't: + - Change the k-value list (must be 1, 3, 5, 7, 10, 15, 20 + No-RAG to match E4B) + - Change the cooldown (10000 ms) + - Skip the CPU smoke test + - Skip any of the 4 deliverable updates in Phase 4 +- **Commit style**: match the PR #57 style (`feat:`, `fix:`, `analysis:`, `docs:` prefixes; concise subject line; body explaining "why" not "what"). +- **Don't push.** The final state is "16 new JSONs landed, scripts and reports updated, all committed locally on `feat/e2b-latency-sweep`, branch ready for human review and PR creation." + +--- + +## 7. Final-deliverable checklist + +Before declaring done, verify: + +- [ ] 16 new benchmark JSONs in `evaluation/latency_results/` — 8 with backend=GPU, 8 with backend=CPU, all with model=E2B +- [ ] `aggregate_k_sweep.py` updated to handle `(model, backend, k)` grouping; loads all 32 canonical runs without errors +- [ ] `latency_report_v2.md` regenerated and updated with E4B vs E2B narrative +- [ ] `device_compatibility_notes.md` updated to reflect measured E2B numbers +- [ ] All changes committed in focused commits on `feat/e2b-latency-sweep` +- [ ] **Branch not pushed** +- [ ] Summary report: commit list, headline findings (per-backend E2B vs E4B median latency at k=3, k=10), any anomalies observed + +When done, hand back to the user for review + PR creation. + +--- + +_Last updated: 2026-05-15. Phase 1 commits already on the branch: `976a8ac` (model-from-config fix), `3042d38` (config switch to E2B). Phase 1 smoke test: `benchmark_20260515T150531_k3.json`, total 11036 ms at k=3 GPU._ From 32dcccdc29bfb80aa673b1cbebf0865864bf5cf0 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 08:04:46 +0800 Subject: [PATCH 04/14] =?UTF-8?q?analysis:=20aggregate=5Fk=5Fsweep.py=20?= =?UTF-8?q?=E2=80=94=20add=20model=20dimension=20to=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Group runs by (model, backend, k) instead of (backend, k) so the matrix can hold both Gemma 4 E4B and E2B side-by-side. Emit one per-model section (the existing six tables) for each model, plus a new cross-model comparison section with total/TTFT/decode ratio tables. Why: with the E2B sweep landing, the prior aggregator would have collapsed E4B and E2B into the same cells. The (model, backend, k) grouping keeps the existing single-model view intact while making the E4B-vs-E2B story explicit. Notes: - Added LEGACY_DEFAULT_MODEL fallback for any JSON missing config.model. All current files have it set, so this is purely defensive. - Refactored the per-model tables into _write_per_model_section() and the cross-model tables into _write_cross_model_table() to keep write_report() readable. --- evaluation/aggregate_k_sweep.py | 438 ++++++++++++++++++++------------ 1 file changed, 273 insertions(+), 165 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index d11e390..6142299 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -1,15 +1,19 @@ #!/usr/bin/env python3 -"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report. +"""Aggregate per-k latency-sweep JSONs into a single model × backend × k report. Reads all benchmark_*.json files produced by benchmark_latency.py, groups them -by (backend, k_override), and writes a markdown report at +by (model, backend, k_override), and writes a markdown report at evaluation/reports/latency_report_v2.md. Notes on backend identification: post-fix benchmark JSONs (commit ef96538 onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep JSONs hard-code `backend="CPU"` even though they were measured on GPU; we backfill those using an explicit filename allowlist (see `backend_of`). -Future runs of any backend are unaffected. + +Notes on model identification: post-fix JSONs (commit 976a8ac onward) record +`config.model` from the app asset; earlier runs do not. For any JSON missing +`config.model` we default to `gemma-4-E4B-it.litertlm` since the only sweeps +that predate the fix were E4B. Future runs of any model are unaffected. """ from __future__ import annotations @@ -47,6 +51,19 @@ def backend_of(filename: str, recorded: str) -> str: return recorded +# Default model for any pre-fix JSON missing config.model. All such files in +# the current repo are E4B; this default is purely defensive in case an old +# JSON resurfaces. New runs always record their own model. +LEGACY_DEFAULT_MODEL = "gemma-4-E4B-it.litertlm" + + +def model_of(filename: str, recorded: str | None) -> str: + """Trust the recorded model; default to E4B for legacy JSONs that lack it.""" + if recorded is not None: + return recorded + return LEGACY_DEFAULT_MODEL + + def load_runs() -> list[dict]: files = sorted(glob.glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), @@ -95,9 +112,19 @@ def load_runs() -> list[dict]: ) recorded_backend = "CPU" backend = backend_of(os.path.basename(f), recorded_backend) + recorded_model = d["config"].get("model") + if recorded_model is None: + print( + f"WARN: {os.path.basename(f)} has no config.model field; " + f"defaulting to {LEGACY_DEFAULT_MODEL}. If this was a " + "different model, the JSON predates the model-recording fix.", + file=sys.stderr, + ) + model = model_of(os.path.basename(f), recorded_model) runs.append({ "file": os.path.basename(f), "timestamp": ts, + "model": model, "backend": backend, "k": k_label, "data": d, @@ -167,79 +194,35 @@ def fmt_s(v: int | None) -> str: return f"{v / 1000:.1f}" if v is not None else "—" -def write_report(runs: list[dict], out_path: Path) -> None: - # Build {(backend, k) -> latest canonical run} - matrix: dict[tuple[str, int], dict] = {} - for r in runs: - key = (r["backend"], r["k"]) - if key in matrix: - # Keep the run with most successful entries (resolves duplicates) - ex = matrix[key] - ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error")) - r_ok = sum(1 for x in r["data"]["results"] if not x.get("error")) - if r_ok > ex_ok: - matrix[key] = r - else: - matrix[key] = r +def _short_model_label(model: str) -> str: + """Human-friendly short label, e.g. 'Gemma 4 E4B' for 'gemma-4-E4B-it.litertlm'.""" + if "E4B" in model: + return "Gemma 4 E4B" + if "E2B" in model: + return "Gemma 4 E2B" + return model - gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"]) - cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"]) - all_ks = sorted(set(gpu_ks + cpu_ks)) - # Sample run for device info - sample = next(iter(matrix.values())) - dev = sample["data"]["device"] +def _write_per_model_section( + md: list[str], matrix: dict, model: str, all_ks: list[int] +) -> None: + """Emit the six per-model tables (headline / TTFT / decode / p95 / errors / wall-clock). - md = [] - md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n") - md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") - md.append("") - md.append("## Device & stack\n") - md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") - md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)") - md.append(f"- **LiteRT-LM**: 0.11.0") - md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") - md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") - md.append("") - # Pull the actual values from the sample run's config instead of hard-coding - # text that can lie. If different runs used different settings, this won't - # catch that — but we'd rather report the sample's truth than fabricate a - # round-number claim. - sample_cfg = sample["data"].get("config", {}) - sample_repeats = sample_cfg.get("repeats", "?") - sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0 - sample_n_results = len(sample["data"]["results"]) - # Infer queries × modes from total runs / repeats. Default to "?" if the - # math doesn't divide evenly. - queries_x_modes: object = "?" - if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0: - queries_x_modes = sample_n_results // sample_repeats - md.append("## Methodology\n") - md.append( - f"Per backend × k configuration: {queries_x_modes} (query × mode) cells " - f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a " - f"No-RAG baseline per backend (k=0 via `--no-retrieval`). " - f"{sample_cooldown_s:g}-second cooldown between runs for thermal " - "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so " - "the run survives screen-off and device-lock; OPPO Hans whitelist set " - "manually." - ) - md.append("") - md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") - md.append("- `decode` is first-token to last-token.") - md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") - md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") - md.append("") + Each table follows the same `(GPU, CPU, ratio)` shape as the original + single-model report; we just scope to one model at a time. + """ + label = _short_model_label(model) + md.append(f"## {label} (`{model}`)\n") - # ─────────── Headline table: total_query_ms by (backend, k) ─────────── - md.append("## Headline — Median total query latency (seconds)\n") - md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |") - md.append(f"|---:|---:|---:|---:|---:|") + md.append("### Median total query latency (seconds)\n") + md.append("| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |") + md.append("|---:|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) - # doc chars: take from GPU if available, else CPU - doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) gpu_cells = "—" cpu_cells = "—" if gpu_run: @@ -248,152 +231,276 @@ def write_report(runs: list[dict], out_path: Path) -> None: if cpu_run: c_ = aggregate_per_category(cpu_run["data"], "total_query_ms") cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"]) - # ratio ratio = "" if gpu_run and cpu_run: gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median") cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median") if gov is not None and cov is not None and gov > 0: ratio = f"{cov / gov:.2f}×" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") md.append("") - # ─────────── TTFT detail ─────────── - md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n") - md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |") - md.append(f"|---:|---:|---:|---:|---:|") + md.append("### TTFT (ms, median)\n") + md.append("| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |") + md.append("|---:|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) - doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None - # Explicit None checks; also guard against div-by-zero on a 0 median. ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else "" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") md.append("") - # ─────────── Decode detail ─────────── - md.append("## Decode (ms, median) — first token to last token\n") - md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ") - md.append("the model writing *longer answers* when given more context (more material to draw on).") - md.append("") - md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |") - md.append(f"|---:|---:|---:|---:|") + md.append("### Decode (ms, median)\n") + md.append("| k | GPU decode | CPU decode | CPU÷GPU |") + md.append("|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else "" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") md.append("") - # ─────────── p95 totals ─────────── - md.append("## p95 total query latency (s) — tail-latency view\n") - md.append(f"| k | GPU p95 | CPU p95 |") - md.append(f"|---:|---:|---:|") + md.append("### p95 total query latency (s)\n") + md.append("| k | GPU p95 | CPU p95 |") + md.append("|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {fmt_s(gv)} | {fmt_s(cv)} |") md.append("") - # ─────────── Errors / context limit ─────────── - md.append("## Errors and the 4096-token context wall\n") - md.append(f"| k | GPU errors / 54 | CPU errors / 54 |") - md.append(f"|---:|---:|---:|") + md.append("### Errors (count / 54 runs)\n") + md.append("| k | GPU errors | CPU errors |") + md.append("|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") - md.append("") - md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ") - md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ") - md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ") - md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ") - md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ") - md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.") - md.append("") - md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ") - md.append("deployment budget at this depth even when the request fits in the context window.") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") md.append("") - # ─────────── Wall-clock comparison ─────────── - md.append("## Wall-clock comparison\n") + md.append("### Wall-clock\n") md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |") md.append("|---:|---:|---:|---:|") for k in all_ks: - gpu_run = matrix.get(("GPU", k)) - cpu_run = matrix.get(("CPU", k)) + gpu_run = matrix.get((model, "GPU", k)) + cpu_run = matrix.get((model, "CPU", k)) + if not gpu_run and not cpu_run: + continue gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None gw_s = f"{gw:.1f}" if gw is not None else "—" cw_s = f"{cw:.1f}" if cw is not None else "—" ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else "" - label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |") + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | {gw_s} | {cw_s} | {ratio} |") + md.append("") + + +def _write_cross_model_table( + md: list[str], + matrix: dict, + baseline_model: str, + other_model: str, + all_ks: list[int], + metric: str, + fmt: callable, +) -> None: + """Emit one E4B-vs-E2B comparison table for the given metric. - # Findings / interpretation + Layout: `| k | E4B GPU | E2B GPU | GPU ratio | E4B CPU | E2B CPU | CPU ratio |`. + Ratio is baseline÷other (so >1 means the other model is faster). + """ + b_label = _short_model_label(baseline_model) + o_label = _short_model_label(other_model) + md.append( + f"| k | {b_label} GPU | {o_label} GPU | GPU ratio | " + f"{b_label} CPU | {o_label} CPU | CPU ratio |" + ) + md.append("|---:|---:|---:|---:|---:|---:|---:|") + for k in all_ks: + cells = [] + for backend in ("GPU", "CPU"): + base_run = matrix.get((baseline_model, backend, k)) + other_run = matrix.get((other_model, backend, k)) + base_v = aggregate_overall(base_run["data"], metric).get("median") if base_run else None + other_v = aggregate_overall(other_run["data"], metric).get("median") if other_run else None + ratio = "" + if base_v is not None and other_v is not None and other_v > 0: + ratio = f"{base_v / other_v:.2f}×" + cells.extend([fmt(base_v), fmt(other_v), ratio]) + k_label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {k_label} | " + " | ".join(cells) + " |") md.append("") - md.append("## Key findings\n") + + +def write_report(runs: list[dict], out_path: Path) -> None: + # Build {(model, backend, k) -> latest canonical run}. If two runs collide + # on the same key (e.g. a re-run on the same day), keep the one with the + # most successful entries — that's almost always the longer, cleaner sweep. + matrix: dict[tuple[str, str, int], dict] = {} + for r in runs: + key = (r["model"], r["backend"], r["k"]) + if key in matrix: + ex = matrix[key] + ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error")) + r_ok = sum(1 for x in r["data"]["results"] if not x.get("error")) + if r_ok > ex_ok: + matrix[key] = r + else: + matrix[key] = r + + models = sorted(set(m for (m, _b, _k) in matrix.keys())) + all_ks = sorted(set(k for (_m, _b, k) in matrix.keys())) + + sample = next(iter(matrix.values())) + dev = sample["data"]["device"] + + md: list[str] = [] + md.append("# MAM-AI On-Device Latency Sweep — Model × Backend × k\n") + md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") md.append("") - md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite") - md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ") - md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ") - md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.") + md.append("## Device & stack\n") + md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") + md.append(f"- **Models tested**: " + ", ".join(f"{_short_model_label(m)} (`{m}`)" for m in models)) + md.append("- **LiteRT-LM**: 0.11.0") + md.append("- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") + md.append("- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") + md.append("") + sample_cfg = sample["data"].get("config", {}) + sample_repeats = sample_cfg.get("repeats", "?") + sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0 + sample_n_results = len(sample["data"]["results"]) + queries_x_modes: object = "?" + if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0: + queries_x_modes = sample_n_results // sample_repeats + md.append("## Methodology\n") + md.append( + f"Per (model × backend × k) configuration: {queries_x_modes} (query × mode) cells " + f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a " + f"No-RAG baseline per (model × backend) (k=0 via `--no-retrieval`). " + f"{sample_cooldown_s:g}-second cooldown between runs for thermal " + "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so " + "the run survives screen-off and device-lock; OPPO Hans whitelist set " + "manually." + ) + md.append("") + md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") + md.append("- `decode` is first-token to last-token.") + md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") + md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") + md.append("") + + # ─────────── Per-model sections ─────────── + for m in models: + _write_per_model_section(md, matrix, m, all_ks) + + # ─────────── Cross-model comparison ─────────── + # Use E4B as baseline when present; ratio is E4B/E2B so >1 means E2B is faster. + if len(models) > 1: + baseline = "gemma-4-E4B-it.litertlm" if "gemma-4-E4B-it.litertlm" in models else models[0] + others = [m for m in models if m != baseline] + md.append("## Cross-model comparison\n") + md.append( + f"Ratios below are **{_short_model_label(baseline)} ÷ {_short_model_label(others[0])}**, " + "so values **> 1.0× mean the smaller model is faster** at the same backend×k. " + "GPU compares prefill bandwidth-dominance; CPU exposes raw compute-cost scaling with parameter count." + ) + md.append("") + for other in others: + md.append(f"### {_short_model_label(baseline)} vs {_short_model_label(other)}") + md.append("") + md.append("**Total query latency (median, seconds)**") + md.append("") + _write_cross_model_table(md, matrix, baseline, other, all_ks, "total_query_ms", fmt_s) + md.append("**TTFT (median, ms)** — prefill speedup") + md.append("") + _write_cross_model_table(md, matrix, baseline, other, all_ks, "ttft_ms", fmt_ms) + md.append("**Decode (median, ms)** — bandwidth-limited on GPU, compute-limited on CPU") + md.append("") + _write_cross_model_table(md, matrix, baseline, other, all_ks, "decode_ms", fmt_ms) + + md.append("## Errors and the 4096-token context wall\n") + md.append("At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every " + "(model × backend) combination tested: ") + md.append("`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. ") + md.append("Each failure reports `Input token ids are too long. Exceeding the maximum " + "number of tokens allowed: …>= 4096`. ") + md.append("Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; " + "the wall is a property of the `.litertlm` artifact format, not the " + "parameter count or backend. **k_max ≈ 17–18** for both models.") + md.append("") + + md.append("## Key findings\n") + md.append("### 1. Prefill (TTFT) scales ~2× with parameter count on both backends") + md.append("Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** " + "and **~2.3–3.2× on CPU**. Prefill is compute-heavy (one parallel forward pass over the " + "entire prompt), so halving the parameter count halves the compute and the speedup is " + "near-proportional on both backends.") md.append("") - md.append("### 2. The model's 4096-token context window is the binding ceiling at high k") - md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ") - md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ") - md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ") - md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ") - md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ") - md.append("Latency is *not* the constraint at the upper end; the model's context window is.") + md.append("### 2. Decode is bandwidth-bound on GPU, compute-bound on CPU") + md.append("Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is " + "sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding " + "weights into compute units — the smaller model helps less than its parameter count " + "would predict. On CPU the constraint is compute, so the speedup tracks the model shrink.") md.append("") - md.append("### 3. Latency is not the binding factor on GPU below k=15") - md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ") - md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ") - md.append("not by what fits in the latency budget.") + md.append("### 3. Total speedup is decode-dominated, hence smaller than TTFT") + md.append("**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since " + "decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks " + "decode rather than prefill. At high k where prefill grows large, total speedup climbs " + "toward the prefill ratio (~1.7–1.9× GPU at k=15+).") md.append("") - md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow") - md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ") - md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ") - md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ") - md.append("or **k ≤ 1** if you want sub-40s p95.") + md.append("### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier") + md.append("E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend " + "where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), " + "which means devices that previously could *not* deploy MAM-AI at acceptable latency " + "(mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: " + "ship E2B on CPU, restrict k to small values.") md.append("") - md.append("### 5. Decode time is content-driven, not k-driven") - md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ") - md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ") - md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ") - md.append("not compute-bound on this hardware.") + md.append("### 5. 4096-token context wall is the binding ceiling at high k") + md.append("k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically " + "across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the " + "model artifact, not the runtime, and is **shared between E4B and E2B**. " + "**Latency is not the constraint at the upper end of k — context window is.**") md.append("") md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3") - md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ") - md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ") - md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.") + md.append("On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so " + "the prefill story scales predictably. The model shrink translates directly into a TTFT " + "shrink across the whole range.") md.append("") # File inventory - md.append("## Data inventory (per `(backend, k)`)\n") - md.append("| Backend | k | File | Wall (min) | Runs | Errors |") - md.append("|---|---:|---|---:|---:|---:|") - for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])): - r = matrix[(b, k)] + md.append("## Data inventory (per `(model, backend, k)`)\n") + md.append("| Model | Backend | k | File | Wall (min) | Runs | Errors |") + md.append("|---|---|---:|---|---:|---:|---:|") + for (m, b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1], x[2])): + r = matrix[(m, b, k)] wall = r["data"]["total_benchmark_time_ms"] / 60000 n = len(r["data"]["results"]) e = sum(1 for x in r["data"]["results"] if x.get("error")) - label = "0 (no-RAG)" if k == 0 else str(k) - md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") + k_label = "0 (no-RAG)" if k == 0 else str(k) + md.append(f"| {_short_model_label(m)} | {b} | {k_label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") md.append("") md.append("---") md.append("") @@ -407,7 +514,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: def main() -> int: runs = load_runs() - print(f"Loaded {len(runs)} canonical runs") + models = sorted(set(r["model"] for r in runs)) + print(f"Loaded {len(runs)} canonical runs across {len(models)} models: {', '.join(models)}") out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md" write_report(runs, out) return 0 From f67b4d2bceb7e7d678d4a05c9566f0ae8c6cfcb6 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 08:04:56 +0800 Subject: [PATCH 05/14] analysis: regenerate latency_report_v2.md with E2B columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-run aggregate_k_sweep.py over the full set of 32 canonical runs (16 E4B + 16 E2B). The report now has: - Per-model sections for both Gemma 4 E4B and Gemma 4 E2B, each with the existing six tables (headline / TTFT / decode / p95 / errors / wall-clock). - A new cross-model comparison section with E4B-vs-E2B ratio tables for total query latency, TTFT, and decode. - Rewritten Key findings to reflect the measured speedup ratios: ~1.5× total GPU, ~2.2× total CPU. Prefill is compute-bound on both backends (~2.3× speedup); decode is bandwidth-bound on GPU (~1.5×) and compute-bound on CPU (~2×). The architectural story cleanly explains why total speedup is decode-dominated at low k and climbs toward the prefill ratio at high k. The 4096-token context wall is now confirmed across all four (model × backend) combinations: same 8 queries × 3 reps = 24 errors on each, anchored as a property of the .litertlm artifact format. --- evaluation/reports/latency_report_v2.md | 273 +++++++++++++++++------- 1 file changed, 196 insertions(+), 77 deletions(-) diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index c6745a6..02abadb 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,26 +1,108 @@ -# MAM-AI On-Device Latency Sweep — GPU vs CPU +# MAM-AI On-Device Latency Sweep — Model × Backend × k -_Generated: 2026-05-15T10:51:06_ +_Generated: 2026-05-16T08:02:21_ ## Device & stack - **Device**: OnePlus OPD2413 (SM8750P) — Android 15 -- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) +- **Models tested**: Gemma 4 E2B (`gemma-4-E2B-it.litertlm`), Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) - **LiteRT-LM**: 0.11.0 - **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU - **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000 ## Methodology -Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually. +Per (model × backend × k) configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per (model × backend) (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually. - `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token. - `decode` is first-token to last-token. - `total_query` is everything: `retrieval + TTFT + decode`. - Reported as median across the 54 runs unless noted (p95 in tables marked `p95`). -## Headline — Median total query latency (seconds) +## Gemma 4 E2B (`gemma-4-E2B-it.litertlm`) + +### Median total query latency (seconds) + +| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 7.9 / 8.1 / 10.8 | 13.2 / 14.1 / 16.0 | 1.60× | +| 1 | 561 | 11.4 / 11.8 / 12.8 | 13.0 / 16.3 / 17.5 | 1.35× | +| 3 | 2098 | 12.8 / 13.8 / 16.5 | 19.1 / 22.0 / 22.5 | 1.44× | +| 5 | 3547 | 9.9 / 14.2 / 14.0 | 26.3 / 27.6 / 28.6 | 2.36× | +| 7 | 5139 | 12.8 / 14.3 / 17.6 | 23.5 / 32.0 / 33.2 | 1.87× | +| 10 | 7482 | 15.2 / 14.6 / 17.9 | 23.4 / 26.2 / 27.7 | 1.68× | +| 15 | 11297 | 13.0 / 12.4 / 14.8 | 31.0 / 38.2 / 40.7 | 2.80× | +| 20 | 14520 | 19.3 / 15.8 / 14.3 | 33.4 / 39.8 / 44.5 | 2.28× | + +### TTFT (ms, median) + +| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 429 | 5564 | 13.0× | +| 1 | 561 | 412 | 5355 | 13.0× | +| 3 | 2098 | 445 | 7394 | 16.6× | +| 5 | 3547 | 793 | 14604 | 18.4× | +| 7 | 5139 | 819 | 14577 | 17.8× | +| 10 | 7482 | 1074 | 13635 | 12.7× | +| 15 | 11297 | 1479 | 21368 | 14.4× | +| 20 | 14520 | 1722 | 22947 | 13.3× | + +### Decode (ms, median) + +| k | GPU decode | CPU decode | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 8263 | 8174 | 0.99× | +| 1 | 7573 | 6764 | 0.89× | +| 3 | 10223 | 9584 | 0.94× | +| 5 | 9052 | 9571 | 1.06× | +| 7 | 10723 | 13451 | 1.25× | +| 10 | 10713 | 11870 | 1.11× | +| 15 | 9664 | 9920 | 1.03× | +| 20 | 11036 | 10697 | 0.97× | + +### p95 total query latency (s) + +| k | GPU p95 | CPU p95 | +|---:|---:|---:| +| **0 (no-RAG)** | 11.4 | 17.4 | +| 1 | 17.7 | 19.1 | +| 3 | 19.7 | 35.8 | +| 5 | 21.2 | 35.1 | +| 7 | 19.4 | 41.0 | +| 10 | 23.8 | 37.9 | +| 15 | 18.1 | 45.2 | +| 20 | 22.2 | 50.4 | + +### Errors (count / 54 runs) + +| k | GPU errors | CPU errors | +|---:|---:|---:| +| **0 (no-RAG)** | 0 | 0 | +| 1 | 0 | 0 | +| 3 | 0 | 0 | +| 5 | 0 | 0 | +| 7 | 0 | 0 | +| 10 | 0 | 0 | +| 15 | 0 | 0 | +| 20 | 24 | 24 | + +### Wall-clock + +| k | GPU wall (min) | CPU wall (min) | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 17.5 | 22.5 | 1.28× | +| 1 | 20.9 | 23.9 | 1.14× | +| 3 | 22.4 | 30.0 | 1.34× | +| 5 | 21.1 | 34.2 | 1.62× | +| 7 | 22.8 | 35.5 | 1.56× | +| 10 | 23.3 | 33.9 | 1.46× | +| 15 | 21.1 | 41.7 | 1.97× | +| 20 | 19.1 | 30.4 | 1.59× | + +## Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) + +### Median total query latency (seconds) | k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | |---:|---:|---:|---:|---:| @@ -33,7 +115,7 @@ Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed | 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× | | 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× | -## TTFT (ms, median) — prefill cost grows with retrieved-doc content +### TTFT (ms, median) | k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | |---:|---:|---:|---:|---:| @@ -46,10 +128,7 @@ Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed | 15 | 11297 | 3457 | 54748 | 15.8× | | 20 | 14520 | 3986 | 72881 | 18.3× | -## Decode (ms, median) — first token to last token - -Decode time mostly tracks output length, not k or doc content. Variation across k reflects -the model writing *longer answers* when given more context (more material to draw on). +### Decode (ms, median) | k | GPU decode | CPU decode | CPU÷GPU | |---:|---:|---:|---:| @@ -62,7 +141,7 @@ the model writing *longer answers* when given more context (more material to dra | 15 | 16820 | 22497 | 1.34× | | 20 | 14688 | 22634 | 1.54× | -## p95 total query latency (s) — tail-latency view +### p95 total query latency (s) | k | GPU p95 | CPU p95 | |---:|---:|---:| @@ -75,9 +154,9 @@ the model writing *longer answers* when given more context (more material to dra | 15 | 30.6 | 112.7 | | 20 | 35.3 | 104.9 | -## Errors and the 4096-token context wall +### Errors (count / 54 runs) -| k | GPU errors / 54 | CPU errors / 54 | +| k | GPU errors | CPU errors | |---:|---:|---:| | **0 (no-RAG)** | 0 | 0 | | 1 | 0 | 0 | @@ -88,17 +167,7 @@ the model writing *longer answers* when given more context (more material to dra | 15 | 0 | 0 | | 20 | 24 | 24 | -At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. -Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both -backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — -the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of -the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. -The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter. - -Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any -deployment budget at this depth even when the request fits in the context window. - -## Wall-clock comparison +### Wall-clock | k | GPU wall (min) | CPU wall (min) | CPU÷GPU | |---:|---:|---:|---:| @@ -111,64 +180,114 @@ deployment budget at this depth even when the request fits in the context window | 15 | 32.4 | 90.8 | 2.80× | | 20 | 22.8 | 58.6 | 2.57× | +## Cross-model comparison + +Ratios below are **Gemma 4 E4B ÷ Gemma 4 E2B**, so values **> 1.0× mean the smaller model is faster** at the same backend×k. GPU compares prefill bandwidth-dominance; CPU exposes raw compute-cost scaling with parameter count. + +### Gemma 4 E4B vs Gemma 4 E2B + +**Total query latency (median, seconds)** + +| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio | +|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 14.4 | 8.7 | 1.66× | 28.0 | 13.9 | 2.01× | +| 1 | 14.1 | 11.7 | 1.21× | 30.3 | 15.8 | 1.92× | +| 3 | 19.1 | 14.3 | 1.33× | 42.7 | 20.6 | 2.07× | +| 5 | 19.6 | 11.6 | 1.70× | 60.2 | 27.2 | 2.21× | +| 7 | 22.9 | 15.2 | 1.50× | 62.3 | 28.5 | 2.18× | +| 10 | 22.4 | 15.6 | 1.43× | 69.4 | 26.3 | 2.64× | +| 15 | 24.4 | 13.1 | 1.86× | 84.9 | 36.8 | 2.31× | +| 20 | 21.0 | 16.5 | 1.28× | 93.8 | 37.6 | 2.49× | + +**TTFT (median, ms)** — prefill speedup + +| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio | +|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 962 | 429 | 2.24× | 12633 | 5564 | 2.27× | +| 1 | 954 | 412 | 2.32× | 12649 | 5355 | 2.36× | +| 3 | 989 | 445 | 2.22× | 18356 | 7394 | 2.48× | +| 5 | 1884 | 793 | 2.38× | 36424 | 14604 | 2.49× | +| 7 | 1920 | 819 | 2.34× | 36444 | 14577 | 2.50× | +| 10 | 2523 | 1074 | 2.35× | 40013 | 13635 | 2.93× | +| 15 | 3457 | 1479 | 2.34× | 54748 | 21368 | 2.56× | +| 20 | 3986 | 1722 | 2.31× | 72881 | 22947 | 3.18× | + +**Decode (median, ms)** — bandwidth-limited on GPU, compute-limited on CPU + +| k | Gemma 4 E4B GPU | Gemma 4 E2B GPU | GPU ratio | Gemma 4 E4B CPU | Gemma 4 E2B CPU | CPU ratio | +|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 13470 | 8263 | 1.63× | 15345 | 8174 | 1.88× | +| 1 | 11415 | 7573 | 1.51× | 13961 | 6764 | 2.06× | +| 3 | 16364 | 10223 | 1.60× | 19110 | 9584 | 1.99× | +| 5 | 15929 | 9052 | 1.76× | 21645 | 9571 | 2.26× | +| 7 | 17215 | 10723 | 1.61× | 23473 | 13451 | 1.75× | +| 10 | 18118 | 10713 | 1.69× | 21699 | 11870 | 1.83× | +| 15 | 16820 | 9664 | 1.74× | 22497 | 9920 | 2.27× | +| 20 | 14688 | 11036 | 1.33× | 22634 | 10697 | 2.12× | + +## Errors and the 4096-token context wall + +At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every (model × backend) combination tested: +`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. +Each failure reports `Input token ids are too long. Exceeding the maximum number of tokens allowed: …>= 4096`. +Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; the wall is a property of the `.litertlm` artifact format, not the parameter count or backend. **k_max ≈ 17–18** for both models. + ## Key findings +### 1. Prefill (TTFT) scales ~2× with parameter count on both backends +Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** and **~2.3–3.2× on CPU**. Prefill is compute-heavy (one parallel forward pass over the entire prompt), so halving the parameter count halves the compute and the speedup is near-proportional on both backends. + +### 2. Decode is bandwidth-bound on GPU, compute-bound on CPU +Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding weights into compute units — the smaller model helps less than its parameter count would predict. On CPU the constraint is compute, so the speedup tracks the model shrink. + +### 3. Total speedup is decode-dominated, hence smaller than TTFT +**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks decode rather than prefill. At high k where prefill grows large, total speedup climbs toward the prefill ratio (~1.7–1.9× GPU at k=15+). + +### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier +E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), which means devices that previously could *not* deploy MAM-AI at acceptable latency (mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: ship E2B on CPU, restrict k to small values. -### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite -GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. -That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), -so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency. - -### 2. The model's 4096-token context window is the binding ceiling at high k -k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — -the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. -Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives -the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, -not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. -Latency is *not* the constraint at the upper end; the model's context window is. - -### 3. Latency is not the binding factor on GPU below k=15 -GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. -Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), -not by what fits in the latency budget. - -### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow -CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. -p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't -available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, -or **k ≤ 1** if you want sub-40s p95. - -### 5. Decode time is content-driven, not k-driven -Decode time tracks output length. As k grows, the model writes *longer* responses — likely because -more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. -Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, -not compute-bound on this hardware. +### 5. 4096-token context wall is the binding ceiling at high k +k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the model artifact, not the runtime, and is **shared between E4B and E2B**. **Latency is not the constraint at the upper end of k — context window is.** ### 6. TTFT scales linearly with retrieved-doc content past k=3 -On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, -CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting -the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both. - -## Data inventory (per `(backend, k)`) - -| Backend | k | File | Wall (min) | Runs | Errors | -|---|---:|---|---:|---:|---:| -| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | -| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | -| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | -| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 | -| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | -| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | -| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | -| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 | -| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | -| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | -| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | -| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 | -| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 | -| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | -| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | -| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | +On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so the prefill story scales predictably. The model shrink translates directly into a TTFT shrink across the whole range. + +## Data inventory (per `(model, backend, k)`) + +| Model | Backend | k | File | Wall (min) | Runs | Errors | +|---|---|---:|---|---:|---:|---:| +| Gemma 4 E2B | CPU | 0 (no-RAG) | `benchmark_20260515T223100.json` | 22.5 | 54 | 0 | +| Gemma 4 E2B | CPU | 1 | `benchmark_20260515T183910_k1.json` | 23.9 | 54 | 0 | +| Gemma 4 E2B | CPU | 3 | `benchmark_20260515T190320_k3.json` | 30.0 | 54 | 0 | +| Gemma 4 E2B | CPU | 5 | `benchmark_20260515T193337_k5.json` | 34.2 | 54 | 0 | +| Gemma 4 E2B | CPU | 7 | `benchmark_20260515T200805_k7.json` | 35.5 | 54 | 0 | +| Gemma 4 E2B | CPU | 10 | `benchmark_20260515T204358_k10.json` | 33.9 | 54 | 0 | +| Gemma 4 E2B | CPU | 15 | `benchmark_20260515T211813_k15.json` | 41.7 | 54 | 0 | +| Gemma 4 E2B | CPU | 20 | `benchmark_20260515T220014_k20.json` | 30.4 | 54 | 24 | +| Gemma 4 E2B | GPU | 0 (no-RAG) | `benchmark_20260515T175744.json` | 17.5 | 54 | 0 | +| Gemma 4 E2B | GPU | 1 | `benchmark_20260515T152447_k1.json` | 20.9 | 54 | 0 | +| Gemma 4 E2B | GPU | 3 | `benchmark_20260515T154608_k3.json` | 22.4 | 54 | 0 | +| Gemma 4 E2B | GPU | 5 | `benchmark_20260515T160846_k5.json` | 21.1 | 54 | 0 | +| Gemma 4 E2B | GPU | 7 | `benchmark_20260515T163011_k7.json` | 22.8 | 54 | 0 | +| Gemma 4 E2B | GPU | 10 | `benchmark_20260515T165316_k10.json` | 23.3 | 54 | 0 | +| Gemma 4 E2B | GPU | 15 | `benchmark_20260515T171649_k15.json` | 21.1 | 54 | 0 | +| Gemma 4 E2B | GPU | 20 | `benchmark_20260515T173816_k20.json` | 19.1 | 54 | 24 | +| Gemma 4 E4B | CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | +| Gemma 4 E4B | CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | +| Gemma 4 E4B | CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | +| Gemma 4 E4B | CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 | +| Gemma 4 E4B | CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | +| Gemma 4 E4B | CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | +| Gemma 4 E4B | CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | +| Gemma 4 E4B | CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 | +| Gemma 4 E4B | GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | +| Gemma 4 E4B | GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | +| Gemma 4 E4B | GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | +| Gemma 4 E4B | GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 | +| Gemma 4 E4B | GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 | +| Gemma 4 E4B | GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | +| Gemma 4 E4B | GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | +| Gemma 4 E4B | GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | --- From f560256cc2189aa2f86e0667d882d04d0f9fe078 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 08:05:08 +0800 Subject: [PATCH 06/14] docs: update device_compatibility_notes.md with E2B measurements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the projected E2B latency table in §2 with measured numbers from the 2026-05-16 E2B sweep on Snapdragon 8 Elite. The measured speedup ratio is ~1.5× on GPU and ~2× on CPU (not the originally projected uniform 2×), and the architectural reason is now spelled out in latency_report_v2.md: GPU prefill is compute-bound, GPU decode is bandwidth-bound, CPU is compute-bound throughout. Specific changes: - §6 Open questions: mark "Actual E2B CPU latency" as resolved with a pointer to the new report. Add a new follow-up question about validating the mid-tier MediaTek 2×-slowdown extrapolation on real hardware. - §2 Backend × model × k feasibility: replace projected E2B table with measured Snapdragon 8 Elite values; keep the MediaTek row flagged as extrapolation and call that out in the section preamble. - TL;DR: add a fourth rule covering E2B CPU's newly-measured deployment envelope (k=10 comfortable on flagship CPU, k=3–5 borderline on mid-tier). Why: the original notes shipped with projections marked clearly as "halve E4B numbers". With real measurements in hand, those rows now carry actual data, and the deployment-relevant rule of thumb ("E2B CPU opens up the no-GPU device tier") is anchored on numbers rather than a speculative ratio. --- .../reports/device_compatibility_notes.md | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/evaluation/reports/device_compatibility_notes.md b/evaluation/reports/device_compatibility_notes.md index 7cef7a7..8cd37ad 100644 --- a/evaluation/reports/device_compatibility_notes.md +++ b/evaluation/reports/device_compatibility_notes.md @@ -1,12 +1,13 @@ # MAM-AI Device Compatibility — On Which Phones the Model Can Run -_Last updated: 2026-05-15. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._ +_Last updated: 2026-05-16. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._ -## TL;DR — three load-bearing rules +## TL;DR — four load-bearing rules 1. **E4B minimum RAM: 6 GB** total. 4 GB phones cannot run E4B reliably (model alone needs ~3.3 GB at runtime; Android + bundled apps eat 1.5–2 GB). 2. **E2B minimum RAM: 4 GB** total. The smaller model halves the runtime memory footprint (~1.7 GB), opening up the $100–$150 device tier that's the largest slice of the African market. 3. **E4B on CPU: k=3 is the borderline.** Beyond k=3, CPU totals exceed the 60 s budget on most mid-tier silicon. **E4B on GPU: no latency worry** — totals stay 13–25 s across k=0–15 on Snapdragon 8 Elite + Adreno. +4. **E2B on CPU: k=10 is comfortable on flagship CPU; k=3–5 on mid-tier MediaTek.** Measured E2B CPU at k=10 on Snapdragon 8 Elite is 26 s median; extrapolating ~2× slower for mid-tier MediaTek gives ~50 s at k=5–7 (borderline). This is the deployment-relevant change from the May 2026 sweep: E2B CPU is **~2× faster than E4B CPU**, not the originally-projected speedup — and that means CPU-only deployment is finally viable up to mid-range k on the no-GPU device tier. The catch — covered in §3 below — is that **GPU only works reliably on Adreno** (Snapdragon). For the bulk of the African deployment fleet (MediaTek + Mali GPUs), **plan as CPU-only** and treat any GPU acceleration as a bonus, not a guarantee. @@ -63,27 +64,29 @@ At hard minimum, the app will install and run but will be vulnerable to OOM kill ## 2. Backend × model × k feasibility (UX at 60 s budget) -Median total query latency targets, measured on Snapdragon 8 Elite (test device) and extrapolated for mid-tier MediaTek (~2× slower CPU than 8 Elite). E2B numbers are projections (~2× faster than E4B on the same hardware) until we collect actual measurements. +Median total query latency. Snapdragon 8 Elite rows are measured (see `latency_report_v2.md`). Mid-tier MediaTek rows are extrapolated by scaling CPU latency ~2× slower than Snapdragon 8 Elite — anchored on the published Geekbench gap between Dimensity 8400 / Helio G99 and the Snapdragon 8 Elite, not on an in-house measurement. **Empirical measurement on real MediaTek hardware is the next open question; see §6.** -### Gemma 4 E4B +### Gemma 4 E4B (measured) | Backend × hardware tier | k=0 (no-RAG) | k=3 | k=5 | k=10 | k=15 | |---|---|---|---|---|---| | **GPU, Snapdragon 8 Elite (Adreno 830)** | 13 s ✅ | 19 s ✅ | 20 s ✅ | 21 s ✅ | 24 s ✅ — **no worry at any k ≤ 15** | -| **CPU, Snapdragon 8 Elite** | 27 s ✅ | 41 s ✅ | 60 s 🟡 | 70 s ❌ | 85 s ❌ | -| CPU, mid-tier MediaTek (~2× slower) | ~50 s 🟡 | ~80 s ❌ | — | — | — | +| **CPU, Snapdragon 8 Elite** | 28 s ✅ | 43 s ✅ | 60 s 🟡 | 69 s ❌ | 85 s ❌ | +| CPU, mid-tier MediaTek (~2× slower) | ~56 s 🟡 | ~85 s ❌ | — | — | — | → For E4B: **CPU is unsafe past k=3** on flagship hardware, and unsafe at any k > 0 on mid-tier. GPU works at all k tested. -### Gemma 4 E2B (projected, halve E4B numbers) +### Gemma 4 E2B (measured 2026-05-16) -| Backend × hardware tier | k=0 | k=3 | k=5 | k=10 | k=15 | +Measured E2B is **~1.5× faster than E4B on GPU** (decode is bandwidth-bound, limits the win) and **~2× faster on CPU** (compute-bound — the smaller model's compute reduction translates more directly). See `latency_report_v2.md` for the per-k speedup ratios. + +| Backend × hardware tier | k=0 (no-RAG) | k=3 | k=5 | k=10 | k=15 | |---|---|---|---|---|---| -| GPU, Snapdragon 8 Elite | ~6 s ✅ | ~10 s ✅ | ~10 s ✅ | ~11 s ✅ | ~12 s ✅ | -| CPU, Snapdragon 8 Elite | ~13 s ✅ | ~20 s ✅ | ~30 s ✅ | ~35 s ✅ | ~42 s ✅ | -| **CPU, mid-tier MediaTek** | ~25 s ✅ | ~40 s ✅ | ~55 s 🟡 | ~70 s ❌ | — | +| **GPU, Snapdragon 8 Elite (Adreno 830)** | 9 s ✅ | 14 s ✅ | 12 s ✅ | 16 s ✅ | 13 s ✅ — **no worry at any k ≤ 15** | +| **CPU, Snapdragon 8 Elite** | 14 s ✅ | 21 s ✅ | 27 s ✅ | 26 s ✅ | 37 s ✅ | +| **CPU, mid-tier MediaTek (~2× slower)** | ~28 s ✅ | ~41 s ✅ | ~54 s 🟡 | ~53 s 🟡 | ~74 s ❌ | -→ For E2B on mid-tier MediaTek CPU, k≤3 is comfortable; k≤5 is borderline. **Empirical measurement still pending.** +→ For E2B on flagship CPU, **all k ≤ 15 fit a 60 s budget**. On mid-tier MediaTek CPU, **k ≤ 3 is comfortable, k=5–10 is borderline, k=15 exceeds budget.** This is the key deployment unlock: the no-GPU, mid-tier-CPU path is finally viable for typical k. --- @@ -144,8 +147,9 @@ Combining the SoC distribution data with the floor specs above: | Question | How to answer | Priority | |---|---|---| -| Actual E2B CPU latency at k=0/3/5/7/10/15 on Snapdragon 8 Elite | Same `benchmark_latency.py` sweep run we did for E4B, with the E2B model swapped in | High — unblocks the E2B-vs-E4B deployment decision | +| ~~Actual E2B CPU latency at k=0/3/5/7/10/15 on Snapdragon 8 Elite~~ | **Resolved 2026-05-16** — measured E2B CPU is ~2× faster than E4B CPU at every k; see `latency_report_v2.md` cross-model tables. E2B CPU at k=10 = 26 s; k=15 = 37 s; both under 60 s budget on flagship. | ~~High~~ ✅ Done | | Does GPU backend engage on real Transsion / MediaTek mid-tier devices? | Borrow / buy a Tecno Camon 30 or Infinix Note 40 and run benchmark with `useGpuForLlm=true`; check `[BACKEND]` log line | High — answers whether GPU is realistic for the deployment majority | +| Does the mid-tier MediaTek CPU 2× slowdown extrapolation hold in practice? | Once a Tecno/Infinix mid-tier is in hand, run the full k-sweep on CPU and compare to the projected `~2× slower` table in §2 | High — anchors the deployment recommendation on real numbers, not Geekbench-based extrapolation | | E2B answer-quality regression vs E4B on safety-critical medical-advice metrics | Re-run `eval_report_app_parity_v1.md` apparatus with E2B model | Critical before any model swap decision | | Does Exynos Xclipse driver bug get fixed upstream | Watch LiteRT-LM Issue #2114 | Low — affects ~5% of African market | | When does E4B Qualcomm SM8750 NPU artifact ship | Watch `litert-community/` HF repo monthly per Issue #58 | Medium — perf upgrade, not a deployment unblocker | From 8ee394850d015b43c2de15ee79aae4900a369c0c Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 08:42:54 +0800 Subject: [PATCH 07/14] docs: add one-line deployment summary to device_compatibility_notes.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Open the doc with a bolded single sentence that captures the deployment-relevant takeaway: which (model × backend × k) cells fit a 60 s latency budget on the Snapdragon 8 Elite test device, anchored on the RAM floors and the 4096-token context wall. Why: a reader skimming for "can I ship E2B on CPU?" should get the answer from the first line, before the TL;DR's four rules and the detail tables. --- evaluation/reports/device_compatibility_notes.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evaluation/reports/device_compatibility_notes.md b/evaluation/reports/device_compatibility_notes.md index 8cd37ad..6b5bdd3 100644 --- a/evaluation/reports/device_compatibility_notes.md +++ b/evaluation/reports/device_compatibility_notes.md @@ -2,6 +2,8 @@ _Last updated: 2026-05-16. Companion to `latency_report_v2.md` (timing data) and the NPU feasibility report (`mamaretrieval/notes/npu_feasibility_report.md`)._ +**On the Snapdragon 8 Elite test device under a 60 s latency budget, E4B (6 GB RAM floor) deploys only on GPU across all RAG depths k ≤ 15 or on CPU at k ≤ 3, while E2B (4 GB RAM floor) deploys on both GPU and CPU across all k ≤ 15 — with k = 20 ruled out for both models by the 4096-token context wall, regardless of backend.** + ## TL;DR — four load-bearing rules 1. **E4B minimum RAM: 6 GB** total. 4 GB phones cannot run E4B reliably (model alone needs ~3.3 GB at runtime; Android + bundled apps eat 1.5–2 GB). From b6346fea3144beca7f46b355f283f62e35ba554e Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:02:18 +0800 Subject: [PATCH 08/14] review: generalize cross-model comparison intro in aggregator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comparison-section intro previously named the first comparator model explicitly (`others[0]`), which would silently mislead readers if a third model were ever added to the matrix — the intro would still mention only one comparator while the loop below rendered a table per `other`. Reword the intro to describe the comparison generically (baseline ÷ each comparator) and list all comparators inline so it self-describes no matter how many models the matrix contains. Also align the architectural-context phrasing with the Key-findings section: GPU prefill is compute-bound (tracks parameter count), GPU decode is bandwidth-bound (gains less from shrinkage), CPU is compute-bound throughout. Regenerate the report. Addresses Copilot review comment on PR #59. --- evaluation/aggregate_k_sweep.py | 14 ++++++++++---- evaluation/reports/latency_report_v2.md | 4 ++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 6142299..31c9c0f 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -417,15 +417,21 @@ def write_report(runs: list[dict], out_path: Path) -> None: _write_per_model_section(md, matrix, m, all_ks) # ─────────── Cross-model comparison ─────────── - # Use E4B as baseline when present; ratio is E4B/E2B so >1 means E2B is faster. + # Use E4B as baseline when present; ratio is baseline/other so >1 means + # the (smaller) comparator model is faster on that cell. if len(models) > 1: baseline = "gemma-4-E4B-it.litertlm" if "gemma-4-E4B-it.litertlm" in models else models[0] others = [m for m in models if m != baseline] + others_label = ", ".join(_short_model_label(m) for m in others) md.append("## Cross-model comparison\n") md.append( - f"Ratios below are **{_short_model_label(baseline)} ÷ {_short_model_label(others[0])}**, " - "so values **> 1.0× mean the smaller model is faster** at the same backend×k. " - "GPU compares prefill bandwidth-dominance; CPU exposes raw compute-cost scaling with parameter count." + f"Each table below compares **{_short_model_label(baseline)}** " + f"(baseline) against each comparator model ({others_label}). " + "Ratios are reported as **baseline ÷ comparator** at the same " + "backend × k cell, so values **> 1.0× mean the comparator is faster**. " + "Reading the columns: GPU prefill (TTFT) is compute-bound and tracks " + "parameter count closely; GPU decode is bandwidth-bound and gains less " + "from model shrinkage; CPU is compute-bound throughout." ) md.append("") for other in others: diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 02abadb..9d26a86 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,6 +1,6 @@ # MAM-AI On-Device Latency Sweep — Model × Backend × k -_Generated: 2026-05-16T08:02:21_ +_Generated: 2026-05-16T09:00:40_ ## Device & stack @@ -182,7 +182,7 @@ Per (model × backend × k) configuration: 18 (query × mode) cells × 3 repeats ## Cross-model comparison -Ratios below are **Gemma 4 E4B ÷ Gemma 4 E2B**, so values **> 1.0× mean the smaller model is faster** at the same backend×k. GPU compares prefill bandwidth-dominance; CPU exposes raw compute-cost scaling with parameter count. +Each table below compares **Gemma 4 E4B** (baseline) against each comparator model (Gemma 4 E2B). Ratios are reported as **baseline ÷ comparator** at the same backend × k cell, so values **> 1.0× mean the comparator is faster**. Reading the columns: GPU prefill (TTFT) is compute-bound and tracks parameter count closely; GPU decode is bandwidth-bound and gains less from model shrinkage; CPU is compute-bound throughout. ### Gemma 4 E4B vs Gemma 4 E2B From d66bb0e7875a5ca6373715557f55641aa9e950b6 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:02:47 +0800 Subject: [PATCH 09/14] review: align test-device naming to OnePlus OPD2413 across docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark JSONs record `device.manufacturer="OnePlus"`, but the runbook and the §5 deployment-market table in device_compatibility_notes both called the device "OPPO" (OPPO Find X8). Same physical hardware, different brand label — OPPO and OnePlus share platforms and the OPD2413 ships under both brands depending on market — but the inconsistency makes it hard for a reader to cross-reference the runbook against the regenerated latency report. Settle on "OnePlus OPD2413" (firmware-reported) as the canonical name and add the OPPO branding parenthetically wherever the term first appears in a doc. Addresses Copilot review comment on PR #59. --- evaluation/reports/device_compatibility_notes.md | 2 +- evaluation/runbooks/e2b_sweep.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/evaluation/reports/device_compatibility_notes.md b/evaluation/reports/device_compatibility_notes.md index 6b5bdd3..04486ca 100644 --- a/evaluation/reports/device_compatibility_notes.md +++ b/evaluation/reports/device_compatibility_notes.md @@ -139,7 +139,7 @@ Combining the SoC distribution data with the floor specs above: | $100–$150 low-mid | Tecno Camon, Infinix Hot Pro+, Redmi 13C | Helio G99, Dimensity 6080 | 6 GB | ✅ tight | ✅ comfortable | ⚠️ uncertain | | $150–$250 mid | Tecno Camon 30, Infinix Note 40, Redmi Note 13, Samsung A25 | Dimensity 7050/7200/8400 | 8 GB | ✅ | ✅ | ⚠️ uncertain (Mali) | | $250+ upper-mid | OnePlus Nord, Samsung A5x | Snapdragon 7+ Gen 3 | 8 GB | ✅ | ✅ | ✅ Adreno | -| $400+ flagship | OPPO Find X8 (our test device), Pixel, Galaxy S | Snapdragon 8 Elite, Dimensity 9400, Tensor | 12+ GB | ✅ | ✅ | ✅ Adreno (Pixel ❌) | +| $400+ flagship | OnePlus OPD2413 / OPPO Find X8 (our test device), Pixel, Galaxy S | Snapdragon 8 Elite, Dimensity 9400, Tensor | 12+ GB | ✅ | ✅ | ✅ Adreno (Pixel ❌) | **Effective deployment-viable hardware floor**: roughly **$120+ retail**, 6 GB RAM, 64 GB storage, any 64-bit chipset from 2022 or later. E2B lowers this to **~$100**, 4 GB RAM. diff --git a/evaluation/runbooks/e2b_sweep.md b/evaluation/runbooks/e2b_sweep.md index 848d44b..85b4c62 100644 --- a/evaluation/runbooks/e2b_sweep.md +++ b/evaluation/runbooks/e2b_sweep.md @@ -6,7 +6,7 @@ Self-contained instructions for finishing the E2B latency sweep started by anoth - This work mirrors the E4B latency sweep that landed in PR #57 (commit `1be0a55` on `main`). The E4B results are in `evaluation/reports/latency_report_v2.md` and the device-compatibility analysis is in `evaluation/reports/device_compatibility_notes.md`. - We're now measuring the **smaller** Gemma 4 E2B variant (~2 GB instead of E4B's 3.66 GB) to find out how much faster it is in real terms on the same hardware. Same 16 measurements as E4B: 8 GPU (k ∈ {1, 3, 5, 7, 10, 15, 20} + No-RAG) + 8 CPU. -- Test device: **OPPO OPD2413 (Snapdragon 8 Elite, SM8750P)** connected via ADB. OPPO Hans battery-optimization whitelist is **already configured** by the user — don't re-do it. +- Test device: **OnePlus OPD2413 (Snapdragon 8 Elite, SM8750P)** connected via ADB — that's the firmware-reported manufacturer (`device.manufacturer="OnePlus"` in the benchmark JSONs); the same OPD2413 hardware ships under the OPPO brand in some markets. The OPPO/OnePlus Hans battery-optimization whitelist is **already configured** by the user — don't re-do it. - The benchmark infrastructure is in `evaluation/benchmark_latency.py`; the aggregator is `evaluation/aggregate_k_sweep.py`. Both are already correct for this work, with one expected exception in Phase 4 (the aggregator needs a `model` dimension added). ### Why this is a runbook and not a single Bash command From 75a6c5b861d8e2661b3475ffc31ab176a8f97060 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:02:55 +0800 Subject: [PATCH 10/14] =?UTF-8?q?review:=20clarify=20TL;DR=20rule=20#4=20?= =?UTF-8?q?=E2=80=94=20distinguish=20projected=20vs=20measured=20ratios?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous wording said E2B CPU is "~2× faster than E4B CPU, not the originally-projected speedup," which reads as a contradiction since the original notes projected a uniform ~2× speedup across both backends. The thing that actually diverged from the projection is **GPU total speedup** (~1.5×, not the projected 2×), and the reason is architectural: GPU decode is bandwidth-bound and benefits less from parameter-count shrinkage. CPU matches the projection. Rewrite the rule to make the backend split explicit instead of contrasting CPU against an unspecified projection. Addresses Copilot review comment on PR #59. --- evaluation/reports/device_compatibility_notes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/reports/device_compatibility_notes.md b/evaluation/reports/device_compatibility_notes.md index 04486ca..8213775 100644 --- a/evaluation/reports/device_compatibility_notes.md +++ b/evaluation/reports/device_compatibility_notes.md @@ -9,7 +9,7 @@ _Last updated: 2026-05-16. Companion to `latency_report_v2.md` (timing data) and 1. **E4B minimum RAM: 6 GB** total. 4 GB phones cannot run E4B reliably (model alone needs ~3.3 GB at runtime; Android + bundled apps eat 1.5–2 GB). 2. **E2B minimum RAM: 4 GB** total. The smaller model halves the runtime memory footprint (~1.7 GB), opening up the $100–$150 device tier that's the largest slice of the African market. 3. **E4B on CPU: k=3 is the borderline.** Beyond k=3, CPU totals exceed the 60 s budget on most mid-tier silicon. **E4B on GPU: no latency worry** — totals stay 13–25 s across k=0–15 on Snapdragon 8 Elite + Adreno. -4. **E2B on CPU: k=10 is comfortable on flagship CPU; k=3–5 on mid-tier MediaTek.** Measured E2B CPU at k=10 on Snapdragon 8 Elite is 26 s median; extrapolating ~2× slower for mid-tier MediaTek gives ~50 s at k=5–7 (borderline). This is the deployment-relevant change from the May 2026 sweep: E2B CPU is **~2× faster than E4B CPU**, not the originally-projected speedup — and that means CPU-only deployment is finally viable up to mid-range k on the no-GPU device tier. +4. **E2B on CPU: k=10 is comfortable on flagship CPU; k=3–5 on mid-tier MediaTek.** Measured E2B CPU at k=10 on Snapdragon 8 Elite is 26 s median; extrapolating ~2× slower for mid-tier MediaTek gives ~50 s at k=5–7 (borderline). The original notes projected a uniform ~2× speedup across backends; measurements show **CPU matches that projection (~2× total speedup)** but **GPU total speedup is closer to ~1.5×** because decode is bandwidth-bound and gains less from the parameter-count shrink. Either way, CPU-only deployment is finally viable up to mid-range k on the no-GPU device tier — that's the deployment-relevant change from the May 2026 sweep. The catch — covered in §3 below — is that **GPU only works reliably on Adreno** (Snapdragon). For the bulk of the African deployment fleet (MediaTek + Mali GPUs), **plan as CPU-only** and treat any GPU acceleration as a bonus, not a guarantee. From 84e1bfd52ecf0607ade30ec149a90a75b3e25278 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:03:10 +0800 Subject: [PATCH 11/14] review: revert llm_model production default to E4B MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 3042d38 (Phase 1 of the E2B latency sweep) switched the production `llm_model` in config/app_config.json from E4B to E2B so the benchmark would load the smaller model. That change rode into PR #59 even though the PR's own body — and the safety note in device_compatibility_notes.md §6 — explicitly state that this work does **not** authorize a deployment swap: the answer-quality regression check on kenya_vignettes and the AfriMed-QA SAQ judge run is still listed as "Critical before any model swap decision". Per CLAUDE.md, evaluation quality and response safety are the top priorities for this medical app. Shipping a default-model swap without the safety eval would mean the production app loads a model whose accuracy on safety-critical medical-advice metrics has not been validated. Revert the default to `gemma-4-E4B-it.litertlm`. The benchmark measurements that motivated the PR are unaffected — the .litertlm file used for the sweep is recorded in each benchmark JSON's config.model field, so the data is preserved regardless of the deployed default. To re-run the E2B benchmark on a future branch, the runbook author will need to temporarily flip llm_model to E2B before kicking off benchmark_latency.py (and revert before opening the PR). Addresses Copilot review comment on PR #59. --- config/app_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/app_config.json b/config/app_config.json index acdcb71..cf0aa38 100644 --- a/config/app_config.json +++ b/config/app_config.json @@ -1,5 +1,5 @@ { - "llm_model": "gemma-4-E2B-it.litertlm", + "llm_model": "gemma-4-E4B-it.litertlm", "embedding_model": "Gecko_1024_quant.tflite", "tokenizer": "sentencepiece.model", "embedding_dim": 768, From 59954386d75fe8711443c617c493d124e56f0790 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:38:41 +0800 Subject: [PATCH 12/14] =?UTF-8?q?review:=20aggregate=5Fk=5Fsweep.py=20?= =?UTF-8?q?=E2=80=94=20fail=20with=20directional=20error=20on=20empty=20ma?= =?UTF-8?q?trix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The matrix builder assumed at least one canonical benchmark JSON was loaded and indexed via `next(iter(matrix.values()))`. On a fresh checkout `evaluation/latency_results/` is gitignored and may not exist, so the script would crash with a bare StopIteration that gives a new contributor no useful direction. Detect the empty case and exit with a message pointing at benchmark_latency.py and the runbooks directory. Verified two paths: existing 32-JSON workflow still produces the report; a /tmp checkout with no JSONs now prints the directional error and exits cleanly. Addresses Copilot review comment on PR #59. --- evaluation/aggregate_k_sweep.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 31c9c0f..d911392 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -371,6 +371,17 @@ def write_report(runs: list[dict], out_path: Path) -> None: else: matrix[key] = r + if not matrix: + # latency_results/ is gitignored, so a fresh checkout can hit this. Exit + # with a directional error rather than crashing on StopIteration below. + results_dir = Path(__file__).resolve().parent / "latency_results" + raise SystemExit( + f"No canonical benchmark_*.json found under {results_dir}. " + "Run `python evaluation/benchmark_latency.py …` to produce JSONs " + "(see evaluation/runbooks/ for the sweep procedure), then re-run " + "this aggregator." + ) + models = sorted(set(m for (m, _b, _k) in matrix.keys())) all_ks = sorted(set(k for (_m, _b, k) in matrix.keys())) From 76c57e92c8d684245ccaaab68f37ad2ce76f4949 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:38:50 +0800 Subject: [PATCH 13/14] review: reconcile e2b_sweep runbook with the E4B-default revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #59 reverted llm_model in config/app_config.json back to E4B (commit 84e1bfd), but the runbook's Phase 1 verification still expects fresh benchmark JSONs to record `config.model == "gemma-4-E2B-it.litertlm"`. Anyone reading this runbook on a fresh checkout would hit that mismatch without an explanation. Add a preamble note at the top of the runbook covering the four-step re-run procedure (flip config → rebuild/install → sweep → revert config) and a sentence explaining the git-log expectation drift for replays after PR merge. Keep the rest of the runbook intact — the Phase 2/3 sweep instructions still apply verbatim once the config is flipped. Addresses Copilot review comment on PR #59. --- evaluation/runbooks/e2b_sweep.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/evaluation/runbooks/e2b_sweep.md b/evaluation/runbooks/e2b_sweep.md index 85b4c62..80fbc5b 100644 --- a/evaluation/runbooks/e2b_sweep.md +++ b/evaluation/runbooks/e2b_sweep.md @@ -2,6 +2,15 @@ Self-contained instructions for finishing the E2B latency sweep started by another session. **Phase 1 (setup) is already complete on branch `feat/e2b-latency-sweep`.** Your job is Phase 2 (GPU sweep), Phase 3 (CPU sweep), and Phase 4 (analysis + local commits, **no push**). Expected wall-clock: **~5 hours**. +> **Reading this after PR #59 merged into main?** PR #59 intentionally **reverted** the production `llm_model` in `config/app_config.json` back to `gemma-4-E4B-it.litertlm` — the latency sweep does not authorize a deployment swap (the kenya_vignettes / AfriMed-QA SAQ safety eval is the gate for that). To re-run this benchmark on a future branch: +> +> 1. Edit `config/app_config.json` and set `"llm_model": "gemma-4-E2B-it.litertlm"`. +> 2. Rebuild and install: `flutter build apk --release && adb install -r app/build/app/outputs/flutter-apk/app-release.apk`. +> 3. Run the sweep (Phase 2 + Phase 3 below). +> 4. **Revert** the `config/app_config.json` change before opening any PR. +> +> With the config flipped, the new benchmark JSONs will record `config.model == "gemma-4-E2B-it.litertlm"` as expected by the Phase 1 verification step in §1. Phase 1's commit-log check (looking for `3042d38 config: switch llm_model to Gemma 4 E2B`) was written when that commit was the tip; on a post-merge replay the same SHA will still be reachable, just deeper in the log. + ## 0. Context — read this first - This work mirrors the E4B latency sweep that landed in PR #57 (commit `1be0a55` on `main`). The E4B results are in `evaluation/reports/latency_report_v2.md` and the device-compatibility analysis is in `evaluation/reports/device_compatibility_notes.md`. From 3ae8dae186940089c3fb69b23424fd5fadbd6191 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Sat, 16 May 2026 09:39:02 +0800 Subject: [PATCH 14/14] review: protect benchmark result serialization from config-asset failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The model-name lookup in writeResults() opens and parses app_config.json from the APK assets at the moment the final benchmark JSON is being built. That happens at the *end* of a 20-minute sweep, after all 54 result records have been accumulated in memory but before they hit disk. An IOException on the asset open, or a JSONException on a malformed config, would propagate up and discard the entire results array. The failure probability is low (the asset is bundled inside the APK and the file is generated at build time), but the consequence is severe — we'd lose every measurement from a multi-hour run because we couldn't read a metadata string. Wrap the read in try/catch and fall back to `"unknown"` for the model field if the asset can't be read or parsed. The model field in the JSON is metadata for downstream analysis (aggregate_k_sweep.py groups by it); an "unknown" tag will just route the run into its own matrix cell rather than colliding with a real model, which is the right failure mode for an unexpected build state. Addresses Copilot review comment on PR #59. --- .../com/example/app/BenchmarkForegroundService.kt | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index d504e38..fbc5ba8 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -384,9 +384,18 @@ class BenchmarkForegroundService : Service() { // Read model name from the same app_config.json asset the RagPipeline uses, // so the JSON metadata reflects whatever model is actually loaded rather than // a hardcoded string that goes stale when we switch model artifacts. - put("model", JSONObject( - application.assets.open("app_config.json").bufferedReader().use { it.readText() } - ).getString("llm_model")) + // Wrapped in try/catch: this read runs at the END of the benchmark when we + // serialize all results — an asset/parse error here would discard 20+ minutes + // of completed runs that are still in-memory. Better to ship an "unknown" tag + // and preserve the timing data than lose the whole sweep. + put("model", try { + JSONObject( + application.assets.open("app_config.json").bufferedReader().use { it.readText() } + ).getString("llm_model") + } catch (e: Exception) { + Log.w("mam-ai-bench", "[BENCHMARK] Failed to read llm_model from app_config.json — recording 'unknown': $e") + "unknown" + }) // Read backend from BuildConfig at compile time. Older builds // hard-coded "CPU" here even when GPU was active — fixed so the // JSON metadata matches reality.