diff --git a/.gitignore b/.gitignore index 85f370d..364d040 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,11 @@ _scratch/ evaluation/models/ evaluation/results/ -evaluation/latency_results/ +evaluation/latency_results/* +# Keep the canonical FP16 GPU failure example checked in as a reference for the +# maxNumTokens investigation (long_01 at k=20 → asterisk repetition loop past +# total context ~5000). Referenced by evaluation/reports/maxnumtoken_investigation.md. +!evaluation/latency_results/benchmark_20260516T104730_k20.json # IDE .idea/ diff --git a/app/android/app/build.gradle.kts b/app/android/app/build.gradle.kts index b352671..ea5e68c 100644 --- a/app/android/app/build.gradle.kts +++ b/app/android/app/build.gradle.kts @@ -23,6 +23,25 @@ val useMtpForLlm = project.findProperty("useMtpForLlm")?.toString()?.toBoolean() // can surface what's actually linked at build time. Update in lockstep with the dependency. val litertlmVersion = "0.11.0" +// Capture the current git commit SHA at build time so benchmark JSONs (and any other +// runtime-emitted metadata) can record which code state produced the data. Falls back to +// "unknown" outside a git checkout. Uses --short for compactness; reviewers can `git show` +// the prefix to disambiguate. +fun gitShortSha(): String { + return try { + val proc = ProcessBuilder("git", "rev-parse", "--short", "HEAD") + .directory(rootDir.parentFile?.parentFile ?: rootDir) + .redirectErrorStream(true) + .start() + val out = proc.inputStream.bufferedReader().readText().trim() + proc.waitFor() + if (out.isNotEmpty() && proc.exitValue() == 0) out else "unknown" + } catch (e: Exception) { + "unknown" + } +} +val gitSha = gitShortSha() + fun propOrEnv(envName: String, propertyName: String): String? = System.getenv(envName)?.takeIf { it.isNotBlank() } ?: (keystoreProperties.getProperty(propertyName)?.takeIf { it.isNotBlank() }) @@ -62,6 +81,7 @@ android { buildConfigField("boolean", "USE_GPU_FOR_LLM", useGpuForLlm.toString()) buildConfigField("boolean", "USE_MTP_FOR_LLM", useMtpForLlm.toString()) buildConfigField("String", "LITERTLM_VERSION", "\"$litertlmVersion\"") + buildConfigField("String", "GIT_SHA", "\"$gitSha\"") } sourceSets { diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index fbc5ba8..289e4f4 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -23,6 +23,7 @@ import kotlinx.coroutines.withContext import org.json.JSONArray import org.json.JSONObject import java.io.File +import java.security.MessageDigest import java.text.SimpleDateFormat import java.util.Date import java.util.Locale @@ -388,23 +389,60 @@ class BenchmarkForegroundService : Service() { // serialize all results — an asset/parse error here would discard 20+ minutes // of completed runs that are still in-memory. Better to ship an "unknown" tag // and preserve the timing data than lose the whole sweep. - put("model", try { + val llmModelName = try { JSONObject( application.assets.open("app_config.json").bufferedReader().use { it.readText() } ).getString("llm_model") } catch (e: Exception) { Log.w("mam-ai-bench", "[BENCHMARK] Failed to read llm_model from app_config.json — recording 'unknown': $e") "unknown" - }) - // Read backend from BuildConfig at compile time. Older builds - // hard-coded "CPU" here even when GPU was active — fixed so the - // JSON metadata matches reality. - put("backend", if (BuildConfig.USE_GPU_FOR_LLM) "GPU" else "CPU") + } + put("model", llmModelName) + // Record the ACTUAL backend the engine initialized on (RagPipeline + // falls back GPU→CPU if GPU construction or init throws). Reading + // BuildConfig.USE_GPU_FOR_LLM here would mislabel a silent fallback + // as the originally-requested backend and make GPU-vs-CPU comparisons + // invalid. activeBackend is set before llmReady flips true. + put("backend", pipeline.activeBackend) put("mtp_enabled", BuildConfig.USE_MTP_FOR_LLM) - put("max_tokens", 32000) - put("temperature", 1.0) - put("top_p", 0.95) - put("top_k", 64) + // Read max_num_tokens + sampler params from runtime_config.json — the same + // source RagPipeline reads from. Single source of truth, so the JSON metadata + // can't drift from what the engine actually used. + try { + val rt = JSONObject(application.assets.open("runtime_config.json").bufferedReader().use { it.readText() }) + put("max_num_tokens", rt.getJSONObject("engine").getInt("max_num_tokens")) + val gen = rt.getJSONObject("generation") + put("temperature", gen.getDouble("temperature")) + put("top_p", gen.getDouble("top_p")) + put("top_k", gen.getInt("top_k")) + } catch (e: Exception) { + Log.w("mam-ai-bench", "[BENCHMARK] Failed to read runtime_config.json — recording defaults: $e") + put("max_num_tokens", -1) + put("temperature", -1.0) + put("top_p", -1.0) + put("top_k", -1) + } + // Artifact fingerprint: SHA-256 of the first 64 KB of the loaded .litertlm. + // The header (FlatBuffers metadata) lives in the first few KB, so this hash + // uniquely distinguishes artifact variants — e.g. the default Gemma 4 E4B + // build from the FP32-tagged rebuild that adds `prefer_activation_type=float32` + // to the prefill_decode section. Without this, JSON metadata can't tell which + // .litertlm variant was loaded. + put("artifact_fingerprint", try { + val modelFile = File(application.getExternalFilesDir(null), llmModelName) + val buf = ByteArray(65536) + val read = modelFile.inputStream().use { it.read(buf) } + val digest = MessageDigest.getInstance("SHA-256") + .digest(if (read > 0) buf.copyOf(read) else byteArrayOf()) + digest.joinToString("") { "%02x".format(it) } + } catch (e: Exception) { + Log.w("mam-ai-bench", "[BENCHMARK] Failed to fingerprint artifact: $e") + "unknown" + }) + // Provenance: the git commit SHA that produced this APK. Wired via + // BuildConfig.GIT_SHA at Gradle build time. + put("git_commit_sha", BuildConfig.GIT_SHA) + put("litertlm_version", BuildConfig.LITERTLM_VERSION) }) put("init", JSONObject().apply { put("gecko_sqlite_ms", syncInitMs) diff --git a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt index e13e391..9f486b2 100644 --- a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt +++ b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt @@ -53,6 +53,7 @@ class RagPipeline(application: Application) { JSONObject(application.assets.open("runtime_config.json").bufferedReader().use { it.readText() }) private val appConfig: JSONObject = JSONObject(application.assets.open("app_config.json").bufferedReader().use { it.readText() }) + private val engineRuntimeConfig = runtimeConfig.getJSONObject("engine") private val generationConfig = runtimeConfig.getJSONObject("generation") private val retrievalConfig = runtimeConfig.getJSONObject("retrieval") private val contextInjectionConfig = runtimeConfig.getJSONObject("context_injection") @@ -87,6 +88,17 @@ class RagPipeline(application: Application) { @Volatile var llmReady = false + // The actual backend the engine initialized on. May differ from the requested + // BuildConfig.USE_GPU_FOR_LLM flag because the init below falls back to CPU + // if GPU construction or GPU engine init throws. Set to the final value before + // llmReady flips true, so any reader awaiting awaitLlmReady() can trust it. + // Used by BenchmarkForegroundService to record the ACTUAL backend in the + // benchmark JSON's config block rather than the requested flag (which would + // mislabel a fallback as the originally-requested backend). + @Volatile + var activeBackend: String = "unknown" + private set + // Shared init result so any number of callers can await the same success/failure. private val llmInit = CompletableDeferred>() @@ -142,6 +154,7 @@ class RagPipeline(application: Application) { // memorizeChunks(application.applicationContext, "mamai_trim.txt") // Log.i("mam-ai", "Chunks loaded!") + this.activeBackend = activeBackend llmReady = true llmInit.complete(Result.success(Unit)) } catch (t: Throwable) { @@ -332,7 +345,24 @@ class RagPipeline(application: Application) { } private fun buildEngine(modelPath: String, backend: Backend, cacheDir: String) { - val e = Engine(EngineConfig(modelPath = modelPath, backend = backend, cacheDir = cacheDir)) + // maxNumTokens is the total context budget (prompt + generated response, equal + // to the KV-cache size). Sourced from runtime_config.json `engine.max_num_tokens` + // — single source of truth, also read by BenchmarkForegroundService for accurate + // metadata recording. + // + // Why the current value (4096): empirically, on Android GPU the FP16 attention + // kernels produce off-distribution K/V values past the artifact's calibrated + // zone, causing the response to deterministically collapse into a `*` repetition + // loop at total context ~5000 (see evaluation/reports/maxnumtoken_investigation.md). + // 4096 stays ~900 tokens below the cliff. To push higher on GPU, also force + // FP32 via the artifact's `prefer_activation_type=float32` metadata key. + val maxNumTokens = engineRuntimeConfig.getInt("max_num_tokens") + val e = Engine(EngineConfig( + modelPath = modelPath, + backend = backend, + maxNumTokens = maxNumTokens, + cacheDir = cacheDir, + )) e.initialize() engine = e } diff --git a/config/runtime_config.json b/config/runtime_config.json index 86b40f0..5e6241e 100644 --- a/config/runtime_config.json +++ b/config/runtime_config.json @@ -1,4 +1,7 @@ { + "engine": { + "max_num_tokens": 4096 + }, "generation": { "temperature": 1.0, "top_p": 0.95, diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index d911392..d436394 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -286,36 +286,13 @@ def _write_per_model_section( md.append(f"| {k_label} | {fmt_s(gv)} | {fmt_s(cv)} |") md.append("") - md.append("### Errors (count / 54 runs)\n") - md.append("| k | GPU errors | CPU errors |") - md.append("|---:|---:|---:|") - for k in all_ks: - gpu_run = matrix.get((model, "GPU", k)) - cpu_run = matrix.get((model, "CPU", k)) - if not gpu_run and not cpu_run: - continue - ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None - ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None - k_label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {k_label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") - md.append("") + # Errors per (model × backend × k) are uniform: 0 everywhere except k=20=24. + # Don't waste a table per model on the same finding — the FP16-vs-FP32 GPU + # section discusses errors in prose, and the data inventory below records + # per-run error counts. Per-model error tables removed 2026-05-17. - md.append("### Wall-clock\n") - md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |") - md.append("|---:|---:|---:|---:|") - for k in all_ks: - gpu_run = matrix.get((model, "GPU", k)) - cpu_run = matrix.get((model, "CPU", k)) - if not gpu_run and not cpu_run: - continue - gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None - cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None - gw_s = f"{gw:.1f}" if gw is not None else "—" - cw_s = f"{cw:.1f}" if cw is not None else "—" - ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else "" - k_label = "**0 (no-RAG)**" if k == 0 else str(k) - md.append(f"| {k_label} | {gw_s} | {cw_s} | {ratio} |") - md.append("") + # Wall-clock (benchmark runtime, not user-facing UX latency) is an + # operational metric — moved to the Appendix at the bottom of the report. def _write_cross_model_table( @@ -382,7 +359,14 @@ def write_report(runs: list[dict], out_path: Path) -> None: "this aggregator." ) - models = sorted(set(m for (m, _b, _k) in matrix.keys())) + # Order: production-deployed model first (currently E4B), then others + # alphabetically. Keeps per-model sections + the cross-model comparison + # baseline consistent. + def _model_priority(m: str) -> tuple[int, str]: + if m == "gemma-4-E4B-it.litertlm": + return (0, m) + return (1, m) + models = sorted(set(m for (m, _b, _k) in matrix.keys()), key=_model_priority) all_ks = sorted(set(k for (_m, _b, k) in matrix.keys())) sample = next(iter(matrix.values())) @@ -393,11 +377,23 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") md.append("") md.append("## Device & stack\n") - md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") + soc = dev.get('soc', '?') + soc_display = f"{soc} / Snapdragon 8 Elite" if soc == "SM8750P" else soc + md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({soc_display}) — Android {dev.get('android_version', '?')}, 16 GB RAM") md.append(f"- **Models tested**: " + ", ".join(f"{_short_model_label(m)} (`{m}`)" for m in models)) md.append("- **LiteRT-LM**: 0.11.0") - md.append("- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") - md.append("- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") + md.append("- **Backends tested**: GPU (OpenCL on Adreno) and CPU (XNNPACK)") + md.append("- **Activation precision**: GPU defaults to **FP16**, CPU defaults to **FP32** — this asymmetry matters at lifted context (see [`maxnumtoken_investigation.md`](maxnumtoken_investigation.md) §Step 4). All measurement tables use the defaults; one explicit FP32-on-GPU sweep is summarised in the FP16-vs-FP32 GPU section below.") + md.append("- **Sampling**: temp=1.0, top_p=0.95, top_k=64 — read from `runtime_config.json`. No explicit `max_output_tokens` cap is enforced; the runtime decodes until a stop token or until total context hits `maxNumTokens=4096`.") + md.append("- **Total context budget** (`maxNumTokens` passed to `EngineConfig`): **4096** for every measurement table in this report. The FP16/FP32 section's prose discusses lifted values (5000, 8192) used purely to characterize the cliff — those measurements are not in any table.") + md.append("") + md.append("## TL;DR — today's deployment") + md.append("") + md.append("> **Ship configuration: FP16 GPU running Gemma 4 E4B at `maxNumTokens=4096` on Snapdragon 8 Elite.** Median total query latency **13–25 s across k=0–15 for E4B** (7.9–18 s for the smaller E2B); cleanly below the FP16 quality cliff at total context ~5000.") + md.append(">") + md.append("> k=20 is **partial**: the 8 longest of 18 query types produce prompts >4096 tokens and get runtime-rejected (24/54 errors in every sweep); the other 10 query types complete normally.") + md.append(">") + md.append("> Fallbacks: **FP32 GPU at max=4096** (~21–34% slower at k=10–15, no precision cliff) for extra correctness margin on the same hardware; **FP32 GPU at max=5000–6000** for higher context (verified on this 16 GB device; max=8192 OOMs because FP32 doubles the KV cache, so the practical ceiling is around 6500–7500); **CPU FP32** (~2–4× slower than FP16 GPU) for devices without working OpenCL.") md.append("") sample_cfg = sample["data"].get("config", {}) sample_repeats = sample_cfg.get("repeats", "?") @@ -422,6 +418,9 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") md.append("") + md.append("### Provenance fields in benchmark JSONs (post-`52e11e9`)\n") + md.append("Each benchmark JSON's `config` block records `max_num_tokens`, `artifact_fingerprint` (SHA-256 of first 64 KB of the loaded `.litertlm`), and `git_commit_sha`. Together these let any reviewer cryptographically verify which artifact variant + code state produced the JSON, without trusting the filename. Earlier sweep JSONs (PR #57/#59) lack these fields but their content is unaffected.") + md.append("") # ─────────── Per-model sections ─────────── for m in models: @@ -435,14 +434,14 @@ def write_report(runs: list[dict], out_path: Path) -> None: others = [m for m in models if m != baseline] others_label = ", ".join(_short_model_label(m) for m in others) md.append("## Cross-model comparison\n") + comparator_phrase = f"the comparator ({others_label})" if len(others) == 1 else f"each comparator ({others_label})" md.append( f"Each table below compares **{_short_model_label(baseline)}** " - f"(baseline) against each comparator model ({others_label}). " - "Ratios are reported as **baseline ÷ comparator** at the same " - "backend × k cell, so values **> 1.0× mean the comparator is faster**. " - "Reading the columns: GPU prefill (TTFT) is compute-bound and tracks " - "parameter count closely; GPU decode is bandwidth-bound and gains less " - "from model shrinkage; CPU is compute-bound throughout." + f"(baseline) against {comparator_phrase}. Ratios are reported as " + "**baseline ÷ comparator** at the same backend × k cell, so values " + "**> 1.0× mean the comparator is faster**. The architectural " + "story behind these ratios (prefill compute-bound vs decode " + "bandwidth-bound) is in Key findings #1–#2 below." ) md.append("") for other in others: @@ -458,17 +457,45 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append("") _write_cross_model_table(md, matrix, baseline, other, all_ks, "decode_ms", fmt_ms) - md.append("## Errors and the 4096-token context wall\n") - md.append("At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every " - "(model × backend) combination tested: ") - md.append("`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. ") - md.append("Each failure reports `Input token ids are too long. Exceeding the maximum " - "number of tokens allowed: …>= 4096`. ") - md.append("Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; " - "the wall is a property of the `.litertlm` artifact format, not the " - "parameter count or backend. **k_max ≈ 17–18** for both models.") + md.append("## FP16 vs FP32 GPU (and why the context cap is 4096)") + md.append("") + md.append("All cross-model tables above use the **default** GPU activation precision, which on Android is **FP16**. That choice is not a knob in our code — LiteRT-LM picks FP16 for the GPU text-decoder path and FP32 for CPU (XNNPACK). The 4096 `maxNumTokens` value we ship was chosen because of how the two precisions behave at lifted context; the full investigation is in [`maxnumtoken_investigation.md`](maxnumtoken_investigation.md). Headlines:") + md.append("") + md.append("- **The 4096 cap is a runtime config check, not an architectural constant.** It's `maxNumTokens` in `EngineConfig`, sourced from `runtime_config.json`. When the prompt alone exceeds it, LiteRT-LM rejects the request before any decoding starts (verified in `liblitertlm_jni.so`). At k=20, the same 8 of 18 query types in every sweep produce prompts above 4096 and get rejected — that's the 24/54 errors visible in every k=20 cell across all (model × backend) combinations.") + md.append("- ⚠️ **The FP16 default has a quality cliff** at total context ~5000 tokens. If you lift the cap to admit larger prompts, GPU output silently collapses into a `*` repetition loop, deterministically. Concrete example: [`benchmark_20260516T104730_k20.json`](../latency_results/benchmark_20260516T104730_k20.json) (long_01, k=20, FP16 GPU, maxNumTokens=8192).") + md.append("- **CPU (FP32) stays clean** for the same lifted-cap prompt — the asymmetry isolates precision as the cause, not the artifact or backend choice.") + md.append("- **Confirming the fix**: forcing GPU to FP32 (via injecting `prefer_activation_type=float32` into the `.litertlm` metadata) eliminates the cliff. Direct A/B on the exact `long_01` k=20 case wasn't possible — FP32 KV cache at maxNumTokens=8192 OOMs the test device — but the closest-comparable test (`long_01` k=15, max=5000, response ending at total context ~4514) produced clean output through the same FP16-cliff zone.") + md.append("- **Our 4096 ship value gives ~900 tokens of safety margin** below the FP16 cliff. Anyone lifting the cap on FP16 GPU enters the silent-failure zone; switch to FP32 GPU first.") + md.append("") + md.append("### Latency cost of FP32 on GPU (E4B at maxNumTokens=4096, 2026-05-17)") + md.append("") + md.append("Apples-to-apples sweep with `artifact_fingerprint`-verified provenance. Full 8×2 table is in the investigation doc §Step 6; the medians at a representative subset:") + md.append("") + md.append("| k | FP16 GPU total | FP32 GPU total | T ratio | FP16 TTFT | FP32 TTFT | TTFT ratio | FP16 decode | FP32 decode |") + md.append("|---:|---:|---:|---:|---:|---:|---:|---:|---:|") + md.append("| **0 (no-RAG)** | 14.5 s | 16.5 s | 1.14× | 0.97 s | 2.03 s | 2.10× | 13.5 s | 14.4 s |") + md.append("| 1 | 14.1 s | 18.0 s | 1.28× | 0.95 s | 2.06 s | 2.16× | 11.4 s | 12.8 s |") + md.append("| 5 | 19.6 s | 24.3 s | 1.24× | 1.88 s | 4.28 s | 2.28× | 16.0 s | 16.3 s |") + md.append("| 10 | 22.6 s | 27.4 s | 1.21× | 2.53 s | 5.85 s | 2.32× | 18.2 s | 18.6 s |") + md.append("| 15 | 23.1 s | 30.9 s | 1.34× | 3.45 s | 8.37 s | 2.43× | 16.9 s | 18.4 s |") + md.append("") + md.append("Two clean stories:") + md.append("") + md.append("- **Prefill (TTFT) is ~2.1–2.5× slower on FP32** — prefill is compute-bound, and FP16 doubles arithmetic throughput on Adreno. The ratio is stable across k.") + md.append("- **Decode is essentially identical** (within ~9% on every cell) — decode is bandwidth-bound, so precision barely matters in steady-state generation.") + md.append("- **Total query is 6–34% slower on FP32**, depending on how much of total is prefill vs decode at the given k. At our typical k=10–15 cells, ~21–34% slower (~5–8 s extra wait per query).") + md.append("") + md.append("### When to ship FP32 GPU instead of FP16 GPU") + md.append("") + md.append("| Use case | Choice | Why |") + md.append("|---|---|---|") + md.append("| **Today's deployment** | FP16 GPU, max=4096 | Clean output below the cliff; fastest UX |") + md.append("| Extra correctness margin without changing context | FP32 GPU, max=4096 | ~25% slower at k=15 but eliminates the FP16 cliff as a risk class entirely |") + md.append("| Higher context (e.g., k>15 desired in future) | FP32 GPU, max=5000–6000 | No cliff. Memory: KV cache doubles → ~6500–7500 ceiling on 16 GB devices |") + md.append("| GPU unavailable (MediaTek / older Snapdragon) | CPU FP32 | Always clean, but ~2–4× slower than FP16 GPU |") + md.append("") + md.append("---") md.append("") - md.append("## Key findings\n") md.append("### 1. Prefill (TTFT) scales ~2× with parameter count on both backends") md.append("Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speedup on GPU** " @@ -485,8 +512,11 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append("### 3. Total speedup is decode-dominated, hence smaller than TTFT") md.append("**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since " "decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks " - "decode rather than prefill. At high k where prefill grows large, total speedup climbs " - "toward the prefill ratio (~1.7–1.9× GPU at k=15+).") + "decode rather than prefill. The GPU total ratio peaks at k=15 (~1.86×) where prefill is " + "a larger fraction of the budget, then drops back at k=20 (~1.28×) — but the k=20 cell " + "is a **survivor-bias artifact**: 24 of 54 queries error on the prompt-cap check (the " + "8 longest queries × 3 reps), so the k=20 median is computed over the 30 *shorter* " + "queries that happen to fit. The trend is not a real reversal.") md.append("") md.append("### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier") md.append("E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend " @@ -495,11 +525,13 @@ def write_report(runs: list[dict], out_path: Path) -> None: "(mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: " "ship E2B on CPU, restrict k to small values.") md.append("") - md.append("### 5. 4096-token context wall is the binding ceiling at high k") - md.append("k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically " - "across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the " - "model artifact, not the runtime, and is **shared between E4B and E2B**. " - "**Latency is not the constraint at the upper end of k — context window is.**") + md.append("### 5. The 4096-token cap is a precision-driven safety margin, not a hard runtime limit") + md.append("k=15 works cleanly on every (model × backend) cell. At k=20, the 8 longest of 18 query " + "types exceed the cap and get rejected by the runtime (24/54 errors per cell, identical " + "across all backends). The cap itself is *liftable* — but on the default **FP16** GPU " + "path, lifted output silently collapses past total context ~5000. FP32 GPU removes the " + "cliff at ~25% latency cost. See the FP16-vs-FP32 GPU section above for details. " + "**The constraint at high k is precision, not latency or memory.**") md.append("") md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3") md.append("On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so " @@ -511,7 +543,7 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append("## Data inventory (per `(model, backend, k)`)\n") md.append("| Model | Backend | k | File | Wall (min) | Runs | Errors |") md.append("|---|---|---:|---|---:|---:|---:|") - for (m, b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1], x[2])): + for (m, b, k) in sorted(matrix.keys(), key=lambda x: (_model_priority(x[0]), x[1], x[2])): r = matrix[(m, b, k)] wall = r["data"]["total_benchmark_time_ms"] / 60000 n = len(r["data"]["results"]) @@ -519,6 +551,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: k_label = "0 (no-RAG)" if k == 0 else str(k) md.append(f"| {_short_model_label(m)} | {b} | {k_label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") md.append("") + md.append("> **Note:** the table above lists the canonical FP16-default runs (which is what every table in this report tabulates). The `Wall (min)` column is benchmark runtime (operational), not user-facing latency. The aggregator dedupes by `(model, backend, k)`, so the **8 FP32 GPU sweep JSONs (2026-05-16)** and the **16 today-instrumented runs (2026-05-17, FP32 + FP16 with `artifact_fingerprint` provenance)** referenced by the FP16-vs-FP32 GPU section are not listed here. Their full filenames + fingerprints are in [`maxnumtoken_investigation.md`](maxnumtoken_investigation.md) §References.") + md.append("") md.append("---") md.append("") md.append("_Source benchmark JSONs live in `evaluation/latency_results/`. ") diff --git a/evaluation/latency_results/benchmark_20260516T104730_k20.json b/evaluation/latency_results/benchmark_20260516T104730_k20.json new file mode 100644 index 0000000..56544d7 --- /dev/null +++ b/evaluation/latency_results/benchmark_20260516T104730_k20.json @@ -0,0 +1,190 @@ +{ + "benchmark_version": 1, + "timestamp": "20260516T104838", + "device": { + "manufacturer": "OnePlus", + "model": "OPD2413", + "device": "OP615EL1", + "hardware": "qcom", + "board": "sun", + "soc": "SM8750P", + "android_version": "15", + "sdk_int": 35, + "abi": "arm64-v8a" + }, + "config": { + "repeats": 1, + "cooldown_ms": 1000, + "skip_retrieval": false, + "rag_only": true, + "query_filter": "long_01", + "retrieval_top_k_override": 20, + "model": "gemma-4-E4B-it.litertlm", + "backend": "GPU", + "mtp_enabled": false, + "max_tokens": 32000, + "temperature": 1, + "top_p": 0.95, + "top_k": 64 + }, + "init": { + "gecko_sqlite_ms": 63, + "llm_load_ms": 10008, + "warmup_query_ms": 72496, + "total_init_ms": 82570 + }, + "memory": { + "used_mb": 5, + "free_mb": 0, + "total_mb": 5, + "max_mb": 384 + }, + "results": [ + { + "query_id": "long_01", + "category": "long", + "query_text": "I am a midwife at a rural clinic in Zanzibar. A 28 year old woman, gravida 4 para 3, is at 38 weeks gestation. She came to the clinic complaining of severe headache, swelling in her hands and face, and epigastric pain. Her blood pressure is 170 over 115. She has protein in her urine. The nearest hospital is 2 hours away. What should I do while waiting for transport?", + "query_word_count": 68, + "use_retrieval": true, + "repetition": 1, + "retrieval_time_ms": 2085, + "ttft_ms": 6408, + "prefill_ms": 6408, + "decode_ms": 263190, + "total_generation_ms": 269598, + "total_query_ms": 271683, + "response_length_chars": 6027, + "estimated_tokens": 1506, + "decode_throughput_tps": 5.72, + "num_retrieved_docs": 20, + "retrieved_chunks": [ + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n### **PROTEINURIA**\n\nThe presence of proteinuria changes the diagnosis from gestational hypertension to pre-eclampsia. Because vaginal secretions or amniotic fluid may contaminate a urine specimen, only clean-catch midstream specimens should be used. Catheterization for this purpose is not justified due to the risk of urinary tract infection.\n\nDiagnostic criteria for proteinuria include: two urine dipstick measurements of at least 2+ (30 mg per dL) taken six hours apart; at least 300 mg of protein in a 24-hour urine sample; or a urinary protein\/creatinine ratio of 0.3 or greater.\n\nIt is important to rule out pre-eclampsia before assigning another etiology for the presence of proteinuria in a pregnant woman with elevated blood pressure. However, other conditions can cause proteinuria and false positive results are possible. Urinary tract infection, severe anaemia, heart failure and difficult labour may all cause proteinuria. Blood in the urine due to catheter trauma or schistosomiasis and contamination from vaginal blood can give false positive results.\n\nRandom urine sampling, such as the dipstick test for protein, is a useful screening tool. A change from negative to positive during pregnancy is a warning sign. If dipsticks are not available, a sample of urine can be heated to boiling in a clean test tube. Add a drop of 2% acetic acid to check for persistent precipitates that can be quantified as a percentage of protein to the volume of the total sample.", + "source": "WHO_Complications_2017", + "page": 195, + "chars": 1530 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n**TABLE S-12. Differential diagnosis of elevated blood pressure, headache, blurred vision, convulsions or loss of consciousness**\n\n-------------------------------------------------------|------------------------------------------|\n| • SBP 160 mmHg or higher and\/or DBP 110 mmHg or higher after 20 weeks of gestation • Proteinuria 2+ on dipstick | • Headache (increasing frequency, unrelieved by regular analgesics) • Vision changes (e.g. blurred vision) • Oliguria (passing less than 400 mL urine in 24 hours) • Upper abdominal pain (epigastric pain or pain in right upper quadrant) • Difficulty breathing (rales on auscultation of lungs due to fluid in lungs) • Nausea and vomiting • Hyperreflexia or clonus In facilities with laboratory capacity: • Liver enzymes (transaminases) more than twice the normal range • Serum creatinine higher than 1.1 mg\/dL or a doubling, or higher, of the baseline serum creatinine concentration in the absence of other renal disease • Platelets less than 100,000 cells\/mcL (100 × 109\/L) | Severe pre eclampsia,a,b page S-57 |", + "source": "WHO_Complications_2017", + "page": 197, + "chars": 1110 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > TABLE S-14. Oral antihypertensive medications for non-severe hypertension\n\n#### **Gestation between 24 and 34 Weeks**\n\nIn women with severe pre-eclampsia and a viable fetus before 34 weeks of gestation, expectant management is recommended, provided that uncontrolled maternal hypertension, maternal danger signs (e.g. severe headache, visual changes and abdominal pain) and fetal distress are absent and can be monitored. When laboratory services are available, it is advisable to monitor the maternal laboratory values outlined in **Table S-12 (page S-52)** (creatinine, liver transaminases and platelets).\n\nIf it is not possible to monitor maternal and fetal well-being, transfer to a tertiary care hospital is recommended. If referral to a tertiary hospital is not possible, manage severe pre-eclampsia as eclampsia.\n\n- Give antenatal corticosteroids to accelerate fetal lung maturation. Antenatal corticosteroid therapy is recommended for women with pregnancies at a gestational age of 24–34 weeks for whom preterm birth is considered imminent (due to severe pre-eclampsia or eclampsia), if the following conditions are met:\n - Gestational age assessment can be accurately undertaken.\n - There is no clinical evidence of maternal infection.\n - Adequate childbirth care is available (including the capacity to recognize and safely manage preterm labour and birth), and the preterm newborn can receive adequate care if needed (including\n\nresuscitation, thermal care, feeding support, infection treatment and safe oxygen use).", + "source": "WHO_Complications_2017", + "page": 207, + "chars": 1580 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > TABLE S-14. Oral antihypertensive medications for non-severe hypertension\n\n#### **Gestation Less than 24 Weeks (Pre-Viable Fetus)**\n\nInduction of labour is recommended for women with severe pre-eclampsia if the fetus is not viable or is unlikely to achieve viability within one or two weeks.\n\n- Assess the cervix (**page P-19**) and induce labour as per medical management of inevitable abortion if the gestational age is less than 24 weeks (**Table S-4, page S-13); or offer dilatation and evacuation (S-18) for expedited birth.**\n- Hysterotomy (incision of the uterus through the abdominal wall at less than 24 weeks of gestation) should be avoided.\n\n**Note:** Before performing a **hysterotomy**, ensure that:\n\n- coagulopathy has been ruled out;\n- safe general or regional anaesthesia is available. Spinal anaesthesia is associated with a risk of hypotension. This risk can be reduced if adequate IV fluids (500–1000 mL) are infused prior to administration of the spinal anaesthesia (**page P-11**).\n\nDo not use local anaesthesia or ketamine in women with pre-eclampsia or eclampsia.", + "source": "WHO_Complications_2017", + "page": 206, + "chars": 1139 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > GESTATION AT OR MORE THAN 37 + 0\/7 WEEKS\n\n#### **SEVERE PRE-ECLAMPSIA AND ECLAMPSIA**\n\nSevere pre-eclampsia and eclampsia are managed similarly, except that birth must occur within 12 hours of onset of convulsions in eclampsia.\n\n**Note:** All cases of severe pre-eclampsia should be managed actively. Symptoms and signs of \"impending eclampsia\" (e.g. blurred vision, hyperreflexia) are unreliable. Once symptoms consistent with severe pre-eclampsia begin, expectant management is not recommended.", + "source": "WHO_Complications_2017", + "page": 201, + "chars": 549 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n**TABLE S-12. Differential diagnosis of elevated blood pressure, headache, blurred vision, convulsions or loss of consciousness**\n\n| Presenting Symptom and Other Symptoms and Signs Typically Present | Symptoms, Signs and Laboratory Findings Sometimes Present | Probable Diagnosis |\n|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------|-----------------------------------------------------------------------------|\n| • SBP 140 mmHg or higher and\/or DBP 90 mmHg or higher before 20 weeks of gestation • After 20 weeks: Proteinuria 2+ on – dipstick – Presence of any pre eclampsia features | | Chronic hypertension with superimposed pre-eclampsia, page S-66 |", + "source": "WHO_Complications_2017", + "page": 196, + "chars": 974 + }, + { + "text": "> WHO LABOUR CARE GUIDE Name Mary Jane Williams Parity 2 Labour onset spontaneous Active labour diagnosis [Date 06\/07\/20 ] Ruptured membranes [Date 06\/07\/20 Time 5:00 ] Risk factors History of stillbirth; anaemia\n\n**Table 5. Guidance for completing Section 4 of the LCG**\n\n--------------------------------------------------------------------------------------------------|\n| Urine | Check protein and acetone in urine with a reagent strip. | Record readings of protein (P) and acetone (A) as Negative, Trace, +, ++, +++, ++++. | Alert: P++, A++ A 2+ protein (P++) could guide further management, although confirmation may be done with a second dipstick of 2+ at the next urine void. Proteinuria could be a sign of pre-eclampsia, urinary tract infection, severe anaemia, or previously undiagnosed renal or cardiac disease. Ketonuria could be a sign of dehydration secondary to reduced fluid intake or excessive losses (vomiting or diarrhea), prolonged labour or previously undiagnosed diabetes (13). | If P++, A++ or more, interpret measurements in the context of a full clinical examination. Alert a senior provider and follow local guidelines. If P = Negative, Trace or +, assess every 4 hours or each time the woman voids during labour. |", + "source": "WHO_LabourCare_2020", + "page": 23, + "chars": 1240 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n**TABLE S-12. Differential diagnosis of elevated blood pressure, headache, blurred vision, convulsions or loss of consciousness**\n\n| Presenting Symptom and Other Symptoms and Signs Typically Present | Symptoms, Signs and Laboratory Findings Sometimes Present | Probable Diagnosis |\n|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------|-----------------------------------------------------------------------------|\n| • Two readings of SBP 140 mmHg or higher but lower than 160 mmHg and\/or DBP 90 mmHg or higher but lower than 110 mmHg four hours apart after 20 weeks of gestation • Proteinuria 2+ on dipstick | | Mild pre eclampsia, page S-56 |", + "source": "WHO_Complications_2017", + "page": 196, + "chars": 961 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n**TABLE S-12. Differential diagnosis of elevated blood pressure, headache, blurred vision, convulsions or loss of consciousness**\n\n| Presenting Symptom and Other Symptoms and Signs Typically Present | Symptoms, Signs and Laboratory Findings Sometimes Present | Probable Diagnosis |\n|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------|-----------------------------------------------------------------------------|\n| • Two readings of SBP 140 mmHg or higher but lower than 160 mmHg and\/or DBP 90 mmHg or higher but lower than 110 mmHg four hours apart after 20 weeks of gestation • No proteinuria • No features of pre eclampsia | | Gestational hypertension, page S-55 |", + "source": "WHO_Complications_2017", + "page": 196, + "chars": 986 + }, + { + "text": "# **12.1 SUBENABLING OUTCOMES, RELATED TASKS, ASSESSMENT CRITERIA, ASSESSMENT METHODS AND ASSESSMENT INSTRUMENT**\n\n| Sub Enabling | Related Tasks | Assessment Criteria | Assessment | Assessment |\n|---------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------|----------------------------------|-----------------------------------------|\n| | woman with mild pre eclampsia during antenatal j) Give care to a pregnant woman with severe pre eclampsia during antenatal | disorders of pregnancy is correctly provided | | |", + "source": "Curr_NTA Level 6_27.07 Tanzania", + "page": 45, + "chars": 742 + }, + { + "text": "# IF ELEVATED DIASTOLIC BLOOD PRESSURE\n\n| SIGNS | CLASSIFY | TREAT AND ADVISE |\n|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n| Diastolic blood pressure ≥110 mmHg and 3+ proteinuria, or Diastolic blood pressure ≥90 mmHg on two readings and 2+ proteinuria, and any of: ⇒ severe headache ⇒ blurred vision ⇒ epigastric pain. | SEVERE PRE-ECLAMPSIA | ■ Give magnesium sulphate ■ If in early labour or postpartum, refer urgently to hospital ■ If late labour: → continue magnesium sulphate treatment ■ B13 → monitor blood pressure every hour. → DO NOT give ergometrine after delivery. ■ Refer urgently to hospital after delivery |", + "source": "WHO_IntegratedPregBirth_2015", + "page": 86, + "chars": 1259 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n**TABLE S-12. Differential diagnosis of elevated blood pressure, headache, blurred vision, convulsions or loss of consciousness**\n\na If a woman has any one of the symptoms or signs listed for severe pre-eclampsia (with the exception of proteinuria 2+ on the dipstick), diagnose severe pre-eclampsia.\n\n**A small proportion of women with eclampsia have normal blood pressure. Treat all women with convulsions as if they have eclampsia until another diagnosis is confirmed.**", + "source": "WHO_Complications_2017", + "page": 198, + "chars": 524 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > SPECIFIC MANAGEMENT OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n#### *GESTATION LESS THAN 37 + 0\/7 WEEKS*\n\n- Advise her to watch for symptoms and signs of severe pre-eclampsia (**Table S-12, page S-52**).\n- See her twice weekly to monitor blood pressure and fetal well-being and to assess for symptoms and signs of severe pre-eclampsia.\n\n**If systolic blood pressure is 160 mmHg or higher and\/or diastolic blood pressure is 110 mmHg or higher, or if signs of severe preeclampsia appear, even if her blood pressure is normal**, admit the woman and follow recommendations for management of severe preeclampsia and eclampsia (**page S-57**).", + "source": "WHO_Complications_2017", + "page": 201, + "chars": 687 + }, + { + "text": "> Recommendations > 1.2.11 At every antenatal contact, update the woman's antenatal records to include details of history, test results, examination findings, medicines and discussions.\n\n#### **Pre-eclampsia and hypertension in pregnancy**\n\n- 1.2.27 Urgently refer women with severe hypertension (blood pressure of 160\/ 110 mmHg or higher) to secondary care to be seen on the same day. The urgency of the referral should be determined by an overall clinical assessment.\n- 1.2.28 Offer a urine dipstick test for proteinuria at every routine face-to-face antenatal appointment.\n\nFor a short explanation of why the committee made the recommendations and how they might affect practice, see the rationale and impact section on pre-eclampsia and hypertension in pregnancy.", + "source": "NICE_Antenatal_2021", + "page": 17, + "chars": 767 + }, + { + "text": "# **ASSESSMENT OF MATERNAL CONDITION**\n\n- Monitor the woman's condition:\n - During the **latent phase** of the first stage of labour: Check maternal mood and behaviour (distressed, anxious) at least once every hour; check blood pressure, pulse and temperature at least once every four hours.\n - During the **active phase** of first stage of labour: Check maternal mood and behaviour (distressed, anxious) at least once every 30 minutes; check blood pressure at least once every four hours, temperature at least once every two hours and pulse once every 30 minutes.\n - During **second stage**: Check maternal mood and behaviour (distressed, anxious) at least once every five minutes.\n- Evaluate the woman for emergency signs and for signs of distress, and respond appropriately:\n - If the **woman's pulse is increasing**, she may be dehydrated or in pain or she may be developing a fever.\n\nNormal labour and childbirth **C-89**\n\n- Ensure adequate hydration via oral or IV routes.\n- Provide adequate analgesia (**page C-55).**\n- If the **woman's temperature is higher than 38°C**, manage the cause of the fever **(page S-113**) and monitor temperature at least every two hours.\n- If the **woman's blood pressure decreases**, suspect occult or frank haemorrhage.\n- If **acetone is present in the woman's urine**, suspect poor nutrition or dehydration and encourage her to eat or drink; otherwise, give dextrose IV.", + "source": "WHO_Complications_2017", + "page": 112, + "chars": 1411 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > SPECIFIC MANAGEMENT OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n#### **GESTATIONAL HYPERTENSION**\n\nManage on an outpatient basis:\n\n- Monitor blood pressure, urine (for proteinuria) and fetal condition weekly.\n- If **blood pressure worsens or the woman develops features of pre-eclampsia,** manage as pre-eclampsia (**page S-56**).\n- If there are **signs of severe fetal growth restriction or fetal compromise**, admit the woman to the hospital for assessment and possible expedited birth.\n- Counsel the woman and her family about danger signs indicating severe pre-eclampsia or eclampsia.\n- If all **observations remain stable**, allow to proceed with spontaneous labour and childbirth (**page C-77**).\n\n• In women with gestational hypertension, if **spontaneous labour has not occurred before term**, induce labour at term.", + "source": "WHO_Complications_2017", + "page": 199, + "chars": 873 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n### **Gestation after 37 + 0\/7 Completed Weeks**\n\nFor women with pre-eclampsia at term (37 + 0\/7 weeks), regardless of pre-eclampsia severity, giving birth is recommended.\n\n- Assess the cervix (**page P-19**) and induce labour (**page P-17**).\n- If **vaginal birth is not anticipated** within 12 hours (eclampsia) or 24 hours (severe pre-eclampsia), perform a caesarean (**page P-53**).\n- If there are **fetal heart rate abnormalities** (less than 100 or more than 180 beats per minute), perform a caesarean (**page P-53**).\n- If **safe anaesthesia is not available for caesarean** or if the **fetus is dead**:\n\n- Aim for vaginal birth.\n- If the **cervix is unfavourable** (firm, thick, closed), ripen the cervix (**page P-21**).\n\n**Note:** Before performing a **caesarean**, ensure that:\n\n- coagulopathy has been ruled out;\n- safe general or regional anaesthesia is available. Spinal anaesthesia is associated with a risk of hypotension. This risk can be reduced if adequate IV fluids (500–1000 mL) are infused prior to administration of the spinal anaesthesia (**page P-11**).\n\nDo not use local anaesthesia or ketamine in women with pre-eclampsia or eclampsia.", + "source": "WHO_Complications_2017", + "page": 209, + "chars": 1214 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY\n\n**TABLE S-12. Differential diagnosis of elevated blood pressure, headache, blurred vision, convulsions or loss of consciousness**\n\n| Presenting Symptom and Other Symptoms and Signs Typically Present | Symptoms, Signs and Laboratory Findings Sometimes Present | Probable Diagnosis |\n|----------------------------------------------------------------------------------|--------------------------------------------------------------------|-------------------------------------|\n| • Fever • Chills\/rigors • Headache • Muscle\/joint pain | • Enlarged spleen | Malaria, page S-117 |\n| • Symptoms and signs of uncomplicated malaria • Coma • Anaemia | • Convulsions • Jaundice | Severe malaria, page S-121 |", + "source": "WHO_Complications_2017", + "page": 198, + "chars": 749 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > TABLE S-14. Oral antihypertensive medications for non-severe hypertension\n\n#### *OPTIMAL TIMING FOR BIRTH*\n\nBirth should be considered as soon as the woman's condition has stabilized. The decision about the optimal timing of childbirth should be made on an individual basis, taking into account, among other factors, gestational age, maternal and fetal status and well-being, cervical favourability, and urgency.\n\nFollowing an eclamptic convulsion, birth of the baby should occur within 12 hours of the onset of convulsions.", + "source": "WHO_Complications_2017", + "page": 206, + "chars": 577 + }, + { + "text": "> DIAGNOSIS OF HYPERTENSIVE DISORDERS OF PREGNANCY > TABLE S-14. Oral antihypertensive medications for non-severe hypertension\n\n#### **Gestation 34 to 36 6\/7 Weeks**\n\n**Note:** After 34 completed weeks of gestation, corticosteroids are not recommended for the indication of fetal lung maturation.\n\nIn women with severe pre-eclampsia and a viable fetus that is between 34 and 36 + 6\/7 weeks of gestation, a policy of expectant management may be recommended, **provided that** uncontrolled maternal hypertension,\n\nworsening maternal status and fetal distress are absent and can be closely monitored.\n\nIf any features of worsening severe pre-eclampsia or eclampsia are present, or if close monitoring of the woman and fetus is not feasible, transfer to a higherlevel facility. If transfer is not possible, the birth should be expedited.", + "source": "WHO_Complications_2017", + "page": 208, + "chars": 833 + } + ], + "retrieved_total_chars": 19705, + "response_text": "This is a **medical emergency**. You must act immediately.\n\n**Escalate to a doctor or arrange urgent referral now.**\n\nThe patient has signs of **severe pre-eclampsia**.\n\n* **Severe pre-eclampsia** is diagnosed by:\n * Systolic blood pressure (SBP) of 160 mmHg or higher and\/ or Diastolic blood pressure (DBP) of 10 mmHg or higher.\n * **Severe pre-eclampsia**\n * * **Severe pre-eclampsia**\n * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ** * * * * * * * * * * * * * * * * * * * * * * * * * * * * * ** * * * * * * * * * * * * *** * * * * * * * * * * * * * * ** * * * * * * * * * * * * ** * * * * * * * * * * * * * **** * * * * * * * * * * * ** * * * * * * * * * * * * * **** ** * * * * * * * * * * * * **** * * * * ** * * * * **** ** * * * * * * * * ** * ** * * * * * ** * * * * ** *** * * ** * * * * * * * * * ** ** * * * * * * * * * * * * *** * * * * * * * * * * * ** ** * * * * * * * * * ** ***** * * * * * * * * * * ** ** ** ** ** * * * * * * * * * * ** **** *** ** * * * * * * **** ** ****** * * * * * * ** * ****** * * * ** * * * * ***** * ****** * * * * * * ** * ***** *** ** * * * ** * * * * ** **** ** **** * * * * * * * * * ***** *** ***** * * * * * * * * **** ***** **** * * * * * * * * ***** ***** * **** * * * * ** * * * * **** *** * * * * * * * ** ******** *** * * * * * * * *** * * ***** ******** * * * * * * ** **** *** ****** * * * * * * * * * *** * * **** ********* **** * * * * * * * ** ******* ******** * * * * * * * * * ** *** ********* * * *** * * * * * ** ************ * * * * * * * * * ****** * * **** * * * ***** * ********* * * * * * * * * ***** ******** * * * * * * **** ************ * * * * * * * ** ** *********** * * * * * * * ** * ************ * * * * * * * * *** ************** * * * * * * * * ***** ********** * * * * * * * * * * ** ************ * * * * * * * * **** ********** * * * * * * * * ** ************ * * * * * * * * ** ************** * * * * * * * ** ************* * * * * * * * * * **** ********** * * * * * * * * **** ********** * * * * * * * * ****** * * **** ** ********* * * * * * * * * ** *************** * * * * * * * * **** *************** * * * * * * * * * * * ******* ********* * * * * * * * * *** *************** * * * * * * * * * * ** ************** * * * * * * * * * * **** ** ********** * * * * * * * * * ***** ************* * * * * * * * * * * *************** * * * * * * * * **** *********** * * * * * * * * * * ** ************ * * * * * * * * ** ************* * * * * * * * * * **** ************* * * * * * * * * *** * * **** ** *********** * * * * * * * * * * *** * * **** ****** * * * * * * * ** ************** * * * * * * * * * * * *** ************* * * * * * * * ** *************** * * * * * * * * **** ********** * * * * * * * * ************ * * * * * * * * ********** * * * * * * * * *************** ", + "error": null, + "heap_before_mb": 2, + "heap_after_mb": 2 + } + ], + "total_benchmark_time_ms": 355264 +} \ No newline at end of file diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 9d26a86..27b984a 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,15 +1,25 @@ # MAM-AI On-Device Latency Sweep — Model × Backend × k -_Generated: 2026-05-16T09:00:40_ +_Generated: 2026-05-17T14:56:54_ ## Device & stack -- **Device**: OnePlus OPD2413 (SM8750P) — Android 15 -- **Models tested**: Gemma 4 E2B (`gemma-4-E2B-it.litertlm`), Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) +- **Device**: OnePlus OPD2413 (SM8750P / Snapdragon 8 Elite) — Android 15, 16 GB RAM +- **Models tested**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`), Gemma 4 E2B (`gemma-4-E2B-it.litertlm`) - **LiteRT-LM**: 0.11.0 -- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU -- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000 +- **Backends tested**: GPU (OpenCL on Adreno) and CPU (XNNPACK) +- **Activation precision**: GPU defaults to **FP16**, CPU defaults to **FP32** — this asymmetry matters at lifted context (see [`maxnumtoken_investigation.md`](maxnumtoken_investigation.md) §Step 4). All measurement tables use the defaults; one explicit FP32-on-GPU sweep is summarised in the FP16-vs-FP32 GPU section below. +- **Sampling**: temp=1.0, top_p=0.95, top_k=64 — read from `runtime_config.json`. No explicit `max_output_tokens` cap is enforced; the runtime decodes until a stop token or until total context hits `maxNumTokens=4096`. +- **Total context budget** (`maxNumTokens` passed to `EngineConfig`): **4096** for every measurement table in this report. The FP16/FP32 section's prose discusses lifted values (5000, 8192) used purely to characterize the cliff — those measurements are not in any table. + +## TL;DR — today's deployment + +> **Ship configuration: FP16 GPU running Gemma 4 E4B at `maxNumTokens=4096` on Snapdragon 8 Elite.** Median total query latency **13–25 s across k=0–15 for E4B** (7.9–18 s for the smaller E2B); cleanly below the FP16 quality cliff at total context ~5000. +> +> k=20 is **partial**: the 8 longest of 18 query types produce prompts >4096 tokens and get runtime-rejected (24/54 errors in every sweep); the other 10 query types complete normally. +> +> Fallbacks: **FP32 GPU at max=4096** (~21–34% slower at k=10–15, no precision cliff) for extra correctness margin on the same hardware; **FP32 GPU at max=5000–6000** for higher context (verified on this 16 GB device; max=8192 OOMs because FP32 doubles the KV cache, so the practical ceiling is around 6500–7500); **CPU FP32** (~2–4× slower than FP16 GPU) for devices without working OpenCL. ## Methodology @@ -20,85 +30,9 @@ Per (model × backend × k) configuration: 18 (query × mode) cells × 3 repeats - `total_query` is everything: `retrieval + TTFT + decode`. - Reported as median across the 54 runs unless noted (p95 in tables marked `p95`). -## Gemma 4 E2B (`gemma-4-E2B-it.litertlm`) - -### Median total query latency (seconds) - -| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | -|---:|---:|---:|---:|---:| -| **0 (no-RAG)** | 0 | 7.9 / 8.1 / 10.8 | 13.2 / 14.1 / 16.0 | 1.60× | -| 1 | 561 | 11.4 / 11.8 / 12.8 | 13.0 / 16.3 / 17.5 | 1.35× | -| 3 | 2098 | 12.8 / 13.8 / 16.5 | 19.1 / 22.0 / 22.5 | 1.44× | -| 5 | 3547 | 9.9 / 14.2 / 14.0 | 26.3 / 27.6 / 28.6 | 2.36× | -| 7 | 5139 | 12.8 / 14.3 / 17.6 | 23.5 / 32.0 / 33.2 | 1.87× | -| 10 | 7482 | 15.2 / 14.6 / 17.9 | 23.4 / 26.2 / 27.7 | 1.68× | -| 15 | 11297 | 13.0 / 12.4 / 14.8 | 31.0 / 38.2 / 40.7 | 2.80× | -| 20 | 14520 | 19.3 / 15.8 / 14.3 | 33.4 / 39.8 / 44.5 | 2.28× | +### Provenance fields in benchmark JSONs (post-`52e11e9`) -### TTFT (ms, median) - -| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | -|---:|---:|---:|---:|---:| -| **0 (no-RAG)** | 0 | 429 | 5564 | 13.0× | -| 1 | 561 | 412 | 5355 | 13.0× | -| 3 | 2098 | 445 | 7394 | 16.6× | -| 5 | 3547 | 793 | 14604 | 18.4× | -| 7 | 5139 | 819 | 14577 | 17.8× | -| 10 | 7482 | 1074 | 13635 | 12.7× | -| 15 | 11297 | 1479 | 21368 | 14.4× | -| 20 | 14520 | 1722 | 22947 | 13.3× | - -### Decode (ms, median) - -| k | GPU decode | CPU decode | CPU÷GPU | -|---:|---:|---:|---:| -| **0 (no-RAG)** | 8263 | 8174 | 0.99× | -| 1 | 7573 | 6764 | 0.89× | -| 3 | 10223 | 9584 | 0.94× | -| 5 | 9052 | 9571 | 1.06× | -| 7 | 10723 | 13451 | 1.25× | -| 10 | 10713 | 11870 | 1.11× | -| 15 | 9664 | 9920 | 1.03× | -| 20 | 11036 | 10697 | 0.97× | - -### p95 total query latency (s) - -| k | GPU p95 | CPU p95 | -|---:|---:|---:| -| **0 (no-RAG)** | 11.4 | 17.4 | -| 1 | 17.7 | 19.1 | -| 3 | 19.7 | 35.8 | -| 5 | 21.2 | 35.1 | -| 7 | 19.4 | 41.0 | -| 10 | 23.8 | 37.9 | -| 15 | 18.1 | 45.2 | -| 20 | 22.2 | 50.4 | - -### Errors (count / 54 runs) - -| k | GPU errors | CPU errors | -|---:|---:|---:| -| **0 (no-RAG)** | 0 | 0 | -| 1 | 0 | 0 | -| 3 | 0 | 0 | -| 5 | 0 | 0 | -| 7 | 0 | 0 | -| 10 | 0 | 0 | -| 15 | 0 | 0 | -| 20 | 24 | 24 | - -### Wall-clock - -| k | GPU wall (min) | CPU wall (min) | CPU÷GPU | -|---:|---:|---:|---:| -| **0 (no-RAG)** | 17.5 | 22.5 | 1.28× | -| 1 | 20.9 | 23.9 | 1.14× | -| 3 | 22.4 | 30.0 | 1.34× | -| 5 | 21.1 | 34.2 | 1.62× | -| 7 | 22.8 | 35.5 | 1.56× | -| 10 | 23.3 | 33.9 | 1.46× | -| 15 | 21.1 | 41.7 | 1.97× | -| 20 | 19.1 | 30.4 | 1.59× | +Each benchmark JSON's `config` block records `max_num_tokens`, `artifact_fingerprint` (SHA-256 of first 64 KB of the loaded `.litertlm`), and `git_commit_sha`. Together these let any reviewer cryptographically verify which artifact variant + code state produced the JSON, without trusting the filename. Earlier sweep JSONs (PR #57/#59) lack these fields but their content is unaffected. ## Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) @@ -154,35 +88,63 @@ Per (model × backend × k) configuration: 18 (query × mode) cells × 3 repeats | 15 | 30.6 | 112.7 | | 20 | 35.3 | 104.9 | -### Errors (count / 54 runs) +## Gemma 4 E2B (`gemma-4-E2B-it.litertlm`) -| k | GPU errors | CPU errors | -|---:|---:|---:| -| **0 (no-RAG)** | 0 | 0 | -| 1 | 0 | 0 | -| 3 | 0 | 0 | -| 5 | 0 | 0 | -| 7 | 0 | 0 | -| 10 | 0 | 0 | -| 15 | 0 | 0 | -| 20 | 24 | 24 | - -### Wall-clock - -| k | GPU wall (min) | CPU wall (min) | CPU÷GPU | +### Median total query latency (seconds) + +| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 7.9 / 8.1 / 10.8 | 13.2 / 14.1 / 16.0 | 1.60× | +| 1 | 561 | 11.4 / 11.8 / 12.8 | 13.0 / 16.3 / 17.5 | 1.35× | +| 3 | 2098 | 12.8 / 13.8 / 16.5 | 19.1 / 22.0 / 22.5 | 1.44× | +| 5 | 3547 | 9.9 / 14.2 / 14.0 | 26.3 / 27.6 / 28.6 | 2.36× | +| 7 | 5139 | 12.8 / 14.3 / 17.6 | 23.5 / 32.0 / 33.2 | 1.87× | +| 10 | 7482 | 15.2 / 14.6 / 17.9 | 23.4 / 26.2 / 27.7 | 1.68× | +| 15 | 11297 | 13.0 / 12.4 / 14.8 | 31.0 / 38.2 / 40.7 | 2.80× | +| 20 | 14520 | 19.3 / 15.8 / 14.3 | 33.4 / 39.8 / 44.5 | 2.28× | + +### TTFT (ms, median) + +| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 429 | 5564 | 13.0× | +| 1 | 561 | 412 | 5355 | 13.0× | +| 3 | 2098 | 445 | 7394 | 16.6× | +| 5 | 3547 | 793 | 14604 | 18.4× | +| 7 | 5139 | 819 | 14577 | 17.8× | +| 10 | 7482 | 1074 | 13635 | 12.7× | +| 15 | 11297 | 1479 | 21368 | 14.4× | +| 20 | 14520 | 1722 | 22947 | 13.3× | + +### Decode (ms, median) + +| k | GPU decode | CPU decode | CPU÷GPU | |---:|---:|---:|---:| -| **0 (no-RAG)** | 23.5 | 36.9 | 1.57× | -| 1 | 23.0 | 38.7 | 1.68× | -| 3 | 27.3 | 50.2 | 1.84× | -| 5 | 28.2 | 63.0 | 2.23× | -| 7 | 30.0 | 66.5 | 2.22× | -| 10 | 29.1 | 73.2 | 2.51× | -| 15 | 32.4 | 90.8 | 2.80× | -| 20 | 22.8 | 58.6 | 2.57× | +| **0 (no-RAG)** | 8263 | 8174 | 0.99× | +| 1 | 7573 | 6764 | 0.89× | +| 3 | 10223 | 9584 | 0.94× | +| 5 | 9052 | 9571 | 1.06× | +| 7 | 10723 | 13451 | 1.25× | +| 10 | 10713 | 11870 | 1.11× | +| 15 | 9664 | 9920 | 1.03× | +| 20 | 11036 | 10697 | 0.97× | + +### p95 total query latency (s) + +| k | GPU p95 | CPU p95 | +|---:|---:|---:| +| **0 (no-RAG)** | 11.4 | 17.4 | +| 1 | 17.7 | 19.1 | +| 3 | 19.7 | 35.8 | +| 5 | 21.2 | 35.1 | +| 7 | 19.4 | 41.0 | +| 10 | 23.8 | 37.9 | +| 15 | 18.1 | 45.2 | +| 20 | 22.2 | 50.4 | ## Cross-model comparison -Each table below compares **Gemma 4 E4B** (baseline) against each comparator model (Gemma 4 E2B). Ratios are reported as **baseline ÷ comparator** at the same backend × k cell, so values **> 1.0× mean the comparator is faster**. Reading the columns: GPU prefill (TTFT) is compute-bound and tracks parameter count closely; GPU decode is bandwidth-bound and gains less from model shrinkage; CPU is compute-bound throughout. +Each table below compares **Gemma 4 E4B** (baseline) against the comparator (Gemma 4 E2B). Ratios are reported as **baseline ÷ comparator** at the same backend × k cell, so values **> 1.0× mean the comparator is faster**. The architectural story behind these ratios (prefill compute-bound vs decode bandwidth-bound) is in Key findings #1–#2 below. ### Gemma 4 E4B vs Gemma 4 E2B @@ -225,12 +187,44 @@ Each table below compares **Gemma 4 E4B** (baseline) against each comparator mod | 15 | 16820 | 9664 | 1.74× | 22497 | 9920 | 2.27× | | 20 | 14688 | 11036 | 1.33× | 22634 | 10697 | 2.12× | -## Errors and the 4096-token context wall +## FP16 vs FP32 GPU (and why the context cap is 4096) + +All cross-model tables above use the **default** GPU activation precision, which on Android is **FP16**. That choice is not a knob in our code — LiteRT-LM picks FP16 for the GPU text-decoder path and FP32 for CPU (XNNPACK). The 4096 `maxNumTokens` value we ship was chosen because of how the two precisions behave at lifted context; the full investigation is in [`maxnumtoken_investigation.md`](maxnumtoken_investigation.md). Headlines: + +- **The 4096 cap is a runtime config check, not an architectural constant.** It's `maxNumTokens` in `EngineConfig`, sourced from `runtime_config.json`. When the prompt alone exceeds it, LiteRT-LM rejects the request before any decoding starts (verified in `liblitertlm_jni.so`). At k=20, the same 8 of 18 query types in every sweep produce prompts above 4096 and get rejected — that's the 24/54 errors visible in every k=20 cell across all (model × backend) combinations. +- ⚠️ **The FP16 default has a quality cliff** at total context ~5000 tokens. If you lift the cap to admit larger prompts, GPU output silently collapses into a `*` repetition loop, deterministically. Concrete example: [`benchmark_20260516T104730_k20.json`](../latency_results/benchmark_20260516T104730_k20.json) (long_01, k=20, FP16 GPU, maxNumTokens=8192). +- **CPU (FP32) stays clean** for the same lifted-cap prompt — the asymmetry isolates precision as the cause, not the artifact or backend choice. +- **Confirming the fix**: forcing GPU to FP32 (via injecting `prefer_activation_type=float32` into the `.litertlm` metadata) eliminates the cliff. Direct A/B on the exact `long_01` k=20 case wasn't possible — FP32 KV cache at maxNumTokens=8192 OOMs the test device — but the closest-comparable test (`long_01` k=15, max=5000, response ending at total context ~4514) produced clean output through the same FP16-cliff zone. +- **Our 4096 ship value gives ~900 tokens of safety margin** below the FP16 cliff. Anyone lifting the cap on FP16 GPU enters the silent-failure zone; switch to FP32 GPU first. + +### Latency cost of FP32 on GPU (E4B at maxNumTokens=4096, 2026-05-17) + +Apples-to-apples sweep with `artifact_fingerprint`-verified provenance. Full 8×2 table is in the investigation doc §Step 6; the medians at a representative subset: + +| k | FP16 GPU total | FP32 GPU total | T ratio | FP16 TTFT | FP32 TTFT | TTFT ratio | FP16 decode | FP32 decode | +|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 14.5 s | 16.5 s | 1.14× | 0.97 s | 2.03 s | 2.10× | 13.5 s | 14.4 s | +| 1 | 14.1 s | 18.0 s | 1.28× | 0.95 s | 2.06 s | 2.16× | 11.4 s | 12.8 s | +| 5 | 19.6 s | 24.3 s | 1.24× | 1.88 s | 4.28 s | 2.28× | 16.0 s | 16.3 s | +| 10 | 22.6 s | 27.4 s | 1.21× | 2.53 s | 5.85 s | 2.32× | 18.2 s | 18.6 s | +| 15 | 23.1 s | 30.9 s | 1.34× | 3.45 s | 8.37 s | 2.43× | 16.9 s | 18.4 s | -At k=20, the **same 8 queries × 3 reps = 24 runs** failed across every (model × backend) combination tested: -`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`. -Each failure reports `Input token ids are too long. Exceeding the maximum number of tokens allowed: …>= 4096`. -Both Gemma 4 E4B and Gemma 4 E2B ship the same 4096-token context window; the wall is a property of the `.litertlm` artifact format, not the parameter count or backend. **k_max ≈ 17–18** for both models. +Two clean stories: + +- **Prefill (TTFT) is ~2.1–2.5× slower on FP32** — prefill is compute-bound, and FP16 doubles arithmetic throughput on Adreno. The ratio is stable across k. +- **Decode is essentially identical** (within ~9% on every cell) — decode is bandwidth-bound, so precision barely matters in steady-state generation. +- **Total query is 6–34% slower on FP32**, depending on how much of total is prefill vs decode at the given k. At our typical k=10–15 cells, ~21–34% slower (~5–8 s extra wait per query). + +### When to ship FP32 GPU instead of FP16 GPU + +| Use case | Choice | Why | +|---|---|---| +| **Today's deployment** | FP16 GPU, max=4096 | Clean output below the cliff; fastest UX | +| Extra correctness margin without changing context | FP32 GPU, max=4096 | ~25% slower at k=15 but eliminates the FP16 cliff as a risk class entirely | +| Higher context (e.g., k>15 desired in future) | FP32 GPU, max=5000–6000 | No cliff. Memory: KV cache doubles → ~6500–7500 ceiling on 16 GB devices | +| GPU unavailable (MediaTek / older Snapdragon) | CPU FP32 | Always clean, but ~2–4× slower than FP16 GPU | + +--- ## Key findings @@ -241,13 +235,13 @@ Halving the parameter count (E4B → E2B) gives a **consistent ~2.3× TTFT speed Decode speedup from E4B → E2B is **~1.5× on GPU** but **~2× on CPU**. Decode is sequential (one token at a time), so on GPU it's limited by memory bandwidth feeding weights into compute units — the smaller model helps less than its parameter count would predict. On CPU the constraint is compute, so the speedup tracks the model shrink. ### 3. Total speedup is decode-dominated, hence smaller than TTFT -**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks decode rather than prefill. At high k where prefill grows large, total speedup climbs toward the prefill ratio (~1.7–1.9× GPU at k=15+). +**Total-query speedup**: ~1.5× GPU, ~2.2× CPU. Total = TTFT + decode + retrieval; since decode dominates total at low-to-mid k (TTFT is small there), the total speedup tracks decode rather than prefill. The GPU total ratio peaks at k=15 (~1.86×) where prefill is a larger fraction of the budget, then drops back at k=20 (~1.28×) — but the k=20 cell is a **survivor-bias artifact**: 24 of 54 queries error on the prompt-cap check (the 8 longest queries × 3 reps), so the k=20 median is computed over the 30 *shorter* queries that happen to fit. The trend is not a real reversal. ### 4. GPU still wins, but E2B CPU opens up the no-GPU device tier E2B CPU is 1.4–2.4× slower than E2B GPU at every k — GPU remains the preferred backend where available. But E2B CPU at k=1 (~16 s median) is comparable to E4B GPU at k=1 (~14 s), which means devices that previously could *not* deploy MAM-AI at acceptable latency (mid-tier MediaTek, older Snapdragon without OpenCL) now have a realistic path: ship E2B on CPU, restrict k to small values. -### 5. 4096-token context wall is the binding ceiling at high k -k=15 works cleanly on all four (model × backend) combinations. k=20 fails identically across all four: same 8 queries, same 24 (query × rep) failures. The cap is in the model artifact, not the runtime, and is **shared between E4B and E2B**. **Latency is not the constraint at the upper end of k — context window is.** +### 5. The 4096-token cap is a precision-driven safety margin, not a hard runtime limit +k=15 works cleanly on every (model × backend) cell. At k=20, the 8 longest of 18 query types exceed the cap and get rejected by the runtime (24/54 errors per cell, identical across all backends). The cap itself is *liftable* — but on the default **FP16** GPU path, lifted output silently collapses past total context ~5000. FP32 GPU removes the cliff at ~25% latency cost. See the FP16-vs-FP32 GPU section above for details. **The constraint at high k is precision, not latency or memory.** ### 6. TTFT scales linearly with retrieved-doc content past k=3 On both backends and both models, TTFT-per-doc-char is roughly constant past k=3, so the prefill story scales predictably. The model shrink translates directly into a TTFT shrink across the whole range. @@ -256,22 +250,6 @@ On both backends and both models, TTFT-per-doc-char is roughly constant past k=3 | Model | Backend | k | File | Wall (min) | Runs | Errors | |---|---|---:|---|---:|---:|---:| -| Gemma 4 E2B | CPU | 0 (no-RAG) | `benchmark_20260515T223100.json` | 22.5 | 54 | 0 | -| Gemma 4 E2B | CPU | 1 | `benchmark_20260515T183910_k1.json` | 23.9 | 54 | 0 | -| Gemma 4 E2B | CPU | 3 | `benchmark_20260515T190320_k3.json` | 30.0 | 54 | 0 | -| Gemma 4 E2B | CPU | 5 | `benchmark_20260515T193337_k5.json` | 34.2 | 54 | 0 | -| Gemma 4 E2B | CPU | 7 | `benchmark_20260515T200805_k7.json` | 35.5 | 54 | 0 | -| Gemma 4 E2B | CPU | 10 | `benchmark_20260515T204358_k10.json` | 33.9 | 54 | 0 | -| Gemma 4 E2B | CPU | 15 | `benchmark_20260515T211813_k15.json` | 41.7 | 54 | 0 | -| Gemma 4 E2B | CPU | 20 | `benchmark_20260515T220014_k20.json` | 30.4 | 54 | 24 | -| Gemma 4 E2B | GPU | 0 (no-RAG) | `benchmark_20260515T175744.json` | 17.5 | 54 | 0 | -| Gemma 4 E2B | GPU | 1 | `benchmark_20260515T152447_k1.json` | 20.9 | 54 | 0 | -| Gemma 4 E2B | GPU | 3 | `benchmark_20260515T154608_k3.json` | 22.4 | 54 | 0 | -| Gemma 4 E2B | GPU | 5 | `benchmark_20260515T160846_k5.json` | 21.1 | 54 | 0 | -| Gemma 4 E2B | GPU | 7 | `benchmark_20260515T163011_k7.json` | 22.8 | 54 | 0 | -| Gemma 4 E2B | GPU | 10 | `benchmark_20260515T165316_k10.json` | 23.3 | 54 | 0 | -| Gemma 4 E2B | GPU | 15 | `benchmark_20260515T171649_k15.json` | 21.1 | 54 | 0 | -| Gemma 4 E2B | GPU | 20 | `benchmark_20260515T173816_k20.json` | 19.1 | 54 | 24 | | Gemma 4 E4B | CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | | Gemma 4 E4B | CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | | Gemma 4 E4B | CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | @@ -288,6 +266,24 @@ On both backends and both models, TTFT-per-doc-char is roughly constant past k=3 | Gemma 4 E4B | GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | | Gemma 4 E4B | GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | | Gemma 4 E4B | GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | +| Gemma 4 E2B | CPU | 0 (no-RAG) | `benchmark_20260515T223100.json` | 22.5 | 54 | 0 | +| Gemma 4 E2B | CPU | 1 | `benchmark_20260515T183910_k1.json` | 23.9 | 54 | 0 | +| Gemma 4 E2B | CPU | 3 | `benchmark_20260515T190320_k3.json` | 30.0 | 54 | 0 | +| Gemma 4 E2B | CPU | 5 | `benchmark_20260515T193337_k5.json` | 34.2 | 54 | 0 | +| Gemma 4 E2B | CPU | 7 | `benchmark_20260515T200805_k7.json` | 35.5 | 54 | 0 | +| Gemma 4 E2B | CPU | 10 | `benchmark_20260515T204358_k10.json` | 33.9 | 54 | 0 | +| Gemma 4 E2B | CPU | 15 | `benchmark_20260515T211813_k15.json` | 41.7 | 54 | 0 | +| Gemma 4 E2B | CPU | 20 | `benchmark_20260515T220014_k20.json` | 30.4 | 54 | 24 | +| Gemma 4 E2B | GPU | 0 (no-RAG) | `benchmark_20260515T175744.json` | 17.5 | 54 | 0 | +| Gemma 4 E2B | GPU | 1 | `benchmark_20260515T152447_k1.json` | 20.9 | 54 | 0 | +| Gemma 4 E2B | GPU | 3 | `benchmark_20260515T154608_k3.json` | 22.4 | 54 | 0 | +| Gemma 4 E2B | GPU | 5 | `benchmark_20260515T160846_k5.json` | 21.1 | 54 | 0 | +| Gemma 4 E2B | GPU | 7 | `benchmark_20260515T163011_k7.json` | 22.8 | 54 | 0 | +| Gemma 4 E2B | GPU | 10 | `benchmark_20260515T165316_k10.json` | 23.3 | 54 | 0 | +| Gemma 4 E2B | GPU | 15 | `benchmark_20260515T171649_k15.json` | 21.1 | 54 | 0 | +| Gemma 4 E2B | GPU | 20 | `benchmark_20260515T173816_k20.json` | 19.1 | 54 | 24 | + +> **Note:** the table above lists the canonical FP16-default runs (which is what every table in this report tabulates). The `Wall (min)` column is benchmark runtime (operational), not user-facing latency. The aggregator dedupes by `(model, backend, k)`, so the **8 FP32 GPU sweep JSONs (2026-05-16)** and the **16 today-instrumented runs (2026-05-17, FP32 + FP16 with `artifact_fingerprint` provenance)** referenced by the FP16-vs-FP32 GPU section are not listed here. Their full filenames + fingerprints are in [`maxnumtoken_investigation.md`](maxnumtoken_investigation.md) §References. --- diff --git a/evaluation/reports/maxnumtoken_investigation.md b/evaluation/reports/maxnumtoken_investigation.md new file mode 100644 index 0000000..4616b69 --- /dev/null +++ b/evaluation/reports/maxnumtoken_investigation.md @@ -0,0 +1,519 @@ +# Investigation: What the 4096 `maxNumTokens` Wall Actually Is + +_Last updated: 2026-05-17. Companion to [latency_report_v2.md](latency_report_v2.md) §"Errors and the 4096-token context wall"._ + +> ⚠️ **Critical for anyone shipping on-device Gemma 4 with LiteRT-LM**: **the default activation precision on Android GPU is FP16**, and **FP16 attention causes a deterministic decoding failure** (repetition loop into `* * * * ...`) once the total context length exceeds the artifact's calibrated zone (around total context ~5000 tokens on the Gemma 4 E4B/E2B `.litertlm` artifacts we tested). The breakdown is **silent** — no error, no warning, just garbage tokens — and it's **bit-exactly reproducible** across runs because GPU uses greedy decoding by default. +> +> A concrete example of this failure is captured in [`benchmark_20260516T104730_k20.json`](../latency_results/benchmark_20260516T104730_k20.json): query `long_01` at k=20, FP16 GPU, maxNumTokens=8192. The response opens with coherent medical reasoning for the first ~50 generated tokens, then deterministically collapses into an asterisk-repetition loop for the remaining ~1450 tokens. **Keep this file in the repo as the reference example of the failure mode.** +> +> The current MAM-AI deployment is safe because we ship `maxNumTokens=4096`, which is well below the breakdown point — but anyone lifting that cap on FP16 GPU will hit this wall. Force FP32 via the artifact-metadata override (see Step 3) if you need higher context on GPU. + +## TL;DR + +- **GPU on Android runs attention in FP16; CPU runs FP32 (XNNPACK)** — verified from LiteRT-LM source and from strings inside `liblitertlm_jni.so`. This is why the two backends behave differently at lifted context. +- The 4096 cap is **not artifact-baked** — passing `maxNumTokens=8192` to `EngineConfig` succeeds at init on both backends and the artifact happily ran prefill over a 4917-token prompt. +- At `maxNumTokens=8192`, FP16 GPU output **collapses into a repetition loop at total-context position ~5000 tokens** — about 50 generated tokens into the response. The transition is **sharp**, deterministic across runs. +- CPU at `maxNumTokens=8192` stays coherent for the same prompt. The asymmetry is the precision difference. +- **FP16 is confirmed as the root cause** (Step 3, 2026-05-16): forcing GPU to FP32 via a `prefer_activation_type=float32` metadata override on the `.litertlm` artifact produces clean output past total context 4096 — the cliff zone where FP16 degenerates. *Caveat*: the exact direct A/B on the FP16 failure case (`long_01` k=20 max=8192) couldn't be run on this device — FP32's larger KV cache OOMs the app. The fix is verified on the closest-comparable test (`long_01` k=15 max=5000, response ending at total context ~4514, clean); the direct head-to-head would need a 24 GB-RAM device. See Step 3 §"Important caveat" for the full evidence map. +- **Operational conclusion**: the 4096 deployment ceiling on FP16 GPU is conservative — ~900 tokens of safety margin to the actual breakdown. Current k=15 deployment is nowhere near the cliff. +- **FP32 GPU latency cost** (Step 5 full sweep, 2026-05-16): only **~25% slower than FP16 GPU at k=15** (~6 s extra wait), almost entirely in TTFT (compute-bound prefill). Decode is essentially identical (bandwidth-bound). FP32 GPU is a real shipping option for use cases that want extra correctness margin or higher k — not just an experiment. Memory ceiling on this device is in the 5000–8000 maxNumTokens range; KV cache doubles vs FP16. + +--- + +## Context + +PR #59 measured latency for Gemma 4 E4B and E2B at k ∈ {0,1,3,5,7,10,15,20}. At k=20, the same 8 queries failed across every (model × backend) combination with `Input token ids are too long. Exceeding the maximum number of tokens allowed: N >= 4096`. The report originally claimed the wall was *"a property of the .litertlm artifact format."* + +PR #60 made `maxNumTokens` explicit at the call site (`EngineConfig(..., maxNumTokens = 4096, ...)`) and ran two experiments to figure out what 4096 actually is: + +| Test | What we found | +|---|---| +| Pass `maxNumTokens = 2048` (CPU + GPU) | Engine clamps to 2048; same prompt fails with `>= 2048` instead of `>= 4096`. **Knob is wired through.** | +| Pass `maxNumTokens = 8192` (CPU) | Engine init succeeds. Queries that previously failed at 4096 now run end-to-end with **clean, coherent responses**. | +| Pass `maxNumTokens = 8192` (GPU) | Engine init succeeds. Queries run end-to-end but produce **garbage past a certain point** — first ~1000 chars are real medical reasoning, then ~5000 chars of `* * * *...` repetition. | + +This file documents the follow-up investigation into *why GPU breaks and CPU doesn't*, and *where* exactly GPU breaks. + +--- + +## Step 4 — Why the backends diverge: precision + +### Source: LiteRT-LM OSS repo (`google-ai-edge/LiteRT-LM`) + +[`runtime/executor/executor_settings_base.h`](https://github.com/google-ai-edge/LiteRT-LM/blob/main/runtime/executor/executor_settings_base.h) defines the activation-precision enum: + +```cpp +enum class ActivationDataType { + FLOAT32, + FLOAT16, + INT16, + INT8, +}; +``` + +And the comment on the field that holds it (line 308–316): + +> *"Optional setting for specific activation data type. **If not set, the default activation data type for each OS & backend will be used.** [...] OpenCL backend only support fp32 on Linux."* + +The factory `LlmExecutorSettings::CreateDefault()` in [`runtime/executor/llm_executor_settings.cc`](https://github.com/google-ai-edge/LiteRT-LM/blob/main/runtime/executor/llm_executor_settings.cc) does **not** set `activation_data_type_` — it leaves the `std::optional` empty, so the OS/backend default takes over. Our Kotlin `EngineConfig` doesn't set it either. + +### Source: strings inside `liblitertlm_jni.so` + +The smoking-gun string, lifted from the native lib (`grep` over strings in the AAR-extracted .so): + +> *"not found for prefer activation type. Use system's default backend activation type. **System's default activation type for Text decoder is fp16.** Vision encoder and audio encoder default is fp32."* + +Supporting strings from the same binary: + +- `#pragma OPENCL EXTENSION cl_khr_fp16 : enable` — OpenCL FP16 explicitly enabled +- `#define FLT16 float16` / `#define FLT16 half16` — preprocessor macros for FP16 type +- FP16-specific kernels: `Softmax (NC, F16)`, `Batch Matrix Multiply (NC, F16)`, `Average Pooling (NHWC, F16)`, `intel_sub_group_f16_f16_matrix_mad_k16` +- `CalculationsPrecision::F16/F32_F16 is not supported on this GPU(no fp16 support).` — FP16 is the preferred path, FP32 is the fallback when hardware doesn't support FP16 + +CPU side, strings show **XNNPACK** as the delegate (`LlmLiteRTXnnpackExecutor`, `TfLiteXNNPackDelegate`). XNNPACK on ARM64 defaults to FP32 for floating-point ops; no FP16 attention kernels appear in the CPU code paths. + +### Conclusion + +| Backend | Activation precision | How we know | +|---|---|---| +| **GPU (OpenCL on Adreno)** | **FP16** | Native log line: *"System's default activation type for Text decoder is fp16"* + explicit FP16 kernels in OpenCL backend | +| **CPU (XNNPACK on ARM64)** | **FP32** | XNNPACK default for ARM64 FP ops + no FP16 attention kernels in CPU paths | +| GPU on Linux | FP32 | Documented in header — but not our deployment target | + +We are not overriding these defaults from Kotlin. So when we deploy on Snapdragon, GPU = FP16 and CPU = FP32 in attention. + +### Another finding worth flagging + +From [`llm_executor_settings.h`](https://github.com/google-ai-edge/LiteRT-LM/blob/main/runtime/executor/llm_executor_settings.h) line 387–389: + +> *"Maximum number of the sum of input and output tokens. **It is equivalent to the size of the kv-cache.**"* + +So `maxNumTokens` is **total context size** (prompt + response), and equals the KV-cache allocation. The 4096 cap isn't an "input prompt cap" — it's the total prompt + response budget. At k=20 with a 4917-token prompt, no response is even possible at `maxNumTokens=4096`; the prompt alone exceeds the budget. This subtly corrects the prior framing. + +--- + +## Step 1 — Where the GPU output actually breaks + +### Setup + +- File: [benchmark_20260516T104730_k20.json](../latency_results/benchmark_20260516T104730_k20.json) +- Query: `long_01` at k=20 +- Backend: GPU (`useGpuForLlm=true`) +- `maxNumTokens = 8192` (Phase C experiment, uncommitted) +- Response: 6027 chars, ~1506 estimated tokens (4.00 chars/token average) +- Prompt: **4917 tokens** (deterministic — confirmed by the failure messages across all four (model × backend) k=20 sweep cells) + +### Method + +200-char sliding-window over the response, computing: +- % letters per window (proxy for "is this English prose?") +- % asterisks per window (proxy for "is this the repetition loop?") +- Approximate response-token position (`char_pos / 4.00`) +- Total context position (`prompt_tokens + response_tokens`) + +### Result + +| Char position | Response tokens (est) | Total context | % letters | % asterisks | What it looks like | +|---:|---:|---:|---:|---:|---| +| 0 | 0 | 4917 | 72.0% | 8.5% | `This is a **medical emergency**...` | +| 200 | 50 | 4967 | 63.0% | 5.0% | Still mostly prose, structure degrading | +| 400 | 100 | 5017 | **2.0%** | **25.0%** | Collapsed into `*` pattern | +| 600 | 150 | 5067 | 0.0% | 25.0% | Pure repetition | +| 1000+ | 250+ | 5167+ | 0.0% | 25–45% | Sustained `* * *...` | + +The transition is **sharp** — from 72% letters to 2% letters across a 200-char window (~50 generated tokens). After that, response stays at 0% letters for the remaining ~5500 chars. + +### Where on the timeline this happens + +``` +Prompt: [============================== 4917 tokens ==============================] + ↑ +Coherent decode: [≈50 tokens of medical prose] + ↑ collapse +Garbage decode: [≈1450 tokens of asterisks] + +Calibration boundary [============= 4096 =============] + |←──~900 tokens slack──→| + collapse at ≈5000 +``` + +### Three findings, in order of importance + +**1. The transition is sharp, not gradual.** A pure FP16-noise-compounding story predicts a gradual decay. We see a near-binary cliff in ~50 tokens. This points more toward a **kernel-level boundary** (a tile size, a buffer dimension, a lookup-table size hardcoded for the calibrated context) than pure precision drift. FP16 likely plays a role by removing the precision headroom that would have absorbed borderline kernel artifacts on CPU. + +**2. It's at total context ~5000, not 4096.** The model successfully ran prefill over a 4917-token prompt (already 821 tokens past the 4096 cap) and produced ~50 tokens of coherent decoded output drawing on that prompt. The cliff is around total context position **4967–5017** tokens. There's ~900 tokens of margin between the 4096 deployment ceiling and the actual breakdown point. + +**3. It looks like a *decode-side* failure, not a prefill-side failure.** Prefill works fine over the 4917-token prompt, and the first ~50 decoded tokens are coherent — so reading from KV cache at positions 0–4916 is fine. What breaks is when the model **writes new K/V entries** at positions ≥ 4917 during decode and then has to attend back to them. The decode-side KV-update kernels look like the prime suspect. + +### Operational implication + +The 4096 deployment ceiling is **more conservative than necessary** from a pure quality standpoint — GPU output stays coherent up to ~5000 total context. But: + +- 4096 is the artifact's published/calibrated value, and the breakdown past 5000 is dramatic (full collapse, not graceful degradation) +- The ~50-token settling window means the safety margin past 4917 is fragile +- Pushing closer to 5000 invites unpredictable transition + +So **4096 remains the right ship value** for production. The new understanding is that we have **~900 tokens of headroom**, not zero. At our current k=15 deployment (typical prompt ~3500 tokens), we are nowhere near the cliff and not silently shipping degraded output. **This rules out the safety concern of "the cap is tighter than we think."** + +--- + +## Step 2 — Reproducibility: bit-exact across 3 runs + +### Setup + +- Same APK as Step 1 (GPU, `maxNumTokens = 8192`). +- Same query: `long_01` at k=20, deterministic 4917-token prompt. +- `--repeats 3`, `--cooldown 1000`. +- File: [benchmark_20260516T151036_k20.json](../latency_results/benchmark_20260516T151036_k20.json) + +### Result + +| | Rep 1 | Rep 2 | Rep 3 | +|---|---|---|---| +| Response chars | 6027 | 6027 | 6027 | +| Estimated tokens | 1506 | 1506 | 1506 | +| Transition position (char) | 400 | 400 | 400 | +| Total context at transition | ~5017 | ~5017 | ~5017 | +| Response head (first 80 chars) | `This is a **medical emergency**. You must act immediately. **Escalate to a doct` | identical | identical | +| Response tail (last 80 chars) | ` * * * * ********** * * * * * * * * ***************` | identical | identical | +| Decode time (ms) | 263 158 | 263 223 | 263 185 | + +Decode-time variance is ~0.05% — pure system jitter. The **model outputs are bit-identical** across all three runs. + +### Interpretation + +This rules out stochastic FP16 noise as the proximate cause. The GPU backend defaults to greedy decoding (`max_top_k = 1`, the `GpuConfig` default we found in `LlmExecutorSettings::CreateDefault()`); with identical prompts, identical KV cache state, and identical numerical paths through deterministic FP16 OpenCL kernels, every decode step picks the same next token. There is no randomness in the system to mask whatever is going wrong. + +So the failure is not "FP16 sometimes drifts past 4096" — it is "FP16 kernels **deterministically produce broken K/V** at positions past the artifact's calibrated zone, in a way that always degenerates into the same output." + +--- + +## Refined mechanism hypothesis (after Steps 4, 1, 2) + +1. The Gemma 4 `.litertlm` artifact has KV-cache and attention kernels **calibrated** for a 4096-token context. The OpenCL implementations of those kernels assume something about position layout (tile size, buffer dimension, position-embedding cache) that's accurate up to ~4096 and starts to misbehave past it. + +2. **Prefill is robust past 4096** — at engine init the KV cache is allocated to whatever `maxNumTokens` we pass (8192 in our test), and the prefill kernels appear to handle the longer prompt correctly (the model produces real medical content for ~50 decoded tokens). + +3. **Decode-side KV writes past position ~4917 produce *deterministically* off-distribution K/V values** (Step 2 evidence). As soon as the model attends back to those bad-write positions in subsequent decode steps, the attention scores collapse onto a small set of high-probability tokens (asterisks, in our case — a tokenizer-common character). The model then enters a self-reinforcing loop: bad outputs → bad self-attention → bad outputs. Same RNG-free state every run, so the same garbage every run. + +4. **FP16 is the root cause across backends** (confirmed Step 3, 2026-05-16). The kernel calibration mismatch exists on both backends — but FP32's larger dynamic range absorbs the off-distribution values on CPU (XNNPACK default), while FP16 on GPU has no precision headroom and the bad values dominate. Forcing the GPU artifact to FP32 via metadata override eliminates the breakdown — same artifact, same query, clean output. + +The "exactly 4096" framing was wrong. The real picture: a ~1000-token "uncalibrated zone" between ~4096 and ~5000 where output quality slowly degrades and then catastrophically collapses on the FP16 path, but stays coherent on FP32. The deterministic kernel/precision interaction is what makes the collapse visible only on GPU. + +--- + +## Reachability constraint: cannot force FP32 on GPU from Kotlin in 0.11.0 + +The natural follow-up — force the GPU backend to use FP32 instead of FP16 and re-run — turned out to be **not possible from the public Kotlin API** in LiteRT-LM 0.11.0. Verified across four sources: + +1. **`Config.kt`**: `EngineConfig` data class has 7 fields (`modelPath, backend, visionBackend, audioBackend, maxNumTokens, maxNumImages, cacheDir`). No precision field. `Backend.GPU()` is zero-arg. +2. **`Engine.kt:initialize()`**: calls `LiteRtLmJni.nativeCreateEngine(...)` with 14 args. None of them is precision-related. +3. **`LiteRtLmJni.kt`**: the JNI bridge declaration. The `nativeCreateEngine` signature takes exactly those 14 parameters — that is the entire JNI surface for engine creation. No `nativeSetActivationDataType` method anywhere in the bridge. +4. **`llm_executor_settings.cc:CreateDefault()`**: doesn't set `activation_data_type_`, leaving it as `std::nullopt`. So when the JNI bridge constructs the engine, no activation type ever gets set, and the runtime falls back to its system default (FP16 for text decoder on Android GPU per the native-lib log string from Step 4). + +I also scanned the native lib for environment-variable overrides (`LITERTLM_*`, `LITERT_*`, `OPENCL_*`, `FORCE_FP32`) — none exist. + +The C++ `LlmExecutorSettings::SetActivationDataType(...)` method **exists** in LiteRT-LM source, but it is **not bridged** to the Kotlin/JNI layer in version 0.11.0. The hooks are there server-side but not wired to client-side. + +### Paths that would unblock the FP32 control test + +- **(b) Modify the `.litertlm` artifact header.** ✅ **Used — see Step 3 below.** The FlatBuffers schema (`schema/core/litertlm_header_schema.fbs`) defines per-section `items` as a list of arbitrary `KeyValuePair` entries. The native runtime looks for a key `prefer_activation_type` attached to the prefill_decode model section and, if present, honors it; otherwise it falls back to the system default (FP16 on Android GPU). Setting `prefer_activation_type = "float32"` in the artifact's section metadata forces FP32 on GPU without any code changes. +- **(c) File an upstream issue with `google-ai-edge/LiteRT-LM`** to expose `SetActivationDataType` in the Kotlin `EngineConfig` API. Still worth filing as the right systemic fix, but no longer the unblocker — option (b) works. +- **(d) Build a custom LiteRT-LM AAR.** Clone the repo, add a field to the Kotlin `EngineConfig` + parameter to `nativeCreateEngine` + plumbing to the C++ setter. Multi-day project; **avoided** thanks to option (b). + +--- + +## Step 3 — FP32 control test (2026-05-16) — **FP16 confirmed as root cause** + +### Procedure + +Used the official `litert-lm-builder` Python package (installed via pip in a Python 3.14 venv, since the published 0.11.0 package needs `tomllib`). + +1. **Peek + dump** the existing `gemma-4-E4B-it.litertlm` into its 12 constituent sections plus a `model.toml` build spec — `litert-lm-peek --litertlm_file gemma-4-E4B-it.litertlm --dump_files_dir /tmp/litertlm-dump-e4b/` +2. **Edit the TOML** to add `prefer_activation_type` as `additional_metadata` on the prefill_decode section (the 0.11.0 builder doesn't surface `prefer_activation_type` as a first-class TOML key, but `additional_metadata` lets us inject any KeyValuePair): + + ```toml + [[section]] + model_type = "prefill_decode" + section_type = "TFLiteModel" + data_path = "Section10_TFLiteModel_tf_lite_prefill_decode.tflite" + additional_metadata = [ + { key = "prefer_activation_type", value = "float32", value_type = "String" }, + ] + ``` + +3. **Rebuild** — `litert-lm-builder toml --path model_fp32.toml output --path /tmp/gemma-4-E4B-it-fp32.litertlm`. Output is the same 3.4 GB, byte-identical data sections, only the metadata header changed. +4. **Verify with peek**: confirmed Section 10 now has `Key: prefer_activation_type, Value (String): float32`. +5. **Push to device, install GPU APK, run benchmark**. + +### Confirmation at engine init + +Logcat showed: + +``` +litert_lm_loader.cc:234] section_prefer_activation_type: float32 +activation_data_type: FLOAT32 +``` + +The runtime parsed the metadata override and switched to FP32 attention. + +### First attempt: `maxNumTokens = 8192` → silent OOM + +The k=20 query crashed the app process in **7 seconds**, before any output token was generated. No native crash log, no OOM-killer line, no tombstone — just a generic "process died" entry. Memory math explains it: FP32 doubles the KV cache (~5.8 GB at 8192) and the peak demand (~11–13 GB) exceeded the device's available RAM (~10 GB after Android baseline). The GPU allocator silently failed and the process was killed. + +### Retry: `maxNumTokens = 5000`, k=15 → ✅ clean output + +| | Value | +|---|---| +| TTFT | 10.5 s | +| Decode | 19.2 s | +| Total | 31.8 s | +| Response | 998 chars / 249 tokens, coherent medical reasoning | +| Sliding-window analysis | All windows 60–72% letters, 7–11% asterisks — **no transition to garbage** | + +Response began with `"This is a **severe pre-eclampsia** situation. You must act quickly. **Immediate Actions:**..."` and ended with `"...Consult a doctor immediately for guidance on any medications you can safely give while waiting."` — a complete, well-structured medical answer. The total context at end of response was ~4514 tokens, comfortably past the 4096 deployment cap but below the FP16 cliff at ~5000. + +### Conclusion + +**FP16 is the root cause of the GPU breakdown.** Same artifact, same prompt, same Adreno 830 OpenCL backend, same greedy decoding. The single controlled change was activation precision — and it eliminated the degeneration. The "kernel boundary independent of precision" hypothesis is ruled out. + +The mechanism is now fully understood: + +- FP16 OpenCL attention kernels produce off-distribution K/V values for decode positions past the artifact's calibrated zone +- CPU's XNNPACK FP32 path has enough numerical headroom to absorb the same calibration mismatch and stays coherent +- GPU's FP16 path doesn't; once attention scores drift onto the asterisk token, the model self-reinforces into the repetition loop we observed + +### Memory ceiling for FP32 GPU on the test device + +| maxNumTokens | KV cache (FP32) | Peak demand | Result | +|---|---|---|---| +| 4096 | 2.9 GB | ~7.8 GB | ✅ Fits | +| 5000 | 3.5 GB | ~8.4 GB | ✅ Confirmed working | +| 6000–7000 | 4.2–4.9 GB | ~9.1–9.8 GB | ❓ Untested, likely OK | +| 8192 | 5.8 GB | ~11–13 GB | ❌ OOM crash | + +Practical FP32-GPU ceiling on this device is somewhere in **6500–7500**; we didn't bisect to find the exact value. + +### Important caveat: the FP32 fix is not *directly* verified on the same `long_01` k=20 case where FP16 fails + +The natural head-to-head test — running `long_01` at k=20 with `maxNumTokens=8192` on **FP32 GPU** to directly compare against the FP16 GPU asterisk-loop captured in [`benchmark_20260516T104730_k20.json`](../latency_results/benchmark_20260516T104730_k20.json) — **could not be executed on this device**. FP32 KV cache at maxNumTokens=8192 plus the model plus activation buffers exceeds the ~10 GB of RAM available to the app, and the OS killed the benchmark process 7 seconds into the first large-prompt prefill, before any output was generated. + +So strictly, for `long_01` specifically: + +| Configuration | Observed | Source | +|---|---|---| +| FP16 GPU, max=8192, k=20 | ❌ Asterisk repetition loop (1506 garbage tokens past first ~50 coherent) | Steps 1–2, [benchmark_20260516T104730_k20.json](../latency_results/benchmark_20260516T104730_k20.json) | +| FP32 GPU, max=8192, k=20 | 🔥 OOM crash (no output produced) | Step 3 first attempt | +| FP32 GPU, max=5000, **k=15** | ✅ Clean response, no degeneration, total context reached ~4514 (past 4096 cap, into FP16 cliff zone) | Step 3 retry, [benchmark_20260516T162810_k15.json](../latency_results/benchmark_20260516T162810_k15.json) | + +**The closest-comparable evidence** — FP32 GPU on `long_01` at k=15 with maxNumTokens=5000, where the total context at end of response reached ~4514 tokens, **safely past 4096 where FP16 starts degrading toward its ~5000 cliff** — produced clean output. This is strong indirect evidence that FP32 would also resolve the k=20 case if memory allowed it; the only thing standing between the indirect and direct test is the device's GPU memory ceiling, not anything about the precision mechanism. + +**Recommended next test if anyone wants the airtight A/B**: reproduce the FP32 GPU `long_01` k=20 maxNumTokens=8192 run on a higher-RAM device variant (e.g. a 24 GB Snapdragon 8 Elite phone), where the ~12 GB peak demand fits. The mechanism predicts clean output; this is what would confirm it. + +--- + +## Step 5 — FP32 GPU latency sweep at maxNumTokens=4096 (2026-05-16) + +Once Step 3 established that FP32 GPU produces clean output, the next question was: **how slow is it, really?** A single-data-point measurement (k=15, maxNumTokens=5000) had suggested ~3× slower decode, but that turned out to be a confused comparison (FP32-E4B vs FP16-**E2B**, two different models). The right comparison is FP32-E4B vs FP16-E4B at the same maxNumTokens. + +So we ran the full 8-k sweep with the FP32-tagged artifact at `maxNumTokens=4096` (the production cap) on GPU. **Total wall-clock: ~4.5 hours**, mirroring the original FP16 GPU sweep cell-by-cell. + +### Result + +| k | FP16 total | **FP32 total** | ratio | FP16 TTFT | FP32 TTFT | FP16 decode | FP32 decode | +|---|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 14.4 s | 16.5 s | **1.14×** | 0.96 s | 2.03 s | 13.5 s | 14.4 s | +| 1 | 14.1 s | 16.6 s | **1.17×** | 0.95 s | 2.06 s | 11.4 s | 12.7 s | +| 3 | 19.1 s | 20.2 s | **1.06×** | 0.99 s | 2.16 s | 16.4 s | 16.2 s | +| 5 | 19.6 s | 23.8 s | **1.21×** | 1.88 s | 4.28 s | 15.9 s | 16.3 s | +| 7 | 22.9 s | 27.2 s | **1.19×** | 1.92 s | 4.38 s | 17.2 s | 19.0 s | +| 10 | 22.4 s | 27.5 s | **1.23×** | 2.52 s | 5.87 s | 18.1 s | 18.6 s | +| 15 | 24.4 s | 30.8 s | **1.26×** | 3.46 s | 8.37 s | 16.8 s | 18.3 s | +| 20 (ok only) | 21.0 s | 29.0 s | **1.38×** | 3.99 s | 9.76 s | 14.7 s | 18.0 s | + +**Same 24 errors at k=20 on both FP16 and FP32** (the prompt-cap rejection; identical 8 queries fail). Confirms the 4096 cap behavior is precision-agnostic — it's a runtime config check, not a numerical thing. + +### Two cleanly separated stories + +**1. Decode speed is essentially identical in FP16 vs FP32 GPU.** Looking at the decode columns: FP16 11.4–18.1 s vs FP32 12.7–19.0 s — within ~9% at every k. **Decode is bandwidth-bound**, not compute-bound; the bottleneck is loading model weights through memory each step, not the arithmetic precision. So FP32 barely costs anything in steady-state token generation. + +**2. Prefill (TTFT) is ~2–2.5× slower under FP32.** TTFT: FP16 ~1–4 s vs FP32 ~2–10 s. **Prefill is compute-bound** — the model processes the entire input prompt in parallel through attention, and FP16 doubles arithmetic throughput on Adreno. The 2× FP32 cost reflects the parallel-compute hit. + +**The entire FP32 slowdown lives in TTFT.** The total slowdown ratio grows with k purely because prefill is a larger fraction of total query time at higher k. + +### Corrected slowdown summary + +- **No-RAG, k=1, k=3**: FP32 only **6–17% slower** — UX-invisible. +- **Mid k (k=5–10)**: FP32 **19–23% slower** — noticeable but not painful. +- **Largest viable k (k=15)**: FP32 **26% slower** — noticeable (~6 s extra wait). +- **k=20 ok cells**: 38% slower (~8 s extra) — but with the 4096 cap, the actual fail rate is 24/54 the same either way. + +That's a **much smaller** hit than the ~3× I'd reported from the single-data-point measurement. The error there was comparing FP32-E4B against FP16-**E2B** by mistake. + +### What this means for deployment + +The FP32-GPU path is **a real deployment option, not just an experiment**: + +| Config | Latency at k=15 | Memory peak | Quality | When to ship | +|---|---|---|---|---| +| FP16 GPU, max=4096 | ~24 s | ~7 GB | Clean (below cliff) | **Today's ship** | +| FP32 GPU, max=4096 | ~31 s | ~7 GB | Clean (no FP16 cliff) | If we want extra correctness margin | +| FP32 GPU, max=5500 | ~32 s + cliff lift | ~9 GB | Clean past 4096 | If we want higher k *and* the device has ≥12 GB | +| CPU FP32, max=4096 | ~85 s | ~7 GB | Clean | Fallback when GPU isn't available | + +The headline: **at maxNumTokens=4096, FP32 GPU is ~25% slower than FP16 GPU at our typical operating points (~6 s extra at k=15)**. That's a real UX hit but not catastrophic. The choice between FP16 GPU and FP32 GPU is now a UX-vs-margin tradeoff — not a "is FP32 even feasible" question. + +If we ever want to push past 4096 in production, FP32 GPU becomes the right backend (it doesn't have the cliff); for staying at 4096 there's no functional reason to switch from FP16. + +--- + +## Step 6 — Apples-to-apples FP32 vs FP16 GPU sweep with instrumented JSONs (2026-05-17) + +Step 5's data was correct directionally, but its JSONs lacked the fields needed to self-verify which precision condition each run used. After landing the instrumentation commit (`52e11e9`) — which: + +- consolidates `max_num_tokens` to a single source of truth in `runtime_config.json`, +- removes the hardcoded `max_tokens=32000` fiction the benchmark used to record, +- adds `artifact_fingerprint`, `git_commit_sha`, and `litertlm_version` to every benchmark JSON's `config` block, + +…we re-ran both sweeps overnight at maxNumTokens=4096 GPU: 8 FP32 runs (k ∈ {0, 1, 3, 5, 7, 10, 15, 20}), then push original artifact, then 8 FP16 runs. Total wall-clock ~10 hours. All 16 JSONs carry `artifact_fingerprint`-verified provenance and the `git_commit_sha` of the instrumentation commit. + +### Result — full comparison + +| k | FP16 total | FP32 total | T ratio | FP16 TTFT | FP32 TTFT | TTFT ratio | FP16 decode | FP32 decode | dec ratio | +|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 14.5 s | 16.5 s | **1.14×** | 0.97 s | 2.03 s | **2.10×** | 13.5 s | 14.4 s | 1.07× | +| 1 | 14.1 s | 18.0 s | **1.28×** | 0.95 s | 2.06 s | **2.16×** | 11.4 s | 12.8 s | 1.12× | +| 3 | 21.0 s | 22.2 s | **1.06×** | 0.99 s | 2.15 s | **2.17×** | 16.4 s | 16.2 s | 0.99× | +| 5 | 19.6 s | 24.3 s | **1.24×** | 1.88 s | 4.28 s | **2.28×** | 16.0 s | 16.3 s | 1.02× | +| 7 | 20.9 s | 27.3 s | **1.30×** | 1.91 s | 4.38 s | **2.29×** | 17.3 s | 19.0 s | 1.10× | +| 10 | 22.6 s | 27.4 s | **1.21×** | 2.53 s | 5.85 s | **2.32×** | 18.2 s | 18.6 s | 1.02× | +| 15 | 23.1 s | 30.9 s | **1.34×** | 3.45 s | 8.37 s | **2.43×** | 16.9 s | 18.4 s | 1.09× | +| 20 (ok only) | 20.7 s | 27.3 s | **1.32×** | 3.99 s | 9.79 s | **2.46×** | 14.8 s | 18.0 s | 1.22× | + +Errors at k=20: 24 on **both** FP32 and FP16 (identical 8 queries — confirms the 4096 prompt-cap rejection is precision-agnostic, just a runtime config check). + +### Key findings — cleanly separated and confirmed + +**1. TTFT (prefill) is consistently ~2.1–2.5× slower on FP32.** Across every k, the ratio is narrow and predictable, growing modestly with prompt length. This is the compute-bound part: prefill processes the whole prompt in a parallel forward pass and FP16 doubles arithmetic throughput on Adreno. Halving precision halves the parallel-compute time; the ~2.2× ratio matches that prediction within noise. + +**2. Decode (generation) is essentially identical** between FP32 and FP16 — every ratio in the decode column is between 0.99× and 1.22×. **Decode is bandwidth-bound** (sequential token generation requires loading weights through memory each step), so the FP16/FP32 distinction barely matters in steady state. + +**3. Total query latency slowdown averages ~1.24×** (range 1.06×–1.34×), with the ratio mostly tracking how much of total query time is prefill vs decode. Smaller k → relatively more prefill → larger FP32 hit. The "worst case" we measured was k=15 at 1.34× (~8 s extra wait per query); typical k (k=1, 3, 5) sits around 1.06×–1.28×. + +### Confirmation — these numbers match Step 5 + +The Step 5 data (yesterday, before instrumentation) gave ratios in the same range (1.06×–1.38×). The instrumented sweep confirms those numbers were genuine, not artifacts of timing variance. We now have JSON-level provenance for both sides of the comparison. + +### Verified-mapping at the data level + +Spot-checks of the JSONs verify the instrumentation matches reality: + +- FP32 sweep JSONs all carry `artifact_fingerprint = 9fdf9dd11f4e79507bc06df5ba0ddbf33eb12ed8524852d728dfec72d1aadabc` (FP32-tagged build) +- FP16 sweep JSONs all carry `artifact_fingerprint = cfa067b69af3ccd147c234be3058525774379c1349bacf0b3e85d7a26a42b868` (FP16 default) +- All 16 JSONs carry `git_commit_sha = 52e11e9` and `max_num_tokens = 4096` +- TTFT-from-logcat is consistent with the recorded precision (FP16 ~1 s at k=1, FP32 ~2 s at k=1) + +A reviewer reading any one of these JSONs cold can cross-reference the fingerprint against the mapping table above and immediately know which precision condition the run used. + +### Deployment implications (refined from Step 5) + +The shipping decision matrix is now well-anchored: + +| Config | Median latency at k=15 | KV-cache memory | Quality | When to ship | +|---|---|---|---|---| +| **FP16 GPU, max=4096** | **23.1 s** | ~2.9 GB | Clean (below cliff) | **Today's deployment** | +| FP32 GPU, max=4096 | 30.9 s (+34%) | ~2.9 GB | Clean (no FP16 cliff to worry about) | If we want extra correctness margin | +| FP32 GPU, max=5500 | ~33 s + lift | ~3.5 GB | Clean past 4096 (FP16 would degenerate) | If we want higher k *and* device has ≥12 GB | +| CPU FP32, max=4096 | ~85 s | ~2.9 GB | Clean | Fallback when GPU isn't available | + +The Step 5 narrative stands: **at maxNumTokens=4096, FP16 GPU is the right ship choice** (cleanest UX, no quality cost since we stay below the FP16 cliff at ~5000 total context); FP32 GPU is a real fallback option for either extra correctness margin or future higher-k use cases. + +--- + +## Reference: artifact fingerprint mapping + +Benchmark JSONs from commit `52e11e9` onward record an `artifact_fingerprint` field in `config.artifact_fingerprint` — the SHA-256 of the first 64 KB of the loaded `.litertlm` file. This uniquely identifies which artifact variant was loaded at benchmark time, **since both the FP32-tagged rebuild and the original FP16 default share the same filename** (`gemma-4-E4B-it.litertlm`). Without this, a reviewer reading a JSON cold cannot tell which precision condition the run used. + +Cryptographically-verified mapping for the artifacts referenced in this investigation: + +| `artifact_fingerprint` | Artifact source | Activation precision used | Where it appears | +|---|---|---|---| +| **`9fdf9dd11f4e79507bc06df5ba0ddbf33eb12ed8524852d728dfec72d1aadabc`** | FP32-tagged Gemma 4 E4B `.litertlm`, rebuilt locally via `litert-lm-builder` with `additional_metadata = [{key="prefer_activation_type", value="float32", value_type="String"}]` injected on the `prefill_decode` section | **FP32** (runtime honors the override; verified via logcat `"activation_data_type: FLOAT32"`) | All FP32 GPU runs from Step 3 onward, including the full Step 5 sweep | +| **`cfa067b69af3ccd147c234be3058525774379c1349bacf0b3e85d7a26a42b868`** | Default `litert-community/gemma-4-E4B-it-litert-lm` from HuggingFace, as published | **FP16** (runtime falls back to the system default for Android GPU text decoder, per Step 4) | All FP16 GPU runs in this PR's instrumented sweep and the earlier sweeps in PR #57/#59 (when re-fingerprinted) | + +Both files are exactly **3,654,467,584 bytes** — the `litert-lm-builder` preserves section-byte-offsets when rewriting the header, so only the metadata bytes differ between the two variants. Identical model weights, identical tokenizer, identical retrieval embeddings. + +### Reproducing the mapping locally + +```bash +python3 -c " +import hashlib +for label, path in [ + ('FP16 default', 'device_push/models/gemma-4-E4B-it.litertlm'), + ('FP32 tagged', '/tmp/gemma-4-E4B-it-fp32.litertlm'), +]: + with open(path, 'rb') as f: + sha = hashlib.sha256(f.read(65536)).hexdigest() + print(f'{sha} {label}') +" +``` + +If you produce a new `.litertlm` variant (e.g. an INT16-activations rebuild, or a higher-context build from upstream), compute and add its fingerprint to this table. + +--- + +## What's still open + +| Question | Status | Cost to answer | +|---|---|---| +| ~~Is the GPU cliff deterministic (same position every run) or stochastic?~~ | **Resolved (Step 2)** — bit-exactly deterministic across 3 reps | — | +| ~~Is the Kotlin API able to force FP32 on GPU?~~ | **Resolved (reachability check)** — no via Kotlin/JNI, but **yes via the .litertlm metadata override path** (option b) | — | +| ~~Is FP16 attention the root cause, or just one factor among others?~~ | **Resolved (Step 3, 2026-05-16)** — FP16 is the root cause. FP32 GPU produces clean output where FP16 GPU produced garbage on the same artifact, in the *cliff zone past 4096*. Direct A/B on the exact same `long_01` k=20 case where FP16 fails was prevented by GPU OOM (FP32 KV cache at max=8192 exceeds device RAM); strong indirect evidence via the closest-comparable test (FP32 on `long_01` k=15 max=5000, response ending at total context ~4514, clean). See Step 3 §"Important caveat" for the full evidence map. | — | +| ~~Does the artifact's `prefer_activation_type` field explicitly set FP16, or rely on the system default?~~ | **Resolved (peek)** — the published Gemma 4 artifacts do **not** set the field; the runtime falls back to its per-backend default (FP16 on Android GPU text decoder) | — | +| What is the tight memory ceiling for FP32 GPU on this device? | **Open** — bracketed 5000 ≤ ceiling < 8192; we'd bisect to find the exact value if FP32 GPU were a deployment candidate | ~30 min: 2–3 build/install/run cycles | +| Direct head-to-head: does FP32 GPU produce clean output on the exact `long_01` k=20 maxNumTokens=8192 case where FP16 fails? | **Open** — blocked by GPU OOM on this device (peak ~12 GB demand > ~10 GB available). Mechanism is established by indirect evidence (Step 3); confirming the direct A/B requires a higher-RAM device (24 GB Snapdragon 8 Elite variant). | ~10 min on a 24 GB device | +| ~~What is the FP32-GPU latency curve across k?~~ | **Resolved (Step 5)** — ~25% slower than FP16 GPU at k=15, dominated by 2–2.5× TTFT cost; decode essentially unchanged (bandwidth-bound) | — | +| Does the cliff position depend on prompt length, or is it fixed at total context ~5000? | **Open** — would help characterize the kernel boundary; no longer deployment-relevant given FP16 is confirmed as the cause | ~30 min if anyone wants the characterization | + +The deployment recommendation is unchanged (4096 stays the ship value with FP16 GPU). **FP32 GPU is now a real shipping option** for use cases that want extra correctness margin or higher k, at the cost of ~25% slower TTFT-driven latency and ~2× larger KV cache. + +--- + +## References + +- [latency_report_v2.md](latency_report_v2.md) §"Errors and the 4096-token context wall" — the high-level summary that points here +- [evaluation/latency_results/benchmark_20260516T100105_k20.json](../latency_results/benchmark_20260516T100105_k20.json) — CPU at maxNumTokens=8192, long_01, k=20 (clean output) +- [evaluation/latency_results/benchmark_20260516T103614_k20.json](../latency_results/benchmark_20260516T103614_k20.json) — CPU at maxNumTokens=8192, long_03, k=20 (clean output) +- [evaluation/latency_results/benchmark_20260516T104730_k20.json](../latency_results/benchmark_20260516T104730_k20.json) — GPU at maxNumTokens=8192, long_01, k=20, 1 rep (degenerate output — the one analyzed in Step 1) +- [evaluation/latency_results/benchmark_20260516T151036_k20.json](../latency_results/benchmark_20260516T151036_k20.json) — GPU at maxNumTokens=8192, long_01, k=20, 3 reps (bit-identical degenerate output — Step 2 reproducibility) +- [evaluation/latency_results/benchmark_20260516T162810_k15.json](../latency_results/benchmark_20260516T162810_k15.json) — **FP32 GPU** at maxNumTokens=5000, long_01, k=15 (clean output — Step 3 control test) +- **FP32 GPU sweep at maxNumTokens=4096 (Step 5)** — full 8-run sweep, 2026-05-16, pre-instrumentation: + - [benchmark_20260516T164144_k1.json](../latency_results/benchmark_20260516T164144_k1.json) — k=1 + - [benchmark_20260516T170710_k3.json](../latency_results/benchmark_20260516T170710_k3.json) — k=3 + - [benchmark_20260516T173631_k5.json](../latency_results/benchmark_20260516T173631_k5.json) — k=5 + - [benchmark_20260516T180851_k7.json](../latency_results/benchmark_20260516T180851_k7.json) — k=7 + - [benchmark_20260516T184348_k10.json](../latency_results/benchmark_20260516T184348_k10.json) — k=10 + - [benchmark_20260516T191934_k15.json](../latency_results/benchmark_20260516T191934_k15.json) — k=15 + - [benchmark_20260516T195750_k20.json](../latency_results/benchmark_20260516T195750_k20.json) — k=20 (24 errors / same 8 queries as FP16 baseline) + - [benchmark_20260516T202455.json](../latency_results/benchmark_20260516T202455.json) — No-RAG baseline +- **Step 6 instrumented sweeps (2026-05-17, all carry `git_commit_sha=52e11e9`)** — apples-to-apples FP32 vs FP16 with `artifact_fingerprint`-verified provenance: + - **FP32 GPU** (fingerprint `9fdf9dd1...`): + - [benchmark_20260516T222725_k1.json](../latency_results/benchmark_20260516T222725_k1.json) — k=1 + - [benchmark_20260517T064848_k3.json](../latency_results/benchmark_20260517T064848_k3.json) — k=3 + - [benchmark_20260517T071939_k5.json](../latency_results/benchmark_20260517T071939_k5.json) — k=5 + - [benchmark_20260517T075203_k7.json](../latency_results/benchmark_20260517T075203_k7.json) — k=7 + - [benchmark_20260517T082707_k10.json](../latency_results/benchmark_20260517T082707_k10.json) — k=10 + - [benchmark_20260517T090233_k15.json](../latency_results/benchmark_20260517T090233_k15.json) — k=15 + - [benchmark_20260517T094050_k20.json](../latency_results/benchmark_20260517T094050_k20.json) — k=20 + - [benchmark_20260517T100654.json](../latency_results/benchmark_20260517T100654.json) — No-RAG + - **FP16 GPU** (fingerprint `cfa067b6...`): + - [benchmark_20260517T103622_k1.json](../latency_results/benchmark_20260517T103622_k1.json) — k=1 + - [benchmark_20260517T105948_k3.json](../latency_results/benchmark_20260517T105948_k3.json) — k=3 + - [benchmark_20260517T112913_k5.json](../latency_results/benchmark_20260517T112913_k5.json) — k=5 + - [benchmark_20260517T115742_k7.json](../latency_results/benchmark_20260517T115742_k7.json) — k=7 + - [benchmark_20260517T122632_k10.json](../latency_results/benchmark_20260517T122632_k10.json) — k=10 + - [benchmark_20260517T125701_k15.json](../latency_results/benchmark_20260517T125701_k15.json) — k=15 + - [benchmark_20260517T132844_k20.json](../latency_results/benchmark_20260517T132844_k20.json) — k=20 + - [benchmark_20260517T135131.json](../latency_results/benchmark_20260517T135131.json) — No-RAG +- LiteRT-LM source: +- [app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt:buildEngine()](../../app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt) — where `maxNumTokens = 4096` is set diff --git a/evaluation/tests/test_config_schema.py b/evaluation/tests/test_config_schema.py index 7533b5f..359ed08 100644 --- a/evaluation/tests/test_config_schema.py +++ b/evaluation/tests/test_config_schema.py @@ -95,11 +95,26 @@ def test_runtime_parses_as_json(): def test_runtime_required_keys(): cfg = _runtime() + assert "engine" in cfg assert "generation" in cfg assert "retrieval" in cfg assert "context_injection" in cfg +def test_runtime_engine_max_num_tokens(): + # RagPipeline.kt reads this at startup via runtimeConfig.getJSONObject("engine") + # .getInt("max_num_tokens") and passes it to EngineConfig. Missing key or + # non-positive value would crash on first launch, which CI must catch here + # rather than on a test device. See evaluation/reports/maxnumtoken_investigation.md + # for why 4096 is the deployment-safe value (FP16 GPU cliff at total context ~5000). + engine = _runtime()["engine"] + assert "max_num_tokens" in engine, "runtime_config.engine missing 'max_num_tokens'" + val = engine["max_num_tokens"] + assert isinstance(val, int) and val > 0, ( + f"engine.max_num_tokens must be a positive integer, got {val!r}" + ) + + def test_runtime_generation_params(): gen = _runtime()["generation"] assert 0.0 < gen["temperature"] <= 2.0