nmrenyi · nmrenyi · May 15, 2026 · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml
@@ -4,6 +4,10 @@
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE"/>
     <!-- Required on Android 14+ for network-data foreground services -->
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE_DATA_SYNC"/>
+    <!-- BenchmarkActivity acquires a PARTIAL_WAKE_LOCK so the CPU keeps running
+         when the screen is off or locked. Without this, multi-hour benchmarks
+         stall silently when the device idles. Used only by BenchmarkActivity. -->
+    <uses-permission android:name="android.permission.WAKE_LOCK"/>
 
     <application
         android:label="MAM-AI"
@@ -45,6 +49,16 @@
             android:foregroundServiceType="dataSync"
             android:exported="false" />
 
+        <!-- Foreground service for the on-device latency benchmark. Holds a
+             PARTIAL_WAKE_LOCK + sticky notification so the work survives
+             screen-off and device-lock through hours-long k-sweeps. Runs
+             in its own :benchmark process to keep the main app isolated. -->
+        <service
+            android:name=".BenchmarkForegroundService"
+            android:foregroundServiceType="dataSync"
+            android:exported="false"
+            android:process=":benchmark" />
+
         <!-- FileProvider for sharing PDF files from getExternalFilesDir with viewer apps -->
         <provider
             android:name="androidx.core.content.FileProvider"

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
diff --git a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
@@ -219,14 +219,20 @@ class RagPipeline(application: Application) {
         }
     }
 
-    /** Generates the response from the LLM with conversation history support. */
+    /** Generates the response from the LLM with conversation history support.
+     *
+     *  [retrieveKOverride] — when non-null, replaces `retrievalConfig.top_k`
+     *  for this call only. Used by [BenchmarkActivity] for the per-k latency
+     *  sweep; production callers leave it null and inherit the runtime config.
+     */
     suspend fun generateResponse(
         prompt: String,
         history: List<Map<String, String>>,
         useRetrieval: Boolean = true,
         language: String = "en",
         retrievalListener: (docs: List<RetrievedDoc>) -> Unit,
         generationListener: (partial: String, done: Boolean) -> Unit,
+        retrieveKOverride: Int? = null,
     ): String =
         coroutineScope {
             awaitLlmReady()
@@ -235,10 +241,11 @@ class RagPipeline(application: Application) {
             val qStart = System.currentTimeMillis()
 
             val docs = if (useRetrieval) {
+                val effectiveTopK = retrieveKOverride ?: retrievalConfig.getInt("top_k")
                 val retrievalRequest = RetrievalRequest.create(
                     prompt,
                     RetrievalConfig.create(
-                        retrievalConfig.getInt("top_k"),
+                        effectiveTopK,
                         retrievalConfig.getDouble("similarity_threshold").toFloat(),
                         TaskType.RETRIEVAL_QUERY,
                     ),

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
@@ -12,6 +12,7 @@
     python evaluation/benchmark_latency.py --filter long_01         # Single specific query
     python evaluation/benchmark_latency.py --no-retrieval           # Skip RAG retrieval
     python evaluation/benchmark_latency.py --cooldown 10000         # Longer cooldown (thermal)
+    python evaluation/benchmark_latency.py --retrieve-k 5           # Override retrieval top_k for this session
 """
 
 import argparse
@@ -68,9 +69,15 @@ def check_device(device_serial=None):
 
 
 def check_models_downloaded(device_serial=None):
-    """Check if model files exist on device."""
+    """Check if model files exist on device.
+
+    Filenames must match config/app_config.json — the app loads
+    "llm_model" / "embedding_model" / "tokenizer" from there. Updated
+    for the Gemma 4 E4B / LiteRT-LM 0.11.0 stack; the old Gemma 3n
+    .task name is no longer in production.
+    """
     required_files = [
-        "gemma-3n-E4B-it-int4.task",
+        "gemma-4-E4B-it.litertlm",
         "Gecko_1024_quant.tflite",
         "sentencepiece.model",
         "embeddings.sqlite",
@@ -103,7 +110,8 @@ def clear_logcat(device_serial=None):
 
 
 def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
-                     skip_retrieval=False, query_filter=None):
+                     skip_retrieval=False, rag_only=False,
+                     query_filter=None, retrieve_k=None):
     """Launch BenchmarkActivity via ADB."""
     cmd = _adb(device_serial) + [
         "shell", "am", "start",
@@ -113,8 +121,12 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
     ]
     if skip_retrieval:
         cmd += ["--ez", "skip_retrieval", "true"]
+    if rag_only:
+        cmd += ["--ez", "rag_only", "true"]
     if query_filter:
         cmd += ["--es", "query_filter", query_filter]
+    if retrieve_k is not None:
+        cmd += ["--ei", "retrieve_k", str(retrieve_k)]
 
     result = subprocess.run(cmd, capture_output=True, text=True)
     if "Error" in result.stderr:
@@ -458,8 +470,16 @@ def main():
                         help="Cooldown between queries in ms (default: 5000)")
     parser.add_argument("--no-retrieval", action="store_true",
                         help="Skip RAG retrieval (generation only)")
+    parser.add_argument("--rag-only", action="store_true",
+                        help="Skip the No-RAG mode (only run with retrieval). "
+                             "Pair with --retrieve-k to do a k-sweep without "
+                             "re-running the No-RAG baseline at every k.")
     parser.add_argument("--filter", type=str, default=None,
                         help="Filter by category (short/medium/long) or query ID (e.g., long_01)")
+    parser.add_argument("--retrieve-k", type=int, default=None,
+                        help="Override retrieval top_k for this session "
+                             "(default: use runtime_config.json's value, currently 3). "
+                             "Used for the per-k latency sweep.")
     parser.add_argument("--output-dir", type=str, default="evaluation/latency_results",
                         help="Directory for output files")
     parser.add_argument("--device", type=str, default=None,
@@ -494,13 +514,16 @@ def main():
         clear_logcat(args.device)
 
         # Launch benchmark
-        print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}")
+        k_msg = f", retrieve_k={args.retrieve_k}" if args.retrieve_k is not None else ""
+        print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}{k_msg}")
         launch_benchmark(
             device_serial=args.device,
             repeats=args.repeats,
             cooldown_ms=args.cooldown,
             skip_retrieval=args.no_retrieval,
+            rag_only=args.rag_only,
             query_filter=args.filter,
+            retrieve_k=args.retrieve_k,
         )
 
         # Wait for completion
@@ -509,8 +532,9 @@ def main():
             print("Benchmark did not complete successfully.")
             sys.exit(1)
 
-        # Pull results
-        json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}.json")
+        # Pull results — include k in the filename so a sweep across k values is legible.
+        k_suffix = f"_k{args.retrieve_k}" if args.retrieve_k is not None else ""
+        json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}{k_suffix}.json")
         pull_results(args.device, json_path)
 
     # Load and analyze

diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
@@ -0,0 +1,179 @@
+# MAM-AI On-Device Latency Sweep — GPU vs CPU
+
+_Generated: 2026-05-15T07:40:25_
+
+
+## Device & stack
+
+- **Device**: OnePlus OPD2413 (SM8750P) — Android 15
+- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)
+- **LiteRT-LM**: 0.11.0
+- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU
+- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000
+
+## Methodology
+
+Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. 
+Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs 
+for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives 
+screen-off and device-lock; OPPO Hans whitelist set manually.
+
+- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.
+- `decode` is first-token to last-token.
+- `total_query` is everything: `retrieval + TTFT + decode`.
+- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).
+
+## Headline — Median total query latency (seconds)
+
+| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 12.9 / 15.6 / 16.1 | 27.2 / 26.9 / 29.8 | 1.94× |
+| 1 | 561 | 13.1 / 12.6 / 17.3 | 29.3 / 31.9 / 30.3 | 2.14× |
+| 3 | 2098 | 18.6 / 18.6 / 21.0 | 37.3 / 44.5 / 42.5 | 2.24× |
+| 5 | 3547 | 18.2 / 20.0 / 21.4 | 54.8 / 60.7 / 63.0 | 3.07× |
+| 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× |
+| 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× |
+| 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× |
+| 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× |
+
+## TTFT (ms, median) — prefill cost grows with retrieved-doc content
+
+| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 962 | 12633 | 13.1× |
+| 1 | 561 | 954 | 12649 | 13.3× |
+| 3 | 2098 | 989 | 18356 | 18.6× |
+| 5 | 3547 | 1884 | 36424 | 19.3× |
+| 7 | 5139 | 1920 | 36444 | 19.0× |
+| 10 | 7482 | 2523 | 40013 | 15.9× |
+| 15 | 11297 | 3457 | 54748 | 15.8× |
+| 20 | 14520 | 3986 | 72881 | 18.3× |
+
+## Decode (ms, median) — first token to last token
+
+Decode time mostly tracks output length, not k or doc content. Variation across k reflects 
+the model writing *longer answers* when given more context (more material to draw on).
+
+| k | GPU decode | CPU decode | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 13470 | 15345 | 1.14× |
+| 1 | 11415 | 13961 | 1.22× |
+| 3 | 16364 | 19110 | 1.17× |
+| 5 | 15929 | 21645 | 1.36× |
+| 7 | 17215 | 23473 | 1.36× |
+| 10 | 18118 | 21699 | 1.20× |
+| 15 | 16820 | 22497 | 1.34× |
+| 20 | 14688 | 22634 | 1.54× |
+
+## p95 total query latency (s) — tail-latency view
+
+| k | GPU p95 | CPU p95 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 26.1 | 38.4 |
+| 1 | 26.1 | 37.1 |
+| 3 | 30.2 | 64.3 |
+| 5 | 30.7 | 74.6 |
+| 7 | 35.1 | 81.7 |
+| 10 | 29.0 | 84.5 |
+| 15 | 30.6 | 112.6 |
+| 20 | 35.3 | 104.9 |
+
+## Errors and the 4096-token context wall
+
+| k | GPU errors / 54 | CPU errors / 54 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 0 |
+| 1 | 0 | 0 |
+| 3 | 0 | 0 |
+| 5 | 0 | 0 |
+| 7 | 0 | 0 |
+| 10 | 0 | 0 |
+| 15 | 0 | 0 |
+| 20 | 24 | 24 |
+
+At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. 
+Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both 
+backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — 
+the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of 
+the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. 
+The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.
+
+Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any 
+deployment budget at this depth even when the request fits in the context window.
+
+## Wall-clock comparison
+
+| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 23.5 | 36.9 | 1.57× |
+| 1 | 23.0 | 38.7 | 1.68× |
+| 3 | 27.3 | 50.2 | 1.84× |
+| 5 | 28.2 | 63.0 | 2.23× |
+| 7 | 30.0 | 66.5 | 2.22× |
+| 10 | 29.1 | 73.2 | 2.51× |
+| 15 | 32.4 | 90.8 | 2.80× |
+| 20 | 22.8 | 58.6 | 2.57× |
+
+## Key findings
+
+
+### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite
+GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. 
+That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), 
+so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.
+
+### 2. The model's 4096-token context window is the binding ceiling at high k
+k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — 
+the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. 
+Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives 
+the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, 
+not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. 
+Latency is *not* the constraint at the upper end; the model's context window is.
+
+### 3. Latency is not the binding factor on GPU below k=15
+GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. 
+Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), 
+not by what fits in the latency budget.
+
+### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow
+CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. 
+p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't 
+available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, 
+or **k ≤ 1** if you want sub-40s p95.
+
+### 5. Decode time is content-driven, not k-driven
+Decode time tracks output length. As k grows, the model writes *longer* responses — likely because 
+more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. 
+Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, 
+not compute-bound on this hardware.
+
+### 6. TTFT scales linearly with retrieved-doc content past k=3
+On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, 
+CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting 
+the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.
+
+## Data inventory (per `(backend, k)`)
+
+| Backend | k | File | Wall (min) | Runs | Errors |
+|---|---:|---|---:|---:|---:|
+| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 |
+| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 |
+| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 |
+| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 |
+| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 |
+| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 |
+| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 |
+| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 |
+| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 |
+| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 |
+| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 |
+| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 |
+| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 |
+| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 |
+| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 |
+| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 |
+
+---
+
+_Source benchmark JSONs live in `evaluation/latency_results/`. 
+Aggregation script: `evaluation/aggregate_k_sweep.py`._