From fd85cd7ca742a08ee831ca3a6406ab701ea4ebbf Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 14:24:39 +0800
Subject: [PATCH 01/30] feat(benchmark): add --retrieve-k override for per-k
 latency sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lets the latency benchmark vary retrieval top_k without rebuilding the APK
or editing runtime_config.json. One build + install up front, then sweep
k via the CLI flag — needed to bound k_max on the Snapdragon 8 Elite + GPU
stack now that TTFT (~1–2 s at k=3) is no longer the binding constraint.

Wiring:
- RagPipeline.generateResponse() gains an optional retrieveKOverride:
  Int? parameter (default null). When non-null it replaces
  retrievalConfig.top_k for that call only; production callers leave it
  null. Param added at the end of the list so RagStream's positional call
  is unaffected.
- BenchmarkActivity reads an "retrieve_k" Intent extra (-1 sentinel = no
  override), threads it through runBenchmark → runQuery →
  generateResponse, and records "retrieval_top_k_override" in the
  config block of the results JSON.
- benchmark_latency.py adds --retrieve-k N, forwards via
  am start --ei retrieve_k N, and appends "_kN" to the output filename
  so a sweep across k values is legible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../com/example/app/BenchmarkActivity.kt      | 23 +++++++++++++++----
 .../kotlin/com/example/app/RagPipeline.kt     | 11 +++++++--
 evaluation/benchmark_latency.py               | 18 +++++++++++----
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index bce61ea..bbe9fa9 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -32,6 +32,9 @@ import java.util.concurrent.Executors
  * Optional extras:
  *   --ez skip_retrieval true     Skip RAG retrieval (generation only)
  *   --es query_filter short      Filter by category or specific query ID
+ *   --ei retrieve_k N            Override retrieval top_k for this session
+ *                                (default: use runtime_config.json's value).
+ *                                Used by the per-k latency sweep.
  */
 class BenchmarkActivity : Activity() {
 
@@ -68,10 +71,12 @@ class BenchmarkActivity : Activity() {
         val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS)
         val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false)
         val queryFilter = intent.getStringExtra("query_filter")
+        // -1 sentinel = no override; any non-negative value overrides runtime_config's top_k.
+        val retrieveKOverride: Int? = intent.getIntExtra("retrieve_k", -1).takeIf { it >= 0 }
 
         scope.launch {
             try {
-                runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter)
+                runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter, retrieveKOverride)
             } catch (t: Throwable) {
                 Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t)
                 Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
@@ -96,6 +101,7 @@ class BenchmarkActivity : Activity() {
         cooldownMs: Long,
         skipRetrieval: Boolean,
         queryFilter: String?,
+        retrieveKOverride: Int?,
     ) {
         val benchmarkStart = System.currentTimeMillis()
         val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date())
@@ -205,7 +211,7 @@ class BenchmarkActivity : Activity() {
                     logStatus("[$runIndex/$totalRuns] ${query.id} | retrieval=$useRetrieval rep=$rep | $etaStr")
 
                     val preMemory = collectMemoryInfo()
-                    val result = runQuery(pipeline, query.text, useRetrieval)
+                    val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride)
                     val postMemory = collectMemoryInfo()
 
                     val decodeTps = if (result.decodeMs > 0)
@@ -261,6 +267,9 @@ class BenchmarkActivity : Activity() {
                 put("cooldown_ms", cooldownMs)
                 put("skip_retrieval", skipRetrieval)
                 put("query_filter", queryFilter ?: JSONObject.NULL)
+                // retrieval_top_k_override is null when the session uses runtime_config.json's
+                // value; non-null records the override value used for this whole session.
+                put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL)
                 put("model", "gemma-4-E4B-it.litertlm")
                 put("backend", "CPU")
                 put("max_tokens", 32000)
@@ -302,7 +311,12 @@ class BenchmarkActivity : Activity() {
         val error: String?,
     )
 
-    private suspend fun runQuery(pipeline: RagPipeline, queryText: String, useRetrieval: Boolean): QueryResult {
+    private suspend fun runQuery(
+        pipeline: RagPipeline,
+        queryText: String,
+        useRetrieval: Boolean,
+        retrieveKOverride: Int?,
+    ): QueryResult {
         var retrievalTimeMs = 0L
         var numDocs = 0
         var firstTokenTime = 0L
@@ -328,7 +342,8 @@ class BenchmarkActivity : Activity() {
                         if (firstTokenTime == 0L && partial.isNotEmpty()) {
                             firstTokenTime = System.currentTimeMillis()
                         }
-                    }
+                    },
+                    retrieveKOverride = retrieveKOverride,
                 )
             }
         } catch (e: Exception) {
diff --git a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
index 19f582e..e13e391 100644
--- a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
@@ -219,7 +219,12 @@ class RagPipeline(application: Application) {
         }
     }
 
-    /** Generates the response from the LLM with conversation history support. */
+    /** Generates the response from the LLM with conversation history support.
+     *
+     *  [retrieveKOverride] — when non-null, replaces `retrievalConfig.top_k`
+     *  for this call only. Used by [BenchmarkActivity] for the per-k latency
+     *  sweep; production callers leave it null and inherit the runtime config.
+     */
     suspend fun generateResponse(
         prompt: String,
         history: List<Map<String, String>>,
@@ -227,6 +232,7 @@ class RagPipeline(application: Application) {
         language: String = "en",
         retrievalListener: (docs: List<RetrievedDoc>) -> Unit,
         generationListener: (partial: String, done: Boolean) -> Unit,
+        retrieveKOverride: Int? = null,
     ): String =
         coroutineScope {
             awaitLlmReady()
@@ -235,10 +241,11 @@ class RagPipeline(application: Application) {
             val qStart = System.currentTimeMillis()
 
             val docs = if (useRetrieval) {
+                val effectiveTopK = retrieveKOverride ?: retrievalConfig.getInt("top_k")
                 val retrievalRequest = RetrievalRequest.create(
                     prompt,
                     RetrievalConfig.create(
-                        retrievalConfig.getInt("top_k"),
+                        effectiveTopK,
                         retrievalConfig.getDouble("similarity_threshold").toFloat(),
                         TaskType.RETRIEVAL_QUERY,
                     ),
diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
index 5611e21..77f177d 100644
--- a/evaluation/benchmark_latency.py
+++ b/evaluation/benchmark_latency.py
@@ -12,6 +12,7 @@
     python evaluation/benchmark_latency.py --filter long_01         # Single specific query
     python evaluation/benchmark_latency.py --no-retrieval           # Skip RAG retrieval
     python evaluation/benchmark_latency.py --cooldown 10000         # Longer cooldown (thermal)
+    python evaluation/benchmark_latency.py --retrieve-k 5           # Override retrieval top_k for this session
 """
 
 import argparse
@@ -103,7 +104,7 @@ def clear_logcat(device_serial=None):
 
 
 def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
-                     skip_retrieval=False, query_filter=None):
+                     skip_retrieval=False, query_filter=None, retrieve_k=None):
     """Launch BenchmarkActivity via ADB."""
     cmd = _adb(device_serial) + [
         "shell", "am", "start",
@@ -115,6 +116,8 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
         cmd += ["--ez", "skip_retrieval", "true"]
     if query_filter:
         cmd += ["--es", "query_filter", query_filter]
+    if retrieve_k is not None:
+        cmd += ["--ei", "retrieve_k", str(retrieve_k)]
 
     result = subprocess.run(cmd, capture_output=True, text=True)
     if "Error" in result.stderr:
@@ -460,6 +463,10 @@ def main():
                         help="Skip RAG retrieval (generation only)")
     parser.add_argument("--filter", type=str, default=None,
                         help="Filter by category (short/medium/long) or query ID (e.g., long_01)")
+    parser.add_argument("--retrieve-k", type=int, default=None,
+                        help="Override retrieval top_k for this session "
+                             "(default: use runtime_config.json's value, currently 3). "
+                             "Used for the per-k latency sweep.")
     parser.add_argument("--output-dir", type=str, default="evaluation/latency_results",
                         help="Directory for output files")
     parser.add_argument("--device", type=str, default=None,
@@ -494,13 +501,15 @@ def main():
         clear_logcat(args.device)
 
         # Launch benchmark
-        print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}")
+        k_msg = f", retrieve_k={args.retrieve_k}" if args.retrieve_k is not None else ""
+        print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}{k_msg}")
         launch_benchmark(
             device_serial=args.device,
             repeats=args.repeats,
             cooldown_ms=args.cooldown,
             skip_retrieval=args.no_retrieval,
             query_filter=args.filter,
+            retrieve_k=args.retrieve_k,
         )
 
         # Wait for completion
@@ -509,8 +518,9 @@ def main():
             print("Benchmark did not complete successfully.")
             sys.exit(1)
 
-        # Pull results
-        json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}.json")
+        # Pull results — include k in the filename so a sweep across k values is legible.
+        k_suffix = f"_k{args.retrieve_k}" if args.retrieve_k is not None else ""
+        json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}{k_suffix}.json")
         pull_results(args.device, json_path)
 
     # Load and analyze

From 8848beef46f35b6ffbd52327aa1bd84bd629ca03 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 14:34:31 +0800
Subject: [PATCH 02/30] feat(benchmark): capture retrieved chunks + response
 text per run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the benchmark recorded only counts/lengths (num_retrieved_docs,
response_length_chars). For the per-k latency sweep we want to see what
the retriever actually surfaced at each k and what the model generated —
both for content review and because the total chunk-text length is what
drives prefill cost as k grows.

New per-run fields in the results JSON:
- retrieved_chunks: array of {text, source, page, chars} for every chunk
  the retriever returned. Lets us inspect what changed as k grew.
- retrieved_total_chars: sum of chunk text lengths. The real
  prompt-length proxy (vs. query_word_count which is static).
- response_text: full model response (the generation listener was
  already accumulating it; we now record the final string).

Size: at k=20 with ~1500-3000 chars/chunk + ~3KB response, per-run
overhead is ~30-60KB. A full 108-run benchmark file grows from ~50KB to
~4-6MB. Acceptable; can add a --no-content opt-out later if needed.

Note: Gemma 4 E4B doesn't emit a separate reasoning channel — any inline
reasoning the model writes shows up in response_text.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../com/example/app/BenchmarkActivity.kt      | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index bbe9fa9..af6c0a8 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -235,6 +235,21 @@ class BenchmarkActivity : Activity() {
                         put("estimated_tokens", result.estimatedTokens)
                         put("decode_throughput_tps", decodeTps)
                         put("num_retrieved_docs", result.numRetrievedDocs)
+                        // Full content for downstream analysis: chunks the retriever surfaced,
+                        // total chunk-text length (drives prefill cost), and the model's
+                        // generated response.
+                        put("retrieved_chunks", JSONArray().apply {
+                            result.retrievedChunks.forEach { doc ->
+                                put(JSONObject().apply {
+                                    put("text", doc.text)
+                                    put("source", doc.source)
+                                    put("page", doc.page)
+                                    put("chars", doc.text.length)
+                                })
+                            }
+                        })
+                        put("retrieved_total_chars", result.retrievedTotalChars)
+                        put("response_text", result.responseText)
                         put("error", result.error ?: JSONObject.NULL)
                         put("heap_before_mb", preMemory.getInt("used_mb"))
                         put("heap_after_mb", postMemory.getInt("used_mb"))
@@ -308,6 +323,9 @@ class BenchmarkActivity : Activity() {
         val responseChars: Int,
         val estimatedTokens: Int,
         val numRetrievedDocs: Int,
+        val retrievedChunks: List<RetrievedDoc>,
+        val retrievedTotalChars: Int,
+        val responseText: String,
         val error: String?,
     )
 
@@ -322,6 +340,7 @@ class BenchmarkActivity : Activity() {
         var firstTokenTime = 0L
         var error: String? = null
         val responseBuilder = StringBuilder()
+        var retrievedChunks: List<RetrievedDoc> = emptyList()
 
         val qStart = System.currentTimeMillis()
         var retrievalDoneTime = 0L
@@ -336,6 +355,7 @@ class BenchmarkActivity : Activity() {
                         retrievalDoneTime = System.currentTimeMillis()
                         retrievalTimeMs = retrievalDoneTime - qStart
                         numDocs = docs.size
+                        retrievedChunks = docs
                     },
                     generationListener = { partial, _ ->
                         responseBuilder.append(partial)
@@ -372,6 +392,9 @@ class BenchmarkActivity : Activity() {
             responseChars = responseChars,
             estimatedTokens = estimatedTokens,
             numRetrievedDocs = numDocs,
+            retrievedChunks = retrievedChunks,
+            retrievedTotalChars = retrievedChunks.sumOf { it.text.length },
+            responseText = responseBuilder.toString(),
             error = error,
         )
     }

From 33604df7305ace807ca405f88dea970f3e2e8014 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 14:52:55 +0800
Subject: [PATCH 03/30] fix(benchmark): update model-file check to match
 production stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

check_models_downloaded() was looking for gemma-3n-E4B-it-int4.task —
left over from the pre-Gemma-4 era. config/app_config.json now declares
"llm_model": "gemma-4-E4B-it.litertlm", so the script's pre-flight check
falsely failed even when the right model was on device.

Caught during the smoke test for the --retrieve-k override feature: the
script aborted before launching BenchmarkActivity.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/benchmark_latency.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
index 77f177d..4c2d152 100644
--- a/evaluation/benchmark_latency.py
+++ b/evaluation/benchmark_latency.py
@@ -69,9 +69,15 @@ def check_device(device_serial=None):
 
 
 def check_models_downloaded(device_serial=None):
-    """Check if model files exist on device."""
+    """Check if model files exist on device.
+
+    Filenames must match config/app_config.json — the app loads
+    "llm_model" / "embedding_model" / "tokenizer" from there. Updated
+    for the Gemma 4 E4B / LiteRT-LM 0.11.0 stack; the old Gemma 3n
+    .task name is no longer in production.
+    """
     required_files = [
-        "gemma-3n-E4B-it-int4.task",
+        "gemma-4-E4B-it.litertlm",
         "Gecko_1024_quant.tflite",
         "sentencepiece.model",
         "embeddings.sqlite",

From 197a7bc439f0fd413e5452030dcc8b377335dde4 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 15:06:11 +0800
Subject: [PATCH 04/30] feat(benchmark): add --rag-only flag to skip No-RAG
 mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For the k-sweep, the No-RAG baseline doesn't change with k (retrieval is
disabled, so the override is ignored). Without this flag, running the
sweep at 7 k-values would re-run the identical 54 No-RAG measurements
seven times — ~1.5 hours of redundant work.

The Intent extra "rag_only" (bool, default false) tells BenchmarkActivity
to run only the RAG mode. Mutually exclusive with skip_retrieval, which
wins if both are set. Python --rag-only forwards via am start --ez.

Recorded in the session config JSON as "rag_only" so reruns are
unambiguous.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kotlin/com/example/app/BenchmarkActivity.kt   | 15 +++++++++++++--
 evaluation/benchmark_latency.py                   | 10 +++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index af6c0a8..a805e7e 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -31,6 +31,9 @@ import java.util.concurrent.Executors
  *
  * Optional extras:
  *   --ez skip_retrieval true     Skip RAG retrieval (generation only)
+ *   --ez rag_only true           Skip the No-RAG mode (useful for k-sweeps
+ *                                where the No-RAG baseline only needs to
+ *                                be captured once)
  *   --es query_filter short      Filter by category or specific query ID
  *   --ei retrieve_k N            Override retrieval top_k for this session
  *                                (default: use runtime_config.json's value).
@@ -70,13 +73,14 @@ class BenchmarkActivity : Activity() {
         val repeats = intent.getIntExtra("repeats", DEFAULT_REPEATS)
         val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS)
         val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false)
+        val ragOnly = intent.getBooleanExtra("rag_only", false)
         val queryFilter = intent.getStringExtra("query_filter")
         // -1 sentinel = no override; any non-negative value overrides runtime_config's top_k.
         val retrieveKOverride: Int? = intent.getIntExtra("retrieve_k", -1).takeIf { it >= 0 }
 
         scope.launch {
             try {
-                runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter, retrieveKOverride)
+                runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride)
             } catch (t: Throwable) {
                 Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t)
                 Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
@@ -100,6 +104,7 @@ class BenchmarkActivity : Activity() {
         repeats: Int,
         cooldownMs: Long,
         skipRetrieval: Boolean,
+        ragOnly: Boolean,
         queryFilter: String?,
         retrieveKOverride: Int?,
     ) {
@@ -183,7 +188,12 @@ class BenchmarkActivity : Activity() {
             return
         }
 
-        val retrievalModes = if (skipRetrieval) listOf(false) else listOf(true, false)
+        // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins if both set).
+        val retrievalModes = when {
+            skipRetrieval -> listOf(false)
+            ragOnly -> listOf(true)
+            else -> listOf(true, false)
+        }
         val totalRuns = queries.size * retrievalModes.size * repeats
         Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs")
 
@@ -281,6 +291,7 @@ class BenchmarkActivity : Activity() {
                 put("repeats", repeats)
                 put("cooldown_ms", cooldownMs)
                 put("skip_retrieval", skipRetrieval)
+                put("rag_only", ragOnly)
                 put("query_filter", queryFilter ?: JSONObject.NULL)
                 // retrieval_top_k_override is null when the session uses runtime_config.json's
                 // value; non-null records the override value used for this whole session.
diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
index 4c2d152..30dec4c 100644
--- a/evaluation/benchmark_latency.py
+++ b/evaluation/benchmark_latency.py
@@ -110,7 +110,8 @@ def clear_logcat(device_serial=None):
 
 
 def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
-                     skip_retrieval=False, query_filter=None, retrieve_k=None):
+                     skip_retrieval=False, rag_only=False,
+                     query_filter=None, retrieve_k=None):
     """Launch BenchmarkActivity via ADB."""
     cmd = _adb(device_serial) + [
         "shell", "am", "start",
@@ -120,6 +121,8 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
     ]
     if skip_retrieval:
         cmd += ["--ez", "skip_retrieval", "true"]
+    if rag_only:
+        cmd += ["--ez", "rag_only", "true"]
     if query_filter:
         cmd += ["--es", "query_filter", query_filter]
     if retrieve_k is not None:
@@ -467,6 +470,10 @@ def main():
                         help="Cooldown between queries in ms (default: 5000)")
     parser.add_argument("--no-retrieval", action="store_true",
                         help="Skip RAG retrieval (generation only)")
+    parser.add_argument("--rag-only", action="store_true",
+                        help="Skip the No-RAG mode (only run with retrieval). "
+                             "Pair with --retrieve-k to do a k-sweep without "
+                             "re-running the No-RAG baseline at every k.")
     parser.add_argument("--filter", type=str, default=None,
                         help="Filter by category (short/medium/long) or query ID (e.g., long_01)")
     parser.add_argument("--retrieve-k", type=int, default=None,
@@ -514,6 +521,7 @@ def main():
             repeats=args.repeats,
             cooldown_ms=args.cooldown,
             skip_retrieval=args.no_retrieval,
+            rag_only=args.rag_only,
             query_filter=args.filter,
             retrieve_k=args.retrieve_k,
         )

From 795ac8492869bec4fc790a15551473ac5ca0b4f3 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 15:39:39 +0800
Subject: [PATCH 05/30] fix(benchmark): use suspending delay() instead of
 Thread.sleep()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thread.sleep(cooldownMs) was running on the UI thread (BenchmarkActivity
is an Activity; the calling coroutine was scope.launch(Dispatchers.Main)
by default). With cooldown ≥ 5000 ms, Android's input-dispatching timeout
fires and the activity gets killed mid-sweep — exactly what happened on
the OPPO Snapdragon 8 Elite at cooldown=10000:

  AnrInfo{reason='Input Dispatching Timeout',
    stackTrace='at java.lang.Thread.sleep(...)
                at BenchmarkActivity.runBenchmark(BenchmarkActivity.kt:279)'}

delay() is a suspending function that doesn't block the underlying
thread, so the UI stays responsive while the benchmark waits. Both
cooldown call sites (post-init and between-runs) are inside the same
suspend coroutine, so this is a drop-in replacement.

The existing latency_report.md used cooldown=10000 on Pixel 7 without
issue, which suggests the older Tensor G2 chipset had laxer ANR
enforcement or Athena memory-kill behavior — either way, the
correct-by-construction fix is to never block the UI thread.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../main/kotlin/com/example/app/BenchmarkActivity.kt  | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index a805e7e..ed16231 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -11,6 +11,7 @@ import kotlinx.coroutines.CoroutineScope
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.SupervisorJob
 import kotlinx.coroutines.asCoroutineDispatcher
+import kotlinx.coroutines.delay
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.withContext
 import org.json.JSONArray
@@ -173,7 +174,7 @@ class BenchmarkActivity : Activity() {
         // Step 4: Cooldown before timed runs
         logStatus("--- Init summary: gecko=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
         logStatus("Cooldown ${cooldownMs}ms...")
-        Thread.sleep(cooldownMs)
+        delay(cooldownMs)
 
         // Filter queries
         val queries = if (queryFilter != null) {
@@ -274,9 +275,13 @@ class BenchmarkActivity : Activity() {
                     val elapsedMin = (System.currentTimeMillis() - loopStart) / 60000
                     logStatus("  [${"█".repeat(pct / 5)}${"░".repeat(20 - pct / 5)}] $pct% ($elapsedMin min elapsed)")
 
-                    // Cooldown between queries (skip after last run)
+                    // Cooldown between queries (skip after last run).
+                    // delay() vs Thread.sleep(): the suspending variant doesn't block the
+                    // UI thread, which is essential — cooldowns >5s with Thread.sleep
+                    // trigger an ANR (Input Dispatching Timeout) and Android kills the
+                    // activity mid-benchmark.
                     if (runIndex < totalRuns) {
-                        Thread.sleep(cooldownMs)
+                        delay(cooldownMs)
                     }
                 }
             }

From 12fd358b08fc49bfc37b726b261bb18fa1493423 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 16:15:27 +0800
Subject: [PATCH 06/30] fix(benchmark): keep CPU alive through screen-off
 (PARTIAL_WAKE_LOCK + Default dispatcher)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark coroutine was running on Dispatchers.Main, so as soon as the
device screen went off (~10 min on OPPO) the activity backgrounded and the
coroutine's delay() never resumed. The process stayed alive but stopped
making progress — the Python wrapper hung waiting for [BENCHMARK] COMPLETE
that would never come. Observed on the OPPO Snapdragon 8 Elite mid-sweep:
~24 min between the last [BENCHMARK] log line and the most recent Athena
heartbeat, with mWakefulness=Asleep.

Two changes:

1. Acquire a PARTIAL_WAKE_LOCK in onCreate (released in onDestroy). Keeps
   the CPU running even when the screen is off; the screen itself can
   still sleep. 6-hour failsafe timeout. Required permission added to
   AndroidManifest.xml — used only by BenchmarkActivity.

2. Switch the coroutine scope from Dispatchers.Main to Dispatchers.Default.
   The benchmark logic doesn't touch the UI directly (logStatus already
   marshals back to Main via runOnUiThread), so there's no reason to run
   on Main — doing so just makes the work pause whenever the activity
   loses focus. Default keeps running in any lifecycle state.

These together let the sweep run while the device is screen-off or
locked. Screen lifespan and battery thank you.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/android/app/src/main/AndroidManifest.xml  |  4 +++
 .../com/example/app/BenchmarkActivity.kt      | 33 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml
index a44e8c2..4c65729 100644
--- a/app/android/app/src/main/AndroidManifest.xml
+++ b/app/android/app/src/main/AndroidManifest.xml
@@ -4,6 +4,10 @@
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE"/>
     <!-- Required on Android 14+ for network-data foreground services -->
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE_DATA_SYNC"/>
+    <!-- BenchmarkActivity acquires a PARTIAL_WAKE_LOCK so the CPU keeps running
+         when the screen is off or locked. Without this, multi-hour benchmarks
+         stall silently when the device idles. Used only by BenchmarkActivity. -->
+    <uses-permission android:name="android.permission.WAKE_LOCK"/>
 
     <application
         android:label="MAM-AI"
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index ed16231..e23dd6c 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -1,8 +1,10 @@
 package com.example.app
 
 import android.app.Activity
+import android.content.Context
 import android.os.Build
 import android.os.Bundle
+import android.os.PowerManager
 import android.util.Log
 import android.widget.LinearLayout
 import android.widget.ScrollView
@@ -50,14 +52,32 @@ class BenchmarkActivity : Activity() {
         private const val CHARS_PER_TOKEN_ESTIMATE = 4.0
     }
 
-    private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Main)
+    // Default dispatcher so the benchmark coroutine isn't tied to the UI thread —
+    // if it were on Dispatchers.Main, the entire run would stall as soon as the
+    // screen sleeps or the activity loses focus.
+    private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default)
     private val executor = Executors.newSingleThreadExecutor()
     private lateinit var logView: TextView
     private lateinit var scrollView: ScrollView
+    private var wakeLock: PowerManager.WakeLock? = null
 
     override fun onCreate(savedInstanceState: Bundle?) {
         super.onCreate(savedInstanceState)
 
+        // Acquire a PARTIAL_WAKE_LOCK so the CPU keeps running even if the
+        // screen turns off or the device is locked. Released in onDestroy.
+        // Without this, multi-hour benchmarks stall silently when the device
+        // idles (OPPO and other vendors aggressively pause background work).
+        val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager
+        wakeLock = powerManager.newWakeLock(
+            PowerManager.PARTIAL_WAKE_LOCK,
+            "mam-ai:benchmark"
+        ).apply {
+            setReferenceCounted(false)
+            acquire(6L * 60L * 60L * 1000L)  // 6 h max — failsafe upper bound
+        }
+        Log.w(BENCH_TAG, "[BENCHMARK] Acquired PARTIAL_WAKE_LOCK (CPU stays on through screen-off)")
+
         // Scrollable log console UI
         scrollView = ScrollView(this).apply {
             setBackgroundColor(0xFF000000.toInt())
@@ -92,6 +112,17 @@ class BenchmarkActivity : Activity() {
         }
     }
 
+    override fun onDestroy() {
+        super.onDestroy()
+        wakeLock?.let {
+            if (it.isHeld) {
+                it.release()
+                Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK")
+            }
+        }
+        wakeLock = null
+    }
+
     private fun logStatus(text: String) {
         runOnUiThread {
             logView.append(text + "\n")

From 7ac3b36e405e7015142774d47646d4c54818f619 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 17:44:52 +0800
Subject: [PATCH 07/30] refactor(benchmark): move benchmark to a foreground
 service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OPPO ColorOS aggressively freezes background activities and even
force-releases PARTIAL_WAKE_LOCKs held by plain Activities (visible as
'add wakelock … to ForceReleaseWakeLock list' in OplusProxyWakeLock).
Foreground services with a sticky notification are respected — once the
app is also whitelisted in Settings → Battery → App Battery Management
("Allow background activity"), the benchmark runs cleanly with the
screen off and the device locked.

Architecture:
- BenchmarkForegroundService (NEW) — holds the wake lock, posts a
  sticky progress notification, and runs the entire benchmark loop.
  Uses Dispatchers.Default so it isn't tied to a UI thread. Stops
  itself when done; the OS reclaims the process.
- BenchmarkActivity — reduced from ~470 lines to ~60. Now a thin
  launcher: receives `am start` Intent extras, forwards them to the
  service via startForegroundService(), and finishes immediately.
  Existing Python wrapper (benchmark_latency.py) is unchanged — it
  still launches the Activity and reads progress from logcat.
- AndroidManifest — registers the new service with
  foregroundServiceType="dataSync" (reuses the existing
  FOREGROUND_SERVICE_DATA_SYNC permission) and android:process=
  ":benchmark" (same isolated process as BenchmarkActivity).

Verified end-to-end on OPPO Snapdragon 8 Elite (OPD2413): launched the
benchmark, locked the screen 30 s in, watched a full medium_01 RAG run
complete with `hans_freeze=0` Athena freeze events and the screen
asleep through the entire decode phase. TTFT 1025 ms, total 13.8 s —
within the same envelope as foreground runs.

Pre-flight on a new device:
  Settings → Battery → App Battery Management → MAM-AI
                                              → "Allow background activity"

Without that, OPPO's OplusHansManager freezes the process at the OS
level regardless of foreground-service status.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/android/app/src/main/AndroidManifest.xml  |  10 +
 .../com/example/app/BenchmarkActivity.kt      | 476 ++---------------
 .../example/app/BenchmarkForegroundService.kt | 487 ++++++++++++++++++
 3 files changed, 530 insertions(+), 443 deletions(-)
 create mode 100644 app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt

diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml
index 4c65729..bb35442 100644
--- a/app/android/app/src/main/AndroidManifest.xml
+++ b/app/android/app/src/main/AndroidManifest.xml
@@ -49,6 +49,16 @@
             android:foregroundServiceType="dataSync"
             android:exported="false" />
 
+        <!-- Foreground service for the on-device latency benchmark. Holds a
+             PARTIAL_WAKE_LOCK + sticky notification so the work survives
+             screen-off and device-lock through hours-long k-sweeps. Runs
+             in its own :benchmark process to keep the main app isolated. -->
+        <service
+            android:name=".BenchmarkForegroundService"
+            android:foregroundServiceType="dataSync"
+            android:exported="false"
+            android:process=":benchmark" />
+
         <!-- FileProvider for sharing PDF files from getExternalFilesDir with viewer apps -->
         <provider
             android:name="androidx.core.content.FileProvider"
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index e23dd6c..887f841 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -1,474 +1,64 @@
 package com.example.app
 
 import android.app.Activity
-import android.content.Context
+import android.content.Intent
 import android.os.Build
 import android.os.Bundle
-import android.os.PowerManager
 import android.util.Log
-import android.widget.LinearLayout
-import android.widget.ScrollView
-import android.widget.TextView
-import kotlinx.coroutines.CoroutineScope
-import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.SupervisorJob
-import kotlinx.coroutines.asCoroutineDispatcher
-import kotlinx.coroutines.delay
-import kotlinx.coroutines.launch
-import kotlinx.coroutines.withContext
-import org.json.JSONArray
-import org.json.JSONObject
-import java.io.File
-import java.text.SimpleDateFormat
-import java.util.Date
-import java.util.Locale
-import java.util.concurrent.Executors
 
 /**
- * Benchmark activity that runs predefined queries through [RagPipeline]
- * and writes structured timing results to a JSON file on device storage.
+ * Thin launcher for [BenchmarkForegroundService].
+ *
+ * All benchmark logic lives in the service so it survives screen-off and
+ * device-lock — vendor power managers (OPPO, Xiaomi, Huawei) will idle a
+ * plain Activity but respect a foreground service with a sticky
+ * notification.
+ *
+ * Launch via ADB exactly as before — the Activity forwards all extras
+ * straight to the service, then finishes immediately:
  *
- * Launch via ADB:
  *   adb shell am start -n com.example.app/.BenchmarkActivity \
  *       --ei repeats 3 --el cooldown_ms 5000
  *
  * Optional extras:
  *   --ez skip_retrieval true     Skip RAG retrieval (generation only)
- *   --ez rag_only true           Skip the No-RAG mode (useful for k-sweeps
- *                                where the No-RAG baseline only needs to
- *                                be captured once)
+ *   --ez rag_only true           Skip the No-RAG mode (k-sweep helper)
  *   --es query_filter short      Filter by category or specific query ID
  *   --ei retrieve_k N            Override retrieval top_k for this session
  *                                (default: use runtime_config.json's value).
- *                                Used by the per-k latency sweep.
  */
 class BenchmarkActivity : Activity() {
 
     companion object {
-        private const val TAG = "mam-ai"
         private const val BENCH_TAG = "mam-ai-bench"
-        private const val DEFAULT_COOLDOWN_MS = 5_000L
-        private const val DEFAULT_REPEATS = 3
-        private const val CHARS_PER_TOKEN_ESTIMATE = 4.0
     }
 
-    // Default dispatcher so the benchmark coroutine isn't tied to the UI thread —
-    // if it were on Dispatchers.Main, the entire run would stall as soon as the
-    // screen sleeps or the activity loses focus.
-    private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default)
-    private val executor = Executors.newSingleThreadExecutor()
-    private lateinit var logView: TextView
-    private lateinit var scrollView: ScrollView
-    private var wakeLock: PowerManager.WakeLock? = null
-
     override fun onCreate(savedInstanceState: Bundle?) {
         super.onCreate(savedInstanceState)
 
-        // Acquire a PARTIAL_WAKE_LOCK so the CPU keeps running even if the
-        // screen turns off or the device is locked. Released in onDestroy.
-        // Without this, multi-hour benchmarks stall silently when the device
-        // idles (OPPO and other vendors aggressively pause background work).
-        val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager
-        wakeLock = powerManager.newWakeLock(
-            PowerManager.PARTIAL_WAKE_LOCK,
-            "mam-ai:benchmark"
-        ).apply {
-            setReferenceCounted(false)
-            acquire(6L * 60L * 60L * 1000L)  // 6 h max — failsafe upper bound
-        }
-        Log.w(BENCH_TAG, "[BENCHMARK] Acquired PARTIAL_WAKE_LOCK (CPU stays on through screen-off)")
-
-        // Scrollable log console UI
-        scrollView = ScrollView(this).apply {
-            setBackgroundColor(0xFF000000.toInt())
-        }
-        logView = TextView(this).apply {
-            setTextColor(0xFF00FF00.toInt())
-            textSize = 13f
-            setPadding(32, 48, 32, 48)
-            text = "=== MAM-AI Benchmark ===\n"
-        }
-        scrollView.addView(logView)
-        setContentView(scrollView)
-
-        val repeats = intent.getIntExtra("repeats", DEFAULT_REPEATS)
-        val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS)
-        val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false)
-        val ragOnly = intent.getBooleanExtra("rag_only", false)
-        val queryFilter = intent.getStringExtra("query_filter")
-        // -1 sentinel = no override; any non-negative value overrides runtime_config's top_k.
-        val retrieveKOverride: Int? = intent.getIntExtra("retrieve_k", -1).takeIf { it >= 0 }
-
-        scope.launch {
-            try {
-                runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride)
-            } catch (t: Throwable) {
-                Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t)
-                Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
-                logStatus("FAILED: ${t.message}")
-            } finally {
-                finish()
-            }
-        }
-    }
-
-    override fun onDestroy() {
-        super.onDestroy()
-        wakeLock?.let {
-            if (it.isHeld) {
-                it.release()
-                Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK")
-            }
-        }
-        wakeLock = null
-    }
-
-    private fun logStatus(text: String) {
-        runOnUiThread {
-            logView.append(text + "\n")
-            scrollView.post { scrollView.fullScroll(ScrollView.FOCUS_DOWN) }
-        }
-    }
-
-    // ── Main benchmark loop ──────────────────────────────────────────────
-
-    private suspend fun runBenchmark(
-        repeats: Int,
-        cooldownMs: Long,
-        skipRetrieval: Boolean,
-        ragOnly: Boolean,
-        queryFilter: String?,
-        retrieveKOverride: Int?,
-    ) {
-        val benchmarkStart = System.currentTimeMillis()
-        val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date())
-
-        Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter")
-
-        // Device info
-        val deviceInfo = collectDeviceInfo()
-        Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})")
-
-        // Step 1: Gecko + SQLite init (synchronous part of RagPipeline constructor)
-        logStatus("Step 1/4: Initializing Gecko embedder + SQLite...")
-        Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...")
-        val initStart = System.currentTimeMillis()
-        val pipeline = withContext(executor.asCoroutineDispatcher()) {
-            RagPipeline(application)
-        }
-        val syncInitMs = System.currentTimeMillis() - initStart
-        Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms")
-        logStatus("Step 1/4: Gecko + SQLite done (${syncInitMs}ms)")
-
-        // Step 2: Wait for LLM model load (async, started by RagPipeline constructor)
-        logStatus("Step 2/4: Loading Gemma 4 LLM model...")
-        Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...")
-        val llmWaitStart = System.currentTimeMillis()
-        withContext(executor.asCoroutineDispatcher()) {
-            pipeline.awaitLlmReady()
-        }
-        val llmInitMs = System.currentTimeMillis() - llmWaitStart
-        Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)")
-        logStatus("Step 2/4: LLM loaded (${llmInitMs}ms)")
-
-        // Step 3: 5 warmup queries of varying length — warms JIT / LiteRT-LM / shader caches
-        val warmupQueries = listOf(
-            "Normal fetal heart rate",
-            "Signs of infection after delivery",
-            "A mother has heavy bleeding after birth. What should I do first?",
-            "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?",
-            "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?",
-        )
-        logStatus("Step 3/4: Running ${warmupQueries.size} warmup queries...")
-        Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...")
-        val warmupStart = System.currentTimeMillis()
-        warmupQueries.forEachIndexed { i, prompt ->
-            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"")
-            withContext(executor.asCoroutineDispatcher()) {
-                pipeline.generateResponse(
-                    prompt = prompt,
-                    history = emptyList(),
-                    useRetrieval = false,
-                    retrievalListener = {},
-                    generationListener = { _, _ -> }
-                )
-            }
-            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)")
-        }
-        val warmupMs = System.currentTimeMillis() - warmupStart
-        val totalInitMs = System.currentTimeMillis() - initStart
-        Log.w(BENCH_TAG, "[BENCHMARK] Warmup complete: ${warmupMs}ms total (${warmupQueries.size} queries)")
-        Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
-
-        val postInitMemory = collectMemoryInfo()
-
-        // Step 4: Cooldown before timed runs
-        logStatus("--- Init summary: gecko=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
-        logStatus("Cooldown ${cooldownMs}ms...")
-        delay(cooldownMs)
-
-        // Filter queries
-        val queries = if (queryFilter != null) {
-            BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter }
+        val serviceIntent = Intent(this, BenchmarkForegroundService::class.java).apply {
+            // Forward every extra the user might have passed via `am start`.
+            // Defaults are resolved inside the service.
+            if (intent.hasExtra("repeats"))
+                putExtra("repeats", intent.getIntExtra("repeats", 3))
+            if (intent.hasExtra("cooldown_ms"))
+                putExtra("cooldown_ms", intent.getLongExtra("cooldown_ms", 5000L))
+            if (intent.hasExtra("skip_retrieval"))
+                putExtra("skip_retrieval", intent.getBooleanExtra("skip_retrieval", false))
+            if (intent.hasExtra("rag_only"))
+                putExtra("rag_only", intent.getBooleanExtra("rag_only", false))
+            if (intent.hasExtra("query_filter"))
+                putExtra("query_filter", intent.getStringExtra("query_filter"))
+            if (intent.hasExtra("retrieve_k"))
+                putExtra("retrieve_k", intent.getIntExtra("retrieve_k", -1))
+        }
+
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+            startForegroundService(serviceIntent)
         } else {
-            BenchmarkQueries.ALL
-        }
-
-        if (queries.isEmpty()) {
-            Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'")
-            Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
-            return
+            startService(serviceIntent)
         }
-
-        // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins if both set).
-        val retrievalModes = when {
-            skipRetrieval -> listOf(false)
-            ragOnly -> listOf(true)
-            else -> listOf(true, false)
-        }
-        val totalRuns = queries.size * retrievalModes.size * repeats
-        Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs")
-
-        // Execution loop
-        val results = mutableListOf<JSONObject>()
-        var runIndex = 0
-        val loopStart = System.currentTimeMillis()
-
-        for (query in queries) {
-            for (useRetrieval in retrievalModes) {
-                for (rep in 1..repeats) {
-                    runIndex++
-
-                    // Estimate time remaining based on average time per completed run
-                    val etaStr = if (runIndex > 1) {
-                        val elapsedMs = System.currentTimeMillis() - loopStart
-                        val avgPerRun = elapsedMs.toDouble() / (runIndex - 1)
-                        val remainingMs = (avgPerRun * (totalRuns - runIndex + 1)).toLong()
-                        val remainMin = remainingMs / 60000
-                        val remainSec = (remainingMs % 60000) / 1000
-                        "ETA: ${remainMin}m ${remainSec}s"
-                    } else "ETA: calculating..."
-
-                    Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats")
-                    logStatus("[$runIndex/$totalRuns] ${query.id} | retrieval=$useRetrieval rep=$rep | $etaStr")
-
-                    val preMemory = collectMemoryInfo()
-                    val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride)
-                    val postMemory = collectMemoryInfo()
-
-                    val decodeTps = if (result.decodeMs > 0)
-                        round2(result.estimatedTokens / (result.decodeMs / 1000.0))
-                    else 0.0
-
-                    val entry = JSONObject().apply {
-                        put("query_id", query.id)
-                        put("category", query.category)
-                        put("query_text", query.text)
-                        put("query_word_count", query.wordCount)
-                        put("use_retrieval", useRetrieval)
-                        put("repetition", rep)
-                        put("retrieval_time_ms", result.retrievalTimeMs)
-                        put("ttft_ms", result.ttftMs)
-                        put("prefill_ms", result.prefillMs)
-                        put("decode_ms", result.decodeMs)
-                        put("total_generation_ms", result.generationTotalMs)
-                        put("total_query_ms", result.totalQueryMs)
-                        put("response_length_chars", result.responseChars)
-                        put("estimated_tokens", result.estimatedTokens)
-                        put("decode_throughput_tps", decodeTps)
-                        put("num_retrieved_docs", result.numRetrievedDocs)
-                        // Full content for downstream analysis: chunks the retriever surfaced,
-                        // total chunk-text length (drives prefill cost), and the model's
-                        // generated response.
-                        put("retrieved_chunks", JSONArray().apply {
-                            result.retrievedChunks.forEach { doc ->
-                                put(JSONObject().apply {
-                                    put("text", doc.text)
-                                    put("source", doc.source)
-                                    put("page", doc.page)
-                                    put("chars", doc.text.length)
-                                })
-                            }
-                        })
-                        put("retrieved_total_chars", result.retrievedTotalChars)
-                        put("response_text", result.responseText)
-                        put("error", result.error ?: JSONObject.NULL)
-                        put("heap_before_mb", preMemory.getInt("used_mb"))
-                        put("heap_after_mb", postMemory.getInt("used_mb"))
-                    }
-                    results.add(entry)
-
-                    val resultLine = "  -> ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms tps=$decodeTps"
-                    Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps")
-                    logStatus(resultLine)
-
-                    val pct = (runIndex * 100) / totalRuns
-                    val elapsedMin = (System.currentTimeMillis() - loopStart) / 60000
-                    logStatus("  [${"█".repeat(pct / 5)}${"░".repeat(20 - pct / 5)}] $pct% ($elapsedMin min elapsed)")
-
-                    // Cooldown between queries (skip after last run).
-                    // delay() vs Thread.sleep(): the suspending variant doesn't block the
-                    // UI thread, which is essential — cooldowns >5s with Thread.sleep
-                    // trigger an ANR (Input Dispatching Timeout) and Android kills the
-                    // activity mid-benchmark.
-                    if (runIndex < totalRuns) {
-                        delay(cooldownMs)
-                    }
-                }
-            }
-        }
-
-        // Assemble output JSON
-        val output = JSONObject().apply {
-            put("benchmark_version", 1)
-            put("timestamp", timestamp)
-            put("device", deviceInfo)
-            put("config", JSONObject().apply {
-                put("repeats", repeats)
-                put("cooldown_ms", cooldownMs)
-                put("skip_retrieval", skipRetrieval)
-                put("rag_only", ragOnly)
-                put("query_filter", queryFilter ?: JSONObject.NULL)
-                // retrieval_top_k_override is null when the session uses runtime_config.json's
-                // value; non-null records the override value used for this whole session.
-                put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL)
-                put("model", "gemma-4-E4B-it.litertlm")
-                put("backend", "CPU")
-                put("max_tokens", 32000)
-                put("temperature", 1.0)
-                put("top_p", 0.95)
-                put("top_k", 64)
-            })
-            put("init", JSONObject().apply {
-                put("gecko_sqlite_ms", syncInitMs)
-                put("llm_load_ms", llmInitMs)
-                put("warmup_query_ms", warmupMs)
-                put("total_init_ms", totalInitMs)
-            })
-            put("memory", postInitMemory)
-            put("results", JSONArray(results))
-            put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart)
-        }
-
-        // Write to file
-        val outFile = File(getExternalFilesDir(null), "benchmark_results.json")
-        outFile.writeText(output.toString(2))
-        Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}")
-        Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE")
-        logStatus("COMPLETE\nResults written to:\n${outFile.absolutePath}")
-    }
-
-    // ── Single query execution ───────────────────────────────────────────
-
-    private data class QueryResult(
-        val retrievalTimeMs: Long,
-        val ttftMs: Long,
-        val prefillMs: Long,
-        val decodeMs: Long,
-        val generationTotalMs: Long,
-        val totalQueryMs: Long,
-        val responseChars: Int,
-        val estimatedTokens: Int,
-        val numRetrievedDocs: Int,
-        val retrievedChunks: List<RetrievedDoc>,
-        val retrievedTotalChars: Int,
-        val responseText: String,
-        val error: String?,
-    )
-
-    private suspend fun runQuery(
-        pipeline: RagPipeline,
-        queryText: String,
-        useRetrieval: Boolean,
-        retrieveKOverride: Int?,
-    ): QueryResult {
-        var retrievalTimeMs = 0L
-        var numDocs = 0
-        var firstTokenTime = 0L
-        var error: String? = null
-        val responseBuilder = StringBuilder()
-        var retrievedChunks: List<RetrievedDoc> = emptyList()
-
-        val qStart = System.currentTimeMillis()
-        var retrievalDoneTime = 0L
-
-        try {
-            withContext(executor.asCoroutineDispatcher()) {
-                pipeline.generateResponse(
-                    prompt = queryText,
-                    history = emptyList(),
-                    useRetrieval = useRetrieval,
-                    retrievalListener = { docs ->
-                        retrievalDoneTime = System.currentTimeMillis()
-                        retrievalTimeMs = retrievalDoneTime - qStart
-                        numDocs = docs.size
-                        retrievedChunks = docs
-                    },
-                    generationListener = { partial, _ ->
-                        responseBuilder.append(partial)
-                        if (firstTokenTime == 0L && partial.isNotEmpty()) {
-                            firstTokenTime = System.currentTimeMillis()
-                        }
-                    },
-                    retrieveKOverride = retrieveKOverride,
-                )
-            }
-        } catch (e: Exception) {
-            error = e.message
-            Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e)
-        }
-
-        val qEnd = System.currentTimeMillis()
-        val totalQueryMs = qEnd - qStart
-        val responseChars = responseBuilder.length
-
-        // Generation timing — measure from after retrieval (or query start if no retrieval)
-        val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart
-        val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0
-        val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0
-        val generationTotalMs = qEnd - genStart
-        val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt()
-
-        return QueryResult(
-            retrievalTimeMs = retrievalTimeMs,
-            ttftMs = ttftMs,
-            prefillMs = ttftMs,
-            decodeMs = decodeMs,
-            generationTotalMs = generationTotalMs,
-            totalQueryMs = totalQueryMs,
-            responseChars = responseChars,
-            estimatedTokens = estimatedTokens,
-            numRetrievedDocs = numDocs,
-            retrievedChunks = retrievedChunks,
-            retrievedTotalChars = retrievedChunks.sumOf { it.text.length },
-            responseText = responseBuilder.toString(),
-            error = error,
-        )
-    }
-
-    // ── Helpers ──────────────────────────────────────────────────────────
-
-    private fun collectDeviceInfo(): JSONObject = JSONObject().apply {
-        put("manufacturer", Build.MANUFACTURER)
-        put("model", Build.MODEL)
-        put("device", Build.DEVICE)
-        put("hardware", Build.HARDWARE)
-        put("board", Build.BOARD)
-        put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown")
-        put("android_version", Build.VERSION.RELEASE)
-        put("sdk_int", Build.VERSION.SDK_INT)
-        put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown")
+        Log.w(BENCH_TAG, "[BENCHMARK] BenchmarkActivity → forwarded extras to BenchmarkForegroundService, finishing.")
+        finish()
     }
-
-    private fun collectMemoryInfo(): JSONObject {
-        val rt = Runtime.getRuntime()
-        return JSONObject().apply {
-            put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024)
-            put("free_mb", rt.freeMemory() / 1024 / 1024)
-            put("total_mb", rt.totalMemory() / 1024 / 1024)
-            put("max_mb", rt.maxMemory() / 1024 / 1024)
-        }
-    }
-
-    private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0
 }
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
new file mode 100644
index 0000000..76358d0
--- /dev/null
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -0,0 +1,487 @@
+package com.example.app
+
+import android.app.Notification
+import android.app.NotificationChannel
+import android.app.NotificationManager
+import android.app.Service
+import android.content.Context
+import android.content.Intent
+import android.content.pm.ServiceInfo
+import android.os.Build
+import android.os.IBinder
+import android.os.PowerManager
+import android.util.Log
+import androidx.core.app.NotificationCompat
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.asCoroutineDispatcher
+import kotlinx.coroutines.cancel
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import org.json.JSONArray
+import org.json.JSONObject
+import java.io.File
+import java.text.SimpleDateFormat
+import java.util.Date
+import java.util.Locale
+import java.util.concurrent.Executors
+
+/**
+ * Foreground service that runs the on-device latency benchmark.
+ *
+ * The service holds a PARTIAL_WAKE_LOCK and posts a sticky notification so
+ * the OS keeps the process alive — unlike a plain Activity, which the
+ * vendor power manager (e.g. OPPO's OplusProxyWakeLock) will idle as soon
+ * as the screen sleeps. This lets multi-hour k-sweeps run while the
+ * device is locked or the screen is off.
+ *
+ * Launched via [BenchmarkActivity] which forwards Intent extras from `am
+ * start`. All benchmark logic lives here; the Activity is a thin shim.
+ *
+ * Intent extras (forwarded from the Activity):
+ *   repeats:Int                Repetitions per query
+ *   cooldown_ms:Long           Sleep between runs
+ *   skip_retrieval:Boolean     Run No-RAG mode only
+ *   rag_only:Boolean           Run RAG mode only
+ *   query_filter:String?       Category or query ID filter
+ *   retrieve_k:Int (>=0)       Override retrieval top_k; -1 = use config
+ */
+class BenchmarkForegroundService : Service() {
+
+    companion object {
+        private const val TAG = "mam-ai"
+        private const val BENCH_TAG = "mam-ai-bench"
+        private const val NOTIFICATION_ID = 1002
+        const val CHANNEL_ID = "mam_ai_benchmark"
+        private const val DEFAULT_COOLDOWN_MS = 5_000L
+        private const val DEFAULT_REPEATS = 3
+        private const val CHARS_PER_TOKEN_ESTIMATE = 4.0
+    }
+
+    // Dispatchers.Default so the long-running coroutine isn't tied to the UI
+    // thread. The service has no UI anyway, but Default also ensures the work
+    // continues regardless of any activity lifecycle event.
+    private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default)
+    private val executor = Executors.newSingleThreadExecutor()
+    private var wakeLock: PowerManager.WakeLock? = null
+
+    override fun onBind(intent: Intent?): IBinder? = null
+
+    override fun onCreate() {
+        super.onCreate()
+        ensureChannel(this)
+
+        // PARTIAL_WAKE_LOCK lets the CPU keep running through screen-off.
+        // Vendor power managers (OPPO ColorOS, Xiaomi MIUI, etc.) respect
+        // wake locks held by foreground services — they aggressively
+        // release locks held by background activities.
+        val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager
+        wakeLock = powerManager.newWakeLock(
+            PowerManager.PARTIAL_WAKE_LOCK,
+            "mam-ai:benchmark"
+        ).apply {
+            setReferenceCounted(false)
+            acquire(6L * 60L * 60L * 1000L)  // 6 h failsafe
+        }
+        Log.w(BENCH_TAG, "[BENCHMARK] Service onCreate, PARTIAL_WAKE_LOCK acquired")
+    }
+
+    override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int {
+        startForegroundCompat("MAM-AI benchmark starting…", -1, 0)
+
+        val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS
+        val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS
+        val skipRetrieval = intent?.getBooleanExtra("skip_retrieval", false) ?: false
+        val ragOnly = intent?.getBooleanExtra("rag_only", false) ?: false
+        val queryFilter = intent?.getStringExtra("query_filter")
+        val retrieveKOverride: Int? = intent?.getIntExtra("retrieve_k", -1)?.takeIf { it >= 0 }
+
+        scope.launch {
+            try {
+                runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride)
+            } catch (t: Throwable) {
+                Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t)
+                Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
+            } finally {
+                stopSelf()
+            }
+        }
+        // START_NOT_STICKY: don't auto-restart on kill — the benchmark is a
+        // one-shot job; restarting halfway through would corrupt the run.
+        return START_NOT_STICKY
+    }
+
+    override fun onDestroy() {
+        super.onDestroy()
+        wakeLock?.let {
+            if (it.isHeld) {
+                it.release()
+                Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK")
+            }
+        }
+        wakeLock = null
+        scope.cancel()
+        @Suppress("DEPRECATION")
+        stopForeground(true)
+    }
+
+    // ── Notification plumbing ────────────────────────────────────────────
+
+    private fun startForegroundCompat(message: String, progress: Int, max: Int) {
+        val notification = buildNotification(this, message, progress, max)
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.UPSIDE_DOWN_CAKE) {
+            startForeground(
+                NOTIFICATION_ID,
+                notification,
+                ServiceInfo.FOREGROUND_SERVICE_TYPE_DATA_SYNC,
+            )
+        } else {
+            startForeground(NOTIFICATION_ID, notification)
+        }
+    }
+
+    private fun updateNotification(message: String, progress: Int, max: Int) {
+        val nm = getSystemService(NotificationManager::class.java) ?: return
+        nm.notify(NOTIFICATION_ID, buildNotification(this, message, progress, max))
+    }
+
+    // ── Main benchmark loop ──────────────────────────────────────────────
+
+    private suspend fun runBenchmark(
+        repeats: Int,
+        cooldownMs: Long,
+        skipRetrieval: Boolean,
+        ragOnly: Boolean,
+        queryFilter: String?,
+        retrieveKOverride: Int?,
+    ) {
+        val benchmarkStart = System.currentTimeMillis()
+        val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date())
+
+        Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter retrieve_k=${retrieveKOverride ?: "default"} rag_only=$ragOnly")
+
+        val deviceInfo = collectDeviceInfo()
+        Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})")
+
+        updateNotification("Initializing pipeline…", -1, 0)
+        Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...")
+        val initStart = System.currentTimeMillis()
+        val pipeline = withContext(executor.asCoroutineDispatcher()) {
+            RagPipeline(application)
+        }
+        val syncInitMs = System.currentTimeMillis() - initStart
+        Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms")
+
+        updateNotification("Loading Gemma 4 LLM…", -1, 0)
+        Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...")
+        val llmWaitStart = System.currentTimeMillis()
+        withContext(executor.asCoroutineDispatcher()) { pipeline.awaitLlmReady() }
+        val llmInitMs = System.currentTimeMillis() - llmWaitStart
+        Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)")
+
+        val warmupQueries = listOf(
+            "Normal fetal heart rate",
+            "Signs of infection after delivery",
+            "A mother has heavy bleeding after birth. What should I do first?",
+            "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?",
+            "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?",
+        )
+        updateNotification("Warmup queries (${warmupQueries.size})…", -1, 0)
+        Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...")
+        val warmupStart = System.currentTimeMillis()
+        warmupQueries.forEachIndexed { i, prompt ->
+            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"")
+            withContext(executor.asCoroutineDispatcher()) {
+                pipeline.generateResponse(
+                    prompt = prompt,
+                    history = emptyList(),
+                    useRetrieval = false,
+                    retrievalListener = {},
+                    generationListener = { _, _ -> }
+                )
+            }
+            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)")
+        }
+        val warmupMs = System.currentTimeMillis() - warmupStart
+        val totalInitMs = System.currentTimeMillis() - initStart
+        Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
+
+        val postInitMemory = collectMemoryInfo()
+        delay(cooldownMs)
+
+        val queries = if (queryFilter != null) {
+            BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter }
+        } else {
+            BenchmarkQueries.ALL
+        }
+        if (queries.isEmpty()) {
+            Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'")
+            Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
+            return
+        }
+
+        // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins).
+        val retrievalModes = when {
+            skipRetrieval -> listOf(false)
+            ragOnly -> listOf(true)
+            else -> listOf(true, false)
+        }
+        val totalRuns = queries.size * retrievalModes.size * repeats
+        Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs")
+
+        val results = mutableListOf<JSONObject>()
+        var runIndex = 0
+        val loopStart = System.currentTimeMillis()
+
+        for (query in queries) {
+            for (useRetrieval in retrievalModes) {
+                for (rep in 1..repeats) {
+                    runIndex++
+
+                    val pct = (runIndex * 100) / totalRuns
+                    updateNotification("[$runIndex/$totalRuns] ${query.id} rep=$rep", runIndex, totalRuns)
+
+                    Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats")
+
+                    val preMemory = collectMemoryInfo()
+                    val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride)
+                    val postMemory = collectMemoryInfo()
+
+                    val decodeTps = if (result.decodeMs > 0)
+                        round2(result.estimatedTokens / (result.decodeMs / 1000.0))
+                    else 0.0
+
+                    val entry = JSONObject().apply {
+                        put("query_id", query.id)
+                        put("category", query.category)
+                        put("query_text", query.text)
+                        put("query_word_count", query.wordCount)
+                        put("use_retrieval", useRetrieval)
+                        put("repetition", rep)
+                        put("retrieval_time_ms", result.retrievalTimeMs)
+                        put("ttft_ms", result.ttftMs)
+                        put("prefill_ms", result.prefillMs)
+                        put("decode_ms", result.decodeMs)
+                        put("total_generation_ms", result.generationTotalMs)
+                        put("total_query_ms", result.totalQueryMs)
+                        put("response_length_chars", result.responseChars)
+                        put("estimated_tokens", result.estimatedTokens)
+                        put("decode_throughput_tps", decodeTps)
+                        put("num_retrieved_docs", result.numRetrievedDocs)
+                        put("retrieved_chunks", JSONArray().apply {
+                            result.retrievedChunks.forEach { doc ->
+                                put(JSONObject().apply {
+                                    put("text", doc.text)
+                                    put("source", doc.source)
+                                    put("page", doc.page)
+                                    put("chars", doc.text.length)
+                                })
+                            }
+                        })
+                        put("retrieved_total_chars", result.retrievedTotalChars)
+                        put("response_text", result.responseText)
+                        put("error", result.error ?: JSONObject.NULL)
+                        put("heap_before_mb", preMemory.getInt("used_mb"))
+                        put("heap_after_mb", postMemory.getInt("used_mb"))
+                    }
+                    results.add(entry)
+
+                    Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps")
+
+                    if (runIndex < totalRuns) {
+                        delay(cooldownMs)
+                    }
+                }
+            }
+        }
+
+        val output = JSONObject().apply {
+            put("benchmark_version", 1)
+            put("timestamp", timestamp)
+            put("device", deviceInfo)
+            put("config", JSONObject().apply {
+                put("repeats", repeats)
+                put("cooldown_ms", cooldownMs)
+                put("skip_retrieval", skipRetrieval)
+                put("rag_only", ragOnly)
+                put("query_filter", queryFilter ?: JSONObject.NULL)
+                put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL)
+                put("model", "gemma-4-E4B-it.litertlm")
+                put("backend", "CPU")
+                put("max_tokens", 32000)
+                put("temperature", 1.0)
+                put("top_p", 0.95)
+                put("top_k", 64)
+            })
+            put("init", JSONObject().apply {
+                put("gecko_sqlite_ms", syncInitMs)
+                put("llm_load_ms", llmInitMs)
+                put("warmup_query_ms", warmupMs)
+                put("total_init_ms", totalInitMs)
+            })
+            put("memory", postInitMemory)
+            put("results", JSONArray(results))
+            put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart)
+        }
+
+        val outFile = File(getExternalFilesDir(null), "benchmark_results.json")
+        outFile.writeText(output.toString(2))
+        Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}")
+        Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE")
+    }
+
+    // ── Single-query execution ───────────────────────────────────────────
+
+    private data class QueryResult(
+        val retrievalTimeMs: Long,
+        val ttftMs: Long,
+        val prefillMs: Long,
+        val decodeMs: Long,
+        val generationTotalMs: Long,
+        val totalQueryMs: Long,
+        val responseChars: Int,
+        val estimatedTokens: Int,
+        val numRetrievedDocs: Int,
+        val retrievedChunks: List<RetrievedDoc>,
+        val retrievedTotalChars: Int,
+        val responseText: String,
+        val error: String?,
+    )
+
+    private suspend fun runQuery(
+        pipeline: RagPipeline,
+        queryText: String,
+        useRetrieval: Boolean,
+        retrieveKOverride: Int?,
+    ): QueryResult {
+        var retrievalTimeMs = 0L
+        var numDocs = 0
+        var firstTokenTime = 0L
+        var error: String? = null
+        val responseBuilder = StringBuilder()
+        var retrievedChunks: List<RetrievedDoc> = emptyList()
+
+        val qStart = System.currentTimeMillis()
+        var retrievalDoneTime = 0L
+
+        try {
+            withContext(executor.asCoroutineDispatcher()) {
+                pipeline.generateResponse(
+                    prompt = queryText,
+                    history = emptyList(),
+                    useRetrieval = useRetrieval,
+                    retrievalListener = { docs ->
+                        retrievalDoneTime = System.currentTimeMillis()
+                        retrievalTimeMs = retrievalDoneTime - qStart
+                        numDocs = docs.size
+                        retrievedChunks = docs
+                    },
+                    generationListener = { partial, _ ->
+                        responseBuilder.append(partial)
+                        if (firstTokenTime == 0L && partial.isNotEmpty()) {
+                            firstTokenTime = System.currentTimeMillis()
+                        }
+                    },
+                    retrieveKOverride = retrieveKOverride,
+                )
+            }
+        } catch (e: Exception) {
+            error = e.message
+            Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e)
+        }
+
+        val qEnd = System.currentTimeMillis()
+        val totalQueryMs = qEnd - qStart
+        val responseChars = responseBuilder.length
+
+        // TTFT excludes retrieval; we measure from end-of-retrieval to first token.
+        val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart
+        val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0
+        val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0
+        val generationTotalMs = qEnd - genStart
+        val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt()
+
+        return QueryResult(
+            retrievalTimeMs = retrievalTimeMs,
+            ttftMs = ttftMs,
+            prefillMs = ttftMs,
+            decodeMs = decodeMs,
+            generationTotalMs = generationTotalMs,
+            totalQueryMs = totalQueryMs,
+            responseChars = responseChars,
+            estimatedTokens = estimatedTokens,
+            numRetrievedDocs = numDocs,
+            retrievedChunks = retrievedChunks,
+            retrievedTotalChars = retrievedChunks.sumOf { it.text.length },
+            responseText = responseBuilder.toString(),
+            error = error,
+        )
+    }
+
+    // ── Helpers ──────────────────────────────────────────────────────────
+
+    private fun collectDeviceInfo(): JSONObject = JSONObject().apply {
+        put("manufacturer", Build.MANUFACTURER)
+        put("model", Build.MODEL)
+        put("device", Build.DEVICE)
+        put("hardware", Build.HARDWARE)
+        put("board", Build.BOARD)
+        put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown")
+        put("android_version", Build.VERSION.RELEASE)
+        put("sdk_int", Build.VERSION.SDK_INT)
+        put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown")
+    }
+
+    private fun collectMemoryInfo(): JSONObject {
+        val rt = Runtime.getRuntime()
+        return JSONObject().apply {
+            put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024)
+            put("free_mb", rt.freeMemory() / 1024 / 1024)
+            put("total_mb", rt.totalMemory() / 1024 / 1024)
+            put("max_mb", rt.maxMemory() / 1024 / 1024)
+        }
+    }
+
+    private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0
+
+    private fun ensureChannel(context: Context) {
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+            val nm = context.getSystemService(NotificationManager::class.java)
+            if (nm?.getNotificationChannel(CHANNEL_ID) == null) {
+                val channel = NotificationChannel(
+                    CHANNEL_ID,
+                    "MAM-AI Benchmark",
+                    NotificationManager.IMPORTANCE_LOW,
+                ).apply {
+                    description = "Foreground notification while the on-device latency benchmark runs"
+                    setShowBadge(false)
+                }
+                nm?.createNotificationChannel(channel)
+            }
+        }
+    }
+
+    private fun buildNotification(
+        context: Context,
+        message: String,
+        progress: Int,
+        max: Int,
+    ): Notification {
+        val builder = NotificationCompat.Builder(context, CHANNEL_ID)
+            .setContentTitle("MAM-AI Benchmark")
+            .setContentText(message)
+            .setSmallIcon(android.R.drawable.stat_sys_download)
+            .setOngoing(true)
+            .setOnlyAlertOnce(true)
+            .setPriority(NotificationCompat.PRIORITY_LOW)
+
+        if (max > 0 && progress >= 0) {
+            builder.setProgress(max, progress, false)
+        } else {
+            builder.setProgress(0, 0, true)
+        }
+        return builder.build()
+    }
+}

From ef965381ab3ee4b072a42de4759968166940ea12 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Thu, 14 May 2026 21:33:35 +0800
Subject: [PATCH 08/30] fix(benchmark): record actual backend (GPU/CPU) in
 config metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The config block dump in benchmark_results.json was hard-coded to
"backend":"CPU" — wrong for any build with useGpuForLlm=true. The
GPU-sweep JSONs we just ran on the OPPO Snapdragon 8 Elite all carry
the incorrect "CPU" label even though they were measured on GPU.

Now reads from BuildConfig.USE_GPU_FOR_LLM at compile time and writes
"GPU" or "CPU" accordingly. Also adds "mtp_enabled" from
BuildConfig.USE_MTP_FOR_LLM for full provenance.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kotlin/com/example/app/BenchmarkForegroundService.kt    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index 76358d0..9506786 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -309,7 +309,11 @@ class BenchmarkForegroundService : Service() {
                 put("query_filter", queryFilter ?: JSONObject.NULL)
                 put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL)
                 put("model", "gemma-4-E4B-it.litertlm")
-                put("backend", "CPU")
+                // Read backend from BuildConfig at compile time. Older builds
+                // hard-coded "CPU" here even when GPU was active — fixed so the
+                // JSON metadata matches reality.
+                put("backend", if (BuildConfig.USE_GPU_FOR_LLM) "GPU" else "CPU")
+                put("mtp_enabled", BuildConfig.USE_MTP_FOR_LLM)
                 put("max_tokens", 32000)
                 put("temperature", 1.0)
                 put("top_p", 0.95)

From ede273f5b6045e59f0fa10a54636109f29542540 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 04:36:07 +0800
Subject: [PATCH 09/30] analysis: k-sweep latency report (GPU + CPU on
 Snapdragon 8 Elite)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aggregates the 15 canonical 54-run benchmark JSONs into a single
GPU↔CPU comparison report covering k ∈ {0, 1, 3, 5, 7, 10, 15} (and
GPU-only k=20). Produced by a new evaluation/aggregate_k_sweep.py
that's re-runnable as more JSONs land.

Headline numbers (median total query latency on OPPO OPD2413,
Snapdragon 8 Elite, Gemma 4 E4B, LiteRT-LM 0.11.0):

  k=0 (no-RAG):  GPU 13–16 s | CPU 27–30 s    (1.9× slower on CPU)
  k=3         :  GPU 19–21 s | CPU 37–45 s    (2.2× slower)
  k=10        :  GPU 21–22 s | CPU 62–78 s    (3.1× slower)
  k=15        :  GPU 22–25 s | CPU 81–90 s    (3.5× slower)
  k=20        :  GPU 44% of runs fail at the 4096-token model ceiling

Key findings:
- GPU is the practical choice for this device tier — TTFT is 13–19×
  faster than CPU; total latency is 2–3.5× faster.
- The model's 4096-token context window is the binding upper limit
  (k_max ≈ 17–18), not latency. GPU has comfortable headroom below
  that ceiling.
- CPU is unusable past k≈3 for any reasonable UX budget. At k=15,
  CPU p95 latency hits 113 s.
- Decode is memory-bandwidth-bound (GPU/CPU within ~1.4×); the GPU
  win is entirely in compute-heavy prefill.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py         | 335 ++++++++++++++++++++++++
 evaluation/reports/latency_report_v2.md | 171 ++++++++++++
 2 files changed, 506 insertions(+)
 create mode 100644 evaluation/aggregate_k_sweep.py
 create mode 100644 evaluation/reports/latency_report_v2.md

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
new file mode 100644
index 0000000..3aa18d4
--- /dev/null
+++ b/evaluation/aggregate_k_sweep.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report.
+
+Reads all benchmark_*.json files produced by benchmark_latency.py, groups them
+by (backend, k_override), and writes a markdown report at
+evaluation/reports/latency_report_v2.md.
+
+Notes on backend identification: GPU sweep JSONs from before commit ef96538
+(2026-05-14 ~21:34) have config.backend="CPU" hard-coded (bug fixed later);
+we identify them by timestamp instead. Anything before the threshold is GPU.
+"""
+from __future__ import annotations
+
+import datetime
+import glob
+import json
+import os
+import statistics
+from collections import defaultdict
+from pathlib import Path
+
+# Timestamp threshold separating GPU runs (before) from CPU runs (after).
+# The CPU rebuild + reinstall happened at ~21:34 on 2026-05-14.
+THRESHOLD_TS = "20260514T2130"
+
+
+def backend_of(timestamp: str, recorded: str) -> str:
+    """Override stale GPU-era "CPU" labels using the timestamp."""
+    if timestamp < THRESHOLD_TS:
+        return "GPU"
+    return recorded
+
+
+def load_runs() -> list[dict]:
+    files = sorted(glob.glob(os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "latency_results", "benchmark_2026051*.json",
+    )))
+    runs = []
+    for f in files:
+        try:
+            d = json.load(open(f))
+        except (json.JSONDecodeError, OSError):
+            continue
+        if "config" not in d or "results" not in d:
+            continue
+        if len(d["results"]) < 30:
+            continue  # skip ad-hoc smoke tests; the canonical sweep is 54 runs
+        ts = os.path.basename(f).replace("benchmark_", "").split(".")[0].split("_")[0]
+        k_override = d["config"].get("retrieval_top_k_override")
+        skip_retrieval = d["config"].get("skip_retrieval", False)
+        k_label = 0 if skip_retrieval else (k_override if k_override is not None else None)
+        if k_label is None:
+            continue
+        backend = backend_of(ts, d["config"].get("backend", "CPU"))
+        runs.append({
+            "file": os.path.basename(f),
+            "timestamp": ts,
+            "backend": backend,
+            "k": k_label,
+            "data": d,
+        })
+    return runs
+
+
+def aggregate_per_category(d: dict, key: str) -> dict[str, dict]:
+    """Per-category {median, p95, n} for the given timing field."""
+    cat_vals: dict[str, list] = defaultdict(list)
+    for r in d["results"]:
+        if r.get("error"):
+            continue
+        cat_vals[r["category"]].append(r[key])
+    out = {}
+    for c, vs in cat_vals.items():
+        if not vs:
+            continue
+        s = sorted(vs)
+        out[c] = {
+            "n": len(vs),
+            "median": int(statistics.median(vs)),
+            "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]),
+        }
+    return out
+
+
+def aggregate_overall(d: dict, key: str) -> dict:
+    vs = [r[key] for r in d["results"] if not r.get("error")]
+    if not vs:
+        return {}
+    s = sorted(vs)
+    return {
+        "n": len(vs),
+        "median": int(statistics.median(vs)),
+        "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]),
+    }
+
+
+def avg_doc_chars(d: dict) -> int:
+    vs = [r.get("retrieved_total_chars", 0) for r in d["results"] if not r.get("error")]
+    return int(statistics.median(vs)) if vs else 0
+
+
+def fmt_ms(v: int | None) -> str:
+    return f"{v}" if v is not None else "—"
+
+
+def fmt_s(v: int | None) -> str:
+    return f"{v / 1000:.1f}" if v is not None else "—"
+
+
+def write_report(runs: list[dict], out_path: Path) -> None:
+    # Build {(backend, k) -> latest canonical run}
+    matrix: dict[tuple[str, int], dict] = {}
+    for r in runs:
+        key = (r["backend"], r["k"])
+        if key in matrix:
+            # Keep the run with most successful entries (resolves duplicates)
+            ex = matrix[key]
+            ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error"))
+            r_ok = sum(1 for x in r["data"]["results"] if not x.get("error"))
+            if r_ok > ex_ok:
+                matrix[key] = r
+        else:
+            matrix[key] = r
+
+    gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"])
+    cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"])
+    all_ks = sorted(set(gpu_ks + cpu_ks))
+
+    # Sample run for device info
+    sample = next(iter(matrix.values()))
+    dev = sample["data"]["device"]
+
+    md = []
+    md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n")
+    md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n")
+    md.append("")
+    md.append("## Device & stack\n")
+    md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}")
+    md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)")
+    md.append(f"- **LiteRT-LM**: 0.11.0")
+    md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU")
+    md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000")
+    md.append("")
+    md.append("## Methodology\n")
+    md.append("Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. ")
+    md.append("Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs ")
+    md.append("for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives ")
+    md.append("screen-off and device-lock; OPPO Hans whitelist set manually.")
+    md.append("")
+    md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.")
+    md.append("- `decode` is first-token to last-token.")
+    md.append("- `total_query` is everything: `retrieval + TTFT + decode`.")
+    md.append("- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).")
+    md.append("")
+
+    # ─────────── Headline table: total_query_ms by (backend, k) ───────────
+    md.append("## Headline — Median total query latency (seconds)\n")
+    md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |")
+    md.append(f"|---:|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        # doc chars: take from GPU if available, else CPU
+        doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        gpu_cells = "—"
+        cpu_cells = "—"
+        ratios = []
+        for col, run, key in [("gpu", gpu_run, "gpu"), ("cpu", cpu_run, "cpu")]:
+            pass
+        if gpu_run:
+            g = aggregate_per_category(gpu_run["data"], "total_query_ms")
+            gpu_cells = " / ".join(fmt_s(g.get(c, {}).get("median")) for c in ["short", "medium", "long"])
+        if cpu_run:
+            c_ = aggregate_per_category(cpu_run["data"], "total_query_ms")
+            cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"])
+        # ratio
+        ratio = ""
+        if gpu_run and cpu_run:
+            gov = aggregate_overall(gpu_run["data"], "total_query_ms")["median"]
+            cov = aggregate_overall(cpu_run["data"], "total_query_ms")["median"]
+            if gov:
+                ratio = f"{cov / gov:.2f}×"
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |")
+    md.append("")
+
+    # ─────────── TTFT detail ───────────
+    md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n")
+    md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |")
+    md.append(f"|---:|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None
+        ratio = f"{cv / gv:.1f}×" if gv and cv else ""
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
+    md.append("")
+
+    # ─────────── Decode detail ───────────
+    md.append("## Decode (ms, median) — first token to last token\n")
+    md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ")
+    md.append("the model writing *longer answers* when given more context (more material to draw on).")
+    md.append("")
+    md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |")
+    md.append(f"|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        gv = aggregate_overall(gpu_run["data"], "decode_ms")["median"] if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "decode_ms")["median"] if cpu_run else None
+        ratio = f"{cv / gv:.2f}×" if gv and cv else ""
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
+    md.append("")
+
+    # ─────────── p95 totals ───────────
+    md.append("## p95 total query latency (s) — tail-latency view\n")
+    md.append(f"| k | GPU p95 | CPU p95 |")
+    md.append(f"|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        gv = aggregate_overall(gpu_run["data"], "total_query_ms")["p95"] if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "total_query_ms")["p95"] if cpu_run else None
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |")
+    md.append("")
+
+    # ─────────── Errors / context limit ───────────
+    md.append("## Errors and the 4096-token context wall\n")
+    md.append(f"| k | GPU errors / 54 | CPU errors / 54 |")
+    md.append(f"|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None
+        ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |")
+    md.append("")
+    md.append("At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum ")
+    md.append("number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B ")
+    md.append("`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped ")
+    md.append("for the same reason (would hit identical limit).")
+    md.append("")
+
+    # ─────────── Wall-clock comparison ───────────
+    md.append("## Wall-clock comparison\n")
+    md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |")
+    md.append("|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None
+        cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None
+        gw_s = f"{gw:.1f}" if gw else "—"
+        cw_s = f"{cw:.1f}" if cw else "—"
+        ratio = f"{cw / gw:.2f}×" if gw and cw else ""
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |")
+
+    # Findings / interpretation
+    md.append("")
+    md.append("## Key findings\n")
+    md.append("")
+    md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite")
+    md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ")
+    md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ")
+    md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.")
+    md.append("")
+    md.append("### 2. The model's 4096-token context window is the binding ceiling at high k")
+    md.append("k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens ")
+    md.append("with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this ")
+    md.append("`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. ")
+    md.append("CPU k=20 was skipped — same model, same limit.")
+    md.append("")
+    md.append("### 3. Latency is not the binding factor on GPU below k=15")
+    md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ")
+    md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ")
+    md.append("not by what fits in the latency budget.")
+    md.append("")
+    md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow")
+    md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ")
+    md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ")
+    md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ")
+    md.append("or **k ≤ 1** if you want sub-40s p95.")
+    md.append("")
+    md.append("### 5. Decode time is content-driven, not k-driven")
+    md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ")
+    md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ")
+    md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ")
+    md.append("not compute-bound on this hardware.")
+    md.append("")
+    md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3")
+    md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ")
+    md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ")
+    md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.")
+    md.append("")
+
+    # File inventory
+    md.append("## Data inventory (per `(backend, k)`)\n")
+    md.append("| Backend | k | File | Wall (min) | Runs | Errors |")
+    md.append("|---|---:|---|---:|---:|---:|")
+    for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])):
+        r = matrix[(b, k)]
+        wall = r["data"]["total_benchmark_time_ms"] / 60000
+        n = len(r["data"]["results"])
+        e = sum(1 for x in r["data"]["results"] if x.get("error"))
+        label = "0 (no-RAG)" if k == 0 else str(k)
+        md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |")
+    md.append("")
+    md.append("---")
+    md.append("")
+    md.append("_Source benchmark JSONs live in `evaluation/latency_results/`. ")
+    md.append("Aggregation script: `evaluation/aggregate_k_sweep.py`._")
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text("\n".join(md) + "\n")
+    print(f"Report written to: {out_path}")
+
+
+def main() -> int:
+    runs = load_runs()
+    print(f"Loaded {len(runs)} canonical runs")
+    out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md"
+    write_report(runs, out)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
new file mode 100644
index 0000000..80a026a
--- /dev/null
+++ b/evaluation/reports/latency_report_v2.md
@@ -0,0 +1,171 @@
+# MAM-AI On-Device Latency Sweep — GPU vs CPU
+
+_Generated: 2026-05-15T04:35:52_
+
+
+## Device & stack
+
+- **Device**: OnePlus OPD2413 (SM8750P) — Android 15
+- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)
+- **LiteRT-LM**: 0.11.0
+- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU
+- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000
+
+## Methodology
+
+Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. 
+Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs 
+for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives 
+screen-off and device-lock; OPPO Hans whitelist set manually.
+
+- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.
+- `decode` is first-token to last-token.
+- `total_query` is everything: `retrieval + TTFT + decode`.
+- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).
+
+## Headline — Median total query latency (seconds)
+
+| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 12.9 / 15.6 / 16.1 | 27.2 / 26.9 / 29.8 | 1.94× |
+| 1 | 561 | 13.1 / 12.6 / 17.3 | 29.3 / 31.9 / 30.3 | 2.14× |
+| 3 | 2098 | 18.6 / 18.6 / 21.0 | 37.3 / 44.5 / 42.5 | 2.24× |
+| 5 | 3547 | 18.2 / 20.0 / 21.4 | 54.8 / 60.7 / 63.0 | 3.07× |
+| 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× |
+| 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× |
+| 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× |
+| 20 | 14520 | 23.9 / 20.5 / 18.5 | — |  |
+
+## TTFT (ms, median) — prefill cost grows with retrieved-doc content
+
+| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 962 | 12633 | 13.1× |
+| 1 | 561 | 954 | 12649 | 13.3× |
+| 3 | 2098 | 989 | 18356 | 18.6× |
+| 5 | 3547 | 1884 | 36424 | 19.3× |
+| 7 | 5139 | 1920 | 36444 | 19.0× |
+| 10 | 7482 | 2523 | 40013 | 15.9× |
+| 15 | 11297 | 3457 | 54748 | 15.8× |
+| 20 | 14520 | 3986 | — |  |
+
+## Decode (ms, median) — first token to last token
+
+Decode time mostly tracks output length, not k or doc content. Variation across k reflects 
+the model writing *longer answers* when given more context (more material to draw on).
+
+| k | GPU decode | CPU decode | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 13470 | 15345 | 1.14× |
+| 1 | 11415 | 13961 | 1.22× |
+| 3 | 16364 | 19110 | 1.17× |
+| 5 | 15929 | 21645 | 1.36× |
+| 7 | 17215 | 23473 | 1.36× |
+| 10 | 18118 | 21699 | 1.20× |
+| 15 | 16820 | 22497 | 1.34× |
+| 20 | 14688 | — |  |
+
+## p95 total query latency (s) — tail-latency view
+
+| k | GPU p95 | CPU p95 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 26.1 | 38.4 |
+| 1 | 26.1 | 37.1 |
+| 3 | 30.2 | 64.3 |
+| 5 | 30.7 | 74.6 |
+| 7 | 35.1 | 81.7 |
+| 10 | 29.0 | 84.5 |
+| 15 | 30.6 | 112.6 |
+| 20 | 35.3 | — |
+
+## Errors and the 4096-token context wall
+
+| k | GPU errors / 54 | CPU errors / 54 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 0 |
+| 1 | 0 | 0 |
+| 3 | 0 | 0 |
+| 5 | 0 | 0 |
+| 7 | 0 | 0 |
+| 10 | 0 | 0 |
+| 15 | 0 | 0 |
+| 20 | 24 | — |
+
+At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum 
+number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B 
+`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped 
+for the same reason (would hit identical limit).
+
+## Wall-clock comparison
+
+| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 23.5 | 36.9 | 1.57× |
+| 1 | 23.0 | 38.7 | 1.68× |
+| 3 | 27.3 | 50.2 | 1.84× |
+| 5 | 28.2 | 63.0 | 2.23× |
+| 7 | 30.0 | 66.5 | 2.22× |
+| 10 | 29.1 | 73.2 | 2.51× |
+| 15 | 32.4 | 90.8 | 2.80× |
+| 20 | 22.8 | — |  |
+
+## Key findings
+
+
+### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite
+GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. 
+That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), 
+so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.
+
+### 2. The model's 4096-token context window is the binding ceiling at high k
+k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens 
+with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this 
+`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. 
+CPU k=20 was skipped — same model, same limit.
+
+### 3. Latency is not the binding factor on GPU below k=15
+GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. 
+Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), 
+not by what fits in the latency budget.
+
+### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow
+CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. 
+p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't 
+available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, 
+or **k ≤ 1** if you want sub-40s p95.
+
+### 5. Decode time is content-driven, not k-driven
+Decode time tracks output length. As k grows, the model writes *longer* responses — likely because 
+more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. 
+Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, 
+not compute-bound on this hardware.
+
+### 6. TTFT scales linearly with retrieved-doc content past k=3
+On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, 
+CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting 
+the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.
+
+## Data inventory (per `(backend, k)`)
+
+| Backend | k | File | Wall (min) | Runs | Errors |
+|---|---:|---|---:|---:|---:|
+| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 |
+| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 |
+| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 |
+| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 |
+| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 |
+| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 |
+| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 |
+| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 |
+| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 |
+| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 |
+| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 |
+| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 |
+| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 |
+| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 |
+| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 |
+
+---
+
+_Source benchmark JSONs live in `evaluation/latency_results/`. 
+Aggregation script: `evaluation/aggregate_k_sweep.py`._

From 4daf6266453c21579e05858e23df044aac7ee00c Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 07:40:55 +0800
Subject: [PATCH 10/30] =?UTF-8?q?analysis:=20add=20CPU=20k=3D20=20?=
 =?UTF-8?q?=E2=80=94=20confirms=204096-token=20wall=20is=20backend-invaria?=
 =?UTF-8?q?nt?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CPU k=20 reproduces the GPU k=20 failure pattern exactly:
- 24/54 errors on both backends (44% failure rate)
- Identical 8 queries fail on both (long_01, long_03, medium_02,
  medium_04, short_01, short_03, short_04, short_05)
- Same 24 (query × rep) pairs across both runs

This is direct evidence that the 4096-token context cap is a property
of the .litertlm model artifact itself — not a runtime config, not a
backend choice. Strengthens finding #2 from "model is the ceiling, GPU
specifically hits it" to "model is the ceiling, both backends hit it
identically."

Successful CPU k=20 runs: TTFT 65–73 s, total 89–96 s — well past any
deployment budget even when the request fits the window.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py         | 23 +++++++++------
 evaluation/reports/latency_report_v2.md | 38 +++++++++++++++----------
 2 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 3aa18d4..e014225 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -242,10 +242,15 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |")
     md.append("")
-    md.append("At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum ")
-    md.append("number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B ")
-    md.append("`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped ")
-    md.append("for the same reason (would hit identical limit).")
+    md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ")
+    md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ")
+    md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ")
+    md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ")
+    md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ")
+    md.append("The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.")
+    md.append("")
+    md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ")
+    md.append("deployment budget at this depth even when the request fits in the context window.")
     md.append("")
 
     # ─────────── Wall-clock comparison ───────────
@@ -273,10 +278,12 @@ def write_report(runs: list[dict], out_path: Path) -> None:
     md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.")
     md.append("")
     md.append("### 2. The model's 4096-token context window is the binding ceiling at high k")
-    md.append("k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens ")
-    md.append("with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this ")
-    md.append("`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. ")
-    md.append("CPU k=20 was skipped — same model, same limit.")
+    md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ")
+    md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ")
+    md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ")
+    md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ")
+    md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ")
+    md.append("Latency is *not* the constraint at the upper end; the model's context window is.")
     md.append("")
     md.append("### 3. Latency is not the binding factor on GPU below k=15")
     md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ")
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
index 80a026a..61661c5 100644
--- a/evaluation/reports/latency_report_v2.md
+++ b/evaluation/reports/latency_report_v2.md
@@ -1,6 +1,6 @@
 # MAM-AI On-Device Latency Sweep — GPU vs CPU
 
-_Generated: 2026-05-15T04:35:52_
+_Generated: 2026-05-15T07:40:25_
 
 
 ## Device & stack
@@ -34,7 +34,7 @@ screen-off and device-lock; OPPO Hans whitelist set manually.
 | 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× |
 | 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× |
 | 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× |
-| 20 | 14520 | 23.9 / 20.5 / 18.5 | — |  |
+| 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× |
 
 ## TTFT (ms, median) — prefill cost grows with retrieved-doc content
 
@@ -47,7 +47,7 @@ screen-off and device-lock; OPPO Hans whitelist set manually.
 | 7 | 5139 | 1920 | 36444 | 19.0× |
 | 10 | 7482 | 2523 | 40013 | 15.9× |
 | 15 | 11297 | 3457 | 54748 | 15.8× |
-| 20 | 14520 | 3986 | — |  |
+| 20 | 14520 | 3986 | 72881 | 18.3× |
 
 ## Decode (ms, median) — first token to last token
 
@@ -63,7 +63,7 @@ the model writing *longer answers* when given more context (more material to dra
 | 7 | 17215 | 23473 | 1.36× |
 | 10 | 18118 | 21699 | 1.20× |
 | 15 | 16820 | 22497 | 1.34× |
-| 20 | 14688 | — |  |
+| 20 | 14688 | 22634 | 1.54× |
 
 ## p95 total query latency (s) — tail-latency view
 
@@ -76,7 +76,7 @@ the model writing *longer answers* when given more context (more material to dra
 | 7 | 35.1 | 81.7 |
 | 10 | 29.0 | 84.5 |
 | 15 | 30.6 | 112.6 |
-| 20 | 35.3 | — |
+| 20 | 35.3 | 104.9 |
 
 ## Errors and the 4096-token context wall
 
@@ -89,12 +89,17 @@ the model writing *longer answers* when given more context (more material to dra
 | 7 | 0 | 0 |
 | 10 | 0 | 0 |
 | 15 | 0 | 0 |
-| 20 | 24 | — |
+| 20 | 24 | 24 |
 
-At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum 
-number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B 
-`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped 
-for the same reason (would hit identical limit).
+At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. 
+Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both 
+backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — 
+the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of 
+the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. 
+The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.
+
+Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any 
+deployment budget at this depth even when the request fits in the context window.
 
 ## Wall-clock comparison
 
@@ -107,7 +112,7 @@ for the same reason (would hit identical limit).
 | 7 | 30.0 | 66.5 | 2.22× |
 | 10 | 29.1 | 73.2 | 2.51× |
 | 15 | 32.4 | 90.8 | 2.80× |
-| 20 | 22.8 | — |  |
+| 20 | 22.8 | 58.6 | 2.57× |
 
 ## Key findings
 
@@ -118,10 +123,12 @@ That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invaria
 so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.
 
 ### 2. The model's 4096-token context window is the binding ceiling at high k
-k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens 
-with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this 
-`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. 
-CPU k=20 was skipped — same model, same limit.
+k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — 
+the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. 
+Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives 
+the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, 
+not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. 
+Latency is *not* the constraint at the upper end; the model's context window is.
 
 ### 3. Latency is not the binding factor on GPU below k=15
 GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. 
@@ -156,6 +163,7 @@ the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays
 | CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 |
 | CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 |
 | CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 |
+| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 |
 | GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 |
 | GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 |
 | GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 |

From 2a592d2836e94457addf333646114f5b85b9f19e Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 07:57:58 +0800
Subject: [PATCH 11/30] review: address Copilot feedback on PR #57
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Real fixes:

- BenchmarkForegroundService: shut down the single-thread executor in
  onDestroy. Without this its worker thread keeps the :benchmark process
  alive after the service stops.
- BenchmarkForegroundService: remove dead `pct` variable left over from
  the Activity-era ASCII progress bar.
- AndroidManifest: stale comment said BenchmarkActivity holds the wake
  lock; updated to reflect the foreground-service refactor.
- benchmark_latency.py: error out if both --no-retrieval and --rag-only
  are passed (previously they silently coexisted; on-device skipRetrieval
  won, but the result was confusing).
- aggregate_k_sweep.py:
  * backend_of() now only overrides recorded="CPU" when the timestamp
    predates the metadata fix. Future GPU runs (which write backend="GPU"
    correctly) and future CPU runs are trusted as-is — fixes the silent
    mislabeling Copilot flagged.
  * Drop the May-2026-only glob (`benchmark_2026051*`) — use
    `benchmark_*.json` and rely on the schema/length filters.
  * Use `with open(...)` context manager — avoid file-handle leak.
  * Rename `avg_doc_chars` → `median_doc_chars` (function used median
    despite the name).
  * Remove dead loop `for col, run, key in [...]: pass`.
  * Update module docstring to describe the new backfill-only logic.

Verified: Kotlin still compiles (flutter build apk --release succeeds);
aggregate script still loads all 16 canonical runs and regenerates the
same report; the new mutual-exclusion error fires when both Python flags
are passed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/android/app/src/main/AndroidManifest.xml  |  8 ++--
 .../example/app/BenchmarkForegroundService.kt |  6 ++-
 evaluation/aggregate_k_sweep.py               | 41 ++++++++++++-------
 evaluation/benchmark_latency.py               |  3 ++
 evaluation/reports/latency_report_v2.md       |  2 +-
 5 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml
index bb35442..c805436 100644
--- a/app/android/app/src/main/AndroidManifest.xml
+++ b/app/android/app/src/main/AndroidManifest.xml
@@ -4,9 +4,11 @@
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE"/>
     <!-- Required on Android 14+ for network-data foreground services -->
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE_DATA_SYNC"/>
-    <!-- BenchmarkActivity acquires a PARTIAL_WAKE_LOCK so the CPU keeps running
-         when the screen is off or locked. Without this, multi-hour benchmarks
-         stall silently when the device idles. Used only by BenchmarkActivity. -->
+    <!-- BenchmarkForegroundService acquires a PARTIAL_WAKE_LOCK so the CPU
+         keeps running when the screen is off or locked. Without this,
+         multi-hour benchmarks stall silently when the device idles.
+         (BenchmarkActivity is now a thin launcher that just starts the
+         service; the wake lock lives in the service.) -->
     <uses-permission android:name="android.permission.WAKE_LOCK"/>
 
     <application
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index 9506786..e6e5401 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -123,6 +123,10 @@ class BenchmarkForegroundService : Service() {
         }
         wakeLock = null
         scope.cancel()
+        // Shut down the single-thread executor that ferries pipeline calls off
+        // the coroutine dispatchers. Otherwise its worker thread keeps the
+        // :benchmark process alive after the service stops.
+        executor.shutdown()
         @Suppress("DEPRECATION")
         stopForeground(true)
     }
@@ -239,8 +243,6 @@ class BenchmarkForegroundService : Service() {
             for (useRetrieval in retrievalModes) {
                 for (rep in 1..repeats) {
                     runIndex++
-
-                    val pct = (runIndex * 100) / totalRuns
                     updateNotification("[$runIndex/$totalRuns] ${query.id} rep=$rep", runIndex, totalRuns)
 
                     Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats")
diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index e014225..252a463 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -5,9 +5,10 @@
 by (backend, k_override), and writes a markdown report at
 evaluation/reports/latency_report_v2.md.
 
-Notes on backend identification: GPU sweep JSONs from before commit ef96538
-(2026-05-14 ~21:34) have config.backend="CPU" hard-coded (bug fixed later);
-we identify them by timestamp instead. Anything before the threshold is GPU.
+Notes on backend identification: post-fix benchmark JSONs (commit ef96538
+onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep
+JSONs hard-code `backend="CPU"`; we backfill those using a one-time timestamp
+threshold (see `backend_of`). Future runs of any backend are unaffected.
 """
 from __future__ import annotations
 
@@ -19,14 +20,24 @@
 from collections import defaultdict
 from pathlib import Path
 
-# Timestamp threshold separating GPU runs (before) from CPU runs (after).
-# The CPU rebuild + reinstall happened at ~21:34 on 2026-05-14.
+# One-time backfill: GPU sweep JSONs from before commit ef96538 ("fix(benchmark):
+# record actual backend (GPU/CPU) in config metadata") have config.backend="CPU"
+# hard-coded. For just those files, the timestamp identifies them as the GPU
+# sweep we ran before the rebuild at ~21:34 on 2026-05-14. Files with a
+# timestamp at or after the threshold have correct metadata and are trusted.
 THRESHOLD_TS = "20260514T2130"
 
 
 def backend_of(timestamp: str, recorded: str) -> str:
-    """Override stale GPU-era "CPU" labels using the timestamp."""
-    if timestamp < THRESHOLD_TS:
+    """Trust the recorded backend, but backfill pre-fix GPU runs.
+
+    Pre-fix files always say "CPU" in metadata even when GPU was active.
+    We override only when (a) the recorded value is "CPU" AND (b) the
+    timestamp predates the metadata fix. New GPU runs (which write
+    backend="GPU" correctly) and any CPU run from any time are trusted
+    as-is.
+    """
+    if recorded == "CPU" and timestamp < THRESHOLD_TS:
         return "GPU"
     return recorded
 
@@ -34,12 +45,13 @@ def backend_of(timestamp: str, recorded: str) -> str:
 def load_runs() -> list[dict]:
     files = sorted(glob.glob(os.path.join(
         os.path.dirname(os.path.abspath(__file__)),
-        "latency_results", "benchmark_2026051*.json",
+        "latency_results", "benchmark_*.json",
     )))
     runs = []
     for f in files:
         try:
-            d = json.load(open(f))
+            with open(f) as fp:
+                d = json.load(fp)
         except (json.JSONDecodeError, OSError):
             continue
         if "config" not in d or "results" not in d:
@@ -95,7 +107,9 @@ def aggregate_overall(d: dict, key: str) -> dict:
     }
 
 
-def avg_doc_chars(d: dict) -> int:
+def median_doc_chars(d: dict) -> int:
+    """Median retrieved_total_chars across successful runs (the table column
+    is labeled 'doc_chars med', so this is the median by definition)."""
     vs = [r.get("retrieved_total_chars", 0) for r in d["results"] if not r.get("error")]
     return int(statistics.median(vs)) if vs else 0
 
@@ -162,12 +176,9 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         gpu_run = matrix.get(("GPU", k))
         cpu_run = matrix.get(("CPU", k))
         # doc chars: take from GPU if available, else CPU
-        doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
         gpu_cells = "—"
         cpu_cells = "—"
-        ratios = []
-        for col, run, key in [("gpu", gpu_run, "gpu"), ("cpu", cpu_run, "cpu")]:
-            pass
         if gpu_run:
             g = aggregate_per_category(gpu_run["data"], "total_query_ms")
             gpu_cells = " / ".join(fmt_s(g.get(c, {}).get("median")) for c in ["short", "medium", "long"])
@@ -192,7 +203,7 @@ def write_report(runs: list[dict], out_path: Path) -> None:
     for k in all_ks:
         gpu_run = matrix.get(("GPU", k))
         cpu_run = matrix.get(("CPU", k))
-        doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
         gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None
         cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None
         ratio = f"{cv / gv:.1f}×" if gv and cv else ""
diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
index 30dec4c..e5c02a9 100644
--- a/evaluation/benchmark_latency.py
+++ b/evaluation/benchmark_latency.py
@@ -488,6 +488,9 @@ def main():
                         help="Timeout in seconds (default: 7200)")
     args = parser.parse_args()
 
+    if args.no_retrieval and args.rag_only:
+        parser.error("--no-retrieval and --rag-only are mutually exclusive")
+
     print("=" * 60)
     print("MAM-AI On-Device Latency Benchmark")
     print("=" * 60)
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
index 61661c5..25356b3 100644
--- a/evaluation/reports/latency_report_v2.md
+++ b/evaluation/reports/latency_report_v2.md
@@ -1,6 +1,6 @@
 # MAM-AI On-Device Latency Sweep — GPU vs CPU
 
-_Generated: 2026-05-15T07:40:25_
+_Generated: 2026-05-15T07:56:55_
 
 
 ## Device & stack

From 659d3f0d51024a1c04d29d92452f856933b7b3c9 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 08:13:17 +0800
Subject: [PATCH 12/30] review: explicit None checks in aggregate report
 formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace truthy `if gw else "—"` style with `if gw is not None else "—"`,
and add `> 0` guards on division ratios. Defensive against a hypothetical
0-valued median from a corrupted/aborted run JSON, which the truthy form
would have silently rendered as "—" instead of "0.0".

Affects four spots: TTFT ratio, decode ratio, wall-clock ratio +
formatting, and the headline-table ratio.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 252a463..11307a3 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -190,7 +190,7 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         if gpu_run and cpu_run:
             gov = aggregate_overall(gpu_run["data"], "total_query_ms")["median"]
             cov = aggregate_overall(cpu_run["data"], "total_query_ms")["median"]
-            if gov:
+            if gov is not None and gov > 0:
                 ratio = f"{cov / gov:.2f}×"
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |")
@@ -206,7 +206,8 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
         gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None
         cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None
-        ratio = f"{cv / gv:.1f}×" if gv and cv else ""
+        # Explicit None checks; also guard against div-by-zero on a 0 median.
+        ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else ""
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
     md.append("")
@@ -223,7 +224,7 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         cpu_run = matrix.get(("CPU", k))
         gv = aggregate_overall(gpu_run["data"], "decode_ms")["median"] if gpu_run else None
         cv = aggregate_overall(cpu_run["data"], "decode_ms")["median"] if cpu_run else None
-        ratio = f"{cv / gv:.2f}×" if gv and cv else ""
+        ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else ""
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
     md.append("")
@@ -273,9 +274,9 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         cpu_run = matrix.get(("CPU", k))
         gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None
         cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None
-        gw_s = f"{gw:.1f}" if gw else "—"
-        cw_s = f"{cw:.1f}" if cw else "—"
-        ratio = f"{cw / gw:.2f}×" if gw and cw else ""
+        gw_s = f"{gw:.1f}" if gw is not None else "—"
+        cw_s = f"{cw:.1f}" if cw is not None else "—"
+        ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else ""
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |")
 

From 497d2fcc0a5ffb2e652a909a8a75d68ab73b2277 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 08:13:59 +0800
Subject: [PATCH 13/30] review: replace THRESHOLD_TS heuristic with explicit
 allowlist

The timestamp threshold (`if timestamp < "20260514T2130": return "GPU"`)
silently rewrites any pre-threshold CPU JSON as GPU. Anyone running this
aggregator with their own historical genuine-CPU runs in
latency_results/ would have those mislabeled as GPU and potentially
double-counted via the "most successful entries" tiebreaker in
write_report.

Replace with `PRE_FIX_GPU_FILES`: a frozenset of the exact 8 filenames
known to predate the metadata fix in commit ef96538. Any file not in
the allowlist uses its recorded backend value. Anyone else's historical
files are unaffected.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py | 46 ++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 11307a3..f34acfe 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -7,8 +7,9 @@
 
 Notes on backend identification: post-fix benchmark JSONs (commit ef96538
 onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep
-JSONs hard-code `backend="CPU"`; we backfill those using a one-time timestamp
-threshold (see `backend_of`). Future runs of any backend are unaffected.
+JSONs hard-code `backend="CPU"` even though they were measured on GPU; we
+backfill those using an explicit filename allowlist (see `backend_of`).
+Future runs of any backend are unaffected.
 """
 from __future__ import annotations
 
@@ -20,24 +21,27 @@
 from collections import defaultdict
 from pathlib import Path
 
-# One-time backfill: GPU sweep JSONs from before commit ef96538 ("fix(benchmark):
-# record actual backend (GPU/CPU) in config metadata") have config.backend="CPU"
-# hard-coded. For just those files, the timestamp identifies them as the GPU
-# sweep we ran before the rebuild at ~21:34 on 2026-05-14. Files with a
-# timestamp at or after the threshold have correct metadata and are trusted.
-THRESHOLD_TS = "20260514T2130"
-
-
-def backend_of(timestamp: str, recorded: str) -> str:
-    """Trust the recorded backend, but backfill pre-fix GPU runs.
-
-    Pre-fix files always say "CPU" in metadata even when GPU was active.
-    We override only when (a) the recorded value is "CPU" AND (b) the
-    timestamp predates the metadata fix. New GPU runs (which write
-    backend="GPU" correctly) and any CPU run from any time are trusted
-    as-is.
-    """
-    if recorded == "CPU" and timestamp < THRESHOLD_TS:
+# Backfill for the specific historical GPU sweep files that predate the
+# metadata-recording fix in commit ef96538. Those JSONs hard-code
+# config.backend="CPU" even though they were measured on GPU. We use an
+# explicit filename allowlist (rather than a timestamp threshold) so the
+# rewrite cannot accidentally fire on anyone else's pre-threshold *genuine
+# CPU* JSONs that happen to share latency_results/.
+PRE_FIX_GPU_FILES = frozenset({
+    "benchmark_20260514T174502_k1.json",
+    "benchmark_20260514T180830_k3.json",
+    "benchmark_20260514T183604_k5.json",
+    "benchmark_20260514T190438_k7.json",
+    "benchmark_20260514T193453_k10.json",
+    "benchmark_20260514T200414_k15.json",
+    "benchmark_20260514T203653_k20.json",
+    "benchmark_20260514T210522.json",
+})
+
+
+def backend_of(filename: str, recorded: str) -> str:
+    """Trust the recorded backend except for the listed pre-fix GPU files."""
+    if filename in PRE_FIX_GPU_FILES:
         return "GPU"
     return recorded
 
@@ -64,7 +68,7 @@ def load_runs() -> list[dict]:
         k_label = 0 if skip_retrieval else (k_override if k_override is not None else None)
         if k_label is None:
             continue
-        backend = backend_of(ts, d["config"].get("backend", "CPU"))
+        backend = backend_of(os.path.basename(f), d["config"].get("backend", "CPU"))
         runs.append({
             "file": os.path.basename(f),
             "timestamp": ts,

From f372f8879d0abc401a138644dd3013fd697784ae Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 08:25:47 +0800
Subject: [PATCH 14/30] review: document :benchmark process model in service
 KDoc

The service runs in android:process=":benchmark", separate from the main
app process. RagPipeline(application) here therefore constructs an
entirely fresh pipeline (Gecko + SQLite + LLM load) in that benchmark
process, not the main app's. Worth documenting because:

1. Application.onCreate() will run a second time when the benchmark
   process spawns.
2. If the main app has the LLM loaded simultaneously, two LLM instances
   may briefly contend for GPU/memory during init.

Add a "Process model" section to the class KDoc explaining both.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../com/example/app/BenchmarkForegroundService.kt  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index e6e5401..4056d6f 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -40,6 +40,20 @@ import java.util.concurrent.Executors
  * Launched via [BenchmarkActivity] which forwards Intent extras from `am
  * start`. All benchmark logic lives here; the Activity is a thin shim.
  *
+ * **Process model.** Both this service and [BenchmarkActivity] declare
+ * `android:process=":benchmark"` in the manifest, so they run in a
+ * separate process from the main MAM-AI app. That process is fresh on
+ * each `am start`: this service constructs its own [RagPipeline]
+ * (Gecko + SQLite + LLM load) on entry, independent of any pipeline
+ * already loaded in the main app process. Two consequences worth
+ * knowing about:
+ *
+ *  1. The application's `Application` subclass initializes once per
+ *     process — anything in your custom Application.onCreate() will
+ *     run a second time when the benchmark process spawns.
+ *  2. If the main app is also running with the LLM loaded, two LLM
+ *     instances may briefly contend for GPU/memory during init.
+ *
  * Intent extras (forwarded from the Activity):
  *   repeats:Int                Repetitions per query
  *   cooldown_ms:Long           Sleep between runs

From e205fdf078d11854ed7b68b24c36c63f8caad10d Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 08:26:14 +0800
Subject: [PATCH 15/30] review: document retrieve_k=-1 sentinel and other
 intent-extra defaults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "-1 = use config default" semantic for retrieve_k is non-obvious and
wasn't documented anywhere. Update both KDoc blocks (BenchmarkActivity
and BenchmarkForegroundService) to spell out:

- retrieve_k: pass any value >= 0 to override; pass -1 (or omit) to use
  runtime_config.json's value. The activity normalises -1 → null before
  forwarding to the service.
- repeats default 3, cooldown_ms default 5000 (were missing).
- skip_retrieval and rag_only are mutually exclusive; skip_retrieval
  wins if both are set.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kotlin/com/example/app/BenchmarkActivity.kt     | 11 ++++++++---
 .../com/example/app/BenchmarkForegroundService.kt   | 13 +++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index 887f841..f94cc1b 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -22,10 +22,15 @@ import android.util.Log
  *
  * Optional extras:
  *   --ez skip_retrieval true     Skip RAG retrieval (generation only)
- *   --ez rag_only true           Skip the No-RAG mode (k-sweep helper)
+ *   --ez rag_only true           Skip the No-RAG mode (k-sweep helper).
+ *                                Mutually exclusive with skip_retrieval —
+ *                                if both are set, skip_retrieval wins.
  *   --es query_filter short      Filter by category or specific query ID
- *   --ei retrieve_k N            Override retrieval top_k for this session
- *                                (default: use runtime_config.json's value).
+ *   --ei retrieve_k N            Override retrieval top_k for this session.
+ *                                Pass any value >= 0 to override; pass -1
+ *                                (or omit) to use runtime_config.json's
+ *                                value. The activity normalises -1 to null
+ *                                before forwarding to the service.
  */
 class BenchmarkActivity : Activity() {
 
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index 4056d6f..9a37df3 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -55,12 +55,17 @@ import java.util.concurrent.Executors
  *     instances may briefly contend for GPU/memory during init.
  *
  * Intent extras (forwarded from the Activity):
- *   repeats:Int                Repetitions per query
- *   cooldown_ms:Long           Sleep between runs
+ *   repeats:Int                Repetitions per query (default 3)
+ *   cooldown_ms:Long           Sleep between runs in ms (default 5000)
  *   skip_retrieval:Boolean     Run No-RAG mode only
  *   rag_only:Boolean           Run RAG mode only
- *   query_filter:String?       Category or query ID filter
- *   retrieve_k:Int (>=0)       Override retrieval top_k; -1 = use config
+ *                              (skip_retrieval and rag_only are mutually
+ *                              exclusive; skip_retrieval wins if both set)
+ *   query_filter:String?       Category or specific query ID filter
+ *   retrieve_k:Int             Override retrieval top_k for this session.
+ *                              Pass -1 (or omit) to use the value from
+ *                              runtime_config.json. Any value >= 0 takes
+ *                              effect for every query in this run.
  */
 class BenchmarkForegroundService : Service() {
 

From 574601c99adc2c8e1a8fca5e69ae8637ef008faf Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 08:26:37 +0800
Subject: [PATCH 16/30] review: mark BenchmarkForegroundService dataSync type
 as dev-only
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

foregroundServiceType="dataSync" is documented for "transferring data
between a device and the cloud or other peers" — clearly a misuse for an
on-device latency benchmark. Google Play's FGS-type enforcement would
reject this on submission.

Acceptable here because BenchmarkForegroundService is only launched via
`adb shell am start` for in-house benchmarking and never appears in any
user-facing flow. Add an explicit DEV-ONLY comment documenting:

- the type is technically wrong,
- the correct type to switch to if we ever ship benchmark capabilities
  (specialUse + the FOREGROUND_SERVICE_SPECIAL_USE permission +
  PROPERTY_SPECIAL_USE_FGS_SUBTYPE),
- this declaration should be stripped from any Play Store build.

No runtime change; only a comment to prevent surprises later.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 app/android/app/src/main/AndroidManifest.xml | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml
index c805436..d19101a 100644
--- a/app/android/app/src/main/AndroidManifest.xml
+++ b/app/android/app/src/main/AndroidManifest.xml
@@ -54,7 +54,17 @@
         <!-- Foreground service for the on-device latency benchmark. Holds a
              PARTIAL_WAKE_LOCK + sticky notification so the work survives
              screen-off and device-lock through hours-long k-sweeps. Runs
-             in its own :benchmark process to keep the main app isolated. -->
+             in its own :benchmark process to keep the main app isolated.
+             DEV-ONLY: foregroundServiceType="dataSync" is technically a
+             misuse here (no actual data sync) — Google Play would reject
+             this declaration. Acceptable because BenchmarkForegroundService
+             is launched only via `adb shell am start` for in-house
+             benchmarking; it never appears in any user-facing flow and
+             this manifest entry should be stripped from any Play Store
+             build. If we ever need to ship benchmark capabilities, switch
+             to foregroundServiceType="specialUse" and add the corresponding
+             android.permission.FOREGROUND_SERVICE_SPECIAL_USE permission
+             plus the PROPERTY_SPECIAL_USE_FGS_SUBTYPE property. -->
         <service
             android:name=".BenchmarkForegroundService"
             android:foregroundServiceType="dataSync"

From d3d475cf2ab9d91709766826bc5abb83d402758f Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 08:28:14 +0800
Subject: [PATCH 17/30] review: guard aggregate_overall subscripts against
 empty dicts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

aggregate_overall returns {} when every run in a (backend, k) cell errored
out (all results have an error field). Callers then subscript with
["median"] / ["p95"] and crash with KeyError.

Today this only happens partially — e.g. GPU k=20 has 24/54 errors but
30 successful runs, so the dict is populated. But a future sweep where
all runs error (entirely possible at higher k once the 4096-token wall
is hit broadly, or if the LLM dies during init) would crash the report
generation.

Switch all four subscript sites in write_report() to .get("median") and
.get("p95") so an empty dict propagates as None, which fmt_ms / fmt_s
already render as "—".

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index f34acfe..6d43656 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -192,9 +192,9 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         # ratio
         ratio = ""
         if gpu_run and cpu_run:
-            gov = aggregate_overall(gpu_run["data"], "total_query_ms")["median"]
-            cov = aggregate_overall(cpu_run["data"], "total_query_ms")["median"]
-            if gov is not None and gov > 0:
+            gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median")
+            cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median")
+            if gov is not None and cov is not None and gov > 0:
                 ratio = f"{cov / gov:.2f}×"
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |")
@@ -208,8 +208,8 @@ def write_report(runs: list[dict], out_path: Path) -> None:
         gpu_run = matrix.get(("GPU", k))
         cpu_run = matrix.get(("CPU", k))
         doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
-        gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None
-        cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None
+        gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None
         # Explicit None checks; also guard against div-by-zero on a 0 median.
         ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else ""
         label = "**0 (no-RAG)**" if k == 0 else str(k)
@@ -226,8 +226,8 @@ def write_report(runs: list[dict], out_path: Path) -> None:
     for k in all_ks:
         gpu_run = matrix.get(("GPU", k))
         cpu_run = matrix.get(("CPU", k))
-        gv = aggregate_overall(gpu_run["data"], "decode_ms")["median"] if gpu_run else None
-        cv = aggregate_overall(cpu_run["data"], "decode_ms")["median"] if cpu_run else None
+        gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None
         ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else ""
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
@@ -240,8 +240,8 @@ def write_report(runs: list[dict], out_path: Path) -> None:
     for k in all_ks:
         gpu_run = matrix.get(("GPU", k))
         cpu_run = matrix.get(("CPU", k))
-        gv = aggregate_overall(gpu_run["data"], "total_query_ms")["p95"] if gpu_run else None
-        cv = aggregate_overall(cpu_run["data"], "total_query_ms")["p95"] if cpu_run else None
+        gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None
         label = "**0 (no-RAG)**" if k == 0 else str(k)
         md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |")
     md.append("")

From e29443ec021d127827eda5da34a4a62385d654ac Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:22:29 +0800
Subject: [PATCH 18/30] =?UTF-8?q?review:=20fix=20wrong=20survivor=20count?=
 =?UTF-8?q?=20in=20k=3D20=20narrative=20(8=20=E2=86=92=2010)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Report claimed "The 8 surviving queries on either side" — but the math
doesn't check out: 18 queries × 3 reps = 54 runs, 24 errored = 8 unique
queries failed × 3 reps. So 18 − 8 = 10 unique queries survived, not 8.

Corrected to "The other 10 queries (10 × 3 reps = 30 successful runs)".

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py         | 2 +-
 evaluation/reports/latency_report_v2.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 6d43656..32c39df 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -263,7 +263,7 @@ def write_report(runs: list[dict], out_path: Path) -> None:
     md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ")
     md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ")
     md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ")
-    md.append("The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.")
+    md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.")
     md.append("")
     md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ")
     md.append("deployment budget at this depth even when the request fits in the context window.")
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
index 25356b3..3edee07 100644
--- a/evaluation/reports/latency_report_v2.md
+++ b/evaluation/reports/latency_report_v2.md
@@ -1,6 +1,6 @@
 # MAM-AI On-Device Latency Sweep — GPU vs CPU
 
-_Generated: 2026-05-15T07:56:55_
+_Generated: 2026-05-15T10:22:29_
 
 
 ## Device & stack
@@ -96,7 +96,7 @@ Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8
 backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — 
 the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of 
 the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. 
-The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.
+The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.
 
 Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any 
 deployment budget at this depth even when the request fits in the context window.

From 57722db6b15f5350ff4a757e56e1e90410b93086 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:23:03 +0800
Subject: [PATCH 19/30] review: reject --retrieve-k 0 (footgun)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The service normalises -1 to null but treats any value >= 0 as an
explicit override, so passing --retrieve-k 0 would silently call
RetrievalConfig.create(0, ...) — a confusing footgun. If you want to
disable retrieval entirely, --no-retrieval is the proper flag.

argparse now rejects --retrieve-k < 1 with a clear error pointing to
--no-retrieval. Negative values were already filtered by the service's
takeIf { it >= 0 } but a 0 was slipping through.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/benchmark_latency.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
index e5c02a9..7486f96 100644
--- a/evaluation/benchmark_latency.py
+++ b/evaluation/benchmark_latency.py
@@ -490,6 +490,11 @@ def main():
 
     if args.no_retrieval and args.rag_only:
         parser.error("--no-retrieval and --rag-only are mutually exclusive")
+    if args.retrieve_k is not None and args.retrieve_k < 1:
+        # The service treats any value >= 0 as an explicit override. Passing 0
+        # would call RetrievalConfig.create(0, …), which is a silent footgun
+        # — use --no-retrieval if you actually want to disable retrieval.
+        parser.error("--retrieve-k must be >= 1; use --no-retrieval to disable retrieval entirely")
 
     print("=" * 60)
     print("MAM-AI On-Device Latency Benchmark")

From 42326ee491cb486fcb57b4fd2698ed9b64d9cf5e Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:23:52 +0800
Subject: [PATCH 20/30] review: acquire wake lock in onStartCommand after
 startForeground
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the PARTIAL_WAKE_LOCK was acquired in onCreate(), before
onStartCommand calls startForegroundCompat. Two problems with that:

1. If the system creates the service but onStartCommand is delayed or
   never invoked (bind-only path, framework deferral), the wake lock is
   held without a foreground notification — and Android 12+ can trip
   the foreground-service-start-while-in-background restriction in that
   state.
2. Even on the normal path, there is a brief window where the CPU is
   pinned awake without the user-visible notification that justifies it.

Move the wake-lock acquisition into onStartCommand, immediately AFTER
startForegroundCompat. The lock is now strictly paired with the
foreground notification's lifetime. Guarded with `wakeLock == null` so
duplicate onStartCommand invocations (which can happen via START_NOT_STICKY
restarts) don't try to re-acquire.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../example/app/BenchmarkForegroundService.kt | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index 9a37df3..d341c58 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -91,24 +91,32 @@ class BenchmarkForegroundService : Service() {
     override fun onCreate() {
         super.onCreate()
         ensureChannel(this)
+    }
+
+    override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int {
+        // Promote to foreground FIRST so the wake lock is always paired with
+        // a visible notification (Android 12+ enforces this pairing for new
+        // foreground-service starts). Acquiring the wake lock in onCreate
+        // before startForeground would briefly hold the CPU awake without a
+        // notification — and would leak if onStartCommand never ran (e.g.
+        // bind-only path or framework deferral).
+        startForegroundCompat("MAM-AI benchmark starting…", -1, 0)
 
         // PARTIAL_WAKE_LOCK lets the CPU keep running through screen-off.
         // Vendor power managers (OPPO ColorOS, Xiaomi MIUI, etc.) respect
         // wake locks held by foreground services — they aggressively
         // release locks held by background activities.
-        val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager
-        wakeLock = powerManager.newWakeLock(
-            PowerManager.PARTIAL_WAKE_LOCK,
-            "mam-ai:benchmark"
-        ).apply {
-            setReferenceCounted(false)
-            acquire(6L * 60L * 60L * 1000L)  // 6 h failsafe
+        if (wakeLock == null) {
+            val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager
+            wakeLock = powerManager.newWakeLock(
+                PowerManager.PARTIAL_WAKE_LOCK,
+                "mam-ai:benchmark"
+            ).apply {
+                setReferenceCounted(false)
+                acquire(6L * 60L * 60L * 1000L)  // 6 h failsafe
+            }
+            Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired")
         }
-        Log.w(BENCH_TAG, "[BENCHMARK] Service onCreate, PARTIAL_WAKE_LOCK acquired")
-    }
-
-    override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int {
-        startForegroundCompat("MAM-AI benchmark starting…", -1, 0)
 
         val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS
         val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS

From 25a1a42411e4b45faa8b7feef537618c52e5f6c2 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:24:32 +0800
Subject: [PATCH 21/30] review: executor.shutdownNow() + brief await to avoid
 race in onDestroy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The single-thread executor is what ferries pipeline calls off the
coroutine dispatchers (LiteRT-LM generation, Gecko retrieval), and
scope.cancel() does NOT propagate cancellation into those blocking
native calls. A plain executor.shutdown() then returns immediately and
leaves the worker thread alive, keeping the :benchmark process running
until generation finishes naturally — stopForeground might run with
the worker still busy.

Use shutdownNow() to interrupt the worker, plus a brief 2 s
awaitTermination() so a worker that's tearing down cleanly gets a chance
to do so. If it doesn't, the OS will reclaim the process eventually.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../example/app/BenchmarkForegroundService.kt  | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index d341c58..b880d94 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -151,9 +151,21 @@ class BenchmarkForegroundService : Service() {
         wakeLock = null
         scope.cancel()
         // Shut down the single-thread executor that ferries pipeline calls off
-        // the coroutine dispatchers. Otherwise its worker thread keeps the
-        // :benchmark process alive after the service stops.
-        executor.shutdown()
+        // the coroutine dispatchers. We use shutdownNow() to interrupt the
+        // worker thread: scope.cancel() does not propagate cancellation into
+        // a blocking native call (e.g. mid-flight LiteRT-LM generation),
+        // and a plain shutdown() would return immediately and leave the
+        // thread running until the call finishes naturally — keeping the
+        // :benchmark process alive after stopForeground.
+        executor.shutdownNow()
+        // Brief best-effort await so we don't yank the rug if the worker is
+        // tearing down cleanly. If it doesn't finish in 2 s we move on; the
+        // OS will eventually kill the process anyway.
+        try {
+            executor.awaitTermination(2, java.util.concurrent.TimeUnit.SECONDS)
+        } catch (_: InterruptedException) {
+            Thread.currentThread().interrupt()
+        }
         @Suppress("DEPRECATION")
         stopForeground(true)
     }

From 2b5cb9c98f32005a0120a63e15b2f17a60a22d01 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:25:09 +0800
Subject: [PATCH 22/30] review: warn loudly when JSON has no recorded backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

backend_of's fallback was `d["config"].get("backend", "CPU")`, which
silently labels any backend-less JSON as "CPU". Pre-fix files are
handled by the explicit allowlist; the silent default only fires on
"unexpected" JSONs — which is exactly when a future regression in
BenchmarkForegroundService (e.g. metadata accidentally dropped) would
slip past us.

Now: if config.backend is missing AND the file isn't on the pre-fix
allowlist, print a warning to stderr explaining the assumption and
defaulting to "CPU". Post-fix JSONs always carry the field, so this
warning only fires when something is genuinely off.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 32c39df..229c3f1 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -18,6 +18,7 @@
 import json
 import os
 import statistics
+import sys
 from collections import defaultdict
 from pathlib import Path
 
@@ -68,7 +69,22 @@ def load_runs() -> list[dict]:
         k_label = 0 if skip_retrieval else (k_override if k_override is not None else None)
         if k_label is None:
             continue
-        backend = backend_of(os.path.basename(f), d["config"].get("backend", "CPU"))
+        # The metadata fix in commit ef96538 ensures post-fix runs record
+        # config.backend. If it's missing, the JSON predates that fix — only
+        # safe if the filename is on the allowlist; otherwise warn loudly
+        # rather than silently defaulting (which would mask future GPU runs
+        # written by a regressed BenchmarkForegroundService).
+        recorded_backend = d["config"].get("backend")
+        if recorded_backend is None:
+            if os.path.basename(f) not in PRE_FIX_GPU_FILES:
+                print(
+                    f"WARN: {os.path.basename(f)} has no config.backend "
+                    "field and is not on the pre-fix allowlist; defaulting "
+                    "to CPU. If this was actually a GPU run, fix the source.",
+                    file=sys.stderr,
+                )
+            recorded_backend = "CPU"
+        backend = backend_of(os.path.basename(f), recorded_backend)
         runs.append({
             "file": os.path.basename(f),
             "timestamp": ts,

From 291725c8c780743dd36904cca7c4f5f4529ab2ed Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:25:49 +0800
Subject: [PATCH 23/30] review: warn in logcat when both skip_retrieval and
 rag_only are set

The Python wrapper rejects --no-retrieval + --rag-only via parser.error()
before the launch ever fires. But a direct `am start --ez skip_retrieval
true --ez rag_only true ...` bypasses Python entirely and silently runs
in No-RAG mode (skipRetrieval wins) with no indication anything is off.

Add a Log.w at the same priority as other [BENCHMARK] markers so the
mismatch is visible in `adb logcat -s mam-ai-bench:W` output during
debugging.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kotlin/com/example/app/BenchmarkForegroundService.kt  | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index b880d94..f971c2f 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -265,7 +265,13 @@ class BenchmarkForegroundService : Service() {
             return
         }
 
-        // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins).
+        // skipRetrieval and ragOnly are mutually exclusive. The Python wrapper
+        // (benchmark_latency.py) rejects this combination upfront via
+        // parser.error(); a direct `am start` could still pass both, so log a
+        // visible warning in logcat instead of silently picking one.
+        if (skipRetrieval && ragOnly) {
+            Log.w(BENCH_TAG, "[BENCHMARK] WARNING: skip_retrieval AND rag_only both set; skip_retrieval wins (No-RAG only).")
+        }
         val retrievalModes = when {
             skipRetrieval -> listOf(false)
             ragOnly -> listOf(true)

From f77effc0b301af3746cfc32fcf0e5bbf9ab26433 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:48:18 +0800
Subject: [PATCH 24/30] review: bump PARTIAL_WAKE_LOCK failsafe from 6h to 24h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 6 h ceiling could expire mid-sweep on long CPU runs — the full GPU
+ CPU k-sweep documented in latency_report_v2.md took ~7 h end-to-end,
and a CPU-only sweep across k ∈ {1, 3, 5, 7, 10, 15} hit similar
totals. When the lock auto-released, the OS could idle the CPU and
the benchmark would silently stall (no failure log, just no progress).

Bump to 24 h, with a comment that anything longer should switch to
periodic re-acquire instead of bumping the constant further.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../kotlin/com/example/app/BenchmarkForegroundService.kt   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index f971c2f..ddc0208 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -113,7 +113,12 @@ class BenchmarkForegroundService : Service() {
                 "mam-ai:benchmark"
             ).apply {
                 setReferenceCounted(false)
-                acquire(6L * 60L * 60L * 1000L)  // 6 h failsafe
+                // 24 h failsafe. Long CPU sweeps (full series × repeats × all k)
+                // have already run ~7 h end-to-end; pushing to 24 h leaves
+                // plenty of slack so the lock can't silently expire mid-run.
+                // If we ever start running sweeps longer than this, switch
+                // to a periodic re-acquire instead of bumping further.
+                acquire(24L * 60L * 60L * 1000L)
             }
             Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired")
         }

From 882e738ea859afabaf05d3b5ffaecd635e24b7cb Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:49:10 +0800
Subject: [PATCH 25/30] review: guard against double-start in onStartCommand
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scope.launch { runBenchmark(...) } was unconditional, so a re-delivered
intent (another `am start` before stopSelf() completes) would spawn a
second coroutine running on the same single-threaded executor and
writing to the same benchmark_results.json — both timings and the
output JSON would be corrupted.

Add a `benchmarkStarted` volatile flag that's set on first entry. Any
later onStartCommand call returns immediately with a logcat warning,
keeping the in-flight run intact.

START_NOT_STICKY makes this unlikely in practice, but the right belt-
and-braces fix is cheap and removes a race even on edge cases (e.g.
process recreated after low-memory kill before stopSelf returned).

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../com/example/app/BenchmarkForegroundService.kt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index ddc0208..566728c 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -85,6 +85,11 @@ class BenchmarkForegroundService : Service() {
     private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default)
     private val executor = Executors.newSingleThreadExecutor()
     private var wakeLock: PowerManager.WakeLock? = null
+    // Set once when the first onStartCommand fires runBenchmark. Subsequent
+    // intent re-deliveries (e.g. another `am start` before stopSelf() runs)
+    // see this true and are no-ops, so we never end up with two concurrent
+    // coroutines sharing the executor and the same output JSON.
+    @Volatile private var benchmarkStarted = false
 
     override fun onBind(intent: Intent?): IBinder? = null
 
@@ -123,6 +128,16 @@ class BenchmarkForegroundService : Service() {
             Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired")
         }
 
+        // Reject re-deliveries before the benchmark coroutine completes. A
+        // second am start while the first is in flight would otherwise spawn
+        // a parallel coroutine and clobber the shared RagPipeline / output
+        // JSON.
+        if (benchmarkStarted) {
+            Log.w(BENCH_TAG, "[BENCHMARK] WARNING: ignoring re-delivery; benchmark is already running.")
+            return START_NOT_STICKY
+        }
+        benchmarkStarted = true
+
         val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS
         val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS
         val skipRetrieval = intent?.getBooleanExtra("skip_retrieval", false) ?: false

From 7c2360feb6127ceab01edd0e0dc302aa963ef581 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:49:34 +0800
Subject: [PATCH 26/30] review: use BENCH_TAG for query-failure log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The catch handler in runQuery() was the only [BENCHMARK] log line in
the file using TAG="mam-ai" instead of BENCH_TAG="mam-ai-bench". This
made `adb logcat -s mam-ai-bench:E` filter out exactly the messages
most worth surfacing — query-level errors.

One-line fix.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../main/kotlin/com/example/app/BenchmarkForegroundService.kt   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index 566728c..9fde78d 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -460,7 +460,7 @@ class BenchmarkForegroundService : Service() {
             }
         } catch (e: Exception) {
             error = e.message
-            Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e)
+            Log.e(BENCH_TAG, "[BENCHMARK] Query failed: ${e.message}", e)
         }
 
         val qEnd = System.currentTimeMillis()

From 8cb9712b54a9b292cab6616a056c723cab656d50 Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:49:54 +0800
Subject: [PATCH 27/30] review: log skipped JSON files in aggregate_k_sweep

load_runs() was silently dropping JSONs that didn't match the 54-run
canonical-sweep shape (missing config/results keys, or < 30 results
which is the smoke-test guard). For users running a narrow sweep
(e.g. --filter long_01 --repeats 3 yields 3 results), the file would
silently never appear in the report with no indication why.

Log SKIP lines to stderr with the file name and reason. The output is
still clean for normal runs (only emits when something is actually
dropped).

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 229c3f1..24329bc 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -60,9 +60,19 @@ def load_runs() -> list[dict]:
         except (json.JSONDecodeError, OSError):
             continue
         if "config" not in d or "results" not in d:
+            print(f"SKIP: {os.path.basename(f)} — missing config or results key", file=sys.stderr)
             continue
         if len(d["results"]) < 30:
-            continue  # skip ad-hoc smoke tests; the canonical sweep is 54 runs
+            # Skip ad-hoc smoke tests (the canonical sweep is 54 runs). Log so
+            # that a legitimate narrow sweep (--filter long_01, single-category)
+            # isn't silently dropped from the report.
+            print(
+                f"SKIP: {os.path.basename(f)} — {len(d['results'])} results "
+                "(< 30 threshold for canonical sweeps; pass it through if it "
+                "should appear in the matrix)",
+                file=sys.stderr,
+            )
+            continue
         ts = os.path.basename(f).replace("benchmark_", "").split(".")[0].split("_")[0]
         k_override = d["config"].get("retrieval_top_k_override")
         skip_retrieval = d["config"].get("skip_retrieval", False)

From f971145b9e15771f2f8cea12d3d13641013111dc Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:50:39 +0800
Subject: [PATCH 28/30] review: derive Methodology text from sample run's
 config, not hardcoded
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Methodology paragraph hardcoded "18 queries × … 10-second cooldown"
even though the JSONs carry the actual config.repeats, config.cooldown_ms,
and a results count that proves the math. If a future run uses different
defaults (or this script is pointed at a different sweep), the
methodology text would silently lie.

Now reads from the sample run's config: pulls repeats, cooldown_ms,
results count; infers (queries × modes) from results / repeats. Output
for the current data set is unchanged ("18 (query × mode) cells × 3
repeats = 54 timed runs … 10-second cooldown") because that's what the
JSONs actually say.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py         | 28 ++++++++++++++++++++-----
 evaluation/reports/latency_report_v2.md |  7 ++-----
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 24329bc..4c6090b 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -186,16 +186,34 @@ def write_report(runs: list[dict], out_path: Path) -> None:
     md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU")
     md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000")
     md.append("")
+    # Pull the actual values from the sample run's config instead of hard-coding
+    # text that can lie. If different runs used different settings, this won't
+    # catch that — but we'd rather report the sample's truth than fabricate a
+    # round-number claim.
+    sample_cfg = sample["data"].get("config", {})
+    sample_repeats = sample_cfg.get("repeats", "?")
+    sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0
+    sample_n_results = len(sample["data"]["results"])
+    # Infer queries × modes from total runs / repeats. Default to "?" if the
+    # math doesn't divide evenly.
+    queries_x_modes: object = "?"
+    if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0:
+        queries_x_modes = sample_n_results // sample_repeats
     md.append("## Methodology\n")
-    md.append("Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. ")
-    md.append("Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs ")
-    md.append("for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives ")
-    md.append("screen-off and device-lock; OPPO Hans whitelist set manually.")
+    md.append(
+        f"Per backend × k configuration: {queries_x_modes} (query × mode) cells "
+        f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a "
+        f"No-RAG baseline per backend (k=0 via `--no-retrieval`). "
+        f"{sample_cooldown_s:g}-second cooldown between runs for thermal "
+        "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so "
+        "the run survives screen-off and device-lock; OPPO Hans whitelist set "
+        "manually."
+    )
     md.append("")
     md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.")
     md.append("- `decode` is first-token to last-token.")
     md.append("- `total_query` is everything: `retrieval + TTFT + decode`.")
-    md.append("- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).")
+    md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).")
     md.append("")
 
     # ─────────── Headline table: total_query_ms by (backend, k) ───────────
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
index 3edee07..80e6348 100644
--- a/evaluation/reports/latency_report_v2.md
+++ b/evaluation/reports/latency_report_v2.md
@@ -1,6 +1,6 @@
 # MAM-AI On-Device Latency Sweep — GPU vs CPU
 
-_Generated: 2026-05-15T10:22:29_
+_Generated: 2026-05-15T10:50:22_
 
 
 ## Device & stack
@@ -13,10 +13,7 @@ _Generated: 2026-05-15T10:22:29_
 
 ## Methodology
 
-Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. 
-Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs 
-for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives 
-screen-off and device-lock; OPPO Hans whitelist set manually.
+Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually.
 
 - `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.
 - `decode` is first-token to last-token.

From 5fa5c6eb799e169e2c94ccb75267860e078cc34a Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:51:18 +0800
Subject: [PATCH 29/30] review: use statistics.quantiles for p95 instead of
 int(n*0.95) index
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous formula `int(len(s) * 0.95)` collapses to max for any
sample size n < 20 — e.g. for the per-category short bucket with 24
runs, int(24*0.95) = 22 (the 23rd of 24 sorted values), which is close
to but not actually the 95th percentile. For a hypothetical narrower
sample with n=3 (e.g. single-query small sweep), int(2.85) = 2 = the
max, so p95 == max by construction.

Centralise the calculation in a `_p95(values)` helper that uses
`statistics.quantiles(values, n=20, method="exclusive")[18]` — the
linear-interpolation 95th percentile from a 20-quantile partition.
Falls back to max only when n < 2 (genuinely no quantiles to compute).

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 evaluation/aggregate_k_sweep.py         | 23 +++++++++++++++++++----
 evaluation/reports/latency_report_v2.md |  8 ++++----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
index 4c6090b..d11e390 100644
--- a/evaluation/aggregate_k_sweep.py
+++ b/evaluation/aggregate_k_sweep.py
@@ -105,6 +105,23 @@ def load_runs() -> list[dict]:
     return runs
 
 
+def _p95(values: list[float]) -> int | None:
+    """95th percentile via linear-interpolation 20-quantile partition.
+
+    `statistics.quantiles(data, n=20)` returns 19 cut points dividing the
+    data into 20 equal-frequency groups; index 18 is the 95th percentile.
+    For very small samples (n < 2), there are no cut points to compute,
+    so we fall back to max — same behaviour as the previous
+    `int(len(s)*0.95)` formula but without the off-by-one that made p95
+    collapse to max for any n < 20.
+    """
+    if not values:
+        return None
+    if len(values) < 2:
+        return int(values[0])
+    return int(statistics.quantiles(values, n=20, method="exclusive")[18])
+
+
 def aggregate_per_category(d: dict, key: str) -> dict[str, dict]:
     """Per-category {median, p95, n} for the given timing field."""
     cat_vals: dict[str, list] = defaultdict(list)
@@ -116,11 +133,10 @@ def aggregate_per_category(d: dict, key: str) -> dict[str, dict]:
     for c, vs in cat_vals.items():
         if not vs:
             continue
-        s = sorted(vs)
         out[c] = {
             "n": len(vs),
             "median": int(statistics.median(vs)),
-            "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]),
+            "p95": _p95(vs),
         }
     return out
 
@@ -129,11 +145,10 @@ def aggregate_overall(d: dict, key: str) -> dict:
     vs = [r[key] for r in d["results"] if not r.get("error")]
     if not vs:
         return {}
-    s = sorted(vs)
     return {
         "n": len(vs),
         "median": int(statistics.median(vs)),
-        "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]),
+        "p95": _p95(vs),
     }
 
 
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
index 80e6348..c6745a6 100644
--- a/evaluation/reports/latency_report_v2.md
+++ b/evaluation/reports/latency_report_v2.md
@@ -1,6 +1,6 @@
 # MAM-AI On-Device Latency Sweep — GPU vs CPU
 
-_Generated: 2026-05-15T10:50:22_
+_Generated: 2026-05-15T10:51:06_
 
 
 ## Device & stack
@@ -68,11 +68,11 @@ the model writing *longer answers* when given more context (more material to dra
 |---:|---:|---:|
 | **0 (no-RAG)** | 26.1 | 38.4 |
 | 1 | 26.1 | 37.1 |
-| 3 | 30.2 | 64.3 |
+| 3 | 30.3 | 64.3 |
 | 5 | 30.7 | 74.6 |
-| 7 | 35.1 | 81.7 |
+| 7 | 35.1 | 81.8 |
 | 10 | 29.0 | 84.5 |
-| 15 | 30.6 | 112.6 |
+| 15 | 30.6 | 112.7 |
 | 20 | 35.3 | 104.9 |
 
 ## Errors and the 4096-token context wall

From 9ecc54eaca29329da98d46bcb00c6706fc18419d Mon Sep 17 00:00:00 2001
From: nmrenyi <nmrenyi@outlook.com>
Date: Fri, 15 May 2026 10:51:45 +0800
Subject: [PATCH 30/30] review: use STOP_FOREGROUND_REMOVE overload on API 24+

stopForeground(boolean) has been deprecated since Android 13. Replace
with the SDK-aware form: STOP_FOREGROUND_REMOVE on API 24+ (where the
int overload was introduced), fall back to the boolean variant only on
older devices (where it isn't deprecated).

Drops the @Suppress("DEPRECATION") on the modern path; we still
suppress on the legacy path because the boolean variant *is* the
non-deprecated API there.

Per Copilot review on PR #57.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../com/example/app/BenchmarkForegroundService.kt      | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
index 9fde78d..e1ee93c 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -186,8 +186,14 @@ class BenchmarkForegroundService : Service() {
         } catch (_: InterruptedException) {
             Thread.currentThread().interrupt()
         }
-        @Suppress("DEPRECATION")
-        stopForeground(true)
+        // Use the non-deprecated overload on API 24+ (where it was introduced).
+        // The boolean variant has been deprecated since Android 13.
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.N) {
+            stopForeground(STOP_FOREGROUND_REMOVE)
+        } else {
+            @Suppress("DEPRECATION")
+            stopForeground(true)
+        }
     }
 
     // ── Notification plumbing ────────────────────────────────────────────