From fd85cd7ca742a08ee831ca3a6406ab701ea4ebbf Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 14:24:39 +0800 Subject: [PATCH 01/30] feat(benchmark): add --retrieve-k override for per-k latency sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lets the latency benchmark vary retrieval top_k without rebuilding the APK or editing runtime_config.json. One build + install up front, then sweep k via the CLI flag — needed to bound k_max on the Snapdragon 8 Elite + GPU stack now that TTFT (~1–2 s at k=3) is no longer the binding constraint. Wiring: - RagPipeline.generateResponse() gains an optional retrieveKOverride: Int? parameter (default null). When non-null it replaces retrievalConfig.top_k for that call only; production callers leave it null. Param added at the end of the list so RagStream's positional call is unaffected. - BenchmarkActivity reads an "retrieve_k" Intent extra (-1 sentinel = no override), threads it through runBenchmark → runQuery → generateResponse, and records "retrieval_top_k_override" in the config block of the results JSON. - benchmark_latency.py adds --retrieve-k N, forwards via am start --ei retrieve_k N, and appends "_kN" to the output filename so a sweep across k values is legible. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../com/example/app/BenchmarkActivity.kt | 23 +++++++++++++++---- .../kotlin/com/example/app/RagPipeline.kt | 11 +++++++-- evaluation/benchmark_latency.py | 18 +++++++++++---- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt index bce61ea..bbe9fa9 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt @@ -32,6 +32,9 @@ import java.util.concurrent.Executors * Optional extras: * --ez skip_retrieval true Skip RAG retrieval (generation only) * --es query_filter short Filter by category or specific query ID + * --ei retrieve_k N Override retrieval top_k for this session + * (default: use runtime_config.json's value). + * Used by the per-k latency sweep. */ class BenchmarkActivity : Activity() { @@ -68,10 +71,12 @@ class BenchmarkActivity : Activity() { val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false) val queryFilter = intent.getStringExtra("query_filter") + // -1 sentinel = no override; any non-negative value overrides runtime_config's top_k. + val retrieveKOverride: Int? = intent.getIntExtra("retrieve_k", -1).takeIf { it >= 0 } scope.launch { try { - runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter) + runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter, retrieveKOverride) } catch (t: Throwable) { Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t) Log.w(BENCH_TAG, "[BENCHMARK] FAILED") @@ -96,6 +101,7 @@ class BenchmarkActivity : Activity() { cooldownMs: Long, skipRetrieval: Boolean, queryFilter: String?, + retrieveKOverride: Int?, ) { val benchmarkStart = System.currentTimeMillis() val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date()) @@ -205,7 +211,7 @@ class BenchmarkActivity : Activity() { logStatus("[$runIndex/$totalRuns] ${query.id} | retrieval=$useRetrieval rep=$rep | $etaStr") val preMemory = collectMemoryInfo() - val result = runQuery(pipeline, query.text, useRetrieval) + val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride) val postMemory = collectMemoryInfo() val decodeTps = if (result.decodeMs > 0) @@ -261,6 +267,9 @@ class BenchmarkActivity : Activity() { put("cooldown_ms", cooldownMs) put("skip_retrieval", skipRetrieval) put("query_filter", queryFilter ?: JSONObject.NULL) + // retrieval_top_k_override is null when the session uses runtime_config.json's + // value; non-null records the override value used for this whole session. + put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) put("model", "gemma-4-E4B-it.litertlm") put("backend", "CPU") put("max_tokens", 32000) @@ -302,7 +311,12 @@ class BenchmarkActivity : Activity() { val error: String?, ) - private suspend fun runQuery(pipeline: RagPipeline, queryText: String, useRetrieval: Boolean): QueryResult { + private suspend fun runQuery( + pipeline: RagPipeline, + queryText: String, + useRetrieval: Boolean, + retrieveKOverride: Int?, + ): QueryResult { var retrievalTimeMs = 0L var numDocs = 0 var firstTokenTime = 0L @@ -328,7 +342,8 @@ class BenchmarkActivity : Activity() { if (firstTokenTime == 0L && partial.isNotEmpty()) { firstTokenTime = System.currentTimeMillis() } - } + }, + retrieveKOverride = retrieveKOverride, ) } } catch (e: Exception) { diff --git a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt index 19f582e..e13e391 100644 --- a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt +++ b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt @@ -219,7 +219,12 @@ class RagPipeline(application: Application) { } } - /** Generates the response from the LLM with conversation history support. */ + /** Generates the response from the LLM with conversation history support. + * + * [retrieveKOverride] — when non-null, replaces `retrievalConfig.top_k` + * for this call only. Used by [BenchmarkActivity] for the per-k latency + * sweep; production callers leave it null and inherit the runtime config. + */ suspend fun generateResponse( prompt: String, history: List>, @@ -227,6 +232,7 @@ class RagPipeline(application: Application) { language: String = "en", retrievalListener: (docs: List) -> Unit, generationListener: (partial: String, done: Boolean) -> Unit, + retrieveKOverride: Int? = null, ): String = coroutineScope { awaitLlmReady() @@ -235,10 +241,11 @@ class RagPipeline(application: Application) { val qStart = System.currentTimeMillis() val docs = if (useRetrieval) { + val effectiveTopK = retrieveKOverride ?: retrievalConfig.getInt("top_k") val retrievalRequest = RetrievalRequest.create( prompt, RetrievalConfig.create( - retrievalConfig.getInt("top_k"), + effectiveTopK, retrievalConfig.getDouble("similarity_threshold").toFloat(), TaskType.RETRIEVAL_QUERY, ), diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py index 5611e21..77f177d 100644 --- a/evaluation/benchmark_latency.py +++ b/evaluation/benchmark_latency.py @@ -12,6 +12,7 @@ python evaluation/benchmark_latency.py --filter long_01 # Single specific query python evaluation/benchmark_latency.py --no-retrieval # Skip RAG retrieval python evaluation/benchmark_latency.py --cooldown 10000 # Longer cooldown (thermal) + python evaluation/benchmark_latency.py --retrieve-k 5 # Override retrieval top_k for this session """ import argparse @@ -103,7 +104,7 @@ def clear_logcat(device_serial=None): def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000, - skip_retrieval=False, query_filter=None): + skip_retrieval=False, query_filter=None, retrieve_k=None): """Launch BenchmarkActivity via ADB.""" cmd = _adb(device_serial) + [ "shell", "am", "start", @@ -115,6 +116,8 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000, cmd += ["--ez", "skip_retrieval", "true"] if query_filter: cmd += ["--es", "query_filter", query_filter] + if retrieve_k is not None: + cmd += ["--ei", "retrieve_k", str(retrieve_k)] result = subprocess.run(cmd, capture_output=True, text=True) if "Error" in result.stderr: @@ -460,6 +463,10 @@ def main(): help="Skip RAG retrieval (generation only)") parser.add_argument("--filter", type=str, default=None, help="Filter by category (short/medium/long) or query ID (e.g., long_01)") + parser.add_argument("--retrieve-k", type=int, default=None, + help="Override retrieval top_k for this session " + "(default: use runtime_config.json's value, currently 3). " + "Used for the per-k latency sweep.") parser.add_argument("--output-dir", type=str, default="evaluation/latency_results", help="Directory for output files") parser.add_argument("--device", type=str, default=None, @@ -494,13 +501,15 @@ def main(): clear_logcat(args.device) # Launch benchmark - print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}") + k_msg = f", retrieve_k={args.retrieve_k}" if args.retrieve_k is not None else "" + print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}{k_msg}") launch_benchmark( device_serial=args.device, repeats=args.repeats, cooldown_ms=args.cooldown, skip_retrieval=args.no_retrieval, query_filter=args.filter, + retrieve_k=args.retrieve_k, ) # Wait for completion @@ -509,8 +518,9 @@ def main(): print("Benchmark did not complete successfully.") sys.exit(1) - # Pull results - json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}.json") + # Pull results — include k in the filename so a sweep across k values is legible. + k_suffix = f"_k{args.retrieve_k}" if args.retrieve_k is not None else "" + json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}{k_suffix}.json") pull_results(args.device, json_path) # Load and analyze From 8848beef46f35b6ffbd52327aa1bd84bd629ca03 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 14:34:31 +0800 Subject: [PATCH 02/30] feat(benchmark): capture retrieved chunks + response text per run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the benchmark recorded only counts/lengths (num_retrieved_docs, response_length_chars). For the per-k latency sweep we want to see what the retriever actually surfaced at each k and what the model generated — both for content review and because the total chunk-text length is what drives prefill cost as k grows. New per-run fields in the results JSON: - retrieved_chunks: array of {text, source, page, chars} for every chunk the retriever returned. Lets us inspect what changed as k grew. - retrieved_total_chars: sum of chunk text lengths. The real prompt-length proxy (vs. query_word_count which is static). - response_text: full model response (the generation listener was already accumulating it; we now record the final string). Size: at k=20 with ~1500-3000 chars/chunk + ~3KB response, per-run overhead is ~30-60KB. A full 108-run benchmark file grows from ~50KB to ~4-6MB. Acceptable; can add a --no-content opt-out later if needed. Note: Gemma 4 E4B doesn't emit a separate reasoning channel — any inline reasoning the model writes shows up in response_text. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../com/example/app/BenchmarkActivity.kt | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt index bbe9fa9..af6c0a8 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt @@ -235,6 +235,21 @@ class BenchmarkActivity : Activity() { put("estimated_tokens", result.estimatedTokens) put("decode_throughput_tps", decodeTps) put("num_retrieved_docs", result.numRetrievedDocs) + // Full content for downstream analysis: chunks the retriever surfaced, + // total chunk-text length (drives prefill cost), and the model's + // generated response. + put("retrieved_chunks", JSONArray().apply { + result.retrievedChunks.forEach { doc -> + put(JSONObject().apply { + put("text", doc.text) + put("source", doc.source) + put("page", doc.page) + put("chars", doc.text.length) + }) + } + }) + put("retrieved_total_chars", result.retrievedTotalChars) + put("response_text", result.responseText) put("error", result.error ?: JSONObject.NULL) put("heap_before_mb", preMemory.getInt("used_mb")) put("heap_after_mb", postMemory.getInt("used_mb")) @@ -308,6 +323,9 @@ class BenchmarkActivity : Activity() { val responseChars: Int, val estimatedTokens: Int, val numRetrievedDocs: Int, + val retrievedChunks: List, + val retrievedTotalChars: Int, + val responseText: String, val error: String?, ) @@ -322,6 +340,7 @@ class BenchmarkActivity : Activity() { var firstTokenTime = 0L var error: String? = null val responseBuilder = StringBuilder() + var retrievedChunks: List = emptyList() val qStart = System.currentTimeMillis() var retrievalDoneTime = 0L @@ -336,6 +355,7 @@ class BenchmarkActivity : Activity() { retrievalDoneTime = System.currentTimeMillis() retrievalTimeMs = retrievalDoneTime - qStart numDocs = docs.size + retrievedChunks = docs }, generationListener = { partial, _ -> responseBuilder.append(partial) @@ -372,6 +392,9 @@ class BenchmarkActivity : Activity() { responseChars = responseChars, estimatedTokens = estimatedTokens, numRetrievedDocs = numDocs, + retrievedChunks = retrievedChunks, + retrievedTotalChars = retrievedChunks.sumOf { it.text.length }, + responseText = responseBuilder.toString(), error = error, ) } From 33604df7305ace807ca405f88dea970f3e2e8014 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 14:52:55 +0800 Subject: [PATCH 03/30] fix(benchmark): update model-file check to match production stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit check_models_downloaded() was looking for gemma-3n-E4B-it-int4.task — left over from the pre-Gemma-4 era. config/app_config.json now declares "llm_model": "gemma-4-E4B-it.litertlm", so the script's pre-flight check falsely failed even when the right model was on device. Caught during the smoke test for the --retrieve-k override feature: the script aborted before launching BenchmarkActivity. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/benchmark_latency.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py index 77f177d..4c2d152 100644 --- a/evaluation/benchmark_latency.py +++ b/evaluation/benchmark_latency.py @@ -69,9 +69,15 @@ def check_device(device_serial=None): def check_models_downloaded(device_serial=None): - """Check if model files exist on device.""" + """Check if model files exist on device. + + Filenames must match config/app_config.json — the app loads + "llm_model" / "embedding_model" / "tokenizer" from there. Updated + for the Gemma 4 E4B / LiteRT-LM 0.11.0 stack; the old Gemma 3n + .task name is no longer in production. + """ required_files = [ - "gemma-3n-E4B-it-int4.task", + "gemma-4-E4B-it.litertlm", "Gecko_1024_quant.tflite", "sentencepiece.model", "embeddings.sqlite", From 197a7bc439f0fd413e5452030dcc8b377335dde4 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 15:06:11 +0800 Subject: [PATCH 04/30] feat(benchmark): add --rag-only flag to skip No-RAG mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the k-sweep, the No-RAG baseline doesn't change with k (retrieval is disabled, so the override is ignored). Without this flag, running the sweep at 7 k-values would re-run the identical 54 No-RAG measurements seven times — ~1.5 hours of redundant work. The Intent extra "rag_only" (bool, default false) tells BenchmarkActivity to run only the RAG mode. Mutually exclusive with skip_retrieval, which wins if both are set. Python --rag-only forwards via am start --ez. Recorded in the session config JSON as "rag_only" so reruns are unambiguous. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kotlin/com/example/app/BenchmarkActivity.kt | 15 +++++++++++++-- evaluation/benchmark_latency.py | 10 +++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt index af6c0a8..a805e7e 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt @@ -31,6 +31,9 @@ import java.util.concurrent.Executors * * Optional extras: * --ez skip_retrieval true Skip RAG retrieval (generation only) + * --ez rag_only true Skip the No-RAG mode (useful for k-sweeps + * where the No-RAG baseline only needs to + * be captured once) * --es query_filter short Filter by category or specific query ID * --ei retrieve_k N Override retrieval top_k for this session * (default: use runtime_config.json's value). @@ -70,13 +73,14 @@ class BenchmarkActivity : Activity() { val repeats = intent.getIntExtra("repeats", DEFAULT_REPEATS) val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false) + val ragOnly = intent.getBooleanExtra("rag_only", false) val queryFilter = intent.getStringExtra("query_filter") // -1 sentinel = no override; any non-negative value overrides runtime_config's top_k. val retrieveKOverride: Int? = intent.getIntExtra("retrieve_k", -1).takeIf { it >= 0 } scope.launch { try { - runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter, retrieveKOverride) + runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride) } catch (t: Throwable) { Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t) Log.w(BENCH_TAG, "[BENCHMARK] FAILED") @@ -100,6 +104,7 @@ class BenchmarkActivity : Activity() { repeats: Int, cooldownMs: Long, skipRetrieval: Boolean, + ragOnly: Boolean, queryFilter: String?, retrieveKOverride: Int?, ) { @@ -183,7 +188,12 @@ class BenchmarkActivity : Activity() { return } - val retrievalModes = if (skipRetrieval) listOf(false) else listOf(true, false) + // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins if both set). + val retrievalModes = when { + skipRetrieval -> listOf(false) + ragOnly -> listOf(true) + else -> listOf(true, false) + } val totalRuns = queries.size * retrievalModes.size * repeats Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs") @@ -281,6 +291,7 @@ class BenchmarkActivity : Activity() { put("repeats", repeats) put("cooldown_ms", cooldownMs) put("skip_retrieval", skipRetrieval) + put("rag_only", ragOnly) put("query_filter", queryFilter ?: JSONObject.NULL) // retrieval_top_k_override is null when the session uses runtime_config.json's // value; non-null records the override value used for this whole session. diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py index 4c2d152..30dec4c 100644 --- a/evaluation/benchmark_latency.py +++ b/evaluation/benchmark_latency.py @@ -110,7 +110,8 @@ def clear_logcat(device_serial=None): def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000, - skip_retrieval=False, query_filter=None, retrieve_k=None): + skip_retrieval=False, rag_only=False, + query_filter=None, retrieve_k=None): """Launch BenchmarkActivity via ADB.""" cmd = _adb(device_serial) + [ "shell", "am", "start", @@ -120,6 +121,8 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000, ] if skip_retrieval: cmd += ["--ez", "skip_retrieval", "true"] + if rag_only: + cmd += ["--ez", "rag_only", "true"] if query_filter: cmd += ["--es", "query_filter", query_filter] if retrieve_k is not None: @@ -467,6 +470,10 @@ def main(): help="Cooldown between queries in ms (default: 5000)") parser.add_argument("--no-retrieval", action="store_true", help="Skip RAG retrieval (generation only)") + parser.add_argument("--rag-only", action="store_true", + help="Skip the No-RAG mode (only run with retrieval). " + "Pair with --retrieve-k to do a k-sweep without " + "re-running the No-RAG baseline at every k.") parser.add_argument("--filter", type=str, default=None, help="Filter by category (short/medium/long) or query ID (e.g., long_01)") parser.add_argument("--retrieve-k", type=int, default=None, @@ -514,6 +521,7 @@ def main(): repeats=args.repeats, cooldown_ms=args.cooldown, skip_retrieval=args.no_retrieval, + rag_only=args.rag_only, query_filter=args.filter, retrieve_k=args.retrieve_k, ) From 795ac8492869bec4fc790a15551473ac5ca0b4f3 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 15:39:39 +0800 Subject: [PATCH 05/30] fix(benchmark): use suspending delay() instead of Thread.sleep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thread.sleep(cooldownMs) was running on the UI thread (BenchmarkActivity is an Activity; the calling coroutine was scope.launch(Dispatchers.Main) by default). With cooldown ≥ 5000 ms, Android's input-dispatching timeout fires and the activity gets killed mid-sweep — exactly what happened on the OPPO Snapdragon 8 Elite at cooldown=10000: AnrInfo{reason='Input Dispatching Timeout', stackTrace='at java.lang.Thread.sleep(...) at BenchmarkActivity.runBenchmark(BenchmarkActivity.kt:279)'} delay() is a suspending function that doesn't block the underlying thread, so the UI stays responsive while the benchmark waits. Both cooldown call sites (post-init and between-runs) are inside the same suspend coroutine, so this is a drop-in replacement. The existing latency_report.md used cooldown=10000 on Pixel 7 without issue, which suggests the older Tensor G2 chipset had laxer ANR enforcement or Athena memory-kill behavior — either way, the correct-by-construction fix is to never block the UI thread. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../main/kotlin/com/example/app/BenchmarkActivity.kt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt index a805e7e..ed16231 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt @@ -11,6 +11,7 @@ import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.SupervisorJob import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.delay import kotlinx.coroutines.launch import kotlinx.coroutines.withContext import org.json.JSONArray @@ -173,7 +174,7 @@ class BenchmarkActivity : Activity() { // Step 4: Cooldown before timed runs logStatus("--- Init summary: gecko=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") logStatus("Cooldown ${cooldownMs}ms...") - Thread.sleep(cooldownMs) + delay(cooldownMs) // Filter queries val queries = if (queryFilter != null) { @@ -274,9 +275,13 @@ class BenchmarkActivity : Activity() { val elapsedMin = (System.currentTimeMillis() - loopStart) / 60000 logStatus(" [${"█".repeat(pct / 5)}${"░".repeat(20 - pct / 5)}] $pct% ($elapsedMin min elapsed)") - // Cooldown between queries (skip after last run) + // Cooldown between queries (skip after last run). + // delay() vs Thread.sleep(): the suspending variant doesn't block the + // UI thread, which is essential — cooldowns >5s with Thread.sleep + // trigger an ANR (Input Dispatching Timeout) and Android kills the + // activity mid-benchmark. if (runIndex < totalRuns) { - Thread.sleep(cooldownMs) + delay(cooldownMs) } } } From 12fd358b08fc49bfc37b726b261bb18fa1493423 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 16:15:27 +0800 Subject: [PATCH 06/30] fix(benchmark): keep CPU alive through screen-off (PARTIAL_WAKE_LOCK + Default dispatcher) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark coroutine was running on Dispatchers.Main, so as soon as the device screen went off (~10 min on OPPO) the activity backgrounded and the coroutine's delay() never resumed. The process stayed alive but stopped making progress — the Python wrapper hung waiting for [BENCHMARK] COMPLETE that would never come. Observed on the OPPO Snapdragon 8 Elite mid-sweep: ~24 min between the last [BENCHMARK] log line and the most recent Athena heartbeat, with mWakefulness=Asleep. Two changes: 1. Acquire a PARTIAL_WAKE_LOCK in onCreate (released in onDestroy). Keeps the CPU running even when the screen is off; the screen itself can still sleep. 6-hour failsafe timeout. Required permission added to AndroidManifest.xml — used only by BenchmarkActivity. 2. Switch the coroutine scope from Dispatchers.Main to Dispatchers.Default. The benchmark logic doesn't touch the UI directly (logStatus already marshals back to Main via runOnUiThread), so there's no reason to run on Main — doing so just makes the work pause whenever the activity loses focus. Default keeps running in any lifecycle state. These together let the sweep run while the device is screen-off or locked. Screen lifespan and battery thank you. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/android/app/src/main/AndroidManifest.xml | 4 +++ .../com/example/app/BenchmarkActivity.kt | 33 ++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml index a44e8c2..4c65729 100644 --- a/app/android/app/src/main/AndroidManifest.xml +++ b/app/android/app/src/main/AndroidManifest.xml @@ -4,6 +4,10 @@ + + Date: Thu, 14 May 2026 17:44:52 +0800 Subject: [PATCH 07/30] refactor(benchmark): move benchmark to a foreground service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OPPO ColorOS aggressively freezes background activities and even force-releases PARTIAL_WAKE_LOCKs held by plain Activities (visible as 'add wakelock … to ForceReleaseWakeLock list' in OplusProxyWakeLock). Foreground services with a sticky notification are respected — once the app is also whitelisted in Settings → Battery → App Battery Management ("Allow background activity"), the benchmark runs cleanly with the screen off and the device locked. Architecture: - BenchmarkForegroundService (NEW) — holds the wake lock, posts a sticky progress notification, and runs the entire benchmark loop. Uses Dispatchers.Default so it isn't tied to a UI thread. Stops itself when done; the OS reclaims the process. - BenchmarkActivity — reduced from ~470 lines to ~60. Now a thin launcher: receives `am start` Intent extras, forwards them to the service via startForegroundService(), and finishes immediately. Existing Python wrapper (benchmark_latency.py) is unchanged — it still launches the Activity and reads progress from logcat. - AndroidManifest — registers the new service with foregroundServiceType="dataSync" (reuses the existing FOREGROUND_SERVICE_DATA_SYNC permission) and android:process= ":benchmark" (same isolated process as BenchmarkActivity). Verified end-to-end on OPPO Snapdragon 8 Elite (OPD2413): launched the benchmark, locked the screen 30 s in, watched a full medium_01 RAG run complete with `hans_freeze=0` Athena freeze events and the screen asleep through the entire decode phase. TTFT 1025 ms, total 13.8 s — within the same envelope as foreground runs. Pre-flight on a new device: Settings → Battery → App Battery Management → MAM-AI → "Allow background activity" Without that, OPPO's OplusHansManager freezes the process at the OS level regardless of foreground-service status. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/android/app/src/main/AndroidManifest.xml | 10 + .../com/example/app/BenchmarkActivity.kt | 476 ++--------------- .../example/app/BenchmarkForegroundService.kt | 487 ++++++++++++++++++ 3 files changed, 530 insertions(+), 443 deletions(-) create mode 100644 app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml index 4c65729..bb35442 100644 --- a/app/android/app/src/main/AndroidManifest.xml +++ b/app/android/app/src/main/AndroidManifest.xml @@ -49,6 +49,16 @@ android:foregroundServiceType="dataSync" android:exported="false" /> + + + = 0 } - - scope.launch { - try { - runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride) - } catch (t: Throwable) { - Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t) - Log.w(BENCH_TAG, "[BENCHMARK] FAILED") - logStatus("FAILED: ${t.message}") - } finally { - finish() - } - } - } - - override fun onDestroy() { - super.onDestroy() - wakeLock?.let { - if (it.isHeld) { - it.release() - Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK") - } - } - wakeLock = null - } - - private fun logStatus(text: String) { - runOnUiThread { - logView.append(text + "\n") - scrollView.post { scrollView.fullScroll(ScrollView.FOCUS_DOWN) } - } - } - - // ── Main benchmark loop ────────────────────────────────────────────── - - private suspend fun runBenchmark( - repeats: Int, - cooldownMs: Long, - skipRetrieval: Boolean, - ragOnly: Boolean, - queryFilter: String?, - retrieveKOverride: Int?, - ) { - val benchmarkStart = System.currentTimeMillis() - val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date()) - - Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter") - - // Device info - val deviceInfo = collectDeviceInfo() - Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})") - - // Step 1: Gecko + SQLite init (synchronous part of RagPipeline constructor) - logStatus("Step 1/4: Initializing Gecko embedder + SQLite...") - Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...") - val initStart = System.currentTimeMillis() - val pipeline = withContext(executor.asCoroutineDispatcher()) { - RagPipeline(application) - } - val syncInitMs = System.currentTimeMillis() - initStart - Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms") - logStatus("Step 1/4: Gecko + SQLite done (${syncInitMs}ms)") - - // Step 2: Wait for LLM model load (async, started by RagPipeline constructor) - logStatus("Step 2/4: Loading Gemma 4 LLM model...") - Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...") - val llmWaitStart = System.currentTimeMillis() - withContext(executor.asCoroutineDispatcher()) { - pipeline.awaitLlmReady() - } - val llmInitMs = System.currentTimeMillis() - llmWaitStart - Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)") - logStatus("Step 2/4: LLM loaded (${llmInitMs}ms)") - - // Step 3: 5 warmup queries of varying length — warms JIT / LiteRT-LM / shader caches - val warmupQueries = listOf( - "Normal fetal heart rate", - "Signs of infection after delivery", - "A mother has heavy bleeding after birth. What should I do first?", - "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?", - "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?", - ) - logStatus("Step 3/4: Running ${warmupQueries.size} warmup queries...") - Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...") - val warmupStart = System.currentTimeMillis() - warmupQueries.forEachIndexed { i, prompt -> - Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"") - withContext(executor.asCoroutineDispatcher()) { - pipeline.generateResponse( - prompt = prompt, - history = emptyList(), - useRetrieval = false, - retrievalListener = {}, - generationListener = { _, _ -> } - ) - } - Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)") - } - val warmupMs = System.currentTimeMillis() - warmupStart - val totalInitMs = System.currentTimeMillis() - initStart - Log.w(BENCH_TAG, "[BENCHMARK] Warmup complete: ${warmupMs}ms total (${warmupQueries.size} queries)") - Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") - - val postInitMemory = collectMemoryInfo() - - // Step 4: Cooldown before timed runs - logStatus("--- Init summary: gecko=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") - logStatus("Cooldown ${cooldownMs}ms...") - delay(cooldownMs) - - // Filter queries - val queries = if (queryFilter != null) { - BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter } + val serviceIntent = Intent(this, BenchmarkForegroundService::class.java).apply { + // Forward every extra the user might have passed via `am start`. + // Defaults are resolved inside the service. + if (intent.hasExtra("repeats")) + putExtra("repeats", intent.getIntExtra("repeats", 3)) + if (intent.hasExtra("cooldown_ms")) + putExtra("cooldown_ms", intent.getLongExtra("cooldown_ms", 5000L)) + if (intent.hasExtra("skip_retrieval")) + putExtra("skip_retrieval", intent.getBooleanExtra("skip_retrieval", false)) + if (intent.hasExtra("rag_only")) + putExtra("rag_only", intent.getBooleanExtra("rag_only", false)) + if (intent.hasExtra("query_filter")) + putExtra("query_filter", intent.getStringExtra("query_filter")) + if (intent.hasExtra("retrieve_k")) + putExtra("retrieve_k", intent.getIntExtra("retrieve_k", -1)) + } + + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + startForegroundService(serviceIntent) } else { - BenchmarkQueries.ALL - } - - if (queries.isEmpty()) { - Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'") - Log.w(BENCH_TAG, "[BENCHMARK] FAILED") - return + startService(serviceIntent) } - - // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins if both set). - val retrievalModes = when { - skipRetrieval -> listOf(false) - ragOnly -> listOf(true) - else -> listOf(true, false) - } - val totalRuns = queries.size * retrievalModes.size * repeats - Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs") - - // Execution loop - val results = mutableListOf() - var runIndex = 0 - val loopStart = System.currentTimeMillis() - - for (query in queries) { - for (useRetrieval in retrievalModes) { - for (rep in 1..repeats) { - runIndex++ - - // Estimate time remaining based on average time per completed run - val etaStr = if (runIndex > 1) { - val elapsedMs = System.currentTimeMillis() - loopStart - val avgPerRun = elapsedMs.toDouble() / (runIndex - 1) - val remainingMs = (avgPerRun * (totalRuns - runIndex + 1)).toLong() - val remainMin = remainingMs / 60000 - val remainSec = (remainingMs % 60000) / 1000 - "ETA: ${remainMin}m ${remainSec}s" - } else "ETA: calculating..." - - Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats") - logStatus("[$runIndex/$totalRuns] ${query.id} | retrieval=$useRetrieval rep=$rep | $etaStr") - - val preMemory = collectMemoryInfo() - val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride) - val postMemory = collectMemoryInfo() - - val decodeTps = if (result.decodeMs > 0) - round2(result.estimatedTokens / (result.decodeMs / 1000.0)) - else 0.0 - - val entry = JSONObject().apply { - put("query_id", query.id) - put("category", query.category) - put("query_text", query.text) - put("query_word_count", query.wordCount) - put("use_retrieval", useRetrieval) - put("repetition", rep) - put("retrieval_time_ms", result.retrievalTimeMs) - put("ttft_ms", result.ttftMs) - put("prefill_ms", result.prefillMs) - put("decode_ms", result.decodeMs) - put("total_generation_ms", result.generationTotalMs) - put("total_query_ms", result.totalQueryMs) - put("response_length_chars", result.responseChars) - put("estimated_tokens", result.estimatedTokens) - put("decode_throughput_tps", decodeTps) - put("num_retrieved_docs", result.numRetrievedDocs) - // Full content for downstream analysis: chunks the retriever surfaced, - // total chunk-text length (drives prefill cost), and the model's - // generated response. - put("retrieved_chunks", JSONArray().apply { - result.retrievedChunks.forEach { doc -> - put(JSONObject().apply { - put("text", doc.text) - put("source", doc.source) - put("page", doc.page) - put("chars", doc.text.length) - }) - } - }) - put("retrieved_total_chars", result.retrievedTotalChars) - put("response_text", result.responseText) - put("error", result.error ?: JSONObject.NULL) - put("heap_before_mb", preMemory.getInt("used_mb")) - put("heap_after_mb", postMemory.getInt("used_mb")) - } - results.add(entry) - - val resultLine = " -> ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms tps=$decodeTps" - Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps") - logStatus(resultLine) - - val pct = (runIndex * 100) / totalRuns - val elapsedMin = (System.currentTimeMillis() - loopStart) / 60000 - logStatus(" [${"█".repeat(pct / 5)}${"░".repeat(20 - pct / 5)}] $pct% ($elapsedMin min elapsed)") - - // Cooldown between queries (skip after last run). - // delay() vs Thread.sleep(): the suspending variant doesn't block the - // UI thread, which is essential — cooldowns >5s with Thread.sleep - // trigger an ANR (Input Dispatching Timeout) and Android kills the - // activity mid-benchmark. - if (runIndex < totalRuns) { - delay(cooldownMs) - } - } - } - } - - // Assemble output JSON - val output = JSONObject().apply { - put("benchmark_version", 1) - put("timestamp", timestamp) - put("device", deviceInfo) - put("config", JSONObject().apply { - put("repeats", repeats) - put("cooldown_ms", cooldownMs) - put("skip_retrieval", skipRetrieval) - put("rag_only", ragOnly) - put("query_filter", queryFilter ?: JSONObject.NULL) - // retrieval_top_k_override is null when the session uses runtime_config.json's - // value; non-null records the override value used for this whole session. - put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) - put("model", "gemma-4-E4B-it.litertlm") - put("backend", "CPU") - put("max_tokens", 32000) - put("temperature", 1.0) - put("top_p", 0.95) - put("top_k", 64) - }) - put("init", JSONObject().apply { - put("gecko_sqlite_ms", syncInitMs) - put("llm_load_ms", llmInitMs) - put("warmup_query_ms", warmupMs) - put("total_init_ms", totalInitMs) - }) - put("memory", postInitMemory) - put("results", JSONArray(results)) - put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart) - } - - // Write to file - val outFile = File(getExternalFilesDir(null), "benchmark_results.json") - outFile.writeText(output.toString(2)) - Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}") - Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE") - logStatus("COMPLETE\nResults written to:\n${outFile.absolutePath}") - } - - // ── Single query execution ─────────────────────────────────────────── - - private data class QueryResult( - val retrievalTimeMs: Long, - val ttftMs: Long, - val prefillMs: Long, - val decodeMs: Long, - val generationTotalMs: Long, - val totalQueryMs: Long, - val responseChars: Int, - val estimatedTokens: Int, - val numRetrievedDocs: Int, - val retrievedChunks: List, - val retrievedTotalChars: Int, - val responseText: String, - val error: String?, - ) - - private suspend fun runQuery( - pipeline: RagPipeline, - queryText: String, - useRetrieval: Boolean, - retrieveKOverride: Int?, - ): QueryResult { - var retrievalTimeMs = 0L - var numDocs = 0 - var firstTokenTime = 0L - var error: String? = null - val responseBuilder = StringBuilder() - var retrievedChunks: List = emptyList() - - val qStart = System.currentTimeMillis() - var retrievalDoneTime = 0L - - try { - withContext(executor.asCoroutineDispatcher()) { - pipeline.generateResponse( - prompt = queryText, - history = emptyList(), - useRetrieval = useRetrieval, - retrievalListener = { docs -> - retrievalDoneTime = System.currentTimeMillis() - retrievalTimeMs = retrievalDoneTime - qStart - numDocs = docs.size - retrievedChunks = docs - }, - generationListener = { partial, _ -> - responseBuilder.append(partial) - if (firstTokenTime == 0L && partial.isNotEmpty()) { - firstTokenTime = System.currentTimeMillis() - } - }, - retrieveKOverride = retrieveKOverride, - ) - } - } catch (e: Exception) { - error = e.message - Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e) - } - - val qEnd = System.currentTimeMillis() - val totalQueryMs = qEnd - qStart - val responseChars = responseBuilder.length - - // Generation timing — measure from after retrieval (or query start if no retrieval) - val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart - val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0 - val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0 - val generationTotalMs = qEnd - genStart - val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt() - - return QueryResult( - retrievalTimeMs = retrievalTimeMs, - ttftMs = ttftMs, - prefillMs = ttftMs, - decodeMs = decodeMs, - generationTotalMs = generationTotalMs, - totalQueryMs = totalQueryMs, - responseChars = responseChars, - estimatedTokens = estimatedTokens, - numRetrievedDocs = numDocs, - retrievedChunks = retrievedChunks, - retrievedTotalChars = retrievedChunks.sumOf { it.text.length }, - responseText = responseBuilder.toString(), - error = error, - ) - } - - // ── Helpers ────────────────────────────────────────────────────────── - - private fun collectDeviceInfo(): JSONObject = JSONObject().apply { - put("manufacturer", Build.MANUFACTURER) - put("model", Build.MODEL) - put("device", Build.DEVICE) - put("hardware", Build.HARDWARE) - put("board", Build.BOARD) - put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown") - put("android_version", Build.VERSION.RELEASE) - put("sdk_int", Build.VERSION.SDK_INT) - put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown") + Log.w(BENCH_TAG, "[BENCHMARK] BenchmarkActivity → forwarded extras to BenchmarkForegroundService, finishing.") + finish() } - - private fun collectMemoryInfo(): JSONObject { - val rt = Runtime.getRuntime() - return JSONObject().apply { - put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024) - put("free_mb", rt.freeMemory() / 1024 / 1024) - put("total_mb", rt.totalMemory() / 1024 / 1024) - put("max_mb", rt.maxMemory() / 1024 / 1024) - } - } - - private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0 } diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt new file mode 100644 index 0000000..76358d0 --- /dev/null +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -0,0 +1,487 @@ +package com.example.app + +import android.app.Notification +import android.app.NotificationChannel +import android.app.NotificationManager +import android.app.Service +import android.content.Context +import android.content.Intent +import android.content.pm.ServiceInfo +import android.os.Build +import android.os.IBinder +import android.os.PowerManager +import android.util.Log +import androidx.core.app.NotificationCompat +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.cancel +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext +import org.json.JSONArray +import org.json.JSONObject +import java.io.File +import java.text.SimpleDateFormat +import java.util.Date +import java.util.Locale +import java.util.concurrent.Executors + +/** + * Foreground service that runs the on-device latency benchmark. + * + * The service holds a PARTIAL_WAKE_LOCK and posts a sticky notification so + * the OS keeps the process alive — unlike a plain Activity, which the + * vendor power manager (e.g. OPPO's OplusProxyWakeLock) will idle as soon + * as the screen sleeps. This lets multi-hour k-sweeps run while the + * device is locked or the screen is off. + * + * Launched via [BenchmarkActivity] which forwards Intent extras from `am + * start`. All benchmark logic lives here; the Activity is a thin shim. + * + * Intent extras (forwarded from the Activity): + * repeats:Int Repetitions per query + * cooldown_ms:Long Sleep between runs + * skip_retrieval:Boolean Run No-RAG mode only + * rag_only:Boolean Run RAG mode only + * query_filter:String? Category or query ID filter + * retrieve_k:Int (>=0) Override retrieval top_k; -1 = use config + */ +class BenchmarkForegroundService : Service() { + + companion object { + private const val TAG = "mam-ai" + private const val BENCH_TAG = "mam-ai-bench" + private const val NOTIFICATION_ID = 1002 + const val CHANNEL_ID = "mam_ai_benchmark" + private const val DEFAULT_COOLDOWN_MS = 5_000L + private const val DEFAULT_REPEATS = 3 + private const val CHARS_PER_TOKEN_ESTIMATE = 4.0 + } + + // Dispatchers.Default so the long-running coroutine isn't tied to the UI + // thread. The service has no UI anyway, but Default also ensures the work + // continues regardless of any activity lifecycle event. + private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default) + private val executor = Executors.newSingleThreadExecutor() + private var wakeLock: PowerManager.WakeLock? = null + + override fun onBind(intent: Intent?): IBinder? = null + + override fun onCreate() { + super.onCreate() + ensureChannel(this) + + // PARTIAL_WAKE_LOCK lets the CPU keep running through screen-off. + // Vendor power managers (OPPO ColorOS, Xiaomi MIUI, etc.) respect + // wake locks held by foreground services — they aggressively + // release locks held by background activities. + val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager + wakeLock = powerManager.newWakeLock( + PowerManager.PARTIAL_WAKE_LOCK, + "mam-ai:benchmark" + ).apply { + setReferenceCounted(false) + acquire(6L * 60L * 60L * 1000L) // 6 h failsafe + } + Log.w(BENCH_TAG, "[BENCHMARK] Service onCreate, PARTIAL_WAKE_LOCK acquired") + } + + override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { + startForegroundCompat("MAM-AI benchmark starting…", -1, 0) + + val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS + val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS + val skipRetrieval = intent?.getBooleanExtra("skip_retrieval", false) ?: false + val ragOnly = intent?.getBooleanExtra("rag_only", false) ?: false + val queryFilter = intent?.getStringExtra("query_filter") + val retrieveKOverride: Int? = intent?.getIntExtra("retrieve_k", -1)?.takeIf { it >= 0 } + + scope.launch { + try { + runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride) + } catch (t: Throwable) { + Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t) + Log.w(BENCH_TAG, "[BENCHMARK] FAILED") + } finally { + stopSelf() + } + } + // START_NOT_STICKY: don't auto-restart on kill — the benchmark is a + // one-shot job; restarting halfway through would corrupt the run. + return START_NOT_STICKY + } + + override fun onDestroy() { + super.onDestroy() + wakeLock?.let { + if (it.isHeld) { + it.release() + Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK") + } + } + wakeLock = null + scope.cancel() + @Suppress("DEPRECATION") + stopForeground(true) + } + + // ── Notification plumbing ──────────────────────────────────────────── + + private fun startForegroundCompat(message: String, progress: Int, max: Int) { + val notification = buildNotification(this, message, progress, max) + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.UPSIDE_DOWN_CAKE) { + startForeground( + NOTIFICATION_ID, + notification, + ServiceInfo.FOREGROUND_SERVICE_TYPE_DATA_SYNC, + ) + } else { + startForeground(NOTIFICATION_ID, notification) + } + } + + private fun updateNotification(message: String, progress: Int, max: Int) { + val nm = getSystemService(NotificationManager::class.java) ?: return + nm.notify(NOTIFICATION_ID, buildNotification(this, message, progress, max)) + } + + // ── Main benchmark loop ────────────────────────────────────────────── + + private suspend fun runBenchmark( + repeats: Int, + cooldownMs: Long, + skipRetrieval: Boolean, + ragOnly: Boolean, + queryFilter: String?, + retrieveKOverride: Int?, + ) { + val benchmarkStart = System.currentTimeMillis() + val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date()) + + Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter retrieve_k=${retrieveKOverride ?: "default"} rag_only=$ragOnly") + + val deviceInfo = collectDeviceInfo() + Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})") + + updateNotification("Initializing pipeline…", -1, 0) + Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...") + val initStart = System.currentTimeMillis() + val pipeline = withContext(executor.asCoroutineDispatcher()) { + RagPipeline(application) + } + val syncInitMs = System.currentTimeMillis() - initStart + Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms") + + updateNotification("Loading Gemma 4 LLM…", -1, 0) + Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...") + val llmWaitStart = System.currentTimeMillis() + withContext(executor.asCoroutineDispatcher()) { pipeline.awaitLlmReady() } + val llmInitMs = System.currentTimeMillis() - llmWaitStart + Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)") + + val warmupQueries = listOf( + "Normal fetal heart rate", + "Signs of infection after delivery", + "A mother has heavy bleeding after birth. What should I do first?", + "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?", + "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?", + ) + updateNotification("Warmup queries (${warmupQueries.size})…", -1, 0) + Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...") + val warmupStart = System.currentTimeMillis() + warmupQueries.forEachIndexed { i, prompt -> + Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"") + withContext(executor.asCoroutineDispatcher()) { + pipeline.generateResponse( + prompt = prompt, + history = emptyList(), + useRetrieval = false, + retrievalListener = {}, + generationListener = { _, _ -> } + ) + } + Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)") + } + val warmupMs = System.currentTimeMillis() - warmupStart + val totalInitMs = System.currentTimeMillis() - initStart + Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") + + val postInitMemory = collectMemoryInfo() + delay(cooldownMs) + + val queries = if (queryFilter != null) { + BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter } + } else { + BenchmarkQueries.ALL + } + if (queries.isEmpty()) { + Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'") + Log.w(BENCH_TAG, "[BENCHMARK] FAILED") + return + } + + // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins). + val retrievalModes = when { + skipRetrieval -> listOf(false) + ragOnly -> listOf(true) + else -> listOf(true, false) + } + val totalRuns = queries.size * retrievalModes.size * repeats + Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs") + + val results = mutableListOf() + var runIndex = 0 + val loopStart = System.currentTimeMillis() + + for (query in queries) { + for (useRetrieval in retrievalModes) { + for (rep in 1..repeats) { + runIndex++ + + val pct = (runIndex * 100) / totalRuns + updateNotification("[$runIndex/$totalRuns] ${query.id} rep=$rep", runIndex, totalRuns) + + Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats") + + val preMemory = collectMemoryInfo() + val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride) + val postMemory = collectMemoryInfo() + + val decodeTps = if (result.decodeMs > 0) + round2(result.estimatedTokens / (result.decodeMs / 1000.0)) + else 0.0 + + val entry = JSONObject().apply { + put("query_id", query.id) + put("category", query.category) + put("query_text", query.text) + put("query_word_count", query.wordCount) + put("use_retrieval", useRetrieval) + put("repetition", rep) + put("retrieval_time_ms", result.retrievalTimeMs) + put("ttft_ms", result.ttftMs) + put("prefill_ms", result.prefillMs) + put("decode_ms", result.decodeMs) + put("total_generation_ms", result.generationTotalMs) + put("total_query_ms", result.totalQueryMs) + put("response_length_chars", result.responseChars) + put("estimated_tokens", result.estimatedTokens) + put("decode_throughput_tps", decodeTps) + put("num_retrieved_docs", result.numRetrievedDocs) + put("retrieved_chunks", JSONArray().apply { + result.retrievedChunks.forEach { doc -> + put(JSONObject().apply { + put("text", doc.text) + put("source", doc.source) + put("page", doc.page) + put("chars", doc.text.length) + }) + } + }) + put("retrieved_total_chars", result.retrievedTotalChars) + put("response_text", result.responseText) + put("error", result.error ?: JSONObject.NULL) + put("heap_before_mb", preMemory.getInt("used_mb")) + put("heap_after_mb", postMemory.getInt("used_mb")) + } + results.add(entry) + + Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps") + + if (runIndex < totalRuns) { + delay(cooldownMs) + } + } + } + } + + val output = JSONObject().apply { + put("benchmark_version", 1) + put("timestamp", timestamp) + put("device", deviceInfo) + put("config", JSONObject().apply { + put("repeats", repeats) + put("cooldown_ms", cooldownMs) + put("skip_retrieval", skipRetrieval) + put("rag_only", ragOnly) + put("query_filter", queryFilter ?: JSONObject.NULL) + put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) + put("model", "gemma-4-E4B-it.litertlm") + put("backend", "CPU") + put("max_tokens", 32000) + put("temperature", 1.0) + put("top_p", 0.95) + put("top_k", 64) + }) + put("init", JSONObject().apply { + put("gecko_sqlite_ms", syncInitMs) + put("llm_load_ms", llmInitMs) + put("warmup_query_ms", warmupMs) + put("total_init_ms", totalInitMs) + }) + put("memory", postInitMemory) + put("results", JSONArray(results)) + put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart) + } + + val outFile = File(getExternalFilesDir(null), "benchmark_results.json") + outFile.writeText(output.toString(2)) + Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}") + Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE") + } + + // ── Single-query execution ─────────────────────────────────────────── + + private data class QueryResult( + val retrievalTimeMs: Long, + val ttftMs: Long, + val prefillMs: Long, + val decodeMs: Long, + val generationTotalMs: Long, + val totalQueryMs: Long, + val responseChars: Int, + val estimatedTokens: Int, + val numRetrievedDocs: Int, + val retrievedChunks: List, + val retrievedTotalChars: Int, + val responseText: String, + val error: String?, + ) + + private suspend fun runQuery( + pipeline: RagPipeline, + queryText: String, + useRetrieval: Boolean, + retrieveKOverride: Int?, + ): QueryResult { + var retrievalTimeMs = 0L + var numDocs = 0 + var firstTokenTime = 0L + var error: String? = null + val responseBuilder = StringBuilder() + var retrievedChunks: List = emptyList() + + val qStart = System.currentTimeMillis() + var retrievalDoneTime = 0L + + try { + withContext(executor.asCoroutineDispatcher()) { + pipeline.generateResponse( + prompt = queryText, + history = emptyList(), + useRetrieval = useRetrieval, + retrievalListener = { docs -> + retrievalDoneTime = System.currentTimeMillis() + retrievalTimeMs = retrievalDoneTime - qStart + numDocs = docs.size + retrievedChunks = docs + }, + generationListener = { partial, _ -> + responseBuilder.append(partial) + if (firstTokenTime == 0L && partial.isNotEmpty()) { + firstTokenTime = System.currentTimeMillis() + } + }, + retrieveKOverride = retrieveKOverride, + ) + } + } catch (e: Exception) { + error = e.message + Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e) + } + + val qEnd = System.currentTimeMillis() + val totalQueryMs = qEnd - qStart + val responseChars = responseBuilder.length + + // TTFT excludes retrieval; we measure from end-of-retrieval to first token. + val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart + val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0 + val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0 + val generationTotalMs = qEnd - genStart + val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt() + + return QueryResult( + retrievalTimeMs = retrievalTimeMs, + ttftMs = ttftMs, + prefillMs = ttftMs, + decodeMs = decodeMs, + generationTotalMs = generationTotalMs, + totalQueryMs = totalQueryMs, + responseChars = responseChars, + estimatedTokens = estimatedTokens, + numRetrievedDocs = numDocs, + retrievedChunks = retrievedChunks, + retrievedTotalChars = retrievedChunks.sumOf { it.text.length }, + responseText = responseBuilder.toString(), + error = error, + ) + } + + // ── Helpers ────────────────────────────────────────────────────────── + + private fun collectDeviceInfo(): JSONObject = JSONObject().apply { + put("manufacturer", Build.MANUFACTURER) + put("model", Build.MODEL) + put("device", Build.DEVICE) + put("hardware", Build.HARDWARE) + put("board", Build.BOARD) + put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown") + put("android_version", Build.VERSION.RELEASE) + put("sdk_int", Build.VERSION.SDK_INT) + put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown") + } + + private fun collectMemoryInfo(): JSONObject { + val rt = Runtime.getRuntime() + return JSONObject().apply { + put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024) + put("free_mb", rt.freeMemory() / 1024 / 1024) + put("total_mb", rt.totalMemory() / 1024 / 1024) + put("max_mb", rt.maxMemory() / 1024 / 1024) + } + } + + private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0 + + private fun ensureChannel(context: Context) { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + val nm = context.getSystemService(NotificationManager::class.java) + if (nm?.getNotificationChannel(CHANNEL_ID) == null) { + val channel = NotificationChannel( + CHANNEL_ID, + "MAM-AI Benchmark", + NotificationManager.IMPORTANCE_LOW, + ).apply { + description = "Foreground notification while the on-device latency benchmark runs" + setShowBadge(false) + } + nm?.createNotificationChannel(channel) + } + } + } + + private fun buildNotification( + context: Context, + message: String, + progress: Int, + max: Int, + ): Notification { + val builder = NotificationCompat.Builder(context, CHANNEL_ID) + .setContentTitle("MAM-AI Benchmark") + .setContentText(message) + .setSmallIcon(android.R.drawable.stat_sys_download) + .setOngoing(true) + .setOnlyAlertOnce(true) + .setPriority(NotificationCompat.PRIORITY_LOW) + + if (max > 0 && progress >= 0) { + builder.setProgress(max, progress, false) + } else { + builder.setProgress(0, 0, true) + } + return builder.build() + } +} From ef965381ab3ee4b072a42de4759968166940ea12 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Thu, 14 May 2026 21:33:35 +0800 Subject: [PATCH 08/30] fix(benchmark): record actual backend (GPU/CPU) in config metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The config block dump in benchmark_results.json was hard-coded to "backend":"CPU" — wrong for any build with useGpuForLlm=true. The GPU-sweep JSONs we just ran on the OPPO Snapdragon 8 Elite all carry the incorrect "CPU" label even though they were measured on GPU. Now reads from BuildConfig.USE_GPU_FOR_LLM at compile time and writes "GPU" or "CPU" accordingly. Also adds "mtp_enabled" from BuildConfig.USE_MTP_FOR_LLM for full provenance. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kotlin/com/example/app/BenchmarkForegroundService.kt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index 76358d0..9506786 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -309,7 +309,11 @@ class BenchmarkForegroundService : Service() { put("query_filter", queryFilter ?: JSONObject.NULL) put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) put("model", "gemma-4-E4B-it.litertlm") - put("backend", "CPU") + // Read backend from BuildConfig at compile time. Older builds + // hard-coded "CPU" here even when GPU was active — fixed so the + // JSON metadata matches reality. + put("backend", if (BuildConfig.USE_GPU_FOR_LLM) "GPU" else "CPU") + put("mtp_enabled", BuildConfig.USE_MTP_FOR_LLM) put("max_tokens", 32000) put("temperature", 1.0) put("top_p", 0.95) From ede273f5b6045e59f0fa10a54636109f29542540 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 04:36:07 +0800 Subject: [PATCH 09/30] analysis: k-sweep latency report (GPU + CPU on Snapdragon 8 Elite) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aggregates the 15 canonical 54-run benchmark JSONs into a single GPU↔CPU comparison report covering k ∈ {0, 1, 3, 5, 7, 10, 15} (and GPU-only k=20). Produced by a new evaluation/aggregate_k_sweep.py that's re-runnable as more JSONs land. Headline numbers (median total query latency on OPPO OPD2413, Snapdragon 8 Elite, Gemma 4 E4B, LiteRT-LM 0.11.0): k=0 (no-RAG): GPU 13–16 s | CPU 27–30 s (1.9× slower on CPU) k=3 : GPU 19–21 s | CPU 37–45 s (2.2× slower) k=10 : GPU 21–22 s | CPU 62–78 s (3.1× slower) k=15 : GPU 22–25 s | CPU 81–90 s (3.5× slower) k=20 : GPU 44% of runs fail at the 4096-token model ceiling Key findings: - GPU is the practical choice for this device tier — TTFT is 13–19× faster than CPU; total latency is 2–3.5× faster. - The model's 4096-token context window is the binding upper limit (k_max ≈ 17–18), not latency. GPU has comfortable headroom below that ceiling. - CPU is unusable past k≈3 for any reasonable UX budget. At k=15, CPU p95 latency hits 113 s. - Decode is memory-bandwidth-bound (GPU/CPU within ~1.4×); the GPU win is entirely in compute-heavy prefill. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 335 ++++++++++++++++++++++++ evaluation/reports/latency_report_v2.md | 171 ++++++++++++ 2 files changed, 506 insertions(+) create mode 100644 evaluation/aggregate_k_sweep.py create mode 100644 evaluation/reports/latency_report_v2.md diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py new file mode 100644 index 0000000..3aa18d4 --- /dev/null +++ b/evaluation/aggregate_k_sweep.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report. + +Reads all benchmark_*.json files produced by benchmark_latency.py, groups them +by (backend, k_override), and writes a markdown report at +evaluation/reports/latency_report_v2.md. + +Notes on backend identification: GPU sweep JSONs from before commit ef96538 +(2026-05-14 ~21:34) have config.backend="CPU" hard-coded (bug fixed later); +we identify them by timestamp instead. Anything before the threshold is GPU. +""" +from __future__ import annotations + +import datetime +import glob +import json +import os +import statistics +from collections import defaultdict +from pathlib import Path + +# Timestamp threshold separating GPU runs (before) from CPU runs (after). +# The CPU rebuild + reinstall happened at ~21:34 on 2026-05-14. +THRESHOLD_TS = "20260514T2130" + + +def backend_of(timestamp: str, recorded: str) -> str: + """Override stale GPU-era "CPU" labels using the timestamp.""" + if timestamp < THRESHOLD_TS: + return "GPU" + return recorded + + +def load_runs() -> list[dict]: + files = sorted(glob.glob(os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "latency_results", "benchmark_2026051*.json", + ))) + runs = [] + for f in files: + try: + d = json.load(open(f)) + except (json.JSONDecodeError, OSError): + continue + if "config" not in d or "results" not in d: + continue + if len(d["results"]) < 30: + continue # skip ad-hoc smoke tests; the canonical sweep is 54 runs + ts = os.path.basename(f).replace("benchmark_", "").split(".")[0].split("_")[0] + k_override = d["config"].get("retrieval_top_k_override") + skip_retrieval = d["config"].get("skip_retrieval", False) + k_label = 0 if skip_retrieval else (k_override if k_override is not None else None) + if k_label is None: + continue + backend = backend_of(ts, d["config"].get("backend", "CPU")) + runs.append({ + "file": os.path.basename(f), + "timestamp": ts, + "backend": backend, + "k": k_label, + "data": d, + }) + return runs + + +def aggregate_per_category(d: dict, key: str) -> dict[str, dict]: + """Per-category {median, p95, n} for the given timing field.""" + cat_vals: dict[str, list] = defaultdict(list) + for r in d["results"]: + if r.get("error"): + continue + cat_vals[r["category"]].append(r[key]) + out = {} + for c, vs in cat_vals.items(): + if not vs: + continue + s = sorted(vs) + out[c] = { + "n": len(vs), + "median": int(statistics.median(vs)), + "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]), + } + return out + + +def aggregate_overall(d: dict, key: str) -> dict: + vs = [r[key] for r in d["results"] if not r.get("error")] + if not vs: + return {} + s = sorted(vs) + return { + "n": len(vs), + "median": int(statistics.median(vs)), + "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]), + } + + +def avg_doc_chars(d: dict) -> int: + vs = [r.get("retrieved_total_chars", 0) for r in d["results"] if not r.get("error")] + return int(statistics.median(vs)) if vs else 0 + + +def fmt_ms(v: int | None) -> str: + return f"{v}" if v is not None else "—" + + +def fmt_s(v: int | None) -> str: + return f"{v / 1000:.1f}" if v is not None else "—" + + +def write_report(runs: list[dict], out_path: Path) -> None: + # Build {(backend, k) -> latest canonical run} + matrix: dict[tuple[str, int], dict] = {} + for r in runs: + key = (r["backend"], r["k"]) + if key in matrix: + # Keep the run with most successful entries (resolves duplicates) + ex = matrix[key] + ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error")) + r_ok = sum(1 for x in r["data"]["results"] if not x.get("error")) + if r_ok > ex_ok: + matrix[key] = r + else: + matrix[key] = r + + gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"]) + cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"]) + all_ks = sorted(set(gpu_ks + cpu_ks)) + + # Sample run for device info + sample = next(iter(matrix.values())) + dev = sample["data"]["device"] + + md = [] + md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n") + md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") + md.append("") + md.append("## Device & stack\n") + md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") + md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)") + md.append(f"- **LiteRT-LM**: 0.11.0") + md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") + md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") + md.append("") + md.append("## Methodology\n") + md.append("Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. ") + md.append("Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs ") + md.append("for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives ") + md.append("screen-off and device-lock; OPPO Hans whitelist set manually.") + md.append("") + md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") + md.append("- `decode` is first-token to last-token.") + md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") + md.append("- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).") + md.append("") + + # ─────────── Headline table: total_query_ms by (backend, k) ─────────── + md.append("## Headline — Median total query latency (seconds)\n") + md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |") + md.append(f"|---:|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + # doc chars: take from GPU if available, else CPU + doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gpu_cells = "—" + cpu_cells = "—" + ratios = [] + for col, run, key in [("gpu", gpu_run, "gpu"), ("cpu", cpu_run, "cpu")]: + pass + if gpu_run: + g = aggregate_per_category(gpu_run["data"], "total_query_ms") + gpu_cells = " / ".join(fmt_s(g.get(c, {}).get("median")) for c in ["short", "medium", "long"]) + if cpu_run: + c_ = aggregate_per_category(cpu_run["data"], "total_query_ms") + cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"]) + # ratio + ratio = "" + if gpu_run and cpu_run: + gov = aggregate_overall(gpu_run["data"], "total_query_ms")["median"] + cov = aggregate_overall(cpu_run["data"], "total_query_ms")["median"] + if gov: + ratio = f"{cov / gov:.2f}×" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") + md.append("") + + # ─────────── TTFT detail ─────────── + md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n") + md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |") + md.append(f"|---:|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None + ratio = f"{cv / gv:.1f}×" if gv and cv else "" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + md.append("") + + # ─────────── Decode detail ─────────── + md.append("## Decode (ms, median) — first token to last token\n") + md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ") + md.append("the model writing *longer answers* when given more context (more material to draw on).") + md.append("") + md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |") + md.append(f"|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + gv = aggregate_overall(gpu_run["data"], "decode_ms")["median"] if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "decode_ms")["median"] if cpu_run else None + ratio = f"{cv / gv:.2f}×" if gv and cv else "" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + md.append("") + + # ─────────── p95 totals ─────────── + md.append("## p95 total query latency (s) — tail-latency view\n") + md.append(f"| k | GPU p95 | CPU p95 |") + md.append(f"|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + gv = aggregate_overall(gpu_run["data"], "total_query_ms")["p95"] if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "total_query_ms")["p95"] if cpu_run else None + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |") + md.append("") + + # ─────────── Errors / context limit ─────────── + md.append("## Errors and the 4096-token context wall\n") + md.append(f"| k | GPU errors / 54 | CPU errors / 54 |") + md.append(f"|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None + ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") + md.append("") + md.append("At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum ") + md.append("number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B ") + md.append("`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped ") + md.append("for the same reason (would hit identical limit).") + md.append("") + + # ─────────── Wall-clock comparison ─────────── + md.append("## Wall-clock comparison\n") + md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |") + md.append("|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None + cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None + gw_s = f"{gw:.1f}" if gw else "—" + cw_s = f"{cw:.1f}" if cw else "—" + ratio = f"{cw / gw:.2f}×" if gw and cw else "" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |") + + # Findings / interpretation + md.append("") + md.append("## Key findings\n") + md.append("") + md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite") + md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ") + md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ") + md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.") + md.append("") + md.append("### 2. The model's 4096-token context window is the binding ceiling at high k") + md.append("k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens ") + md.append("with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this ") + md.append("`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. ") + md.append("CPU k=20 was skipped — same model, same limit.") + md.append("") + md.append("### 3. Latency is not the binding factor on GPU below k=15") + md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ") + md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ") + md.append("not by what fits in the latency budget.") + md.append("") + md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow") + md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ") + md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ") + md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ") + md.append("or **k ≤ 1** if you want sub-40s p95.") + md.append("") + md.append("### 5. Decode time is content-driven, not k-driven") + md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ") + md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ") + md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ") + md.append("not compute-bound on this hardware.") + md.append("") + md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3") + md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ") + md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ") + md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.") + md.append("") + + # File inventory + md.append("## Data inventory (per `(backend, k)`)\n") + md.append("| Backend | k | File | Wall (min) | Runs | Errors |") + md.append("|---|---:|---|---:|---:|---:|") + for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])): + r = matrix[(b, k)] + wall = r["data"]["total_benchmark_time_ms"] / 60000 + n = len(r["data"]["results"]) + e = sum(1 for x in r["data"]["results"] if x.get("error")) + label = "0 (no-RAG)" if k == 0 else str(k) + md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") + md.append("") + md.append("---") + md.append("") + md.append("_Source benchmark JSONs live in `evaluation/latency_results/`. ") + md.append("Aggregation script: `evaluation/aggregate_k_sweep.py`._") + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text("\n".join(md) + "\n") + print(f"Report written to: {out_path}") + + +def main() -> int: + runs = load_runs() + print(f"Loaded {len(runs)} canonical runs") + out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md" + write_report(runs, out) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md new file mode 100644 index 0000000..80a026a --- /dev/null +++ b/evaluation/reports/latency_report_v2.md @@ -0,0 +1,171 @@ +# MAM-AI On-Device Latency Sweep — GPU vs CPU + +_Generated: 2026-05-15T04:35:52_ + + +## Device & stack + +- **Device**: OnePlus OPD2413 (SM8750P) — Android 15 +- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) +- **LiteRT-LM**: 0.11.0 +- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU +- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000 + +## Methodology + +Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. +Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs +for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives +screen-off and device-lock; OPPO Hans whitelist set manually. + +- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token. +- `decode` is first-token to last-token. +- `total_query` is everything: `retrieval + TTFT + decode`. +- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`). + +## Headline — Median total query latency (seconds) + +| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 12.9 / 15.6 / 16.1 | 27.2 / 26.9 / 29.8 | 1.94× | +| 1 | 561 | 13.1 / 12.6 / 17.3 | 29.3 / 31.9 / 30.3 | 2.14× | +| 3 | 2098 | 18.6 / 18.6 / 21.0 | 37.3 / 44.5 / 42.5 | 2.24× | +| 5 | 3547 | 18.2 / 20.0 / 21.4 | 54.8 / 60.7 / 63.0 | 3.07× | +| 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× | +| 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× | +| 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× | +| 20 | 14520 | 23.9 / 20.5 / 18.5 | — | | + +## TTFT (ms, median) — prefill cost grows with retrieved-doc content + +| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 962 | 12633 | 13.1× | +| 1 | 561 | 954 | 12649 | 13.3× | +| 3 | 2098 | 989 | 18356 | 18.6× | +| 5 | 3547 | 1884 | 36424 | 19.3× | +| 7 | 5139 | 1920 | 36444 | 19.0× | +| 10 | 7482 | 2523 | 40013 | 15.9× | +| 15 | 11297 | 3457 | 54748 | 15.8× | +| 20 | 14520 | 3986 | — | | + +## Decode (ms, median) — first token to last token + +Decode time mostly tracks output length, not k or doc content. Variation across k reflects +the model writing *longer answers* when given more context (more material to draw on). + +| k | GPU decode | CPU decode | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 13470 | 15345 | 1.14× | +| 1 | 11415 | 13961 | 1.22× | +| 3 | 16364 | 19110 | 1.17× | +| 5 | 15929 | 21645 | 1.36× | +| 7 | 17215 | 23473 | 1.36× | +| 10 | 18118 | 21699 | 1.20× | +| 15 | 16820 | 22497 | 1.34× | +| 20 | 14688 | — | | + +## p95 total query latency (s) — tail-latency view + +| k | GPU p95 | CPU p95 | +|---:|---:|---:| +| **0 (no-RAG)** | 26.1 | 38.4 | +| 1 | 26.1 | 37.1 | +| 3 | 30.2 | 64.3 | +| 5 | 30.7 | 74.6 | +| 7 | 35.1 | 81.7 | +| 10 | 29.0 | 84.5 | +| 15 | 30.6 | 112.6 | +| 20 | 35.3 | — | + +## Errors and the 4096-token context wall + +| k | GPU errors / 54 | CPU errors / 54 | +|---:|---:|---:| +| **0 (no-RAG)** | 0 | 0 | +| 1 | 0 | 0 | +| 3 | 0 | 0 | +| 5 | 0 | 0 | +| 7 | 0 | 0 | +| 10 | 0 | 0 | +| 15 | 0 | 0 | +| 20 | 24 | — | + +At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum +number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B +`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped +for the same reason (would hit identical limit). + +## Wall-clock comparison + +| k | GPU wall (min) | CPU wall (min) | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 23.5 | 36.9 | 1.57× | +| 1 | 23.0 | 38.7 | 1.68× | +| 3 | 27.3 | 50.2 | 1.84× | +| 5 | 28.2 | 63.0 | 2.23× | +| 7 | 30.0 | 66.5 | 2.22× | +| 10 | 29.1 | 73.2 | 2.51× | +| 15 | 32.4 | 90.8 | 2.80× | +| 20 | 22.8 | — | | + +## Key findings + + +### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite +GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. +That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), +so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency. + +### 2. The model's 4096-token context window is the binding ceiling at high k +k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens +with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this +`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. +CPU k=20 was skipped — same model, same limit. + +### 3. Latency is not the binding factor on GPU below k=15 +GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. +Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), +not by what fits in the latency budget. + +### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow +CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. +p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't +available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, +or **k ≤ 1** if you want sub-40s p95. + +### 5. Decode time is content-driven, not k-driven +Decode time tracks output length. As k grows, the model writes *longer* responses — likely because +more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. +Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, +not compute-bound on this hardware. + +### 6. TTFT scales linearly with retrieved-doc content past k=3 +On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, +CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting +the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both. + +## Data inventory (per `(backend, k)`) + +| Backend | k | File | Wall (min) | Runs | Errors | +|---|---:|---|---:|---:|---:| +| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | +| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | +| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | +| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 | +| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | +| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | +| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | +| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | +| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | +| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | +| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 | +| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 | +| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | +| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | +| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | + +--- + +_Source benchmark JSONs live in `evaluation/latency_results/`. +Aggregation script: `evaluation/aggregate_k_sweep.py`._ From 4daf6266453c21579e05858e23df044aac7ee00c Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 07:40:55 +0800 Subject: [PATCH 10/30] =?UTF-8?q?analysis:=20add=20CPU=20k=3D20=20?= =?UTF-8?q?=E2=80=94=20confirms=204096-token=20wall=20is=20backend-invaria?= =?UTF-8?q?nt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU k=20 reproduces the GPU k=20 failure pattern exactly: - 24/54 errors on both backends (44% failure rate) - Identical 8 queries fail on both (long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05) - Same 24 (query × rep) pairs across both runs This is direct evidence that the 4096-token context cap is a property of the .litertlm model artifact itself — not a runtime config, not a backend choice. Strengthens finding #2 from "model is the ceiling, GPU specifically hits it" to "model is the ceiling, both backends hit it identically." Successful CPU k=20 runs: TTFT 65–73 s, total 89–96 s — well past any deployment budget even when the request fits the window. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 23 +++++++++------ evaluation/reports/latency_report_v2.md | 38 +++++++++++++++---------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 3aa18d4..e014225 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -242,10 +242,15 @@ def write_report(runs: list[dict], out_path: Path) -> None: label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") md.append("") - md.append("At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum ") - md.append("number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B ") - md.append("`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped ") - md.append("for the same reason (would hit identical limit).") + md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ") + md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ") + md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ") + md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ") + md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ") + md.append("The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.") + md.append("") + md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ") + md.append("deployment budget at this depth even when the request fits in the context window.") md.append("") # ─────────── Wall-clock comparison ─────────── @@ -273,10 +278,12 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.") md.append("") md.append("### 2. The model's 4096-token context window is the binding ceiling at high k") - md.append("k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens ") - md.append("with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this ") - md.append("`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. ") - md.append("CPU k=20 was skipped — same model, same limit.") + md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ") + md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ") + md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ") + md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ") + md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ") + md.append("Latency is *not* the constraint at the upper end; the model's context window is.") md.append("") md.append("### 3. Latency is not the binding factor on GPU below k=15") md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ") diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 80a026a..61661c5 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,6 +1,6 @@ # MAM-AI On-Device Latency Sweep — GPU vs CPU -_Generated: 2026-05-15T04:35:52_ +_Generated: 2026-05-15T07:40:25_ ## Device & stack @@ -34,7 +34,7 @@ screen-off and device-lock; OPPO Hans whitelist set manually. | 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× | | 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× | | 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× | -| 20 | 14520 | 23.9 / 20.5 / 18.5 | — | | +| 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× | ## TTFT (ms, median) — prefill cost grows with retrieved-doc content @@ -47,7 +47,7 @@ screen-off and device-lock; OPPO Hans whitelist set manually. | 7 | 5139 | 1920 | 36444 | 19.0× | | 10 | 7482 | 2523 | 40013 | 15.9× | | 15 | 11297 | 3457 | 54748 | 15.8× | -| 20 | 14520 | 3986 | — | | +| 20 | 14520 | 3986 | 72881 | 18.3× | ## Decode (ms, median) — first token to last token @@ -63,7 +63,7 @@ the model writing *longer answers* when given more context (more material to dra | 7 | 17215 | 23473 | 1.36× | | 10 | 18118 | 21699 | 1.20× | | 15 | 16820 | 22497 | 1.34× | -| 20 | 14688 | — | | +| 20 | 14688 | 22634 | 1.54× | ## p95 total query latency (s) — tail-latency view @@ -76,7 +76,7 @@ the model writing *longer answers* when given more context (more material to dra | 7 | 35.1 | 81.7 | | 10 | 29.0 | 84.5 | | 15 | 30.6 | 112.6 | -| 20 | 35.3 | — | +| 20 | 35.3 | 104.9 | ## Errors and the 4096-token context wall @@ -89,12 +89,17 @@ the model writing *longer answers* when given more context (more material to dra | 7 | 0 | 0 | | 10 | 0 | 0 | | 15 | 0 | 0 | -| 20 | 24 | — | +| 20 | 24 | 24 | -At k=20 on GPU, 24 of 54 runs failed with `Input token ids are too long. Exceeding the maximum -number of tokens allowed: …>= 4096` errors. The 4096-token cap is baked into the Gemma 4 E4B -`.litertlm` export — it's a model-artifact property, not a runtime config. CPU k=20 was skipped -for the same reason (would hit identical limit). +At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. +Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both +backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — +the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of +the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. +The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter. + +Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any +deployment budget at this depth even when the request fits in the context window. ## Wall-clock comparison @@ -107,7 +112,7 @@ for the same reason (would hit identical limit). | 7 | 30.0 | 66.5 | 2.22× | | 10 | 29.1 | 73.2 | 2.51× | | 15 | 32.4 | 90.8 | 2.80× | -| 20 | 22.8 | — | | +| 20 | 22.8 | 58.6 | 2.57× | ## Key findings @@ -118,10 +123,12 @@ That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invaria so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency. ### 2. The model's 4096-token context window is the binding ceiling at high k -k=15 works (54/54 on both GPU and CPU). k=20 fails on 44% of queries on GPU — input exceeds 4096 tokens -with chunks averaging ~200 tokens and system prompt + query ~500 tokens. **k_max ≈ 17–18** for this -`.litertlm` artifact. Latency is *not* the constraint at the upper end; the model's context window is. -CPU k=20 was skipped — same model, same limit. +k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — +the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. +Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives +the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, +not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. +Latency is *not* the constraint at the upper end; the model's context window is. ### 3. Latency is not the binding factor on GPU below k=15 GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. @@ -156,6 +163,7 @@ the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays | CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | | CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | | CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | +| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 | | GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | | GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | | GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | From 2a592d2836e94457addf333646114f5b85b9f19e Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 07:57:58 +0800 Subject: [PATCH 11/30] review: address Copilot feedback on PR #57 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real fixes: - BenchmarkForegroundService: shut down the single-thread executor in onDestroy. Without this its worker thread keeps the :benchmark process alive after the service stops. - BenchmarkForegroundService: remove dead `pct` variable left over from the Activity-era ASCII progress bar. - AndroidManifest: stale comment said BenchmarkActivity holds the wake lock; updated to reflect the foreground-service refactor. - benchmark_latency.py: error out if both --no-retrieval and --rag-only are passed (previously they silently coexisted; on-device skipRetrieval won, but the result was confusing). - aggregate_k_sweep.py: * backend_of() now only overrides recorded="CPU" when the timestamp predates the metadata fix. Future GPU runs (which write backend="GPU" correctly) and future CPU runs are trusted as-is — fixes the silent mislabeling Copilot flagged. * Drop the May-2026-only glob (`benchmark_2026051*`) — use `benchmark_*.json` and rely on the schema/length filters. * Use `with open(...)` context manager — avoid file-handle leak. * Rename `avg_doc_chars` → `median_doc_chars` (function used median despite the name). * Remove dead loop `for col, run, key in [...]: pass`. * Update module docstring to describe the new backfill-only logic. Verified: Kotlin still compiles (flutter build apk --release succeeds); aggregate script still loads all 16 canonical runs and regenerates the same report; the new mutual-exclusion error fires when both Python flags are passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/android/app/src/main/AndroidManifest.xml | 8 ++-- .../example/app/BenchmarkForegroundService.kt | 6 ++- evaluation/aggregate_k_sweep.py | 41 ++++++++++++------- evaluation/benchmark_latency.py | 3 ++ evaluation/reports/latency_report_v2.md | 2 +- 5 files changed, 39 insertions(+), 21 deletions(-) diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml index bb35442..c805436 100644 --- a/app/android/app/src/main/AndroidManifest.xml +++ b/app/android/app/src/main/AndroidManifest.xml @@ -4,9 +4,11 @@ - + str: - """Override stale GPU-era "CPU" labels using the timestamp.""" - if timestamp < THRESHOLD_TS: + """Trust the recorded backend, but backfill pre-fix GPU runs. + + Pre-fix files always say "CPU" in metadata even when GPU was active. + We override only when (a) the recorded value is "CPU" AND (b) the + timestamp predates the metadata fix. New GPU runs (which write + backend="GPU" correctly) and any CPU run from any time are trusted + as-is. + """ + if recorded == "CPU" and timestamp < THRESHOLD_TS: return "GPU" return recorded @@ -34,12 +45,13 @@ def backend_of(timestamp: str, recorded: str) -> str: def load_runs() -> list[dict]: files = sorted(glob.glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), - "latency_results", "benchmark_2026051*.json", + "latency_results", "benchmark_*.json", ))) runs = [] for f in files: try: - d = json.load(open(f)) + with open(f) as fp: + d = json.load(fp) except (json.JSONDecodeError, OSError): continue if "config" not in d or "results" not in d: @@ -95,7 +107,9 @@ def aggregate_overall(d: dict, key: str) -> dict: } -def avg_doc_chars(d: dict) -> int: +def median_doc_chars(d: dict) -> int: + """Median retrieved_total_chars across successful runs (the table column + is labeled 'doc_chars med', so this is the median by definition).""" vs = [r.get("retrieved_total_chars", 0) for r in d["results"] if not r.get("error")] return int(statistics.median(vs)) if vs else 0 @@ -162,12 +176,9 @@ def write_report(runs: list[dict], out_path: Path) -> None: gpu_run = matrix.get(("GPU", k)) cpu_run = matrix.get(("CPU", k)) # doc chars: take from GPU if available, else CPU - doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 gpu_cells = "—" cpu_cells = "—" - ratios = [] - for col, run, key in [("gpu", gpu_run, "gpu"), ("cpu", cpu_run, "cpu")]: - pass if gpu_run: g = aggregate_per_category(gpu_run["data"], "total_query_ms") gpu_cells = " / ".join(fmt_s(g.get(c, {}).get("median")) for c in ["short", "medium", "long"]) @@ -192,7 +203,7 @@ def write_report(runs: list[dict], out_path: Path) -> None: for k in all_ks: gpu_run = matrix.get(("GPU", k)) cpu_run = matrix.get(("CPU", k)) - doc_chars = avg_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None ratio = f"{cv / gv:.1f}×" if gv and cv else "" diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py index 30dec4c..e5c02a9 100644 --- a/evaluation/benchmark_latency.py +++ b/evaluation/benchmark_latency.py @@ -488,6 +488,9 @@ def main(): help="Timeout in seconds (default: 7200)") args = parser.parse_args() + if args.no_retrieval and args.rag_only: + parser.error("--no-retrieval and --rag-only are mutually exclusive") + print("=" * 60) print("MAM-AI On-Device Latency Benchmark") print("=" * 60) diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 61661c5..25356b3 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,6 +1,6 @@ # MAM-AI On-Device Latency Sweep — GPU vs CPU -_Generated: 2026-05-15T07:40:25_ +_Generated: 2026-05-15T07:56:55_ ## Device & stack From 659d3f0d51024a1c04d29d92452f856933b7b3c9 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 08:13:17 +0800 Subject: [PATCH 12/30] review: explicit None checks in aggregate report formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace truthy `if gw else "—"` style with `if gw is not None else "—"`, and add `> 0` guards on division ratios. Defensive against a hypothetical 0-valued median from a corrupted/aborted run JSON, which the truthy form would have silently rendered as "—" instead of "0.0". Affects four spots: TTFT ratio, decode ratio, wall-clock ratio + formatting, and the headline-table ratio. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 252a463..11307a3 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -190,7 +190,7 @@ def write_report(runs: list[dict], out_path: Path) -> None: if gpu_run and cpu_run: gov = aggregate_overall(gpu_run["data"], "total_query_ms")["median"] cov = aggregate_overall(cpu_run["data"], "total_query_ms")["median"] - if gov: + if gov is not None and gov > 0: ratio = f"{cov / gov:.2f}×" label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") @@ -206,7 +206,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None - ratio = f"{cv / gv:.1f}×" if gv and cv else "" + # Explicit None checks; also guard against div-by-zero on a 0 median. + ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else "" label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") md.append("") @@ -223,7 +224,7 @@ def write_report(runs: list[dict], out_path: Path) -> None: cpu_run = matrix.get(("CPU", k)) gv = aggregate_overall(gpu_run["data"], "decode_ms")["median"] if gpu_run else None cv = aggregate_overall(cpu_run["data"], "decode_ms")["median"] if cpu_run else None - ratio = f"{cv / gv:.2f}×" if gv and cv else "" + ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else "" label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") md.append("") @@ -273,9 +274,9 @@ def write_report(runs: list[dict], out_path: Path) -> None: cpu_run = matrix.get(("CPU", k)) gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None - gw_s = f"{gw:.1f}" if gw else "—" - cw_s = f"{cw:.1f}" if cw else "—" - ratio = f"{cw / gw:.2f}×" if gw and cw else "" + gw_s = f"{gw:.1f}" if gw is not None else "—" + cw_s = f"{cw:.1f}" if cw is not None else "—" + ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else "" label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |") From 497d2fcc0a5ffb2e652a909a8a75d68ab73b2277 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 08:13:59 +0800 Subject: [PATCH 13/30] review: replace THRESHOLD_TS heuristic with explicit allowlist The timestamp threshold (`if timestamp < "20260514T2130": return "GPU"`) silently rewrites any pre-threshold CPU JSON as GPU. Anyone running this aggregator with their own historical genuine-CPU runs in latency_results/ would have those mislabeled as GPU and potentially double-counted via the "most successful entries" tiebreaker in write_report. Replace with `PRE_FIX_GPU_FILES`: a frozenset of the exact 8 filenames known to predate the metadata fix in commit ef96538. Any file not in the allowlist uses its recorded backend value. Anyone else's historical files are unaffected. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 11307a3..f34acfe 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -7,8 +7,9 @@ Notes on backend identification: post-fix benchmark JSONs (commit ef96538 onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep -JSONs hard-code `backend="CPU"`; we backfill those using a one-time timestamp -threshold (see `backend_of`). Future runs of any backend are unaffected. +JSONs hard-code `backend="CPU"` even though they were measured on GPU; we +backfill those using an explicit filename allowlist (see `backend_of`). +Future runs of any backend are unaffected. """ from __future__ import annotations @@ -20,24 +21,27 @@ from collections import defaultdict from pathlib import Path -# One-time backfill: GPU sweep JSONs from before commit ef96538 ("fix(benchmark): -# record actual backend (GPU/CPU) in config metadata") have config.backend="CPU" -# hard-coded. For just those files, the timestamp identifies them as the GPU -# sweep we ran before the rebuild at ~21:34 on 2026-05-14. Files with a -# timestamp at or after the threshold have correct metadata and are trusted. -THRESHOLD_TS = "20260514T2130" - - -def backend_of(timestamp: str, recorded: str) -> str: - """Trust the recorded backend, but backfill pre-fix GPU runs. - - Pre-fix files always say "CPU" in metadata even when GPU was active. - We override only when (a) the recorded value is "CPU" AND (b) the - timestamp predates the metadata fix. New GPU runs (which write - backend="GPU" correctly) and any CPU run from any time are trusted - as-is. - """ - if recorded == "CPU" and timestamp < THRESHOLD_TS: +# Backfill for the specific historical GPU sweep files that predate the +# metadata-recording fix in commit ef96538. Those JSONs hard-code +# config.backend="CPU" even though they were measured on GPU. We use an +# explicit filename allowlist (rather than a timestamp threshold) so the +# rewrite cannot accidentally fire on anyone else's pre-threshold *genuine +# CPU* JSONs that happen to share latency_results/. +PRE_FIX_GPU_FILES = frozenset({ + "benchmark_20260514T174502_k1.json", + "benchmark_20260514T180830_k3.json", + "benchmark_20260514T183604_k5.json", + "benchmark_20260514T190438_k7.json", + "benchmark_20260514T193453_k10.json", + "benchmark_20260514T200414_k15.json", + "benchmark_20260514T203653_k20.json", + "benchmark_20260514T210522.json", +}) + + +def backend_of(filename: str, recorded: str) -> str: + """Trust the recorded backend except for the listed pre-fix GPU files.""" + if filename in PRE_FIX_GPU_FILES: return "GPU" return recorded @@ -64,7 +68,7 @@ def load_runs() -> list[dict]: k_label = 0 if skip_retrieval else (k_override if k_override is not None else None) if k_label is None: continue - backend = backend_of(ts, d["config"].get("backend", "CPU")) + backend = backend_of(os.path.basename(f), d["config"].get("backend", "CPU")) runs.append({ "file": os.path.basename(f), "timestamp": ts, From f372f8879d0abc401a138644dd3013fd697784ae Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 08:25:47 +0800 Subject: [PATCH 14/30] review: document :benchmark process model in service KDoc The service runs in android:process=":benchmark", separate from the main app process. RagPipeline(application) here therefore constructs an entirely fresh pipeline (Gecko + SQLite + LLM load) in that benchmark process, not the main app's. Worth documenting because: 1. Application.onCreate() will run a second time when the benchmark process spawns. 2. If the main app has the LLM loaded simultaneously, two LLM instances may briefly contend for GPU/memory during init. Add a "Process model" section to the class KDoc explaining both. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../com/example/app/BenchmarkForegroundService.kt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index e6e5401..4056d6f 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -40,6 +40,20 @@ import java.util.concurrent.Executors * Launched via [BenchmarkActivity] which forwards Intent extras from `am * start`. All benchmark logic lives here; the Activity is a thin shim. * + * **Process model.** Both this service and [BenchmarkActivity] declare + * `android:process=":benchmark"` in the manifest, so they run in a + * separate process from the main MAM-AI app. That process is fresh on + * each `am start`: this service constructs its own [RagPipeline] + * (Gecko + SQLite + LLM load) on entry, independent of any pipeline + * already loaded in the main app process. Two consequences worth + * knowing about: + * + * 1. The application's `Application` subclass initializes once per + * process — anything in your custom Application.onCreate() will + * run a second time when the benchmark process spawns. + * 2. If the main app is also running with the LLM loaded, two LLM + * instances may briefly contend for GPU/memory during init. + * * Intent extras (forwarded from the Activity): * repeats:Int Repetitions per query * cooldown_ms:Long Sleep between runs From e205fdf078d11854ed7b68b24c36c63f8caad10d Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 08:26:14 +0800 Subject: [PATCH 15/30] review: document retrieve_k=-1 sentinel and other intent-extra defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "-1 = use config default" semantic for retrieve_k is non-obvious and wasn't documented anywhere. Update both KDoc blocks (BenchmarkActivity and BenchmarkForegroundService) to spell out: - retrieve_k: pass any value >= 0 to override; pass -1 (or omit) to use runtime_config.json's value. The activity normalises -1 → null before forwarding to the service. - repeats default 3, cooldown_ms default 5000 (were missing). - skip_retrieval and rag_only are mutually exclusive; skip_retrieval wins if both are set. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kotlin/com/example/app/BenchmarkActivity.kt | 11 ++++++++--- .../com/example/app/BenchmarkForegroundService.kt | 13 +++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt index 887f841..f94cc1b 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt @@ -22,10 +22,15 @@ import android.util.Log * * Optional extras: * --ez skip_retrieval true Skip RAG retrieval (generation only) - * --ez rag_only true Skip the No-RAG mode (k-sweep helper) + * --ez rag_only true Skip the No-RAG mode (k-sweep helper). + * Mutually exclusive with skip_retrieval — + * if both are set, skip_retrieval wins. * --es query_filter short Filter by category or specific query ID - * --ei retrieve_k N Override retrieval top_k for this session - * (default: use runtime_config.json's value). + * --ei retrieve_k N Override retrieval top_k for this session. + * Pass any value >= 0 to override; pass -1 + * (or omit) to use runtime_config.json's + * value. The activity normalises -1 to null + * before forwarding to the service. */ class BenchmarkActivity : Activity() { diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index 4056d6f..9a37df3 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -55,12 +55,17 @@ import java.util.concurrent.Executors * instances may briefly contend for GPU/memory during init. * * Intent extras (forwarded from the Activity): - * repeats:Int Repetitions per query - * cooldown_ms:Long Sleep between runs + * repeats:Int Repetitions per query (default 3) + * cooldown_ms:Long Sleep between runs in ms (default 5000) * skip_retrieval:Boolean Run No-RAG mode only * rag_only:Boolean Run RAG mode only - * query_filter:String? Category or query ID filter - * retrieve_k:Int (>=0) Override retrieval top_k; -1 = use config + * (skip_retrieval and rag_only are mutually + * exclusive; skip_retrieval wins if both set) + * query_filter:String? Category or specific query ID filter + * retrieve_k:Int Override retrieval top_k for this session. + * Pass -1 (or omit) to use the value from + * runtime_config.json. Any value >= 0 takes + * effect for every query in this run. */ class BenchmarkForegroundService : Service() { From 574601c99adc2c8e1a8fca5e69ae8637ef008faf Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 08:26:37 +0800 Subject: [PATCH 16/30] review: mark BenchmarkForegroundService dataSync type as dev-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit foregroundServiceType="dataSync" is documented for "transferring data between a device and the cloud or other peers" — clearly a misuse for an on-device latency benchmark. Google Play's FGS-type enforcement would reject this on submission. Acceptable here because BenchmarkForegroundService is only launched via `adb shell am start` for in-house benchmarking and never appears in any user-facing flow. Add an explicit DEV-ONLY comment documenting: - the type is technically wrong, - the correct type to switch to if we ever ship benchmark capabilities (specialUse + the FOREGROUND_SERVICE_SPECIAL_USE permission + PROPERTY_SPECIAL_USE_FGS_SUBTYPE), - this declaration should be stripped from any Play Store build. No runtime change; only a comment to prevent surprises later. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- app/android/app/src/main/AndroidManifest.xml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml index c805436..d19101a 100644 --- a/app/android/app/src/main/AndroidManifest.xml +++ b/app/android/app/src/main/AndroidManifest.xml @@ -54,7 +54,17 @@ + in its own :benchmark process to keep the main app isolated. + DEV-ONLY: foregroundServiceType="dataSync" is technically a + misuse here (no actual data sync) — Google Play would reject + this declaration. Acceptable because BenchmarkForegroundService + is launched only via `adb shell am start` for in-house + benchmarking; it never appears in any user-facing flow and + this manifest entry should be stripped from any Play Store + build. If we ever need to ship benchmark capabilities, switch + to foregroundServiceType="specialUse" and add the corresponding + android.permission.FOREGROUND_SERVICE_SPECIAL_USE permission + plus the PROPERTY_SPECIAL_USE_FGS_SUBTYPE property. --> Date: Fri, 15 May 2026 08:28:14 +0800 Subject: [PATCH 17/30] review: guard aggregate_overall subscripts against empty dicts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit aggregate_overall returns {} when every run in a (backend, k) cell errored out (all results have an error field). Callers then subscript with ["median"] / ["p95"] and crash with KeyError. Today this only happens partially — e.g. GPU k=20 has 24/54 errors but 30 successful runs, so the dict is populated. But a future sweep where all runs error (entirely possible at higher k once the 4096-token wall is hit broadly, or if the LLM dies during init) would crash the report generation. Switch all four subscript sites in write_report() to .get("median") and .get("p95") so an empty dict propagates as None, which fmt_ms / fmt_s already render as "—". Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index f34acfe..6d43656 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -192,9 +192,9 @@ def write_report(runs: list[dict], out_path: Path) -> None: # ratio ratio = "" if gpu_run and cpu_run: - gov = aggregate_overall(gpu_run["data"], "total_query_ms")["median"] - cov = aggregate_overall(cpu_run["data"], "total_query_ms")["median"] - if gov is not None and gov > 0: + gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median") + cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median") + if gov is not None and cov is not None and gov > 0: ratio = f"{cov / gov:.2f}×" label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") @@ -208,8 +208,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: gpu_run = matrix.get(("GPU", k)) cpu_run = matrix.get(("CPU", k)) doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 - gv = aggregate_overall(gpu_run["data"], "ttft_ms")["median"] if gpu_run else None - cv = aggregate_overall(cpu_run["data"], "ttft_ms")["median"] if cpu_run else None + gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None # Explicit None checks; also guard against div-by-zero on a 0 median. ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else "" label = "**0 (no-RAG)**" if k == 0 else str(k) @@ -226,8 +226,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: for k in all_ks: gpu_run = matrix.get(("GPU", k)) cpu_run = matrix.get(("CPU", k)) - gv = aggregate_overall(gpu_run["data"], "decode_ms")["median"] if gpu_run else None - cv = aggregate_overall(cpu_run["data"], "decode_ms")["median"] if cpu_run else None + gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else "" label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") @@ -240,8 +240,8 @@ def write_report(runs: list[dict], out_path: Path) -> None: for k in all_ks: gpu_run = matrix.get(("GPU", k)) cpu_run = matrix.get(("CPU", k)) - gv = aggregate_overall(gpu_run["data"], "total_query_ms")["p95"] if gpu_run else None - cv = aggregate_overall(cpu_run["data"], "total_query_ms")["p95"] if cpu_run else None + gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None label = "**0 (no-RAG)**" if k == 0 else str(k) md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |") md.append("") From e29443ec021d127827eda5da34a4a62385d654ac Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:22:29 +0800 Subject: [PATCH 18/30] =?UTF-8?q?review:=20fix=20wrong=20survivor=20count?= =?UTF-8?q?=20in=20k=3D20=20narrative=20(8=20=E2=86=92=2010)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Report claimed "The 8 surviving queries on either side" — but the math doesn't check out: 18 queries × 3 reps = 54 runs, 24 errored = 8 unique queries failed × 3 reps. So 18 − 8 = 10 unique queries survived, not 8. Corrected to "The other 10 queries (10 × 3 reps = 30 successful runs)". Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 2 +- evaluation/reports/latency_report_v2.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 6d43656..32c39df 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -263,7 +263,7 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ") md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ") md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ") - md.append("The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter.") + md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.") md.append("") md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ") md.append("deployment budget at this depth even when the request fits in the context window.") diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 25356b3..3edee07 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,6 +1,6 @@ # MAM-AI On-Device Latency Sweep — GPU vs CPU -_Generated: 2026-05-15T07:56:55_ +_Generated: 2026-05-15T10:22:29_ ## Device & stack @@ -96,7 +96,7 @@ Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. -The 8 surviving queries on either side were the ones whose retrieved chunks happened to be shorter. +The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter. Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any deployment budget at this depth even when the request fits in the context window. From 57722db6b15f5350ff4a757e56e1e90410b93086 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:23:03 +0800 Subject: [PATCH 19/30] review: reject --retrieve-k 0 (footgun) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The service normalises -1 to null but treats any value >= 0 as an explicit override, so passing --retrieve-k 0 would silently call RetrievalConfig.create(0, ...) — a confusing footgun. If you want to disable retrieval entirely, --no-retrieval is the proper flag. argparse now rejects --retrieve-k < 1 with a clear error pointing to --no-retrieval. Negative values were already filtered by the service's takeIf { it >= 0 } but a 0 was slipping through. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/benchmark_latency.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py index e5c02a9..7486f96 100644 --- a/evaluation/benchmark_latency.py +++ b/evaluation/benchmark_latency.py @@ -490,6 +490,11 @@ def main(): if args.no_retrieval and args.rag_only: parser.error("--no-retrieval and --rag-only are mutually exclusive") + if args.retrieve_k is not None and args.retrieve_k < 1: + # The service treats any value >= 0 as an explicit override. Passing 0 + # would call RetrievalConfig.create(0, …), which is a silent footgun + # — use --no-retrieval if you actually want to disable retrieval. + parser.error("--retrieve-k must be >= 1; use --no-retrieval to disable retrieval entirely") print("=" * 60) print("MAM-AI On-Device Latency Benchmark") From 42326ee491cb486fcb57b4fd2698ed9b64d9cf5e Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:23:52 +0800 Subject: [PATCH 20/30] review: acquire wake lock in onStartCommand after startForeground MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the PARTIAL_WAKE_LOCK was acquired in onCreate(), before onStartCommand calls startForegroundCompat. Two problems with that: 1. If the system creates the service but onStartCommand is delayed or never invoked (bind-only path, framework deferral), the wake lock is held without a foreground notification — and Android 12+ can trip the foreground-service-start-while-in-background restriction in that state. 2. Even on the normal path, there is a brief window where the CPU is pinned awake without the user-visible notification that justifies it. Move the wake-lock acquisition into onStartCommand, immediately AFTER startForegroundCompat. The lock is now strictly paired with the foreground notification's lifetime. Guarded with `wakeLock == null` so duplicate onStartCommand invocations (which can happen via START_NOT_STICKY restarts) don't try to re-acquire. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../example/app/BenchmarkForegroundService.kt | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index 9a37df3..d341c58 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -91,24 +91,32 @@ class BenchmarkForegroundService : Service() { override fun onCreate() { super.onCreate() ensureChannel(this) + } + + override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { + // Promote to foreground FIRST so the wake lock is always paired with + // a visible notification (Android 12+ enforces this pairing for new + // foreground-service starts). Acquiring the wake lock in onCreate + // before startForeground would briefly hold the CPU awake without a + // notification — and would leak if onStartCommand never ran (e.g. + // bind-only path or framework deferral). + startForegroundCompat("MAM-AI benchmark starting…", -1, 0) // PARTIAL_WAKE_LOCK lets the CPU keep running through screen-off. // Vendor power managers (OPPO ColorOS, Xiaomi MIUI, etc.) respect // wake locks held by foreground services — they aggressively // release locks held by background activities. - val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager - wakeLock = powerManager.newWakeLock( - PowerManager.PARTIAL_WAKE_LOCK, - "mam-ai:benchmark" - ).apply { - setReferenceCounted(false) - acquire(6L * 60L * 60L * 1000L) // 6 h failsafe + if (wakeLock == null) { + val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager + wakeLock = powerManager.newWakeLock( + PowerManager.PARTIAL_WAKE_LOCK, + "mam-ai:benchmark" + ).apply { + setReferenceCounted(false) + acquire(6L * 60L * 60L * 1000L) // 6 h failsafe + } + Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired") } - Log.w(BENCH_TAG, "[BENCHMARK] Service onCreate, PARTIAL_WAKE_LOCK acquired") - } - - override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { - startForegroundCompat("MAM-AI benchmark starting…", -1, 0) val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS From 25a1a42411e4b45faa8b7feef537618c52e5f6c2 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:24:32 +0800 Subject: [PATCH 21/30] review: executor.shutdownNow() + brief await to avoid race in onDestroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The single-thread executor is what ferries pipeline calls off the coroutine dispatchers (LiteRT-LM generation, Gecko retrieval), and scope.cancel() does NOT propagate cancellation into those blocking native calls. A plain executor.shutdown() then returns immediately and leaves the worker thread alive, keeping the :benchmark process running until generation finishes naturally — stopForeground might run with the worker still busy. Use shutdownNow() to interrupt the worker, plus a brief 2 s awaitTermination() so a worker that's tearing down cleanly gets a chance to do so. If it doesn't, the OS will reclaim the process eventually. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../example/app/BenchmarkForegroundService.kt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index d341c58..b880d94 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -151,9 +151,21 @@ class BenchmarkForegroundService : Service() { wakeLock = null scope.cancel() // Shut down the single-thread executor that ferries pipeline calls off - // the coroutine dispatchers. Otherwise its worker thread keeps the - // :benchmark process alive after the service stops. - executor.shutdown() + // the coroutine dispatchers. We use shutdownNow() to interrupt the + // worker thread: scope.cancel() does not propagate cancellation into + // a blocking native call (e.g. mid-flight LiteRT-LM generation), + // and a plain shutdown() would return immediately and leave the + // thread running until the call finishes naturally — keeping the + // :benchmark process alive after stopForeground. + executor.shutdownNow() + // Brief best-effort await so we don't yank the rug if the worker is + // tearing down cleanly. If it doesn't finish in 2 s we move on; the + // OS will eventually kill the process anyway. + try { + executor.awaitTermination(2, java.util.concurrent.TimeUnit.SECONDS) + } catch (_: InterruptedException) { + Thread.currentThread().interrupt() + } @Suppress("DEPRECATION") stopForeground(true) } From 2b5cb9c98f32005a0120a63e15b2f17a60a22d01 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:25:09 +0800 Subject: [PATCH 22/30] review: warn loudly when JSON has no recorded backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit backend_of's fallback was `d["config"].get("backend", "CPU")`, which silently labels any backend-less JSON as "CPU". Pre-fix files are handled by the explicit allowlist; the silent default only fires on "unexpected" JSONs — which is exactly when a future regression in BenchmarkForegroundService (e.g. metadata accidentally dropped) would slip past us. Now: if config.backend is missing AND the file isn't on the pre-fix allowlist, print a warning to stderr explaining the assumption and defaulting to "CPU". Post-fix JSONs always carry the field, so this warning only fires when something is genuinely off. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 32c39df..229c3f1 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -18,6 +18,7 @@ import json import os import statistics +import sys from collections import defaultdict from pathlib import Path @@ -68,7 +69,22 @@ def load_runs() -> list[dict]: k_label = 0 if skip_retrieval else (k_override if k_override is not None else None) if k_label is None: continue - backend = backend_of(os.path.basename(f), d["config"].get("backend", "CPU")) + # The metadata fix in commit ef96538 ensures post-fix runs record + # config.backend. If it's missing, the JSON predates that fix — only + # safe if the filename is on the allowlist; otherwise warn loudly + # rather than silently defaulting (which would mask future GPU runs + # written by a regressed BenchmarkForegroundService). + recorded_backend = d["config"].get("backend") + if recorded_backend is None: + if os.path.basename(f) not in PRE_FIX_GPU_FILES: + print( + f"WARN: {os.path.basename(f)} has no config.backend " + "field and is not on the pre-fix allowlist; defaulting " + "to CPU. If this was actually a GPU run, fix the source.", + file=sys.stderr, + ) + recorded_backend = "CPU" + backend = backend_of(os.path.basename(f), recorded_backend) runs.append({ "file": os.path.basename(f), "timestamp": ts, From 291725c8c780743dd36904cca7c4f5f4529ab2ed Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:25:49 +0800 Subject: [PATCH 23/30] review: warn in logcat when both skip_retrieval and rag_only are set The Python wrapper rejects --no-retrieval + --rag-only via parser.error() before the launch ever fires. But a direct `am start --ez skip_retrieval true --ez rag_only true ...` bypasses Python entirely and silently runs in No-RAG mode (skipRetrieval wins) with no indication anything is off. Add a Log.w at the same priority as other [BENCHMARK] markers so the mismatch is visible in `adb logcat -s mam-ai-bench:W` output during debugging. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kotlin/com/example/app/BenchmarkForegroundService.kt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index b880d94..f971c2f 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -265,7 +265,13 @@ class BenchmarkForegroundService : Service() { return } - // skipRetrieval and ragOnly are mutually exclusive (skipRetrieval wins). + // skipRetrieval and ragOnly are mutually exclusive. The Python wrapper + // (benchmark_latency.py) rejects this combination upfront via + // parser.error(); a direct `am start` could still pass both, so log a + // visible warning in logcat instead of silently picking one. + if (skipRetrieval && ragOnly) { + Log.w(BENCH_TAG, "[BENCHMARK] WARNING: skip_retrieval AND rag_only both set; skip_retrieval wins (No-RAG only).") + } val retrievalModes = when { skipRetrieval -> listOf(false) ragOnly -> listOf(true) From f77effc0b301af3746cfc32fcf0e5bbf9ab26433 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:48:18 +0800 Subject: [PATCH 24/30] review: bump PARTIAL_WAKE_LOCK failsafe from 6h to 24h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 6 h ceiling could expire mid-sweep on long CPU runs — the full GPU + CPU k-sweep documented in latency_report_v2.md took ~7 h end-to-end, and a CPU-only sweep across k ∈ {1, 3, 5, 7, 10, 15} hit similar totals. When the lock auto-released, the OS could idle the CPU and the benchmark would silently stall (no failure log, just no progress). Bump to 24 h, with a comment that anything longer should switch to periodic re-acquire instead of bumping the constant further. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../kotlin/com/example/app/BenchmarkForegroundService.kt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index f971c2f..ddc0208 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -113,7 +113,12 @@ class BenchmarkForegroundService : Service() { "mam-ai:benchmark" ).apply { setReferenceCounted(false) - acquire(6L * 60L * 60L * 1000L) // 6 h failsafe + // 24 h failsafe. Long CPU sweeps (full series × repeats × all k) + // have already run ~7 h end-to-end; pushing to 24 h leaves + // plenty of slack so the lock can't silently expire mid-run. + // If we ever start running sweeps longer than this, switch + // to a periodic re-acquire instead of bumping further. + acquire(24L * 60L * 60L * 1000L) } Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired") } From 882e738ea859afabaf05d3b5ffaecd635e24b7cb Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:49:10 +0800 Subject: [PATCH 25/30] review: guard against double-start in onStartCommand MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scope.launch { runBenchmark(...) } was unconditional, so a re-delivered intent (another `am start` before stopSelf() completes) would spawn a second coroutine running on the same single-threaded executor and writing to the same benchmark_results.json — both timings and the output JSON would be corrupted. Add a `benchmarkStarted` volatile flag that's set on first entry. Any later onStartCommand call returns immediately with a logcat warning, keeping the in-flight run intact. START_NOT_STICKY makes this unlikely in practice, but the right belt- and-braces fix is cheap and removes a race even on edge cases (e.g. process recreated after low-memory kill before stopSelf returned). Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../com/example/app/BenchmarkForegroundService.kt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index ddc0208..566728c 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -85,6 +85,11 @@ class BenchmarkForegroundService : Service() { private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default) private val executor = Executors.newSingleThreadExecutor() private var wakeLock: PowerManager.WakeLock? = null + // Set once when the first onStartCommand fires runBenchmark. Subsequent + // intent re-deliveries (e.g. another `am start` before stopSelf() runs) + // see this true and are no-ops, so we never end up with two concurrent + // coroutines sharing the executor and the same output JSON. + @Volatile private var benchmarkStarted = false override fun onBind(intent: Intent?): IBinder? = null @@ -123,6 +128,16 @@ class BenchmarkForegroundService : Service() { Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired") } + // Reject re-deliveries before the benchmark coroutine completes. A + // second am start while the first is in flight would otherwise spawn + // a parallel coroutine and clobber the shared RagPipeline / output + // JSON. + if (benchmarkStarted) { + Log.w(BENCH_TAG, "[BENCHMARK] WARNING: ignoring re-delivery; benchmark is already running.") + return START_NOT_STICKY + } + benchmarkStarted = true + val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS val skipRetrieval = intent?.getBooleanExtra("skip_retrieval", false) ?: false From 7c2360feb6127ceab01edd0e0dc302aa963ef581 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:49:34 +0800 Subject: [PATCH 26/30] review: use BENCH_TAG for query-failure log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The catch handler in runQuery() was the only [BENCHMARK] log line in the file using TAG="mam-ai" instead of BENCH_TAG="mam-ai-bench". This made `adb logcat -s mam-ai-bench:E` filter out exactly the messages most worth surfacing — query-level errors. One-line fix. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../main/kotlin/com/example/app/BenchmarkForegroundService.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index 566728c..9fde78d 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -460,7 +460,7 @@ class BenchmarkForegroundService : Service() { } } catch (e: Exception) { error = e.message - Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e) + Log.e(BENCH_TAG, "[BENCHMARK] Query failed: ${e.message}", e) } val qEnd = System.currentTimeMillis() From 8cb9712b54a9b292cab6616a056c723cab656d50 Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:49:54 +0800 Subject: [PATCH 27/30] review: log skipped JSON files in aggregate_k_sweep load_runs() was silently dropping JSONs that didn't match the 54-run canonical-sweep shape (missing config/results keys, or < 30 results which is the smoke-test guard). For users running a narrow sweep (e.g. --filter long_01 --repeats 3 yields 3 results), the file would silently never appear in the report with no indication why. Log SKIP lines to stderr with the file name and reason. The output is still clean for normal runs (only emits when something is actually dropped). Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 229c3f1..24329bc 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -60,9 +60,19 @@ def load_runs() -> list[dict]: except (json.JSONDecodeError, OSError): continue if "config" not in d or "results" not in d: + print(f"SKIP: {os.path.basename(f)} — missing config or results key", file=sys.stderr) continue if len(d["results"]) < 30: - continue # skip ad-hoc smoke tests; the canonical sweep is 54 runs + # Skip ad-hoc smoke tests (the canonical sweep is 54 runs). Log so + # that a legitimate narrow sweep (--filter long_01, single-category) + # isn't silently dropped from the report. + print( + f"SKIP: {os.path.basename(f)} — {len(d['results'])} results " + "(< 30 threshold for canonical sweeps; pass it through if it " + "should appear in the matrix)", + file=sys.stderr, + ) + continue ts = os.path.basename(f).replace("benchmark_", "").split(".")[0].split("_")[0] k_override = d["config"].get("retrieval_top_k_override") skip_retrieval = d["config"].get("skip_retrieval", False) From f971145b9e15771f2f8cea12d3d13641013111dc Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:50:39 +0800 Subject: [PATCH 28/30] review: derive Methodology text from sample run's config, not hardcoded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Methodology paragraph hardcoded "18 queries × … 10-second cooldown" even though the JSONs carry the actual config.repeats, config.cooldown_ms, and a results count that proves the math. If a future run uses different defaults (or this script is pointed at a different sweep), the methodology text would silently lie. Now reads from the sample run's config: pulls repeats, cooldown_ms, results count; infers (queries × modes) from results / repeats. Output for the current data set is unchanged ("18 (query × mode) cells × 3 repeats = 54 timed runs … 10-second cooldown") because that's what the JSONs actually say. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 28 ++++++++++++++++++++----- evaluation/reports/latency_report_v2.md | 7 ++----- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 24329bc..4c6090b 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -186,16 +186,34 @@ def write_report(runs: list[dict], out_path: Path) -> None: md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") md.append("") + # Pull the actual values from the sample run's config instead of hard-coding + # text that can lie. If different runs used different settings, this won't + # catch that — but we'd rather report the sample's truth than fabricate a + # round-number claim. + sample_cfg = sample["data"].get("config", {}) + sample_repeats = sample_cfg.get("repeats", "?") + sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0 + sample_n_results = len(sample["data"]["results"]) + # Infer queries × modes from total runs / repeats. Default to "?" if the + # math doesn't divide evenly. + queries_x_modes: object = "?" + if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0: + queries_x_modes = sample_n_results // sample_repeats md.append("## Methodology\n") - md.append("Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. ") - md.append("Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs ") - md.append("for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives ") - md.append("screen-off and device-lock; OPPO Hans whitelist set manually.") + md.append( + f"Per backend × k configuration: {queries_x_modes} (query × mode) cells " + f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a " + f"No-RAG baseline per backend (k=0 via `--no-retrieval`). " + f"{sample_cooldown_s:g}-second cooldown between runs for thermal " + "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so " + "the run survives screen-off and device-lock; OPPO Hans whitelist set " + "manually." + ) md.append("") md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") md.append("- `decode` is first-token to last-token.") md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") - md.append("- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).") + md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") md.append("") # ─────────── Headline table: total_query_ms by (backend, k) ─────────── diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 3edee07..80e6348 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,6 +1,6 @@ # MAM-AI On-Device Latency Sweep — GPU vs CPU -_Generated: 2026-05-15T10:22:29_ +_Generated: 2026-05-15T10:50:22_ ## Device & stack @@ -13,10 +13,7 @@ _Generated: 2026-05-15T10:22:29_ ## Methodology -Per backend × k configuration: 18 queries × 1 mode (RAG-only) × 3 repeats = 54 timed runs. -Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs -for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives -screen-off and device-lock; OPPO Hans whitelist set manually. +Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually. - `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token. - `decode` is first-token to last-token. From 5fa5c6eb799e169e2c94ccb75267860e078cc34a Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:51:18 +0800 Subject: [PATCH 29/30] review: use statistics.quantiles for p95 instead of int(n*0.95) index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous formula `int(len(s) * 0.95)` collapses to max for any sample size n < 20 — e.g. for the per-category short bucket with 24 runs, int(24*0.95) = 22 (the 23rd of 24 sorted values), which is close to but not actually the 95th percentile. For a hypothetical narrower sample with n=3 (e.g. single-query small sweep), int(2.85) = 2 = the max, so p95 == max by construction. Centralise the calculation in a `_p95(values)` helper that uses `statistics.quantiles(values, n=20, method="exclusive")[18]` — the linear-interpolation 95th percentile from a 20-quantile partition. Falls back to max only when n < 2 (genuinely no quantiles to compute). Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- evaluation/aggregate_k_sweep.py | 23 +++++++++++++++++++---- evaluation/reports/latency_report_v2.md | 8 ++++---- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py index 4c6090b..d11e390 100644 --- a/evaluation/aggregate_k_sweep.py +++ b/evaluation/aggregate_k_sweep.py @@ -105,6 +105,23 @@ def load_runs() -> list[dict]: return runs +def _p95(values: list[float]) -> int | None: + """95th percentile via linear-interpolation 20-quantile partition. + + `statistics.quantiles(data, n=20)` returns 19 cut points dividing the + data into 20 equal-frequency groups; index 18 is the 95th percentile. + For very small samples (n < 2), there are no cut points to compute, + so we fall back to max — same behaviour as the previous + `int(len(s)*0.95)` formula but without the off-by-one that made p95 + collapse to max for any n < 20. + """ + if not values: + return None + if len(values) < 2: + return int(values[0]) + return int(statistics.quantiles(values, n=20, method="exclusive")[18]) + + def aggregate_per_category(d: dict, key: str) -> dict[str, dict]: """Per-category {median, p95, n} for the given timing field.""" cat_vals: dict[str, list] = defaultdict(list) @@ -116,11 +133,10 @@ def aggregate_per_category(d: dict, key: str) -> dict[str, dict]: for c, vs in cat_vals.items(): if not vs: continue - s = sorted(vs) out[c] = { "n": len(vs), "median": int(statistics.median(vs)), - "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]), + "p95": _p95(vs), } return out @@ -129,11 +145,10 @@ def aggregate_overall(d: dict, key: str) -> dict: vs = [r[key] for r in d["results"] if not r.get("error")] if not vs: return {} - s = sorted(vs) return { "n": len(vs), "median": int(statistics.median(vs)), - "p95": int(s[min(len(s) - 1, int(len(s) * 0.95))]), + "p95": _p95(vs), } diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md index 80e6348..c6745a6 100644 --- a/evaluation/reports/latency_report_v2.md +++ b/evaluation/reports/latency_report_v2.md @@ -1,6 +1,6 @@ # MAM-AI On-Device Latency Sweep — GPU vs CPU -_Generated: 2026-05-15T10:50:22_ +_Generated: 2026-05-15T10:51:06_ ## Device & stack @@ -68,11 +68,11 @@ the model writing *longer answers* when given more context (more material to dra |---:|---:|---:| | **0 (no-RAG)** | 26.1 | 38.4 | | 1 | 26.1 | 37.1 | -| 3 | 30.2 | 64.3 | +| 3 | 30.3 | 64.3 | | 5 | 30.7 | 74.6 | -| 7 | 35.1 | 81.7 | +| 7 | 35.1 | 81.8 | | 10 | 29.0 | 84.5 | -| 15 | 30.6 | 112.6 | +| 15 | 30.6 | 112.7 | | 20 | 35.3 | 104.9 | ## Errors and the 4096-token context wall From 9ecc54eaca29329da98d46bcb00c6706fc18419d Mon Sep 17 00:00:00 2001 From: nmrenyi Date: Fri, 15 May 2026 10:51:45 +0800 Subject: [PATCH 30/30] review: use STOP_FOREGROUND_REMOVE overload on API 24+ stopForeground(boolean) has been deprecated since Android 13. Replace with the SDK-aware form: STOP_FOREGROUND_REMOVE on API 24+ (where the int overload was introduced), fall back to the boolean variant only on older devices (where it isn't deprecated). Drops the @Suppress("DEPRECATION") on the modern path; we still suppress on the legacy path because the boolean variant *is* the non-deprecated API there. Per Copilot review on PR #57. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../com/example/app/BenchmarkForegroundService.kt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt index 9fde78d..e1ee93c 100644 --- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -186,8 +186,14 @@ class BenchmarkForegroundService : Service() { } catch (_: InterruptedException) { Thread.currentThread().interrupt() } - @Suppress("DEPRECATION") - stopForeground(true) + // Use the non-deprecated overload on API 24+ (where it was introduced). + // The boolean variant has been deprecated since Android 13. + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.N) { + stopForeground(STOP_FOREGROUND_REMOVE) + } else { + @Suppress("DEPRECATION") + stopForeground(true) + } } // ── Notification plumbing ────────────────────────────────────────────