diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml
index a44e8c2..d19101a 100644
--- a/app/android/app/src/main/AndroidManifest.xml
+++ b/app/android/app/src/main/AndroidManifest.xml
@@ -4,6 +4,12 @@
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE"/>
     <!-- Required on Android 14+ for network-data foreground services -->
     <uses-permission android:name="android.permission.FOREGROUND_SERVICE_DATA_SYNC"/>
+    <!-- BenchmarkForegroundService acquires a PARTIAL_WAKE_LOCK so the CPU
+         keeps running when the screen is off or locked. Without this,
+         multi-hour benchmarks stall silently when the device idles.
+         (BenchmarkActivity is now a thin launcher that just starts the
+         service; the wake lock lives in the service.) -->
+    <uses-permission android:name="android.permission.WAKE_LOCK"/>
 
     <application
         android:label="MAM-AI"
@@ -45,6 +51,26 @@
             android:foregroundServiceType="dataSync"
             android:exported="false" />
 
+        <!-- Foreground service for the on-device latency benchmark. Holds a
+             PARTIAL_WAKE_LOCK + sticky notification so the work survives
+             screen-off and device-lock through hours-long k-sweeps. Runs
+             in its own :benchmark process to keep the main app isolated.
+             DEV-ONLY: foregroundServiceType="dataSync" is technically a
+             misuse here (no actual data sync) — Google Play would reject
+             this declaration. Acceptable because BenchmarkForegroundService
+             is launched only via `adb shell am start` for in-house
+             benchmarking; it never appears in any user-facing flow and
+             this manifest entry should be stripped from any Play Store
+             build. If we ever need to ship benchmark capabilities, switch
+             to foregroundServiceType="specialUse" and add the corresponding
+             android.permission.FOREGROUND_SERVICE_SPECIAL_USE permission
+             plus the PROPERTY_SPECIAL_USE_FGS_SUBTYPE property. -->
+        <service
+            android:name=".BenchmarkForegroundService"
+            android:foregroundServiceType="dataSync"
+            android:exported="false"
+            android:process=":benchmark" />
+
         <!-- FileProvider for sharing PDF files from getExternalFilesDir with viewer apps -->
         <provider
             android:name="androidx.core.content.FileProvider"
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
index bce61ea..f94cc1b 100644
--- a/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkActivity.kt
@@ -1,389 +1,69 @@
 package com.example.app
 
 import android.app.Activity
+import android.content.Intent
 import android.os.Build
 import android.os.Bundle
 import android.util.Log
-import android.widget.LinearLayout
-import android.widget.ScrollView
-import android.widget.TextView
-import kotlinx.coroutines.CoroutineScope
-import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.SupervisorJob
-import kotlinx.coroutines.asCoroutineDispatcher
-import kotlinx.coroutines.launch
-import kotlinx.coroutines.withContext
-import org.json.JSONArray
-import org.json.JSONObject
-import java.io.File
-import java.text.SimpleDateFormat
-import java.util.Date
-import java.util.Locale
-import java.util.concurrent.Executors
 
 /**
- * Benchmark activity that runs predefined queries through [RagPipeline]
- * and writes structured timing results to a JSON file on device storage.
+ * Thin launcher for [BenchmarkForegroundService].
+ *
+ * All benchmark logic lives in the service so it survives screen-off and
+ * device-lock — vendor power managers (OPPO, Xiaomi, Huawei) will idle a
+ * plain Activity but respect a foreground service with a sticky
+ * notification.
+ *
+ * Launch via ADB exactly as before — the Activity forwards all extras
+ * straight to the service, then finishes immediately:
  *
- * Launch via ADB:
  *   adb shell am start -n com.example.app/.BenchmarkActivity \
  *       --ei repeats 3 --el cooldown_ms 5000
  *
  * Optional extras:
  *   --ez skip_retrieval true     Skip RAG retrieval (generation only)
+ *   --ez rag_only true           Skip the No-RAG mode (k-sweep helper).
+ *                                Mutually exclusive with skip_retrieval —
+ *                                if both are set, skip_retrieval wins.
  *   --es query_filter short      Filter by category or specific query ID
+ *   --ei retrieve_k N            Override retrieval top_k for this session.
+ *                                Pass any value >= 0 to override; pass -1
+ *                                (or omit) to use runtime_config.json's
+ *                                value. The activity normalises -1 to null
+ *                                before forwarding to the service.
  */
 class BenchmarkActivity : Activity() {
 
     companion object {
-        private const val TAG = "mam-ai"
         private const val BENCH_TAG = "mam-ai-bench"
-        private const val DEFAULT_COOLDOWN_MS = 5_000L
-        private const val DEFAULT_REPEATS = 3
-        private const val CHARS_PER_TOKEN_ESTIMATE = 4.0
     }
 
-    private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Main)
-    private val executor = Executors.newSingleThreadExecutor()
-    private lateinit var logView: TextView
-    private lateinit var scrollView: ScrollView
-
     override fun onCreate(savedInstanceState: Bundle?) {
         super.onCreate(savedInstanceState)
 
-        // Scrollable log console UI
-        scrollView = ScrollView(this).apply {
-            setBackgroundColor(0xFF000000.toInt())
-        }
-        logView = TextView(this).apply {
-            setTextColor(0xFF00FF00.toInt())
-            textSize = 13f
-            setPadding(32, 48, 32, 48)
-            text = "=== MAM-AI Benchmark ===\n"
-        }
-        scrollView.addView(logView)
-        setContentView(scrollView)
-
-        val repeats = intent.getIntExtra("repeats", DEFAULT_REPEATS)
-        val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS)
-        val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false)
-        val queryFilter = intent.getStringExtra("query_filter")
-
-        scope.launch {
-            try {
-                runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter)
-            } catch (t: Throwable) {
-                Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t)
-                Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
-                logStatus("FAILED: ${t.message}")
-            } finally {
-                finish()
-            }
+        val serviceIntent = Intent(this, BenchmarkForegroundService::class.java).apply {
+            // Forward every extra the user might have passed via `am start`.
+            // Defaults are resolved inside the service.
+            if (intent.hasExtra("repeats"))
+                putExtra("repeats", intent.getIntExtra("repeats", 3))
+            if (intent.hasExtra("cooldown_ms"))
+                putExtra("cooldown_ms", intent.getLongExtra("cooldown_ms", 5000L))
+            if (intent.hasExtra("skip_retrieval"))
+                putExtra("skip_retrieval", intent.getBooleanExtra("skip_retrieval", false))
+            if (intent.hasExtra("rag_only"))
+                putExtra("rag_only", intent.getBooleanExtra("rag_only", false))
+            if (intent.hasExtra("query_filter"))
+                putExtra("query_filter", intent.getStringExtra("query_filter"))
+            if (intent.hasExtra("retrieve_k"))
+                putExtra("retrieve_k", intent.getIntExtra("retrieve_k", -1))
         }
-    }
-
-    private fun logStatus(text: String) {
-        runOnUiThread {
-            logView.append(text + "\n")
-            scrollView.post { scrollView.fullScroll(ScrollView.FOCUS_DOWN) }
-        }
-    }
-
-    // ── Main benchmark loop ──────────────────────────────────────────────
-
-    private suspend fun runBenchmark(
-        repeats: Int,
-        cooldownMs: Long,
-        skipRetrieval: Boolean,
-        queryFilter: String?,
-    ) {
-        val benchmarkStart = System.currentTimeMillis()
-        val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date())
-
-        Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter")
-
-        // Device info
-        val deviceInfo = collectDeviceInfo()
-        Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})")
 
-        // Step 1: Gecko + SQLite init (synchronous part of RagPipeline constructor)
-        logStatus("Step 1/4: Initializing Gecko embedder + SQLite...")
-        Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...")
-        val initStart = System.currentTimeMillis()
-        val pipeline = withContext(executor.asCoroutineDispatcher()) {
-            RagPipeline(application)
-        }
-        val syncInitMs = System.currentTimeMillis() - initStart
-        Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms")
-        logStatus("Step 1/4: Gecko + SQLite done (${syncInitMs}ms)")
-
-        // Step 2: Wait for LLM model load (async, started by RagPipeline constructor)
-        logStatus("Step 2/4: Loading Gemma 4 LLM model...")
-        Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...")
-        val llmWaitStart = System.currentTimeMillis()
-        withContext(executor.asCoroutineDispatcher()) {
-            pipeline.awaitLlmReady()
-        }
-        val llmInitMs = System.currentTimeMillis() - llmWaitStart
-        Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)")
-        logStatus("Step 2/4: LLM loaded (${llmInitMs}ms)")
-
-        // Step 3: 5 warmup queries of varying length — warms JIT / LiteRT-LM / shader caches
-        val warmupQueries = listOf(
-            "Normal fetal heart rate",
-            "Signs of infection after delivery",
-            "A mother has heavy bleeding after birth. What should I do first?",
-            "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?",
-            "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?",
-        )
-        logStatus("Step 3/4: Running ${warmupQueries.size} warmup queries...")
-        Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...")
-        val warmupStart = System.currentTimeMillis()
-        warmupQueries.forEachIndexed { i, prompt ->
-            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"")
-            withContext(executor.asCoroutineDispatcher()) {
-                pipeline.generateResponse(
-                    prompt = prompt,
-                    history = emptyList(),
-                    useRetrieval = false,
-                    retrievalListener = {},
-                    generationListener = { _, _ -> }
-                )
-            }
-            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)")
-        }
-        val warmupMs = System.currentTimeMillis() - warmupStart
-        val totalInitMs = System.currentTimeMillis() - initStart
-        Log.w(BENCH_TAG, "[BENCHMARK] Warmup complete: ${warmupMs}ms total (${warmupQueries.size} queries)")
-        Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
-
-        val postInitMemory = collectMemoryInfo()
-
-        // Step 4: Cooldown before timed runs
-        logStatus("--- Init summary: gecko=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
-        logStatus("Cooldown ${cooldownMs}ms...")
-        Thread.sleep(cooldownMs)
-
-        // Filter queries
-        val queries = if (queryFilter != null) {
-            BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter }
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+            startForegroundService(serviceIntent)
         } else {
-            BenchmarkQueries.ALL
-        }
-
-        if (queries.isEmpty()) {
-            Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'")
-            Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
-            return
+            startService(serviceIntent)
         }
-
-        val retrievalModes = if (skipRetrieval) listOf(false) else listOf(true, false)
-        val totalRuns = queries.size * retrievalModes.size * repeats
-        Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs")
-
-        // Execution loop
-        val results = mutableListOf<JSONObject>()
-        var runIndex = 0
-        val loopStart = System.currentTimeMillis()
-
-        for (query in queries) {
-            for (useRetrieval in retrievalModes) {
-                for (rep in 1..repeats) {
-                    runIndex++
-
-                    // Estimate time remaining based on average time per completed run
-                    val etaStr = if (runIndex > 1) {
-                        val elapsedMs = System.currentTimeMillis() - loopStart
-                        val avgPerRun = elapsedMs.toDouble() / (runIndex - 1)
-                        val remainingMs = (avgPerRun * (totalRuns - runIndex + 1)).toLong()
-                        val remainMin = remainingMs / 60000
-                        val remainSec = (remainingMs % 60000) / 1000
-                        "ETA: ${remainMin}m ${remainSec}s"
-                    } else "ETA: calculating..."
-
-                    Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats")
-                    logStatus("[$runIndex/$totalRuns] ${query.id} | retrieval=$useRetrieval rep=$rep | $etaStr")
-
-                    val preMemory = collectMemoryInfo()
-                    val result = runQuery(pipeline, query.text, useRetrieval)
-                    val postMemory = collectMemoryInfo()
-
-                    val decodeTps = if (result.decodeMs > 0)
-                        round2(result.estimatedTokens / (result.decodeMs / 1000.0))
-                    else 0.0
-
-                    val entry = JSONObject().apply {
-                        put("query_id", query.id)
-                        put("category", query.category)
-                        put("query_text", query.text)
-                        put("query_word_count", query.wordCount)
-                        put("use_retrieval", useRetrieval)
-                        put("repetition", rep)
-                        put("retrieval_time_ms", result.retrievalTimeMs)
-                        put("ttft_ms", result.ttftMs)
-                        put("prefill_ms", result.prefillMs)
-                        put("decode_ms", result.decodeMs)
-                        put("total_generation_ms", result.generationTotalMs)
-                        put("total_query_ms", result.totalQueryMs)
-                        put("response_length_chars", result.responseChars)
-                        put("estimated_tokens", result.estimatedTokens)
-                        put("decode_throughput_tps", decodeTps)
-                        put("num_retrieved_docs", result.numRetrievedDocs)
-                        put("error", result.error ?: JSONObject.NULL)
-                        put("heap_before_mb", preMemory.getInt("used_mb"))
-                        put("heap_after_mb", postMemory.getInt("used_mb"))
-                    }
-                    results.add(entry)
-
-                    val resultLine = "  -> ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms tps=$decodeTps"
-                    Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps")
-                    logStatus(resultLine)
-
-                    val pct = (runIndex * 100) / totalRuns
-                    val elapsedMin = (System.currentTimeMillis() - loopStart) / 60000
-                    logStatus("  [${"█".repeat(pct / 5)}${"░".repeat(20 - pct / 5)}] $pct% ($elapsedMin min elapsed)")
-
-                    // Cooldown between queries (skip after last run)
-                    if (runIndex < totalRuns) {
-                        Thread.sleep(cooldownMs)
-                    }
-                }
-            }
-        }
-
-        // Assemble output JSON
-        val output = JSONObject().apply {
-            put("benchmark_version", 1)
-            put("timestamp", timestamp)
-            put("device", deviceInfo)
-            put("config", JSONObject().apply {
-                put("repeats", repeats)
-                put("cooldown_ms", cooldownMs)
-                put("skip_retrieval", skipRetrieval)
-                put("query_filter", queryFilter ?: JSONObject.NULL)
-                put("model", "gemma-4-E4B-it.litertlm")
-                put("backend", "CPU")
-                put("max_tokens", 32000)
-                put("temperature", 1.0)
-                put("top_p", 0.95)
-                put("top_k", 64)
-            })
-            put("init", JSONObject().apply {
-                put("gecko_sqlite_ms", syncInitMs)
-                put("llm_load_ms", llmInitMs)
-                put("warmup_query_ms", warmupMs)
-                put("total_init_ms", totalInitMs)
-            })
-            put("memory", postInitMemory)
-            put("results", JSONArray(results))
-            put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart)
-        }
-
-        // Write to file
-        val outFile = File(getExternalFilesDir(null), "benchmark_results.json")
-        outFile.writeText(output.toString(2))
-        Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}")
-        Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE")
-        logStatus("COMPLETE\nResults written to:\n${outFile.absolutePath}")
-    }
-
-    // ── Single query execution ───────────────────────────────────────────
-
-    private data class QueryResult(
-        val retrievalTimeMs: Long,
-        val ttftMs: Long,
-        val prefillMs: Long,
-        val decodeMs: Long,
-        val generationTotalMs: Long,
-        val totalQueryMs: Long,
-        val responseChars: Int,
-        val estimatedTokens: Int,
-        val numRetrievedDocs: Int,
-        val error: String?,
-    )
-
-    private suspend fun runQuery(pipeline: RagPipeline, queryText: String, useRetrieval: Boolean): QueryResult {
-        var retrievalTimeMs = 0L
-        var numDocs = 0
-        var firstTokenTime = 0L
-        var error: String? = null
-        val responseBuilder = StringBuilder()
-
-        val qStart = System.currentTimeMillis()
-        var retrievalDoneTime = 0L
-
-        try {
-            withContext(executor.asCoroutineDispatcher()) {
-                pipeline.generateResponse(
-                    prompt = queryText,
-                    history = emptyList(),
-                    useRetrieval = useRetrieval,
-                    retrievalListener = { docs ->
-                        retrievalDoneTime = System.currentTimeMillis()
-                        retrievalTimeMs = retrievalDoneTime - qStart
-                        numDocs = docs.size
-                    },
-                    generationListener = { partial, _ ->
-                        responseBuilder.append(partial)
-                        if (firstTokenTime == 0L && partial.isNotEmpty()) {
-                            firstTokenTime = System.currentTimeMillis()
-                        }
-                    }
-                )
-            }
-        } catch (e: Exception) {
-            error = e.message
-            Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e)
-        }
-
-        val qEnd = System.currentTimeMillis()
-        val totalQueryMs = qEnd - qStart
-        val responseChars = responseBuilder.length
-
-        // Generation timing — measure from after retrieval (or query start if no retrieval)
-        val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart
-        val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0
-        val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0
-        val generationTotalMs = qEnd - genStart
-        val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt()
-
-        return QueryResult(
-            retrievalTimeMs = retrievalTimeMs,
-            ttftMs = ttftMs,
-            prefillMs = ttftMs,
-            decodeMs = decodeMs,
-            generationTotalMs = generationTotalMs,
-            totalQueryMs = totalQueryMs,
-            responseChars = responseChars,
-            estimatedTokens = estimatedTokens,
-            numRetrievedDocs = numDocs,
-            error = error,
-        )
+        Log.w(BENCH_TAG, "[BENCHMARK] BenchmarkActivity → forwarded extras to BenchmarkForegroundService, finishing.")
+        finish()
     }
-
-    // ── Helpers ──────────────────────────────────────────────────────────
-
-    private fun collectDeviceInfo(): JSONObject = JSONObject().apply {
-        put("manufacturer", Build.MANUFACTURER)
-        put("model", Build.MODEL)
-        put("device", Build.DEVICE)
-        put("hardware", Build.HARDWARE)
-        put("board", Build.BOARD)
-        put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown")
-        put("android_version", Build.VERSION.RELEASE)
-        put("sdk_int", Build.VERSION.SDK_INT)
-        put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown")
-    }
-
-    private fun collectMemoryInfo(): JSONObject {
-        val rt = Runtime.getRuntime()
-        return JSONObject().apply {
-            put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024)
-            put("free_mb", rt.freeMemory() / 1024 / 1024)
-            put("total_mb", rt.totalMemory() / 1024 / 1024)
-            put("max_mb", rt.maxMemory() / 1024 / 1024)
-        }
-    }
-
-    private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0
 }
diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
new file mode 100644
index 0000000..e1ee93c
--- /dev/null
+++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt
@@ -0,0 +1,564 @@
+package com.example.app
+
+import android.app.Notification
+import android.app.NotificationChannel
+import android.app.NotificationManager
+import android.app.Service
+import android.content.Context
+import android.content.Intent
+import android.content.pm.ServiceInfo
+import android.os.Build
+import android.os.IBinder
+import android.os.PowerManager
+import android.util.Log
+import androidx.core.app.NotificationCompat
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.asCoroutineDispatcher
+import kotlinx.coroutines.cancel
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import org.json.JSONArray
+import org.json.JSONObject
+import java.io.File
+import java.text.SimpleDateFormat
+import java.util.Date
+import java.util.Locale
+import java.util.concurrent.Executors
+
+/**
+ * Foreground service that runs the on-device latency benchmark.
+ *
+ * The service holds a PARTIAL_WAKE_LOCK and posts a sticky notification so
+ * the OS keeps the process alive — unlike a plain Activity, which the
+ * vendor power manager (e.g. OPPO's OplusProxyWakeLock) will idle as soon
+ * as the screen sleeps. This lets multi-hour k-sweeps run while the
+ * device is locked or the screen is off.
+ *
+ * Launched via [BenchmarkActivity] which forwards Intent extras from `am
+ * start`. All benchmark logic lives here; the Activity is a thin shim.
+ *
+ * **Process model.** Both this service and [BenchmarkActivity] declare
+ * `android:process=":benchmark"` in the manifest, so they run in a
+ * separate process from the main MAM-AI app. That process is fresh on
+ * each `am start`: this service constructs its own [RagPipeline]
+ * (Gecko + SQLite + LLM load) on entry, independent of any pipeline
+ * already loaded in the main app process. Two consequences worth
+ * knowing about:
+ *
+ *  1. The application's `Application` subclass initializes once per
+ *     process — anything in your custom Application.onCreate() will
+ *     run a second time when the benchmark process spawns.
+ *  2. If the main app is also running with the LLM loaded, two LLM
+ *     instances may briefly contend for GPU/memory during init.
+ *
+ * Intent extras (forwarded from the Activity):
+ *   repeats:Int                Repetitions per query (default 3)
+ *   cooldown_ms:Long           Sleep between runs in ms (default 5000)
+ *   skip_retrieval:Boolean     Run No-RAG mode only
+ *   rag_only:Boolean           Run RAG mode only
+ *                              (skip_retrieval and rag_only are mutually
+ *                              exclusive; skip_retrieval wins if both set)
+ *   query_filter:String?       Category or specific query ID filter
+ *   retrieve_k:Int             Override retrieval top_k for this session.
+ *                              Pass -1 (or omit) to use the value from
+ *                              runtime_config.json. Any value >= 0 takes
+ *                              effect for every query in this run.
+ */
+class BenchmarkForegroundService : Service() {
+
+    companion object {
+        private const val TAG = "mam-ai"
+        private const val BENCH_TAG = "mam-ai-bench"
+        private const val NOTIFICATION_ID = 1002
+        const val CHANNEL_ID = "mam_ai_benchmark"
+        private const val DEFAULT_COOLDOWN_MS = 5_000L
+        private const val DEFAULT_REPEATS = 3
+        private const val CHARS_PER_TOKEN_ESTIMATE = 4.0
+    }
+
+    // Dispatchers.Default so the long-running coroutine isn't tied to the UI
+    // thread. The service has no UI anyway, but Default also ensures the work
+    // continues regardless of any activity lifecycle event.
+    private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default)
+    private val executor = Executors.newSingleThreadExecutor()
+    private var wakeLock: PowerManager.WakeLock? = null
+    // Set once when the first onStartCommand fires runBenchmark. Subsequent
+    // intent re-deliveries (e.g. another `am start` before stopSelf() runs)
+    // see this true and are no-ops, so we never end up with two concurrent
+    // coroutines sharing the executor and the same output JSON.
+    @Volatile private var benchmarkStarted = false
+
+    override fun onBind(intent: Intent?): IBinder? = null
+
+    override fun onCreate() {
+        super.onCreate()
+        ensureChannel(this)
+    }
+
+    override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int {
+        // Promote to foreground FIRST so the wake lock is always paired with
+        // a visible notification (Android 12+ enforces this pairing for new
+        // foreground-service starts). Acquiring the wake lock in onCreate
+        // before startForeground would briefly hold the CPU awake without a
+        // notification — and would leak if onStartCommand never ran (e.g.
+        // bind-only path or framework deferral).
+        startForegroundCompat("MAM-AI benchmark starting…", -1, 0)
+
+        // PARTIAL_WAKE_LOCK lets the CPU keep running through screen-off.
+        // Vendor power managers (OPPO ColorOS, Xiaomi MIUI, etc.) respect
+        // wake locks held by foreground services — they aggressively
+        // release locks held by background activities.
+        if (wakeLock == null) {
+            val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager
+            wakeLock = powerManager.newWakeLock(
+                PowerManager.PARTIAL_WAKE_LOCK,
+                "mam-ai:benchmark"
+            ).apply {
+                setReferenceCounted(false)
+                // 24 h failsafe. Long CPU sweeps (full series × repeats × all k)
+                // have already run ~7 h end-to-end; pushing to 24 h leaves
+                // plenty of slack so the lock can't silently expire mid-run.
+                // If we ever start running sweeps longer than this, switch
+                // to a periodic re-acquire instead of bumping further.
+                acquire(24L * 60L * 60L * 1000L)
+            }
+            Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired")
+        }
+
+        // Reject re-deliveries before the benchmark coroutine completes. A
+        // second am start while the first is in flight would otherwise spawn
+        // a parallel coroutine and clobber the shared RagPipeline / output
+        // JSON.
+        if (benchmarkStarted) {
+            Log.w(BENCH_TAG, "[BENCHMARK] WARNING: ignoring re-delivery; benchmark is already running.")
+            return START_NOT_STICKY
+        }
+        benchmarkStarted = true
+
+        val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS
+        val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS
+        val skipRetrieval = intent?.getBooleanExtra("skip_retrieval", false) ?: false
+        val ragOnly = intent?.getBooleanExtra("rag_only", false) ?: false
+        val queryFilter = intent?.getStringExtra("query_filter")
+        val retrieveKOverride: Int? = intent?.getIntExtra("retrieve_k", -1)?.takeIf { it >= 0 }
+
+        scope.launch {
+            try {
+                runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride)
+            } catch (t: Throwable) {
+                Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t)
+                Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
+            } finally {
+                stopSelf()
+            }
+        }
+        // START_NOT_STICKY: don't auto-restart on kill — the benchmark is a
+        // one-shot job; restarting halfway through would corrupt the run.
+        return START_NOT_STICKY
+    }
+
+    override fun onDestroy() {
+        super.onDestroy()
+        wakeLock?.let {
+            if (it.isHeld) {
+                it.release()
+                Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK")
+            }
+        }
+        wakeLock = null
+        scope.cancel()
+        // Shut down the single-thread executor that ferries pipeline calls off
+        // the coroutine dispatchers. We use shutdownNow() to interrupt the
+        // worker thread: scope.cancel() does not propagate cancellation into
+        // a blocking native call (e.g. mid-flight LiteRT-LM generation),
+        // and a plain shutdown() would return immediately and leave the
+        // thread running until the call finishes naturally — keeping the
+        // :benchmark process alive after stopForeground.
+        executor.shutdownNow()
+        // Brief best-effort await so we don't yank the rug if the worker is
+        // tearing down cleanly. If it doesn't finish in 2 s we move on; the
+        // OS will eventually kill the process anyway.
+        try {
+            executor.awaitTermination(2, java.util.concurrent.TimeUnit.SECONDS)
+        } catch (_: InterruptedException) {
+            Thread.currentThread().interrupt()
+        }
+        // Use the non-deprecated overload on API 24+ (where it was introduced).
+        // The boolean variant has been deprecated since Android 13.
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.N) {
+            stopForeground(STOP_FOREGROUND_REMOVE)
+        } else {
+            @Suppress("DEPRECATION")
+            stopForeground(true)
+        }
+    }
+
+    // ── Notification plumbing ────────────────────────────────────────────
+
+    private fun startForegroundCompat(message: String, progress: Int, max: Int) {
+        val notification = buildNotification(this, message, progress, max)
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.UPSIDE_DOWN_CAKE) {
+            startForeground(
+                NOTIFICATION_ID,
+                notification,
+                ServiceInfo.FOREGROUND_SERVICE_TYPE_DATA_SYNC,
+            )
+        } else {
+            startForeground(NOTIFICATION_ID, notification)
+        }
+    }
+
+    private fun updateNotification(message: String, progress: Int, max: Int) {
+        val nm = getSystemService(NotificationManager::class.java) ?: return
+        nm.notify(NOTIFICATION_ID, buildNotification(this, message, progress, max))
+    }
+
+    // ── Main benchmark loop ──────────────────────────────────────────────
+
+    private suspend fun runBenchmark(
+        repeats: Int,
+        cooldownMs: Long,
+        skipRetrieval: Boolean,
+        ragOnly: Boolean,
+        queryFilter: String?,
+        retrieveKOverride: Int?,
+    ) {
+        val benchmarkStart = System.currentTimeMillis()
+        val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date())
+
+        Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter retrieve_k=${retrieveKOverride ?: "default"} rag_only=$ragOnly")
+
+        val deviceInfo = collectDeviceInfo()
+        Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})")
+
+        updateNotification("Initializing pipeline…", -1, 0)
+        Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...")
+        val initStart = System.currentTimeMillis()
+        val pipeline = withContext(executor.asCoroutineDispatcher()) {
+            RagPipeline(application)
+        }
+        val syncInitMs = System.currentTimeMillis() - initStart
+        Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms")
+
+        updateNotification("Loading Gemma 4 LLM…", -1, 0)
+        Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...")
+        val llmWaitStart = System.currentTimeMillis()
+        withContext(executor.asCoroutineDispatcher()) { pipeline.awaitLlmReady() }
+        val llmInitMs = System.currentTimeMillis() - llmWaitStart
+        Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)")
+
+        val warmupQueries = listOf(
+            "Normal fetal heart rate",
+            "Signs of infection after delivery",
+            "A mother has heavy bleeding after birth. What should I do first?",
+            "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?",
+            "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?",
+        )
+        updateNotification("Warmup queries (${warmupQueries.size})…", -1, 0)
+        Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...")
+        val warmupStart = System.currentTimeMillis()
+        warmupQueries.forEachIndexed { i, prompt ->
+            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"")
+            withContext(executor.asCoroutineDispatcher()) {
+                pipeline.generateResponse(
+                    prompt = prompt,
+                    history = emptyList(),
+                    useRetrieval = false,
+                    retrievalListener = {},
+                    generationListener = { _, _ -> }
+                )
+            }
+            Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)")
+        }
+        val warmupMs = System.currentTimeMillis() - warmupStart
+        val totalInitMs = System.currentTimeMillis() - initStart
+        Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms")
+
+        val postInitMemory = collectMemoryInfo()
+        delay(cooldownMs)
+
+        val queries = if (queryFilter != null) {
+            BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter }
+        } else {
+            BenchmarkQueries.ALL
+        }
+        if (queries.isEmpty()) {
+            Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'")
+            Log.w(BENCH_TAG, "[BENCHMARK] FAILED")
+            return
+        }
+
+        // skipRetrieval and ragOnly are mutually exclusive. The Python wrapper
+        // (benchmark_latency.py) rejects this combination upfront via
+        // parser.error(); a direct `am start` could still pass both, so log a
+        // visible warning in logcat instead of silently picking one.
+        if (skipRetrieval && ragOnly) {
+            Log.w(BENCH_TAG, "[BENCHMARK] WARNING: skip_retrieval AND rag_only both set; skip_retrieval wins (No-RAG only).")
+        }
+        val retrievalModes = when {
+            skipRetrieval -> listOf(false)
+            ragOnly -> listOf(true)
+            else -> listOf(true, false)
+        }
+        val totalRuns = queries.size * retrievalModes.size * repeats
+        Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs")
+
+        val results = mutableListOf<JSONObject>()
+        var runIndex = 0
+        val loopStart = System.currentTimeMillis()
+
+        for (query in queries) {
+            for (useRetrieval in retrievalModes) {
+                for (rep in 1..repeats) {
+                    runIndex++
+                    updateNotification("[$runIndex/$totalRuns] ${query.id} rep=$rep", runIndex, totalRuns)
+
+                    Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats")
+
+                    val preMemory = collectMemoryInfo()
+                    val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride)
+                    val postMemory = collectMemoryInfo()
+
+                    val decodeTps = if (result.decodeMs > 0)
+                        round2(result.estimatedTokens / (result.decodeMs / 1000.0))
+                    else 0.0
+
+                    val entry = JSONObject().apply {
+                        put("query_id", query.id)
+                        put("category", query.category)
+                        put("query_text", query.text)
+                        put("query_word_count", query.wordCount)
+                        put("use_retrieval", useRetrieval)
+                        put("repetition", rep)
+                        put("retrieval_time_ms", result.retrievalTimeMs)
+                        put("ttft_ms", result.ttftMs)
+                        put("prefill_ms", result.prefillMs)
+                        put("decode_ms", result.decodeMs)
+                        put("total_generation_ms", result.generationTotalMs)
+                        put("total_query_ms", result.totalQueryMs)
+                        put("response_length_chars", result.responseChars)
+                        put("estimated_tokens", result.estimatedTokens)
+                        put("decode_throughput_tps", decodeTps)
+                        put("num_retrieved_docs", result.numRetrievedDocs)
+                        put("retrieved_chunks", JSONArray().apply {
+                            result.retrievedChunks.forEach { doc ->
+                                put(JSONObject().apply {
+                                    put("text", doc.text)
+                                    put("source", doc.source)
+                                    put("page", doc.page)
+                                    put("chars", doc.text.length)
+                                })
+                            }
+                        })
+                        put("retrieved_total_chars", result.retrievedTotalChars)
+                        put("response_text", result.responseText)
+                        put("error", result.error ?: JSONObject.NULL)
+                        put("heap_before_mb", preMemory.getInt("used_mb"))
+                        put("heap_after_mb", postMemory.getInt("used_mb"))
+                    }
+                    results.add(entry)
+
+                    Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps")
+
+                    if (runIndex < totalRuns) {
+                        delay(cooldownMs)
+                    }
+                }
+            }
+        }
+
+        val output = JSONObject().apply {
+            put("benchmark_version", 1)
+            put("timestamp", timestamp)
+            put("device", deviceInfo)
+            put("config", JSONObject().apply {
+                put("repeats", repeats)
+                put("cooldown_ms", cooldownMs)
+                put("skip_retrieval", skipRetrieval)
+                put("rag_only", ragOnly)
+                put("query_filter", queryFilter ?: JSONObject.NULL)
+                put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL)
+                put("model", "gemma-4-E4B-it.litertlm")
+                // Read backend from BuildConfig at compile time. Older builds
+                // hard-coded "CPU" here even when GPU was active — fixed so the
+                // JSON metadata matches reality.
+                put("backend", if (BuildConfig.USE_GPU_FOR_LLM) "GPU" else "CPU")
+                put("mtp_enabled", BuildConfig.USE_MTP_FOR_LLM)
+                put("max_tokens", 32000)
+                put("temperature", 1.0)
+                put("top_p", 0.95)
+                put("top_k", 64)
+            })
+            put("init", JSONObject().apply {
+                put("gecko_sqlite_ms", syncInitMs)
+                put("llm_load_ms", llmInitMs)
+                put("warmup_query_ms", warmupMs)
+                put("total_init_ms", totalInitMs)
+            })
+            put("memory", postInitMemory)
+            put("results", JSONArray(results))
+            put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart)
+        }
+
+        val outFile = File(getExternalFilesDir(null), "benchmark_results.json")
+        outFile.writeText(output.toString(2))
+        Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}")
+        Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE")
+    }
+
+    // ── Single-query execution ───────────────────────────────────────────
+
+    private data class QueryResult(
+        val retrievalTimeMs: Long,
+        val ttftMs: Long,
+        val prefillMs: Long,
+        val decodeMs: Long,
+        val generationTotalMs: Long,
+        val totalQueryMs: Long,
+        val responseChars: Int,
+        val estimatedTokens: Int,
+        val numRetrievedDocs: Int,
+        val retrievedChunks: List<RetrievedDoc>,
+        val retrievedTotalChars: Int,
+        val responseText: String,
+        val error: String?,
+    )
+
+    private suspend fun runQuery(
+        pipeline: RagPipeline,
+        queryText: String,
+        useRetrieval: Boolean,
+        retrieveKOverride: Int?,
+    ): QueryResult {
+        var retrievalTimeMs = 0L
+        var numDocs = 0
+        var firstTokenTime = 0L
+        var error: String? = null
+        val responseBuilder = StringBuilder()
+        var retrievedChunks: List<RetrievedDoc> = emptyList()
+
+        val qStart = System.currentTimeMillis()
+        var retrievalDoneTime = 0L
+
+        try {
+            withContext(executor.asCoroutineDispatcher()) {
+                pipeline.generateResponse(
+                    prompt = queryText,
+                    history = emptyList(),
+                    useRetrieval = useRetrieval,
+                    retrievalListener = { docs ->
+                        retrievalDoneTime = System.currentTimeMillis()
+                        retrievalTimeMs = retrievalDoneTime - qStart
+                        numDocs = docs.size
+                        retrievedChunks = docs
+                    },
+                    generationListener = { partial, _ ->
+                        responseBuilder.append(partial)
+                        if (firstTokenTime == 0L && partial.isNotEmpty()) {
+                            firstTokenTime = System.currentTimeMillis()
+                        }
+                    },
+                    retrieveKOverride = retrieveKOverride,
+                )
+            }
+        } catch (e: Exception) {
+            error = e.message
+            Log.e(BENCH_TAG, "[BENCHMARK] Query failed: ${e.message}", e)
+        }
+
+        val qEnd = System.currentTimeMillis()
+        val totalQueryMs = qEnd - qStart
+        val responseChars = responseBuilder.length
+
+        // TTFT excludes retrieval; we measure from end-of-retrieval to first token.
+        val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart
+        val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0
+        val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0
+        val generationTotalMs = qEnd - genStart
+        val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt()
+
+        return QueryResult(
+            retrievalTimeMs = retrievalTimeMs,
+            ttftMs = ttftMs,
+            prefillMs = ttftMs,
+            decodeMs = decodeMs,
+            generationTotalMs = generationTotalMs,
+            totalQueryMs = totalQueryMs,
+            responseChars = responseChars,
+            estimatedTokens = estimatedTokens,
+            numRetrievedDocs = numDocs,
+            retrievedChunks = retrievedChunks,
+            retrievedTotalChars = retrievedChunks.sumOf { it.text.length },
+            responseText = responseBuilder.toString(),
+            error = error,
+        )
+    }
+
+    // ── Helpers ──────────────────────────────────────────────────────────
+
+    private fun collectDeviceInfo(): JSONObject = JSONObject().apply {
+        put("manufacturer", Build.MANUFACTURER)
+        put("model", Build.MODEL)
+        put("device", Build.DEVICE)
+        put("hardware", Build.HARDWARE)
+        put("board", Build.BOARD)
+        put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown")
+        put("android_version", Build.VERSION.RELEASE)
+        put("sdk_int", Build.VERSION.SDK_INT)
+        put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown")
+    }
+
+    private fun collectMemoryInfo(): JSONObject {
+        val rt = Runtime.getRuntime()
+        return JSONObject().apply {
+            put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024)
+            put("free_mb", rt.freeMemory() / 1024 / 1024)
+            put("total_mb", rt.totalMemory() / 1024 / 1024)
+            put("max_mb", rt.maxMemory() / 1024 / 1024)
+        }
+    }
+
+    private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0
+
+    private fun ensureChannel(context: Context) {
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
+            val nm = context.getSystemService(NotificationManager::class.java)
+            if (nm?.getNotificationChannel(CHANNEL_ID) == null) {
+                val channel = NotificationChannel(
+                    CHANNEL_ID,
+                    "MAM-AI Benchmark",
+                    NotificationManager.IMPORTANCE_LOW,
+                ).apply {
+                    description = "Foreground notification while the on-device latency benchmark runs"
+                    setShowBadge(false)
+                }
+                nm?.createNotificationChannel(channel)
+            }
+        }
+    }
+
+    private fun buildNotification(
+        context: Context,
+        message: String,
+        progress: Int,
+        max: Int,
+    ): Notification {
+        val builder = NotificationCompat.Builder(context, CHANNEL_ID)
+            .setContentTitle("MAM-AI Benchmark")
+            .setContentText(message)
+            .setSmallIcon(android.R.drawable.stat_sys_download)
+            .setOngoing(true)
+            .setOnlyAlertOnce(true)
+            .setPriority(NotificationCompat.PRIORITY_LOW)
+
+        if (max > 0 && progress >= 0) {
+            builder.setProgress(max, progress, false)
+        } else {
+            builder.setProgress(0, 0, true)
+        }
+        return builder.build()
+    }
+}
diff --git a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
index 19f582e..e13e391 100644
--- a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
+++ b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt
@@ -219,7 +219,12 @@ class RagPipeline(application: Application) {
         }
     }
 
-    /** Generates the response from the LLM with conversation history support. */
+    /** Generates the response from the LLM with conversation history support.
+     *
+     *  [retrieveKOverride] — when non-null, replaces `retrievalConfig.top_k`
+     *  for this call only. Used by [BenchmarkActivity] for the per-k latency
+     *  sweep; production callers leave it null and inherit the runtime config.
+     */
     suspend fun generateResponse(
         prompt: String,
         history: List<Map<String, String>>,
@@ -227,6 +232,7 @@ class RagPipeline(application: Application) {
         language: String = "en",
         retrievalListener: (docs: List<RetrievedDoc>) -> Unit,
         generationListener: (partial: String, done: Boolean) -> Unit,
+        retrieveKOverride: Int? = null,
     ): String =
         coroutineScope {
             awaitLlmReady()
@@ -235,10 +241,11 @@ class RagPipeline(application: Application) {
             val qStart = System.currentTimeMillis()
 
             val docs = if (useRetrieval) {
+                val effectiveTopK = retrieveKOverride ?: retrievalConfig.getInt("top_k")
                 val retrievalRequest = RetrievalRequest.create(
                     prompt,
                     RetrievalConfig.create(
-                        retrievalConfig.getInt("top_k"),
+                        effectiveTopK,
                         retrievalConfig.getDouble("similarity_threshold").toFloat(),
                         TaskType.RETRIEVAL_QUERY,
                     ),
diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py
new file mode 100644
index 0000000..d11e390
--- /dev/null
+++ b/evaluation/aggregate_k_sweep.py
@@ -0,0 +1,417 @@
+#!/usr/bin/env python3
+"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report.
+
+Reads all benchmark_*.json files produced by benchmark_latency.py, groups them
+by (backend, k_override), and writes a markdown report at
+evaluation/reports/latency_report_v2.md.
+
+Notes on backend identification: post-fix benchmark JSONs (commit ef96538
+onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep
+JSONs hard-code `backend="CPU"` even though they were measured on GPU; we
+backfill those using an explicit filename allowlist (see `backend_of`).
+Future runs of any backend are unaffected.
+"""
+from __future__ import annotations
+
+import datetime
+import glob
+import json
+import os
+import statistics
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+# Backfill for the specific historical GPU sweep files that predate the
+# metadata-recording fix in commit ef96538. Those JSONs hard-code
+# config.backend="CPU" even though they were measured on GPU. We use an
+# explicit filename allowlist (rather than a timestamp threshold) so the
+# rewrite cannot accidentally fire on anyone else's pre-threshold *genuine
+# CPU* JSONs that happen to share latency_results/.
+PRE_FIX_GPU_FILES = frozenset({
+    "benchmark_20260514T174502_k1.json",
+    "benchmark_20260514T180830_k3.json",
+    "benchmark_20260514T183604_k5.json",
+    "benchmark_20260514T190438_k7.json",
+    "benchmark_20260514T193453_k10.json",
+    "benchmark_20260514T200414_k15.json",
+    "benchmark_20260514T203653_k20.json",
+    "benchmark_20260514T210522.json",
+})
+
+
+def backend_of(filename: str, recorded: str) -> str:
+    """Trust the recorded backend except for the listed pre-fix GPU files."""
+    if filename in PRE_FIX_GPU_FILES:
+        return "GPU"
+    return recorded
+
+
+def load_runs() -> list[dict]:
+    files = sorted(glob.glob(os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "latency_results", "benchmark_*.json",
+    )))
+    runs = []
+    for f in files:
+        try:
+            with open(f) as fp:
+                d = json.load(fp)
+        except (json.JSONDecodeError, OSError):
+            continue
+        if "config" not in d or "results" not in d:
+            print(f"SKIP: {os.path.basename(f)} — missing config or results key", file=sys.stderr)
+            continue
+        if len(d["results"]) < 30:
+            # Skip ad-hoc smoke tests (the canonical sweep is 54 runs). Log so
+            # that a legitimate narrow sweep (--filter long_01, single-category)
+            # isn't silently dropped from the report.
+            print(
+                f"SKIP: {os.path.basename(f)} — {len(d['results'])} results "
+                "(< 30 threshold for canonical sweeps; pass it through if it "
+                "should appear in the matrix)",
+                file=sys.stderr,
+            )
+            continue
+        ts = os.path.basename(f).replace("benchmark_", "").split(".")[0].split("_")[0]
+        k_override = d["config"].get("retrieval_top_k_override")
+        skip_retrieval = d["config"].get("skip_retrieval", False)
+        k_label = 0 if skip_retrieval else (k_override if k_override is not None else None)
+        if k_label is None:
+            continue
+        # The metadata fix in commit ef96538 ensures post-fix runs record
+        # config.backend. If it's missing, the JSON predates that fix — only
+        # safe if the filename is on the allowlist; otherwise warn loudly
+        # rather than silently defaulting (which would mask future GPU runs
+        # written by a regressed BenchmarkForegroundService).
+        recorded_backend = d["config"].get("backend")
+        if recorded_backend is None:
+            if os.path.basename(f) not in PRE_FIX_GPU_FILES:
+                print(
+                    f"WARN: {os.path.basename(f)} has no config.backend "
+                    "field and is not on the pre-fix allowlist; defaulting "
+                    "to CPU. If this was actually a GPU run, fix the source.",
+                    file=sys.stderr,
+                )
+            recorded_backend = "CPU"
+        backend = backend_of(os.path.basename(f), recorded_backend)
+        runs.append({
+            "file": os.path.basename(f),
+            "timestamp": ts,
+            "backend": backend,
+            "k": k_label,
+            "data": d,
+        })
+    return runs
+
+
+def _p95(values: list[float]) -> int | None:
+    """95th percentile via linear-interpolation 20-quantile partition.
+
+    `statistics.quantiles(data, n=20)` returns 19 cut points dividing the
+    data into 20 equal-frequency groups; index 18 is the 95th percentile.
+    For very small samples (n < 2), there are no cut points to compute,
+    so we fall back to max — same behaviour as the previous
+    `int(len(s)*0.95)` formula but without the off-by-one that made p95
+    collapse to max for any n < 20.
+    """
+    if not values:
+        return None
+    if len(values) < 2:
+        return int(values[0])
+    return int(statistics.quantiles(values, n=20, method="exclusive")[18])
+
+
+def aggregate_per_category(d: dict, key: str) -> dict[str, dict]:
+    """Per-category {median, p95, n} for the given timing field."""
+    cat_vals: dict[str, list] = defaultdict(list)
+    for r in d["results"]:
+        if r.get("error"):
+            continue
+        cat_vals[r["category"]].append(r[key])
+    out = {}
+    for c, vs in cat_vals.items():
+        if not vs:
+            continue
+        out[c] = {
+            "n": len(vs),
+            "median": int(statistics.median(vs)),
+            "p95": _p95(vs),
+        }
+    return out
+
+
+def aggregate_overall(d: dict, key: str) -> dict:
+    vs = [r[key] for r in d["results"] if not r.get("error")]
+    if not vs:
+        return {}
+    return {
+        "n": len(vs),
+        "median": int(statistics.median(vs)),
+        "p95": _p95(vs),
+    }
+
+
+def median_doc_chars(d: dict) -> int:
+    """Median retrieved_total_chars across successful runs (the table column
+    is labeled 'doc_chars med', so this is the median by definition)."""
+    vs = [r.get("retrieved_total_chars", 0) for r in d["results"] if not r.get("error")]
+    return int(statistics.median(vs)) if vs else 0
+
+
+def fmt_ms(v: int | None) -> str:
+    return f"{v}" if v is not None else "—"
+
+
+def fmt_s(v: int | None) -> str:
+    return f"{v / 1000:.1f}" if v is not None else "—"
+
+
+def write_report(runs: list[dict], out_path: Path) -> None:
+    # Build {(backend, k) -> latest canonical run}
+    matrix: dict[tuple[str, int], dict] = {}
+    for r in runs:
+        key = (r["backend"], r["k"])
+        if key in matrix:
+            # Keep the run with most successful entries (resolves duplicates)
+            ex = matrix[key]
+            ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error"))
+            r_ok = sum(1 for x in r["data"]["results"] if not x.get("error"))
+            if r_ok > ex_ok:
+                matrix[key] = r
+        else:
+            matrix[key] = r
+
+    gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"])
+    cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"])
+    all_ks = sorted(set(gpu_ks + cpu_ks))
+
+    # Sample run for device info
+    sample = next(iter(matrix.values()))
+    dev = sample["data"]["device"]
+
+    md = []
+    md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n")
+    md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n")
+    md.append("")
+    md.append("## Device & stack\n")
+    md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}")
+    md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)")
+    md.append(f"- **LiteRT-LM**: 0.11.0")
+    md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU")
+    md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000")
+    md.append("")
+    # Pull the actual values from the sample run's config instead of hard-coding
+    # text that can lie. If different runs used different settings, this won't
+    # catch that — but we'd rather report the sample's truth than fabricate a
+    # round-number claim.
+    sample_cfg = sample["data"].get("config", {})
+    sample_repeats = sample_cfg.get("repeats", "?")
+    sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0
+    sample_n_results = len(sample["data"]["results"])
+    # Infer queries × modes from total runs / repeats. Default to "?" if the
+    # math doesn't divide evenly.
+    queries_x_modes: object = "?"
+    if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0:
+        queries_x_modes = sample_n_results // sample_repeats
+    md.append("## Methodology\n")
+    md.append(
+        f"Per backend × k configuration: {queries_x_modes} (query × mode) cells "
+        f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a "
+        f"No-RAG baseline per backend (k=0 via `--no-retrieval`). "
+        f"{sample_cooldown_s:g}-second cooldown between runs for thermal "
+        "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so "
+        "the run survives screen-off and device-lock; OPPO Hans whitelist set "
+        "manually."
+    )
+    md.append("")
+    md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.")
+    md.append("- `decode` is first-token to last-token.")
+    md.append("- `total_query` is everything: `retrieval + TTFT + decode`.")
+    md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).")
+    md.append("")
+
+    # ─────────── Headline table: total_query_ms by (backend, k) ───────────
+    md.append("## Headline — Median total query latency (seconds)\n")
+    md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |")
+    md.append(f"|---:|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        # doc chars: take from GPU if available, else CPU
+        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        gpu_cells = "—"
+        cpu_cells = "—"
+        if gpu_run:
+            g = aggregate_per_category(gpu_run["data"], "total_query_ms")
+            gpu_cells = " / ".join(fmt_s(g.get(c, {}).get("median")) for c in ["short", "medium", "long"])
+        if cpu_run:
+            c_ = aggregate_per_category(cpu_run["data"], "total_query_ms")
+            cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"])
+        # ratio
+        ratio = ""
+        if gpu_run and cpu_run:
+            gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median")
+            cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median")
+            if gov is not None and cov is not None and gov > 0:
+                ratio = f"{cov / gov:.2f}×"
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |")
+    md.append("")
+
+    # ─────────── TTFT detail ───────────
+    md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n")
+    md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |")
+    md.append(f"|---:|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0
+        gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None
+        # Explicit None checks; also guard against div-by-zero on a 0 median.
+        ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else ""
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
+    md.append("")
+
+    # ─────────── Decode detail ───────────
+    md.append("## Decode (ms, median) — first token to last token\n")
+    md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ")
+    md.append("the model writing *longer answers* when given more context (more material to draw on).")
+    md.append("")
+    md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |")
+    md.append(f"|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None
+        ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else ""
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |")
+    md.append("")
+
+    # ─────────── p95 totals ───────────
+    md.append("## p95 total query latency (s) — tail-latency view\n")
+    md.append(f"| k | GPU p95 | CPU p95 |")
+    md.append(f"|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None
+        cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |")
+    md.append("")
+
+    # ─────────── Errors / context limit ───────────
+    md.append("## Errors and the 4096-token context wall\n")
+    md.append(f"| k | GPU errors / 54 | CPU errors / 54 |")
+    md.append(f"|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None
+        ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |")
+    md.append("")
+    md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ")
+    md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ")
+    md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ")
+    md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ")
+    md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ")
+    md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.")
+    md.append("")
+    md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ")
+    md.append("deployment budget at this depth even when the request fits in the context window.")
+    md.append("")
+
+    # ─────────── Wall-clock comparison ───────────
+    md.append("## Wall-clock comparison\n")
+    md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |")
+    md.append("|---:|---:|---:|---:|")
+    for k in all_ks:
+        gpu_run = matrix.get(("GPU", k))
+        cpu_run = matrix.get(("CPU", k))
+        gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None
+        cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None
+        gw_s = f"{gw:.1f}" if gw is not None else "—"
+        cw_s = f"{cw:.1f}" if cw is not None else "—"
+        ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else ""
+        label = "**0 (no-RAG)**" if k == 0 else str(k)
+        md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |")
+
+    # Findings / interpretation
+    md.append("")
+    md.append("## Key findings\n")
+    md.append("")
+    md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite")
+    md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ")
+    md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ")
+    md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.")
+    md.append("")
+    md.append("### 2. The model's 4096-token context window is the binding ceiling at high k")
+    md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ")
+    md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ")
+    md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ")
+    md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ")
+    md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ")
+    md.append("Latency is *not* the constraint at the upper end; the model's context window is.")
+    md.append("")
+    md.append("### 3. Latency is not the binding factor on GPU below k=15")
+    md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ")
+    md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ")
+    md.append("not by what fits in the latency budget.")
+    md.append("")
+    md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow")
+    md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ")
+    md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ")
+    md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ")
+    md.append("or **k ≤ 1** if you want sub-40s p95.")
+    md.append("")
+    md.append("### 5. Decode time is content-driven, not k-driven")
+    md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ")
+    md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ")
+    md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ")
+    md.append("not compute-bound on this hardware.")
+    md.append("")
+    md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3")
+    md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ")
+    md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ")
+    md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.")
+    md.append("")
+
+    # File inventory
+    md.append("## Data inventory (per `(backend, k)`)\n")
+    md.append("| Backend | k | File | Wall (min) | Runs | Errors |")
+    md.append("|---|---:|---|---:|---:|---:|")
+    for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])):
+        r = matrix[(b, k)]
+        wall = r["data"]["total_benchmark_time_ms"] / 60000
+        n = len(r["data"]["results"])
+        e = sum(1 for x in r["data"]["results"] if x.get("error"))
+        label = "0 (no-RAG)" if k == 0 else str(k)
+        md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |")
+    md.append("")
+    md.append("---")
+    md.append("")
+    md.append("_Source benchmark JSONs live in `evaluation/latency_results/`. ")
+    md.append("Aggregation script: `evaluation/aggregate_k_sweep.py`._")
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text("\n".join(md) + "\n")
+    print(f"Report written to: {out_path}")
+
+
+def main() -> int:
+    runs = load_runs()
+    print(f"Loaded {len(runs)} canonical runs")
+    out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md"
+    write_report(runs, out)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py
index 5611e21..7486f96 100644
--- a/evaluation/benchmark_latency.py
+++ b/evaluation/benchmark_latency.py
@@ -12,6 +12,7 @@
     python evaluation/benchmark_latency.py --filter long_01         # Single specific query
     python evaluation/benchmark_latency.py --no-retrieval           # Skip RAG retrieval
     python evaluation/benchmark_latency.py --cooldown 10000         # Longer cooldown (thermal)
+    python evaluation/benchmark_latency.py --retrieve-k 5           # Override retrieval top_k for this session
 """
 
 import argparse
@@ -68,9 +69,15 @@ def check_device(device_serial=None):
 
 
 def check_models_downloaded(device_serial=None):
-    """Check if model files exist on device."""
+    """Check if model files exist on device.
+
+    Filenames must match config/app_config.json — the app loads
+    "llm_model" / "embedding_model" / "tokenizer" from there. Updated
+    for the Gemma 4 E4B / LiteRT-LM 0.11.0 stack; the old Gemma 3n
+    .task name is no longer in production.
+    """
     required_files = [
-        "gemma-3n-E4B-it-int4.task",
+        "gemma-4-E4B-it.litertlm",
         "Gecko_1024_quant.tflite",
         "sentencepiece.model",
         "embeddings.sqlite",
@@ -103,7 +110,8 @@ def clear_logcat(device_serial=None):
 
 
 def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
-                     skip_retrieval=False, query_filter=None):
+                     skip_retrieval=False, rag_only=False,
+                     query_filter=None, retrieve_k=None):
     """Launch BenchmarkActivity via ADB."""
     cmd = _adb(device_serial) + [
         "shell", "am", "start",
@@ -113,8 +121,12 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000,
     ]
     if skip_retrieval:
         cmd += ["--ez", "skip_retrieval", "true"]
+    if rag_only:
+        cmd += ["--ez", "rag_only", "true"]
     if query_filter:
         cmd += ["--es", "query_filter", query_filter]
+    if retrieve_k is not None:
+        cmd += ["--ei", "retrieve_k", str(retrieve_k)]
 
     result = subprocess.run(cmd, capture_output=True, text=True)
     if "Error" in result.stderr:
@@ -458,8 +470,16 @@ def main():
                         help="Cooldown between queries in ms (default: 5000)")
     parser.add_argument("--no-retrieval", action="store_true",
                         help="Skip RAG retrieval (generation only)")
+    parser.add_argument("--rag-only", action="store_true",
+                        help="Skip the No-RAG mode (only run with retrieval). "
+                             "Pair with --retrieve-k to do a k-sweep without "
+                             "re-running the No-RAG baseline at every k.")
     parser.add_argument("--filter", type=str, default=None,
                         help="Filter by category (short/medium/long) or query ID (e.g., long_01)")
+    parser.add_argument("--retrieve-k", type=int, default=None,
+                        help="Override retrieval top_k for this session "
+                             "(default: use runtime_config.json's value, currently 3). "
+                             "Used for the per-k latency sweep.")
     parser.add_argument("--output-dir", type=str, default="evaluation/latency_results",
                         help="Directory for output files")
     parser.add_argument("--device", type=str, default=None,
@@ -468,6 +488,14 @@ def main():
                         help="Timeout in seconds (default: 7200)")
     args = parser.parse_args()
 
+    if args.no_retrieval and args.rag_only:
+        parser.error("--no-retrieval and --rag-only are mutually exclusive")
+    if args.retrieve_k is not None and args.retrieve_k < 1:
+        # The service treats any value >= 0 as an explicit override. Passing 0
+        # would call RetrievalConfig.create(0, …), which is a silent footgun
+        # — use --no-retrieval if you actually want to disable retrieval.
+        parser.error("--retrieve-k must be >= 1; use --no-retrieval to disable retrieval entirely")
+
     print("=" * 60)
     print("MAM-AI On-Device Latency Benchmark")
     print("=" * 60)
@@ -494,13 +522,16 @@ def main():
         clear_logcat(args.device)
 
         # Launch benchmark
-        print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}")
+        k_msg = f", retrieve_k={args.retrieve_k}" if args.retrieve_k is not None else ""
+        print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}{k_msg}")
         launch_benchmark(
             device_serial=args.device,
             repeats=args.repeats,
             cooldown_ms=args.cooldown,
             skip_retrieval=args.no_retrieval,
+            rag_only=args.rag_only,
             query_filter=args.filter,
+            retrieve_k=args.retrieve_k,
         )
 
         # Wait for completion
@@ -509,8 +540,9 @@ def main():
             print("Benchmark did not complete successfully.")
             sys.exit(1)
 
-        # Pull results
-        json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}.json")
+        # Pull results — include k in the filename so a sweep across k values is legible.
+        k_suffix = f"_k{args.retrieve_k}" if args.retrieve_k is not None else ""
+        json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}{k_suffix}.json")
         pull_results(args.device, json_path)
 
     # Load and analyze
diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md
new file mode 100644
index 0000000..c6745a6
--- /dev/null
+++ b/evaluation/reports/latency_report_v2.md
@@ -0,0 +1,176 @@
+# MAM-AI On-Device Latency Sweep — GPU vs CPU
+
+_Generated: 2026-05-15T10:51:06_
+
+
+## Device & stack
+
+- **Device**: OnePlus OPD2413 (SM8750P) — Android 15
+- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)
+- **LiteRT-LM**: 0.11.0
+- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU
+- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000
+
+## Methodology
+
+Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually.
+
+- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.
+- `decode` is first-token to last-token.
+- `total_query` is everything: `retrieval + TTFT + decode`.
+- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`).
+
+## Headline — Median total query latency (seconds)
+
+| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 12.9 / 15.6 / 16.1 | 27.2 / 26.9 / 29.8 | 1.94× |
+| 1 | 561 | 13.1 / 12.6 / 17.3 | 29.3 / 31.9 / 30.3 | 2.14× |
+| 3 | 2098 | 18.6 / 18.6 / 21.0 | 37.3 / 44.5 / 42.5 | 2.24× |
+| 5 | 3547 | 18.2 / 20.0 / 21.4 | 54.8 / 60.7 / 63.0 | 3.07× |
+| 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× |
+| 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× |
+| 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× |
+| 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× |
+
+## TTFT (ms, median) — prefill cost grows with retrieved-doc content
+
+| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |
+|---:|---:|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 962 | 12633 | 13.1× |
+| 1 | 561 | 954 | 12649 | 13.3× |
+| 3 | 2098 | 989 | 18356 | 18.6× |
+| 5 | 3547 | 1884 | 36424 | 19.3× |
+| 7 | 5139 | 1920 | 36444 | 19.0× |
+| 10 | 7482 | 2523 | 40013 | 15.9× |
+| 15 | 11297 | 3457 | 54748 | 15.8× |
+| 20 | 14520 | 3986 | 72881 | 18.3× |
+
+## Decode (ms, median) — first token to last token
+
+Decode time mostly tracks output length, not k or doc content. Variation across k reflects 
+the model writing *longer answers* when given more context (more material to draw on).
+
+| k | GPU decode | CPU decode | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 13470 | 15345 | 1.14× |
+| 1 | 11415 | 13961 | 1.22× |
+| 3 | 16364 | 19110 | 1.17× |
+| 5 | 15929 | 21645 | 1.36× |
+| 7 | 17215 | 23473 | 1.36× |
+| 10 | 18118 | 21699 | 1.20× |
+| 15 | 16820 | 22497 | 1.34× |
+| 20 | 14688 | 22634 | 1.54× |
+
+## p95 total query latency (s) — tail-latency view
+
+| k | GPU p95 | CPU p95 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 26.1 | 38.4 |
+| 1 | 26.1 | 37.1 |
+| 3 | 30.3 | 64.3 |
+| 5 | 30.7 | 74.6 |
+| 7 | 35.1 | 81.8 |
+| 10 | 29.0 | 84.5 |
+| 15 | 30.6 | 112.7 |
+| 20 | 35.3 | 104.9 |
+
+## Errors and the 4096-token context wall
+
+| k | GPU errors / 54 | CPU errors / 54 |
+|---:|---:|---:|
+| **0 (no-RAG)** | 0 | 0 |
+| 1 | 0 | 0 |
+| 3 | 0 | 0 |
+| 5 | 0 | 0 |
+| 7 | 0 | 0 |
+| 10 | 0 | 0 |
+| 15 | 0 | 0 |
+| 20 | 24 | 24 |
+
+At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. 
+Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both 
+backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — 
+the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of 
+the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. 
+The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.
+
+Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any 
+deployment budget at this depth even when the request fits in the context window.
+
+## Wall-clock comparison
+
+| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |
+|---:|---:|---:|---:|
+| **0 (no-RAG)** | 23.5 | 36.9 | 1.57× |
+| 1 | 23.0 | 38.7 | 1.68× |
+| 3 | 27.3 | 50.2 | 1.84× |
+| 5 | 28.2 | 63.0 | 2.23× |
+| 7 | 30.0 | 66.5 | 2.22× |
+| 10 | 29.1 | 73.2 | 2.51× |
+| 15 | 32.4 | 90.8 | 2.80× |
+| 20 | 22.8 | 58.6 | 2.57× |
+
+## Key findings
+
+
+### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite
+GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. 
+That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), 
+so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.
+
+### 2. The model's 4096-token context window is the binding ceiling at high k
+k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — 
+the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. 
+Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives 
+the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, 
+not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. 
+Latency is *not* the constraint at the upper end; the model's context window is.
+
+### 3. Latency is not the binding factor on GPU below k=15
+GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. 
+Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), 
+not by what fits in the latency budget.
+
+### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow
+CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. 
+p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't 
+available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, 
+or **k ≤ 1** if you want sub-40s p95.
+
+### 5. Decode time is content-driven, not k-driven
+Decode time tracks output length. As k grows, the model writes *longer* responses — likely because 
+more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. 
+Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, 
+not compute-bound on this hardware.
+
+### 6. TTFT scales linearly with retrieved-doc content past k=3
+On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, 
+CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting 
+the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.
+
+## Data inventory (per `(backend, k)`)
+
+| Backend | k | File | Wall (min) | Runs | Errors |
+|---|---:|---|---:|---:|---:|
+| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 |
+| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 |
+| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 |
+| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 |
+| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 |
+| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 |
+| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 |
+| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 |
+| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 |
+| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 |
+| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 |
+| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 |
+| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 |
+| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 |
+| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 |
+| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 |
+
+---
+
+_Source benchmark JSONs live in `evaluation/latency_results/`. 
+Aggregation script: `evaluation/aggregate_k_sweep.py`._