diff --git a/app/android/app/src/main/AndroidManifest.xml b/app/android/app/src/main/AndroidManifest.xml index a44e8c2..d19101a 100644 --- a/app/android/app/src/main/AndroidManifest.xml +++ b/app/android/app/src/main/AndroidManifest.xml @@ -4,6 +4,12 @@ + + + + + = 0 to override; pass -1 + * (or omit) to use runtime_config.json's + * value. The activity normalises -1 to null + * before forwarding to the service. */ class BenchmarkActivity : Activity() { companion object { - private const val TAG = "mam-ai" private const val BENCH_TAG = "mam-ai-bench" - private const val DEFAULT_COOLDOWN_MS = 5_000L - private const val DEFAULT_REPEATS = 3 - private const val CHARS_PER_TOKEN_ESTIMATE = 4.0 } - private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Main) - private val executor = Executors.newSingleThreadExecutor() - private lateinit var logView: TextView - private lateinit var scrollView: ScrollView - override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) - // Scrollable log console UI - scrollView = ScrollView(this).apply { - setBackgroundColor(0xFF000000.toInt()) - } - logView = TextView(this).apply { - setTextColor(0xFF00FF00.toInt()) - textSize = 13f - setPadding(32, 48, 32, 48) - text = "=== MAM-AI Benchmark ===\n" - } - scrollView.addView(logView) - setContentView(scrollView) - - val repeats = intent.getIntExtra("repeats", DEFAULT_REPEATS) - val cooldownMs = intent.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) - val skipRetrieval = intent.getBooleanExtra("skip_retrieval", false) - val queryFilter = intent.getStringExtra("query_filter") - - scope.launch { - try { - runBenchmark(repeats, cooldownMs, skipRetrieval, queryFilter) - } catch (t: Throwable) { - Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t) - Log.w(BENCH_TAG, "[BENCHMARK] FAILED") - logStatus("FAILED: ${t.message}") - } finally { - finish() - } + val serviceIntent = Intent(this, BenchmarkForegroundService::class.java).apply { + // Forward every extra the user might have passed via `am start`. + // Defaults are resolved inside the service. + if (intent.hasExtra("repeats")) + putExtra("repeats", intent.getIntExtra("repeats", 3)) + if (intent.hasExtra("cooldown_ms")) + putExtra("cooldown_ms", intent.getLongExtra("cooldown_ms", 5000L)) + if (intent.hasExtra("skip_retrieval")) + putExtra("skip_retrieval", intent.getBooleanExtra("skip_retrieval", false)) + if (intent.hasExtra("rag_only")) + putExtra("rag_only", intent.getBooleanExtra("rag_only", false)) + if (intent.hasExtra("query_filter")) + putExtra("query_filter", intent.getStringExtra("query_filter")) + if (intent.hasExtra("retrieve_k")) + putExtra("retrieve_k", intent.getIntExtra("retrieve_k", -1)) } - } - - private fun logStatus(text: String) { - runOnUiThread { - logView.append(text + "\n") - scrollView.post { scrollView.fullScroll(ScrollView.FOCUS_DOWN) } - } - } - - // ── Main benchmark loop ────────────────────────────────────────────── - - private suspend fun runBenchmark( - repeats: Int, - cooldownMs: Long, - skipRetrieval: Boolean, - queryFilter: String?, - ) { - val benchmarkStart = System.currentTimeMillis() - val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date()) - - Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter") - - // Device info - val deviceInfo = collectDeviceInfo() - Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})") - // Step 1: Gecko + SQLite init (synchronous part of RagPipeline constructor) - logStatus("Step 1/4: Initializing Gecko embedder + SQLite...") - Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...") - val initStart = System.currentTimeMillis() - val pipeline = withContext(executor.asCoroutineDispatcher()) { - RagPipeline(application) - } - val syncInitMs = System.currentTimeMillis() - initStart - Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms") - logStatus("Step 1/4: Gecko + SQLite done (${syncInitMs}ms)") - - // Step 2: Wait for LLM model load (async, started by RagPipeline constructor) - logStatus("Step 2/4: Loading Gemma 4 LLM model...") - Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...") - val llmWaitStart = System.currentTimeMillis() - withContext(executor.asCoroutineDispatcher()) { - pipeline.awaitLlmReady() - } - val llmInitMs = System.currentTimeMillis() - llmWaitStart - Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)") - logStatus("Step 2/4: LLM loaded (${llmInitMs}ms)") - - // Step 3: 5 warmup queries of varying length — warms JIT / LiteRT-LM / shader caches - val warmupQueries = listOf( - "Normal fetal heart rate", - "Signs of infection after delivery", - "A mother has heavy bleeding after birth. What should I do first?", - "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?", - "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?", - ) - logStatus("Step 3/4: Running ${warmupQueries.size} warmup queries...") - Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...") - val warmupStart = System.currentTimeMillis() - warmupQueries.forEachIndexed { i, prompt -> - Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"") - withContext(executor.asCoroutineDispatcher()) { - pipeline.generateResponse( - prompt = prompt, - history = emptyList(), - useRetrieval = false, - retrievalListener = {}, - generationListener = { _, _ -> } - ) - } - Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)") - } - val warmupMs = System.currentTimeMillis() - warmupStart - val totalInitMs = System.currentTimeMillis() - initStart - Log.w(BENCH_TAG, "[BENCHMARK] Warmup complete: ${warmupMs}ms total (${warmupQueries.size} queries)") - Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") - - val postInitMemory = collectMemoryInfo() - - // Step 4: Cooldown before timed runs - logStatus("--- Init summary: gecko=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") - logStatus("Cooldown ${cooldownMs}ms...") - Thread.sleep(cooldownMs) - - // Filter queries - val queries = if (queryFilter != null) { - BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter } + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + startForegroundService(serviceIntent) } else { - BenchmarkQueries.ALL - } - - if (queries.isEmpty()) { - Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'") - Log.w(BENCH_TAG, "[BENCHMARK] FAILED") - return + startService(serviceIntent) } - - val retrievalModes = if (skipRetrieval) listOf(false) else listOf(true, false) - val totalRuns = queries.size * retrievalModes.size * repeats - Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs") - - // Execution loop - val results = mutableListOf() - var runIndex = 0 - val loopStart = System.currentTimeMillis() - - for (query in queries) { - for (useRetrieval in retrievalModes) { - for (rep in 1..repeats) { - runIndex++ - - // Estimate time remaining based on average time per completed run - val etaStr = if (runIndex > 1) { - val elapsedMs = System.currentTimeMillis() - loopStart - val avgPerRun = elapsedMs.toDouble() / (runIndex - 1) - val remainingMs = (avgPerRun * (totalRuns - runIndex + 1)).toLong() - val remainMin = remainingMs / 60000 - val remainSec = (remainingMs % 60000) / 1000 - "ETA: ${remainMin}m ${remainSec}s" - } else "ETA: calculating..." - - Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats") - logStatus("[$runIndex/$totalRuns] ${query.id} | retrieval=$useRetrieval rep=$rep | $etaStr") - - val preMemory = collectMemoryInfo() - val result = runQuery(pipeline, query.text, useRetrieval) - val postMemory = collectMemoryInfo() - - val decodeTps = if (result.decodeMs > 0) - round2(result.estimatedTokens / (result.decodeMs / 1000.0)) - else 0.0 - - val entry = JSONObject().apply { - put("query_id", query.id) - put("category", query.category) - put("query_text", query.text) - put("query_word_count", query.wordCount) - put("use_retrieval", useRetrieval) - put("repetition", rep) - put("retrieval_time_ms", result.retrievalTimeMs) - put("ttft_ms", result.ttftMs) - put("prefill_ms", result.prefillMs) - put("decode_ms", result.decodeMs) - put("total_generation_ms", result.generationTotalMs) - put("total_query_ms", result.totalQueryMs) - put("response_length_chars", result.responseChars) - put("estimated_tokens", result.estimatedTokens) - put("decode_throughput_tps", decodeTps) - put("num_retrieved_docs", result.numRetrievedDocs) - put("error", result.error ?: JSONObject.NULL) - put("heap_before_mb", preMemory.getInt("used_mb")) - put("heap_after_mb", postMemory.getInt("used_mb")) - } - results.add(entry) - - val resultLine = " -> ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms tps=$decodeTps" - Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps") - logStatus(resultLine) - - val pct = (runIndex * 100) / totalRuns - val elapsedMin = (System.currentTimeMillis() - loopStart) / 60000 - logStatus(" [${"█".repeat(pct / 5)}${"░".repeat(20 - pct / 5)}] $pct% ($elapsedMin min elapsed)") - - // Cooldown between queries (skip after last run) - if (runIndex < totalRuns) { - Thread.sleep(cooldownMs) - } - } - } - } - - // Assemble output JSON - val output = JSONObject().apply { - put("benchmark_version", 1) - put("timestamp", timestamp) - put("device", deviceInfo) - put("config", JSONObject().apply { - put("repeats", repeats) - put("cooldown_ms", cooldownMs) - put("skip_retrieval", skipRetrieval) - put("query_filter", queryFilter ?: JSONObject.NULL) - put("model", "gemma-4-E4B-it.litertlm") - put("backend", "CPU") - put("max_tokens", 32000) - put("temperature", 1.0) - put("top_p", 0.95) - put("top_k", 64) - }) - put("init", JSONObject().apply { - put("gecko_sqlite_ms", syncInitMs) - put("llm_load_ms", llmInitMs) - put("warmup_query_ms", warmupMs) - put("total_init_ms", totalInitMs) - }) - put("memory", postInitMemory) - put("results", JSONArray(results)) - put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart) - } - - // Write to file - val outFile = File(getExternalFilesDir(null), "benchmark_results.json") - outFile.writeText(output.toString(2)) - Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}") - Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE") - logStatus("COMPLETE\nResults written to:\n${outFile.absolutePath}") - } - - // ── Single query execution ─────────────────────────────────────────── - - private data class QueryResult( - val retrievalTimeMs: Long, - val ttftMs: Long, - val prefillMs: Long, - val decodeMs: Long, - val generationTotalMs: Long, - val totalQueryMs: Long, - val responseChars: Int, - val estimatedTokens: Int, - val numRetrievedDocs: Int, - val error: String?, - ) - - private suspend fun runQuery(pipeline: RagPipeline, queryText: String, useRetrieval: Boolean): QueryResult { - var retrievalTimeMs = 0L - var numDocs = 0 - var firstTokenTime = 0L - var error: String? = null - val responseBuilder = StringBuilder() - - val qStart = System.currentTimeMillis() - var retrievalDoneTime = 0L - - try { - withContext(executor.asCoroutineDispatcher()) { - pipeline.generateResponse( - prompt = queryText, - history = emptyList(), - useRetrieval = useRetrieval, - retrievalListener = { docs -> - retrievalDoneTime = System.currentTimeMillis() - retrievalTimeMs = retrievalDoneTime - qStart - numDocs = docs.size - }, - generationListener = { partial, _ -> - responseBuilder.append(partial) - if (firstTokenTime == 0L && partial.isNotEmpty()) { - firstTokenTime = System.currentTimeMillis() - } - } - ) - } - } catch (e: Exception) { - error = e.message - Log.e(TAG, "[BENCHMARK] Query failed: ${e.message}", e) - } - - val qEnd = System.currentTimeMillis() - val totalQueryMs = qEnd - qStart - val responseChars = responseBuilder.length - - // Generation timing — measure from after retrieval (or query start if no retrieval) - val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart - val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0 - val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0 - val generationTotalMs = qEnd - genStart - val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt() - - return QueryResult( - retrievalTimeMs = retrievalTimeMs, - ttftMs = ttftMs, - prefillMs = ttftMs, - decodeMs = decodeMs, - generationTotalMs = generationTotalMs, - totalQueryMs = totalQueryMs, - responseChars = responseChars, - estimatedTokens = estimatedTokens, - numRetrievedDocs = numDocs, - error = error, - ) + Log.w(BENCH_TAG, "[BENCHMARK] BenchmarkActivity → forwarded extras to BenchmarkForegroundService, finishing.") + finish() } - - // ── Helpers ────────────────────────────────────────────────────────── - - private fun collectDeviceInfo(): JSONObject = JSONObject().apply { - put("manufacturer", Build.MANUFACTURER) - put("model", Build.MODEL) - put("device", Build.DEVICE) - put("hardware", Build.HARDWARE) - put("board", Build.BOARD) - put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown") - put("android_version", Build.VERSION.RELEASE) - put("sdk_int", Build.VERSION.SDK_INT) - put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown") - } - - private fun collectMemoryInfo(): JSONObject { - val rt = Runtime.getRuntime() - return JSONObject().apply { - put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024) - put("free_mb", rt.freeMemory() / 1024 / 1024) - put("total_mb", rt.totalMemory() / 1024 / 1024) - put("max_mb", rt.maxMemory() / 1024 / 1024) - } - } - - private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0 } diff --git a/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt new file mode 100644 index 0000000..e1ee93c --- /dev/null +++ b/app/android/app/src/main/kotlin/com/example/app/BenchmarkForegroundService.kt @@ -0,0 +1,564 @@ +package com.example.app + +import android.app.Notification +import android.app.NotificationChannel +import android.app.NotificationManager +import android.app.Service +import android.content.Context +import android.content.Intent +import android.content.pm.ServiceInfo +import android.os.Build +import android.os.IBinder +import android.os.PowerManager +import android.util.Log +import androidx.core.app.NotificationCompat +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.cancel +import kotlinx.coroutines.delay +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext +import org.json.JSONArray +import org.json.JSONObject +import java.io.File +import java.text.SimpleDateFormat +import java.util.Date +import java.util.Locale +import java.util.concurrent.Executors + +/** + * Foreground service that runs the on-device latency benchmark. + * + * The service holds a PARTIAL_WAKE_LOCK and posts a sticky notification so + * the OS keeps the process alive — unlike a plain Activity, which the + * vendor power manager (e.g. OPPO's OplusProxyWakeLock) will idle as soon + * as the screen sleeps. This lets multi-hour k-sweeps run while the + * device is locked or the screen is off. + * + * Launched via [BenchmarkActivity] which forwards Intent extras from `am + * start`. All benchmark logic lives here; the Activity is a thin shim. + * + * **Process model.** Both this service and [BenchmarkActivity] declare + * `android:process=":benchmark"` in the manifest, so they run in a + * separate process from the main MAM-AI app. That process is fresh on + * each `am start`: this service constructs its own [RagPipeline] + * (Gecko + SQLite + LLM load) on entry, independent of any pipeline + * already loaded in the main app process. Two consequences worth + * knowing about: + * + * 1. The application's `Application` subclass initializes once per + * process — anything in your custom Application.onCreate() will + * run a second time when the benchmark process spawns. + * 2. If the main app is also running with the LLM loaded, two LLM + * instances may briefly contend for GPU/memory during init. + * + * Intent extras (forwarded from the Activity): + * repeats:Int Repetitions per query (default 3) + * cooldown_ms:Long Sleep between runs in ms (default 5000) + * skip_retrieval:Boolean Run No-RAG mode only + * rag_only:Boolean Run RAG mode only + * (skip_retrieval and rag_only are mutually + * exclusive; skip_retrieval wins if both set) + * query_filter:String? Category or specific query ID filter + * retrieve_k:Int Override retrieval top_k for this session. + * Pass -1 (or omit) to use the value from + * runtime_config.json. Any value >= 0 takes + * effect for every query in this run. + */ +class BenchmarkForegroundService : Service() { + + companion object { + private const val TAG = "mam-ai" + private const val BENCH_TAG = "mam-ai-bench" + private const val NOTIFICATION_ID = 1002 + const val CHANNEL_ID = "mam_ai_benchmark" + private const val DEFAULT_COOLDOWN_MS = 5_000L + private const val DEFAULT_REPEATS = 3 + private const val CHARS_PER_TOKEN_ESTIMATE = 4.0 + } + + // Dispatchers.Default so the long-running coroutine isn't tied to the UI + // thread. The service has no UI anyway, but Default also ensures the work + // continues regardless of any activity lifecycle event. + private val scope = CoroutineScope(SupervisorJob() + Dispatchers.Default) + private val executor = Executors.newSingleThreadExecutor() + private var wakeLock: PowerManager.WakeLock? = null + // Set once when the first onStartCommand fires runBenchmark. Subsequent + // intent re-deliveries (e.g. another `am start` before stopSelf() runs) + // see this true and are no-ops, so we never end up with two concurrent + // coroutines sharing the executor and the same output JSON. + @Volatile private var benchmarkStarted = false + + override fun onBind(intent: Intent?): IBinder? = null + + override fun onCreate() { + super.onCreate() + ensureChannel(this) + } + + override fun onStartCommand(intent: Intent?, flags: Int, startId: Int): Int { + // Promote to foreground FIRST so the wake lock is always paired with + // a visible notification (Android 12+ enforces this pairing for new + // foreground-service starts). Acquiring the wake lock in onCreate + // before startForeground would briefly hold the CPU awake without a + // notification — and would leak if onStartCommand never ran (e.g. + // bind-only path or framework deferral). + startForegroundCompat("MAM-AI benchmark starting…", -1, 0) + + // PARTIAL_WAKE_LOCK lets the CPU keep running through screen-off. + // Vendor power managers (OPPO ColorOS, Xiaomi MIUI, etc.) respect + // wake locks held by foreground services — they aggressively + // release locks held by background activities. + if (wakeLock == null) { + val powerManager = getSystemService(Context.POWER_SERVICE) as PowerManager + wakeLock = powerManager.newWakeLock( + PowerManager.PARTIAL_WAKE_LOCK, + "mam-ai:benchmark" + ).apply { + setReferenceCounted(false) + // 24 h failsafe. Long CPU sweeps (full series × repeats × all k) + // have already run ~7 h end-to-end; pushing to 24 h leaves + // plenty of slack so the lock can't silently expire mid-run. + // If we ever start running sweeps longer than this, switch + // to a periodic re-acquire instead of bumping further. + acquire(24L * 60L * 60L * 1000L) + } + Log.w(BENCH_TAG, "[BENCHMARK] Foreground started, PARTIAL_WAKE_LOCK acquired") + } + + // Reject re-deliveries before the benchmark coroutine completes. A + // second am start while the first is in flight would otherwise spawn + // a parallel coroutine and clobber the shared RagPipeline / output + // JSON. + if (benchmarkStarted) { + Log.w(BENCH_TAG, "[BENCHMARK] WARNING: ignoring re-delivery; benchmark is already running.") + return START_NOT_STICKY + } + benchmarkStarted = true + + val repeats = intent?.getIntExtra("repeats", DEFAULT_REPEATS) ?: DEFAULT_REPEATS + val cooldownMs = intent?.getLongExtra("cooldown_ms", DEFAULT_COOLDOWN_MS) ?: DEFAULT_COOLDOWN_MS + val skipRetrieval = intent?.getBooleanExtra("skip_retrieval", false) ?: false + val ragOnly = intent?.getBooleanExtra("rag_only", false) ?: false + val queryFilter = intent?.getStringExtra("query_filter") + val retrieveKOverride: Int? = intent?.getIntExtra("retrieve_k", -1)?.takeIf { it >= 0 } + + scope.launch { + try { + runBenchmark(repeats, cooldownMs, skipRetrieval, ragOnly, queryFilter, retrieveKOverride) + } catch (t: Throwable) { + Log.e(TAG, "[BENCHMARK] FATAL ERROR: ${t.message}", t) + Log.w(BENCH_TAG, "[BENCHMARK] FAILED") + } finally { + stopSelf() + } + } + // START_NOT_STICKY: don't auto-restart on kill — the benchmark is a + // one-shot job; restarting halfway through would corrupt the run. + return START_NOT_STICKY + } + + override fun onDestroy() { + super.onDestroy() + wakeLock?.let { + if (it.isHeld) { + it.release() + Log.w(BENCH_TAG, "[BENCHMARK] Released PARTIAL_WAKE_LOCK") + } + } + wakeLock = null + scope.cancel() + // Shut down the single-thread executor that ferries pipeline calls off + // the coroutine dispatchers. We use shutdownNow() to interrupt the + // worker thread: scope.cancel() does not propagate cancellation into + // a blocking native call (e.g. mid-flight LiteRT-LM generation), + // and a plain shutdown() would return immediately and leave the + // thread running until the call finishes naturally — keeping the + // :benchmark process alive after stopForeground. + executor.shutdownNow() + // Brief best-effort await so we don't yank the rug if the worker is + // tearing down cleanly. If it doesn't finish in 2 s we move on; the + // OS will eventually kill the process anyway. + try { + executor.awaitTermination(2, java.util.concurrent.TimeUnit.SECONDS) + } catch (_: InterruptedException) { + Thread.currentThread().interrupt() + } + // Use the non-deprecated overload on API 24+ (where it was introduced). + // The boolean variant has been deprecated since Android 13. + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.N) { + stopForeground(STOP_FOREGROUND_REMOVE) + } else { + @Suppress("DEPRECATION") + stopForeground(true) + } + } + + // ── Notification plumbing ──────────────────────────────────────────── + + private fun startForegroundCompat(message: String, progress: Int, max: Int) { + val notification = buildNotification(this, message, progress, max) + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.UPSIDE_DOWN_CAKE) { + startForeground( + NOTIFICATION_ID, + notification, + ServiceInfo.FOREGROUND_SERVICE_TYPE_DATA_SYNC, + ) + } else { + startForeground(NOTIFICATION_ID, notification) + } + } + + private fun updateNotification(message: String, progress: Int, max: Int) { + val nm = getSystemService(NotificationManager::class.java) ?: return + nm.notify(NOTIFICATION_ID, buildNotification(this, message, progress, max)) + } + + // ── Main benchmark loop ────────────────────────────────────────────── + + private suspend fun runBenchmark( + repeats: Int, + cooldownMs: Long, + skipRetrieval: Boolean, + ragOnly: Boolean, + queryFilter: String?, + retrieveKOverride: Int?, + ) { + val benchmarkStart = System.currentTimeMillis() + val timestamp = SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.US).format(Date()) + + Log.w(BENCH_TAG, "[BENCHMARK] START repeats=$repeats cooldown=${cooldownMs}ms filter=$queryFilter retrieve_k=${retrieveKOverride ?: "default"} rag_only=$ragOnly") + + val deviceInfo = collectDeviceInfo() + Log.w(BENCH_TAG, "[BENCHMARK] device=${deviceInfo.getString("model")} (${deviceInfo.optString("soc", "?")})") + + updateNotification("Initializing pipeline…", -1, 0) + Log.w(BENCH_TAG, "[BENCHMARK] Initializing pipeline (Gecko + SQLite)...") + val initStart = System.currentTimeMillis() + val pipeline = withContext(executor.asCoroutineDispatcher()) { + RagPipeline(application) + } + val syncInitMs = System.currentTimeMillis() - initStart + Log.w(BENCH_TAG, "[BENCHMARK] Gecko + SQLite init: ${syncInitMs}ms") + + updateNotification("Loading Gemma 4 LLM…", -1, 0) + Log.w(BENCH_TAG, "[BENCHMARK] Waiting for LLM model load...") + val llmWaitStart = System.currentTimeMillis() + withContext(executor.asCoroutineDispatcher()) { pipeline.awaitLlmReady() } + val llmInitMs = System.currentTimeMillis() - llmWaitStart + Log.w(BENCH_TAG, "[BENCHMARK] LLM model loaded: ${llmInitMs}ms (total init: ${System.currentTimeMillis() - initStart}ms)") + + val warmupQueries = listOf( + "Normal fetal heart rate", + "Signs of infection after delivery", + "A mother has heavy bleeding after birth. What should I do first?", + "A newborn is not breathing after delivery and has a heart rate below 100. What are the first steps to take?", + "A pregnant woman at 34 weeks has a severe headache, blurred vision, and blood pressure of 160 over 110. The nearest hospital is 45 minutes away. What should I do immediately while waiting for transport?", + ) + updateNotification("Warmup queries (${warmupQueries.size})…", -1, 0) + Log.w(BENCH_TAG, "[BENCHMARK] Running ${warmupQueries.size} warmup queries...") + val warmupStart = System.currentTimeMillis() + warmupQueries.forEachIndexed { i, prompt -> + Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1}/${warmupQueries.size}: \"${prompt.take(40)}...\"") + withContext(executor.asCoroutineDispatcher()) { + pipeline.generateResponse( + prompt = prompt, + history = emptyList(), + useRetrieval = false, + retrievalListener = {}, + generationListener = { _, _ -> } + ) + } + Log.w(BENCH_TAG, "[BENCHMARK] Warmup ${i + 1} done (${System.currentTimeMillis() - warmupStart}ms elapsed)") + } + val warmupMs = System.currentTimeMillis() - warmupStart + val totalInitMs = System.currentTimeMillis() - initStart + Log.w(BENCH_TAG, "[BENCHMARK] Init complete: sync=${syncInitMs}ms llm=${llmInitMs}ms warmup=${warmupMs}ms total=${totalInitMs}ms") + + val postInitMemory = collectMemoryInfo() + delay(cooldownMs) + + val queries = if (queryFilter != null) { + BenchmarkQueries.ALL.filter { it.category == queryFilter || it.id == queryFilter } + } else { + BenchmarkQueries.ALL + } + if (queries.isEmpty()) { + Log.e(BENCH_TAG, "[BENCHMARK] No queries matched filter '$queryFilter'") + Log.w(BENCH_TAG, "[BENCHMARK] FAILED") + return + } + + // skipRetrieval and ragOnly are mutually exclusive. The Python wrapper + // (benchmark_latency.py) rejects this combination upfront via + // parser.error(); a direct `am start` could still pass both, so log a + // visible warning in logcat instead of silently picking one. + if (skipRetrieval && ragOnly) { + Log.w(BENCH_TAG, "[BENCHMARK] WARNING: skip_retrieval AND rag_only both set; skip_retrieval wins (No-RAG only).") + } + val retrievalModes = when { + skipRetrieval -> listOf(false) + ragOnly -> listOf(true) + else -> listOf(true, false) + } + val totalRuns = queries.size * retrievalModes.size * repeats + Log.w(BENCH_TAG, "[BENCHMARK] Running ${queries.size} queries x ${retrievalModes.size} modes x $repeats repeats = $totalRuns total runs") + + val results = mutableListOf() + var runIndex = 0 + val loopStart = System.currentTimeMillis() + + for (query in queries) { + for (useRetrieval in retrievalModes) { + for (rep in 1..repeats) { + runIndex++ + updateNotification("[$runIndex/$totalRuns] ${query.id} rep=$rep", runIndex, totalRuns) + + Log.w(BENCH_TAG, "[BENCHMARK] [$runIndex/$totalRuns] query=${query.id} retrieval=$useRetrieval rep=$rep/$repeats") + + val preMemory = collectMemoryInfo() + val result = runQuery(pipeline, query.text, useRetrieval, retrieveKOverride) + val postMemory = collectMemoryInfo() + + val decodeTps = if (result.decodeMs > 0) + round2(result.estimatedTokens / (result.decodeMs / 1000.0)) + else 0.0 + + val entry = JSONObject().apply { + put("query_id", query.id) + put("category", query.category) + put("query_text", query.text) + put("query_word_count", query.wordCount) + put("use_retrieval", useRetrieval) + put("repetition", rep) + put("retrieval_time_ms", result.retrievalTimeMs) + put("ttft_ms", result.ttftMs) + put("prefill_ms", result.prefillMs) + put("decode_ms", result.decodeMs) + put("total_generation_ms", result.generationTotalMs) + put("total_query_ms", result.totalQueryMs) + put("response_length_chars", result.responseChars) + put("estimated_tokens", result.estimatedTokens) + put("decode_throughput_tps", decodeTps) + put("num_retrieved_docs", result.numRetrievedDocs) + put("retrieved_chunks", JSONArray().apply { + result.retrievedChunks.forEach { doc -> + put(JSONObject().apply { + put("text", doc.text) + put("source", doc.source) + put("page", doc.page) + put("chars", doc.text.length) + }) + } + }) + put("retrieved_total_chars", result.retrievedTotalChars) + put("response_text", result.responseText) + put("error", result.error ?: JSONObject.NULL) + put("heap_before_mb", preMemory.getInt("used_mb")) + put("heap_after_mb", postMemory.getInt("used_mb")) + } + results.add(entry) + + Log.w(BENCH_TAG, "[BENCHMARK] result: ttft=${result.ttftMs}ms decode=${result.decodeMs}ms total=${result.totalQueryMs}ms chars=${result.responseChars} tps=$decodeTps") + + if (runIndex < totalRuns) { + delay(cooldownMs) + } + } + } + } + + val output = JSONObject().apply { + put("benchmark_version", 1) + put("timestamp", timestamp) + put("device", deviceInfo) + put("config", JSONObject().apply { + put("repeats", repeats) + put("cooldown_ms", cooldownMs) + put("skip_retrieval", skipRetrieval) + put("rag_only", ragOnly) + put("query_filter", queryFilter ?: JSONObject.NULL) + put("retrieval_top_k_override", retrieveKOverride ?: JSONObject.NULL) + put("model", "gemma-4-E4B-it.litertlm") + // Read backend from BuildConfig at compile time. Older builds + // hard-coded "CPU" here even when GPU was active — fixed so the + // JSON metadata matches reality. + put("backend", if (BuildConfig.USE_GPU_FOR_LLM) "GPU" else "CPU") + put("mtp_enabled", BuildConfig.USE_MTP_FOR_LLM) + put("max_tokens", 32000) + put("temperature", 1.0) + put("top_p", 0.95) + put("top_k", 64) + }) + put("init", JSONObject().apply { + put("gecko_sqlite_ms", syncInitMs) + put("llm_load_ms", llmInitMs) + put("warmup_query_ms", warmupMs) + put("total_init_ms", totalInitMs) + }) + put("memory", postInitMemory) + put("results", JSONArray(results)) + put("total_benchmark_time_ms", System.currentTimeMillis() - benchmarkStart) + } + + val outFile = File(getExternalFilesDir(null), "benchmark_results.json") + outFile.writeText(output.toString(2)) + Log.w(BENCH_TAG, "[BENCHMARK] Results written to ${outFile.absolutePath}") + Log.w(BENCH_TAG, "[BENCHMARK] COMPLETE") + } + + // ── Single-query execution ─────────────────────────────────────────── + + private data class QueryResult( + val retrievalTimeMs: Long, + val ttftMs: Long, + val prefillMs: Long, + val decodeMs: Long, + val generationTotalMs: Long, + val totalQueryMs: Long, + val responseChars: Int, + val estimatedTokens: Int, + val numRetrievedDocs: Int, + val retrievedChunks: List, + val retrievedTotalChars: Int, + val responseText: String, + val error: String?, + ) + + private suspend fun runQuery( + pipeline: RagPipeline, + queryText: String, + useRetrieval: Boolean, + retrieveKOverride: Int?, + ): QueryResult { + var retrievalTimeMs = 0L + var numDocs = 0 + var firstTokenTime = 0L + var error: String? = null + val responseBuilder = StringBuilder() + var retrievedChunks: List = emptyList() + + val qStart = System.currentTimeMillis() + var retrievalDoneTime = 0L + + try { + withContext(executor.asCoroutineDispatcher()) { + pipeline.generateResponse( + prompt = queryText, + history = emptyList(), + useRetrieval = useRetrieval, + retrievalListener = { docs -> + retrievalDoneTime = System.currentTimeMillis() + retrievalTimeMs = retrievalDoneTime - qStart + numDocs = docs.size + retrievedChunks = docs + }, + generationListener = { partial, _ -> + responseBuilder.append(partial) + if (firstTokenTime == 0L && partial.isNotEmpty()) { + firstTokenTime = System.currentTimeMillis() + } + }, + retrieveKOverride = retrieveKOverride, + ) + } + } catch (e: Exception) { + error = e.message + Log.e(BENCH_TAG, "[BENCHMARK] Query failed: ${e.message}", e) + } + + val qEnd = System.currentTimeMillis() + val totalQueryMs = qEnd - qStart + val responseChars = responseBuilder.length + + // TTFT excludes retrieval; we measure from end-of-retrieval to first token. + val genStart = if (retrievalDoneTime > 0) retrievalDoneTime else qStart + val ttftMs = if (firstTokenTime > 0) firstTokenTime - genStart else 0 + val decodeMs = if (firstTokenTime > 0) qEnd - firstTokenTime else 0 + val generationTotalMs = qEnd - genStart + val estimatedTokens = (responseChars / CHARS_PER_TOKEN_ESTIMATE).toInt() + + return QueryResult( + retrievalTimeMs = retrievalTimeMs, + ttftMs = ttftMs, + prefillMs = ttftMs, + decodeMs = decodeMs, + generationTotalMs = generationTotalMs, + totalQueryMs = totalQueryMs, + responseChars = responseChars, + estimatedTokens = estimatedTokens, + numRetrievedDocs = numDocs, + retrievedChunks = retrievedChunks, + retrievedTotalChars = retrievedChunks.sumOf { it.text.length }, + responseText = responseBuilder.toString(), + error = error, + ) + } + + // ── Helpers ────────────────────────────────────────────────────────── + + private fun collectDeviceInfo(): JSONObject = JSONObject().apply { + put("manufacturer", Build.MANUFACTURER) + put("model", Build.MODEL) + put("device", Build.DEVICE) + put("hardware", Build.HARDWARE) + put("board", Build.BOARD) + put("soc", if (Build.VERSION.SDK_INT >= 31) Build.SOC_MODEL else "unknown") + put("android_version", Build.VERSION.RELEASE) + put("sdk_int", Build.VERSION.SDK_INT) + put("abi", Build.SUPPORTED_ABIS.firstOrNull() ?: "unknown") + } + + private fun collectMemoryInfo(): JSONObject { + val rt = Runtime.getRuntime() + return JSONObject().apply { + put("used_mb", (rt.totalMemory() - rt.freeMemory()) / 1024 / 1024) + put("free_mb", rt.freeMemory() / 1024 / 1024) + put("total_mb", rt.totalMemory() / 1024 / 1024) + put("max_mb", rt.maxMemory() / 1024 / 1024) + } + } + + private fun round2(v: Double): Double = Math.round(v * 100.0) / 100.0 + + private fun ensureChannel(context: Context) { + if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) { + val nm = context.getSystemService(NotificationManager::class.java) + if (nm?.getNotificationChannel(CHANNEL_ID) == null) { + val channel = NotificationChannel( + CHANNEL_ID, + "MAM-AI Benchmark", + NotificationManager.IMPORTANCE_LOW, + ).apply { + description = "Foreground notification while the on-device latency benchmark runs" + setShowBadge(false) + } + nm?.createNotificationChannel(channel) + } + } + } + + private fun buildNotification( + context: Context, + message: String, + progress: Int, + max: Int, + ): Notification { + val builder = NotificationCompat.Builder(context, CHANNEL_ID) + .setContentTitle("MAM-AI Benchmark") + .setContentText(message) + .setSmallIcon(android.R.drawable.stat_sys_download) + .setOngoing(true) + .setOnlyAlertOnce(true) + .setPriority(NotificationCompat.PRIORITY_LOW) + + if (max > 0 && progress >= 0) { + builder.setProgress(max, progress, false) + } else { + builder.setProgress(0, 0, true) + } + return builder.build() + } +} diff --git a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt index 19f582e..e13e391 100644 --- a/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt +++ b/app/android/app/src/main/kotlin/com/example/app/RagPipeline.kt @@ -219,7 +219,12 @@ class RagPipeline(application: Application) { } } - /** Generates the response from the LLM with conversation history support. */ + /** Generates the response from the LLM with conversation history support. + * + * [retrieveKOverride] — when non-null, replaces `retrievalConfig.top_k` + * for this call only. Used by [BenchmarkActivity] for the per-k latency + * sweep; production callers leave it null and inherit the runtime config. + */ suspend fun generateResponse( prompt: String, history: List>, @@ -227,6 +232,7 @@ class RagPipeline(application: Application) { language: String = "en", retrievalListener: (docs: List) -> Unit, generationListener: (partial: String, done: Boolean) -> Unit, + retrieveKOverride: Int? = null, ): String = coroutineScope { awaitLlmReady() @@ -235,10 +241,11 @@ class RagPipeline(application: Application) { val qStart = System.currentTimeMillis() val docs = if (useRetrieval) { + val effectiveTopK = retrieveKOverride ?: retrievalConfig.getInt("top_k") val retrievalRequest = RetrievalRequest.create( prompt, RetrievalConfig.create( - retrievalConfig.getInt("top_k"), + effectiveTopK, retrievalConfig.getDouble("similarity_threshold").toFloat(), TaskType.RETRIEVAL_QUERY, ), diff --git a/evaluation/aggregate_k_sweep.py b/evaluation/aggregate_k_sweep.py new file mode 100644 index 0000000..d11e390 --- /dev/null +++ b/evaluation/aggregate_k_sweep.py @@ -0,0 +1,417 @@ +#!/usr/bin/env python3 +"""Aggregate per-k latency-sweep JSONs into a single GPU↔CPU comparison report. + +Reads all benchmark_*.json files produced by benchmark_latency.py, groups them +by (backend, k_override), and writes a markdown report at +evaluation/reports/latency_report_v2.md. + +Notes on backend identification: post-fix benchmark JSONs (commit ef96538 +onward) record `backend` correctly and are trusted as-is. Pre-fix GPU sweep +JSONs hard-code `backend="CPU"` even though they were measured on GPU; we +backfill those using an explicit filename allowlist (see `backend_of`). +Future runs of any backend are unaffected. +""" +from __future__ import annotations + +import datetime +import glob +import json +import os +import statistics +import sys +from collections import defaultdict +from pathlib import Path + +# Backfill for the specific historical GPU sweep files that predate the +# metadata-recording fix in commit ef96538. Those JSONs hard-code +# config.backend="CPU" even though they were measured on GPU. We use an +# explicit filename allowlist (rather than a timestamp threshold) so the +# rewrite cannot accidentally fire on anyone else's pre-threshold *genuine +# CPU* JSONs that happen to share latency_results/. +PRE_FIX_GPU_FILES = frozenset({ + "benchmark_20260514T174502_k1.json", + "benchmark_20260514T180830_k3.json", + "benchmark_20260514T183604_k5.json", + "benchmark_20260514T190438_k7.json", + "benchmark_20260514T193453_k10.json", + "benchmark_20260514T200414_k15.json", + "benchmark_20260514T203653_k20.json", + "benchmark_20260514T210522.json", +}) + + +def backend_of(filename: str, recorded: str) -> str: + """Trust the recorded backend except for the listed pre-fix GPU files.""" + if filename in PRE_FIX_GPU_FILES: + return "GPU" + return recorded + + +def load_runs() -> list[dict]: + files = sorted(glob.glob(os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "latency_results", "benchmark_*.json", + ))) + runs = [] + for f in files: + try: + with open(f) as fp: + d = json.load(fp) + except (json.JSONDecodeError, OSError): + continue + if "config" not in d or "results" not in d: + print(f"SKIP: {os.path.basename(f)} — missing config or results key", file=sys.stderr) + continue + if len(d["results"]) < 30: + # Skip ad-hoc smoke tests (the canonical sweep is 54 runs). Log so + # that a legitimate narrow sweep (--filter long_01, single-category) + # isn't silently dropped from the report. + print( + f"SKIP: {os.path.basename(f)} — {len(d['results'])} results " + "(< 30 threshold for canonical sweeps; pass it through if it " + "should appear in the matrix)", + file=sys.stderr, + ) + continue + ts = os.path.basename(f).replace("benchmark_", "").split(".")[0].split("_")[0] + k_override = d["config"].get("retrieval_top_k_override") + skip_retrieval = d["config"].get("skip_retrieval", False) + k_label = 0 if skip_retrieval else (k_override if k_override is not None else None) + if k_label is None: + continue + # The metadata fix in commit ef96538 ensures post-fix runs record + # config.backend. If it's missing, the JSON predates that fix — only + # safe if the filename is on the allowlist; otherwise warn loudly + # rather than silently defaulting (which would mask future GPU runs + # written by a regressed BenchmarkForegroundService). + recorded_backend = d["config"].get("backend") + if recorded_backend is None: + if os.path.basename(f) not in PRE_FIX_GPU_FILES: + print( + f"WARN: {os.path.basename(f)} has no config.backend " + "field and is not on the pre-fix allowlist; defaulting " + "to CPU. If this was actually a GPU run, fix the source.", + file=sys.stderr, + ) + recorded_backend = "CPU" + backend = backend_of(os.path.basename(f), recorded_backend) + runs.append({ + "file": os.path.basename(f), + "timestamp": ts, + "backend": backend, + "k": k_label, + "data": d, + }) + return runs + + +def _p95(values: list[float]) -> int | None: + """95th percentile via linear-interpolation 20-quantile partition. + + `statistics.quantiles(data, n=20)` returns 19 cut points dividing the + data into 20 equal-frequency groups; index 18 is the 95th percentile. + For very small samples (n < 2), there are no cut points to compute, + so we fall back to max — same behaviour as the previous + `int(len(s)*0.95)` formula but without the off-by-one that made p95 + collapse to max for any n < 20. + """ + if not values: + return None + if len(values) < 2: + return int(values[0]) + return int(statistics.quantiles(values, n=20, method="exclusive")[18]) + + +def aggregate_per_category(d: dict, key: str) -> dict[str, dict]: + """Per-category {median, p95, n} for the given timing field.""" + cat_vals: dict[str, list] = defaultdict(list) + for r in d["results"]: + if r.get("error"): + continue + cat_vals[r["category"]].append(r[key]) + out = {} + for c, vs in cat_vals.items(): + if not vs: + continue + out[c] = { + "n": len(vs), + "median": int(statistics.median(vs)), + "p95": _p95(vs), + } + return out + + +def aggregate_overall(d: dict, key: str) -> dict: + vs = [r[key] for r in d["results"] if not r.get("error")] + if not vs: + return {} + return { + "n": len(vs), + "median": int(statistics.median(vs)), + "p95": _p95(vs), + } + + +def median_doc_chars(d: dict) -> int: + """Median retrieved_total_chars across successful runs (the table column + is labeled 'doc_chars med', so this is the median by definition).""" + vs = [r.get("retrieved_total_chars", 0) for r in d["results"] if not r.get("error")] + return int(statistics.median(vs)) if vs else 0 + + +def fmt_ms(v: int | None) -> str: + return f"{v}" if v is not None else "—" + + +def fmt_s(v: int | None) -> str: + return f"{v / 1000:.1f}" if v is not None else "—" + + +def write_report(runs: list[dict], out_path: Path) -> None: + # Build {(backend, k) -> latest canonical run} + matrix: dict[tuple[str, int], dict] = {} + for r in runs: + key = (r["backend"], r["k"]) + if key in matrix: + # Keep the run with most successful entries (resolves duplicates) + ex = matrix[key] + ex_ok = sum(1 for x in ex["data"]["results"] if not x.get("error")) + r_ok = sum(1 for x in r["data"]["results"] if not x.get("error")) + if r_ok > ex_ok: + matrix[key] = r + else: + matrix[key] = r + + gpu_ks = sorted([k for (b, k) in matrix if b == "GPU"]) + cpu_ks = sorted([k for (b, k) in matrix if b == "CPU"]) + all_ks = sorted(set(gpu_ks + cpu_ks)) + + # Sample run for device info + sample = next(iter(matrix.values())) + dev = sample["data"]["device"] + + md = [] + md.append("# MAM-AI On-Device Latency Sweep — GPU vs CPU\n") + md.append(f"_Generated: {datetime.datetime.now().isoformat(timespec='seconds')}_\n") + md.append("") + md.append("## Device & stack\n") + md.append(f"- **Device**: {dev.get('manufacturer', '?')} {dev.get('model', '?')} ({dev.get('soc', '?')}) — Android {dev.get('android_version', '?')}") + md.append(f"- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`)") + md.append(f"- **LiteRT-LM**: 0.11.0") + md.append(f"- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU") + md.append(f"- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000") + md.append("") + # Pull the actual values from the sample run's config instead of hard-coding + # text that can lie. If different runs used different settings, this won't + # catch that — but we'd rather report the sample's truth than fabricate a + # round-number claim. + sample_cfg = sample["data"].get("config", {}) + sample_repeats = sample_cfg.get("repeats", "?") + sample_cooldown_s = (sample_cfg.get("cooldown_ms") or 0) / 1000.0 + sample_n_results = len(sample["data"]["results"]) + # Infer queries × modes from total runs / repeats. Default to "?" if the + # math doesn't divide evenly. + queries_x_modes: object = "?" + if isinstance(sample_repeats, int) and sample_repeats > 0 and sample_n_results % sample_repeats == 0: + queries_x_modes = sample_n_results // sample_repeats + md.append("## Methodology\n") + md.append( + f"Per backend × k configuration: {queries_x_modes} (query × mode) cells " + f"× {sample_repeats} repeats = {sample_n_results} timed runs. Plus a " + f"No-RAG baseline per backend (k=0 via `--no-retrieval`). " + f"{sample_cooldown_s:g}-second cooldown between runs for thermal " + "stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so " + "the run survives screen-off and device-lock; OPPO Hans whitelist set " + "manually." + ) + md.append("") + md.append("- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token.") + md.append("- `decode` is first-token to last-token.") + md.append("- `total_query` is everything: `retrieval + TTFT + decode`.") + md.append(f"- Reported as median across the {sample_n_results} runs unless noted (p95 in tables marked `p95`).") + md.append("") + + # ─────────── Headline table: total_query_ms by (backend, k) ─────────── + md.append("## Headline — Median total query latency (seconds)\n") + md.append(f"| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU |") + md.append(f"|---:|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + # doc chars: take from GPU if available, else CPU + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gpu_cells = "—" + cpu_cells = "—" + if gpu_run: + g = aggregate_per_category(gpu_run["data"], "total_query_ms") + gpu_cells = " / ".join(fmt_s(g.get(c, {}).get("median")) for c in ["short", "medium", "long"]) + if cpu_run: + c_ = aggregate_per_category(cpu_run["data"], "total_query_ms") + cpu_cells = " / ".join(fmt_s(c_.get(c, {}).get("median")) for c in ["short", "medium", "long"]) + # ratio + ratio = "" + if gpu_run and cpu_run: + gov = aggregate_overall(gpu_run["data"], "total_query_ms").get("median") + cov = aggregate_overall(cpu_run["data"], "total_query_ms").get("median") + if gov is not None and cov is not None and gov > 0: + ratio = f"{cov / gov:.2f}×" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {doc_chars} | {gpu_cells} | {cpu_cells} | {ratio} |") + md.append("") + + # ─────────── TTFT detail ─────────── + md.append("## TTFT (ms, median) — prefill cost grows with retrieved-doc content\n") + md.append(f"| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU |") + md.append(f"|---:|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + doc_chars = median_doc_chars(gpu_run["data"] if gpu_run else cpu_run["data"]) if (gpu_run or cpu_run) else 0 + gv = aggregate_overall(gpu_run["data"], "ttft_ms").get("median") if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "ttft_ms").get("median") if cpu_run else None + # Explicit None checks; also guard against div-by-zero on a 0 median. + ratio = f"{cv / gv:.1f}×" if (gv is not None and cv is not None and gv > 0) else "" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {doc_chars} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + md.append("") + + # ─────────── Decode detail ─────────── + md.append("## Decode (ms, median) — first token to last token\n") + md.append("Decode time mostly tracks output length, not k or doc content. Variation across k reflects ") + md.append("the model writing *longer answers* when given more context (more material to draw on).") + md.append("") + md.append(f"| k | GPU decode | CPU decode | CPU÷GPU |") + md.append(f"|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + gv = aggregate_overall(gpu_run["data"], "decode_ms").get("median") if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "decode_ms").get("median") if cpu_run else None + ratio = f"{cv / gv:.2f}×" if (gv is not None and cv is not None and gv > 0) else "" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {fmt_ms(gv)} | {fmt_ms(cv)} | {ratio} |") + md.append("") + + # ─────────── p95 totals ─────────── + md.append("## p95 total query latency (s) — tail-latency view\n") + md.append(f"| k | GPU p95 | CPU p95 |") + md.append(f"|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + gv = aggregate_overall(gpu_run["data"], "total_query_ms").get("p95") if gpu_run else None + cv = aggregate_overall(cpu_run["data"], "total_query_ms").get("p95") if cpu_run else None + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {fmt_s(gv)} | {fmt_s(cv)} |") + md.append("") + + # ─────────── Errors / context limit ─────────── + md.append("## Errors and the 4096-token context wall\n") + md.append(f"| k | GPU errors / 54 | CPU errors / 54 |") + md.append(f"|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + ge = sum(1 for r in gpu_run["data"]["results"] if r.get("error")) if gpu_run else None + ce = sum(1 for r in cpu_run["data"]["results"] if r.get("error")) if cpu_run else None + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {fmt_ms(ge)} | {fmt_ms(ce)} |") + md.append("") + md.append("At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. ") + md.append("Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both ") + md.append("backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — ") + md.append("the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of ") + md.append("the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. ") + md.append("The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter.") + md.append("") + md.append("Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any ") + md.append("deployment budget at this depth even when the request fits in the context window.") + md.append("") + + # ─────────── Wall-clock comparison ─────────── + md.append("## Wall-clock comparison\n") + md.append("| k | GPU wall (min) | CPU wall (min) | CPU÷GPU |") + md.append("|---:|---:|---:|---:|") + for k in all_ks: + gpu_run = matrix.get(("GPU", k)) + cpu_run = matrix.get(("CPU", k)) + gw = gpu_run["data"]["total_benchmark_time_ms"] / 60000 if gpu_run else None + cw = cpu_run["data"]["total_benchmark_time_ms"] / 60000 if cpu_run else None + gw_s = f"{gw:.1f}" if gw is not None else "—" + cw_s = f"{cw:.1f}" if cw is not None else "—" + ratio = f"{cw / gw:.2f}×" if (gw is not None and cw is not None and gw > 0) else "" + label = "**0 (no-RAG)**" if k == 0 else str(k) + md.append(f"| {label} | {gw_s} | {cw_s} | {ratio} |") + + # Findings / interpretation + md.append("") + md.append("## Key findings\n") + md.append("") + md.append("### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite") + md.append("GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. ") + md.append("That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), ") + md.append("so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency.") + md.append("") + md.append("### 2. The model's 4096-token context window is the binding ceiling at high k") + md.append("k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — ") + md.append("the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. ") + md.append("Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives ") + md.append("the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, ") + md.append("not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. ") + md.append("Latency is *not* the constraint at the upper end; the model's context window is.") + md.append("") + md.append("### 3. Latency is not the binding factor on GPU below k=15") + md.append("GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. ") + md.append("Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), ") + md.append("not by what fits in the latency budget.") + md.append("") + md.append("### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow") + md.append("CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. ") + md.append("p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't ") + md.append("available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, ") + md.append("or **k ≤ 1** if you want sub-40s p95.") + md.append("") + md.append("### 5. Decode time is content-driven, not k-driven") + md.append("Decode time tracks output length. As k grows, the model writes *longer* responses — likely because ") + md.append("more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. ") + md.append("Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, ") + md.append("not compute-bound on this hardware.") + md.append("") + md.append("### 6. TTFT scales linearly with retrieved-doc content past k=3") + md.append("On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, ") + md.append("CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting ") + md.append("the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both.") + md.append("") + + # File inventory + md.append("## Data inventory (per `(backend, k)`)\n") + md.append("| Backend | k | File | Wall (min) | Runs | Errors |") + md.append("|---|---:|---|---:|---:|---:|") + for (b, k) in sorted(matrix.keys(), key=lambda x: (x[0], x[1])): + r = matrix[(b, k)] + wall = r["data"]["total_benchmark_time_ms"] / 60000 + n = len(r["data"]["results"]) + e = sum(1 for x in r["data"]["results"] if x.get("error")) + label = "0 (no-RAG)" if k == 0 else str(k) + md.append(f"| {b} | {label} | `{r['file']}` | {wall:.1f} | {n} | {e} |") + md.append("") + md.append("---") + md.append("") + md.append("_Source benchmark JSONs live in `evaluation/latency_results/`. ") + md.append("Aggregation script: `evaluation/aggregate_k_sweep.py`._") + + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text("\n".join(md) + "\n") + print(f"Report written to: {out_path}") + + +def main() -> int: + runs = load_runs() + print(f"Loaded {len(runs)} canonical runs") + out = Path(__file__).resolve().parent / "reports" / "latency_report_v2.md" + write_report(runs, out) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/evaluation/benchmark_latency.py b/evaluation/benchmark_latency.py index 5611e21..7486f96 100644 --- a/evaluation/benchmark_latency.py +++ b/evaluation/benchmark_latency.py @@ -12,6 +12,7 @@ python evaluation/benchmark_latency.py --filter long_01 # Single specific query python evaluation/benchmark_latency.py --no-retrieval # Skip RAG retrieval python evaluation/benchmark_latency.py --cooldown 10000 # Longer cooldown (thermal) + python evaluation/benchmark_latency.py --retrieve-k 5 # Override retrieval top_k for this session """ import argparse @@ -68,9 +69,15 @@ def check_device(device_serial=None): def check_models_downloaded(device_serial=None): - """Check if model files exist on device.""" + """Check if model files exist on device. + + Filenames must match config/app_config.json — the app loads + "llm_model" / "embedding_model" / "tokenizer" from there. Updated + for the Gemma 4 E4B / LiteRT-LM 0.11.0 stack; the old Gemma 3n + .task name is no longer in production. + """ required_files = [ - "gemma-3n-E4B-it-int4.task", + "gemma-4-E4B-it.litertlm", "Gecko_1024_quant.tflite", "sentencepiece.model", "embeddings.sqlite", @@ -103,7 +110,8 @@ def clear_logcat(device_serial=None): def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000, - skip_retrieval=False, query_filter=None): + skip_retrieval=False, rag_only=False, + query_filter=None, retrieve_k=None): """Launch BenchmarkActivity via ADB.""" cmd = _adb(device_serial) + [ "shell", "am", "start", @@ -113,8 +121,12 @@ def launch_benchmark(device_serial=None, repeats=3, cooldown_ms=5000, ] if skip_retrieval: cmd += ["--ez", "skip_retrieval", "true"] + if rag_only: + cmd += ["--ez", "rag_only", "true"] if query_filter: cmd += ["--es", "query_filter", query_filter] + if retrieve_k is not None: + cmd += ["--ei", "retrieve_k", str(retrieve_k)] result = subprocess.run(cmd, capture_output=True, text=True) if "Error" in result.stderr: @@ -458,8 +470,16 @@ def main(): help="Cooldown between queries in ms (default: 5000)") parser.add_argument("--no-retrieval", action="store_true", help="Skip RAG retrieval (generation only)") + parser.add_argument("--rag-only", action="store_true", + help="Skip the No-RAG mode (only run with retrieval). " + "Pair with --retrieve-k to do a k-sweep without " + "re-running the No-RAG baseline at every k.") parser.add_argument("--filter", type=str, default=None, help="Filter by category (short/medium/long) or query ID (e.g., long_01)") + parser.add_argument("--retrieve-k", type=int, default=None, + help="Override retrieval top_k for this session " + "(default: use runtime_config.json's value, currently 3). " + "Used for the per-k latency sweep.") parser.add_argument("--output-dir", type=str, default="evaluation/latency_results", help="Directory for output files") parser.add_argument("--device", type=str, default=None, @@ -468,6 +488,14 @@ def main(): help="Timeout in seconds (default: 7200)") args = parser.parse_args() + if args.no_retrieval and args.rag_only: + parser.error("--no-retrieval and --rag-only are mutually exclusive") + if args.retrieve_k is not None and args.retrieve_k < 1: + # The service treats any value >= 0 as an explicit override. Passing 0 + # would call RetrievalConfig.create(0, …), which is a silent footgun + # — use --no-retrieval if you actually want to disable retrieval. + parser.error("--retrieve-k must be >= 1; use --no-retrieval to disable retrieval entirely") + print("=" * 60) print("MAM-AI On-Device Latency Benchmark") print("=" * 60) @@ -494,13 +522,16 @@ def main(): clear_logcat(args.device) # Launch benchmark - print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}") + k_msg = f", retrieve_k={args.retrieve_k}" if args.retrieve_k is not None else "" + print(f"Launching: {args.repeats} repeats, {args.cooldown}ms cooldown, filter={args.filter}{k_msg}") launch_benchmark( device_serial=args.device, repeats=args.repeats, cooldown_ms=args.cooldown, skip_retrieval=args.no_retrieval, + rag_only=args.rag_only, query_filter=args.filter, + retrieve_k=args.retrieve_k, ) # Wait for completion @@ -509,8 +540,9 @@ def main(): print("Benchmark did not complete successfully.") sys.exit(1) - # Pull results - json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}.json") + # Pull results — include k in the filename so a sweep across k values is legible. + k_suffix = f"_k{args.retrieve_k}" if args.retrieve_k is not None else "" + json_path = os.path.join(args.output_dir, f"benchmark_{timestamp}{k_suffix}.json") pull_results(args.device, json_path) # Load and analyze diff --git a/evaluation/reports/latency_report_v2.md b/evaluation/reports/latency_report_v2.md new file mode 100644 index 0000000..c6745a6 --- /dev/null +++ b/evaluation/reports/latency_report_v2.md @@ -0,0 +1,176 @@ +# MAM-AI On-Device Latency Sweep — GPU vs CPU + +_Generated: 2026-05-15T10:51:06_ + + +## Device & stack + +- **Device**: OnePlus OPD2413 (SM8750P) — Android 15 +- **Model**: Gemma 4 E4B (`gemma-4-E4B-it.litertlm`) +- **LiteRT-LM**: 0.11.0 +- **Backends tested**: GPU (OpenCL, via `useGpuForLlm=true`) and CPU +- **Sampling**: temp=1.0, top_p=0.95, top_k=64, max_tokens=32000 + +## Methodology + +Per backend × k configuration: 18 (query × mode) cells × 3 repeats = 54 timed runs. Plus a No-RAG baseline per backend (k=0 via `--no-retrieval`). 10-second cooldown between runs for thermal stability. Activity → ForegroundService with PARTIAL_WAKE_LOCK so the run survives screen-off and device-lock; OPPO Hans whitelist set manually. + +- `TTFT` excludes retrieval — measured from end-of-retrieval to first generated token. +- `decode` is first-token to last-token. +- `total_query` is everything: `retrieval + TTFT + decode`. +- Reported as median across the 54 runs unless noted (p95 in tables marked `p95`). + +## Headline — Median total query latency (seconds) + +| k | doc_chars med | GPU short / med / long | CPU short / med / long | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 12.9 / 15.6 / 16.1 | 27.2 / 26.9 / 29.8 | 1.94× | +| 1 | 561 | 13.1 / 12.6 / 17.3 | 29.3 / 31.9 / 30.3 | 2.14× | +| 3 | 2098 | 18.6 / 18.6 / 21.0 | 37.3 / 44.5 / 42.5 | 2.24× | +| 5 | 3547 | 18.2 / 20.0 / 21.4 | 54.8 / 60.7 / 63.0 | 3.07× | +| 7 | 5139 | 21.3 / 23.2 / 22.8 | 61.4 / 62.3 / 60.4 | 2.72× | +| 10 | 7482 | 22.5 / 20.5 / 20.4 | 61.8 / 70.6 / 77.9 | 3.10× | +| 15 | 11297 | 25.3 / 24.0 / 22.4 | 84.8 / 80.8 / 89.7 | 3.48× | +| 20 | 14520 | 23.9 / 20.5 / 18.5 | 88.7 / 95.6 / 95.6 | 4.46× | + +## TTFT (ms, median) — prefill cost grows with retrieved-doc content + +| k | doc_chars med | GPU TTFT | CPU TTFT | CPU÷GPU | +|---:|---:|---:|---:|---:| +| **0 (no-RAG)** | 0 | 962 | 12633 | 13.1× | +| 1 | 561 | 954 | 12649 | 13.3× | +| 3 | 2098 | 989 | 18356 | 18.6× | +| 5 | 3547 | 1884 | 36424 | 19.3× | +| 7 | 5139 | 1920 | 36444 | 19.0× | +| 10 | 7482 | 2523 | 40013 | 15.9× | +| 15 | 11297 | 3457 | 54748 | 15.8× | +| 20 | 14520 | 3986 | 72881 | 18.3× | + +## Decode (ms, median) — first token to last token + +Decode time mostly tracks output length, not k or doc content. Variation across k reflects +the model writing *longer answers* when given more context (more material to draw on). + +| k | GPU decode | CPU decode | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 13470 | 15345 | 1.14× | +| 1 | 11415 | 13961 | 1.22× | +| 3 | 16364 | 19110 | 1.17× | +| 5 | 15929 | 21645 | 1.36× | +| 7 | 17215 | 23473 | 1.36× | +| 10 | 18118 | 21699 | 1.20× | +| 15 | 16820 | 22497 | 1.34× | +| 20 | 14688 | 22634 | 1.54× | + +## p95 total query latency (s) — tail-latency view + +| k | GPU p95 | CPU p95 | +|---:|---:|---:| +| **0 (no-RAG)** | 26.1 | 38.4 | +| 1 | 26.1 | 37.1 | +| 3 | 30.3 | 64.3 | +| 5 | 30.7 | 74.6 | +| 7 | 35.1 | 81.8 | +| 10 | 29.0 | 84.5 | +| 15 | 30.6 | 112.7 | +| 20 | 35.3 | 104.9 | + +## Errors and the 4096-token context wall + +| k | GPU errors / 54 | CPU errors / 54 | +|---:|---:|---:| +| **0 (no-RAG)** | 0 | 0 | +| 1 | 0 | 0 | +| 3 | 0 | 0 | +| 5 | 0 | 0 | +| 7 | 0 | 0 | +| 10 | 0 | 0 | +| 15 | 0 | 0 | +| 20 | 24 | 24 | + +At k=20, **24 of 54 runs failed on both GPU and CPU** with `Input token ids are too long. +Exceeding the maximum number of tokens allowed: …>= 4096`. The **exact same 8 queries failed on both +backends** (`long_01, long_03, medium_02, medium_04, short_01, short_03, short_04, short_05`) — +the same 24 (query × rep) pairs. This is direct evidence that the 4096-token cap is a property of +the Gemma 4 E4B `.litertlm` artifact itself, not a runtime configuration, not a backend choice. +The other 10 queries (10 × 3 reps = 30 successful runs) were the ones whose retrieved chunks happened to be shorter. + +Successful-run timing at CPU k=20: TTFT 65–73 s, total 89–96 s — confirming CPU is well past any +deployment budget at this depth even when the request fits in the context window. + +## Wall-clock comparison + +| k | GPU wall (min) | CPU wall (min) | CPU÷GPU | +|---:|---:|---:|---:| +| **0 (no-RAG)** | 23.5 | 36.9 | 1.57× | +| 1 | 23.0 | 38.7 | 1.68× | +| 3 | 27.3 | 50.2 | 1.84× | +| 5 | 28.2 | 63.0 | 2.23× | +| 7 | 30.0 | 66.5 | 2.22× | +| 10 | 29.1 | 73.2 | 2.51× | +| 15 | 32.4 | 90.8 | 2.80× | +| 20 | 22.8 | 58.6 | 2.57× | + +## Key findings + + +### 1. GPU is the practical choice for this workload on Snapdragon 8 Elite +GPU TTFT runs around **1–3.5 s** across k=0–15. CPU TTFT runs around **12.6 s (no-RAG) → 55 s (k=15)**. +That's a 13–19× TTFT speedup from GPU. Decode time is largely backend-invariant (memory-bandwidth-bound), +so the *total* speedup is closer to 2–3.5× — but those seconds of TTFT translate directly to perceived UX latency. + +### 2. The model's 4096-token context window is the binding ceiling at high k +k=15 works cleanly (54/54 on both GPU and CPU). k=20 fails identically on **both backends** — +the **exact same 24 of 54 runs (8 queries × 3 reps)** error with `Input token ids are too long … >= 4096`. +Same queries fail on both because the chunks retrieved are deterministic and chunk length × k drives +the prompt past the window. The 4096-token cap is a property of the `.litertlm` model artifact, +not a runtime config and not a backend choice. **k_max ≈ 17–18** for this artifact. +Latency is *not* the constraint at the upper end; the model's context window is. + +### 3. Latency is not the binding factor on GPU below k=15 +GPU total medians stay between 13 s (no-RAG) and 25 s (k=15) — all well under any reasonable UX budget. +Picking k* should be driven by **answer quality** (do more chunks help or hurt the small generator?), +not by what fits in the latency budget. + +### 4. CPU at k≥5 hits any reasonable UX budget; at k=15 it's prohibitively slow +CPU totals: k=3 → 37–44 s, k=5 → 55–63 s, k=7 → 60–62 s, k=10 → 62–78 s, k=15 → 81–90 s. +p95 at CPU k=15 hits **113 s** — almost two minutes for the slowest 5% of queries. If GPU isn't +available (lower-tier devices), the practical CPU operating point is **k ≤ 3** for a sub-60s budget, +or **k ≤ 1** if you want sub-40s p95. + +### 5. Decode time is content-driven, not k-driven +Decode time tracks output length. As k grows, the model writes *longer* responses — likely because +more context = more material to weave in. This is a quality-coupled latency effect, not a prefill effect. +Decode-time difference between GPU and CPU is only ~1.1–1.4× across all k, since decode is memory-bandwidth-bound, +not compute-bound on this hardware. + +### 6. TTFT scales linearly with retrieved-doc content past k=3 +On both backends, TTFT per added doc-char is roughly constant past k=3: GPU ~100–250 µs/char, +CPU ~3,500–5,000 µs/char. The GPU↔CPU ratio is stable at ~13–19× across the prefill range, suggesting +the GPU primarily speeds up the *compute-heavy* prefill phase while decode stays bandwidth-bound on both. + +## Data inventory (per `(backend, k)`) + +| Backend | k | File | Wall (min) | Runs | Errors | +|---|---:|---|---:|---:|---:| +| CPU | 0 (no-RAG) | `benchmark_20260515T022647.json` | 36.9 | 54 | 0 | +| CPU | 1 | `benchmark_20260514T213337_k1.json` | 38.7 | 54 | 0 | +| CPU | 3 | `benchmark_20260514T221238_k3.json` | 50.2 | 54 | 0 | +| CPU | 5 | `benchmark_20260514T230309_k5.json` | 63.0 | 54 | 0 | +| CPU | 7 | `benchmark_20260515T000622_k7.json` | 66.5 | 54 | 0 | +| CPU | 10 | `benchmark_20260515T011307_k10.json` | 73.2 | 54 | 0 | +| CPU | 15 | `benchmark_20260515T030401_k15.json` | 90.8 | 54 | 0 | +| CPU | 20 | `benchmark_20260515T064042_k20.json` | 58.6 | 54 | 24 | +| GPU | 0 (no-RAG) | `benchmark_20260514T210522.json` | 23.5 | 54 | 0 | +| GPU | 1 | `benchmark_20260514T174502_k1.json` | 23.0 | 54 | 0 | +| GPU | 3 | `benchmark_20260514T180830_k3.json` | 27.3 | 54 | 0 | +| GPU | 5 | `benchmark_20260514T183604_k5.json` | 28.2 | 54 | 0 | +| GPU | 7 | `benchmark_20260514T190438_k7.json` | 30.0 | 54 | 0 | +| GPU | 10 | `benchmark_20260514T193453_k10.json` | 29.1 | 54 | 0 | +| GPU | 15 | `benchmark_20260514T200414_k15.json` | 32.4 | 54 | 0 | +| GPU | 20 | `benchmark_20260514T203653_k20.json` | 22.8 | 54 | 24 | + +--- + +_Source benchmark JSONs live in `evaluation/latency_results/`. +Aggregation script: `evaluation/aggregate_k_sweep.py`._