whyisitworking
diff --git a/‎app/src/main/java/com/suhel/llamabro/demo/ui/screens/chat/ChatViewModel.kt‎
Lines changed: 58 additions & 41 deletions b/‎app/src/main/java/com/suhel/llamabro/demo/ui/screens/chat/ChatViewModel.kt‎
Lines changed: 58 additions & 41 deletions
diff --git a/‎sdk/src/main/cpp/jni/llama_session_jni.cpp‎
Lines changed: 2 additions & 2 deletions b/‎sdk/src/main/cpp/jni/llama_session_jni.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/src/main/cpp/session.cpp‎
Lines changed: 2 additions & 2 deletions b/‎sdk/src/main/cpp/session.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/src/main/cpp/session.h‎
Lines changed: 1 addition & 1 deletion b/‎sdk/src/main/cpp/session.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sdk/src/main/java/com/suhel/llamabro/sdk/LlamaChatSession.kt‎
Lines changed: 2 additions & 2 deletions b/‎sdk/src/main/java/com/suhel/llamabro/sdk/LlamaChatSession.kt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sdk/src/main/java/com/suhel/llamabro/sdk/LlamaSession.kt‎
Lines changed: 3 additions & 3 deletions b/‎sdk/src/main/java/com/suhel/llamabro/sdk/LlamaSession.kt‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎sdk/src/main/java/com/suhel/llamabro/sdk/internal/LlamaChatSessionImpl.kt‎
Lines changed: 53 additions & 62 deletions b/‎sdk/src/main/java/com/suhel/llamabro/sdk/internal/LlamaChatSessionImpl.kt‎
Lines changed: 53 additions & 62 deletions
diff --git a/‎sdk/src/main/java/com/suhel/llamabro/sdk/internal/LlamaSessionImpl.kt‎
Lines changed: 3 additions & 3 deletions b/‎sdk/src/main/java/com/suhel/llamabro/sdk/internal/LlamaSessionImpl.kt‎
Lines changed: 3 additions & 3 deletions
@@ -24,10 +24,10 @@ import kotlinx.coroutines.ExperimentalCoroutinesApi
 import kotlinx.coroutines.FlowPreview
 import kotlinx.coroutines.flow.MutableSharedFlow
 import kotlinx.coroutines.flow.SharingStarted
+import kotlinx.coroutines.flow.catch
 import kotlinx.coroutines.flow.distinctUntilChanged
 import kotlinx.coroutines.flow.emitAll
 import kotlinx.coroutines.flow.filterNotNull
-import kotlinx.coroutines.flow.first
 import kotlinx.coroutines.flow.flatMapLatest
 import kotlinx.coroutines.flow.flow
 import kotlinx.coroutines.flow.flowOf
@@ -78,7 +78,10 @@ class ChatViewModel @Inject constructor(
             val history = chatRepository.getMessages(args.conversationId)
                 .map { chatMessage ->
                     when (chatMessage.role) {
-                        MessageRole.User -> Message.User(chatMessage.content)
+                        MessageRole.User -> Message.User(
+                            content = chatMessage.content
+                        )
+
                         MessageRole.Assistant -> Message.Assistant(
                             content = chatMessage.content,
                             thinking = chatMessage.thinking
@@ -116,50 +119,64 @@ class ChatViewModel @Inject constructor(
     val incomingMessage = sendMessageTrigger
         .distinctUntilChanged()
         .flatMapLatest { message ->
-            if (message != null) {
-                flow {
-                    emit(
-                        UiChatMessage(
-                            id = "streaming",
-                            role = MessageRole.Assistant,
-                            isProcessing = true,
-                        )
-                    )
+            if (message == null) {
+                return@flatMapLatest flowOf(null)
+            }
 
-                    chatRepository.addMessage(
-                        conversationId = args.conversationId,
-                        role = MessageRole.User,
-                        content = message
+            flow<UiChatMessage?> {
+                emit(
+                    UiChatMessage(
+                        id = "streaming",
+                        role = MessageRole.Assistant,
+                        isProcessing = true,
                     )
+                )
 
-                    val session = chatSessionFlow.filterNotNull().first()
-
-                    emitAll(
-                        session.completion(message)
-                            .map { chunk ->
-                                if (chunk.isComplete && chunk.contentText != null) {
-                                    chatRepository.addMessage(
-                                        conversationId = args.conversationId,
-                                        role = MessageRole.Assistant,
-                                        content = chunk.contentText!!,
-                                        thinking = chunk.thinkingText,
-                                        tokensPerSecond = chunk.tokensPerSecond
-                                    )
+                chatRepository.addMessage(
+                    conversationId = args.conversationId,
+                    role = MessageRole.User,
+                    content = message
+                )
 
-                                    null
-                                } else {
-                                    UiChatMessage(
-                                        id = "streaming",
-                                        role = MessageRole.Assistant,
-                                        content = chunk.contentText,
-                                        thinking = chunk.thinkingText
-                                    )
-                                }
+                emitAll(
+                    chatSessionFlow
+                        .filterNotNull()
+                        .flatMapLatest { chatSession ->
+                            chatSession.completion(message)
+                        }
+                        .onEach { chunk ->
+                            if (chunk.isComplete && chunk.contentText != null) {
+                                chatRepository.addMessage(
+                                    conversationId = args.conversationId,
+                                    role = MessageRole.Assistant,
+                                    content = chunk.contentText!!,
+                                    thinking = chunk.thinkingText,
+                                    tokensPerSecond = chunk.tokensPerSecond
+                                )
                             }
-                    )
-                }
-            } else {
-                flowOf(null as UiChatMessage?)
+                        }
+                        .map { chunk ->
+                            if (chunk.isComplete) {
+                                null
+                            } else {
+                                UiChatMessage(
+                                    id = "streaming",
+                                    role = MessageRole.Assistant,
+                                    content = chunk.contentText,
+                                    thinking = chunk.thinkingText
+                                )
+                            }
+                        }
+                        .catch { e ->
+                            emit(
+                                UiChatMessage(
+                                    id = "streaming",
+                                    role = MessageRole.Assistant,
+                                    error = e.message
+                                )
+                            )
+                        }
+                )
             }
         }
         .stateIn(viewModelScope, SharingStarted.Eagerly, null)
 
@@ -91,7 +91,7 @@ Java_com_suhel_llamabro_sdk_internal_LlamaSessionImpl_00024Jni_setSystemPrompt(J
 
 extern "C"
 JNIEXPORT void JNICALL
-Java_com_suhel_llamabro_sdk_internal_LlamaSessionImpl_00024Jni_injectPrompt(JNIEnv *env, jclass,
+Java_com_suhel_llamabro_sdk_internal_LlamaSessionImpl_00024Jni_ingestPrompt(JNIEnv *env, jclass,
                                                                             jlong jSessionPtr,
                                                                             jstring jText,
                                                                             jboolean jAddSpecial) {
@@ -101,7 +101,7 @@ Java_com_suhel_llamabro_sdk_internal_LlamaSessionImpl_00024Jni_injectPrompt(JNIE
     env->ReleaseStringUTFChars(jText, text);
 
     try {
-        session->injectPrompt(textStr, jAddSpecial);
+        session->ingestPrompt(textStr, jAddSpecial);
     } catch (const LlamaException &ex) {
         throwLlamaError(env, ex);
     }
 
@@ -217,8 +217,8 @@ void LlamaSession::setSystemPrompt(const std::string &prompt, bool add_special)
     ingest_prompt(prompt, true, add_special);
 }
 
-void LlamaSession::injectPrompt(const std::string &user_message, bool add_special) {
-    ingest_prompt(user_message, false, add_special);
+void LlamaSession::ingestPrompt(const std::string &prompt, bool add_special) {
+    ingest_prompt(prompt, false, add_special);
 }
 
 Generation LlamaSession::generate() {
 
@@ -84,7 +84,7 @@ class LlamaSession {
 
     void setSystemPrompt(const std::string &prompt, bool add_special);
 
-    void injectPrompt(const std::string &prompt, bool add_special);
+    void ingestPrompt(const std::string &prompt, bool add_special);
 
     Generation generate();
 
 
@@ -35,10 +35,10 @@ interface LlamaChatSession {
      * If the collector's coroutine is cancelled, the underlying native generation
      * is automatically aborted.
      *
-     * @param message The user's input text.
+     * @param prompt The user's input text.
      * @return A flow of [Completion] updates.
      */
-    fun completion(message: String): Flow<Completion>
+    fun completion(prompt: String): Flow<Completion>
 
     /**
      * Clears the current conversation history while retaining the system prompt.
 
@@ -46,11 +46,11 @@ interface LlamaSession : AutoCloseable {
      * It is cancellable; if the coroutine is cancelled, the native pre-fill
      * loop will be interrupted.
      *
-     * @param text       Raw text to add to the context.
+     * @param prompt       Raw text to add to the context.
      * @param addSpecial If true, prepends the model's default BOS token.
      * @throws LlamaError.ContextOverflow if the context is full and cannot be recovered.
      */
-    suspend fun prompt(text: String, addSpecial: Boolean = false)
+    suspend fun ingestPrompt(prompt: String, addSpecial: Boolean = false)
 
     /**
      * Samples the next token from the model based on the current context.
@@ -73,7 +73,7 @@ interface LlamaSession : AutoCloseable {
     /**
      * Asynchronously signals the native engine to stop any active computation.
      *
-     * Use this to immediately halt a long-running [prompt] or [generate] call
+     * Use this to immediately halt a long-running [ingestPrompt] or [generate] call
      * from another thread or UI action.
      */
     fun abort()
 
@@ -14,66 +14,78 @@ import kotlinx.coroutines.flow.onCompletion
 import kotlinx.coroutines.isActive
 import kotlinx.coroutines.withContext
 
-/**
- * High-level implementation of [LlamaChatSession].
- *
- * This class coordinates between the raw [LlamaSession], the [PromptFormatter],
- * and the [TokenStreamParser] to provide a conversational experience.
- * It manages the token generation loop and transforms raw tokens into
- * structured [Completion] snapshots.
- *
- * ### Turn lifecycle
- * The C++ layer decodes EOG tokens into the KV cache, so every assistant
- * turn is automatically closed at the native level. This class does not
- * need to track or inject turn-closing tokens.
- */
 internal class LlamaChatSessionImpl(
     private val session: LlamaSession,
     private val systemPrompt: String
 ) : LlamaChatSession {
-
+    private val parser = TokenStreamParser()
     private val promptFormatter = PromptFormatter(session.modelConfig.promptFormat)
 
-    override fun completion(message: String): Flow<Completion> = flow {
-        val parser = TokenStreamParser(session.modelConfig.promptFormat.assistantSuffix)
+    override fun completion(prompt: String): Flow<Completion> = flow {
         var completionState = Completion()
         var tokenCount = 0
+        val contentBuilder = StringBuilder()
+        val thinkingBuilder = StringBuilder()
+
+        parser.reset()
+        session.ingestPrompt(promptFormatter.user(prompt) + promptFormatter.assistantStart())
 
-        // Inject user turn + assistant turn prefix
-        session.prompt(promptFormatter.user(message) + promptFormatter.assistantStart())
         val startTime = System.nanoTime()
 
         while (currentCoroutineContext().isActive) {
             val generation = try {
                 session.generate()
             } catch (_: LlamaError.Cancelled) {
-                emit(completionState.finalize(tokenCount, startTime, true))
+                emit(
+                    completionState.finalize(
+                        tokenCount = tokenCount,
+                        startTime = startTime,
+                        isInterrupted = true,
+                        contentBuilder = contentBuilder,
+                        thinkingBuilder = thinkingBuilder
+                    )
+                )
                 return@flow
             } catch (e: LlamaError) {
                 throw e
             }
 
-            if (generation.isComplete) {
-                completionState = completionState.applyActions(parser.flush())
-                emit(completionState.finalize(tokenCount, startTime))
-                break
-            }
-
-            generation.token?.let { generatedToken ->
+            generation.token?.let { token ->
                 tokenCount++
-                val actions = parser.process(generatedToken)
 
-                if (actions.isNotEmpty()) {
-                    completionState = completionState.applyActions(actions)
+                val contentLenBefore = contentBuilder.length
+                val thinkingLenBefore = thinkingBuilder.length
+                val stateBefore = parser.isThinking
+
+                // The parser directly modifies the builders. 0 allocations.
+                parser.process(token, contentBuilder, thinkingBuilder)
+
+                // Only emit a new state if the parser actually appended text or flipped state
+                if (
+                    contentBuilder.length > contentLenBefore ||
+                    thinkingBuilder.length > thinkingLenBefore ||
+                    parser.isThinking != stateBefore
+                ) {
+                    completionState = completionState.copy(
+                        contentText = if (contentBuilder.isEmpty()) null else contentBuilder.toString(),
+                        thinkingText = if (thinkingBuilder.isEmpty()) null else thinkingBuilder.toString()
+                    )
                     emit(completionState)
                 }
+            }
 
-                // Stop if the parser intercepted a configured stop sequence
-                // (e.g. assistant suffix for custom formats where suffix ≠ EOG).
-                if (actions.any { it is StreamAction.Stop }) {
-                    emit(completionState.finalize(tokenCount, startTime))
-                    break
-                }
+            if (generation.isComplete) {
+                parser.flush(contentBuilder, thinkingBuilder)
+                emit(
+                    completionState.finalize(
+                        tokenCount = tokenCount,
+                        startTime = startTime,
+                        isInterrupted = false,
+                        contentBuilder = contentBuilder,
+                        thinkingBuilder = thinkingBuilder
+                    )
+                )
+                break
             }
         }
     }
@@ -84,42 +96,21 @@ internal class LlamaChatSessionImpl(
         }
         .flowOn(Dispatchers.IO)
 
-    /** Appends parser actions to the current completion snapshot. */
-    private fun Completion.applyActions(actions: List<StreamAction>): Completion {
-        var newContent = this.contentText
-        var newThinking = this.thinkingText
-
-        for (action in actions) {
-            when (action) {
-                is StreamAction.Content -> {
-                    newContent = (newContent ?: "") + action.text
-                }
-
-                is StreamAction.Thinking -> {
-                    newThinking = (newThinking ?: "") + action.text
-                }
-
-                is StreamAction.Stop -> {
-                }
-            }
-        }
-
-        return this.copy(contentText = newContent, thinkingText = newThinking)
-    }
-
     /** Finalizes completion state with performance metrics and trimming. */
     private fun Completion.finalize(
         tokenCount: Int,
         startTime: Long,
-        isInterrupted: Boolean = false
+        isInterrupted: Boolean,
+        contentBuilder: StringBuilder,
+        thinkingBuilder: StringBuilder
     ): Completion {
         val endTime = System.nanoTime()
         val durationNs = (endTime - startTime).coerceAtLeast(1)
         val tps = (tokenCount.toDouble() / durationNs * 1e9).toFloat()
 
         return this.copy(
-            thinkingText = if (this.thinkingText.isNullOrBlank()) null else this.thinkingText.trim(),
-            contentText = if (this.contentText.isNullOrBlank()) null else this.contentText.trim(),
+            thinkingText = thinkingBuilder.ifBlank { null }?.toString()?.trim(),
+            contentText = contentBuilder.ifBlank { null }?.toString()?.trim(),
             tokensPerSecond = tps,
             isComplete = true,
             isInterrupted = isInterrupted,
@@ -134,7 +125,7 @@ internal class LlamaChatSessionImpl(
     override suspend fun loadHistory(messages: List<Message>) =
         withContext(Dispatchers.IO) {
             messages.forEach { msg ->
-                session.prompt(promptFormatter.format(msg))
+                session.ingestPrompt(promptFormatter.format(msg))
             }
         }
 
 
@@ -75,12 +75,12 @@ internal class LlamaSessionImpl(
             }
         }
 
-    override suspend fun prompt(text: String, addSpecial: Boolean) =
+    override suspend fun ingestPrompt(prompt: String, addSpecial: Boolean) =
         withContext(Dispatchers.IO) {
             mutex.withLock {
                 try {
                     runInterruptible {
-                        Jni.injectPrompt(ptr, text, addSpecial)
+                        Jni.ingestPrompt(ptr, prompt, addSpecial)
                     }
                 } catch (e: RuntimeException) {
                     throw mapNativeError(e)
@@ -182,7 +182,7 @@ internal class LlamaSessionImpl(
         external fun setSystemPrompt(sessionPtr: Long, text: String, addSpecial: Boolean)
 
         @JvmStatic
-        external fun injectPrompt(sessionPtr: Long, text: String, addSpecial: Boolean)
+        external fun ingestPrompt(sessionPtr: Long, text: String, addSpecial: Boolean)
 
         @JvmStatic
         external fun clear(sessionPtr: Long)
Original file line number	Diff line number	Diff line change
`@@ -217,8 +217,8 @@ void LlamaSession::setSystemPrompt(const std::string &prompt, bool add_special)`
`217`	`217`	`ingest_prompt(prompt, true, add_special);`
`218`	`218`	`}`
`219`	`219`
`220`		`-void LlamaSession::injectPrompt(const std::string &user_message, bool add_special) {`
`221`		`- ingest_prompt(user_message, false, add_special);`
	`220`	`+void LlamaSession::ingestPrompt(const std::string &prompt, bool add_special) {`
	`221`	`+ ingest_prompt(prompt, false, add_special);`
`222`	`222`	`}`
`223`	`223`
`224`	`224`	`Generation LlamaSession::generate() {`