From 7f20a80dd3afd648d00fe1c63cf04a7ac3b0edc9 Mon Sep 17 00:00:00 2001 From: sborisov88 Date: Sun, 22 Feb 2026 03:10:14 +0300 Subject: [PATCH] Fix prompt tokens causing empty transcription output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When promptTokens are provided in DecodingOptions, the prefill cache is disabled (known limitation). This causes the decoding loop to start at tokenIndex=0, where startOfPreviousToken is fed to the model. The model then predicts EOT or produces a low-confidence prediction, triggering early termination checks (sampleResult.completed or firstTokenLogProbThreshold) and breaking the loop immediately — resulting in empty transcription text. Two fixes: 1. isFirstToken now points to the first actually decoded token after the prompt (max(prefilledIndex, initialPromptIndex)) instead of tokenIndex 0 during prompt prefill. 2. sampleResult.completed (EOT) is ignored during the prefill phase, since the model is being force-fed prompt tokens and its predictions are not meaningful for early stopping. Co-Authored-By: Claude Opus 4.6 --- Sources/WhisperKit/Core/TextDecoder.swift | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Sources/WhisperKit/Core/TextDecoder.swift b/Sources/WhisperKit/Core/TextDecoder.swift index 4d717b6e..9d7c9af8 100644 --- a/Sources/WhisperKit/Core/TextDecoder.swift +++ b/Sources/WhisperKit/Core/TextDecoder.swift @@ -764,7 +764,8 @@ open class TextDecoder: TextDecoding, WhisperMLModel { let isPrefill = tokenIndex < initialPromptIndex - 1 // Prefill stops at the last token of the initial prompt let isLastPrefillToken = tokenIndex == initialPromptIndex - 1 - let isFirstToken = tokenIndex == prefilledIndex + let isInPrefillPhase = isPrefill || isLastPrefillToken // tokenIndex < initialPromptIndex + let isFirstToken = tokenIndex == max(prefilledIndex, initialPromptIndex) // First actually decoded token (after prompt) // Check if current index is part of the initial prompt if tokenIndex < initialPromptIndex { @@ -854,8 +855,11 @@ open class TextDecoder: TextDecoding, WhisperMLModel { } else { false } + // During prefill phase (processing prompt tokens), skip early termination checks: + // - The model is being force-fed prompt tokens, so EOT predictions and low log probs are expected + // - Early stopping should only apply to actually decoded tokens after the prompt let isSegmentCompleted = - sampleResult.completed || + (!isInPrefillPhase && sampleResult.completed) || currentTokens.count >= Constants.maxTokenContext - 1 || isFirstTokenLogProbTooLow