fix: assistant msg cropping might crop suffix special tokens

starpit · starpit · commit f20fc4c69dca · 2025-09-12T13:04:28.000-04:00
The complexity here: we need to crop the assistant messages so that
they end on a block boundary. We cannot pad them out, because this is
not what the model server will do when generating the assistant
message. But it will cache the prefix of full blocks. Hence the need
to crop. However, we cannot just crop at the end, as this would also
crop off any "end of text" special tokens that the chat template adds
to the end of the `self.assistant(m)` token sequence. Therefore, we
need to crop just the message part. This logic tries to do all of that
in a way that is agnostic to the cast template. However, the logic
does currently assume that the chat template will never add special
tokens *in the middle* of the given message `m`; it assumes special
tokens are only ever added (if at all) to the beginning or end.

Signed-off-by: Nick Mitchell &lt;nickm@us.ibm.com&gt;
diff --git a/spnl/src/tokenize.rs b/spnl/src/tokenize.rs
@@ -32,11 +32,60 @@ impl Tokenizer {
         Ok(())
     }
 
+    /// The complexity here: we need to crop the assistant messages so
+    /// that they end on a block boundary. We cannot pad them out,
+    /// because this is not what the model server will do when
+    /// generating the assistant message. But it will cache the prefix
+    /// of full blocks. Hence the need to crop. However, we cannot
+    /// just crop at the end, as this would also crop off any "end of
+    /// text" special tokens that the chat template adds to the end of
+    /// the `self.assistant(m)` token sequence. Therefore, we need to
+    /// crop just the message part. This logic tries to do all of that
+    /// in a way that is agnostic to the cast template. However, the
+    /// logic does currently assume that the chat template will never
+    /// add special tokens *in the middle* of the given message `m`;
+    /// it assumes special tokens are only ever added (if at all) to
+    /// the beginning or end.
     fn assistanttok(&self, m: &str, tokens: &mut Vec<u32>) -> tokenizers::tokenizer::Result<()> {
-        self.extend_crop(
-            self.tok.encode_fast(self.assistant(m), false)?.get_ids(),
-            tokens,
-        );
+        let binding = self.tok.encode_fast(self.assistant(m), false)?;
+        let binding2 = self.tok.encode_fast(m, false)?;
+        let with_chat_template = binding.get_ids();
+        let without_chat_template = binding2.get_ids();
+
+        // TODO this is imperfect...
+        let start_of_message_idx = with_chat_template
+            .iter()
+            .position(|t| *t == without_chat_template[0]);
+        let end_of_message_idx = start_of_message_idx
+            .map(|start_of_message_idx| start_of_message_idx + without_chat_template.len());
+        // [pppppmmmmmmmmmss]  <- ppppp are the prefix speical tokens added by chat template; ss suffix special tokens
+        //       ^ start_of_message_idx
+        //                ^ end_of_message_idx
+
+        if with_chat_template.len() > self.block_size {
+            eprintln!(
+                "Warning (spnl): assistant message cannot be cropped due to length chat template"
+            )
+        }
+
+        if without_chat_template.is_empty() {
+            self.extend(with_chat_template, tokens);
+        } else if let Some(start_of_message_idx) = start_of_message_idx
+            && let Some(end_of_message_idx) = end_of_message_idx
+        {
+            self.extend_crop(
+                with_chat_template,
+                start_of_message_idx,
+                end_of_message_idx,
+                tokens,
+            );
+        } else {
+            eprintln!(
+                "Warning (spnl): assistant message could not be cropped because message is not found within chat template"
+            );
+            self.extend(with_chat_template, tokens);
+        }
+
         Ok(())
     }
 
@@ -67,18 +116,39 @@ impl Tokenizer {
         tokens.extend(extra);
     }
 
-    /// Extend with tokens, cropping to a block boundary
-    fn extend_crop(&self, extra: &[u32], tokens: &mut Vec<u32>) {
+    /// Extend with tokens, cropping to a block boundary, but only the `mmm` part in the middle, as follows:
+    /// [pppppmmmmmmmmmss] <- msg_with_chat_template
+    ///       ^ start_of_message_idx
+    ///                ^ end_of_message_idx
+    fn extend_crop(
+        &self,
+        msg_with_chat_template: &[u32],
+        start_of_message_idx: usize,
+        end_of_message_idx: usize,
+        tokens: &mut Vec<u32>,
+    ) {
         // Round down to nearest block boundary. Note: for future
         // reference, if we need to round up to nearest block
         // boundary, replace `tokens.len()` with
         // `tokens.len()+self.block_size-1`.
-        let end = extra.len() + tokens.len();
+        let end = msg_with_chat_template.len() + tokens.len();
         let nearest_block_boundary = end / self.block_size * self.block_size;
         let amount_to_crop = end - nearest_block_boundary;
-        let extra_end = extra.len() - amount_to_crop;
-
-        self.extend(&extra[0..extra_end], tokens);
+        let end_of_crop = if amount_to_crop > (end_of_message_idx - start_of_message_idx) {
+            start_of_message_idx
+        } else {
+            end_of_message_idx - amount_to_crop
+        };
+
+        let m = msg_with_chat_template;
+        let cropped = [
+            &m[0..start_of_message_idx],
+            &m[start_of_message_idx..end_of_crop],
+            &m[end_of_message_idx..],
+        ]
+        .concat();
+
+        self.extend(&cropped, tokens);
     }
 
     /// Pad to block boundary, then push
@@ -460,3 +530,93 @@ pub fn tokenize_prepare(
         _ => todo!(),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use itertools::Itertools;
+
+    const PAD_TOKEN: u32 = 27;
+    const BLOCK_SIZE: usize = 16;
+
+    const MODEL: &str = "ibm-granite/granite-3.3-2b-instruct"; // TODO find smaller model with public tokenizers.json
+    const START_OF_ROLE: u32 = 49152;
+    const END_OF_ROLE: u32 = 49153;
+    const END_OF_TEXT: u32 = 0;
+    const USER: u32 = 496;
+    const ASSISTANT: u32 = 17594;
+    const HELLO: u32 = 7656;
+    const LONGER: u32 = 8928;
+
+    fn tok() -> Result<::std::sync::Arc<Tokenizer>, ::std::sync::Arc<tokenizers::tokenizer::Error>>
+    {
+        init(2).get_or_create(&MODEL.into(), PAD_TOKEN, None, None, BLOCK_SIZE)
+    }
+
+    #[test]
+    fn create_tokenizer() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
+        tok().map(|_| ())
+    }
+
+    #[test]
+    fn user() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
+        assert_eq!(
+            tok().map(|tok| tok.user("hello"))?,
+            "<|start_of_role|>user<|end_of_role|>hello<|end_of_text|>"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn usertok() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
+        let mut tokens = vec![];
+        tok()?.usertok("hello", &mut tokens)?;
+        assert_eq!(
+            tokens,
+            [START_OF_ROLE, USER, END_OF_ROLE, HELLO, END_OF_TEXT]
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn assistant() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
+        assert_eq!(
+            tok().map(|tok| tok.assistant("hello"))?,
+            "<|start_of_role|>assistant<|end_of_role|>hello<|end_of_text|>"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn assistanttok_fully_cropped() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>> {
+        let mut tokens = vec![];
+        tok()?.assistanttok("hello", &mut tokens)?;
+        assert_eq!(tokens, [START_OF_ROLE, ASSISTANT, END_OF_ROLE, END_OF_TEXT]);
+        Ok(())
+    }
+
+    #[test]
+    fn assistanttok_partially_cropped() -> Result<(), ::std::sync::Arc<tokenizers::tokenizer::Error>>
+    {
+        let repeat_input = 17; // repeat this many times for the input message
+        let repeat_output = 11; // expect this many repetitions after cropping
+        let mut tokens = vec![];
+        tok()?.assistanttok(
+            format!(
+                "hello {}",
+                ::std::iter::repeat_n("longer", repeat_input).join(" ")
+            )
+            .as_str(),
+            &mut tokens,
+        )?;
+        assert_eq!(
+            tokens,
+            [START_OF_ROLE, ASSISTANT, END_OF_ROLE, HELLO]
+                .into_iter()
+                .chain(::std::iter::repeat_n(LONGER, repeat_output))
+                .chain([END_OF_TEXT])
+                .collect::<Vec<u32>>(),
+        );
+        Ok(())
+    }
+}