From d917086c3fe132af16a0abf9359520467b6b1933 Mon Sep 17 00:00:00 2001
From: 2561056571 <112464849+2561056571@users.noreply.github.com>
Date: Mon, 9 Mar 2026 18:23:11 +0800
Subject: [PATCH] fix(chat): prevent prompt overflow when referencing knowledge
 documents

The notebook chat page throws "prompt is too long" error when referencing
knowledge documents because of two issues:

1. Token estimation uses naive `len // 4` heuristic which dramatically
   underestimates tokens for CJK content (Chinese/Japanese/Korean chars
   use ~1.5 tokens per character, not 0.25). This causes large documents
   to pass the injection threshold when they shouldn't.

2. The compression system's AttachmentTruncationStrategy doesn't
   recognize `<selected_documents>` XML tags, so injected document
   content is invisible to compression strategies and cannot be
   truncated when the prompt exceeds context limits.

Changes:
- Add content-aware token estimation that handles CJK text properly
- Reduce MAX_INJECTION_RATIO from 0.5 to 0.3 to reserve room for
  system prompt, chat history, and dynamic context
- Add `<selected_documents>` to compression strategy's document markers
  so the compressor can detect and truncate notebook document content
- Update regex pattern and header extraction to handle both bracket-style
  and XML-style document formats
---
 .../chat/preprocessing/selected_documents.py  | 68 +++++++++++++++++--
 .../chat_shell/compression/strategies.py      | 27 ++++++--
 2 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/backend/app/services/chat/preprocessing/selected_documents.py b/backend/app/services/chat/preprocessing/selected_documents.py
index f45157965..fdd835a99 100644
--- a/backend/app/services/chat/preprocessing/selected_documents.py
+++ b/backend/app/services/chat/preprocessing/selected_documents.py
@@ -10,12 +10,17 @@
 2. Estimates token count to determine if direct injection is feasible
 3. Either injects content directly into the message or falls back to RAG retrieval
 
-The injection strategy follows the same threshold as KnowledgeBaseTool:
+The injection strategy uses conservative token estimation:
 - If estimated tokens <= 30% of context window, inject directly
 - Otherwise, create KnowledgeBaseTool with document_ids filter for RAG retrieval
+
+Token estimation uses a conservative multiplier to handle multilingual content
+(e.g., Chinese characters where 1 char ≈ 1-2 tokens) and avoid exceeding the
+model's context window limit.
 """
 
 import logging
+import re
 from typing import Any, Dict, List, Optional, Tuple
 
 from langchain_core.tools import BaseTool
@@ -29,8 +34,17 @@
 # Default context window size (same as InjectionStrategy.DEFAULT_CONTEXT_WINDOW)
 DEFAULT_CONTEXT_WINDOW = 128000
 
-# Maximum ratio of context window that can be used for document injection
-MAX_INJECTION_RATIO = 0.5
+# Maximum ratio of context window that can be used for document injection.
+# Set to 0.3 (30%) to leave sufficient room for system prompt, chat history,
+# KB metadata, and dynamic context that are added separately.
+MAX_INJECTION_RATIO = 0.3
+
+# Regex pattern to detect CJK (Chinese, Japanese, Korean) characters
+_CJK_PATTERN = re.compile(
+    r"[\u4e00-\u9fff\u3400-\u4dbf\u2e80-\u2eff\u3000-\u303f"
+    r"\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff\uac00-\ud7af"
+    r"\uf900-\ufaff\ufe30-\ufe4f]"
+)
 
 # Prompt template for selected documents context (direct injection mode)
 SELECTED_DOCUMENTS_PROMPT_TEMPLATE = """
@@ -143,10 +157,11 @@ def process_selected_documents_contexts(
             user_subtask_id=user_subtask_id,
         )
 
-    # Estimate token count
+    # Estimate token count using content-aware estimation
     total_chars = sum(len(doc["content"]) for doc in documents_content)
-    # Estimate tokens: approximately 4 characters per token
-    estimated_tokens = total_chars // 4
+    estimated_tokens = _estimate_tokens(
+        "".join(doc["content"] for doc in documents_content)
+    )
 
     # Calculate threshold
     max_tokens_for_injection = int(context_window * MAX_INJECTION_RATIO)
@@ -154,7 +169,8 @@ def process_selected_documents_contexts(
     logger.info(
         f"[process_selected_documents_contexts] Token estimation: "
         f"total_chars={total_chars}, estimated_tokens={estimated_tokens}, "
-        f"threshold={max_tokens_for_injection}, context_window={context_window}"
+        f"threshold={max_tokens_for_injection}, context_window={context_window}, "
+        f"injection_ratio={MAX_INJECTION_RATIO}"
     )
 
     if estimated_tokens <= max_tokens_for_injection:
@@ -187,6 +203,44 @@ def process_selected_documents_contexts(
         )
 
 
+def _estimate_tokens(text: str) -> int:
+    """Estimate token count for text with content-aware heuristics.
+
+    Uses different estimation ratios depending on character composition:
+    - CJK characters (Chinese/Japanese/Korean): ~1.5 tokens per character
+    - Latin/ASCII characters: ~0.25 tokens per character (4 chars/token)
+
+    This is more conservative than the naive `len // 4` approach, which
+    dramatically underestimates tokens for CJK-heavy text and can cause
+    the model's context window to overflow.
+
+    Args:
+        text: Input text to estimate tokens for
+
+    Returns:
+        Estimated token count (conservative upper bound)
+    """
+    if not text:
+        return 0
+
+    cjk_chars = len(_CJK_PATTERN.findall(text))
+    non_cjk_chars = len(text) - cjk_chars
+
+    # CJK characters: ~1.5 tokens per character (conservative estimate)
+    # Non-CJK characters: ~0.25 tokens per character (4 chars/token)
+    estimated = int(cjk_chars * 1.5 + non_cjk_chars * 0.25)
+
+    # Apply a safety margin of 10% to account for tokenizer overhead
+    estimated = int(estimated * 1.1)
+
+    logger.debug(
+        f"[_estimate_tokens] chars={len(text)}, cjk={cjk_chars}, "
+        f"non_cjk={non_cjk_chars}, estimated_tokens={estimated}"
+    )
+
+    return estimated
+
+
 def _load_documents_content(
     db: Session,
     document_ids: List[int],
diff --git a/chat_shell/chat_shell/compression/strategies.py b/chat_shell/chat_shell/compression/strategies.py
index 78496c136..af5ac53a5 100644
--- a/chat_shell/chat_shell/compression/strategies.py
+++ b/chat_shell/chat_shell/compression/strategies.py
@@ -160,16 +160,21 @@ class AttachmentTruncationStrategy(CompressionStrategy):
     def name(self) -> str:
         return "attachment_truncation"
 
-    # Pattern to match attachment content blocks wrapped in <attachment> XML tags
-    # or legacy format without tags
+    # Pattern to match attachment content blocks wrapped in <attachment> XML tags,
+    # <selected_documents> XML tags, or legacy format without tags
     ATTACHMENT_PATTERN = re.compile(
-        r"(?:<attachment>)?\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\](.*?)(?:</attachment>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|$))",
+        r"(?:<attachment>|<selected_documents>)?"
+        r"(?:\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\]|"
+        r"# Reference Documents\s*\n\s*[^\n]*\n\s*)"
+        r"(.*?)"
+        r"(?:</attachment>|</selected_documents>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|<selected_documents>|$))",
         re.DOTALL,
     )
 
     # Pattern for document content markers (including XML tag)
     DOCUMENT_MARKERS = [
         "<attachment>",  # New XML tag format
+        "<selected_documents>",  # Notebook mode selected documents
         "[Attachment",
         "[File Content",  # New format for file attachments
         "[Document:",  # History loader format
@@ -484,9 +489,19 @@ def truncate_match(match: re.Match) -> str:
             full_match = match.group(0)
             attachment_content = match.group(1) if match.lastindex else ""
 
-            # Get the header (e.g., "[Attachment 1 - document.pdf]")
-            header_end = full_match.find("]") + 1
-            header = full_match[:header_end]
+            # Extract the header portion before the main content body
+            # Handles both bracket-style headers (e.g., "[Attachment 1 - doc.pdf]")
+            # and XML-style headers (e.g., "<selected_documents># Reference Documents\n...")
+            bracket_pos = full_match.find("]")
+            content_start = (
+                full_match.find(attachment_content) if attachment_content else -1
+            )
+            if bracket_pos >= 0 and (content_start < 0 or bracket_pos < content_start):
+                header = full_match[: bracket_pos + 1]
+            elif content_start > 0:
+                header = full_match[:content_start]
+            else:
+                header = ""
 
             original_length = len(attachment_content)