wecode-ai · 2561056571 · Mar 9, 2026 · coderabbitai · Mar 9, 2026
diff --git a/backend/app/services/chat/preprocessing/selected_documents.py b/backend/app/services/chat/preprocessing/selected_documents.py
@@ -10,12 +10,17 @@
 2. Estimates token count to determine if direct injection is feasible
 3. Either injects content directly into the message or falls back to RAG retrieval
 
-The injection strategy follows the same threshold as KnowledgeBaseTool:
+The injection strategy uses conservative token estimation:
 - If estimated tokens <= 30% of context window, inject directly
 - Otherwise, create KnowledgeBaseTool with document_ids filter for RAG retrieval
+
+Token estimation uses a conservative multiplier to handle multilingual content
+(e.g., Chinese characters where 1 char ≈ 1-2 tokens) and avoid exceeding the
+model's context window limit.
 """
 
 import logging
+import re
 from typing import Any, Dict, List, Optional, Tuple
 
 from langchain_core.tools import BaseTool
@@ -29,8 +34,17 @@
 # Default context window size (same as InjectionStrategy.DEFAULT_CONTEXT_WINDOW)
 DEFAULT_CONTEXT_WINDOW = 128000
 
-# Maximum ratio of context window that can be used for document injection
-MAX_INJECTION_RATIO = 0.5
+# Maximum ratio of context window that can be used for document injection.
+# Set to 0.3 (30%) to leave sufficient room for system prompt, chat history,
+# KB metadata, and dynamic context that are added separately.
+MAX_INJECTION_RATIO = 0.3
+
+# Regex pattern to detect CJK (Chinese, Japanese, Korean) characters
+_CJK_PATTERN = re.compile(
+    r"[\u4e00-\u9fff\u3400-\u4dbf\u2e80-\u2eff\u3000-\u303f"
+    r"\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff\uac00-\ud7af"
+    r"\uf900-\ufaff\ufe30-\ufe4f]"
+)
 
 # Prompt template for selected documents context (direct injection mode)
 SELECTED_DOCUMENTS_PROMPT_TEMPLATE = """
@@ -143,18 +157,20 @@ def process_selected_documents_contexts(
             user_subtask_id=user_subtask_id,
         )
 
-    # Estimate token count
+    # Estimate token count using content-aware estimation
     total_chars = sum(len(doc["content"]) for doc in documents_content)
-    # Estimate tokens: approximately 4 characters per token
-    estimated_tokens = total_chars // 4
+    estimated_tokens = _estimate_tokens(
+        "".join(doc["content"] for doc in documents_content)
+    )
 
     # Calculate threshold
     max_tokens_for_injection = int(context_window * MAX_INJECTION_RATIO)
 
     logger.info(
         f"[process_selected_documents_contexts] Token estimation: "
         f"total_chars={total_chars}, estimated_tokens={estimated_tokens}, "
-        f"threshold={max_tokens_for_injection}, context_window={context_window}"
+        f"threshold={max_tokens_for_injection}, context_window={context_window}, "
+        f"injection_ratio={MAX_INJECTION_RATIO}"
     )
 
     if estimated_tokens <= max_tokens_for_injection:
@@ -187,6 +203,44 @@ def process_selected_documents_contexts(
         )
 
 
+def _estimate_tokens(text: str) -> int:
+    """Estimate token count for text with content-aware heuristics.
+
+    Uses different estimation ratios depending on character composition:
+    - CJK characters (Chinese/Japanese/Korean): ~1.5 tokens per character
+    - Latin/ASCII characters: ~0.25 tokens per character (4 chars/token)
+
+    This is more conservative than the naive `len // 4` approach, which
+    dramatically underestimates tokens for CJK-heavy text and can cause
+    the model's context window to overflow.
+
+    Args:
+        text: Input text to estimate tokens for
+
+    Returns:
+        Estimated token count (conservative upper bound)
+    """
+    if not text:
+        return 0
+
+    cjk_chars = len(_CJK_PATTERN.findall(text))
+    non_cjk_chars = len(text) - cjk_chars
+
+    # CJK characters: ~1.5 tokens per character (conservative estimate)
+    # Non-CJK characters: ~0.25 tokens per character (4 chars/token)
+    estimated = int(cjk_chars * 1.5 + non_cjk_chars * 0.25)
+
+    # Apply a safety margin of 10% to account for tokenizer overhead
+    estimated = int(estimated * 1.1)
+
+    logger.debug(
+        f"[_estimate_tokens] chars={len(text)}, cjk={cjk_chars}, "
+        f"non_cjk={non_cjk_chars}, estimated_tokens={estimated}"
+    )
+
+    return estimated
+
+
 def _load_documents_content(
     db: Session,
     document_ids: List[int],

diff --git a/chat_shell/chat_shell/compression/strategies.py b/chat_shell/chat_shell/compression/strategies.py
@@ -160,16 +160,21 @@ class AttachmentTruncationStrategy(CompressionStrategy):
     def name(self) -> str:
         return "attachment_truncation"
 
-    # Pattern to match attachment content blocks wrapped in <attachment> XML tags
-    # or legacy format without tags
+    # Pattern to match attachment content blocks wrapped in <attachment> XML tags,
+    # <selected_documents> XML tags, or legacy format without tags
     ATTACHMENT_PATTERN = re.compile(
-        r"(?:<attachment>)?\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\](.*?)(?:</attachment>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|$))",
+        r"(?:<attachment>|<selected_documents>)?"
+        r"(?:\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\]|"
+        r"# Reference Documents\s*\n\s*[^\n]*\n\s*)"
+        r"(.*?)"
+        r"(?:</attachment>|</selected_documents>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|<selected_documents>|$))",
         re.DOTALL,
     )
 
     # Pattern for document content markers (including XML tag)
     DOCUMENT_MARKERS = [
         "<attachment>",  # New XML tag format
+        "<selected_documents>",  # Notebook mode selected documents
         "[Attachment",
         "[File Content",  # New format for file attachments
         "[Document:",  # History loader format
@@ -484,9 +489,19 @@ def truncate_match(match: re.Match) -> str:
             full_match = match.group(0)
             attachment_content = match.group(1) if match.lastindex else ""
 
-            # Get the header (e.g., "[Attachment 1 - document.pdf]")
-            header_end = full_match.find("]") + 1
-            header = full_match[:header_end]
+            # Extract the header portion before the main content body
+            # Handles both bracket-style headers (e.g., "[Attachment 1 - doc.pdf]")
+            # and XML-style headers (e.g., "<selected_documents># Reference Documents\n...")
+            bracket_pos = full_match.find("]")
+            content_start = (
+                full_match.find(attachment_content) if attachment_content else -1
+            )
+            if bracket_pos >= 0 and (content_start < 0 or bracket_pos < content_start):
+                header = full_match[: bracket_pos + 1]
+            elif content_start > 0:
+                header = full_match[:content_start]
+            else:
+                header = ""
 
             original_length = len(attachment_content)