From d917086c3fe132af16a0abf9359520467b6b1933 Mon Sep 17 00:00:00 2001 From: 2561056571 <112464849+2561056571@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:23:11 +0800 Subject: [PATCH] fix(chat): prevent prompt overflow when referencing knowledge documents The notebook chat page throws "prompt is too long" error when referencing knowledge documents because of two issues: 1. Token estimation uses naive `len // 4` heuristic which dramatically underestimates tokens for CJK content (Chinese/Japanese/Korean chars use ~1.5 tokens per character, not 0.25). This causes large documents to pass the injection threshold when they shouldn't. 2. The compression system's AttachmentTruncationStrategy doesn't recognize `` XML tags, so injected document content is invisible to compression strategies and cannot be truncated when the prompt exceeds context limits. Changes: - Add content-aware token estimation that handles CJK text properly - Reduce MAX_INJECTION_RATIO from 0.5 to 0.3 to reserve room for system prompt, chat history, and dynamic context - Add `` to compression strategy's document markers so the compressor can detect and truncate notebook document content - Update regex pattern and header extraction to handle both bracket-style and XML-style document formats --- .../chat/preprocessing/selected_documents.py | 68 +++++++++++++++++-- .../chat_shell/compression/strategies.py | 27 ++++++-- 2 files changed, 82 insertions(+), 13 deletions(-) diff --git a/backend/app/services/chat/preprocessing/selected_documents.py b/backend/app/services/chat/preprocessing/selected_documents.py index f45157965..fdd835a99 100644 --- a/backend/app/services/chat/preprocessing/selected_documents.py +++ b/backend/app/services/chat/preprocessing/selected_documents.py @@ -10,12 +10,17 @@ 2. Estimates token count to determine if direct injection is feasible 3. Either injects content directly into the message or falls back to RAG retrieval -The injection strategy follows the same threshold as KnowledgeBaseTool: +The injection strategy uses conservative token estimation: - If estimated tokens <= 30% of context window, inject directly - Otherwise, create KnowledgeBaseTool with document_ids filter for RAG retrieval + +Token estimation uses a conservative multiplier to handle multilingual content +(e.g., Chinese characters where 1 char ≈ 1-2 tokens) and avoid exceeding the +model's context window limit. """ import logging +import re from typing import Any, Dict, List, Optional, Tuple from langchain_core.tools import BaseTool @@ -29,8 +34,17 @@ # Default context window size (same as InjectionStrategy.DEFAULT_CONTEXT_WINDOW) DEFAULT_CONTEXT_WINDOW = 128000 -# Maximum ratio of context window that can be used for document injection -MAX_INJECTION_RATIO = 0.5 +# Maximum ratio of context window that can be used for document injection. +# Set to 0.3 (30%) to leave sufficient room for system prompt, chat history, +# KB metadata, and dynamic context that are added separately. +MAX_INJECTION_RATIO = 0.3 + +# Regex pattern to detect CJK (Chinese, Japanese, Korean) characters +_CJK_PATTERN = re.compile( + r"[\u4e00-\u9fff\u3400-\u4dbf\u2e80-\u2eff\u3000-\u303f" + r"\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff\uac00-\ud7af" + r"\uf900-\ufaff\ufe30-\ufe4f]" +) # Prompt template for selected documents context (direct injection mode) SELECTED_DOCUMENTS_PROMPT_TEMPLATE = """ @@ -143,10 +157,11 @@ def process_selected_documents_contexts( user_subtask_id=user_subtask_id, ) - # Estimate token count + # Estimate token count using content-aware estimation total_chars = sum(len(doc["content"]) for doc in documents_content) - # Estimate tokens: approximately 4 characters per token - estimated_tokens = total_chars // 4 + estimated_tokens = _estimate_tokens( + "".join(doc["content"] for doc in documents_content) + ) # Calculate threshold max_tokens_for_injection = int(context_window * MAX_INJECTION_RATIO) @@ -154,7 +169,8 @@ def process_selected_documents_contexts( logger.info( f"[process_selected_documents_contexts] Token estimation: " f"total_chars={total_chars}, estimated_tokens={estimated_tokens}, " - f"threshold={max_tokens_for_injection}, context_window={context_window}" + f"threshold={max_tokens_for_injection}, context_window={context_window}, " + f"injection_ratio={MAX_INJECTION_RATIO}" ) if estimated_tokens <= max_tokens_for_injection: @@ -187,6 +203,44 @@ def process_selected_documents_contexts( ) +def _estimate_tokens(text: str) -> int: + """Estimate token count for text with content-aware heuristics. + + Uses different estimation ratios depending on character composition: + - CJK characters (Chinese/Japanese/Korean): ~1.5 tokens per character + - Latin/ASCII characters: ~0.25 tokens per character (4 chars/token) + + This is more conservative than the naive `len // 4` approach, which + dramatically underestimates tokens for CJK-heavy text and can cause + the model's context window to overflow. + + Args: + text: Input text to estimate tokens for + + Returns: + Estimated token count (conservative upper bound) + """ + if not text: + return 0 + + cjk_chars = len(_CJK_PATTERN.findall(text)) + non_cjk_chars = len(text) - cjk_chars + + # CJK characters: ~1.5 tokens per character (conservative estimate) + # Non-CJK characters: ~0.25 tokens per character (4 chars/token) + estimated = int(cjk_chars * 1.5 + non_cjk_chars * 0.25) + + # Apply a safety margin of 10% to account for tokenizer overhead + estimated = int(estimated * 1.1) + + logger.debug( + f"[_estimate_tokens] chars={len(text)}, cjk={cjk_chars}, " + f"non_cjk={non_cjk_chars}, estimated_tokens={estimated}" + ) + + return estimated + + def _load_documents_content( db: Session, document_ids: List[int], diff --git a/chat_shell/chat_shell/compression/strategies.py b/chat_shell/chat_shell/compression/strategies.py index 78496c136..af5ac53a5 100644 --- a/chat_shell/chat_shell/compression/strategies.py +++ b/chat_shell/chat_shell/compression/strategies.py @@ -160,16 +160,21 @@ class AttachmentTruncationStrategy(CompressionStrategy): def name(self) -> str: return "attachment_truncation" - # Pattern to match attachment content blocks wrapped in XML tags - # or legacy format without tags + # Pattern to match attachment content blocks wrapped in XML tags, + # XML tags, or legacy format without tags ATTACHMENT_PATTERN = re.compile( - r"(?:)?\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\](.*?)(?:|(?=\[(?:Attachment \d+|File Content|Document)||$))", + r"(?:|)?" + r"(?:\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\]|" + r"# Reference Documents\s*\n\s*[^\n]*\n\s*)" + r"(.*?)" + r"(?:||(?=\[(?:Attachment \d+|File Content|Document)|||$))", re.DOTALL, ) # Pattern for document content markers (including XML tag) DOCUMENT_MARKERS = [ "", # New XML tag format + "", # Notebook mode selected documents "[Attachment", "[File Content", # New format for file attachments "[Document:", # History loader format @@ -484,9 +489,19 @@ def truncate_match(match: re.Match) -> str: full_match = match.group(0) attachment_content = match.group(1) if match.lastindex else "" - # Get the header (e.g., "[Attachment 1 - document.pdf]") - header_end = full_match.find("]") + 1 - header = full_match[:header_end] + # Extract the header portion before the main content body + # Handles both bracket-style headers (e.g., "[Attachment 1 - doc.pdf]") + # and XML-style headers (e.g., "# Reference Documents\n...") + bracket_pos = full_match.find("]") + content_start = ( + full_match.find(attachment_content) if attachment_content else -1 + ) + if bracket_pos >= 0 and (content_start < 0 or bracket_pos < content_start): + header = full_match[: bracket_pos + 1] + elif content_start > 0: + header = full_match[:content_start] + else: + header = "" original_length = len(attachment_content)