Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 61 additions & 7 deletions backend/app/services/chat/preprocessing/selected_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@
2. Estimates token count to determine if direct injection is feasible
3. Either injects content directly into the message or falls back to RAG retrieval

The injection strategy follows the same threshold as KnowledgeBaseTool:
The injection strategy uses conservative token estimation:
- If estimated tokens <= 30% of context window, inject directly
- Otherwise, create KnowledgeBaseTool with document_ids filter for RAG retrieval

Token estimation uses a conservative multiplier to handle multilingual content
(e.g., Chinese characters where 1 char ≈ 1-2 tokens) and avoid exceeding the
model's context window limit.
"""

import logging
import re
from typing import Any, Dict, List, Optional, Tuple

from langchain_core.tools import BaseTool
Expand All @@ -29,8 +34,17 @@
# Default context window size (same as InjectionStrategy.DEFAULT_CONTEXT_WINDOW)
DEFAULT_CONTEXT_WINDOW = 128000

# Maximum ratio of context window that can be used for document injection
MAX_INJECTION_RATIO = 0.5
# Maximum ratio of context window that can be used for document injection.
# Set to 0.3 (30%) to leave sufficient room for system prompt, chat history,
# KB metadata, and dynamic context that are added separately.
MAX_INJECTION_RATIO = 0.3

# Regex pattern to detect CJK (Chinese, Japanese, Korean) characters
_CJK_PATTERN = re.compile(
r"[\u4e00-\u9fff\u3400-\u4dbf\u2e80-\u2eff\u3000-\u303f"
r"\u3040-\u309f\u30a0-\u30ff\u31f0-\u31ff\uac00-\ud7af"
r"\uf900-\ufaff\ufe30-\ufe4f]"
)

# Prompt template for selected documents context (direct injection mode)
SELECTED_DOCUMENTS_PROMPT_TEMPLATE = """
Expand Down Expand Up @@ -143,18 +157,20 @@ def process_selected_documents_contexts(
user_subtask_id=user_subtask_id,
)

# Estimate token count
# Estimate token count using content-aware estimation
total_chars = sum(len(doc["content"]) for doc in documents_content)
# Estimate tokens: approximately 4 characters per token
estimated_tokens = total_chars // 4
estimated_tokens = _estimate_tokens(
"".join(doc["content"] for doc in documents_content)
)

# Calculate threshold
max_tokens_for_injection = int(context_window * MAX_INJECTION_RATIO)

logger.info(
f"[process_selected_documents_contexts] Token estimation: "
f"total_chars={total_chars}, estimated_tokens={estimated_tokens}, "
f"threshold={max_tokens_for_injection}, context_window={context_window}"
f"threshold={max_tokens_for_injection}, context_window={context_window}, "
f"injection_ratio={MAX_INJECTION_RATIO}"
)

if estimated_tokens <= max_tokens_for_injection:
Expand Down Expand Up @@ -187,6 +203,44 @@ def process_selected_documents_contexts(
)


def _estimate_tokens(text: str) -> int:
"""Estimate token count for text with content-aware heuristics.

Uses different estimation ratios depending on character composition:
- CJK characters (Chinese/Japanese/Korean): ~1.5 tokens per character
- Latin/ASCII characters: ~0.25 tokens per character (4 chars/token)

This is more conservative than the naive `len // 4` approach, which
dramatically underestimates tokens for CJK-heavy text and can cause
the model's context window to overflow.

Args:
text: Input text to estimate tokens for

Returns:
Estimated token count (conservative upper bound)
"""
if not text:
return 0

cjk_chars = len(_CJK_PATTERN.findall(text))
non_cjk_chars = len(text) - cjk_chars

# CJK characters: ~1.5 tokens per character (conservative estimate)
# Non-CJK characters: ~0.25 tokens per character (4 chars/token)
estimated = int(cjk_chars * 1.5 + non_cjk_chars * 0.25)

# Apply a safety margin of 10% to account for tokenizer overhead
estimated = int(estimated * 1.1)

logger.debug(
f"[_estimate_tokens] chars={len(text)}, cjk={cjk_chars}, "
f"non_cjk={non_cjk_chars}, estimated_tokens={estimated}"
)

return estimated


def _load_documents_content(
db: Session,
document_ids: List[int],
Expand Down
27 changes: 21 additions & 6 deletions chat_shell/chat_shell/compression/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,21 @@ class AttachmentTruncationStrategy(CompressionStrategy):
def name(self) -> str:
return "attachment_truncation"

# Pattern to match attachment content blocks wrapped in <attachment> XML tags
# or legacy format without tags
# Pattern to match attachment content blocks wrapped in <attachment> XML tags,
# <selected_documents> XML tags, or legacy format without tags
ATTACHMENT_PATTERN = re.compile(
r"(?:<attachment>)?\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\](.*?)(?:</attachment>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|$))",
r"(?:<attachment>|<selected_documents>)?"
r"(?:\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\]|"
r"# Reference Documents\s*\n\s*[^\n]*\n\s*)"
r"(.*?)"
r"(?:</attachment>|</selected_documents>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|<selected_documents>|$))",
re.DOTALL,
)
Comment on lines +163 to 172
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Closing tag is lost during truncation.

The regex correctly matches <selected_documents> opening and closing tags, but the closing tag is consumed by the match (not a lookahead) and included in full_match.group(0). However, when _truncate_attachments_proportionally reconstructs the truncated content at line 531, it only returns header + begin_part + truncation_notice + end_part — the closing tag is dropped.

This produces malformed XML output like:

<selected_documents>
# Reference Documents
...truncated content...
[... Middle content truncated ...]
...end portion...

Missing </selected_documents>.

🐛 Proposed fix: preserve closing tag
 ATTACHMENT_PATTERN = re.compile(
-    r"(?:<attachment>|<selected_documents>)?"
+    r"(?P<open_tag><attachment>|<selected_documents>)?"
     r"(?:\[(?:Attachment \d+|File Content|Document)(?:\s*[:-]\s*[^\]]+)?\]|"
     r"# Reference Documents\s*\n\s*[^\n]*\n\s*)"
     r"(.*?)"
-    r"(?:</attachment>|</selected_documents>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|<selected_documents>|$))",
+    r"(?P<close_tag></attachment>|</selected_documents>|(?=\[(?:Attachment \d+|File Content|Document)|<attachment>|<selected_documents>|$))",
     re.DOTALL,
 )

Then in _truncate_attachments_proportionally, extract and append the closing tag:

# At end of truncate_match function
close_tag = match.group("close_tag") or ""
# Only include if it's an actual tag, not lookahead match
if close_tag.startswith("</"):
    return header + begin_part + truncation_notice + end_part + close_tag
return header + begin_part + truncation_notice + end_part
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@chat_shell/chat_shell/compression/strategies.py` around lines 163 - 172, The
attachment-matching regex (ATTACHMENT_PATTERN) consumes the closing tag which
then gets dropped when rebuilding content in
_truncate_attachments_proportionally; fix by making the regex capture the
closing tag (e.g., as a named group "close_tag") or otherwise expose it from the
match, and then in _truncate_attachments_proportionally append that captured
close_tag (but only if it is an actual closing tag, e.g., startswith("</")) to
the reconstructed string (header + begin_part + truncation_notice + end_part +
close_tag) so the XML stay well-formed.


# Pattern for document content markers (including XML tag)
DOCUMENT_MARKERS = [
"<attachment>", # New XML tag format
"<selected_documents>", # Notebook mode selected documents
"[Attachment",
"[File Content", # New format for file attachments
"[Document:", # History loader format
Expand Down Expand Up @@ -484,9 +489,19 @@ def truncate_match(match: re.Match) -> str:
full_match = match.group(0)
attachment_content = match.group(1) if match.lastindex else ""

# Get the header (e.g., "[Attachment 1 - document.pdf]")
header_end = full_match.find("]") + 1
header = full_match[:header_end]
# Extract the header portion before the main content body
# Handles both bracket-style headers (e.g., "[Attachment 1 - doc.pdf]")
# and XML-style headers (e.g., "<selected_documents># Reference Documents\n...")
bracket_pos = full_match.find("]")
content_start = (
full_match.find(attachment_content) if attachment_content else -1
)
if bracket_pos >= 0 and (content_start < 0 or bracket_pos < content_start):
header = full_match[: bracket_pos + 1]
elif content_start > 0:
header = full_match[:content_start]
else:
header = ""

original_length = len(attachment_content)

Expand Down
Loading