diff --git a/agent/knowledge/service.py b/agent/knowledge/service.py index fb564afc7..52a712ccc 100644 --- a/agent/knowledge/service.py +++ b/agent/knowledge/service.py @@ -1,89 +1,62 @@ """ Knowledge service for handling knowledge base operations. -Provides a unified interface for listing, reading, and graphing knowledge files, -callable from the web console, API, or CLI. - -Knowledge file layout (under workspace_root): - knowledge/index.md - knowledge/log.md - knowledge//.md +Provides a unified interface for listing, reading, graphing, chunk inspection, +and import reindexing for files under the workspace knowledge directory. """ +from __future__ import annotations + +import asyncio import os import re from pathlib import Path from typing import Optional +from agent.memory.config import MemoryConfig +from agent.memory.document_parser import SUPPORTED_DOCUMENT_EXTENSIONS +from agent.memory.manager import MemoryManager from common.log import logger from config import conf class KnowledgeService: - """ - High-level service for knowledge base queries. - Operates directly on the filesystem. - """ + """High-level service for knowledge base queries.""" def __init__(self, workspace_root: str): self.workspace_root = workspace_root self.knowledge_dir = os.path.join(workspace_root, "knowledge") + self._memory_manager: Optional[MemoryManager] = None + + @property + def memory_manager(self) -> MemoryManager: + """Lazy-init memory manager for parser/chunker/reindex reuse.""" + if self._memory_manager is None: + self._memory_manager = MemoryManager(MemoryConfig(workspace_root=self.workspace_root)) + return self._memory_manager - # ------------------------------------------------------------------ - # list — directory tree with stats - # ------------------------------------------------------------------ def list_tree(self) -> dict: - """ - Return the knowledge directory tree grouped by category, - supporting arbitrarily nested sub-directories. - - Returns:: - - { - "tree": [ - { - "dir": "concepts", - "files": [ - {"name": "moe.md", "title": "MoE", "size": 1234}, - ], - "children": [] - }, - { - "dir": "platform", - "files": [], - "children": [ - { - "dir": "analysis", - "files": [{"name": "perf.md", ...}], - "children": [] - } - ] - }, - ], - "stats": {"pages": 15, "size": 32768}, - "enabled": true - } - """ + """Return the knowledge directory tree grouped by category.""" if not os.path.isdir(self.knowledge_dir): - return {"tree": [], "stats": {"pages": 0, "size": 0}, "enabled": conf().get("knowledge", True)} + return { + "tree": [], + "stats": {"pages": 0, "size": 0}, + "enabled": conf().get("knowledge", True), + "imports_dir": "knowledge/imports", + } stats = {"pages": 0, "size": 0} root_files, tree = self._scan_dir(self.knowledge_dir, stats, is_root=True) - return { "root_files": root_files, "tree": tree, "stats": stats, "enabled": conf().get("knowledge", True), + "imports_dir": "knowledge/imports", } def _scan_dir(self, dir_path: str, stats: dict, is_root: bool = False) -> tuple: - """ - Recursively scan a directory. - - :return: (files, children) where files is a list of .md file dicts - in this directory and children is a list of sub-directory nodes. - """ + """Recursively scan a directory and return files + child directories.""" files = [] children = [] for name in sorted(os.listdir(dir_path)): @@ -93,79 +66,121 @@ def _scan_dir(self, dir_path: str, stats: dict, is_root: bool = False) -> tuple: if os.path.isdir(full): sub_files, sub_children = self._scan_dir(full, stats) children.append({"dir": name, "files": sub_files, "children": sub_children}) - elif name.endswith(".md"): - size = os.path.getsize(full) - if not is_root: - stats["pages"] += 1 - stats["size"] += size - title = name.replace(".md", "") - try: - with open(full, "r", encoding="utf-8") as f: - first_line = f.readline().strip() - if first_line.startswith("# "): - title = first_line[2:].strip() - except Exception: - pass - files.append({"name": name, "title": title, "size": size}) + continue + + suffix = Path(full).suffix.lower() + if suffix not in SUPPORTED_DOCUMENT_EXTENSIONS: + continue + + size = os.path.getsize(full) + if not is_root: + stats["pages"] += 1 + stats["size"] += size + + rel_path = str(Path(full).relative_to(self.knowledge_dir)).replace("\\", "/") + title = Path(full).stem.replace("-", " ").replace("_", " ").strip() or name + try: + if suffix in {".md", ".markdown", ".txt", ".rst"}: + parsed = self.memory_manager.document_parser.parse(Path(full), f"knowledge/{rel_path}") + title = parsed.metadata.get("title") or title + except Exception: + pass + + files.append( + { + "name": name, + "title": title, + "size": size, + "doc_type": suffix.lstrip("."), + } + ) return files, children - # ------------------------------------------------------------------ - # read — single file content - # ------------------------------------------------------------------ def read_file(self, rel_path: str) -> dict: - """ - Read a single knowledge markdown file. - - :param rel_path: Relative path within knowledge/, e.g. ``concepts/moe.md`` - :return: dict with ``content`` and ``path`` - :raises ValueError: if path is invalid or escapes knowledge dir - :raises FileNotFoundError: if file does not exist - """ - if not rel_path or ".." in rel_path: + """Read a single knowledge file and return normalized text content.""" + full_path = self._resolve_knowledge_path(rel_path) + result = self.memory_manager.read_document_range(f"knowledge/{rel_path}", start_line=1, num_lines=None) + return { + "content": result["content"], + "path": rel_path, + "title": result.get("title"), + "doc_type": result.get("doc_type"), + "chunk_mode": result.get("chunk_mode"), + "citation": result.get("citation"), + "total_lines": result.get("total_lines"), + "file_size": os.path.getsize(full_path), + } + + def get_chunks(self, rel_path: str) -> dict: + """Inspect live RAG chunks for a knowledge document.""" + self._resolve_knowledge_path(rel_path) + chunk_info = self.memory_manager.inspect_document_chunks(f"knowledge/{rel_path}") + chunk_info["path"] = rel_path + return chunk_info + + def create_file( + self, + title: str, + rel_path: Optional[str] = None, + content: str = "", + ) -> dict: + """Create a new markdown knowledge page and index it.""" + title = (title or "").strip() + if not title and not rel_path: + raise ValueError("title is required") + + rel_path = (rel_path or "").strip().replace("\\", "/").strip("/") + if rel_path and ".." in rel_path: raise ValueError("invalid path") + if not rel_path: + slug = re.sub(r"[^0-9A-Za-z_\-\u4e00-\u9fff]+", "-", title).strip("-").lower() or "untitled" + rel_path = f"notes/{slug}.md" + elif not rel_path.endswith(".md"): + rel_path += ".md" + full_path = os.path.normpath(os.path.join(self.knowledge_dir, rel_path)) allowed = os.path.normpath(self.knowledge_dir) - if not full_path.startswith(allowed + os.sep) and full_path != allowed: + if not full_path.startswith(allowed + os.sep): raise ValueError("path outside knowledge dir") + if os.path.exists(full_path): + raise ValueError(f"file already exists: {rel_path}") + + os.makedirs(os.path.dirname(full_path), exist_ok=True) + body = content.strip() + markdown = f"# {title or Path(rel_path).stem}\n\n{body}\n" if body else f"# {title or Path(rel_path).stem}\n" + with open(full_path, "w", encoding="utf-8") as f: + f.write(markdown) + + asyncio.run(self.memory_manager.sync(force=False)) + return {"path": rel_path, "title": title or Path(rel_path).stem} + + def delete_file(self, rel_path: str) -> dict: + """Delete a knowledge file and purge its index entries.""" + full_path = self._resolve_knowledge_path(rel_path) + os.remove(full_path) + indexed_path = f"knowledge/{rel_path}".replace("\\", "/") + self.memory_manager.storage.delete_by_path(indexed_path) + self.memory_manager.storage.delete_file_record(indexed_path) + return {"path": rel_path, "deleted": True} + + def reindex_imports(self, force: bool = True) -> dict: + """Trigger a manual reindex for knowledge/imports.""" + result = asyncio.run(self.memory_manager.sync_imports(force=force)) + return result - if not os.path.isfile(full_path): - raise FileNotFoundError(f"file not found: {rel_path}") - - with open(full_path, "r", encoding="utf-8") as f: - content = f.read() - return {"content": content, "path": rel_path} - - # ------------------------------------------------------------------ - # graph — nodes and links for visualization - # ------------------------------------------------------------------ def build_graph(self) -> dict: - """ - Parse all knowledge pages and extract cross-reference links. - - Returns:: - - { - "nodes": [ - {"id": "concepts/moe.md", "label": "MoE", "category": "concepts"}, - ... - ], - "links": [ - {"source": "concepts/moe.md", "target": "entities/deepseek.md"}, - ... - ] - } - """ + """Parse markdown knowledge pages and extract cross-reference links.""" knowledge_path = Path(self.knowledge_dir) if not knowledge_path.is_dir(): return {"nodes": [], "links": []} nodes = {} links = [] - link_re = re.compile(r'\[([^\]]*)\]\(([^)]+\.md)\)') + link_re = re.compile(r"\[([^\]]*)\]\(([^)]+\.md)\)") for md_file in knowledge_path.rglob("*.md"): - rel = str(md_file.relative_to(knowledge_path)) + rel = str(md_file.relative_to(knowledge_path)).replace("\\", "/") if rel in ("index.md", "log.md"): continue parts = rel.split("/") @@ -179,7 +194,7 @@ def build_graph(self) -> dict: for _, link_target in link_re.findall(content): resolved = (md_file.parent / link_target).resolve() try: - target_rel = str(resolved.relative_to(knowledge_path)) + target_rel = str(resolved.relative_to(knowledge_path)).replace("\\", "/") except ValueError: continue if target_rel != rel: @@ -189,47 +204,65 @@ def build_graph(self) -> dict: nodes[rel] = {"id": rel, "label": title, "category": category} valid_ids = set(nodes.keys()) - links = [l for l in links if l["source"] in valid_ids and l["target"] in valid_ids] - seen = set() deduped = [] - for l in links: - key = tuple(sorted([l["source"], l["target"]])) - if key not in seen: - seen.add(key) - deduped.append(l) + seen = set() + for link in links: + if link["source"] not in valid_ids or link["target"] not in valid_ids: + continue + key = tuple(sorted((link["source"], link["target"]))) + if key in seen: + continue + seen.add(key) + deduped.append(link) return {"nodes": list(nodes.values()), "links": deduped} - # ------------------------------------------------------------------ - # dispatch — single entry point for protocol messages - # ------------------------------------------------------------------ def dispatch(self, action: str, payload: Optional[dict] = None) -> dict: - """ - Dispatch a knowledge management action. - - :param action: ``list``, ``read``, or ``graph`` - :param payload: action-specific payload - :return: protocol-compatible response dict - """ + """Dispatch a knowledge management action.""" payload = payload or {} try: if action == "list": result = self.list_tree() return {"action": action, "code": 200, "message": "success", "payload": result} - elif action == "read": + if action == "read": path = payload.get("path") if not path: return {"action": action, "code": 400, "message": "path is required", "payload": None} result = self.read_file(path) return {"action": action, "code": 200, "message": "success", "payload": result} - elif action == "graph": + if action == "chunks": + path = payload.get("path") + if not path: + return {"action": action, "code": 400, "message": "path is required", "payload": None} + result = self.get_chunks(path) + return {"action": action, "code": 200, "message": "success", "payload": result} + + if action == "create": + result = self.create_file( + title=payload.get("title", ""), + rel_path=payload.get("path"), + content=payload.get("content", ""), + ) + return {"action": action, "code": 200, "message": "success", "payload": result} + + if action == "delete": + path = payload.get("path") + if not path: + return {"action": action, "code": 400, "message": "path is required", "payload": None} + result = self.delete_file(path) + return {"action": action, "code": 200, "message": "success", "payload": result} + + if action == "reindex_imports": + result = self.reindex_imports(force=bool(payload.get("force", True))) + return {"action": action, "code": 200, "message": "success", "payload": result} + + if action == "graph": result = self.build_graph() return {"action": action, "code": 200, "message": "success", "payload": result} - else: - return {"action": action, "code": 400, "message": f"unknown action: {action}", "payload": None} + return {"action": action, "code": 400, "message": f"unknown action: {action}", "payload": None} except ValueError as e: return {"action": action, "code": 403, "message": str(e), "payload": None} @@ -238,3 +271,16 @@ def dispatch(self, action: str, payload: Optional[dict] = None) -> dict: except Exception as e: logger.error(f"[KnowledgeService] dispatch error: action={action}, error={e}") return {"action": action, "code": 500, "message": str(e), "payload": None} + + def _resolve_knowledge_path(self, rel_path: str) -> str: + """Resolve and validate a path inside knowledge/.""" + if not rel_path or ".." in rel_path: + raise ValueError("invalid path") + + full_path = os.path.normpath(os.path.join(self.knowledge_dir, rel_path)) + allowed = os.path.normpath(self.knowledge_dir) + if not full_path.startswith(allowed + os.sep) and full_path != allowed: + raise ValueError("path outside knowledge dir") + if not os.path.isfile(full_path): + raise FileNotFoundError(f"file not found: {rel_path}") + return full_path diff --git a/agent/memory/chunker.py b/agent/memory/chunker.py index 8dfd062d6..a6ba677e9 100644 --- a/agent/memory/chunker.py +++ b/agent/memory/chunker.py @@ -1,29 +1,34 @@ """ -Text chunking utilities for memory +Text chunking utilities for memory and knowledge documents. -Splits text into chunks with token limits and overlap +Provides both a generic line-based chunker and a lightweight Markdown-aware +chunker that preserves heading structure for better retrieval quality. """ from __future__ import annotations -from typing import List, Tuple -from dataclasses import dataclass + +import re +from typing import Dict, List, Optional +from dataclasses import dataclass, field @dataclass class TextChunk: - """Represents a text chunk with line numbers""" + """Represents a text chunk with line numbers and retrieval metadata.""" + text: str start_line: int end_line: int + metadata: Dict[str, object] = field(default_factory=dict) class TextChunker: - """Chunks text by line count with token estimation""" - + """Chunks text by line count with optional Markdown structure awareness.""" + def __init__(self, max_tokens: int = 500, overlap_tokens: int = 50): """ - Initialize chunker - + Initialize chunker. + Args: max_tokens: Maximum tokens per chunk overlap_tokens: Overlap tokens between chunks @@ -32,109 +37,268 @@ def __init__(self, max_tokens: int = 500, overlap_tokens: int = 50): self.overlap_tokens = overlap_tokens # Rough estimation: ~4 chars per token for English/Chinese mixed self.chars_per_token = 4 - - def chunk_text(self, text: str) -> List[TextChunk]: + + def chunk_text( + self, + text: str, + start_line: int = 1, + metadata: Optional[Dict[str, object]] = None, + ) -> List[TextChunk]: """ - Chunk text into overlapping segments - + Chunk plain text into overlapping segments. + Args: text: Input text to chunk - - Returns: - List of TextChunk objects + start_line: Starting line number of the given text in the source doc + metadata: Metadata attached to every emitted chunk + """ + return self._chunk_lines(text.split("\n"), start_line=start_line, metadata=metadata) + + def chunk_markdown( + self, + text: str, + metadata: Optional[Dict[str, object]] = None, + ) -> List[TextChunk]: + """ + Chunk Markdown text while respecting heading structure. + + The algorithm keeps section blocks together when possible and propagates + section path metadata for better ranking and citation quality. """ if not text.strip(): return [] - - lines = text.split('\n') - chunks = [] - + + lines = text.split("\n") + base_metadata = dict(metadata or {}) + + heading_re = re.compile(r"^(#{1,6})\s+(.*)$") + sections = [] + heading_stack: List[tuple[int, str]] = [] + current_lines: List[str] = [] + current_start_line = 1 + current_path: List[str] = [] + + def flush_current(end_line: int): + if not current_lines: + return + + section_metadata = dict(base_metadata) + section_title = current_path[-1] if current_path else section_metadata.get("title", "") + parent_titles = current_path[:-1] if len(current_path) > 1 else [] + section_metadata.update( + { + "section_title": section_title, + "parent_titles": parent_titles, + "heading_path": list(current_path), + "chunk_type": "markdown_section" if current_path else "markdown_root", + } + ) + sections.append( + (current_lines[:], current_start_line, end_line, section_metadata) + ) + + for lineno, line in enumerate(lines, start=1): + match = heading_re.match(line) + if match: + flush_current(lineno - 1) + + level = len(match.group(1)) + heading_text = match.group(2).strip() + while heading_stack and heading_stack[-1][0] >= level: + heading_stack.pop() + heading_stack.append((level, heading_text)) + + current_path = [item[1] for item in heading_stack] + current_lines = [line] + current_start_line = lineno + else: + if not current_lines: + current_lines = [line] + current_start_line = lineno + else: + current_lines.append(line) + + flush_current(len(lines)) + + chunks: List[TextChunk] = [] + for section_lines, section_start, _section_end, section_metadata in sections: + chunks.extend( + self._chunk_lines( + section_lines, + start_line=section_start, + metadata=section_metadata, + ) + ) + + if chunks: + return chunks + + return self.chunk_text(text, metadata=base_metadata) + + def chunk_document( + self, + text: str, + chunk_mode: str = "plain_text", + metadata: Optional[Dict[str, object]] = None, + ) -> List[TextChunk]: + """ + Chunk a normalized document according to its format strategy. + + Supported modes: + - markdown: preserve heading hierarchy + - pdf_pages: each `# Page N` section becomes a primary unit + - word_sections: heading-aware paragraph chunking + - spreadsheet: chunk per sheet/row windows + - plain_text: generic overlap chunking + """ + if chunk_mode in {"markdown", "pdf_pages", "word_sections"}: + return self.chunk_markdown(text, metadata=metadata) + if chunk_mode == "spreadsheet": + return self._chunk_spreadsheet(text, metadata=metadata) + return self.chunk_text(text, metadata=metadata) + + def _chunk_lines( + self, + lines: List[str], + start_line: int = 1, + metadata: Optional[Dict[str, object]] = None, + ) -> List[TextChunk]: + """Internal line-aware chunking with overlap and metadata propagation.""" + if not any(line.strip() for line in lines): + return [] + + metadata = dict(metadata or {}) + chunks: List[TextChunk] = [] + max_chars = self.max_tokens * self.chars_per_token overlap_chars = self.overlap_tokens * self.chars_per_token - - current_chunk = [] + + current_chunk: List[str] = [] current_chars = 0 - start_line = 1 - - for i, line in enumerate(lines, start=1): + chunk_start_line = start_line + + for offset, line in enumerate(lines): + absolute_line = start_line + offset line_chars = len(line) - - # If single line exceeds max, split it + if line_chars > max_chars: - # Save current chunk if exists if current_chunk: - chunks.append(TextChunk( - text='\n'.join(current_chunk), - start_line=start_line, - end_line=i - 1 - )) + chunks.append( + TextChunk( + text="\n".join(current_chunk), + start_line=chunk_start_line, + end_line=absolute_line - 1, + metadata=dict(metadata), + ) + ) current_chunk = [] current_chars = 0 - - # Split long line into multiple chunks - for sub_chunk in self._split_long_line(line, max_chars): - chunks.append(TextChunk( - text=sub_chunk, - start_line=i, - end_line=i - )) - - start_line = i + 1 + + for sub_index, sub_chunk in enumerate(self._split_long_line(line, max_chars)): + sub_metadata = dict(metadata) + sub_metadata["chunk_type"] = sub_metadata.get("chunk_type", "text_chunk") + sub_metadata["split_part"] = sub_index + 1 + chunks.append( + TextChunk( + text=sub_chunk, + start_line=absolute_line, + end_line=absolute_line, + metadata=sub_metadata, + ) + ) + chunk_start_line = absolute_line + 1 continue - - # Check if adding this line would exceed limit + if current_chars + line_chars > max_chars and current_chunk: - # Save current chunk - chunks.append(TextChunk( - text='\n'.join(current_chunk), - start_line=start_line, - end_line=i - 1 - )) - - # Start new chunk with overlap + chunks.append( + TextChunk( + text="\n".join(current_chunk), + start_line=chunk_start_line, + end_line=absolute_line - 1, + metadata=dict(metadata), + ) + ) + overlap_lines = self._get_overlap_lines(current_chunk, overlap_chars) current_chunk = overlap_lines + [line] - current_chars = sum(len(l) for l in current_chunk) - start_line = i - len(overlap_lines) + current_chars = sum(len(item) for item in current_chunk) + chunk_start_line = absolute_line - len(overlap_lines) else: - # Add line to current chunk current_chunk.append(line) current_chars += line_chars - - # Save last chunk + if current_chunk: - chunks.append(TextChunk( - text='\n'.join(current_chunk), - start_line=start_line, - end_line=len(lines) - )) - + chunks.append( + TextChunk( + text="\n".join(current_chunk), + start_line=chunk_start_line, + end_line=start_line + len(lines) - 1, + metadata=dict(metadata), + ) + ) + + return chunks + + def _chunk_spreadsheet( + self, + text: str, + metadata: Optional[Dict[str, object]] = None, + ) -> List[TextChunk]: + """Chunk spreadsheet text by sheet boundaries, then by row windows.""" + if not text.strip(): + return [] + + lines = text.split("\n") + sheet_ranges = [] + current_start = 1 + current_sheet = "" + + for lineno, line in enumerate(lines, start=1): + if line.startswith("# Sheet:"): + if lineno > current_start: + sheet_ranges.append((current_sheet, current_start, lineno - 1)) + current_sheet = line.replace("# Sheet:", "", 1).strip() + current_start = lineno + + sheet_ranges.append((current_sheet, current_start, len(lines))) + + chunks: List[TextChunk] = [] + for sheet_name, start_line, end_line in sheet_ranges: + section_lines = lines[start_line - 1 : end_line] + section_metadata = dict(metadata or {}) + section_metadata.update( + { + "section_title": sheet_name or section_metadata.get("section_title", ""), + "chunk_type": "spreadsheet_sheet", + } + ) + chunks.extend( + self._chunk_lines( + section_lines, + start_line=start_line, + metadata=section_metadata, + ) + ) + return chunks - + def _split_long_line(self, line: str, max_chars: int) -> List[str]: - """Split a single long line into multiple chunks""" + """Split a single long line into multiple chunks.""" chunks = [] - for i in range(0, len(line), max_chars): - chunks.append(line[i:i + max_chars]) + for index in range(0, len(line), max_chars): + chunks.append(line[index:index + max_chars]) return chunks - + def _get_overlap_lines(self, lines: List[str], target_chars: int) -> List[str]: - """Get last few lines that fit within target_chars for overlap""" + """Get last few lines that fit within target_chars for overlap.""" overlap = [] chars = 0 - + for line in reversed(lines): line_chars = len(line) if chars + line_chars > target_chars: break overlap.insert(0, line) chars += line_chars - + return overlap - - def chunk_markdown(self, text: str) -> List[TextChunk]: - """ - Chunk markdown text while respecting structure - (For future enhancement: respect markdown sections) - """ - return self.chunk_text(text) diff --git a/agent/memory/config.py b/agent/memory/config.py index 70d303dc5..e1bf26015 100644 --- a/agent/memory/config.py +++ b/agent/memory/config.py @@ -47,6 +47,8 @@ class MemoryConfig: # Sync config enable_auto_sync: bool = True sync_on_search: bool = True + imports_watch_enabled: bool = True + imports_watch_interval_sec: int = 10 def get_workspace(self) -> Path: diff --git a/agent/memory/document_parser.py b/agent/memory/document_parser.py new file mode 100644 index 000000000..4bc19bedf --- /dev/null +++ b/agent/memory/document_parser.py @@ -0,0 +1,261 @@ +""" +Document parsing utilities for memory/knowledge indexing. + +This module normalizes multiple file formats into a unified text-plus-metadata +representation so downstream chunking can apply format-specific strategies. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional + + +SUPPORTED_DOCUMENT_EXTENSIONS = { + ".md", + ".markdown", + ".txt", + ".rst", + ".pdf", + ".docx", + ".xlsx", +} + + +@dataclass +class ParsedDocument: + """Normalized document representation for indexing.""" + + content: str + doc_type: str + chunk_mode: str + metadata: Dict[str, object] = field(default_factory=dict) + + +class DocumentParser: + """Parse different office/document formats into normalized text.""" + + def parse(self, file_path: Path, rel_path: str) -> ParsedDocument: + suffix = file_path.suffix.lower() + + if suffix in {".md", ".markdown"}: + return self._parse_markdown(file_path, rel_path) + if suffix in {".txt", ".rst"}: + return self._parse_text(file_path, rel_path) + if suffix == ".pdf": + return self._parse_pdf(file_path, rel_path) + if suffix == ".docx": + return self._parse_word(file_path, rel_path) + if suffix == ".xlsx": + return self._parse_excel(file_path, rel_path) + + raise ValueError(f"Unsupported document type for indexing: {suffix}") + + @staticmethod + def is_supported(file_path: Path) -> bool: + return file_path.suffix.lower() in SUPPORTED_DOCUMENT_EXTENSIONS + + def _parse_markdown(self, file_path: Path, rel_path: str) -> ParsedDocument: + content = file_path.read_text(encoding="utf-8") + title = self._extract_heading_title(content, fallback=file_path.stem) + return ParsedDocument( + content=content, + doc_type="markdown", + chunk_mode="markdown", + metadata={ + "title": title, + "parser": "markdown", + "file_extension": file_path.suffix.lower(), + "source_path": rel_path, + }, + ) + + def _parse_text(self, file_path: Path, rel_path: str) -> ParsedDocument: + content = self._read_text_with_fallbacks(file_path) + title = file_path.stem.replace("-", " ").replace("_", " ").strip() or file_path.name + return ParsedDocument( + content=content, + doc_type="text", + chunk_mode="plain_text", + metadata={ + "title": title, + "parser": "text", + "file_extension": file_path.suffix.lower(), + "source_path": rel_path, + }, + ) + + def _parse_pdf(self, file_path: Path, rel_path: str) -> ParsedDocument: + try: + from pypdf import PdfReader + except ImportError: + raise ImportError("pypdf library not installed. Install with: pip install pypdf") + + reader = PdfReader(str(file_path)) + parts: List[str] = [] + for page_num, page in enumerate(reader.pages, start=1): + page_text = (page.extract_text() or "").strip() + if page_text: + parts.append(f"# Page {page_num}\n\n{page_text}") + + content = "\n\n".join(parts).strip() + title = file_path.stem.replace("-", " ").replace("_", " ").strip() or file_path.name + return ParsedDocument( + content=content, + doc_type="pdf", + chunk_mode="pdf_pages", + metadata={ + "title": title, + "parser": "pdf", + "page_count": len(reader.pages), + "file_extension": file_path.suffix.lower(), + "source_path": rel_path, + }, + ) + + def _parse_word(self, file_path: Path, rel_path: str) -> ParsedDocument: + try: + from docx import Document + except ImportError: + raise ImportError("python-docx library not installed. Install with: pip install python-docx") + + doc = Document(str(file_path)) + parts: List[str] = [] + title: Optional[str] = None + + for para in doc.paragraphs: + text = para.text.strip() + if not text: + continue + + style_name = "" + try: + style_name = para.style.name or "" + except Exception: + style_name = "" + + normalized_style = style_name.lower() + if normalized_style.startswith("title") and not title: + title = text + parts.append(f"# {text}") + elif normalized_style.startswith("heading"): + level = self._extract_heading_level(style_name) + parts.append(f"{'#' * level} {text}") + else: + parts.append(text) + + for table_index, table in enumerate(doc.tables, start=1): + parts.append(f"## Table {table_index}") + for row in table.rows: + cells = [cell.text.strip().replace("\n", " / ") for cell in row.cells] + if any(cells): + parts.append(" | ".join(cells)) + + normalized = "\n\n".join(part for part in parts if part).strip() + if not title: + title = file_path.stem.replace("-", " ").replace("_", " ").strip() or file_path.name + + return ParsedDocument( + content=normalized, + doc_type="word", + chunk_mode="word_sections", + metadata={ + "title": title, + "parser": "word", + "file_extension": file_path.suffix.lower(), + "source_path": rel_path, + }, + ) + + def _parse_excel(self, file_path: Path, rel_path: str) -> ParsedDocument: + try: + from openpyxl import load_workbook + except ImportError: + raise ImportError("openpyxl library not installed. Install with: pip install openpyxl") + + workbook = load_workbook(str(file_path), read_only=True, data_only=True) + parts: List[str] = [] + non_empty_sheets = 0 + + for sheet_name in workbook.sheetnames: + sheet = workbook[sheet_name] + parts.append(f"# Sheet: {sheet_name}") + + rows = list(sheet.iter_rows(values_only=True)) + if not rows: + continue + + header = None + header_index = None + for idx, row in enumerate(rows): + cells = [str(cell).strip() if cell is not None else "" for cell in row] + if any(cells): + header = cells + header_index = idx + break + + if header is None: + continue + + non_empty_sheets += 1 + parts.append(" | ".join(header)) + + for row_num, row in enumerate(rows[header_index + 1 :], start=header_index + 2): + cells = [str(cell).strip() if cell is not None else "" for cell in row] + if not any(cells): + continue + row_pairs = [] + for col_index, cell_value in enumerate(cells): + column_name = header[col_index] if col_index < len(header) and header[col_index] else f"Column {col_index + 1}" + if cell_value: + row_pairs.append(f"{column_name}: {cell_value}") + if row_pairs: + parts.append(f"- Row {row_num}: " + " | ".join(row_pairs)) + + workbook.close() + + content = "\n\n".join(parts).strip() + title = file_path.stem.replace("-", " ").replace("_", " ").strip() or file_path.name + return ParsedDocument( + content=content, + doc_type="excel", + chunk_mode="spreadsheet", + metadata={ + "title": title, + "parser": "excel", + "sheet_count": len(workbook.sheetnames), + "non_empty_sheets": non_empty_sheets, + "file_extension": file_path.suffix.lower(), + "source_path": rel_path, + }, + ) + + @staticmethod + def _extract_heading_title(content: str, fallback: str) -> str: + for line in content.splitlines(): + stripped = line.strip() + if stripped.startswith("# "): + return stripped[2:].strip() + return fallback.replace("-", " ").replace("_", " ").strip() or fallback + + @staticmethod + def _read_text_with_fallbacks(file_path: Path) -> str: + encodings = ["utf-8", "utf-8-sig", "gbk", "gb2312", "latin-1"] + for encoding in encodings: + try: + return file_path.read_text(encoding=encoding) + except (UnicodeDecodeError, UnicodeError): + continue + raise ValueError(f"Unable to decode text file: {file_path}") + + @staticmethod + def _extract_heading_level(style_name: str) -> int: + digits = "".join(ch for ch in style_name if ch.isdigit()) + if digits: + try: + level = int(digits) + return min(max(level, 1), 6) + except ValueError: + return 2 + return 2 diff --git a/agent/memory/manager.py b/agent/memory/manager.py index 259742f05..8f729e33a 100644 --- a/agent/memory/manager.py +++ b/agent/memory/manager.py @@ -4,15 +4,19 @@ Provides high-level interface for memory operations """ +import asyncio import os +import re +import threading from typing import List, Optional, Dict, Any from pathlib import Path import hashlib -from datetime import datetime, timedelta +from datetime import datetime from agent.memory.config import MemoryConfig, get_default_memory_config from agent.memory.storage import MemoryStorage, MemoryChunk, SearchResult from agent.memory.chunker import TextChunker +from agent.memory.document_parser import DocumentParser, SUPPORTED_DOCUMENT_EXTENSIONS from agent.memory.embedding import create_embedding_provider, EmbeddingProvider from agent.memory.summarizer import MemoryFlushManager, create_memory_files_if_needed @@ -49,6 +53,7 @@ def __init__( max_tokens=self.config.chunk_max_tokens, overlap_tokens=self.config.chunk_overlap_tokens ) + self.document_parser = DocumentParser() # Initialize embedding provider (optional, prefer OpenAI, fallback to LinkAI) self.embedding_provider = None @@ -105,11 +110,14 @@ def __init__( self._init_workspace() self._dirty = False + self._imports_watch_thread: Optional[threading.Thread] = None + self._imports_watch_stop = threading.Event() def _init_workspace(self): """Initialize workspace directories""" memory_dir = self.config.get_memory_dir() memory_dir.mkdir(parents=True, exist_ok=True) + (self.config.get_workspace() / "knowledge" / "imports").mkdir(parents=True, exist_ok=True) # Create default memory files workspace_dir = self.config.get_workspace() @@ -138,6 +146,7 @@ async def search( """ max_results = max_results or self.config.max_results min_score = min_score or self.config.min_score + retrieval_plan = self._build_retrieval_plan(query, max_results) # Determine scopes scopes = [] @@ -163,7 +172,7 @@ async def search( query_embedding=query_embedding, user_id=user_id, scopes=scopes, - limit=max_results * 2 # Get more candidates for merging + limit=retrieval_plan["vector_limit"], ) logger.info(f"[MemoryManager] Vector search found {len(vector_results)} results for query: {query}") except Exception as e: @@ -175,7 +184,7 @@ async def search( query=query, user_id=user_id, scopes=scopes, - limit=max_results * 2 + limit=retrieval_plan["keyword_limit"], ) from common.log import logger logger.info(f"[MemoryManager] Keyword search found {len(keyword_results)} results for query: {query}") @@ -184,13 +193,21 @@ async def search( merged = self._merge_results( vector_results, keyword_results, - self.config.vector_weight, - self.config.keyword_weight + retrieval_plan["vector_weight"], + retrieval_plan["keyword_weight"], + query=query, + query_type=retrieval_plan["query_type"], ) - # Filter by min score and limit + # Filter by min score and diversify across documents so one file + # does not crowd out other highly relevant sources. filtered = [r for r in merged if r.score >= min_score] - return filtered[:max_results] + diversified = self._diversify_results( + filtered, + max_results=max_results, + query_type=retrieval_plan["query_type"], + ) + return diversified[:max_results] async def add_memory( self, @@ -224,7 +241,7 @@ async def add_memory( path = f"memory/shared/memory_{content_hash}.md" # Chunk content - chunks = self.chunker.chunk_text(content) + chunks = self._chunk_document(content, path) # Generate embeddings (if provider available) texts = [chunk.text for chunk in chunks] @@ -240,6 +257,9 @@ async def add_memory( chunk_id = self._generate_chunk_id(path, chunk.start_line, chunk.end_line) chunk_hash = MemoryStorage.compute_hash(chunk.text) + chunk_metadata = dict(metadata or {}) + chunk_metadata.update(chunk.metadata) + memory_chunks.append(MemoryChunk( id=chunk_id, user_id=user_id, @@ -251,7 +271,7 @@ async def add_memory( text=chunk.text, embedding=embedding, hash=chunk_hash, - metadata=metadata + metadata=chunk_metadata )) # Save to storage @@ -280,11 +300,15 @@ async def sync(self, force: bool = False): # Scan MEMORY.md (workspace root) memory_file = Path(workspace_dir) / "MEMORY.md" if memory_file.exists(): - await self._sync_file(memory_file, "memory", "shared", None) + await self._sync_file(memory_file, "memory", "shared", None, force=force) # Scan memory directory (including daily summaries) if memory_dir.exists(): - for file_path in memory_dir.rglob("*.md"): + for file_path in memory_dir.rglob("*"): + if not file_path.is_file(): + continue + if file_path.suffix.lower() != ".md": + continue # Skip hidden directories (e.g. .dreams/) if any(part.startswith('.') for part in file_path.relative_to(workspace_dir).parts): continue @@ -315,29 +339,80 @@ async def sync(self, force: bool = False): user_id = None scope = "shared" - await self._sync_file(file_path, "memory", scope, user_id) + await self._sync_file(file_path, "memory", scope, user_id, force=force) - # Scan knowledge directory (structured knowledge wiki) + # Scan knowledge directory (structured knowledge wiki + imported files) from config import conf if conf().get("knowledge", True): knowledge_dir = Path(workspace_dir) / "knowledge" if knowledge_dir.exists(): - for file_path in knowledge_dir.rglob("*.md"): - await self._sync_file(file_path, "knowledge", "shared", None) + for file_path in knowledge_dir.rglob("*"): + if not file_path.is_file(): + continue + if file_path.suffix.lower() not in SUPPORTED_DOCUMENT_EXTENSIONS: + continue + await self._sync_file(file_path, "knowledge", "shared", None, force=force) self._dirty = False + + async def sync_imports(self, force: bool = False) -> Dict[str, Any]: + """Incrementally sync files under knowledge/imports and prune deleted files.""" + workspace_dir = self.config.get_workspace() + imports_dir = workspace_dir / "knowledge" / "imports" + imports_dir.mkdir(parents=True, exist_ok=True) + + indexed = 0 + current_paths = set() + for file_path in imports_dir.rglob("*"): + if not file_path.is_file(): + continue + if file_path.suffix.lower() not in SUPPORTED_DOCUMENT_EXTENSIONS: + continue + rel_path = str(file_path.relative_to(workspace_dir)).replace("\\", "/") + current_paths.add(rel_path) + await self._sync_file(file_path, "knowledge", "shared", None, force=force) + indexed += 1 + + indexed_paths = set(self.storage.list_indexed_paths(prefix="knowledge/imports/", source="knowledge")) + removed_paths = sorted(indexed_paths - current_paths) + for rel_path in removed_paths: + self.storage.delete_by_path(rel_path) + self.storage.delete_file_record(rel_path) + + self._dirty = False + return { + "imports_dir": "knowledge/imports", + "indexed_files": indexed, + "removed_files": removed_paths, + "watching": self._imports_watch_thread is not None and self._imports_watch_thread.is_alive(), + } async def _sync_file( self, file_path: Path, source: str, scope: str, - user_id: Optional[str] + user_id: Optional[str], + force: bool = False, ): """Sync a single file""" - # Compute file hash - content = file_path.read_text(encoding='utf-8') - file_hash = MemoryStorage.compute_hash(content) + try: + parsed_doc = self._parse_document_for_indexing(file_path, source) + except ImportError as e: + from common.log import logger + logger.warning(f"[MemoryManager] Skip indexing {file_path.name}: {e}") + return + except Exception as e: + from common.log import logger + logger.warning(f"[MemoryManager] Failed to parse {file_path.name}: {e}") + return + content = parsed_doc.content + if not content.strip(): + return + + # Compute file hash from raw bytes to reflect source file changes even if + # parser normalization keeps extracted text stable. + file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest() # Get relative path workspace_dir = self.config.get_workspace() @@ -345,14 +420,14 @@ async def _sync_file( # Check if file changed stored_hash = self.storage.get_file_hash(rel_path) - if stored_hash == file_hash: + if not force and stored_hash == file_hash: return # No changes # Delete old chunks self.storage.delete_by_path(rel_path) # Chunk and embed - chunks = self.chunker.chunk_text(content) + chunks = self._chunk_document(content, rel_path, parsed_doc) if not chunks: return @@ -368,6 +443,14 @@ async def _sync_file( chunk_id = self._generate_chunk_id(rel_path, chunk.start_line, chunk.end_line) chunk_hash = MemoryStorage.compute_hash(chunk.text) + chunk_metadata = self._build_chunk_metadata( + rel_path=rel_path, + source=source, + chunk=chunk, + content=content, + parsed_doc=parsed_doc, + ) + memory_chunks.append(MemoryChunk( id=chunk_id, user_id=user_id, @@ -379,7 +462,7 @@ async def _sync_file( text=chunk.text, embedding=embedding, hash=chunk_hash, - metadata=None + metadata=chunk_metadata )) # Save @@ -436,19 +519,170 @@ def get_status(self) -> Dict[str, Any]: 'files': stats['files'], 'workspace': str(self.config.get_workspace()), 'dirty': self._dirty, + 'imports_dir': 'knowledge/imports', + 'imports_watching': self._imports_watch_thread is not None and self._imports_watch_thread.is_alive(), 'embedding_enabled': self.embedding_provider is not None, 'embedding_provider': self.config.embedding_provider if self.embedding_provider else 'disabled', 'embedding_model': self.config.embedding_model if self.embedding_provider else 'N/A', - 'search_mode': 'hybrid (vector + keyword)' if self.embedding_provider else 'keyword only (FTS5)' + 'search_mode': 'hybrid (vector + keyword)' if self.embedding_provider else 'keyword only (FTS5)', + 'knowledge_supported_formats': sorted(SUPPORTED_DOCUMENT_EXTENSIONS), } def mark_dirty(self): """Mark memory as dirty (needs sync)""" self._dirty = True + + def start_import_watcher(self): + """Start a background poller for knowledge/imports incremental indexing.""" + if not self.config.imports_watch_enabled: + return + if self._imports_watch_thread and self._imports_watch_thread.is_alive(): + return + + self._imports_watch_stop.clear() + + def _watch_loop(): + from common.log import logger + + logger.info("[MemoryManager] knowledge/imports watcher started") + while not self._imports_watch_stop.wait(self.config.imports_watch_interval_sec): + try: + asyncio.run(self.sync_imports(force=False)) + except Exception as e: + logger.warning(f"[MemoryManager] knowledge/imports watcher error: {e}") + logger.info("[MemoryManager] knowledge/imports watcher stopped") + + self._imports_watch_thread = threading.Thread( + target=_watch_loop, + name="knowledge-imports-watcher", + daemon=True, + ) + self._imports_watch_thread.start() + + def stop_import_watcher(self): + """Stop the knowledge/imports watcher thread.""" + self._imports_watch_stop.set() + if self._imports_watch_thread and self._imports_watch_thread.is_alive(): + self._imports_watch_thread.join(timeout=1.0) + self._imports_watch_thread = None def close(self): """Close memory manager and release resources""" + self.stop_import_watcher() self.storage.close() + + def normalize_requested_path(self, path: str) -> str: + """Normalize a user-supplied memory/knowledge path inside the workspace.""" + if not path: + raise ValueError("path is required") + + normalized = path.replace("\\", "/").strip() + if ( + not normalized.startswith("memory/") + and not normalized.startswith("knowledge/") + and not normalized.startswith("/") + and normalized != "MEMORY.md" + ): + normalized = f"memory/{normalized}" + return normalized + + def resolve_workspace_path(self, path: str) -> tuple[str, Path]: + """Resolve a normalized path and ensure it stays inside the workspace.""" + normalized = self.normalize_requested_path(path) + workspace_dir = self.config.get_workspace().resolve() + file_path = (workspace_dir / normalized).resolve() + if file_path != workspace_dir and workspace_dir not in file_path.parents: + raise ValueError("Access denied: path outside workspace") + return normalized, file_path + + def inspect_document_chunks(self, path: str) -> Dict[str, Any]: + """Parse and chunk a document for UI inspection and citation generation.""" + rel_path, file_path = self.resolve_workspace_path(path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {rel_path}") + + parsed_doc = self._parse_document_for_indexing(file_path, "knowledge" if rel_path.startswith("knowledge/") else "memory") + chunks = self._chunk_document(parsed_doc.content, rel_path, parsed_doc=parsed_doc) + chunk_items = [] + for index, chunk in enumerate(chunks, start=1): + chunk_metadata = self._build_chunk_metadata( + rel_path=rel_path, + source="knowledge" if rel_path.startswith("knowledge/") else "memory", + chunk=chunk, + content=parsed_doc.content, + parsed_doc=parsed_doc, + ) + chunk_items.append( + { + "index": index, + "path": rel_path, + "start_line": chunk.start_line, + "end_line": chunk.end_line, + "citation": chunk_metadata.get("citation", ""), + "page_number": chunk_metadata.get("page_number"), + "section_title": chunk_metadata.get("section_title", ""), + "section_path": chunk_metadata.get("section_path", ""), + "title": chunk_metadata.get("title", ""), + "category": chunk_metadata.get("category", ""), + "doc_type": chunk_metadata.get("doc_type", parsed_doc.doc_type), + "chunk_mode": chunk_metadata.get("chunk_mode", parsed_doc.chunk_mode), + "metadata": chunk_metadata, + "content": chunk.text, + "preview": chunk.text[:320] + ("..." if len(chunk.text) > 320 else ""), + } + ) + + return { + "path": rel_path, + "doc_type": parsed_doc.doc_type, + "chunk_mode": parsed_doc.chunk_mode, + "title": parsed_doc.metadata.get("title"), + "total_chunks": len(chunk_items), + "chunks": chunk_items, + } + + def read_document_range(self, path: str, start_line: int = 1, num_lines: Optional[int] = None) -> Dict[str, Any]: + """Read a normalized document slice and attach stable citation metadata.""" + rel_path, file_path = self.resolve_workspace_path(path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {rel_path}") + + parsed_doc = self._parse_document_for_indexing(file_path, "knowledge" if rel_path.startswith("knowledge/") else "memory") + lines = parsed_doc.content.split("\n") + total_lines = len(lines) + if start_line < 1: + start_line = 1 + start_idx = start_line - 1 + end_idx = total_lines if not num_lines else min(total_lines, start_idx + max(num_lines, 0)) + selected_lines = lines[start_idx:end_idx] + end_line = start_line + max(len(selected_lines) - 1, 0) + + chunk_info = self.inspect_document_chunks(rel_path) + selected_chunk = None + best_overlap = -1 + for chunk in chunk_info["chunks"]: + overlap = min(end_line, chunk["end_line"]) - max(start_line, chunk["start_line"]) + 1 + if overlap > best_overlap: + best_overlap = overlap + selected_chunk = chunk + + metadata = dict((selected_chunk or {}).get("metadata") or {}) + if "citation" not in metadata: + metadata["citation"] = self.build_citation(rel_path, start_line, end_line, metadata) + + return { + "path": rel_path, + "doc_type": parsed_doc.doc_type, + "chunk_mode": parsed_doc.chunk_mode, + "title": parsed_doc.metadata.get("title"), + "content": "\n".join(selected_lines), + "start_line": start_line, + "end_line": end_line, + "shown_lines": len(selected_lines), + "total_lines": total_lines, + "citation": metadata.get("citation", ""), + "metadata": metadata, + } # Helper methods @@ -493,18 +727,20 @@ def _merge_results( vector_results: List[SearchResult], keyword_results: List[SearchResult], vector_weight: float, - keyword_weight: float + keyword_weight: float, + query: str = "", + query_type: str = "general", ) -> List[SearchResult]: - """Merge vector and keyword search results with temporal decay for dated files""" + """Merge and rerank search results with temporal and source-aware scoring.""" merged_map = {} for result in vector_results: key = (result.path, result.start_line, result.end_line) merged_map[key] = { - 'result': result, - 'vector_score': result.score, - 'keyword_score': 0.0 - } + 'result': result, + 'vector_score': result.score, + 'keyword_score': 0.0 + } for result in keyword_results: key = (result.path, result.start_line, result.end_line) @@ -516,7 +752,7 @@ def _merge_results( 'vector_score': 0.0, 'keyword_score': result.score } - + merged_results = [] for entry in merged_map.values(): combined_score = ( @@ -528,16 +764,332 @@ def _merge_results( result = entry['result'] decay = self._compute_temporal_decay(result.path) combined_score *= decay + + rerank_bonus = self._compute_rerank_bonus( + result=result, + query=query, + query_type=query_type, + vector_score=entry['vector_score'], + keyword_score=entry['keyword_score'], + ) + final_score = combined_score + rerank_bonus merged_results.append(SearchResult( path=result.path, start_line=result.start_line, end_line=result.end_line, - score=combined_score, + score=final_score, snippet=result.snippet, source=result.source, - user_id=result.user_id + user_id=result.user_id, + metadata=result.metadata, )) merged_results.sort(key=lambda r: r.score, reverse=True) return merged_results + + @staticmethod + def _target_unique_paths(max_results: int, query_type: str) -> int: + """Determine how many distinct documents to prioritize in top hits.""" + if max_results <= 1: + return 1 + if query_type in {"location", "exact"}: + return min(2, max_results) + return min(3, max_results) + + def _diversify_results( + self, + results: List[SearchResult], + max_results: int, + query_type: str, + ) -> List[SearchResult]: + """ + Prefer multiple distinct files in top results before adding extra + chunks from the same document. This helps the agent inspect parallel + manuals/checklists instead of stopping at the first matching file. + """ + if not results or max_results <= 1: + return results[:max_results] + + target_unique_paths = self._target_unique_paths(max_results, query_type) + per_path_cap = 2 if max_results > 2 else 1 + + chosen: List[SearchResult] = [] + path_counts: Dict[str, int] = {} + + # Pass 1: collect the best hit from as many different paths as possible. + for result in results: + if path_counts.get(result.path, 0) > 0: + continue + chosen.append(result) + path_counts[result.path] = 1 + if len(chosen) >= min(target_unique_paths, max_results): + break + + # Pass 2: fill remaining slots, but cap repeated chunks per path. + for result in results: + if len(chosen) >= max_results: + break + if path_counts.get(result.path, 0) >= per_path_cap: + continue + if any( + existing.path == result.path + and existing.start_line == result.start_line + and existing.end_line == result.end_line + for existing in chosen + ): + continue + chosen.append(result) + path_counts[result.path] = path_counts.get(result.path, 0) + 1 + + return chosen + + def _chunk_document(self, content: str, rel_path: str, parsed_doc=None): + """Chunk a document with source-aware metadata and parser-specific strategy.""" + if parsed_doc is None: + base_metadata = self._build_document_metadata(rel_path, content) + chunk_mode = "markdown" if rel_path.endswith(".md") else "plain_text" + else: + base_metadata = self._build_document_metadata(rel_path, content, parsed_doc=parsed_doc) + chunk_mode = parsed_doc.chunk_mode + return self.chunker.chunk_document(content, chunk_mode=chunk_mode, metadata=base_metadata) + + def _build_document_metadata(self, rel_path: str, content: str, parsed_doc=None) -> Dict[str, Any]: + """Build document-level metadata used for chunk indexing and reranking.""" + first_heading = "" + for line in content.splitlines(): + stripped = line.strip() + if stripped.startswith("# "): + first_heading = stripped[2:].strip() + break + + path_obj = Path(rel_path) + source_type = self._detect_source_type(rel_path) + category = path_obj.parent.name if len(path_obj.parts) > 1 else source_type + title = None + if parsed_doc: + title = parsed_doc.metadata.get("title") + title = title or first_heading or path_obj.stem.replace("-", " ").replace("_", " ").strip() or path_obj.name + + metadata = { + "title": title, + "source_type": source_type, + "category": category, + "is_evergreen": source_type in {"memory_core", "knowledge_page"}, + } + if parsed_doc: + metadata.update(parsed_doc.metadata) + metadata["doc_type"] = parsed_doc.doc_type + metadata["chunk_mode"] = parsed_doc.chunk_mode + return metadata + + def _build_chunk_metadata( + self, + rel_path: str, + source: str, + chunk, + content: str, + parsed_doc=None, + ) -> Dict[str, Any]: + """Merge document metadata with chunk-level metadata.""" + doc_metadata = self._build_document_metadata(rel_path, content, parsed_doc=parsed_doc) + chunk_metadata = dict(doc_metadata) + chunk_metadata.update(chunk.metadata or {}) + chunk_metadata["source"] = source + return self._attach_reference_metadata( + rel_path=rel_path, + start_line=chunk.start_line, + end_line=chunk.end_line, + metadata=chunk_metadata, + ) + + @staticmethod + def _extract_page_number(metadata: Dict[str, Any]) -> Optional[int]: + """Infer page number from parsed metadata and section titles.""" + page_number = metadata.get("page_number") + if isinstance(page_number, int): + return page_number + + for candidate in (metadata.get("section_title"), metadata.get("title")): + if not candidate: + continue + match = re.search(r"\bpage\s+(\d+)\b", str(candidate), re.IGNORECASE) + if match: + return int(match.group(1)) + return None + + def _attach_reference_metadata( + self, + rel_path: str, + start_line: int, + end_line: int, + metadata: Dict[str, Any], + ) -> Dict[str, Any]: + """Normalize section/page metadata and attach a stable citation string.""" + normalized = dict(metadata or {}) + page_number = self._extract_page_number(normalized) + if page_number is not None: + normalized["page_number"] = page_number + + parent_titles = normalized.get("parent_titles") or [] + if isinstance(parent_titles, list): + section_bits = [str(item).strip() for item in parent_titles if str(item).strip()] + else: + section_bits = [str(parent_titles).strip()] if str(parent_titles).strip() else [] + section_title = str(normalized.get("section_title", "")).strip() + if section_title: + section_bits.append(section_title) + section_path = " > ".join(section_bits) + if section_path: + normalized["section_path"] = section_path + + normalized["citation"] = self.build_citation( + path=rel_path, + start_line=start_line, + end_line=end_line, + metadata=normalized, + ) + return normalized + + def build_citation( + self, + path: str, + start_line: int, + end_line: int, + metadata: Optional[Dict[str, Any]] = None, + ) -> str: + """Build a stable inline citation string for final answers and tools.""" + metadata = metadata or {} + anchors = [] + page_number = self._extract_page_number(metadata) + if page_number is not None: + anchors.append(f"page={page_number}") + + section_path = str(metadata.get("section_path") or metadata.get("section_title") or "").strip() + if section_path: + safe_section = re.sub(r"\s+", "-", section_path) + safe_section = re.sub(r"[^0-9A-Za-z_\-\u4e00-\u9fff>]", "", safe_section) + if safe_section: + anchors.append(f"section={safe_section}") + + anchors.append(f"L{start_line}-L{end_line}") + return f"[{path}#{'#'.join(anchors)}]" + + @staticmethod + def _detect_source_type(rel_path: str) -> str: + """Infer source type from path.""" + normalized = rel_path.replace("\\", "/") + if normalized == "MEMORY.md" or normalized.endswith("/MEMORY.md"): + return "memory_core" + if normalized.startswith("knowledge/"): + return "knowledge_page" + if "/dreams/" in normalized or normalized.startswith("memory/dreams/"): + return "memory_dream" + if re.search(r"\d{4}-\d{2}-\d{2}\.md$", normalized): + return "memory_daily" + if normalized.startswith("memory/users/"): + return "memory_user" + return "memory_note" + + def _parse_document_for_indexing(self, file_path: Path, source: str): + """Parse a source file into normalized content for indexing.""" + workspace_dir = self.config.get_workspace() + rel_path = str(file_path.relative_to(workspace_dir)).replace("\\", "/") + return self.document_parser.parse(file_path, rel_path) + + def _build_retrieval_plan(self, query: str, max_results: int) -> Dict[str, Any]: + """Classify the query and derive retrieval weights/limits.""" + query_type = self._classify_query(query) + vector_weight = self.config.vector_weight + keyword_weight = self.config.keyword_weight + + if query_type == "recent_memory": + vector_weight = 0.55 + keyword_weight = 0.45 + elif query_type == "core_memory": + vector_weight = 0.50 + keyword_weight = 0.50 + elif query_type == "knowledge": + vector_weight = 0.75 + keyword_weight = 0.25 + elif query_type == "exact_lookup": + vector_weight = 0.35 + keyword_weight = 0.65 + + candidate_limit = max(max_results * 3, 12) + return { + "query_type": query_type, + "vector_weight": vector_weight, + "keyword_weight": keyword_weight, + "vector_limit": candidate_limit, + "keyword_limit": candidate_limit, + } + + @staticmethod + def _classify_query(query: str) -> str: + """Lightweight query classifier for source-aware retrieval.""" + normalized = query.lower() + + recent_patterns = ["昨天", "刚才", "最近", "前几天", "上次", "today", "yesterday", "recent", "earlier"] + core_patterns = ["偏好", "习惯", "记住", "总是", "不要", "preference", "prefer", "always", "never"] + knowledge_patterns = ["什么是", "原理", "概念", "总结", "架构", "how", "what is", "concept", "architecture"] + exact_patterns = ["文件", "文档", "标题", ".md", "path", "file", "document", "title"] + + if any(token in normalized for token in recent_patterns): + return "recent_memory" + if any(token in normalized for token in core_patterns): + return "core_memory" + if any(token in normalized for token in knowledge_patterns): + return "knowledge" + if any(token in normalized for token in exact_patterns) or "/" in normalized: + return "exact_lookup" + return "general" + + def _compute_rerank_bonus( + self, + result: SearchResult, + query: str, + query_type: str, + vector_score: float, + keyword_score: float, + ) -> float: + """Apply lightweight source-aware reranking.""" + metadata = result.metadata or {} + source_type = metadata.get("source_type", "") + title = str(metadata.get("title", "")).lower() + section_title = str(metadata.get("section_title", "")).lower() + category = str(metadata.get("category", "")).lower() + normalized_query = query.lower() + query_terms = [term for term in re.findall(r"[\w\u4e00-\u9fff]{2,}", normalized_query) if term] + + bonus = 0.0 + + if source_type == "memory_core": + bonus += 0.14 if query_type == "core_memory" else 0.03 + elif source_type == "memory_daily": + bonus += 0.14 if query_type == "recent_memory" else 0.02 + elif source_type == "knowledge_page": + bonus += 0.16 if query_type == "knowledge" else 0.04 + + title_hits = sum(1 for term in query_terms if term in title) + section_hits = sum(1 for term in query_terms if term in section_title) + category_hits = sum(1 for term in query_terms if term in category) + + bonus += min(title_hits * 0.06, 0.18) + bonus += min(section_hits * 0.04, 0.12) + bonus += min(category_hits * 0.03, 0.06) + + if result.path.lower() in normalized_query or title and title in normalized_query: + bonus += 0.08 + + if vector_score > 0 and keyword_score > 0: + bonus += 0.05 + + parent_titles = metadata.get("parent_titles") or [] + if isinstance(parent_titles, list): + for parent in parent_titles: + if any(term in str(parent).lower() for term in query_terms): + bonus += 0.02 + break + + return bonus diff --git a/agent/memory/service.py b/agent/memory/service.py index eb565b138..9a3c927a7 100644 --- a/agent/memory/service.py +++ b/agent/memory/service.py @@ -13,6 +13,8 @@ from datetime import datetime from typing import Dict, List, Optional from pathlib import Path +from agent.memory.config import MemoryConfig +from agent.memory.manager import MemoryManager from common.log import logger @@ -28,6 +30,14 @@ def __init__(self, workspace_root: str): """ self.workspace_root = workspace_root self.memory_dir = os.path.join(workspace_root, "memory") + self._memory_manager: Optional[MemoryManager] = None + + @property + def memory_manager(self) -> MemoryManager: + """Lazy-init memory manager for index maintenance.""" + if self._memory_manager is None: + self._memory_manager = MemoryManager(MemoryConfig(workspace_root=self.workspace_root)) + return self._memory_manager # ------------------------------------------------------------------ # list — paginated file metadata @@ -117,6 +127,32 @@ def get_content(self, filename: str, category: str = "memory") -> dict: "content": content, } + def delete_file(self, filename: str, category: str = "memory") -> dict: + """ + Delete a memory/dream file. MEMORY.md is truncated instead of removed. + """ + path = self._resolve_path(filename, category) + if not os.path.isfile(path): + raise FileNotFoundError(f"Memory file not found: {filename}") + + rel_path = self._to_workspace_relative(path) + if filename == "MEMORY.md": + with open(path, "w", encoding="utf-8") as f: + f.write("") + action = "cleared" + else: + os.remove(path) + action = "deleted" + + self.memory_manager.storage.delete_by_path(rel_path) + self.memory_manager.storage.delete_file_record(rel_path) + + return { + "filename": filename, + "path": rel_path, + "action": action, + } + # ------------------------------------------------------------------ # dispatch — single entry point for protocol messages # ------------------------------------------------------------------ @@ -145,6 +181,14 @@ def dispatch(self, action: str, payload: Optional[dict] = None) -> dict: result_payload = self.get_content(filename, category=category) return {"action": action, "code": 200, "message": "success", "payload": result_payload} + elif action == "delete": + filename = payload.get("filename") + if not filename: + return {"action": action, "code": 400, "message": "filename is required", "payload": None} + category = payload.get("category", "memory") + result_payload = self.delete_file(filename, category=category) + return {"action": action, "code": 200, "message": "success", "payload": result_payload} + else: return {"action": action, "code": 400, "message": f"unknown action: {action}", "payload": None} @@ -195,3 +239,6 @@ def _file_info(path: str, filename: str, file_type: str) -> dict: "size": stat.st_size, "updated_at": updated_at, } + + def _to_workspace_relative(self, path: str) -> str: + return str(Path(path).resolve().relative_to(Path(self.workspace_root).resolve())).replace("\\", "/") diff --git a/agent/memory/storage.py b/agent/memory/storage.py index 8ff0504af..54d4fd0fe 100644 --- a/agent/memory/storage.py +++ b/agent/memory/storage.py @@ -8,6 +8,7 @@ import sqlite3 import json import hashlib +import re from typing import List, Dict, Optional, Any from pathlib import Path from dataclasses import dataclass @@ -39,6 +40,7 @@ class SearchResult: snippet: str source: str user_id: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None class MemoryStorage: @@ -235,6 +237,17 @@ def save_chunks_batch(self, chunks: List[MemoryChunk]): for c in chunks ]) self.conn.commit() + + def rebuild_fts(self): + """Rebuild FTS index from the content table after schema-compatible updates.""" + if not self.fts5_available: + return + try: + self.conn.execute("INSERT INTO chunks_fts(chunks_fts) VALUES('rebuild')") + self.conn.commit() + except Exception: + # Best-effort rebuild. Search still has LIKE fallback. + pass def get_chunk(self, chunk_id: str) -> Optional[MemoryChunk]: """Get a chunk by ID""" @@ -305,7 +318,8 @@ def search_vector( score=score, snippet=self._truncate_text(row['text'], 500), source=row['source'], - user_id=row['user_id'] + user_id=row['user_id'], + metadata=json.loads(row['metadata']) if row['metadata'] else None, ) for score, row in results ] @@ -328,18 +342,21 @@ def search_keyword( scopes = ["shared"] if user_id: scopes.append("user") + + metadata_results = self._search_metadata_path(query, user_id, scopes, limit) # Try FTS5 search first (if available) if self.fts5_available: fts_results = self._search_fts5(query, user_id, scopes, limit) if fts_results: - return fts_results + return self._merge_keyword_result_sets(metadata_results, fts_results, limit) # Fallback to LIKE search (always for CJK, or if FTS5 not available) if not self.fts5_available or MemoryStorage._contains_cjk(query): - return self._search_like(query, user_id, scopes, limit) + like_results = self._search_like(query, user_id, scopes, limit) + return self._merge_keyword_result_sets(metadata_results, like_results, limit) - return [] + return metadata_results[:limit] def _search_fts5( self, @@ -390,7 +407,8 @@ def _search_fts5( score=self._bm25_rank_to_score(row['rank']), snippet=self._truncate_text(row['text'], 500), source=row['source'], - user_id=row['user_id'] + user_id=row['user_id'], + metadata=json.loads(row['metadata']) if row['metadata'] else None, ) for row in rows ] @@ -451,12 +469,90 @@ def _search_like( score=0.5, # Fixed score for LIKE search snippet=self._truncate_text(row['text'], 500), source=row['source'], - user_id=row['user_id'] + user_id=row['user_id'], + metadata=json.loads(row['metadata']) if row['metadata'] else None, + ) + for row in rows + ] + except Exception: + return [] + + def _search_metadata_path( + self, + query: str, + user_id: Optional[str], + scopes: List[str], + limit: int, + ) -> List[SearchResult]: + """Search chunk metadata and path for exact-lookups and title hits.""" + terms = [term for term in re.findall(r"[\w\u4e00-\u9fff\-.\/]{2,}", query.lower()) if term] + if not terms: + return [] + + scope_placeholders = ",".join("?" * len(scopes)) + conditions = [] + params: List[Any] = [] + for term in terms[:6]: + conditions.append("(LOWER(path) LIKE ? OR LOWER(COALESCE(metadata, '')) LIKE ?)") + like = f"%{term}%" + params.extend([like, like]) + + if not conditions: + return [] + + params.extend(scopes) + where_clause = " OR ".join(conditions) + if user_id: + sql_query = f""" + SELECT * FROM chunks + WHERE ({where_clause}) + AND scope IN ({scope_placeholders}) + AND (scope = 'shared' OR user_id = ?) + LIMIT ? + """ + params.extend([user_id, limit]) + else: + sql_query = f""" + SELECT * FROM chunks + WHERE ({where_clause}) + AND scope IN ({scope_placeholders}) + LIMIT ? + """ + params.append(limit) + + try: + rows = self.conn.execute(sql_query, params).fetchall() + return [ + SearchResult( + path=row["path"], + start_line=row["start_line"], + end_line=row["end_line"], + score=0.65, + snippet=self._truncate_text(row["text"], 500), + source=row["source"], + user_id=row["user_id"], + metadata=json.loads(row["metadata"]) if row["metadata"] else None, ) for row in rows ] except Exception: return [] + + @staticmethod + def _merge_keyword_result_sets( + primary: List[SearchResult], + secondary: List[SearchResult], + limit: int, + ) -> List[SearchResult]: + """Deduplicate multiple keyword recall sets while preserving score ordering.""" + merged: Dict[tuple, SearchResult] = {} + for result in primary + secondary: + key = (result.path, result.start_line, result.end_line) + existing = merged.get(key) + if existing is None or result.score > existing.score: + merged[key] = result + ordered = sorted(merged.values(), key=lambda item: item.score, reverse=True) + return ordered[:limit] def delete_by_path(self, path: str): """Delete all chunks from a file""" @@ -464,6 +560,40 @@ def delete_by_path(self, path: str): DELETE FROM chunks WHERE path = ? """, (path,)) self.conn.commit() + + def delete_file_record(self, path: str): + """Delete stored file metadata for a path.""" + self.conn.execute(""" + DELETE FROM files WHERE path = ? + """, (path,)) + self.conn.commit() + + def list_indexed_paths(self, prefix: Optional[str] = None, source: Optional[str] = None) -> List[str]: + """List indexed file paths, optionally filtered by prefix/source.""" + conditions = [] + params: List[Any] = [] + if prefix: + conditions.append("path LIKE ?") + params.append(f"{prefix}%") + if source: + conditions.append("source = ?") + params.append(source) + + where_clause = f"WHERE {' AND '.join(conditions)}" if conditions else "" + rows = self.conn.execute( + f"SELECT path FROM files {where_clause} ORDER BY path ASC", + params, + ).fetchall() + return [row["path"] for row in rows] + + def list_chunks_by_path(self, path: str) -> List[MemoryChunk]: + """List all chunks for a file ordered by line range.""" + rows = self.conn.execute(""" + SELECT * FROM chunks + WHERE path = ? + ORDER BY start_line ASC, end_line ASC + """, (path,)).fetchall() + return [self._row_to_chunk(row) for row in rows] def get_file_hash(self, path: str) -> Optional[str]: """Get stored file hash""" @@ -548,7 +678,6 @@ def _cosine_similarity(vec1: List[float], vec2: List[float]) -> float: @staticmethod def _contains_cjk(text: str) -> bool: """Check if text contains CJK (Chinese/Japanese/Korean) characters""" - import re return bool(re.search(r'[\u4e00-\u9fff]', text)) @staticmethod @@ -559,7 +688,6 @@ def _build_fts_query(raw_query: str) -> Optional[str]: Works best for English and word-based languages. For CJK characters, LIKE search will be used as fallback. """ - import re # Extract words (primarily English words and numbers) tokens = re.findall(r'[A-Za-z0-9_]+', raw_query) if not tokens: diff --git a/agent/prompt/builder.py b/agent/prompt/builder.py index 0856db70f..79f61929c 100644 --- a/agent/prompt/builder.py +++ b/agent/prompt/builder.py @@ -220,6 +220,16 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]: "", ] + lines.extend([ + "### RAG Retrieval Rules", + "", + "- For knowledge-base questions, do not stop after reading the first matching document.", + "- If `memory_search` returns multiple relevant knowledge files, you must inspect at least the top 2 distinct files with `memory_get` before answering.", + "- If the result list includes complementary documents such as a manual, checklist, FAQ, issue list, troubleshooting guide, or spec, compare them and synthesize the answer across sources.", + "- In the final answer, cite every source you actually relied on. Do not claim coverage from a single file when multiple relevant files were found.", + "", + ]) + return lines diff --git a/agent/tools/__init__.py b/agent/tools/__init__.py index 8bfc6de92..8d2cc7ded 100644 --- a/agent/tools/__init__.py +++ b/agent/tools/__init__.py @@ -9,6 +9,7 @@ from agent.tools.bash.bash import Bash from agent.tools.ls.ls import Ls from agent.tools.send.send import Send +from agent.tools.knowledge_import.knowledge_import import KnowledgeImportTool # Import memory tools from agent.tools.memory.memory_search import MemorySearchTool @@ -117,6 +118,7 @@ def _import_browser_tool(): 'Bash', 'Ls', 'Send', + 'KnowledgeImportTool', 'MemorySearchTool', 'MemoryGetTool', 'EnvConfig', diff --git a/agent/tools/knowledge_import/__init__.py b/agent/tools/knowledge_import/__init__.py new file mode 100644 index 000000000..a614e0641 --- /dev/null +++ b/agent/tools/knowledge_import/__init__.py @@ -0,0 +1,3 @@ +from agent.tools.knowledge_import.knowledge_import import KnowledgeImportTool + +__all__ = ["KnowledgeImportTool"] diff --git a/agent/tools/knowledge_import/knowledge_import.py b/agent/tools/knowledge_import/knowledge_import.py new file mode 100644 index 000000000..e1649c054 --- /dev/null +++ b/agent/tools/knowledge_import/knowledge_import.py @@ -0,0 +1,184 @@ +""" +Knowledge import tool. + +Copies or moves uploaded files into knowledge/imports and triggers incremental +RAG indexing so the file becomes visible in the knowledge view. +""" + +import asyncio +import os +import re +import shutil +import threading +from pathlib import Path +from typing import Any, Dict + +from agent.memory.document_parser import SUPPORTED_DOCUMENT_EXTENSIONS +from agent.tools.base_tool import BaseTool, ToolResult +from common.utils import expand_path + + +class KnowledgeImportTool(BaseTool): + """Import an existing local file into knowledge/imports.""" + + name: str = "knowledge_import" + description: str = ( + "Import a local file into knowledge/imports so it becomes part of the " + "knowledge base and RAG index. Use this for uploaded PDF/Word/Excel/" + "Markdown/Text files that should be stored in the knowledge library." + ) + params: dict = { + "type": "object", + "properties": { + "source_path": { + "type": "string", + "description": "Existing local file path, typically an uploaded file under tmp/" + }, + "target_subdir": { + "type": "string", + "description": "Optional subdirectory under knowledge/imports/ (e.g. 'sales/2026-q2')" + }, + "filename": { + "type": "string", + "description": "Optional target filename. Keeps original extension if omitted." + }, + "move": { + "type": "boolean", + "description": "Move the source file instead of copying it. Default: false", + "default": False + } + }, + "required": ["source_path"] + } + + def __init__(self, config: dict = None): + self.config = config or {} + self.memory_manager = self.config.get("memory_manager") + + def execute(self, args: Dict[str, Any]) -> ToolResult: + source_path = str(args.get("source_path", "")).strip() + target_subdir = str(args.get("target_subdir", "")).strip().strip("/\\") + filename = str(args.get("filename", "")).strip() + move = bool(args.get("move", False)) + + if not source_path: + return ToolResult.fail("Error: source_path parameter is required") + + absolute_source = Path(expand_path(source_path)).resolve() + if not absolute_source.exists() or not absolute_source.is_file(): + return ToolResult.fail(f"Error: source file not found: {source_path}") + + suffix = absolute_source.suffix.lower() + if suffix not in SUPPORTED_DOCUMENT_EXTENSIONS: + return ToolResult.fail( + f"Error: unsupported file type for knowledge import: {suffix}. " + f"Supported: {', '.join(sorted(SUPPORTED_DOCUMENT_EXTENSIONS))}" + ) + + workspace_root = self._get_workspace_root() + if not workspace_root: + return ToolResult.fail("Error: workspace root is not configured") + + safe_subdir = self._sanitize_subdir(target_subdir) + target_dir = workspace_root / "knowledge" / "imports" + if safe_subdir: + target_dir = target_dir / safe_subdir + target_dir.mkdir(parents=True, exist_ok=True) + + target_name = self._sanitize_filename(filename) if filename else absolute_source.name + if not Path(target_name).suffix: + target_name += suffix + if Path(target_name).suffix.lower() != suffix: + target_name = f"{Path(target_name).stem}{suffix}" + + target_path = self._dedupe_target_path(target_dir / target_name) + + try: + if move: + shutil.move(str(absolute_source), str(target_path)) + else: + shutil.copy2(str(absolute_source), str(target_path)) + + reindex_result = None + if self.memory_manager: + reindex_result = self._run_sync_imports() + + rel_path = target_path.relative_to(workspace_root).as_posix() + result = { + "message": f"Imported file into knowledge base: {rel_path}", + "path": rel_path, + "source_path": str(absolute_source), + "doc_type": suffix.lstrip("."), + "moved": move, + } + if reindex_result: + result["indexing"] = reindex_result + return ToolResult.success(result) + except Exception as e: + return ToolResult.fail(f"Error importing knowledge file: {str(e)}") + + def _run_sync_imports(self): + """Run incremental imports sync from a sync tool context.""" + try: + return asyncio.run(self.memory_manager.sync_imports(force=True)) + except RuntimeError: + result_holder = {} + error_holder = [] + + def _runner(): + loop = asyncio.new_event_loop() + try: + asyncio.set_event_loop(loop) + result_holder["result"] = loop.run_until_complete( + self.memory_manager.sync_imports(force=True) + ) + except Exception as e: + error_holder.append(e) + finally: + loop.close() + + thread = threading.Thread(target=_runner, daemon=True) + thread.start() + thread.join() + if error_holder: + raise error_holder[0] + return result_holder.get("result") + + def _get_workspace_root(self) -> Path: + if self.memory_manager: + return self.memory_manager.config.get_workspace().resolve() + cwd = self.config.get("cwd") or os.getcwd() + return Path(expand_path(cwd)).resolve() + + @staticmethod + def _sanitize_subdir(value: str) -> str: + if not value: + return "" + normalized = value.replace("\\", "/").strip("/") + normalized = re.sub(r"/+", "/", normalized) + parts = [] + for part in normalized.split("/"): + part = re.sub(r"[^0-9A-Za-z_\-\u4e00-\u9fff.]", "-", part).strip(".-") + if not part or part in {".", ".."}: + continue + parts.append(part) + return "/".join(parts) + + @staticmethod + def _sanitize_filename(value: str) -> str: + sanitized = re.sub(r"[^0-9A-Za-z_\-\u4e00-\u9fff.]", "-", value).strip(".-") + return sanitized or "imported-file" + + @staticmethod + def _dedupe_target_path(path: Path) -> Path: + if not path.exists(): + return path + stem = path.stem + suffix = path.suffix + parent = path.parent + index = 2 + while True: + candidate = parent / f"{stem}-{index}{suffix}" + if not candidate.exists(): + return candidate + index += 1 diff --git a/agent/tools/memory/memory_get.py b/agent/tools/memory/memory_get.py index ec466849b..a9bd55d3b 100644 --- a/agent/tools/memory/memory_get.py +++ b/agent/tools/memory/memory_get.py @@ -78,50 +78,35 @@ def execute(self, args: dict): return ToolResult.fail("Error: path parameter is required") try: - workspace_dir = self.memory_manager.config.get_workspace() - - # Auto-prepend memory/ if not present and not absolute path - # Exceptions: MEMORY.md in root, knowledge/ files at workspace root - if not path.startswith('memory/') and not path.startswith('knowledge/') and not path.startswith('/') and path != 'MEMORY.md': - path = f'memory/{path}' - - file_path = (workspace_dir / path).resolve() - workspace_resolved = workspace_dir.resolve() - - if not str(file_path).startswith(str(workspace_resolved) + '/') and file_path != workspace_resolved: - return ToolResult.fail(f"Error: Access denied: path outside workspace") - - if not file_path.exists(): - return ToolResult.fail(f"Error: File not found: {path}") - - content = file_path.read_text(encoding='utf-8') - lines = content.split('\n') - - # Handle line range - if start_line < 1: - start_line = 1 - - start_idx = start_line - 1 - - if num_lines: - end_idx = start_idx + num_lines - selected_lines = lines[start_idx:end_idx] - else: - selected_lines = lines[start_idx:] - - result = '\n'.join(selected_lines) - - # Add metadata - total_lines = len(lines) - shown_lines = len(selected_lines) - + result = self.memory_manager.read_document_range( + path=path, + start_line=start_line, + num_lines=num_lines, + ) + metadata = result.get("metadata") or {} + section_title = metadata.get("section_title") or "" + section_path = metadata.get("section_path") or "" + page_number = metadata.get("page_number") output = [ - f"File: {path}", - f"Lines: {start_line}-{start_line + shown_lines - 1} (total: {total_lines})", + f"File: {result['path']}", + f"Lines: {result['start_line']}-{result['end_line']} (total: {result['total_lines']})", + f"Citation: {result['citation']}", "", - result ] - + + if result.get("title"): + output.append(f"Title: {result['title']}") + if page_number is not None: + output.append(f"Page: {page_number}") + if section_path: + output.append(f"Section Path: {section_path}") + elif section_title: + output.append(f"Section: {section_title}") + output.extend([ + "", + result["content"], + ]) + return ToolResult.success('\n'.join(output)) except Exception as e: diff --git a/agent/tools/memory/memory_search.py b/agent/tools/memory/memory_search.py index 1387d11c2..2ce91f898 100644 --- a/agent/tools/memory/memory_search.py +++ b/agent/tools/memory/memory_search.py @@ -95,13 +95,51 @@ def execute(self, args: dict): f"You can store new memories by writing to MEMORY.md or memory/YYYY-MM-DD.md files." ) - # Format results - output = [f"Found {len(results)} relevant memories:\n"] - + output = [ + "[RAG_RESULTS]", + f"Query: {query}", + f"Hits: {len(results)}", + f"Distinct Sources: {len({result.path for result in results})}", + "", + ] + + distinct_paths = [] + for result in results: + if result.path not in distinct_paths: + distinct_paths.append(result.path) + + if len(distinct_paths) > 1: + output.append( + "Next step: read at least the top 2 distinct relevant files with memory_get before answering." + ) + output.append("") + for i, result in enumerate(results, 1): - output.append(f"\n{i}. {result.path} (lines {result.start_line}-{result.end_line})") - output.append(f" Score: {result.score:.3f}") - output.append(f" Snippet: {result.snippet}") + metadata = result.metadata or {} + source_type = metadata.get("source_type", result.source) + title = metadata.get("title") or "Untitled" + section_title = metadata.get("section_title") or "" + category = metadata.get("category") or "" + citation = metadata.get("citation") or self.memory_manager.build_citation( + result.path, + result.start_line, + result.end_line, + metadata, + ) + + output.append( + f"[{i}] score={result.score:.3f} source={result.path} " + f"lines={result.start_line}-{result.end_line} type={source_type}" + ) + output.append(f"citation: {citation}") + output.append(f"title: {title}") + if section_title: + output.append(f"section: {section_title}") + if category: + output.append(f"category: {category}") + output.append("content:") + output.append(result.snippet) + output.append("") return ToolResult.success("\n".join(output)) diff --git a/agent/tools/write/write.py b/agent/tools/write/write.py index 5bd16f403..abdd93d2b 100644 --- a/agent/tools/write/write.py +++ b/agent/tools/write/write.py @@ -66,8 +66,8 @@ def execute(self, args: Dict[str, Any]) -> ToolResult: # Get bytes written bytes_written = len(content.encode('utf-8')) - # Auto-sync to memory database if this is a memory file - if self.memory_manager and 'memory/' in path: + # Mark memory/knowledge indexes dirty after file writes. + if self.memory_manager and ('memory/' in path or 'knowledge/' in path): self.memory_manager.mark_dirty() result = { diff --git a/bridge/agent_bridge.py b/bridge/agent_bridge.py index 3701df774..d8abddcd7 100644 --- a/bridge/agent_bridge.py +++ b/bridge/agent_bridge.py @@ -167,15 +167,14 @@ def call(self, request: LLMRequest): if session_id: kwargs['session_id'] = session_id - # Thinking mode is a global toggle independent of the channel. - # IM channels (WeChat/WeCom/DingTalk/Feishu) won't render the - # reasoning trace, but still benefit from the higher answer - # quality the thinking pass produces. + # Determine thinking: respect the global switch and keep + # behavior consistent across channels for multi-step RAG. from config import conf - kwargs['thinking'] = ( - {"type": "enabled"} if conf().get("enable_thinking", False) - else {"type": "disabled"} - ) + global_thinking = conf().get("enable_thinking", False) + if not global_thinking: + kwargs['thinking'] = {"type": "disabled"} + else: + kwargs['thinking'] = {"type": "enabled"} response = self.bot.call_with_tools(**kwargs) return self._format_response(response) @@ -222,15 +221,14 @@ def call_stream(self, request: LLMRequest): if session_id: kwargs['session_id'] = session_id - # Thinking mode is a global toggle independent of the channel. - # IM channels (WeChat/WeCom/DingTalk/Feishu) won't render the - # reasoning trace, but still benefit from the higher answer - # quality the thinking pass produces. + # Determine thinking: respect the global switch and keep + # behavior consistent across channels for multi-step RAG. from config import conf - kwargs['thinking'] = ( - {"type": "enabled"} if conf().get("enable_thinking", False) - else {"type": "disabled"} - ) + global_thinking = conf().get("enable_thinking", False) + if not global_thinking: + kwargs['thinking'] = {"type": "disabled"} + else: + kwargs['thinking'] = {"type": "enabled"} stream = self.bot.call_with_tools(**kwargs) @@ -746,4 +744,4 @@ def _refresh_conditional_tools(agent): agent.tools = [t for t in agent.tools if t.name != "web_search"] logger.info("[AgentBridge] web_search tool removed (API key no longer available)") except Exception as e: - logger.debug(f"[AgentBridge] Failed to refresh conditional tools: {e}") \ No newline at end of file + logger.debug(f"[AgentBridge] Failed to refresh conditional tools: {e}") diff --git a/bridge/agent_initializer.py b/bridge/agent_initializer.py index 32768d397..c3b3d66c1 100644 --- a/bridge/agent_initializer.py +++ b/bridge/agent_initializer.py @@ -304,6 +304,8 @@ def _setup_memory_system(self, workspace_root: str, session_id: Optional[str] = # Sync memory self._sync_memory(memory_manager, session_id) + if session_id is None: + memory_manager.start_import_watcher() # Create memory tools memory_tools = [ @@ -366,7 +368,7 @@ def _load_tools(self, workspace_root: str, memory_manager, memory_tools: List, s if tool: # Apply workspace config to file operation tools - if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser']: + if tool_name in ['read', 'write', 'edit', 'bash', 'grep', 'find', 'ls', 'web_fetch', 'send', 'browser', 'knowledge_import']: tool.config = file_config tool.cwd = file_config.get("cwd", getattr(tool, 'cwd', None)) if 'memory_manager' in file_config: diff --git a/channel/web/chat.html b/channel/web/chat.html index 01e1d0e4a..a037d213e 100644 --- a/channel/web/chat.html +++ b/channel/web/chat.html @@ -726,6 +726,13 @@

+
+ +
+
+
+ + +
+ +
+
diff --git a/channel/web/static/js/console.js b/channel/web/static/js/console.js index 28bb5029c..356ad7e08 100644 --- a/channel/web/static/js/console.js +++ b/channel/web/static/js/console.js @@ -38,7 +38,7 @@ const I18N = { config_max_tokens: '最大上下文 Token', config_max_tokens_hint: '对话中 Agent 能输入的最大 Token 长度,超过后会智能压缩处理', config_max_turns: '最大记忆轮次', config_max_turns_hint: '一问一答为一轮,超过后会智能压缩处理', config_max_steps: '最大执行步数', config_max_steps_hint: '单次对话中 Agent 最多调用工具的次数', - config_enable_thinking: '深度思考', config_enable_thinking_hint: '是否启用深度思考模式', + config_enable_thinking: '深度思考', config_enable_thinking_hint: '启用后在 Web 端展示模型推理过程', config_channel_type: '通道类型', config_provider: '模型厂商', config_model_name: '模型', config_custom_model_hint: '输入自定义模型名称', @@ -124,7 +124,7 @@ const I18N = { config_max_tokens: 'Max Context Tokens', config_max_tokens_hint: 'Max tokens the Agent can input per conversation, auto-compressed when exceeded', config_max_turns: 'Max Memory Turns', config_max_turns_hint: 'One Q&A pair = one turn, auto-compressed when exceeded', config_max_steps: 'Max Steps', config_max_steps_hint: 'Max tool calls the Agent can make in a single conversation', - config_enable_thinking: 'Deep Thinking', config_enable_thinking_hint: 'Enable deep thinking mode', + config_enable_thinking: 'Deep Thinking', config_enable_thinking_hint: 'Show model reasoning on web console', config_channel_type: 'Channel Type', config_provider: 'Provider', config_model_name: 'Model', config_custom_model_hint: 'Enter custom model name', @@ -204,7 +204,6 @@ function applyI18n() { document.querySelectorAll('[data-tip-key]').forEach(el => { el.setAttribute('data-tooltip', t(el.dataset.tipKey)); }); - installCfgTipPortal(); const langLabel = document.getElementById('lang-label'); if (langLabel) langLabel.textContent = currentLang === 'zh' ? '中文' : 'EN'; } @@ -216,54 +215,6 @@ function toggleLanguage() { _applyInputTooltips(); } -// Floating tooltip portal for [data-tip-key] elements. Tooltip nodes are -// appended to so they aren't clipped by overflow:hidden ancestors -// (e.g. the config panel's scroll container). -let _cfgTipPortalEl = null; -let _cfgTipPortalInstalled = false; -function installCfgTipPortal() { - if (_cfgTipPortalInstalled) return; - _cfgTipPortalInstalled = true; - - const showTip = (target) => { - const text = target.getAttribute('data-tooltip'); - if (!text) return; - if (!_cfgTipPortalEl) { - _cfgTipPortalEl = document.createElement('div'); - _cfgTipPortalEl.className = 'cfg-tip-floating'; - document.body.appendChild(_cfgTipPortalEl); - } - _cfgTipPortalEl.textContent = text; - const rect = target.getBoundingClientRect(); - // Render once to measure, then position above the target, centered. - _cfgTipPortalEl.style.left = '0px'; - _cfgTipPortalEl.style.top = '0px'; - _cfgTipPortalEl.classList.add('show'); - const tipRect = _cfgTipPortalEl.getBoundingClientRect(); - let left = rect.left + rect.width / 2 - tipRect.width / 2; - // Clamp horizontally to the viewport with an 8px gutter. - left = Math.max(8, Math.min(left, window.innerWidth - tipRect.width - 8)); - const top = rect.top - tipRect.height - 6; - _cfgTipPortalEl.style.left = left + 'px'; - _cfgTipPortalEl.style.top = top + 'px'; - }; - const hideTip = () => { - if (_cfgTipPortalEl) _cfgTipPortalEl.classList.remove('show'); - }; - - document.addEventListener('mouseover', (e) => { - const target = e.target.closest('[data-tip-key]'); - if (target) showTip(target); - }); - document.addEventListener('mouseout', (e) => { - const target = e.target.closest('[data-tip-key]'); - if (target) hideTip(); - }); - // Hide on scroll/resize so the tooltip doesn't drift away from its anchor. - window.addEventListener('scroll', hideTip, true); - window.addEventListener('resize', hideTip); -} - // ===================================================================== // Theme // ===================================================================== @@ -412,32 +363,13 @@ function _buildVideoHtml(url) { ` ${escapeHtml(fileName)}`; } -function _openImageLightbox(src) { - let overlay = document.getElementById('cow-lightbox'); - if (!overlay) { - overlay = document.createElement('div'); - overlay.id = 'cow-lightbox'; - overlay.style.cssText = 'position:fixed;inset:0;z-index:9999;background:rgba(0,0,0,0.85);display:flex;align-items:center;justify-content:center;cursor:zoom-out;opacity:0;transition:opacity .2s'; - overlay.onclick = () => { overlay.style.opacity = '0'; setTimeout(() => overlay.style.display = 'none', 200); }; - const img = document.createElement('img'); - img.id = 'cow-lightbox-img'; - img.style.cssText = 'max-width:92vw;max-height:92vh;border-radius:8px;box-shadow:0 4px 24px rgba(0,0,0,0.5);object-fit:contain;'; - img.onclick = (e) => e.stopPropagation(); - overlay.appendChild(img); - document.body.appendChild(overlay); - } - overlay.querySelector('#cow-lightbox-img').src = src; - overlay.style.display = 'flex'; - requestAnimationFrame(() => overlay.style.opacity = '1'); -} - function _buildImageHtml(url) { const webUrl = _toWebUrl(url); const safeUrl = webUrl.replace(/"/g, '"'); return `
` + `image` + + `onclick="window.open('${safeUrl}','_blank')" ` + + `style="max-width:520px;width:100%;border-radius:10px;box-shadow:0 2px 8px rgba(0,0,0,0.15);display:block;cursor:pointer;">` + `
`; } @@ -481,12 +413,12 @@ function injectImagePreviews(html) { } function _rewriteLocalImgSrc(html) { - return html.replace(/]*?)src="([^"]+)"([^>]*?)>/gi, (match, pre, src, post) => { + return html.replace(/]*?)src="([^"]+)"/gi, (match, pre, src) => { const webSrc = _toWebUrl(src); - const safeSrc = webSrc.replace(/"/g, '"'); - const hasClick = /onclick/i.test(pre + post); - const clickAttr = hasClick ? '' : ` onclick="_openImageLightbox(this.src)" style="cursor:zoom-in;"`; - return ``; + if (webSrc !== src) { + return ` _openImageLightbox(imgEl.src); + imgEl.style.cssText = 'max-width:600px;border-radius:8px;margin:8px 0;cursor:pointer;box-shadow:0 1px 4px rgba(0,0,0,0.1);'; + imgEl.onclick = () => window.open(item.content, '_blank'); mediaEl.appendChild(imgEl); scrollChatToBottom(); @@ -2827,6 +2759,8 @@ function toggleSkill(name, currentlyEnabled) { let memoryPage = 1; let memoryCategory = 'memory'; // 'memory' | 'dream' const memoryPageSize = 10; +let _memoryCurrentFile = null; +let _memoryCurrentCategory = 'memory'; function switchMemoryTab(tab) { document.querySelectorAll('.memory-tab').forEach(el => el.classList.remove('active')); @@ -2881,7 +2815,14 @@ function loadMemoryView(page) { ${escapeHtml(f.filename)} ${typeLabel} ${sizeStr} - ${escapeHtml(f.updated_at)}`; + ${escapeHtml(f.updated_at)} + + + `; tbody.appendChild(tr); }); @@ -2899,6 +2840,8 @@ function loadMemoryView(page) { function openMemoryFile(filename, category) { category = category || 'memory'; + _memoryCurrentFile = filename; + _memoryCurrentCategory = category; fetch(`/api/memory/content?filename=${encodeURIComponent(filename)}&category=${category}`).then(r => r.json()).then(data => { if (data.status !== 'success') return; document.getElementById('memory-panel-list').classList.add('hidden'); @@ -2915,6 +2858,32 @@ function closeMemoryViewer() { document.getElementById('memory-panel-list').classList.remove('hidden'); } +function deleteCurrentMemoryFile() { + if (!_memoryCurrentFile) return; + deleteMemoryFile(_memoryCurrentFile, _memoryCurrentCategory, true); +} + +function deleteMemoryFile(filename, category, fromViewer) { + showConfirmDialog({ + title: currentLang === 'zh' ? '删除记忆' : 'Delete Memory', + message: (currentLang === 'zh' + ? `确认删除 ${filename} 吗?此操作会同时移除对应索引。` + : `Delete ${filename}? This also removes its index entries.`), + okText: currentLang === 'zh' ? '删除' : 'Delete', + onConfirm: () => { + fetch('/api/memory/delete', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ filename, category }) + }).then(r => r.json()).then(data => { + if (data.status !== 'success') return; + if (fromViewer) closeMemoryViewer(); + loadMemoryView(memoryPage); + }).catch(() => {}); + } + }); +} + // ===================================================================== // Custom Confirm Dialog // ===================================================================== @@ -3812,7 +3781,10 @@ navigateTo = function(viewId) { let _knowledgeTreeData = []; let _knowledgeRootFiles = []; let _knowledgeCurrentFile = null; +let _knowledgeCurrentTitle = ''; let _knowledgeGraphLoaded = false; +let _knowledgeCurrentChunks = []; +let _knowledgeViewerTab = 'content'; function loadKnowledgeView() { // Reset to docs tab @@ -4052,6 +4024,8 @@ function _searchFileInGroups(groups, parentPath, filename) { function openKnowledgeFile(path, title) { _knowledgeCurrentFile = path; + _knowledgeCurrentTitle = title || path; + _knowledgeViewerTab = 'content'; // Update active state in tree via data-path document.querySelectorAll('.knowledge-tree-file').forEach(el => { el.classList.toggle('active', el.dataset.path === path); @@ -4060,17 +4034,26 @@ function openKnowledgeFile(path, title) { // Immediately hide placeholder document.getElementById('knowledge-content-placeholder').classList.add('hidden'); - fetch(`/api/knowledge/read?path=${encodeURIComponent(path)}`).then(r => r.json()).then(data => { - if (data.status !== 'success') return; + Promise.allSettled([ + fetch(`/api/knowledge/read?path=${encodeURIComponent(path)}`).then(r => r.json()), + fetch(`/api/knowledge/chunks?path=${encodeURIComponent(path)}`).then(r => r.json()) + ]).then(([docResult, chunkResult]) => { + const docData = docResult.status === 'fulfilled' ? docResult.value : null; + const chunkData = chunkResult.status === 'fulfilled' ? chunkResult.value : null; + if (!docData || docData.status !== 'success') return; const viewer = document.getElementById('knowledge-content-viewer'); document.getElementById('knowledge-viewer-title').textContent = title; document.getElementById('knowledge-viewer-path').textContent = path; const bodyEl = document.getElementById('knowledge-viewer-body'); - bodyEl.innerHTML = renderMarkdown(data.content || ''); + bodyEl.innerHTML = renderMarkdown(docData.content || ''); viewer.classList.remove('hidden'); applyHighlighting(viewer); bindKnowledgeLinks(bodyEl, path); + _knowledgeCurrentChunks = chunkData && chunkData.status === 'success' ? (chunkData.chunks || []) : []; + renderKnowledgeChunks(_knowledgeCurrentChunks); + switchKnowledgeViewerTab('content'); + // Mobile: hide sidebar, show content if (window.innerWidth < 768) { document.getElementById('knowledge-sidebar').classList.add('hidden'); @@ -4078,11 +4061,135 @@ function openKnowledgeFile(path, title) { }).catch(() => {}); } +function createKnowledgePage() { + const title = window.prompt(currentLang === 'zh' ? '输入知识标题' : 'Enter knowledge title'); + if (!title) return; + const relPath = window.prompt( + currentLang === 'zh' + ? '可选:输入相对路径(如 notes/my-note.md),留空自动生成' + : 'Optional: relative path (e.g. notes/my-note.md). Leave blank to auto-generate', + '' + ); + const content = window.prompt( + currentLang === 'zh' ? '可选:输入初始内容' : 'Optional: initial content', + '' + ); + fetch('/api/knowledge/create', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ title, path: relPath || '', content: content || '' }) + }).then(r => r.json()).then(data => { + if (data.status !== 'success') return; + loadKnowledgeView(); + setTimeout(() => openKnowledgeFile(data.path, title), 100); + }).catch(() => {}); +} + +function deleteCurrentKnowledgeFile() { + if (!_knowledgeCurrentFile) return; + showConfirmDialog({ + title: currentLang === 'zh' ? '删除知识' : 'Delete Knowledge', + message: (currentLang === 'zh' + ? `确认删除 ${_knowledgeCurrentFile} 吗?此操作会同时移除对应索引和切片。` + : `Delete ${_knowledgeCurrentFile}? This also removes index entries and chunks.`), + okText: currentLang === 'zh' ? '删除' : 'Delete', + onConfirm: () => { + fetch('/api/knowledge/delete', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ path: _knowledgeCurrentFile }) + }).then(r => r.json()).then(data => { + if (data.status !== 'success') return; + _knowledgeCurrentFile = null; + _knowledgeCurrentTitle = ''; + document.getElementById('knowledge-content-viewer').classList.add('hidden'); + document.getElementById('knowledge-content-placeholder').classList.remove('hidden'); + loadKnowledgeView(); + }).catch(() => {}); + } + }); +} + function knowledgeMobileBack() { document.getElementById('knowledge-sidebar').classList.remove('hidden'); document.getElementById('knowledge-content-viewer').classList.add('hidden'); } +function switchKnowledgeViewerTab(tab) { + _knowledgeViewerTab = tab; + document.querySelectorAll('.knowledge-view-tab').forEach(el => { + el.classList.remove('active', 'bg-white', 'dark:bg-slate-700', 'text-slate-800', 'dark:text-slate-100', 'shadow-sm'); + el.classList.add('text-slate-500', 'dark:text-slate-400'); + }); + const activeTab = document.getElementById(`knowledge-view-tab-${tab}`); + if (activeTab) { + activeTab.classList.add('active', 'bg-white', 'dark:bg-slate-700', 'text-slate-800', 'dark:text-slate-100', 'shadow-sm'); + activeTab.classList.remove('text-slate-500', 'dark:text-slate-400'); + } + + const bodyEl = document.getElementById('knowledge-viewer-body'); + const chunksEl = document.getElementById('knowledge-viewer-chunks'); + if (!bodyEl || !chunksEl) return; + + if (tab === 'chunks') { + bodyEl.classList.add('hidden'); + chunksEl.classList.remove('hidden'); + } else { + chunksEl.classList.add('hidden'); + bodyEl.classList.remove('hidden'); + } +} + +function renderKnowledgeChunks(chunks) { + const container = document.getElementById('knowledge-viewer-chunks'); + if (!container) return; + + if (!chunks || chunks.length === 0) { + container.innerHTML = ` +
+ +

暂无可用切片

+
`; + return; + } + + container.innerHTML = chunks.map(chunk => { + const meta = []; + if (chunk.page_number) meta.push(`页码 ${escapeHtml(String(chunk.page_number))}`); + if (chunk.section_title) meta.push(`章节 ${escapeHtml(chunk.section_title)}`); + meta.push(`行 ${escapeHtml(String(chunk.start_line))}-${escapeHtml(String(chunk.end_line))}`); + return ` +
+
+
+ ${escapeHtml(String(chunk.index))} +
+
+
${escapeHtml(chunk.citation || '')}
+
${meta.join(' · ')}
+
+
+
+
${escapeHtml(chunk.content || chunk.preview || '')}
+
+
`; + }).join(''); +} + +function reindexKnowledgeImports() { + fetch('/api/knowledge/reindex_imports', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ force: true }) + }).then(r => r.json()).then(data => { + if (data.status !== 'success') return; + loadKnowledgeView(); + if (_knowledgeCurrentFile) { + setTimeout(() => openKnowledgeFile(_knowledgeCurrentFile, document.getElementById('knowledge-viewer-title').textContent || _knowledgeCurrentFile), 100); + } + }).catch(() => {}); +} + function switchKnowledgeTab(tab) { document.querySelectorAll('.knowledge-tab').forEach(el => el.classList.remove('active')); document.getElementById('knowledge-tab-' + tab).classList.add('active'); diff --git a/channel/web/web_channel.py b/channel/web/web_channel.py index ad1301aa9..f6498aa87 100644 --- a/channel/web/web_channel.py +++ b/channel/web/web_channel.py @@ -579,8 +579,13 @@ def startup(self): '/api/skills', 'SkillsHandler', '/api/memory', 'MemoryHandler', '/api/memory/content', 'MemoryContentHandler', + '/api/memory/delete', 'MemoryDeleteHandler', '/api/knowledge/list', 'KnowledgeListHandler', '/api/knowledge/read', 'KnowledgeReadHandler', + '/api/knowledge/create', 'KnowledgeCreateHandler', + '/api/knowledge/delete', 'KnowledgeDeleteHandler', + '/api/knowledge/chunks', 'KnowledgeChunksHandler', + '/api/knowledge/reindex_imports', 'KnowledgeReindexImportsHandler', '/api/knowledge/graph', 'KnowledgeGraphHandler', '/api/scheduler', 'SchedulerHandler', '/api/sessions', 'SessionsHandler', @@ -771,14 +776,14 @@ class ConfigHandler: _RECOMMENDED_MODELS = [ const.MINIMAX_M2_7_HIGHSPEED, const.MINIMAX_M2_7, const.MINIMAX_M2_5, const.MINIMAX_M2_1, const.MINIMAX_M2_1_LIGHTNING, - const.DEEPSEEK_V4_PRO, const.DEEPSEEK_V4_FLASH, const.DEEPSEEK_CHAT, const.DEEPSEEK_REASONER, + const.GLM_5_TURBO, const.GLM_5, const.GLM_4_7, + const.QWEN36_PLUS, const.QWEN35_PLUS, const.QWEN3_MAX, + const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2, + const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE, const.CLAUDE_4_6_SONNET, const.CLAUDE_4_7_OPUS, const.CLAUDE_4_6_OPUS, const.CLAUDE_4_5_SONNET, const.GEMINI_31_FLASH_LITE_PRE, const.GEMINI_31_PRO_PRE, const.GEMINI_3_FLASH_PRE, const.GPT_54, const.GPT_54_MINI, const.GPT_54_NANO, const.GPT_5, const.GPT_41, const.GPT_4o, - const.GLM_5_1, const.GLM_5_TURBO, const.GLM_5, const.GLM_4_7, - const.QWEN36_PLUS, const.QWEN35_PLUS, const.QWEN3_MAX, - const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE, - const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2, + const.DEEPSEEK_CHAT, const.DEEPSEEK_REASONER, ] PROVIDER_MODELS = OrderedDict([ @@ -789,12 +794,33 @@ class ConfigHandler: "api_base_default": None, "models": [const.MINIMAX_M2_7, const.MINIMAX_M2_7_HIGHSPEED, const.MINIMAX_M2_5, const.MINIMAX_M2_1, const.MINIMAX_M2_1_LIGHTNING], }), - ("deepseek", { - "label": "DeepSeek", - "api_key_field": "deepseek_api_key", - "api_base_key": "deepseek_api_base", - "api_base_default": "https://api.deepseek.com/v1", - "models": [const.DEEPSEEK_V4_PRO, const.DEEPSEEK_V4_FLASH, const.DEEPSEEK_CHAT, const.DEEPSEEK_REASONER], + ("zhipu", { + "label": "智谱AI", + "api_key_field": "zhipu_ai_api_key", + "api_base_key": "zhipu_ai_api_base", + "api_base_default": "https://open.bigmodel.cn/api/paas/v4", + "models": [const.GLM_5_TURBO, const.GLM_5, const.GLM_4_7], + }), + ("dashscope", { + "label": "通义千问", + "api_key_field": "dashscope_api_key", + "api_base_key": None, + "api_base_default": None, + "models": [const.QWEN36_PLUS, const.QWEN35_PLUS, const.QWEN3_MAX], + }), + ("moonshot", { + "label": "Kimi", + "api_key_field": "moonshot_api_key", + "api_base_key": "moonshot_base_url", + "api_base_default": "https://api.moonshot.cn/v1", + "models": [const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2], + }), + ("doubao", { + "label": "豆包", + "api_key_field": "ark_api_key", + "api_base_key": "ark_base_url", + "api_base_default": "https://ark.cn-beijing.volces.com/api/v3", + "models": [const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE], }), ("claudeAPI", { "label": "Claude", @@ -817,33 +843,12 @@ class ConfigHandler: "api_base_default": "https://api.openai.com/v1", "models": [const.GPT_54, const.GPT_54_MINI, const.GPT_54_NANO, const.GPT_5, const.GPT_41, const.GPT_4o], }), - ("zhipu", { - "label": "智谱AI", - "api_key_field": "zhipu_ai_api_key", - "api_base_key": "zhipu_ai_api_base", - "api_base_default": "https://open.bigmodel.cn/api/paas/v4", - "models": [const.GLM_5_1, const.GLM_5_TURBO, const.GLM_5, const.GLM_4_7], - }), - ("dashscope", { - "label": "通义千问", - "api_key_field": "dashscope_api_key", - "api_base_key": None, - "api_base_default": None, - "models": [const.QWEN36_PLUS, const.QWEN35_PLUS, const.QWEN3_MAX], - }), - ("doubao", { - "label": "豆包", - "api_key_field": "ark_api_key", - "api_base_key": "ark_base_url", - "api_base_default": "https://ark.cn-beijing.volces.com/api/v3", - "models": [const.DOUBAO_SEED_2_PRO, const.DOUBAO_SEED_2_CODE], - }), - ("moonshot", { - "label": "Kimi", - "api_key_field": "moonshot_api_key", - "api_base_key": "moonshot_base_url", - "api_base_default": "https://api.moonshot.cn/v1", - "models": [const.KIMI_K2_6, const.KIMI_K2_5, const.KIMI_K2], + ("deepseek", { + "label": "DeepSeek", + "api_key_field": "deepseek_api_key", + "api_base_key": "deepseek_api_base", + "api_base_default": "https://api.deepseek.com/v1", + "models": [const.DEEPSEEK_CHAT, const.DEEPSEEK_REASONER], }), ("modelscope", { "label": "ModelScope", @@ -1621,6 +1626,30 @@ def GET(self): return json.dumps({"status": "error", "message": str(e)}) +class MemoryDeleteHandler: + def POST(self): + _require_auth() + web.header('Content-Type', 'application/json; charset=utf-8') + try: + from agent.memory.service import MemoryService + body = json.loads(web.data() or "{}") + filename = body.get("filename", "") + category = body.get("category", "memory") + if not filename: + return json.dumps({"status": "error", "message": "filename required"}) + workspace_root = _get_workspace_root() + service = MemoryService(workspace_root) + result = service.delete_file(filename, category=category) + return json.dumps({"status": "success", **result}, ensure_ascii=False) + except ValueError: + return json.dumps({"status": "error", "message": "invalid filename"}) + except FileNotFoundError: + return json.dumps({"status": "error", "message": "file not found"}) + except Exception as e: + logger.error(f"[WebChannel] Memory delete API error: {e}") + return json.dumps({"status": "error", "message": str(e)}) + + class SchedulerHandler: def GET(self): _require_auth() @@ -1909,6 +1938,74 @@ def GET(self): return json.dumps({"status": "error", "message": str(e)}) +class KnowledgeCreateHandler: + def POST(self): + _require_auth() + web.header('Content-Type', 'application/json; charset=utf-8') + try: + from agent.knowledge.service import KnowledgeService + body = json.loads(web.data() or "{}") + svc = KnowledgeService(_get_workspace_root()) + result = svc.create_file( + title=body.get("title", ""), + rel_path=body.get("path"), + content=body.get("content", ""), + ) + return json.dumps({"status": "success", **result}, ensure_ascii=False) + except Exception as e: + logger.error(f"[WebChannel] Knowledge create error: {e}") + return json.dumps({"status": "error", "message": str(e)}) + + +class KnowledgeDeleteHandler: + def POST(self): + _require_auth() + web.header('Content-Type', 'application/json; charset=utf-8') + try: + from agent.knowledge.service import KnowledgeService + body = json.loads(web.data() or "{}") + svc = KnowledgeService(_get_workspace_root()) + result = svc.delete_file(body.get("path", "")) + return json.dumps({"status": "success", **result}, ensure_ascii=False) + except (ValueError, FileNotFoundError) as e: + return json.dumps({"status": "error", "message": str(e)}) + except Exception as e: + logger.error(f"[WebChannel] Knowledge delete error: {e}") + return json.dumps({"status": "error", "message": str(e)}) + + +class KnowledgeChunksHandler: + def GET(self): + _require_auth() + web.header('Content-Type', 'application/json; charset=utf-8') + try: + from agent.knowledge.service import KnowledgeService + params = web.input(path='') + svc = KnowledgeService(_get_workspace_root()) + result = svc.get_chunks(params.path) + return json.dumps({"status": "success", **result}, ensure_ascii=False) + except (ValueError, FileNotFoundError) as e: + return json.dumps({"status": "error", "message": str(e)}) + except Exception as e: + logger.error(f"[WebChannel] Knowledge chunks error: {e}") + return json.dumps({"status": "error", "message": str(e)}) + + +class KnowledgeReindexImportsHandler: + def POST(self): + _require_auth() + web.header('Content-Type', 'application/json; charset=utf-8') + try: + from agent.knowledge.service import KnowledgeService + body = json.loads(web.data() or "{}") + svc = KnowledgeService(_get_workspace_root()) + result = svc.reindex_imports(force=bool(body.get("force", True))) + return json.dumps({"status": "success", **result}, ensure_ascii=False) + except Exception as e: + logger.error(f"[WebChannel] Knowledge imports reindex error: {e}") + return json.dumps({"status": "error", "message": str(e)}) + + class KnowledgeGraphHandler: def GET(self): _require_auth()