diff --git a/backend/app/mcp_server/server.py b/backend/app/mcp_server/server.py index 0326ea71a..630871f11 100644 --- a/backend/app/mcp_server/server.py +++ b/backend/app/mcp_server/server.py @@ -208,6 +208,7 @@ def _register_knowledge_tools() -> None: # The decorators will add tools to the global registry from app.mcp_server.tool_registry import register_tools_to_server from app.mcp_server.tools import ( # noqa: F401 side-effect: triggers @mcp_tool registration + dingtalk_upload, knowledge, ) diff --git a/backend/app/mcp_server/tools/__init__.py b/backend/app/mcp_server/tools/__init__.py index a4e5f4005..5de6d5f93 100644 --- a/backend/app/mcp_server/tools/__init__.py +++ b/backend/app/mcp_server/tools/__init__.py @@ -6,8 +6,10 @@ Contains tools for: - System MCP (silent_exit) -- Knowledge MCP (list_knowledge_bases, list_documents, create_knowledge_base, - create_document, update_document_content) +- Knowledge MCP (wegent_kb_search_knowledge_base, wegent_kb_list_knowledge_bases, + wegent_kb_list_documents, wegent_kb_create_knowledge_base, + wegent_kb_create_document, wegent_kb_read_document_content, + wegent_kb_update_document_content) Knowledge MCP tools are implemented independently using the KnowledgeOrchestrator service layer, with Celery-based async task scheduling for indexing and summary. @@ -26,6 +28,7 @@ get_registered_mcp_tools, mcp_tool, ) +from .dingtalk_upload import DINGTALK_UPLOAD_MCP_TOOLS from .knowledge import KNOWLEDGE_MCP_TOOLS from .silent_exit import silent_exit from .subscription import create_subscription, preview_subscription @@ -33,6 +36,7 @@ __all__ = [ "silent_exit", "KNOWLEDGE_MCP_TOOLS", + "DINGTALK_UPLOAD_MCP_TOOLS", "mcp_tool", "get_registered_mcp_tools", "build_mcp_tools_dict", diff --git a/backend/app/mcp_server/tools/decorator.py b/backend/app/mcp_server/tools/decorator.py index cff1dcbd0..da59265ad 100644 --- a/backend/app/mcp_server/tools/decorator.py +++ b/backend/app/mcp_server/tools/decorator.py @@ -16,7 +16,7 @@ from app.mcp_server.tools.decorator import mcp_tool @mcp_tool( - name="list_knowledge_bases", + name="wegent_kb_list_knowledge_bases", description="List all accessible knowledge bases", server="knowledge", exclude_params=["token_info"], # Hidden from MCP schema @@ -201,7 +201,7 @@ def mcp_tool( Example: @mcp_tool( - name="list_knowledge_bases", + name="wegent_kb_list_knowledge_bases", description="List all accessible knowledge bases", server="knowledge", exclude_params=["token_info"], diff --git a/backend/app/mcp_server/tools/dingtalk_upload.py b/backend/app/mcp_server/tools/dingtalk_upload.py new file mode 100644 index 000000000..ee5d75377 --- /dev/null +++ b/backend/app/mcp_server/tools/dingtalk_upload.py @@ -0,0 +1,413 @@ +# SPDX-FileCopyrightText: 2025 Weibo, Inc. +# +# SPDX-License-Identifier: Apache-2.0 + +""" +MCP tools for DingTalk document upload to Wegent knowledge base. + +This module provides tools to upload DingTalk documents to Wegent knowledge base +without using sandbox exec commands. It handles: +- Downloading files from URLs and uploading to Wegent +- Saving content to temporary files and uploading to Wegent +""" + +import ipaddress +import logging +import os +import re +import socket +import unicodedata +from typing import Any, Dict, Optional +from urllib.parse import unquote, urlparse + +import requests +from sqlalchemy.orm import Session + +from app.db.session import SessionLocal +from app.mcp_server.auth import TaskTokenInfo +from app.mcp_server.tools.decorator import build_mcp_tools_dict, mcp_tool +from app.models.user import User +from app.services.context import context_service + +logger = logging.getLogger(__name__) + +# Default timeout for HTTP requests +DEFAULT_TIMEOUT = 300 # 5 minutes + +# Security limits for URL downloads +MAX_DOWNLOAD_BYTES = 100 * 1024 * 1024 # 100 MB +ALLOWED_SCHEMES = {"http", "https"} +BLOCKED_HOSTS = { + "localhost", + "127.0.0.1", + "::1", + "0.0.0.0", +} +BLOCKED_HOST_PATTERNS = [ + re.compile(r"^127\.\d+\.\d+\.\d+$"), # 127.x.x.x + re.compile(r"^10\.\d+\.\d+\.\d+$"), # 10.x.x.x + re.compile(r"^172\.(1[6-9]|2\d|3[01])\.\d+\.\d+$"), # 172.16-31.x.x + re.compile(r"^192\.168\.\d+\.\d+$"), # 192.168.x.x + re.compile(r"^169\.254\.\d+\.\d+$"), # Link-local + re.compile(r"\.internal$"), # .internal domains +] + + +def _validate_download_url(url: str) -> None: + """ + Validate download URL to prevent SSRF attacks. + + Args: + url: URL to validate + + Raises: + ValueError: If URL is invalid or blocked + """ + parsed = urlparse(url) + + # Check scheme + if parsed.scheme not in ALLOWED_SCHEMES: + raise ValueError(f"URL scheme must be http or https, got: {parsed.scheme}") + + # Check hostname + hostname = parsed.hostname + if not hostname: + raise ValueError("URL must have a non-empty hostname") + + # Check blocked hosts + if hostname.lower() in BLOCKED_HOSTS: + raise ValueError(f"URL hostname is blocked: {hostname}") + + # Check blocked patterns + for pattern in BLOCKED_HOST_PATTERNS: + if pattern.match(hostname): + raise ValueError(f"URL hostname matches blocked pattern: {hostname}") + + # Resolve and check IP address + try: + addrinfo = socket.getaddrinfo(hostname, None) + for _, _, _, _, sockaddr in addrinfo: + ip = ipaddress.ip_address(sockaddr[0]) + # Check for private, loopback, link-local, or reserved IPs + if ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_reserved + or ip.is_multicast + ): + raise ValueError(f"URL resolves to blocked IP: {ip}") + except socket.gaierror as e: + raise ValueError(f"Failed to resolve hostname: {hostname}") from e + + +def _get_user_from_token(db: Session, token_info: TaskTokenInfo) -> Optional[User]: + """Get user from token info.""" + return db.query(User).filter(User.id == token_info.user_id).first() + + +def _download_file_from_url(url: str) -> bytes: + """ + Download file from URL with security validations. + + Args: + url: Download URL + + Returns: + Downloaded file content as bytes + + Raises: + ValueError: If URL validation fails + requests.RequestException: If download fails + """ + # Validate URL before downloading + _validate_download_url(url) + + headers = { + "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36") + } + + # Use stream=True and limit redirects for security + with requests.get( + url, + headers=headers, + stream=True, + timeout=DEFAULT_TIMEOUT, + allow_redirects=True, + ) as response: + response.raise_for_status() + + # Re-validate after redirects (if Location header present) + if response.history: + final_url = response.url + _validate_download_url(final_url) + + total_size = 0 + chunks = [] + + for chunk in response.iter_content(chunk_size=8192): + if chunk: + total_size += len(chunk) + if total_size > MAX_DOWNLOAD_BYTES: + raise ValueError( + f"Download exceeds maximum size of {MAX_DOWNLOAD_BYTES} bytes" + ) + chunks.append(chunk) + + return b"".join(chunks) + + +def _get_filename_from_url(url: str, default_filename: str = "document") -> str: + """ + Extract and sanitize filename from URL or use default. + + Decodes URL-encoded sequences, removes control/NUL characters, strips + unsafe path separators and leading dots, and collapses unsafe characters + to produce a safe, predictable filename token. + + Args: + url: The URL to parse + default_filename: Default filename if not found in URL + + Returns: + Sanitized filename with extension, or default_filename if result is empty + """ + parsed = urlparse(url) + path = parsed.path + if path: + # URL-decode percent-encoded sequences (e.g. %20 -> space) + basename = unquote(os.path.basename(path)) + + # Remove NUL bytes and ASCII/Unicode control characters + basename = "".join( + ch for ch in basename if ch != "\x00" and unicodedata.category(ch) != "Cc" + ) + + # Strip path separators that could enable directory traversal + basename = basename.replace("/", "").replace("\\", "") + + # Strip leading dots to avoid hidden files (e.g. ".bashrc") + basename = basename.lstrip(".") + + # Collapse runs of whitespace/unsafe chars to a single underscore, + # keeping alphanumerics, hyphens, underscores, dots, and Unicode letters + basename = re.sub(r"[^\w\-.]", "_", basename) + basename = re.sub(r"_+", "_", basename).strip("_") + + if basename and "." in basename: + return basename + + return default_filename + + +@mcp_tool( + name="dingtalk_upload_file_from_url", + description=( + "Download a file from URL and upload it to Wegent as an attachment. " + "Returns attachment_id for use with wegent_kb_create_document." + ), + server="knowledge", + param_descriptions={ + "download_url": ( + "The URL to download the file from " + "(e.g., from dingtalk-docs.download_file)" + ), + "filename": ( + "Optional filename with extension. " + "If not provided, will extract from URL or use default" + ), + "default_extension": ( + "Default file extension if filename cannot be determined " + "(e.g., 'docx', 'pdf')" + ), + }, +) +def upload_file_from_url( + token_info: TaskTokenInfo, + download_url: str, + filename: Optional[str] = None, + default_extension: str = "bin", +) -> Dict[str, Any]: + """ + Download a file from URL and upload to Wegent as an attachment. + + This tool replaces the sandbox exec approach for downloading files + from DingTalk and uploading to Wegent. It handles the entire process + including temporary file management and cleanup. + + Args: + token_info: Task token information containing user context + download_url: The URL to download the file from + filename: Optional filename with extension (e.g., "document.docx") + default_extension: Default extension if filename cannot be determined + + Returns: + Dict with attachment_id and upload status + + Example: + upload_file_from_url( + download_url="https://alidocs.dingtalk.com/...", + filename="Specifications.docx" + ) + Returns: { + "attachment_id": 123, + "filename": "Specifications.docx", + "size": 10240 + } + """ + db = SessionLocal() + + try: + user = _get_user_from_token(db, token_info) + if not user: + return {"error": "User not found"} + + # Determine filename + if not filename: + filename = _get_filename_from_url( + download_url, f"document.{default_extension}" + ) + + # Download file from URL with security validations + logger.info(f"Downloading file from URL: {download_url}") + binary_data = _download_file_from_url(download_url) + file_size = len(binary_data) + + # Upload to Wegent using context service + context, truncation_info = context_service.upload_attachment( + db=db, + user_id=user.id, + filename=filename, + binary_data=binary_data, + subtask_id=0, # Unlinked attachment + ) + + return { + "attachment_id": context.id, + "filename": filename, + "size": file_size, + "mime_type": context.mime_type, + "truncated": (truncation_info.is_truncated if truncation_info else False), + } + + except requests.RequestException as e: + logger.error(f"Failed to download file from URL: {e!s}") + db.rollback() + return {"error": f"Failed to download file: {e!s}"} + + except ValueError as e: + logger.error(f"URL validation failed: {e!s}") + db.rollback() + return {"error": f"URL validation failed: {e!s}"} + + except Exception as e: + logger.error(f"Error uploading file from URL: {e!s}", exc_info=True) + db.rollback() + return {"error": str(e)} + + finally: + db.close() + + +@mcp_tool( + name="dingtalk_upload_content", + description=( + "Save content to a file and upload it to Wegent as an attachment. " + "Returns attachment_id for use with wegent_kb_create_document." + ), + server="knowledge", + param_descriptions={ + "content": ( + "The content to save " + "(e.g., markdown text from dingtalk-docs.get_document_content)" + ), + "filename": "Filename with extension (e.g., 'document.md', 'spreadsheet.md')", + "encoding": "Text encoding (default: utf-8)", + }, +) +def upload_content( + token_info: TaskTokenInfo, + content: str, + filename: str, + encoding: str = "utf-8", +) -> Dict[str, Any]: + """ + Save content to a file and upload to Wegent as an attachment. + + This tool replaces the sandbox exec approach for saving content + (e.g., markdown from DingTalk online documents) to a file and + uploading to Wegent. It handles the entire process including + temporary file management and cleanup. + + Args: + token_info: Task token information containing user context + content: The text content to save + filename: Filename with extension (e.g., "document.md") + encoding: Text encoding for saving the file + + Returns: + Dict with attachment_id and upload status + + Example: + upload_content( + content="# Title\n\nContent from DingTalk...", + filename="Specifications.md" + ) + Returns: { + "attachment_id": 456, + "filename": "Specifications.md", + "size": 2048 + } + """ + db = SessionLocal() + + try: + user = _get_user_from_token(db, token_info) + if not user: + return {"error": "User not found"} + + # Validate content + if not content: + return {"error": "Content cannot be empty"} + + # Validate filename + if not filename: + return {"error": "Filename is required"} + + # Encode content to bytes in memory (no filesystem roundtrip) + try: + binary_data = content.encode(encoding) + except UnicodeEncodeError as e: + logger.error(f"Encoding error when saving content: {e!s}") + return {"error": f"Failed to encode content with {encoding}: {e!s}"} + + file_size = len(binary_data) + + # Upload to Wegent using context service + context, truncation_info = context_service.upload_attachment( + db=db, + user_id=user.id, + filename=filename, + binary_data=binary_data, + subtask_id=0, # Unlinked attachment + ) + + return { + "attachment_id": context.id, + "filename": filename, + "size": file_size, + "mime_type": context.mime_type, + "truncated": (truncation_info.is_truncated if truncation_info else False), + } + + except Exception as e: + logger.error(f"Error uploading content: {e!s}", exc_info=True) + db.rollback() + return {"error": str(e)} + + finally: + db.close() + + +# Build tool registry from decorated functions +DINGTALK_UPLOAD_MCP_TOOLS = build_mcp_tools_dict(server="knowledge") diff --git a/backend/init_data/skills/wegent-knowledge/SKILL.md b/backend/init_data/skills/wegent-knowledge/SKILL.md index f32ba8829..d364a5f63 100644 --- a/backend/init_data/skills/wegent-knowledge/SKILL.md +++ b/backend/init_data/skills/wegent-knowledge/SKILL.md @@ -1,13 +1,17 @@ --- -description: "Knowledge base management and search tools for Wegent. Provides capabilities to list, create, update, and search knowledge bases and documents using RAG retrieval. Use this skill when the user wants to manage knowledge bases, documents, or search for information programmatically." +description: "Knowledge base management and search tools for Wegent. Provides capabilities to list, create, update, and search knowledge bases and documents using RAG retrieval. Also supports uploading DingTalk documents, spreadsheets, and AI tables to knowledge base with proper source attribution." displayName: "知识库工具" -version: "1.0.0" +version: "1.0.1" author: "Wegent Team" -tags: ["knowledge", "knowledge-base", "document", "rag"] +tags: ["knowledge", "knowledge-base", "document", "rag", "dingtalk", "spreadsheet", "ai-table"] bindShells: - Chat - Agno - ClaudeCode +config: + unconfiguredGuide: + modalLink: "wegent://modal/mcp-provider-config?provider=dingtalk&service=docs" + modalText: "打开钉钉 MCP 配置弹窗" mcpServers: wegent-knowledge: type: streamable-http @@ -17,6 +21,18 @@ mcpServers: headers: Authorization: "Bearer ${{task_token}}" timeout: 300 + dingtalk-docs: + type: streamable-http + url: "${{task_data.user_mcps.dingtalk.services.docs.credentials.url}}" + timeout: 300 + dingtalk-table: + type: streamable-http + url: "${{task_data.user_mcps.dingtalk.services.table.credentials.url}}" + timeout: 300 + dingtalk-ai-table: + type: streamable-http + url: "${{task_data.user_mcps.dingtalk.services.ai_table.credentials.url}}" + timeout: 300 --- # Wegent Knowledge Base Skill @@ -25,6 +41,8 @@ You now have access to Wegent Knowledge Base management tools. ## Available Tools +### Knowledge Base Management Tools + - **wegent_kb_list_knowledge_bases**: List all knowledge bases accessible to the current user - scope: "personal" (your own), "group" (team), or "all" (default) - group_name: Specify group name when scope="group" @@ -42,11 +60,12 @@ You now have access to Wegent Knowledge Base management tools. - **wegent_kb_create_document**: Create a new document in a knowledge base - knowledge_base_id: Target knowledge base ID - name: Document name - - source_type: "text" (paste content), "file" (base64 encoded), or "web" (URL to scrape) + - source_type: "text" (paste content), "file" (base64 encoded), "web" (URL to scrape), or "attachment" (existing attachment) - content: Document content when source_type="text" - file_base64: Base64 encoded file when source_type="file" - file_extension: File extension when source_type="file" - url: URL to fetch when source_type="web" + - attachment_id: Existing attachment ID when source_type="attachment" - trigger_indexing: Whether to trigger RAG indexing (default: true) - trigger_summary: Whether to trigger summary generation (default: true) @@ -67,6 +86,20 @@ You now have access to Wegent Knowledge Base management tools. - max_results: Maximum results to return (default: 10, max: 50) - document_ids: Optional list of document IDs to filter search scope +### DingTalk Upload Helper Tools + +- **dingtalk_upload_file_from_url**: Download a file from URL and upload to Wegent + - download_url: The URL to download the file from (e.g., from dingtalk-docs.download_file) + - filename: Optional filename with extension. If not provided, will extract from URL + - default_extension: Default extension if filename cannot be determined (default: "bin") + - returns: attachment_id, filename, size, mime_type, truncated + +- **dingtalk_upload_content**: Save content to a file and upload to Wegent + - content: The text content to save (e.g., markdown from dingtalk-docs.get_document_content) + - filename: Filename with extension (e.g., "document.md") + - encoding: Text encoding (default: "utf-8") + - returns: attachment_id, filename, size, mime_type, truncated + ## Usage Notes - All operations inherit the current user's permissions @@ -78,20 +111,20 @@ You now have access to Wegent Knowledge Base management tools. - Avoid loops: if a tool call fails, report the error once and stop retrying/re-loading the skill unless the user changes inputs. - Long documents should be read incrementally: start with the backend default limit, then continue with `offset = previous_offset + previous_returned_length` while `has_more=true` -## Example Workflow +## Example Workflows for Knowledge Base Management 1. First, list available knowledge bases: - ``` + ```python wegent_kb_list_knowledge_bases(scope="all") ``` 2. List documents in a specific knowledge base: - ``` + ```python wegent_kb_list_documents(knowledge_base_id=123) ``` 3. Create a new knowledge base: - ``` + ```python wegent_kb_create_knowledge_base( name="My KB", description="My personal notes", @@ -101,7 +134,7 @@ You now have access to Wegent Knowledge Base management tools. ``` 4. Create a new text document: - ``` + ```python wegent_kb_create_document( knowledge_base_id=123, name="Meeting Notes", @@ -111,7 +144,7 @@ You now have access to Wegent Knowledge Base management tools. ``` 5. Update document content: - ```text + ```python wegent_kb_update_document_content( document_id=456, content="Updated notes with new information...", @@ -120,7 +153,7 @@ You now have access to Wegent Knowledge Base management tools. ``` 6. Read long document content incrementally: - ```text + ```python wegent_kb_read_document_content( document_id=456, offset=0 @@ -128,7 +161,7 @@ You now have access to Wegent Knowledge Base management tools. ``` 7. Search knowledge base using RAG retrieval: - ```text + ```python wegent_kb_search_knowledge_base( knowledge_base_id=123, query="How to configure the system?", @@ -137,7 +170,7 @@ You now have access to Wegent Knowledge Base management tools. ``` 8. Search within specific documents: - ```text + ```python wegent_kb_search_knowledge_base( knowledge_base_id=123, query="deployment steps", @@ -145,3 +178,245 @@ You now have access to Wegent Knowledge Base management tools. document_ids=[456, 789] ) ``` + +## DingTalk Document Upload + +This skill also supports uploading DingTalk documents, spreadsheets, and AI tables to **Wegent** knowledge bases. + +**Important:** This feature reads documents FROM DingTalk and uploads them TO Wegent's knowledge base. The final document will be stored in Wegent, not in DingTalk. + +### When To Use DingTalk Upload + +- The user wants to add a DingTalk document to a knowledge base +- The user mentions URLs like: + - `https://alidocs.dingtalk.com/i/nodes/...` (DingTalk Docs) + - `https://alidocs.dingtalk.com/i/spreadsheet/...` (DingTalk Table) + - `https://alidocs.dingtalk.com/i/ai/...` (DingTalk AI Table) + +### DingTalk Upload Workflow + +When user wants to upload a DingTalk document to knowledge base: + +#### Step 1: Parse URL +Identify document type from the DingTalk URL: +- Docs: `https://alidocs.dingtalk.com/i/nodes/{node_id}` +- Table: `https://alidocs.dingtalk.com/i/spreadsheet/{doc_id}` +- AI Table: `https://alidocs.dingtalk.com/i/ai/{doc_id}` + +#### Step 2: Get Document Info +**IMPORTANT: Always call get_document_info first!** This returns document metadata including `contentType` and `file_extension` fields which determine which tool to use next. + +For docs (dingtalk-docs MCP): +```python +dingtalk-docs.get_document_info(document_id="xxx") +``` + +Returns key fields: +- `name`: Document title +- `contentType`: Document content type (ALIDOC or other) +- `file_extension`: File extension (adoc, axls, able, docx, xlsx, etc.) +- `nodeType`: Node type (file, folder, etc.) + +#### Step 3: Route Based on contentType and extension + +Based on `get_document_info` response, choose the appropriate tool: + +| contentType | file_extension | Tool to Use | +|-------------|----------------|-------------| +| ALIDOC | adoc | `get_document_content(nodeId)` - Returns Markdown content | +| ALIDOC | axls | Use dingtalk-table MCP: `get_all_sheets(nodeId)` then `get_range(nodeId, sheetId, range)` | +| ALIDOC | able | Use dingtalk-ai-table MCP: `get_tables(nodeId)` then `query_records(nodeId, tableId)` | +| ≠ALIDOC | file | `download_file(nodeId)` - Returns file download URL | + +##### Option A: Online Document (adoc) - contentType=ALIDOC, extension=adoc +Call `get_document_content` to get markdown content: +```python +dingtalk-docs.get_document_content(nodeId="xxx") +``` +Returns: `{"markdown": "# Title\n\nContent...", ...}` + +Then proceed to Step 4B. + +##### Option B: Spreadsheet (axls) - contentType=ALIDOC, extension=axls +Use dingtalk-table MCP: +```python +# Get all sheets +dingtalk-table.get_all_sheets(nodeId="xxx") +# Returns: [{"sheetId": "1", "name": "Sheet1"}, ...] + +# Read data from specific sheet - use a large range to capture all rows +dingtalk-table.get_range(nodeId="xxx", sheetId="1", range="A1:Z10000") +``` + +**⚠️ IMPORTANT: Use a sufficiently large row range.** The example `A1:Z100` only fetches 100 rows. For real spreadsheets, use `A1:Z10000` or larger. If the tool returns a `hasMore` or similar field indicating more data exists, keep fetching with an incremented row offset until all rows are retrieved. + +##### Option C: AI Table (able) - contentType=ALIDOC, extension=able +Use dingtalk-ai-table MCP: +```python +# Get all tables (nodeId is baseId) +dingtalk-ai-table.get_tables(nodeId="xxx") +# Returns: [{"tableId": "1", "name": "Table1"}, ...] + +# Query records with pagination - MUST loop until all records are fetched +# First call (no pageToken) +result = dingtalk-ai-table.query_records(nodeId="xxx", tableId="1") +all_records = result["records"] + +# Continue fetching if there are more pages +while result.get("pageToken"): + result = dingtalk-ai-table.query_records( + nodeId="xxx", tableId="1", pageToken=result["pageToken"] + ) + all_records.extend(result["records"]) +``` + +**⚠️ IMPORTANT: AI Table pagination is mandatory.** `query_records` returns a `pageToken` field when there are more records. You MUST keep calling `query_records` with the returned `pageToken` until `pageToken` is absent or empty. Uploading only the first page will result in incomplete data. + +##### Option D: File-based Document - contentType≠ALIDOC, nodeType=file +Call `download_file` to get download URL: +```python +dingtalk-docs.download_file(nodeId="xxx") +``` +Returns: `{"download_url": "https://...", "download_token": "..."}` + +Then proceed to Step 4A. + +#### Step 4A: Download and Upload File (for file-based documents) + +If you got a `download_url` from Option D, use the `dingtalk_upload_file_from_url` tool to download the file and upload it to Wegent: + +```python +dingtalk_upload_file_from_url( + download_url="https://alidocs.dingtalk.com/...", + filename="Specifications.docx" +) +``` + +Returns: `{"attachment_id": 123, "filename": "Specifications.docx", "size": 10240}` + +**Parameters:** +- `download_url`: The URL to download the file from (from `dingtalk-docs.download_file`) +- `filename`: Filename with extension (e.g., "document.docx"). If not provided, will extract from URL +- `default_extension`: Default extension if filename cannot be determined (default: "bin") + +#### Step 4B: Save Content and Upload (for online documents) + +If you got content from adoc or axls or able, use the `dingtalk_upload_content` tool to save the content to a file and upload it to Wegent: + +```python +dingtalk_upload_content( + content="# Title\n\nContent from get_document_content...", + filename="Specifications.md" +) +``` + +Returns: `{"attachment_id": 456, "filename": "Specifications.md", "size": 2048}` + +**Parameters:** +- `content`: The text content to save (e.g., markdown from `dingtalk-docs.get_document_content`) +- `filename`: Filename with extension (e.g., "document.md") +- `encoding`: Text encoding (default: "utf-8") + +**Important:** The content from `get_document_content` is always in markdown format, so save it with `.md` extension. + +#### Step 5: Create Document in Wegent Knowledge Base + +**⚠️ CRITICAL:** When calling `wegent_kb_create_document`, `source_type` MUST be set to `"attachment"`. + +Use the **wegent-knowledge** MCP server's `wegent_kb_create_document` tool to create the document in **Wegent's** knowledge base: + +```python +wegent_kb_create_document( + knowledge_base_id=123, # REQUIRED: The target Wegent knowledge base ID + name="Document Name", # REQUIRED: Use the document name from get_document_info + source_type="attachment", # REQUIRED: Must be "attachment" since we have attachment_id + attachment_id=456, # REQUIRED: From Step 4A or 4B + trigger_indexing=True # REQUIRED +) +``` + +**⚠️ CRITICAL - DO NOT MAKE THIS MISTAKE:** +- **CORRECT:** `wegent_kb_create_document` - Creates document in Wegent KB ✓ +- **WRONG:** `dingtalk-docs.create_document` or `create_document` - Creates document in DingTalk KB ✗ + +**NEVER** call `dingtalk-docs.create_document` or plain `create_document` - this would create the document in DingTalk's knowledge base, which is NOT what the user wants. The user wants to add the DingTalk document to Wegent's knowledge base. + +The `dingtalk-docs` MCP server is **ONLY** for reading documents from DingTalk. The `wegent-knowledge` MCP server's `wegent_kb_*` tools are for managing Wegent's knowledge base. + +Parameters: +- `knowledge_base_id`: **Required.** The target Wegent knowledge base ID (integer). Get this from user input or by listing knowledge bases. +- `name`: **Required.** Document name (use the document name from get_document_info) +- `source_type`: **Required.** Use "attachment" since we have an attachment_id +- `attachment_id`: **Required.** The attachment ID returned from Step 4A or 4B +- `trigger_indexing`: Set to true to enable RAG indexing + +**Common Error:** If you see "knowledge_base_id: Field required", it means you forgot to include the `knowledge_base_id` parameter in the tool call. All three parameters (knowledge_base_id, name, source_type) are required. + +### DingTalk Error Handling + +If MCP tools report a permissions or configuration problem: +1. Check if the user has configured the corresponding DingTalk MCP service +2. Guide the user to Settings → Integrations → DingTalk to configure +3. Do not proceed without proper configuration + +### DingTalk Upload Examples + +#### Example 1: File-based Document (Word, Excel, PDF, etc.) + +User: "将钉钉文档 https://alidocs.dingtalk.com/i/nodes/nYMoOje9 添加到知识库 '产品文档'" + +Steps: +1. Parse URL → node_id="nYMoOje9" +2. Call dingtalk-docs.get_document_info(document_id="nYMoOje9") + - Returns: `{"name": "Specifications", "contentType": "FILE", "file_extension": "docx", "nodeType": "file", ...}` +3. Since contentType≠ALIDOC and nodeType=file, call dingtalk-docs.download_file(nodeId="nYMoOje9") + - Returns: `{"download_url": "https://...", "download_token": "..."}` +4. Use **dingtalk_upload_file_from_url** to download and upload: + ```python + dingtalk_upload_file_from_url( + download_url="https://alidocs.dingtalk.com/...", + filename="Specifications.docx" + ) + ``` + - Returns: `{"attachment_id": 123, ...}` +5. Call **wegent_kb_create_document** to create document in Wegent knowledge base: + ```python + wegent_kb_create_document( + knowledge_base_id=1, # REQUIRED: Target Wegent knowledge base ID + name="Specifications", # REQUIRED: From get_document_info + source_type="attachment", # REQUIRED + attachment_id=123, # REQUIRED: From step 4 + trigger_indexing=True, # REQUIRED + ) + ``` + **Result:** Document is now available in Wegent's knowledge base (NOT in DingTalk). + +#### Example 2: Online Document (adoc) + +User: "将钉钉文档 https://alidocs.dingtalk.com/i/nodes/AbCdEfGh 添加到知识库" + +Steps: +1. Parse URL → node_id="AbCdEfGh" +2. Call dingtalk-docs.get_document_info(document_id="AbCdEfGh") + - Returns: `{"name": "Specifications", "contentType": "ALIDOC", "file_extension": "adoc", ...}` +3. Since contentType=ALIDOC and extension=adoc, call dingtalk-docs.get_document_content(nodeId="AbCdEfGh") + - Returns: `{"markdown": "# Title\n\nContent...", ...}` +4. Use **dingtalk_upload_content** to save and upload: + ```python + dingtalk_upload_content( + content="# Title\n\nContent from DingTalk...", + filename="Specifications.md" + ) + ``` + - Returns: `{"attachment_id": 456, ...}` +5. Call **wegent_kb_create_document** to create document in Wegent knowledge base: + ```python + wegent_kb_create_document( + knowledge_base_id=1, # REQUIRED: Target Wegent knowledge base ID + name="Specifications", # REQUIRED: From get_document_info + source_type="attachment", # REQUIRED + attachment_id=456, # REQUIRED: From step 4 + trigger_indexing=True, # REQUIRED + ) + ``` + **Result:** Document is now available in Wegent's knowledge base (NOT in DingTalk).