wecode-ai · DavidLeeUX · Apr 26, 2026 · Apr 27, 2026 · Apr 28, 2026 · coderabbitai
diff --git a/backend/.env.example b/backend/.env.example
@@ -210,6 +210,93 @@ WIKI_MAX_CONTENT_SIZE=10485760
 # Base URL for internal wiki content writer
 WIKI_CONTENT_WRITE_BASE_URL=http://backend:8000
 
+
+
+# ==========================================
+# Knowledge Document Conversion Configuration
+# 知识库文档转换配置（PDF/PPTX 等转换为 Markdown）
+# ==========================================
+
+# Master switch for document conversion feature
+# 总开关：true 开启转换功能，false 关闭（走原来直接索引的逻辑）
+KNOWLEDGE_CONVERSION_ENABLED=false
+
+# Comma-separated list of file extensions that need conversion to markdown
+# 需要转换为 Markdown 的文件扩展名列表（逗号分隔），如：pdf,pptx,docx
+# 仅在 KNOWLEDGE_CONVERSION_ENABLED=true 时生效
+KNOWLEDGE_CONVERSION_FILE_TYPES=pdf
+
+# Celery queue name for document conversion tasks
+# 文档转换任务的 Celery 队列名称
+KNOWLEDGE_CONVERSION_QUEUE=knowledge_conversion
+
+# Stale detection timeout for CONVERTING status (seconds)
+# 转换状态过期时间（秒），超过此时间视为卡住，可被重新入队
+KNOWLEDGE_INDEX_STALE_CONVERTING_SECONDS=1800
+
+# Conversion task distributed lock configuration
+# 转换任务分布式锁配置
+KNOWLEDGE_CONVERSION_LOCK_TIMEOUT_SECONDS=10000
+KNOWLEDGE_CONVERSION_LOCK_EXTEND_INTERVAL_SECONDS=60
+KNOWLEDGE_CONVERSION_LOCK_MAX_RETRIES=2
+KNOWLEDGE_CONVERSION_LOCK_RETRY_DELAY_SECONDS=30
+
+# ==========================================
+# MinerU API Configuration
+# PDF 转 Markdown 的 MinerU API 配置
+# ==========================================
+
+# Base URL for MinerU API service
+# MinerU API 服务地址，为空则禁用 PDF 转换
+MINERU_API_BASE_URL=
+
+# MinerU backend type: "pipeline" or other supported backends
+MINERU_BACKEND=pipeline
+
+# MinerU parse method: "ocr", "auto", etc.
+MINERU_PARSE_METHOD=ocr
+
+# Language list for OCR (comma-separated, e.g., "ch,en")
+MINERU_LANG_LIST=ch
+
+# Enable formula recognition
+MINERU_FORMULA_ENABLE=true
+
+# Enable table recognition
+MINERU_TABLE_ENABLE=true
+
+# Polling interval for task status checks (seconds)
+MINERU_POLL_INTERVAL_SECONDS=3
+
+# Maximum time to wait for MinerU task completion (seconds)
+MINERU_MAX_WAIT_SECONDS=600
+
+# ==========================================
+# Document Conversion S3 Storage Configuration
+# MinerU 提取图片的 S3 存储配置
+# ==========================================
+
+# Enable S3 upload for extracted images
+# 是否启用 S3 图片上传
+WORKER_CONVERSION_S3_ENABLED=false
+
+# S3 endpoint URL (e.g., MinIO or AWS S3)
+# S3 服务端点地址
+WORKER_CONVERSION_S3_ENDPOINT=
+
+# S3 access key
+WORKER_CONVERSION_S3_ACCESS_KEY=
+
+# S3 SECRET key
+WORKER_CONVERSION_S3_SECRET_KEY=
+
+# S3 bucket name for storing images
+# S3 存储桶名称
+WORKER_CONVERSION_S3_BUCKET_NAME=
+
+# S3 region name
+WORKER_CONVERSION_S3_REGION_NAME=us-east-1
+
 # Data Table Configuration
 # JSON string containing table provider credentials (DingTalk, etc.)
 # Format: {"dingtalk":{"appKey":"YOUR_APP_KEY","appSecret":"YOUR_APP_secret","operatorId":"YOUR_OPERATOR_ID","userMapping":{"baseId":"YOUR_BASE_ID","sheetId":"YOUR_SHEET_ID"}}}

diff --git a/backend/app/core/celery_app.py b/backend/app/core/celery_app.py
@@ -43,6 +43,7 @@
     include=[
         "app.tasks.subscription_tasks",
         "app.tasks.knowledge_tasks",
+        "app.tasks.conversion_tasks",
     ],
 )
 
@@ -67,6 +68,14 @@
     task_default_retry_delay=60,  # 1 minute default retry delay
     # Default queue configuration
     task_default_queue=settings.CELERY_TASK_DEFAULT_QUEUE,
+    # Task routing: conversion tasks go to dedicated queue
+    # Main worker does NOT consume this queue
+    # Conversion worker: celery -A app.core.celery_app worker --queues=knowledge_conversion
+    task_routes={
+        "app.tasks.conversion_tasks.*": {
+            "queue": settings.KNOWLEDGE_CONVERSION_QUEUE,
+        },
+    },
     # Beat schedule for periodic tasks
     beat_schedule={
         "check-due-subscriptions": {

diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -330,6 +330,58 @@ def parse_rag_runtime_mode(cls, v: Any) -> str | dict[str, str]:
     KNOWLEDGE_INDEX_STALE_QUEUED_SECONDS: int = 600
     KNOWLEDGE_INDEX_STALE_INDEXING_SECONDS: int = 2700
 
+    # Knowledge document conversion configuration
+    # Master switch for document conversion feature
+    # When False, conversion is disabled and files are indexed directly (original behavior)
+    # When True, files matching KNOWLEDGE_CONVERSION_FILE_TYPES will be converted to markdown
+    KNOWLEDGE_CONVERSION_ENABLED: bool = False
+
+    # Comma-separated list of file extensions that need conversion to markdown
+    # before indexing. Example: "pdf,pptx,docx"
+    # Only used when KNOWLEDGE_CONVERSION_ENABLED is True.
+    KNOWLEDGE_CONVERSION_FILE_TYPES: str = ""
+
+    # Celery queue name for document conversion tasks
+    KNOWLEDGE_CONVERSION_QUEUE: str = "knowledge_conversion"
+
+    # Stale detection timeout for CONVERTING status (seconds, default 30 min)
+    KNOWLEDGE_INDEX_STALE_CONVERTING_SECONDS: int = 1800
+
+    # Conversion task distributed lock configuration
+    # Lock timeout should be longer than task soft_time_limit to prevent premature expiration
+    KNOWLEDGE_CONVERSION_LOCK_TIMEOUT_SECONDS: int = 2000
+    KNOWLEDGE_CONVERSION_LOCK_EXTEND_INTERVAL_SECONDS: int = 60
+    KNOWLEDGE_CONVERSION_LOCK_MAX_RETRIES: int = 2
+    KNOWLEDGE_CONVERSION_LOCK_RETRY_DELAY_SECONDS: int = 30
+
+    # MinerU API configuration for PDF to Markdown conversion
+    # Base URL for MinerU API service (e.g., "http://10.2.40.157:8367")
+    MINERU_API_BASE_URL: str = ""
+    # MinerU backend type: "pipeline" or other supported backends
+    MINERU_BACKEND: str = "pipeline"
+    # MinerU parse method: "ocr", "auto", etc.
+    MINERU_PARSE_METHOD: str = "ocr"
+    # Language list for OCR (comma-separated, e.g., "ch,en")
+    MINERU_LANG_LIST: str = "ch"
+    # Enable formula recognition
+    MINERU_FORMULA_ENABLE: bool = True
+    # Enable table recognition
+    MINERU_TABLE_ENABLE: bool = True
+    # Polling interval for task status checks (seconds)
+    MINERU_POLL_INTERVAL_SECONDS: int = 3
+    # Maximum time to wait for MinerU task completion (seconds, default 10 min)
+    MINERU_MAX_WAIT_SECONDS: int = 600
+
+    # Document conversion S3 storage configuration for extracted images
+    # When enabled, images extracted by MinerU will be uploaded to S3
+    # and markdown image references will be updated to S3 URLs
+    WORKER_CONVERSION_S3_ENABLED: bool = False
+    WORKER_CONVERSION_S3_ENDPOINT: str = ""
+    WORKER_CONVERSION_S3_ACCESS_KEY: str = ""
+    WORKER_CONVERSION_S3_SECRET_KEY: str = ""
+    WORKER_CONVERSION_S3_BUCKET_NAME: str = ""
+    WORKER_CONVERSION_S3_REGION_NAME: str = "us-east-1"
+
     # Circuit breaker configuration
     CIRCUIT_BREAKER_FAIL_MAX: int = 5  # Open circuit after 5 consecutive failures
     CIRCUIT_BREAKER_RESET_TIMEOUT: int = 60  # Try to recover after 60 seconds
@@ -595,6 +647,26 @@ def parse_rag_runtime_mode(cls, v: Any) -> str | dict[str, str]:
     # Use: from shared.telemetry.config import get_otel_config
     # All OTEL_* environment variables are read from there
 
+    def needs_conversion(self, file_extension: str) -> bool:
+        """Check if a file extension requires conversion before indexing.
+
+        Conversion only occurs when:
+        1. KNOWLEDGE_CONVERSION_ENABLED is True (master switch)
+        2. KNOWLEDGE_CONVERSION_FILE_TYPES is not empty
+        3. The file extension is in the conversion list
+        """
+        if not self.KNOWLEDGE_CONVERSION_ENABLED:
+            return False
+        if not self.KNOWLEDGE_CONVERSION_FILE_TYPES:
+            return False
+        ext = file_extension.lstrip(".").lower()
+        types = [
+            t.strip().lower()
+            for t in self.KNOWLEDGE_CONVERSION_FILE_TYPES.split(",")
+            if t.strip()
+        ]
+        return ext in types
+
     def get_rag_runtime_mode(self, operation: str) -> str:
         """Resolve the effective RAG runtime mode for an operation."""
         config = self.RAG_RUNTIME_MODE

diff --git a/backend/app/models/knowledge.py b/backend/app/models/knowledge.py
@@ -53,6 +53,7 @@ class DocumentIndexStatus(str, PyEnum):
 
     NOT_INDEXED = "not_indexed"
     QUEUED = "queued"
+    CONVERTING = "converting"
     INDEXING = "indexing"
     SUCCESS = "success"
     FAILED = "failed"

diff --git a/backend/app/schemas/knowledge.py b/backend/app/schemas/knowledge.py
@@ -48,6 +48,7 @@ class DocumentIndexStatus(str, Enum):
 
     NOT_INDEXED = "not_indexed"
     QUEUED = "queued"
+    CONVERTING = "converting"
     INDEXING = "indexing"
     SUCCESS = "success"
     FAILED = "failed"

diff --git a/backend/app/services/knowledge/index_state_machine.py b/backend/app/services/knowledge/index_state_machine.py
@@ -45,6 +45,7 @@ class IndexExecutionDecision:
 
 ACTIVE_INDEX_STATUSES = {
     DocumentIndexStatus.QUEUED,
+    DocumentIndexStatus.CONVERTING,
     DocumentIndexStatus.INDEXING,
 }
 
@@ -74,6 +75,12 @@ def _get_active_index_stale_reason(
     ):
         return "stale_indexing"
 
+    if (
+        document.index_status == DocumentIndexStatus.CONVERTING
+        and age_seconds >= settings.KNOWLEDGE_INDEX_STALE_CONVERTING_SECONDS
+    ):
+        return "stale_converting"
+
     return None
 
 
@@ -420,6 +427,129 @@ def mark_document_index_succeeded(
     return updated > 0
 
 
+@trace_sync(
+    span_name="knowledge.mark_document_conversion_started",
+    tracer_name="knowledge.state_machine",
+    extract_attributes=lambda db, document_id, generation: {
+        "knowledge.document_id": document_id,
+        "knowledge.index_generation": generation,
+    },
+)
+def mark_document_conversion_started(
+    db: Session,
+    document_id: int,
+    generation: int,
+) -> IndexExecutionDecision:
+    """Transition QUEUED -> CONVERTING when conversion worker picks up the task."""
+    document = (
+        db.query(KnowledgeDocument)
+        .filter(KnowledgeDocument.id == document_id)
+        .with_for_update()
+        .first()
+    )
+    if document is None:
+        db.rollback()
+        _record_transition(
+            "knowledge.conversion.start.skipped",
+            document_id=document_id,
+            generation=generation,
+            reason="document_not_found",
+        )
+        return IndexExecutionDecision(
+            should_execute=False,
+            reason="document_not_found",
+        )
+
+    if document.index_generation != generation:
+        db.rollback()
+        _record_transition(
+            "knowledge.conversion.start.skipped",
+            document_id=document_id,
+            generation=generation,
+            reason="stale_generation",
+            previous_status=document.index_status,
+        )
+        return IndexExecutionDecision(
+            should_execute=False,
+            reason="stale_generation",
+        )
+
+    current_status = document.index_status or DocumentIndexStatus.NOT_INDEXED
+    if current_status != DocumentIndexStatus.QUEUED:
+        db.rollback()
+        _record_transition(
+            "knowledge.conversion.start.skipped",
+            document_id=document_id,
+            generation=generation,
+            reason=f"unexpected_status_{current_status.value}",
+            previous_status=current_status,
+        )
+        return IndexExecutionDecision(
+            should_execute=False,
+            reason=f"unexpected_status_{current_status.value}",
+        )
+
+    document.index_status = DocumentIndexStatus.CONVERTING
+    document.updated_at = _utcnow()
+    db.commit()
+
+    _record_transition(
+        "knowledge.conversion.start.accepted",
+        document_id=document_id,
+        generation=generation,
+        reason="conversion_started",
+        previous_status=current_status,
+    )
+    return IndexExecutionDecision(
+        should_execute=True,
+        reason="conversion_started",
+    )
+
+
+@trace_sync(
+    span_name="knowledge.mark_document_conversion_succeeded",
+    tracer_name="knowledge.state_machine",
+    extract_attributes=lambda db, document_id, generation: {
+        "knowledge.document_id": document_id,
+        "knowledge.index_generation": generation,
+    },
+)
+def mark_document_conversion_succeeded(
+    db: Session,
+    document_id: int,
+    generation: int,
+) -> bool:
+    """Transition CONVERTING -> QUEUED after successful conversion.
+
+    The document returns to QUEUED so index_document_task can proceed
+    with the normal QUEUED -> INDEXING transition.
+    """
+    updated = (
+        db.query(KnowledgeDocument)
+        .filter(
+            KnowledgeDocument.id == document_id,
+            KnowledgeDocument.index_generation == generation,
+            KnowledgeDocument.index_status == DocumentIndexStatus.CONVERTING,
+        )
+        .update(
+            {
+                KnowledgeDocument.index_status: DocumentIndexStatus.QUEUED,
+                KnowledgeDocument.updated_at: _utcnow(),
+            },
+            synchronize_session=False,
+        )
+    )
+    db.commit()
+
+    _record_transition(
+        "knowledge.conversion.finalize.success",
+        document_id=document_id,
+        generation=generation,
+        reason="converted" if updated > 0 else "stale_or_already_finalized",
+    )
+    return updated > 0
+
+
 @trace_sync(
     span_name="knowledge.mark_document_index_failed",
     tracer_name="knowledge.state_machine",