-
Notifications
You must be signed in to change notification settings - Fork 92
feat(knowledge): add document conversion pipeline (PDF/Office to Markdown) #1031
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -330,6 +330,58 @@ def parse_rag_runtime_mode(cls, v: Any) -> str | dict[str, str]: | |
| KNOWLEDGE_INDEX_STALE_QUEUED_SECONDS: int = 600 | ||
| KNOWLEDGE_INDEX_STALE_INDEXING_SECONDS: int = 2700 | ||
|
|
||
| # Knowledge document conversion configuration | ||
| # Master switch for document conversion feature | ||
| # When False, conversion is disabled and files are indexed directly (original behavior) | ||
| # When True, files matching KNOWLEDGE_CONVERSION_FILE_TYPES will be converted to markdown | ||
| KNOWLEDGE_CONVERSION_ENABLED: bool = False | ||
|
|
||
| # Comma-separated list of file extensions that need conversion to markdown | ||
| # before indexing. Example: "pdf,pptx,docx" | ||
| # Only used when KNOWLEDGE_CONVERSION_ENABLED is True. | ||
| KNOWLEDGE_CONVERSION_FILE_TYPES: str = "" | ||
|
|
||
| # Celery queue name for document conversion tasks | ||
| KNOWLEDGE_CONVERSION_QUEUE: str = "knowledge_conversion" | ||
|
|
||
| # Stale detection timeout for CONVERTING status (seconds, default 30 min) | ||
| KNOWLEDGE_INDEX_STALE_CONVERTING_SECONDS: int = 1800 | ||
|
|
||
| # Conversion task distributed lock configuration | ||
| # Lock timeout should be longer than task soft_time_limit to prevent premature expiration | ||
| KNOWLEDGE_CONVERSION_LOCK_TIMEOUT_SECONDS: int = 2000 | ||
| KNOWLEDGE_CONVERSION_LOCK_EXTEND_INTERVAL_SECONDS: int = 60 | ||
| KNOWLEDGE_CONVERSION_LOCK_MAX_RETRIES: int = 2 | ||
| KNOWLEDGE_CONVERSION_LOCK_RETRY_DELAY_SECONDS: int = 30 | ||
|
Comment on lines
+347
to
+355
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Default conversion timeouts still undershoot the worker limits. These defaults can mark or unlock an in-flight conversion long before 🤖 Prompt for AI Agents |
||
|
|
||
| # MinerU API configuration for PDF to Markdown conversion | ||
| # Base URL for MinerU API service (e.g., "http://10.2.40.157:8367") | ||
| MINERU_API_BASE_URL: str = "" | ||
| # MinerU backend type: "pipeline" or other supported backends | ||
| MINERU_BACKEND: str = "pipeline" | ||
| # MinerU parse method: "ocr", "auto", etc. | ||
| MINERU_PARSE_METHOD: str = "ocr" | ||
| # Language list for OCR (comma-separated, e.g., "ch,en") | ||
| MINERU_LANG_LIST: str = "ch" | ||
| # Enable formula recognition | ||
| MINERU_FORMULA_ENABLE: bool = True | ||
| # Enable table recognition | ||
| MINERU_TABLE_ENABLE: bool = True | ||
| # Polling interval for task status checks (seconds) | ||
| MINERU_POLL_INTERVAL_SECONDS: int = 3 | ||
| # Maximum time to wait for MinerU task completion (seconds, default 10 min) | ||
| MINERU_MAX_WAIT_SECONDS: int = 600 | ||
|
|
||
| # Document conversion S3 storage configuration for extracted images | ||
| # When enabled, images extracted by MinerU will be uploaded to S3 | ||
| # and markdown image references will be updated to S3 URLs | ||
| WORKER_CONVERSION_S3_ENABLED: bool = False | ||
| WORKER_CONVERSION_S3_ENDPOINT: str = "" | ||
| WORKER_CONVERSION_S3_ACCESS_KEY: str = "" | ||
| WORKER_CONVERSION_S3_SECRET_KEY: str = "" | ||
| WORKER_CONVERSION_S3_BUCKET_NAME: str = "" | ||
| WORKER_CONVERSION_S3_REGION_NAME: str = "us-east-1" | ||
|
|
||
| # Circuit breaker configuration | ||
| CIRCUIT_BREAKER_FAIL_MAX: int = 5 # Open circuit after 5 consecutive failures | ||
| CIRCUIT_BREAKER_RESET_TIMEOUT: int = 60 # Try to recover after 60 seconds | ||
|
|
@@ -595,6 +647,26 @@ def parse_rag_runtime_mode(cls, v: Any) -> str | dict[str, str]: | |
| # Use: from shared.telemetry.config import get_otel_config | ||
| # All OTEL_* environment variables are read from there | ||
|
|
||
| def needs_conversion(self, file_extension: str) -> bool: | ||
| """Check if a file extension requires conversion before indexing. | ||
|
|
||
| Conversion only occurs when: | ||
| 1. KNOWLEDGE_CONVERSION_ENABLED is True (master switch) | ||
| 2. KNOWLEDGE_CONVERSION_FILE_TYPES is not empty | ||
| 3. The file extension is in the conversion list | ||
| """ | ||
| if not self.KNOWLEDGE_CONVERSION_ENABLED: | ||
| return False | ||
| if not self.KNOWLEDGE_CONVERSION_FILE_TYPES: | ||
| return False | ||
| ext = file_extension.lstrip(".").lower() | ||
| types = [ | ||
| t.strip().lower() | ||
| for t in self.KNOWLEDGE_CONVERSION_FILE_TYPES.split(",") | ||
| if t.strip() | ||
| ] | ||
| return ext in types | ||
|
|
||
| def get_rag_runtime_mode(self, operation: str) -> str: | ||
| """Resolve the effective RAG runtime mode for an operation.""" | ||
| config = self.RAG_RUNTIME_MODE | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
convertingstale window is shorter than the allowed task runtime.KNOWLEDGE_INDEX_STALE_CONVERTING_SECONDS=1800marks a conversion as stuck after 30 minutes, but the conversion task is allowed to run up to 9000s soft / 10000s hard. That means a healthy long-running conversion can be taken over as “stale” long before the worker actually times out, which undermines the new generation/state-machine logic.🧰 Tools
🪛 dotenv-linter (4.0.0)
[warning] 240-240: [UnorderedKey] The KNOWLEDGE_CONVERSION_LOCK_EXTEND_INTERVAL_SECONDS key should go before the KNOWLEDGE_CONVERSION_LOCK_TIMEOUT_SECONDS key
(UnorderedKey)
[warning] 241-241: [UnorderedKey] The KNOWLEDGE_CONVERSION_LOCK_MAX_RETRIES key should go before the KNOWLEDGE_CONVERSION_LOCK_TIMEOUT_SECONDS key
(UnorderedKey)
[warning] 242-242: [UnorderedKey] The KNOWLEDGE_CONVERSION_LOCK_RETRY_DELAY_SECONDS key should go before the KNOWLEDGE_CONVERSION_LOCK_TIMEOUT_SECONDS key
(UnorderedKey)
🤖 Prompt for AI Agents