Refactor docparse.py for better readability and organization

BohuTANG · BohuTANG · commit 600d2172203c · 2025-11-27T23:39:14.000+08:00
diff --git a/databend_aiserver/udfs/docparse.py b/databend_aiserver/udfs/docparse.py
@@ -15,12 +15,12 @@
 from __future__ import annotations
 
 import logging
-from collections import OrderedDict
 import mimetypes
 import os
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Protocol
+from time import perf_counter, perf_counter_ns
+from typing import Any, Dict, List, Optional, Protocol, Tuple
 
 from databend_udf import StageLocation, udf
 from docling.document_converter import DocumentConverter, PdfFormatOption
@@ -33,8 +33,6 @@
 from docling.chunking import HybridChunker
 from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
 from transformers import AutoTokenizer
-from opendal import exceptions as opendal_exceptions
-from time import perf_counter, perf_counter_ns
 
 from databend_aiserver.runtime import DeviceRequest, choose_device, get_runtime
 from databend_aiserver.stages.operator import (
@@ -69,11 +67,9 @@ class _DoclingBackend:
     name = "docling"
 
     def __init__(self) -> None:
-        self.choice = self._choose_device()
-        self.accel = self._build_accelerator(self.choice)
-        self.ocr_provider = self._select_ocr_provider()
+        self.accel = self._build_accelerator()
 
-    def _choose_device(self):
+    def _build_accelerator(self):
         override = os.getenv("AISERVER_DOCLING_DEVICE")
         req = DeviceRequest(task="docling", allow_gpu=True, allow_mps=True, explicit=override)
         choice = choose_device(req)
@@ -84,9 +80,7 @@ def _choose_device(self):
             choice.reason,
             override,
         )
-        return choice
 
-    def _build_accelerator(self, choice):
         if AcceleratorOptions is None or AcceleratorDevice is None:
             return None
         if choice.device.startswith("cuda"):
@@ -95,42 +89,26 @@ def _build_accelerator(self, choice):
             return AcceleratorOptions(device=AcceleratorDevice.MPS)
         return AcceleratorOptions(device=AcceleratorDevice.CPU)
 
-    def _select_ocr_provider(self) -> Optional[str]:
-        runtime = get_runtime()
-        providers = runtime.capabilities.onnx_providers
-        if runtime.capabilities.device_kind == "cuda" and "CUDAExecutionProvider" in providers:
-            choice = "CUDAExecutionProvider"
-        else:
-            choice = "CPUExecutionProvider"
-        logger.info("Docling OCR provider: %s (available=%s)", choice, providers)
-        return choice
-
     def _build_converter(self):
-        # Docling expects accelerator via pipeline options, not constructor kwargs.
         format_options: Dict[InputFormat, Any] = {}
         if self.accel is not None:
             pdf_opts = ThreadedPdfPipelineOptions()
             pdf_opts.accelerator_options = self.accel
-            format_options[InputFormat.PDF] = PdfFormatOption(
-                pipeline_options=pdf_opts
-            )
+            format_options[InputFormat.PDF] = PdfFormatOption(pipeline_options=pdf_opts)
 
         try:
-            return DocumentConverter(
-                format_options=format_options if format_options else None
-            )
+            return DocumentConverter(format_options=format_options if format_options else None)
         except TypeError:
-            # Extremely old docling builds may not accept format_options; fall back.
-            logger.warning(
-                "Installed docling version does not support format_options; using defaults"
-            )
+            logger.warning("Installed docling version does not support format_options; using defaults")
             return DocumentConverter()
 
     def convert(self, stage_location: StageLocation, path: str) -> tuple[ConversionResult, int]:
         t_start = perf_counter()
         raw = load_stage_file(stage_location, path)
         suffix = stage_file_suffix(path)
         converter = self._build_converter()
+        
+        # Try processing from memory stream first
         if DocumentStream is not None:
             try:
                 stream = DocumentStream(
@@ -139,25 +117,19 @@ def convert(self, stage_location: StageLocation, path: str) -> tuple[ConversionR
                     mime_type=mimetypes.guess_type(f"file{suffix}")[0] or "application/octet-stream",
                 )
                 result = converter.convert(stream)
-                logger.info(
-                    "Docling convert path=%s stream=memory bytes=%s duration=%.3fs",
-                    path,
-                    len(raw),
-                    perf_counter() - t_start,
-                )
+                logger.info("Docling convert path=%s stream=memory bytes=%s duration=%.3fs", 
+                          path, len(raw), perf_counter() - t_start)
                 return result, len(raw)
             except Exception:
                 pass
+
+        # Fallback to temp file
         with tempfile.TemporaryDirectory() as tmpdir:
             tmp_path = Path(tmpdir) / f"doc{suffix}"
             tmp_path.write_bytes(raw)
             result = converter.convert(tmp_path)
-            logger.info(
-                "Docling convert path=%s stream=tempfile bytes=%s duration=%.3fs",
-                path,
-                len(raw),
-                perf_counter() - t_start,
-            )
+            logger.info("Docling convert path=%s stream=tempfile bytes=%s duration=%.3fs", 
+                      path, len(raw), perf_counter() - t_start)
             return result, len(raw)
 
 
@@ -181,6 +153,76 @@ def _get_hf_tokenizer(model_name: str) -> HuggingFaceTokenizer:
     return _TOKENIZER_CACHE[model_name]
 
 
+def _resolve_full_path(stage_location: StageLocation, path: str) -> str:
+    resolved_path = resolve_stage_subpath(stage_location, path)
+    storage = stage_location.storage or {}
+    storage_root = str(storage.get("root", "") or "")
+    bucket = storage.get("bucket") or storage.get("name")
+
+    if storage_root.startswith("s3://"):
+        base = storage_root.rstrip("/")
+        return f"{base}/{resolved_path}"
+    elif bucket:
+        base = f"s3://{bucket}"
+        if storage_root:
+            base = f"{base}/{storage_root.strip('/')}"
+        return f"{base}/{resolved_path}"
+    
+    return resolved_path or path
+
+
+def _chunk_document(doc: Any) -> Tuple[List[Dict[str, Any]], bool]:
+    """Chunk the document and return pages/chunks and a fallback flag."""
+    markdown = doc.export_to_markdown()
+    tokenizer = _get_hf_tokenizer(DEFAULT_EMBED_MODEL)
+    chunker = HybridChunker(tokenizer=tokenizer)
+
+    try:
+        chunks = list(chunker.chunk(dl_doc=doc))
+        if not chunks:
+            return [{"index": 0, "content": markdown}], True
+            
+        return [
+            {"index": idx, "content": chunker.contextualize(chunk)}
+            for idx, chunk in enumerate(chunks)
+        ], False
+    except Exception:
+        return [{"index": 0, "content": markdown}], True
+
+
+def _format_response(
+    path: str,
+    full_path: str,
+    pages: List[Dict[str, Any]],
+    file_size: int,
+    timings: Dict[str, float],
+    fallback: bool
+) -> Dict[str, Any]:
+    duration_ms = timings["total"]
+    payload: Dict[str, Any] = {
+        "metadata": {
+            "chunk_count": len(pages),
+            "chunk_size": DEFAULT_CHUNK_SIZE,
+            "duration_ms": duration_ms,
+            "file_size": file_size,
+            "filename": Path(path).name,
+            "path": full_path,
+            "timings_ms": timings,
+            "version": 1,
+        },
+        "chunks": pages,
+    }
+    
+    if fallback:
+        payload["error_information"] = [
+            {
+                "type": "ChunkingFallback",
+                "message": "chunker failed or returned empty; returned full markdown instead",
+            }
+        ]
+    return payload
+
+
 @udf(
     name="ai_parse_document",
     stage_refs=["stage_location"],
@@ -189,13 +231,7 @@ def _get_hf_tokenizer(model_name: str) -> HuggingFaceTokenizer:
     io_threads=4,
 )
 def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any]:
-    """Parse a document and return Snowflake-compatible layout output.
-
-    Simplified semantics:
-    - Always processes the full document.
-    - Always returns Markdown layout in ``content``.
-    - Includes ``pages`` array with per-page content when possible.
-    """
+    """Parse a document and return Snowflake-compatible layout output."""
     try:
         t_total_ns = perf_counter_ns()
         runtime = get_runtime()
@@ -205,91 +241,36 @@ def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any
             runtime.capabilities.preferred_device,
             runtime.capabilities.device_kind,
         )
+        
         backend = _get_doc_parser_backend()
-
         t_convert_start_ns = perf_counter_ns()
         result, file_size = backend.convert(stage_location, path)
         t_convert_end_ns = perf_counter_ns()
 
-        doc = result.document
-        markdown = doc.export_to_markdown()
-
-        # Docling chunking: tokenizer aligned with embedding model.
-        tokenizer = _get_hf_tokenizer(DEFAULT_EMBED_MODEL)
-        chunker = HybridChunker(tokenizer=tokenizer)
-
-        fallback = False
-        try:
-            chunks = list(chunker.chunk(dl_doc=doc))
-            pages: List[Dict[str, Any]] = [
-                {"index": idx, "content": chunker.contextualize(chunk)}
-                for idx, chunk in enumerate(chunks)
-            ]
-        except Exception:
-            pages = [{"index": 0, "content": markdown}]
-            fallback = True
-        if not pages:
-            pages = [{"index": 0, "content": markdown}]
-            fallback = True
-
-        chunk_count = len(pages)
-
+        pages, fallback = _chunk_document(result.document)
         t_chunk_end_ns = perf_counter_ns()
-        duration_ms = (t_chunk_end_ns - t_total_ns) / 1_000_000.0
-
-        # Output shape:
-        # { "chunks": [...], "metadata": {...}, "error_information": [...] }
-        resolved_path = resolve_stage_subpath(stage_location, path)
-        storage = stage_location.storage or {}
-        storage_root = str(storage.get("root", "") or "")
-        bucket = storage.get("bucket") or storage.get("name")
-
-        if storage_root.startswith("s3://"):
-            base = storage_root.rstrip("/")
-            full_path = f"{base}/{resolved_path}"
-        elif bucket:
-            base = f"s3://{bucket}"
-            if storage_root:
-                base = f"{base}/{storage_root.strip('/')}"
-            full_path = f"{base}/{resolved_path}"
-        else:
-            full_path = resolved_path or path
-
-        # Keep metadata first for predictable JSON ordering.
-        payload: Dict[str, Any] = {
-            "metadata": {
-                "chunk_count": chunk_count,
-                "chunk_size": DEFAULT_CHUNK_SIZE,
-                "duration_ms": duration_ms,
-                "file_size": file_size if file_size is not None else 0,
-                "filename": Path(path).name,
-                "path": full_path or path,
-                "timings_ms": {
-                    "convert": (t_convert_end_ns - t_convert_start_ns) / 1_000_000.0,
-                    "chunk": (t_chunk_end_ns - t_convert_end_ns) / 1_000_000.0,
-                    "total": duration_ms,
-                },
-                "version": 1,
-            },
-            "chunks": pages,
+
+        full_path = _resolve_full_path(stage_location, path)
+        
+        timings = {
+            "convert": (t_convert_end_ns - t_convert_start_ns) / 1_000_000.0,
+            "chunk": (t_chunk_end_ns - t_convert_end_ns) / 1_000_000.0,
+            "total": (t_chunk_end_ns - t_total_ns) / 1_000_000.0,
         }
-        if fallback:
-            payload["error_information"] = [
-                {
-                    "type": "ChunkingFallback",
-                    "message": "chunker failed or returned empty; returned full markdown instead",
-                }
-            ]
+
+        payload = _format_response(path, full_path, pages, file_size, timings, fallback)
+        
         logger.info(
             "ai_parse_document path=%s backend=%s chunks=%s fallback=%s duration_ms=%.1f",
             path,
             getattr(backend, "name", "unknown"),
-            chunk_count,
+            len(pages),
             fallback,
-            duration_ms,
+            timings["total"],
         )
         return payload
-    except Exception as exc:  # pragma: no cover - defensive for unexpected docling errors
+        
+    except Exception as exc:  # pragma: no cover
         return {
             "metadata": {
                 "path": path,
@@ -298,3 +279,4 @@ def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any
             "chunks": [],
             "error_information": [{"message": str(exc), "type": exc.__class__.__name__}],
         }
+