11import base64
2+ import hashlib
23from contextlib import asynccontextmanager
4+ from enum import Enum
35from io import BytesIO
4- from pathlib import Path
5- from typing import Any , Dict , Union
6+ from typing import Any , Dict , List , Optional , Tuple , Union
67
78import httpx
89from docling .datamodel .base_models import (
910 ConversionStatus ,
1011 DocumentStream ,
11- PipelineOptions ,
12+ ErrorItem ,
13+ InputFormat ,
1214)
13- from docling .datamodel .document import ConversionResult , DocumentConversionInput
14- from docling .document_converter import DocumentConverter
15- from fastapi import FastAPI , HTTPException
16- from pydantic import BaseModel
15+ from docling .datamodel .document import ConversionResult
16+ from docling .datamodel .pipeline_options import (
17+ EasyOcrOptions ,
18+ OcrOptions ,
19+ PdfPipelineOptions ,
20+ RapidOcrOptions ,
21+ TesseractOcrOptions ,
22+ )
23+ from docling .document_converter import DocumentConverter , PdfFormatOption
24+ from docling .utils .profiling import ProfilingItem
25+ from docling_core .types .doc import DoclingDocument , ImageRefMode
26+ from docling_core .utils .file import resolve_remote_filename
27+ from fastapi import FastAPI , HTTPException , Response
28+ from pydantic import AnyHttpUrl , BaseModel
29+
30+
31+ # TODO: import enum from Docling, once it is exposed
32+ class OcrEngine (str , Enum ):
33+ EASYOCR = "easyocr"
34+ TESSERACT = "tesseract"
35+ RAPIDOCR = "rapidocr"
36+
37+
38+ class ConvertOptions (BaseModel ):
39+ output_docling_document : bool = True
40+ output_markdown : bool = False
41+ output_html : bool = False
42+ do_ocr : bool = True
43+ ocr_engine : OcrEngine = OcrEngine .EASYOCR
44+ ocr_lang : Optional [List [str ]] = None
45+ force_ocr : bool = False
46+ do_table_structure : bool = True
47+ include_images : bool = True
48+ images_scale : float = 2.0
49+
1750
18- from docling_serve .settings import Settings
51+ class DocumentConvertBase (BaseModel ):
52+ options : ConvertOptions = ConvertOptions ()
1953
2054
2155class HttpSource (BaseModel ):
@@ -28,37 +62,124 @@ class FileSource(BaseModel):
2862 filename : str
2963
3064
31- class ConvertDocumentHttpSourceRequest (BaseModel ):
65+ class ConvertDocumentHttpSourceRequest (DocumentConvertBase ):
3266 http_source : HttpSource
3367
3468
35- class ConvertDocumentFileSourceRequest (BaseModel ):
69+ class ConvertDocumentFileSourceRequest (DocumentConvertBase ):
3670 file_source : FileSource
3771
3872
73+ class DocumentResponse (BaseModel ):
74+ markdown : Optional [str ] = None
75+ docling_document : Optional [DoclingDocument ] = None
76+ html : Optional [str ] = None
77+
78+
3979class ConvertDocumentResponse (BaseModel ):
40- content_md : str
80+ document : DocumentResponse
81+ status : ConversionStatus
82+ errors : List [ErrorItem ] = []
83+ timings : Dict [str , ProfilingItem ] = {}
84+
85+
86+ class ConvertDocumentErrorResponse (BaseModel ):
87+ status : ConversionStatus
88+ # errors: List[ErrorItem] = []
4189
4290
4391ConvertDocumentRequest = Union [
4492 ConvertDocumentFileSourceRequest , ConvertDocumentHttpSourceRequest
4593]
4694
4795
48- models = {}
96+ class MarkdownTextResponse (Response ):
97+ media_type = "text/markdown"
98+
99+
100+ class HealthCheckResponse (BaseModel ):
101+ status : str = "ok"
102+
103+
104+ def get_pdf_pipeline_opts (options : ConvertOptions ) -> Tuple [PdfPipelineOptions , str ]:
105+
106+ if options .ocr_engine == OcrEngine .EASYOCR :
107+ try :
108+ import easyocr # noqa: F401
109+ except ImportError :
110+ raise HTTPException (
111+ status_code = 400 ,
112+ detail = "The requested OCR engine"
113+ f" (ocr_engine={ options .ocr_engine .value } )"
114+ " is not available on this system. Please choose another OCR engine "
115+ "or contact your system administrator." ,
116+ )
117+ ocr_options : OcrOptions = EasyOcrOptions (force_full_page_ocr = options .force_ocr )
118+ elif options .ocr_engine == OcrEngine .TESSERACT :
119+ try :
120+ import tesserocr # noqa: F401
121+ except ImportError :
122+ raise HTTPException (
123+ status_code = 400 ,
124+ detail = "The requested OCR engine"
125+ f" (ocr_engine={ options .ocr_engine .value } )"
126+ " is not available on this system. Please choose another OCR engine "
127+ "or contact your system administrator." ,
128+ )
129+ ocr_options = TesseractOcrOptions (force_full_page_ocr = options .force_ocr )
130+ elif options .ocr_engine == OcrEngine .RAPIDOCR :
131+ try :
132+ from rapidocr_onnxruntime import RapidOCR # noqa: F401
133+ except ImportError :
134+ raise HTTPException (
135+ status_code = 400 ,
136+ detail = "The requested OCR engine"
137+ f" (ocr_engine={ options .ocr_engine .value } )"
138+ " is not available on this system. Please choose another OCR engine "
139+ "or contact your system administrator." ,
140+ )
141+ ocr_options = RapidOcrOptions (force_full_page_ocr = options .force_ocr )
142+ else :
143+ raise RuntimeError (f"Unexpected OCR engine type { options .ocr_engine } " )
144+
145+ if options .ocr_lang is not None :
146+ ocr_options .lang = options .ocr_lang
147+
148+ pipeline_options = PdfPipelineOptions (
149+ do_ocr = options .do_ocr ,
150+ ocr_options = ocr_options ,
151+ do_table_structure = options .do_table_structure ,
152+ generate_page_images = options .include_images ,
153+ generate_picture_images = options .include_images ,
154+ images_scale = options .images_scale ,
155+ )
156+
157+ options_hash = hashlib .sha1 (pipeline_options .model_dump_json ().encode ()).hexdigest ()
158+
159+ return pipeline_options , options_hash
160+
161+
162+ converters : Dict [str , DocumentConverter ] = {}
49163
50164
51165@asynccontextmanager
52166async def lifespan (app : FastAPI ):
53- # Converter
54- settings = Settings ()
55- pipeline_options = PipelineOptions ()
56- pipeline_options .do_ocr = settings .do_ocr
57- pipeline_options .do_table_structure = settings .do_table_structure
58- models ["converter" ] = DocumentConverter (pipeline_options = pipeline_options )
167+ # settings = Settings()
168+
169+ # Converter with default options
170+ pipeline_options , options_hash = get_pdf_pipeline_opts (ConvertOptions ())
171+ converters [options_hash ] = DocumentConverter (
172+ format_options = {
173+ InputFormat .PDF : PdfFormatOption (pipeline_options = pipeline_options ),
174+ InputFormat .IMAGE : PdfFormatOption (pipeline_options = pipeline_options ),
175+ }
176+ )
177+
178+ converters [options_hash ].initialize_pipeline (InputFormat .PDF )
179+
59180 yield
60181
61- models .clear ()
182+ converters .clear ()
62183
63184
64185app = FastAPI (
@@ -67,10 +188,14 @@ async def lifespan(app: FastAPI):
67188)
68189
69190
70- @app .post ("/convert" )
71- def convert_pdf_document (
191+ @app .get ("/health" )
192+ def health () -> HealthCheckResponse :
193+ return HealthCheckResponse ()
194+
195+
196+ def _convert_document (
72197 body : ConvertDocumentRequest ,
73- ) -> ConvertDocumentResponse :
198+ ) -> ConversionResult :
74199
75200 filename : str
76201 buf : BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
81206 elif isinstance (body , ConvertDocumentHttpSourceRequest ):
82207 http_res = httpx .get (body .http_source .url , headers = body .http_source .headers )
83208 buf = BytesIO (http_res .content )
84- filename = Path (
85- body .http_source .url
86- ).name # TODO: use better way to detect filename, e.g. from Content-Disposition
209+ filename = resolve_remote_filename (
210+ http_url = AnyHttpUrl (body .http_source .url ),
211+ response_headers = dict (** http_res .headers ),
212+ )
213+
214+ doc_input = DocumentStream (name = filename , stream = buf )
215+
216+ pipeline_options , options_hash = get_pdf_pipeline_opts (body .options )
217+ if options_hash not in converters :
218+ converters [options_hash ] = DocumentConverter (
219+ format_options = {
220+ InputFormat .PDF : PdfFormatOption (pipeline_options = pipeline_options ),
221+ InputFormat .IMAGE : PdfFormatOption (pipeline_options = pipeline_options ),
222+ }
223+ )
224+
225+ result : ConversionResult = converters [options_hash ].convert (doc_input )
226+
227+ if result is None or result .status == ConversionStatus .SKIPPED :
228+ raise HTTPException (status_code = 400 , detail = result .errors )
229+
230+ if result is None or result .status not in {
231+ ConversionStatus .SUCCESS ,
232+ }:
233+ raise HTTPException (
234+ status_code = 500 , detail = {"errors" : result .errors , "status" : result .status }
235+ )
236+
237+ return result
87238
88- docs_input = DocumentConversionInput .from_streams (
89- [DocumentStream (filename = filename , stream = buf )]
239+
240+ @app .post (
241+ "/convert" ,
242+ )
243+ def convert_document (
244+ body : ConvertDocumentRequest ,
245+ ) -> ConvertDocumentResponse :
246+
247+ result = _convert_document (body = body )
248+
249+ image_mode = (
250+ ImageRefMode .EMBEDDED
251+ if body .options .include_images
252+ else ImageRefMode .PLACEHOLDER
253+ )
254+ doc_resp = DocumentResponse ()
255+ if body .options .output_docling_document :
256+ doc_resp .docling_document = result .document
257+ if body .options .output_markdown :
258+ doc_resp .markdown = result .document .export_to_markdown (image_mode = image_mode )
259+ if body .options .output_html :
260+ doc_resp .html = result .document .export_to_html (image_mode = image_mode )
261+
262+ return ConvertDocumentResponse (
263+ document = doc_resp , status = result .status , timings = result .timings
90264 )
91- result : ConversionResult = next (models ["converter" ].convert (docs_input ), None )
92265
93- if result is None or result .status != ConversionStatus .SUCCESS :
94- raise HTTPException (status_code = 500 , detail = {"errors" : result .errors })
95266
96- return ConvertDocumentResponse (content_md = result .render_as_markdown ())
267+ @app .post ("/convert/markdown" , response_class = MarkdownTextResponse )
268+ def convert_document_md (
269+ body : ConvertDocumentRequest ,
270+ ) -> MarkdownTextResponse :
271+ result = _convert_document (body = body )
272+ image_mode = (
273+ ImageRefMode .EMBEDDED
274+ if body .options .include_images
275+ else ImageRefMode .PLACEHOLDER
276+ )
277+ return MarkdownTextResponse (
278+ result .document .export_to_markdown (image_mode = image_mode )
279+ )
0 commit comments