Skip to content

Commit b00718b

Browse files
authored
feat: upgrade endpoint to docling v2 (#13)
* upgrade endpoint to docling v2 Signed-off-by: Michele Dolfi <[email protected]> * fix Containerfile Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent 3824aa6 commit b00718b

File tree

5 files changed

+2657
-2015
lines changed

5 files changed

+2657
-2015
lines changed

Containerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ RUN if [ "$CPU_ONLY" = "true" ]; then \
2020
ENV HF_HOME=/tmp/
2121
ENV TORCH_HOME=/tmp/
2222

23-
RUN poetry run python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
23+
RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
2424

2525
# On container environments, always set a thread budget to avoid undesired thread congestion.
2626
ENV OMP_NUM_THREADS=4
@@ -29,4 +29,4 @@ COPY ./docling_serve /docling-serve/docling_serve
2929

3030
EXPOSE 5000
3131

32-
CMD ["poetry", "run", "uvicorn", "--port", "5000", "docling_serve.app:app"]
32+
CMD ["poetry", "run", "uvicorn", "--port", "5000", "--host", "0.0.0.0", "docling_serve.app:app"]

docling_serve/app.py

Lines changed: 214 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,55 @@
11
import base64
2+
import hashlib
23
from contextlib import asynccontextmanager
4+
from enum import Enum
35
from io import BytesIO
4-
from pathlib import Path
5-
from typing import Any, Dict, Union
6+
from typing import Any, Dict, List, Optional, Tuple, Union
67

78
import httpx
89
from docling.datamodel.base_models import (
910
ConversionStatus,
1011
DocumentStream,
11-
PipelineOptions,
12+
ErrorItem,
13+
InputFormat,
1214
)
13-
from docling.datamodel.document import ConversionResult, DocumentConversionInput
14-
from docling.document_converter import DocumentConverter
15-
from fastapi import FastAPI, HTTPException
16-
from pydantic import BaseModel
15+
from docling.datamodel.document import ConversionResult
16+
from docling.datamodel.pipeline_options import (
17+
EasyOcrOptions,
18+
OcrOptions,
19+
PdfPipelineOptions,
20+
RapidOcrOptions,
21+
TesseractOcrOptions,
22+
)
23+
from docling.document_converter import DocumentConverter, PdfFormatOption
24+
from docling.utils.profiling import ProfilingItem
25+
from docling_core.types.doc import DoclingDocument, ImageRefMode
26+
from docling_core.utils.file import resolve_remote_filename
27+
from fastapi import FastAPI, HTTPException, Response
28+
from pydantic import AnyHttpUrl, BaseModel
29+
30+
31+
# TODO: import enum from Docling, once it is exposed
32+
class OcrEngine(str, Enum):
33+
EASYOCR = "easyocr"
34+
TESSERACT = "tesseract"
35+
RAPIDOCR = "rapidocr"
36+
37+
38+
class ConvertOptions(BaseModel):
39+
output_docling_document: bool = True
40+
output_markdown: bool = False
41+
output_html: bool = False
42+
do_ocr: bool = True
43+
ocr_engine: OcrEngine = OcrEngine.EASYOCR
44+
ocr_lang: Optional[List[str]] = None
45+
force_ocr: bool = False
46+
do_table_structure: bool = True
47+
include_images: bool = True
48+
images_scale: float = 2.0
49+
1750

18-
from docling_serve.settings import Settings
51+
class DocumentConvertBase(BaseModel):
52+
options: ConvertOptions = ConvertOptions()
1953

2054

2155
class HttpSource(BaseModel):
@@ -28,37 +62,124 @@ class FileSource(BaseModel):
2862
filename: str
2963

3064

31-
class ConvertDocumentHttpSourceRequest(BaseModel):
65+
class ConvertDocumentHttpSourceRequest(DocumentConvertBase):
3266
http_source: HttpSource
3367

3468

35-
class ConvertDocumentFileSourceRequest(BaseModel):
69+
class ConvertDocumentFileSourceRequest(DocumentConvertBase):
3670
file_source: FileSource
3771

3872

73+
class DocumentResponse(BaseModel):
74+
markdown: Optional[str] = None
75+
docling_document: Optional[DoclingDocument] = None
76+
html: Optional[str] = None
77+
78+
3979
class ConvertDocumentResponse(BaseModel):
40-
content_md: str
80+
document: DocumentResponse
81+
status: ConversionStatus
82+
errors: List[ErrorItem] = []
83+
timings: Dict[str, ProfilingItem] = {}
84+
85+
86+
class ConvertDocumentErrorResponse(BaseModel):
87+
status: ConversionStatus
88+
# errors: List[ErrorItem] = []
4189

4290

4391
ConvertDocumentRequest = Union[
4492
ConvertDocumentFileSourceRequest, ConvertDocumentHttpSourceRequest
4593
]
4694

4795

48-
models = {}
96+
class MarkdownTextResponse(Response):
97+
media_type = "text/markdown"
98+
99+
100+
class HealthCheckResponse(BaseModel):
101+
status: str = "ok"
102+
103+
104+
def get_pdf_pipeline_opts(options: ConvertOptions) -> Tuple[PdfPipelineOptions, str]:
105+
106+
if options.ocr_engine == OcrEngine.EASYOCR:
107+
try:
108+
import easyocr # noqa: F401
109+
except ImportError:
110+
raise HTTPException(
111+
status_code=400,
112+
detail="The requested OCR engine"
113+
f" (ocr_engine={options.ocr_engine.value})"
114+
" is not available on this system. Please choose another OCR engine "
115+
"or contact your system administrator.",
116+
)
117+
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=options.force_ocr)
118+
elif options.ocr_engine == OcrEngine.TESSERACT:
119+
try:
120+
import tesserocr # noqa: F401
121+
except ImportError:
122+
raise HTTPException(
123+
status_code=400,
124+
detail="The requested OCR engine"
125+
f" (ocr_engine={options.ocr_engine.value})"
126+
" is not available on this system. Please choose another OCR engine "
127+
"or contact your system administrator.",
128+
)
129+
ocr_options = TesseractOcrOptions(force_full_page_ocr=options.force_ocr)
130+
elif options.ocr_engine == OcrEngine.RAPIDOCR:
131+
try:
132+
from rapidocr_onnxruntime import RapidOCR # noqa: F401
133+
except ImportError:
134+
raise HTTPException(
135+
status_code=400,
136+
detail="The requested OCR engine"
137+
f" (ocr_engine={options.ocr_engine.value})"
138+
" is not available on this system. Please choose another OCR engine "
139+
"or contact your system administrator.",
140+
)
141+
ocr_options = RapidOcrOptions(force_full_page_ocr=options.force_ocr)
142+
else:
143+
raise RuntimeError(f"Unexpected OCR engine type {options.ocr_engine}")
144+
145+
if options.ocr_lang is not None:
146+
ocr_options.lang = options.ocr_lang
147+
148+
pipeline_options = PdfPipelineOptions(
149+
do_ocr=options.do_ocr,
150+
ocr_options=ocr_options,
151+
do_table_structure=options.do_table_structure,
152+
generate_page_images=options.include_images,
153+
generate_picture_images=options.include_images,
154+
images_scale=options.images_scale,
155+
)
156+
157+
options_hash = hashlib.sha1(pipeline_options.model_dump_json().encode()).hexdigest()
158+
159+
return pipeline_options, options_hash
160+
161+
162+
converters: Dict[str, DocumentConverter] = {}
49163

50164

51165
@asynccontextmanager
52166
async def lifespan(app: FastAPI):
53-
# Converter
54-
settings = Settings()
55-
pipeline_options = PipelineOptions()
56-
pipeline_options.do_ocr = settings.do_ocr
57-
pipeline_options.do_table_structure = settings.do_table_structure
58-
models["converter"] = DocumentConverter(pipeline_options=pipeline_options)
167+
# settings = Settings()
168+
169+
# Converter with default options
170+
pipeline_options, options_hash = get_pdf_pipeline_opts(ConvertOptions())
171+
converters[options_hash] = DocumentConverter(
172+
format_options={
173+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
174+
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
175+
}
176+
)
177+
178+
converters[options_hash].initialize_pipeline(InputFormat.PDF)
179+
59180
yield
60181

61-
models.clear()
182+
converters.clear()
62183

63184

64185
app = FastAPI(
@@ -67,10 +188,14 @@ async def lifespan(app: FastAPI):
67188
)
68189

69190

70-
@app.post("/convert")
71-
def convert_pdf_document(
191+
@app.get("/health")
192+
def health() -> HealthCheckResponse:
193+
return HealthCheckResponse()
194+
195+
196+
def _convert_document(
72197
body: ConvertDocumentRequest,
73-
) -> ConvertDocumentResponse:
198+
) -> ConversionResult:
74199

75200
filename: str
76201
buf: BytesIO
@@ -81,16 +206,74 @@ def convert_pdf_document(
81206
elif isinstance(body, ConvertDocumentHttpSourceRequest):
82207
http_res = httpx.get(body.http_source.url, headers=body.http_source.headers)
83208
buf = BytesIO(http_res.content)
84-
filename = Path(
85-
body.http_source.url
86-
).name # TODO: use better way to detect filename, e.g. from Content-Disposition
209+
filename = resolve_remote_filename(
210+
http_url=AnyHttpUrl(body.http_source.url),
211+
response_headers=dict(**http_res.headers),
212+
)
213+
214+
doc_input = DocumentStream(name=filename, stream=buf)
215+
216+
pipeline_options, options_hash = get_pdf_pipeline_opts(body.options)
217+
if options_hash not in converters:
218+
converters[options_hash] = DocumentConverter(
219+
format_options={
220+
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
221+
InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options),
222+
}
223+
)
224+
225+
result: ConversionResult = converters[options_hash].convert(doc_input)
226+
227+
if result is None or result.status == ConversionStatus.SKIPPED:
228+
raise HTTPException(status_code=400, detail=result.errors)
229+
230+
if result is None or result.status not in {
231+
ConversionStatus.SUCCESS,
232+
}:
233+
raise HTTPException(
234+
status_code=500, detail={"errors": result.errors, "status": result.status}
235+
)
236+
237+
return result
87238

88-
docs_input = DocumentConversionInput.from_streams(
89-
[DocumentStream(filename=filename, stream=buf)]
239+
240+
@app.post(
241+
"/convert",
242+
)
243+
def convert_document(
244+
body: ConvertDocumentRequest,
245+
) -> ConvertDocumentResponse:
246+
247+
result = _convert_document(body=body)
248+
249+
image_mode = (
250+
ImageRefMode.EMBEDDED
251+
if body.options.include_images
252+
else ImageRefMode.PLACEHOLDER
253+
)
254+
doc_resp = DocumentResponse()
255+
if body.options.output_docling_document:
256+
doc_resp.docling_document = result.document
257+
if body.options.output_markdown:
258+
doc_resp.markdown = result.document.export_to_markdown(image_mode=image_mode)
259+
if body.options.output_html:
260+
doc_resp.html = result.document.export_to_html(image_mode=image_mode)
261+
262+
return ConvertDocumentResponse(
263+
document=doc_resp, status=result.status, timings=result.timings
90264
)
91-
result: ConversionResult = next(models["converter"].convert(docs_input), None)
92265

93-
if result is None or result.status != ConversionStatus.SUCCESS:
94-
raise HTTPException(status_code=500, detail={"errors": result.errors})
95266

96-
return ConvertDocumentResponse(content_md=result.render_as_markdown())
267+
@app.post("/convert/markdown", response_class=MarkdownTextResponse)
268+
def convert_document_md(
269+
body: ConvertDocumentRequest,
270+
) -> MarkdownTextResponse:
271+
result = _convert_document(body=body)
272+
image_mode = (
273+
ImageRefMode.EMBEDDED
274+
if body.options.include_images
275+
else ImageRefMode.PLACEHOLDER
276+
)
277+
return MarkdownTextResponse(
278+
result.document.export_to_markdown(image_mode=image_mode)
279+
)

docling_serve/settings.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,5 @@
22

33

44
class Settings(BaseSettings):
5-
do_ocr: bool = True
6-
do_table_structure: bool = True
75

86
model_config = SettingsConfigDict(env_prefix="DOCLING_")

0 commit comments

Comments
 (0)