Skip to content

Commit 607b350

Browse files
authored
Docling exposing backend arg and linting/publishing in CI (#1189)
1 parent 64b8ac4 commit 607b350

File tree

3 files changed

+33
-2
lines changed

3 files changed

+33
-2
lines changed

.github/workflows/build.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,18 @@ jobs:
3434
path: dist
3535
- name: Clean up paper-qa-pypdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
3636
run: rm -r ${{ steps.build-paper-qa-pypdf.outputs.dist }}
37+
- id: build-paper-qa-docling
38+
uses: hynek/build-and-inspect-python-package@v2
39+
with:
40+
path: packages/paper-qa-docling
41+
upload-name-suffix: -paper-qa-docling
42+
- name: Download built paper-qa-docling artifact to dist/
43+
uses: actions/download-artifact@v6
44+
with:
45+
name: ${{ steps.build-paper-qa-docling.outputs.artifact-name }}
46+
path: dist
47+
- name: Clean up paper-qa-docling build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
48+
run: rm -r ${{ steps.build-paper-qa-docling.outputs.dist }}
3749
- id: build-paper-qa
3850
uses: hynek/build-and-inspect-python-package@v2
3951
with:

.github/workflows/tests.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,16 @@ jobs:
5858
- name: Clean up paper-qa-pymupdf build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
5959
if: matrix.python-version == '3.11'
6060
run: rm -r ${{ steps.build-paper-qa-pypdf.outputs.dist }}
61+
- name: Check paper-qa-docling build
62+
id: build-paper-qa-docling
63+
if: matrix.python-version == '3.11'
64+
uses: hynek/build-and-inspect-python-package@v2
65+
with:
66+
path: packages/paper-qa-docling
67+
upload-name-suffix: -paper-qa-docling
68+
- name: Clean up paper-qa-docling build # Work around https://github.com/hynek/build-and-inspect-python-package/issues/174
69+
if: matrix.python-version == '3.11'
70+
run: rm -r ${{ steps.build-paper-qa-docling.outputs.dist }}
6171
- name: Check paper-qa build
6272
id: build-paper-qa
6373
if: matrix.python-version == '3.11'

packages/paper-qa-docling/src/paperqa_docling/reader.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
from collections.abc import Mapping
66
from importlib.metadata import version
77
from pathlib import Path
8-
from typing import Any, cast
8+
from typing import TYPE_CHECKING, Any, cast
99

1010
import docling
11+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
1112
from docling.datamodel.base_models import ConversionStatus
1213
from docling.datamodel.pipeline_options import PdfPipelineOptions
1314
from docling.datamodel.settings import DEFAULT_PAGE_RANGE
@@ -24,6 +25,9 @@
2425
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
2526
from paperqa.utils import ImpossibleParsingError
2627

28+
if TYPE_CHECKING:
29+
from docling.backend.abstract_backend import AbstractDocumentBackend
30+
2731
DOCLING_VERSION = version(docling.__name__)
2832
DOCLING_IMAGES_SCALE_PER_DPI = (
2933
72 # SEE: https://github.com/docling-project/docling/issues/2405
@@ -38,6 +42,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
3842
pipeline_cls: type = StandardPdfPipeline,
3943
dpi: int | None = None,
4044
custom_pipeline_options: Mapping[str, Any] | None = None,
45+
backend: "type[AbstractDocumentBackend]" = DoclingParseV4DocumentBackend,
4146
**_,
4247
) -> ParsedText:
4348
"""Parse a PDF.
@@ -56,6 +61,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
5661
page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
5762
to parse only specific pages, where pages are one-indexed.
5863
Leaving as the default of None will parse all pages.
64+
backend: PDF backend class to use for parsing, defaults to docling-parse v4.
5965
**_: Thrown away kwargs.
6066
"""
6167
path = Path(path)
@@ -73,7 +79,9 @@ def parse_pdf_to_pages( # noqa: PLR0912
7379
converter = DocumentConverter(
7480
format_options={
7581
InputFormat.PDF: PdfFormatOption(
76-
pipeline_options=pipeline_options, pipeline_cls=pipeline_cls
82+
pipeline_options=pipeline_options,
83+
pipeline_cls=pipeline_cls,
84+
backend=backend,
7785
)
7886
}
7987
)
@@ -253,6 +261,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
253261
name=(
254262
f"pdf|pipeline={pipeline_cls.__name__}"
255263
f"|page_range={str(page_range).replace(' ', '')}" # Remove space in tuple
264+
f"|backend={backend.__name__}"
256265
f"{multimodal_string if parse_media else ''}"
257266
),
258267
)

0 commit comments

Comments
 (0)