55from collections .abc import Mapping
66from importlib .metadata import version
77from pathlib import Path
8- from typing import Any , cast
8+ from typing import TYPE_CHECKING , Any , cast
99
1010import docling
11+ from docling .backend .docling_parse_v4_backend import DoclingParseV4DocumentBackend
1112from docling .datamodel .base_models import ConversionStatus
1213from docling .datamodel .pipeline_options import PdfPipelineOptions
1314from docling .datamodel .settings import DEFAULT_PAGE_RANGE
2425from paperqa .types import ParsedMedia , ParsedMetadata , ParsedText
2526from paperqa .utils import ImpossibleParsingError
2627
28+ if TYPE_CHECKING :
29+ from docling .backend .abstract_backend import AbstractDocumentBackend
30+
2731DOCLING_VERSION = version (docling .__name__ )
2832DOCLING_IMAGES_SCALE_PER_DPI = (
2933 72 # SEE: https://github.com/docling-project/docling/issues/2405
@@ -38,6 +42,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
3842 pipeline_cls : type = StandardPdfPipeline ,
3943 dpi : int | None = None ,
4044 custom_pipeline_options : Mapping [str , Any ] | None = None ,
45+ backend : "type[AbstractDocumentBackend]" = DoclingParseV4DocumentBackend ,
4146 ** _ ,
4247) -> ParsedText :
4348 """Parse a PDF.
@@ -56,6 +61,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
5661 page_range: Optional start_page or two-tuple of inclusive (start_page, end_page)
5762 to parse only specific pages, where pages are one-indexed.
5863 Leaving as the default of None will parse all pages.
64+ backend: PDF backend class to use for parsing, defaults to docling-parse v4.
5965 **_: Thrown away kwargs.
6066 """
6167 path = Path (path )
@@ -73,7 +79,9 @@ def parse_pdf_to_pages( # noqa: PLR0912
7379 converter = DocumentConverter (
7480 format_options = {
7581 InputFormat .PDF : PdfFormatOption (
76- pipeline_options = pipeline_options , pipeline_cls = pipeline_cls
82+ pipeline_options = pipeline_options ,
83+ pipeline_cls = pipeline_cls ,
84+ backend = backend ,
7785 )
7886 }
7987 )
@@ -253,6 +261,7 @@ def parse_pdf_to_pages( # noqa: PLR0912
253261 name = (
254262 f"pdf|pipeline={ pipeline_cls .__name__ } "
255263 f"|page_range={ str (page_range ).replace (' ' , '' )} " # Remove space in tuple
264+ f"|backend={ backend .__name__ } "
256265 f"{ multimodal_string if parse_media else '' } "
257266 ),
258267 )
0 commit comments