From 2dcc582d02b66e95dbaa624ecaa556978ca1543e Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 31 Jan 2025 13:58:07 +0100 Subject: [PATCH] feat: Add option to define page range Signed-off-by: Christoph Auer --- docling/datamodel/document.py | 2 ++ docling/datamodel/settings.py | 17 ++++++++++++++- docling/document_converter.py | 14 +++++++++++-- docling/pipeline/base_pipeline.py | 4 +++- tests/test_input_doc.py | 35 +++++++++++++++++++++++++++++++ tests/test_options.py | 14 +++++++++++++ 6 files changed, 82 insertions(+), 4 deletions(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e37541b78..d887fed94 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -157,6 +157,8 @@ def __init__( self.page_count = self._backend.page_count() if not self.page_count <= self.limits.max_num_pages: self.valid = False + elif self.page_count < self.limits.page_range[0]: + self.valid = False except (FileNotFoundError, OSError) as e: self.valid = False diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 46bab75cd..928562038 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -1,13 +1,28 @@ import sys from pathlib import Path +from typing import Annotated, Tuple -from pydantic import BaseModel +from pydantic import BaseModel, PlainValidator from pydantic_settings import BaseSettings, SettingsConfigDict +def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]: + if v[0] < 1 or v[1] < v[0]: + raise ValueError( + "Invalid page range: start must be ≥ 1 and end must be ≥ start." + ) + return v + + +PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)] + +DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize) + + class DocumentLimits(BaseModel): max_num_pages: int = sys.maxsize max_file_size: int = sys.maxsize + page_range: PageRange = DEFAULT_PAGE_RANGE class BatchConcurrencySettings(BaseModel): diff --git a/docling/document_converter.py b/docling/document_converter.py index 13203ea7b..d885dd20d 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,9 +1,10 @@ import logging +import math import sys import time from functools import partial from pathlib import Path -from typing import Dict, Iterable, Iterator, List, Optional, Type, Union +from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union from pydantic import BaseModel, ConfigDict, model_validator, validate_call @@ -31,7 +32,12 @@ _DocumentConversionInput, ) from docling.datamodel.pipeline_options import PipelineOptions -from docling.datamodel.settings import DocumentLimits, settings +from docling.datamodel.settings import ( + DEFAULT_PAGE_RANGE, + DocumentLimits, + PageRange, + settings, +) from docling.exceptions import ConversionError from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.simple_pipeline import SimplePipeline @@ -184,6 +190,7 @@ def convert( raises_on_error: bool = True, max_num_pages: int = sys.maxsize, max_file_size: int = sys.maxsize, + page_range: PageRange = DEFAULT_PAGE_RANGE, ) -> ConversionResult: all_res = self.convert_all( source=[source], @@ -191,6 +198,7 @@ def convert( max_num_pages=max_num_pages, max_file_size=max_file_size, headers=headers, + page_range=page_range, ) return next(all_res) @@ -202,10 +210,12 @@ def convert_all( raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error max_num_pages: int = sys.maxsize, max_file_size: int = sys.maxsize, + page_range: PageRange = DEFAULT_PAGE_RANGE, ) -> Iterator[ConversionResult]: limits = DocumentLimits( max_num_pages=max_num_pages, max_file_size=max_file_size, + page_range=page_range, ) conv_input = _DocumentConversionInput( path_or_stream_iterator=source, limits=limits, headers=headers diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 75a08e769..89aedf8e0 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -141,7 +141,9 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): for i in range(0, conv_res.input.page_count): - conv_res.pages.append(Page(page_no=i)) + start_page, end_page = conv_res.input.limits.page_range + if (start_page - 1) <= i <= (end_page - 1): + conv_res.pages.append(Page(page_no=i)) try: # Iterate batches of pages (page_batch_size) in the doc diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index f6c516aaf..efecb81e9 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -4,6 +4,7 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import InputDocument, _DocumentConversionInput +from docling.datamodel.settings import DocumentLimits def test_in_doc_from_valid_path(): @@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf(): assert doc.valid == False +def test_in_doc_with_page_range(): + test_doc_path = Path("./tests/data/2206.01062.pdf") + limits = DocumentLimits() + limits.page_range = (1, 10) + + doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + limits=limits, + ) + assert doc.valid == True + + limits.page_range = (9, 9) + + doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + limits=limits, + ) + assert doc.valid == True + + limits.page_range = (11, 12) + + doc = InputDocument( + path_or_stream=test_doc_path, + format=InputFormat.PDF, + backend=PyPdfiumDocumentBackend, + limits=limits, + ) + assert doc.valid == False + + def test_guess_format(tmp_path): """Test docling.datamodel.document._DocumentConversionInput.__guess_format""" dci = _DocumentConversionInput(path_or_stream_iterator=[]) diff --git a/tests/test_options.py b/tests/test_options.py index 8d861e48b..1dd3bbc89 100644 --- a/tests/test_options.py +++ b/tests/test_options.py @@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path): assert doc_result.status == ConversionStatus.SUCCESS +def test_page_range(test_doc_path): + converter = DocumentConverter() + doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9)) + + assert doc_result.status == ConversionStatus.SUCCESS + assert doc_result.input.page_count == 9 + assert doc_result.document.num_pages() == 1 + + doc_result: ConversionResult = converter.convert( + test_doc_path, page_range=(10, 10), raises_on_error=False + ) + assert doc_result.status == ConversionStatus.FAILURE + + def test_ocr_coverage_threshold(test_doc_path): pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True