Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add option to define page range #852

Merged
merged 1 commit into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ def __init__(
self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages:
self.valid = False
elif self.page_count < self.limits.page_range[0]:
self.valid = False

except (FileNotFoundError, OSError) as e:
self.valid = False
Expand Down
17 changes: 16 additions & 1 deletion docling/datamodel/settings.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
import sys
from pathlib import Path
from typing import Annotated, Tuple

from pydantic import BaseModel
from pydantic import BaseModel, PlainValidator
from pydantic_settings import BaseSettings, SettingsConfigDict


def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
if v[0] < 1 or v[1] < v[0]:
raise ValueError(
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
)
return v


PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]

DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
cau-git marked this conversation as resolved.
Show resolved Hide resolved


class DocumentLimits(BaseModel):
max_num_pages: int = sys.maxsize
max_file_size: int = sys.maxsize
page_range: PageRange = DEFAULT_PAGE_RANGE


class BatchConcurrencySettings(BaseModel):
Expand Down
14 changes: 12 additions & 2 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import logging
import math
import sys
import time
from functools import partial
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union

from pydantic import BaseModel, ConfigDict, model_validator, validate_call

Expand Down Expand Up @@ -31,7 +32,12 @@
_DocumentConversionInput,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings
from docling.datamodel.settings import (
DEFAULT_PAGE_RANGE,
DocumentLimits,
PageRange,
settings,
)
from docling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
Expand Down Expand Up @@ -184,13 +190,15 @@ def convert(
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
page_range: PageRange = DEFAULT_PAGE_RANGE,
cau-git marked this conversation as resolved.
Show resolved Hide resolved
) -> ConversionResult:
all_res = self.convert_all(
source=[source],
raises_on_error=raises_on_error,
max_num_pages=max_num_pages,
max_file_size=max_file_size,
headers=headers,
page_range=page_range,
)
return next(all_res)

Expand All @@ -202,10 +210,12 @@ def convert_all(
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
page_range: PageRange = DEFAULT_PAGE_RANGE,
) -> Iterator[ConversionResult]:
limits = DocumentLimits(
max_num_pages=max_num_pages,
max_file_size=max_file_size,
page_range=page_range,
)
conv_input = _DocumentConversionInput(
path_or_stream_iterator=source, limits=limits, headers=headers
Expand Down
4 changes: 3 additions & 1 deletion docling/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):

for i in range(0, conv_res.input.page_count):
conv_res.pages.append(Page(page_no=i))
start_page, end_page = conv_res.input.limits.page_range
if (start_page - 1) <= i <= (end_page - 1):
conv_res.pages.append(Page(page_no=i))

try:
# Iterate batches of pages (page_batch_size) in the doc
Expand Down
35 changes: 35 additions & 0 deletions tests/test_input_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
from docling.datamodel.settings import DocumentLimits


def test_in_doc_from_valid_path():
Expand Down Expand Up @@ -39,6 +40,40 @@ def test_in_doc_from_invalid_buf():
assert doc.valid == False


def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/2206.01062.pdf")
limits = DocumentLimits()
limits.page_range = (1, 10)

doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True

limits.page_range = (9, 9)

doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True

limits.page_range = (11, 12)

doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == False


def test_guess_format(tmp_path):
"""Test docling.datamodel.document._DocumentConversionInput.__guess_format"""
dci = _DocumentConversionInput(path_or_stream_iterator=[])
Expand Down
14 changes: 14 additions & 0 deletions tests/test_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,20 @@ def test_e2e_conversions(test_doc_path):
assert doc_result.status == ConversionStatus.SUCCESS


def test_page_range(test_doc_path):
converter = DocumentConverter()
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(9, 9))

assert doc_result.status == ConversionStatus.SUCCESS
assert doc_result.input.page_count == 9
assert doc_result.document.num_pages() == 1

doc_result: ConversionResult = converter.convert(
test_doc_path, page_range=(10, 10), raises_on_error=False
)
assert doc_result.status == ConversionStatus.FAILURE


def test_ocr_coverage_threshold(test_doc_path):
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
Expand Down