From c6b3763ecb6ef862840a30978ee177b907f86505 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com> Date: Tue, 12 Nov 2024 09:46:14 +0100 Subject: [PATCH 1/4] feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290) - When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos --- docling/cli/main.py | 13 +++++-- docling/datamodel/pipeline_options.py | 1 + docling/models/base_ocr_model.py | 25 ++++++++++++-- docling/models/easyocr_model.py | 10 ++---- docling/models/tesseract_ocr_cli_model.py | 10 ++---- docling/models/tesseract_ocr_model.py | 10 ++---- docs/examples/full_page_ocr.py | 42 +++++++++++++++++++++++ mkdocs.yml | 1 + tests/test_e2e_ocr_conversion.py | 42 ++++------------------- tests/verify_utils.py | 8 +++++ 10 files changed, 100 insertions(+), 62 deletions(-) create mode 100644 docs/examples/full_page_ocr.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 35ae01df2..60a3c296a 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -153,6 +153,13 @@ def convert( ..., help="If enabled, the bitmap content will be processed using OCR." ), ] = True, + force_ocr: Annotated[ + bool, + typer.Option( + ..., + help="Replace any existing text with OCR generated text over the full content.", + ), + ] = False, ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, @@ -219,11 +226,11 @@ def convert( match ocr_engine: case OcrEngine.EASYOCR: - ocr_options: OcrOptions = EasyOcrOptions() + ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr) case OcrEngine.TESSERACT_CLI: - ocr_options = TesseractCliOcrOptions() + ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr) case OcrEngine.TESSERACT: - ocr_options = TesseractOcrOptions() + ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr) case _: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index d57f16711..2b9d228c5 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str + force_full_page_ocr: bool = False # If enabled a full page OCR is always applied bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR ) diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 9d26a317f..38b5e52cd 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -10,7 +10,7 @@ from rtree import index from scipy.ndimage import find_objects, label -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import OcrOptions from docling.datamodel.settings import settings @@ -73,7 +73,9 @@ def find_ocr_rects(size, bitmap_rects): coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) # return full-page rectangle if sufficiently covered with bitmaps - if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold): + if self.options.force_full_page_ocr or coverage > max( + BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold + ): return [ BoundingBox( l=0, @@ -96,7 +98,7 @@ def find_ocr_rects(size, bitmap_rects): return ocr_rects # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. - def filter_ocr_cells(self, ocr_cells, programmatic_cells): + def _filter_ocr_cells(self, ocr_cells, programmatic_cells): # Create R-tree index for programmatic cells p = index.Property() p.dimension = 2 @@ -117,6 +119,23 @@ def is_overlapping_with_existing_cells(ocr_cell): ] return filtered_ocr_cells + def post_process_cells(self, ocr_cells, programmatic_cells): + r""" + Post-process the ocr and programmatic cells and return the final list of of cells + """ + if self.options.force_full_page_ocr: + # If a full page OCR is forced, use only the OCR cells + cells = [ + Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox) + for c_ocr in ocr_cells + ] + return cells + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells) + programmatic_cells.extend(filtered_ocr_cells) + return programmatic_cells + def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): image = copy.deepcopy(page.image) draw = ImageDraw.Draw(image, "RGBA") diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 1b8e914f7..f8d0cf8df 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -5,7 +5,7 @@ import torch from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import EasyOcrOptions from docling.datamodel.settings import settings @@ -88,12 +88,8 @@ def __call__( ] all_ocr_cells.extend(cells) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells( - all_ocr_cells, page.cells - ) - - page.cells.extend(filtered_ocr_cells) + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 6f939351a..9a50eee02 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -7,7 +7,7 @@ import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings @@ -170,12 +170,8 @@ def __call__( ) all_ocr_cells.append(cell) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells( - all_ocr_cells, page.cells - ) - - page.cells.extend(filtered_ocr_cells) + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 83f238372..b2bd358b0 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -3,7 +3,7 @@ from docling_core.types.doc import BoundingBox, CoordOrigin -from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings @@ -140,12 +140,8 @@ def __call__( # del high_res_image all_ocr_cells.extend(cells) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells( - all_ocr_cells, page.cells - ) - - page.cells.extend(filtered_ocr_cells) + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) # DEBUG code: if settings.debug.visualize_ocr: diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py new file mode 100644 index 000000000..35c2ba6b7 --- /dev/null +++ b/docs/examples/full_page_ocr.py @@ -0,0 +1,42 @@ +from pathlib import Path + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + input_doc = Path("./tests/data/2206.01062.pdf") + + pipeline_options = PdfPipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions + # ocr_options = EasyOcrOptions(force_full_page_ocr=True) + # ocr_options = TesseractOcrOptions(force_full_page_ocr=True) + ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) + pipeline_options.ocr_options = ocr_options + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + + doc = converter.convert(input_doc).document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 2ce244bd7..25eb48f44 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -71,6 +71,7 @@ nav: - "Figure enrichment": examples/develop_picture_enrichment.py - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py + - "Force full page OCR": examples/full_page_ocr.py - RAG / QA: - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 2aeda467b..324a4a14a 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -15,34 +15,8 @@ from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE = False - - -# Debug -def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): - r""" """ - import json - import os - - parent = pdf_path.parent - eng = "" if engine is None else f".{engine}" - - dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json") - with open(dict_fn, "w") as fd: - json.dump(doc_result.legacy_document.export_to_dict(), fd) - - pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json") - pages = [p.model_dump() for p in doc_result.pages] - with open(pages_fn, "w") as fd: - json.dump(pages, fd) - - doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt") - with open(doctags_fn, "w") as fd: - fd.write(doc_result.legacy_document.export_to_doctags()) - - md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md") - with open(md_fn, "w") as fd: - fd.write(doc_result.legacy_document.export_to_markdown()) +GENERATE_V1 = False +GENERATE_V2 = False def get_pdf_paths(): @@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions): def test_e2e_conversions(): - pdf_paths = get_pdf_paths() engines: List[OcrOptions] = [ EasyOcrOptions(), TesseractOcrOptions(), TesseractCliOcrOptions(), + EasyOcrOptions(force_full_page_ocr=True), + TesseractOcrOptions(force_full_page_ocr=True), + TesseractCliOcrOptions(force_full_page_ocr=True), ] for ocr_options in engines: @@ -91,20 +67,16 @@ def test_e2e_conversions(): doc_result: ConversionResult = converter.convert(pdf_path) - # Save conversions - # save_output(pdf_path, doc_result, None) - - # Debug verify_conversion_result_v1( input_path=pdf_path, doc_result=doc_result, - generate=GENERATE, + generate=GENERATE_V1, fuzzy=True, ) verify_conversion_result_v2( input_path=pdf_path, doc_result=doc_result, - generate=GENERATE, + generate=GENERATE_V2, fuzzy=True, ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 20f5eef00..c444266b7 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -256,15 +256,19 @@ def verify_conversion_result_v1( dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth + pages_path.parent.mkdir(parents=True, exist_ok=True) with open(pages_path, "w") as fw: fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) + json_path.parent.mkdir(parents=True, exist_ok=True) with open(json_path, "w") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder)) + md_path.parent.mkdir(parents=True, exist_ok=True) with open(md_path, "w") as fw: fw.write(doc_pred_md) + dt_path.parent.mkdir(parents=True, exist_ok=True) with open(dt_path, "w") as fw: fw.write(doc_pred_dt) else: # default branch in test @@ -328,15 +332,19 @@ def verify_conversion_result_v2( dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth + pages_path.parent.mkdir(parents=True, exist_ok=True) with open(pages_path, "w") as fw: fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) + json_path.parent.mkdir(parents=True, exist_ok=True) with open(json_path, "w") as fw: fw.write(json.dumps(doc_pred, default=pydantic_encoder)) + md_path.parent.mkdir(parents=True, exist_ok=True) with open(md_path, "w") as fw: fw.write(doc_pred_md) + dt_path.parent.mkdir(parents=True, exist_ok=True) with open(dt_path, "w") as fw: fw.write(doc_pred_dt) else: # default branch in test From 5d4a10b121317fa481208dacbee47032b08ff928 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:57:16 +0100 Subject: [PATCH 2/4] fix: Configure env prefix for docling settings (#315) Signed-off-by: Christoph Auer --- docling/datamodel/settings.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 7daf5047b..b1c47305f 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -2,7 +2,7 @@ from pathlib import Path from pydantic import BaseModel -from pydantic_settings import BaseSettings +from pydantic_settings import BaseSettings, SettingsConfigDict class DocumentLimits(BaseModel): @@ -40,6 +40,8 @@ class DebugSettings(BaseModel): class AppSettings(BaseSettings): + model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_") + perf: BatchConcurrencySettings debug: DebugSettings From 777237ebc9f86af4a516e0770810fa8fbbf1f52d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 12 Nov 2024 10:19:55 +0000 Subject: [PATCH 3/4] chore: bump version to 2.5.0 [skip ci] --- CHANGELOG.md | 16 ++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10cd14022..76963f6cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +## [v2.5.0](https://github.com/DS4SD/docling/releases/tag/v2.5.0) - 2024-11-12 + +### Feature + +* **OCR:** Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning ([#290](https://github.com/DS4SD/docling/issues/290)) ([`c6b3763`](https://github.com/DS4SD/docling/commit/c6b3763ecb6ef862840a30978ee177b907f86505)) + +### Fix + +* Configure env prefix for docling settings ([#315](https://github.com/DS4SD/docling/issues/315)) ([`5d4a10b`](https://github.com/DS4SD/docling/commit/5d4a10b121317fa481208dacbee47032b08ff928)) +* Added handling of grouped elements in pptx backend ([#307](https://github.com/DS4SD/docling/issues/307)) ([`81c8243`](https://github.com/DS4SD/docling/commit/81c8243a8bf177feed8f87ea283b5bb6836350cb)) +* Allow mps usage for easyocr ([#286](https://github.com/DS4SD/docling/issues/286)) ([`97f214e`](https://github.com/DS4SD/docling/commit/97f214efddcf66f0734a95c17c08936f6111d113)) + +### Documentation + +* Add navigation indices ([#305](https://github.com/DS4SD/docling/issues/305)) ([`1239ade`](https://github.com/DS4SD/docling/commit/1239ade2750349d13d4e865d88449b232bbad944)) + ## [v2.4.2](https://github.com/DS4SD/docling/releases/tag/v2.4.2) - 2024-11-08 ### Fix diff --git a/pyproject.toml b/pyproject.toml index 0451ba805..95db56431 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.4.2" # DO NOT EDIT, updated automatically +version = "2.5.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT" From 93fc1be61abfe0669daf26c0984a51ec8675bf62 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 12 Nov 2024 12:21:48 +0100 Subject: [PATCH 4/4] docs: add Data Prep Kit integration (#316) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/integrations/data_prep_kit.md | 13 +++++++++++++ docs/integrations/llamaindex.md | 2 +- mkdocs.yml | 5 +++-- 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 docs/integrations/data_prep_kit.md diff --git a/docs/integrations/data_prep_kit.md b/docs/integrations/data_prep_kit.md new file mode 100644 index 000000000..5885e8eda --- /dev/null +++ b/docs/integrations/data_prep_kit.md @@ -0,0 +1,13 @@ +## Get started + +Docling is used by the [Data Prep Kit \[↗\]](https://ibm.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale. + +Below you find the Data Prep Kit modules powered by Docling. + +## PDF ingestion to Parquet +- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet) +- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/pdf2parquet/python/) + +## Document chunking +- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_chunk) +- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/doc_chunk/python/) diff --git a/docs/integrations/llamaindex.md b/docs/integrations/llamaindex.md index a43a6e6a4..41eb6e3d9 100644 --- a/docs/integrations/llamaindex.md +++ b/docs/integrations/llamaindex.md @@ -1,6 +1,6 @@ ## Get started -Docling is available as an official LlamaIndex extension! +Docling is available as an official [LlamaIndex \[↗\]](https://docs.llamaindex.ai/) extension. To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/). diff --git a/mkdocs.yml b/mkdocs.yml index 25eb48f44..6d8de0858 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -81,8 +81,9 @@ nav: # - CLI: examples/cli.md - Integrations: - Integrations: integrations/index.md - - "LlamaIndex 🦙 extension": integrations/llamaindex.md - # - "LangChain 🦜🔗 extension": integrations/langchain.md + - "Data Prep Kit": integrations/data_prep_kit.md + - "LlamaIndex 🦙": integrations/llamaindex.md + # - "LangChain 🦜🔗": integrations/langchain.md # - API reference: # - API reference: api_reference/index.md