From c6b3763ecb6ef862840a30978ee177b907f86505 Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com>
Date: Tue, 12 Nov 2024 09:46:14 +0100
Subject: [PATCH 1/4] feat(OCR): Introduce the OcrOptions.force_full_page_ocr
 parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
---
 docling/cli/main.py                       | 13 +++++--
 docling/datamodel/pipeline_options.py     |  1 +
 docling/models/base_ocr_model.py          | 25 ++++++++++++--
 docling/models/easyocr_model.py           | 10 ++----
 docling/models/tesseract_ocr_cli_model.py | 10 ++----
 docling/models/tesseract_ocr_model.py     | 10 ++----
 docs/examples/full_page_ocr.py            | 42 +++++++++++++++++++++++
 mkdocs.yml                                |  1 +
 tests/test_e2e_ocr_conversion.py          | 42 ++++-------------------
 tests/verify_utils.py                     |  8 +++++
 10 files changed, 100 insertions(+), 62 deletions(-)
 create mode 100644 docs/examples/full_page_ocr.py

diff --git a/docling/cli/main.py b/docling/cli/main.py
index 35ae01df2..60a3c296a 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -153,6 +153,13 @@ def convert(
             ..., help="If enabled, the bitmap content will be processed using OCR."
         ),
     ] = True,
+    force_ocr: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="Replace any existing text with OCR generated text over the full content.",
+        ),
+    ] = False,
     ocr_engine: Annotated[
         OcrEngine, typer.Option(..., help="The OCR engine to use.")
     ] = OcrEngine.EASYOCR,
@@ -219,11 +226,11 @@ def convert(
 
     match ocr_engine:
         case OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions()
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
         case OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions()
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
         case OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions()
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
         case _:
             raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
 
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index d57f16711..2b9d228c5 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
 
 class OcrOptions(BaseModel):
     kind: str
+    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
     bitmap_area_threshold: float = (
         0.05  # percentage of the area for a bitmap to processed with OCR
     )
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
index 9d26a317f..38b5e52cd 100644
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -10,7 +10,7 @@
 from rtree import index
 from scipy.ndimage import find_objects, label
 
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
@@ -73,7 +73,9 @@ def find_ocr_rects(size, bitmap_rects):
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
 
         # return full-page rectangle if sufficiently covered with bitmaps
-        if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
+        if self.options.force_full_page_ocr or coverage > max(
+            BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
+        ):
             return [
                 BoundingBox(
                     l=0,
@@ -96,7 +98,7 @@ def find_ocr_rects(size, bitmap_rects):
             return ocr_rects
 
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
         # Create R-tree index for programmatic cells
         p = index.Property()
         p.dimension = 2
@@ -117,6 +119,23 @@ def is_overlapping_with_existing_cells(ocr_cell):
         ]
         return filtered_ocr_cells
 
+    def post_process_cells(self, ocr_cells, programmatic_cells):
+        r"""
+        Post-process the ocr and programmatic cells and return the final list of of cells
+        """
+        if self.options.force_full_page_ocr:
+            # If a full page OCR is forced, use only the OCR cells
+            cells = [
+                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
+                for c_ocr in ocr_cells
+            ]
+            return cells
+
+        ## Remove OCR cells which overlap with programmatic cells.
+        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
+        programmatic_cells.extend(filtered_ocr_cells)
+        return programmatic_cells
+
     def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
         image = copy.deepcopy(page.image)
         draw = ImageDraw.Draw(image, "RGBA")
diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
index 1b8e914f7..f8d0cf8df 100644
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -5,7 +5,7 @@
 import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
 
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.datamodel.settings import settings
@@ -88,12 +88,8 @@ def __call__(
                         ]
                         all_ocr_cells.extend(cells)
 
-                    ## Remove OCR cells which overlap with programmatic cells.
-                    filtered_ocr_cells = self.filter_ocr_cells(
-                        all_ocr_cells, page.cells
-                    )
-
-                    page.cells.extend(filtered_ocr_cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
 
                 # DEBUG code:
                 if settings.debug.visualize_ocr:
diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py
index 6f939351a..9a50eee02 100644
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
 
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
@@ -170,12 +170,8 @@ def __call__(
                             )
                             all_ocr_cells.append(cell)
 
-                    ## Remove OCR cells which overlap with programmatic cells.
-                    filtered_ocr_cells = self.filter_ocr_cells(
-                        all_ocr_cells, page.cells
-                    )
-
-                    page.cells.extend(filtered_ocr_cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
 
                 # DEBUG code:
                 if settings.debug.visualize_ocr:
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
index 83f238372..b2bd358b0 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -3,7 +3,7 @@
 
 from docling_core.types.doc import BoundingBox, CoordOrigin
 
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
@@ -140,12 +140,8 @@ def __call__(
                         # del high_res_image
                         all_ocr_cells.extend(cells)
 
-                    ## Remove OCR cells which overlap with programmatic cells.
-                    filtered_ocr_cells = self.filter_ocr_cells(
-                        all_ocr_cells, page.cells
-                    )
-
-                    page.cells.extend(filtered_ocr_cells)
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
 
                 # DEBUG code:
                 if settings.debug.visualize_ocr:
diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py
new file mode 100644
index 000000000..35c2ba6b7
--- /dev/null
+++ b/docs/examples/full_page_ocr.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    input_doc = Path("./tests/data/2206.01062.pdf")
+
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
+    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
+    # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
+    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
+    pipeline_options.ocr_options = ocr_options
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+
+    doc = converter.convert(input_doc).document
+    md = doc.export_to_markdown()
+    print(md)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mkdocs.yml b/mkdocs.yml
index 2ce244bd7..25eb48f44 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -71,6 +71,7 @@ nav:
       - "Figure enrichment": examples/develop_picture_enrichment.py
       - "Table export": examples/export_tables.py
       - "Multimodal export": examples/export_multimodal.py
+      - "Force full page OCR": examples/full_page_ocr.py
     - RAG / QA:
       - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
       - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py
index 2aeda467b..324a4a14a 100644
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -15,34 +15,8 @@
 
 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
 
-GENERATE = False
-
-
-# Debug
-def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
-    r""" """
-    import json
-    import os
-
-    parent = pdf_path.parent
-    eng = "" if engine is None else f".{engine}"
-
-    dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
-    with open(dict_fn, "w") as fd:
-        json.dump(doc_result.legacy_document.export_to_dict(), fd)
-
-    pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
-    pages = [p.model_dump() for p in doc_result.pages]
-    with open(pages_fn, "w") as fd:
-        json.dump(pages, fd)
-
-    doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
-    with open(doctags_fn, "w") as fd:
-        fd.write(doc_result.legacy_document.export_to_doctags())
-
-    md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
-    with open(md_fn, "w") as fd:
-        fd.write(doc_result.legacy_document.export_to_markdown())
+GENERATE_V1 = False
+GENERATE_V2 = False
 
 
 def get_pdf_paths():
@@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):
 
 
 def test_e2e_conversions():
-
     pdf_paths = get_pdf_paths()
 
     engines: List[OcrOptions] = [
         EasyOcrOptions(),
         TesseractOcrOptions(),
         TesseractCliOcrOptions(),
+        EasyOcrOptions(force_full_page_ocr=True),
+        TesseractOcrOptions(force_full_page_ocr=True),
+        TesseractCliOcrOptions(force_full_page_ocr=True),
     ]
 
     for ocr_options in engines:
@@ -91,20 +67,16 @@ def test_e2e_conversions():
 
             doc_result: ConversionResult = converter.convert(pdf_path)
 
-            # Save conversions
-            # save_output(pdf_path, doc_result, None)
-
-            # Debug
             verify_conversion_result_v1(
                 input_path=pdf_path,
                 doc_result=doc_result,
-                generate=GENERATE,
+                generate=GENERATE_V1,
                 fuzzy=True,
             )
 
             verify_conversion_result_v2(
                 input_path=pdf_path,
                 doc_result=doc_result,
-                generate=GENERATE,
+                generate=GENERATE_V2,
                 fuzzy=True,
             )
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 20f5eef00..c444266b7 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -256,15 +256,19 @@ def verify_conversion_result_v1(
     dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
 
     if generate:  # only used when re-generating truth
+        pages_path.parent.mkdir(parents=True, exist_ok=True)
         with open(pages_path, "w") as fw:
             fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
 
+        json_path.parent.mkdir(parents=True, exist_ok=True)
         with open(json_path, "w") as fw:
             fw.write(json.dumps(doc_pred, default=pydantic_encoder))
 
+        md_path.parent.mkdir(parents=True, exist_ok=True)
         with open(md_path, "w") as fw:
             fw.write(doc_pred_md)
 
+        dt_path.parent.mkdir(parents=True, exist_ok=True)
         with open(dt_path, "w") as fw:
             fw.write(doc_pred_dt)
     else:  # default branch in test
@@ -328,15 +332,19 @@ def verify_conversion_result_v2(
     dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
 
     if generate:  # only used when re-generating truth
+        pages_path.parent.mkdir(parents=True, exist_ok=True)
         with open(pages_path, "w") as fw:
             fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
 
+        json_path.parent.mkdir(parents=True, exist_ok=True)
         with open(json_path, "w") as fw:
             fw.write(json.dumps(doc_pred, default=pydantic_encoder))
 
+        md_path.parent.mkdir(parents=True, exist_ok=True)
         with open(md_path, "w") as fw:
             fw.write(doc_pred_md)
 
+        dt_path.parent.mkdir(parents=True, exist_ok=True)
         with open(dt_path, "w") as fw:
             fw.write(doc_pred_dt)
     else:  # default branch in test

From 5d4a10b121317fa481208dacbee47032b08ff928 Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:57:16 +0100
Subject: [PATCH 2/4] fix: Configure env prefix for docling settings (#315)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/settings.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py
index 7daf5047b..b1c47305f 100644
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 from pydantic import BaseModel
-from pydantic_settings import BaseSettings
+from pydantic_settings import BaseSettings, SettingsConfigDict
 
 
 class DocumentLimits(BaseModel):
@@ -40,6 +40,8 @@ class DebugSettings(BaseModel):
 
 
 class AppSettings(BaseSettings):
+    model_config = SettingsConfigDict(env_prefix="DOCLING_", env_nested_delimiter="_")
+
     perf: BatchConcurrencySettings
     debug: DebugSettings
 

From 777237ebc9f86af4a516e0770810fa8fbbf1f52d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:19:55 +0000
Subject: [PATCH 3/4] chore: bump version to 2.5.0 [skip ci]

---
 CHANGELOG.md   | 16 ++++++++++++++++
 pyproject.toml |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 10cd14022..76963f6cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,19 @@
+## [v2.5.0](https://github.com/DS4SD/docling/releases/tag/v2.5.0) - 2024-11-12
+
+### Feature
+
+* **OCR:** Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning ([#290](https://github.com/DS4SD/docling/issues/290)) ([`c6b3763`](https://github.com/DS4SD/docling/commit/c6b3763ecb6ef862840a30978ee177b907f86505))
+
+### Fix
+
+* Configure env prefix for docling settings ([#315](https://github.com/DS4SD/docling/issues/315)) ([`5d4a10b`](https://github.com/DS4SD/docling/commit/5d4a10b121317fa481208dacbee47032b08ff928))
+* Added handling of grouped elements in pptx backend ([#307](https://github.com/DS4SD/docling/issues/307)) ([`81c8243`](https://github.com/DS4SD/docling/commit/81c8243a8bf177feed8f87ea283b5bb6836350cb))
+* Allow mps usage for easyocr ([#286](https://github.com/DS4SD/docling/issues/286)) ([`97f214e`](https://github.com/DS4SD/docling/commit/97f214efddcf66f0734a95c17c08936f6111d113))
+
+### Documentation
+
+* Add navigation indices ([#305](https://github.com/DS4SD/docling/issues/305)) ([`1239ade`](https://github.com/DS4SD/docling/commit/1239ade2750349d13d4e865d88449b232bbad944))
+
 ## [v2.4.2](https://github.com/DS4SD/docling/releases/tag/v2.4.2) - 2024-11-08
 
 ### Fix
diff --git a/pyproject.toml b/pyproject.toml
index 0451ba805..95db56431 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.4.2"  # DO NOT EDIT, updated automatically
+version = "2.5.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From 93fc1be61abfe0669daf26c0984a51ec8675bf62 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Tue, 12 Nov 2024 12:21:48 +0100
Subject: [PATCH 4/4] docs: add Data Prep Kit integration (#316)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docs/integrations/data_prep_kit.md | 13 +++++++++++++
 docs/integrations/llamaindex.md    |  2 +-
 mkdocs.yml                         |  5 +++--
 3 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 docs/integrations/data_prep_kit.md

diff --git a/docs/integrations/data_prep_kit.md b/docs/integrations/data_prep_kit.md
new file mode 100644
index 000000000..5885e8eda
--- /dev/null
+++ b/docs/integrations/data_prep_kit.md
@@ -0,0 +1,13 @@
+## Get started
+
+Docling is used by the [Data Prep Kit \[↗\]](https://ibm.github.io/data-prep-kit/) open-source toolkit for preparing unstructured data for LLM application development ranging from laptop scale to datacenter scale.
+
+Below you find the Data Prep Kit modules powered by Docling.
+
+## PDF ingestion to Parquet
+- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet)
+- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/pdf2parquet/python/)
+
+## Document chunking
+- 💻 [GitHub \[↗\]](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/doc_chunk)
+- 📖 [API docs \[↗\]](https://ibm.github.io/data-prep-kit/transforms/language/doc_chunk/python/)
diff --git a/docs/integrations/llamaindex.md b/docs/integrations/llamaindex.md
index a43a6e6a4..41eb6e3d9 100644
--- a/docs/integrations/llamaindex.md
+++ b/docs/integrations/llamaindex.md
@@ -1,6 +1,6 @@
 ## Get started
 
-Docling is available as an official LlamaIndex extension!
+Docling is available as an official [LlamaIndex \[↗\]](https://docs.llamaindex.ai/) extension.
 
 To get started, check out the [step-by-step guide in LlamaIndex \[↗\]](https://docs.llamaindex.ai/en/stable/examples/data_connectors/DoclingReaderDemo/)<!--{target="_blank"}-->.
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 25eb48f44..6d8de0858 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -81,8 +81,9 @@ nav:
     #   - CLI: examples/cli.md
   - Integrations:
     - Integrations: integrations/index.md
-    - "LlamaIndex 🦙 extension": integrations/llamaindex.md
-    # - "LangChain 🦜🔗 extension": integrations/langchain.md
+    - "Data Prep Kit": integrations/data_prep_kit.md
+    - "LlamaIndex 🦙": integrations/llamaindex.md
+    # - "LangChain 🦜🔗": integrations/langchain.md
   # - API reference:
   #   - API reference: api_reference/index.md