microsoft · cyphercodes · Jul 5, 2026
diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py
@@ -89,14 +89,28 @@ def _extract_images_from_page(page: Any) -> list[dict]:
                     y0 = img_dict.get("top", 0)
                     x1 = img_dict.get("x1", 0)
                     y1 = img_dict.get("bottom", 0)
+
+                    page_bbox = getattr(page, "bbox", None)
+                    if page_bbox is not None:
+                        try:
+                            page_x0, page_y0, page_x1, page_y1 = page_bbox
+                        except (TypeError, ValueError):
+                            pass
+                        else:
+                            x0 = max(x0, page_x0)
+                            y0 = max(y0, page_y0)
+                            x1 = min(x1, page_x1)
+                            y1 = min(y1, page_y1)
+
                     y_pos = y0
 
-                    # Check if dimensions are valid
+                    # Check if dimensions are valid after clamping to the page bounds.
                     if x1 <= x0 or y1 <= y0:
                         continue
 
-                    # Use pdfplumber's within_bbox to crop, then render
-                    # This preserves coordinate system correctly
+                    # Use pdfplumber's within_bbox to crop, then render.
+                    # This preserves coordinate system correctly while tolerating
+                    # tiny bbox overflows reported by some scanned/image-only PDFs.
                     bbox = (x0, y0, x1, y1)
                     cropped_page = page.within_bbox(bbox)
 

diff --git a/packages/markitdown-ocr/tests/test_pdf_converter.py b/packages/markitdown-ocr/tests/test_pdf_converter.py
@@ -23,6 +23,7 @@
 from markitdown_ocr._ocr_service import OCRResult  # noqa: E402
 from markitdown_ocr._pdf_converter_with_ocr import (  # noqa: E402
     PdfConverterWithOCR,
+    _extract_images_from_page,
 )
 from markitdown import StreamInfo  # noqa: E402
 
@@ -42,6 +43,47 @@ def extract_text(
         return OCRResult(text=_MOCK_TEXT, backend_used="mock")
 
 
+def test_extract_images_clamps_slightly_out_of_bounds_bbox() -> None:
+    class FakeRenderedImage:
+        def save(self, stream: Any, format: str) -> None:
+            stream.write(b"fake png data")
+
+    class FakePageImage:
+        original = FakeRenderedImage()
+
+    class FakeCroppedPage:
+        def to_image(self, resolution: int) -> FakePageImage:
+            return FakePageImage()
+
+    class FakePage:
+        page_number = 1
+        bbox = (0, 0, 100, 100)
+        images = [{"x0": 10, "top": -1.999e-05, "x1": 90, "bottom": 80}]
+
+        def __init__(self) -> None:
+            self.cropped_bbox: tuple[float, float, float, float] | None = None
+
+        def within_bbox(
+            self, bbox: tuple[float, float, float, float]
+        ) -> FakeCroppedPage:
+            x0, y0, x1, y1 = bbox
+            assert x0 >= self.bbox[0]
+            assert y0 >= self.bbox[1]
+            assert x1 <= self.bbox[2]
+            assert y1 <= self.bbox[3]
+            self.cropped_bbox = bbox
+            return FakeCroppedPage()
+
+    page = FakePage()
+
+    images = _extract_images_from_page(page)
+
+    assert page.cropped_bbox == (10, 0, 90, 80)
+    assert len(images) == 1
+    assert images[0]["name"] == "page_1_img_0"
+    assert images[0]["y_pos"] == 0
+
+
 @pytest.fixture(scope="module")
 def svc() -> MockOCRService:
     return MockOCRService()