Skip to content

Commit f597da6

Browse files
authored
Handle TIFFs in PDFs by converting to PNG (#419)
Fixes #418 and thus alephdata/aleph#2810
1 parent bdbf294 commit f597da6

File tree

3 files changed

+14
-0
lines changed

3 files changed

+14
-0
lines changed

ingestors/support/pdf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ def _extract_images(
178178
if isinstance(image, Image.Image):
179179
image.save(filepath_prefix + ".png", "PNG")
180180
else:
181+
pil_image = image.as_pil_image()
182+
if pil_image.format == "TIFF":
183+
pil_image.save(filepath_prefix + ".png", "PNG")
181184
image.extract_to(fileprefix=filepath_prefix)
182185

183186
def pdf_extract_page(

tests/fixtures/greek.pdf

274 KB
Binary file not shown.

tests/test_pdf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,3 +195,14 @@ def test_ingest_index_text(self):
195195
assert (
196196
"Erklärung verkündeten Rechte und Freiheiten zum Ziel hat." in pages_index
197197
)
198+
199+
def test_ingest_pdf_ocr_greek(self):
200+
fixture_path, entity = self.fixture("greek.pdf")
201+
self.manager.ingest(fixture_path, entity)
202+
203+
emitted = self.get_emitted()
204+
assert len(emitted) == 3
205+
206+
page = emitted[1]
207+
assert page.schema.name == "Page"
208+
assert "IRIDECEA HOLDINGS LIMITED" in "\n".join(page.get("bodyText"))

0 commit comments

Comments
 (0)