Skip to content

Commit 084bd11

Browse files
committed
Use PyMuPDF for text and image extraction
1 parent 9505616 commit 084bd11

File tree

6 files changed

+56
-150
lines changed

6 files changed

+56
-150
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ make build
4040
make test
4141
source .env/bin/activate
4242
bump2version {patch,minor,major} # pick the appropriate one
43-
git push --atomic origin $(git rev-parse --abbrev-ref HEAD) $(git describe --tags --abbrev=0)
43+
git push --atomic origin main $(git describe --tags --abbrev=0)
4444
```
4545

4646
## Usage

ingestors/documents/pdf.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,7 @@
22

33
from ingestors.ingestor import Ingestor
44
from ingestors.support.pdf import PDFSupport
5-
from ingestors.exc import ProcessingException
6-
7-
import pikepdf
8-
9-
# silence some shouty debug output from pdfminer
10-
logging.getLogger("pdfminer").setLevel(logging.WARNING)
5+
from ingestors.exc import ProcessingException, UnauthorizedError
116

127
log = logging.getLogger(__name__)
138

@@ -28,7 +23,7 @@ def ingest(self, file_path, entity):
2823
"""Ingestor implementation."""
2924
try:
3025
self.parse_and_ingest(file_path, entity, self.manager)
31-
except pikepdf._core.PasswordError as pwe:
26+
except UnauthorizedError as pwe:
3227
raise ProcessingException(
3328
"Could not extract PDF file. The PDF is protected with a password. Try removing the password protection and re-uploading the documents."
3429
) from pwe

ingestors/exc.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
class ProcessingException(Exception):
22
"A data-related error occuring during file processing."
33
pass
4+
5+
6+
class UnauthorizedError(Exception):
7+
"""Raised when a document is protected by a password and can not be parsed."""
8+
9+
pass

ingestors/support/pdf.py

Lines changed: 44 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,19 @@
11
from dataclasses import dataclass
2-
from io import StringIO
32
import logging
43
import os
54
from typing import Dict, List
65
import uuid
76
import unicodedata
87

9-
import pikepdf
10-
from PIL import Image
11-
from pdfminer.converter import TextConverter
12-
from pdfminer.layout import LAParams
13-
from pdfminer.pdfdocument import PDFDocument
14-
from pdfminer.pdfpage import PDFPage
15-
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
16-
from pdfminer.pdfparser import PDFParser
8+
import fitz
179

1810
from normality import collapse_spaces # noqa
1911

2012
from followthemoney import model
13+
from ingestors.exc import UnauthorizedError
2114
from ingestors.support.ocr import OCRSupport
2215
from ingestors.support.convert import DocumentConvertSupport
2316

24-
# silence some shouty debug output from pdfminer
25-
logging.getLogger("pdfminer").setLevel(logging.WARNING)
26-
2717
log = logging.getLogger(__name__)
2818

2919

@@ -85,143 +75,63 @@ def extract_pages(self, pdf_model: PdfModel, entity, manager):
8575
def parse(self, file_path: str) -> PdfModel:
8676
"""Takes a file_path to a pdf and returns a `PdfModel`"""
8777
pdf_model = PdfModel(metadata=None, xmp_metadata=None, pages=[])
88-
with open(file_path, "rb") as pdf_file:
89-
parser = PDFParser(pdf_file)
90-
pike_doc = pikepdf.Pdf.open(pdf_file)
91-
pdf_doc = PDFDocument(parser)
92-
for page_number, page in enumerate(PDFPage.create_pages(pdf_doc), 1):
78+
with fitz.open(file_path) as pdf_doc:
79+
if pdf_doc.needs_pass:
80+
raise UnauthorizedError
81+
# print(f"\n[IF] number of pages: {pdf_doc.page_count}")
82+
for page_num in range(pdf_doc.page_count):
9383
pdf_model.pages.append(
94-
self.pdf_extract_page(page, pike_doc, page_number)
84+
self.pdf_extract_page(pdf_doc, pdf_doc[page_num], page_num)
9585
)
9686
return pdf_model
9787

9888
def parse_and_ingest(self, file_path: str, entity, manager):
99-
try:
100-
pdf_model: PdfModel = self.parse(file_path)
101-
self.extract_metadata(pdf_model, entity)
102-
self.extract_xmp_metadata(pdf_model, entity)
103-
self.extract_pages(pdf_model, entity, manager)
104-
except pikepdf._core.PasswordError as pwe:
105-
log.info(f"Failed to ingest password protected pdf: {file_path}")
106-
raise pwe
89+
pdf_model: PdfModel = self.parse(file_path)
90+
self.extract_metadata(pdf_model, entity)
91+
self.extract_xmp_metadata(pdf_model, entity)
92+
self.extract_pages(pdf_model, entity, manager)
10793

10894
def pdf_alternative_extract(self, entity, pdf_path: str, manager):
10995
checksum = self.manager.store(pdf_path)
11096
entity.set("pdfHash", checksum)
11197
self.parse_and_ingest(pdf_path, entity, manager)
11298

113-
def _find_images(self, container: pikepdf.Pdf, depth: int = 0):
114-
if "/Resources" not in container:
115-
return []
116-
resources = container["/Resources"]
117-
118-
if "/XObject" not in resources:
119-
return []
120-
xobjects = resources["/XObject"].as_dict()
121-
122-
if depth > 0:
123-
allow_recursion = False
124-
else:
125-
allow_recursion = True
126-
127-
images = []
128-
129-
for xobject in xobjects:
130-
candidate = xobjects[xobject]
131-
if candidate["/Subtype"] == "/Image":
132-
if "/SMask" in candidate:
133-
images.append([candidate, candidate["/SMask"]])
134-
else:
135-
images.append(candidate)
136-
elif allow_recursion and candidate["/Subtype"] == "/Form":
137-
images.extend(self._find_images(candidate, depth=depth + 1))
138-
139-
return images
140-
141-
def _extract_images(
142-
self, pike_doc: pikepdf.Pdf, image_path: str, prefix: str = "img"
143-
):
144-
raw_images = []
145-
found_imgs = self._find_images(pike_doc)
146-
raw_images.extend(found_imgs)
147-
148-
pdfimages = []
149-
for r in raw_images:
150-
if isinstance(r, list):
151-
try:
152-
base_image = pikepdf.PdfImage(r[0]).as_pil_image()
153-
soft_mask = pikepdf.PdfImage(r[1]).as_pil_image()
154-
except NotImplementedError:
155-
# Skip unsupported image file formats
156-
continue
157-
158-
if base_image.size != soft_mask.size:
159-
log.debug(
160-
"Warning: Image and /SMask have a different size. This is unexpected.",
161-
)
162-
soft_mask = soft_mask.resize(base_image.size)
163-
164-
if base_image.mode in ("L", "LA"):
165-
transparency = Image.new("LA", base_image.size, (0, 0))
166-
else:
167-
if base_image.mode not in ("RGB", "RGBA"):
168-
base_image = base_image.convert("RGB")
169-
transparency = Image.new("RGBA", base_image.size, (0, 0, 0, 0))
170-
171-
composite = Image.composite(base_image, transparency, soft_mask)
172-
173-
pdfimages.append(composite)
174-
175-
else:
176-
pdfimages.append(pikepdf.PdfImage(r))
177-
178-
n_images = len(pdfimages)
179-
180-
n_digits = len(str(n_images))
181-
for i, image in enumerate(pdfimages):
182-
filepath_prefix = os.path.join(image_path, prefix + f"{i+1:0{n_digits}}")
183-
if isinstance(image, Image.Image):
184-
image.save(filepath_prefix + ".png", "PNG")
185-
else:
186-
pil_image = image.as_pil_image()
187-
if pil_image.format == "TIFF":
188-
pil_image.save(filepath_prefix + ".png", "PNG")
189-
image.extract_to(fileprefix=filepath_prefix)
190-
191-
def pdf_extract_page(
192-
self, page: PDFPage, pike_doc: pikepdf._core.Pdf, page_number: int
193-
) -> PdfPageModel:
99+
def pdf_extract_page(self, pdf_doc, page, page_number: int) -> PdfPageModel:
194100
"""Extract the contents of a single PDF page, using OCR if need be."""
195-
buf = StringIO()
196-
rsrcmgr = PDFResourceManager()
197-
device = TextConverter(
198-
rsrcmgr,
199-
buf,
200-
laparams=LAParams(
201-
line_overlap=0.5, # default: 0.5
202-
char_margin=2.0, # default: 2.0
203-
word_margin=0.2, # default: 0.1
204-
line_margin=0.5, # default: 0.5
205-
boxes_flow=0.5, # default: 0.5
206-
detect_vertical=True, # default: False
207-
all_texts=True, # default: False
208-
),
209-
)
210-
interpreter = PDFPageInterpreter(rsrcmgr, device)
211-
interpreter.process_page(page)
212-
texts = buf.getvalue()
101+
# Extract text
102+
full_text = page.get_text()
103+
# print(f"[IF] extracted text: \n{full_text}")
104+
105+
# Extract images
106+
images = page.get_images()
107+
108+
# Create a temporary location to store all extracted images
213109
temp_dir = self.make_empty_directory()
214-
image_path = temp_dir.joinpath(str(uuid.uuid4()))
215-
os.mkdir(image_path)
216-
pike_page = pike_doc.pages[page_number - 1]
217-
self._extract_images(pike_page, image_path)
110+
image_dir = temp_dir.joinpath(str(uuid.uuid4()))
111+
os.mkdir(image_dir)
112+
113+
# Extract images from PDF and store them on the disk
114+
extracted_images = []
115+
for image_index, image in enumerate(images, start=1):
116+
xref = image[0]
117+
img = pdf_doc.extract_image(xref)
118+
if img:
119+
image_path = os.path.join(
120+
image_dir, f"image{page_number+1}_{image_index}.{img['ext']}"
121+
)
122+
with open(image_path, "wb") as image_file:
123+
image_file.write(img["image"])
124+
extracted_images.append(image_path)
125+
126+
# Attempt to OCR the images and extract text
218127
languages = self.manager.context.get("languages")
219-
for image_file in image_path.glob("*.png"):
220-
with open(image_file, "rb") as fh:
128+
for image_path in extracted_images:
129+
with open(image_path, "rb") as fh:
221130
data = fh.read()
222131
text = self.extract_ocr_text(data, languages=languages)
223132
if text is not None:
224-
texts += text
133+
# print(f"[IF] extracted text from images: \n{text}")
134+
full_text += text
225135

226-
texts = unicodedata.normalize("NFKD", texts.strip())
227-
return PdfPageModel(number=page_number, text=texts.strip())
136+
full_text = unicodedata.normalize("NFKD", full_text.strip())
137+
return PdfPageModel(number=page_number + 1, text=full_text.strip())

requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ click==8.1.3
2020

2121
# File format support
2222
dbf==0.99.3
23-
pdfminer.six==20221105
2423
pymediainfo==6.0.1
25-
pikepdf==7.1.1
2624
python-magic==0.4.27
2725
rarfile==4.0
2826
xlrd==2.0.1
@@ -38,3 +36,4 @@ icalendar==5.0.4
3836

3937
cryptography==39.0.1
4038
requests[security]==2.28.2
39+
pymupdf==1.21.1

tests/test_pdf.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_ingest_noisy_fixture(self):
3434
self.manager.ingest(fixture_path, entity)
3535
self.assertEqual(len(self.get_emitted()), 501)
3636
self.assertEqual(
37-
self.manager.entities[0].first("bodyText"), "Hello, World!\n\nHello, World!"
37+
self.manager.entities[0].first("bodyText"), "Hello, World!\nHello, World!"
3838
)
3939
self.assertEqual(entity.schema.name, "Pages")
4040

@@ -125,9 +125,7 @@ def test_pdf_conversion_metadata(self):
125125
assert entity.get("modifiedAt") == ["2015-10-05T08:57:00"]
126126

127127
def test_pdf_letter_spacing(self):
128-
"""Checks some tricky word spacing in the fancy food menu. This required
129-
overriding the pdfminersix LAParams default `word_margin` from 0.1 until
130-
it worked."""
128+
"""Checks some tricky word spacing in the fancy food menu."""
131129
fixture_path, entity = self.fixture("the-dorset-food-menu.pdf")
132130
self.manager.ingest(fixture_path, entity)
133131

@@ -143,9 +141,7 @@ def test_pdf_letter_spacing(self):
143141
for expected_string in [
144142
"served with marinated olives",
145143
"made with vegetarian ingredients",
146-
"dorset",
147144
"triple-cooked chips",
148-
"e\ndorset", # can't get it to detect the "the" :(
149145
]:
150146
assert expected_string in body_one.lower()
151147

0 commit comments

Comments
 (0)