|
1 | 1 | from dataclasses import dataclass
|
2 |
| -from io import StringIO |
3 | 2 | import logging
|
4 | 3 | import os
|
5 | 4 | from typing import Dict, List
|
6 | 5 | import uuid
|
7 | 6 | import unicodedata
|
8 | 7 |
|
9 |
| -import pikepdf |
10 |
| -from PIL import Image |
11 |
| -from pdfminer.converter import TextConverter |
12 |
| -from pdfminer.layout import LAParams |
13 |
| -from pdfminer.pdfdocument import PDFDocument |
14 |
| -from pdfminer.pdfpage import PDFPage |
15 |
| -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter |
16 |
| -from pdfminer.pdfparser import PDFParser |
| 8 | +import fitz |
17 | 9 |
|
18 | 10 | from normality import collapse_spaces # noqa
|
19 | 11 |
|
20 | 12 | from followthemoney import model
|
| 13 | +from ingestors.exc import UnauthorizedError |
21 | 14 | from ingestors.support.ocr import OCRSupport
|
22 | 15 | from ingestors.support.convert import DocumentConvertSupport
|
23 | 16 |
|
24 |
| -# silence some shouty debug output from pdfminer |
25 |
| -logging.getLogger("pdfminer").setLevel(logging.WARNING) |
26 |
| - |
27 | 17 | log = logging.getLogger(__name__)
|
28 | 18 |
|
29 | 19 |
|
@@ -85,143 +75,63 @@ def extract_pages(self, pdf_model: PdfModel, entity, manager):
|
85 | 75 | def parse(self, file_path: str) -> PdfModel:
|
86 | 76 | """Takes a file_path to a pdf and returns a `PdfModel`"""
|
87 | 77 | pdf_model = PdfModel(metadata=None, xmp_metadata=None, pages=[])
|
88 |
| - with open(file_path, "rb") as pdf_file: |
89 |
| - parser = PDFParser(pdf_file) |
90 |
| - pike_doc = pikepdf.Pdf.open(pdf_file) |
91 |
| - pdf_doc = PDFDocument(parser) |
92 |
| - for page_number, page in enumerate(PDFPage.create_pages(pdf_doc), 1): |
| 78 | + with fitz.open(file_path) as pdf_doc: |
| 79 | + if pdf_doc.needs_pass: |
| 80 | + raise UnauthorizedError |
| 81 | + # print(f"\n[IF] number of pages: {pdf_doc.page_count}") |
| 82 | + for page_num in range(pdf_doc.page_count): |
93 | 83 | pdf_model.pages.append(
|
94 |
| - self.pdf_extract_page(page, pike_doc, page_number) |
| 84 | + self.pdf_extract_page(pdf_doc, pdf_doc[page_num], page_num) |
95 | 85 | )
|
96 | 86 | return pdf_model
|
97 | 87 |
|
98 | 88 | def parse_and_ingest(self, file_path: str, entity, manager):
|
99 |
| - try: |
100 |
| - pdf_model: PdfModel = self.parse(file_path) |
101 |
| - self.extract_metadata(pdf_model, entity) |
102 |
| - self.extract_xmp_metadata(pdf_model, entity) |
103 |
| - self.extract_pages(pdf_model, entity, manager) |
104 |
| - except pikepdf._core.PasswordError as pwe: |
105 |
| - log.info(f"Failed to ingest password protected pdf: {file_path}") |
106 |
| - raise pwe |
| 89 | + pdf_model: PdfModel = self.parse(file_path) |
| 90 | + self.extract_metadata(pdf_model, entity) |
| 91 | + self.extract_xmp_metadata(pdf_model, entity) |
| 92 | + self.extract_pages(pdf_model, entity, manager) |
107 | 93 |
|
108 | 94 | def pdf_alternative_extract(self, entity, pdf_path: str, manager):
|
109 | 95 | checksum = self.manager.store(pdf_path)
|
110 | 96 | entity.set("pdfHash", checksum)
|
111 | 97 | self.parse_and_ingest(pdf_path, entity, manager)
|
112 | 98 |
|
113 |
| - def _find_images(self, container: pikepdf.Pdf, depth: int = 0): |
114 |
| - if "/Resources" not in container: |
115 |
| - return [] |
116 |
| - resources = container["/Resources"] |
117 |
| - |
118 |
| - if "/XObject" not in resources: |
119 |
| - return [] |
120 |
| - xobjects = resources["/XObject"].as_dict() |
121 |
| - |
122 |
| - if depth > 0: |
123 |
| - allow_recursion = False |
124 |
| - else: |
125 |
| - allow_recursion = True |
126 |
| - |
127 |
| - images = [] |
128 |
| - |
129 |
| - for xobject in xobjects: |
130 |
| - candidate = xobjects[xobject] |
131 |
| - if candidate["/Subtype"] == "/Image": |
132 |
| - if "/SMask" in candidate: |
133 |
| - images.append([candidate, candidate["/SMask"]]) |
134 |
| - else: |
135 |
| - images.append(candidate) |
136 |
| - elif allow_recursion and candidate["/Subtype"] == "/Form": |
137 |
| - images.extend(self._find_images(candidate, depth=depth + 1)) |
138 |
| - |
139 |
| - return images |
140 |
| - |
141 |
| - def _extract_images( |
142 |
| - self, pike_doc: pikepdf.Pdf, image_path: str, prefix: str = "img" |
143 |
| - ): |
144 |
| - raw_images = [] |
145 |
| - found_imgs = self._find_images(pike_doc) |
146 |
| - raw_images.extend(found_imgs) |
147 |
| - |
148 |
| - pdfimages = [] |
149 |
| - for r in raw_images: |
150 |
| - if isinstance(r, list): |
151 |
| - try: |
152 |
| - base_image = pikepdf.PdfImage(r[0]).as_pil_image() |
153 |
| - soft_mask = pikepdf.PdfImage(r[1]).as_pil_image() |
154 |
| - except NotImplementedError: |
155 |
| - # Skip unsupported image file formats |
156 |
| - continue |
157 |
| - |
158 |
| - if base_image.size != soft_mask.size: |
159 |
| - log.debug( |
160 |
| - "Warning: Image and /SMask have a different size. This is unexpected.", |
161 |
| - ) |
162 |
| - soft_mask = soft_mask.resize(base_image.size) |
163 |
| - |
164 |
| - if base_image.mode in ("L", "LA"): |
165 |
| - transparency = Image.new("LA", base_image.size, (0, 0)) |
166 |
| - else: |
167 |
| - if base_image.mode not in ("RGB", "RGBA"): |
168 |
| - base_image = base_image.convert("RGB") |
169 |
| - transparency = Image.new("RGBA", base_image.size, (0, 0, 0, 0)) |
170 |
| - |
171 |
| - composite = Image.composite(base_image, transparency, soft_mask) |
172 |
| - |
173 |
| - pdfimages.append(composite) |
174 |
| - |
175 |
| - else: |
176 |
| - pdfimages.append(pikepdf.PdfImage(r)) |
177 |
| - |
178 |
| - n_images = len(pdfimages) |
179 |
| - |
180 |
| - n_digits = len(str(n_images)) |
181 |
| - for i, image in enumerate(pdfimages): |
182 |
| - filepath_prefix = os.path.join(image_path, prefix + f"{i+1:0{n_digits}}") |
183 |
| - if isinstance(image, Image.Image): |
184 |
| - image.save(filepath_prefix + ".png", "PNG") |
185 |
| - else: |
186 |
| - pil_image = image.as_pil_image() |
187 |
| - if pil_image.format == "TIFF": |
188 |
| - pil_image.save(filepath_prefix + ".png", "PNG") |
189 |
| - image.extract_to(fileprefix=filepath_prefix) |
190 |
| - |
191 |
| - def pdf_extract_page( |
192 |
| - self, page: PDFPage, pike_doc: pikepdf._core.Pdf, page_number: int |
193 |
| - ) -> PdfPageModel: |
| 99 | + def pdf_extract_page(self, pdf_doc, page, page_number: int) -> PdfPageModel: |
194 | 100 | """Extract the contents of a single PDF page, using OCR if need be."""
|
195 |
| - buf = StringIO() |
196 |
| - rsrcmgr = PDFResourceManager() |
197 |
| - device = TextConverter( |
198 |
| - rsrcmgr, |
199 |
| - buf, |
200 |
| - laparams=LAParams( |
201 |
| - line_overlap=0.5, # default: 0.5 |
202 |
| - char_margin=2.0, # default: 2.0 |
203 |
| - word_margin=0.2, # default: 0.1 |
204 |
| - line_margin=0.5, # default: 0.5 |
205 |
| - boxes_flow=0.5, # default: 0.5 |
206 |
| - detect_vertical=True, # default: False |
207 |
| - all_texts=True, # default: False |
208 |
| - ), |
209 |
| - ) |
210 |
| - interpreter = PDFPageInterpreter(rsrcmgr, device) |
211 |
| - interpreter.process_page(page) |
212 |
| - texts = buf.getvalue() |
| 101 | + # Extract text |
| 102 | + full_text = page.get_text() |
| 103 | + # print(f"[IF] extracted text: \n{full_text}") |
| 104 | + |
| 105 | + # Extract images |
| 106 | + images = page.get_images() |
| 107 | + |
| 108 | + # Create a temporary location to store all extracted images |
213 | 109 | temp_dir = self.make_empty_directory()
|
214 |
| - image_path = temp_dir.joinpath(str(uuid.uuid4())) |
215 |
| - os.mkdir(image_path) |
216 |
| - pike_page = pike_doc.pages[page_number - 1] |
217 |
| - self._extract_images(pike_page, image_path) |
| 110 | + image_dir = temp_dir.joinpath(str(uuid.uuid4())) |
| 111 | + os.mkdir(image_dir) |
| 112 | + |
| 113 | + # Extract images from PDF and store them on the disk |
| 114 | + extracted_images = [] |
| 115 | + for image_index, image in enumerate(images, start=1): |
| 116 | + xref = image[0] |
| 117 | + img = pdf_doc.extract_image(xref) |
| 118 | + if img: |
| 119 | + image_path = os.path.join( |
| 120 | + image_dir, f"image{page_number+1}_{image_index}.{img['ext']}" |
| 121 | + ) |
| 122 | + with open(image_path, "wb") as image_file: |
| 123 | + image_file.write(img["image"]) |
| 124 | + extracted_images.append(image_path) |
| 125 | + |
| 126 | + # Attempt to OCR the images and extract text |
218 | 127 | languages = self.manager.context.get("languages")
|
219 |
| - for image_file in image_path.glob("*.png"): |
220 |
| - with open(image_file, "rb") as fh: |
| 128 | + for image_path in extracted_images: |
| 129 | + with open(image_path, "rb") as fh: |
221 | 130 | data = fh.read()
|
222 | 131 | text = self.extract_ocr_text(data, languages=languages)
|
223 | 132 | if text is not None:
|
224 |
| - texts += text |
| 133 | + # print(f"[IF] extracted text from images: \n{text}") |
| 134 | + full_text += text |
225 | 135 |
|
226 |
| - texts = unicodedata.normalize("NFKD", texts.strip()) |
227 |
| - return PdfPageModel(number=page_number, text=texts.strip()) |
| 136 | + full_text = unicodedata.normalize("NFKD", full_text.strip()) |
| 137 | + return PdfPageModel(number=page_number + 1, text=full_text.strip()) |
0 commit comments