File tree 3 files changed +14
-0
lines changed
3 files changed +14
-0
lines changed Original file line number Diff line number Diff line change @@ -178,6 +178,9 @@ def _extract_images(
178
178
if isinstance (image , Image .Image ):
179
179
image .save (filepath_prefix + ".png" , "PNG" )
180
180
else :
181
+ pil_image = image .as_pil_image ()
182
+ if pil_image .format == "TIFF" :
183
+ pil_image .save (filepath_prefix + ".png" , "PNG" )
181
184
image .extract_to (fileprefix = filepath_prefix )
182
185
183
186
def pdf_extract_page (
Original file line number Diff line number Diff line change @@ -195,3 +195,14 @@ def test_ingest_index_text(self):
195
195
assert (
196
196
"Erklärung verkündeten Rechte und Freiheiten zum Ziel hat." in pages_index
197
197
)
198
+
199
+ def test_ingest_pdf_ocr_greek (self ):
200
+ fixture_path , entity = self .fixture ("greek.pdf" )
201
+ self .manager .ingest (fixture_path , entity )
202
+
203
+ emitted = self .get_emitted ()
204
+ assert len (emitted ) == 3
205
+
206
+ page = emitted [1 ]
207
+ assert page .schema .name == "Page"
208
+ assert "IRIDECEA HOLDINGS LIMITED" in "\n " .join (page .get ("bodyText" ))
You can’t perform that action at this time.
0 commit comments