@@ -201,6 +201,10 @@ def extract_paragraphs(xml_string):
201201 except ValueError :
202202 continue
203203 etree .cleanup_namespaces (tree )
204+
205+ if _is_pmc_ocr_xml (tree ):
206+ return _extract_from_pmc_ocr (tree )
207+
204208 # Strip out latex
205209 _remove_elements_by_tag (tree , 'tex-math' )
206210 # Strip out all content in unwanted elements except the captions
@@ -256,6 +260,38 @@ def filter_pmids(pmid_list, source_type):
256260 pmids_fulltext_dict .get (source_type )))
257261
258262
263+
264+ def _is_pmc_ocr_xml (tree ):
265+ """Return True if this JATS XML contains PMC OCR text."""
266+ preformats = tree .findall (".//preformat" )
267+ for pre in preformats :
268+ if pre .get ("preformat-type" ) == "pmc-ocr-text" :
269+ return True
270+ return False
271+
272+
273+ def _extract_from_pmc_ocr (tree ):
274+ """Extract PMC OCR text as a list of paragraphs."""
275+ paragraphs = []
276+
277+ for pre in tree .findall (".//preformat" ):
278+ if pre .get ("preformat-type" ) != "pmc-ocr-text" :
279+ continue
280+
281+ text = "" .join (pre .itertext ())
282+ text = text .replace ("\ufeff " , "" )
283+ text = text .replace ("\r \n " , "\n " ).replace ("\r " , "\n " )
284+
285+ raw_paragraphs = re .split (r"\n\s*\n+" , text )
286+
287+ for para in raw_paragraphs :
288+ lines = [line .strip () for line in para .splitlines () if line .strip ()]
289+ if lines :
290+ paragraphs .append (" " .join (lines ))
291+
292+ return paragraphs
293+
294+
259295def _select_from_top_level (tree , tag ):
260296 """Select direct children of the article element of a tree by tag.
261297
0 commit comments