Skip to content

Commit d25693c

Browse files
committed
Add check for ocr xml file and then extract paragraphs from xml
1 parent 8a3739f commit d25693c

1 file changed

Lines changed: 36 additions & 0 deletions

File tree

indra/literature/pmc_client.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,10 @@ def extract_paragraphs(xml_string):
201201
except ValueError:
202202
continue
203203
etree.cleanup_namespaces(tree)
204+
205+
if _is_pmc_ocr_xml(tree):
206+
return _extract_from_pmc_ocr(tree)
207+
204208
# Strip out latex
205209
_remove_elements_by_tag(tree, 'tex-math')
206210
# Strip out all content in unwanted elements except the captions
@@ -256,6 +260,38 @@ def filter_pmids(pmid_list, source_type):
256260
pmids_fulltext_dict.get(source_type)))
257261

258262

263+
264+
def _is_pmc_ocr_xml(tree):
265+
"""Return True if this JATS XML contains PMC OCR text."""
266+
preformats = tree.findall(".//preformat")
267+
for pre in preformats:
268+
if pre.get("preformat-type") == "pmc-ocr-text":
269+
return True
270+
return False
271+
272+
273+
def _extract_from_pmc_ocr(tree):
274+
"""Extract PMC OCR text as a list of paragraphs."""
275+
paragraphs = []
276+
277+
for pre in tree.findall(".//preformat"):
278+
if pre.get("preformat-type") != "pmc-ocr-text":
279+
continue
280+
281+
text = "".join(pre.itertext())
282+
text = text.replace("\ufeff", "")
283+
text = text.replace("\r\n", "\n").replace("\r", "\n")
284+
285+
raw_paragraphs = re.split(r"\n\s*\n+", text)
286+
287+
for para in raw_paragraphs:
288+
lines = [line.strip() for line in para.splitlines() if line.strip()]
289+
if lines:
290+
paragraphs.append(" ".join(lines))
291+
292+
return paragraphs
293+
294+
259295
def _select_from_top_level(tree, tag):
260296
"""Select direct children of the article element of a tree by tag.
261297

0 commit comments

Comments
 (0)