Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions indra/literature/pmc_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,10 @@ def extract_paragraphs(xml_string):
except ValueError:
continue
etree.cleanup_namespaces(tree)

if _is_pmc_ocr_xml(tree):
return _extract_from_pmc_ocr(tree)

# Strip out latex
_remove_elements_by_tag(tree, 'tex-math')
# Strip out all content in unwanted elements except the captions
Expand Down Expand Up @@ -256,6 +260,39 @@ def filter_pmids(pmid_list, source_type):
pmids_fulltext_dict.get(source_type)))



def _is_pmc_ocr_xml(tree):
"""Return True if this JATS XML contains PMC OCR text."""
preformats = tree.findall(".//preformat")
for pre in preformats:
if pre.get("preformat-type") == "pmc-ocr-text":
return True
return False


def _extract_from_pmc_ocr(tree):
"""Extract PMC OCR text as a list of paragraphs."""
paragraphs = []

for pre in tree.findall(".//preformat"):
if pre.get("preformat-type") != "pmc-ocr-text":
continue

text = "".join(pre.itertext())
# Remove xml BOM and unify newline formats
text = text.replace("\ufeff", "")
text = text.replace("\r\n", "\n").replace("\r", "\n")

raw_paragraphs = re.split(r"\n\s*\n+", text)

for para in raw_paragraphs:
lines = [line.strip() for line in para.splitlines() if line.strip()]
if lines:
paragraphs.append(" ".join(lines))

return paragraphs


def _select_from_top_level(tree, tag):
"""Select direct children of the article element of a tree by tag.

Expand Down
Loading