Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 31 additions & 24 deletions src/createcompendia/leftover_umls.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import json
import logging
from datetime import datetime
from pathlib import Path

import jsonlines

from src.node import NodeFactory
from src.util import get_biolink_model_toolkit, get_logger
from src.categories import ACTIVITY, AGENT, DEVICE, DRUG, FOOD, PHYSICAL_ENTITY, PROCEDURE, PUBLICATION, SMALL_MOLECULE
from src.datahandlers import umls
from src.metadata.provenance import write_metadata
from src.node import NodeFactory

Check failure on line 13 in src/createcompendia/leftover_umls.py

View workflow job for this annotation

GitHub Actions / Check Python formatting with ruff

ruff (F811)

src/createcompendia/leftover_umls.py:13:22: F811 Redefinition of unused `NodeFactory` from line 8: `NodeFactory` redefined here src/createcompendia/leftover_umls.py:8:22: previous definition of `NodeFactory` here help: Remove definition: `NodeFactory`
from src.prefixes import UMLS
from src.util import get_biolink_model_toolkit

Check failure on line 15 in src/createcompendia/leftover_umls.py

View workflow job for this annotation

GitHub Actions / Check Python formatting with ruff

ruff (F811)

src/createcompendia/leftover_umls.py:15:22: F811 Redefinition of unused `get_biolink_model_toolkit` from line 9: `get_biolink_model_toolkit` redefined here src/createcompendia/leftover_umls.py:9:22: previous definition of `get_biolink_model_toolkit` here help: Remove definition: `get_biolink_model_toolkit`

Check failure on line 15 in src/createcompendia/leftover_umls.py

View workflow job for this annotation

GitHub Actions / Check Python formatting with ruff

ruff (I001)

src/createcompendia/leftover_umls.py:1:1: I001 Import block is un-sorted or un-formatted help: Organize imports

logger = get_logger(__name__)

def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, biolink_version):
"""
Expand All @@ -32,10 +35,10 @@
:return: Nothing.
"""

logging.info(
logger.info(
f"write_leftover_umls({compendia}, {umls_labels_filename}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {biolink_version})"
)

# For now, we have many more UMLS entities in MRCONSO than in the compendia, so
# we'll make an in-memory list of those first. Once that flips, this should be
# switched to the other way around (or perhaps written into an in-memory database
Expand All @@ -55,7 +58,7 @@
biolink_toolkit = get_biolink_model_toolkit(biolink_version)

for compendium in compendia:
logging.info(f"Starting compendium: {compendium}")
logger.info(f"Starting compendium: {compendium}")
umls_ids = set()

with open(compendium) as f:
Expand All @@ -65,10 +68,10 @@
if id["i"].startswith(UMLS + ":"):
umls_ids.add(id["i"])

logging.info(f"Completed compendium {compendium} with {len(umls_ids)} UMLS IDs")
logger.info(f"Completed compendium {compendium} with {len(umls_ids)} UMLS IDs")
umls_ids_in_other_compendia.update(umls_ids)

logging.info(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.")
logger.info(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.")
reportf.write(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.\n")
# print(umls_ids_in_other_compendia)

Expand All @@ -95,7 +98,7 @@
types_by_tui[tui] = set()
types_by_tui[tui].add(sty)

logging.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
logger.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
reportf.write(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n")

with open("babel_outputs/reports/umls-types.tsv", "w") as outf:
Expand All @@ -104,8 +107,8 @@
outf.write(f"{tui}\t{sty}\n")

# Create a compendium that consists solely of all MRCONSO entries that haven't been referenced.
count_no_umls_type = 0
count_multiple_umls_type = 0
curies_no_umls_type = set()
curies_multiple_umls_type = set()
with open(mrconso) as inf:
for line in inf:
if not umls.check_mrconso_line(line):
Expand All @@ -115,10 +118,10 @@
cui = x[0]
umls_id = f"{UMLS}:{cui}"
if umls_id in umls_ids_in_other_compendia:
logging.debug(f"UMLS ID {umls_id} is in another compendium, skipping.")
logger.debug(f"UMLS ID {umls_id} is in another compendium, skipping.")
continue
if umls_id in umls_ids_in_this_compendium:
logging.debug(f"UMLS ID {umls_id} has already been included in this compendium, skipping.")
logger.debug(f"UMLS ID {umls_id} has already been included in this compendium, skipping.")
continue

# The STR value should be the label.
Expand All @@ -128,7 +131,7 @@
def umls_type_to_biolink_type(umls_tui):
biolink_type = biolink_toolkit.get_element_by_mapping(f"STY:{umls_tui}", most_specific=True, formatted=True, mixin=True)
if biolink_type is None:
logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
logger.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
return biolink_type

umls_type_results = types_by_id.get(umls_id, {"biolink:NamedThing": {"Named thing"}})
Expand Down Expand Up @@ -158,14 +161,18 @@
biolink_types = [FOOD]

if len(biolink_types) == 0:
logging.debug(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
count_no_umls_type += 1
# We skip this CURIE, but we don't want to print multiple warnings for the same CURIE.
if umls_id not in curies_no_umls_type:
curies_no_umls_type.add(umls_id)
logger.warning(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
continue
if len(biolink_types) > 1:
logging.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
count_multiple_umls_type += 1
# We skip this CURIE, but we don't want to print multiple warnings for the same CURIE.
if umls_id not in curies_multiple_umls_type:
curies_multiple_umls_type.add(umls_id)
logger.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
continue
biolink_type = list(biolink_types)[0]
umls_type_by_id[umls_id] = biolink_type
Expand All @@ -186,13 +193,13 @@
}
compendiumf.write(json.dumps(cluster) + "\n")
umls_ids_in_this_compendium.add(umls_id)
logging.debug(f"Writing {cluster} to {compendiumf}")
logger.debug(f"Writing {cluster} to {compendiumf}")

logging.info(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.")
logger.info(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.")
reportf.write(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.\n")

logging.info(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.\n")
logger.info(f"Found {len(curies_no_umls_type)} UMLS IDs without UMLS types and {len(curies_multiple_umls_type)} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {len(curies_no_umls_type)} UMLS IDs without UMLS types and {len(curies_multiple_umls_type)} UMLS IDs with multiple UMLS types.\n")

# Collected synonyms for all IDs in this compendium.
synonyms_by_id = dict()
Expand All @@ -208,7 +215,7 @@
# We don't record the synonym relation (https://github.com/TranslatorSRI/Babel/pull/113#issuecomment-1516450124),
# so we don't need to write that out now.

logging.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
logger.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")

# Write out synonyms to synonym file.
Expand Down Expand Up @@ -249,7 +256,7 @@
count_synonym_objs += 1
count_synonyms += len(synonyms_list)

logging.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
logger.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
reportf.write(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.\n")

write_metadata(
Expand All @@ -275,4 +282,4 @@

logging.info(f"Wrote out metadata file {metadata_yaml}.")

logging.info("Complete")
logging.info("Complete")
Loading