Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/RunningBabel.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,5 @@ cluster. You need to create three resources:

4. Press `Ctrl+A D` to "detach" the screen. You can reconnect to a detached screen by running
`screen -r`. You can also see a list of all running screens by running `screen -l`.

5. Once the generation completes, all output files should be in the `babel_outputs` directory.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ apybiomart = { git = "https://github.com/gaurav/apybiomart.git", rev = "change-c
dev = [
"ipykernel>=7.1.0",
"ruff>=0.14.14",
"rumdl>=0.1.22",
"rumdl>=0.1.26",
"snakefmt>=0.11.2",
]

Expand Down
55 changes: 30 additions & 25 deletions src/createcompendia/leftover_umls.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from src.metadata.provenance import write_metadata
from src.node import NodeFactory
from src.prefixes import UMLS
from src.util import get_biolink_model_toolkit
from src.util import get_biolink_model_toolkit, get_logger

logger = get_logger(__name__)

def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, biolink_version):
"""
Expand All @@ -32,10 +33,10 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
:return: Nothing.
"""

logging.info(
logger.info(
f"write_leftover_umls({compendia}, {umls_labels_filename}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {biolink_version})"
)

# For now, we have many more UMLS entities in MRCONSO than in the compendia, so
# we'll make an in-memory list of those first. Once that flips, this should be
# switched to the other way around (or perhaps written into an in-memory database
Expand All @@ -55,7 +56,7 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
biolink_toolkit = get_biolink_model_toolkit(biolink_version)

for compendium in compendia:
logging.info(f"Starting compendium: {compendium}")
logger.info(f"Starting compendium: {compendium}")
umls_ids = set()

with open(compendium) as f:
Expand All @@ -65,10 +66,10 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
if id["i"].startswith(UMLS + ":"):
umls_ids.add(id["i"])

logging.info(f"Completed compendium {compendium} with {len(umls_ids)} UMLS IDs")
logger.info(f"Completed compendium {compendium} with {len(umls_ids)} UMLS IDs")
umls_ids_in_other_compendia.update(umls_ids)

logging.info(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.")
logger.info(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.")
reportf.write(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.\n")
# print(umls_ids_in_other_compendia)

Expand All @@ -95,7 +96,7 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
types_by_tui[tui] = set()
types_by_tui[tui].add(sty)

logging.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
logger.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
reportf.write(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n")

with open("babel_outputs/reports/umls-types.tsv", "w") as outf:
Expand All @@ -104,8 +105,8 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
outf.write(f"{tui}\t{sty}\n")

# Create a compendium that consists solely of all MRCONSO entries that haven't been referenced.
count_no_umls_type = 0
count_multiple_umls_type = 0
curies_no_umls_type = set()
curies_multiple_umls_type = set()
with open(mrconso) as inf:
for line in inf:
if not umls.check_mrconso_line(line):
Expand All @@ -115,10 +116,10 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
cui = x[0]
umls_id = f"{UMLS}:{cui}"
if umls_id in umls_ids_in_other_compendia:
logging.debug(f"UMLS ID {umls_id} is in another compendium, skipping.")
logger.debug(f"UMLS ID {umls_id} is in another compendium, skipping.")
continue
if umls_id in umls_ids_in_this_compendium:
logging.debug(f"UMLS ID {umls_id} has already been included in this compendium, skipping.")
logger.debug(f"UMLS ID {umls_id} has already been included in this compendium, skipping.")
continue

# The STR value should be the label.
Expand All @@ -128,7 +129,7 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
def umls_type_to_biolink_type(umls_tui):
biolink_type = biolink_toolkit.get_element_by_mapping(f"STY:{umls_tui}", most_specific=True, formatted=True, mixin=True)
if biolink_type is None:
logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
logger.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
return biolink_type

umls_type_results = types_by_id.get(umls_id, {"biolink:NamedThing": {"Named thing"}})
Expand Down Expand Up @@ -158,14 +159,18 @@ def umls_type_to_biolink_type(umls_tui):
biolink_types = [FOOD]

if len(biolink_types) == 0:
logging.debug(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
count_no_umls_type += 1
# We skip this CURIE, but we don't want to print multiple warnings for the same CURIE.
if umls_id not in curies_no_umls_type:
curies_no_umls_type.add(umls_id)
logger.warning(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
continue
if len(biolink_types) > 1:
logging.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
count_multiple_umls_type += 1
# We skip this CURIE, but we don't want to print multiple warnings for the same CURIE.
if umls_id not in curies_multiple_umls_type:
curies_multiple_umls_type.add(umls_id)
logger.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
continue
biolink_type = list(biolink_types)[0]
umls_type_by_id[umls_id] = biolink_type
Expand All @@ -186,13 +191,13 @@ def umls_type_to_biolink_type(umls_tui):
}
compendiumf.write(json.dumps(cluster) + "\n")
umls_ids_in_this_compendium.add(umls_id)
logging.debug(f"Writing {cluster} to {compendiumf}")
logger.debug(f"Writing {cluster} to {compendiumf}")

logging.info(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.")
logger.info(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.")
reportf.write(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.\n")

logging.info(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.\n")
logger.info(f"Found {len(curies_no_umls_type)} UMLS IDs without UMLS types and {len(curies_multiple_umls_type)} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {len(curies_no_umls_type)} UMLS IDs without UMLS types and {len(curies_multiple_umls_type)} UMLS IDs with multiple UMLS types.\n")

# Collected synonyms for all IDs in this compendium.
synonyms_by_id = dict()
Expand All @@ -208,7 +213,7 @@ def umls_type_to_biolink_type(umls_tui):
# We don't record the synonym relation (https://github.com/TranslatorSRI/Babel/pull/113#issuecomment-1516450124),
# so we don't need to write that out now.

logging.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
logger.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")

# Write out synonyms to synonym file.
Expand Down Expand Up @@ -249,7 +254,7 @@ def umls_type_to_biolink_type(umls_tui):
count_synonym_objs += 1
count_synonyms += len(synonyms_list)

logging.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
logger.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
reportf.write(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.\n")

write_metadata(
Expand All @@ -275,4 +280,4 @@ def umls_type_to_biolink_type(umls_tui):

logging.info(f"Wrote out metadata file {metadata_yaml}.")

logging.info("Complete")
logging.info("Complete")
20 changes: 10 additions & 10 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.