Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 30 additions & 27 deletions src/createcompendia/leftover_umls.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import logging
from datetime import datetime
from pathlib import Path

Expand All @@ -10,8 +9,9 @@
from src.metadata.provenance import write_metadata
from src.node import NodeFactory
from src.prefixes import UMLS
from src.util import get_biolink_model_toolkit
from src.util import get_biolink_model_toolkit, get_logger

logger = get_logger(__name__)

def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso, mrsty, synonyms, umls_compendium, umls_synonyms, report, biolink_version):
"""
Expand All @@ -32,10 +32,9 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
:return: Nothing.
"""

logging.info(
logger.info(
f"write_leftover_umls({compendia}, {umls_labels_filename}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {biolink_version})"
)

# For now, we have many more UMLS entities in MRCONSO than in the compendia, so
# we'll make an in-memory list of those first. Once that flips, this should be
# switched to the other way around (or perhaps written into an in-memory database
Expand All @@ -55,7 +54,7 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
biolink_toolkit = get_biolink_model_toolkit(biolink_version)

for compendium in compendia:
logging.info(f"Starting compendium: {compendium}")
logger.info(f"Starting compendium: {compendium}")
umls_ids = set()

with open(compendium) as f:
Expand All @@ -65,10 +64,10 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
if id["i"].startswith(UMLS + ":"):
umls_ids.add(id["i"])

logging.info(f"Completed compendium {compendium} with {len(umls_ids)} UMLS IDs")
logger.info(f"Completed compendium {compendium} with {len(umls_ids)} UMLS IDs")
umls_ids_in_other_compendia.update(umls_ids)

logging.info(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.")
logger.info(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.")
reportf.write(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.\n")
# print(umls_ids_in_other_compendia)

Expand All @@ -95,7 +94,7 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
types_by_tui[tui] = set()
types_by_tui[tui].add(sty)

logging.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
logger.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
reportf.write(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n")

with open("babel_outputs/reports/umls-types.tsv", "w") as outf:
Expand All @@ -104,8 +103,8 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
outf.write(f"{tui}\t{sty}\n")

# Create a compendium that consists solely of all MRCONSO entries that haven't been referenced.
count_no_umls_type = 0
count_multiple_umls_type = 0
curies_no_umls_type = set()
curies_multiple_umls_type = set()
with open(mrconso) as inf:
for line in inf:
if not umls.check_mrconso_line(line):
Expand All @@ -115,10 +114,10 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
cui = x[0]
umls_id = f"{UMLS}:{cui}"
if umls_id in umls_ids_in_other_compendia:
logging.debug(f"UMLS ID {umls_id} is in another compendium, skipping.")
logger.debug(f"UMLS ID {umls_id} is in another compendium, skipping.")
continue
if umls_id in umls_ids_in_this_compendium:
logging.debug(f"UMLS ID {umls_id} has already been included in this compendium, skipping.")
logger.debug(f"UMLS ID {umls_id} has already been included in this compendium, skipping.")
continue

# The STR value should be the label.
Expand All @@ -128,7 +127,7 @@ def write_leftover_umls(metadata_yaml, compendia, umls_labels_filename, mrconso,
def umls_type_to_biolink_type(umls_tui):
biolink_type = biolink_toolkit.get_element_by_mapping(f"STY:{umls_tui}", most_specific=True, formatted=True, mixin=True)
if biolink_type is None:
logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
logger.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
return biolink_type

umls_type_results = types_by_id.get(umls_id, {"biolink:NamedThing": {"Named thing"}})
Expand Down Expand Up @@ -158,14 +157,18 @@ def umls_type_to_biolink_type(umls_tui):
biolink_types = [FOOD]

if len(biolink_types) == 0:
logging.debug(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
count_no_umls_type += 1
# We skip this CURIE, but we don't want to print multiple warnings for the same CURIE.
if umls_id not in curies_no_umls_type:
curies_no_umls_type.add(umls_id)
logger.warning(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
continue
if len(biolink_types) > 1:
logging.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
count_multiple_umls_type += 1
# We skip this CURIE, but we don't want to print multiple log messages for the same CURIE.
if umls_id not in curies_multiple_umls_type:
curies_multiple_umls_type.add(umls_id)
logger.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
continue
biolink_type = list(biolink_types)[0]
umls_type_by_id[umls_id] = biolink_type
Expand All @@ -186,13 +189,13 @@ def umls_type_to_biolink_type(umls_tui):
}
compendiumf.write(json.dumps(cluster) + "\n")
umls_ids_in_this_compendium.add(umls_id)
logging.debug(f"Writing {cluster} to {compendiumf}")
logger.debug(f"Writing {cluster} to {compendiumf}")

logging.info(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.")
logger.info(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.")
reportf.write(f"Wrote out {len(umls_ids_in_this_compendium)} UMLS IDs into the leftover UMLS compendium.\n")

logging.info(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {count_no_umls_type} UMLS IDs without UMLS types and {count_multiple_umls_type} UMLS IDs with multiple UMLS types.\n")
logger.info(f"Found {len(curies_no_umls_type)} UMLS IDs without UMLS types and {len(curies_multiple_umls_type)} UMLS IDs with multiple UMLS types.")
reportf.write(f"Found {len(curies_no_umls_type)} UMLS IDs without UMLS types and {len(curies_multiple_umls_type)} UMLS IDs with multiple UMLS types.\n")

# Collected synonyms for all IDs in this compendium.
synonyms_by_id = dict()
Expand All @@ -208,7 +211,7 @@ def umls_type_to_biolink_type(umls_tui):
# We don't record the synonym relation (https://github.com/TranslatorSRI/Babel/pull/113#issuecomment-1516450124),
# so we don't need to write that out now.

logging.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
logger.info(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.")
reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")

# Write out synonyms to synonym file.
Expand Down Expand Up @@ -249,7 +252,7 @@ def umls_type_to_biolink_type(umls_tui):
count_synonym_objs += 1
count_synonyms += len(synonyms_list)

logging.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
logger.info(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.")
reportf.write(f"Wrote out {count_synonym_objs} synonym objects into the leftover UMLS synonyms file.\n")

write_metadata(
Expand All @@ -273,6 +276,6 @@ def umls_type_to_biolink_type(umls_tui):
}
)

logging.info(f"Wrote out metadata file {metadata_yaml}.")
logger.info(f"Wrote out metadata file {metadata_yaml}.")

logging.info("Complete")
logger.info("Complete")
2 changes: 1 addition & 1 deletion src/snakefiles/anatomy.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ rule anatomy_umls_ids:


rule get_anatomy_obo_relationships:
retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry.
output:
config["intermediate_directory"] + "/anatomy/concords/UBERON",
config["intermediate_directory"] + "/anatomy/concords/CL",
Expand All @@ -74,6 +73,7 @@ rule get_anatomy_obo_relationships:
go_metadata=config["intermediate_directory"] + "/anatomy/concords/metadata-GO.yaml",
benchmark:
config["output_directory"] + "/benchmarks/get_anatomy_obo_relationships.tsv"
retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry.
run:
anatomy.build_anatomy_obo_relationships(
config["intermediate_directory"] + "/anatomy/concords",
Expand Down
16 changes: 8 additions & 8 deletions src/snakefiles/chemical.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,11 @@ rule chemical_drugcentral_ids:


rule chemical_chebi_ids:
retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry.
output:
outfile=config["intermediate_directory"] + "/chemicals/ids/CHEBI",
benchmark:
config["output_directory"] + "/benchmarks/chemical_chebi_ids.tsv"
retries: 10 # Ubergraph sometimes fails mid-download, and then we need to retry.
run:
chemicals.write_chebi_ids(output.outfile)

Expand Down Expand Up @@ -282,8 +282,6 @@ rule get_chebi_concord:


rule chemical_unichem_concordia:
resources:
mem="128G",
input:
concords=expand(
"{dd}/chemicals/concords/UNICHEM/UNICHEM_{ucc}",
Expand All @@ -294,13 +292,13 @@ rule chemical_unichem_concordia:
unichemgroup=config["intermediate_directory"] + "/chemicals/partials/UNICHEM",
benchmark:
config["output_directory"] + "/benchmarks/chemical_unichem_concordia.tsv"
resources:
mem="128G",
run:
chemicals.combine_unichem(input.concords, output.unichemgroup)


rule untyped_chemical_compendia:
resources:
mem="512G",
input:
labels=expand("{dd}/{ap}/labels", dd=config["download_directory"], ap=config["chemical_labels"]),
synonyms=expand("{dd}/{ap}/synonyms", dd=config["download_directory"], ap=config["chemical_synonyms"]),
Expand All @@ -320,6 +318,8 @@ rule untyped_chemical_compendia:
untyped_meta=config["intermediate_directory"] + "/chemicals/partials/metadata-untyped_compendium.yaml",
benchmark:
config["output_directory"] + "/benchmarks/untyped_chemical_compendia.tsv"
resources:
mem="512G",
run:
chemicals.build_untyped_compendia(
input.concords,
Expand All @@ -333,9 +333,6 @@ rule untyped_chemical_compendia:


rule chemical_compendia:
resources:
mem="512G",
runtime="6h",
input:
typesfile=config["intermediate_directory"] + "/chemicals/partials/types",
untyped_file=config["intermediate_directory"] + "/chemicals/partials/untyped_compendium",
Expand All @@ -348,6 +345,9 @@ rule chemical_compendia:
expand("{od}/metadata/{ap}.yaml", od=config["output_directory"], ap=config["chemical_outputs"]),
benchmark:
config["output_directory"] + "/benchmarks/chemical_compendia.tsv"
resources:
mem="512G",
runtime="6h",
run:
chemicals.build_compendia(
input.typesfile,
Expand Down
Loading
Loading