Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions src/createcompendia/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from src.babel_utils import Text, glom, read_identifier_file, write_compendium
from src.categories import PROTEIN
from src.metadata.provenance import write_concord_metadata
from src.prefixes import DRUGBANK, ENSEMBL, MESH, NCBITAXON, NCIT, PR, UNIPROTKB
from src.prefixes import DRUGBANK, ENSEMBL, NCBITAXON, NCIT, PR, UNIPROTKB
from src.ubergraph import UberGraph
from src.util import get_logger, get_memory_usage_summary

Expand All @@ -26,8 +26,19 @@ def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename


def write_umls_ids(mrsty, outfile):
umlsmap = {}
umlsmap["A1.4.1.2.1.7"] = PROTEIN
# Compare with src/createcompendia/chemicals.py (see source code at
# https://github.com/NCATSTranslator/Babel/blob/c91654411923b86300cc2f6b5a21b96ea857817f/src/createcompendia/chemicals.py#L54-L76)
#
# We have to make sure we don't include UMLS identifiers both here and in chemicals.py, otherwise they'll
# end up in both compendia.
umlsmap = {
"A1.4.1.2.1.7": PROTEIN, # Amino Acid, Peptide, or Protein -- https://uts.nlm.nih.gov/uts/umls/semantic-network/T116
# The following should not be needed: receptors are generally proteins, and enzymes are definitionally proteins, so
# they should all be included in T116. But since we exclude them in chemicals.py, I think it makes sense to include
# them here.
"A1.4.1.1.3.6": PROTEIN, # Receptor -- https://uts.nlm.nih.gov/uts/umls/semantic-network/T192
"A1.4.1.1.3.3": PROTEIN, # Enzyme -- https://uts.nlm.nih.gov/uts/umls/semantic-network/T126
}
umls.write_umls_ids(mrsty, umlsmap, outfile)


Expand Down Expand Up @@ -158,14 +169,14 @@ def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml):
umls.build_sets(mrconso, idfile, outfile, {"NCI": NCIT}, provenance_metadata_yaml=metadata_yaml)


def build_umls_relationships(mrconso, idfile, outfile, metadata_yaml):
def build_umls_relationships(mrconso, idfile, outfile, other_prefixes, exclude_ids_from, metadata_yaml):
# The corresponding code in chemicals also includes (1) {'RXNORM': RXCUI}, and (2) we also pull in RxNorm to
# provide the inverse concords (i.e. RxNorm -> MESH and DRUGBANK). Doing so will probably fix some RXCUI IDs,
# but assigning RXCUI to proteins seems like a bridge too far for me.
#
# TODO: we should probably add some kind of filtering so we don't include concords that point to chemicals rather
# than proteins, which could result in duplicates (if the same ID is picked up in both chemicals and proteins).
umls.build_sets(mrconso, idfile, outfile, {"MSH": MESH, "DRUGBANK": DRUGBANK}, provenance_metadata_yaml=metadata_yaml)
umls.build_sets(mrconso, idfile, outfile, other_prefixes=other_prefixes, exclude_ids_from=exclude_ids_from, provenance_metadata_yaml=metadata_yaml)


def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename):
Expand Down
100 changes: 85 additions & 15 deletions src/datahandlers/umls.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,30 +204,92 @@ def write_rxnorm_ids(category_map, bad_categories, infile, outfile, prefix=RXCUI
outf.write(f"{prefix}:{current_id}\t{DRUG}\n")


# I've made this more complicated than it ought to be for 2 reasons:
# One is to keep from having to pass through the umls file more than once, but that's a bad reason
# The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another
# way. I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it.
def build_sets(
mrconso, umls_input, umls_output, other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}, cui_prefix=UMLS, provenance_metadata_yaml=None
mrconso, umls_input, umls_output,
other_prefixes,
bad_mappings=None,
exclude_ids_from=None,
acceptable_identifiers={},
cui_prefix=UMLS,
provenance_metadata_yaml=None
):
"""Given a list of umls identifiers we want to generate all the concordances
between UMLS and that other entity"""
"""
Given a list of UMLS identifiers, this function generates all concordances between UMLS
and another specified entity.

The process involves parsing UMLS data from MRCONSO, applying filters based on source
types and term types (TTY), and outputting relationships between UMLS identifiers and
corresponding external IDs. Additionally, the function captures provenance information if
specified.

:param mrconso: Path to the MRCONSO file, which contains UMLS relationship data.
:param umls_input: Path to the input file listing the UMLS identifiers of interest.
:param umls_output: Path to the output file where concordance results will be written.
:param other_prefixes: Dictionary mapping source types (e.g., MESH, DRUGBANK) to their
respective prefixes for generating external identifiers.
:param bad_mappings: Dictionary mapping UMLS CUIs to sets of external identifiers that
are considered invalid. Defaults to an empty defaultdict.
:param exclude_ids_from: A list of files to exclude IDs from. Identifiers from this list of
files will be excluded whichever side of the relationship they appear on. Defaults to an empty list.
:param acceptable_identifiers: Dictionary where keys are prefixes and values are
sets of acceptable external identifiers for those prefixes. Defaults to an empty
dictionary.
:param cui_prefix: Prefix for UMLS concept unique identifiers (CUIs). Defaults to "UMLS".
:param provenance_metadata_yaml: Path to a YAML file where provenance metadata will be
stored. If None, provenance metadata is not written. Defaults to None.

:return: None
"""

if bad_mappings is None:
bad_mappings = defaultdict(set)
if exclude_ids_from is None:
exclude_ids_from = []

# I've made this more complicated than it ought to be for 2 reasons:
# One is to keep from having to pass through the umls file more than once, but that's a bad reason
# The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another
# way. I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it.
umls_ids = set()
with open(umls_input) as inf:
for line in inf:
u = line.strip().split("\t")[0].split(":")[1]
umls_ids.add(u)
lookfor = set(other_prefixes.keys())

# Load identifiers we need to exclude.
exclude_ids = set()
for exclude_file in exclude_ids_from:
exclude_ids_for_file = set()
with open(exclude_file) as inf:
line_no = 0
for line in inf:
line_no += 1
row = re.split(r"\s+", line.strip())
if len(row) == 0:
# Identifier only.
exclude_ids_for_file.add(row[0])
elif len(row) == 2:
# Identifier and Biolink Type, which we can ignore.
exclude_ids_for_file.add(row[0])
else:
raise RuntimeError(f"Invalid line in exclude_ids_from file {exclude_file} line {line_no}: '{line}'")

logger.info(f"Loaded {len(exclude_ids_for_file)} identifiers from {exclude_file} to exclude from build_sets()")
exclude_ids.update(exclude_ids_for_file)

logger.info(f"Loaded {len(exclude_ids)} identifiers to exclude from build_sets() from exclude files: {exclude_ids_from}")

# On UMLS / MESH: we have been getting all UMLS / MESH relationships. This has led to some clear mistakes
# and logical impossibilities such as cyclical subclasses. On further review, we can sharpen these relationships
# by choosing the best match UMLS for each MESH. We will make use of the TTY column (column 12) in MRCONSO.
# This column can have a lot of values, but every MESH has one of (and only one of): MH, NM, HT, QAB. These
# will be the ones that we pull, as they correspond to the "main" name or heading of the mesh entry.
# Because drugbank IDs are for active ingredients, we only want the UMLS IDs that map to a TTY of IN (ingredient)
# Otherwise, you get the same DBID mapping to multiple UMLS IDs in a loose way.
umls_ids = set()
with open(umls_input) as inf:
for line in inf:
u = line.strip().split("\t")[0].split(":")[1]
umls_ids.add(u)
lookfor = set(other_prefixes.keys())
acceptable_mesh_tty = set(["MH", "NM", "HT", "QAB"])
acceptable_drugbank_tty = set(["IN", "PIN", "MIN"])

pairs = set()
# test_cui = 'C0026827'
with open(mrconso) as inf, open(umls_output, "w") as concordfile:
Expand Down Expand Up @@ -265,7 +327,15 @@ def build_sets(
if (pref in acceptable_identifiers) and (tup[1] not in acceptable_identifiers[pref]):
continue
if tup not in pairs:
concordfile.write(f"{tup[0]}\teq\t{tup[1]}\n")
from_id = tup[0]
to_id = tup[1]
if from_id in exclude_ids:
logger.debug(f"Skipping {from_id} -> {to_id} because {from_id} is in exclude_ids")
continue
if to_id in exclude_ids:
logger.debug(f"Skipping {from_id} -> {to_id} because {to_id} is in exclude_ids")
continue
concordfile.write(f"{from_id}\teq\t{to_id}\n")
pairs.add(tup)

# Write provenance for this build_sets() call.
Expand All @@ -274,7 +344,7 @@ def build_sets(
provenance_metadata_yaml,
name="umls.build_sets()",
sources=[{"type": "UMLS", "name": "MRCONSO"}],
description=f"umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}",
description=f"umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}, while excluding identifiers from {exclude_ids_from}.",
concord_filename=umls_output,
)

Expand Down
13 changes: 12 additions & 1 deletion src/snakefiles/protein.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import src.assess_compendia as assessments

# import src.filter_compendia as filter
import src.snakefiles.util as util
from src.prefixes import MESH, DRUGBANK

### Gene / Protein

Expand Down Expand Up @@ -96,11 +97,21 @@ rule get_protein_umls_relationships:
input:
mrconso=config["download_directory"] + "/UMLS/MRCONSO.RRF",
infile=config["intermediate_directory"] + "/protein/ids/UMLS",
exclude_ids_from=expand("{intermediate}/chemicals/ids/{id_files}", intermediate=config["intermediate_directory"], id_files=[
'UMLS',
'MESH',
'DRUGBANK',
]),
output:
outfile=config["intermediate_directory"] + "/protein/concords/UMLS",
metadata_yaml=config["intermediate_directory"] + "/protein/concords/metadata-UMLS.yaml",
run:
protein.build_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml)
# Since we're incorporating UMLS <-> MSH and DRUGBANK mappings, we should also make sure we exclude
# chemical/ids/UMLS, MESH and DRUGBANK so that we don't add the same identifier to chemicals and to proteins.
protein.build_umls_relationships(input.mrconso, input.infile, output.outfile, other_prefixes={
"MSH": MESH,
"DRUGBANK": DRUGBANK,
}, exclude_ids_from=input.exclude_ids_from, metadata_yaml=output.metadata_yaml)


rule protein_compendia:
Expand Down
Loading