NCATSTranslator · gaurav · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py
@@ -6,7 +6,7 @@
 from src.babel_utils import Text, glom, read_identifier_file, write_compendium
 from src.categories import PROTEIN
 from src.metadata.provenance import write_concord_metadata
-from src.prefixes import DRUGBANK, ENSEMBL, MESH, NCBITAXON, NCIT, PR, UNIPROTKB
+from src.prefixes import DRUGBANK, ENSEMBL, NCBITAXON, NCIT, PR, UNIPROTKB
 from src.ubergraph import UberGraph
 from src.util import get_logger, get_memory_usage_summary
 
@@ -26,8 +26,19 @@ def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename
 
 
 def write_umls_ids(mrsty, outfile):
-    umlsmap = {}
-    umlsmap["A1.4.1.2.1.7"] = PROTEIN
+    # Compare with src/createcompendia/chemicals.py (see source code at
+    # https://github.com/NCATSTranslator/Babel/blob/c91654411923b86300cc2f6b5a21b96ea857817f/src/createcompendia/chemicals.py#L54-L76)
+    #
+    # We have to make sure we don't include UMLS identifiers both here and in chemicals.py, otherwise they'll
+    # end up in both compendia.
+    umlsmap = {
+        "A1.4.1.2.1.7": PROTEIN,    # Amino Acid, Peptide, or Protein -- https://uts.nlm.nih.gov/uts/umls/semantic-network/T116
+        # The following should not be needed: receptors are generally proteins, and enzymes are definitionally proteins, so
+        # they should all be included in T116. But since we exclude them in chemicals.py, I think it makes sense to include
+        # them here.
+        "A1.4.1.1.3.6": PROTEIN,    # Receptor -- https://uts.nlm.nih.gov/uts/umls/semantic-network/T192
+        "A1.4.1.1.3.3": PROTEIN,    # Enzyme -- https://uts.nlm.nih.gov/uts/umls/semantic-network/T126
+    }
     umls.write_umls_ids(mrsty, umlsmap, outfile)
 
 
@@ -158,14 +169,14 @@ def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml):
     umls.build_sets(mrconso, idfile, outfile, {"NCI": NCIT}, provenance_metadata_yaml=metadata_yaml)
 
 
-def build_umls_relationships(mrconso, idfile, outfile, metadata_yaml):
+def build_umls_relationships(mrconso, idfile, outfile, other_prefixes, exclude_ids_from, metadata_yaml):
     # The corresponding code in chemicals also includes (1) {'RXNORM': RXCUI}, and (2) we also pull in RxNorm to
     # provide the inverse concords (i.e. RxNorm -> MESH and DRUGBANK). Doing so will probably fix some RXCUI IDs,
     # but assigning RXCUI to proteins seems like a bridge too far for me.
     #
     # TODO: we should probably add some kind of filtering so we don't include concords that point to chemicals rather
     # than proteins, which could result in duplicates (if the same ID is picked up in both chemicals and proteins).
-    umls.build_sets(mrconso, idfile, outfile, {"MSH": MESH, "DRUGBANK": DRUGBANK}, provenance_metadata_yaml=metadata_yaml)
+    umls.build_sets(mrconso, idfile, outfile, other_prefixes=other_prefixes, exclude_ids_from=exclude_ids_from, provenance_metadata_yaml=metadata_yaml)
 
 
 def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename):

diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py
@@ -204,30 +204,92 @@ def write_rxnorm_ids(category_map, bad_categories, infile, outfile, prefix=RXCUI
                 outf.write(f"{prefix}:{current_id}\t{DRUG}\n")
 
 
-# I've made this more complicated than it ought to be for 2 reasons:
-# One is to keep from having to pass through the umls file more than once, but that's a bad reason
-# The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another
-#  way.  I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it.
 def build_sets(
-    mrconso, umls_input, umls_output, other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}, cui_prefix=UMLS, provenance_metadata_yaml=None
+        mrconso, umls_input, umls_output,
+        other_prefixes,
+        bad_mappings=None,
+        exclude_ids_from=None,
+        acceptable_identifiers={},
+        cui_prefix=UMLS,
+        provenance_metadata_yaml=None
 ):
-    """Given a list of umls identifiers we want to generate all the concordances
-    between UMLS and that other entity"""
+    """
+    Given a list of UMLS identifiers, this function generates all concordances between UMLS
+    and another specified entity.
+
+    The process involves parsing UMLS data from MRCONSO, applying filters based on source
+    types and term types (TTY), and outputting relationships between UMLS identifiers and
+    corresponding external IDs. Additionally, the function captures provenance information if
+    specified.
+
+    :param mrconso: Path to the MRCONSO file, which contains UMLS relationship data.
+    :param umls_input: Path to the input file listing the UMLS identifiers of interest.
+    :param umls_output: Path to the output file where concordance results will be written.
+    :param other_prefixes: Dictionary mapping source types (e.g., MESH, DRUGBANK) to their
+        respective prefixes for generating external identifiers.
+    :param bad_mappings: Dictionary mapping UMLS CUIs to sets of external identifiers that
+        are considered invalid. Defaults to an empty defaultdict.
+    :param exclude_ids_from: A list of files to exclude IDs from. Identifiers from this list of
+        files will be excluded whichever side of the relationship they appear on. Defaults to an empty list.
+    :param acceptable_identifiers: Dictionary where keys are prefixes and values are
+        sets of acceptable external identifiers for those prefixes. Defaults to an empty
+        dictionary.
+    :param cui_prefix: Prefix for UMLS concept unique identifiers (CUIs). Defaults to "UMLS".
+    :param provenance_metadata_yaml: Path to a YAML file where provenance metadata will be
+        stored. If None, provenance metadata is not written. Defaults to None.
+
+    :return: None
+    """
+
+    if bad_mappings is None:
+        bad_mappings = defaultdict(set)
+    if exclude_ids_from is None:
+        exclude_ids_from = []
+
+    # I've made this more complicated than it ought to be for 2 reasons:
+    # One is to keep from having to pass through the umls file more than once, but that's a bad reason
+    # The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another
+    #  way.  I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it.
+    umls_ids = set()
+    with open(umls_input) as inf:
+        for line in inf:
+            u = line.strip().split("\t")[0].split(":")[1]
+            umls_ids.add(u)
+    lookfor = set(other_prefixes.keys())
+
+    # Load identifiers we need to exclude.
+    exclude_ids = set()
+    for exclude_file in exclude_ids_from:
+        exclude_ids_for_file = set()
+        with open(exclude_file) as inf:
+            line_no = 0
+            for line in inf:
+                line_no += 1
+                row = re.split(r"\s+", line.strip())
+                if len(row) == 0:
+                    # Identifier only.
+                    exclude_ids_for_file.add(row[0])
+                elif len(row) == 2:
+                    # Identifier and Biolink Type, which we can ignore.
+                    exclude_ids_for_file.add(row[0])
+                else:
+                    raise RuntimeError(f"Invalid line in exclude_ids_from file {exclude_file} line {line_no}: '{line}'")
+
+        logger.info(f"Loaded {len(exclude_ids_for_file)} identifiers from {exclude_file} to exclude from build_sets()")
+        exclude_ids.update(exclude_ids_for_file)
+
+    logger.info(f"Loaded {len(exclude_ids)} identifiers to exclude from build_sets() from exclude files: {exclude_ids_from}")
+
     # On UMLS / MESH: we have been getting all UMLS / MESH relationships.   This has led to some clear mistakes
     # and logical impossibilities such as cyclical subclasses.   On further review, we can sharpen these relationships
     # by choosing the best match UMLS for each MESH.  We will make use of the TTY column (column 12) in MRCONSO.
     # This column can have a lot of values, but every MESH has one of (and only one of): MH, NM, HT, QAB.  These
     # will be the ones that we pull, as they correspond to the "main" name or heading of the mesh entry.
     # Because drugbank IDs are for active ingredients, we only want the UMLS IDs that map to a TTY of IN (ingredient)
     # Otherwise, you get the same DBID mapping to multiple UMLS IDs in a loose way.
-    umls_ids = set()
-    with open(umls_input) as inf:
-        for line in inf:
-            u = line.strip().split("\t")[0].split(":")[1]
-            umls_ids.add(u)
-    lookfor = set(other_prefixes.keys())
     acceptable_mesh_tty = set(["MH", "NM", "HT", "QAB"])
     acceptable_drugbank_tty = set(["IN", "PIN", "MIN"])
+
     pairs = set()
     # test_cui = 'C0026827'
     with open(mrconso) as inf, open(umls_output, "w") as concordfile:
@@ -265,7 +327,15 @@ def build_sets(
             if (pref in acceptable_identifiers) and (tup[1] not in acceptable_identifiers[pref]):
                 continue
             if tup not in pairs:
-                concordfile.write(f"{tup[0]}\teq\t{tup[1]}\n")
+                from_id = tup[0]
+                to_id = tup[1]
+                if from_id in exclude_ids:
+                    logger.debug(f"Skipping {from_id} -> {to_id} because {from_id} is in exclude_ids")
+                    continue
+                if to_id in exclude_ids:
+                    logger.debug(f"Skipping {from_id} -> {to_id} because {to_id} is in exclude_ids")
+                    continue
+                concordfile.write(f"{from_id}\teq\t{to_id}\n")
                 pairs.add(tup)
 
     # Write provenance for this build_sets() call.
@@ -274,7 +344,7 @@ def build_sets(
             provenance_metadata_yaml,
             name="umls.build_sets()",
             sources=[{"type": "UMLS", "name": "MRCONSO"}],
-            description=f"umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}",
+            description=f"umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}, while excluding identifiers from {exclude_ids_from}.",
             concord_filename=umls_output,
         )
 

diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile
@@ -3,6 +3,7 @@ import src.assess_compendia as assessments
 
 # import src.filter_compendia as filter
 import src.snakefiles.util as util
+from src.prefixes import MESH, DRUGBANK
 
 ### Gene / Protein
 
@@ -96,11 +97,21 @@ rule get_protein_umls_relationships:
     input:
         mrconso=config["download_directory"] + "/UMLS/MRCONSO.RRF",
         infile=config["intermediate_directory"] + "/protein/ids/UMLS",
+        exclude_ids_from=expand("{intermediate}/chemicals/ids/{id_files}", intermediate=config["intermediate_directory"], id_files=[
+            'UMLS',
+            'MESH',
+            'DRUGBANK',
+        ]),
     output:
         outfile=config["intermediate_directory"] + "/protein/concords/UMLS",
         metadata_yaml=config["intermediate_directory"] + "/protein/concords/metadata-UMLS.yaml",
     run:
-        protein.build_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml)
+        # Since we're incorporating UMLS <-> MSH and DRUGBANK mappings, we should also make sure we exclude
+        # chemical/ids/UMLS, MESH and DRUGBANK so that we don't add the same identifier to chemicals and to proteins.
+        protein.build_umls_relationships(input.mrconso, input.infile, output.outfile, other_prefixes={
+            "MSH": MESH,
+            "DRUGBANK": DRUGBANK,
+        }, exclude_ids_from=input.exclude_ids_from, metadata_yaml=output.metadata_yaml)
 
 
 rule protein_compendia: