NCATSTranslator · gaurav · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -136,3 +136,28 @@ Gene+Protein and Drug+Chemical each have dedicated conflation modules (`geneprot
 - `babel_downloads/` — cached source data
 - `babel_outputs/intermediate/` — intermediate build artifacts
 - `babel_outputs/` — final compendia, synonyms, reports, exports
+
+## Running Babel
+
+You may run `uv run snakemake -c all --rerun-incomplete [rulename]` to run a particular rule.
+When running a download step, it will be easier to run the job in Snakemake, but when running
+a rule that produces intermediate files, it might be easier to download the intermediate files from
+<https://stars.renci.org/var/babel/2025dec11/> (which is the `babel_output` folder from a run on a
+high performance cluster) so you don't need to download all the source files and
+rerun the entire pipeline. You can look at the resource requirements of a rule to decide which
+option would be best.
+
+## Debugging
+
+When looking things up in the source databases, prefer to invoke the existing download code in
+this repository unless you suspect that it is incorrect, in which case use the existing code
+and then compare it with an API lookup to see how they differ.
+
+If it is easy to add a test that will either exercise this bug or check some other relevant
+functionality, please suggest that when planning the bug fix.
+
+It is very important that two different compendia don't contain the same identifier and that we
+don't miss out on any valid identifiers without very good reason. If you're changing how
+identifiers are filtered in one compendium, think about whether that will affect which identifiers
+should be included in the other compendia to prevent any identifiers from being missed or being
+added twice.
diff --git a/config.yaml b/config.yaml
@@ -169,6 +169,7 @@ protein_synonyms:
 
 protein_ids:
   - ENSEMBL
+  - MESH
   - UniProtKB
   - PR
   - UMLS

diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py
@@ -144,26 +144,53 @@ def write_pubchem_ids(labelfile, smilesfile, outfile):
 
 
 def write_mesh_ids(outfile):
-    # Get the D tree,
-    # D01	Inorganic Chemicals
-    # D02	Organic Chemicals
-    # D03	Heterocyclic Compounds
-    # D04	Polycyclic Compounds
-    # D05	Macromolecular Substances  NO
-    # D06	Hormones, Hormone Substitutes, and Hormone Antagonists
-    # D08	Enzymes and Coenzymes  NO, include with ... Activities?
-    # D09	Carbohydrates
-    # D10	Lipids
-    # D12	Amino Acids, Peptides, and Proteins
-    # D12.125 AA yes
-    # D12.644 Peptides yes
-    # D12.776 proteins  NO
-    # D13	Nucleic Acids, Nucleotides, and Nucleosides
-    # D20	Complex Mixtures
-    # D23	Biological Factors
-    # D25	Biomedical and Dental Materials
-    # D26	Pharmaceutical Preparations
-    # D27	Chemical Actions and Uses NO
+    # MeSH D tree — chemical-related subtrees.
+    # Included as CHEMICAL_ENTITY:
+    #   D01  Inorganic Chemicals
+    #   D02  Organic Chemicals
+    #   D03  Heterocyclic Compounds
+    #   D04  Polycyclic Compounds
+    #   D06  Hormones, Hormone Substitutes, and Hormone Antagonists
+    #   D07  (not currently assigned in MeSH)
+    #   D09  Carbohydrates
+    #   D10  Lipids
+    #   D11  (not currently assigned in MeSH)
+    #   D12  Amino Acids, Peptides, and Proteins (partially — see below)
+    #   D14–D19  (not currently assigned in MeSH)
+    #   D21–D22  (not currently assigned in MeSH)
+    #   D23  Biological Factors
+    #   D24  (not currently assigned in MeSH)
+    #   D25  Biomedical and Dental Materials
+    #   D26  Pharmaceutical Preparations
+    #
+    # Included as POLYPEPTIDE:
+    #   D12.125  Amino Acids
+    #   D12.644  Peptides
+    #   D13      Nucleic Acids, Nucleotides, and Nucleosides
+    #
+    # Included as COMPLEX_MOLECULAR_MIXTURE:
+    #   D20  Complex Mixtures
+    #
+    # EXCLUDED (sent to protein compendium instead — see protein.write_mesh_ids):
+    #   D05      Macromolecular Substances — protein subtrees (D05.500 Multiprotein Complexes,
+    #            D05.875 Protein Aggregates) go to proteins; non-protein subtrees (D05.750
+    #            Polymers, D05.937 Smart Materials, D05.374 Micelles) are in neither compendium.
+    #   D08      Enzymes and Coenzymes — protein subtrees (D08.811 Enzymes, D08.622 Enzyme
+    #            Precursors, D08.244 Cytochromes) go to proteins; D08.211 Coenzymes (small
+    #            molecules) is in neither compendium.
+    #   D12.776  Proteins — goes to protein compendium.
+    #
+    # D27 (Chemical Actions and Uses) is implicitly excluded by the range D01-D26.
+    #
+    # TODO: The MeSH tree assignments for chemicals and proteins are currently defined
+    # independently in chemicals.write_mesh_ids() and protein.write_mesh_ids(). These
+    # should be unified into a shared mapping (e.g. in config.yaml or a dedicated module)
+    # so both compendia are derived from the same source of truth. This would also make it
+    # easier to handle edge cases like:
+    #   - D05 non-protein subtrees (Polymers, Smart Materials, Micelles) and D08.211
+    #     (Coenzymes) that currently fall into neither compendium.
+    #   - SCR_Chemical terms mapped to non-protein descriptors that are nonetheless proteins
+    #     (e.g. scorpion venom toxins classified under D23 Biological Factors).
     meshmap = {f"D{str(i).zfill(2)}": CHEMICAL_ENTITY for i in range(1, 27)}
     meshmap["D05"] = "EXCLUDE"
     meshmap["D08"] = "EXCLUDE"
@@ -172,8 +199,17 @@ def write_mesh_ids(outfile):
     meshmap["D12.644"] = POLYPEPTIDE
     meshmap["D13"] = POLYPEPTIDE
     meshmap["D20"] = COMPLEX_MOLECULAR_MIXTURE
-    # Also add anything from SCR_Chemical, if it doesn't have a tree map
-    mesh.write_ids(meshmap, outfile, order=["EXCLUDE", POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY})
+    # Also add anything from SCR_Chemical, if it doesn't have a tree map.
+    # SCR terms don't have tree numbers, so we need to separately exclude SCRs
+    # mapped to descriptors under excluded trees (proteins, macromolecules, enzymes).
+    excluded_trees = [treenum for treenum, category in meshmap.items() if category == "EXCLUDE"]
+    mesh.write_ids(
+        meshmap,
+        outfile,
+        order=["EXCLUDE", POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY],
+        extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY},
+        scr_exclude_trees=excluded_trees,
+    )
 
 
 # def write_obo_ids(irisandtypes,outfile,exclude=[]):

diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py
@@ -1,6 +1,7 @@
 import os
 import re
 
+import src.datahandlers.mesh as mesh
 import src.datahandlers.obo as obo
 import src.datahandlers.umls as umls
 from src.babel_utils import Text, glom, read_identifier_file, write_compendium
@@ -31,6 +32,59 @@ def write_umls_ids(mrsty, outfile):
     umls.write_umls_ids(mrsty, umlsmap, outfile)
 
 
+def write_mesh_ids(outfile):
+    # MeSH protein trees — these are terms excluded from the chemical compendium
+    # (see chemicals.write_mesh_ids) that belong in the protein compendium instead.
+    #
+    # D12.776  Proteins (entire subtree)
+    #
+    # D05      Macromolecular Substances — only protein-related subtrees:
+    #   D05.500  Multiprotein Complexes
+    #   D05.875  Protein Aggregates
+    #   (Excluded from both compendia: D05.750 Polymers, D05.937 Smart Materials,
+    #    D05.374 Micelles — these are non-protein macromolecules.)
+    #
+    # D08      Enzymes and Coenzymes — only protein-related subtrees:
+    #   D08.811  Enzymes
+    #   D08.622  Enzyme Precursors
+    #   D08.244  Cytochromes
+    #   (Excluded from both compendia: D08.211 Coenzymes — these are small molecules.)
+    #
+    # TODO: A more comprehensive solution would be to define the chemical and protein
+    # MeSH tree assignments in a single shared location (e.g. config.yaml or a dedicated
+    # mapping module) so that both compendia are derived from the same source of truth.
+    # This would prevent the current situation where the excluded trees in chemicals.py
+    # and the included trees here must be kept in sync manually. Possible approaches:
+    #   1. A shared dict mapping tree numbers to (compendium, category) pairs.
+    #   2. A two-pass approach: first classify all MeSH terms, then partition into
+    #      compendia based on the classification.
+    #   3. Use the MeSH SCR "heading mapped to" relationships more aggressively to
+    #      infer types for SCR terms that lack tree numbers (e.g. SCR proteins that
+    #      MeSH maps to venom descriptors rather than protein descriptors).
+    meshmap = {
+        "D12.776": PROTEIN,  # Proteins
+        "D05.500": PROTEIN,  # Multiprotein Complexes
+        "D05.875": PROTEIN,  # Protein Aggregates
+        "D08.811": PROTEIN,  # Enzymes
+        "D08.622": PROTEIN,  # Enzyme Precursors
+        "D08.244": PROTEIN,  # Cytochromes
+    }
+    # Also include SCR_Chemical terms mapped to protein descriptor trees.
+    # We use scr_include_trees to only keep SCR terms mapped to the protein-related
+    # trees (D12.776, D05, D08). This is the inverse of scr_exclude_trees used in
+    # chemicals.write_mesh_ids(). We use the broader D05 and D08 here (not just the
+    # protein subtrees) because any SCR mapped to D05 or D08 is more likely a protein
+    # than a non-protein macromolecule.
+    scr_protein_trees = ["D12.776", "D05", "D08"]
+    mesh.write_ids(
+        meshmap,
+        outfile,
+        order=[PROTEIN],
+        extra_vocab={"SCR_Chemical": PROTEIN},
+        scr_include_trees=scr_protein_trees,
+    )
+
+
 def write_pr_ids(outfile):
     protein_id = f"{PR}:000000001"
     obo.write_obo_ids([(protein_id, PROTEIN)], outfile, [PROTEIN])

diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
@@ -45,6 +45,32 @@ def get_terms_in_tree(self, top_treenum):
             meshes.append(f"{MESH}:{meshid}")
         return meshes
 
+    def get_scr_terms_mapped_to_trees(self, top_treenums):
+        """Get Supplementary Concept Record terms that are mapped to descriptors under any of the given tree numbers.
+
+        SCR terms don't have tree numbers themselves, but they have meshv:mappedTo and/or
+        meshv:preferredMappedTo relationships to descriptor terms that do. This method finds
+        SCR terms whose mapped descriptors fall under the specified trees."""
+        terms = set()
+        for top_treenum in top_treenums:
+            s = f"""   PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
+                    PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
+
+                    SELECT DISTINCT ?term
+                    WHERE {{ VALUES ?mappingPred {{ meshv:mappedTo meshv:preferredMappedTo }}
+                             ?term ?mappingPred ?descriptor .
+                             ?descriptor meshv:treeNumber ?treenum .
+                             ?treenum meshv:parentTreeNumber* mesh:{top_treenum}
+                    }}
+                    ORDER BY ?term
+            """
+            qres = self.m.query(s)
+            for row in list(qres):
+                iterm = str(row["term"])
+                meshid = iterm[:-1].split("/")[-1]
+                terms.add(f"{MESH}:{meshid}")
+        return terms
+
     def get_terms_with_type(self, termtype):
         s = f"""  PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                 PREFIX rdfns: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
@@ -137,20 +163,43 @@ def pull_mesh_registry():
     return m.get_registry()
 
 
-def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}):
+def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}, scr_exclude_trees=None, scr_include_trees=None):
     """Write the mesh identifiers from a particular set of hierarchies to an output directory.
     This might be a mixed list of types (for instance anatomy and cell).  Also, the same term
-    may appear in multiple trees, perhaps with different types."""
+    may appear in multiple trees, perhaps with different types.
+
+    scr_exclude_trees: optional list of tree numbers. SCR terms (from extra_vocab) that are
+    mapped to descriptors under these trees will be marked as EXCLUDE.
+    scr_include_trees: optional list of tree numbers. If set, only SCR terms (from extra_vocab)
+    that are mapped to descriptors under these trees will be kept; all other SCR terms will be
+    removed. Cannot be used together with scr_exclude_trees."""
+    if scr_exclude_trees and scr_include_trees:
+        raise ValueError("scr_exclude_trees and scr_include_trees cannot both be set")
     m = Mesh()
     terms2type = defaultdict(set)
     for treenum, category in meshmap.items():
         mesh_terms = m.get_terms_in_tree(treenum)
         for mt in mesh_terms:
             terms2type[mt].add(category)
-    for k, v in extra_vocab.items():
-        mesh_terms = m.get_terms_with_type(k)
-        for mt in mesh_terms:
-            terms2type[mt].add(v)
+    if scr_include_trees:
+        # Only add extra_vocab terms that are mapped to descriptors under the included trees.
+        # This is the inverse of scr_exclude_trees: instead of adding all SCR terms and then
+        # marking some as EXCLUDE, we only add SCR terms that match the included trees.
+        included_scr_terms = m.get_scr_terms_mapped_to_trees(scr_include_trees)
+        for k, v in extra_vocab.items():
+            mesh_terms = m.get_terms_with_type(k)
+            for mt in mesh_terms:
+                if mt in included_scr_terms:
+                    terms2type[mt].add(v)
+    else:
+        for k, v in extra_vocab.items():
+            mesh_terms = m.get_terms_with_type(k)
+            for mt in mesh_terms:
+                terms2type[mt].add(v)
+        if scr_exclude_trees:
+            excluded_scr_terms = m.get_scr_terms_mapped_to_trees(scr_exclude_trees)
+            for mt in excluded_scr_terms:
+                terms2type[mt].add("EXCLUDE")
     with open(outfile, "w") as idfile:
         for term, typeset in terms2type.items():
             list_typeset = list(typeset)

diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py
@@ -307,9 +307,7 @@ def download_umls(umls_version, umls_subset, download_dir):
     """
     umls_api_key = os.environ.get("UMLS_API_KEY")
     if not umls_api_key:
-        print("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.")
-        print("See instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html")
-        exit(1)
+        raise RuntimeError("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.\nSee instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html")
 
     # Check umls_subset.
     if umls_subset not in ["full", "level-0"]:
@@ -325,8 +323,7 @@ def download_umls(umls_version, umls_subset, download_dir):
         stream=True,
     )
     if not req.ok:
-        print(f"Unable to download UMLS from {umls_url}: {req}")
-        exit(1)
+        raise RuntimeError(f"Unable to download UMLS from {umls_url}: {req}")
 
     # Write file to {download_dir}/umls-{umls_version}-metathesaurus-full.zip
     logging.info(f"Downloading {filename} to {download_dir}")

diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile
@@ -7,6 +7,15 @@ import src.snakefiles.util as util
 ### Gene / Protein
 
 
+rule protein_mesh_ids:
+    input:
+        infile=config["download_directory"] + "/MESH/mesh.nt",
+    output:
+        outfile=config["intermediate_directory"] + "/protein/ids/MESH",
+    run:
+        protein.write_mesh_ids(output.outfile)
+
+
 rule protein_pr_ids:
     output:
         outfile=config["intermediate_directory"] + "/protein/ids/PR",
-Original file line number
+Diff line change
@@ Expand Up / @@ -169,6 +169,7 @@ protein_synonyms: @@
     protein_ids:
       - ENSEMBL
+      - MESH
       - UniProtKB
       - PR
       - UMLS
@@ Expand Down @@