NCATSTranslator · gaurav · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026 · Mar 8, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -29,13 +29,13 @@ uv run snakemake --cores 1 chemical       # Another target
 ### Testing
 
 ```bash
-PYTHONPATH=. uv run pytest                           # All tests
-PYTHONPATH=. uv run pytest --cov=src                 # With coverage report
-PYTHONPATH=. uv run pytest tests/test_node_factory.py  # Single test file
-PYTHONPATH=. uv run pytest -m unit -q               # Unit tests only (CI default)
-PYTHONPATH=. uv run pytest --network                # Include network tests
-PYTHONPATH=. uv run pytest --all                    # Run every test
-PYTHONPATH=. uv run pytest -n auto                  # Parallel (all CPUs)
+uv run pytest                           # All tests
+uv run pytest --cov=src                 # With coverage report
+uv run pytest tests/test_node_factory.py  # Single test file
+uv run pytest -m unit -q               # Unit tests only (CI default)
+uv run pytest --network                # Include network tests
+uv run pytest --all                    # Run every test
+uv run pytest -n auto                  # Parallel (all CPUs)
 ```
 
 Tests use four marks: `unit` (fast, offline), `network` (requires internet, opt-in with
@@ -61,6 +61,10 @@ uv run rumdl fmt .                       # Markdown auto-fix
 - Line length is 160 for both Python (ruff) and Snakemake (snakefmt).
 - Main config: `config.yaml` (directory paths, version strings, prefix lists per semantic type).
 - `UMLS_API_KEY` environment variable required for UMLS/RxNorm downloads.
+- `compendium_directories` in `config.yaml` maps Python compendium names to the Snakemake
+  intermediate directory names when they differ (e.g., `diseasephenotype → disease`,
+  `processactivitypathway → process`). Update this when adding a new semantic type whose
+  directory name doesn't match its Python module name.
 
 ## Architecture
 
@@ -139,6 +143,37 @@ GeneProtein and DrugChemical conflation each have dedicated conflation modules (
 - `babel_outputs/intermediate/` — intermediate build artifacts
 - `babel_outputs/` — final compendia, synonyms, reports, exports
 
+## Running Babel
+
+You may run `uv run snakemake -c all --rerun-incomplete [rulename]` to run a particular rule.
+When running a download step, it will be easier to run the job in Snakemake, but when running
+a rule that produces intermediate files, it might be easier to download the intermediate files from
+<https://stars.renci.org/var/babel/2025dec11/> (which is the `babel_output` folder from a run on a
+high performance cluster) so you don't need to download all the source files and
+rerun the entire pipeline. You can look at the resource requirements of a rule to decide which
+option would be best.
+
+## Conventions
+
+- **Error handling** — raise exceptions (`RuntimeError`, `ValueError`, etc.) rather than
+  `print(...) + exit(1)`. Exceptions are testable and propagate cleanly through Snakemake;
+  bare `exit()` calls bypass Python's exception machinery and make unit testing impossible.
+
+## Debugging
+
+When looking things up in the source databases, prefer to invoke the existing download code in
+this repository unless you suspect that it is incorrect, in which case use the existing code
+and then compare it with an API lookup to see how they differ.
+
+If it is easy to add a test that will either exercise this bug or check some other relevant
+functionality, please suggest that when planning the bug fix.
+
+It is very important that two different compendia don't contain the same identifier and that we
+don't miss out on any valid identifiers without very good reason. If you're changing how
+identifiers are filtered in one compendium, think about whether that will affect which identifiers
+should be included in the other compendia to prevent any identifiers from being missed or being
+added twice.
+
 ## Documentation
 
 When making a significant change, check if it affects any of the documentation

diff --git a/config.yaml b/config.yaml
@@ -16,6 +16,14 @@ intermediate_directory: babel_outputs/intermediate
 output_directory: babel_outputs
 tmp_directory: babel_downloads/tmp
 
+# Maps Python compendium names (as used in src/createcompendia/ and tests) to the
+# directory names that Snakemake uses under intermediate_directory.
+# Compendia not listed here use their own name as the directory name.
+# Update this whenever a new semantic type uses a shortened or different directory name.
+compendium_directories:
+  diseasephenotype: disease
+  processactivitypathway: process
+
 #
 # SHARED
 #
@@ -169,6 +177,7 @@ protein_synonyms:
 
 protein_ids:
   - ENSEMBL
+  - MESH
   - UniProtKB
   - PR
   - UMLS

diff --git a/pyproject.toml b/pyproject.toml
@@ -77,6 +77,7 @@ include = '\.snakefile$|^Snakefile'
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
+pythonpath = ["."]
 addopts = ""
 timeout = 30          # global fallback (seconds); overridden per mark in conftest.py
 # Canonical marker definitions — conftest.py intentionally does NOT re-register these.

diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -149,7 +149,7 @@ def get(self, url):
             cdelta = now - self.last_time
             if cdelta < self.delta:
                 waittime = self.delta - cdelta
-                time.sleep(waittime.microseconds / 1e6)
+                time.sleep(waittime.total_seconds())
                 throttled = True
         self.last_time = datetime.now()
         response = requests.get(url)

diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py
@@ -144,26 +144,53 @@ def write_pubchem_ids(labelfile, smilesfile, outfile):
 
 
 def write_mesh_ids(outfile):
-    # Get the D tree,
-    # D01	Inorganic Chemicals
-    # D02	Organic Chemicals
-    # D03	Heterocyclic Compounds
-    # D04	Polycyclic Compounds
-    # D05	Macromolecular Substances  NO
-    # D06	Hormones, Hormone Substitutes, and Hormone Antagonists
-    # D08	Enzymes and Coenzymes  NO, include with ... Activities?
-    # D09	Carbohydrates
-    # D10	Lipids
-    # D12	Amino Acids, Peptides, and Proteins
-    # D12.125 AA yes
-    # D12.644 Peptides yes
-    # D12.776 proteins  NO
-    # D13	Nucleic Acids, Nucleotides, and Nucleosides
-    # D20	Complex Mixtures
-    # D23	Biological Factors
-    # D25	Biomedical and Dental Materials
-    # D26	Pharmaceutical Preparations
-    # D27	Chemical Actions and Uses NO
+    # MeSH D tree — chemical-related subtrees.
+    # Included as CHEMICAL_ENTITY:
+    #   D01  Inorganic Chemicals
+    #   D02  Organic Chemicals
+    #   D03  Heterocyclic Compounds
+    #   D04  Polycyclic Compounds
+    #   D06  Hormones, Hormone Substitutes, and Hormone Antagonists
+    #   D07  (not currently assigned in MeSH)
+    #   D09  Carbohydrates
+    #   D10  Lipids
+    #   D11  (not currently assigned in MeSH)
+    #   D12  Amino Acids, Peptides, and Proteins (partially — see below)
+    #   D14–D19  (not currently assigned in MeSH)
+    #   D21–D22  (not currently assigned in MeSH)
+    #   D23  Biological Factors
+    #   D24  (not currently assigned in MeSH)
+    #   D25  Biomedical and Dental Materials
+    #   D26  Pharmaceutical Preparations
+    #
+    # Included as POLYPEPTIDE:
+    #   D12.125  Amino Acids
+    #   D12.644  Peptides
+    #   D13      Nucleic Acids, Nucleotides, and Nucleosides
+    #
+    # Included as COMPLEX_MOLECULAR_MIXTURE:
+    #   D20  Complex Mixtures
+    #
+    # EXCLUDED (sent to protein compendium instead — see protein.write_mesh_ids):
+    #   D05      Macromolecular Substances — protein subtrees (D05.500 Multiprotein Complexes,
+    #            D05.875 Protein Aggregates) go to proteins; non-protein subtrees (D05.750
+    #            Polymers, D05.937 Smart Materials, D05.374 Micelles) are in neither compendium.
+    #   D08      Enzymes and Coenzymes — protein subtrees (D08.811 Enzymes, D08.622 Enzyme
+    #            Precursors, D08.244 Cytochromes) go to proteins; D08.211 Coenzymes (small
+    #            molecules) is in neither compendium.
+    #   D12.776  Proteins — goes to protein compendium.
+    #
+    # D27 (Chemical Actions and Uses) is implicitly excluded by the range D01-D26.
+    #
+    # TODO: The MeSH tree assignments for chemicals and proteins are currently defined
+    # independently in chemicals.write_mesh_ids() and protein.write_mesh_ids(). These
+    # should be unified into a shared mapping (e.g. in config.yaml or a dedicated module)
+    # so both compendia are derived from the same source of truth. This would also make it
+    # easier to handle edge cases like:
+    #   - D05 non-protein subtrees (Polymers, Smart Materials, Micelles) and D08.211
+    #     (Coenzymes) that currently fall into neither compendium.
+    #   - SCR_Chemical terms mapped to non-protein descriptors that are nonetheless proteins
+    #     (e.g. scorpion venom toxins classified under D23 Biological Factors).
     meshmap = {f"D{str(i).zfill(2)}": CHEMICAL_ENTITY for i in range(1, 27)}
     meshmap["D05"] = "EXCLUDE"
     meshmap["D08"] = "EXCLUDE"
@@ -172,8 +199,17 @@ def write_mesh_ids(outfile):
     meshmap["D12.644"] = POLYPEPTIDE
     meshmap["D13"] = POLYPEPTIDE
     meshmap["D20"] = COMPLEX_MOLECULAR_MIXTURE
-    # Also add anything from SCR_Chemical, if it doesn't have a tree map
-    mesh.write_ids(meshmap, outfile, order=["EXCLUDE", POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY})
+    # Also add anything from SCR_Chemical, if it doesn't have a tree map.
+    # SCR terms don't have tree numbers, so we need to separately exclude SCRs
+    # mapped to descriptors under excluded trees (proteins, macromolecules, enzymes).
+    excluded_trees = [treenum for treenum, category in meshmap.items() if category == "EXCLUDE"]
+    mesh.write_ids(
+        meshmap,
+        outfile,
+        order=["EXCLUDE", POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY],
+        extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY},
+        scr_exclude_trees=excluded_trees,
+    )
 
 
 # def write_obo_ids(irisandtypes,outfile,exclude=[]):

diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py
@@ -1,6 +1,7 @@
 import os
 import re
 
+import src.datahandlers.mesh as mesh
 import src.datahandlers.obo as obo
 import src.datahandlers.umls as umls
 from src.babel_utils import Text, glom, read_identifier_file, write_compendium
@@ -42,6 +43,62 @@ def write_umls_ids(mrsty, outfile):
     umls.write_umls_ids(mrsty, umlsmap, outfile)
 
 
+def write_mesh_ids(outfile):
+    # MeSH protein trees — these are terms excluded from the chemical compendium
+    # (see chemicals.write_mesh_ids) that belong in the protein compendium instead.
+    #
+    # D12.776  Proteins (entire subtree)
+    #
+    # D05      Macromolecular Substances — only protein-related subtrees:
+    #   D05.500  Multiprotein Complexes
+    #   D05.875  Protein Aggregates
+    #   (Excluded from both compendia: D05.750 Polymers, D05.937 Smart Materials,
+    #    D05.374 Micelles — these are non-protein macromolecules.)
+    #
+    # D08      Enzymes and Coenzymes — only protein-related subtrees:
+    #   D08.811  Enzymes
+    #   D08.622  Enzyme Precursors
+    #   D08.244  Cytochromes
+    #   (Excluded from both compendia: D08.211 Coenzymes — these are small molecules.)
+    #
+    # TODO: A more comprehensive solution would be to define the chemical and protein
+    # MeSH tree assignments in a single shared location (e.g. config.yaml or a dedicated
+    # mapping module) so that both compendia are derived from the same source of truth.
+    # This would prevent the current situation where the excluded trees in chemicals.py
+    # and the included trees here must be kept in sync manually. Possible approaches:
+    #   1. A shared dict mapping tree numbers to (compendium, category) pairs.
+    #   2. A two-pass approach: first classify all MeSH terms, then partition into
+    #      compendia based on the classification.
+    #   3. Use the MeSH SCR "heading mapped to" relationships more aggressively to
+    #      infer types for SCR terms that lack tree numbers (e.g. SCR proteins that
+    #      MeSH maps to venom descriptors rather than protein descriptors).
+    meshmap = {
+        "D12.776": PROTEIN,  # Proteins
+        "D05.500": PROTEIN,  # Multiprotein Complexes
+        "D05.875": PROTEIN,  # Protein Aggregates
+        "D08.811": PROTEIN,  # Enzymes
+        "D08.622": PROTEIN,  # Enzyme Precursors
+        "D08.244": PROTEIN,  # Cytochromes
+    }
+    # Also include SCR_Chemical terms mapped to protein descriptor trees.
+    # We use scr_include_trees to only keep SCR terms mapped to the protein-related
+    # trees (D12.776, D05, D08). This is the inverse of scr_exclude_trees used in
+    # chemicals.write_mesh_ids(). We use the broader D05 and D08 here (not just the
+    # specific protein subtrees) because any SCR mapped to D05 or D08 is more likely a
+    # protein than a non-protein macromolecule.  The trade-off: SCR terms mapped to
+    # non-protein D05/D08 subtrees (e.g. Polymers under D05.750, Coenzymes under
+    # D08.211) will be classified as PROTEIN here rather than falling into neither
+    # compendium, as their corresponding descriptor terms do.
+    scr_protein_trees = ["D12.776", "D05", "D08"]
+    mesh.write_ids(
+        meshmap,
+        outfile,
+        order=[PROTEIN],
+        extra_vocab={"SCR_Chemical": PROTEIN},
+        scr_include_trees=scr_protein_trees,
+    )
+
+
 def write_pr_ids(outfile):
     protein_id = f"{PR}:000000001"
     obo.write_obo_ids([(protein_id, PROTEIN)], outfile, [PROTEIN])

diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
@@ -45,6 +45,36 @@ def get_terms_in_tree(self, top_treenum):
             meshes.append(f"{MESH}:{meshid}")
         return meshes
 
+    def get_scr_terms_mapped_to_trees(self, top_treenums):
+        """Get Supplementary Concept Record terms that are mapped to descriptors under any of the given tree numbers.
+
+        SCR terms don't have tree numbers themselves, but they have meshv:mappedTo and/or
+        meshv:preferredMappedTo relationships to descriptor terms that do. This method finds
+        SCR terms whose mapped descriptors fall under the specified trees.
+
+        Returns an empty set if top_treenums is empty."""
+        if not top_treenums:
+            return set()
+        values_clause = " ".join(f"mesh:{t}" for t in top_treenums)
+        s = f"""   PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
+                PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
+
+                SELECT DISTINCT ?term
+                WHERE {{ VALUES ?mappingPred {{ meshv:mappedTo meshv:preferredMappedTo }}
+                         VALUES ?topTree {{ {values_clause} }}
+                         ?term ?mappingPred ?descriptor .
+                         ?descriptor meshv:treeNumber ?treenum .
+                         ?treenum meshv:parentTreeNumber* ?topTree
+                }}
+                ORDER BY ?term
+        """
+        terms = set()
+        for row in list(self.m.query(s)):
+            iterm = str(row["term"])
+            meshid = iterm[:-1].split("/")[-1]
+            terms.add(f"{MESH}:{meshid}")
+        return terms
+
     def get_terms_with_type(self, termtype):
         s = f"""  PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                 PREFIX rdfns: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
@@ -137,20 +167,43 @@ def pull_mesh_registry():
     return m.get_registry()
 
 
-def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}):
+def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}, scr_exclude_trees=None, scr_include_trees=None):
     """Write the mesh identifiers from a particular set of hierarchies to an output directory.
     This might be a mixed list of types (for instance anatomy and cell).  Also, the same term
-    may appear in multiple trees, perhaps with different types."""
+    may appear in multiple trees, perhaps with different types.
+
+    scr_exclude_trees: optional list of tree numbers. SCR terms (from extra_vocab) that are
+    mapped to descriptors under these trees will be marked as EXCLUDE.
+    scr_include_trees: optional list of tree numbers. If set, only SCR terms (from extra_vocab)
+    that are mapped to descriptors under these trees will be kept; all other SCR terms will be
+    removed. Cannot be used together with scr_exclude_trees."""
+    if scr_exclude_trees and scr_include_trees:
+        raise ValueError("scr_exclude_trees and scr_include_trees cannot both be set")
     m = Mesh()
     terms2type = defaultdict(set)
     for treenum, category in meshmap.items():
         mesh_terms = m.get_terms_in_tree(treenum)
         for mt in mesh_terms:
             terms2type[mt].add(category)
-    for k, v in extra_vocab.items():
-        mesh_terms = m.get_terms_with_type(k)
-        for mt in mesh_terms:
-            terms2type[mt].add(v)
+    if scr_include_trees:
+        # Only add extra_vocab terms that are mapped to descriptors under the included trees.
+        # This is the inverse of scr_exclude_trees: instead of adding all SCR terms and then
+        # marking some as EXCLUDE, we only add SCR terms that match the included trees.
+        included_scr_terms = m.get_scr_terms_mapped_to_trees(scr_include_trees)
+        for k, v in extra_vocab.items():
+            mesh_terms = m.get_terms_with_type(k)
+            for mt in mesh_terms:
+                if mt in included_scr_terms:
+                    terms2type[mt].add(v)
+    else:
+        for k, v in extra_vocab.items():
+            mesh_terms = m.get_terms_with_type(k)
+            for mt in mesh_terms:
+                terms2type[mt].add(v)
+        if scr_exclude_trees:
+            excluded_scr_terms = m.get_scr_terms_mapped_to_trees(scr_exclude_trees)
+            for mt in excluded_scr_terms:
+                terms2type[mt].add("EXCLUDE")
     with open(outfile, "w") as idfile:
         for term, typeset in terms2type.items():
             list_typeset = list(typeset)