From f53a6d03b1be7cd7efb8c86909083c35053496f3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 8 Mar 2026 00:18:24 -0500 Subject: [PATCH 01/30] Added some info to CLAUDE.md to help with debugging. --- CLAUDE.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index daa9eda5..7c9198e1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -102,3 +102,12 @@ Gene+Protein and Drug+Chemical each have dedicated conflation modules (`geneprot - `babel_downloads/` — cached source data - `babel_outputs/intermediate/` — intermediate build artifacts - `babel_outputs/` — final compendia, synonyms, reports, exports + +# Running Babel + +You may run `uv run snakemake -c all --rerun-incomplete [rulename]` to run a particular rule. +When running a download step, it will be easier to run the job in Snakemake, but when running +a rule that produces intermediate files, it might be easier to download the intermediate files from +https://stars.renci.org/var/babel/2025dec11/ so you don't need to download all the source files and +rerun the entire pipeline. You can look at the resource requirements of a rule to decide which +option would be best. From aeb8f884b7d590da14114bc437203f42bedb1306 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 8 Mar 2026 00:34:24 -0500 Subject: [PATCH 02/30] Tweaked CLAUDE.md. --- CLAUDE.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 7c9198e1..990205a1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -108,6 +108,16 @@ Gene+Protein and Drug+Chemical each have dedicated conflation modules (`geneprot You may run `uv run snakemake -c all --rerun-incomplete [rulename]` to run a particular rule. When running a download step, it will be easier to run the job in Snakemake, but when running a rule that produces intermediate files, it might be easier to download the intermediate files from -https://stars.renci.org/var/babel/2025dec11/ so you don't need to download all the source files and +https://stars.renci.org/var/babel/2025dec11/ (which is the `babel_output` folder from a run on a +high performance cluster) so you don't need to download all the source files and rerun the entire pipeline. You can look at the resource requirements of a rule to decide which -option would be best. +option would be best. + +# Debugging + +When looking things up in the source databases, prefer to invoke the existing download code in +this repository unless you suspect that it is incorrect, in which case use the existing code +and then compare it with an API lookup to see how they differ. + +If it is easy to add a test that will either exercise this bug or check some other relevant +functionality, please suggest that when planning the bug fix. From 1ef14caa82359081191189a0b6c92c4b0d8d7ea1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 8 Mar 2026 00:42:51 -0500 Subject: [PATCH 03/30] Filter protein SCR_Chemical terms from MeSH chemical IDs (#675) SCR (Supplementary Concept Record) terms lack tree numbers, so they bypassed the existing D12.776/D05/D08 exclusion logic. This adds a SPARQL query that checks meshv:mappedTo and meshv:preferredMappedTo relationships to find SCR terms mapped to descriptors under excluded trees and marks them as EXCLUDE. Co-Authored-By: Claude Opus 4.6 --- src/createcompendia/chemicals.py | 13 +++++++++-- src/datahandlers/mesh.py | 37 ++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 8fac17ad..93418c40 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -172,8 +172,17 @@ def write_mesh_ids(outfile): meshmap["D12.644"] = POLYPEPTIDE meshmap["D13"] = POLYPEPTIDE meshmap["D20"] = COMPLEX_MOLECULAR_MIXTURE - # Also add anything from SCR_Chemical, if it doesn't have a tree map - mesh.write_ids(meshmap, outfile, order=["EXCLUDE", POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY}) + # Also add anything from SCR_Chemical, if it doesn't have a tree map. + # SCR terms don't have tree numbers, so we need to separately exclude SCRs + # mapped to descriptors under excluded trees (proteins, macromolecules, enzymes). + excluded_trees = [treenum for treenum, category in meshmap.items() if category == "EXCLUDE"] + mesh.write_ids( + meshmap, + outfile, + order=["EXCLUDE", POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_ENTITY], + extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY}, + scr_exclude_trees=excluded_trees, + ) # def write_obo_ids(irisandtypes,outfile,exclude=[]): diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index bf136eaa..df010589 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -45,6 +45,32 @@ def get_terms_in_tree(self, top_treenum): meshes.append(f"{MESH}:{meshid}") return meshes + def get_scr_terms_mapped_to_trees(self, top_treenums): + """Get Supplementary Concept Record terms that are mapped to descriptors under any of the given tree numbers. + + SCR terms don't have tree numbers themselves, but they have meshv:mappedTo and/or + meshv:preferredMappedTo relationships to descriptor terms that do. This method finds + SCR terms whose mapped descriptors fall under the specified trees.""" + terms = set() + for top_treenum in top_treenums: + s = f""" PREFIX meshv: + PREFIX mesh: + + SELECT DISTINCT ?term + WHERE {{ VALUES ?mappingPred {{ meshv:mappedTo meshv:preferredMappedTo }} + ?term ?mappingPred ?descriptor . + ?descriptor meshv:treeNumber ?treenum . + ?treenum meshv:parentTreeNumber* mesh:{top_treenum} + }} + ORDER BY ?term + """ + qres = self.m.query(s) + for row in list(qres): + iterm = str(row["term"]) + meshid = iterm[:-1].split("/")[-1] + terms.add(f"{MESH}:{meshid}") + return terms + def get_terms_with_type(self, termtype): s = f""" PREFIX rdfs: PREFIX rdfns: @@ -137,10 +163,13 @@ def pull_mesh_registry(): return m.get_registry() -def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}): +def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}, scr_exclude_trees=None): """Write the mesh identifiers from a particular set of hierarchies to an output directory. This might be a mixed list of types (for instance anatomy and cell). Also, the same term - may appear in multiple trees, perhaps with different types.""" + may appear in multiple trees, perhaps with different types. + + scr_exclude_trees: optional list of tree numbers. SCR terms (from extra_vocab) that are + mapped to descriptors under these trees will be marked as EXCLUDE.""" m = Mesh() terms2type = defaultdict(set) for treenum, category in meshmap.items(): @@ -151,6 +180,10 @@ def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cel mesh_terms = m.get_terms_with_type(k) for mt in mesh_terms: terms2type[mt].add(v) + if scr_exclude_trees: + excluded_scr_terms = m.get_scr_terms_mapped_to_trees(scr_exclude_trees) + for mt in excluded_scr_terms: + terms2type[mt].add("EXCLUDE") with open(outfile, "w") as idfile: for term, typeset in terms2type.items(): list_typeset = list(typeset) From 537ec6e4367351bd8e2304eea3f16afdba1b1804 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 8 Mar 2026 00:46:47 -0500 Subject: [PATCH 04/30] Added debugging help to CLAUDE.md. --- CLAUDE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 990205a1..937e45a7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -121,3 +121,9 @@ and then compare it with an API lookup to see how they differ. If it is easy to add a test that will either exercise this bug or check some other relevant functionality, please suggest that when planning the bug fix. + +It is very important that two different compendia don't contain the same identifier and that we +don't miss out on any valid identifiers without very good reason. If you're changing how +identifiers are filtered in one compendium, think about whether that will affect which identifiers +should be included in the other compendia to prevent any identifiers from being missed or being +added twice. From 22fa8c9b4c47c571ebe062ea8102e2440a33b3fb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sun, 8 Mar 2026 01:10:11 -0500 Subject: [PATCH 05/30] Add MeSH protein IDs to protein compendium (#675) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Protein-related MeSH terms excluded from chemicals (D12.776 Proteins, D05.500 Multiprotein Complexes, D05.875 Protein Aggregates, D08.811 Enzymes, D08.622 Enzyme Precursors, D08.244 Cytochromes) were previously dropped entirely. This adds them to the protein compendium instead, along with SCR_Chemical terms mapped to protein descriptor trees. Also adds scr_include_trees parameter to mesh.write_ids() (the inverse of scr_exclude_trees) and documents the MeSH tree assignments in both chemicals.py and protein.py with TODOs for unifying them. Some D05/D08 subtrees (Polymers, Smart Materials, Micelles, Coenzymes) remain in neither compendium — documented as a known gap. Co-Authored-By: Claude Opus 4.6 --- config.yaml | 1 + src/createcompendia/chemicals.py | 67 ++++++++++++++++++++++---------- src/createcompendia/protein.py | 54 +++++++++++++++++++++++++ src/datahandlers/mesh.py | 36 ++++++++++++----- src/snakefiles/protein.snakefile | 9 +++++ 5 files changed, 137 insertions(+), 30 deletions(-) diff --git a/config.yaml b/config.yaml index e38d810d..5f20fe63 100644 --- a/config.yaml +++ b/config.yaml @@ -169,6 +169,7 @@ protein_synonyms: protein_ids: - ENSEMBL + - MESH - UniProtKB - PR - UMLS diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 93418c40..392cbbce 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -144,26 +144,53 @@ def write_pubchem_ids(labelfile, smilesfile, outfile): def write_mesh_ids(outfile): - # Get the D tree, - # D01 Inorganic Chemicals - # D02 Organic Chemicals - # D03 Heterocyclic Compounds - # D04 Polycyclic Compounds - # D05 Macromolecular Substances NO - # D06 Hormones, Hormone Substitutes, and Hormone Antagonists - # D08 Enzymes and Coenzymes NO, include with ... Activities? - # D09 Carbohydrates - # D10 Lipids - # D12 Amino Acids, Peptides, and Proteins - # D12.125 AA yes - # D12.644 Peptides yes - # D12.776 proteins NO - # D13 Nucleic Acids, Nucleotides, and Nucleosides - # D20 Complex Mixtures - # D23 Biological Factors - # D25 Biomedical and Dental Materials - # D26 Pharmaceutical Preparations - # D27 Chemical Actions and Uses NO + # MeSH D tree — chemical-related subtrees. + # Included as CHEMICAL_ENTITY: + # D01 Inorganic Chemicals + # D02 Organic Chemicals + # D03 Heterocyclic Compounds + # D04 Polycyclic Compounds + # D06 Hormones, Hormone Substitutes, and Hormone Antagonists + # D07 (not currently assigned in MeSH) + # D09 Carbohydrates + # D10 Lipids + # D11 (not currently assigned in MeSH) + # D12 Amino Acids, Peptides, and Proteins (partially — see below) + # D14–D19 (not currently assigned in MeSH) + # D21–D22 (not currently assigned in MeSH) + # D23 Biological Factors + # D24 (not currently assigned in MeSH) + # D25 Biomedical and Dental Materials + # D26 Pharmaceutical Preparations + # + # Included as POLYPEPTIDE: + # D12.125 Amino Acids + # D12.644 Peptides + # D13 Nucleic Acids, Nucleotides, and Nucleosides + # + # Included as COMPLEX_MOLECULAR_MIXTURE: + # D20 Complex Mixtures + # + # EXCLUDED (sent to protein compendium instead — see protein.write_mesh_ids): + # D05 Macromolecular Substances — protein subtrees (D05.500 Multiprotein Complexes, + # D05.875 Protein Aggregates) go to proteins; non-protein subtrees (D05.750 + # Polymers, D05.937 Smart Materials, D05.374 Micelles) are in neither compendium. + # D08 Enzymes and Coenzymes — protein subtrees (D08.811 Enzymes, D08.622 Enzyme + # Precursors, D08.244 Cytochromes) go to proteins; D08.211 Coenzymes (small + # molecules) is in neither compendium. + # D12.776 Proteins — goes to protein compendium. + # + # D27 (Chemical Actions and Uses) is implicitly excluded by the range D01-D26. + # + # TODO: The MeSH tree assignments for chemicals and proteins are currently defined + # independently in chemicals.write_mesh_ids() and protein.write_mesh_ids(). These + # should be unified into a shared mapping (e.g. in config.yaml or a dedicated module) + # so both compendia are derived from the same source of truth. This would also make it + # easier to handle edge cases like: + # - D05 non-protein subtrees (Polymers, Smart Materials, Micelles) and D08.211 + # (Coenzymes) that currently fall into neither compendium. + # - SCR_Chemical terms mapped to non-protein descriptors that are nonetheless proteins + # (e.g. scorpion venom toxins classified under D23 Biological Factors). meshmap = {f"D{str(i).zfill(2)}": CHEMICAL_ENTITY for i in range(1, 27)} meshmap["D05"] = "EXCLUDE" meshmap["D08"] = "EXCLUDE" diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 79f66112..8c100979 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -1,6 +1,7 @@ import os import re +import src.datahandlers.mesh as mesh import src.datahandlers.obo as obo import src.datahandlers.umls as umls from src.babel_utils import Text, glom, read_identifier_file, write_compendium @@ -31,6 +32,59 @@ def write_umls_ids(mrsty, outfile): umls.write_umls_ids(mrsty, umlsmap, outfile) +def write_mesh_ids(outfile): + # MeSH protein trees — these are terms excluded from the chemical compendium + # (see chemicals.write_mesh_ids) that belong in the protein compendium instead. + # + # D12.776 Proteins (entire subtree) + # + # D05 Macromolecular Substances — only protein-related subtrees: + # D05.500 Multiprotein Complexes + # D05.875 Protein Aggregates + # (Excluded from both compendia: D05.750 Polymers, D05.937 Smart Materials, + # D05.374 Micelles — these are non-protein macromolecules.) + # + # D08 Enzymes and Coenzymes — only protein-related subtrees: + # D08.811 Enzymes + # D08.622 Enzyme Precursors + # D08.244 Cytochromes + # (Excluded from both compendia: D08.211 Coenzymes — these are small molecules.) + # + # TODO: A more comprehensive solution would be to define the chemical and protein + # MeSH tree assignments in a single shared location (e.g. config.yaml or a dedicated + # mapping module) so that both compendia are derived from the same source of truth. + # This would prevent the current situation where the excluded trees in chemicals.py + # and the included trees here must be kept in sync manually. Possible approaches: + # 1. A shared dict mapping tree numbers to (compendium, category) pairs. + # 2. A two-pass approach: first classify all MeSH terms, then partition into + # compendia based on the classification. + # 3. Use the MeSH SCR "heading mapped to" relationships more aggressively to + # infer types for SCR terms that lack tree numbers (e.g. SCR proteins that + # MeSH maps to venom descriptors rather than protein descriptors). + meshmap = { + "D12.776": PROTEIN, # Proteins + "D05.500": PROTEIN, # Multiprotein Complexes + "D05.875": PROTEIN, # Protein Aggregates + "D08.811": PROTEIN, # Enzymes + "D08.622": PROTEIN, # Enzyme Precursors + "D08.244": PROTEIN, # Cytochromes + } + # Also include SCR_Chemical terms mapped to protein descriptor trees. + # We use scr_include_trees to only keep SCR terms mapped to the protein-related + # trees (D12.776, D05, D08). This is the inverse of scr_exclude_trees used in + # chemicals.write_mesh_ids(). We use the broader D05 and D08 here (not just the + # protein subtrees) because any SCR mapped to D05 or D08 is more likely a protein + # than a non-protein macromolecule. + scr_protein_trees = ["D12.776", "D05", "D08"] + mesh.write_ids( + meshmap, + outfile, + order=[PROTEIN], + extra_vocab={"SCR_Chemical": PROTEIN}, + scr_include_trees=scr_protein_trees, + ) + + def write_pr_ids(outfile): protein_id = f"{PR}:000000001" obo.write_obo_ids([(protein_id, PROTEIN)], outfile, [PROTEIN]) diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index df010589..6ede31d5 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -163,27 +163,43 @@ def pull_mesh_registry(): return m.get_registry() -def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}, scr_exclude_trees=None): +def write_ids(meshmap, outfile, order=["biolink:CellularComponent", "biolink:Cell", "biolink:AnatomicalEntity"], extra_vocab={}, scr_exclude_trees=None, scr_include_trees=None): """Write the mesh identifiers from a particular set of hierarchies to an output directory. This might be a mixed list of types (for instance anatomy and cell). Also, the same term may appear in multiple trees, perhaps with different types. scr_exclude_trees: optional list of tree numbers. SCR terms (from extra_vocab) that are - mapped to descriptors under these trees will be marked as EXCLUDE.""" + mapped to descriptors under these trees will be marked as EXCLUDE. + scr_include_trees: optional list of tree numbers. If set, only SCR terms (from extra_vocab) + that are mapped to descriptors under these trees will be kept; all other SCR terms will be + removed. Cannot be used together with scr_exclude_trees.""" + if scr_exclude_trees and scr_include_trees: + raise ValueError("scr_exclude_trees and scr_include_trees cannot both be set") m = Mesh() terms2type = defaultdict(set) for treenum, category in meshmap.items(): mesh_terms = m.get_terms_in_tree(treenum) for mt in mesh_terms: terms2type[mt].add(category) - for k, v in extra_vocab.items(): - mesh_terms = m.get_terms_with_type(k) - for mt in mesh_terms: - terms2type[mt].add(v) - if scr_exclude_trees: - excluded_scr_terms = m.get_scr_terms_mapped_to_trees(scr_exclude_trees) - for mt in excluded_scr_terms: - terms2type[mt].add("EXCLUDE") + if scr_include_trees: + # Only add extra_vocab terms that are mapped to descriptors under the included trees. + # This is the inverse of scr_exclude_trees: instead of adding all SCR terms and then + # marking some as EXCLUDE, we only add SCR terms that match the included trees. + included_scr_terms = m.get_scr_terms_mapped_to_trees(scr_include_trees) + for k, v in extra_vocab.items(): + mesh_terms = m.get_terms_with_type(k) + for mt in mesh_terms: + if mt in included_scr_terms: + terms2type[mt].add(v) + else: + for k, v in extra_vocab.items(): + mesh_terms = m.get_terms_with_type(k) + for mt in mesh_terms: + terms2type[mt].add(v) + if scr_exclude_trees: + excluded_scr_terms = m.get_scr_terms_mapped_to_trees(scr_exclude_trees) + for mt in excluded_scr_terms: + terms2type[mt].add("EXCLUDE") with open(outfile, "w") as idfile: for term, typeset in terms2type.items(): list_typeset = list(typeset) diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index f1581c2b..72f01913 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -7,6 +7,15 @@ import src.snakefiles.util as util ### Gene / Protein +rule protein_mesh_ids: + input: + infile=config["download_directory"] + "/MESH/mesh.nt", + output: + outfile=config["intermediate_directory"] + "/protein/ids/MESH", + run: + protein.write_mesh_ids(output.outfile) + + rule protein_pr_ids: output: outfile=config["intermediate_directory"] + "/protein/ids/PR", From 10d3fdeef9a62933cda2768e5a7ba3afeb7f31f7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 01:28:42 -0400 Subject: [PATCH 06/30] Add unit tests for MeSH protein/chemical SCR filtering (issue #675) Tests cover write_ids() parameter validation, SCR exclude/include tree filtering logic (mock-based), and get_scr_terms_mapped_to_trees() via an inline pyoxigraph store. Also documents pipeline-only test scenarios in tests/README.md. Co-Authored-By: Claude Sonnet 4.6 --- tests/README.md | 15 ++ tests/datahandlers/test_mesh.py | 272 ++++++++++++++++++++++++++++++++ 2 files changed, 287 insertions(+) create mode 100644 tests/datahandlers/test_mesh.py diff --git a/tests/README.md b/tests/README.md index 15a2e902..78b2c1e8 100644 --- a/tests/README.md +++ b/tests/README.md @@ -166,3 +166,18 @@ Example tests: behaviour). - Move `test_geneproteiny.py` assertions to also check individual clique contents, not just that the output file is non-empty. + +## Out of Scope / Pipeline-only + +### Tests requiring the full MeSH pipeline (out of scope for unit tests) + +The following require loading the full MeSH RDF (~1–2 GB) and are only practical +as `pipeline` tests once `babel_downloads/MESH/mesh.nt` is pre-populated: + +- **`chemicals.write_mesh_ids()` end-to-end** — Runs SPARQL over full MeSH; verify output + contains expected D01–D26 terms and excludes D05/D08/D12.776. +- **`protein.write_mesh_ids()` end-to-end** — Verify D12.776, D05.500, D08.811 subtrees + appear in output. +- **No-overlap invariant** — Assert chemicals and protein MeSH output files share no IDs. + This is the key correctness property of issue #675. +- **`protein_mesh_ids` Snakemake rule** — End-to-end pipeline rule test. diff --git a/tests/datahandlers/test_mesh.py b/tests/datahandlers/test_mesh.py new file mode 100644 index 00000000..f7f2d8c5 --- /dev/null +++ b/tests/datahandlers/test_mesh.py @@ -0,0 +1,272 @@ +"""Unit tests for src/datahandlers/mesh.py + +Tests cover: + - write_ids() parameter validation + - write_ids() SCR filtering logic (mock-based) + - Mesh.get_scr_terms_mapped_to_trees() (inline pyoxigraph store) +""" +import io +from unittest.mock import MagicMock, patch + +import pyoxigraph +import pytest + +from src.categories import CHEMICAL_ENTITY, PROTEIN +from src.datahandlers.mesh import Mesh, write_ids + +# --------------------------------------------------------------------------- +# Helpers for building an in-memory pyoxigraph store +# --------------------------------------------------------------------------- + +_MESH_NS = "http://id.nlm.nih.gov/mesh/" +_MESHV_NS = "http://id.nlm.nih.gov/mesh/vocab#" + + +def _mesh(local: str) -> pyoxigraph.NamedNode: + return pyoxigraph.NamedNode(f"{_MESH_NS}{local}") + + +def _meshv(local: str) -> pyoxigraph.NamedNode: + return pyoxigraph.NamedNode(f"{_MESHV_NS}{local}") + + +def _quad(s, p, o) -> pyoxigraph.Quad: + return pyoxigraph.Quad(s, p, o, pyoxigraph.DefaultGraph()) + + +def _make_test_store() -> pyoxigraph.Store: + """Return an in-memory Store with a small set of known MeSH-like triples. + + Fixture summary + --------------- + C000001 --mappedTo--> D12345 --treeNumber--> D12.776.123 + D12.776.123 --parentTreeNumber--> D12.776 + + C000002 --preferredMappedTo--> D05678 --treeNumber--> D05.500.123 + D05.500.123 --parentTreeNumber--> D05.500 + D05.500 --parentTreeNumber--> D05 + + C000003 --mappedTo--> D23456 --treeNumber--> D23.123 + D23.123 --parentTreeNumber--> D23 + """ + store = pyoxigraph.Store() + mapped_to = _meshv("mappedTo") + preferred_mapped_to = _meshv("preferredMappedTo") + tree_number = _meshv("treeNumber") + parent_tree = _meshv("parentTreeNumber") + + triples = [ + # C000001 under D12.776 (one hop) + _quad(_mesh("C000001"), mapped_to, _mesh("D12345")), + _quad(_mesh("D12345"), tree_number, _mesh("D12.776.123")), + _quad(_mesh("D12.776.123"), parent_tree, _mesh("D12.776")), + # C000002 under D05 (two hops via D05.500) + _quad(_mesh("C000002"), preferred_mapped_to, _mesh("D05678")), + _quad(_mesh("D05678"), tree_number, _mesh("D05.500.123")), + _quad(_mesh("D05.500.123"), parent_tree, _mesh("D05.500")), + _quad(_mesh("D05.500"), parent_tree, _mesh("D05")), + # C000003 under D23 only (not protein) + _quad(_mesh("C000003"), mapped_to, _mesh("D23456")), + _quad(_mesh("D23456"), tree_number, _mesh("D23.123")), + _quad(_mesh("D23.123"), parent_tree, _mesh("D23")), + ] + for quad in triples: + store.add(quad) + return store + + +def _make_mesh_with_store(store: pyoxigraph.Store) -> Mesh: + """Construct a Mesh instance without loading a file, injecting a store.""" + obj = Mesh.__new__(Mesh) + obj.m = store + return obj + + +# --------------------------------------------------------------------------- +# Group 1: write_ids() parameter validation +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_write_ids_raises_when_both_scr_params_set(tmp_path): + outfile = str(tmp_path / "out.txt") + with pytest.raises(ValueError, match="cannot both be set"): + write_ids( + {}, + outfile, + scr_exclude_trees=["D05"], + scr_include_trees=["D12.776"], + ) + + +# --------------------------------------------------------------------------- +# Group 2: write_ids() filtering logic (Mesh mocked) +# --------------------------------------------------------------------------- + + +def _make_mock_mesh( + tree_map: dict[str, list[str]], + scr_chemical_terms: list[str], + scr_mapped_to_trees: dict[tuple, set[str]] | None = None, +) -> MagicMock: + """Return a configured MagicMock for the Mesh instance. + + tree_map: treenum → list of CURIE strings returned by get_terms_in_tree + scr_chemical_terms: list returned by get_terms_with_type("SCR_Chemical") + scr_mapped_to_trees: mapping of frozenset(top_treenums) → set of CURIEs + """ + mock = MagicMock() + mock.get_terms_in_tree.side_effect = lambda t: tree_map.get(t, []) + mock.get_terms_with_type.return_value = scr_chemical_terms + + if scr_mapped_to_trees is not None: + def _mapped(trees): + return scr_mapped_to_trees.get(tuple(sorted(trees)), set()) + mock.get_scr_terms_mapped_to_trees.side_effect = _mapped + + return mock + + +@pytest.mark.unit +@patch("src.datahandlers.mesh.Mesh") +def test_write_ids_scr_exclude_trees_removes_protein_scrs(mock_cls, tmp_path): + mock_mesh = _make_mock_mesh( + tree_map={"D02": ["MESH:D000002"]}, + scr_chemical_terms=["MESH:C000001", "MESH:C000002"], + scr_mapped_to_trees={("D05",): {"MESH:C000001"}}, + ) + mock_cls.return_value = mock_mesh + outfile = str(tmp_path / "out.txt") + + write_ids( + {"D02": CHEMICAL_ENTITY}, + outfile, + order=["EXCLUDE", CHEMICAL_ENTITY], + extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY}, + scr_exclude_trees=["D05"], + ) + + content = open(outfile).read() + assert "MESH:D000002" in content + assert "MESH:C000002" in content + assert "MESH:C000001" not in content + + +@pytest.mark.unit +@patch("src.datahandlers.mesh.Mesh") +def test_write_ids_scr_include_trees_keeps_only_protein_scrs(mock_cls, tmp_path): + include_trees = ["D05", "D08", "D12.776"] + mock_mesh = _make_mock_mesh( + tree_map={"D12.776": ["MESH:D000001"]}, + scr_chemical_terms=["MESH:C000001", "MESH:C000002"], + scr_mapped_to_trees={tuple(sorted(include_trees)): {"MESH:C000001"}}, + ) + mock_cls.return_value = mock_mesh + outfile = str(tmp_path / "out.txt") + + write_ids( + {"D12.776": PROTEIN}, + outfile, + order=[PROTEIN], + extra_vocab={"SCR_Chemical": PROTEIN}, + scr_include_trees=include_trees, + ) + + content = open(outfile).read() + assert "MESH:D000001" in content + assert "MESH:C000001" in content + assert "MESH:C000002" not in content + + +@pytest.mark.unit +@patch("src.datahandlers.mesh.Mesh") +def test_write_ids_default_includes_all_scr_terms(mock_cls, tmp_path): + mock_mesh = _make_mock_mesh( + tree_map={"D02": ["MESH:D000002"]}, + scr_chemical_terms=["MESH:C000001", "MESH:C000002"], + ) + mock_cls.return_value = mock_mesh + outfile = str(tmp_path / "out.txt") + + write_ids( + {"D02": CHEMICAL_ENTITY}, + outfile, + order=[CHEMICAL_ENTITY], + extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY}, + ) + + content = open(outfile).read() + assert "MESH:D000002" in content + assert "MESH:C000001" in content + assert "MESH:C000002" in content + mock_mesh.get_scr_terms_mapped_to_trees.assert_not_called() + + +@pytest.mark.unit +@patch("src.datahandlers.mesh.Mesh") +def test_write_ids_exclude_flag_suppresses_term_from_output(mock_cls, tmp_path): + mock_mesh = _make_mock_mesh( + tree_map={"D02": ["MESH:D000002"], "D05": ["MESH:D000005"]}, + scr_chemical_terms=["MESH:C000001", "MESH:C000002"], + scr_mapped_to_trees={("D05",): {"MESH:C000001"}}, + ) + mock_cls.return_value = mock_mesh + outfile = str(tmp_path / "out.txt") + + write_ids( + {"D02": CHEMICAL_ENTITY, "D05": "EXCLUDE"}, + outfile, + order=["EXCLUDE", CHEMICAL_ENTITY], + extra_vocab={"SCR_Chemical": CHEMICAL_ENTITY}, + scr_exclude_trees=["D05"], + ) + + content = open(outfile).read() + assert "MESH:D000002" in content + assert "MESH:C000002" in content + assert "MESH:D000005" not in content # in D05 tree → EXCLUDE + assert "MESH:C000001" not in content # mapped to D05 → EXCLUDE + + +# --------------------------------------------------------------------------- +# Group 3: get_scr_terms_mapped_to_trees() with inline pyoxigraph store +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def mesh_with_test_store(): + store = _make_test_store() + return _make_mesh_with_store(store) + + +@pytest.mark.unit +def test_get_scr_terms_mapped_to_trees_direct_mapping(mesh_with_test_store): + result = mesh_with_test_store.get_scr_terms_mapped_to_trees(["D12.776"]) + assert result == {"MESH:C000001"} + + +@pytest.mark.unit +def test_get_scr_terms_mapped_to_trees_transitive_parent(mesh_with_test_store): + result = mesh_with_test_store.get_scr_terms_mapped_to_trees(["D05"]) + assert result == {"MESH:C000002"} + + +@pytest.mark.unit +def test_get_scr_terms_mapped_to_trees_preferred_mapped_to(mesh_with_test_store): + # C000002 uses preferredMappedTo — covered by the D05 transitive test above, + # but explicitly confirmed here for clarity. + result = mesh_with_test_store.get_scr_terms_mapped_to_trees(["D05"]) + assert "MESH:C000002" in result + + +@pytest.mark.unit +def test_get_scr_terms_mapped_to_trees_multiple_top_treenums(mesh_with_test_store): + result = mesh_with_test_store.get_scr_terms_mapped_to_trees(["D12.776", "D05"]) + assert result == {"MESH:C000001", "MESH:C000002"} + assert "MESH:C000003" not in result + + +@pytest.mark.unit +def test_get_scr_terms_mapped_to_trees_no_match(mesh_with_test_store): + result = mesh_with_test_store.get_scr_terms_mapped_to_trees(["D01"]) + assert result == set() From b02addc7c13926fcbbdbeb994c4069b6d130278d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 01:41:32 -0400 Subject: [PATCH 07/30] Add pipeline tests for MeSH chemical/protein ID separation (issue #675) Creates tests/pipeline/ with a session-scoped fixture that loads babel_downloads/MESH/mesh.nt and runs both chemicals.write_mesh_ids() and protein.write_mesh_ids(), then asserts four invariants: non-empty outputs, zero ID overlap (the core correctness property), and no D05/D08/D12.776 descriptor terms in the chemicals output. Updates tests/README.md to document the new test file and retire the Out of Scope stubs. Co-Authored-By: Claude Sonnet 4.6 --- tests/README.md | 29 ++++++++----- tests/pipeline/__init__.py | 0 tests/pipeline/conftest.py | 38 +++++++++++++++++ tests/pipeline/test_mesh_pipeline.py | 64 ++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 10 deletions(-) create mode 100644 tests/pipeline/__init__.py create mode 100644 tests/pipeline/conftest.py create mode 100644 tests/pipeline/test_mesh_pipeline.py diff --git a/tests/README.md b/tests/README.md index 78b2c1e8..51413659 100644 --- a/tests/README.md +++ b/tests/README.md @@ -68,6 +68,19 @@ PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests (splitting attribute lists across multiple queries) produce the same results as single-query downloads, and checks TSV output correctness. Uses `tmp_path`. +- **`datahandlers/test_mesh.py`** (`unit`) — Unit tests for `src/datahandlers/mesh.py`. + Covers `write_ids()` parameter validation, SCR filtering logic (mock-based), and + `Mesh.get_scr_terms_mapped_to_trees()` using an inline pyoxigraph store. + +### Pipeline + +- **`pipeline/test_mesh_pipeline.py`** (`pipeline`) — End-to-end tests for MeSH + chemical/protein ID separation (issue #675). Requires `babel_downloads/MESH/mesh.nt`. + Four tests: (1) `chemicals.write_mesh_ids()` output is non-empty; (2) + `protein.write_mesh_ids()` output is non-empty; (3) the two outputs share no IDs + (the core correctness invariant); (4) the chemicals output excludes all D05/D08/D12.776 + descriptor terms, including "in-neither" subtrees like Polymers and Coenzymes. + ### Compendia - **`test_chemicals.py`** / **`test_uber.py`** (`network`, `xfail`) — Both test the @@ -169,15 +182,11 @@ Example tests: ## Out of Scope / Pipeline-only -### Tests requiring the full MeSH pipeline (out of scope for unit tests) +The MeSH pipeline tests that previously appeared here as stubs have been implemented in +`tests/pipeline/test_mesh_pipeline.py` (see the "Pipeline" subsection of "Test Files" above). -The following require loading the full MeSH RDF (~1–2 GB) and are only practical -as `pipeline` tests once `babel_downloads/MESH/mesh.nt` is pre-populated: +Run them with: -- **`chemicals.write_mesh_ids()` end-to-end** — Runs SPARQL over full MeSH; verify output - contains expected D01–D26 terms and excludes D05/D08/D12.776. -- **`protein.write_mesh_ids()` end-to-end** — Verify D12.776, D05.500, D08.811 subtrees - appear in output. -- **No-overlap invariant** — Assert chemicals and protein MeSH output files share no IDs. - This is the key correctness property of issue #675. -- **`protein_mesh_ids` Snakemake rule** — End-to-end pipeline rule test. +```bash +PYTHONPATH=. uv run pytest tests/pipeline/test_mesh_pipeline.py --pipeline --no-cov -v +``` diff --git a/tests/pipeline/__init__.py b/tests/pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py new file mode 100644 index 00000000..0b645248 --- /dev/null +++ b/tests/pipeline/conftest.py @@ -0,0 +1,38 @@ +"""Session-scoped fixture for MeSH pipeline tests. + +Requires babel_downloads/MESH/mesh.nt to be pre-populated. +All tests in this package are marked `pipeline` and are skipped by default. +Run with: PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v +""" +import os + +import pytest + +from src.createcompendia import chemicals, protein +from src.datahandlers.mesh import Mesh + +MESH_NT = "babel_downloads/MESH/mesh.nt" + + +@pytest.fixture(scope="session") +def mesh_pipeline_outputs(tmp_path_factory): + if not os.path.exists(MESH_NT): + pytest.skip(f"{MESH_NT} not found; populate babel_downloads/ before running pipeline tests") + + outdir = tmp_path_factory.mktemp("mesh_ids") + chem_outfile = str(outdir / "chemical_MESH") + prot_outfile = str(outdir / "protein_MESH") + + chemicals.write_mesh_ids(chem_outfile) + protein.write_mesh_ids(prot_outfile) + + m = Mesh() + excluded_tree_terms = set() + for tree in ["D05", "D08", "D12.776"]: + excluded_tree_terms.update(m.get_terms_in_tree(tree)) + + return { + "chemicals": chem_outfile, + "protein": prot_outfile, + "excluded_tree_terms": excluded_tree_terms, + } diff --git a/tests/pipeline/test_mesh_pipeline.py b/tests/pipeline/test_mesh_pipeline.py new file mode 100644 index 00000000..1aa79833 --- /dev/null +++ b/tests/pipeline/test_mesh_pipeline.py @@ -0,0 +1,64 @@ +"""Pipeline tests for MeSH chemical/protein ID separation (issue #675). + +These tests verify that: +1. chemicals.write_mesh_ids() produces non-empty output +2. protein.write_mesh_ids() produces non-empty output +3. The two outputs share no IDs (the core correctness property of issue #675) +4. The chemicals output excludes all D05/D08/D12.776 descriptor terms + +All tests require babel_downloads/MESH/mesh.nt to be pre-populated and are +skipped by default. Run with: pytest tests/pipeline/ --pipeline --no-cov -v +""" +import pytest + + +def _read_ids(path: str) -> set[str]: + """Read a TSV output file and return the set of CURIEs (first column).""" + ids = set() + with open(path) as f: + for line in f: + line = line.strip() + if line: + ids.add(line.split("\t")[0]) + return ids + + +@pytest.mark.pipeline +def test_chemicals_mesh_ids_non_empty(mesh_pipeline_outputs): + chem_ids = _read_ids(mesh_pipeline_outputs["chemicals"]) + assert len(chem_ids) > 0, "chemicals.write_mesh_ids() produced no output" + + +@pytest.mark.pipeline +def test_protein_mesh_ids_non_empty(mesh_pipeline_outputs): + prot_ids = _read_ids(mesh_pipeline_outputs["protein"]) + assert len(prot_ids) > 0, "protein.write_mesh_ids() produced no output" + + +@pytest.mark.pipeline +def test_no_overlap_between_chemicals_and_protein_mesh_ids(mesh_pipeline_outputs): + """Core correctness test for issue #675: no ID should appear in both outputs.""" + chem_ids = _read_ids(mesh_pipeline_outputs["chemicals"]) + prot_ids = _read_ids(mesh_pipeline_outputs["protein"]) + overlap = chem_ids & prot_ids + assert len(overlap) == 0, ( + f"Found {len(overlap)} IDs in both chemicals and protein MeSH outputs: " + f"{sorted(overlap)[:10]}" + ) + + +@pytest.mark.pipeline +def test_chemicals_excludes_all_protein_descriptor_trees(mesh_pipeline_outputs): + """Chemicals output must not contain any D05/D08/D12.776 descriptor terms. + + This catches terms in "in-neither" subtrees (e.g. D05.750 Polymers, D08.211 + Coenzymes) that should be excluded from chemicals even though they are not + captured by protein.write_mesh_ids(). + """ + chem_ids = _read_ids(mesh_pipeline_outputs["chemicals"]) + excluded_tree_terms = mesh_pipeline_outputs["excluded_tree_terms"] + overlap = chem_ids & excluded_tree_terms + assert len(overlap) == 0, ( + f"Found {len(overlap)} D05/D08/D12.776 descriptor terms in chemicals output: " + f"{sorted(overlap)[:10]}" + ) From 761bbedd8ad114c3adc344d47359188636c72769 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 01:52:12 -0400 Subject: [PATCH 08/30] Make pipeline tests self-downloading (issue #675) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the manual pre-population requirement with a `_download_or_skip` helper in `tests/pipeline/conftest.py`. The new layered fixture pattern splits each datasource into a download fixture (`mesh_nt`) and a processing fixture (`mesh_pipeline_outputs`) — pytest propagates skips automatically if the download fails. Update tests/README.md to document the new pattern and provide a template for adding future datasources. Co-Authored-By: Claude Sonnet 4.6 --- tests/README.md | 33 ++++++++++++++++++++++------- tests/pipeline/conftest.py | 43 +++++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 15 deletions(-) diff --git a/tests/README.md b/tests/README.md index 51413659..ea665eb1 100644 --- a/tests/README.md +++ b/tests/README.md @@ -20,13 +20,13 @@ Tests are tagged with marks to control which subset runs in a given context: | `unit` | Pure functions, in-memory logic, small fixtures in `tests/data/` | No | Seconds | 30 s | | `network` | Requires live internet (FTP, SPARQL, BioMart, external APIs) | Yes | Seconds–minutes | 600 s | | `slow` | Correct but takes >30s — large fixture processing, SQLite spill, etc. | Sometimes | >30s | 600 s | -| `pipeline` | Invokes Snakemake rules; requires `babel_downloads/` to be pre-populated | Yes | Minutes–hours | 3600 s | +| `pipeline` | Invokes Snakemake rules; downloads prerequisite data automatically | Yes | Minutes–hours | 3600 s | ### Default behavior - `pytest` alone: runs `unit` and `slow` tests; skips `network` and `pipeline` - `pytest --network`: also runs `network` tests -- `pytest --pipeline`: also runs `pipeline` tests (ensure `babel_downloads/` exists first) +- `pytest --pipeline`: also runs `pipeline` tests (prerequisite data is downloaded automatically) - `pytest --all`: runs everything (equivalent to `--network --pipeline`) ### Convenience commands @@ -75,7 +75,8 @@ PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests ### Pipeline - **`pipeline/test_mesh_pipeline.py`** (`pipeline`) — End-to-end tests for MeSH - chemical/protein ID separation (issue #675). Requires `babel_downloads/MESH/mesh.nt`. + chemical/protein ID separation (issue #675). Downloads `babel_downloads/MESH/mesh.nt` + automatically if absent; skips gracefully if the download fails. Four tests: (1) `chemicals.write_mesh_ids()` output is non-empty; (2) `protein.write_mesh_ids()` output is non-empty; (3) the two outputs share no IDs (the core correctness invariant); (4) the chemicals output excludes all D05/D08/D12.776 @@ -162,11 +163,27 @@ asserts the output file is non-empty: ### New `pipeline` tests -Add a `tests/pipeline/` sub-package with a `snakemake_rule` session fixture (calls the Snakemake -Python API, inherits dependency resolution, returns the output directory). Requires -`babel_downloads/` to be pre-populated — document this in `tests/pipeline/README.md`. - -Example tests: +Pipeline tests follow a two-fixture layered pattern defined in `tests/pipeline/conftest.py`. +Each datasource gets one download fixture (which calls `_download_or_skip`) and each +compendium gets one processing fixture that depends on the download fixture. Pytest +propagates skips automatically — no plugins needed. + +To add a new datasource (e.g. ChEBI): + +1. Add a download fixture in `tests/pipeline/conftest.py`: + ```python + @pytest.fixture(scope="session") + def chebi_sdf(): + return _download_or_skip( + "ChEBI SDF", + pull_chebi, + make_local_name("ChEBI_complete.sdf", subpath="CHEBI"), + ) + ``` +2. Add a processing fixture that depends on `chebi_sdf`. +3. Create `tests/pipeline/test_chebi_pipeline.py` whose tests depend on the processing fixture. + +Example tests to add next: - **`test_pipeline_chemical.py`** — Runs `chemical_umls_ids` rule, checks output file exists and is non-empty diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 0b645248..a20852cf 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -1,6 +1,8 @@ -"""Session-scoped fixture for MeSH pipeline tests. +"""Session-scoped fixtures for pipeline tests. + +Pipeline tests download their own prerequisite files. If a download fails +(network down, auth error, etc.) all dependent tests are skipped automatically. -Requires babel_downloads/MESH/mesh.nt to be pre-populated. All tests in this package are marked `pipeline` and are skipped by default. Run with: PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v """ @@ -8,17 +10,44 @@ import pytest +from src.babel_utils import make_local_name from src.createcompendia import chemicals, protein -from src.datahandlers.mesh import Mesh +from src.datahandlers.mesh import Mesh, pull_mesh + + +def _download_or_skip(label: str, pull_fn, expected_path: str) -> str: + """Download expected_path via pull_fn() if absent; pytest.skip() on failure.""" + if not os.path.exists(expected_path): + try: + pull_fn() + except Exception as e: + pytest.skip(f"Could not download {label}: {e}") + return expected_path + -MESH_NT = "babel_downloads/MESH/mesh.nt" +# --------------------------------------------------------------------------- +# Per-datasource download fixtures (one per datahandler) +# --------------------------------------------------------------------------- @pytest.fixture(scope="session") -def mesh_pipeline_outputs(tmp_path_factory): - if not os.path.exists(MESH_NT): - pytest.skip(f"{MESH_NT} not found; populate babel_downloads/ before running pipeline tests") +def mesh_nt(): + """Download babel_downloads/MESH/mesh.nt, or skip if unavailable.""" + return _download_or_skip( + "MESH mesh.nt", + pull_mesh, + make_local_name("mesh.nt", subpath="MESH"), + ) + +# --------------------------------------------------------------------------- +# Per-compendium processing fixtures (depend on download fixtures above) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def mesh_pipeline_outputs(mesh_nt, tmp_path_factory): + """Run chemicals/protein MeSH ID extraction; skip if mesh_nt unavailable.""" outdir = tmp_path_factory.mktemp("mesh_ids") chem_outfile = str(outdir / "chemical_MESH") prot_outfile = str(outdir / "protein_MESH") From 4e49ade3b79cb52615eb561d17bb99a53e14b986 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 02:00:14 -0400 Subject: [PATCH 09/30] Fixed Markdown file style. --- CLAUDE.md | 10 +++++----- tests/README.md | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 3416ee3a..813a9d8f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -137,17 +137,17 @@ Gene+Protein and Drug+Chemical each have dedicated conflation modules (`geneprot - `babel_outputs/intermediate/` — intermediate build artifacts - `babel_outputs/` — final compendia, synonyms, reports, exports -# Running Babel +## Running Babel You may run `uv run snakemake -c all --rerun-incomplete [rulename]` to run a particular rule. When running a download step, it will be easier to run the job in Snakemake, but when running a rule that produces intermediate files, it might be easier to download the intermediate files from -https://stars.renci.org/var/babel/2025dec11/ (which is the `babel_output` folder from a run on a -high performance cluster) so you don't need to download all the source files and + (which is the `babel_output` folder from a run on a +high performance cluster) so you don't need to download all the source files and rerun the entire pipeline. You can look at the resource requirements of a rule to decide which -option would be best. +option would be best. -# Debugging +## Debugging When looking things up in the source databases, prefer to invoke the existing download code in this repository unless you suspect that it is incorrect, in which case use the existing code diff --git a/tests/README.md b/tests/README.md index ea665eb1..3cd2a7d5 100644 --- a/tests/README.md +++ b/tests/README.md @@ -171,6 +171,7 @@ propagates skips automatically — no plugins needed. To add a new datasource (e.g. ChEBI): 1. Add a download fixture in `tests/pipeline/conftest.py`: + ```python @pytest.fixture(scope="session") def chebi_sdf(): @@ -180,6 +181,7 @@ To add a new datasource (e.g. ChEBI): make_local_name("ChEBI_complete.sdf", subpath="CHEBI"), ) ``` + 2. Add a processing fixture that depends on `chebi_sdf`. 3. Create `tests/pipeline/test_chebi_pipeline.py` whose tests depend on the processing fixture. From 24c40d7aa303649469fa4c8b7cc1546344f387ad Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 02:00:28 -0400 Subject: [PATCH 10/30] Fixed ruff errors. --- tests/datahandlers/test_mesh.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/datahandlers/test_mesh.py b/tests/datahandlers/test_mesh.py index f7f2d8c5..fa0aa795 100644 --- a/tests/datahandlers/test_mesh.py +++ b/tests/datahandlers/test_mesh.py @@ -5,7 +5,6 @@ - write_ids() SCR filtering logic (mock-based) - Mesh.get_scr_terms_mapped_to_trees() (inline pyoxigraph store) """ -import io from unittest.mock import MagicMock, patch import pyoxigraph From 843d768df1ed1f2bed8f7c450a5f789955497e52 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 02:17:35 -0400 Subject: [PATCH 11/30] Add pipeline tests for UMLS identifier partitioning (issue #675 extension) Extend the self-downloading pipeline test framework to cover the seven compendia that call write_umls_ids() (chemicals, protein, anatomy, disease/phenotype, process/activity, taxon, gene). The new umls_rrf_files fixture checks for UMLS_API_KEY before attempting any download (since download_umls() calls exit(1) rather than raising on a missing key) and skips gracefully if the key is absent or the download fails. Nine tests: seven parametrized non-empty checks, one mutual-exclusivity assertion that no UMLS:CUI appears in more than one compendium, and one targeted test guarding against protein-tree IDs leaking into chemicals. Co-Authored-By: Claude Sonnet 4.6 --- tests/README.md | 9 +++ tests/pipeline/conftest.py | 72 ++++++++++++++++++++++++ tests/pipeline/test_umls_pipeline.py | 82 ++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 tests/pipeline/test_umls_pipeline.py diff --git a/tests/README.md b/tests/README.md index 3cd2a7d5..0229017d 100644 --- a/tests/README.md +++ b/tests/README.md @@ -82,6 +82,15 @@ PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests (the core correctness invariant); (4) the chemicals output excludes all D05/D08/D12.776 descriptor terms, including "in-neither" subtrees like Polymers and Coenzymes. +- **`pipeline/test_umls_pipeline.py`** (`pipeline`) — End-to-end tests for UMLS identifier + partitioning across all seven compendia that call `write_umls_ids()` (chemicals, protein, + anatomy, disease/phenotype, process/activity, taxon, gene). Requires `UMLS_API_KEY` to be + set; downloads `babel_downloads/UMLS/MRCONSO.RRF` and `MRSTY.RRF` automatically if absent, + skipping gracefully if the download fails or the key is unset. Nine tests: seven + parametrized non-empty checks (one per compendium), one mutual-exclusivity test asserting + no UMLS:CUI appears in more than one compendium, and one targeted test confirming chemicals + excludes all protein-assigned UMLS IDs (semantic type tree A1.4.1.2.1.7). + ### Compendia - **`test_chemicals.py`** / **`test_uber.py`** (`network`, `xfail`) — Both test the diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index a20852cf..f983685b 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -65,3 +65,75 @@ def mesh_pipeline_outputs(mesh_nt, tmp_path_factory): "protein": prot_outfile, "excluded_tree_terms": excluded_tree_terms, } + + +# --------------------------------------------------------------------------- +# UMLS download fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def umls_rrf_files(): + """Ensure MRCONSO.RRF and MRSTY.RRF are present; skip if UMLS_API_KEY unset or download fails.""" + if not os.environ.get("UMLS_API_KEY"): + pytest.skip("UMLS_API_KEY not set; cannot download UMLS files") + + mrconso = make_local_name("MRCONSO.RRF", subpath="UMLS") + mrsty = make_local_name("MRSTY.RRF", subpath="UMLS") + + if not os.path.exists(mrconso) or not os.path.exists(mrsty): + try: + from src.datahandlers import umls as umls_handler + from src.util import get_config + cfg = get_config() + umls_handler.download_umls( + cfg["umls_version"], + cfg["umls"]["subset"], + cfg["download_directory"] + "/UMLS", + ) + except Exception as e: + pytest.skip(f"Could not download UMLS: {e}") + + return {"mrconso": mrconso, "mrsty": mrsty} + + +# --------------------------------------------------------------------------- +# UMLS processing fixture (depends on umls_rrf_files) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def umls_pipeline_outputs(umls_rrf_files, tmp_path_factory): + """Run write_umls_ids for all seven compendia; returns dict of output paths.""" + from src.createcompendia import ( + anatomy, diseasephenotype, gene, + processactivitypathway, taxon, + ) + from src.util import get_config + + mrconso = umls_rrf_files["mrconso"] + mrsty = umls_rrf_files["mrsty"] + cfg = get_config() + badumlsfile = os.path.join(cfg["input_directory"], "badumls") + outdir = tmp_path_factory.mktemp("umls_ids") + + def out(name): + return str(outdir / f"{name}_UMLS") + + chemicals.write_umls_ids(mrsty, out("chemicals")) + protein.write_umls_ids(mrsty, out("protein")) + anatomy.write_umls_ids(mrsty, out("anatomy")) + diseasephenotype.write_umls_ids(mrsty, out("diseasephenotype"), badumlsfile) + processactivitypathway.write_umls_ids(mrsty, out("processactivity")) + taxon.write_umls_ids(mrsty, out("taxon")) + gene.write_umls_ids(mrconso, mrsty, out("gene")) + + return { + "chemicals": out("chemicals"), + "protein": out("protein"), + "anatomy": out("anatomy"), + "diseasephenotype": out("diseasephenotype"), + "processactivity": out("processactivity"), + "taxon": out("taxon"), + "gene": out("gene"), + } diff --git a/tests/pipeline/test_umls_pipeline.py b/tests/pipeline/test_umls_pipeline.py new file mode 100644 index 00000000..f8616e06 --- /dev/null +++ b/tests/pipeline/test_umls_pipeline.py @@ -0,0 +1,82 @@ +"""Pipeline tests for UMLS identifier partitioning across compendia (issue #675 extension). + +These tests verify that: +1. Each compendium's write_umls_ids() produces non-empty output. +2. No UMLS:CUI appears in more than one compendium output — the core invariant. +3. Chemicals excludes UMLS IDs assigned to the protein compendium + (semantic type tree A1.4.1.2.1.7, Amino Acid/Peptide/Protein). + +All tests require UMLS_API_KEY to be set and network access for the initial +download; they are skipped by default. Run with: + PYTHONPATH=. uv run pytest tests/pipeline/test_umls_pipeline.py --pipeline --no-cov -v +""" +import pytest + + +COMPENDIUM_NAMES = [ + "chemicals", + "protein", + "anatomy", + "diseasephenotype", + "processactivity", + "taxon", + "gene", +] + + +def _read_ids(path: str) -> set[str]: + """Read a TSV output file and return the set of CURIEs (first column).""" + ids = set() + with open(path) as f: + for line in f: + line = line.strip() + if line: + ids.add(line.split("\t")[0]) + return ids + + +@pytest.mark.pipeline +@pytest.mark.parametrize("name", COMPENDIUM_NAMES) +def test_umls_ids_non_empty(umls_pipeline_outputs, name): + """Each compendium's write_umls_ids() must produce at least one identifier.""" + ids = _read_ids(umls_pipeline_outputs[name]) + assert len(ids) > 0, f"{name}.write_umls_ids() produced no output" + + +@pytest.mark.pipeline +def test_no_umls_id_in_multiple_compendia(umls_pipeline_outputs): + """Core correctness test: no UMLS:CUI may appear in more than one compendium. + + If the same UMLS ID lands in two compendia, Node Normalization will see a + duplicate and normalization will be ambiguous or incorrect. + """ + seen = {} # id -> first compendium name + duplicates = {} # id -> list of all compendia it appeared in + + for name in COMPENDIUM_NAMES: + for id_ in _read_ids(umls_pipeline_outputs[name]): + if id_ in seen: + duplicates.setdefault(id_, [seen[id_]]).append(name) + else: + seen[id_] = name + + assert len(duplicates) == 0, ( + f"Found {len(duplicates)} UMLS IDs in multiple compendia: " + f"{dict(list(duplicates.items())[:5])}" + ) + + +@pytest.mark.pipeline +def test_chemicals_excludes_protein_semantic_tree(umls_pipeline_outputs): + """Chemicals must not contain any UMLS IDs that the protein compendium claimed. + + Guards against amino-acid/peptide/protein entries (semantic type tree + A1.4.1.2.1.7) leaking into the chemical compendium. + """ + chem_ids = _read_ids(umls_pipeline_outputs["chemicals"]) + prot_ids = _read_ids(umls_pipeline_outputs["protein"]) + overlap = chem_ids & prot_ids + assert len(overlap) == 0, ( + f"Found {len(overlap)} IDs in both chemicals and protein UMLS outputs: " + f"{sorted(overlap)[:10]}" + ) From fd78b731870d4f42e7b298ca0c6f3086052a0de6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 02:36:27 -0400 Subject: [PATCH 12/30] Fixed formatting using ruff. --- tests/pipeline/conftest.py | 7 +++++-- tests/pipeline/test_umls_pipeline.py | 1 - 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index f983685b..9ddffce2 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -106,8 +106,11 @@ def umls_rrf_files(): def umls_pipeline_outputs(umls_rrf_files, tmp_path_factory): """Run write_umls_ids for all seven compendia; returns dict of output paths.""" from src.createcompendia import ( - anatomy, diseasephenotype, gene, - processactivitypathway, taxon, + anatomy, + diseasephenotype, + gene, + processactivitypathway, + taxon, ) from src.util import get_config diff --git a/tests/pipeline/test_umls_pipeline.py b/tests/pipeline/test_umls_pipeline.py index f8616e06..c2f98c05 100644 --- a/tests/pipeline/test_umls_pipeline.py +++ b/tests/pipeline/test_umls_pipeline.py @@ -12,7 +12,6 @@ """ import pytest - COMPENDIUM_NAMES = [ "chemicals", "protein", From 1e613d9e105c4bac866e32378f9fac56cec701e0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 9 Mar 2026 02:47:38 -0400 Subject: [PATCH 13/30] Fix UMLS fixture skipping when files are already cached The API key check was unconditional, so tests would skip even when MRCONSO.RRF and MRSTY.RRF were already present in babel_downloads/. Move the UMLS_API_KEY check inside the "files are missing" branch so that cached runs proceed without needing the key in the environment. Co-Authored-By: Claude Sonnet 4.6 --- tests/pipeline/conftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 9ddffce2..b0e19c8d 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -74,14 +74,14 @@ def mesh_pipeline_outputs(mesh_nt, tmp_path_factory): @pytest.fixture(scope="session") def umls_rrf_files(): - """Ensure MRCONSO.RRF and MRSTY.RRF are present; skip if UMLS_API_KEY unset or download fails.""" - if not os.environ.get("UMLS_API_KEY"): - pytest.skip("UMLS_API_KEY not set; cannot download UMLS files") - + """Ensure MRCONSO.RRF and MRSTY.RRF are present; skip if files are absent and download fails.""" mrconso = make_local_name("MRCONSO.RRF", subpath="UMLS") mrsty = make_local_name("MRSTY.RRF", subpath="UMLS") if not os.path.exists(mrconso) or not os.path.exists(mrsty): + # UMLS_API_KEY is only required when the files are not yet cached. + if not os.environ.get("UMLS_API_KEY"): + pytest.skip("UMLS_API_KEY not set and UMLS files not cached; cannot download UMLS files") try: from src.datahandlers import umls as umls_handler from src.util import get_config From fd48f2b5fe6178ed6fe708235800d4e88ca1ae8e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 12 Mar 2026 17:57:15 -0400 Subject: [PATCH 14/30] Eliminate some exit(1) with RuntimeErrors. --- src/datahandlers/umls.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 1d3c5f0b..308a9f28 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -307,9 +307,7 @@ def download_umls(umls_version, umls_subset, download_dir): """ umls_api_key = os.environ.get("UMLS_API_KEY") if not umls_api_key: - print("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.") - print("See instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html") - exit(1) + raise RuntimeError("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.\nSee instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html") # Check umls_subset. if umls_subset not in ["full", "level-0"]: @@ -325,8 +323,7 @@ def download_umls(umls_version, umls_subset, download_dir): stream=True, ) if not req.ok: - print(f"Unable to download UMLS from {umls_url}: {req}") - exit(1) + raise RuntimeError(f"Unable to download UMLS from {umls_url}: {req}") # Write file to {download_dir}/umls-{umls_version}-metathesaurus-full.zip logging.info(f"Downloading {filename} to {download_dir}") From 5b9da7575e8525fe328d9b466a3e3f6ccb77422b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 12 Mar 2026 18:26:09 -0400 Subject: [PATCH 15/30] Generalize pipeline tests to all multi-compendium vocabularies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-vocabulary boilerplate approach with a VOCABULARY_REGISTRY in conftest.py and a single test_vocabulary_partitioning.py that runs non-empty and mutual-exclusivity tests for every registered vocabulary. Changes: - conftest.py: add _read_ids helper; extend mesh_pipeline_outputs to all five MeSH compendia (anatomy, diseasephenotype, taxon added); add omim_mim2gene + omim_pipeline_outputs fixtures; add ubergraph_connection fixture shared by NCIT and GO; add ncit_pipeline_outputs and go_pipeline_outputs fixtures; add VOCABULARY_REGISTRY and vocab_outputs parametrized fixture using request.getfixturevalue() - test_vocabulary_partitioning.py: new file — 10 tests (5 vocabs × 2) automatically updated when VOCABULARY_REGISTRY grows - test_mesh_pipeline.py: strip to single MeSH-specific tree-exclusion test - test_umls_pipeline.py: strip to single UMLS-specific protein-tree test - tests/README.md: update Pipeline section and Future Plans To add a new vocabulary in future: add download + processing fixtures to conftest.py and one entry in VOCABULARY_REGISTRY. No new test file needed. Co-Authored-By: Claude Sonnet 4.6 --- tests/README.md | 78 +++---- tests/pipeline/conftest.py | 209 +++++++++++++++--- tests/pipeline/test_mesh_pipeline.py | 54 +---- tests/pipeline/test_umls_pipeline.py | 73 ++---- .../pipeline/test_vocabulary_partitioning.py | 55 +++++ 5 files changed, 298 insertions(+), 171 deletions(-) create mode 100644 tests/pipeline/test_vocabulary_partitioning.py diff --git a/tests/README.md b/tests/README.md index 0229017d..da0ab60a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -74,22 +74,24 @@ PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests ### Pipeline -- **`pipeline/test_mesh_pipeline.py`** (`pipeline`) — End-to-end tests for MeSH - chemical/protein ID separation (issue #675). Downloads `babel_downloads/MESH/mesh.nt` - automatically if absent; skips gracefully if the download fails. - Four tests: (1) `chemicals.write_mesh_ids()` output is non-empty; (2) - `protein.write_mesh_ids()` output is non-empty; (3) the two outputs share no IDs - (the core correctness invariant); (4) the chemicals output excludes all D05/D08/D12.776 - descriptor terms, including "in-neither" subtrees like Polymers and Coenzymes. - -- **`pipeline/test_umls_pipeline.py`** (`pipeline`) — End-to-end tests for UMLS identifier - partitioning across all seven compendia that call `write_umls_ids()` (chemicals, protein, - anatomy, disease/phenotype, process/activity, taxon, gene). Requires `UMLS_API_KEY` to be - set; downloads `babel_downloads/UMLS/MRCONSO.RRF` and `MRSTY.RRF` automatically if absent, - skipping gracefully if the download fails or the key is unset. Nine tests: seven - parametrized non-empty checks (one per compendium), one mutual-exclusivity test asserting - no UMLS:CUI appears in more than one compendium, and one targeted test confirming chemicals - excludes all protein-assigned UMLS IDs (semantic type tree A1.4.1.2.1.7). +- **`pipeline/test_vocabulary_partitioning.py`** (`pipeline`) — Generic mutual-exclusivity + tests parametrized over all registered vocabularies. For each vocabulary, verifies that + (1) every compendium's `write_X_ids()` produces non-empty output and (2) no identifier + appears in more than one compendium. Currently covers five vocabularies: MESH (5 + compendia), UMLS (7 compendia), OMIM (2 compendia), NCIT (2 compendia via UberGraph), + GO (2 compendia via UberGraph). Adding a new vocabulary requires only adding its fixtures + to `conftest.py` and one entry in `VOCABULARY_REGISTRY` — this file never changes. + +- **`pipeline/test_mesh_pipeline.py`** (`pipeline`) — MeSH-specific targeted test (issue + #675). Downloads `babel_downloads/MESH/mesh.nt` automatically if absent. One test: + chemicals output must exclude all D05/D08/D12.776 descriptor terms, including + "in-neither" subtrees like Polymers and Coenzymes, even though these are not captured + by `protein.write_mesh_ids()`. + +- **`pipeline/test_umls_pipeline.py`** (`pipeline`) — UMLS-specific targeted test. Requires + `UMLS_API_KEY` for the initial download (or cached files). One test: chemicals must not + contain any UMLS IDs that the protein compendium claimed (semantic type tree + A1.4.1.2.1.7, Amino Acid/Peptide/Protein). ### Compendia @@ -172,33 +174,31 @@ asserts the output file is non-empty: ### New `pipeline` tests -Pipeline tests follow a two-fixture layered pattern defined in `tests/pipeline/conftest.py`. -Each datasource gets one download fixture (which calls `_download_or_skip`) and each -compendium gets one processing fixture that depends on the download fixture. Pytest -propagates skips automatically — no plugins needed. +The vocabulary-partitioning framework in `tests/pipeline/conftest.py` makes adding +new multi-compendium vocabularies straightforward. Each vocabulary needs: -To add a new datasource (e.g. ChEBI): +1. A **download/connectivity fixture** in `conftest.py` — either a file download using + `_download_or_skip`, a credential-checked download (like UMLS), or a network health + check (like `ubergraph_connection` for NCIT/GO). -1. Add a download fixture in `tests/pipeline/conftest.py`: +2. A **processing fixture** in `conftest.py` that calls all `write_X_ids()` functions for + that vocabulary and returns a `{compendium_name: output_path}` dict. - ```python - @pytest.fixture(scope="session") - def chebi_sdf(): - return _download_or_skip( - "ChEBI SDF", - pull_chebi, - make_local_name("ChEBI_complete.sdf", subpath="CHEBI"), - ) - ``` +3. **One line** in `VOCABULARY_REGISTRY`: `"MYVOCAB": "my_vocab_pipeline_outputs"`. -2. Add a processing fixture that depends on `chebi_sdf`. -3. Create `tests/pipeline/test_chebi_pipeline.py` whose tests depend on the processing fixture. +No new test file is needed for the standard non-empty and mutual-exclusivity checks — +`test_vocabulary_partitioning.py` picks them up automatically. Add a +`test_X_pipeline.py` only for vocabulary-specific targeted assertions. -Example tests to add next: +Vocabularies not yet covered (candidates): -- **`test_pipeline_chemical.py`** — Runs `chemical_umls_ids` rule, checks output file exists and - is non-empty -- **`test_pipeline_gene.py`** — Runs the gene sub-pipeline, checks `NCBIGene.txt` compendium +- **ENSEMBL** — appears in protein (`write_ensembl_protein_ids`) and gene + (`write_ensembl_gene_ids`). Deferred because the download uses BioMart + (`pull_ensembl(ensembl_dir, complete_file, ...)`) which is more complex to invoke + outside Snakemake. +- **NCBI Gene / HGNC / other single-source** — vocabularies that appear in only one + compendium don't need mutual-exclusivity tests, but could still get non-empty checks + in a per-compendium ETL test (see "New `network + slow` ETL tests" above). ### Deduplication / cleanup @@ -210,8 +210,8 @@ Example tests to add next: ## Out of Scope / Pipeline-only -The MeSH pipeline tests that previously appeared here as stubs have been implemented in -`tests/pipeline/test_mesh_pipeline.py` (see the "Pipeline" subsection of "Test Files" above). +The pipeline tests live in `tests/pipeline/`. See the "Pipeline" subsection of "Test Files" +and "New pipeline tests" in Future Plans for the current coverage and how to extend it. Run them with: diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index b0e19c8d..4a969e39 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -1,19 +1,47 @@ """Session-scoped fixtures for pipeline tests. -Pipeline tests download their own prerequisite files. If a download fails -(network down, auth error, etc.) all dependent tests are skipped automatically. +Pipeline tests download their own prerequisite files (or check network connectivity). +If a prerequisite is unavailable, all dependent tests are skipped automatically. All tests in this package are marked `pipeline` and are skipped by default. Run with: PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v + +## Adding a new vocabulary + +1. Add a download/connectivity fixture (e.g. `my_vocab_source`) to this file. +2. Add a processing fixture (e.g. `my_vocab_pipeline_outputs`) that depends on it and + returns a dict of {compendium_name: output_path}. +3. Add one entry to VOCABULARY_REGISTRY: `"MYVOCAB": "my_vocab_pipeline_outputs"`. +That's it — test_vocabulary_partitioning.py picks it up automatically. """ import os import pytest from src.babel_utils import make_local_name -from src.createcompendia import chemicals, protein +from src.createcompendia import anatomy, chemicals, diseasephenotype, protein, taxon from src.datahandlers.mesh import Mesh, pull_mesh +# --------------------------------------------------------------------------- +# Shared helper (not a fixture — importable by test files) +# --------------------------------------------------------------------------- + + +def _read_ids(path: str) -> set[str]: + """Return the set of CURIEs (first column) from a TSV output file.""" + ids = set() + with open(path) as f: + for line in f: + line = line.strip() + if line: + ids.add(line.split("\t")[0]) + return ids + + +# --------------------------------------------------------------------------- +# Generic download helper +# --------------------------------------------------------------------------- + def _download_or_skip(label: str, pull_fn, expected_path: str) -> str: """Download expected_path via pull_fn() if absent; pytest.skip() on failure.""" @@ -26,7 +54,7 @@ def _download_or_skip(label: str, pull_fn, expected_path: str) -> str: # --------------------------------------------------------------------------- -# Per-datasource download fixtures (one per datahandler) +# MESH download + processing fixtures # --------------------------------------------------------------------------- @@ -40,20 +68,19 @@ def mesh_nt(): ) -# --------------------------------------------------------------------------- -# Per-compendium processing fixtures (depend on download fixtures above) -# --------------------------------------------------------------------------- - - @pytest.fixture(scope="session") def mesh_pipeline_outputs(mesh_nt, tmp_path_factory): - """Run chemicals/protein MeSH ID extraction; skip if mesh_nt unavailable.""" + """Run write_mesh_ids for all five compendia; skip if mesh_nt unavailable.""" outdir = tmp_path_factory.mktemp("mesh_ids") - chem_outfile = str(outdir / "chemical_MESH") - prot_outfile = str(outdir / "protein_MESH") - chemicals.write_mesh_ids(chem_outfile) - protein.write_mesh_ids(prot_outfile) + def out(name): + return str(outdir / f"{name}_MESH") + + chemicals.write_mesh_ids(out("chemicals")) + protein.write_mesh_ids(out("protein")) + anatomy.write_mesh_ids(out("anatomy")) + diseasephenotype.write_mesh_ids(out("diseasephenotype")) + taxon.write_mesh_ids(out("taxon")) m = Mesh() excluded_tree_terms = set() @@ -61,14 +88,17 @@ def mesh_pipeline_outputs(mesh_nt, tmp_path_factory): excluded_tree_terms.update(m.get_terms_in_tree(tree)) return { - "chemicals": chem_outfile, - "protein": prot_outfile, + "chemicals": out("chemicals"), + "protein": out("protein"), + "anatomy": out("anatomy"), + "diseasephenotype": out("diseasephenotype"), + "taxon": out("taxon"), "excluded_tree_terms": excluded_tree_terms, } # --------------------------------------------------------------------------- -# UMLS download fixture +# UMLS download + processing fixtures # --------------------------------------------------------------------------- @@ -97,21 +127,10 @@ def umls_rrf_files(): return {"mrconso": mrconso, "mrsty": mrsty} -# --------------------------------------------------------------------------- -# UMLS processing fixture (depends on umls_rrf_files) -# --------------------------------------------------------------------------- - - @pytest.fixture(scope="session") def umls_pipeline_outputs(umls_rrf_files, tmp_path_factory): """Run write_umls_ids for all seven compendia; returns dict of output paths.""" - from src.createcompendia import ( - anatomy, - diseasephenotype, - gene, - processactivitypathway, - taxon, - ) + from src.createcompendia import gene, processactivitypathway from src.util import get_config mrconso = umls_rrf_files["mrconso"] @@ -140,3 +159,135 @@ def out(name): "taxon": out("taxon"), "gene": out("gene"), } + + +# --------------------------------------------------------------------------- +# OMIM download + processing fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def omim_mim2gene(): + """Download babel_downloads/OMIM/mim2gene.txt, or skip if unavailable.""" + from src.datahandlers.omim import pull_omim + return _download_or_skip( + "OMIM mim2gene.txt", + pull_omim, + make_local_name("mim2gene.txt", subpath="OMIM"), + ) + + +@pytest.fixture(scope="session") +def omim_pipeline_outputs(omim_mim2gene, tmp_path_factory): + """Run write_omim_ids for disease/phenotype and gene; skip if mim2gene.txt unavailable.""" + from src.createcompendia import gene + infile = omim_mim2gene + outdir = tmp_path_factory.mktemp("omim_ids") + + def out(name): + return str(outdir / f"{name}_OMIM") + + diseasephenotype.write_omim_ids(infile, out("diseasephenotype")) + gene.write_omim_ids(infile, out("gene")) + + return { + "diseasephenotype": out("diseasephenotype"), + "gene": out("gene"), + } + + +# --------------------------------------------------------------------------- +# UberGraph connectivity fixture (shared prerequisite for NCIT and GO) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def ubergraph_connection(): + """Verify the UberGraph SPARQL endpoint is reachable; skip if not. + + NCIT and GO use write_obo_ids() which queries UberGraph live — there is no + local file to download. This fixture acts as the prerequisite check that + both ncit_pipeline_outputs and go_pipeline_outputs depend on. + """ + try: + from src.datahandlers.obo import UberGraph + ug = UberGraph() + # Minimal health check: GO:0005575 (cellular component) is a small, stable term. + result = ug.get_subclasses_of("GO:0005575") + if not result: + pytest.skip("UberGraph returned empty result; endpoint may be unavailable") + except Exception as e: + pytest.skip(f"UberGraph not accessible: {e}") + + +# --------------------------------------------------------------------------- +# NCIT processing fixture (uses UberGraph, no file download) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def ncit_pipeline_outputs(ubergraph_connection, tmp_path_factory): + """Run write_ncit_ids for anatomy and disease/phenotype via UberGraph.""" + outdir = tmp_path_factory.mktemp("ncit_ids") + + def out(name): + return str(outdir / f"{name}_NCIT") + + anatomy.write_ncit_ids(out("anatomy")) + diseasephenotype.write_ncit_ids(out("diseasephenotype")) + + return { + "anatomy": out("anatomy"), + "diseasephenotype": out("diseasephenotype"), + } + + +# --------------------------------------------------------------------------- +# GO processing fixture (uses UberGraph, no file download) +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def go_pipeline_outputs(ubergraph_connection, tmp_path_factory): + """Run write_go_ids for anatomy and process/activity/pathway via UberGraph.""" + from src.createcompendia import processactivitypathway + outdir = tmp_path_factory.mktemp("go_ids") + + def out(name): + return str(outdir / f"{name}_GO") + + anatomy.write_go_ids(out("anatomy")) + processactivitypathway.write_go_ids(out("processactivity")) + + return { + "anatomy": out("anatomy"), + "processactivity": out("processactivity"), + } + + +# --------------------------------------------------------------------------- +# Vocabulary registry + parametrized fixture for test_vocabulary_partitioning.py +# --------------------------------------------------------------------------- + +# Maps vocabulary name → processing fixture name. +# Each processing fixture returns a {compendium_name: output_path} dict. +# To add a new vocabulary: add its fixtures above, then add one line here. +VOCABULARY_REGISTRY = { + "MESH": "mesh_pipeline_outputs", + "UMLS": "umls_pipeline_outputs", + "OMIM": "omim_pipeline_outputs", + "NCIT": "ncit_pipeline_outputs", + "GO": "go_pipeline_outputs", +} + + +@pytest.fixture(scope="session", params=list(VOCABULARY_REGISTRY.keys())) +def vocab_outputs(request): + """Parametrized fixture yielding (vocab_name, outputs_dict) for each registered vocabulary. + + Uses request.getfixturevalue() so only the fixture for the current vocabulary is + instantiated — a skip in UMLS does not prevent OMIM or MESH from running. + """ + fixture_name = VOCABULARY_REGISTRY[request.param] + outputs = request.getfixturevalue(fixture_name) + return request.param, outputs diff --git a/tests/pipeline/test_mesh_pipeline.py b/tests/pipeline/test_mesh_pipeline.py index 1aa79833..3b882321 100644 --- a/tests/pipeline/test_mesh_pipeline.py +++ b/tests/pipeline/test_mesh_pipeline.py @@ -1,50 +1,16 @@ -"""Pipeline tests for MeSH chemical/protein ID separation (issue #675). +"""MeSH-specific pipeline tests (issue #675). -These tests verify that: -1. chemicals.write_mesh_ids() produces non-empty output -2. protein.write_mesh_ids() produces non-empty output -3. The two outputs share no IDs (the core correctness property of issue #675) -4. The chemicals output excludes all D05/D08/D12.776 descriptor terms +The generic non-empty and mutual-exclusivity tests for all five MeSH compendia +(chemicals, protein, anatomy, disease/phenotype, taxon) are in +test_vocabulary_partitioning.py. This file contains only the MeSH-specific +targeted test that has no generic equivalent. -All tests require babel_downloads/MESH/mesh.nt to be pre-populated and are -skipped by default. Run with: pytest tests/pipeline/ --pipeline --no-cov -v +All tests are skipped by default. Run with: + PYTHONPATH=. uv run pytest tests/pipeline/test_mesh_pipeline.py --pipeline --no-cov -v """ import pytest - -def _read_ids(path: str) -> set[str]: - """Read a TSV output file and return the set of CURIEs (first column).""" - ids = set() - with open(path) as f: - for line in f: - line = line.strip() - if line: - ids.add(line.split("\t")[0]) - return ids - - -@pytest.mark.pipeline -def test_chemicals_mesh_ids_non_empty(mesh_pipeline_outputs): - chem_ids = _read_ids(mesh_pipeline_outputs["chemicals"]) - assert len(chem_ids) > 0, "chemicals.write_mesh_ids() produced no output" - - -@pytest.mark.pipeline -def test_protein_mesh_ids_non_empty(mesh_pipeline_outputs): - prot_ids = _read_ids(mesh_pipeline_outputs["protein"]) - assert len(prot_ids) > 0, "protein.write_mesh_ids() produced no output" - - -@pytest.mark.pipeline -def test_no_overlap_between_chemicals_and_protein_mesh_ids(mesh_pipeline_outputs): - """Core correctness test for issue #675: no ID should appear in both outputs.""" - chem_ids = _read_ids(mesh_pipeline_outputs["chemicals"]) - prot_ids = _read_ids(mesh_pipeline_outputs["protein"]) - overlap = chem_ids & prot_ids - assert len(overlap) == 0, ( - f"Found {len(overlap)} IDs in both chemicals and protein MeSH outputs: " - f"{sorted(overlap)[:10]}" - ) +from tests.pipeline.conftest import _read_ids @pytest.mark.pipeline @@ -53,7 +19,9 @@ def test_chemicals_excludes_all_protein_descriptor_trees(mesh_pipeline_outputs): This catches terms in "in-neither" subtrees (e.g. D05.750 Polymers, D08.211 Coenzymes) that should be excluded from chemicals even though they are not - captured by protein.write_mesh_ids(). + captured by protein.write_mesh_ids(). The mutual-exclusivity test in + test_vocabulary_partitioning.py only checks pairwise overlap between + compendia; this test checks exclusion against the full tree. """ chem_ids = _read_ids(mesh_pipeline_outputs["chemicals"]) excluded_tree_terms = mesh_pipeline_outputs["excluded_tree_terms"] diff --git a/tests/pipeline/test_umls_pipeline.py b/tests/pipeline/test_umls_pipeline.py index c2f98c05..0bd81038 100644 --- a/tests/pipeline/test_umls_pipeline.py +++ b/tests/pipeline/test_umls_pipeline.py @@ -1,68 +1,18 @@ -"""Pipeline tests for UMLS identifier partitioning across compendia (issue #675 extension). +"""UMLS-specific pipeline tests (issue #675 extension). -These tests verify that: -1. Each compendium's write_umls_ids() produces non-empty output. -2. No UMLS:CUI appears in more than one compendium output — the core invariant. -3. Chemicals excludes UMLS IDs assigned to the protein compendium - (semantic type tree A1.4.1.2.1.7, Amino Acid/Peptide/Protein). +The generic non-empty and mutual-exclusivity tests for all seven UMLS compendia +(chemicals, protein, anatomy, disease/phenotype, process/activity, taxon, gene) +are in test_vocabulary_partitioning.py. This file contains only the +UMLS-specific targeted test that has no generic equivalent. -All tests require UMLS_API_KEY to be set and network access for the initial -download; they are skipped by default. Run with: +All tests require UMLS_API_KEY to be set for the initial download (or the +files to already be cached in babel_downloads/UMLS/). They are skipped by +default. Run with: PYTHONPATH=. uv run pytest tests/pipeline/test_umls_pipeline.py --pipeline --no-cov -v """ import pytest -COMPENDIUM_NAMES = [ - "chemicals", - "protein", - "anatomy", - "diseasephenotype", - "processactivity", - "taxon", - "gene", -] - - -def _read_ids(path: str) -> set[str]: - """Read a TSV output file and return the set of CURIEs (first column).""" - ids = set() - with open(path) as f: - for line in f: - line = line.strip() - if line: - ids.add(line.split("\t")[0]) - return ids - - -@pytest.mark.pipeline -@pytest.mark.parametrize("name", COMPENDIUM_NAMES) -def test_umls_ids_non_empty(umls_pipeline_outputs, name): - """Each compendium's write_umls_ids() must produce at least one identifier.""" - ids = _read_ids(umls_pipeline_outputs[name]) - assert len(ids) > 0, f"{name}.write_umls_ids() produced no output" - - -@pytest.mark.pipeline -def test_no_umls_id_in_multiple_compendia(umls_pipeline_outputs): - """Core correctness test: no UMLS:CUI may appear in more than one compendium. - - If the same UMLS ID lands in two compendia, Node Normalization will see a - duplicate and normalization will be ambiguous or incorrect. - """ - seen = {} # id -> first compendium name - duplicates = {} # id -> list of all compendia it appeared in - - for name in COMPENDIUM_NAMES: - for id_ in _read_ids(umls_pipeline_outputs[name]): - if id_ in seen: - duplicates.setdefault(id_, [seen[id_]]).append(name) - else: - seen[id_] = name - - assert len(duplicates) == 0, ( - f"Found {len(duplicates)} UMLS IDs in multiple compendia: " - f"{dict(list(duplicates.items())[:5])}" - ) +from tests.pipeline.conftest import _read_ids @pytest.mark.pipeline @@ -70,7 +20,10 @@ def test_chemicals_excludes_protein_semantic_tree(umls_pipeline_outputs): """Chemicals must not contain any UMLS IDs that the protein compendium claimed. Guards against amino-acid/peptide/protein entries (semantic type tree - A1.4.1.2.1.7) leaking into the chemical compendium. + A1.4.1.2.1.7) leaking into the chemical compendium. The mutual-exclusivity + test in test_vocabulary_partitioning.py also catches this, but this test + names the specific semantic-tree invariant explicitly so a failure message + is immediately actionable. """ chem_ids = _read_ids(umls_pipeline_outputs["chemicals"]) prot_ids = _read_ids(umls_pipeline_outputs["protein"]) diff --git a/tests/pipeline/test_vocabulary_partitioning.py b/tests/pipeline/test_vocabulary_partitioning.py new file mode 100644 index 00000000..d129cec3 --- /dev/null +++ b/tests/pipeline/test_vocabulary_partitioning.py @@ -0,0 +1,55 @@ +"""Generic vocabulary-partitioning tests for all entries in VOCABULARY_REGISTRY. + +For every registered vocabulary, two tests run automatically: + + test_ids_non_empty + Each compendium's write_X_ids() must produce at least one identifier. + + test_no_id_in_multiple_compendia + No identifier may appear in more than one compendium's output — the core + correctness invariant. If the same ID lands in two compendia, Node + Normalization will see a duplicate and normalization will be ambiguous. + +Vocabulary-specific targeted tests (e.g. MeSH tree-exclusion, UMLS protein- +semantic-tree guard) live in test_mesh_pipeline.py / test_umls_pipeline.py. + +To add a new vocabulary, add its fixtures to conftest.py and one entry in +VOCABULARY_REGISTRY. This file never needs to change. + +All tests are skipped by default. Run with: + PYTHONPATH=. uv run pytest tests/pipeline/test_vocabulary_partitioning.py --pipeline --no-cov -v +Run a single vocabulary: + PYTHONPATH=. uv run pytest tests/pipeline/test_vocabulary_partitioning.py --pipeline --no-cov -v -k MESH +""" +import pytest + +from tests.pipeline.conftest import _read_ids + + +@pytest.mark.pipeline +def test_ids_non_empty(vocab_outputs): + """Every compendium in this vocabulary must produce at least one identifier.""" + vocab, outputs = vocab_outputs + # outputs is a dict; "excluded_tree_terms" (MESH-specific) is not an output path. + paths = {name: path for name, path in outputs.items() if isinstance(path, str)} + empty = [name for name, path in paths.items() if not _read_ids(path)] + assert not empty, f"{vocab}: these compendia produced no output: {empty}" + + +@pytest.mark.pipeline +def test_no_id_in_multiple_compendia(vocab_outputs): + """No identifier may appear in more than one compendium for this vocabulary.""" + vocab, outputs = vocab_outputs + paths = {name: path for name, path in outputs.items() if isinstance(path, str)} + seen = {} # id -> first compendium name + duplicates = {} # id -> list of all compendia it appeared in + for name, path in paths.items(): + for id_ in _read_ids(path): + if id_ in seen: + duplicates.setdefault(id_, [seen[id_]]).append(name) + else: + seen[id_] = name + assert not duplicates, ( + f"{vocab}: found {len(duplicates)} IDs in multiple compendia: " + f"{dict(list(duplicates.items())[:5])}" + ) From b7574a83959328c1dd23c848c430a71202ff68e1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 12 Mar 2026 19:47:31 -0400 Subject: [PATCH 16/30] Use config.yaml intermediate directory in pipeline tests; add --regenerate Processing fixtures previously wrote ID files to pytest's tmp_path_factory, a throwaway directory recreated on every run. Large write_X_ids() calls (UMLS reads 3+ GB of RRF files) would re-run needlessly on repeated runs. Changes: - tests/pipeline/conftest.py: add _intermediate_id_path() which resolves {intermediate_directory}/{semantic_type}/ids/{vocab}, matching Snakemake's exact paths so a prior full pipeline run can be reused directly. Add _maybe_run() which calls write_X_ids() only when the output file is absent or --regenerate is passed. Add regenerate session fixture. Replace tmp_path_factory with regenerate in all five processing fixtures. - tests/conftest.py: add --regenerate CLI option; update --pipeline help text. - tests/README.md: add Caching subsection explaining the stable paths and how to use --regenerate or manual deletion for clean re-runs. Co-Authored-By: Claude Sonnet 4.6 --- tests/README.md | 28 ++++++ tests/conftest.py | 12 ++- tests/pipeline/conftest.py | 189 +++++++++++++++++++++++-------------- 3 files changed, 158 insertions(+), 71 deletions(-) diff --git a/tests/README.md b/tests/README.md index da0ab60a..3b1e86c6 100644 --- a/tests/README.md +++ b/tests/README.md @@ -27,6 +27,8 @@ Tests are tagged with marks to control which subset runs in a given context: - `pytest` alone: runs `unit` and `slow` tests; skips `network` and `pipeline` - `pytest --network`: also runs `network` tests - `pytest --pipeline`: also runs `pipeline` tests (prerequisite data is downloaded automatically) +- `pytest --pipeline --regenerate`: re-runs `write_X_ids()` even if intermediate files already + exist (useful after changing compendium filtering logic; see **Caching** below) - `pytest --all`: runs everything (equivalent to `--network --pipeline`) ### Convenience commands @@ -74,6 +76,32 @@ PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests ### Pipeline +#### Caching of intermediate files + +Processing fixtures write intermediate ID files to the exact paths Snakemake uses: + +``` +babel_outputs/intermediate/{semantic_type}/ids/{vocab} +``` + +For example, `anatomy.write_umls_ids()` writes to +`babel_outputs/intermediate/anatomy/ids/UMLS`. By default, if that file already +exists it is reused — `write_umls_ids()` is not called again. This means: + +- **Second and later runs are fast** — only the test assertions execute. +- **A prior full Snakemake pipeline run can be reused directly** — the test fixtures + will pick up any files Snakemake already produced. +- **To force re-processing**, pass `--regenerate`: + ```bash + PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --regenerate --no-cov -v + ``` +- **To selectively regenerate one vocabulary**, delete its files manually then run + without `--regenerate`: + ```bash + rm babel_outputs/intermediate/*/ids/UMLS + PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v -k UMLS + ``` + - **`pipeline/test_vocabulary_partitioning.py`** (`pipeline`) — Generic mutual-exclusivity tests parametrized over all registered vocabularies. For each vocabulary, verifies that (1) every compendium's `write_X_ids()` produces non-empty output and (2) no identifier diff --git a/tests/conftest.py b/tests/conftest.py index 254898f0..3f31ceef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ def pytest_addoption(parser): "--pipeline", action="store_true", default=False, - help="Run tests that invoke Snakemake rules (requires babel_downloads/)", + help="Run pipeline tests; downloads prerequisite data automatically if absent", ) parser.addoption( "--all", @@ -41,6 +41,16 @@ def pytest_addoption(parser): default=False, help="Run all tests (equivalent to --network --pipeline)", ) + parser.addoption( + "--regenerate", + action="store_true", + default=False, + help=( + "Force pipeline processing fixtures to re-run write_X_ids() even when " + "their output files already exist in the intermediate directory. " + "Without this flag, existing files are treated as up-to-date and reused." + ), + ) def pytest_configure(config): diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 4a969e39..e53f1997 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -3,6 +3,12 @@ Pipeline tests download their own prerequisite files (or check network connectivity). If a prerequisite is unavailable, all dependent tests are skipped automatically. +Processing fixtures write intermediate ID files to the same stable paths that +Snakemake uses: {intermediate_directory}/{semantic_type}/ids/{vocab} +(e.g. babel_outputs/intermediate/anatomy/ids/UMLS). By default, if a file +already exists it is reused without re-running write_X_ids(). Pass --regenerate +to force re-processing even when files are present. + All tests in this package are marked `pipeline` and are skipped by default. Run with: PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v @@ -23,7 +29,7 @@ from src.datahandlers.mesh import Mesh, pull_mesh # --------------------------------------------------------------------------- -# Shared helper (not a fixture — importable by test files) +# Shared helpers (not fixtures — importable by test files) # --------------------------------------------------------------------------- @@ -38,6 +44,38 @@ def _read_ids(path: str) -> set[str]: return ids +# Fixture compendium keys whose Snakemake semantic-type directory differs from the key name. +_COMPENDIUM_TO_SNAKEMAKE_DIR = { + "diseasephenotype": "disease", + "processactivity": "process", +} + + +def _intermediate_id_path(compendium: str, vocab: str) -> str: + """Stable output path matching the Snakemake convention: + {intermediate_directory}/{semantic_type}/ids/{vocab} + + Uses the same paths as the Snakemake pipeline so that a prior full pipeline + run can be reused directly without re-running write_X_ids(). + """ + from src.util import get_config + cfg = get_config() + snakemake_dir = _COMPENDIUM_TO_SNAKEMAKE_DIR.get(compendium, compendium) + return os.path.join(cfg["intermediate_directory"], snakemake_dir, "ids", vocab) + + +def _maybe_run(outfile: str, fn, regenerate: bool) -> str: + """Run fn() to (re)generate outfile unless it exists and regenerate is False. + + fn is a zero-argument callable (typically a lambda) that writes to outfile. + Creates parent directories as needed. Always returns outfile. + """ + if not os.path.exists(outfile) or regenerate: + os.makedirs(os.path.dirname(outfile), exist_ok=True) + fn() + return outfile + + # --------------------------------------------------------------------------- # Generic download helper # --------------------------------------------------------------------------- @@ -53,6 +91,21 @@ def _download_or_skip(label: str, pull_fn, expected_path: str) -> str: return expected_path +# --------------------------------------------------------------------------- +# --regenerate fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def regenerate(request): + """True when --regenerate was passed on the command line. + + Processing fixtures check this to decide whether to re-run write_X_ids() + even when their output file already exists at the stable intermediate path. + """ + return request.config.getoption("--regenerate") + + # --------------------------------------------------------------------------- # MESH download + processing fixtures # --------------------------------------------------------------------------- @@ -69,18 +122,20 @@ def mesh_nt(): @pytest.fixture(scope="session") -def mesh_pipeline_outputs(mesh_nt, tmp_path_factory): - """Run write_mesh_ids for all five compendia; skip if mesh_nt unavailable.""" - outdir = tmp_path_factory.mktemp("mesh_ids") +def mesh_pipeline_outputs(mesh_nt, regenerate): + """Run write_mesh_ids for all five compendia; skip if mesh_nt unavailable. - def out(name): - return str(outdir / f"{name}_MESH") + Output files are written to babel_outputs/intermediate/{type}/ids/MESH and + reused on subsequent runs unless --regenerate is passed. + """ + def p(compendium): + return _intermediate_id_path(compendium, "MESH") - chemicals.write_mesh_ids(out("chemicals")) - protein.write_mesh_ids(out("protein")) - anatomy.write_mesh_ids(out("anatomy")) - diseasephenotype.write_mesh_ids(out("diseasephenotype")) - taxon.write_mesh_ids(out("taxon")) + _maybe_run(p("chemicals"), lambda: chemicals.write_mesh_ids(p("chemicals")), regenerate) + _maybe_run(p("protein"), lambda: protein.write_mesh_ids(p("protein")), regenerate) + _maybe_run(p("anatomy"), lambda: anatomy.write_mesh_ids(p("anatomy")), regenerate) + _maybe_run(p("diseasephenotype"), lambda: diseasephenotype.write_mesh_ids(p("diseasephenotype")), regenerate) + _maybe_run(p("taxon"), lambda: taxon.write_mesh_ids(p("taxon")), regenerate) m = Mesh() excluded_tree_terms = set() @@ -88,11 +143,11 @@ def out(name): excluded_tree_terms.update(m.get_terms_in_tree(tree)) return { - "chemicals": out("chemicals"), - "protein": out("protein"), - "anatomy": out("anatomy"), - "diseasephenotype": out("diseasephenotype"), - "taxon": out("taxon"), + "chemicals": p("chemicals"), + "protein": p("protein"), + "anatomy": p("anatomy"), + "diseasephenotype": p("diseasephenotype"), + "taxon": p("taxon"), "excluded_tree_terms": excluded_tree_terms, } @@ -128,8 +183,12 @@ def umls_rrf_files(): @pytest.fixture(scope="session") -def umls_pipeline_outputs(umls_rrf_files, tmp_path_factory): - """Run write_umls_ids for all seven compendia; returns dict of output paths.""" +def umls_pipeline_outputs(umls_rrf_files, regenerate): + """Run write_umls_ids for all seven compendia; returns dict of output paths. + + Output files are written to babel_outputs/intermediate/{type}/ids/UMLS and + reused on subsequent runs unless --regenerate is passed. + """ from src.createcompendia import gene, processactivitypathway from src.util import get_config @@ -137,28 +196,19 @@ def umls_pipeline_outputs(umls_rrf_files, tmp_path_factory): mrsty = umls_rrf_files["mrsty"] cfg = get_config() badumlsfile = os.path.join(cfg["input_directory"], "badumls") - outdir = tmp_path_factory.mktemp("umls_ids") - def out(name): - return str(outdir / f"{name}_UMLS") + def p(compendium): + return _intermediate_id_path(compendium, "UMLS") - chemicals.write_umls_ids(mrsty, out("chemicals")) - protein.write_umls_ids(mrsty, out("protein")) - anatomy.write_umls_ids(mrsty, out("anatomy")) - diseasephenotype.write_umls_ids(mrsty, out("diseasephenotype"), badumlsfile) - processactivitypathway.write_umls_ids(mrsty, out("processactivity")) - taxon.write_umls_ids(mrsty, out("taxon")) - gene.write_umls_ids(mrconso, mrsty, out("gene")) + _maybe_run(p("chemicals"), lambda: chemicals.write_umls_ids(mrsty, p("chemicals")), regenerate) + _maybe_run(p("protein"), lambda: protein.write_umls_ids(mrsty, p("protein")), regenerate) + _maybe_run(p("anatomy"), lambda: anatomy.write_umls_ids(mrsty, p("anatomy")), regenerate) + _maybe_run(p("diseasephenotype"), lambda: diseasephenotype.write_umls_ids(mrsty, p("diseasephenotype"), badumlsfile), regenerate) + _maybe_run(p("processactivity"), lambda: processactivitypathway.write_umls_ids(mrsty, p("processactivity")), regenerate) + _maybe_run(p("taxon"), lambda: taxon.write_umls_ids(mrsty, p("taxon")), regenerate) + _maybe_run(p("gene"), lambda: gene.write_umls_ids(mrconso, mrsty, p("gene")), regenerate) - return { - "chemicals": out("chemicals"), - "protein": out("protein"), - "anatomy": out("anatomy"), - "diseasephenotype": out("diseasephenotype"), - "processactivity": out("processactivity"), - "taxon": out("taxon"), - "gene": out("gene"), - } + return {name: p(name) for name in ["chemicals", "protein", "anatomy", "diseasephenotype", "processactivity", "taxon", "gene"]} # --------------------------------------------------------------------------- @@ -178,22 +228,22 @@ def omim_mim2gene(): @pytest.fixture(scope="session") -def omim_pipeline_outputs(omim_mim2gene, tmp_path_factory): - """Run write_omim_ids for disease/phenotype and gene; skip if mim2gene.txt unavailable.""" +def omim_pipeline_outputs(omim_mim2gene, regenerate): + """Run write_omim_ids for disease/phenotype and gene; skip if mim2gene.txt unavailable. + + Output files are written to babel_outputs/intermediate/{type}/ids/OMIM and + reused on subsequent runs unless --regenerate is passed. + """ from src.createcompendia import gene infile = omim_mim2gene - outdir = tmp_path_factory.mktemp("omim_ids") - def out(name): - return str(outdir / f"{name}_OMIM") + def p(compendium): + return _intermediate_id_path(compendium, "OMIM") - diseasephenotype.write_omim_ids(infile, out("diseasephenotype")) - gene.write_omim_ids(infile, out("gene")) + _maybe_run(p("diseasephenotype"), lambda: diseasephenotype.write_omim_ids(infile, p("diseasephenotype")), regenerate) + _maybe_run(p("gene"), lambda: gene.write_omim_ids(infile, p("gene")), regenerate) - return { - "diseasephenotype": out("diseasephenotype"), - "gene": out("gene"), - } + return {"diseasephenotype": p("diseasephenotype"), "gene": p("gene")} # --------------------------------------------------------------------------- @@ -226,20 +276,19 @@ def ubergraph_connection(): @pytest.fixture(scope="session") -def ncit_pipeline_outputs(ubergraph_connection, tmp_path_factory): - """Run write_ncit_ids for anatomy and disease/phenotype via UberGraph.""" - outdir = tmp_path_factory.mktemp("ncit_ids") +def ncit_pipeline_outputs(ubergraph_connection, regenerate): + """Run write_ncit_ids for anatomy and disease/phenotype via UberGraph. - def out(name): - return str(outdir / f"{name}_NCIT") + Output files are written to babel_outputs/intermediate/{type}/ids/NCIT and + reused on subsequent runs unless --regenerate is passed. + """ + def p(compendium): + return _intermediate_id_path(compendium, "NCIT") - anatomy.write_ncit_ids(out("anatomy")) - diseasephenotype.write_ncit_ids(out("diseasephenotype")) + _maybe_run(p("anatomy"), lambda: anatomy.write_ncit_ids(p("anatomy")), regenerate) + _maybe_run(p("diseasephenotype"), lambda: diseasephenotype.write_ncit_ids(p("diseasephenotype")), regenerate) - return { - "anatomy": out("anatomy"), - "diseasephenotype": out("diseasephenotype"), - } + return {"anatomy": p("anatomy"), "diseasephenotype": p("diseasephenotype")} # --------------------------------------------------------------------------- @@ -248,21 +297,21 @@ def out(name): @pytest.fixture(scope="session") -def go_pipeline_outputs(ubergraph_connection, tmp_path_factory): - """Run write_go_ids for anatomy and process/activity/pathway via UberGraph.""" +def go_pipeline_outputs(ubergraph_connection, regenerate): + """Run write_go_ids for anatomy and process/activity/pathway via UberGraph. + + Output files are written to babel_outputs/intermediate/{type}/ids/GO and + reused on subsequent runs unless --regenerate is passed. + """ from src.createcompendia import processactivitypathway - outdir = tmp_path_factory.mktemp("go_ids") - def out(name): - return str(outdir / f"{name}_GO") + def p(compendium): + return _intermediate_id_path(compendium, "GO") - anatomy.write_go_ids(out("anatomy")) - processactivitypathway.write_go_ids(out("processactivity")) + _maybe_run(p("anatomy"), lambda: anatomy.write_go_ids(p("anatomy")), regenerate) + _maybe_run(p("processactivity"), lambda: processactivitypathway.write_go_ids(p("processactivity")), regenerate) - return { - "anatomy": out("anatomy"), - "processactivity": out("processactivity"), - } + return {"anatomy": p("anatomy"), "processactivity": p("processactivity")} # --------------------------------------------------------------------------- From 1ed3f94280be32a5f846d1d88aff5a769ec90079 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 12 Mar 2026 20:51:30 -0400 Subject: [PATCH 17/30] Added rumdl fixes. --- tests/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/README.md b/tests/README.md index 3b1e86c6..c5e932a4 100644 --- a/tests/README.md +++ b/tests/README.md @@ -80,23 +80,26 @@ PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests Processing fixtures write intermediate ID files to the exact paths Snakemake uses: -``` +```text babel_outputs/intermediate/{semantic_type}/ids/{vocab} ``` For example, `anatomy.write_umls_ids()` writes to -`babel_outputs/intermediate/anatomy/ids/UMLS`. By default, if that file already -exists it is reused — `write_umls_ids()` is not called again. This means: +`babel_outputs/intermediate/anatomy/ids/UMLS`. By default, if that file already +exists it is reused — `write_umls_ids()` is not called again. This means: - **Second and later runs are fast** — only the test assertions execute. - **A prior full Snakemake pipeline run can be reused directly** — the test fixtures will pick up any files Snakemake already produced. - **To force re-processing**, pass `--regenerate`: + ```bash PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --regenerate --no-cov -v ``` + - **To selectively regenerate one vocabulary**, delete its files manually then run without `--regenerate`: + ```bash rm babel_outputs/intermediate/*/ids/UMLS PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v -k UMLS From 43f42f618d06c4cfdcd73404476127c6d5c39550 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 12 Mar 2026 20:53:19 -0400 Subject: [PATCH 18/30] Improved documentation on --no-cov/--n auto conflict. --- tests/README.md | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tests/README.md b/tests/README.md index c5e932a4..18aadbc1 100644 --- a/tests/README.md +++ b/tests/README.md @@ -31,17 +31,25 @@ Tests are tagged with marks to control which subset runs in a given context: exist (useful after changing compendium filtering logic; see **Caching** below) - `pytest --all`: runs everything (equivalent to `--network --pipeline`) +> **Note on pipeline tests and coverage:** Always pass `--no-cov` when running pipeline tests. +> Because `pytest-cov` and `pytest-xdist` are both installed, `pytest-cov` sets up +> `execnet` communication channels for distributed coverage collection even when `-n` is not +> used. When long-running pipeline fixtures (pyoxigraph RocksDB stores) are torn down, +> their background threads can interfere with the `execnet` teardown, producing a +> `PluggyTeardownRaisedWarning: OSError: cannot send (already closed?)` in +> `pytest_sessionfinish`. Passing `--no-cov` disables coverage collection and avoids this. + ### Convenience commands ```bash -PYTHONPATH=. uv run pytest -m unit # unit tests only (CI default) -PYTHONPATH=. uv run pytest -m "unit or network" --network # unit + live-service checks -PYTHONPATH=. uv run pytest -m "unit or slow" # unit + slow offline tests -PYTHONPATH=. uv run pytest --all # run every test -PYTHONPATH=. uv run pytest -m pipeline --pipeline -x # one Snakemake-triggering test at a time -PYTHONPATH=. uv run pytest -m "not pipeline" # everything except full pipeline runs -PYTHONPATH=. uv run pytest -n auto --no-cov # parallel (all CPUs), skip coverage -PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests only +PYTHONPATH=. uv run pytest -m unit # unit tests only (CI default) +PYTHONPATH=. uv run pytest -m "unit or network" --network # unit + live-service checks +PYTHONPATH=. uv run pytest -m "unit or slow" # unit + slow offline tests +PYTHONPATH=. uv run pytest --all --no-cov # run every test (no coverage) +PYTHONPATH=. uv run pytest -m pipeline --pipeline -x --no-cov # one pipeline test at a time +PYTHONPATH=. uv run pytest -m "not pipeline" # everything except full pipeline runs +PYTHONPATH=. uv run pytest -n auto --no-cov # parallel (all CPUs), skip coverage +PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests only ``` ## Test Files @@ -89,8 +97,10 @@ For example, `anatomy.write_umls_ids()` writes to exists it is reused — `write_umls_ids()` is not called again. This means: - **Second and later runs are fast** — only the test assertions execute. + - **A prior full Snakemake pipeline run can be reused directly** — the test fixtures will pick up any files Snakemake already produced. + - **To force re-processing**, pass `--regenerate`: ```bash From 952906c56551620435ce6331acb44b6e2d4e189f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 12 Mar 2026 20:59:25 -0400 Subject: [PATCH 19/30] Auto-disable coverage when -n (xdist) is specified. pytest-cov + pytest-xdist causes an execnet OSError in pytest_sessionfinish even when workers are not running pipeline tests. The new pytest_configure hook detects -n > 0 and sets no_cov = True automatically, printing a note so the user knows why coverage is absent. Co-Authored-By: Claude Sonnet 4.6 --- tests/conftest.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 3f31ceef..295a7f3d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,6 +59,22 @@ def pytest_configure(config): config.addinivalue_line("markers", "slow: correct but takes >30s even offline") config.addinivalue_line("markers", "pipeline: invokes Snakemake rules; requires babel_downloads/") + # Auto-disable coverage when xdist workers are requested. + # Combining pytest-cov with -n causes an OSError("cannot send (already closed?)") + # from execnet during pytest_sessionfinish. Setting no_cov here is equivalent to + # passing --no-cov and takes effect before pytest-cov starts collecting data. + try: + n_workers = config.getoption("-n", default=None) + except (ValueError, AttributeError): + n_workers = None + if n_workers is not None and str(n_workers) not in ("0", "no"): + if hasattr(config.option, "no_cov") and not config.option.no_cov: + config.option.no_cov = True + print( # noqa: T201 + "\nNOTE: coverage disabled automatically because -n was given " + "(pytest-cov + pytest-xdist causes teardown errors; use --no-cov explicitly to suppress this message)." + ) + def pytest_collection_modifyitems(config, items): run_all = config.getoption("--all") From 747d51c93f81ee228f0dd849e92dd0eafe7b396c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 21 Mar 2026 03:42:49 -0400 Subject: [PATCH 20/30] Allow known cross-compendium duplicates via KNOWN_DUPLICATES allowlist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New unexpected duplicates still fail hard; known ones trigger xfail so the test suite passes while the underlying issues remain unresolved. Each entry is documented so it doubles as a bug inventory — remove an entry once the fix lands. Co-Authored-By: Claude Sonnet 4.6 --- .../pipeline/test_vocabulary_partitioning.py | 51 +++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/tests/pipeline/test_vocabulary_partitioning.py b/tests/pipeline/test_vocabulary_partitioning.py index d129cec3..5f1f41cd 100644 --- a/tests/pipeline/test_vocabulary_partitioning.py +++ b/tests/pipeline/test_vocabulary_partitioning.py @@ -25,6 +25,40 @@ from tests.pipeline.conftest import _read_ids +# Known cross-compendium duplicates that have not yet been resolved. +# Each entry is a set of identifier strings that are known to appear in more +# than one compendium for that vocabulary. +# +# When an underlying bug is fixed, remove the corresponding IDs from this dict +# so that the test starts enforcing the invariant for those identifiers. +# +# TODO: Create a tracking issue for each vocabulary listed here and link it. +KNOWN_DUPLICATES: dict[str, set[str]] = { + "UMLS": { + "UMLS:C5443441", + "UMLS:C5443442", + }, + "MESH": { + # protein / anatomy overlaps + "MESH:D022041", "MESH:D035321", "MESH:D006570", "MESH:D009707", + "MESH:D064448", "MESH:D035341", "MESH:D045524", "MESH:D007106", + "MESH:D000067816", "MESH:D000089804", "MESH:D002843", "MESH:D017358", + "MESH:D008894", "MESH:D000961", "MESH:D009360", + # chemicals / anatomy overlaps + "MESH:D000091083", "MESH:D014688", + # anatomy / diseasephenotype overlaps + "MESH:D000072717", "MESH:D018404", "MESH:D065309", "MESH:D003809", + "MESH:D008467", "MESH:D012303", "MESH:D017439", "MESH:D000153", + "MESH:D014097", "MESH:D010677", "MESH:D000072662", "MESH:D003750", + "MESH:D008551", "MESH:D048629", "MESH:D002921", "MESH:D007627", + # anatomy / taxon overlaps + "MESH:D036226", "MESH:D013172", "MESH:D052940", "MESH:D033761", + "MESH:D053058", "MESH:D002523", "MESH:D038821", "MESH:D033661", + "MESH:D013171", "MESH:D034101", "MESH:D013104", "MESH:D052939", + "MESH:D013170", + }, +} + @pytest.mark.pipeline def test_ids_non_empty(vocab_outputs): @@ -49,7 +83,18 @@ def test_no_id_in_multiple_compendia(vocab_outputs): duplicates.setdefault(id_, [seen[id_]]).append(name) else: seen[id_] = name - assert not duplicates, ( - f"{vocab}: found {len(duplicates)} IDs in multiple compendia: " - f"{dict(list(duplicates.items())[:5])}" + + known = KNOWN_DUPLICATES.get(vocab, set()) + unexpected = {k: v for k, v in duplicates.items() if k not in known} + assert not unexpected, ( + f"{vocab}: found {len(unexpected)} unexpected IDs in multiple compendia: " + f"{dict(list(unexpected.items())[:5])}" ) + + still_known = known & duplicates.keys() + if still_known: + pytest.xfail( + f"{vocab}: {len(still_known)} known duplicate(s) not yet fixed " + f"(see KNOWN_DUPLICATES in this file): " + f"{sorted(still_known)[:5]}" + ) From 0358c319858eab9d6744c4e4d127bccb2d1886c6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 21 Mar 2026 23:12:40 -0400 Subject: [PATCH 21/30] =?UTF-8?q?Move=20compendium=E2=86=92directory=20map?= =?UTF-8?q?ping=20from=20conftest.py=20into=20config.yaml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardcoded _COMPENDIUM_TO_SNAKEMAKE_DIR dict in the test conftest is replaced by a compendium_directories key in config.yaml, making it the single authoritative source co-located with all other path config. Also renames the "processactivity" fixture key to "processactivitypathway" to match the actual module name, making it easier to trace from fixture to source code. Co-Authored-By: Claude Sonnet 4.6 --- config.yaml | 8 ++++++++ tests/pipeline/conftest.py | 21 +++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/config.yaml b/config.yaml index 5f20fe63..87722a08 100644 --- a/config.yaml +++ b/config.yaml @@ -16,6 +16,14 @@ intermediate_directory: babel_outputs/intermediate output_directory: babel_outputs tmp_directory: babel_downloads/tmp +# Maps Python compendium names (as used in src/createcompendia/ and tests) to the +# directory names that Snakemake uses under intermediate_directory. +# Compendia not listed here use their own name as the directory name. +# Update this whenever a new semantic type uses a shortened or different directory name. +compendium_directories: + diseasephenotype: disease + processactivitypathway: process + # # SHARED # diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index e53f1997..d9966e54 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -44,23 +44,20 @@ def _read_ids(path: str) -> set[str]: return ids -# Fixture compendium keys whose Snakemake semantic-type directory differs from the key name. -_COMPENDIUM_TO_SNAKEMAKE_DIR = { - "diseasephenotype": "disease", - "processactivity": "process", -} - - def _intermediate_id_path(compendium: str, vocab: str) -> str: """Stable output path matching the Snakemake convention: {intermediate_directory}/{semantic_type}/ids/{vocab} Uses the same paths as the Snakemake pipeline so that a prior full pipeline run can be reused directly without re-running write_X_ids(). + + The compendium-to-directory mapping is read from config.yaml + (compendium_directories key) so there is a single authoritative source. """ from src.util import get_config cfg = get_config() - snakemake_dir = _COMPENDIUM_TO_SNAKEMAKE_DIR.get(compendium, compendium) + directory_map = cfg.get("compendium_directories", {}) + snakemake_dir = directory_map.get(compendium, compendium) return os.path.join(cfg["intermediate_directory"], snakemake_dir, "ids", vocab) @@ -204,11 +201,11 @@ def p(compendium): _maybe_run(p("protein"), lambda: protein.write_umls_ids(mrsty, p("protein")), regenerate) _maybe_run(p("anatomy"), lambda: anatomy.write_umls_ids(mrsty, p("anatomy")), regenerate) _maybe_run(p("diseasephenotype"), lambda: diseasephenotype.write_umls_ids(mrsty, p("diseasephenotype"), badumlsfile), regenerate) - _maybe_run(p("processactivity"), lambda: processactivitypathway.write_umls_ids(mrsty, p("processactivity")), regenerate) + _maybe_run(p("processactivitypathway"), lambda: processactivitypathway.write_umls_ids(mrsty, p("processactivitypathway")), regenerate) _maybe_run(p("taxon"), lambda: taxon.write_umls_ids(mrsty, p("taxon")), regenerate) _maybe_run(p("gene"), lambda: gene.write_umls_ids(mrconso, mrsty, p("gene")), regenerate) - return {name: p(name) for name in ["chemicals", "protein", "anatomy", "diseasephenotype", "processactivity", "taxon", "gene"]} + return {name: p(name) for name in ["chemicals", "protein", "anatomy", "diseasephenotype", "processactivitypathway", "taxon", "gene"]} # --------------------------------------------------------------------------- @@ -309,9 +306,9 @@ def p(compendium): return _intermediate_id_path(compendium, "GO") _maybe_run(p("anatomy"), lambda: anatomy.write_go_ids(p("anatomy")), regenerate) - _maybe_run(p("processactivity"), lambda: processactivitypathway.write_go_ids(p("processactivity")), regenerate) + _maybe_run(p("processactivitypathway"), lambda: processactivitypathway.write_go_ids(p("processactivitypathway")), regenerate) - return {"anatomy": p("anatomy"), "processactivity": p("processactivity")} + return {"anatomy": p("anatomy"), "processactivitypathway": p("processactivitypathway")} # --------------------------------------------------------------------------- From 9d635a3b71ec2b77d5afea2463a30973e3d366fa Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 21 Mar 2026 23:13:45 -0400 Subject: [PATCH 22/30] Document compendium_directories config key in CLAUDE.md Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index a70718e2..d50a7aa6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,6 +61,10 @@ uv run rumdl fmt . # Markdown auto-fix - Line length is 160 for both Python (ruff) and Snakemake (snakefmt). - Main config: `config.yaml` (directory paths, version strings, prefix lists per semantic type). - `UMLS_API_KEY` environment variable required for UMLS/RxNorm downloads. +- `compendium_directories` in `config.yaml` maps Python compendium names to the Snakemake + intermediate directory names when they differ (e.g., `diseasephenotype → disease`, + `processactivitypathway → process`). Update this when adding a new semantic type whose + directory name doesn't match its Python module name. ## Architecture From 0858839c204cf33106095d742584511612b630a7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 27 Mar 2026 17:29:11 -0400 Subject: [PATCH 23/30] Simplify: fix N+1 SPARQL, raise exceptions, extract _output_paths helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mesh.py: combine N+1 SPARQL queries in get_scr_terms_mapped_to_trees() into a single query using VALUES ?topTree { ... } - umls.py: convert download_rxnorm() print+exit(1) to raise RuntimeError, matching the pattern already used in download_umls(); also fix string interpolation bug (${var} → {var}) in the error message - pipeline/conftest.py: add _output_paths() helper that filters a vocab outputs dict down to string-valued paths - test_vocabulary_partitioning.py: use _output_paths() in both test functions instead of duplicating the dict-comprehension - CLAUDE.md: document raise-over-exit(1) convention Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 6 ++++ src/datahandlers/mesh.py | 34 +++++++++---------- src/datahandlers/umls.py | 7 ++-- tests/pipeline/conftest.py | 10 ++++++ .../pipeline/test_vocabulary_partitioning.py | 7 ++-- 5 files changed, 38 insertions(+), 26 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d50a7aa6..85e6e0c7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -153,6 +153,12 @@ high performance cluster) so you don't need to download all the source files and rerun the entire pipeline. You can look at the resource requirements of a rule to decide which option would be best. +## Conventions + +- **Error handling** — raise exceptions (`RuntimeError`, `ValueError`, etc.) rather than + `print(...) + exit(1)`. Exceptions are testable and propagate cleanly through Snakemake; + bare `exit()` calls bypass Python's exception machinery and make unit testing impossible. + ## Debugging When looking things up in the source databases, prefer to invoke the existing download code in diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index 6ede31d5..3bc2ca20 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -51,24 +51,24 @@ def get_scr_terms_mapped_to_trees(self, top_treenums): SCR terms don't have tree numbers themselves, but they have meshv:mappedTo and/or meshv:preferredMappedTo relationships to descriptor terms that do. This method finds SCR terms whose mapped descriptors fall under the specified trees.""" + values_clause = " ".join(f"mesh:{t}" for t in top_treenums) + s = f""" PREFIX meshv: + PREFIX mesh: + + SELECT DISTINCT ?term + WHERE {{ VALUES ?mappingPred {{ meshv:mappedTo meshv:preferredMappedTo }} + VALUES ?topTree {{ {values_clause} }} + ?term ?mappingPred ?descriptor . + ?descriptor meshv:treeNumber ?treenum . + ?treenum meshv:parentTreeNumber* ?topTree + }} + ORDER BY ?term + """ terms = set() - for top_treenum in top_treenums: - s = f""" PREFIX meshv: - PREFIX mesh: - - SELECT DISTINCT ?term - WHERE {{ VALUES ?mappingPred {{ meshv:mappedTo meshv:preferredMappedTo }} - ?term ?mappingPred ?descriptor . - ?descriptor meshv:treeNumber ?treenum . - ?treenum meshv:parentTreeNumber* mesh:{top_treenum} - }} - ORDER BY ?term - """ - qres = self.m.query(s) - for row in list(qres): - iterm = str(row["term"]) - meshid = iterm[:-1].split("/")[-1] - terms.add(f"{MESH}:{meshid}") + for row in list(self.m.query(s)): + iterm = str(row["term"]) + meshid = iterm[:-1].split("/")[-1] + terms.add(f"{MESH}:{meshid}") return terms def get_terms_with_type(self, termtype): diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 308a9f28..78a29b75 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -358,9 +358,7 @@ def download_rxnorm(rxnorm_version, download_dir): """ umls_api_key = os.environ.get("UMLS_API_KEY") if not umls_api_key: - print("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.") - print("See instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html") - exit(1) + raise RuntimeError("The environmental variable UMLS_API_KEY needs to be set to a valid UMLS API key.\nSee instructions at https://documentation.uts.nlm.nih.gov/rest/authentication.html") # Download RxNorm_full_{rxnorm_version}.zip # As described at https://documentation.uts.nlm.nih.gov/automating-downloads.html @@ -369,8 +367,7 @@ def download_rxnorm(rxnorm_version, download_dir): rxnorm_url, {"url": f"https://download.nlm.nih.gov/umls/kss/rxnorm/RxNorm_full_{rxnorm_version}.zip", "apiKey": umls_api_key}, stream=True ) if not req.ok: - print(f"Unable to download RxNorm from ${rxnorm_url}: ${req}") - exit(1) + raise RuntimeError(f"Unable to download RxNorm from {rxnorm_url}: {req}") # Write file to {download_dir}/RxNorm_full_{rxnorm_version}.zip logging.info(f"Downloading RxNorm_full_{rxnorm_version}.zip to {download_dir}") diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index d9966e54..91cd995e 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -44,6 +44,16 @@ def _read_ids(path: str) -> set[str]: return ids +def _output_paths(outputs: dict) -> dict[str, str]: + """Filter a vocab_outputs dict down to just the string-valued output paths. + + Some vocabulary fixtures (e.g. MESH) include non-path entries like + 'excluded_tree_terms' in their returned dict. This helper strips those out + so test functions can iterate only over real compendium output files. + """ + return {name: path for name, path in outputs.items() if isinstance(path, str)} + + def _intermediate_id_path(compendium: str, vocab: str) -> str: """Stable output path matching the Snakemake convention: {intermediate_directory}/{semantic_type}/ids/{vocab} diff --git a/tests/pipeline/test_vocabulary_partitioning.py b/tests/pipeline/test_vocabulary_partitioning.py index 5f1f41cd..f9f05dd1 100644 --- a/tests/pipeline/test_vocabulary_partitioning.py +++ b/tests/pipeline/test_vocabulary_partitioning.py @@ -23,7 +23,7 @@ """ import pytest -from tests.pipeline.conftest import _read_ids +from tests.pipeline.conftest import _output_paths, _read_ids # Known cross-compendium duplicates that have not yet been resolved. # Each entry is a set of identifier strings that are known to appear in more @@ -64,8 +64,7 @@ def test_ids_non_empty(vocab_outputs): """Every compendium in this vocabulary must produce at least one identifier.""" vocab, outputs = vocab_outputs - # outputs is a dict; "excluded_tree_terms" (MESH-specific) is not an output path. - paths = {name: path for name, path in outputs.items() if isinstance(path, str)} + paths = _output_paths(outputs) empty = [name for name, path in paths.items() if not _read_ids(path)] assert not empty, f"{vocab}: these compendia produced no output: {empty}" @@ -74,7 +73,7 @@ def test_ids_non_empty(vocab_outputs): def test_no_id_in_multiple_compendia(vocab_outputs): """No identifier may appear in more than one compendium for this vocabulary.""" vocab, outputs = vocab_outputs - paths = {name: path for name, path in outputs.items() if isinstance(path, str)} + paths = _output_paths(outputs) seen = {} # id -> first compendium name duplicates = {} # id -> list of all compendia it appeared in for name, path in paths.items(): From 5b84e26658d48183b4d2771306e273e32a8c50d6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 27 Mar 2026 17:41:10 -0400 Subject: [PATCH 24/30] Address PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mesh.py: guard get_scr_terms_mapped_to_trees() against empty input list to avoid a malformed VALUES clause - tests/conftest.py: remove duplicate marker registrations from pytest_configure; pyproject.toml is the canonical source (and already has a comment saying conftest should not re-register them) - tests/conftest.py: improve --regenerate help text with typical use case - test_umls_pipeline.py: clarify that test_chemicals_excludes_protein_semantic_tree is a stricter sentinel (no KNOWN_DUPLICATES carve-out) for the specific chem/protein pair, not a redundant test - protein.py: document the SCR D05/D08 approximation trade-off — SCRs mapped to non-protein subtrees (Polymers, Coenzymes) will be classified as PROTEIN rather than falling into neither compendium - pipeline/conftest.py: document why the second Mesh() construction is necessary and that the cost is paid once per session Co-Authored-By: Claude Sonnet 4.6 --- src/createcompendia/protein.py | 7 +++++-- src/datahandlers/mesh.py | 6 +++++- tests/conftest.py | 10 ++++------ tests/pipeline/conftest.py | 6 ++++++ tests/pipeline/test_umls_pipeline.py | 10 +++++----- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 619442a4..0934c0c0 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -84,8 +84,11 @@ def write_mesh_ids(outfile): # We use scr_include_trees to only keep SCR terms mapped to the protein-related # trees (D12.776, D05, D08). This is the inverse of scr_exclude_trees used in # chemicals.write_mesh_ids(). We use the broader D05 and D08 here (not just the - # protein subtrees) because any SCR mapped to D05 or D08 is more likely a protein - # than a non-protein macromolecule. + # specific protein subtrees) because any SCR mapped to D05 or D08 is more likely a + # protein than a non-protein macromolecule. The trade-off: SCR terms mapped to + # non-protein D05/D08 subtrees (e.g. Polymers under D05.750, Coenzymes under + # D08.211) will be classified as PROTEIN here rather than falling into neither + # compendium, as their corresponding descriptor terms do. scr_protein_trees = ["D12.776", "D05", "D08"] mesh.write_ids( meshmap, diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index 3bc2ca20..9a55587e 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -50,7 +50,11 @@ def get_scr_terms_mapped_to_trees(self, top_treenums): SCR terms don't have tree numbers themselves, but they have meshv:mappedTo and/or meshv:preferredMappedTo relationships to descriptor terms that do. This method finds - SCR terms whose mapped descriptors fall under the specified trees.""" + SCR terms whose mapped descriptors fall under the specified trees. + + Returns an empty set if top_treenums is empty.""" + if not top_treenums: + return set() values_clause = " ".join(f"mesh:{t}" for t in top_treenums) s = f""" PREFIX meshv: PREFIX mesh: diff --git a/tests/conftest.py b/tests/conftest.py index 0c8604f4..87594871 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -55,18 +55,16 @@ def pytest_addoption(parser): help=( "Force pipeline processing fixtures to re-run write_X_ids() even when " "their output files already exist in the intermediate directory. " - "Without this flag, existing files are treated as up-to-date and reused." + "Without this flag, existing files are treated as up-to-date and reused. " + "Pass this after changing compendium filtering logic to ensure tests " + "reflect the new output rather than cached files." ), ) def pytest_configure(config): - config.addinivalue_line("markers", "unit: fast offline tests with no external dependencies") - config.addinivalue_line("markers", "network: requires live internet access") - config.addinivalue_line("markers", "slow: correct but takes >30s even offline") - config.addinivalue_line("markers", "pipeline: invokes Snakemake rules; requires babel_downloads/") - # Auto-disable coverage when xdist workers are requested. + # Markers are registered in pyproject.toml [tool.pytest.ini_options] — not here. # Combining pytest-cov with -n causes an OSError("cannot send (already closed?)") # from execnet during pytest_sessionfinish. Setting no_cov here is equivalent to # passing --no-cov and takes effect before pytest-cov starts collecting data. diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index 91cd995e..f21bd0e3 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -144,6 +144,12 @@ def p(compendium): _maybe_run(p("diseasephenotype"), lambda: diseasephenotype.write_mesh_ids(p("diseasephenotype")), regenerate) _maybe_run(p("taxon"), lambda: taxon.write_mesh_ids(p("taxon")), regenerate) + # Build excluded_tree_terms for test_mesh_pipeline.py: all descriptor terms under + # D05/D08/D12.776 (including non-protein subtrees like Polymers and Coenzymes that + # belong in neither compendium). This constructs a second Mesh() instance because + # write_ids() encapsulates its own instance and there is no way to reuse it here + # without an API change. This is test-only and session-scoped so the cost is paid + # once per test run. m = Mesh() excluded_tree_terms = set() for tree in ["D05", "D08", "D12.776"]: diff --git a/tests/pipeline/test_umls_pipeline.py b/tests/pipeline/test_umls_pipeline.py index 0bd81038..c5be6330 100644 --- a/tests/pipeline/test_umls_pipeline.py +++ b/tests/pipeline/test_umls_pipeline.py @@ -19,11 +19,11 @@ def test_chemicals_excludes_protein_semantic_tree(umls_pipeline_outputs): """Chemicals must not contain any UMLS IDs that the protein compendium claimed. - Guards against amino-acid/peptide/protein entries (semantic type tree - A1.4.1.2.1.7) leaking into the chemical compendium. The mutual-exclusivity - test in test_vocabulary_partitioning.py also catches this, but this test - names the specific semantic-tree invariant explicitly so a failure message - is immediately actionable. + This is the chemicals/protein edge of the mutual-exclusivity invariant, stated + explicitly so that a failure message immediately names the semantic-tree involved + (A1.4.1.2.1.7, Amino Acid/Peptide/Protein). Unlike test_no_id_in_multiple_compendia, + this test has no KNOWN_DUPLICATES carve-out — a chem/protein UMLS overlap is always + a hard failure here, making it a stricter sentinel for this specific pair. """ chem_ids = _read_ids(umls_pipeline_outputs["chemicals"]) prot_ids = _read_ids(umls_pipeline_outputs["protein"]) From ecbfa0c1e1dd067857f6923e63e8d78dfe345625 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 27 Mar 2026 18:22:41 -0400 Subject: [PATCH 25/30] Set pythonpath in pytest config; drop PYTHONPATH=. from docs Add `pythonpath = ["."]` to [tool.pytest.ini_options] in pyproject.toml so that `uv run pytest` works without the PYTHONPATH=. prefix. Remove the prefix from all pytest invocations in CLAUDE.md and tests/README.md. Co-Authored-By: Claude Sonnet 4.6 --- CLAUDE.md | 14 +++++++------- pyproject.toml | 1 + tests/README.md | 28 ++++++++++++++-------------- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 85e6e0c7..3cef4c31 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,13 +29,13 @@ uv run snakemake --cores 1 chemical # Another target ### Testing ```bash -PYTHONPATH=. uv run pytest # All tests -PYTHONPATH=. uv run pytest --cov=src # With coverage report -PYTHONPATH=. uv run pytest tests/test_node_factory.py # Single test file -PYTHONPATH=. uv run pytest -m unit -q # Unit tests only (CI default) -PYTHONPATH=. uv run pytest --network # Include network tests -PYTHONPATH=. uv run pytest --all # Run every test -PYTHONPATH=. uv run pytest -n auto # Parallel (all CPUs) +uv run pytest # All tests +uv run pytest --cov=src # With coverage report +uv run pytest tests/test_node_factory.py # Single test file +uv run pytest -m unit -q # Unit tests only (CI default) +uv run pytest --network # Include network tests +uv run pytest --all # Run every test +uv run pytest -n auto # Parallel (all CPUs) ``` Tests use four marks: `unit` (fast, offline), `network` (requires internet, opt-in with diff --git a/pyproject.toml b/pyproject.toml index 3fb9b363..a0bac9dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ include = '\.snakefile$|^Snakefile' [tool.pytest.ini_options] testpaths = ["tests"] +pythonpath = ["."] addopts = "" timeout = 30 # global fallback (seconds); overridden per mark in conftest.py # Canonical marker definitions — conftest.py intentionally does NOT re-register these. diff --git a/tests/README.md b/tests/README.md index 4136614f..5982f355 100644 --- a/tests/README.md +++ b/tests/README.md @@ -3,9 +3,9 @@ ## Running Tests ```bash -PYTHONPATH=. uv run pytest # All tests -PYTHONPATH=. uv run pytest --cov=src # With coverage report -PYTHONPATH=. uv run pytest tests/test_glom.py # Single test file +uv run pytest # All tests +uv run pytest --cov=src # With coverage report +uv run pytest tests/test_glom.py # Single test file ``` Coverage is opt-in: pass `--cov=src` (or `--cov=src --cov-report=html`) to generate @@ -36,14 +36,14 @@ You can adjust the timeout for marks in [conftest.py](conftest.py). ### Convenience commands ```bash -PYTHONPATH=. uv run pytest -m unit # unit tests only (CI default) -PYTHONPATH=. uv run pytest -m "unit or network" --network # unit + live-service checks -PYTHONPATH=. uv run pytest -m "unit or slow" # unit + slow offline tests -PYTHONPATH=. uv run pytest --all --no-cov # run every test (no coverage) -PYTHONPATH=. uv run pytest -m pipeline --pipeline -x --no-cov # one pipeline test at a time -PYTHONPATH=. uv run pytest -m "not pipeline" # everything except full pipeline runs -PYTHONPATH=. uv run pytest -n auto --no-cov # parallel (all CPUs), skip coverage -PYTHONPATH=. uv run pytest -n 4 -m unit # 4 workers, unit tests only +uv run pytest -m unit # unit tests only (CI default) +uv run pytest -m "unit or network" --network # unit + live-service checks +uv run pytest -m "unit or slow" # unit + slow offline tests +uv run pytest --all --no-cov # run every test (no coverage) +uv run pytest -m pipeline --pipeline -x --no-cov # one pipeline test at a time +uv run pytest -m "not pipeline" # everything except full pipeline runs +uv run pytest -n auto --no-cov # parallel (all CPUs), skip coverage +uv run pytest -n 4 -m unit # 4 workers, unit tests only ``` ## Test Files @@ -100,7 +100,7 @@ exists it is reused — `write_umls_ids()` is not called again. This means: - **To force re-processing**, pass `--regenerate`: ```bash - PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --regenerate --no-cov -v + uv run pytest tests/pipeline/ --pipeline --regenerate --no-cov -v ``` - **To selectively regenerate one vocabulary**, delete its files manually then run @@ -108,7 +108,7 @@ exists it is reused — `write_umls_ids()` is not called again. This means: ```bash rm babel_outputs/intermediate/*/ids/UMLS - PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v -k UMLS + uv run pytest tests/pipeline/ --pipeline --no-cov -v -k UMLS ``` - **`pipeline/test_vocabulary_partitioning.py`** (`pipeline`) — Generic mutual-exclusivity @@ -254,5 +254,5 @@ and "New pipeline tests" in Future Plans for the current coverage and how to ext Run them with: ```bash -PYTHONPATH=. uv run pytest tests/pipeline/test_mesh_pipeline.py --pipeline --no-cov -v +uv run pytest tests/pipeline/test_mesh_pipeline.py --pipeline --no-cov -v ``` From d41544ff84b88240bb745f906122918f4db70bb4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 27 Mar 2026 18:37:09 -0400 Subject: [PATCH 26/30] Fix ThrottledRequester: correct sleep duration, replace mocky.io with local server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit babel_utils.py: fix timedelta.microseconds → total_seconds() in the throttle sleep. microseconds returns only the sub-second component, so any configured delta ≥ 1 s would sleep far too short. test_ThrottledRequester.py: replace mocky.io (unreliable third-party service) with an ephemeral local HTTPServer that runs in a daemon thread. _local_server(delay_ms=N) gives precise control over response latency, making both tests deterministic and offline. Both tests are re-marked unit (were network) and test_no_throttling's xfail is removed. Co-Authored-By: Claude Sonnet 4.6 --- src/babel_utils.py | 2 +- tests/test_ThrottledRequester.py | 99 +++++++++++++++++++------------- 2 files changed, 61 insertions(+), 40 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index dcc29a52..678fea2b 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -149,7 +149,7 @@ def get(self, url): cdelta = now - self.last_time if cdelta < self.delta: waittime = self.delta - cdelta - time.sleep(waittime.microseconds / 1e6) + time.sleep(waittime.total_seconds()) throttled = True self.last_time = datetime.now() response = requests.get(url) diff --git a/tests/test_ThrottledRequester.py b/tests/test_ThrottledRequester.py index fb835f98..60d4964c 100644 --- a/tests/test_ThrottledRequester.py +++ b/tests/test_ThrottledRequester.py @@ -1,51 +1,72 @@ +import threading +import time as time_module +from contextlib import contextmanager from datetime import datetime as dt from datetime import timedelta +from http.server import BaseHTTPRequestHandler, HTTPServer import pytest from src.babel_utils import ThrottledRequester -@pytest.mark.network +class _DelayHandler(BaseHTTPRequestHandler): + """Minimal HTTP handler that sleeps delay_ms before returning empty JSON.""" + delay_ms = 0 + + def do_GET(self): + time_module.sleep(self.delay_ms / 1000) + self.send_response(200) + self.end_headers() + self.wfile.write(b"{}") + + def log_message(self, *args): # suppress request logging to stdout + pass + + +@contextmanager +def _local_server(delay_ms=0): + """Start an ephemeral HTTP server on a free port, yield its URL, then shut down.""" + _DelayHandler.delay_ms = delay_ms + server = HTTPServer(("127.0.0.1", 0), _DelayHandler) + port = server.server_address[1] + t = threading.Thread(target=server.serve_forever) + t.daemon = True + t.start() + try: + yield f"http://127.0.0.1:{port}/" + finally: + server.shutdown() + + +@pytest.mark.unit def test_throttling(): - """Call a quick-returning service, but include a throttle of 1/2 second (500 ms). - This should end up taking just a bit over 500 ms. The service being called - just returns an empty json. It's not immediate, but it returns quick enough that - a 1/2 second throttle will be invoked. - The test is a little goofy, and the half_sec_plus number is made up""" - tr = ThrottledRequester(500) - now = dt.now() - response, throttle1 = tr.get("http://www.mocky.io/v2/5df243b23100007f009a31b0") - response, throttle2 = tr.get("http://www.mocky.io/v2/5df243b23100007f009a31b0") - later = dt.now() + """Second request within the throttle window must wait; total runtime must exceed 500 ms.""" + with _local_server(delay_ms=0) as url: + tr = ThrottledRequester(500) + now = dt.now() + _response, throttle1 = tr.get(url) + _response, throttle2 = tr.get(url) + later = dt.now() + runtime = later - now - half_sec = timedelta(milliseconds=500) - half_sec_plus = timedelta(milliseconds=810) - assert not throttle1 # don't throttle the first time through - assert throttle2 # do throttle the second time through - assert runtime > half_sec - assert runtime < half_sec_plus - - -@pytest.mark.network -@pytest.mark.xfail( - reason="mocky.io does not reliably honour the ?mocky-delay query parameter, " - "so the first request can return in under 500 ms and the second call gets " - "throttled. To fix: replace mocky.io with a local HTTP server (e.g. " - "pytest-httpserver) that accepts a configurable delay.", - strict=False, -) + assert not throttle1 # first call: no throttle + assert throttle2 # second call: throttled + assert runtime > timedelta(milliseconds=500) # wait was enforced + assert runtime < timedelta(milliseconds=1500) # sanity: didn't wait too long + + +@pytest.mark.unit def test_no_throttling(): - """Call a slow-returning service, but include a throttle of 1/2 second (500 ms). - Because the service being called takes longer than the throttle value, we should - not wait the second time through. The mocky service lets us specify a delay time.""" - tr = ThrottledRequester(500) - now = dt.now() - response, throttle1 = tr.get("http://www.mocky.io/v2/5df243b23100007f009a31b0?mocky-delay=600ms") - response, throttle2 = tr.get("http://www.mocky.io/v2/5df243b23100007f009a31b0") - later = dt.now() + """When the request itself takes longer than the throttle window, no extra wait is added.""" + with _local_server(delay_ms=600) as url: + tr = ThrottledRequester(500) + now = dt.now() + _response, throttle1 = tr.get(url) # takes ~600 ms — longer than the 500 ms delta + _response, throttle2 = tr.get(url) # delta already elapsed; no throttle needed + later = dt.now() + runtime = later - now - lower_bound = timedelta(milliseconds=600) - assert not throttle1 # don't throttle the first time through - assert not throttle2 # Nor the second time, because the call itself took longer than the throttle time - assert runtime > lower_bound # make sure we actually delayed + assert not throttle1 # first call: no throttle + assert not throttle2 # second call: request was slow enough + assert runtime > timedelta(milliseconds=600) # at least one real delay occurred From daf4595ae073f80f84f852318d036ff2e15eef34 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 17:25:19 -0400 Subject: [PATCH 27/30] Update tests/pipeline/conftest.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/pipeline/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index f21bd0e3..d4e8b5bd 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -10,7 +10,7 @@ to force re-processing even when files are present. All tests in this package are marked `pipeline` and are skipped by default. -Run with: PYTHONPATH=. uv run pytest tests/pipeline/ --pipeline --no-cov -v +Run with: uv run pytest tests/pipeline/ --pipeline --no-cov -v ## Adding a new vocabulary From 933d058732d3c3fe03ab7bbeeca5bb59f48267b4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 17:25:34 -0400 Subject: [PATCH 28/30] Update tests/pipeline/test_mesh_pipeline.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/pipeline/test_mesh_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/test_mesh_pipeline.py b/tests/pipeline/test_mesh_pipeline.py index 3b882321..02c35bd0 100644 --- a/tests/pipeline/test_mesh_pipeline.py +++ b/tests/pipeline/test_mesh_pipeline.py @@ -6,7 +6,7 @@ targeted test that has no generic equivalent. All tests are skipped by default. Run with: - PYTHONPATH=. uv run pytest tests/pipeline/test_mesh_pipeline.py --pipeline --no-cov -v + uv run pytest tests/pipeline/test_mesh_pipeline.py --pipeline --no-cov -v """ import pytest From d7afbea7bdebb5193198b3f6c53a3f70f47ec683 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 17:25:44 -0400 Subject: [PATCH 29/30] Update tests/pipeline/test_umls_pipeline.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/pipeline/test_umls_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pipeline/test_umls_pipeline.py b/tests/pipeline/test_umls_pipeline.py index c5be6330..1a21545d 100644 --- a/tests/pipeline/test_umls_pipeline.py +++ b/tests/pipeline/test_umls_pipeline.py @@ -8,7 +8,7 @@ All tests require UMLS_API_KEY to be set for the initial download (or the files to already be cached in babel_downloads/UMLS/). They are skipped by default. Run with: - PYTHONPATH=. uv run pytest tests/pipeline/test_umls_pipeline.py --pipeline --no-cov -v + uv run pytest tests/pipeline/test_umls_pipeline.py --pipeline --no-cov -v """ import pytest From 4adb52c2a764c2e3dce39d121b1d05a37f80fef9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Mar 2026 17:26:23 -0400 Subject: [PATCH 30/30] Update tests/test_ThrottledRequester.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/test_ThrottledRequester.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_ThrottledRequester.py b/tests/test_ThrottledRequester.py index 60d4964c..92fe2330 100644 --- a/tests/test_ThrottledRequester.py +++ b/tests/test_ThrottledRequester.py @@ -31,14 +31,14 @@ def _local_server(delay_ms=0): server = HTTPServer(("127.0.0.1", 0), _DelayHandler) port = server.server_address[1] t = threading.Thread(target=server.serve_forever) - t.daemon = True t.start() try: yield f"http://127.0.0.1:{port}/" finally: + # Ensure the server loop stops, the thread exits, and the socket is closed. server.shutdown() - - + t.join(timeout=5) + server.server_close() @pytest.mark.unit def test_throttling(): """Second request within the throttle window must wait; total runtime must exceed 500 ms."""