diff --git a/config.yaml b/config.yaml index 87722a08..1eb136f4 100644 --- a/config.yaml +++ b/config.yaml @@ -426,5 +426,10 @@ ensembl_datasets_to_skip: - otshawytscha_gene_ensembl - aocellaris_gene_ensembl -demote_labels_longer_than: 25 +# Labels longer than this limit are demoted (not used as preferred label if a shorter alternative exists). +# Keyed by Biolink type; types not listed here are never demoted. Uses ancestor traversal, so +# biolink:ChemicalEntity applies to all chemical subtypes (SmallMolecule, Drug, etc.). +# See https://github.com/NCATSTranslator/Babel/issues/597 +demote_labels_longer_than: + biolink:ChemicalEntity: 25 diff --git a/src/babel_utils.py b/src/babel_utils.py index 678fea2b..c174668a 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -404,6 +404,63 @@ def sort_identifiers_with_boosted_prefixes(identifiers, prefixes): ) +def _select_preferred_label(node, types, preferred_name_boost_prefixes, demote_labels_longer_than): + """Choose the preferred display label for a normalised node. + + Steps: + 1. Sort labels in boosted-prefix order if the node's most-specific type has an entry in + preferred_name_boost_prefixes; otherwise use Biolink prefix order. + 2. Filter blank labels. + 3. Demote labels longer than the per-type limit (if the type has an entry in + demote_labels_longer_than). Types with no entry are never demoted. + See https://github.com/NCATSTranslator/Babel/issues/597 + 4. Return the first surviving label, or "" if none remain. + + :param node: A node dict with "identifiers" (list of dicts with "identifier" and optionally "label") and "type". + :param types: Ancestor types for this node, most-specific first. + :param preferred_name_boost_prefixes: Dict mapping Biolink type → list of boosted prefixes (from config). + :param demote_labels_longer_than: Dict mapping Biolink type → int length limit (from config). + :return: The preferred label string, or "" if no label is available. + """ + # Step 1.1 — sort by boosted prefix order for the most-specific matching type. + possible_labels = [] + for typ in types: + if typ in preferred_name_boost_prefixes: + possible_labels = list( + map( + lambda identifier: identifier.get("label", ""), + sort_identifiers_with_boosted_prefixes(node["identifiers"], preferred_name_boost_prefixes[typ]), + ) + ) + # Append any remaining labels not already included. + for id in node["identifiers"]: + label = id.get("label", "") + if label not in possible_labels: + possible_labels.append(label) + break + + # Step 1.2 — fallback: use identifiers in their existing (Biolink prefix) order. + if not possible_labels: + possible_labels = list(map(lambda identifier: identifier.get("label", ""), node["identifiers"])) + + # Step 2 — filter blank labels. + filtered = [label for label in possible_labels if label] + + # Step 3 — per-type length demotion: find the limit for the most-specific matching type. + length_limit = None + for typ in types: + if typ in demote_labels_longer_than: + length_limit = demote_labels_longer_than[typ] + break + if length_limit is not None: + shorter = [label for label in filtered if len(label) <= length_limit] + if shorter: + filtered = shorter + + # Step 4 — return the first surviving label. + return filtered[0] if filtered else "" + + def get_numerical_curie_suffix(curie): """ If a CURIE has a numerical suffix, return it as an integer. Otherwise return None. @@ -459,6 +516,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # coming up with a preferred label for a particular Biolink class. preferred_name_boost_prefixes = config["preferred_name_boost_prefixes"] + # Load the per-type label length demotion config. Types not listed here are never demoted. + demote_labels_longer_than = config.get("demote_labels_longer_than", {}) + # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download # directory. if not icrdf_filename: @@ -546,61 +606,8 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # Determine types. types = node_factory.get_ancestors(node["type"]) - # Generate a preferred label for this clique. - # - # To pick a preferred label for this clique, we need to do three things: - # 1. We sort all labels in the preferred-name order. By default, this should be - # the preferred CURIE order, but if this clique is in one of the Biolink classes in - # preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list. - # 2. We filter out any suspicious labels. - # (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the - # algorithm proposed by Jeff at - # https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850) - # 3. We filter out any labels longer than config['demote_labels_longer_than'], but only if there is - # at least one label shorter than this limit. - # 4. We choose the first label that isn't blank (that allows us to use our rule of smallest-prefix-first to find the broadest name for this concept). If no labels remain, we generate a warning. - - # Step 1.1. Sort labels in boosted prefix order if possible. - possible_labels = [] - for typ in types: - if typ in preferred_name_boost_prefixes: - # This is the most specific matching type, so we use this and then break. - possible_labels = list( - map( - lambda identifier: identifier.get("label", ""), - sort_identifiers_with_boosted_prefixes(node["identifiers"], preferred_name_boost_prefixes[typ]), - ) - ) - - # Add in all the other labels -- we'd still like to consider them, but at a lower priority. - for id in node["identifiers"]: - label = id.get("label", "") - if label not in possible_labels: - possible_labels.append(label) - - # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) - # categories: so let's break here. - break - - # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their - # Biolink prefix order. - if not possible_labels: - possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"]) - - # Step 2. Filter out any suspicious labels. - filtered_possible_labels = [label for label in possible_labels if label] # Ignore blank or empty names. - - # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at - # least one label shorter than this limit. - labels_shorter_than_limit = [label for label in filtered_possible_labels if label and len(label) <= config["demote_labels_longer_than"]] - if labels_shorter_than_limit: - filtered_possible_labels = labels_shorter_than_limit - - # Step 4. Pick the first label if it isn't blank. - if filtered_possible_labels: - preferred_name = filtered_possible_labels[0] - else: - preferred_name = "" + # Generate a preferred label for this clique using _select_preferred_label(). + preferred_name = _select_preferred_label(node, types, preferred_name_boost_prefixes, demote_labels_longer_than) # At this point, we insert any HAS_ADDITIONAL_ID IDs we have. # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long diff --git a/tests/README.md b/tests/README.md index 16fb3a2a..668e61b7 100644 --- a/tests/README.md +++ b/tests/README.md @@ -221,6 +221,18 @@ uv run pytest tests/pipeline/checks/ -k "xref" --pipeline --no-cov -v verifying that rate-limiting delays are correctly applied between requests. Requires `--network` to run. +### babel_utils/ + +- **`babel_utils/test_write_compendia.py`** (`unit`) — Unit tests for `_select_preferred_label()`, + the label-selection helper extracted from `write_compendium()`. Covers per-type length demotion + (demotion applies to chemicals and their subtypes via ancestor traversal; diseases, phenotypes, + and other non-chemical types are never demoted), interaction with `preferred_name_boost_prefixes`, + and the fall-through when all labels exceed the limit. Regression tests use real CURIEs from + [#597](https://github.com/NCATSTranslator/Babel/issues/597), + [#711](https://github.com/NCATSTranslator/Babel/issues/711), + [#714](https://github.com/NCATSTranslator/Babel/issues/714), and + [#723](https://github.com/NCATSTranslator/Babel/issues/723). + ## Test Data The `tests/data` directory contains fixture files used by several tests: diff --git a/tests/babel_utils/__init__.py b/tests/babel_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/babel_utils/test_write_compendia.py b/tests/babel_utils/test_write_compendia.py new file mode 100644 index 00000000..1db5f104 --- /dev/null +++ b/tests/babel_utils/test_write_compendia.py @@ -0,0 +1,220 @@ +"""Unit tests for label-selection logic in write_compendium(). + +The helper under test is _select_preferred_label(), which is the extracted core of +the label-selection algorithm previously embedded in write_compendium(). + +Issue context: + https://github.com/NCATSTranslator/Babel/issues/597 — original report: good names demoted + https://github.com/NCATSTranslator/Babel/issues/714 — MONDO:0011479 "postural orthostatic + tachycardia syndrome" wrongly demoted to "Irritable heart" + https://github.com/NCATSTranslator/Babel/issues/711 — HP:0001508 "Failure to thrive" + wrongly demoted to "Undergrowth" + https://github.com/NCATSTranslator/Babel/issues/723 — MONDO:0005578 "arthritic joint + disease" wrongly demoted to "arthritis" +""" + +import pytest + +from src.babel_utils import _select_preferred_label + + +def _node(identifiers): + """Build a minimal node dict from a list of (curie, label_or_None) tuples.""" + ids = [] + for curie, label in identifiers: + entry = {"identifier": curie} + if label is not None: + entry["label"] = label + ids.append(entry) + return {"identifiers": ids} + + +# --------------------------------------------------------------------------- +# Ancestor lists (mirrors what node_factory.get_ancestors() returns) +# --------------------------------------------------------------------------- + +DISEASE_ANCESTORS = [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", +] + +PHENOTYPIC_FEATURE_ANCESTORS = [ + "biolink:PhenotypicFeature", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", +] + +CHEMICAL_ENTITY_ANCESTORS = [ + "biolink:ChemicalEntity", + "biolink:PhysicalEssence", + "biolink:NamedThing", +] + +SMALL_MOLECULE_ANCESTORS = [ + "biolink:SmallMolecule", + "biolink:ChemicalEntity", + "biolink:PhysicalEssence", + "biolink:NamedThing", +] + +DRUG_ANCESTORS = [ + "biolink:Drug", + "biolink:ChemicalEntity", + "biolink:PhysicalEssence", + "biolink:NamedThing", +] + +DEMOTE_CHEMICALS_25 = {"biolink:ChemicalEntity": 25} + + +# --------------------------------------------------------------------------- +# Regression tests from GitHub issues +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_pots_label_not_demoted(): + """MONDO:0011479 — "postural orthostatic tachycardia syndrome" (40 chars) must not be + demoted to "Irritable heart" (14 chars) for biolink:Disease. + https://github.com/NCATSTranslator/Babel/issues/714 + """ + node = _node([ + ("MONDO:0011479", "postural orthostatic tachycardia syndrome"), + ("UMLS:C2930833", "Irritable heart"), + ]) + result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "postural orthostatic tachycardia syndrome" + + +@pytest.mark.unit +def test_failure_to_thrive_not_demoted(): + """HP:0001508 — "Failure to thrive" (18 chars) must not be demoted to "Undergrowth" + (10 chars) for biolink:PhenotypicFeature. + https://github.com/NCATSTranslator/Babel/issues/711 + """ + node = _node([ + ("HP:0001508", "Failure to thrive"), + ("UMLS:C4531021", "Undergrowth"), + ]) + result = _select_preferred_label(node, PHENOTYPIC_FEATURE_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "Failure to thrive" + + +@pytest.mark.unit +def test_arthritic_joint_disease_not_demoted(): + """MONDO:0005578 — "arthritic joint disease" (22 chars) must not be demoted to + "arthritis" (9 chars) for biolink:Disease. + https://github.com/NCATSTranslator/Babel/issues/723 + """ + node = _node([ + ("MONDO:0005578", "arthritic joint disease"), + ("DOID:848", "arthritis"), + ]) + result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "arthritic joint disease" + + +# --------------------------------------------------------------------------- +# Chemical demotion — should still apply +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_chemical_long_iupac_demoted(): + """For biolink:ChemicalEntity, a long IUPAC name should be demoted in favour of a short + common name when a shorter label is available. + """ + node = _node([ + ("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"), # 35 chars — too long + ("CHEBI:17334", "serine"), # would be a duplicate curie in practice, but label logic is independent + ]) + # Use two distinct CURIEs to get two distinct labels + node = _node([ + ("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"), + ("PUBCHEM.COMPOUND:5951", "serine"), + ]) + result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "serine" + + +@pytest.mark.unit +def test_chemical_demotion_via_small_molecule_ancestor(): + """Demotion configured on biolink:ChemicalEntity should apply to biolink:SmallMolecule + (a subtype) via ancestor traversal. + """ + node = _node([ + ("CHEBI:17234", "(2R,3S,4S,5R)-2,3,4,5,6-pentahydroxyhexanal"), # very long IUPAC + ("PUBCHEM.COMPOUND:107526", "glucose"), + ]) + result = _select_preferred_label(node, SMALL_MOLECULE_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "glucose" + + +@pytest.mark.unit +def test_chemical_demotion_via_drug_ancestor(): + """Demotion should also apply to biolink:Drug via ancestor traversal.""" + node = _node([ + ("DRUGBANK:DB00945", "acetylsalicylic acid"), # 20 chars — within limit + ("PUBCHEM.COMPOUND:2244", "aspirin"), + ]) + # acetylsalicylic acid (20) is within the limit, so it should be returned first + result = _select_preferred_label(node, DRUG_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "acetylsalicylic acid" + + +@pytest.mark.unit +def test_chemical_all_labels_long_keeps_first(): + """If all labels exceed the demotion limit, no demotion occurs and the first label is kept.""" + node = _node([ + ("CHEBI:100001", "some-very-long-iupac-name-that-exceeds-the-limit"), + ("PUBCHEM.COMPOUND:99999", "another-very-long-chemical-name-here"), + ]) + result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "some-very-long-iupac-name-that-exceeds-the-limit" + + +# --------------------------------------------------------------------------- +# Empty / no-config cases +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_no_demotion_when_config_is_empty(): + """When demote_labels_longer_than is an empty dict, no demotion occurs for any type.""" + node = _node([ + ("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"), + ("PUBCHEM.COMPOUND:5951", "serine"), + ]) + result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, {}) + assert result == "(2S)-2-amino-3-hydroxypropanoic acid" + + +@pytest.mark.unit +def test_no_labels_returns_empty_string(): + """A node with no labels should return an empty string.""" + node = _node([("MONDO:0000001", None)]) + result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25) + assert result == "" + + +# --------------------------------------------------------------------------- +# Interaction between boost prefixes and demotion +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_boost_prefix_then_demotion(): + """preferred_name_boost_prefixes reorders labels; demotion then filters by length. + DRUGBANK is boosted for ChemicalEntity, so a long DRUGBANK label is moved to the front — + but demotion should then skip it in favour of the shorter alternative. + """ + boost = {"biolink:ChemicalEntity": ["DRUGBANK", "CHEBI"]} + node = _node([ + ("CHEBI:27899", "cisplatin"), # 9 chars — short, not boosted first + ("DRUGBANK:DB00515", "cis-diaminedichloroplatinum(II)"), # 31 chars — boosted first but too long + ]) + result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, boost, DEMOTE_CHEMICALS_25) + assert result == "cisplatin"