NCATSTranslator · gaurav · Apr 18, 2026
diff --git a/config.yaml b/config.yaml
@@ -426,5 +426,10 @@ ensembl_datasets_to_skip:
   - otshawytscha_gene_ensembl
   - aocellaris_gene_ensembl
 
-demote_labels_longer_than: 25
+# Labels longer than this limit are demoted (not used as preferred label if a shorter alternative exists).
+# Keyed by Biolink type; types not listed here are never demoted. Uses ancestor traversal, so
+# biolink:ChemicalEntity applies to all chemical subtypes (SmallMolecule, Drug, etc.).
+# See https://github.com/NCATSTranslator/Babel/issues/597
+demote_labels_longer_than:
+  biolink:ChemicalEntity: 25
 
diff --git a/src/babel_utils.py b/src/babel_utils.py
@@ -404,6 +404,63 @@ def sort_identifiers_with_boosted_prefixes(identifiers, prefixes):
     )
 
 
+def _select_preferred_label(node, types, preferred_name_boost_prefixes, demote_labels_longer_than):
+    """Choose the preferred display label for a normalised node.
+
+    Steps:
+    1. Sort labels in boosted-prefix order if the node's most-specific type has an entry in
+       preferred_name_boost_prefixes; otherwise use Biolink prefix order.
+    2. Filter blank labels.
+    3. Demote labels longer than the per-type limit (if the type has an entry in
+       demote_labels_longer_than). Types with no entry are never demoted.
+       See https://github.com/NCATSTranslator/Babel/issues/597
+    4. Return the first surviving label, or "" if none remain.
+
+    :param node: A node dict with "identifiers" (list of dicts with "identifier" and optionally "label") and "type".
+    :param types: Ancestor types for this node, most-specific first.
+    :param preferred_name_boost_prefixes: Dict mapping Biolink type → list of boosted prefixes (from config).
+    :param demote_labels_longer_than: Dict mapping Biolink type → int length limit (from config).
+    :return: The preferred label string, or "" if no label is available.
+    """
+    # Step 1.1 — sort by boosted prefix order for the most-specific matching type.
+    possible_labels = []
+    for typ in types:
+        if typ in preferred_name_boost_prefixes:
+            possible_labels = list(
+                map(
+                    lambda identifier: identifier.get("label", ""),
+                    sort_identifiers_with_boosted_prefixes(node["identifiers"], preferred_name_boost_prefixes[typ]),
+                )
+            )
+            # Append any remaining labels not already included.
+            for id in node["identifiers"]:
+                label = id.get("label", "")
+                if label not in possible_labels:
+                    possible_labels.append(label)
+            break
+
+    # Step 1.2 — fallback: use identifiers in their existing (Biolink prefix) order.
+    if not possible_labels:
+        possible_labels = list(map(lambda identifier: identifier.get("label", ""), node["identifiers"]))
+
+    # Step 2 — filter blank labels.
+    filtered = [label for label in possible_labels if label]
+
+    # Step 3 — per-type length demotion: find the limit for the most-specific matching type.
+    length_limit = None
+    for typ in types:
+        if typ in demote_labels_longer_than:
+            length_limit = demote_labels_longer_than[typ]
+            break
+    if length_limit is not None:
+        shorter = [label for label in filtered if len(label) <= length_limit]
+        if shorter:
+            filtered = shorter
+
+    # Step 4 — return the first surviving label.
+    return filtered[0] if filtered else ""
+
+
 def get_numerical_curie_suffix(curie):
     """
     If a CURIE has a numerical suffix, return it as an integer. Otherwise return None.
@@ -459,6 +516,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non
     # coming up with a preferred label for a particular Biolink class.
     preferred_name_boost_prefixes = config["preferred_name_boost_prefixes"]
 
+    # Load the per-type label length demotion config. Types not listed here are never demoted.
+    demote_labels_longer_than = config.get("demote_labels_longer_than", {})
+
     # Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download
     # directory.
     if not icrdf_filename:
@@ -546,61 +606,8 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non
                 # Determine types.
                 types = node_factory.get_ancestors(node["type"])
 
-                # Generate a preferred label for this clique.
-                #
-                # To pick a preferred label for this clique, we need to do three things:
-                # 1. We sort all labels in the preferred-name order. By default, this should be
-                #    the preferred CURIE order, but if this clique is in one of the Biolink classes in
-                #    preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list.
-                # 2. We filter out any suspicious labels.
-                #    (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the
-                #    algorithm proposed by Jeff at
-                #    https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850)
-                # 3. We filter out any labels longer than config['demote_labels_longer_than'], but only if there is
-                #    at least one label shorter than this limit.
-                # 4. We choose the first label that isn't blank (that allows us to use our rule of smallest-prefix-first to find the broadest name for this concept). If no labels remain, we generate a warning.
-
-                # Step 1.1. Sort labels in boosted prefix order if possible.
-                possible_labels = []
-                for typ in types:
-                    if typ in preferred_name_boost_prefixes:
-                        # This is the most specific matching type, so we use this and then break.
-                        possible_labels = list(
-                            map(
-                                lambda identifier: identifier.get("label", ""),
-                                sort_identifiers_with_boosted_prefixes(node["identifiers"], preferred_name_boost_prefixes[typ]),
-                            )
-                        )
-
-                        # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
-                        for id in node["identifiers"]:
-                            label = id.get("label", "")
-                            if label not in possible_labels:
-                                possible_labels.append(label)
-
-                        # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
-                        # categories: so let's break here.
-                        break
-
-                # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
-                # Biolink prefix order.
-                if not possible_labels:
-                    possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"])
-
-                # Step 2. Filter out any suspicious labels.
-                filtered_possible_labels = [label for label in possible_labels if label]  # Ignore blank or empty names.
-
-                # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
-                # least one label shorter than this limit.
-                labels_shorter_than_limit = [label for label in filtered_possible_labels if label and len(label) <= config["demote_labels_longer_than"]]
-                if labels_shorter_than_limit:
-                    filtered_possible_labels = labels_shorter_than_limit
-
-                # Step 4. Pick the first label if it isn't blank.
-                if filtered_possible_labels:
-                    preferred_name = filtered_possible_labels[0]
-                else:
-                    preferred_name = ""
+                # Generate a preferred label for this clique using _select_preferred_label().
+                preferred_name = _select_preferred_label(node, types, preferred_name_boost_prefixes, demote_labels_longer_than)
 
                 # At this point, we insert any HAS_ADDITIONAL_ID IDs we have.
                 # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long

diff --git a/tests/README.md b/tests/README.md
@@ -221,6 +221,18 @@ uv run pytest tests/pipeline/checks/ -k "xref" --pipeline --no-cov -v
   verifying that rate-limiting delays are correctly applied between requests.
   Requires `--network` to run.
 
+### babel_utils/
+
+- **`babel_utils/test_write_compendia.py`** (`unit`) — Unit tests for `_select_preferred_label()`,
+  the label-selection helper extracted from `write_compendium()`. Covers per-type length demotion
+  (demotion applies to chemicals and their subtypes via ancestor traversal; diseases, phenotypes,
+  and other non-chemical types are never demoted), interaction with `preferred_name_boost_prefixes`,
+  and the fall-through when all labels exceed the limit. Regression tests use real CURIEs from
+  [#597](https://github.com/NCATSTranslator/Babel/issues/597),
+  [#711](https://github.com/NCATSTranslator/Babel/issues/711),
+  [#714](https://github.com/NCATSTranslator/Babel/issues/714), and
+  [#723](https://github.com/NCATSTranslator/Babel/issues/723).
+
 ## Test Data
 
 The `tests/data` directory contains fixture files used by several tests:

diff --git a/tests/babel_utils/__init__.py b/tests/babel_utils/__init__.py
diff --git a/tests/babel_utils/test_write_compendia.py b/tests/babel_utils/test_write_compendia.py
@@ -0,0 +1,220 @@
+"""Unit tests for label-selection logic in write_compendium().
+
+The helper under test is _select_preferred_label(), which is the extracted core of
+the label-selection algorithm previously embedded in write_compendium().
+
+Issue context:
+  https://github.com/NCATSTranslator/Babel/issues/597  — original report: good names demoted
+  https://github.com/NCATSTranslator/Babel/issues/714  — MONDO:0011479 "postural orthostatic
+      tachycardia syndrome" wrongly demoted to "Irritable heart"
+  https://github.com/NCATSTranslator/Babel/issues/711  — HP:0001508 "Failure to thrive"
+      wrongly demoted to "Undergrowth"
+  https://github.com/NCATSTranslator/Babel/issues/723  — MONDO:0005578 "arthritic joint
+      disease" wrongly demoted to "arthritis"
+"""
+
+import pytest
+
+from src.babel_utils import _select_preferred_label
+
+
+def _node(identifiers):
+    """Build a minimal node dict from a list of (curie, label_or_None) tuples."""
+    ids = []
+    for curie, label in identifiers:
+        entry = {"identifier": curie}
+        if label is not None:
+            entry["label"] = label
+        ids.append(entry)
+    return {"identifiers": ids}
+
+
+# ---------------------------------------------------------------------------
+# Ancestor lists (mirrors what node_factory.get_ancestors() returns)
+# ---------------------------------------------------------------------------
+
+DISEASE_ANCESTORS = [
+    "biolink:Disease",
+    "biolink:DiseaseOrPhenotypicFeature",
+    "biolink:BiologicalEntity",
+    "biolink:NamedThing",
+]
+
+PHENOTYPIC_FEATURE_ANCESTORS = [
+    "biolink:PhenotypicFeature",
+    "biolink:DiseaseOrPhenotypicFeature",
+    "biolink:BiologicalEntity",
+    "biolink:NamedThing",
+]
+
+CHEMICAL_ENTITY_ANCESTORS = [
+    "biolink:ChemicalEntity",
+    "biolink:PhysicalEssence",
+    "biolink:NamedThing",
+]
+
+SMALL_MOLECULE_ANCESTORS = [
+    "biolink:SmallMolecule",
+    "biolink:ChemicalEntity",
+    "biolink:PhysicalEssence",
+    "biolink:NamedThing",
+]
+
+DRUG_ANCESTORS = [
+    "biolink:Drug",
+    "biolink:ChemicalEntity",
+    "biolink:PhysicalEssence",
+    "biolink:NamedThing",
+]
+
+DEMOTE_CHEMICALS_25 = {"biolink:ChemicalEntity": 25}
+
+
+# ---------------------------------------------------------------------------
+# Regression tests from GitHub issues
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.unit
+def test_pots_label_not_demoted():
+    """MONDO:0011479 — "postural orthostatic tachycardia syndrome" (40 chars) must not be
+    demoted to "Irritable heart" (14 chars) for biolink:Disease.
+    https://github.com/NCATSTranslator/Babel/issues/714
+    """
+    node = _node([
+        ("MONDO:0011479", "postural orthostatic tachycardia syndrome"),
+        ("UMLS:C2930833", "Irritable heart"),
+    ])
+    result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "postural orthostatic tachycardia syndrome"
+
+
+@pytest.mark.unit
+def test_failure_to_thrive_not_demoted():
+    """HP:0001508 — "Failure to thrive" (18 chars) must not be demoted to "Undergrowth"
+    (10 chars) for biolink:PhenotypicFeature.
+    https://github.com/NCATSTranslator/Babel/issues/711
+    """
+    node = _node([
+        ("HP:0001508", "Failure to thrive"),
+        ("UMLS:C4531021", "Undergrowth"),
+    ])
+    result = _select_preferred_label(node, PHENOTYPIC_FEATURE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "Failure to thrive"
+
+
+@pytest.mark.unit
+def test_arthritic_joint_disease_not_demoted():
+    """MONDO:0005578 — "arthritic joint disease" (22 chars) must not be demoted to
+    "arthritis" (9 chars) for biolink:Disease.
+    https://github.com/NCATSTranslator/Babel/issues/723
+    """
+    node = _node([
+        ("MONDO:0005578", "arthritic joint disease"),
+        ("DOID:848", "arthritis"),
+    ])
+    result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "arthritic joint disease"
+
+
+# ---------------------------------------------------------------------------
+# Chemical demotion — should still apply
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.unit
+def test_chemical_long_iupac_demoted():
+    """For biolink:ChemicalEntity, a long IUPAC name should be demoted in favour of a short
+    common name when a shorter label is available.
+    """
+    node = _node([
+        ("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"),  # 35 chars — too long
+        ("CHEBI:17334", "serine"),  # would be a duplicate curie in practice, but label logic is independent
+    ])
+    # Use two distinct CURIEs to get two distinct labels
+    node = _node([
+        ("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"),
+        ("PUBCHEM.COMPOUND:5951", "serine"),
+    ])
+    result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "serine"
+
+
+@pytest.mark.unit
+def test_chemical_demotion_via_small_molecule_ancestor():
+    """Demotion configured on biolink:ChemicalEntity should apply to biolink:SmallMolecule
+    (a subtype) via ancestor traversal.
+    """
+    node = _node([
+        ("CHEBI:17234", "(2R,3S,4S,5R)-2,3,4,5,6-pentahydroxyhexanal"),  # very long IUPAC
+        ("PUBCHEM.COMPOUND:107526", "glucose"),
+    ])
+    result = _select_preferred_label(node, SMALL_MOLECULE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "glucose"
+
+
+@pytest.mark.unit
+def test_chemical_demotion_via_drug_ancestor():
+    """Demotion should also apply to biolink:Drug via ancestor traversal."""
+    node = _node([
+        ("DRUGBANK:DB00945", "acetylsalicylic acid"),  # 20 chars — within limit
+        ("PUBCHEM.COMPOUND:2244", "aspirin"),
+    ])
+    # acetylsalicylic acid (20) is within the limit, so it should be returned first
+    result = _select_preferred_label(node, DRUG_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "acetylsalicylic acid"
+
+
+@pytest.mark.unit
+def test_chemical_all_labels_long_keeps_first():
+    """If all labels exceed the demotion limit, no demotion occurs and the first label is kept."""
+    node = _node([
+        ("CHEBI:100001", "some-very-long-iupac-name-that-exceeds-the-limit"),
+        ("PUBCHEM.COMPOUND:99999", "another-very-long-chemical-name-here"),
+    ])
+    result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == "some-very-long-iupac-name-that-exceeds-the-limit"
+
+
+# ---------------------------------------------------------------------------
+# Empty / no-config cases
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.unit
+def test_no_demotion_when_config_is_empty():
+    """When demote_labels_longer_than is an empty dict, no demotion occurs for any type."""
+    node = _node([
+        ("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"),
+        ("PUBCHEM.COMPOUND:5951", "serine"),
+    ])
+    result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, {})
+    assert result == "(2S)-2-amino-3-hydroxypropanoic acid"
+
+
+@pytest.mark.unit
+def test_no_labels_returns_empty_string():
+    """A node with no labels should return an empty string."""
+    node = _node([("MONDO:0000001", None)])
+    result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
+    assert result == ""
+
+
+# ---------------------------------------------------------------------------
+# Interaction between boost prefixes and demotion
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.unit
+def test_boost_prefix_then_demotion():
+    """preferred_name_boost_prefixes reorders labels; demotion then filters by length.
+    DRUGBANK is boosted for ChemicalEntity, so a long DRUGBANK label is moved to the front —
+    but demotion should then skip it in favour of the shorter alternative.
+    """
+    boost = {"biolink:ChemicalEntity": ["DRUGBANK", "CHEBI"]}
+    node = _node([
+        ("CHEBI:27899", "cisplatin"),               # 9 chars — short, not boosted first
+        ("DRUGBANK:DB00515", "cis-diaminedichloroplatinum(II)"),  # 31 chars — boosted first but too long
+    ])
+    result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, boost, DEMOTE_CHEMICALS_25)
+    assert result == "cisplatin"