Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -426,5 +426,10 @@ ensembl_datasets_to_skip:
- otshawytscha_gene_ensembl
- aocellaris_gene_ensembl

demote_labels_longer_than: 25
# Labels longer than this limit are demoted (not used as preferred label if a shorter alternative exists).
# Keyed by Biolink type; types not listed here are never demoted. Uses ancestor traversal, so
# biolink:ChemicalEntity applies to all chemical subtypes (SmallMolecule, Drug, etc.).
# See https://github.com/NCATSTranslator/Babel/issues/597
demote_labels_longer_than:
biolink:ChemicalEntity: 25

117 changes: 62 additions & 55 deletions src/babel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,63 @@ def sort_identifiers_with_boosted_prefixes(identifiers, prefixes):
)


def _select_preferred_label(node, types, preferred_name_boost_prefixes, demote_labels_longer_than):
"""Choose the preferred display label for a normalised node.

Steps:
1. Sort labels in boosted-prefix order if the node's most-specific type has an entry in
preferred_name_boost_prefixes; otherwise use Biolink prefix order.
2. Filter blank labels.
3. Demote labels longer than the per-type limit (if the type has an entry in
demote_labels_longer_than). Types with no entry are never demoted.
See https://github.com/NCATSTranslator/Babel/issues/597
4. Return the first surviving label, or "" if none remain.

:param node: A node dict with "identifiers" (list of dicts with "identifier" and optionally "label") and "type".
:param types: Ancestor types for this node, most-specific first.
:param preferred_name_boost_prefixes: Dict mapping Biolink type → list of boosted prefixes (from config).
:param demote_labels_longer_than: Dict mapping Biolink type → int length limit (from config).
:return: The preferred label string, or "" if no label is available.
"""
# Step 1.1 — sort by boosted prefix order for the most-specific matching type.
possible_labels = []
for typ in types:
if typ in preferred_name_boost_prefixes:
possible_labels = list(
map(
lambda identifier: identifier.get("label", ""),
sort_identifiers_with_boosted_prefixes(node["identifiers"], preferred_name_boost_prefixes[typ]),
)
)
# Append any remaining labels not already included.
for id in node["identifiers"]:
label = id.get("label", "")
if label not in possible_labels:
possible_labels.append(label)
break

# Step 1.2 — fallback: use identifiers in their existing (Biolink prefix) order.
if not possible_labels:
possible_labels = list(map(lambda identifier: identifier.get("label", ""), node["identifiers"]))

# Step 2 — filter blank labels.
filtered = [label for label in possible_labels if label]

# Step 3 — per-type length demotion: find the limit for the most-specific matching type.
length_limit = None
for typ in types:
if typ in demote_labels_longer_than:
length_limit = demote_labels_longer_than[typ]
break
if length_limit is not None:
shorter = [label for label in filtered if len(label) <= length_limit]
if shorter:
filtered = shorter

# Step 4 — return the first surviving label.
return filtered[0] if filtered else ""


def get_numerical_curie_suffix(curie):
"""
If a CURIE has a numerical suffix, return it as an integer. Otherwise return None.
Expand Down Expand Up @@ -459,6 +516,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non
# coming up with a preferred label for a particular Biolink class.
preferred_name_boost_prefixes = config["preferred_name_boost_prefixes"]

# Load the per-type label length demotion config. Types not listed here are never demoted.
demote_labels_longer_than = config.get("demote_labels_longer_than", {})

# Create an InformationContentFactory based on the specified icRDF.tsv file. Default to the one in the download
# directory.
if not icrdf_filename:
Expand Down Expand Up @@ -546,61 +606,8 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non
# Determine types.
types = node_factory.get_ancestors(node["type"])

# Generate a preferred label for this clique.
#
# To pick a preferred label for this clique, we need to do three things:
# 1. We sort all labels in the preferred-name order. By default, this should be
# the preferred CURIE order, but if this clique is in one of the Biolink classes in
# preferred_name_boost_prefixes, we boost those prefixes in that order to the top of the list.
# 2. We filter out any suspicious labels.
# (If this simple filter doesn't work, and if prefixes are inconsistent, we can build upon the
# algorithm proposed by Jeff at
# https://github.com/NCATSTranslator/Feedback/issues/259#issuecomment-1605140850)
# 3. We filter out any labels longer than config['demote_labels_longer_than'], but only if there is
# at least one label shorter than this limit.
# 4. We choose the first label that isn't blank (that allows us to use our rule of smallest-prefix-first to find the broadest name for this concept). If no labels remain, we generate a warning.

# Step 1.1. Sort labels in boosted prefix order if possible.
possible_labels = []
for typ in types:
if typ in preferred_name_boost_prefixes:
# This is the most specific matching type, so we use this and then break.
possible_labels = list(
map(
lambda identifier: identifier.get("label", ""),
sort_identifiers_with_boosted_prefixes(node["identifiers"], preferred_name_boost_prefixes[typ]),
)
)

# Add in all the other labels -- we'd still like to consider them, but at a lower priority.
for id in node["identifiers"]:
label = id.get("label", "")
if label not in possible_labels:
possible_labels.append(label)

# Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
# categories: so let's break here.
break

# Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
# Biolink prefix order.
if not possible_labels:
possible_labels = map(lambda identifier: identifier.get("label", ""), node["identifiers"])

# Step 2. Filter out any suspicious labels.
filtered_possible_labels = [label for label in possible_labels if label] # Ignore blank or empty names.

# Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
# least one label shorter than this limit.
labels_shorter_than_limit = [label for label in filtered_possible_labels if label and len(label) <= config["demote_labels_longer_than"]]
if labels_shorter_than_limit:
filtered_possible_labels = labels_shorter_than_limit

# Step 4. Pick the first label if it isn't blank.
if filtered_possible_labels:
preferred_name = filtered_possible_labels[0]
else:
preferred_name = ""
# Generate a preferred label for this clique using _select_preferred_label().
preferred_name = _select_preferred_label(node, types, preferred_name_boost_prefixes, demote_labels_longer_than)

# At this point, we insert any HAS_ADDITIONAL_ID IDs we have.
# The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long
Expand Down
12 changes: 12 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,18 @@ uv run pytest tests/pipeline/checks/ -k "xref" --pipeline --no-cov -v
verifying that rate-limiting delays are correctly applied between requests.
Requires `--network` to run.

### babel_utils/

- **`babel_utils/test_write_compendia.py`** (`unit`) — Unit tests for `_select_preferred_label()`,
the label-selection helper extracted from `write_compendium()`. Covers per-type length demotion
(demotion applies to chemicals and their subtypes via ancestor traversal; diseases, phenotypes,
and other non-chemical types are never demoted), interaction with `preferred_name_boost_prefixes`,
and the fall-through when all labels exceed the limit. Regression tests use real CURIEs from
[#597](https://github.com/NCATSTranslator/Babel/issues/597),
[#711](https://github.com/NCATSTranslator/Babel/issues/711),
[#714](https://github.com/NCATSTranslator/Babel/issues/714), and
[#723](https://github.com/NCATSTranslator/Babel/issues/723).

## Test Data

The `tests/data` directory contains fixture files used by several tests:
Expand Down
Empty file added tests/babel_utils/__init__.py
Empty file.
220 changes: 220 additions & 0 deletions tests/babel_utils/test_write_compendia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
"""Unit tests for label-selection logic in write_compendium().

The helper under test is _select_preferred_label(), which is the extracted core of
the label-selection algorithm previously embedded in write_compendium().

Issue context:
https://github.com/NCATSTranslator/Babel/issues/597 — original report: good names demoted
https://github.com/NCATSTranslator/Babel/issues/714 — MONDO:0011479 "postural orthostatic
tachycardia syndrome" wrongly demoted to "Irritable heart"
https://github.com/NCATSTranslator/Babel/issues/711 — HP:0001508 "Failure to thrive"
wrongly demoted to "Undergrowth"
https://github.com/NCATSTranslator/Babel/issues/723 — MONDO:0005578 "arthritic joint
disease" wrongly demoted to "arthritis"
"""

import pytest

from src.babel_utils import _select_preferred_label


def _node(identifiers):
"""Build a minimal node dict from a list of (curie, label_or_None) tuples."""
ids = []
for curie, label in identifiers:
entry = {"identifier": curie}
if label is not None:
entry["label"] = label
ids.append(entry)
return {"identifiers": ids}


# ---------------------------------------------------------------------------
# Ancestor lists (mirrors what node_factory.get_ancestors() returns)
# ---------------------------------------------------------------------------

DISEASE_ANCESTORS = [
"biolink:Disease",
"biolink:DiseaseOrPhenotypicFeature",
"biolink:BiologicalEntity",
"biolink:NamedThing",
]

PHENOTYPIC_FEATURE_ANCESTORS = [
"biolink:PhenotypicFeature",
"biolink:DiseaseOrPhenotypicFeature",
"biolink:BiologicalEntity",
"biolink:NamedThing",
]

CHEMICAL_ENTITY_ANCESTORS = [
"biolink:ChemicalEntity",
"biolink:PhysicalEssence",
"biolink:NamedThing",
]

SMALL_MOLECULE_ANCESTORS = [
"biolink:SmallMolecule",
"biolink:ChemicalEntity",
"biolink:PhysicalEssence",
"biolink:NamedThing",
]

DRUG_ANCESTORS = [
"biolink:Drug",
"biolink:ChemicalEntity",
"biolink:PhysicalEssence",
"biolink:NamedThing",
]

DEMOTE_CHEMICALS_25 = {"biolink:ChemicalEntity": 25}


# ---------------------------------------------------------------------------
# Regression tests from GitHub issues
# ---------------------------------------------------------------------------


@pytest.mark.unit
def test_pots_label_not_demoted():
"""MONDO:0011479 — "postural orthostatic tachycardia syndrome" (40 chars) must not be
demoted to "Irritable heart" (14 chars) for biolink:Disease.
https://github.com/NCATSTranslator/Babel/issues/714
"""
node = _node([
("MONDO:0011479", "postural orthostatic tachycardia syndrome"),
("UMLS:C2930833", "Irritable heart"),
])
result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "postural orthostatic tachycardia syndrome"


@pytest.mark.unit
def test_failure_to_thrive_not_demoted():
"""HP:0001508 — "Failure to thrive" (18 chars) must not be demoted to "Undergrowth"
(10 chars) for biolink:PhenotypicFeature.
https://github.com/NCATSTranslator/Babel/issues/711
"""
node = _node([
("HP:0001508", "Failure to thrive"),
("UMLS:C4531021", "Undergrowth"),
])
result = _select_preferred_label(node, PHENOTYPIC_FEATURE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "Failure to thrive"


@pytest.mark.unit
def test_arthritic_joint_disease_not_demoted():
"""MONDO:0005578 — "arthritic joint disease" (22 chars) must not be demoted to
"arthritis" (9 chars) for biolink:Disease.
https://github.com/NCATSTranslator/Babel/issues/723
"""
node = _node([
("MONDO:0005578", "arthritic joint disease"),
("DOID:848", "arthritis"),
])
result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "arthritic joint disease"


# ---------------------------------------------------------------------------
# Chemical demotion — should still apply
# ---------------------------------------------------------------------------


@pytest.mark.unit
def test_chemical_long_iupac_demoted():
"""For biolink:ChemicalEntity, a long IUPAC name should be demoted in favour of a short
common name when a shorter label is available.
"""
node = _node([
("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"), # 35 chars — too long
("CHEBI:17334", "serine"), # would be a duplicate curie in practice, but label logic is independent
])
# Use two distinct CURIEs to get two distinct labels
node = _node([
("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"),
("PUBCHEM.COMPOUND:5951", "serine"),
])
result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "serine"


@pytest.mark.unit
def test_chemical_demotion_via_small_molecule_ancestor():
"""Demotion configured on biolink:ChemicalEntity should apply to biolink:SmallMolecule
(a subtype) via ancestor traversal.
"""
node = _node([
("CHEBI:17234", "(2R,3S,4S,5R)-2,3,4,5,6-pentahydroxyhexanal"), # very long IUPAC
("PUBCHEM.COMPOUND:107526", "glucose"),
])
result = _select_preferred_label(node, SMALL_MOLECULE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "glucose"


@pytest.mark.unit
def test_chemical_demotion_via_drug_ancestor():
"""Demotion should also apply to biolink:Drug via ancestor traversal."""
node = _node([
("DRUGBANK:DB00945", "acetylsalicylic acid"), # 20 chars — within limit
("PUBCHEM.COMPOUND:2244", "aspirin"),
])
# acetylsalicylic acid (20) is within the limit, so it should be returned first
result = _select_preferred_label(node, DRUG_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "acetylsalicylic acid"


@pytest.mark.unit
def test_chemical_all_labels_long_keeps_first():
"""If all labels exceed the demotion limit, no demotion occurs and the first label is kept."""
node = _node([
("CHEBI:100001", "some-very-long-iupac-name-that-exceeds-the-limit"),
("PUBCHEM.COMPOUND:99999", "another-very-long-chemical-name-here"),
])
result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == "some-very-long-iupac-name-that-exceeds-the-limit"


# ---------------------------------------------------------------------------
# Empty / no-config cases
# ---------------------------------------------------------------------------


@pytest.mark.unit
def test_no_demotion_when_config_is_empty():
"""When demote_labels_longer_than is an empty dict, no demotion occurs for any type."""
node = _node([
("CHEBI:17334", "(2S)-2-amino-3-hydroxypropanoic acid"),
("PUBCHEM.COMPOUND:5951", "serine"),
])
result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, {}, {})
assert result == "(2S)-2-amino-3-hydroxypropanoic acid"


@pytest.mark.unit
def test_no_labels_returns_empty_string():
"""A node with no labels should return an empty string."""
node = _node([("MONDO:0000001", None)])
result = _select_preferred_label(node, DISEASE_ANCESTORS, {}, DEMOTE_CHEMICALS_25)
assert result == ""


# ---------------------------------------------------------------------------
# Interaction between boost prefixes and demotion
# ---------------------------------------------------------------------------


@pytest.mark.unit
def test_boost_prefix_then_demotion():
"""preferred_name_boost_prefixes reorders labels; demotion then filters by length.
DRUGBANK is boosted for ChemicalEntity, so a long DRUGBANK label is moved to the front —
but demotion should then skip it in favour of the shorter alternative.
"""
boost = {"biolink:ChemicalEntity": ["DRUGBANK", "CHEBI"]}
node = _node([
("CHEBI:27899", "cisplatin"), # 9 chars — short, not boosted first
("DRUGBANK:DB00515", "cis-diaminedichloroplatinum(II)"), # 31 chars — boosted first but too long
])
result = _select_preferred_label(node, CHEMICAL_ENTITY_ANCESTORS, boost, DEMOTE_CHEMICALS_25)
assert result == "cisplatin"
Loading