Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 38 additions & 2 deletions src/dug/core/parsers/_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations
import json
from typing import Union, Callable, Any, Iterable, Dict, List, Annotated, Literal
import re
from typing import Union, Callable, Any, Iterable, Dict, List, Annotated, Literal, override

from dug.core.loaders import InputFile

Expand Down Expand Up @@ -159,6 +160,41 @@ class DugVariable(DugElement):
data_type:str='text'
is_cde:bool=False

@override
@computed_field
@property
def ml_ready_desc(self) -> str:
"""
Return a description of this variable for use in machine learning.

For a variable, we also want to incorporate the variable name, both verbatim and (possibly) as a

:return: A description of this variable for use in machine learning.
"""
variable_name = self.name

# TODO: can we incorporate the permissible values somehow?

# Is this variable name in CamelCase or containing numbers? If so, add spaces between words or numbers.
cleaned_variable_name = re.sub(
r'''
(?<=[a-z])(?=[A-Z0-9]) | # end lowercase → start uppercase OR number
(?<=[A-Z])(?=[A-Z][a-z0-9]) | # acronym → regular word/number
(?<=[0-9])(?=[A-Za-z]) # number → letter
''',
' ',
variable_name,
flags=re.VERBOSE
)

# Is this variable name in snake_case? If so, replace underscores with spaces.
cleaned_variable_name = re.sub(r'_+', ' ', cleaned_variable_name)

# Only add the cleaned variable name if it differs from the original.
if cleaned_variable_name != variable_name:
return f"{variable_name} ({cleaned_variable_name}): {self.description}"
return f"{variable_name}: {self.description}"

def get_searchable_dict(self):
# Translate DugConcept into Elastic-Compatible Concept
es_elem = super().get_searchable_dict()
Expand Down Expand Up @@ -205,4 +241,4 @@ def get_searchable_dict(self):
DugElementParsedList = TypeAdapter(List[DiscriminatedIndexable])

DugElement.update_forward_refs()
DugConcept.update_forward_refs()
DugConcept.update_forward_refs()
34 changes: 34 additions & 0 deletions tests/unit/dug_data_model/test_dug_variable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Tests for DugVariable.
import pytest

from src.dug.core.parsers import DugVariable

DUG_VARIABLE_EXPANSION_TEST_CASES = [
("variable_name", "variable name"),
("VariableName", "Variable Name"),
("AnotherVariableName", "Another Variable Name"),
("JSONVariable", "JSON Variable"),
("yet_another_variableName", "yet another variable Name"),
("variable_collection123", "variable collection 123"),
("vde_123def", "vde 123 def"),
]

@pytest.mark.parametrize("variable_name,expanded_name", DUG_VARIABLE_EXPANSION_TEST_CASES)
def test_dug_variable_expansion(variable_name, expanded_name):
"""
DugVariable.ml_ready_desc should be able to identify variables in CamelCase and snake_case
and expand both of them to simplify NER.
"""

var = DugVariable(id="var", name=variable_name, description="some desc")
expected_ml_ready_desc = f"{variable_name} ({expanded_name}): some desc"
assert var.ml_ready_desc == expected_ml_ready_desc


def test_dug_variable_no_expansion():
"""
ml_ready_desc should not include a cleaned up variable if the variable name is neither in CamelCase nor snake_case.
"""

assert DugVariable(id="var", name="123", description="some desc").ml_ready_desc == "123: some desc"
assert DugVariable(id="var", name="variablename", description="some desc").ml_ready_desc == "variablename: some desc"