Skip to content

Commit

Permalink
Remove deprecated variants module
Browse files Browse the repository at this point in the history
Spelling variants are better handled with a normalization step instead of an exponential increase of expansion candidates, which led to very slow processing and several bugs. This refs bst-mug#87 and closes bst-mug#98.

Also, `get_acro_def_pair_score` was originally intended for web-based (i.e. text with acronym-definition pairs) inputs, now removed.
  • Loading branch information
michelole committed Aug 21, 2020
1 parent a88817b commit 9b4146f
Show file tree
Hide file tree
Showing 5 changed files with 2 additions and 291 deletions.
66 changes: 0 additions & 66 deletions acres/rater/rater.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from acres.rater import expansion
from acres.rater import full as full_rater
from acres.util import acronym as acro_util
from acres.util import variants as varianter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -107,68 +106,3 @@ def get_acronym_score(acro: str, full: str) -> float:
return 0

return _calc_score(acro, full)


def get_acronym_score_variants(acro: str, full: str) -> float:
"""
Wrapper for `get_acronym_score` that takes variants into consideration.
For checking for valid German expansions it is important to consider variants,
therefore invoke spelling variant generator from `varianter.generate_all_variants_by_rules`.
At this place more rules can be added.
Typical substitutions, mostly concerning the inconsistent use of k, c, and z in clinical texts
can be enhanced by frequent translations in `varianter.generate_all_variants_by_rules`.
Return the score of the best variant.
.. deprecated:: 0.1
Variants have not been used recently (e.g. not used in Michel's PhD Thesis).
:param acro:
:param full:
:return:
"""
max_score = 0.0
variants = varianter.generate_all_variants_by_rules(full)
for variant in variants:
max_score = max(max_score, get_acronym_score(acro, variant))
return max_score


def get_acro_def_pair_score(acro: str, full: str) -> Tuple[str, float]:
"""
Wrapper function for `get_acronym_score` that takes possible acronym-definition pairs into
account.
The scoring function should work both for acronyms extracted from a corpus (for which strict
matching criteria should be applied) and for acronyms harvested from the Web for which the
criteria may be relaxed once strong evidence from acronym - definition patterns exist, e.g.
"ARDS (akutes Atemnotsyndrom)".
There might be acronym - definition patterns in well-written clinical documents.
In the latter case, full would take this form, i.e. a string that contains both the acronym and
the expansion.
:param acro:
:param full:
:return:
"""
is_acronym_definition_pair = False
definition = full

# full form contains an acronym definition pattern (normally only yielded
# from Web scraping, unlikely in clinical texts)
# acronym is included; is then removed from full form
acro_def_pattern = acro_util.extract_acronym_definition(full, 7)
if acro_def_pattern is not None:
is_acronym_definition_pair = True
if acro_def_pattern[0] == acro:
definition = acro_def_pattern[1]
# high score, but also might be something else

# XXX Maybe we shouldn't consider variants in case it's an acronym-definition pair
score = get_acronym_score_variants(acro, definition)
if is_acronym_definition_pair:
score *= 10
return definition, score
4 changes: 2 additions & 2 deletions acres/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Package with general utilities modules.
"""
from acres.util import acronym, functions, text, variants
from acres.util import acronym, functions, text

__all__ = ['acronym', 'functions', 'text', 'variants']
__all__ = ['acronym', 'functions', 'text']
149 changes: 0 additions & 149 deletions acres/util/variants.py

This file was deleted.

23 changes: 0 additions & 23 deletions tests/rater/test_rater.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,3 @@ def test_get_acronym_score():

# TODO Wrong
#assert rater.get_acronym_score("SR", "Sinusrythmus") > rater.get_acronym_score("SR", "Sinusarrhythmie")


def test_get_acronym_score_variants():
# Acronyms created out of spelling variants are accepted
assert 1.0 == rater.get_acronym_score_variants("AK", "Arbeitskammer")
assert 1.0 == rater.get_acronym_score_variants("AC", "Arbeitskammer")

# But not the opposite!
# TODO Is is expected?
assert 0.0 == rater.get_acronym_score_variants("AK", "Arbeitscammer")

# Score of the best variant should be preserved
assert 2.0 == rater.get_acronym_score_variants("AK", "Arbeits Kranker") # sic

# Acronyms with only plural letters should not cause IndexError
assert 0 == rater.get_acronym_score_variants("SS", "Überprüfen Sie die")


def test_get_acronym_definition_pair_score():
assert 10 == rater.get_acro_def_pair_score("EKG", "EKG (Elektrokardiogramm)")[1]

# FIXME Does not work
#assert 10 == rater.get_acronym_definition_pair_score("ARDS", "ARDS (akutes Atemnotsyndrom)")[1]
51 changes: 0 additions & 51 deletions tests/util/test_variants.py

This file was deleted.

0 comments on commit 9b4146f

Please sign in to comment.