From 9b4146f33fc19d69dfb4e511a1cb098166f22900 Mon Sep 17 00:00:00 2001 From: Michel Oleynik Date: Wed, 19 Aug 2020 16:22:30 +0200 Subject: [PATCH] Remove deprecated `variants` module Spelling variants are better handled with a normalization step instead of an exponential increase of expansion candidates, which led to very slow processing and several bugs. This refs #87 and closes #98. Also, `get_acro_def_pair_score` was originally intended for web-based (i.e. text with acronym-definition pairs) inputs, now removed. --- acres/rater/rater.py | 66 ---------------- acres/util/__init__.py | 4 +- acres/util/variants.py | 149 ------------------------------------ tests/rater/test_rater.py | 23 ------ tests/util/test_variants.py | 51 ------------ 5 files changed, 2 insertions(+), 291 deletions(-) delete mode 100644 acres/util/variants.py delete mode 100644 tests/util/test_variants.py diff --git a/acres/rater/rater.py b/acres/rater/rater.py index a6eb121..652e1c4 100755 --- a/acres/rater/rater.py +++ b/acres/rater/rater.py @@ -8,7 +8,6 @@ from acres.rater import expansion from acres.rater import full as full_rater from acres.util import acronym as acro_util -from acres.util import variants as varianter logger = logging.getLogger(__name__) @@ -107,68 +106,3 @@ def get_acronym_score(acro: str, full: str) -> float: return 0 return _calc_score(acro, full) - - -def get_acronym_score_variants(acro: str, full: str) -> float: - """ - Wrapper for `get_acronym_score` that takes variants into consideration. - - For checking for valid German expansions it is important to consider variants, - therefore invoke spelling variant generator from `varianter.generate_all_variants_by_rules`. - At this place more rules can be added. - - Typical substitutions, mostly concerning the inconsistent use of k, c, and z in clinical texts - can be enhanced by frequent translations in `varianter.generate_all_variants_by_rules`. - - Return the score of the best variant. - - .. deprecated:: 0.1 - Variants have not been used recently (e.g. not used in Michel's PhD Thesis). - - :param acro: - :param full: - :return: - """ - max_score = 0.0 - variants = varianter.generate_all_variants_by_rules(full) - for variant in variants: - max_score = max(max_score, get_acronym_score(acro, variant)) - return max_score - - -def get_acro_def_pair_score(acro: str, full: str) -> Tuple[str, float]: - """ - Wrapper function for `get_acronym_score` that takes possible acronym-definition pairs into - account. - - The scoring function should work both for acronyms extracted from a corpus (for which strict - matching criteria should be applied) and for acronyms harvested from the Web for which the - criteria may be relaxed once strong evidence from acronym - definition patterns exist, e.g. - "ARDS (akutes Atemnotsyndrom)". - There might be acronym - definition patterns in well-written clinical documents. - - In the latter case, full would take this form, i.e. a string that contains both the acronym and - the expansion. - - :param acro: - :param full: - :return: - """ - is_acronym_definition_pair = False - definition = full - - # full form contains an acronym definition pattern (normally only yielded - # from Web scraping, unlikely in clinical texts) - # acronym is included; is then removed from full form - acro_def_pattern = acro_util.extract_acronym_definition(full, 7) - if acro_def_pattern is not None: - is_acronym_definition_pair = True - if acro_def_pattern[0] == acro: - definition = acro_def_pattern[1] - # high score, but also might be something else - - # XXX Maybe we shouldn't consider variants in case it's an acronym-definition pair - score = get_acronym_score_variants(acro, definition) - if is_acronym_definition_pair: - score *= 10 - return definition, score diff --git a/acres/util/__init__.py b/acres/util/__init__.py index 00ec25d..2e5dc44 100644 --- a/acres/util/__init__.py +++ b/acres/util/__init__.py @@ -1,6 +1,6 @@ """ Package with general utilities modules. """ -from acres.util import acronym, functions, text, variants +from acres.util import acronym, functions, text -__all__ = ['acronym', 'functions', 'text', 'variants'] +__all__ = ['acronym', 'functions', 'text'] diff --git a/acres/util/variants.py b/acres/util/variants.py deleted file mode 100644 index d62c2ae..0000000 --- a/acres/util/variants.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -Module to generate string variants. - -.. deprecated:: 0.1 - Variants have not been used recently (e.g. not used in Michel's PhD Thesis). -""" -from typing import List, Union, Tuple - - -def _resolve_ambiguous_lists(lists: List[List[Union[str, Tuple[str, str]]]]) -> \ - List[List[Union[str, Tuple[str, str]]]]: - """ - - :param lists: - :return: - """ - for a_list in lists: - list0 = [] # type: List[Union[str, Tuple[str, str]]] - list1 = [] # type: List[Union[str, Tuple[str, str]]] - is_open = True - is_tuple = False - for element in a_list: - if isinstance(element, tuple) and is_open: - list0.append(element[0]) - list1.append(element[1]) - is_open = False - is_tuple = True - else: - list0.append(element) - list1.append(element) - if is_tuple: - lists.append(list0) - lists.append(list1) - else: - return lists - - return [[]] - - -def _create_string_variants_as_list(a_string: str, search: str, replace: str) -> \ - List[Union[str, Tuple[str, str]]]: - """ - Analyses a string a_string for all substrings. - - :param a_string: - :param search: - :param replace: - :return: A list constituted by non-substitutable strings and/or search/replace pairs - """ - if search == "": - return [a_string] - ret = [] # type: List[Union[str, Tuple[str, str]]] - i = 0 - built_string = "" - while True: - char = a_string[i] - j = i + len(search) - if a_string[i:j] == search: - if built_string != "": - ret.append(built_string) - built_string = "" - ret.append((search, replace)) - i = i + len(search) - else: - built_string = built_string + char - i = i + 1 - if i >= len(a_string): - if built_string != "": - ret.append(built_string) - return ret - - -def _list_to_string(a_list: List[Union[str, Tuple[str, str]]]) -> str: - """ - transforms input of list - if a list element is not a string: -> empty string - - :param a_list: - :return: - """ - out = "" - for element in a_list: - if isinstance(element, str): - out = out + element - else: - return "" - return out - - -def _list_all_string_variants(a_string: str, search: str, replace: str) -> List[str]: - """ - - :param a_string: - :param search: - :param replace: - :return: - """ - out = [] - # XXX Why do we need to encapsulate the return of _create_string_variants_as_list in a list? - a_list = _resolve_ambiguous_lists([_create_string_variants_as_list(a_string, search, replace)]) - for element in a_list: - a_string = _list_to_string(element) - if a_string != "": - out.append(a_string) - return out - - -def generate_all_variants_by_rules(raw_string: str) -> List[str]: - """ - - :param raw_string: - :return: - """ - rules = [ - ("druck", " pressure"), - ("krankheit", " Disorder"), - ("fa", "pha"), ("Fa", "Pha"), - ("fe", "phe"), ("Fe", "Phe"), - ("fi", "phi"), ("Fi", "Phi"), - ("fo", "pho"), ("Fo", "Pho"), - ("fu", "phu"), ("Fu", "Phu"), - ("fy", "phy"), ("Fy", "Phy"), - ("fä", "phä"), ("Fä", "Phä"), - ("fö", "phö"), ("Fö", "Phö"), - ("fü", "phü"), ("Fü", "Phü"), - ("ka", "ca"), ("Ka", "Ca"), - ("ko", "co"), ("Ko", "Co"), - ("ku", "cu"), ("Ku", "Cu"), - ("zy", "cy"), ("Zy", "Cy"), - ("zi", "ci"), ("Zi", "Ci"), - ("ze", "ce"), ("Ze", "Ce"), - ("kl", "cl"), ("Kl", "Cl"), - ("kr", "cr"), ("Kr", "Cr"), - ("kn", "cn"), ("Kn", "Cn"), - ("kz", "cc"), - # TODO remove. Use `transliterate_to_seven_bit` on input first - ("ö", "e"), ("Ö", "E"), # because of esophagus - ("ü", "ue"), ("Ü", "Ue"), - ("ä", "ae"), ("Ä", "Ae")] - - out = [raw_string] - - for rule in rules: - for a_string in out: - new_list = _list_all_string_variants(a_string, rule[0], rule[1]) - for element in new_list: - if element not in out: - out.append(element) - return out diff --git a/tests/rater/test_rater.py b/tests/rater/test_rater.py index bedc138..c51884f 100644 --- a/tests/rater/test_rater.py +++ b/tests/rater/test_rater.py @@ -43,26 +43,3 @@ def test_get_acronym_score(): # TODO Wrong #assert rater.get_acronym_score("SR", "Sinusrythmus") > rater.get_acronym_score("SR", "Sinusarrhythmie") - - -def test_get_acronym_score_variants(): - # Acronyms created out of spelling variants are accepted - assert 1.0 == rater.get_acronym_score_variants("AK", "Arbeitskammer") - assert 1.0 == rater.get_acronym_score_variants("AC", "Arbeitskammer") - - # But not the opposite! - # TODO Is is expected? - assert 0.0 == rater.get_acronym_score_variants("AK", "Arbeitscammer") - - # Score of the best variant should be preserved - assert 2.0 == rater.get_acronym_score_variants("AK", "Arbeits Kranker") # sic - - # Acronyms with only plural letters should not cause IndexError - assert 0 == rater.get_acronym_score_variants("SS", "Überprüfen Sie die") - - -def test_get_acronym_definition_pair_score(): - assert 10 == rater.get_acro_def_pair_score("EKG", "EKG (Elektrokardiogramm)")[1] - - # FIXME Does not work - #assert 10 == rater.get_acronym_definition_pair_score("ARDS", "ARDS (akutes Atemnotsyndrom)")[1] \ No newline at end of file diff --git a/tests/util/test_variants.py b/tests/util/test_variants.py deleted file mode 100644 index d448960..0000000 --- a/tests/util/test_variants.py +++ /dev/null @@ -1,51 +0,0 @@ -from acres.util import variants - - -def test__resolve_ambiguous_lists(): - expected = [['cyclophospham', ('id', 'ide')], ['cyclophospham', 'id'], ['cyclophospham', 'ide']] - actual = variants._resolve_ambiguous_lists([['cyclophospham', ('id', 'ide')]]) - assert expected == actual - - -def test__create_string_variants_as_list(): - expected = ['cyclophospham', ('id', 'ide')] - actual = variants._create_string_variants_as_list("cyclophosphamid", "id", "ide") - assert expected == actual - - # Empty search returns the imput string as a list - expected = ['cyclophosphamid'] - actual = variants._create_string_variants_as_list("cyclophosphamid", "", "ide") - assert expected == actual - - -def test__list_to_string(): - expected = "abc" - actual = variants._list_to_string(["a", "b", "c"]) - assert expected == actual - - # Non-string element returns empty - expected = "" - actual = variants._list_to_string(["a", "b", "c", ("g", "e")]) - assert expected == actual - - -def test__list_all_string_variants(): - expected = ['cyclophosphamid', 'cyclophosphamide'] - actual = variants._list_all_string_variants("cyclophosphamid", "id", "ide") - assert expected == actual - - -def test_generate_all_variants_by_rules(): - expected = ['Arterielle Verschlusskrankheit', 'Arterielle Verschluss Disorder', - 'Arterielle Verschlusscrankheit'] - actual = variants.generate_all_variants_by_rules("Arterielle Verschlusskrankheit") - assert expected == actual - - expected = ["elektrokardiogramm", "elektrocardiogramm"] - assert expected == variants.generate_all_variants_by_rules("elektrokardiogramm") - - # TODO "esophagus" is never mentioned in the corpus, while "oesophagus" is. - #assert ["ösophagus", "oesophagus"] == variants.generate_all_variants_by_rules("ösophagus") - assert ["herz"] == variants.generate_all_variants_by_rules("herz") - assert ["café"] == variants.generate_all_variants_by_rules("café") # TODO add cafe - assert ["à"] == variants.generate_all_variants_by_rules("à") # TODO add a