From f9f316fc0724963fe7b6d893671eff1b93d790cf Mon Sep 17 00:00:00 2001 From: George Walker Date: Wed, 1 Sep 2021 14:29:48 +0100 Subject: [PATCH 1/5] raise StringLengthException if vectoriser is applied to strings that are not all greater in length than ngram_size --- string_grouper/string_grouper.py | 22 ++++++++++++++++++++++ string_grouper/test/test_string_grouper.py | 7 ++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index d1612511..44b28474 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -194,6 +194,11 @@ class StringGrouperNotFitException(Exception): pass +class StringLengthException(Exception): + """Raised when vectoriser is fit on strings that are not of length greater or equal to ngram size""" + pass + + class StringGrouper(object): def __init__(self, master: pd.Series, duplicates: Optional[pd.Series] = None, @@ -258,6 +263,13 @@ def n_grams(self, string: str) -> List[str]: def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" + + # Validate match strings length + if not StringGrouper._strings_are_of_sufficient_length(self._master, self._config.ngram_size) or \ + (self._duplicates is not None + and not StringGrouper._strings_are_of_sufficient_length(self._duplicates, self._config.ngram_size)): + raise StringLengthException('None of input string lengths are greater than or equal to n_gram length') + master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity @@ -697,6 +709,16 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool: return False return True + @staticmethod + def _strings_are_of_sufficient_length(series_to_test: pd.Series, ngram_size: int) -> bool: + if not isinstance(series_to_test, pd.Series): + return False + elif series_to_test.to_frame().applymap( + lambda x: not len(x) >= ngram_size + ).squeeze(axis=1).all(): + return False + return True + @staticmethod def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool: if duplicates is None and (duplicates_id is not None) \ diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index f5f0aac8..8ebf6fcf 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -6,7 +6,7 @@ DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ match_most_similar, group_similar_strings, match_strings, \ - compute_pairwise_similarities + compute_pairwise_similarities, StringLengthException from unittest.mock import patch @@ -822,6 +822,11 @@ def test_prior_matches_added(self): # All strings should now match to the same "master" string self.assertEqual(1, len(df.deduped.unique())) + def test_group_similar_strings_stopwords(self): + """StringGrouper shouldn't raise a ValueError if all strings are shorter than 3 characters""" + with self.assertRaises(StringLengthException): + StringGrouper(pd.Series(['zz', 'yy', 'xx'])).fit() + if __name__ == '__main__': unittest.main() From cb56fbe6dd1b84881125f7a09c24c0e2d280eedd Mon Sep 17 00:00:00 2001 From: George Walker Date: Wed, 1 Sep 2021 16:20:49 +0100 Subject: [PATCH 2/5] remove regex pattern characters before checking length --- string_grouper/string_grouper.py | 10 ++++++---- string_grouper/test/test_string_grouper.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 44b28474..2ad6538e 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -265,9 +265,11 @@ def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" # Validate match strings length - if not StringGrouper._strings_are_of_sufficient_length(self._master, self._config.ngram_size) or \ + if not StringGrouper._strings_are_of_sufficient_length(self._master, self._config.ngram_size, + self._config.regex) or \ (self._duplicates is not None - and not StringGrouper._strings_are_of_sufficient_length(self._duplicates, self._config.ngram_size)): + and not StringGrouper._strings_are_of_sufficient_length(self._duplicates, self._config.ngram_size, + self._config.regex)): raise StringLengthException('None of input string lengths are greater than or equal to n_gram length') master_matrix, duplicate_matrix = self._get_tf_idf_matrices() @@ -710,11 +712,11 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool: return True @staticmethod - def _strings_are_of_sufficient_length(series_to_test: pd.Series, ngram_size: int) -> bool: + def _strings_are_of_sufficient_length(series_to_test: pd.Series, ngram_size: int, regex: str) -> bool: if not isinstance(series_to_test, pd.Series): return False elif series_to_test.to_frame().applymap( - lambda x: not len(x) >= ngram_size + lambda x: not len(re.sub(regex, r'', x)) >= ngram_size ).squeeze(axis=1).all(): return False return True diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 8ebf6fcf..733bf3d8 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -825,7 +825,7 @@ def test_prior_matches_added(self): def test_group_similar_strings_stopwords(self): """StringGrouper shouldn't raise a ValueError if all strings are shorter than 3 characters""" with self.assertRaises(StringLengthException): - StringGrouper(pd.Series(['zz', 'yy', 'xx'])).fit() + StringGrouper(pd.Series(['zz', 'yy', 'xx,'])).fit() if __name__ == '__main__': From ff0b04138cc3420320176c65f14f3c208dcfabae Mon Sep 17 00:00:00 2001 From: George Walker Date: Wed, 1 Sep 2021 16:44:45 +0100 Subject: [PATCH 3/5] inherit from Error class --- string_grouper/string_grouper.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 2ad6538e..9ed54ed8 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -189,12 +189,16 @@ def wrapper(*args, **kwargs): return wrapper -class StringGrouperNotFitException(Exception): +class Error(Exception): + pass + + +class StringGrouperNotFitException(Error): """Raised when one of the public functions is called which requires the StringGrouper to be fit first""" pass -class StringLengthException(Exception): +class StringLengthException(Error): """Raised when vectoriser is fit on strings that are not of length greater or equal to ngram size""" pass From ce518bf10467d8f1533e9cb56925f9f3d2f2e1a7 Mon Sep 17 00:00:00 2001 From: George Walker Date: Fri, 3 Sep 2021 16:09:02 +0100 Subject: [PATCH 4/5] raise StringLengthException instead of ValueError when vectorizer is fit --- string_grouper/string_grouper.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 9ed54ed8..04dc50ee 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -267,15 +267,6 @@ def n_grams(self, string: str) -> List[str]: def fit(self) -> 'StringGrouper': """Builds the _matches list which contains string matches indices and similarity""" - - # Validate match strings length - if not StringGrouper._strings_are_of_sufficient_length(self._master, self._config.ngram_size, - self._config.regex) or \ - (self._duplicates is not None - and not StringGrouper._strings_are_of_sufficient_length(self._duplicates, self._config.ngram_size, - self._config.regex)): - raise StringLengthException('None of input string lengths are greater than or equal to n_gram length') - master_matrix, duplicate_matrix = self._get_tf_idf_matrices() # Calculate the matches using the cosine similarity @@ -468,7 +459,10 @@ def _fit_vectorizer(self) -> TfidfVectorizer: strings = pd.concat([self._master, self._duplicates]) else: strings = self._master - self._vectorizer.fit(strings) + try: + self._vectorizer.fit(strings) + except ValueError: + raise StringLengthException('None of input string lengths are greater than or equal to n_gram length') return self._vectorizer def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix) -> csr_matrix: From eb78a013ad5261985d39c3b55cb0610682c9fadb Mon Sep 17 00:00:00 2001 From: George Walker Date: Fri, 3 Sep 2021 16:09:35 +0100 Subject: [PATCH 5/5] remove string length check method --- string_grouper/string_grouper.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 04dc50ee..6dfc6cfa 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -709,16 +709,6 @@ def _is_series_of_strings(series_to_test: pd.Series) -> bool: return False return True - @staticmethod - def _strings_are_of_sufficient_length(series_to_test: pd.Series, ngram_size: int, regex: str) -> bool: - if not isinstance(series_to_test, pd.Series): - return False - elif series_to_test.to_frame().applymap( - lambda x: not len(re.sub(regex, r'', x)) >= ngram_size - ).squeeze(axis=1).all(): - return False - return True - @staticmethod def _is_input_data_combination_valid(duplicates, master_id, duplicates_id) -> bool: if duplicates is None and (duplicates_id is not None) \