From 45b66efb9848954cd9efeed86abdb468ee340b30 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 23 May 2022 12:28:31 +0200 Subject: [PATCH 1/2] added ability to cache matrix in queries across which master is constant --- string_grouper/string_grouper.py | 53 +++++++++++++++++++--- string_grouper/test/test_string_grouper.py | 29 +++++++++++- 2 files changed, 75 insertions(+), 7 deletions(-) diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py index 0d8a5ac6..816bdfee 100644 --- a/string_grouper/string_grouper.py +++ b/string_grouper/string_grouper.py @@ -26,6 +26,7 @@ # similar string index-columns with corresponding duplicates-index values DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity # matches appear in the output +DEFAULT_ENABLE_CACHE: bool = False # does not cache the master tf-idf matrix between queries which preserve master GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest # similarity aggregate as group-representative: GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative: @@ -185,6 +186,9 @@ class StringGrouperConfig(NamedTuple): before performing the string-comparisons block-wise. Defaults to 'guess', in which case the numbers of blocks are estimated based on previous empirical results. If n_blocks = 'auto', then splitting is done automatically in the event of an OverflowError. + :param enable_cache: bool. Whether or not to cache the tf-idf matrix for ``master`` between queries which + preserve ``master``. Defaults to False. Use with caution: setting this option to True may degrade + performance when ``master`` is too large to fit into RAM. """ ngram_size: int = DEFAULT_NGRAM_SIZE @@ -200,6 +204,7 @@ class StringGrouperConfig(NamedTuple): group_rep: str = DEFAULT_GROUP_REP force_symmetries: bool = DEFAULT_FORCE_SYMMETRIES n_blocks: Tuple[int, int] = DEFAULT_N_BLOCKS + enable_cache: bool = DEFAULT_ENABLE_CACHE def validate_is_fit(f): @@ -242,6 +247,7 @@ def __init__(self, master: pd.Series, """ # private members: self.is_build = False + self._cache = dict() self._master: pd.DataFrame = pd.DataFrame() self._duplicates: Optional[pd.Series] = None @@ -323,8 +329,24 @@ def reset_data(self, :param duplicates_id: pandas.Series. If set, contains ID values for each row in duplicates Series. :param kwargs: All other keyword arguments are passed to StringGrouperConfig """ + self._cache.clear() self._set_data(master, duplicates, master_id, duplicates_id) + def _reset_duplicates_only(self, duplicates: pd.Series = None, duplicates_id: Optional[pd.Series] = None): + # Validate input strings data + self.duplicates = duplicates + + # Validate optional IDs input + if not StringGrouper._is_input_data_combination_valid(duplicates, self._master_id, duplicates_id): + raise Exception('List of data Series options is invalid') + StringGrouper._validate_id_data(self._master, duplicates, self._master_id, duplicates_id) + self._duplicates_id = duplicates_id + + # Set some private members + self._left_Series = self._duplicates + + self.is_build = False + def clear_data(self): self._master = None self._duplicates = None @@ -333,6 +355,7 @@ def clear_data(self): self._matches_list = None self._left_Series = None self._right_Series = None + self._cache.clear() self.is_build = False def update_options(self, **kwargs): @@ -729,14 +752,19 @@ def match_strings(self, This can be seen as an self-join. If both master and duplicates is given, it will return highly similar strings between master and duplicates. This can be seen as an inner-join. - :param master: pandas.Series. Series of strings against which matches are calculated. + :param master: pandas.Series. Series of strings against which matches are calculated. If set to ``None``, then + the currently stored ``master`` Series will be reused. :param duplicates: pandas.Series. Series of strings that will be matched with master if given (Optional). :param master_id: pandas.Series. Series of values that are IDs for master column rows (Optional). :param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows (Optional). :param kwargs: All other keyword arguments are passed to StringGrouperConfig. :return: pandas.Dataframe. """ - self.reset_data(master, duplicates, master_id, duplicates_id) + if master is None: + self._reset_duplicates_only(duplicates, duplicates_id) + else: + self.reset_data(master, duplicates, master_id, duplicates_id) + self.update_options(**kwargs) self = self.fit() return self.get_matches() @@ -761,14 +789,18 @@ def match_most_similar(self, If IDs (both 'master_id' and 'duplicates_id') are also given, returns a DataFrame of the same strings output in the above case with their corresponding IDs. - :param master: pandas.Series. Series of strings that the duplicates will be matched with. + :param master: pandas.Series. Series of strings that the duplicates will be matched with. If it is + set to ``None``, then the currently stored ``master`` Series will be reused. :param duplicates: pandas.Series. Series of strings that will me matched with the master. :param master_id: pandas.Series. Series of values that are IDs for master column rows. (Optional) :param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows. (Optional) :param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional) :return: pandas.Series or pandas.DataFrame. """ - self.reset_data(master, duplicates, master_id, duplicates_id) + if master is None: + self._reset_duplicates_only(duplicates, duplicates_id) + else: + self.reset_data(master, duplicates, master_id, duplicates_id) old_max_n_matches = self._max_n_matches new_max_n_matches = None @@ -875,8 +907,17 @@ def _get_right_tf_idf_matrix(self, partition=(None, None)): # unlike _get_tf_idf_matrices(), _get_right_tf_idf_matrix # does not set the corpus but rather # builds a matrix using the existing corpus - return self._vectorizer.transform( - self._right_Series.iloc[slice(*partition)]) + key = tuple(partition) + if self._config.enable_cache and key in self._cache: + matrix = self._cache[key] + else: + matrix = self._vectorizer.transform( + self._right_Series.iloc[slice(*partition)]) + + if self._config.enable_cache: + self._cache[key] = matrix + + return matrix def _fit_vectorizer(self) -> TfidfVectorizer: # if both dupes and master string series are set - we concat them to fit the vectorizer on all diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py index 7b74eddb..42e82b1b 100644 --- a/string_grouper/test/test_string_grouper.py +++ b/string_grouper/test/test_string_grouper.py @@ -4,6 +4,7 @@ from scipy.sparse.csr import csr_matrix from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \ DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \ + DEFAULT_ENABLE_CACHE, \ StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \ match_most_similar, group_similar_strings, match_strings, \ compute_pairwise_similarities @@ -100,6 +101,7 @@ def test_config_defaults(self): self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE) self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES) self.assertEqual(config.ignore_case, DEFAULT_IGNORE_CASE) + self.assertEqual(config.enable_cache, DEFAULT_ENABLE_CACHE) def test_config_immutable(self): """Configurations should be immutable""" @@ -117,6 +119,29 @@ def test_config_non_default_values(self): class StringGrouperTest(unittest.TestCase): + def test_cache(self): + """tests caching when the option is enabled""" + + sort_cols = ['right_index', 'left_index'] + + def fix_row_order(df): + return df.sort_values(sort_cols).reset_index(drop=True) + + simple_example = SimpleExample() + df1 = simple_example.customers_df2['Customer Name'] + + sg = StringGrouper(df1, min_similarity=0.1, enable_cache=True) + assert sg._cache == dict() + matches = fix_row_order(sg.match_strings(None, duplicates=df1)) # no cache + assert len(sg._cache) > 0 + for _, value in sg._cache.items(): + assert isinstance(value, csr_matrix) + matches_ = fix_row_order(sg.match_strings(None, duplicates=df1)) + assert len(sg._cache) > 0 + pd.testing.assert_frame_equal(matches_, matches) + with self.assertRaises(Exception): + _ = sg.match_strings(None, duplicates=df1, duplicates_id=simple_example.customers_df2['Customer ID']) + def test_auto_blocking_single_Series(self): """tests whether automatic blocking yields consistent results""" # This function will force an OverflowError to occur when @@ -870,8 +895,10 @@ def test_get_groups_two_df(self): result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master') pd.testing.assert_series_equal(expected_result, result) - result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3) + result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3, enable_cache=True) pd.testing.assert_series_equal(expected_result, result) + result2 = sg.match_most_similar(None, test_series_2, max_n_matches=3) + pd.testing.assert_series_equal(expected_result, result2) def test_get_groups_2_string_series_2_id_series(self): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string From 3cebc7f7d07100db7c791486efbfbfa7d3aa9333 Mon Sep 17 00:00:00 2001 From: Particular Miner <78448465+ParticularMiner@users.noreply.github.com> Date: Mon, 23 May 2022 13:27:38 +0200 Subject: [PATCH 2/2] update CHANGELOG.md --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e536952..fbcf8162 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [Unreleased] - 2022-05-23 + +### Added + +* Ability to execute multiple separate queries on the same `master` dataset while the `duplicates` dataset changes; +* Ability to take advantage of caching to boost the performance of such queries (if `master` is small enough). ## [0.6.1] - 2021-10-19