Skip to content

added ability to cache matrix in queries across which master is constant #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [Unreleased] - 2022-05-23

### Added

* Ability to execute multiple separate queries on the same `master` dataset while the `duplicates` dataset changes;
* Ability to take advantage of caching to boost the performance of such queries (if `master` is small enough).

## [0.6.1] - 2021-10-19

Expand Down
53 changes: 47 additions & 6 deletions string_grouper/string_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# similar string index-columns with corresponding duplicates-index values
DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
# matches appear in the output
DEFAULT_ENABLE_CACHE: bool = False # does not cache the master tf-idf matrix between queries which preserve master
GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest
# similarity aggregate as group-representative:
GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative:
Expand Down Expand Up @@ -185,6 +186,9 @@ class StringGrouperConfig(NamedTuple):
before performing the string-comparisons block-wise. Defaults to 'guess', in which case the numbers of
blocks are estimated based on previous empirical results. If n_blocks = 'auto', then splitting is done
automatically in the event of an OverflowError.
:param enable_cache: bool. Whether or not to cache the tf-idf matrix for ``master`` between queries which
preserve ``master``. Defaults to False. Use with caution: setting this option to True may degrade
performance when ``master`` is too large to fit into RAM.
"""

ngram_size: int = DEFAULT_NGRAM_SIZE
Expand All @@ -200,6 +204,7 @@ class StringGrouperConfig(NamedTuple):
group_rep: str = DEFAULT_GROUP_REP
force_symmetries: bool = DEFAULT_FORCE_SYMMETRIES
n_blocks: Tuple[int, int] = DEFAULT_N_BLOCKS
enable_cache: bool = DEFAULT_ENABLE_CACHE


def validate_is_fit(f):
Expand Down Expand Up @@ -242,6 +247,7 @@ def __init__(self, master: pd.Series,
"""
# private members:
self.is_build = False
self._cache = dict()

self._master: pd.DataFrame = pd.DataFrame()
self._duplicates: Optional[pd.Series] = None
Expand Down Expand Up @@ -323,8 +329,24 @@ def reset_data(self,
:param duplicates_id: pandas.Series. If set, contains ID values for each row in duplicates Series.
:param kwargs: All other keyword arguments are passed to StringGrouperConfig
"""
self._cache.clear()
self._set_data(master, duplicates, master_id, duplicates_id)

def _reset_duplicates_only(self, duplicates: pd.Series = None, duplicates_id: Optional[pd.Series] = None):
# Validate input strings data
self.duplicates = duplicates

# Validate optional IDs input
if not StringGrouper._is_input_data_combination_valid(duplicates, self._master_id, duplicates_id):
raise Exception('List of data Series options is invalid')
StringGrouper._validate_id_data(self._master, duplicates, self._master_id, duplicates_id)
self._duplicates_id = duplicates_id

# Set some private members
self._left_Series = self._duplicates

self.is_build = False

def clear_data(self):
self._master = None
self._duplicates = None
Expand All @@ -333,6 +355,7 @@ def clear_data(self):
self._matches_list = None
self._left_Series = None
self._right_Series = None
self._cache.clear()
self.is_build = False

def update_options(self, **kwargs):
Expand Down Expand Up @@ -729,14 +752,19 @@ def match_strings(self,
This can be seen as an self-join. If both master and duplicates is given, it will return highly similar strings
between master and duplicates. This can be seen as an inner-join.

:param master: pandas.Series. Series of strings against which matches are calculated.
:param master: pandas.Series. Series of strings against which matches are calculated. If set to ``None``, then
the currently stored ``master`` Series will be reused.
:param duplicates: pandas.Series. Series of strings that will be matched with master if given (Optional).
:param master_id: pandas.Series. Series of values that are IDs for master column rows (Optional).
:param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows (Optional).
:param kwargs: All other keyword arguments are passed to StringGrouperConfig.
:return: pandas.Dataframe.
"""
self.reset_data(master, duplicates, master_id, duplicates_id)
if master is None:
self._reset_duplicates_only(duplicates, duplicates_id)
else:
self.reset_data(master, duplicates, master_id, duplicates_id)

self.update_options(**kwargs)
self = self.fit()
return self.get_matches()
Expand All @@ -761,14 +789,18 @@ def match_most_similar(self,
If IDs (both 'master_id' and 'duplicates_id') are also given, returns a DataFrame of the same strings
output in the above case with their corresponding IDs.

:param master: pandas.Series. Series of strings that the duplicates will be matched with.
:param master: pandas.Series. Series of strings that the duplicates will be matched with. If it is
set to ``None``, then the currently stored ``master`` Series will be reused.
:param duplicates: pandas.Series. Series of strings that will me matched with the master.
:param master_id: pandas.Series. Series of values that are IDs for master column rows. (Optional)
:param duplicates_id: pandas.Series. Series of values that are IDs for duplicates column rows. (Optional)
:param kwargs: All other keyword arguments are passed to StringGrouperConfig. (Optional)
:return: pandas.Series or pandas.DataFrame.
"""
self.reset_data(master, duplicates, master_id, duplicates_id)
if master is None:
self._reset_duplicates_only(duplicates, duplicates_id)
else:
self.reset_data(master, duplicates, master_id, duplicates_id)

old_max_n_matches = self._max_n_matches
new_max_n_matches = None
Expand Down Expand Up @@ -875,8 +907,17 @@ def _get_right_tf_idf_matrix(self, partition=(None, None)):
# unlike _get_tf_idf_matrices(), _get_right_tf_idf_matrix
# does not set the corpus but rather
# builds a matrix using the existing corpus
return self._vectorizer.transform(
self._right_Series.iloc[slice(*partition)])
key = tuple(partition)
if self._config.enable_cache and key in self._cache:
matrix = self._cache[key]
else:
matrix = self._vectorizer.transform(
self._right_Series.iloc[slice(*partition)])

if self._config.enable_cache:
self._cache[key] = matrix

return matrix

def _fit_vectorizer(self) -> TfidfVectorizer:
# if both dupes and master string series are set - we concat them to fit the vectorizer on all
Expand Down
29 changes: 28 additions & 1 deletion string_grouper/test/test_string_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scipy.sparse.csr import csr_matrix
from string_grouper.string_grouper import DEFAULT_MIN_SIMILARITY, \
DEFAULT_REGEX, DEFAULT_NGRAM_SIZE, DEFAULT_N_PROCESSES, DEFAULT_IGNORE_CASE, \
DEFAULT_ENABLE_CACHE, \
StringGrouperConfig, StringGrouper, StringGrouperNotFitException, \
match_most_similar, group_similar_strings, match_strings, \
compute_pairwise_similarities
Expand Down Expand Up @@ -100,6 +101,7 @@ def test_config_defaults(self):
self.assertEqual(config.ngram_size, DEFAULT_NGRAM_SIZE)
self.assertEqual(config.number_of_processes, DEFAULT_N_PROCESSES)
self.assertEqual(config.ignore_case, DEFAULT_IGNORE_CASE)
self.assertEqual(config.enable_cache, DEFAULT_ENABLE_CACHE)

def test_config_immutable(self):
"""Configurations should be immutable"""
Expand All @@ -117,6 +119,29 @@ def test_config_non_default_values(self):

class StringGrouperTest(unittest.TestCase):

def test_cache(self):
"""tests caching when the option is enabled"""

sort_cols = ['right_index', 'left_index']

def fix_row_order(df):
return df.sort_values(sort_cols).reset_index(drop=True)

simple_example = SimpleExample()
df1 = simple_example.customers_df2['Customer Name']

sg = StringGrouper(df1, min_similarity=0.1, enable_cache=True)
assert sg._cache == dict()
matches = fix_row_order(sg.match_strings(None, duplicates=df1)) # no cache
assert len(sg._cache) > 0
for _, value in sg._cache.items():
assert isinstance(value, csr_matrix)
matches_ = fix_row_order(sg.match_strings(None, duplicates=df1))
assert len(sg._cache) > 0
pd.testing.assert_frame_equal(matches_, matches)
with self.assertRaises(Exception):
_ = sg.match_strings(None, duplicates=df1, duplicates_id=simple_example.customers_df2['Customer ID'])

def test_auto_blocking_single_Series(self):
"""tests whether automatic blocking yields consistent results"""
# This function will force an OverflowError to occur when
Expand Down Expand Up @@ -870,8 +895,10 @@ def test_get_groups_two_df(self):
result = sg.get_groups()
expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'], name='most_similar_master')
pd.testing.assert_series_equal(expected_result, result)
result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3)
result = sg.match_most_similar(test_series_1, test_series_2, max_n_matches=3, enable_cache=True)
pd.testing.assert_series_equal(expected_result, result)
result2 = sg.match_most_similar(None, test_series_2, max_n_matches=3)
pd.testing.assert_series_equal(expected_result, result2)

def test_get_groups_2_string_series_2_id_series(self):
"""Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
Expand Down