Skip to content

Commit 691904e

Browse files
made PEP8-conforming modifications
1 parent 6f6ff50 commit 691904e

File tree

9 files changed

+121
-118
lines changed

9 files changed

+121
-118
lines changed

string_grouper/string_grouper.py

+56-50
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Tuple, NamedTuple, List, Optional, Union
99
from string_grouper_topn import awesome_cossim_topn
1010
from functools import wraps
11+
from scipy.sparse.lil import lil_matrix
1112

1213
DEFAULT_NGRAM_SIZE: int = 3
1314
DEFAULT_REGEX: str = r'[,-./]|\s'
@@ -17,24 +18,24 @@
1718
DEFAULT_IGNORE_CASE: bool = True # ignores case by default
1819
DEFAULT_DROP_INDEX: bool = False # includes index-columns in output
1920
DEFAULT_REPLACE_NA: bool = False # when finding the most similar strings, does not replace NaN values in most
20-
# similar string index-columns with corresponding duplicates-index values
21-
DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
22-
# matches appear in the output
21+
# similar string index-columns with corresponding duplicates-index values
22+
DEFAULT_INCLUDE_ZEROES: bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
23+
# matches appear in the output
2324
GROUP_REP_CENTROID: str = 'centroid' # Option value to select the string in each group with the largest
24-
# similarity aggregate as group-representative:
25+
# similarity aggregate as group-representative:
2526
GROUP_REP_FIRST: str = 'first' # Option value to select the first string in each group as group-representative:
26-
DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
27+
DEFAULT_GROUP_REP: str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
2728

2829
# The following string constants are used by (but aren't [yet] options passed to) StringGrouper
2930
DEFAULT_COLUMN_NAME: str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches
30-
DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
31+
DEFAULT_ID_NAME: str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
3132
LEFT_PREFIX: str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches
3233
RIGHT_PREFIX: str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches
3334
MOST_SIMILAR_PREFIX: str = 'most_similar_' # used to prefix columns of the output of
34-
# StringGrouper._get_nearest_matches
35-
DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
35+
# StringGrouper._get_nearest_matches
36+
DEFAULT_MASTER_NAME: str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
3637
DEFAULT_MASTER_ID_NAME: str = f'{DEFAULT_MASTER_NAME}_{DEFAULT_ID_NAME}' # used to name id-column of the output of
37-
# StringGrouper.get_nearest_matches
38+
# StringGrouper.get_nearest_matches
3839
GROUP_REP_PREFIX: str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
3940

4041
# High level functions
@@ -147,9 +148,9 @@ class StringGrouperConfig(NamedTuple):
147148
Defaults to number of cores on a machine - 1.
148149
:param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
149150
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False.
150-
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
151+
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
151152
appear in the output. Defaults to True.
152-
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
153+
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
153154
corresponding duplicates-index values. Defaults to False.
154155
:param group_rep: str. The scheme to select the group-representative. Default is 'centroid'.
155156
The other choice is 'first'.
@@ -231,8 +232,8 @@ def __init__(self, master: pd.Series,
231232
self._vectorizer = TfidfVectorizer(min_df=1, analyzer=self.n_grams)
232233
# After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
233234
self._matches_list: pd.DataFrame = pd.DataFrame()
234-
# _true_max_n_matches will contain the true maximum number of matches over all strings in master if
235-
# self._config.min_similarity <= 0
235+
# _true_max_n_matches will contain the true maximum number of matches over all strings in master if
236+
# self._config.min_similarity <= 0
236237
self._true_max_n_matches = None
237238

238239
def n_grams(self, string: str) -> List[str]:
@@ -251,21 +252,21 @@ def n_grams(self, string: str) -> List[str]:
251252
def fit(self) -> 'StringGrouper':
252253
"""Builds the _matches list which contains string matches indices and similarity"""
253254
master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
254-
255+
255256
# Calculate the matches using the cosine similarity
256257
matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
257-
258+
258259
if self._duplicates is None:
259260
# convert to lil format for best efficiency when setting matrix-elements
260-
matches = matches.tolil()
261-
# matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
261+
matches = matches.tolil()
262+
# matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
262263
# floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
263264
matches = StringGrouper._fix_diagonal(matches)
264265
if self._max_n_matches < self._true_max_n_matches:
265266
# the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
266267
matches = StringGrouper._symmetrize_matrix(matches)
267268
matches = matches.tocsr()
268-
269+
269270
# build list from matrix
270271
self._matches_list = self._get_matches_list(matches)
271272
self.is_build = True
@@ -283,14 +284,14 @@ def dot(self) -> pd.Series:
283284
@validate_is_fit
284285
def get_matches(self,
285286
ignore_index: Optional[bool] = None,
286-
include_zeroes: Optional[bool]=None) -> pd.DataFrame:
287+
include_zeroes: Optional[bool] = None) -> pd.DataFrame:
287288
"""
288289
Returns a DataFrame with all the matches and their cosine similarity.
289290
If optional IDs are used, returned as extra columns with IDs matched to respective data rows
290291
291-
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
292+
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
292293
self._config.ignore_index.
293-
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
294+
:param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
294295
appear in the output. Defaults to self._config.include_zeroes.
295296
"""
296297
def get_both_sides(master: pd.Series,
@@ -313,18 +314,20 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
313314
else:
314315
return data.rename(f"{prefix}{data.name}")
315316

316-
if ignore_index is None: ignore_index = self._config.ignore_index
317-
if include_zeroes is None: include_zeroes = self._config.include_zeroes
317+
if ignore_index is None:
318+
ignore_index = self._config.ignore_index
319+
if include_zeroes is None:
320+
include_zeroes = self._config.include_zeroes
318321
if self._config.min_similarity > 0 or not include_zeroes:
319322
matches_list = self._matches_list
320323
elif include_zeroes:
321324
# Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
322-
# the fix includes zero-similarity matches that are missing by default
323-
# in _matches_list due to our use of sparse matrices
325+
# the fix includes zero-similarity matches that are missing by default
326+
# in _matches_list due to our use of sparse matrices
324327
non_matches_list = self._get_non_matches_list()
325328
matches_list = self._matches_list if non_matches_list.empty else \
326329
pd.concat([self._matches_list, non_matches_list], axis=0, ignore_index=True)
327-
330+
328331
left_side, right_side = get_both_sides(self._master, self._duplicates, drop_index=ignore_index)
329332
similarity = matches_list.similarity.reset_index(drop=True)
330333
if self._master_id is None:
@@ -366,16 +369,18 @@ def get_groups(self,
366369
If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
367370
above are returned as well altogether in a DataFrame.
368371
369-
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
372+
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
370373
self._config.ignore_index.
371-
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
374+
:param replace_na: whether or not to replace NaN values in most similar string index-columns with
372375
corresponding duplicates-index values. Defaults to self._config.replace_na.
373376
"""
374-
if ignore_index is None: ignore_index = self._config.ignore_index
377+
if ignore_index is None:
378+
ignore_index = self._config.ignore_index
375379
if self._duplicates is None:
376380
return self._deduplicate(ignore_index=ignore_index)
377381
else:
378-
if replace_na is None: replace_na = self._config.replace_na
382+
if replace_na is None:
383+
replace_na = self._config.replace_na
379384
return self._get_nearest_matches(ignore_index=ignore_index, replace_na=replace_na)
380385

381386
@validate_is_fit
@@ -445,7 +450,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
445450
"""Builds the cossine similarity matrix of two csr matrices"""
446451
tf_idf_matrix_1 = master_matrix
447452
tf_idf_matrix_2 = duplicate_matrix.transpose()
448-
453+
449454
optional_kwargs = {
450455
'return_best_ntop': True,
451456
'use_threads': self._config.number_of_processes > 1,
@@ -465,7 +470,8 @@ def _get_non_matches_list(self) -> pd.DataFrame:
465470
all_pairs = pd.MultiIndex.from_product([range(m_sz), range(d_sz)], names=['master_side', 'dupe_side'])
466471
matched_pairs = pd.MultiIndex.from_frame(self._matches_list[['master_side', 'dupe_side']])
467472
missing_pairs = all_pairs.difference(matched_pairs)
468-
if missing_pairs.empty: return pd.DataFrame()
473+
if missing_pairs.empty:
474+
return pd.DataFrame()
469475
if (self._max_n_matches < self._true_max_n_matches):
470476
raise Exception(f'\nERROR: Cannot return zero-similarity matches since \n'
471477
f'\t\t max_n_matches={self._max_n_matches} is too small!\n'
@@ -483,8 +489,8 @@ def _get_nearest_matches(self,
483489
master_label = f'{prefix}{self._master.name if self._master.name else DEFAULT_MASTER_NAME}'
484490
master = self._master.rename(master_label).reset_index(drop=ignore_index)
485491
dupes = self._duplicates.rename('duplicates').reset_index(drop=ignore_index)
486-
487-
# Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
492+
493+
# Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
488494
if isinstance(dupes, pd.DataFrame):
489495
master.rename(
490496
columns={col: f'{prefix}{col}' for col in master.columns if str(col) != master_label},
@@ -514,14 +520,14 @@ def _get_nearest_matches(self,
514520
if self._master_id is not None:
515521
# Also update the master_id-series with the duplicates_id in cases were there is no match
516522
dupes_max_sim.loc[rows_to_update, master_id_label] = dupes_max_sim[rows_to_update].duplicates_id
517-
523+
518524
# For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
519525
# appear within them. So here we change them back to their original datatypes if possible:
520526
if dupes_max_sim[master_id_label].dtype != self._master_id.dtype and \
521-
self._duplicates_id.dtype == self._master_id.dtype:
527+
self._duplicates_id.dtype == self._master_id.dtype:
522528
dupes_max_sim.loc[:, master_id_label] = \
523-
dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
524-
529+
dupes_max_sim.loc[:, master_id_label].astype(self._master_id.dtype)
530+
525531
# Prepare the output:
526532
required_column_list = [master_label] if self._master_id is None else [master_id_label, master_label]
527533
index_column_list = \
@@ -531,13 +537,13 @@ def _get_nearest_matches(self,
531537
# Update the master index-columns with the duplicates index-column values in cases were there is no match
532538
dupes_index_columns = [col for col in dupes.columns if str(col) != 'duplicates']
533539
dupes_max_sim.loc[rows_to_update, index_column_list] = \
534-
dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
535-
540+
dupes_max_sim.loc[rows_to_update, dupes_index_columns].values
541+
536542
# Restore their original datatypes if possible:
537543
for m, d in zip(index_column_list, dupes_index_columns):
538544
if dupes_max_sim[m].dtype != master[m].dtype and dupes[d].dtype == master[m].dtype:
539545
dupes_max_sim.loc[:, m] = dupes_max_sim.loc[:, m].astype(master[m].dtype)
540-
546+
541547
# Make sure to keep same order as duplicates
542548
dupes_max_sim = dupes_max_sim.sort_values('dupe_side').set_index('dupe_side')
543549
output = dupes_max_sim[index_column_list + required_column_list]
@@ -608,7 +614,7 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
608614
master_indices = master_strings[master_strings == master_side].index.to_series().reset_index(drop=True)
609615
dupe_indices = dupe_strings[dupe_strings == dupe_side].index.to_series().reset_index(drop=True)
610616
return master_indices, dupe_indices
611-
617+
612618
def _validate_group_rep_specs(self):
613619
group_rep_options = (GROUP_REP_FIRST, GROUP_REP_CENTROID)
614620
if self._config.group_rep not in group_rep_options:
@@ -626,16 +632,16 @@ def _validate_replace_na_and_drop(self):
626632
)
627633

628634
@staticmethod
629-
def _fix_diagonal(A) -> csr_matrix:
630-
r = np.arange(A.shape[0])
631-
A[r, r] = 1
632-
return A
635+
def _fix_diagonal(m: lil_matrix) -> csr_matrix:
636+
r = np.arange(m.shape[0])
637+
m[r, r] = 1
638+
return m
633639

634640
@staticmethod
635-
def _symmetrize_matrix(A) -> csr_matrix:
636-
r, c = A.nonzero()
637-
A[c, r] = A[r, c]
638-
return A
641+
def _symmetrize_matrix(m_symmetric: lil_matrix) -> csr_matrix:
642+
r, c = m_symmetric.nonzero()
643+
m_symmetric[c, r] = m_symmetric[r, c]
644+
return m_symmetric
639645

640646
@staticmethod
641647
def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:

0 commit comments

Comments
 (0)