4
4
import multiprocessing
5
5
from sklearn .feature_extraction .text import TfidfVectorizer
6
6
from scipy .sparse .csr import csr_matrix
7
+ from scipy .sparse .lil import lil_matrix
7
8
from scipy .sparse .csgraph import connected_components
8
9
from typing import Tuple , NamedTuple , List , Optional , Union
9
10
from string_grouper_topn import awesome_cossim_topn
17
18
DEFAULT_IGNORE_CASE : bool = True # ignores case by default
18
19
DEFAULT_DROP_INDEX : bool = False # includes index-columns in output
19
20
DEFAULT_REPLACE_NA : bool = False # when finding the most similar strings, does not replace NaN values in most
20
- # similar string index-columns with corresponding duplicates-index values
21
- DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
22
- # matches appear in the output
21
+ # similar string index-columns with corresponding duplicates-index values
22
+ DEFAULT_INCLUDE_ZEROES : bool = True # when the minimum cosine similarity <=0, determines whether zero-similarity
23
+ # matches appear in the output
23
24
GROUP_REP_CENTROID : str = 'centroid' # Option value to select the string in each group with the largest
24
- # similarity aggregate as group-representative:
25
+ # similarity aggregate as group-representative:
25
26
GROUP_REP_FIRST : str = 'first' # Option value to select the first string in each group as group-representative:
26
- DEFAULT_GROUP_REP : str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
27
+ DEFAULT_GROUP_REP : str = GROUP_REP_CENTROID # chooses group centroid as group-representative by default
27
28
28
29
# The following string constants are used by (but aren't [yet] options passed to) StringGrouper
29
30
DEFAULT_COLUMN_NAME : str = 'side' # used to name non-index columns of the output of StringGrouper.get_matches
30
- DEFAULT_ID_NAME : str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
31
+ DEFAULT_ID_NAME : str = 'id' # used to name id-columns in the output of StringGrouper.get_matches
31
32
LEFT_PREFIX : str = 'left_' # used to prefix columns on the left of the output of StringGrouper.get_matches
32
33
RIGHT_PREFIX : str = 'right_' # used to prefix columns on the right of the output of StringGrouper.get_matches
33
34
MOST_SIMILAR_PREFIX : str = 'most_similar_' # used to prefix columns of the output of
34
- # StringGrouper._get_nearest_matches
35
- DEFAULT_MASTER_NAME : str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
35
+ # StringGrouper._get_nearest_matches
36
+ DEFAULT_MASTER_NAME : str = 'master' # used to name non-index column of the output of StringGrouper.get_nearest_matches
36
37
DEFAULT_MASTER_ID_NAME : str = f'{ DEFAULT_MASTER_NAME } _{ DEFAULT_ID_NAME } ' # used to name id-column of the output of
37
- # StringGrouper.get_nearest_matches
38
+ # StringGrouper.get_nearest_matches
38
39
GROUP_REP_PREFIX : str = 'group_rep_' # used to prefix and name columns of the output of StringGrouper._deduplicate
39
40
40
41
# High level functions
@@ -147,9 +148,9 @@ class StringGrouperConfig(NamedTuple):
147
148
Defaults to number of cores on a machine - 1.
148
149
:param ignore_case: bool. Whether or not case should be ignored. Defaults to True (ignore case).
149
150
:param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to False.
150
- :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
151
+ :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
151
152
appear in the output. Defaults to True.
152
- :param replace_na: whether or not to replace NaN values in most similar string index-columns with
153
+ :param replace_na: whether or not to replace NaN values in most similar string index-columns with
153
154
corresponding duplicates-index values. Defaults to False.
154
155
:param group_rep: str. The scheme to select the group-representative. Default is 'centroid'.
155
156
The other choice is 'first'.
@@ -231,8 +232,8 @@ def __init__(self, master: pd.Series,
231
232
self ._vectorizer = TfidfVectorizer (min_df = 1 , analyzer = self .n_grams )
232
233
# After the StringGrouper is built, _matches_list will contain the indices and similarities of the matches
233
234
self ._matches_list : pd .DataFrame = pd .DataFrame ()
234
- # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
235
- # self._config.min_similarity <= 0
235
+ # _true_max_n_matches will contain the true maximum number of matches over all strings in master if
236
+ # self._config.min_similarity <= 0
236
237
self ._true_max_n_matches = None
237
238
238
239
def n_grams (self , string : str ) -> List [str ]:
@@ -251,21 +252,21 @@ def n_grams(self, string: str) -> List[str]:
251
252
def fit (self ) -> 'StringGrouper' :
252
253
"""Builds the _matches list which contains string matches indices and similarity"""
253
254
master_matrix , duplicate_matrix = self ._get_tf_idf_matrices ()
254
-
255
+
255
256
# Calculate the matches using the cosine similarity
256
257
matches , self ._true_max_n_matches = self ._build_matches (master_matrix , duplicate_matrix )
257
-
258
+
258
259
if self ._duplicates is None :
259
260
# convert to lil format for best efficiency when setting matrix-elements
260
- matches = matches .tolil ()
261
- # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
261
+ matches = matches .tolil ()
262
+ # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by
262
263
# floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
263
264
matches = StringGrouper ._fix_diagonal (matches )
264
265
if self ._max_n_matches < self ._true_max_n_matches :
265
266
# the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
266
267
matches = StringGrouper ._symmetrize_matrix (matches )
267
268
matches = matches .tocsr ()
268
-
269
+
269
270
# build list from matrix
270
271
self ._matches_list = self ._get_matches_list (matches )
271
272
self .is_build = True
@@ -283,14 +284,14 @@ def dot(self) -> pd.Series:
283
284
@validate_is_fit
284
285
def get_matches (self ,
285
286
ignore_index : Optional [bool ] = None ,
286
- include_zeroes : Optional [bool ]= None ) -> pd .DataFrame :
287
+ include_zeroes : Optional [bool ] = None ) -> pd .DataFrame :
287
288
"""
288
289
Returns a DataFrame with all the matches and their cosine similarity.
289
290
If optional IDs are used, returned as extra columns with IDs matched to respective data rows
290
291
291
- :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
292
+ :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
292
293
self._config.ignore_index.
293
- :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
294
+ :param include_zeroes: when the minimum cosine similarity <=0, determines whether zero-similarity matches
294
295
appear in the output. Defaults to self._config.include_zeroes.
295
296
"""
296
297
def get_both_sides (master : pd .Series ,
@@ -313,18 +314,20 @@ def prefix_column_names(data: Union[pd.Series, pd.DataFrame], prefix: str):
313
314
else :
314
315
return data .rename (f"{ prefix } { data .name } " )
315
316
316
- if ignore_index is None : ignore_index = self ._config .ignore_index
317
- if include_zeroes is None : include_zeroes = self ._config .include_zeroes
317
+ if ignore_index is None :
318
+ ignore_index = self ._config .ignore_index
319
+ if include_zeroes is None :
320
+ include_zeroes = self ._config .include_zeroes
318
321
if self ._config .min_similarity > 0 or not include_zeroes :
319
322
matches_list = self ._matches_list
320
323
elif include_zeroes :
321
324
# Here's a fix to a bug pointed out by one GitHub user (@nbcvijanovic):
322
- # the fix includes zero-similarity matches that are missing by default
323
- # in _matches_list due to our use of sparse matrices
325
+ # the fix includes zero-similarity matches that are missing by default
326
+ # in _matches_list due to our use of sparse matrices
324
327
non_matches_list = self ._get_non_matches_list ()
325
328
matches_list = self ._matches_list if non_matches_list .empty else \
326
329
pd .concat ([self ._matches_list , non_matches_list ], axis = 0 , ignore_index = True )
327
-
330
+
328
331
left_side , right_side = get_both_sides (self ._master , self ._duplicates , drop_index = ignore_index )
329
332
similarity = matches_list .similarity .reset_index (drop = True )
330
333
if self ._master_id is None :
@@ -366,16 +369,18 @@ def get_groups(self,
366
369
If there are IDs (master_id and/or duplicates_id) then the IDs corresponding to the string outputs
367
370
above are returned as well altogether in a DataFrame.
368
371
369
- :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
372
+ :param ignore_index: whether or not to exclude string Series index-columns in output. Defaults to
370
373
self._config.ignore_index.
371
- :param replace_na: whether or not to replace NaN values in most similar string index-columns with
374
+ :param replace_na: whether or not to replace NaN values in most similar string index-columns with
372
375
corresponding duplicates-index values. Defaults to self._config.replace_na.
373
376
"""
374
- if ignore_index is None : ignore_index = self ._config .ignore_index
377
+ if ignore_index is None :
378
+ ignore_index = self ._config .ignore_index
375
379
if self ._duplicates is None :
376
380
return self ._deduplicate (ignore_index = ignore_index )
377
381
else :
378
- if replace_na is None : replace_na = self ._config .replace_na
382
+ if replace_na is None :
383
+ replace_na = self ._config .replace_na
379
384
return self ._get_nearest_matches (ignore_index = ignore_index , replace_na = replace_na )
380
385
381
386
@validate_is_fit
@@ -445,7 +450,7 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
445
450
"""Builds the cossine similarity matrix of two csr matrices"""
446
451
tf_idf_matrix_1 = master_matrix
447
452
tf_idf_matrix_2 = duplicate_matrix .transpose ()
448
-
453
+
449
454
optional_kwargs = {
450
455
'return_best_ntop' : True ,
451
456
'use_threads' : self ._config .number_of_processes > 1 ,
@@ -465,7 +470,8 @@ def _get_non_matches_list(self) -> pd.DataFrame:
465
470
all_pairs = pd .MultiIndex .from_product ([range (m_sz ), range (d_sz )], names = ['master_side' , 'dupe_side' ])
466
471
matched_pairs = pd .MultiIndex .from_frame (self ._matches_list [['master_side' , 'dupe_side' ]])
467
472
missing_pairs = all_pairs .difference (matched_pairs )
468
- if missing_pairs .empty : return pd .DataFrame ()
473
+ if missing_pairs .empty :
474
+ return pd .DataFrame ()
469
475
if (self ._max_n_matches < self ._true_max_n_matches ):
470
476
raise Exception (f'\n ERROR: Cannot return zero-similarity matches since \n '
471
477
f'\t \t max_n_matches={ self ._max_n_matches } is too small!\n '
@@ -483,8 +489,8 @@ def _get_nearest_matches(self,
483
489
master_label = f'{ prefix } { self ._master .name if self ._master .name else DEFAULT_MASTER_NAME } '
484
490
master = self ._master .rename (master_label ).reset_index (drop = ignore_index )
485
491
dupes = self ._duplicates .rename ('duplicates' ).reset_index (drop = ignore_index )
486
-
487
- # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
492
+
493
+ # Rename new master-columns to avoid possible conflict with new dupes-columns when later merging
488
494
if isinstance (dupes , pd .DataFrame ):
489
495
master .rename (
490
496
columns = {col : f'{ prefix } { col } ' for col in master .columns if str (col ) != master_label },
@@ -514,14 +520,14 @@ def _get_nearest_matches(self,
514
520
if self ._master_id is not None :
515
521
# Also update the master_id-series with the duplicates_id in cases were there is no match
516
522
dupes_max_sim .loc [rows_to_update , master_id_label ] = dupes_max_sim [rows_to_update ].duplicates_id
517
-
523
+
518
524
# For some weird reason, pandas' merge function changes int-datatype columns to float when NaN values
519
525
# appear within them. So here we change them back to their original datatypes if possible:
520
526
if dupes_max_sim [master_id_label ].dtype != self ._master_id .dtype and \
521
- self ._duplicates_id .dtype == self ._master_id .dtype :
527
+ self ._duplicates_id .dtype == self ._master_id .dtype :
522
528
dupes_max_sim .loc [:, master_id_label ] = \
523
- dupes_max_sim .loc [:, master_id_label ].astype (self ._master_id .dtype )
524
-
529
+ dupes_max_sim .loc [:, master_id_label ].astype (self ._master_id .dtype )
530
+
525
531
# Prepare the output:
526
532
required_column_list = [master_label ] if self ._master_id is None else [master_id_label , master_label ]
527
533
index_column_list = \
@@ -531,13 +537,13 @@ def _get_nearest_matches(self,
531
537
# Update the master index-columns with the duplicates index-column values in cases were there is no match
532
538
dupes_index_columns = [col for col in dupes .columns if str (col ) != 'duplicates' ]
533
539
dupes_max_sim .loc [rows_to_update , index_column_list ] = \
534
- dupes_max_sim .loc [rows_to_update , dupes_index_columns ].values
535
-
540
+ dupes_max_sim .loc [rows_to_update , dupes_index_columns ].values
541
+
536
542
# Restore their original datatypes if possible:
537
543
for m , d in zip (index_column_list , dupes_index_columns ):
538
544
if dupes_max_sim [m ].dtype != master [m ].dtype and dupes [d ].dtype == master [m ].dtype :
539
545
dupes_max_sim .loc [:, m ] = dupes_max_sim .loc [:, m ].astype (master [m ].dtype )
540
-
546
+
541
547
# Make sure to keep same order as duplicates
542
548
dupes_max_sim = dupes_max_sim .sort_values ('dupe_side' ).set_index ('dupe_side' )
543
549
output = dupes_max_sim [index_column_list + required_column_list ]
@@ -608,7 +614,7 @@ def _get_indices_of(self, master_side: str, dupe_side: str) -> Tuple[pd.Series,
608
614
master_indices = master_strings [master_strings == master_side ].index .to_series ().reset_index (drop = True )
609
615
dupe_indices = dupe_strings [dupe_strings == dupe_side ].index .to_series ().reset_index (drop = True )
610
616
return master_indices , dupe_indices
611
-
617
+
612
618
def _validate_group_rep_specs (self ):
613
619
group_rep_options = (GROUP_REP_FIRST , GROUP_REP_CENTROID )
614
620
if self ._config .group_rep not in group_rep_options :
@@ -626,16 +632,16 @@ def _validate_replace_na_and_drop(self):
626
632
)
627
633
628
634
@staticmethod
629
- def _fix_diagonal (A ) -> csr_matrix :
630
- r = np .arange (A .shape [0 ])
631
- A [r , r ] = 1
632
- return A
635
+ def _fix_diagonal (m : lil_matrix ) -> csr_matrix :
636
+ r = np .arange (m .shape [0 ])
637
+ m [r , r ] = 1
638
+ return m
633
639
634
640
@staticmethod
635
- def _symmetrize_matrix (A ) -> csr_matrix :
636
- r , c = A .nonzero ()
637
- A [c , r ] = A [r , c ]
638
- return A
641
+ def _symmetrize_matrix (m_symmetric : lil_matrix ) -> csr_matrix :
642
+ r , c = m_symmetric .nonzero ()
643
+ m_symmetric [c , r ] = m_symmetric [r , c ]
644
+ return m_symmetric
639
645
640
646
@staticmethod
641
647
def _get_matches_list (matches : csr_matrix ) -> pd .DataFrame :
0 commit comments