@@ -197,7 +197,10 @@ def test_match_strings(self, mock_StringGouper):
197
197
mock_StringGrouper_instance .get_matches .assert_called_once ()
198
198
self .assertEqual (df , 'whatever' )
199
199
200
- @patch ('string_grouper.string_grouper.StringGrouper._symmetrize_matrix' , side_effect = mock_symmetrize_matrix )
200
+ @patch (
201
+ 'string_grouper.string_grouper.StringGrouper._symmetrize_matrix' ,
202
+ side_effect = mock_symmetrize_matrix
203
+ )
201
204
def test_match_list_symmetry_without_symmetrize_function (self , mock_symmetrize_matrix ):
202
205
"""mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is
203
206
**partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
@@ -236,17 +239,33 @@ def test_match_list_symmetry_with_symmetrize_function(self):
236
239
# upper, upper_prime and their intersection should be identical.
237
240
self .assertTrue (intersection .empty or len (upper ) == len (upper_prime ) == len (intersection ))
238
241
239
- def test_match_list_diagonal (self ):
242
+ @patch (
243
+ 'string_grouper.string_grouper.StringGrouper._fix_diagonal' ,
244
+ side_effect = mock_symmetrize_matrix
245
+ )
246
+ def test_match_list_diagonal_without_the_fix (self , mock_fix_diagonal ):
240
247
"""test fails whenever _matches_list's number of self-joins is not equal to the number of strings"""
241
248
# This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
242
249
# for small datasets setting max_n_matches=1 reproduces the bug
243
250
simple_example = SimpleExample ()
244
251
df = simple_example .customers_df ['Customer Name' ]
245
252
matches = match_strings (df , max_n_matches = 1 )
253
+ mock_fix_diagonal .assert_called_once ()
246
254
num_self_joins = len (matches [matches ['left_index' ] == matches ['right_index' ]])
247
255
num_strings = len (df )
248
256
self .assertNotEqual (num_self_joins , num_strings )
249
257
258
+ def test_match_list_diagonal (self ):
259
+ """This test ensures that all self-joins are present"""
260
+ # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
261
+ # for small datasets setting max_n_matches=1 reproduces the bug
262
+ simple_example = SimpleExample ()
263
+ df = simple_example .customers_df ['Customer Name' ]
264
+ matches = match_strings (df , max_n_matches = 1 )
265
+ num_self_joins = len (matches [matches ['left_index' ] == matches ['right_index' ]])
266
+ num_strings = len (df )
267
+ self .assertEqual (num_self_joins , num_strings )
268
+
250
269
def test_zero_min_similarity (self ):
251
270
"""Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are
252
271
returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic"""
@@ -381,7 +400,7 @@ def test_get_matches_single(self):
381
400
left_side = ['foo' , 'foo' , 'bar' , 'baz' , 'foo' , 'foo' ]
382
401
right_side = ['foo' , 'foo' , 'bar' , 'baz' , 'foo' , 'foo' ]
383
402
left_index = [0 , 0 , 1 , 2 , 3 , 3 ]
384
- right_index = [3 , 0 , 1 , 2 , 3 , 0 ]
403
+ right_index = [0 , 3 , 1 , 2 , 0 , 3 ]
385
404
similarity = [1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ]
386
405
expected_df = pd .DataFrame ({'left_index' : left_index , 'left_side' : left_side ,
387
406
'similarity' : similarity ,
@@ -397,8 +416,8 @@ def test_get_matches_1_series_1_id_series(self):
397
416
left_side_id = ['A0' , 'A0' , 'A1' , 'A2' , 'A3' , 'A3' ]
398
417
left_index = [0 , 0 , 1 , 2 , 3 , 3 ]
399
418
right_side = ['foo' , 'foo' , 'bar' , 'baz' , 'foo' , 'foo' ]
400
- right_side_id = ['A3 ' , 'A0 ' , 'A1' , 'A2' , 'A3 ' , 'A0 ' ]
401
- right_index = [3 , 0 , 1 , 2 , 3 , 0 ]
419
+ right_side_id = ['A0 ' , 'A3 ' , 'A1' , 'A2' , 'A0 ' , 'A3 ' ]
420
+ right_index = [0 , 3 , 1 , 2 , 0 , 3 ]
402
421
similarity = [1.0 , 1.0 , 1.0 , 1.0 , 1.0 , 1.0 ]
403
422
expected_df = pd .DataFrame ({'left_index' : left_index , 'left_side' : left_side , 'left_id' : left_side_id ,
404
423
'similarity' : similarity ,
0 commit comments