fixed bug related to single-valued input Series

ParticularMiner · ParticularMiner · commit 2c6b102d3883 · 2021-05-05T00:52:33.000+02:00
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -251,11 +251,21 @@ def n_grams(self, string: str) -> List[str]:
     def fit(self) -> 'StringGrouper':
         """Builds the _matches list which contains string matches indices and similarity"""
         master_matrix, duplicate_matrix = self._get_tf_idf_matrices()
+        
         # Calculate the matches using the cosine similarity
         matches, self._true_max_n_matches = self._build_matches(master_matrix, duplicate_matrix)
-        if self._duplicates is None and self._max_n_matches < self._true_max_n_matches:
-            # the list of matches needs to be symmetric!!! (i.e., if A != B and A matches B; then B matches A)
-            matches = StringGrouper._symmetrize_matrix_and_fix_diagonal(matches)
+        
+        if self._duplicates is None:
+            # convert to lil format for best efficiency when setting matrix-elements
+            matches = matches.tolil() 
+            # matrix diagonal elements must be exactly 1 (numerical precision errors introduced by 
+            # floating-point computations in awesome_cossim_topn sometimes lead to unexpected results)
+            matches = StringGrouper._fix_diagonal(matches)
+            if self._max_n_matches < self._true_max_n_matches:
+                # the list of matches must be symmetric! (i.e., if A != B and A matches B; then B matches A)
+                matches = StringGrouper._symmetrize_matrix(matches)
+            matches = matches.tocsr()
+        
         # build list from matrix
         self._matches_list = self._get_matches_list(matches)
         self.is_build = True
@@ -616,13 +626,16 @@ def _validate_replace_na_and_drop(self):
             )
 
     @staticmethod
-    def _symmetrize_matrix_and_fix_diagonal(AA: csr_matrix) -> csr_matrix:
-        A = AA.tolil()
-        r, c = A.nonzero()
-        A[c, r] = A[r, c]
+    def _fix_diagonal(A) -> csr_matrix:
         r = np.arange(A.shape[0])
         A[r, r] = 1
-        return A.tocsr()
+        return A
+
+    @staticmethod
+    def _symmetrize_matrix(A) -> csr_matrix:
+        r, c = A.nonzero()
+        A[c, r] = A[r, c]
+        return A
 
     @staticmethod
     def _get_matches_list(matches: csr_matrix) -> pd.DataFrame:
diff --git a/string_grouper/test/test_string_grouper.py b/string_grouper/test/test_string_grouper.py
@@ -197,7 +197,10 @@ def test_match_strings(self, mock_StringGouper):
         mock_StringGrouper_instance.get_matches.assert_called_once()
         self.assertEqual(df, 'whatever')
 
-    @patch('string_grouper.string_grouper.StringGrouper._symmetrize_matrix', side_effect=mock_symmetrize_matrix)
+    @patch(
+        'string_grouper.string_grouper.StringGrouper._symmetrize_matrix',
+        side_effect=mock_symmetrize_matrix
+    )
     def test_match_list_symmetry_without_symmetrize_function(self, mock_symmetrize_matrix):
         """mocks StringGrouper._symmetrize_matches_list so that this test fails whenever _matches_list is 
         **partially** symmetric which often occurs when the kwarg max_n_matches is too small"""
@@ -236,17 +239,33 @@ def test_match_list_symmetry_with_symmetrize_function(self):
         # upper, upper_prime and their intersection should be identical.
         self.assertTrue(intersection.empty or len(upper) == len(upper_prime) == len(intersection))
 
-    def test_match_list_diagonal(self):
+    @patch(
+        'string_grouper.string_grouper.StringGrouper._fix_diagonal',
+        side_effect=mock_symmetrize_matrix
+    )
+    def test_match_list_diagonal_without_the_fix(self, mock_fix_diagonal):
         """test fails whenever _matches_list's number of self-joins is not equal to the number of strings"""
         # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
         # for small datasets setting max_n_matches=1 reproduces the bug
         simple_example = SimpleExample()
         df = simple_example.customers_df['Customer Name']
         matches = match_strings(df, max_n_matches=1)
+        mock_fix_diagonal.assert_called_once()
         num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
         num_strings = len(df)
         self.assertNotEqual(num_self_joins, num_strings)
 
+    def test_match_list_diagonal(self):
+        """This test ensures that all self-joins are present"""
+        # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
+        # for small datasets setting max_n_matches=1 reproduces the bug
+        simple_example = SimpleExample()
+        df = simple_example.customers_df['Customer Name']
+        matches = match_strings(df, max_n_matches=1)
+        num_self_joins = len(matches[matches['left_index'] == matches['right_index']])
+        num_strings = len(df)
+        self.assertEqual(num_self_joins, num_strings)
+
     def test_zero_min_similarity(self):
         """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are 
         returned when min_similarity <= 0.  A bug related to this was first pointed out by @nbcvijanovic"""
@@ -381,7 +400,7 @@ def test_get_matches_single(self):
         left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
         left_index = [0, 0, 1, 2, 3, 3]
-        right_index = [3, 0, 1, 2, 3, 0]
+        right_index = [0, 3, 1, 2, 0, 3]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side,
                                     'similarity': similarity,
@@ -397,8 +416,8 @@ def test_get_matches_1_series_1_id_series(self):
         left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
         left_index = [0, 0, 1, 2, 3, 3]
         right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
-        right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
-        right_index = [3, 0, 1, 2, 3, 0]
+        right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
+        right_index = [0, 3, 1, 2, 0, 3]
         similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
         expected_df = pd.DataFrame({'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id,
                                     'similarity': similarity,