made ntop always flexible (i.e., not only when ntop >= B.shape[1])

ParticularMiner · ParticularMiner · commit 30712de5d8c1 · 2021-04-25T16:51:23.000+02:00
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -419,6 +419,7 @@ void sparse_dot_topn_extd_parallel(
 void inner_sparse_dot_free(
 		job_range_type job_range,
 		int n_col_inner,
+		int ntop_inner,
 		double lower_bound_inner,
 		int Ap_copy[],
 		int Aj_copy[],
@@ -485,18 +486,29 @@ void inner_sparse_dot_free(
 		}
 
 		int len = (int) (real_candidates->size() - sz);
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
 
 		candidate* candidate_arr_begin = real_candidates->data() + sz;
-		std::sort(
-				candidate_arr_begin,
-				candidate_arr_begin + len,
-				candidate_cmp
-		);
+		if (len > ntop_inner){
+			std::partial_sort(
+					candidate_arr_begin,
+					candidate_arr_begin + ntop_inner,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
+			len = ntop_inner;
+		}
+		else {
+			std::sort(
+					candidate_arr_begin,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
+		}
 
 		real_candidates->resize(sz + (size_t) len);
 		*(row_sizes_ptr++) = len;
 		(*total) += len;
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
 	}
 	real_candidates->shrink_to_fit();
 }
@@ -510,6 +522,7 @@ void sparse_dot_free_parallel(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* vCj,
@@ -536,7 +549,7 @@ void sparse_dot_free_parallel(
 				inner_sparse_dot_free,
 				job_ranges[job_nr],
 				n_col,
-				lower_bound,
+				ntop, lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
 				&real_candidates[job_nr],
 				&row_sizes[job_nr],
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -67,6 +67,7 @@ extern void sparse_dot_free_parallel(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -250,8 +250,9 @@ void sparse_dot_topn_extd_source(
 	C++ implementation of sparse_dot_free_source
 
 	This function will return a matrix C in CSR format, where
-	C = [all results > lower_bound sorted for each row of A * B].
-	It also returns the maximum number of elements per row of C.
+	C = [sorted top n results > lower_bound for each row of A * B].
+	The maximum number n_minmax of elements per row of C (assuming ntop = n_col)
+	is also returned.
 
 	Input:
 		n_row: number of rows of A matrix
@@ -260,7 +261,7 @@ void sparse_dot_topn_extd_source(
 		Ap, Aj, Ax: CSR expression of A matrix
 		Bp, Bj, Bx: CSR expression of B matrix
 
-		memory_bound: the maximum number of elements per row of C
+		ntop: n top results
 		lower_bound: a threshold that the element of A*B must greater than
 
 	Output by reference:
@@ -280,6 +281,7 @@ void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
@@ -342,18 +344,22 @@ void sparse_dot_free_source(
 
 		int len = (int)candidates.size();
 		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-		std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+
+		if (len > ntop){
+			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+			len = ntop;
+		} else {
+			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+		}
 
 		for(int a=0; a < len; a++){
 			Cj->push_back(candidates[a].index);
 			Cx->push_back(candidates[a].value);
 		}
 		candidates.clear();
 
-		Cp[i+1] = (int) (Cj->size());
+		Cp[i+1] = Cj->size();
 	}
-	Cj->shrink_to_fit();
-	Cx->shrink_to_fit();
 }
 
 /*
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -70,6 +70,7 @@ extern void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -75,6 +75,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
 										int Bp[],
 										int Bj[],
 										double Bx[],
+										int ntop,
 										double lower_bound,
 										int Cp[],
 										vector[int]* Cj,
@@ -167,6 +168,7 @@ cpdef sparse_dot_free_threaded(
 								np.ndarray[int, ndim=1] b_indptr,
 								np.ndarray[int, ndim=1] b_indices,
 								np.ndarray[double, ndim=1] b_data,
+								int ntop,
 								double lower_bound,
 								np.ndarray[int, ndim=1] c_indptr,
 								int n_jobs
@@ -185,7 +187,7 @@ cpdef sparse_dot_free_threaded(
 	cdef vector[int] vCj;
 	cdef vector[double] vCx;
 
-	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
+	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
 	
 	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
 	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -62,8 +62,15 @@ def helper_awesome_cossim_topn_dense(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
 
@@ -76,8 +83,15 @@ def helper_awesome_cossim_topn_dense(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    pruned_awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in pruned_awesome_result_top3]
 
@@ -131,8 +145,15 @@ def helper_awesome_cossim_topn_sparse(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
 
@@ -145,8 +166,15 @@ def helper_awesome_cossim_topn_sparse(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    pruned_awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in pruned_awesome_result_top3]
 
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -218,9 +218,13 @@ def __init__(self, master: pd.Series,
         self._duplicates: pd.Series = duplicates if duplicates is not None else None
         self._master_id: pd.Series = master_id if master_id is not None else None
         self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None
+
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
-        self._max_n_matches = len(self._master) if self._config.max_n_matches is None \
-            else self._config.max_n_matches
+        if self._config.max_n_matches is None:
+            self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
+        else:
+            self._max_n_matches = self._config.max_n_matches
+
         self._validate_group_rep_specs()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
@@ -435,7 +439,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         optional_kwargs = dict()
         if self._config.number_of_processes > 1:
             optional_kwargs = {
-                'ntop_is_flexible': self._config.max_n_matches is None,
                 'return_best_topn': True,
                 'use_threads': True,
                 'n_jobs': self._config.number_of_processes