made ntop always flexible (i.e., not only when ntop >= B.shape[1])

ParticularMiner · ParticularMiner · commit 93d0fca6396e · 2021-04-24T23:18:14.000+02:00
diff --git a/sparse_dot_topn/awesome_cossim_topn.py b/sparse_dot_topn/awesome_cossim_topn.py
@@ -18,7 +18,6 @@ def awesome_cossim_topn(
 		lower_bound=0,
 		use_threads=False,
 		n_jobs=1,
-		ntop_is_flexible=False,
 		mem_manager_is_C=False,
 		return_best_topn=False
 	):
@@ -35,13 +34,9 @@ def awesome_cossim_topn(
 		lower_bound: a threshold that the element of A*B must be greater than
 		use_threads: use multi-thread or not
 		n_jobs: number of thread, must be >= 1
-		ntop_is_flexible: (default: False) if True, memory management will be handed 
-						  over to C/C++ whenever python's attempt at allocating
-						  memory fails.
 		mem_manager_is_C: (default: False) this is mainly for testing purposes. if 
 						  True, will force memory management to be handed over to
-						  C/C++. Should be used only when ntop >= number of columns 
-						  of B or ntop_is_flexible=True.
+						  C/C++.
 		return_best_topn: (default: False) if True, will return best_topn together 
 						  with C as a tuple: (C, best_topn)
 
@@ -82,58 +77,46 @@ def awesome_cossim_topn(
 			return output
 
 	# filled matrices from here on
-	indptr = np.empty(M + 1, dtype=idx_dtype)
+	indptr = np.empty(M+1, dtype=idx_dtype)
 	try:
 		indices = np.empty(nnz_max, dtype=idx_dtype)
 		data = np.empty(nnz_max, dtype=A.dtype)
-		
 		if mem_manager_is_C: raise MemoryError	# This is mainly for testing purposes
-		
 	except MemoryError:
 		# if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
-		if ntop_is_flexible or ntop >= N:
 		# It is likely you are here because nnz_max is too large. But don't give up just yet! 
 		# sparse_dot_topn will hand over the memory allocation/management to C++.  C++ will
 		# grow the memory allocations for these arrays as needed without any need for nnz_max.
 		# Note that reallocations could occur causing data to be copied to other locations 
 		# in memory thus impacting performance
-			indices = np.empty(0, dtype=idx_dtype)
-			data = np.empty(0, dtype=A.dtype)
-			if not use_threads:
-	
-				indices, data, best_topn = ct.sparse_dot_free(
-					M, N, np.asarray(A.indptr, dtype=idx_dtype),
-					np.asarray(A.indices, dtype=idx_dtype),
-					A.data,
-					np.asarray(B.indptr, dtype=idx_dtype),
-					np.asarray(B.indices, dtype=idx_dtype),
-					B.data,
-					lower_bound,
-					indptr
-				)
-			else:
-	
-				indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
-					M, N, np.asarray(A.indptr, dtype=idx_dtype),
-					np.asarray(A.indices, dtype=idx_dtype),
-					A.data,
-					np.asarray(B.indptr, dtype=idx_dtype),
-					np.asarray(B.indices, dtype=idx_dtype),
-					B.data,
-					lower_bound,
-					indptr, n_jobs
-				)
+		indices = np.empty(0, dtype=idx_dtype)
+		data = np.empty(0, dtype=A.dtype)
+		if not use_threads:
+
+			indices, data, best_topn = ct.sparse_dot_free(
+				M, N, np.asarray(A.indptr, dtype=idx_dtype),
+				np.asarray(A.indices, dtype=idx_dtype),
+				A.data,
+				np.asarray(B.indptr, dtype=idx_dtype),
+				np.asarray(B.indices, dtype=idx_dtype),
+				B.data,
+				ntop, lower_bound,
+				indptr
+			)
+			
 		else:
 
-			if mem_manager_is_C:
-				raise Exception(
-					'When mem_manager_is_C=True, set ntop >= B.shape[1], or set ntop_is_flexible=True'
-				)
-			else:
-				raise Exception(
-					'Not enough memory!  Data array is too large. Try reducing the value of ntop.'
-					'or set ntop_is_flexible=True'
-				)
+			indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
+				M, N, np.asarray(A.indptr, dtype=idx_dtype),
+				np.asarray(A.indices, dtype=idx_dtype),
+				A.data,
+				np.asarray(B.indptr, dtype=idx_dtype),
+				np.asarray(B.indices, dtype=idx_dtype),
+				B.data,
+				ntop, lower_bound,
+				indptr, n_jobs
+			)
+
 	else:
 		# no exception was raised; then use old function (as it is expected to be the fastest)
 		
@@ -152,6 +135,7 @@ def awesome_cossim_topn(
 				lower_bound,
 				indptr, indices, data, best_topn_arr
 			)
+	
 		else:
 			if n_jobs < 1:
 				err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
@@ -168,6 +152,7 @@ def awesome_cossim_topn(
 				lower_bound,
 				indptr, indices, data, best_topn_arr, n_jobs
 			)
+		
 		best_topn = best_topn_arr[0]
 	
 	# prepare and return the output:
diff --git a/sparse_dot_topn/sparse_dot_topn.pyx b/sparse_dot_topn/sparse_dot_topn.pyx
@@ -72,6 +72,7 @@ cdef extern from "sparse_dot_topn_source.h":
 										int Bp[],
 										int Bj[],
 										double Bx[],
+										int ntop,
 										double lower_bound,
 										int Cp[],
 										vector[int]* Cj,
@@ -202,7 +203,8 @@ cpdef sparse_dot_free(
 						np.ndarray[int, ndim=1] b_indptr,
 						np.ndarray[int, ndim=1] b_indices,
 						np.ndarray[double, ndim=1] b_data,
-						double lower_bound,
+                        int ntop,
+                        double lower_bound,
 						np.ndarray[int, ndim=1] c_indptr
 					):
 	"""
@@ -242,7 +244,7 @@ cpdef sparse_dot_free(
 	cdef vector[int] vCj;
 	cdef vector[double] vCx;
 
-	sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax)
+	sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax)
 	
 	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
 	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.cpp b/sparse_dot_topn/sparse_dot_topn_parallel.cpp
@@ -419,6 +419,7 @@ void sparse_dot_topn_extd_parallel(
 void inner_sparse_dot_free(
 		job_range_type job_range,
 		int n_col_inner,
+		int ntop_inner,
 		double lower_bound_inner,
 		int Ap_copy[],
 		int Aj_copy[],
@@ -485,18 +486,29 @@ void inner_sparse_dot_free(
 		}
 
 		int len = (int) (real_candidates->size() - sz);
+		*n_minmax = (len > *n_minmax)? len : *n_minmax;
 
 		candidate* candidate_arr_begin = real_candidates->data() + sz;
-		std::sort(
-				candidate_arr_begin,
-				candidate_arr_begin + len,
-				candidate_cmp
-		);
+		if (len > ntop_inner){
+			std::partial_sort(
+					candidate_arr_begin,
+					candidate_arr_begin + ntop_inner,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
+			len = ntop_inner;
+		}
+		else {
+			std::sort(
+					candidate_arr_begin,
+					candidate_arr_begin + len,
+					candidate_cmp
+			);
+		}
 
 		real_candidates->resize(sz + (size_t) len);
 		*(row_sizes_ptr++) = len;
 		(*total) += len;
-		*n_minmax = (len > *n_minmax)? len : *n_minmax;
 	}
 	real_candidates->shrink_to_fit();
 }
@@ -510,6 +522,7 @@ void sparse_dot_free_parallel(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* vCj,
@@ -536,7 +549,7 @@ void sparse_dot_free_parallel(
 				inner_sparse_dot_free,
 				job_ranges[job_nr],
 				n_col,
-				lower_bound,
+				ntop, lower_bound,
 				Ap, Aj, Ax, Bp, Bj, Bx,
 				&real_candidates[job_nr],
 				&row_sizes[job_nr],
diff --git a/sparse_dot_topn/sparse_dot_topn_parallel.h b/sparse_dot_topn/sparse_dot_topn_parallel.h
@@ -67,6 +67,7 @@ extern void sparse_dot_free_parallel(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
diff --git a/sparse_dot_topn/sparse_dot_topn_source.cpp b/sparse_dot_topn/sparse_dot_topn_source.cpp
@@ -280,6 +280,7 @@ void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
@@ -342,18 +343,22 @@ void sparse_dot_free_source(
 
 		int len = (int)candidates.size();
 		*n_minmax = (len > *n_minmax)? len : *n_minmax;
-		std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+
+		if (len > ntop){
+			std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
+			len = ntop;
+		} else {
+			std::sort(candidates.begin(), candidates.end(), candidate_cmp);
+		}
 
 		for(int a=0; a < len; a++){
 			Cj->push_back(candidates[a].index);
 			Cx->push_back(candidates[a].value);
 		}
 		candidates.clear();
 
-		Cp[i+1] = (int) (Cj->size());
+		Cp[i+1] = Cj->size();
 	}
-	Cj->shrink_to_fit();
-	Cx->shrink_to_fit();
 }
 
 /*
diff --git a/sparse_dot_topn/sparse_dot_topn_source.h b/sparse_dot_topn/sparse_dot_topn_source.h
@@ -70,6 +70,7 @@ extern void sparse_dot_free_source(
 		int Bp[],
 		int Bj[],
 		double Bx[], //data of B
+		int ntop,
 		double lower_bound,
 		int Cp[],
 		std::vector<int>* Cj,
diff --git a/sparse_dot_topn/sparse_dot_topn_threaded.pyx b/sparse_dot_topn/sparse_dot_topn_threaded.pyx
@@ -75,6 +75,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
 										int Bp[],
 										int Bj[],
 										double Bx[],
+										int ntop,
 										double lower_bound,
 										int Cp[],
 										vector[int]* Cj,
@@ -167,6 +168,7 @@ cpdef sparse_dot_free_threaded(
 								np.ndarray[int, ndim=1] b_indptr,
 								np.ndarray[int, ndim=1] b_indices,
 								np.ndarray[double, ndim=1] b_data,
+								int ntop,
 								double lower_bound,
 								np.ndarray[int, ndim=1] c_indptr,
 								int n_jobs
@@ -185,7 +187,7 @@ cpdef sparse_dot_free_threaded(
 	cdef vector[int] vCj;
 	cdef vector[double] vCx;
 
-	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
+	sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
 	
 	c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
 	c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)
diff --git a/sparse_dot_topn/test/test_awesome_cossim_topn.py b/sparse_dot_topn/test/test_awesome_cossim_topn.py
@@ -62,8 +62,15 @@ def helper_awesome_cossim_topn_dense(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
 
@@ -76,8 +83,15 @@ def helper_awesome_cossim_topn_dense(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    pruned_awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in pruned_awesome_result_top3]
 
@@ -131,8 +145,15 @@ def helper_awesome_cossim_topn_sparse(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
+    awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        0.0,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in awesome_result_top3]  # make comparable, normally not needed
 
@@ -145,8 +166,15 @@ def helper_awesome_cossim_topn_sparse(
         use_threads=use_threads,
         n_jobs=n_jobs
     )
-    pruned_awesome_result_top3 = \
-        awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
+    pruned_awesome_result_top3 = awesome_cossim_topn(
+        a_csr,
+        b_csr_t,
+        NUM_CANDIDATES,
+        PRUNE_THRESHOLD,
+        mem_manager_is_C=mem_manager_is_C,
+        use_threads=use_threads,
+        n_jobs=n_jobs
+    )
     pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
         row.data) > 0 else None for row in pruned_awesome_result_top3]
 
diff --git a/string_grouper/string_grouper.py b/string_grouper/string_grouper.py
@@ -218,9 +218,13 @@ def __init__(self, master: pd.Series,
         self._duplicates: pd.Series = duplicates if duplicates is not None else None
         self._master_id: pd.Series = master_id if master_id is not None else None
         self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None
+
         self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
-        self._max_n_matches = len(self._master) if self._config.max_n_matches is None \
-            else self._config.max_n_matches
+        if self._config.max_n_matches is None:
+            self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
+        else:
+            self._max_n_matches = self._config.max_n_matches
+
         self._validate_group_rep_specs()
         self._validate_replace_na_and_drop()
         self.is_build = False  # indicates if the grouper was fit or not
@@ -435,7 +439,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
         optional_kwargs = dict()
         if self._config.number_of_processes > 1:
             optional_kwargs = {
-                'ntop_is_flexible': self._config.max_n_matches is None,
                 'return_best_topn': True,
                 'use_threads': True,
                 'n_jobs': self._config.number_of_processes