Skip to content

Commit 93d0fca

Browse files
made ntop always flexible (i.e., not only when ntop >= B.shape[1])
1 parent 5a12efb commit 93d0fca

9 files changed

+110
-70
lines changed

sparse_dot_topn/awesome_cossim_topn.py

+30-45
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ def awesome_cossim_topn(
1818
lower_bound=0,
1919
use_threads=False,
2020
n_jobs=1,
21-
ntop_is_flexible=False,
2221
mem_manager_is_C=False,
2322
return_best_topn=False
2423
):
@@ -35,13 +34,9 @@ def awesome_cossim_topn(
3534
lower_bound: a threshold that the element of A*B must be greater than
3635
use_threads: use multi-thread or not
3736
n_jobs: number of thread, must be >= 1
38-
ntop_is_flexible: (default: False) if True, memory management will be handed
39-
over to C/C++ whenever python's attempt at allocating
40-
memory fails.
4137
mem_manager_is_C: (default: False) this is mainly for testing purposes. if
4238
True, will force memory management to be handed over to
43-
C/C++. Should be used only when ntop >= number of columns
44-
of B or ntop_is_flexible=True.
39+
C/C++.
4540
return_best_topn: (default: False) if True, will return best_topn together
4641
with C as a tuple: (C, best_topn)
4742
@@ -82,58 +77,46 @@ def awesome_cossim_topn(
8277
return output
8378

8479
# filled matrices from here on
85-
indptr = np.empty(M + 1, dtype=idx_dtype)
80+
indptr = np.empty(M+1, dtype=idx_dtype)
8681
try:
8782
indices = np.empty(nnz_max, dtype=idx_dtype)
8883
data = np.empty(nnz_max, dtype=A.dtype)
89-
9084
if mem_manager_is_C: raise MemoryError # This is mainly for testing purposes
91-
9285
except MemoryError:
9386
# if mem_manager_is_C: print('Exception raised! Continuing ...', flush=True)
94-
if ntop_is_flexible or ntop >= N:
9587
# It is likely you are here because nnz_max is too large. But don't give up just yet!
9688
# sparse_dot_topn will hand over the memory allocation/management to C++. C++ will
9789
# grow the memory allocations for these arrays as needed without any need for nnz_max.
9890
# Note that reallocations could occur causing data to be copied to other locations
9991
# in memory thus impacting performance
100-
indices = np.empty(0, dtype=idx_dtype)
101-
data = np.empty(0, dtype=A.dtype)
102-
if not use_threads:
103-
104-
indices, data, best_topn = ct.sparse_dot_free(
105-
M, N, np.asarray(A.indptr, dtype=idx_dtype),
106-
np.asarray(A.indices, dtype=idx_dtype),
107-
A.data,
108-
np.asarray(B.indptr, dtype=idx_dtype),
109-
np.asarray(B.indices, dtype=idx_dtype),
110-
B.data,
111-
lower_bound,
112-
indptr
113-
)
114-
else:
115-
116-
indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
117-
M, N, np.asarray(A.indptr, dtype=idx_dtype),
118-
np.asarray(A.indices, dtype=idx_dtype),
119-
A.data,
120-
np.asarray(B.indptr, dtype=idx_dtype),
121-
np.asarray(B.indices, dtype=idx_dtype),
122-
B.data,
123-
lower_bound,
124-
indptr, n_jobs
125-
)
92+
indices = np.empty(0, dtype=idx_dtype)
93+
data = np.empty(0, dtype=A.dtype)
94+
if not use_threads:
95+
96+
indices, data, best_topn = ct.sparse_dot_free(
97+
M, N, np.asarray(A.indptr, dtype=idx_dtype),
98+
np.asarray(A.indices, dtype=idx_dtype),
99+
A.data,
100+
np.asarray(B.indptr, dtype=idx_dtype),
101+
np.asarray(B.indices, dtype=idx_dtype),
102+
B.data,
103+
ntop, lower_bound,
104+
indptr
105+
)
106+
126107
else:
127108

128-
if mem_manager_is_C:
129-
raise Exception(
130-
'When mem_manager_is_C=True, set ntop >= B.shape[1], or set ntop_is_flexible=True'
131-
)
132-
else:
133-
raise Exception(
134-
'Not enough memory! Data array is too large. Try reducing the value of ntop.'
135-
'or set ntop_is_flexible=True'
136-
)
109+
indices, data, best_topn = ct_thread.sparse_dot_free_threaded(
110+
M, N, np.asarray(A.indptr, dtype=idx_dtype),
111+
np.asarray(A.indices, dtype=idx_dtype),
112+
A.data,
113+
np.asarray(B.indptr, dtype=idx_dtype),
114+
np.asarray(B.indices, dtype=idx_dtype),
115+
B.data,
116+
ntop, lower_bound,
117+
indptr, n_jobs
118+
)
119+
137120
else:
138121
# no exception was raised; then use old function (as it is expected to be the fastest)
139122

@@ -152,6 +135,7 @@ def awesome_cossim_topn(
152135
lower_bound,
153136
indptr, indices, data, best_topn_arr
154137
)
138+
155139
else:
156140
if n_jobs < 1:
157141
err_str = 'Whenever you select the multi-thread mode, n_job must be greater than or equal to 1!'
@@ -168,6 +152,7 @@ def awesome_cossim_topn(
168152
lower_bound,
169153
indptr, indices, data, best_topn_arr, n_jobs
170154
)
155+
171156
best_topn = best_topn_arr[0]
172157

173158
# prepare and return the output:

sparse_dot_topn/sparse_dot_topn.pyx

+4-2
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ cdef extern from "sparse_dot_topn_source.h":
7272
int Bp[],
7373
int Bj[],
7474
double Bx[],
75+
int ntop,
7576
double lower_bound,
7677
int Cp[],
7778
vector[int]* Cj,
@@ -202,7 +203,8 @@ cpdef sparse_dot_free(
202203
np.ndarray[int, ndim=1] b_indptr,
203204
np.ndarray[int, ndim=1] b_indices,
204205
np.ndarray[double, ndim=1] b_data,
205-
double lower_bound,
206+
int ntop,
207+
double lower_bound,
206208
np.ndarray[int, ndim=1] c_indptr
207209
):
208210
"""
@@ -242,7 +244,7 @@ cpdef sparse_dot_free(
242244
cdef vector[int] vCj;
243245
cdef vector[double] vCx;
244246

245-
sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax)
247+
sparse_dot_free_source(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax)
246248

247249
c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
248250
c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)

sparse_dot_topn/sparse_dot_topn_parallel.cpp

+20-7
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,7 @@ void sparse_dot_topn_extd_parallel(
419419
void inner_sparse_dot_free(
420420
job_range_type job_range,
421421
int n_col_inner,
422+
int ntop_inner,
422423
double lower_bound_inner,
423424
int Ap_copy[],
424425
int Aj_copy[],
@@ -485,18 +486,29 @@ void inner_sparse_dot_free(
485486
}
486487

487488
int len = (int) (real_candidates->size() - sz);
489+
*n_minmax = (len > *n_minmax)? len : *n_minmax;
488490

489491
candidate* candidate_arr_begin = real_candidates->data() + sz;
490-
std::sort(
491-
candidate_arr_begin,
492-
candidate_arr_begin + len,
493-
candidate_cmp
494-
);
492+
if (len > ntop_inner){
493+
std::partial_sort(
494+
candidate_arr_begin,
495+
candidate_arr_begin + ntop_inner,
496+
candidate_arr_begin + len,
497+
candidate_cmp
498+
);
499+
len = ntop_inner;
500+
}
501+
else {
502+
std::sort(
503+
candidate_arr_begin,
504+
candidate_arr_begin + len,
505+
candidate_cmp
506+
);
507+
}
495508

496509
real_candidates->resize(sz + (size_t) len);
497510
*(row_sizes_ptr++) = len;
498511
(*total) += len;
499-
*n_minmax = (len > *n_minmax)? len : *n_minmax;
500512
}
501513
real_candidates->shrink_to_fit();
502514
}
@@ -510,6 +522,7 @@ void sparse_dot_free_parallel(
510522
int Bp[],
511523
int Bj[],
512524
double Bx[], //data of B
525+
int ntop,
513526
double lower_bound,
514527
int Cp[],
515528
std::vector<int>* vCj,
@@ -536,7 +549,7 @@ void sparse_dot_free_parallel(
536549
inner_sparse_dot_free,
537550
job_ranges[job_nr],
538551
n_col,
539-
lower_bound,
552+
ntop, lower_bound,
540553
Ap, Aj, Ax, Bp, Bj, Bx,
541554
&real_candidates[job_nr],
542555
&row_sizes[job_nr],

sparse_dot_topn/sparse_dot_topn_parallel.h

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ extern void sparse_dot_free_parallel(
6767
int Bp[],
6868
int Bj[],
6969
double Bx[], //data of B
70+
int ntop,
7071
double lower_bound,
7172
int Cp[],
7273
std::vector<int>* Cj,

sparse_dot_topn/sparse_dot_topn_source.cpp

+9-4
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ void sparse_dot_free_source(
280280
int Bp[],
281281
int Bj[],
282282
double Bx[], //data of B
283+
int ntop,
283284
double lower_bound,
284285
int Cp[],
285286
std::vector<int>* Cj,
@@ -342,18 +343,22 @@ void sparse_dot_free_source(
342343

343344
int len = (int)candidates.size();
344345
*n_minmax = (len > *n_minmax)? len : *n_minmax;
345-
std::sort(candidates.begin(), candidates.end(), candidate_cmp);
346+
347+
if (len > ntop){
348+
std::partial_sort(candidates.begin(), candidates.begin()+ntop, candidates.end(), candidate_cmp);
349+
len = ntop;
350+
} else {
351+
std::sort(candidates.begin(), candidates.end(), candidate_cmp);
352+
}
346353

347354
for(int a=0; a < len; a++){
348355
Cj->push_back(candidates[a].index);
349356
Cx->push_back(candidates[a].value);
350357
}
351358
candidates.clear();
352359

353-
Cp[i+1] = (int) (Cj->size());
360+
Cp[i+1] = Cj->size();
354361
}
355-
Cj->shrink_to_fit();
356-
Cx->shrink_to_fit();
357362
}
358363

359364
/*

sparse_dot_topn/sparse_dot_topn_source.h

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ extern void sparse_dot_free_source(
7070
int Bp[],
7171
int Bj[],
7272
double Bx[], //data of B
73+
int ntop,
7374
double lower_bound,
7475
int Cp[],
7576
std::vector<int>* Cj,

sparse_dot_topn/sparse_dot_topn_threaded.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ cdef extern from "sparse_dot_topn_parallel.h":
7575
int Bp[],
7676
int Bj[],
7777
double Bx[],
78+
int ntop,
7879
double lower_bound,
7980
int Cp[],
8081
vector[int]* Cj,
@@ -167,6 +168,7 @@ cpdef sparse_dot_free_threaded(
167168
np.ndarray[int, ndim=1] b_indptr,
168169
np.ndarray[int, ndim=1] b_indices,
169170
np.ndarray[double, ndim=1] b_data,
171+
int ntop,
170172
double lower_bound,
171173
np.ndarray[int, ndim=1] c_indptr,
172174
int n_jobs
@@ -185,7 +187,7 @@ cpdef sparse_dot_free_threaded(
185187
cdef vector[int] vCj;
186188
cdef vector[double] vCx;
187189

188-
sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
190+
sparse_dot_free_parallel(n_row, n_col, Ap, Aj, Ax, Bp, Bj, Bx, ntop, lower_bound, Cp, &vCj, &vCx, n_minmax, n_jobs)
189191

190192
c_indices = np.asarray(ArrayWrapper_int(vCj)).squeeze(axis=0)
191193
c_data = np.asarray(ArrayWrapper_double(vCx)).squeeze(axis=0)

sparse_dot_topn/test/test_awesome_cossim_topn.py

+36-8
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,15 @@ def helper_awesome_cossim_topn_dense(
6262
use_threads=use_threads,
6363
n_jobs=n_jobs
6464
)
65-
awesome_result_top3 = \
66-
awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
65+
awesome_result_top3 = awesome_cossim_topn(
66+
a_csr,
67+
b_csr_t,
68+
NUM_CANDIDATES,
69+
0.0,
70+
mem_manager_is_C=mem_manager_is_C,
71+
use_threads=use_threads,
72+
n_jobs=n_jobs
73+
)
6774
awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
6875
row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed
6976

@@ -76,8 +83,15 @@ def helper_awesome_cossim_topn_dense(
7683
use_threads=use_threads,
7784
n_jobs=n_jobs
7885
)
79-
pruned_awesome_result_top3 = \
80-
awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
86+
pruned_awesome_result_top3 = awesome_cossim_topn(
87+
a_csr,
88+
b_csr_t,
89+
NUM_CANDIDATES,
90+
PRUNE_THRESHOLD,
91+
mem_manager_is_C=mem_manager_is_C,
92+
use_threads=use_threads,
93+
n_jobs=n_jobs
94+
)
8195
pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
8296
row.data) > 0 else None for row in pruned_awesome_result_top3]
8397

@@ -131,8 +145,15 @@ def helper_awesome_cossim_topn_sparse(
131145
use_threads=use_threads,
132146
n_jobs=n_jobs
133147
)
134-
awesome_result_top3 = \
135-
awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, 0.0, use_threads=use_threads, n_jobs=n_jobs)
148+
awesome_result_top3 = awesome_cossim_topn(
149+
a_csr,
150+
b_csr_t,
151+
NUM_CANDIDATES,
152+
0.0,
153+
mem_manager_is_C=mem_manager_is_C,
154+
use_threads=use_threads,
155+
n_jobs=n_jobs
156+
)
136157
awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
137158
row.data) > 0 else None for row in awesome_result_top3] # make comparable, normally not needed
138159

@@ -145,8 +166,15 @@ def helper_awesome_cossim_topn_sparse(
145166
use_threads=use_threads,
146167
n_jobs=n_jobs
147168
)
148-
pruned_awesome_result_top3 = \
149-
awesome_cossim_topn(a_csr, b_csr_t, NUM_CANDIDATES, PRUNE_THRESHOLD, use_threads=use_threads, n_jobs=n_jobs)
169+
pruned_awesome_result_top3 = awesome_cossim_topn(
170+
a_csr,
171+
b_csr_t,
172+
NUM_CANDIDATES,
173+
PRUNE_THRESHOLD,
174+
mem_manager_is_C=mem_manager_is_C,
175+
use_threads=use_threads,
176+
n_jobs=n_jobs
177+
)
150178
pruned_awesome_result_top3 = [list(zip(row.indices, row.data)) if len(
151179
row.data) > 0 else None for row in pruned_awesome_result_top3]
152180

string_grouper/string_grouper.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -218,9 +218,13 @@ def __init__(self, master: pd.Series,
218218
self._duplicates: pd.Series = duplicates if duplicates is not None else None
219219
self._master_id: pd.Series = master_id if master_id is not None else None
220220
self._duplicates_id: pd.Series = duplicates_id if duplicates_id is not None else None
221+
221222
self._config: StringGrouperConfig = StringGrouperConfig(**kwargs)
222-
self._max_n_matches = len(self._master) if self._config.max_n_matches is None \
223-
else self._config.max_n_matches
223+
if self._config.max_n_matches is None:
224+
self._max_n_matches = len(self._master) if self._duplicates is None else len(self._duplicates)
225+
else:
226+
self._max_n_matches = self._config.max_n_matches
227+
224228
self._validate_group_rep_specs()
225229
self._validate_replace_na_and_drop()
226230
self.is_build = False # indicates if the grouper was fit or not
@@ -435,7 +439,6 @@ def _build_matches(self, master_matrix: csr_matrix, duplicate_matrix: csr_matrix
435439
optional_kwargs = dict()
436440
if self._config.number_of_processes > 1:
437441
optional_kwargs = {
438-
'ntop_is_flexible': self._config.max_n_matches is None,
439442
'return_best_topn': True,
440443
'use_threads': True,
441444
'n_jobs': self._config.number_of_processes

0 commit comments

Comments
 (0)