Skip to content

Commit fe7ef10

Browse files
adamreicholdPhilippe-Cholet
authored andcommitted
Add k_smallest_relaxed and variants
This implements the algorithm described in [1] which consumes twice the amount of memory as the existing `k_smallest` algorithm but achieves linear time in the number of elements in the input. [1] https://quickwit.io/blog/top-k-complexity
1 parent 1c850ce commit fe7ef10

File tree

3 files changed

+270
-2
lines changed

3 files changed

+270
-2
lines changed

src/k_smallest.rs

+39
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,45 @@ where
8888
storage
8989
}
9090

91+
pub(crate) fn k_smallest_relaxed_general<I, F>(iter: I, k: usize, mut comparator: F) -> Vec<I::Item>
92+
where
93+
I: Iterator,
94+
F: FnMut(&I::Item, &I::Item) -> Ordering,
95+
{
96+
if k == 0 {
97+
iter.last();
98+
return Vec::new();
99+
}
100+
101+
let mut iter = iter.fuse();
102+
let mut buf = iter.by_ref().take(2 * k).collect::<Vec<_>>();
103+
104+
if buf.len() < k {
105+
buf.sort_unstable_by(&mut comparator);
106+
return buf;
107+
}
108+
109+
buf.select_nth_unstable_by(k - 1, &mut comparator);
110+
buf.truncate(k);
111+
112+
iter.for_each(|val| {
113+
if comparator(&val, &buf[k - 1]) != Ordering::Less {
114+
return;
115+
}
116+
117+
buf.push(val);
118+
119+
if buf.len() == 2 * k {
120+
buf.select_nth_unstable_by(k - 1, &mut comparator);
121+
buf.truncate(k);
122+
}
123+
});
124+
125+
buf.sort_unstable_by(&mut comparator);
126+
buf.truncate(k);
127+
buf
128+
}
129+
91130
#[inline]
92131
pub(crate) fn key_to_cmp<T, K, F>(mut key: F) -> impl FnMut(&T, &T) -> Ordering
93132
where

src/lib.rs

+187
Original file line numberDiff line numberDiff line change
@@ -3153,6 +3153,105 @@ pub trait Itertools: Iterator {
31533153
self.k_smallest_by(k, k_smallest::key_to_cmp(key))
31543154
}
31553155

3156+
/// Sort the k smallest elements into a new iterator, in ascending order, relaxing the amount of memory required.
3157+
///
3158+
/// **Note:** This consumes the entire iterator, and returns the result
3159+
/// as a new iterator that owns its elements. If the input contains
3160+
/// less than k elements, the result is equivalent to `self.sorted()`.
3161+
///
3162+
/// This is guaranteed to use `2 * k * sizeof(Self::Item) + O(1)` memory
3163+
/// and `O(n + k log k)` time, with `n` the number of elements in the input,
3164+
/// meaning it uses more memory than the minimum obtained by [`k_smallest`](Itertools::k_smallest)
3165+
/// but achieves linear time in the number of elements.
3166+
///
3167+
/// The sorted iterator, if directly collected to a `Vec`, is converted
3168+
/// without any extra copying or allocation cost.
3169+
///
3170+
/// **Note:** This is functionally-equivalent to `self.sorted().take(k)`
3171+
/// but much more efficient.
3172+
///
3173+
/// ```
3174+
/// use itertools::Itertools;
3175+
///
3176+
/// // A random permutation of 0..15
3177+
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
3178+
///
3179+
/// let five_smallest = numbers
3180+
/// .into_iter()
3181+
/// .k_smallest_relaxed(5);
3182+
///
3183+
/// itertools::assert_equal(five_smallest, 0..5);
3184+
/// ```
3185+
#[cfg(feature = "use_alloc")]
3186+
fn k_smallest_relaxed(self, k: usize) -> VecIntoIter<Self::Item>
3187+
where
3188+
Self: Sized,
3189+
Self::Item: Ord,
3190+
{
3191+
self.k_smallest_relaxed_by(k, Ord::cmp)
3192+
}
3193+
3194+
/// Sort the k smallest elements into a new iterator using the provided comparison, relaxing the amount of memory required.
3195+
///
3196+
/// The sorted iterator, if directly collected to a `Vec`, is converted
3197+
/// without any extra copying or allocation cost.
3198+
///
3199+
/// This corresponds to `self.sorted_by(cmp).take(k)` in the same way that
3200+
/// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`,
3201+
/// in both semantics and complexity.
3202+
///
3203+
/// ```
3204+
/// use itertools::Itertools;
3205+
///
3206+
/// // A random permutation of 0..15
3207+
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
3208+
///
3209+
/// let five_smallest = numbers
3210+
/// .into_iter()
3211+
/// .k_smallest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b)));
3212+
///
3213+
/// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]);
3214+
/// ```
3215+
#[cfg(feature = "use_alloc")]
3216+
fn k_smallest_relaxed_by<F>(self, k: usize, cmp: F) -> VecIntoIter<Self::Item>
3217+
where
3218+
Self: Sized,
3219+
F: FnMut(&Self::Item, &Self::Item) -> Ordering,
3220+
{
3221+
k_smallest::k_smallest_relaxed_general(self, k, cmp).into_iter()
3222+
}
3223+
3224+
/// Return the elements producing the k smallest outputs of the provided function, relaxing the amount of memory required.
3225+
///
3226+
/// The sorted iterator, if directly collected to a `Vec`, is converted
3227+
/// without any extra copying or allocation cost.
3228+
///
3229+
/// This corresponds to `self.sorted_by_key(key).take(k)` in the same way that
3230+
/// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`,
3231+
/// in both semantics and complexity.
3232+
///
3233+
/// ```
3234+
/// use itertools::Itertools;
3235+
///
3236+
/// // A random permutation of 0..15
3237+
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
3238+
///
3239+
/// let five_smallest = numbers
3240+
/// .into_iter()
3241+
/// .k_smallest_relaxed_by_key(5, |n| (n % 7, *n));
3242+
///
3243+
/// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]);
3244+
/// ```
3245+
#[cfg(feature = "use_alloc")]
3246+
fn k_smallest_relaxed_by_key<F, K>(self, k: usize, key: F) -> VecIntoIter<Self::Item>
3247+
where
3248+
Self: Sized,
3249+
F: FnMut(&Self::Item) -> K,
3250+
K: Ord,
3251+
{
3252+
self.k_smallest_relaxed_by(k, k_smallest::key_to_cmp(key))
3253+
}
3254+
31563255
/// Sort the k largest elements into a new iterator, in descending order.
31573256
///
31583257
/// The sorted iterator, if directly collected to a `Vec`, is converted
@@ -3243,6 +3342,94 @@ pub trait Itertools: Iterator {
32433342
self.k_largest_by(k, k_smallest::key_to_cmp(key))
32443343
}
32453344

3345+
/// Sort the k largest elements into a new iterator, in descending order, relaxing the amount of memory required.
3346+
///
3347+
/// The sorted iterator, if directly collected to a `Vec`, is converted
3348+
/// without any extra copying or allocation cost.
3349+
///
3350+
/// It is semantically equivalent to [`k_smallest_relaxed`](Itertools::k_smallest_relaxed)
3351+
/// with a reversed `Ord`.
3352+
///
3353+
/// ```
3354+
/// use itertools::Itertools;
3355+
///
3356+
/// // A random permutation of 0..15
3357+
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
3358+
///
3359+
/// let five_largest = numbers
3360+
/// .into_iter()
3361+
/// .k_largest_relaxed(5);
3362+
///
3363+
/// itertools::assert_equal(five_largest, vec![14, 13, 12, 11, 10]);
3364+
/// ```
3365+
#[cfg(feature = "use_alloc")]
3366+
fn k_largest_relaxed(self, k: usize) -> VecIntoIter<Self::Item>
3367+
where
3368+
Self: Sized,
3369+
Self::Item: Ord,
3370+
{
3371+
self.k_largest_relaxed_by(k, Self::Item::cmp)
3372+
}
3373+
3374+
/// Sort the k largest elements into a new iterator using the provided comparison, relaxing the amount of memory required.
3375+
///
3376+
/// The sorted iterator, if directly collected to a `Vec`, is converted
3377+
/// without any extra copying or allocation cost.
3378+
///
3379+
/// Functionally equivalent to [`k_smallest_relaxed_by`](Itertools::k_smallest_relaxed_by)
3380+
/// with a reversed `Ord`.
3381+
///
3382+
/// ```
3383+
/// use itertools::Itertools;
3384+
///
3385+
/// // A random permutation of 0..15
3386+
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
3387+
///
3388+
/// let five_largest = numbers
3389+
/// .into_iter()
3390+
/// .k_largest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b)));
3391+
///
3392+
/// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]);
3393+
/// ```
3394+
#[cfg(feature = "use_alloc")]
3395+
fn k_largest_relaxed_by<F>(self, k: usize, mut cmp: F) -> VecIntoIter<Self::Item>
3396+
where
3397+
Self: Sized,
3398+
F: FnMut(&Self::Item, &Self::Item) -> Ordering,
3399+
{
3400+
self.k_smallest_relaxed_by(k, move |a, b| cmp(b, a))
3401+
}
3402+
3403+
/// Return the elements producing the k largest outputs of the provided function, relaxing the amount of memory required.
3404+
///
3405+
/// The sorted iterator, if directly collected to a `Vec`, is converted
3406+
/// without any extra copying or allocation cost.
3407+
///
3408+
/// Functionally equivalent to [`k_smallest_relaxed_by_key`](Itertools::k_smallest_relaxed_by_key)
3409+
/// with a reversed `Ord`.
3410+
///
3411+
/// ```
3412+
/// use itertools::Itertools;
3413+
///
3414+
/// // A random permutation of 0..15
3415+
/// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5];
3416+
///
3417+
/// let five_largest = numbers
3418+
/// .into_iter()
3419+
/// .k_largest_relaxed_by_key(5, |n| (n % 7, *n));
3420+
///
3421+
/// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]);
3422+
/// ```
3423+
#[cfg(feature = "use_alloc")]
3424+
fn k_largest_relaxed_by_key<F, K>(self, k: usize, key: F) -> VecIntoIter<Self::Item>
3425+
where
3426+
Self: Sized,
3427+
F: FnMut(&Self::Item) -> K,
3428+
K: Ord,
3429+
{
3430+
self.k_largest_relaxed_by(k, k_smallest::key_to_cmp(key))
3431+
}
3432+
32463433
/// Consumes the iterator and return an iterator of the last `n` elements.
32473434
///
32483435
/// The iterator, if directly collected to a `VecDeque`, is converted

tests/test_std.rs

+44-2
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,42 @@ qc::quickcheck! {
528528
it::assert_equal(largest_by, sorted_largest.clone());
529529
it::assert_equal(largest_by_key, sorted_largest);
530530
}
531+
532+
fn k_smallest_relaxed_range(n: i64, m: u16, k: u16) -> () {
533+
// u16 is used to constrain k and m to 0..2¹⁶,
534+
// otherwise the test could use too much memory.
535+
let (k, m) = (k as usize, m as u64);
536+
537+
let mut v: Vec<_> = (n..n.saturating_add(m as _)).collect();
538+
// Generate a random permutation of n..n+m
539+
v.shuffle(&mut thread_rng());
540+
541+
// Construct the right answers for the top and bottom elements
542+
let mut sorted = v.clone();
543+
sorted.sort();
544+
// how many elements are we checking
545+
let num_elements = min(k, m as _);
546+
547+
// Compute the top and bottom k in various combinations
548+
let sorted_smallest = sorted[..num_elements].iter().cloned();
549+
let smallest = v.iter().cloned().k_smallest_relaxed(k);
550+
let smallest_by = v.iter().cloned().k_smallest_relaxed_by(k, Ord::cmp);
551+
let smallest_by_key = v.iter().cloned().k_smallest_relaxed_by_key(k, |&x| x);
552+
553+
let sorted_largest = sorted[sorted.len() - num_elements..].iter().rev().cloned();
554+
let largest = v.iter().cloned().k_largest_relaxed(k);
555+
let largest_by = v.iter().cloned().k_largest_relaxed_by(k, Ord::cmp);
556+
let largest_by_key = v.iter().cloned().k_largest_relaxed_by_key(k, |&x| x);
557+
558+
// Check the variations produce the same answers and that they're right
559+
it::assert_equal(smallest, sorted_smallest.clone());
560+
it::assert_equal(smallest_by, sorted_smallest.clone());
561+
it::assert_equal(smallest_by_key, sorted_smallest);
562+
563+
it::assert_equal(largest, sorted_largest.clone());
564+
it::assert_equal(largest_by, sorted_largest.clone());
565+
it::assert_equal(largest_by_key, sorted_largest);
566+
}
531567
}
532568

533569
#[derive(Clone, Debug)]
@@ -572,8 +608,11 @@ where
572608
I::Item: Ord + Debug,
573609
{
574610
let j = i.clone();
611+
let i1 = i.clone();
612+
let j1 = i.clone();
575613
let k = k as usize;
576-
it::assert_equal(i.k_smallest(k), j.sorted().take(k))
614+
it::assert_equal(i.k_smallest(k), j.sorted().take(k));
615+
it::assert_equal(i1.k_smallest_relaxed(k), j1.sorted().take(k));
577616
}
578617

579618
// Similar to `k_smallest_sort` but for our custom heap implementation.
@@ -583,8 +622,11 @@ where
583622
I::Item: Ord + Debug,
584623
{
585624
let j = i.clone();
625+
let i1 = i.clone();
626+
let j1 = i.clone();
586627
let k = k as usize;
587-
it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k))
628+
it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k));
629+
it::assert_equal(i1.k_smallest_relaxed_by(k, Ord::cmp), j1.sorted().take(k));
588630
}
589631

590632
macro_rules! generic_test {

0 commit comments

Comments
 (0)