Skip to content

Commit

Permalink
Ordered iterators.
Browse files Browse the repository at this point in the history
Add ordered iterators over HAMT and `HashSet`.  Given two sets with the
same elements and the same hashers, the new iterators enumerate these
sets in the same order.  This is in contrast to existing iterators that
do not guarantee consistent ordering for elements with the same hash
values.  Consistent ordering is achieved by sorting collision nodes on
the fly during iteration.

We use the new iterators to fix the implementation of `Ord` and `PartialOrd`
for `HashSet` (bodil#175).
  • Loading branch information
ryzhyk committed Feb 20, 2021
1 parent 3f4e01a commit 42584f0
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 11 deletions.
2 changes: 1 addition & 1 deletion benches/native.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extern crate im;
extern crate rand;
extern crate test;

use rand::{rngs::SmallRng, SeedableRng, Rng};
use rand::{rngs::SmallRng, Rng, SeedableRng};
use std::collections::{BTreeMap, HashMap, VecDeque};
use std::iter::FromIterator;
use test::Bencher;
Expand Down
80 changes: 70 additions & 10 deletions src/hash/set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ use std::iter::FusedIterator;
use std::iter::{FromIterator, IntoIterator, Sum};
use std::ops::{Add, Deref, Mul};

use crate::nodes::hamt::{hash_key, Drain as NodeDrain, HashValue, Iter as NodeIter, Node};
use crate::nodes::hamt::{
hash_key, Drain as NodeDrain, HashValue, Iter as NodeIter, Node, OrderedIter as NodeOrderedIter,
};
use crate::ordset::OrdSet;
use crate::Vector;
use crate::util::{Pool, PoolRef, Ref};
use crate::Vector;

/// Construct a set from a sequence of values.
///
Expand Down Expand Up @@ -313,15 +315,25 @@ impl<A, S> HashSet<A, S> {

/// Get an iterator over the values in a hash set.
///
/// No ordering guarantees are provided.
#[must_use]
pub fn iter(&self) -> Iter<'_, A> {
Iter {
it: NodeIter::new(&self.root, self.size),
}
}

/// Get an ordered iterator over the values in a hash set.
///
/// Please note that the order is consistent between sets using
/// the same hasher, but no other ordering guarantee is offered.
/// Items will not come out in insertion order or sort order.
/// They will, however, come out in the same order every time for
/// the same set.
#[must_use]
pub fn iter(&self) -> Iter<'_, A> {
Iter {
it: NodeIter::new(&self.root, self.size),
pub fn ordered_iter(&self) -> OrderedIter<'_, A> {
OrderedIter {
it: NodeOrderedIter::new(&self.root, self.size),
}
}
}
Expand Down Expand Up @@ -667,14 +679,14 @@ where

impl<A, S> PartialOrd for HashSet<A, S>
where
A: Hash + Eq + Clone + PartialOrd,
A: Hash + Eq + Clone + PartialOrd + Ord,
S: BuildHasher + Default,
{
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
if Ref::ptr_eq(&self.hasher, &other.hasher) {
return self.iter().partial_cmp(other.iter());
return self.ordered_iter().partial_cmp(other.ordered_iter());
}
self.iter().partial_cmp(other.iter())
self.ordered_iter().partial_cmp(other.ordered_iter())
}
}

Expand All @@ -685,9 +697,9 @@ where
{
fn cmp(&self, other: &Self) -> Ordering {
if Ref::ptr_eq(&self.hasher, &other.hasher) {
return self.iter().cmp(other.iter());
return self.ordered_iter().cmp(other.ordered_iter());
}
self.iter().cmp(other.iter())
self.ordered_iter().cmp(other.ordered_iter())
}
}

Expand Down Expand Up @@ -857,6 +869,35 @@ impl<'a, A> ExactSizeIterator for Iter<'a, A> {}

impl<'a, A> FusedIterator for Iter<'a, A> {}

/// An ordered iterator over the elements of a set.
/// Given a deterministic hasher, this iterator yields values in a deterministic
/// order: two sets with the same elements are enumerated in the same order
/// regardless of the order in which elements were inserted in the sets. Items
/// are returned in the order of their hash values. Items with identical hash
/// values are sorted based on the `Ord` trait.
pub struct OrderedIter<'a, A> {
it: NodeOrderedIter<'a, Value<A>>,
}

impl<'a, A> Iterator for OrderedIter<'a, A>
where
A: 'a + Ord,
{
type Item = &'a A;

fn next(&mut self) -> Option<Self::Item> {
self.it.next().map(|(v, _)| &v.0)
}

fn size_hint(&self) -> (usize, Option<usize>) {
self.it.size_hint()
}
}

impl<'a, A: Ord> ExactSizeIterator for OrderedIter<'a, A> {}

impl<'a, A: Ord> FusedIterator for OrderedIter<'a, A> {}

/// A consuming iterator over the elements of a set.
pub struct ConsumingIter<A>
where
Expand Down Expand Up @@ -1063,6 +1104,7 @@ mod test {
use crate::test::LolHasher;
use ::proptest::num::i16;
use ::proptest::proptest;
use std::collections::hash_map::DefaultHasher;
use std::hash::BuildHasherDefault;

#[test]
Expand Down Expand Up @@ -1119,6 +1161,24 @@ mod test {
}
}

#[test]
fn consistent_ord() {
let mut set1 = HashSet::with_hasher(<BuildHasherDefault<DefaultHasher>>::default());
let mut set2 = HashSet::with_hasher(<BuildHasherDefault<DefaultHasher>>::default());

// Create two sets with identical elements but different insertion order.
// The sets are big enough to trigger hash collisions.
for i in 0..50_001 {
set1.insert(i);
set2.insert(50_000 - i);
}

// The sets are the same according to Eq...
assert_eq!(set1, set2);
// ... and Ord.
assert_eq!(set1.cmp(&set2), Ordering::Equal);
}

proptest! {
#[test]
fn proptest_a_set(ref s in hash_set(".*", 10..100)) {
Expand Down
86 changes: 86 additions & 0 deletions src/nodes/hamt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,92 @@ impl<'a, A> ExactSizeIterator for Iter<'a, A> where A: 'a {}

impl<'a, A> FusedIterator for Iter<'a, A> where A: 'a {}

// Ordered Ref iterator.
// Given a deterministic hasher, this iterator yields values in a deterministic
// order: two sets with the same elements are enumerated in the same order
// regardless of the order in which elements were inserted in the sets. Items
// are returned in the order of their hash values. Items with identical hash
// values are sorted based on the `Ord` trait.

pub(crate) struct OrderedIter<'a, A> {
count: usize,
stack: Vec<ChunkIter<'a, Entry<A>, HashWidth>>,
current: ChunkIter<'a, Entry<A>, HashWidth>,
collision: Option<(HashBits, std::vec::IntoIter<&'a A>)>,
}

impl<'a, A> OrderedIter<'a, A>
where
A: 'a,
{
pub(crate) fn new(root: &'a Node<A>, size: usize) -> Self {
OrderedIter {
count: size,
stack: Vec::with_capacity((HASH_WIDTH / HASH_SHIFT) + 1),
current: root.data.iter(),
collision: None,
}
}
}

impl<'a, A> Iterator for OrderedIter<'a, A>
where
A: 'a + Ord,
{
type Item = (&'a A, HashBits);

fn next(&mut self) -> Option<Self::Item> {
if self.count == 0 {
return None;
}
if self.collision.is_some() {
if let Some((hash, ref mut coll)) = self.collision {
match coll.next() {
None => {}
Some(value) => {
self.count -= 1;
return Some((value, hash));
}
}
}
self.collision = None;
return self.next();
}
match self.current.next() {
Some(Entry::Value(value, hash)) => {
self.count -= 1;
Some((value, *hash))
}
Some(Entry::Node(child)) => {
let current = mem::replace(&mut self.current, child.data.iter());
self.stack.push(current);
self.next()
}
Some(Entry::Collision(coll)) => {
let mut refs: Vec<&'a A> = coll.data.iter().collect();
refs.sort();
self.collision = Some((coll.hash, refs.into_iter()));
self.next()
}
None => match self.stack.pop() {
None => None,
Some(iter) => {
self.current = iter;
self.next()
}
},
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
(self.count, Some(self.count))
}
}

impl<'a, A> ExactSizeIterator for OrderedIter<'a, A> where A: 'a + Ord {}

impl<'a, A> FusedIterator for OrderedIter<'a, A> where A: 'a + Ord {}

// Mut ref iterator

pub(crate) struct IterMut<'a, A> {
Expand Down

0 comments on commit 42584f0

Please sign in to comment.