Skip to content

Commit e01d2ac

Browse files
committed
bitm can return arrays as iterators
wavelet_matrix can use any bitvec type, e.g. aligned Box
1 parent ecbcf49 commit e01d2ac

File tree

7 files changed

+85
-35
lines changed

7 files changed

+85
-35
lines changed

binout/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "binout"
3-
version = "0.2.1"
3+
version = "0.3.0"
44
edition = "2021"
55
authors = ["Piotr Beling <[email protected]>"]
66
license = "MIT OR Apache-2.0"

binout/src/lib.rs

+47-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,36 @@
11
#![doc = include_str!("../README.md")]
22

3+
/// Iterator whose each `next` call uses deserializer `S` to deserialize the value of type `T` from the `input`.
4+
pub struct ReadIter<'r, T: ?Sized, S, R: ?Sized> {
5+
pub input: &'r mut R,
6+
serializer_type: std::marker::PhantomData<S>,
7+
value_type: std::marker::PhantomData<T>
8+
}
9+
10+
impl<'r, T: Copy, S: Serializer::<T>, R: std::io::Read + ?Sized> Iterator for ReadIter<'r, T, S, R> {
11+
type Item = Result<T, std::io::Error>;
12+
13+
#[inline] fn next(&mut self) -> Option<Self::Item> {
14+
Some(S::read(self.input))
15+
}
16+
17+
#[inline] fn size_hint(&self) -> (usize, Option<usize>) {
18+
(usize::MAX, None)
19+
}
20+
}
21+
22+
impl<'r, T: Copy, S: Serializer::<T>, R: std::io::Read + ?Sized> std::iter::FusedIterator for ReadIter<'r, T, S, R> {}
23+
24+
pub type ReadNIter<'r, T, S, R> = std::iter::Take<ReadIter<'r, T, S, R>>;
25+
326
/// Trait implemented by each serializer for the following types:
427
/// `u8`, `u16`, `u32`, `u64`, `usize` (which, for portability, is always serialized the same as `u64`).
528
pub trait Serializer<T: Copy>: Copy {
629

730
/// Either size of each value in bytes (if each value occupies constant size) or [`None`].
831
const CONST_SIZE: Option<usize> = None;
932

10-
/// Returns number of bytes which [`write`](Serializer::write) needs to serialize `val`.
33+
/// Returns the number of bytes which [`write`](Serializer::write) needs to serialize `val`.
1134
fn size(val: T) -> usize;
1235

1336
/// Serialize `val` to the given `output`.
@@ -30,7 +53,7 @@ pub trait Serializer<T: Copy>: Copy {
3053
Self::write_all_values(output, values.into_iter().cloned())
3154
}
3255

33-
/// Returns number of bytes occupied by serialized content of the array.
56+
/// Returns the number of bytes occupied by serialized content of the array.
3457
fn array_content_size(array: &[T]) -> usize {
3558
if let Some(each_element_size) = Self::CONST_SIZE {
3659
each_element_size * array.len()
@@ -39,7 +62,7 @@ pub trait Serializer<T: Copy>: Copy {
3962
}
4063
}
4164

42-
/// Returns number of bytes which [`write_array`](Serializer::write_array) needs to serialize `array`.
65+
/// Returns the number of bytes which [`write_array`](Serializer::write_array) needs to serialize `array`.
4366
#[inline] fn array_size(array: &[T]) -> usize {
4467
VByte::size(array.len()) + Self::array_content_size(array)
4568
}
@@ -52,16 +75,36 @@ pub trait Serializer<T: Copy>: Copy {
5275

5376
/// Deserialize `n` values from the given `input`.
5477
fn read_n<R: std::io::Read + ?Sized>(input: &mut R, n: usize) -> std::io::Result<Box<[T]>> {
78+
//Self::read_n_iter(input, n).collect()
5579
let mut result = Vec::with_capacity(n);
5680
for _ in 0..n { result.push(Self::read(input)?); }
5781
Ok(result.into_boxed_slice())
5882
}
5983

60-
/// Deserialize array from the given `input`. Size of the array is serialized in [`VByte`] format.
84+
/// Deserialize array from the given `input`. Size of the array is deserialized from [`VByte`] format.
6185
fn read_array<R: std::io::Read + ?Sized>(input: &mut R) -> std::io::Result<Box<[T]>> {
6286
let n = VByte::read(input)?;
6387
Self::read_n(input, n)
6488
}
89+
90+
/// Returns an iterator whose each `next` call deserializes the value from the `input`.
91+
#[inline] fn read_iter<R: ?Sized>(input: &mut R) -> ReadIter<'_, T, Self, R> {
92+
ReadIter { input, serializer_type: Default::default(), value_type: Default::default() }
93+
}
94+
95+
/// Returns an iterator whose `n` `next` calls deserialize values from the `input`,
96+
/// while further calls yield [`None`].
97+
#[inline] fn read_n_iter<R: std::io::Read + ?Sized>(input: &mut R, n: usize) -> ReadNIter<'_, T, Self, R> {
98+
Self::read_iter(input).take(n)
99+
}
100+
101+
/// First try to read from the `input` and deserialize from [`VByte`] format, a number of values *n*.
102+
/// If successful, returns the iterator whose *n* `next` calls deserialize values from the `input`,
103+
/// while further calls yield [`None`].
104+
#[inline] fn read_array_iter<R: std::io::Read + ?Sized>(input: &mut R) -> std::io::Result<ReadNIter<'_, T, Self, R>> {
105+
let n = VByte::read(input)?;
106+
Ok(Self::read_n_iter(input, n))
107+
}
65108
}
66109

67110
impl<S> Serializer<usize> for S where S: Serializer<u64> {

cseq/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ keywords = [ "compression", "sequence", "succinct", "elias-fano", "wavelet-matri
1515
[dependencies]
1616
bitm = { version="0.4", path="../bitm" }
1717
dyn_size_of = { version="0.4", path="../dyn_size_of" }
18-
binout = { version="0.2", path="../binout" }
18+
binout = { version="0.3", path="../binout" }

cseq/src/wavelet_matrix.rs

+33-26
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
//! Wavelet Matrix representation of symbol sequence.
22
3-
use std::{iter::FusedIterator, io};
3+
use std::{io, iter::FusedIterator, ops::{Deref, DerefMut}};
44

55
use binout::{AsIs, Serializer};
66
use bitm::{BitAccess, BitVec, RankSelect101111, CombinedSampling, Rank, Select, Select0, SelectForRank101111, Select0ForRank101111, bits_to_store, ceiling_div};
@@ -16,24 +16,24 @@ use dyn_size_of::GetSize;
1616
/// - `upper_index` is index of `upper_bit`,
1717
/// - `lower_zero_index` is index of `lower_bits` to insert next item with 0 msb,
1818
/// - `lower_one_index` is index of `lower_bits` to insert next item with 1 msb,
19-
struct LevelBuilder {
20-
upper_bit: Box<[u64]>,
19+
struct LevelBuilder<BV> {
20+
upper_bit: BV,
2121
upper_index: usize,
22-
lower_bits: Box<[u64]>,
22+
lower_bits: BV,
2323
lower_zero_index: usize,
2424
lower_one_index: usize,
2525
upper_bit_mask: u64,
2626
bits_per_item: u8
2727
}
2828

29-
impl LevelBuilder {
29+
impl<BV: DerefMut<Target = [u64]> + BitVec> LevelBuilder<BV> {
3030
/// Construct level builder for given level `total_len` in bits, `number_of_zeros` among the most significant bits
3131
/// and index of most significant bit (`index_of_bit_to_extract`).
3232
fn new(number_of_zeros: usize, total_len: usize, index_of_bit_to_extract: u8) -> Self {
3333
Self {
34-
upper_bit: Box::with_zeroed_bits(total_len + 1), // we add one bit to ensure that rank(len) will work
34+
upper_bit: BV::with_zeroed_bits(total_len + 1), // we add one bit to ensure that rank(len) will work
3535
upper_index: 0,
36-
lower_bits: Box::with_zeroed_bits(total_len * index_of_bit_to_extract as usize + 1), // we add one bit to ensure that rank(len) will work
36+
lower_bits: BV::with_zeroed_bits(total_len * index_of_bit_to_extract as usize + 1), // we add one bit to ensure that rank(len) will work
3737
lower_zero_index: 0,
3838
lower_one_index: number_of_zeros * index_of_bit_to_extract as usize,
3939
upper_bit_mask: 1<<index_of_bit_to_extract,
@@ -52,36 +52,36 @@ impl LevelBuilder {
5252
}
5353

5454
/// Level of the we wavelet matrix.
55-
struct Level<S = CombinedSampling> {
55+
struct Level<S = CombinedSampling, BV = Box<[u64]>> {
5656
/// Level content as bit vector with support for rank and select queries.
57-
content: RankSelect101111::<S, S>,
57+
content: RankSelect101111::<S, S, BV>,
5858

5959
/// Number of zero bits in content.
6060
number_of_zeros: usize
6161
}
6262

63-
impl<S> GetSize for Level<S> where RankSelect101111<S, S>: GetSize {
63+
impl<S, BV> GetSize for Level<S, BV> where RankSelect101111<S, S, BV>: GetSize {
6464
fn size_bytes_dyn(&self) -> usize { self.content.size_bytes_dyn() }
6565
const USES_DYN_MEM: bool = true;
6666
}
6767

68-
impl<S> Level<S> where RankSelect101111<S, S>: From<Box<[u64]>> {
68+
impl<S, BV> Level<S, BV> where RankSelect101111<S, S, BV>: From<BV> {
6969
/// Constructs level with given `content` that contain given number of zero bits.
70-
fn new(content: Box::<[u64]>, number_of_zeros: usize) -> Self {
70+
#[inline] fn new(content: BV, number_of_zeros: usize) -> Self {
7171
//let (bits, number_of_ones) = ArrayWithRank::build(level);
7272
//Self { bits, zeros: level_len - number_of_ones }
7373
Self { content: content.into(), number_of_zeros }
7474
}
7575
}
7676

77-
impl<S> Level<S> where S: SelectForRank101111 {
78-
fn try_select(&self, rank: usize, len: usize) -> Option<usize> {
77+
impl<S, BV> Level<S, BV> where S: SelectForRank101111, BV: Deref<Target = [u64]> {
78+
#[inline] fn try_select(&self, rank: usize, len: usize) -> Option<usize> {
7979
self.content.try_select(rank).filter(|i| *i < len)
8080
}
8181
}
8282

83-
impl<S> Level<S> where S: Select0ForRank101111 {
84-
fn try_select0(&self, rank: usize, len: usize) -> Option<usize> {
83+
impl<S, BV> Level<S, BV> where S: Select0ForRank101111, BV: Deref<Target = [u64]> {
84+
#[inline] fn try_select0(&self, rank: usize, len: usize) -> Option<usize> {
8585
self.content.try_select0(rank).filter(|i| *i < len)
8686
}
8787
}
@@ -93,7 +93,7 @@ impl<S> Level<S> where S: Select0ForRank101111 {
9393
/// - *select* - see [`Self::select`],
9494
/// - *rank* - see [`Self::rank`].
9595
///
96-
/// By default [`bitm::CombinedSampling`] is used as a select strategy for internal bit vectors
96+
/// By default [`bitm::CombinedSampling`] is used as a select strategy `S` for internal bit vectors
9797
/// (see [`bitm::RankSelect101111`]), but this can be changed to [`bitm::BinaryRankSearch`]
9898
/// to save a bit of space (about 0.78%) at the cost of slower *select* queries.
9999
///
@@ -107,12 +107,12 @@ impl<S> Level<S> where S: Select0ForRank101111 {
107107
/// Additionally, our implementation draws some ideas (like elimination of recursion)
108108
/// from the Go implementation by Daisuke Okanohara,
109109
/// available at <https://github.com/hillbig/waveletTree/>.
110-
pub struct Sequence<S = CombinedSampling> {
111-
levels: Box<[Level<S>]>,
110+
pub struct Sequence<S = CombinedSampling, BV = Box<[u64]>> {
111+
levels: Box<[Level<S, BV>]>,
112112
len: usize
113113
}
114114

115-
impl<S> Sequence<S> {
115+
impl<S, BV> Sequence<S, BV> {
116116
/// Returns number of stored items.
117117
#[inline] pub fn len(&self) -> usize { self.len }
118118

@@ -151,7 +151,7 @@ impl Sequence<CombinedSampling> {
151151
}
152152
}
153153

154-
impl<S> Sequence<S> where S: SelectForRank101111+Select0ForRank101111 {
154+
impl<S, BV> Sequence<S, BV> where S: SelectForRank101111+Select0ForRank101111, BV: BitVec+DerefMut<Target = [u64]> {
155155

156156
/// Constructs [`Sequence`] with `content_len` `bits_per_item`-bit
157157
/// items exposed by iterator returned by `content` function,
@@ -162,7 +162,7 @@ impl<S> Sequence<S> where S: SelectForRank101111+Select0ForRank101111 {
162162
assert!(bits_per_item > 0 && bits_per_item <= 63);
163163
let mut levels = Vec::with_capacity(bits_per_item as usize);
164164
if bits_per_item == 1 {
165-
let mut level = Box::with_zeroed_bits(content_len+1);
165+
let mut level = BV::with_zeroed_bits(content_len+1);
166166
for (i, e) in content().into_iter().enumerate() {
167167
level.init_bit(i, e != 0);
168168
}
@@ -223,6 +223,9 @@ impl<S> Sequence<S> where S: SelectForRank101111+Select0ForRank101111 {
223223
|| { (0..content_len).map(|index| content.get_fragment(index, bits_per_item)) },
224224
content_len, bits_per_item)
225225
}
226+
}
227+
228+
impl<S, BV> Sequence<S, BV> where S: SelectForRank101111+Select0ForRank101111, BV: BitVec+Deref<Target = [u64]> {
226229

227230
/// Returns an item with given `index`. The result is undefined if `index` is out of bounds.
228231
pub unsafe fn get_unchecked(&self, mut index: usize) -> u64 {
@@ -319,6 +322,9 @@ impl<S> Sequence<S> where S: SelectForRank101111+Select0ForRank101111 {
319322
pub fn iter(&self) -> impl Iterator<Item = u64> + DoubleEndedIterator + FusedIterator + '_ {
320323
(0..self.len()).map(|i| unsafe { self.get_unchecked(i) })
321324
}
325+
}
326+
327+
impl<S, BV> Sequence<S, BV> where S: SelectForRank101111+Select0ForRank101111, BV: BitVec+Deref<Target = [u64]>+FromIterator<u64> {
322328

323329
/// Reads `self` from the `input`.
324330
///
@@ -329,19 +335,20 @@ impl<S> Sequence<S> where S: SelectForRank101111+Select0ForRank101111 {
329335
let mut levels = Vec::with_capacity(bits_per_item as usize);
330336
for _ in 0..bits_per_item {
331337
let number_of_zeros = AsIs::read(input)?;
332-
let content = AsIs::read_n(input, ceiling_div(len+1, 64))?;
333-
levels.push(Level::<S>::new(content, number_of_zeros))
338+
//let content = AsIs::read_n(input, ceiling_div(len+1, 64))?;
339+
let content = <AsIs as Serializer<u64>>::read_n_iter(input, ceiling_div(len+1, 64)).collect::<io::Result::<BV>>()?;
340+
levels.push(Level::<S, BV>::new(content, number_of_zeros))
334341
}
335342
Ok(Self { levels: levels.into_boxed_slice(), len })
336343
}
337344
}
338345

339-
impl<S> GetSize for Sequence<S> where RankSelect101111<S, S>: GetSize {
346+
impl<S, BV> GetSize for Sequence<S, BV> where RankSelect101111<S, S, BV>: GetSize {
340347
fn size_bytes_dyn(&self) -> usize { self.levels.size_bytes_dyn() }
341348
const USES_DYN_MEM: bool = true;
342349
}
343350

344-
impl<S> Sequence<S> {
351+
impl<S, BV> Sequence<S, BV> where BV: Deref<Target = [u64]> {
345352
/// Returns number of bytes which `write` will write.
346353
pub fn write_bytes(&self) -> usize {
347354
AsIs::size(self.len) +

csf/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ keywords = [ "perfect", "hashing", "mphf", "map", "dictionary" ]
1616
bitm = { version="0.4", path="../bitm" }
1717
ph = { version="0.8", path="../ph", features = ["wyhash"] }
1818
minimum_redundancy = { version="0.3", path="../minimum_redundancy" }
19-
binout = { version="0.2", path="../binout" }
19+
binout = { version=">=0.2", path="../binout" }
2020
dyn_size_of = { version="0.4", path="../dyn_size_of" }
2121
fsum = { version="0.1", path="../fsum" }
2222

minimum_redundancy/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ keywords = [ "compression", "decompression", "huffman", "huffman_coding", "canon
1414

1515
[dependencies]
1616
dyn_size_of = { version="0.4", path="../dyn_size_of" }
17-
binout = { version="0.2", path="../binout" }
17+
binout = { version=">=0.2", path="../binout" }
1818
fsum = { version="0.1", path="../fsum" }
1919
co_sort = "0.2.*"
2020

ph/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ sip13 = [] # if enabled, makes available BuildSip13 that uses Sip13 from compil
1919
[dependencies]
2020
bitm = { version="0.4", path="../bitm" }
2121
dyn_size_of = { version="0.4", path="../dyn_size_of" }
22-
binout = { version="0.2", path="../binout" }
22+
binout = { version=">=0.2", path="../binout" }
2323
rayon = "1.5"
2424
wyhash = { version="0.5", optional=true }
2525
#wyhash_git = { git = "https://github.com/eldruin/wyhash-rs", package = "wyhash", optional=true } # does not work

0 commit comments

Comments
 (0)