From 579bf959dc99c248079c0de336518aa5590e0c0c Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 2 Jun 2026 15:39:53 +0100 Subject: [PATCH] Optimize Compressor::rebuild_from Signed-off-by: Adam Gutglick --- benches/micro.rs | 63 +++++++++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 16 ++++-------- 2 files changed, 67 insertions(+), 12 deletions(-) diff --git a/benches/micro.rs b/benches/micro.rs index e689614..a0be74e 100644 --- a/benches/micro.rs +++ b/benches/micro.rs @@ -2,12 +2,72 @@ use criterion::{Criterion, Throughput, criterion_group, criterion_main}; -use fsst::{CompressorBuilder, Symbol}; +use fsst::{Compressor, CompressorBuilder, Symbol}; fn one_megabyte(seed: &[u8]) -> Vec { seed.iter().copied().cycle().take(1024 * 1024).collect() } +fn symbol(bytes: &[u8]) -> Symbol { + let mut padded = [0; 8]; + padded[..bytes.len()].copy_from_slice(bytes); + Symbol::from_slice(&padded) +} + +fn bench_rebuild_from(c: &mut Criterion) { + let empty_symbols = Vec::new(); + let empty_lengths = Vec::new(); + + let one_byte_symbols = vec![Symbol::from_u8(b'a'), Symbol::from_u8(b'b')]; + let one_byte_lengths = vec![1, 1]; + + let two_byte_symbols = vec![symbol(b"ab"), symbol(b"cd"), symbol(b"ef")]; + let two_byte_lengths = vec![2, 2, 2]; + + let mixed_symbols = vec![ + symbol(b"xy"), + symbol(b"ab"), + symbol(b"ab0"), + symbol(b"cd01"), + symbol(b"ef012"), + symbol(b"gh0123"), + symbol(b"ij01234"), + symbol(b"kl012345"), + Symbol::from_u8(b'x'), + Symbol::from_u8(b'y'), + ]; + let mixed_lengths = vec![2, 2, 3, 4, 5, 6, 7, 8, 1, 1]; + + let training_corpus = one_megabyte(b"the quick brown fox jumps over the lazy dog; "); + let training_values = vec![training_corpus.as_slice()]; + let trained = Compressor::train(&training_values); + let trained_symbols = trained.symbol_table().to_vec(); + let trained_lengths = trained.symbol_lengths().to_vec(); + + let mut group = c.benchmark_group("rebuild-from"); + + macro_rules! bench_case { + ($name:literal, $symbols:expr, $lengths:expr) => { + group.bench_function($name, |b| { + b.iter_with_large_drop(|| { + Compressor::rebuild_from( + std::hint::black_box($symbols.as_slice()), + std::hint::black_box($lengths.as_slice()), + ) + }) + }); + }; + } + + bench_case!("empty-table", empty_symbols, empty_lengths); + bench_case!("one-byte-table", one_byte_symbols, one_byte_lengths); + bench_case!("two-byte-table", two_byte_symbols, two_byte_lengths); + bench_case!("mixed-length-table", mixed_symbols, mixed_lengths); + bench_case!("trained-table", trained_symbols, trained_lengths); + + group.finish(); +} + fn bench_decompress_short(c: &mut Criterion) { let mut compressor = CompressorBuilder::new(); assert!(compressor.insert(Symbol::from_slice(b"abcdefgh"), 8)); @@ -228,6 +288,7 @@ fn bench_compress(c: &mut Criterion) { criterion_group!( bench_micro, bench_compress, + bench_rebuild_from, bench_decompress_short, bench_decompress_escape_heavy ); diff --git a/src/lib.rs b/src/lib.rs index 8b6d219..8a3fe94 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -829,7 +829,7 @@ impl Compressor { let lengths = symbol_lens.to_vec(); let mut lossy_pht = LossyPHT::new(); - let mut codes_one_byte = vec![Code::UNUSED; 256]; + let mut codes_one_byte = [Code::UNUSED; 256]; // Insert all of the one byte symbols first. for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { @@ -838,8 +838,10 @@ impl Compressor { } } - // Initialize the codes_two_byte table to be all escapes - let mut codes_two_byte = vec![Code::UNUSED; 65_536]; + let mut codes_two_byte = Vec::with_capacity(65_536); + for _ in 0..256 { + codes_two_byte.extend_from_slice(&codes_one_byte); + } // Insert the two byte symbols, possibly overwriting slots for one-byte symbols and escapes. for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { @@ -857,14 +859,6 @@ impl Compressor { } } - // Build the finished codes_two_byte table, subbing in unused positions with the - // codes_one_byte value similar to what we do in CompressBuilder::finalize. - for (symbol, code) in codes_two_byte.iter_mut().enumerate() { - if *code == Code::UNUSED { - *code = codes_one_byte[symbol & 0xFF]; - } - } - // Find the position of the first 2-byte code that has a suffix later in the table let mut has_suffix_code = 0u8; for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() {