diff --git a/pagefind/src/index/mod.rs b/pagefind/src/index/mod.rs index a62f67cb..32d83989 100644 --- a/pagefind/src/index/mod.rs +++ b/pagefind/src/index/mod.rs @@ -91,7 +91,7 @@ pub async fn build_indexes( sorts.dedup(); // Determine the best sorting parser that fits all available values for each given key - let mut sort_types: HashMap = HashMap::new(); + let mut sort_types: BTreeMap = BTreeMap::new(); for sort in sorts.iter() { let mut sort_values = pages.iter().flat_map(|page| page.sort.get(sort)); sort_types.insert( @@ -402,18 +402,21 @@ pub async fn build_indexes( word_count: *word_count as u32, })); - // TODO: Change filter indexes to BTree to give them a stable hash. // Encode filter indexes in parallel - // Convert hashbrown HashMap to Vec for rayon compatibility - let filter_map_vec: Vec<_> = filter_map.into_iter().collect(); + // Accumulate into HashMap for O(1) inserts, then sort once before encoding + // to produce a stable hash without paying BTreeMap's per-insert overhead. + let mut filter_map_vec: Vec<_> = filter_map.into_iter().collect(); + filter_map_vec.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); let encoded_filters: Vec<(String, Vec, String)> = filter_map_vec .into_par_iter() .map(|(filter, values)| { + let mut sorted_values: Vec<_> = values.into_iter().collect(); + sorted_values.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); let mut filter_index: Vec = Vec::new(); let _ = minicbor::encode::>( FilterIndex { filter: filter.clone(), - values: values + values: sorted_values .into_iter() .map(|(value, pages)| PackedValue { value, pages }) .collect(), @@ -586,7 +589,8 @@ fn positions_to_packed_page(mut positions: Vec, page_number: usiz } fn chunk_index(word_map: HashMap, chunk_size: usize) -> Vec> { - // TODO: Use ye olde BTree + // Accumulate into HashMap for O(1) inserts, then sort once before encoding + // to produce a stable hash without paying BTreeMap's per-insert overhead. let mut words = word_map .into_iter() .map(|(_, w)| w) diff --git a/pagefind/src/output/entry.rs b/pagefind/src/output/entry.rs index 2765416a..d5b3f3e1 100644 --- a/pagefind/src/output/entry.rs +++ b/pagefind/src/output/entry.rs @@ -1,11 +1,11 @@ -use hashbrown::HashMap; +use std::collections::BTreeMap; use serde::Serialize; #[derive(Serialize, Debug)] pub struct PagefindEntryMeta { pub version: &'static str, - pub languages: HashMap, + pub languages: BTreeMap, pub include_characters: Vec, } diff --git a/pagefind/src/output/mod.rs b/pagefind/src/output/mod.rs index 4f836354..a21ccf3a 100644 --- a/pagefind/src/output/mod.rs +++ b/pagefind/src/output/mod.rs @@ -7,7 +7,7 @@ use crate::{SearchOptions, PAGEFIND_VERSION}; use flate2::write::GzEncoder; // TODO: Replace flate2 with async-compression since we use flate2::Compression; // // require that crate for the input compression anyway. use futures::future::join_all; -use hashbrown::HashMap; +use std::collections::BTreeMap; use include_dir::{include_dir, Dir}; use minifier::js::minify; use tokio::fs::{create_dir_all, File}; @@ -135,7 +135,7 @@ async fn write_common( let entry_meta = entry::PagefindEntryMeta { version: PAGEFIND_VERSION, - languages: HashMap::from_iter(language_indexes.into_iter().map(|i| { + languages: BTreeMap::from_iter(language_indexes.into_iter().map(|i| { ( i.language, entry::PagefindEntryLanguage {