diff --git a/dev-hash/benches/hybrid.rs b/dev-hash/benches/hybrid.rs new file mode 100644 index 00000000..e6670ae3 --- /dev/null +++ b/dev-hash/benches/hybrid.rs @@ -0,0 +1,65 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use data_resource::ResourceId; +use rand::prelude::*; +use std::path::Path; + +use dev_hash::Hybrid; + +// Add files to benchmark here +const FILE_PATHS: [&str; 2] = + ["../test-assets/lena.jpg", "../test-assets/test.pdf"]; +// Modify time limit here +const BENCHMARK_TIME_LIMIT: std::time::Duration = + std::time::Duration::from_secs(20); + +fn generate_random_data(size: usize) -> Vec { + let mut rng = rand::thread_rng(); + (0..size).map(|_| rng.gen()).collect() +} + +/// Benchmarks the performance of resource ID creation +/// from file paths and random data. +/// +/// - Measures the time taken to create a resource ID from file paths. +/// - Measures the time taken to create a resource ID from random data. +fn bench_resource_id_creation(c: &mut Criterion) { + let mut group = c.benchmark_group("blake3_resource_id_creation"); + group.measurement_time(BENCHMARK_TIME_LIMIT); + + // Benchmarks for computing from file paths + for file_path in FILE_PATHS.iter() { + assert!( + Path::new(file_path).is_file(), + "The file: {} does not exist or is not a file", + file_path + ); + + let id = format!("compute_from_path:{}", file_path); + group.bench_function(id, move |b| { + b.iter(|| { + ::from_path(black_box(file_path)) + .expect("from_path returned an error") + }); + }); + } + + // Benchmarks for computing from random data + let inputs = [("small", 1024), ("medium", 65536), ("large", 1048576)]; + + for (name, size) in inputs.iter() { + let input_data = generate_random_data(*size); + + let id = format!("compute_from_bytes:{}", name); + group.bench_function(id, move |b| { + b.iter(|| { + ::from_bytes(black_box(&input_data)) + .expect("from_bytes returned an error") + }); + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_resource_id_creation); +criterion_main!(benches); diff --git a/dev-hash/src/hybrid.rs b/dev-hash/src/hybrid.rs new file mode 100644 index 00000000..7524de02 --- /dev/null +++ b/dev-hash/src/hybrid.rs @@ -0,0 +1,136 @@ +use std::{ + fs, + io::{BufRead, BufReader}, + path::Path, +}; + +use blake3::Hasher as Blake3Hasher; +use core::{fmt::Display, str::FromStr}; +use hex::encode; +use serde::{Deserialize, Serialize}; + +use data_error::Result; +use data_resource::ResourceId; + +use std::hash::{Hash, Hasher}; + +const FNV_OFFSET_BASIS: u64 = 0xcbf29ce484222325; +const FNV_PRIME: u64 = 0x100000001b3; + +fn fnv_hash_bytes(bytes: &[u8]) -> u64 { + let mut hash = FNV_OFFSET_BASIS; + for &byte in bytes.iter() { + hash ^= byte as u64; + hash = hash.wrapping_mul(FNV_PRIME); + } + hash +} + +fn fnv_hash_path>(path: P) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + path.as_ref().hash(&mut hasher); + let hash = hasher.finish(); + fnv_hash_bytes(hash.to_le_bytes().as_slice()) +} + +#[derive( + Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash, Serialize, Deserialize, +)] +pub struct Hybrid(pub String); + +impl FromStr for Hybrid { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> core::result::Result { + Ok(Hybrid(s.to_string())) + } +} + +impl Display for Hybrid { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{}", self.0) + } +} + +const THRESHOLD: u64 = 1024 * 1024 * 1024; + +impl ResourceId for Hybrid { + fn from_path>(file_path: P) -> Result { + let size = fs::metadata(file_path.as_ref())?.len(); + + if size < THRESHOLD { + // Use Blake3 for small files + log::debug!( + "Computing BLAKE3 hash for file: {:?}", + file_path.as_ref() + ); + + let file = fs::File::open(file_path)?; + let mut reader = BufReader::new(file); + let mut hasher = Blake3Hasher::new(); + let mut buffer = Vec::new(); + loop { + let bytes_read = reader.read_until(b'\n', &mut buffer)?; + if bytes_read == 0 { + break; + } + hasher.update(&buffer); + buffer.clear(); + } + let hash = hasher.finalize(); + Ok(Hybrid(encode(hash.as_bytes()))) + } else { + // Use fnv hashing for large files + log::debug!( + "Computing simple hash for file: {:?}", + file_path.as_ref() + ); + + let hash = fnv_hash_path(file_path); + Ok(Hybrid(format!("{}_{}", size, hash))) + } + } + + fn from_bytes(bytes: &[u8]) -> Result { + let size = bytes.len() as u64; + if size < THRESHOLD { + // Use Blake3 for small files + log::debug!("Computing BLAKE3 hash for bytes"); + + let mut hasher = Blake3Hasher::new(); + hasher.update(bytes); + let hash = hasher.finalize(); + Ok(Hybrid(encode(hash.as_bytes()))) + } else { + // Use fnv hashing for large files + log::debug!("Computing simple hash for bytes"); + + let hash = fnv_hash_bytes(bytes); + Ok(Hybrid(format!("{}_{}", size, hash))) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sanity_check() { + let file_path = Path::new("../test-assets/lena.jpg"); + let id = Hybrid::from_path(file_path) + .expect("Failed to compute resource identifier"); + assert_eq!( + id, + Hybrid("172b4bf148e858b13dde0fc6613413bcb7552e5c4e5c45195ac6c80f20eb5ff5".to_string()) + ); + + let raw_bytes = fs::read(file_path).expect("Failed to read file"); + let id = ::from_bytes(&raw_bytes) + .expect("Failed to compute resource identifier"); + assert_eq!( + id, + Hybrid("172b4bf148e858b13dde0fc6613413bcb7552e5c4e5c45195ac6c80f20eb5ff5".to_string()) + ); + } +} diff --git a/dev-hash/src/lib.rs b/dev-hash/src/lib.rs index 208c125c..77224601 100644 --- a/dev-hash/src/lib.rs +++ b/dev-hash/src/lib.rs @@ -1,5 +1,8 @@ mod blake3; mod crc32; +mod hybrid; + pub use blake3::Blake3; pub use crc32::Crc32; +pub use hybrid::Hybrid;