From 03faec492dd38d05b1b9e95ff2a5e165484cde13 Mon Sep 17 00:00:00 2001 From: oritwoen <18102267+oritwoen@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:11:36 +0100 Subject: [PATCH 1/3] fix(source): stream wordlist file instead of loading into memory (#71) WordlistSource used to read the entire file into Vec before processing. For large wordlists (multi-GB), this consumed memory proportional to file size regardless of batch processing needs. Switched to streaming with read_line() in chunks of 10k lines, keeping Rayon parallelism within each chunk. Progress bar now tracks bytes read from the file instead of line count. Closes #71 --- src/source/wordlist.rs | 221 ++++++++++++++++++++++++++++++++--------- 1 file changed, 172 insertions(+), 49 deletions(-) diff --git a/src/source/wordlist.rs b/src/source/wordlist.rs index 598cf08..fadf921 100644 --- a/src/source/wordlist.rs +++ b/src/source/wordlist.rs @@ -1,11 +1,13 @@ //! Wordlist source - generate keys from file of passphrases. +//! +//! Streams the file in chunks to avoid loading entire wordlists into memory. use anyhow::Result; use indicatif::ProgressBar; use rayon::prelude::*; -use std::fs::File; +use std::fs; use std::io::{BufRead, BufReader}; -use std::path::Path; +use std::path::{Path, PathBuf}; use super::{ProcessStats, Source}; use crate::derive::KeyDeriver; @@ -13,28 +15,25 @@ use crate::matcher::Matcher; use crate::output::Output; use crate::transform::{Input, Transform}; +const CHUNK_SIZE: usize = 10_000; +const BATCH_SIZE: usize = 1000; + /// Generate keys from a wordlist file pub struct WordlistSource { - lines: Vec, + path: PathBuf, } impl WordlistSource { pub fn from_file(path: &Path) -> Result { - let file = File::open(path)?; - let reader = BufReader::new(file); - let lines: Vec = reader - .lines() - .filter_map(|line| match line { - Ok(s) => { - let trimmed = s.trim().to_string(); - if trimmed.is_empty() { None } else { Some(Ok(trimmed)) } - } - Err(e) if e.kind() == std::io::ErrorKind::InvalidData => None, - Err(e) => Some(Err(e)), - }) - .collect::>>()?; - - Ok(Self { lines }) + if !path.exists() { + anyhow::bail!("Wordlist file not found: {}", path.display()); + } + if !path.is_file() { + anyhow::bail!("Not a file: {}", path.display()); + } + Ok(Self { + path: path.to_path_buf(), + }) } } @@ -46,51 +45,175 @@ impl Source for WordlistSource { matcher: Option<&Matcher>, output: &dyn Output, ) -> Result { - let pb = ProgressBar::new(self.lines.len() as u64); + let file_size = fs::metadata(&self.path)?.len(); + let pb = ProgressBar::new(file_size); pb.set_style(crate::default_progress_style()); - let stats = std::sync::atomic::AtomicU64::new(0); let matches = std::sync::atomic::AtomicU64::new(0); + let mut inputs_processed = 0u64; + let mut bytes_consumed = 0u64; + + let file = std::fs::File::open(&self.path)?; + let mut reader = BufReader::new(file); + let mut chunk = Vec::with_capacity(CHUNK_SIZE); + let mut line_buf = String::new(); + + loop { + line_buf.clear(); + let bytes_read = match reader.read_line(&mut line_buf) { + Ok(0) => break, + Ok(n) => n as u64, + Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { + // Skip invalid UTF-8 lines; estimate 1 byte consumed to keep progress moving + bytes_consumed += 1; + continue; + } + Err(e) => return Err(e.into()), + }; - self.lines.par_chunks(1000).for_each(|chunk| { - let inputs: Vec = chunk - .iter() - .map(|s| Input::from_string(s.clone())) - .collect(); - let mut buffer = Vec::with_capacity(inputs.len() * 2); - - for transform in transforms { - buffer.clear(); - transform.apply_batch(&inputs, &mut buffer); - - for (source, key) in &buffer { - let derived = deriver.derive(key); - - if let Some(m) = matcher { - if let Some(match_info) = m.check(&derived) { - output - .hit(source, transform.name(), &derived, &match_info) - .ok(); - matches.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - } - } else { - output.key(source, transform.name(), &derived).ok(); - } + bytes_consumed += bytes_read; - stats.fetch_add(1, std::sync::atomic::Ordering::Relaxed); - } + let trimmed = line_buf.trim().to_string(); + if trimmed.is_empty() { + continue; } - pb.inc(chunk.len() as u64); - }); + chunk.push(trimmed); + inputs_processed += 1; + + if chunk.len() >= CHUNK_SIZE { + process_chunk( + &chunk, transforms, deriver, matcher, output, &stats, &matches, + ); + pb.set_position(bytes_consumed); + chunk.clear(); + } + } + + if !chunk.is_empty() { + process_chunk( + &chunk, transforms, deriver, matcher, output, &stats, &matches, + ); + } pb.finish_and_clear(); Ok(ProcessStats { - inputs_processed: self.lines.len() as u64, + inputs_processed, keys_generated: stats.load(std::sync::atomic::Ordering::Relaxed), matches_found: matches.load(std::sync::atomic::Ordering::Relaxed), }) } } + +fn process_chunk( + lines: &[String], + transforms: &[Box], + deriver: &KeyDeriver, + matcher: Option<&Matcher>, + output: &dyn Output, + stats: &std::sync::atomic::AtomicU64, + matches: &std::sync::atomic::AtomicU64, +) { + lines.par_chunks(BATCH_SIZE).for_each(|batch| { + let inputs: Vec = batch + .iter() + .map(|s| Input::from_string(s.clone())) + .collect(); + let mut buffer = Vec::with_capacity(inputs.len() * 2); + + for transform in transforms { + buffer.clear(); + transform.apply_batch(&inputs, &mut buffer); + + for (source, key) in &buffer { + let derived = deriver.derive(key); + + if let Some(m) = matcher { + if let Some(match_info) = m.check(&derived) { + output + .hit(source, transform.name(), &derived, &match_info) + .ok(); + matches.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + } + } else { + output.key(source, transform.name(), &derived).ok(); + } + + stats.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + } + } + }); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::derive::KeyDeriver; + use crate::output::ConsoleOutput; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn from_file_not_found() { + let result = WordlistSource::from_file(Path::new("/nonexistent/path/file.txt")); + assert!(result.is_err()); + } + + #[test] + fn from_file_not_a_file() { + let dir = tempfile::tempdir().unwrap(); + let result = WordlistSource::from_file(dir.path()); + assert!(result.is_err()); + } + + #[test] + fn process_empty_file() { + let mut file = NamedTempFile::new().unwrap(); + file.write_all(b"").unwrap(); + + let source = WordlistSource::from_file(file.path()).unwrap(); + let deriver = KeyDeriver::new(); + let output = ConsoleOutput::new(); + let transforms: Vec> = Vec::new(); + + let stats = source + .process(&transforms, &deriver, None, &output) + .unwrap(); + assert_eq!(stats.inputs_processed, 0); + } + + #[test] + fn process_skips_blank_lines() { + let mut file = NamedTempFile::new().unwrap(); + file.write_all(b"hello\n\n \nworld\n").unwrap(); + + let source = WordlistSource::from_file(file.path()).unwrap(); + let deriver = KeyDeriver::new(); + let output = ConsoleOutput::new(); + let transforms: Vec> = Vec::new(); + + let stats = source + .process(&transforms, &deriver, None, &output) + .unwrap(); + assert_eq!(stats.inputs_processed, 2); + } + + #[test] + fn process_skips_invalid_utf8() { + let mut file = NamedTempFile::new().unwrap(); + file.write_all(b"valid\n\xff\xfe\ninvalid bytes\ntest\n") + .unwrap(); + + let source = WordlistSource::from_file(file.path()).unwrap(); + let deriver = KeyDeriver::new(); + let output = ConsoleOutput::new(); + let transforms: Vec> = Vec::new(); + + let stats = source + .process(&transforms, &deriver, None, &output) + .unwrap(); + assert_eq!(stats.inputs_processed, 3); + } +} From 311fe4fe75c73484a520ed4f2d4a86e7dd12a726 Mon Sep 17 00:00:00 2001 From: oritwoen <18102267+oritwoen@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:20:00 +0100 Subject: [PATCH 2/3] fix(source): address review feedback on wordlist streaming - Update progress bar after final partial chunk - Use stream_position() for accurate byte tracking on invalid UTF-8 - Bump CHUNK_SIZE to 100k for better Rayon utilization on many-core machines --- src/source/wordlist.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/source/wordlist.rs b/src/source/wordlist.rs index fadf921..e4c53e6 100644 --- a/src/source/wordlist.rs +++ b/src/source/wordlist.rs @@ -6,7 +6,7 @@ use anyhow::Result; use indicatif::ProgressBar; use rayon::prelude::*; use std::fs; -use std::io::{BufRead, BufReader}; +use std::io::{BufRead, BufReader, Seek}; use std::path::{Path, PathBuf}; use super::{ProcessStats, Source}; @@ -15,7 +15,7 @@ use crate::matcher::Matcher; use crate::output::Output; use crate::transform::{Input, Transform}; -const CHUNK_SIZE: usize = 10_000; +const CHUNK_SIZE: usize = 100_000; const BATCH_SIZE: usize = 1000; /// Generate keys from a wordlist file @@ -65,8 +65,8 @@ impl Source for WordlistSource { Ok(0) => break, Ok(n) => n as u64, Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { - // Skip invalid UTF-8 lines; estimate 1 byte consumed to keep progress moving - bytes_consumed += 1; + // read_line() already consumed the bytes, sync position from reader + bytes_consumed = reader.stream_position().unwrap_or(bytes_consumed); continue; } Err(e) => return Err(e.into()), @@ -95,6 +95,7 @@ impl Source for WordlistSource { process_chunk( &chunk, transforms, deriver, matcher, output, &stats, &matches, ); + pb.set_position(bytes_consumed); } pb.finish_and_clear(); From 5c5f23244536c877a8002bd3f21edf3287ae5cbd Mon Sep 17 00:00:00 2001 From: oritwoen <18102267+oritwoen@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:27:44 +0100 Subject: [PATCH 3/3] fix(source): update progress bar on every line read, not just chunk boundaries --- src/source/wordlist.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/source/wordlist.rs b/src/source/wordlist.rs index e4c53e6..939e471 100644 --- a/src/source/wordlist.rs +++ b/src/source/wordlist.rs @@ -67,12 +67,14 @@ impl Source for WordlistSource { Err(e) if e.kind() == std::io::ErrorKind::InvalidData => { // read_line() already consumed the bytes, sync position from reader bytes_consumed = reader.stream_position().unwrap_or(bytes_consumed); + pb.set_position(bytes_consumed); continue; } Err(e) => return Err(e.into()), }; bytes_consumed += bytes_read; + pb.set_position(bytes_consumed); let trimmed = line_buf.trim().to_string(); if trimmed.is_empty() { @@ -86,7 +88,6 @@ impl Source for WordlistSource { process_chunk( &chunk, transforms, deriver, matcher, output, &stats, &matches, ); - pb.set_position(bytes_consumed); chunk.clear(); } } @@ -95,7 +96,6 @@ impl Source for WordlistSource { process_chunk( &chunk, transforms, deriver, matcher, output, &stats, &matches, ); - pb.set_position(bytes_consumed); } pb.finish_and_clear();