From 03faec492dd38d05b1b9e95ff2a5e165484cde13 Mon Sep 17 00:00:00 2001
From: oritwoen <18102267+oritwoen@users.noreply.github.com>
Date: Fri, 13 Mar 2026 14:11:36 +0100
Subject: [PATCH 1/3] fix(source): stream wordlist file instead of loading into
 memory (#71)

WordlistSource used to read the entire file into Vec<String> before
processing. For large wordlists (multi-GB), this consumed memory
proportional to file size regardless of batch processing needs.

Switched to streaming with read_line() in chunks of 10k lines,
keeping Rayon parallelism within each chunk. Progress bar now
tracks bytes read from the file instead of line count.

Closes #71
---
 src/source/wordlist.rs | 221 ++++++++++++++++++++++++++++++++---------
 1 file changed, 172 insertions(+), 49 deletions(-)
diff --git a/src/source/wordlist.rs b/src/source/wordlist.rs
index 598cf08..fadf921 100644
--- a/src/source/wordlist.rs
+++ b/src/source/wordlist.rs
@@ -1,11 +1,13 @@
 //! Wordlist source - generate keys from file of passphrases.
+//!
+//! Streams the file in chunks to avoid loading entire wordlists into memory.
 
 use anyhow::Result;
 use indicatif::ProgressBar;
 use rayon::prelude::*;
-use std::fs::File;
+use std::fs;
 use std::io::{BufRead, BufReader};
-use std::path::Path;
+use std::path::{Path, PathBuf};
 
 use super::{ProcessStats, Source};
 use crate::derive::KeyDeriver;
@@ -13,28 +15,25 @@ use crate::matcher::Matcher;
 use crate::output::Output;
 use crate::transform::{Input, Transform};
 
+const CHUNK_SIZE: usize = 10_000;
+const BATCH_SIZE: usize = 1000;
+
 /// Generate keys from a wordlist file
 pub struct WordlistSource {
-    lines: Vec<String>,
+    path: PathBuf,
 }
 
 impl WordlistSource {
     pub fn from_file(path: &Path) -> Result<Self> {
-        let file = File::open(path)?;
-        let reader = BufReader::new(file);
-        let lines: Vec<String> = reader
-            .lines()
-            .filter_map(|line| match line {
-                Ok(s) => {
-                    let trimmed = s.trim().to_string();
-                    if trimmed.is_empty() { None } else { Some(Ok(trimmed)) }
-                }
-                Err(e) if e.kind() == std::io::ErrorKind::InvalidData => None,
-                Err(e) => Some(Err(e)),
-            })
-            .collect::<std::io::Result<Vec<_>>>()?;
-
-        Ok(Self { lines })
+        if !path.exists() {
+            anyhow::bail!("Wordlist file not found: {}", path.display());
+        }
+        if !path.is_file() {
+            anyhow::bail!("Not a file: {}", path.display());
+        }
+        Ok(Self {
+            path: path.to_path_buf(),
+        })
     }
 }
 
@@ -46,51 +45,175 @@ impl Source for WordlistSource {
         matcher: Option<&Matcher>,
         output: &dyn Output,
     ) -> Result<ProcessStats> {
-        let pb = ProgressBar::new(self.lines.len() as u64);
+        let file_size = fs::metadata(&self.path)?.len();
+        let pb = ProgressBar::new(file_size);
         pb.set_style(crate::default_progress_style());
 
-
         let stats = std::sync::atomic::AtomicU64::new(0);
         let matches = std::sync::atomic::AtomicU64::new(0);
+        let mut inputs_processed = 0u64;
+        let mut bytes_consumed = 0u64;
+
+        let file = std::fs::File::open(&self.path)?;
+        let mut reader = BufReader::new(file);
+        let mut chunk = Vec::with_capacity(CHUNK_SIZE);
+        let mut line_buf = String::new();
+
+        loop {
+            line_buf.clear();
+            let bytes_read = match reader.read_line(&mut line_buf) {
+                Ok(0) => break,
+                Ok(n) => n as u64,
+                Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
+                    // Skip invalid UTF-8 lines; estimate 1 byte consumed to keep progress moving
+                    bytes_consumed += 1;
+                    continue;
+                }
+                Err(e) => return Err(e.into()),
+            };
 
-        self.lines.par_chunks(1000).for_each(|chunk| {
-            let inputs: Vec<Input> = chunk
-                .iter()
-                .map(|s| Input::from_string(s.clone()))
-                .collect();
-            let mut buffer = Vec::with_capacity(inputs.len() * 2);
-
-            for transform in transforms {
-                buffer.clear();
-                transform.apply_batch(&inputs, &mut buffer);
-
-                for (source, key) in &buffer {
-                    let derived = deriver.derive(key);
-
-                    if let Some(m) = matcher {
-                        if let Some(match_info) = m.check(&derived) {
-                            output
-                                .hit(source, transform.name(), &derived, &match_info)
-                                .ok();
-                            matches.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-                        }
-                    } else {
-                        output.key(source, transform.name(), &derived).ok();
-                    }
+            bytes_consumed += bytes_read;
 
-                    stats.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
-                }
+            let trimmed = line_buf.trim().to_string();
+            if trimmed.is_empty() {
+                continue;
             }
 
-            pb.inc(chunk.len() as u64);
-        });
+            chunk.push(trimmed);
+            inputs_processed += 1;
+
+            if chunk.len() >= CHUNK_SIZE {
+                process_chunk(
+                    &chunk, transforms, deriver, matcher, output, &stats, &matches,
+                );
+                pb.set_position(bytes_consumed);
+                chunk.clear();
+            }
+        }
+
+        if !chunk.is_empty() {
+            process_chunk(
+                &chunk, transforms, deriver, matcher, output, &stats, &matches,
+            );
+        }
 
         pb.finish_and_clear();
 
         Ok(ProcessStats {
-            inputs_processed: self.lines.len() as u64,
+            inputs_processed,
             keys_generated: stats.load(std::sync::atomic::Ordering::Relaxed),
             matches_found: matches.load(std::sync::atomic::Ordering::Relaxed),
         })
     }
 }
+
+fn process_chunk(
+    lines: &[String],
+    transforms: &[Box<dyn Transform>],
+    deriver: &KeyDeriver,
+    matcher: Option<&Matcher>,
+    output: &dyn Output,
+    stats: &std::sync::atomic::AtomicU64,
+    matches: &std::sync::atomic::AtomicU64,
+) {
+    lines.par_chunks(BATCH_SIZE).for_each(|batch| {
+        let inputs: Vec<Input> = batch
+            .iter()
+            .map(|s| Input::from_string(s.clone()))
+            .collect();
+        let mut buffer = Vec::with_capacity(inputs.len() * 2);
+
+        for transform in transforms {
+            buffer.clear();
+            transform.apply_batch(&inputs, &mut buffer);
+
+            for (source, key) in &buffer {
+                let derived = deriver.derive(key);
+
+                if let Some(m) = matcher {
+                    if let Some(match_info) = m.check(&derived) {
+                        output
+                            .hit(source, transform.name(), &derived, &match_info)
+                            .ok();
+                        matches.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                    }
+                } else {
+                    output.key(source, transform.name(), &derived).ok();
+                }
+
+                stats.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+            }
+        }
+    });
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::derive::KeyDeriver;
+    use crate::output::ConsoleOutput;
+    use std::io::Write;
+    use tempfile::NamedTempFile;
+
+    #[test]
+    fn from_file_not_found() {
+        let result = WordlistSource::from_file(Path::new("/nonexistent/path/file.txt"));
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn from_file_not_a_file() {
+        let dir = tempfile::tempdir().unwrap();
+        let result = WordlistSource::from_file(dir.path());
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn process_empty_file() {
+        let mut file = NamedTempFile::new().unwrap();
+        file.write_all(b"").unwrap();
+
+        let source = WordlistSource::from_file(file.path()).unwrap();
+        let deriver = KeyDeriver::new();
+        let output = ConsoleOutput::new();
+        let transforms: Vec<Box<dyn Transform>> = Vec::new();
+
+        let stats = source
+            .process(&transforms, &deriver, None, &output)
+            .unwrap();
+        assert_eq!(stats.inputs_processed, 0);
+    }
+
+    #[test]
+    fn process_skips_blank_lines() {
+        let mut file = NamedTempFile::new().unwrap();
+        file.write_all(b"hello\n\n  \nworld\n").unwrap();
+
+        let source = WordlistSource::from_file(file.path()).unwrap();
+        let deriver = KeyDeriver::new();
+        let output = ConsoleOutput::new();
+        let transforms: Vec<Box<dyn Transform>> = Vec::new();
+
+        let stats = source
+            .process(&transforms, &deriver, None, &output)
+            .unwrap();
+        assert_eq!(stats.inputs_processed, 2);
+    }
+
+    #[test]
+    fn process_skips_invalid_utf8() {
+        let mut file = NamedTempFile::new().unwrap();
+        file.write_all(b"valid\n\xff\xfe\ninvalid bytes\ntest\n")
+            .unwrap();
+
+        let source = WordlistSource::from_file(file.path()).unwrap();
+        let deriver = KeyDeriver::new();
+        let output = ConsoleOutput::new();
+        let transforms: Vec<Box<dyn Transform>> = Vec::new();
+
+        let stats = source
+            .process(&transforms, &deriver, None, &output)
+            .unwrap();
+        assert_eq!(stats.inputs_processed, 3);
+    }
+}

From 311fe4fe75c73484a520ed4f2d4a86e7dd12a726 Mon Sep 17 00:00:00 2001
From: oritwoen <18102267+oritwoen@users.noreply.github.com>
Date: Fri, 13 Mar 2026 14:20:00 +0100
Subject: [PATCH 2/3] fix(source): address review feedback on wordlist
 streaming

- Update progress bar after final partial chunk
- Use stream_position() for accurate byte tracking on invalid UTF-8
- Bump CHUNK_SIZE to 100k for better Rayon utilization on many-core machines
---
 src/source/wordlist.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/source/wordlist.rs b/src/source/wordlist.rs
index fadf921..e4c53e6 100644
--- a/src/source/wordlist.rs
+++ b/src/source/wordlist.rs
@@ -6,7 +6,7 @@ use anyhow::Result;
 use indicatif::ProgressBar;
 use rayon::prelude::*;
 use std::fs;
-use std::io::{BufRead, BufReader};
+use std::io::{BufRead, BufReader, Seek};
 use std::path::{Path, PathBuf};
 
 use super::{ProcessStats, Source};
@@ -15,7 +15,7 @@ use crate::matcher::Matcher;
 use crate::output::Output;
 use crate::transform::{Input, Transform};
 
-const CHUNK_SIZE: usize = 10_000;
+const CHUNK_SIZE: usize = 100_000;
 const BATCH_SIZE: usize = 1000;
 
 /// Generate keys from a wordlist file
@@ -65,8 +65,8 @@ impl Source for WordlistSource {
                 Ok(0) => break,
                 Ok(n) => n as u64,
                 Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
-                    // Skip invalid UTF-8 lines; estimate 1 byte consumed to keep progress moving
-                    bytes_consumed += 1;
+                    // read_line() already consumed the bytes, sync position from reader
+                    bytes_consumed = reader.stream_position().unwrap_or(bytes_consumed);
                     continue;
                 }
                 Err(e) => return Err(e.into()),
@@ -95,6 +95,7 @@ impl Source for WordlistSource {
             process_chunk(
                 &chunk, transforms, deriver, matcher, output, &stats, &matches,
             );
+            pb.set_position(bytes_consumed);
         }
 
         pb.finish_and_clear();

From 5c5f23244536c877a8002bd3f21edf3287ae5cbd Mon Sep 17 00:00:00 2001
From: oritwoen <18102267+oritwoen@users.noreply.github.com>
Date: Fri, 13 Mar 2026 14:27:44 +0100
Subject: [PATCH 3/3] fix(source): update progress bar on every line read, not
 just chunk boundaries

---
 src/source/wordlist.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/source/wordlist.rs b/src/source/wordlist.rs
index e4c53e6..939e471 100644
--- a/src/source/wordlist.rs
+++ b/src/source/wordlist.rs
@@ -67,12 +67,14 @@ impl Source for WordlistSource {
                 Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
                     // read_line() already consumed the bytes, sync position from reader
                     bytes_consumed = reader.stream_position().unwrap_or(bytes_consumed);
+                    pb.set_position(bytes_consumed);
                     continue;
                 }
                 Err(e) => return Err(e.into()),
             };
 
             bytes_consumed += bytes_read;
+            pb.set_position(bytes_consumed);
 
             let trimmed = line_buf.trim().to_string();
             if trimmed.is_empty() {
@@ -86,7 +88,6 @@ impl Source for WordlistSource {
                 process_chunk(
                     &chunk, transforms, deriver, matcher, output, &stats, &matches,
                 );
-                pb.set_position(bytes_consumed);
                 chunk.clear();
             }
         }
@@ -95,7 +96,6 @@ impl Source for WordlistSource {
             process_chunk(
                 &chunk, transforms, deriver, matcher, output, &stats, &matches,
             );
-            pb.set_position(bytes_consumed);
         }
 
         pb.finish_and_clear();