Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 174 additions & 50 deletions src/source/wordlist.rs
Original file line number Diff line number Diff line change
@@ -1,40 +1,39 @@
//! Wordlist source - generate keys from file of passphrases.
//!
//! Streams the file in chunks to avoid loading entire wordlists into memory.

use anyhow::Result;
use indicatif::ProgressBar;
use rayon::prelude::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::fs;
use std::io::{BufRead, BufReader, Seek};
use std::path::{Path, PathBuf};

use super::{ProcessStats, Source};
use crate::derive::KeyDeriver;
use crate::matcher::Matcher;
use crate::output::Output;
use crate::transform::{Input, Transform};

const CHUNK_SIZE: usize = 100_000;
const BATCH_SIZE: usize = 1000;

/// Generate keys from a wordlist file
pub struct WordlistSource {
lines: Vec<String>,
path: PathBuf,
}

impl WordlistSource {
pub fn from_file(path: &Path) -> Result<Self> {
let file = File::open(path)?;
let reader = BufReader::new(file);
let lines: Vec<String> = reader
.lines()
.filter_map(|line| match line {
Ok(s) => {
let trimmed = s.trim().to_string();
if trimmed.is_empty() { None } else { Some(Ok(trimmed)) }
}
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => None,
Err(e) => Some(Err(e)),
})
.collect::<std::io::Result<Vec<_>>>()?;

Ok(Self { lines })
if !path.exists() {
anyhow::bail!("Wordlist file not found: {}", path.display());
}
if !path.is_file() {
anyhow::bail!("Not a file: {}", path.display());
}
Ok(Self {
path: path.to_path_buf(),
})
}
}

Expand All @@ -46,51 +45,176 @@ impl Source for WordlistSource {
matcher: Option<&Matcher>,
output: &dyn Output,
) -> Result<ProcessStats> {
let pb = ProgressBar::new(self.lines.len() as u64);
let file_size = fs::metadata(&self.path)?.len();
let pb = ProgressBar::new(file_size);
pb.set_style(crate::default_progress_style());


let stats = std::sync::atomic::AtomicU64::new(0);
let matches = std::sync::atomic::AtomicU64::new(0);
let mut inputs_processed = 0u64;
let mut bytes_consumed = 0u64;

let file = std::fs::File::open(&self.path)?;
let mut reader = BufReader::new(file);
let mut chunk = Vec::with_capacity(CHUNK_SIZE);
let mut line_buf = String::new();

loop {
line_buf.clear();
let bytes_read = match reader.read_line(&mut line_buf) {
Ok(0) => break,
Ok(n) => n as u64,
Err(e) if e.kind() == std::io::ErrorKind::InvalidData => {
// read_line() already consumed the bytes, sync position from reader
bytes_consumed = reader.stream_position().unwrap_or(bytes_consumed);
pb.set_position(bytes_consumed);
continue;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
Err(e) => return Err(e.into()),
};

self.lines.par_chunks(1000).for_each(|chunk| {
let inputs: Vec<Input> = chunk
.iter()
.map(|s| Input::from_string(s.clone()))
.collect();
let mut buffer = Vec::with_capacity(inputs.len() * 2);

for transform in transforms {
buffer.clear();
transform.apply_batch(&inputs, &mut buffer);

for (source, key) in &buffer {
let derived = deriver.derive(key);

if let Some(m) = matcher {
if let Some(match_info) = m.check(&derived) {
output
.hit(source, transform.name(), &derived, &match_info)
.ok();
matches.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
} else {
output.key(source, transform.name(), &derived).ok();
}
bytes_consumed += bytes_read;
pb.set_position(bytes_consumed);

stats.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
let trimmed = line_buf.trim().to_string();
if trimmed.is_empty() {
continue;
}

pb.inc(chunk.len() as u64);
});
chunk.push(trimmed);
inputs_processed += 1;

if chunk.len() >= CHUNK_SIZE {
process_chunk(
&chunk, transforms, deriver, matcher, output, &stats, &matches,
);
chunk.clear();
}
}

if !chunk.is_empty() {
process_chunk(
&chunk, transforms, deriver, matcher, output, &stats, &matches,
);
}

pb.finish_and_clear();
Comment thread
oritwoen marked this conversation as resolved.

Ok(ProcessStats {
inputs_processed: self.lines.len() as u64,
inputs_processed,
keys_generated: stats.load(std::sync::atomic::Ordering::Relaxed),
matches_found: matches.load(std::sync::atomic::Ordering::Relaxed),
})
}
}

fn process_chunk(
lines: &[String],
transforms: &[Box<dyn Transform>],
deriver: &KeyDeriver,
matcher: Option<&Matcher>,
output: &dyn Output,
stats: &std::sync::atomic::AtomicU64,
matches: &std::sync::atomic::AtomicU64,
) {
lines.par_chunks(BATCH_SIZE).for_each(|batch| {
let inputs: Vec<Input> = batch
.iter()
.map(|s| Input::from_string(s.clone()))
.collect();
let mut buffer = Vec::with_capacity(inputs.len() * 2);

for transform in transforms {
buffer.clear();
transform.apply_batch(&inputs, &mut buffer);

for (source, key) in &buffer {
let derived = deriver.derive(key);

if let Some(m) = matcher {
if let Some(match_info) = m.check(&derived) {
output
.hit(source, transform.name(), &derived, &match_info)
.ok();
matches.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
} else {
output.key(source, transform.name(), &derived).ok();
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

stats.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
}
});
}

#[cfg(test)]
mod tests {
use super::*;
use crate::derive::KeyDeriver;
use crate::output::ConsoleOutput;
use std::io::Write;
use tempfile::NamedTempFile;

#[test]
fn from_file_not_found() {
let result = WordlistSource::from_file(Path::new("/nonexistent/path/file.txt"));
assert!(result.is_err());
}

#[test]
fn from_file_not_a_file() {
let dir = tempfile::tempdir().unwrap();
let result = WordlistSource::from_file(dir.path());
assert!(result.is_err());
}

#[test]
fn process_empty_file() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(b"").unwrap();

let source = WordlistSource::from_file(file.path()).unwrap();
let deriver = KeyDeriver::new();
let output = ConsoleOutput::new();
let transforms: Vec<Box<dyn Transform>> = Vec::new();

let stats = source
.process(&transforms, &deriver, None, &output)
.unwrap();
assert_eq!(stats.inputs_processed, 0);
}

#[test]
fn process_skips_blank_lines() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(b"hello\n\n \nworld\n").unwrap();

let source = WordlistSource::from_file(file.path()).unwrap();
let deriver = KeyDeriver::new();
let output = ConsoleOutput::new();
let transforms: Vec<Box<dyn Transform>> = Vec::new();

let stats = source
.process(&transforms, &deriver, None, &output)
.unwrap();
assert_eq!(stats.inputs_processed, 2);
}

#[test]
fn process_skips_invalid_utf8() {
let mut file = NamedTempFile::new().unwrap();
file.write_all(b"valid\n\xff\xfe\ninvalid bytes\ntest\n")
.unwrap();

let source = WordlistSource::from_file(file.path()).unwrap();
let deriver = KeyDeriver::new();
let output = ConsoleOutput::new();
let transforms: Vec<Box<dyn Transform>> = Vec::new();

let stats = source
.process(&transforms, &deriver, None, &output)
.unwrap();
assert_eq!(stats.inputs_processed, 3);
}
}
Loading