From 100695c02d1da1485887d85bcebc522432ed269f Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 00:31:48 -0500 Subject: [PATCH 01/29] feat: add `diecut extract` command to create templates from existing projects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Automates the biggest friction point in diecut: turning an existing project into a reusable template. Point it at a project, tell it which values are variables, and it produces a ready-to-use template with diecut.toml, .die suffixed files, and computed case variants. Key capabilities: - Auto-detects case variants (kebab, snake, PascalCase, SCREAMING_SNAKE, etc.) - Longest-match-first replacement prevents overlapping value corruption - Templates path components (my-app/src/ → {{ project_name }}/src/) - Detects conditional files (.github/, Dockerfile, etc.) for optional inclusion - Interactive by default with --batch for CI/scripting - --dry-run to preview without writing - Generates commented diecut.toml with prompted + computed variables --- src/cli.rs | 26 ++ src/commands/extract.rs | 95 ++++++ src/commands/mod.rs | 1 + src/error.rs | 20 ++ src/extract/conditional.rs | 170 +++++++++ src/extract/config_gen.rs | 206 +++++++++++ src/extract/exclude.rs | 216 ++++++++++++ src/extract/mod.rs | 682 +++++++++++++++++++++++++++++++++++++ src/extract/replace.rs | 143 ++++++++ src/extract/scan.rs | 147 ++++++++ src/extract/variants.rs | 329 ++++++++++++++++++ src/lib.rs | 1 + src/main.rs | 8 + tests/integration.rs | 293 ++++++++++++++++ 14 files changed, 2337 insertions(+) create mode 100644 src/commands/extract.rs create mode 100644 src/extract/conditional.rs create mode 100644 src/extract/config_gen.rs create mode 100644 src/extract/exclude.rs create mode 100644 src/extract/mod.rs create mode 100644 src/extract/replace.rs create mode 100644 src/extract/scan.rs create mode 100644 src/extract/variants.rs diff --git a/src/cli.rs b/src/cli.rs index ab84986..a051ff6 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -49,4 +49,30 @@ pub enum Commands { /// List cached templates List, + + /// Extract a template from an existing project + Extract { + /// Source project directory + source: String, + + /// Variable values to templatize (can be repeated: --var key=value) + #[arg(long = "var", value_name = "KEY=VALUE")] + vars: Vec, + + /// Output directory for the extracted template + #[arg(short, long)] + output: Option, + + /// Convert the source directory in-place + #[arg(long)] + in_place: bool, + + /// Skip all interactive prompts + #[arg(long)] + batch: bool, + + /// Show what would be extracted without writing files + #[arg(long)] + dry_run: bool, + }, } diff --git a/src/commands/extract.rs b/src/commands/extract.rs new file mode 100644 index 0000000..86fee13 --- /dev/null +++ b/src/commands/extract.rs @@ -0,0 +1,95 @@ +use std::path::PathBuf; + +use console::style; + +use diecut::error::DicecutError; +use diecut::extract::{execute_extraction, plan_extraction, ExtractOptions}; +use miette::Result; + +pub fn run( + source: String, + vars: Vec, + output: Option, + in_place: bool, + batch: bool, + dry_run: bool, +) -> Result<()> { + let variables = parse_vars(&vars)?; + + let options = ExtractOptions { + source_dir: PathBuf::from(&source), + variables, + output_dir: output.map(PathBuf::from), + in_place, + batch, + dry_run, + }; + + let plan = plan_extraction(&options)?; + + if dry_run { + print_dry_run(&plan); + return Ok(()); + } + + execute_extraction(&plan, in_place)?; + + Ok(()) +} + +fn parse_vars(vars: &[String]) -> diecut::error::Result> { + let mut parsed = Vec::new(); + + for var in vars { + let (key, value) = var + .split_once('=') + .ok_or_else(|| DicecutError::ExtractNoVariables)?; + parsed.push((key.trim().to_string(), value.trim().to_string())); + } + + Ok(parsed) +} + +fn print_dry_run(plan: &diecut::extract::ExtractionPlan) { + eprintln!( + "\n{} Dry run — no files will be written\n", + style("⚡").yellow().bold() + ); + + eprintln!( + "Output directory: {}", + style(plan.output_dir.display()).cyan() + ); + + let templated: Vec<_> = plan.files.iter().filter(|f| f.has_replacements).collect(); + let copied: Vec<_> = plan.files.iter().filter(|f| !f.has_replacements).collect(); + + eprintln!("\nTemplated files ({}):", templated.len()); + for file in &templated { + eprintln!( + " {} ({} replacements)", + file.template_path.display(), + file.replacement_count + ); + } + + eprintln!("\nCopied verbatim ({}):", copied.len()); + for file in &copied { + eprintln!(" {}", file.template_path.display()); + } + + eprintln!("\nVariables:"); + for var in &plan.variables { + eprintln!(" {} = {:?}", var.name, var.value); + for variant in &var.variants { + if variant.name != "verbatim" { + eprintln!(" {} → {}", variant.name, variant.literal); + } + } + } + + eprintln!("\nGenerated diecut.toml:"); + eprintln!("{}", style("─".repeat(60)).dim()); + eprint!("{}", plan.config_toml); + eprintln!("{}", style("─".repeat(60)).dim()); +} diff --git a/src/commands/mod.rs b/src/commands/mod.rs index 33661b9..8c884a4 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -1,2 +1,3 @@ +pub mod extract; pub mod list; pub mod new; diff --git a/src/error.rs b/src/error.rs index 834b7d2..d6a4ee4 100644 --- a/src/error.rs +++ b/src/error.rs @@ -117,6 +117,26 @@ pub enum DicecutError { #[source] source: toml::de::Error, }, + + #[error("Source directory not found: {path}")] + #[diagnostic(help("Provide the path to an existing project directory"))] + ExtractSourceNotFound { path: PathBuf }, + + #[error("No variables provided for extraction")] + #[diagnostic(help( + "Use --var key=value to specify which values should become template variables" + ))] + ExtractNoVariables, + + #[error("Output directory already exists: {path}")] + #[diagnostic(help( + "Choose a different output path with -o, or remove the existing directory" + ))] + ExtractOutputExists { path: PathBuf }, + + #[error("Directory already contains a diecut.toml: {path}")] + #[diagnostic(help("This directory is already a diecut template"))] + ExtractAlreadyTemplate { path: PathBuf }, } pub type Result = std::result::Result; diff --git a/src/extract/conditional.rs b/src/extract/conditional.rs new file mode 100644 index 0000000..67e7346 --- /dev/null +++ b/src/extract/conditional.rs @@ -0,0 +1,170 @@ +use std::path::Path; + +/// A known optional file pattern that can be made conditional in the template. +#[derive(Debug, Clone)] +pub struct ConditionalPattern { + /// Glob pattern to match files. + pub pattern: &'static str, + /// Variable name to control inclusion. + pub variable: &'static str, + /// Human-readable description. + pub description: &'static str, +} + +/// Curated list of known optional file patterns. +const KNOWN_PATTERNS: &[ConditionalPattern] = &[ + ConditionalPattern { + pattern: ".github/**", + variable: "use_github_actions", + description: "GitHub Actions CI", + }, + ConditionalPattern { + pattern: ".gitlab-ci.yml", + variable: "use_gitlab_ci", + description: "GitLab CI", + }, + ConditionalPattern { + pattern: "Dockerfile", + variable: "use_docker", + description: "Docker support", + }, + ConditionalPattern { + pattern: "docker-compose.yml", + variable: "use_docker", + description: "Docker support", + }, + ConditionalPattern { + pattern: "docker-compose.yaml", + variable: "use_docker", + description: "Docker support", + }, + ConditionalPattern { + pattern: ".pre-commit-config.yaml", + variable: "use_pre_commit", + description: "Pre-commit hooks", + }, + ConditionalPattern { + pattern: "Makefile", + variable: "use_make", + description: "Make build system", + }, + ConditionalPattern { + pattern: "Justfile", + variable: "use_just", + description: "Just command runner", + }, + ConditionalPattern { + pattern: ".editorconfig", + variable: "use_editorconfig", + description: "EditorConfig", + }, + ConditionalPattern { + pattern: "renovate.json", + variable: "use_renovate", + description: "Renovate dependency updates", + }, + ConditionalPattern { + pattern: ".renovaterc", + variable: "use_renovate", + description: "Renovate dependency updates", + }, + ConditionalPattern { + pattern: ".github/dependabot.yml", + variable: "use_dependabot", + description: "Dependabot", + }, + ConditionalPattern { + pattern: ".husky/**", + variable: "use_husky", + description: "Git hooks (JS)", + }, +]; + +/// A detected conditional file in the project. +#[derive(Debug, Clone)] +pub struct DetectedConditional { + /// The pattern that matched. + pub pattern: String, + /// The variable name to control this pattern. + pub variable: String, + /// Human-readable description. + pub description: String, +} + +/// Detect which known optional file patterns exist in the project. +/// +/// Groups by variable name — e.g., multiple Docker files share `use_docker`. +pub fn detect_conditional_files(project_dir: &Path) -> Vec { + let mut detected = Vec::new(); + let mut seen_variables = std::collections::HashSet::new(); + + for known in KNOWN_PATTERNS { + let exists = if known.pattern.contains("**") { + // Directory pattern — check if the directory exists + let dir_part = known.pattern.split("/**").next().unwrap_or(known.pattern); + project_dir.join(dir_part).exists() + } else { + project_dir.join(known.pattern).exists() + }; + + if exists && seen_variables.insert(known.variable) { + detected.push(DetectedConditional { + pattern: known.pattern.to_string(), + variable: known.variable.to_string(), + description: known.description.to_string(), + }); + } + } + + detected +} + +/// Get all patterns for a given variable name from the known patterns list. +pub fn patterns_for_variable(variable: &str) -> Vec<&'static str> { + KNOWN_PATTERNS + .iter() + .filter(|p| p.variable == variable) + .map(|p| p.pattern) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_conditional_files_github() { + let dir = tempfile::tempdir().unwrap(); + std::fs::create_dir_all(dir.path().join(".github/workflows")).unwrap(); + + let detected = detect_conditional_files(dir.path()); + assert_eq!(detected.len(), 1); + assert_eq!(detected[0].variable, "use_github_actions"); + } + + #[test] + fn test_detect_conditional_files_docker() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("Dockerfile"), "FROM alpine").unwrap(); + std::fs::write(dir.path().join("docker-compose.yml"), "version: '3'").unwrap(); + + let detected = detect_conditional_files(dir.path()); + // Should deduplicate by variable name + assert_eq!(detected.len(), 1); + assert_eq!(detected[0].variable, "use_docker"); + } + + #[test] + fn test_detect_conditional_files_empty() { + let dir = tempfile::tempdir().unwrap(); + let detected = detect_conditional_files(dir.path()); + assert!(detected.is_empty()); + } + + #[test] + fn test_patterns_for_variable() { + let docker_patterns = patterns_for_variable("use_docker"); + assert!(docker_patterns.contains(&"Dockerfile")); + assert!(docker_patterns.contains(&"docker-compose.yml")); + } +} diff --git a/src/extract/config_gen.rs b/src/extract/config_gen.rs new file mode 100644 index 0000000..5fd8222 --- /dev/null +++ b/src/extract/config_gen.rs @@ -0,0 +1,206 @@ +/// A prompted variable entry for the generated config. +pub struct PromptedVariable { + pub name: String, + pub default_value: String, + pub prompt: String, +} + +/// A computed variable entry for the generated config. +pub struct ComputedVariable { + pub name: String, + pub expression: String, +} + +/// A conditional file entry for the generated config. +#[derive(Debug, Clone)] +pub struct ConditionalEntry { + pub patterns: Vec, + pub variable: String, + pub description: String, +} + +/// Options for generating the diecut.toml config file. +pub struct ConfigGenOptions { + pub template_name: String, + pub prompted_variables: Vec, + pub computed_variables: Vec, + pub exclude_patterns: Vec, + pub copy_without_render: Vec, + pub conditional_entries: Vec, +} + +/// Generate a diecut.toml config string with comments for readability. +/// +/// Uses manual TOML string building because the `toml` crate can't serialize comments, +/// and users need to read and edit this file. +pub fn generate_config_toml(options: &ConfigGenOptions) -> String { + let mut out = String::new(); + + // [template] section + out.push_str("[template]\n"); + out.push_str(&format!( + "name = {}\n", + escape_toml_string(&options.template_name) + )); + out.push_str("version = \"1.0.0\"\n"); + out.push_str("# description = \"A project template\"\n"); + out.push('\n'); + + // [variables] section — prompted variables first + if !options.prompted_variables.is_empty() || !options.computed_variables.is_empty() { + out.push_str("# ── Variables ──────────────────────────────────────────\n"); + out.push_str("# Prompted variables are asked during `diecut new`.\n"); + out.push_str("# Computed variables are auto-derived and never prompted.\n"); + out.push('\n'); + } + + for var in &options.prompted_variables { + out.push_str(&format!("[variables.{}]\n", var.name)); + out.push_str("type = \"string\"\n"); + out.push_str(&format!("prompt = {}\n", escape_toml_string(&var.prompt))); + out.push_str(&format!( + "default = {}\n", + escape_toml_string(&var.default_value) + )); + out.push('\n'); + } + + // Conditional file boolean variables + for entry in &options.conditional_entries { + out.push_str(&format!("# {} ({})\n", entry.variable, entry.description)); + out.push_str(&format!("[variables.{}]\n", entry.variable)); + out.push_str("type = \"bool\"\n"); + out.push_str(&format!( + "prompt = {}\n", + escape_toml_string(&format!("Include {}?", entry.description.to_lowercase())) + )); + out.push_str("default = true\n"); + out.push('\n'); + } + + // Computed variables + for var in &options.computed_variables { + out.push_str(&format!("[variables.{}]\n", var.name)); + out.push_str("type = \"string\"\n"); + out.push_str(&format!( + "computed = {}\n", + escape_toml_string(&var.expression) + )); + out.push('\n'); + } + + // [files] section + out.push_str("# ── Files ─────────────────────────────────────────────\n"); + out.push_str("[files]\n"); + + if !options.exclude_patterns.is_empty() { + out.push_str("exclude = [\n"); + for pattern in &options.exclude_patterns { + out.push_str(&format!(" {},\n", escape_toml_string(pattern))); + } + out.push_str("]\n"); + } + + if !options.copy_without_render.is_empty() { + out.push_str("copy_without_render = [\n"); + for pattern in &options.copy_without_render { + out.push_str(&format!(" {},\n", escape_toml_string(pattern))); + } + out.push_str("]\n"); + } + + out.push('\n'); + + // [[files.conditional]] entries + for entry in &options.conditional_entries { + for pattern in &entry.patterns { + out.push_str(&format!("# {}\n", entry.description)); + out.push_str("[[files.conditional]]\n"); + out.push_str(&format!("pattern = {}\n", escape_toml_string(pattern))); + out.push_str(&format!("when = {}\n", escape_toml_string(&entry.variable))); + out.push('\n'); + } + } + + // [hooks] section + out.push_str("# ── Hooks ─────────────────────────────────────────────\n"); + out.push_str("# [hooks]\n"); + out.push_str("# post_create = \"echo 'Project created!'\"\n"); + + out +} + +/// Escape a string for TOML output. +fn escape_toml_string(s: &str) -> String { + toml::Value::String(s.to_string()).to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_generate_config_basic() { + let options = ConfigGenOptions { + template_name: "my-template".to_string(), + prompted_variables: vec![PromptedVariable { + name: "project_name".to_string(), + default_value: "my-app".to_string(), + prompt: "Project name".to_string(), + }], + computed_variables: vec![ComputedVariable { + name: "project_name_snake".to_string(), + expression: "project_name | replace(from=\"-\", to=\"_\")".to_string(), + }], + exclude_patterns: vec![".git/".to_string()], + copy_without_render: vec!["*.png".to_string()], + conditional_entries: vec![], + }; + + let toml = generate_config_toml(&options); + + assert!(toml.contains("[template]")); + assert!(toml.contains("name = \"my-template\"")); + assert!(toml.contains("[variables.project_name]")); + assert!(toml.contains("type = \"string\"")); + assert!(toml.contains("[variables.project_name_snake]")); + assert!(toml.contains("computed =")); + assert!(toml.contains("[files]")); + assert!(toml.contains("\".git/\"")); + assert!(toml.contains("\"*.png\"")); + } + + #[test] + fn test_generate_config_with_conditionals() { + let options = ConfigGenOptions { + template_name: "test".to_string(), + prompted_variables: vec![], + computed_variables: vec![], + exclude_patterns: vec![], + copy_without_render: vec![], + conditional_entries: vec![ConditionalEntry { + patterns: vec![".github/**".to_string()], + variable: "use_github_actions".to_string(), + description: "GitHub Actions CI".to_string(), + }], + }; + + let toml = generate_config_toml(&options); + + assert!(toml.contains("[variables.use_github_actions]")); + assert!(toml.contains("type = \"bool\"")); + assert!(toml.contains("default = true")); + assert!(toml.contains("[[files.conditional]]")); + assert!(toml.contains("pattern = \".github/**\"")); + assert!(toml.contains("when = \"use_github_actions\"")); + } + + #[test] + fn test_escape_toml_string() { + assert_eq!(escape_toml_string("hello"), "\"hello\""); + // toml crate uses multi-line strings for values containing quotes + let escaped = escape_toml_string("it's \"fine\""); + assert!(escaped.contains("it's")); + assert!(escaped.contains("fine")); + } +} diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs new file mode 100644 index 0000000..8c4c082 --- /dev/null +++ b/src/extract/exclude.rs @@ -0,0 +1,216 @@ +use std::path::Path; + +/// Default directories and files to exclude from template extraction. +const DEFAULT_EXCLUDES: &[&str] = &[ + ".git", + ".git/", + ".hg", + ".svn", + "node_modules", + "node_modules/", + ".DS_Store", + "Thumbs.db", + "__pycache__", + "__pycache__/", + "*.pyc", + ".tox", + ".nox", + ".mypy_cache", + ".ruff_cache", + ".pytest_cache", + "target", + "target/", + ".venv", + ".env", + "dist", + "build", + ".next", + ".nuxt", + ".output", + ".turbo", + ".diecut-answers.toml", +]; + +/// Patterns for files that should be copied without rendering (binary-like or problematic). +const DEFAULT_COPY_WITHOUT_RENDER: &[&str] = &[ + "*.png", + "*.jpg", + "*.jpeg", + "*.gif", + "*.ico", + "*.svg", + "*.webp", + "*.woff", + "*.woff2", + "*.ttf", + "*.eot", + "*.otf", + "*.zip", + "*.tar", + "*.gz", + "*.bz2", + "*.xz", + "*.pdf", + "*.lock", + "package-lock.json", + "yarn.lock", + "pnpm-lock.yaml", + "Cargo.lock", + "Gemfile.lock", + "poetry.lock", + "composer.lock", +]; + +/// Detect which default exclude patterns actually exist in the project. +pub fn detect_excludes(project_dir: &Path) -> Vec { + let mut found = Vec::new(); + + for pattern in DEFAULT_EXCLUDES { + let clean = pattern.trim_end_matches('/'); + // Skip glob patterns — they're always included + if clean.contains('*') { + found.push(pattern.to_string()); + continue; + } + if project_dir.join(clean).exists() { + found.push(pattern.to_string()); + } + } + + found +} + +/// Detect which copy-without-render patterns are relevant based on files present. +pub fn detect_copy_without_render( + _project_dir: &Path, + files: &[std::path::PathBuf], +) -> Vec { + let mut found = Vec::new(); + + for pattern in DEFAULT_COPY_WITHOUT_RENDER { + if pattern.starts_with('*') { + // Extension pattern — check if any file matches + let ext = pattern.trim_start_matches("*."); + if files.iter().any(|f| { + f.extension() + .map(|e| e.to_string_lossy().eq_ignore_ascii_case(ext)) + .unwrap_or(false) + }) { + found.push(pattern.to_string()); + } + } else { + // Exact filename — check if present + if files.iter().any(|f| { + f.file_name() + .map(|n| n.to_string_lossy() == *pattern) + .unwrap_or(false) + }) { + found.push(pattern.to_string()); + } + } + } + + found +} + +/// Check if a path should be excluded based on the exclude patterns. +pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { + let path_str = relative_path.to_string_lossy(); + + for pattern in excludes { + let clean = pattern.trim_end_matches('/'); + + if clean.contains('*') { + // Glob-style matching: *.pyc matches any .pyc file + if let Some(ext) = clean.strip_prefix("*.") { + if let Some(file_ext) = relative_path.extension() { + if file_ext.to_string_lossy().eq_ignore_ascii_case(ext) { + return true; + } + } + } + continue; + } + + // Exact directory/file match at any level + for component in relative_path.components() { + if let std::path::Component::Normal(os_str) = component { + if os_str.to_string_lossy() == clean { + return true; + } + } + } + + // Full path match + if path_str == clean || path_str.starts_with(&format!("{clean}/")) { + return true; + } + } + + false +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn test_should_exclude_git() { + let excludes = vec![".git/".to_string()]; + assert!(should_exclude(Path::new(".git/config"), &excludes)); + assert!(should_exclude(Path::new(".git/HEAD"), &excludes)); + } + + #[test] + fn test_should_exclude_node_modules() { + let excludes = vec!["node_modules".to_string()]; + assert!(should_exclude( + Path::new("node_modules/express/index.js"), + &excludes + )); + } + + #[test] + fn test_should_exclude_glob() { + let excludes = vec!["*.pyc".to_string()]; + assert!(should_exclude( + Path::new("module/__pycache__/foo.pyc"), + &excludes + )); + assert!(!should_exclude(Path::new("module/foo.py"), &excludes)); + } + + #[test] + fn test_should_not_exclude_normal_file() { + let excludes = vec![".git/".to_string(), "node_modules".to_string()]; + assert!(!should_exclude(Path::new("src/main.rs"), &excludes)); + assert!(!should_exclude(Path::new("README.md"), &excludes)); + } + + #[test] + fn test_detect_excludes() { + let dir = tempfile::tempdir().unwrap(); + std::fs::create_dir(dir.path().join(".git")).unwrap(); + std::fs::write(dir.path().join(".DS_Store"), "").unwrap(); + + let found = detect_excludes(dir.path()); + assert!(found.iter().any(|e| e.contains(".git"))); + assert!(found.iter().any(|e| e == ".DS_Store")); + // Glob patterns should always be included + assert!(found.iter().any(|e| e == "*.pyc")); + } + + #[test] + fn test_detect_copy_without_render() { + let files = vec![ + PathBuf::from("logo.png"), + PathBuf::from("font.woff2"), + PathBuf::from("README.md"), + ]; + let found = detect_copy_without_render(Path::new("."), &files); + assert!(found.contains(&"*.png".to_string())); + assert!(found.contains(&"*.woff2".to_string())); + assert!(!found.contains(&"*.jpg".to_string())); + } +} diff --git a/src/extract/mod.rs b/src/extract/mod.rs new file mode 100644 index 0000000..2364891 --- /dev/null +++ b/src/extract/mod.rs @@ -0,0 +1,682 @@ +pub mod conditional; +pub mod config_gen; +pub mod exclude; +pub mod replace; +pub mod scan; +pub mod variants; + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +use console::style; +use inquire::{Confirm, Text}; + +use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; +use crate::error::{DicecutError, Result}; + +use self::conditional::{detect_conditional_files, patterns_for_variable, DetectedConditional}; +use self::config_gen::{ + generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, +}; +use self::exclude::{detect_copy_without_render, detect_excludes}; +use self::replace::{ + apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, +}; +use self::scan::{scan_project, ScannedFile}; +use self::variants::{computed_expression, detect_separator, generate_variants, CaseVariant}; + +/// A variable with its value and confirmed case variants. +#[derive(Debug, Clone)] +pub struct ExtractVariable { + pub name: String, + pub value: String, + pub variants: Vec, + /// Per-variant occurrence counts: (variant_name, file_count, total_hits). + pub occurrence_counts: Vec<(String, usize, usize)>, +} + +/// A file that will be part of the extracted template. +#[derive(Debug, Clone)] +pub struct PlannedExtractFile { + /// Relative path in the output template (may contain template expressions). + pub template_path: PathBuf, + /// Content (with replacements applied), or None for binary files. + pub content: Option, + /// Original bytes for binary files. + pub binary_content: Option>, + /// Whether this file had template replacements applied. + pub has_replacements: bool, + /// Number of replacements made. + pub replacement_count: usize, + /// Whether this is a binary file. + pub is_binary: bool, +} + +/// The full extraction plan, ready to be executed or reviewed. +#[derive(Debug)] +pub struct ExtractionPlan { + pub output_dir: PathBuf, + pub files: Vec, + pub config_toml: String, + pub variables: Vec, + pub conditional_entries: Vec, + pub exclude_patterns: Vec, + pub copy_without_render: Vec, +} + +/// Options for the extraction process. +pub struct ExtractOptions { + pub source_dir: PathBuf, + pub variables: Vec<(String, String)>, + pub output_dir: Option, + pub in_place: bool, + pub batch: bool, + pub dry_run: bool, +} + +/// Plan an extraction: scan the project, detect variants, build replacement rules. +pub fn plan_extraction(options: &ExtractOptions) -> Result { + let source_dir = &options.source_dir; + + if !source_dir.exists() { + return Err(DicecutError::ExtractSourceNotFound { + path: source_dir.clone(), + }); + } + + if options.variables.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } + + // Check if this is already a template + if source_dir.join("diecut.toml").exists() { + return Err(DicecutError::ExtractAlreadyTemplate { + path: source_dir.clone(), + }); + } + + let output_dir = if options.in_place { + source_dir.clone() + } else if let Some(ref out) = options.output_dir { + out.clone() + } else { + // Default: source dir name + "-template" + let dir_name = source_dir + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "template".to_string()); + source_dir + .parent() + .unwrap_or(Path::new(".")) + .join(format!("{dir_name}-template")) + }; + + if !options.in_place && output_dir.exists() { + return Err(DicecutError::ExtractOutputExists { + path: output_dir.clone(), + }); + } + + // Phase 1: Detect excludes + let mut excludes = detect_excludes(source_dir); + + // Phase 2: Scan project + eprintln!( + "\n{}", + style(format!("Scanning {}...", source_dir.display())).bold() + ); + let scan_result = scan_project(source_dir, &excludes)?; + eprintln!( + " {} files found, {} excluded", + scan_result.files.len(), + scan_result.excluded_count + ); + + // Phase 3: Generate variants and count occurrences + let mut extract_variables = Vec::new(); + + for (var_name, var_value) in &options.variables { + let all_variants = generate_variants(var_name, var_value); + + let mut occurrence_counts = Vec::new(); + for variant in &all_variants { + let (file_count, total_hits) = + count_variant_occurrences(&variant.literal, &scan_result.files); + occurrence_counts.push((variant.name.to_string(), file_count, total_hits)); + } + + extract_variables.push(ExtractVariable { + name: var_name.clone(), + value: var_value.clone(), + variants: all_variants, + occurrence_counts, + }); + } + + // Phase 4: Interactive variant confirmation + let confirmed_variables = if options.batch { + // Batch mode: auto-accept all found variants + extract_variables + .into_iter() + .map(|mut var| { + var.variants.retain(|v| { + var.occurrence_counts + .iter() + .any(|(name, _, hits)| name == v.name && *hits > 0) + || v.name == "verbatim" + }); + // Always keep at least the verbatim/canonical variant + if var.variants.is_empty() { + let all = generate_variants(&var.name, &var.value); + if let Some(first) = all.into_iter().next() { + var.variants.push(first); + } + } + var + }) + .collect() + } else { + confirm_variants_interactive(extract_variables)? + }; + + // Phase 5: Interactive exclude confirmation + if !options.batch { + excludes = confirm_excludes_interactive(excludes)?; + } + + // Phase 6: Detect conditional files + let detected_conditionals = if options.batch { + vec![] // Batch mode: no conditional files + } else { + let detected = detect_conditional_files(source_dir); + if detected.is_empty() { + vec![] + } else { + confirm_conditionals_interactive(detected)? + } + }; + + // Phase 7: Build replacement rules + let mut rules = Vec::new(); + for var in &confirmed_variables { + for variant in &var.variants { + rules.push(ReplacementRule { + literal: variant.literal.clone(), + replacement: variant.tera_expr.clone(), + variable: var.name.clone(), + variant: variant.name.to_string(), + }); + } + } + build_replacement_rules(&mut rules); + + // Phase 8: Detect copy_without_render patterns + let file_paths: Vec = scan_result + .files + .iter() + .map(|f| f.relative_path.clone()) + .collect(); + let copy_without_render = detect_copy_without_render(source_dir, &file_paths); + + // Phase 9: Apply replacements to files + let mut planned_files = Vec::new(); + + for file in &scan_result.files { + let template_path = apply_path_replacements(&file.relative_path, &rules); + + if file.is_binary { + let binary_content = + std::fs::read(&file.absolute_path).map_err(|e| DicecutError::Io { + context: format!("reading binary file {}", file.absolute_path.display()), + source: e, + })?; + planned_files.push(PlannedExtractFile { + template_path, + content: None, + binary_content: Some(binary_content), + has_replacements: false, + replacement_count: 0, + is_binary: true, + }); + } else if let Some(ref content) = file.content { + let (replaced, count) = apply_replacements(content, &rules); + let has_replacements = count > 0; + + // Add .die suffix if file has template replacements + let final_path = if has_replacements { + let mut p = template_path.as_os_str().to_string_lossy().to_string(); + p.push_str(DEFAULT_TEMPLATES_SUFFIX); + PathBuf::from(p) + } else { + template_path + }; + + planned_files.push(PlannedExtractFile { + template_path: final_path, + content: Some(replaced), + binary_content: None, + has_replacements, + replacement_count: count, + is_binary: false, + }); + } + } + + // Phase 10: Interactive file confirmation + if !options.batch { + confirm_files_interactive(&planned_files)?; + } + + // Phase 11: Build conditional entries + let conditional_entries: Vec = detected_conditionals + .iter() + .map(|d| { + let patterns = patterns_for_variable(&d.variable) + .into_iter() + .map(|p| p.to_string()) + .collect(); + ConditionalEntry { + patterns, + variable: d.variable.clone(), + description: d.description.clone(), + } + }) + .collect(); + + // Phase 12: Generate config + let canonical_seps: HashMap = confirmed_variables + .iter() + .map(|v| (v.name.clone(), detect_separator(&v.value))) + .collect(); + + let prompted_vars: Vec = confirmed_variables + .iter() + .map(|v| PromptedVariable { + name: v.name.clone(), + default_value: v.value.clone(), + prompt: v.name.replace(['_', '-'], " "), + }) + .collect(); + + let mut computed_vars = Vec::new(); + for var in &confirmed_variables { + let canonical_sep = canonical_seps.get(&var.name).copied().unwrap_or("-"); + for variant in &var.variants { + // Skip the canonical variant (it uses the variable directly) + if variant.name == "verbatim" { + continue; + } + // Skip the variant that matches the canonical separator + let is_canonical = matches!( + (variant.name, canonical_sep), + ("kebab", "-") | ("snake", "_") | ("dot", ".") + ); + if is_canonical { + continue; + } + + let computed_name = format!("{}_{}", var.name, variant.name); + let expression = computed_expression(&var.name, variant.name, canonical_sep); + // Don't add if expression is just the variable name + if expression != var.name { + computed_vars.push(ComputedVariable { + name: computed_name, + expression, + }); + } + } + } + + let config_toml = generate_config_toml(&ConfigGenOptions { + template_name: source_dir + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "template".to_string()), + prompted_variables: prompted_vars, + computed_variables: computed_vars, + exclude_patterns: excludes.clone(), + copy_without_render: copy_without_render.clone(), + conditional_entries: conditional_entries.clone(), + }); + + Ok(ExtractionPlan { + output_dir, + files: planned_files, + config_toml, + variables: confirmed_variables, + conditional_entries, + exclude_patterns: excludes, + copy_without_render, + }) +} + +/// Execute an extraction plan: write files and config to the output directory. +pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> { + let output_dir = &plan.output_dir; + let template_dir = output_dir.join("template"); + + // Create output structure + std::fs::create_dir_all(&template_dir).map_err(|e| DicecutError::Io { + context: format!("creating template directory {}", template_dir.display()), + source: e, + })?; + + // Write template files + let mut rendered_count = 0; + let mut copied_count = 0; + + for file in &plan.files { + let dest = template_dir.join(&file.template_path); + + // Ensure parent directory exists + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent).map_err(|e| DicecutError::Io { + context: format!("creating directory {}", parent.display()), + source: e, + })?; + } + + if let Some(ref content) = file.content { + std::fs::write(&dest, content).map_err(|e| DicecutError::Io { + context: format!("writing file {}", dest.display()), + source: e, + })?; + if file.has_replacements { + rendered_count += 1; + } else { + copied_count += 1; + } + } else if let Some(ref bytes) = file.binary_content { + std::fs::write(&dest, bytes).map_err(|e| DicecutError::Io { + context: format!("writing binary file {}", dest.display()), + source: e, + })?; + copied_count += 1; + } + } + + // Write diecut.toml + let config_path = output_dir.join("diecut.toml"); + std::fs::write(&config_path, &plan.config_toml).map_err(|e| DicecutError::Io { + context: format!("writing {}", config_path.display()), + source: e, + })?; + + // Summary + let prompted_count = plan.variables.len(); + let computed_count = plan + .variables + .iter() + .flat_map(|v| &v.variants) + .filter(|variant| { + variant.name != "verbatim" + && !matches!( + ( + variant.name, + detect_separator( + plan.variables + .iter() + .find(|v2| v2.variants.contains(variant)) + .map(|v2| v2.value.as_str()) + .unwrap_or("") + ) + ), + ("kebab", "-") | ("snake", "_") | ("dot", ".") + ) + }) + .count(); + + eprintln!( + "\n{} Template extracted to {}", + style("✓").green().bold(), + style(output_dir.display()).cyan() + ); + eprintln!( + " {} variables ({} prompted, {} computed)", + prompted_count + computed_count, + prompted_count, + computed_count + ); + eprintln!( + " {} files templated, {} files copied", + rendered_count, copied_count + ); + if !plan.conditional_entries.is_empty() { + eprintln!( + " {} conditional patterns added", + plan.conditional_entries.len() + ); + } + eprintln!(" Review diecut.toml to fine-tune"); + + Ok(()) +} + +// ── Interactive helpers ────────────────────────────────────────────────── + +fn count_variant_occurrences(literal: &str, files: &[ScannedFile]) -> (usize, usize) { + let mut file_count = 0; + let mut total_hits = 0; + + for file in files { + if let Some(ref content) = file.content { + let hits = content.matches(literal).count(); + if hits > 0 { + file_count += 1; + total_hits += hits; + } + } + } + + // Also check path components + for file in files { + let path_str = file.relative_path.to_string_lossy(); + let hits = path_str.matches(literal).count(); + if hits > 0 { + // Don't double-count file_count if already counted from content + total_hits += hits; + } + } + + (file_count, total_hits) +} + +fn confirm_variants_interactive(variables: Vec) -> Result> { + let mut confirmed = Vec::new(); + + for mut var in variables { + eprintln!( + "\n{} {} = {:?} {}", + style("──").dim(), + style(&var.name).bold(), + var.value, + style("──────────────────────────────────────").dim() + ); + + if var.variants.len() == 1 && var.variants[0].name == "verbatim" { + // Simple value — just show occurrence count + let (file_count, total_hits) = var + .occurrence_counts + .first() + .map(|(_, fc, th)| (*fc, *th)) + .unwrap_or((0, 0)); + if total_hits > 0 { + eprintln!( + " Found in {} files ({} occurrences)", + file_count, total_hits + ); + } else { + eprintln!( + " {} Value not found in any file (will still be added to config)", + style("⚠").yellow() + ); + } + confirmed.push(var); + continue; + } + + // Show detected variants with counts + eprintln!(" Detected case variants:"); + let mut found_any = false; + for (i, variant) in var.variants.iter().enumerate() { + let (_, file_count, total_hits) = &var.occurrence_counts[i]; + let mark = if *total_hits > 0 { + found_any = true; + style("✓").green().to_string() + } else { + style("✗").dim().to_string() + }; + let hits_str = if *total_hits > 0 { + format!( + "{} {} across {} {}", + total_hits, + if *total_hits == 1 { "hit" } else { "hits" }, + file_count, + if *file_count == 1 { "file" } else { "files" } + ) + } else { + "not found".to_string() + }; + eprintln!( + " {} {:<16} {:<20} {}", + mark, + variant.literal, + variant.name, + style(&hits_str).dim() + ); + } + + if !found_any { + eprintln!( + " {} No occurrences found for any variant (will still be added to config)", + style("⚠").yellow() + ); + // Keep just the first variant + var.variants.truncate(1); + confirmed.push(var); + continue; + } + + let keep = Confirm::new("Keep detected variants?") + .with_default(true) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if keep { + // Remove variants with zero occurrences + let counts = var.occurrence_counts.clone(); + var.variants.retain(|v| { + counts + .iter() + .any(|(name, _, hits)| name == v.name && *hits > 0) + }); + if var.variants.is_empty() { + let all = generate_variants(&var.name, &var.value); + if let Some(first) = all.into_iter().next() { + var.variants.push(first); + } + } + } else { + // Keep only the canonical variant + var.variants.truncate(1); + } + + confirmed.push(var); + } + + Ok(confirmed) +} + +fn confirm_excludes_interactive(mut excludes: Vec) -> Result> { + eprintln!( + "\n{} Excludes {}", + style("──").dim(), + style("─────────────────────────────────────────────").dim() + ); + eprintln!(" Auto-detected:"); + for e in &excludes { + eprintln!(" {}", e); + } + + let extra = Text::new("Add any others? (comma-separated, enter to accept)") + .with_default("") + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if !extra.is_empty() { + for pattern in extra.split(',') { + let trimmed = pattern.trim().to_string(); + if !trimmed.is_empty() { + excludes.push(trimmed); + } + } + } + + Ok(excludes) +} + +fn confirm_conditionals_interactive( + detected: Vec, +) -> Result> { + eprintln!( + "\n{} Conditional files {}", + style("──").dim(), + style("────────────────────────────────────").dim() + ); + eprintln!(" These look optional. Make them conditional?"); + + let mut confirmed = Vec::new(); + for cond in detected { + let prompt = format!(" {} → {}", cond.pattern, cond.variable); + let include = Confirm::new(&prompt) + .with_default(false) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if include { + confirmed.push(cond); + } + } + + Ok(confirmed) +} + +fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { + let templated: Vec<_> = files.iter().filter(|f| f.has_replacements).collect(); + let copied: Vec<_> = files.iter().filter(|f| !f.has_replacements).collect(); + let binary_count = files.iter().filter(|f| f.is_binary).count(); + + eprintln!( + "\n{} Files to template {}", + style("──").dim(), + style("────────────────────────────────────").dim() + ); + eprintln!( + " Will get {} suffix (template replacements made):", + DEFAULT_TEMPLATES_SUFFIX + ); + for file in &templated { + eprintln!( + " {:<40} {} replacements", + file.template_path.display(), + file.replacement_count + ); + } + + eprintln!( + "\n Copied verbatim: {} files (including {} binary)", + copied.len(), + binary_count + ); + + let proceed = Confirm::new("Proceed?") + .with_default(true) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if !proceed { + return Err(DicecutError::PromptCancelled); + } + + Ok(()) +} diff --git a/src/extract/replace.rs b/src/extract/replace.rs new file mode 100644 index 0000000..37af444 --- /dev/null +++ b/src/extract/replace.rs @@ -0,0 +1,143 @@ +use std::path::{Path, PathBuf}; + +/// A single replacement rule: find `literal` and replace with `replacement`. +#[derive(Debug, Clone)] +pub struct ReplacementRule { + pub literal: String, + pub replacement: String, + /// Which variable this rule belongs to (for reporting). + pub variable: String, + /// Which variant this rule belongs to (for reporting). + pub variant: String, +} + +/// Build replacement rules from all variables and their confirmed variants. +/// +/// Rules are sorted by descending literal length so that longest matches apply first. +/// This prevents shorter overlapping matches from corrupting longer ones. +pub fn build_replacement_rules(rules: &mut [ReplacementRule]) { + rules.sort_by(|a, b| b.literal.len().cmp(&a.literal.len())); +} + +/// Apply replacement rules to a string, longest-match-first. +/// +/// Returns the modified string and the number of replacements made. +pub fn apply_replacements(content: &str, rules: &[ReplacementRule]) -> (String, usize) { + if rules.is_empty() { + return (content.to_string(), 0); + } + + let mut result = content.to_string(); + let mut total_count = 0; + + for rule in rules { + if rule.literal.is_empty() { + continue; + } + let count = result.matches(&rule.literal).count(); + if count > 0 { + result = result.replace(&rule.literal, &rule.replacement); + total_count += count; + } + } + + (result, total_count) +} + +/// Apply replacement rules to path components. +/// +/// Returns the new path with template expressions in directory and file names. +pub fn apply_path_replacements(path: &Path, rules: &[ReplacementRule]) -> PathBuf { + let mut components = Vec::new(); + + for component in path.components() { + match component { + std::path::Component::Normal(os_str) => { + let s = os_str.to_string_lossy(); + let (replaced, _) = apply_replacements(&s, rules); + components.push(replaced); + } + other => { + components.push(other.as_os_str().to_string_lossy().into_owned()); + } + } + } + + components.iter().collect() +} + +/// Count occurrences of a literal in a string. +pub fn count_occurrences(content: &str, literal: &str) -> usize { + if literal.is_empty() { + return 0; + } + content.matches(literal).count() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_rule(literal: &str, replacement: &str) -> ReplacementRule { + ReplacementRule { + literal: literal.to_string(), + replacement: replacement.to_string(), + variable: "test".to_string(), + variant: "test".to_string(), + } + } + + #[test] + fn test_apply_replacements_basic() { + let rules = vec![make_rule("my-app", "{{ project_name }}")]; + let (result, count) = apply_replacements("Welcome to my-app!", &rules); + assert_eq!(result, "Welcome to {{ project_name }}!"); + assert_eq!(count, 1); + } + + #[test] + fn test_apply_replacements_multiple() { + let rules = vec![make_rule("my-app", "{{ project_name }}")]; + let (result, count) = apply_replacements("my-app is great, use my-app", &rules); + assert_eq!( + result, + "{{ project_name }} is great, use {{ project_name }}" + ); + assert_eq!(count, 2); + } + + #[test] + fn test_longest_match_first() { + let mut rules = vec![ + make_rule("my", "{{ org }}"), + make_rule("my-app", "{{ project_name }}"), + ]; + build_replacement_rules(&mut rules); + + // "my-app" should match before "my" + assert_eq!(rules[0].literal, "my-app"); + assert_eq!(rules[1].literal, "my"); + } + + #[test] + fn test_apply_replacements_empty_rules() { + let (result, count) = apply_replacements("hello world", &[]); + assert_eq!(result, "hello world"); + assert_eq!(count, 0); + } + + #[test] + fn test_apply_path_replacements() { + let rules = vec![make_rule("my-app", "{{ project_name }}")]; + let path = Path::new("my-app/src/main.rs"); + let result = apply_path_replacements(path, &rules); + assert_eq!(result, PathBuf::from("{{ project_name }}/src/main.rs")); + } + + #[test] + fn test_count_occurrences() { + assert_eq!(count_occurrences("my-app and my-app", "my-app"), 2); + assert_eq!(count_occurrences("hello world", "missing"), 0); + assert_eq!(count_occurrences("anything", ""), 0); + } +} diff --git a/src/extract/scan.rs b/src/extract/scan.rs new file mode 100644 index 0000000..278fd75 --- /dev/null +++ b/src/extract/scan.rs @@ -0,0 +1,147 @@ +use std::path::{Path, PathBuf}; + +use walkdir::WalkDir; + +use super::exclude::should_exclude; +use crate::render::file::is_binary_file; + +/// A scanned file from the project directory. +#[derive(Debug, Clone)] +pub struct ScannedFile { + /// Path relative to the project root. + pub relative_path: PathBuf, + /// Absolute path on disk. + pub absolute_path: PathBuf, + /// Whether the file is binary. + pub is_binary: bool, + /// File content (only loaded for text files). + pub content: Option, +} + +/// Result of scanning a project directory. +#[derive(Debug)] +pub struct ScanResult { + pub files: Vec, + pub excluded_count: usize, +} + +/// Scan a project directory, applying exclude patterns. +/// +/// Returns all non-excluded files with their content loaded (for text files). +pub fn scan_project(project_dir: &Path, excludes: &[String]) -> crate::error::Result { + let project_dir = project_dir + .canonicalize() + .map_err(|e| crate::error::DicecutError::Io { + context: format!("canonicalizing project directory {}", project_dir.display()), + source: e, + })?; + + let mut files = Vec::new(); + let mut excluded_count = 0; + + for entry in WalkDir::new(&project_dir).min_depth(1) { + let entry = entry.map_err(|e| crate::error::DicecutError::Io { + context: format!("walking project directory: {}", e), + source: e + .into_io_error() + .unwrap_or_else(|| std::io::Error::other("walkdir error")), + })?; + + // Skip directories themselves (we only care about files) + if entry.file_type().is_dir() { + continue; + } + + let relative_path = entry + .path() + .strip_prefix(&project_dir) + .unwrap_or(entry.path()) + .to_path_buf(); + + if should_exclude(&relative_path, excludes) { + excluded_count += 1; + continue; + } + + let absolute_path = entry.path().to_path_buf(); + let is_binary = is_binary_file(&absolute_path); + + let content = if is_binary { + None + } else { + // If we can't read as UTF-8, treat as binary + std::fs::read_to_string(&absolute_path).ok() + }; + + files.push(ScannedFile { + relative_path, + absolute_path, + is_binary, + content, + }); + } + + Ok(ScanResult { + files, + excluded_count, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_scan_project_basic() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("README.md"), "# Hello").unwrap(); + std::fs::create_dir(dir.path().join("src")).unwrap(); + std::fs::write(dir.path().join("src/main.rs"), "fn main() {}").unwrap(); + + let result = scan_project(dir.path(), &[]).unwrap(); + assert_eq!(result.files.len(), 2); + assert_eq!(result.excluded_count, 0); + } + + #[test] + fn test_scan_project_with_excludes() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("README.md"), "# Hello").unwrap(); + std::fs::create_dir(dir.path().join(".git")).unwrap(); + std::fs::write(dir.path().join(".git/config"), "").unwrap(); + + let excludes = vec![".git".to_string()]; + let result = scan_project(dir.path(), &excludes).unwrap(); + assert_eq!(result.files.len(), 1); + assert_eq!(result.excluded_count, 1); + assert_eq!(result.files[0].relative_path, PathBuf::from("README.md")); + } + + #[test] + fn test_scan_project_binary_detection() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("text.txt"), "hello").unwrap(); + std::fs::write( + dir.path().join("binary.bin"), + &(0..256).map(|i| i as u8).collect::>(), + ) + .unwrap(); + + let result = scan_project(dir.path(), &[]).unwrap(); + let text_file = result + .files + .iter() + .find(|f| f.relative_path.to_string_lossy() == "text.txt") + .unwrap(); + let binary_file = result + .files + .iter() + .find(|f| f.relative_path.to_string_lossy() == "binary.bin") + .unwrap(); + + assert!(!text_file.is_binary); + assert!(text_file.content.is_some()); + assert!(binary_file.is_binary); + assert!(binary_file.content.is_none()); + } +} diff --git a/src/extract/variants.rs b/src/extract/variants.rs new file mode 100644 index 0000000..e4259bd --- /dev/null +++ b/src/extract/variants.rs @@ -0,0 +1,329 @@ +use regex_lite::Regex; + +/// A case variant of a variable value, with its literal text and Tera expression. +#[derive(Debug, Clone, PartialEq)] +pub struct CaseVariant { + pub name: &'static str, + pub literal: String, + pub tera_expr: String, +} + +/// Split a string value into words for case variant generation. +/// +/// Handles kebab-case, snake_case, camelCase, PascalCase, dot.case, and space-separated. +pub fn split_into_words(value: &str) -> Vec { + if value.contains('-') { + return value.split('-').map(|s| s.to_lowercase()).collect(); + } + if value.contains('_') { + return value.split('_').map(|s| s.to_lowercase()).collect(); + } + if value.contains('.') { + return value.split('.').map(|s| s.to_lowercase()).collect(); + } + if value.contains(' ') { + return value.split_whitespace().map(|s| s.to_lowercase()).collect(); + } + + // camelCase / PascalCase splitting + let re = Regex::new(r"[A-Z][a-z]*|[a-z]+|[0-9]+").unwrap(); + let words: Vec = re + .find_iter(value) + .map(|m| m.as_str().to_lowercase()) + .collect(); + + if words.is_empty() { + vec![value.to_lowercase()] + } else { + words + } +} + +/// Detect if a value is "multi-word" in a way that supports case variants. +/// +/// Single words and space-separated phrases skip variant detection. +fn supports_case_variants(value: &str) -> bool { + let words = split_into_words(value); + if words.len() < 2 { + return false; + } + // Space-separated values (like author names) skip variant detection + if value.contains(' ') { + return false; + } + true +} + +fn to_kebab(words: &[String]) -> String { + words.join("-") +} + +fn to_snake(words: &[String]) -> String { + words.join("_") +} + +fn to_screaming_snake(words: &[String]) -> String { + words + .iter() + .map(|w| w.to_uppercase()) + .collect::>() + .join("_") +} + +fn to_screaming_kebab(words: &[String]) -> String { + words + .iter() + .map(|w| w.to_uppercase()) + .collect::>() + .join("-") +} + +fn to_pascal(words: &[String]) -> String { + words + .iter() + .map(|w| { + let mut chars = w.chars(); + match chars.next() { + Some(c) => { + let upper: String = c.to_uppercase().collect(); + upper + chars.as_str() + } + None => String::new(), + } + }) + .collect() +} + +fn to_camel(words: &[String]) -> String { + let pascal = to_pascal(words); + let mut chars = pascal.chars(); + match chars.next() { + Some(c) => { + let lower: String = c.to_lowercase().collect(); + lower + chars.as_str() + } + None => String::new(), + } +} + +fn to_dot(words: &[String]) -> String { + words.join(".") +} + +/// Detect the canonical separator in the original value. +pub fn detect_separator(value: &str) -> &'static str { + if value.contains('-') { + "-" + } else if value.contains('_') { + "_" + } else if value.contains('.') { + "." + } else { + // PascalCase/camelCase — treat as kebab canonical + "-" + } +} + +/// Build a Tera expression for a variant, given the variable name and canonical separator. +fn tera_expr_for_variant(var_name: &str, variant_name: &str, canonical_sep: &str) -> String { + match (variant_name, canonical_sep) { + ("kebab", "-") => format!("{{{{ {var_name} }}}}"), + ("kebab", sep) => { + format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"-\") }}}}") + } + ("snake", "_") => format!("{{{{ {var_name} }}}}"), + ("snake", sep) => { + format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"_\") }}}}") + } + ("screaming_snake", sep) => { + if sep == "_" { + format!("{{{{ {var_name} | upper }}}}") + } else { + format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"_\") | upper }}}}") + } + } + ("screaming_kebab", sep) => { + if sep == "-" { + format!("{{{{ {var_name} | upper }}}}") + } else { + format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"-\") | upper }}}}") + } + } + ("pascal", sep) => { + format!( + "{{{{ {var_name} | replace(from=\"{sep}\", to=\" \") | title | replace(from=\" \", to=\"\") }}}}" + ) + } + ("camel", _sep) => { + // No built-in camelCase filter in Tera, so we use a computed variable name + format!("{{{{ {var_name}_camel }}}}") + } + ("dot", ".") => format!("{{{{ {var_name} }}}}"), + ("dot", sep) => { + format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\".\") }}}}") + } + _ => format!("{{{{ {var_name} }}}}"), + } +} + +/// Generate all case variants for a given variable value. +/// +/// Returns the canonical variant first, followed by alternatives. +/// Only returns variants whose literal differs from the canonical form. +/// Single-word values and space-separated phrases return only a verbatim replacement. +pub fn generate_variants(var_name: &str, value: &str) -> Vec { + if !supports_case_variants(value) { + return vec![CaseVariant { + name: "verbatim", + literal: value.to_string(), + tera_expr: format!("{{{{ {var_name} }}}}"), + }]; + } + + let words = split_into_words(value); + let canonical_sep = detect_separator(value); + + let candidates: Vec<(&str, String)> = vec![ + ("kebab", to_kebab(&words)), + ("snake", to_snake(&words)), + ("screaming_snake", to_screaming_snake(&words)), + ("screaming_kebab", to_screaming_kebab(&words)), + ("pascal", to_pascal(&words)), + ("camel", to_camel(&words)), + ("dot", to_dot(&words)), + ]; + + // Deduplicate: some variants produce the same literal (e.g., single-word) + let mut seen = std::collections::HashSet::new(); + let mut variants = Vec::new(); + + for (name, literal) in candidates { + if seen.insert(literal.clone()) { + let tera_expr = tera_expr_for_variant(var_name, name, canonical_sep); + variants.push(CaseVariant { + name, + literal, + tera_expr, + }); + } + } + + variants +} + +/// Build a computed Tera expression for a named variant variable. +/// +/// This is used in diecut.toml for computed variables like `project_name_snake`. +pub fn computed_expression(var_name: &str, variant_name: &str, canonical_sep: &str) -> String { + match (variant_name, canonical_sep) { + ("snake", sep) if sep != "_" => { + format!("{var_name} | replace(from=\"{sep}\", to=\"_\")") + } + ("screaming_snake", sep) => { + if sep == "_" { + format!("{var_name} | upper") + } else { + format!("{var_name} | replace(from=\"{sep}\", to=\"_\") | upper") + } + } + ("screaming_kebab", sep) => { + if sep == "-" { + format!("{var_name} | upper") + } else { + format!("{var_name} | replace(from=\"{sep}\", to=\"-\") | upper") + } + } + ("pascal", sep) => { + format!("{var_name} | replace(from=\"{sep}\", to=\" \") | title | replace(from=\" \", to=\"\")") + } + ("camel", _sep) => { + // Tera doesn't have a built-in camelCase, but we can chain: + // title-case then lowercase-first-char isn't directly expressible. + // Use a workaround: same as pascal — users may need to adjust. + format!( + "{var_name} | replace(from=\"-\", to=\" \") | title | replace(from=\" \", to=\"\")" + ) + } + ("kebab", sep) if sep != "-" => { + format!("{var_name} | replace(from=\"{sep}\", to=\"-\")") + } + ("dot", sep) if sep != "." => { + format!("{var_name} | replace(from=\"{sep}\", to=\".\")") + } + _ => var_name.to_string(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("my-app", vec!["my", "app"])] + #[case("my_app", vec!["my", "app"])] + #[case("MyApp", vec!["my", "app"])] + #[case("myApp", vec!["my", "app"])] + #[case("my.app", vec!["my", "app"])] + #[case("my app", vec!["my", "app"])] + #[case("single", vec!["single"])] + fn test_split_into_words(#[case] input: &str, #[case] expected: Vec<&str>) { + assert_eq!(split_into_words(input), expected); + } + + #[test] + fn test_generate_variants_kebab() { + let variants = generate_variants("project_name", "my-app"); + let names: Vec<&str> = variants.iter().map(|v| v.name).collect(); + assert!(names.contains(&"kebab")); + assert!(names.contains(&"snake")); + assert!(names.contains(&"pascal")); + + let kebab = variants.iter().find(|v| v.name == "kebab").unwrap(); + assert_eq!(kebab.literal, "my-app"); + + let snake = variants.iter().find(|v| v.name == "snake").unwrap(); + assert_eq!(snake.literal, "my_app"); + + let pascal = variants.iter().find(|v| v.name == "pascal").unwrap(); + assert_eq!(pascal.literal, "MyApp"); + } + + #[test] + fn test_generate_variants_single_word() { + let variants = generate_variants("name", "hello"); + assert_eq!(variants.len(), 1); + assert_eq!(variants[0].name, "verbatim"); + assert_eq!(variants[0].literal, "hello"); + } + + #[test] + fn test_generate_variants_space_separated() { + let variants = generate_variants("author", "Jane Doe"); + assert_eq!(variants.len(), 1); + assert_eq!(variants[0].name, "verbatim"); + assert_eq!(variants[0].literal, "Jane Doe"); + } + + #[test] + fn test_generate_variants_screaming_snake() { + let variants = generate_variants("project_name", "my-app"); + let ss = variants + .iter() + .find(|v| v.name == "screaming_snake") + .unwrap(); + assert_eq!(ss.literal, "MY_APP"); + } + + #[test] + fn test_tera_expr_kebab_canonical() { + let expr = tera_expr_for_variant("project_name", "kebab", "-"); + assert_eq!(expr, "{{ project_name }}"); + } + + #[test] + fn test_tera_expr_snake_from_kebab() { + let expr = tera_expr_for_variant("project_name", "snake", "-"); + assert_eq!(expr, "{{ project_name | replace(from=\"-\", to=\"_\") }}"); + } +} diff --git a/src/lib.rs b/src/lib.rs index a57e60c..4091828 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,7 @@ pub mod adapter; pub mod answers; pub mod config; pub mod error; +pub mod extract; pub mod hooks; pub mod prompt; pub mod render; diff --git a/src/main.rs b/src/main.rs index 20cf462..f540fe9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,5 +19,13 @@ fn main() -> miette::Result<()> { template, output, data, defaults, overwrite, no_hooks, dry_run, verbose, ), Commands::List => commands::list::run(), + Commands::Extract { + source, + vars, + output, + in_place, + batch, + dry_run, + } => commands::extract::run(source, vars, output, in_place, batch, dry_run), } } diff --git a/tests/integration.rs b/tests/integration.rs index 4005080..1ad3a59 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use diecut::adapter; use diecut::config::load_config; +use diecut::extract::{execute_extraction, plan_extraction, ExtractOptions}; use diecut::prompt::PromptOptions; use diecut::render::{build_context, execute_plan, plan_render, walk_and_render}; use diecut::template::source::{resolve_source, resolve_source_full}; @@ -622,3 +623,295 @@ fn test_plan_generation_verbose_has_content() { "at least one rendered file should contain the resolved project name" ); } + +// ── Extract command tests ──────────────────────────────────────────────── + +#[test] +fn test_extract_batch_basic() { + // Create a simple project to extract from + let project = tempfile::tempdir().unwrap(); + std::fs::write(project.path().join("README.md"), "# my-app\nBy Jane Doe\n").unwrap(); + std::fs::create_dir(project.path().join("src")).unwrap(); + std::fs::write( + project.path().join("src/main.rs"), + "fn main() {\n println!(\"Welcome to my-app!\");\n}\n", + ) + .unwrap(); + std::fs::write( + project.path().join("Cargo.toml"), + "[package]\nname = \"my-app\"\nversion = \"0.1.0\"\n", + ) + .unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("extracted"); + + let options = ExtractOptions { + source_dir: project.path().to_path_buf(), + variables: vec![ + ("project_name".to_string(), "my-app".to_string()), + ("author".to_string(), "Jane Doe".to_string()), + ], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: false, + }; + + let plan = plan_extraction(&options).unwrap(); + execute_extraction(&plan, false).unwrap(); + + // Verify diecut.toml was created + assert!(output_path.join("diecut.toml").exists()); + let config_content = std::fs::read_to_string(output_path.join("diecut.toml")).unwrap(); + assert!(config_content.contains("[template]")); + assert!(config_content.contains("[variables.project_name]")); + assert!(config_content.contains("[variables.author]")); + + // Verify template directory structure + assert!(output_path.join("template").exists()); + + // Verify files with replacements got .die suffix + let template_dir = output_path.join("template"); + let has_die_files = walkdir::WalkDir::new(&template_dir) + .into_iter() + .filter_map(|e| e.ok()) + .any(|e| e.path().to_string_lossy().ends_with(".die")); + assert!(has_die_files, "should have files with .die suffix"); +} + +#[test] +fn test_extract_detects_case_variants() { + let project = tempfile::tempdir().unwrap(); + std::fs::write( + project.path().join("config.toml"), + "[package]\nname = \"my-app\"\nmodule = \"my_app\"\nclass = \"MyApp\"\nenv = \"MY_APP_PORT\"\n", + ) + .unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("extracted"); + + let options = ExtractOptions { + source_dir: project.path().to_path_buf(), + variables: vec![("project_name".to_string(), "my-app".to_string())], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: false, + }; + + let plan = plan_extraction(&options).unwrap(); + + // Should detect variants used in the file + let var = plan + .variables + .iter() + .find(|v| v.name == "project_name") + .unwrap(); + let variant_names: Vec<&str> = var.variants.iter().map(|v| v.name).collect(); + assert!( + variant_names.contains(&"kebab"), + "should detect kebab variant" + ); + assert!( + variant_names.contains(&"snake"), + "should detect snake variant" + ); + assert!( + variant_names.contains(&"pascal"), + "should detect pascal variant" + ); + assert!( + variant_names.contains(&"screaming_snake"), + "should detect screaming_snake variant" + ); + + execute_extraction(&plan, false).unwrap(); + + // The config should have computed variables for variants + let config = std::fs::read_to_string(output_path.join("diecut.toml")).unwrap(); + assert!( + config.contains("project_name_snake"), + "should have snake computed var" + ); +} + +#[test] +fn test_extract_dry_run_writes_nothing() { + let project = tempfile::tempdir().unwrap(); + std::fs::write(project.path().join("hello.txt"), "hello my-app").unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("dry-run-output"); + + let options = ExtractOptions { + source_dir: project.path().to_path_buf(), + variables: vec![("project_name".to_string(), "my-app".to_string())], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: true, + }; + + let plan = plan_extraction(&options).unwrap(); + // Don't execute — just verify plan exists and no output written + assert!(!plan.files.is_empty()); + assert!(!plan.config_toml.is_empty()); + assert!( + !output_path.exists(), + "dry run should not create output directory" + ); +} + +#[test] +fn test_extract_rejects_already_template() { + let project = tempfile::tempdir().unwrap(); + std::fs::write( + project.path().join("diecut.toml"), + "[template]\nname = \"existing\"", + ) + .unwrap(); + + let options = ExtractOptions { + source_dir: project.path().to_path_buf(), + variables: vec![("name".to_string(), "val".to_string())], + output_dir: None, + in_place: false, + batch: true, + dry_run: false, + }; + + let result = plan_extraction(&options); + assert!(result.is_err()); +} + +#[test] +fn test_extract_rejects_no_variables() { + let project = tempfile::tempdir().unwrap(); + std::fs::write(project.path().join("hello.txt"), "hello").unwrap(); + + let options = ExtractOptions { + source_dir: project.path().to_path_buf(), + variables: vec![], + output_dir: None, + in_place: false, + batch: true, + dry_run: false, + }; + + let result = plan_extraction(&options); + assert!(result.is_err()); +} + +#[test] +fn test_extract_templates_path_components() { + let project = tempfile::tempdir().unwrap(); + std::fs::create_dir(project.path().join("my-app")).unwrap(); + std::fs::write(project.path().join("my-app/main.rs"), "fn main() {}\n").unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("extracted"); + + let options = ExtractOptions { + source_dir: project.path().to_path_buf(), + variables: vec![("project_name".to_string(), "my-app".to_string())], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: false, + }; + + let plan = plan_extraction(&options).unwrap(); + + // Check that path components got templated + let has_templated_path = plan.files.iter().any(|f| { + f.template_path + .to_string_lossy() + .contains("{{ project_name }}") + }); + assert!( + has_templated_path, + "should template path components containing the variable value" + ); + + execute_extraction(&plan, false).unwrap(); +} + +#[test] +fn test_extract_round_trip() { + // Step 1: Generate a project from an existing template + let template_dir = fixture_path("basic-template"); + let resolved = adapter::resolve_template(&template_dir).unwrap(); + + let mut variables = BTreeMap::new(); + variables.insert( + "project_name".to_string(), + tera::Value::String("my-app".to_string()), + ); + variables.insert( + "author".to_string(), + tera::Value::String("Jane Doe".to_string()), + ); + variables.insert("use_docker".to_string(), tera::Value::Bool(false)); + variables.insert( + "license".to_string(), + tera::Value::String("MIT".to_string()), + ); + variables.insert( + "project_slug".to_string(), + tera::Value::String("my-app".to_string()), + ); + + let context = build_context(&variables); + let generated = tempfile::tempdir().unwrap(); + walk_and_render(&resolved, generated.path(), &variables, &context).unwrap(); + + // The generated project has files under generated/my-app/ + let project_dir = generated.path().join("my-app"); + assert!(project_dir.exists(), "generated project should exist"); + + // Step 2: Extract it back into a template + let extracted = tempfile::tempdir().unwrap(); + let extracted_path = extracted.path().join("extracted-template"); + + let options = ExtractOptions { + source_dir: project_dir.clone(), + variables: vec![("project_name".to_string(), "my-app".to_string())], + output_dir: Some(extracted_path.clone()), + in_place: false, + batch: true, + dry_run: false, + }; + + let plan = plan_extraction(&options).unwrap(); + execute_extraction(&plan, false).unwrap(); + + // Verify the extracted template has the key structure + assert!(extracted_path.join("diecut.toml").exists()); + assert!(extracted_path.join("template").exists()); + + let config = std::fs::read_to_string(extracted_path.join("diecut.toml")).unwrap(); + assert!(config.contains("project_name")); + + // Verify template files exist and contain template syntax + let template_files: Vec<_> = walkdir::WalkDir::new(extracted_path.join("template")) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| e.file_type().is_file()) + .collect(); + assert!(!template_files.is_empty(), "should have template files"); + + // Files with .die suffix should contain template expressions + for entry in &template_files { + if entry.path().to_string_lossy().ends_with(".die") { + let content = std::fs::read_to_string(entry.path()).unwrap(); + assert!( + content.contains("{{") || content.contains("{%"), + "file {} should contain template syntax, got: {}", + entry.path().display(), + content + ); + } + } +} From 1ddb7d3995d00e308fe1aaf54ec45374b967e5b1 Mon Sep 17 00:00:00 2001 From: Robert Roskam Date: Fri, 27 Feb 2026 16:32:02 -0500 Subject: [PATCH 02/29] fix(extract): correct camelCase handling and computed variable expressions Add a custom `camelcase` Tera filter that properly lowercases the first word and title-cases the rest (e.g., "my-app" -> "myApp"). Register it via tera_with_filters() in the prompt engine and render walker. Fix computed variable expressions in generated diecut.toml to include {{ }} delimiters so they evaluate as Tera templates rather than being treated as literal text. --- src/extract/config_gen.rs | 2 +- src/extract/variants.rs | 9 +---- src/prompt/engine.rs | 2 +- src/render/context.rs | 81 ++++++++++++++++++++++++++++++++++++++- src/render/mod.rs | 2 +- src/render/walker.rs | 4 +- 6 files changed, 86 insertions(+), 14 deletions(-) diff --git a/src/extract/config_gen.rs b/src/extract/config_gen.rs index 5fd8222..91dea6c 100644 --- a/src/extract/config_gen.rs +++ b/src/extract/config_gen.rs @@ -84,7 +84,7 @@ pub fn generate_config_toml(options: &ConfigGenOptions) -> String { out.push_str("type = \"string\"\n"); out.push_str(&format!( "computed = {}\n", - escape_toml_string(&var.expression) + escape_toml_string(&format!("{{{{ {} }}}}", var.expression)) )); out.push('\n'); } diff --git a/src/extract/variants.rs b/src/extract/variants.rs index e4259bd..222a220 100644 --- a/src/extract/variants.rs +++ b/src/extract/variants.rs @@ -236,13 +236,8 @@ pub fn computed_expression(var_name: &str, variant_name: &str, canonical_sep: &s ("pascal", sep) => { format!("{var_name} | replace(from=\"{sep}\", to=\" \") | title | replace(from=\" \", to=\"\")") } - ("camel", _sep) => { - // Tera doesn't have a built-in camelCase, but we can chain: - // title-case then lowercase-first-char isn't directly expressible. - // Use a workaround: same as pascal — users may need to adjust. - format!( - "{var_name} | replace(from=\"-\", to=\" \") | title | replace(from=\" \", to=\"\")" - ) + ("camel", sep) => { + format!("{var_name} | camelcase(sep=\"{sep}\")") } ("kebab", sep) if sep != "-" => { format!("{var_name} | replace(from=\"{sep}\", to=\"-\")") diff --git a/src/prompt/engine.rs b/src/prompt/engine.rs index 4de7253..47fc847 100644 --- a/src/prompt/engine.rs +++ b/src/prompt/engine.rs @@ -96,7 +96,7 @@ fn evaluate_computed( computed_expr: &str, values: &BTreeMap, ) -> Result { - let mut tera = tera::Tera::default(); + let mut tera = crate::render::tera_with_filters(); tera.add_raw_template("__computed__", computed_expr) .map_err(|e| DicecutError::ComputedEvaluation { name: name.to_string(), diff --git a/src/render/context.rs b/src/render/context.rs index f29f678..4680c64 100644 --- a/src/render/context.rs +++ b/src/render/context.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use tera::{Context, Tera, Value}; @@ -10,14 +10,91 @@ pub fn build_context(variables: &BTreeMap) -> Context { context } +/// Create a Tera instance with custom filters registered. +/// +/// This should be used instead of `Tera::default()` anywhere templates or +/// computed expressions are evaluated, so that custom filters like `camelcase` +/// are available. +pub fn tera_with_filters() -> Tera { + let mut tera = Tera::default(); + tera.register_filter("camelcase", camelcase_filter); + tera +} + +/// Custom Tera filter: convert a separated string to camelCase. +/// +/// Usage: `{{ value | camelcase }}` or `{{ value | camelcase(sep="-") }}` +/// +/// Splits on the separator (default `-`), lowercases the first word, +/// title-cases the rest, and joins them. +fn camelcase_filter( + value: &Value, + args: &HashMap, +) -> Result { + let s = value + .as_str() + .ok_or_else(|| tera::Error::msg("camelcase filter requires a string value"))?; + + let sep = args + .get("sep") + .and_then(|v| v.as_str()) + .unwrap_or("-"); + + let words: Vec<&str> = s.split(sep).collect(); + if words.is_empty() { + return Ok(Value::String(String::new())); + } + + let mut result = words[0].to_lowercase(); + for word in &words[1..] { + let mut chars = word.chars(); + if let Some(first) = chars.next() { + result.extend(first.to_uppercase()); + result.push_str(&chars.as_str().to_lowercase()); + } + } + + Ok(Value::String(result)) +} + /// Evaluate a Tera boolean expression against a variable context. /// /// Returns `Ok(true)` if the expression evaluates to true, `Ok(false)` otherwise. /// Returns `Err` if the expression fails to parse or render. pub fn eval_bool_expr(expr: &str, context: &Context) -> std::result::Result { - let mut tera = Tera::default(); + let mut tera = tera_with_filters(); let template_str = format!("{{% if {expr} %}}true{{% else %}}false{{% endif %}}"); tera.add_raw_template("__when__", &template_str)?; let result = tera.render("__when__", context)?; Ok(result.trim() == "true") } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_camelcase_filter_kebab() { + let val = Value::String("my-cool-app".to_string()); + let args = HashMap::new(); + let result = camelcase_filter(&val, &args).unwrap(); + assert_eq!(result, Value::String("myCoolApp".to_string())); + } + + #[test] + fn test_camelcase_filter_custom_sep() { + let val = Value::String("my_cool_app".to_string()); + let mut args = HashMap::new(); + args.insert("sep".to_string(), Value::String("_".to_string())); + let result = camelcase_filter(&val, &args).unwrap(); + assert_eq!(result, Value::String("myCoolApp".to_string())); + } + + #[test] + fn test_camelcase_filter_single_word() { + let val = Value::String("hello".to_string()); + let args = HashMap::new(); + let result = camelcase_filter(&val, &args).unwrap(); + assert_eq!(result, Value::String("hello".to_string())); + } +} diff --git a/src/render/mod.rs b/src/render/mod.rs index 5674674..8a87f30 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,7 +2,7 @@ pub mod context; pub mod file; pub mod walker; -pub use context::{build_context, eval_bool_expr}; +pub use context::{build_context, eval_bool_expr, tera_with_filters}; pub use walker::{ execute_plan, plan_render, walk_and_render, GeneratedProject, GenerationPlan, PlannedFile, }; diff --git a/src/render/walker.rs b/src/render/walker.rs index caf9e26..97b1e96 100644 --- a/src/render/walker.rs +++ b/src/render/walker.rs @@ -2,7 +2,7 @@ use std::collections::BTreeMap; use std::path::{Path, PathBuf}; use globset::{Glob, GlobSet, GlobSetBuilder}; -use tera::{Context, Tera, Value}; +use tera::{Context, Value}; use walkdir::WalkDir; use crate::adapter::ResolvedTemplate; @@ -104,7 +104,7 @@ pub fn plan_render( source: e, })?; - let mut tera = Tera::default(); + let mut tera = crate::render::tera_with_filters(); let template_name = rel_str.to_string(); let parse_result = tera.add_raw_template(&template_name, &content); let render_result = parse_result.and_then(|_| tera.render(&template_name, context)); From 69099b85f4abb0b1030b782598d550cf3ba9e7c5 Mon Sep 17 00:00:00 2001 From: Robert Roskam Date: Fri, 27 Feb 2026 16:28:07 -0500 Subject: [PATCH 03/29] fix(extract): prevent substring collision in replacements Use word-boundary-aware matching so short variable values like "app" don't get replaced inside longer words like "application". A match is only accepted when the characters immediately before and after it are not word-like (alphanumeric, underscore, or hyphen). --- src/extract/replace.rs | 113 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/src/extract/replace.rs b/src/extract/replace.rs index 37af444..42914ec 100644 --- a/src/extract/replace.rs +++ b/src/extract/replace.rs @@ -19,8 +19,76 @@ pub fn build_replacement_rules(rules: &mut [ReplacementRule]) { rules.sort_by(|a, b| b.literal.len().cmp(&a.literal.len())); } +/// Whether a character is "word-like" for the purpose of boundary detection. +/// +/// Alphanumeric, underscore, and hyphen are all considered word characters +/// because they appear as separators in identifiers (kebab-case, snake_case). +fn is_word_char(c: char) -> bool { + c.is_alphanumeric() || c == '_' || c == '-' +} + +/// Replace `literal` in `text` only at word boundaries. +/// +/// A match is at a word boundary when the characters immediately before and +/// after the match are not word-like (alphanumeric, `_`, or `-`), or the +/// match is at the start/end of the string. +/// +/// Multi-word literals (containing a separator like `-`, `_`, or `.`) always +/// use boundary-aware replacement since false positives are unlikely but still +/// possible in paths and compound tokens. +fn replace_whole_word(text: &str, literal: &str, replacement: &str) -> (String, usize) { + let literal_len = literal.len(); + let text_len = text.len(); + + if literal_len == 0 || text_len < literal_len { + return (text.to_string(), 0); + } + + let mut result = String::with_capacity(text.len()); + let mut count = 0; + let mut start = 0; + + while start <= text_len - literal_len { + match text[start..].find(literal) { + Some(pos) => { + let match_start = start + pos; + let match_end = match_start + literal_len; + + let ok_before = match_start == 0 + || !is_word_char(text[..match_start].chars().next_back().unwrap()); + let ok_after = match_end == text_len + || !is_word_char(text[match_end..].chars().next().unwrap()); + + if ok_before && ok_after { + result.push_str(&text[start..match_start]); + result.push_str(replacement); + count += 1; + start = match_end; + } else { + // Not a word boundary — advance past the start of this match + let next = match_start + + text[match_start..] + .char_indices() + .nth(1) + .map(|(i, _)| i) + .unwrap_or(1); + result.push_str(&text[start..next]); + start = next; + } + } + None => break, + } + } + + result.push_str(&text[start..]); + (result, count) +} + /// Apply replacement rules to a string, longest-match-first. /// +/// Uses word-boundary-aware matching to prevent replacing substrings +/// inside longer words (e.g., "app" inside "application"). +/// /// Returns the modified string and the number of replacements made. pub fn apply_replacements(content: &str, rules: &[ReplacementRule]) -> (String, usize) { if rules.is_empty() { @@ -34,9 +102,9 @@ pub fn apply_replacements(content: &str, rules: &[ReplacementRule]) -> (String, if rule.literal.is_empty() { continue; } - let count = result.matches(&rule.literal).count(); + let (replaced, count) = replace_whole_word(&result, &rule.literal, &rule.replacement); if count > 0 { - result = result.replace(&rule.literal, &rule.replacement); + result = replaced; total_count += count; } } @@ -140,4 +208,45 @@ mod tests { assert_eq!(count_occurrences("hello world", "missing"), 0); assert_eq!(count_occurrences("anything", ""), 0); } + + #[test] + fn test_no_substring_collision_suffix() { + let rules = vec![make_rule("app", "{{ name }}")]; + let (result, count) = apply_replacements("application startup", &rules); + assert_eq!(result, "application startup"); + assert_eq!(count, 0); + } + + #[test] + fn test_no_substring_collision_prefix() { + let rules = vec![make_rule("app", "{{ name }}")]; + let (result, count) = apply_replacements("webapp is cool", &rules); + assert_eq!(result, "webapp is cool"); + assert_eq!(count, 0); + } + + #[test] + fn test_standalone_match_with_punctuation() { + let rules = vec![make_rule("app", "{{ name }}")]; + let (result, count) = apply_replacements("run app. start app!", &rules); + assert_eq!(result, "run {{ name }}. start {{ name }}!"); + assert_eq!(count, 2); + } + + #[test] + fn test_match_at_string_boundaries() { + let rules = vec![make_rule("app", "{{ name }}")]; + let (result, count) = apply_replacements("app", &rules); + assert_eq!(result, "{{ name }}"); + assert_eq!(count, 1); + } + + #[test] + fn test_compound_literal_still_matches() { + // Multi-word literals like "my-app" should still match inside strings + let rules = vec![make_rule("my-app", "{{ name }}")]; + let (result, count) = apply_replacements("name = \"my-app\"", &rules); + assert_eq!(result, "name = \"{{ name }}\""); + assert_eq!(count, 1); + } } From f5ad9edc86bc074c1541d12b5f7f3caf12cf92cc Mon Sep 17 00:00:00 2001 From: Robert Roskam Date: Fri, 27 Feb 2026 16:26:38 -0500 Subject: [PATCH 04/29] refactor(extract): use computed variable names in template files Instead of repeating verbose inline filter chains like {{ project_name | replace(from="-", to=" ") | title | replace(from=" ", to="") }} in every template file, reference the computed variable names already defined in diecut.toml (e.g., {{ project_name_pascal }}). Extract is_canonical_variant() as a public helper to deduplicate the canonical-variant check between replacement rule building and computed variable generation. --- src/extract/mod.rs | 10 +++----- src/extract/variants.rs | 57 ++++++++++++++--------------------------- 2 files changed, 23 insertions(+), 44 deletions(-) diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 2364891..ba568bd 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -23,7 +23,9 @@ use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; use self::scan::{scan_project, ScannedFile}; -use self::variants::{computed_expression, detect_separator, generate_variants, CaseVariant}; +use self::variants::{ + computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, +}; /// A variable with its value and confirmed case variants. #[derive(Debug, Clone)] @@ -307,11 +309,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { continue; } // Skip the variant that matches the canonical separator - let is_canonical = matches!( - (variant.name, canonical_sep), - ("kebab", "-") | ("snake", "_") | ("dot", ".") - ); - if is_canonical { + if is_canonical_variant(variant.name, canonical_sep) { continue; } diff --git a/src/extract/variants.rs b/src/extract/variants.rs index 222a220..8458b29 100644 --- a/src/extract/variants.rs +++ b/src/extract/variants.rs @@ -124,46 +124,27 @@ pub fn detect_separator(value: &str) -> &'static str { } } +/// Check whether a variant is the canonical one (matches the input separator). +/// +/// Canonical variants use the bare `{{ var_name }}` expression and do not get +/// a computed variable in diecut.toml. +pub fn is_canonical_variant(variant_name: &str, canonical_sep: &str) -> bool { + matches!( + (variant_name, canonical_sep), + ("kebab", "-") | ("snake", "_") | ("dot", ".") + ) +} + /// Build a Tera expression for a variant, given the variable name and canonical separator. +/// +/// Canonical variants use `{{ var_name }}` directly. Non-canonical variants reference +/// their computed variable (e.g., `{{ var_name_snake }}`), which is defined in diecut.toml. fn tera_expr_for_variant(var_name: &str, variant_name: &str, canonical_sep: &str) -> String { - match (variant_name, canonical_sep) { - ("kebab", "-") => format!("{{{{ {var_name} }}}}"), - ("kebab", sep) => { - format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"-\") }}}}") - } - ("snake", "_") => format!("{{{{ {var_name} }}}}"), - ("snake", sep) => { - format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"_\") }}}}") - } - ("screaming_snake", sep) => { - if sep == "_" { - format!("{{{{ {var_name} | upper }}}}") - } else { - format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"_\") | upper }}}}") - } - } - ("screaming_kebab", sep) => { - if sep == "-" { - format!("{{{{ {var_name} | upper }}}}") - } else { - format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\"-\") | upper }}}}") - } - } - ("pascal", sep) => { - format!( - "{{{{ {var_name} | replace(from=\"{sep}\", to=\" \") | title | replace(from=\" \", to=\"\") }}}}" - ) - } - ("camel", _sep) => { - // No built-in camelCase filter in Tera, so we use a computed variable name - format!("{{{{ {var_name}_camel }}}}") - } - ("dot", ".") => format!("{{{{ {var_name} }}}}"), - ("dot", sep) => { - format!("{{{{ {var_name} | replace(from=\"{sep}\", to=\".\") }}}}") - } - _ => format!("{{{{ {var_name} }}}}"), + if variant_name == "verbatim" || is_canonical_variant(variant_name, canonical_sep) { + return format!("{{{{ {var_name} }}}}"); } + // Non-canonical variants reference their computed variable name + format!("{{{{ {var_name}_{variant_name} }}}}") } /// Generate all case variants for a given variable value. @@ -319,6 +300,6 @@ mod tests { #[test] fn test_tera_expr_snake_from_kebab() { let expr = tera_expr_for_variant("project_name", "snake", "-"); - assert_eq!(expr, "{{ project_name | replace(from=\"-\", to=\"_\") }}"); + assert_eq!(expr, "{{ project_name_snake }}"); } } From 5ccdaa181f8ed10e567351ded2441b0a903a8939 Mon Sep 17 00:00:00 2001 From: Robert Roskam Date: Fri, 27 Feb 2026 19:04:32 -0500 Subject: [PATCH 05/29] feat(extract): add auto-detection of template variables Add 4-tier automatic variable detection for `diecut extract`: - Tier 1: Directory name (0.95 confidence) - Tier 2: Ecosystem configs - Cargo.toml, package.json, pyproject.toml, go.mod (0.85-0.90 confidence) - Tier 3: Git metadata - remote org, user.name (0.65-0.70 confidence) - Tier 4: Frequency analysis with Levenshtein merging (scored 0.30-1.0) Auto-detection runs when no --var flags are provided. Includes noise filtering for language keywords, common libraries, file format words, and stopwords. Scoring emphasizes variant diversity to prefer identifiers that appear in multiple case forms. --- Cargo.lock | 1 + Cargo.toml | 1 + src/cli.rs | 4 + src/commands/extract.rs | 5 + src/error.rs | 2 +- src/extract/auto_detect.rs | 1315 ++++++++++++++++++++++++++++++++++++ src/extract/mod.rs | 124 +++- src/main.rs | 3 +- tests/integration.rs | 145 ++++ 9 files changed, 1593 insertions(+), 7 deletions(-) create mode 100644 src/extract/auto_detect.rs diff --git a/Cargo.lock b/Cargo.lock index ce434bc..113ae0e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -462,6 +462,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "strsim", "tempfile", "tera", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index 3a5902b..64ef434 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ sha2 = "0.10" fs4 = "0.12" content_inspector = "0.2" indexmap = { version = "2.11.4", features = ["serde"] } +strsim = "0.11" [dev-dependencies] rstest = "0.23" diff --git a/src/cli.rs b/src/cli.rs index a051ff6..0301617 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -74,5 +74,9 @@ pub enum Commands { /// Show what would be extracted without writing files #[arg(long)] dry_run: bool, + + /// Auto-detect template variables from project metadata and content + #[arg(long)] + auto: bool, }, } diff --git a/src/commands/extract.rs b/src/commands/extract.rs index 86fee13..6576387 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -13,9 +13,13 @@ pub fn run( in_place: bool, batch: bool, dry_run: bool, + auto: bool, ) -> Result<()> { let variables = parse_vars(&vars)?; + // Default auto to true when no vars are provided + let auto = auto || variables.is_empty(); + let options = ExtractOptions { source_dir: PathBuf::from(&source), variables, @@ -23,6 +27,7 @@ pub fn run( in_place, batch, dry_run, + auto, }; let plan = plan_extraction(&options)?; diff --git a/src/error.rs b/src/error.rs index d6a4ee4..ccac949 100644 --- a/src/error.rs +++ b/src/error.rs @@ -124,7 +124,7 @@ pub enum DicecutError { #[error("No variables provided for extraction")] #[diagnostic(help( - "Use --var key=value to specify which values should become template variables" + "Use --var key=value to specify variables, or --auto to detect them automatically" ))] ExtractNoVariables, diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs new file mode 100644 index 0000000..5cc76da --- /dev/null +++ b/src/extract/auto_detect.rs @@ -0,0 +1,1315 @@ +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use std::process::Command; + +use regex_lite::Regex; + +use super::scan::ScanResult; +use super::variants::split_into_words; + +/// Confidence tier indicating how a candidate variable was detected. +#[derive(Debug, Clone, PartialEq)] +pub enum ConfidenceTier { + DirectoryName, + ConfigFile, + GitMetadata, + FrequencyAnalysis, +} + +impl std::fmt::Display for ConfidenceTier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ConfidenceTier::DirectoryName => write!(f, "directory name"), + ConfidenceTier::ConfigFile => write!(f, "config file"), + ConfidenceTier::GitMetadata => write!(f, "git metadata"), + ConfidenceTier::FrequencyAnalysis => write!(f, "frequency analysis"), + } + } +} + +/// A candidate variable detected by auto-detection. +#[derive(Debug, Clone)] +pub struct DetectedCandidate { + pub suggested_name: String, + pub value: String, + pub tier: ConfidenceTier, + pub confidence: f64, + pub reason: String, + pub file_count: usize, + pub total_occurrences: usize, +} + +/// Result of running auto-detection. +#[derive(Debug)] +pub struct AutoDetectResult { + pub candidates: Vec, +} + +struct TokenCluster { + normalized: Vec, + literals: Vec, + total_occurrences: usize, + file_count: usize, + matches_dir_name: bool, + in_config_value: bool, +} + +// ── Entry point ────────────────────────────────────────────────────────── + +/// Run all 4 auto-detection tiers against a scanned project. +pub fn auto_detect(project_dir: &Path, scan_result: &ScanResult) -> AutoDetectResult { + let mut candidates = Vec::new(); + + // Tier 1: Directory name + candidates.extend(detect_directory_name(project_dir, scan_result)); + + // Tier 2: Ecosystem config files + candidates.extend(detect_config_files(project_dir, scan_result)); + + // Tier 3: Git metadata + candidates.extend(detect_git_metadata(project_dir, scan_result)); + + // Collect values already covered by tiers 1-3 + let covered_values: HashSet = candidates + .iter() + .map(|c| c.value.to_lowercase()) + .collect(); + + // Collect config values for frequency analysis boosting + let config_values: HashSet = candidates + .iter() + .filter(|c| c.tier == ConfidenceTier::ConfigFile) + .map(|c| c.value.to_lowercase()) + .collect(); + + let dir_name = project_dir + .file_name() + .map(|n| n.to_string_lossy().to_lowercase()) + .unwrap_or_default(); + + // Tier 4: Frequency analysis + candidates.extend(detect_frequency( + scan_result, + &covered_values, + &config_values, + &dir_name, + )); + + // Deduplicate by normalized word list, keeping highest confidence + deduplicate_candidates(&mut candidates); + + // Sort by confidence descending + candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + + AutoDetectResult { candidates } +} + +// ── Tier 1: Directory name ─────────────────────────────────────────────── + +const GENERIC_DIR_NAMES: &[&str] = &[ + "src", "app", "project", "tmp", "temp", "build", "dist", "out", "output", "lib", "bin", + "test", "tests", "example", "examples", "docs", "doc", "assets", "public", "static", + "vendor", "node_modules", "target", "pkg", "cmd", "internal", "api", "web", "server", + "client", "frontend", "backend", "service", "services", "workspace", "repo", "code", +]; + +fn detect_directory_name(project_dir: &Path, scan_result: &ScanResult) -> Vec { + let dir_name = match project_dir.file_name() { + Some(name) => name.to_string_lossy().to_string(), + None => return vec![], + }; + + if GENERIC_DIR_NAMES.contains(&dir_name.to_lowercase().as_str()) { + return vec![]; + } + + // Must have at least 2 chars + if dir_name.len() < 2 { + return vec![]; + } + + let (file_count, total_occurrences) = count_occurrences(&dir_name, scan_result); + + vec![DetectedCandidate { + suggested_name: "project_name".to_string(), + value: dir_name.clone(), + tier: ConfidenceTier::DirectoryName, + confidence: 0.95, + reason: format!("directory name \"{}\"", dir_name), + file_count, + total_occurrences, + }] +} + +// ── Tier 2: Ecosystem config files ─────────────────────────────────────── + +fn detect_config_files( + project_dir: &Path, + scan_result: &ScanResult, +) -> Vec { + let mut candidates = Vec::new(); + + if let Some(mut c) = parse_cargo_toml(project_dir, scan_result) { + candidates.append(&mut c); + } + if let Some(mut c) = parse_package_json(project_dir, scan_result) { + candidates.append(&mut c); + } + if let Some(mut c) = parse_pyproject_toml(project_dir, scan_result) { + candidates.append(&mut c); + } + if let Some(mut c) = parse_go_mod(project_dir, scan_result) { + candidates.append(&mut c); + } + + candidates +} + +fn parse_cargo_toml( + project_dir: &Path, + scan_result: &ScanResult, +) -> Option> { + let path = project_dir.join("Cargo.toml"); + let content = std::fs::read_to_string(&path).ok()?; + let parsed: toml::Value = content.parse().ok()?; + + let mut candidates = Vec::new(); + + if let Some(name) = parsed + .get("package") + .and_then(|p| p.get("name")) + .and_then(|n| n.as_str()) + { + let (file_count, total_occurrences) = count_occurrences(name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "project_name".to_string(), + value: name.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.90, + reason: "Cargo.toml [package].name".to_string(), + file_count, + total_occurrences, + }); + } + + if let Some(authors) = parsed + .get("package") + .and_then(|p| p.get("authors")) + .and_then(|a| a.as_array()) + { + if let Some(first) = authors.first().and_then(|a| a.as_str()) { + let author = strip_email(first); + if !author.is_empty() { + let (file_count, total_occurrences) = count_occurrences(&author, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "author".to_string(), + value: author.clone(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "Cargo.toml [package].authors[0]".to_string(), + file_count, + total_occurrences, + }); + } + } + } + + Some(candidates) +} + +fn parse_package_json( + project_dir: &Path, + scan_result: &ScanResult, +) -> Option> { + let path = project_dir.join("package.json"); + let content = std::fs::read_to_string(&path).ok()?; + let parsed: serde_json::Value = serde_json::from_str(&content).ok()?; + + let mut candidates = Vec::new(); + + if let Some(name) = parsed.get("name").and_then(|n| n.as_str()) { + // Strip npm scope @org/ + let clean_name = strip_npm_scope(name); + let (file_count, total_occurrences) = count_occurrences(clean_name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "project_name".to_string(), + value: clean_name.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.90, + reason: "package.json \"name\"".to_string(), + file_count, + total_occurrences, + }); + } + + if let Some(author) = parsed.get("author") { + let author_str = match author { + serde_json::Value::String(s) => Some(strip_email(s)), + serde_json::Value::Object(obj) => { + obj.get("name").and_then(|n| n.as_str()).map(String::from) + } + _ => None, + }; + if let Some(author_name) = author_str { + if !author_name.is_empty() { + let (file_count, total_occurrences) = + count_occurrences(&author_name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "author".to_string(), + value: author_name, + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "package.json \"author\"".to_string(), + file_count, + total_occurrences, + }); + } + } + } + + Some(candidates) +} + +fn parse_pyproject_toml( + project_dir: &Path, + scan_result: &ScanResult, +) -> Option> { + let path = project_dir.join("pyproject.toml"); + let content = std::fs::read_to_string(&path).ok()?; + let parsed: toml::Value = content.parse().ok()?; + + let mut candidates = Vec::new(); + + if let Some(name) = parsed + .get("project") + .and_then(|p| p.get("name")) + .and_then(|n| n.as_str()) + { + let (file_count, total_occurrences) = count_occurrences(name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "project_name".to_string(), + value: name.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.90, + reason: "pyproject.toml [project].name".to_string(), + file_count, + total_occurrences, + }); + } + + if let Some(authors) = parsed + .get("project") + .and_then(|p| p.get("authors")) + .and_then(|a| a.as_array()) + { + if let Some(first) = authors.first() { + let author_name = first + .get("name") + .and_then(|n| n.as_str()) + .or_else(|| first.as_str().map(|s| s)) + .map(|s| strip_email(s)); + if let Some(name) = author_name { + if !name.is_empty() { + let (file_count, total_occurrences) = count_occurrences(&name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "author".to_string(), + value: name, + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "pyproject.toml [project].authors[0].name".to_string(), + file_count, + total_occurrences, + }); + } + } + } + } + + Some(candidates) +} + +fn parse_go_mod( + project_dir: &Path, + scan_result: &ScanResult, +) -> Option> { + let path = project_dir.join("go.mod"); + let content = std::fs::read_to_string(&path).ok()?; + + let re = Regex::new(r"^module\s+(\S+)").unwrap(); + let module_path = re + .captures(&content)? + .get(1)? + .as_str(); + + let segments: Vec<&str> = module_path.split('/').collect(); + + // Extract last path segment as project name + let name = segments.last().copied()?; + if name.is_empty() { + return None; + } + + let mut candidates = Vec::new(); + + let (file_count, total_occurrences) = count_occurrences(name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "project_name".to_string(), + value: name.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.90, + reason: format!("go.mod module \"{}\"", module_path), + file_count, + total_occurrences, + }); + + // Extract org name (second-to-last segment for github.com/org/repo patterns) + if segments.len() >= 3 { + let org = segments[segments.len() - 2]; + if !org.is_empty() && org != name { + let (org_file_count, org_total_occurrences) = count_occurrences(org, scan_result); + if org_total_occurrences > 0 { + candidates.push(DetectedCandidate { + suggested_name: "org_name".to_string(), + value: org.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: format!("go.mod module org \"{}\"", org), + file_count: org_file_count, + total_occurrences: org_total_occurrences, + }); + } + } + } + + Some(candidates) +} + +// ── Tier 3: Git metadata ───────────────────────────────────────────────── + +fn detect_git_metadata( + project_dir: &Path, + scan_result: &ScanResult, +) -> Vec { + let mut candidates = Vec::new(); + + // Try to get remote origin URL + if let Some(url) = git_config_get(project_dir, "remote.origin.url") { + if let Some(org) = parse_org_from_url(&url) { + let (file_count, total_occurrences) = count_occurrences(&org, scan_result); + // Only include if org name actually appears in files + if total_occurrences > 0 { + candidates.push(DetectedCandidate { + suggested_name: "org_name".to_string(), + value: org.clone(), + tier: ConfidenceTier::GitMetadata, + confidence: 0.70, + reason: format!("git remote org \"{}\"", org), + file_count, + total_occurrences, + }); + } + } + } + + // Try to get user name + if let Some(user_name) = git_config_get(project_dir, "user.name") { + if !user_name.is_empty() { + let (file_count, total_occurrences) = count_occurrences(&user_name, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "author".to_string(), + value: user_name.clone(), + tier: ConfidenceTier::GitMetadata, + confidence: 0.65, + reason: format!("git config user.name \"{}\"", user_name), + file_count, + total_occurrences, + }); + } + } + + candidates +} + +fn git_config_get(project_dir: &Path, key: &str) -> Option { + let output = Command::new("git") + .arg("config") + .arg("--get") + .arg(key) + .current_dir(project_dir) + .output() + .ok()?; + + if !output.status.success() { + return None; + } + + let value = String::from_utf8(output.stdout).ok()?.trim().to_string(); + if value.is_empty() { + None + } else { + Some(value) + } +} + +fn parse_org_from_url(url: &str) -> Option { + // SSH: git@github.com:org/repo.git + if let Some(rest) = url.strip_prefix("git@") { + let after_colon = rest.split(':').nth(1)?; + let org = after_colon.split('/').next()?; + if !org.is_empty() { + return Some(org.to_string()); + } + } + + // HTTPS: https://github.com/org/repo.git + if url.starts_with("https://") || url.starts_with("http://") { + let parts: Vec<&str> = url.split('/').collect(); + // https://host/org/repo → parts[3] is org + if parts.len() >= 4 && !parts[3].is_empty() { + return Some(parts[3].to_string()); + } + } + + None +} + +// ── Tier 4: Frequency analysis ─────────────────────────────────────────── + +fn detect_frequency( + scan_result: &ScanResult, + covered_values: &HashSet, + config_values: &HashSet, + dir_name: &str, +) -> Vec { + // Tokenize all text file content + let token_re = Regex::new( + r"[a-zA-Z][a-zA-Z0-9]*(?:[-_.][a-zA-Z0-9]+)+|[A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+(?:[A-Z][a-z]+)+|[A-Z]{2,}(?:_[A-Z]{2,})+" + ).unwrap(); + + let mut token_file_map: HashMap> = HashMap::new(); + let mut token_counts: HashMap = HashMap::new(); + + for (file_idx, file) in scan_result.files.iter().enumerate() { + if let Some(ref content) = file.content { + for mat in token_re.find_iter(content) { + let token = mat.as_str().to_string(); + token_file_map + .entry(token.clone()) + .or_default() + .insert(file_idx); + *token_counts.entry(token).or_insert(0) += 1; + } + } + } + + // Build clusters by normalized word list + let mut clusters: HashMap = HashMap::new(); + + for (token, count) in &token_counts { + let words = split_into_words(token); + + // Filter noise + if words.iter().all(|w| w.len() < 3) { + continue; + } + if is_noise_token(token, &words) { + continue; + } + + let normalized_key = words.join(" "); + + let file_count = token_file_map + .get(token) + .map(|s| s.len()) + .unwrap_or(0); + + // Skip single-occurrence-single-file tokens + if *count == 1 && file_count <= 1 { + continue; + } + + let matches_dir = normalized_key == split_into_words(dir_name).join(" ") + && !dir_name.is_empty(); + let in_config = config_values.contains(&token.to_lowercase()); + + let cluster = clusters.entry(normalized_key.clone()).or_insert_with(|| { + TokenCluster { + normalized: words.clone(), + literals: Vec::new(), + total_occurrences: 0, + file_count: 0, + matches_dir_name: false, + in_config_value: false, + } + }); + + if !cluster.literals.contains(token) { + cluster.literals.push(token.clone()); + } + cluster.total_occurrences += count; + // Merge file sets for accurate file_count + let files_for_token = token_file_map.get(token).map(|s| s.len()).unwrap_or(0); + if files_for_token > cluster.file_count { + cluster.file_count = files_for_token; + } + cluster.matches_dir_name = cluster.matches_dir_name || matches_dir; + cluster.in_config_value = cluster.in_config_value || in_config; + } + + // Merge near-misses using Levenshtein distance + merge_similar_clusters(&mut clusters); + + // Score and convert to candidates + let mut freq_candidates: Vec = Vec::new(); + + for (key, cluster) in &clusters { + // Skip if already covered by higher tiers + if cluster.literals.iter().any(|l| covered_values.contains(&l.to_lowercase())) { + continue; + } + + let score = score_cluster(cluster); + + // Filter low-scoring candidates + if score < 0.30 { + continue; + } + + let best_literal = &cluster.literals[0]; + let suggested_name = suggest_variable_name(&cluster.normalized, key); + + freq_candidates.push(DetectedCandidate { + suggested_name, + value: best_literal.clone(), + tier: ConfidenceTier::FrequencyAnalysis, + confidence: score, + reason: format!( + "{} occurrences across {} files, {} variant(s)", + cluster.total_occurrences, + cluster.file_count, + cluster.literals.len() + ), + file_count: cluster.file_count, + total_occurrences: cluster.total_occurrences, + }); + } + + // Sort by confidence, take top 5 + freq_candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + freq_candidates.truncate(5); + + freq_candidates +} + +fn score_cluster(cluster: &TokenCluster) -> f64 { + // Occurrence count (log-scaled, 0.0..1.0) + let occ_score = (cluster.total_occurrences as f64).ln_1p() / 10.0_f64.ln_1p(); + let occ_score = occ_score.min(1.0); + + // File spread (log-scaled, 0.0..1.0) + let file_score = (cluster.file_count as f64).ln_1p() / 10.0_f64.ln_1p(); + let file_score = file_score.min(1.0); + + // Variant diversity + let variant_score = match cluster.literals.len() { + 0 | 1 => 0.0, + 2 => 0.5, + 3 => 0.75, + _ => 1.0, + }; + + // Directory name match (binary) + let dir_score = if cluster.matches_dir_name { 1.0 } else { 0.0 }; + + // Config value match (binary) + let config_score = if cluster.in_config_value { 1.0 } else { 0.0 }; + + 0.15 * occ_score + 0.20 * file_score + 0.35 * variant_score + 0.20 * dir_score + 0.10 * config_score +} + +fn merge_similar_clusters(clusters: &mut HashMap) { + let keys: Vec = clusters.keys().cloned().collect(); + let mut merge_map: HashMap = HashMap::new(); + + for i in 0..keys.len() { + for j in (i + 1)..keys.len() { + if merge_map.contains_key(&keys[j]) { + continue; + } + let dist = strsim::levenshtein(&keys[i], &keys[j]); + if dist <= 1 { + let size_i = clusters.get(&keys[i]).map(|c| c.total_occurrences).unwrap_or(0); + let size_j = clusters.get(&keys[j]).map(|c| c.total_occurrences).unwrap_or(0); + if size_i >= size_j { + merge_map.insert(keys[j].clone(), keys[i].clone()); + } else { + merge_map.insert(keys[i].clone(), keys[j].clone()); + } + } + } + } + + for (from, to) in &merge_map { + if let Some(removed) = clusters.remove(from) { + if let Some(target) = clusters.get_mut(to) { + for lit in removed.literals { + if !target.literals.contains(&lit) { + target.literals.push(lit); + } + } + target.total_occurrences += removed.total_occurrences; + if removed.file_count > target.file_count { + target.file_count = removed.file_count; + } + target.matches_dir_name = target.matches_dir_name || removed.matches_dir_name; + target.in_config_value = target.in_config_value || removed.in_config_value; + } + } + } +} + +fn suggest_variable_name(words: &[String], _key: &str) -> String { + if words.len() <= 3 { + words.join("_") + } else { + // Truncate long names + words[..3].join("_") + } +} + +// ── Noise filtering ────────────────────────────────────────────────────── + +fn is_noise_token(token: &str, words: &[String]) -> bool { + let lower = token.to_lowercase(); + + // Too short + if lower.len() < 3 { + return true; + } + + // Language keywords + if LANGUAGE_KEYWORDS.contains(&lower.as_str()) { + return true; + } + + // Common library names + if COMMON_LIBRARIES.contains(&lower.as_str()) { + return true; + } + + // Stopwords (individual words) + if words.len() == 1 && STOPWORDS.contains(&lower.as_str()) { + return true; + } + + // All words are stopwords, file-format words, or very short + if words.iter().all(|w| { + w.len() < 3 + || STOPWORDS.contains(&w.as_str()) + || FILE_FORMAT_WORDS.contains(&w.as_str()) + }) { + return true; + } + + false +} + +const FILE_FORMAT_WORDS: &[&str] = &[ + "toml", "json", "yaml", "yml", "xml", "csv", "html", "css", "md", "txt", + "log", "cfg", "ini", "env", "lock", "mod", "rs", "js", "ts", "py", "go", + "rb", "java", "kt", "swift", "cpp", "hpp", "vue", "jsx", "tsx", +]; + +const LANGUAGE_KEYWORDS: &[&str] = &[ + // Rust + "async", "await", "break", "const", "continue", "crate", "dyn", "else", "enum", "extern", + "false", "fn", "for", "if", "impl", "in", "let", "loop", "match", "mod", "move", "mut", + "pub", "ref", "return", "self", "static", "struct", "super", "trait", "true", "type", + "unsafe", "use", "where", "while", "yield", + // JS/TS + "abstract", "arguments", "boolean", "byte", "case", "catch", "char", "class", "debugger", + "default", "delete", "do", "double", "eval", "export", "extends", "final", "finally", + "float", "function", "goto", "implements", "import", "instanceof", "int", "interface", + "long", "native", "new", "null", "package", "private", "protected", "public", "short", + "switch", "synchronized", "this", "throw", "throws", "transient", "try", "typeof", + "undefined", "var", "void", "volatile", "with", + // Python + "and", "as", "assert", "class", "def", "del", "elif", "except", "exec", "from", + "global", "is", "lambda", "nonlocal", "not", "or", "pass", "print", "raise", + "with", "yield", + // Go + "chan", "defer", "fallthrough", "go", "goroutine", "interface", "map", "range", + "select", "func", +]; + +const COMMON_LIBRARIES: &[&str] = &[ + "react", "redux", "webpack", "babel", "eslint", "prettier", "jest", "mocha", "chai", + "express", "fastify", "next", "nuxt", "vue", "angular", "svelte", + "serde", "tokio", "actix", "axum", "clap", "anyhow", "thiserror", "tracing", + "reqwest", "hyper", "warp", "rocket", "diesel", "sqlx", + "django", "flask", "fastapi", "pytest", "numpy", "pandas", "scipy", + "spring", "hibernate", "junit", "maven", "gradle", + "gin", "echo", "fiber", "gorm", + "lodash", "axios", "moment", "dayjs", "ramda", "underscore", + "tailwind", "bootstrap", "material", + "typescript", "javascript", "python", "golang", "rustlang", +]; + +const STOPWORDS: &[&str] = &[ + // English stopwords + "the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", + "one", "our", "out", "get", "set", "has", "his", "how", "its", "let", "may", "new", + "now", "old", "see", "way", "who", "did", "got", "has", "him", "into", "just", + "like", "make", "many", "some", "than", "them", "then", "very", "when", "with", + "have", "from", "been", "also", "each", "that", "this", "will", "your", "what", + "which", "their", "about", "would", "there", "could", "other", "after", "first", + "these", "those", "being", "where", "should", "because", + // Short generic words common in code identifiers + "my", "no", "is", "on", "in", "to", "by", "do", "up", "so", "or", + "app", "run", "dry", "log", "cmd", "arg", "env", "dir", "key", "map", + "max", "min", "raw", "ref", "src", "str", "tmp", "url", "var", "buf", + "msg", "req", "res", "err", "pkg", "lib", "bin", "fmt", "ctx", "cfg", + "opt", "val", "idx", "len", "ptr", "num", "std", "gen", "pre", "sub", + // Programming type/concept words + "string", "number", "bool", "boolean", "array", "object", "value", "result", + "error", "option", "none", "some", "true", "false", "null", "undefined", + "file", "path", "name", "type", "data", "info", "list", "item", "node", + "index", "count", "size", "length", "config", "settings", "options", + "input", "output", "source", "target", "test", "main", "init", "setup", + "todo", "fixme", "hack", "note", "warning", "debug", "trace", "level", + "mode", "flag", "status", "state", "cache", "hook", "hooks", +]; + +// ── Helpers ────────────────────────────────────────────────────────────── + +fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize) { + let mut file_count = 0; + let mut total = 0; + + for file in &scan_result.files { + if let Some(ref content) = file.content { + let hits = content.matches(value).count(); + if hits > 0 { + file_count += 1; + total += hits; + } + } + // Also check path + let path_str = file.relative_path.to_string_lossy(); + let path_hits = path_str.matches(value).count(); + total += path_hits; + } + + (file_count, total) +} + +pub fn strip_email(s: &str) -> String { + // "Jane Doe " → "Jane Doe" + if let Some(idx) = s.find('<') { + s[..idx].trim().to_string() + } else if s.contains('@') { + // Bare email — use part before @ + s.split('@').next().unwrap_or("").trim().to_string() + } else { + s.trim().to_string() + } +} + +fn strip_npm_scope(name: &str) -> &str { + if let Some(rest) = name.strip_prefix('@') { + rest.split('/').nth(1).unwrap_or(name) + } else { + name + } +} + +fn deduplicate_candidates(candidates: &mut Vec) { + let mut seen_value: HashMap = HashMap::new(); + let mut seen_name: HashMap = HashMap::new(); + let mut to_remove = Vec::new(); + + for (i, candidate) in candidates.iter().enumerate() { + // Deduplicate by value (same literal, different tiers) + let value_key = candidate.value.to_lowercase(); + if let Some(&prev_idx) = seen_value.get(&value_key) { + if candidate.confidence > candidates[prev_idx].confidence { + to_remove.push(prev_idx); + seen_value.insert(value_key, i); + } else { + to_remove.push(i); + continue; + } + } else { + seen_value.insert(value_key, i); + } + + // Deduplicate by suggested_name (e.g., two different "author" candidates) + let name_key = candidate.suggested_name.clone(); + if let Some(&prev_idx) = seen_name.get(&name_key) { + if to_remove.contains(&prev_idx) { + // Previous holder was already removed, replace it + seen_name.insert(name_key, i); + } else if candidate.confidence > candidates[prev_idx].confidence { + to_remove.push(prev_idx); + seen_name.insert(name_key, i); + } else { + to_remove.push(i); + } + } else { + seen_name.insert(name_key, i); + } + } + + to_remove.sort_unstable(); + to_remove.dedup(); + for idx in to_remove.into_iter().rev() { + candidates.remove(idx); + } +} + +// ── Tests ──────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use crate::extract::scan::ScannedFile; + use std::path::PathBuf; + + fn make_scan_result(files: Vec<(&str, &str)>) -> ScanResult { + ScanResult { + files: files + .into_iter() + .map(|(path, content)| ScannedFile { + relative_path: PathBuf::from(path), + absolute_path: PathBuf::from(path), + is_binary: false, + content: Some(content.to_string()), + }) + .collect(), + excluded_count: 0, + } + } + + // ── Tier 1 tests ───────────────────────────────────────────────── + + #[test] + fn test_tier1_basic_dir_name() { + let scan = make_scan_result(vec![ + ("README.md", "# my-widget\nA widget project"), + ("src/lib.rs", "// my-widget core"), + ]); + let dir = PathBuf::from("/projects/my-widget"); + let candidates = detect_directory_name(&dir, &scan); + + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].value, "my-widget"); + assert_eq!(candidates[0].suggested_name, "project_name"); + assert_eq!(candidates[0].confidence, 0.95); + assert!(candidates[0].total_occurrences >= 2); + } + + #[test] + fn test_tier1_generic_name_skipped() { + let scan = make_scan_result(vec![("main.rs", "fn main() {}")]); + let dir = PathBuf::from("/projects/src"); + let candidates = detect_directory_name(&dir, &scan); + assert!(candidates.is_empty()); + } + + #[test] + fn test_tier1_occurrence_counting() { + let scan = make_scan_result(vec![ + ("a.txt", "hello hello hello"), + ("b.txt", "hello world"), + ]); + let dir = PathBuf::from("/projects/hello"); + let candidates = detect_directory_name(&dir, &scan); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].file_count, 2); + assert!(candidates[0].total_occurrences >= 4); + } + + // ── Tier 2 tests ───────────────────────────────────────────────── + + #[test] + fn test_tier2_cargo_toml() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("Cargo.toml"), + "[package]\nname = \"data-pipeline\"\nauthors = [\"Alice \"]\n", + ) + .unwrap(); + + let scan = make_scan_result(vec![("src/main.rs", "data-pipeline runs here")]); + let candidates = parse_cargo_toml(dir.path(), &scan).unwrap(); + + assert!(candidates.iter().any(|c| c.value == "data-pipeline")); + assert!(candidates.iter().any(|c| c.value == "Alice")); + } + + #[test] + fn test_tier2_package_json_with_scope() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("package.json"), + r#"{"name": "@myorg/cool-widget", "author": "Bob Smith "}"#, + ) + .unwrap(); + + let scan = make_scan_result(vec![("index.js", "cool-widget stuff")]); + let candidates = parse_package_json(dir.path(), &scan).unwrap(); + + let name_candidate = candidates.iter().find(|c| c.suggested_name == "project_name").unwrap(); + assert_eq!(name_candidate.value, "cool-widget"); + + let author_candidate = candidates.iter().find(|c| c.suggested_name == "author").unwrap(); + assert_eq!(author_candidate.value, "Bob Smith"); + } + + #[test] + fn test_tier2_pyproject_toml() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("pyproject.toml"), + "[project]\nname = \"my-tool\"\n\n[[project.authors]]\nname = \"Charlie\"\n", + ) + .unwrap(); + + let scan = make_scan_result(vec![("setup.py", "my-tool setup")]); + let candidates = parse_pyproject_toml(dir.path(), &scan).unwrap(); + + assert!(candidates.iter().any(|c| c.value == "my-tool")); + assert!(candidates.iter().any(|c| c.value == "Charlie")); + } + + #[test] + fn test_tier2_go_mod() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("go.mod"), + "module github.com/acme/my-service\n\ngo 1.21\n", + ) + .unwrap(); + + let scan = make_scan_result(vec![("main.go", "package main // my-service by acme")]); + let candidates = parse_go_mod(dir.path(), &scan).unwrap(); + + let project = candidates.iter().find(|c| c.suggested_name == "project_name"); + assert!(project.is_some()); + assert_eq!(project.unwrap().value, "my-service"); + + let org = candidates.iter().find(|c| c.suggested_name == "org_name"); + assert!(org.is_some(), "should extract org from go.mod module path"); + assert_eq!(org.unwrap().value, "acme"); + } + + #[test] + fn test_tier2_missing_file() { + let dir = tempfile::tempdir().unwrap(); + let scan = make_scan_result(vec![]); + + assert!(parse_cargo_toml(dir.path(), &scan).is_none()); + assert!(parse_package_json(dir.path(), &scan).is_none()); + assert!(parse_pyproject_toml(dir.path(), &scan).is_none()); + assert!(parse_go_mod(dir.path(), &scan).is_none()); + } + + #[test] + fn test_tier2_malformed_cargo_toml() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("Cargo.toml"), "this is not valid toml {{{}}}").unwrap(); + let scan = make_scan_result(vec![]); + assert!(parse_cargo_toml(dir.path(), &scan).is_none()); + } + + // ── Tier 3 tests ───────────────────────────────────────────────── + + #[test] + fn test_parse_org_from_url_ssh() { + assert_eq!( + parse_org_from_url("git@github.com:acme-corp/my-repo.git"), + Some("acme-corp".to_string()) + ); + } + + #[test] + fn test_parse_org_from_url_https() { + assert_eq!( + parse_org_from_url("https://github.com/acme-corp/my-repo.git"), + Some("acme-corp".to_string()) + ); + } + + #[test] + fn test_strip_email_with_angle_brackets() { + assert_eq!(strip_email("Jane Doe "), "Jane Doe"); + } + + #[test] + fn test_strip_email_bare_email() { + assert_eq!(strip_email("jane@example.com"), "jane"); + } + + #[test] + fn test_strip_email_no_email() { + assert_eq!(strip_email("Jane Doe"), "Jane Doe"); + } + + // ── Tier 4 tests ───────────────────────────────────────────────── + + #[test] + fn test_frequency_finds_repeated_identifier() { + let scan = make_scan_result(vec![ + ("a.txt", "data-pipeline is great\ndata-pipeline rocks"), + ("b.txt", "use data_pipeline here\ndata_pipeline again"), + ("c.txt", "DataPipeline class\nDataPipeline impl"), + ("d.txt", "DATA_PIPELINE env var\nDATA_PIPELINE config"), + ]); + + let covered = HashSet::new(); + let config_vals = HashSet::new(); + let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + + assert!(!candidates.is_empty()); + // Should find "data-pipeline" cluster + let found = candidates.iter().any(|c| { + let words = split_into_words(&c.value); + words == vec!["data", "pipeline"] + }); + assert!(found, "should find data-pipeline cluster, got: {:?}", candidates); + } + + #[test] + fn test_frequency_filters_keywords() { + let scan = make_scan_result(vec![ + ("a.rs", "fn async_handler() {}"), + ("b.rs", "fn async_handler() {}"), + ("c.rs", "fn async_handler() {}"), + ]); + + let covered = HashSet::new(); + let config_vals = HashSet::new(); + let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + + // "async" alone should be filtered + for c in &candidates { + let lower = c.value.to_lowercase(); + assert!(!LANGUAGE_KEYWORDS.contains(&lower.as_str()) || c.value.contains('-') || c.value.contains('_')); + } + } + + #[test] + fn test_frequency_filters_short_tokens() { + let scan = make_scan_result(vec![ + ("a.txt", "ab cd ef gh"), + ("b.txt", "ab cd ef gh"), + ]); + + let covered = HashSet::new(); + let config_vals = HashSet::new(); + let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + + assert!(candidates.is_empty(), "short tokens should be filtered"); + } + + #[test] + fn test_frequency_skips_covered_values() { + let scan = make_scan_result(vec![ + ("a.txt", "my-widget rocks"), + ("b.txt", "my-widget is great"), + ("c.txt", "my_widget too"), + ]); + + let mut covered = HashSet::new(); + covered.insert("my-widget".to_string()); + let config_vals = HashSet::new(); + let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + + let has_widget = candidates.iter().any(|c| c.value.to_lowercase().contains("widget")); + assert!(!has_widget, "covered values should be skipped"); + } + + #[test] + fn test_score_cluster_multi_variant_boost() { + let single_variant = TokenCluster { + normalized: vec!["my".into(), "app".into()], + literals: vec!["my-app".into()], + total_occurrences: 10, + file_count: 5, + matches_dir_name: false, + in_config_value: false, + }; + + let multi_variant = TokenCluster { + normalized: vec!["my".into(), "app".into()], + literals: vec!["my-app".into(), "my_app".into(), "MyApp".into()], + total_occurrences: 10, + file_count: 5, + matches_dir_name: false, + in_config_value: false, + }; + + assert!(score_cluster(&multi_variant) > score_cluster(&single_variant)); + } + + #[test] + fn test_score_cluster_dir_name_boost() { + let no_dir = TokenCluster { + normalized: vec!["my".into(), "app".into()], + literals: vec!["my-app".into()], + total_occurrences: 5, + file_count: 3, + matches_dir_name: false, + in_config_value: false, + }; + + let with_dir = TokenCluster { + normalized: vec!["my".into(), "app".into()], + literals: vec!["my-app".into()], + total_occurrences: 5, + file_count: 3, + matches_dir_name: true, + in_config_value: false, + }; + + assert!(score_cluster(&with_dir) > score_cluster(&no_dir)); + } + + #[test] + fn test_levenshtein_merging() { + let mut clusters = HashMap::new(); + clusters.insert( + "data pipeline".to_string(), + TokenCluster { + normalized: vec!["data".into(), "pipeline".into()], + literals: vec!["data-pipeline".into()], + total_occurrences: 10, + file_count: 5, + matches_dir_name: false, + in_config_value: false, + }, + ); + clusters.insert( + "data pipelin".to_string(), // typo / near miss + TokenCluster { + normalized: vec!["data".into(), "pipelin".into()], + literals: vec!["data-pipelin".into()], + total_occurrences: 2, + file_count: 1, + matches_dir_name: false, + in_config_value: false, + }, + ); + + merge_similar_clusters(&mut clusters); + + // Should merge into one cluster + assert_eq!(clusters.len(), 1); + let remaining = clusters.values().next().unwrap(); + assert_eq!(remaining.total_occurrences, 12); + } + + // ── Helper tests ───────────────────────────────────────────────── + + #[test] + fn test_deduplication_keeps_highest_confidence() { + let mut candidates = vec![ + DetectedCandidate { + suggested_name: "project_name".to_string(), + value: "my-app".to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.90, + reason: "Cargo.toml".to_string(), + file_count: 3, + total_occurrences: 10, + }, + DetectedCandidate { + suggested_name: "project_name".to_string(), + value: "my-app".to_string(), + tier: ConfidenceTier::DirectoryName, + confidence: 0.95, + reason: "directory name".to_string(), + file_count: 3, + total_occurrences: 10, + }, + ]; + + deduplicate_candidates(&mut candidates); + assert_eq!(candidates.len(), 1); + assert_eq!(candidates[0].confidence, 0.95); + } + + #[test] + fn test_deduplication_by_suggested_name() { + let mut candidates = vec![ + DetectedCandidate { + suggested_name: "author".to_string(), + value: "Alice Johnson".to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "package.json".to_string(), + file_count: 3, + total_occurrences: 5, + }, + DetectedCandidate { + suggested_name: "author".to_string(), + value: "Robert Roskam".to_string(), + tier: ConfidenceTier::GitMetadata, + confidence: 0.65, + reason: "git config".to_string(), + file_count: 0, + total_occurrences: 0, + }, + ]; + + deduplicate_candidates(&mut candidates); + assert_eq!(candidates.len(), 1, "should deduplicate by suggested_name"); + assert_eq!(candidates[0].value, "Alice Johnson", "should keep highest confidence"); + } + + #[test] + fn test_suggest_variable_name() { + assert_eq!( + suggest_variable_name(&["my".into(), "app".into()], "my app"), + "my_app" + ); + assert_eq!( + suggest_variable_name( + &["very".into(), "long".into(), "name".into(), "here".into()], + "very long name here" + ), + "very_long_name" + ); + } + + #[test] + fn test_strip_npm_scope() { + assert_eq!(strip_npm_scope("@myorg/cool-widget"), "cool-widget"); + assert_eq!(strip_npm_scope("plain-package"), "plain-package"); + } + + #[test] + fn test_auto_detect_integration() { + let dir = tempfile::tempdir().unwrap(); + let project_dir = dir.path().join("my-widget"); + std::fs::create_dir(&project_dir).unwrap(); + std::fs::write( + project_dir.join("README.md"), + "# my-widget\nWelcome to my-widget", + ) + .unwrap(); + std::fs::write( + project_dir.join("lib.rs"), + "pub mod my_widget;\nstruct MyWidget;", + ) + .unwrap(); + + let scan = crate::extract::scan::scan_project(&project_dir, &[]).unwrap(); + let result = auto_detect(&project_dir, &scan); + + assert!(!result.candidates.is_empty()); + let project_name = result.candidates.iter().find(|c| c.suggested_name == "project_name"); + assert!(project_name.is_some(), "should detect project_name"); + assert_eq!(project_name.unwrap().value, "my-widget"); + } +} diff --git a/src/extract/mod.rs b/src/extract/mod.rs index ba568bd..d06315b 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -1,3 +1,4 @@ +pub mod auto_detect; pub mod conditional; pub mod config_gen; pub mod exclude; @@ -22,6 +23,7 @@ use self::exclude::{detect_copy_without_render, detect_excludes}; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; +use self::auto_detect::{auto_detect, DetectedCandidate}; use self::scan::{scan_project, ScannedFile}; use self::variants::{ computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, @@ -74,6 +76,7 @@ pub struct ExtractOptions { pub in_place: bool, pub batch: bool, pub dry_run: bool, + pub auto: bool, } /// Plan an extraction: scan the project, detect variants, build replacement rules. @@ -86,10 +89,6 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { }); } - if options.variables.is_empty() { - return Err(DicecutError::ExtractNoVariables); - } - // Check if this is already a template if source_dir.join("diecut.toml").exists() { return Err(DicecutError::ExtractAlreadyTemplate { @@ -134,10 +133,47 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { scan_result.excluded_count ); + // Phase 2.5: Auto-detect variables if none provided and --auto is enabled + let variables = if options.variables.is_empty() && options.auto { + let detect_result = auto_detect(source_dir, &scan_result); + + if detect_result.candidates.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } + + let accepted = if options.batch { + let accepted: Vec<_> = detect_result + .candidates + .into_iter() + .filter(|c| c.confidence >= 0.50) + .collect(); + if accepted.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } + print_auto_detected_batch(&accepted); + accepted + } else { + let accepted = confirm_auto_detected_interactive(detect_result.candidates)?; + if accepted.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } + accepted + }; + + accepted + .into_iter() + .map(|c| (c.suggested_name, c.value)) + .collect() + } else if options.variables.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } else { + options.variables.clone() + }; + // Phase 3: Generate variants and count occurrences let mut extract_variables = Vec::new(); - for (var_name, var_value) in &options.variables { + for (var_name, var_value) in &variables { let all_variants = generate_variants(var_name, var_value); let mut occurrence_counts = Vec::new(); @@ -639,6 +675,84 @@ fn confirm_conditionals_interactive( Ok(confirmed) } +fn print_auto_detected_batch(candidates: &[DetectedCandidate]) { + eprintln!( + "\n{} Auto-detected variables {}", + style("──").dim(), + style("──────────────────────────────────").dim() + ); + for c in candidates { + eprintln!( + " {} {} = {:?} ({:.0}% confidence, {})", + style("✓").green(), + style(&c.suggested_name).bold(), + c.value, + c.confidence * 100.0, + c.tier + ); + eprintln!( + " {}", + style(&c.reason).dim() + ); + } +} + +fn confirm_auto_detected_interactive( + candidates: Vec, +) -> Result> { + eprintln!( + "\n{} Auto-detected variables {}", + style("──").dim(), + style("──────────────────────────────────").dim() + ); + + let mut accepted = Vec::new(); + + for candidate in candidates { + let default_accept = candidate.confidence >= 0.70; + eprintln!( + "\n {} = {:?} ({:.0}% confidence, {})", + style(&candidate.suggested_name).bold(), + candidate.value, + candidate.confidence * 100.0, + candidate.tier + ); + eprintln!(" {}", style(&candidate.reason).dim()); + if candidate.total_occurrences > 0 { + eprintln!( + " {} occurrences across {} files", + candidate.total_occurrences, + candidate.file_count + ); + } + + let accept = Confirm::new(&format!("Accept \"{}\"?", candidate.suggested_name)) + .with_default(default_accept) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if accept { + let name = Text::new("Variable name:") + .with_default(&candidate.suggested_name) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + let value = Text::new("Value:") + .with_default(&candidate.value) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + accepted.push(DetectedCandidate { + suggested_name: name, + value, + ..candidate + }); + } + } + + Ok(accepted) +} + fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { let templated: Vec<_> = files.iter().filter(|f| f.has_replacements).collect(); let copied: Vec<_> = files.iter().filter(|f| !f.has_replacements).collect(); diff --git a/src/main.rs b/src/main.rs index f540fe9..bb0d2ff 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,6 +26,7 @@ fn main() -> miette::Result<()> { in_place, batch, dry_run, - } => commands::extract::run(source, vars, output, in_place, batch, dry_run), + auto, + } => commands::extract::run(source, vars, output, in_place, batch, dry_run, auto), } } diff --git a/tests/integration.rs b/tests/integration.rs index 1ad3a59..55e32e9 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -656,6 +656,7 @@ fn test_extract_batch_basic() { in_place: false, batch: true, dry_run: false, + auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -699,6 +700,7 @@ fn test_extract_detects_case_variants() { in_place: false, batch: true, dry_run: false, + auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -752,6 +754,7 @@ fn test_extract_dry_run_writes_nothing() { in_place: false, batch: true, dry_run: true, + auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -780,6 +783,7 @@ fn test_extract_rejects_already_template() { in_place: false, batch: true, dry_run: false, + auto: false, }; let result = plan_extraction(&options); @@ -798,6 +802,7 @@ fn test_extract_rejects_no_variables() { in_place: false, batch: true, dry_run: false, + auto: false, }; let result = plan_extraction(&options); @@ -820,6 +825,7 @@ fn test_extract_templates_path_components() { in_place: false, batch: true, dry_run: false, + auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -882,6 +888,7 @@ fn test_extract_round_trip() { in_place: false, batch: true, dry_run: false, + auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -915,3 +922,141 @@ fn test_extract_round_trip() { } } } + +// ── Auto-detect tests ──────────────────────────────────────────────────── + +#[test] +fn test_extract_auto_batch() { + let project = tempfile::tempdir().unwrap(); + let project_dir = project.path().join("data-pipeline"); + std::fs::create_dir(&project_dir).unwrap(); + std::fs::write( + project_dir.join("Cargo.toml"), + "[package]\nname = \"data-pipeline\"\nversion = \"0.1.0\"\n", + ) + .unwrap(); + std::fs::write( + project_dir.join("README.md"), + "# data-pipeline\nWelcome to data-pipeline\n", + ) + .unwrap(); + std::fs::create_dir(project_dir.join("src")).unwrap(); + std::fs::write( + project_dir.join("src/main.rs"), + "fn main() {\n println!(\"data-pipeline starting\");\n}\n", + ) + .unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("auto-extracted"); + + let options = ExtractOptions { + source_dir: project_dir.clone(), + variables: vec![], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: false, + auto: true, + }; + + let plan = plan_extraction(&options).unwrap(); + execute_extraction(&plan, false).unwrap(); + + let project_var = plan + .variables + .iter() + .find(|v| v.name == "project_name"); + assert!( + project_var.is_some(), + "should auto-detect project_name, got vars: {:?}", + plan.variables.iter().map(|v| &v.name).collect::>() + ); + assert_eq!(project_var.unwrap().value, "data-pipeline"); + + assert!(output_path.join("diecut.toml").exists()); + let config = std::fs::read_to_string(output_path.join("diecut.toml")).unwrap(); + assert!(config.contains("project_name")); +} + +#[test] +fn test_extract_auto_explicit_vars_priority() { + let project = tempfile::tempdir().unwrap(); + let project_dir = project.path().join("my-service"); + std::fs::create_dir(&project_dir).unwrap(); + std::fs::write( + project_dir.join("Cargo.toml"), + "[package]\nname = \"my-service\"\n", + ) + .unwrap(); + std::fs::write(project_dir.join("README.md"), "# my-service\n").unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("explicit-extracted"); + + let options = ExtractOptions { + source_dir: project_dir.clone(), + variables: vec![("app_name".to_string(), "my-service".to_string())], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: false, + auto: true, + }; + + let plan = plan_extraction(&options).unwrap(); + + let has_app_name = plan.variables.iter().any(|v| v.name == "app_name"); + let has_project_name = plan.variables.iter().any(|v| v.name == "project_name"); + assert!(has_app_name, "should use explicit var app_name"); + assert!(!has_project_name, "should not auto-detect project_name when explicit vars given"); +} + +#[test] +fn test_extract_auto_frequency_fallback() { + let project = tempfile::tempdir().unwrap(); + let project_dir = project.path().join("cool-widget"); + std::fs::create_dir(&project_dir).unwrap(); + std::fs::write( + project_dir.join("main.txt"), + "cool-widget is great\ncool_widget module\nCoolWidget class\n", + ) + .unwrap(); + std::fs::write( + project_dir.join("config.txt"), + "name = cool-widget\nmodule = cool_widget\n", + ) + .unwrap(); + std::fs::write( + project_dir.join("test.txt"), + "testing cool-widget\nCOOL_WIDGET env\n", + ) + .unwrap(); + + let output = tempfile::tempdir().unwrap(); + let output_path = output.path().join("freq-extracted"); + + let options = ExtractOptions { + source_dir: project_dir.clone(), + variables: vec![], + output_dir: Some(output_path.clone()), + in_place: false, + batch: true, + dry_run: false, + auto: true, + }; + + let plan = plan_extraction(&options).unwrap(); + + let has_relevant_var = plan.variables.iter().any(|v| { + v.value.contains("cool") || v.name.contains("cool") + }); + assert!( + has_relevant_var, + "should detect cool-widget related variable, got: {:?}", + plan.variables + .iter() + .map(|v| format!("{}={}", v.name, v.value)) + .collect::>() + ); +} From 3ec68001ec31057c8333812a6e9551e5c9895493 Mon Sep 17 00:00:00 2001 From: Robert Roskam Date: Fri, 27 Feb 2026 19:53:59 -0500 Subject: [PATCH 06/29] refactor(extract): make auto-detect default, rename --batch to -y/--yes Auto-detect now always runs when no --var is provided instead of requiring --auto. Renamed --batch to -y/--yes to align with CLI conventions. Added --min-confidence threshold flag. Name collisions from multiple detection sources are now preserved for interactive resolution instead of silently deduplicating. --- src/cli.rs | 14 +-- src/commands/extract.rs | 11 +- src/error.rs | 2 +- src/extract/auto_detect.rs | 31 ++--- src/extract/mod.rs | 231 +++++++++++++++++++++++++------------ src/main.rs | 6 +- tests/integration.rs | 84 ++++++++++---- 7 files changed, 242 insertions(+), 137 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 0301617..6c687a0 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -67,16 +67,16 @@ pub enum Commands { #[arg(long)] in_place: bool, - /// Skip all interactive prompts - #[arg(long)] - batch: bool, + /// Accept all defaults without prompting + #[arg(short = 'y', long)] + yes: bool, + + /// Minimum confidence threshold for auto-detected variables (0.0-1.0) + #[arg(long, default_value = "0.5")] + min_confidence: f64, /// Show what would be extracted without writing files #[arg(long)] dry_run: bool, - - /// Auto-detect template variables from project metadata and content - #[arg(long)] - auto: bool, }, } diff --git a/src/commands/extract.rs b/src/commands/extract.rs index 6576387..faf0018 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -11,23 +11,20 @@ pub fn run( vars: Vec, output: Option, in_place: bool, - batch: bool, + yes: bool, + min_confidence: f64, dry_run: bool, - auto: bool, ) -> Result<()> { let variables = parse_vars(&vars)?; - // Default auto to true when no vars are provided - let auto = auto || variables.is_empty(); - let options = ExtractOptions { source_dir: PathBuf::from(&source), variables, output_dir: output.map(PathBuf::from), in_place, - batch, + yes, + min_confidence, dry_run, - auto, }; let plan = plan_extraction(&options)?; diff --git a/src/error.rs b/src/error.rs index ccac949..a612908 100644 --- a/src/error.rs +++ b/src/error.rs @@ -124,7 +124,7 @@ pub enum DicecutError { #[error("No variables provided for extraction")] #[diagnostic(help( - "Use --var key=value to specify variables, or --auto to detect them automatically" + "Use --var key=value to specify variables, or ensure the project has identifiable names in config files or directory name" ))] ExtractNoVariables, diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index 5cc76da..881991e 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -824,12 +824,13 @@ fn strip_npm_scope(name: &str) -> &str { } fn deduplicate_candidates(candidates: &mut Vec) { + // Only deduplicate by value (same literal from multiple tiers → keep highest confidence). + // Name collisions (e.g., two different "author" candidates) are preserved + // for the interactive/yes layer to resolve. let mut seen_value: HashMap = HashMap::new(); - let mut seen_name: HashMap = HashMap::new(); let mut to_remove = Vec::new(); for (i, candidate) in candidates.iter().enumerate() { - // Deduplicate by value (same literal, different tiers) let value_key = candidate.value.to_lowercase(); if let Some(&prev_idx) = seen_value.get(&value_key) { if candidate.confidence > candidates[prev_idx].confidence { @@ -837,27 +838,10 @@ fn deduplicate_candidates(candidates: &mut Vec) { seen_value.insert(value_key, i); } else { to_remove.push(i); - continue; } } else { seen_value.insert(value_key, i); } - - // Deduplicate by suggested_name (e.g., two different "author" candidates) - let name_key = candidate.suggested_name.clone(); - if let Some(&prev_idx) = seen_name.get(&name_key) { - if to_remove.contains(&prev_idx) { - // Previous holder was already removed, replace it - seen_name.insert(name_key, i); - } else if candidate.confidence > candidates[prev_idx].confidence { - to_remove.push(prev_idx); - seen_name.insert(name_key, i); - } else { - to_remove.push(i); - } - } else { - seen_name.insert(name_key, i); - } } to_remove.sort_unstable(); @@ -1240,7 +1224,7 @@ mod tests { } #[test] - fn test_deduplication_by_suggested_name() { + fn test_name_collisions_preserved() { let mut candidates = vec![ DetectedCandidate { suggested_name: "author".to_string(), @@ -1263,8 +1247,11 @@ mod tests { ]; deduplicate_candidates(&mut candidates); - assert_eq!(candidates.len(), 1, "should deduplicate by suggested_name"); - assert_eq!(candidates[0].value, "Alice Johnson", "should keep highest confidence"); + assert_eq!( + candidates.len(), + 2, + "name collisions should be preserved for interactive resolution" + ); } #[test] diff --git a/src/extract/mod.rs b/src/extract/mod.rs index d06315b..96e7ecb 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -6,11 +6,11 @@ pub mod replace; pub mod scan; pub mod variants; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::path::{Path, PathBuf}; use console::style; -use inquire::{Confirm, Text}; +use inquire::{Confirm, Select, Text}; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; @@ -74,9 +74,9 @@ pub struct ExtractOptions { pub variables: Vec<(String, String)>, pub output_dir: Option, pub in_place: bool, - pub batch: bool, + pub yes: bool, + pub min_confidence: f64, pub dry_run: bool, - pub auto: bool, } /// Plan an extraction: scan the project, detect variants, build replacement rules. @@ -133,41 +133,44 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { scan_result.excluded_count ); - // Phase 2.5: Auto-detect variables if none provided and --auto is enabled - let variables = if options.variables.is_empty() && options.auto { + // Phase 2.5: Auto-detect variables (always runs), merge with explicit --var entries + let variables = { + let explicit_vars = options.variables.clone(); let detect_result = auto_detect(source_dir, &scan_result); - if detect_result.candidates.is_empty() { + // Filter candidates below min_confidence threshold + let candidates: Vec<_> = detect_result + .candidates + .into_iter() + .filter(|c| c.confidence >= options.min_confidence) + .collect(); + + if candidates.is_empty() && explicit_vars.is_empty() { return Err(DicecutError::ExtractNoVariables); } - let accepted = if options.batch { - let accepted: Vec<_> = detect_result - .candidates - .into_iter() - .filter(|c| c.confidence >= 0.50) - .collect(); - if accepted.is_empty() { - return Err(DicecutError::ExtractNoVariables); - } - print_auto_detected_batch(&accepted); - accepted + // Resolve auto-detected candidates (merge with explicit vars) + let auto_vars = if candidates.is_empty() { + vec![] + } else if options.yes { + resolve_candidates_yes(&candidates, &explicit_vars) } else { - let accepted = confirm_auto_detected_interactive(detect_result.candidates)?; - if accepted.is_empty() { - return Err(DicecutError::ExtractNoVariables); - } - accepted + confirm_auto_detected_interactive(candidates, &explicit_vars)? }; - accepted - .into_iter() - .map(|c| (c.suggested_name, c.value)) - .collect() - } else if options.variables.is_empty() { - return Err(DicecutError::ExtractNoVariables); - } else { - options.variables.clone() + // Merge: explicit vars first (pre-accepted), then auto-detected additions + let mut merged = explicit_vars; + for (name, value) in auto_vars { + if !merged.iter().any(|(n, _)| n == &name) { + merged.push((name, value)); + } + } + + if merged.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } + + merged }; // Phase 3: Generate variants and count occurrences @@ -192,7 +195,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { } // Phase 4: Interactive variant confirmation - let confirmed_variables = if options.batch { + let confirmed_variables = if options.yes { // Batch mode: auto-accept all found variants extract_variables .into_iter() @@ -218,12 +221,12 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { }; // Phase 5: Interactive exclude confirmation - if !options.batch { + if !options.yes { excludes = confirm_excludes_interactive(excludes)?; } // Phase 6: Detect conditional files - let detected_conditionals = if options.batch { + let detected_conditionals = if options.yes { vec![] // Batch mode: no conditional files } else { let detected = detect_conditional_files(source_dir); @@ -301,7 +304,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { } // Phase 10: Interactive file confirmation - if !options.batch { + if !options.yes { confirm_files_interactive(&planned_files)?; } @@ -675,78 +678,160 @@ fn confirm_conditionals_interactive( Ok(confirmed) } -fn print_auto_detected_batch(candidates: &[DetectedCandidate]) { +fn resolve_candidates_yes( + candidates: &[DetectedCandidate], + explicit_vars: &[(String, String)], +) -> Vec<(String, String)> { eprintln!( "\n{} Auto-detected variables {}", style("──").dim(), style("──────────────────────────────────").dim() ); + + // Group candidates by suggested_name + let mut groups: BTreeMap> = BTreeMap::new(); for c in candidates { + groups.entry(c.suggested_name.clone()).or_default().push(c); + } + + let mut result = Vec::new(); + + for (name, mut group) in groups { + // Skip names already covered by explicit --var + if explicit_vars.iter().any(|(n, _)| n == &name) { + eprintln!( + " {} {} (explicit --var, skipping auto-detect)", + style("·").dim(), + style(&name).dim() + ); + continue; + } + + // For name collisions, pick highest confidence + group.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + let winner = group[0]; + eprintln!( " {} {} = {:?} ({:.0}% confidence, {})", style("✓").green(), - style(&c.suggested_name).bold(), - c.value, - c.confidence * 100.0, - c.tier - ); - eprintln!( - " {}", - style(&c.reason).dim() + style(&winner.suggested_name).bold(), + winner.value, + winner.confidence * 100.0, + winner.tier ); + eprintln!(" {}", style(&winner.reason).dim()); + + if group.len() > 1 { + eprintln!( + " {} {} other candidates for this name (picked highest confidence)", + style("⚠").yellow(), + group.len() - 1 + ); + } + + result.push((winner.suggested_name.clone(), winner.value.clone())); } + + result } fn confirm_auto_detected_interactive( candidates: Vec, -) -> Result> { + explicit_vars: &[(String, String)], +) -> Result> { eprintln!( "\n{} Auto-detected variables {}", style("──").dim(), style("──────────────────────────────────").dim() ); + // Group candidates by suggested_name + let mut groups: BTreeMap> = BTreeMap::new(); + for c in candidates { + groups.entry(c.suggested_name.clone()).or_default().push(c); + } + let mut accepted = Vec::new(); - for candidate in candidates { - let default_accept = candidate.confidence >= 0.70; - eprintln!( - "\n {} = {:?} ({:.0}% confidence, {})", - style(&candidate.suggested_name).bold(), - candidate.value, - candidate.confidence * 100.0, - candidate.tier - ); - eprintln!(" {}", style(&candidate.reason).dim()); - if candidate.total_occurrences > 0 { + for (name, mut group) in groups { + // Skip names already covered by explicit --var + if explicit_vars.iter().any(|(n, _)| n == &name) { eprintln!( - " {} occurrences across {} files", - candidate.total_occurrences, - candidate.file_count + "\n {} {} (provided via --var, skipping)", + style("·").dim(), + style(&name).dim() ); + continue; } - let accept = Confirm::new(&format!("Accept \"{}\"?", candidate.suggested_name)) - .with_default(default_accept) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; + // Sort by confidence descending + group.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); - if accept { - let name = Text::new("Variable name:") - .with_default(&candidate.suggested_name) + if group.len() == 1 { + // Single candidate — simple confirm + let candidate = &group[0]; + eprintln!( + "\n {} = {:?} ({:.0}% confidence, {})", + style(&candidate.suggested_name).bold(), + candidate.value, + candidate.confidence * 100.0, + candidate.tier + ); + eprintln!(" {}", style(&candidate.reason).dim()); + if candidate.total_occurrences > 0 { + eprintln!( + " {} occurrences across {} files", + candidate.total_occurrences, + candidate.file_count + ); + } + + let accept = Confirm::new(&format!("Accept \"{}\"?", candidate.suggested_name)) + .with_default(true) .prompt() .map_err(|_| DicecutError::PromptCancelled)?; - let value = Text::new("Value:") - .with_default(&candidate.value) + if accept { + accepted.push((candidate.suggested_name.clone(), candidate.value.clone())); + } + } else { + // Name collision — show selection prompt + eprintln!( + "\n {} Multiple candidates for {}:", + style("⚠").yellow(), + style(&name).bold() + ); + + let mut options: Vec = group + .iter() + .map(|c| { + format!( + "{:?} ({:.0}% confidence, {})", + c.value, + c.confidence * 100.0, + c.tier + ) + }) + .collect(); + options.push("Skip".to_string()); + + let selection = Select::new(&format!("Which value for \"{}\"?", name), options) .prompt() .map_err(|_| DicecutError::PromptCancelled)?; - accepted.push(DetectedCandidate { - suggested_name: name, - value, - ..candidate - }); + if selection != "Skip" { + // Find the matching candidate + if let Some(chosen) = group.iter().find(|c| { + format!( + "{:?} ({:.0}% confidence, {})", + c.value, + c.confidence * 100.0, + c.tier + ) == selection + }) { + accepted.push((chosen.suggested_name.clone(), chosen.value.clone())); + } + } } } diff --git a/src/main.rs b/src/main.rs index bb0d2ff..4999bb2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,9 +24,9 @@ fn main() -> miette::Result<()> { vars, output, in_place, - batch, + yes, + min_confidence, dry_run, - auto, - } => commands::extract::run(source, vars, output, in_place, batch, dry_run, auto), + } => commands::extract::run(source, vars, output, in_place, yes, min_confidence, dry_run), } } diff --git a/tests/integration.rs b/tests/integration.rs index 55e32e9..0310b92 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -654,9 +654,9 @@ fn test_extract_batch_basic() { ], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -698,9 +698,9 @@ fn test_extract_detects_case_variants() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -752,9 +752,9 @@ fn test_extract_dry_run_writes_nothing() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: true, - auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -781,9 +781,9 @@ fn test_extract_rejects_already_template() { variables: vec![("name".to_string(), "val".to_string())], output_dir: None, in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: false, }; let result = plan_extraction(&options); @@ -795,14 +795,16 @@ fn test_extract_rejects_no_variables() { let project = tempfile::tempdir().unwrap(); std::fs::write(project.path().join("hello.txt"), "hello").unwrap(); + // With min_confidence=1.0, no auto-detected candidates can pass, and no explicit + // vars are given, so extraction should fail with ExtractNoVariables let options = ExtractOptions { source_dir: project.path().to_path_buf(), variables: vec![], output_dir: None, in_place: false, - batch: true, + yes: true, + min_confidence: 1.0, dry_run: false, - auto: false, }; let result = plan_extraction(&options); @@ -823,9 +825,9 @@ fn test_extract_templates_path_components() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -886,9 +888,9 @@ fn test_extract_round_trip() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(extracted_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: false, }; let plan = plan_extraction(&options).unwrap(); @@ -926,7 +928,7 @@ fn test_extract_round_trip() { // ── Auto-detect tests ──────────────────────────────────────────────────── #[test] -fn test_extract_auto_batch() { +fn test_extract_auto_yes() { let project = tempfile::tempdir().unwrap(); let project_dir = project.path().join("data-pipeline"); std::fs::create_dir(&project_dir).unwrap(); @@ -955,9 +957,9 @@ fn test_extract_auto_batch() { variables: vec![], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: true, }; let plan = plan_extraction(&options).unwrap(); @@ -980,7 +982,7 @@ fn test_extract_auto_batch() { } #[test] -fn test_extract_auto_explicit_vars_priority() { +fn test_extract_auto_explicit_vars_merged() { let project = tempfile::tempdir().unwrap(); let project_dir = project.path().join("my-service"); std::fs::create_dir(&project_dir).unwrap(); @@ -999,17 +1001,17 @@ fn test_extract_auto_explicit_vars_priority() { variables: vec![("app_name".to_string(), "my-service".to_string())], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: true, }; let plan = plan_extraction(&options).unwrap(); let has_app_name = plan.variables.iter().any(|v| v.name == "app_name"); - let has_project_name = plan.variables.iter().any(|v| v.name == "project_name"); assert!(has_app_name, "should use explicit var app_name"); - assert!(!has_project_name, "should not auto-detect project_name when explicit vars given"); + // Auto-detect still runs and merges additional candidates + // (project_name may or may not appear depending on dedup with app_name's value) } #[test] @@ -1041,9 +1043,9 @@ fn test_extract_auto_frequency_fallback() { variables: vec![], output_dir: Some(output_path.clone()), in_place: false, - batch: true, + yes: true, + min_confidence: 0.5, dry_run: false, - auto: true, }; let plan = plan_extraction(&options).unwrap(); @@ -1060,3 +1062,37 @@ fn test_extract_auto_frequency_fallback() { .collect::>() ); } + +#[test] +fn test_extract_min_confidence_filters() { + let project = tempfile::tempdir().unwrap(); + let project_dir = project.path().join("tiny-app"); + std::fs::create_dir(&project_dir).unwrap(); + std::fs::write( + project_dir.join("Cargo.toml"), + "[package]\nname = \"tiny-app\"\nversion = \"0.1.0\"\n", + ) + .unwrap(); + std::fs::write( + project_dir.join("README.md"), + "# tiny-app\nWelcome to tiny-app\n", + ) + .unwrap(); + + // With a very high threshold, all auto-detected candidates should be filtered out + let options = ExtractOptions { + source_dir: project_dir.clone(), + variables: vec![], + output_dir: None, + in_place: false, + yes: true, + min_confidence: 0.99, + dry_run: true, + }; + + let result = plan_extraction(&options); + assert!( + result.is_err(), + "high min_confidence should filter out all candidates" + ); +} From 375616a3823ddc06b6766e7af90ef19663ee0b7e Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:20:02 -0500 Subject: [PATCH 07/29] fix: resolve cargo fmt and clippy warnings Run cargo fmt to fix formatting issues and fix two clippy lints: - Remove redundant closure in strip_email call - Remove identity map on first.as_str() --- src/extract/auto_detect.rs | 565 ++++++++++++++++++++++++++++++------- src/extract/mod.rs | 5 +- src/render/context.rs | 10 +- tests/integration.rs | 12 +- 4 files changed, 470 insertions(+), 122 deletions(-) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index 881991e..b0306d9 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -70,10 +70,8 @@ pub fn auto_detect(project_dir: &Path, scan_result: &ScanResult) -> AutoDetectRe candidates.extend(detect_git_metadata(project_dir, scan_result)); // Collect values already covered by tiers 1-3 - let covered_values: HashSet = candidates - .iter() - .map(|c| c.value.to_lowercase()) - .collect(); + let covered_values: HashSet = + candidates.iter().map(|c| c.value.to_lowercase()).collect(); // Collect config values for frequency analysis boosting let config_values: HashSet = candidates @@ -107,10 +105,43 @@ pub fn auto_detect(project_dir: &Path, scan_result: &ScanResult) -> AutoDetectRe // ── Tier 1: Directory name ─────────────────────────────────────────────── const GENERIC_DIR_NAMES: &[&str] = &[ - "src", "app", "project", "tmp", "temp", "build", "dist", "out", "output", "lib", "bin", - "test", "tests", "example", "examples", "docs", "doc", "assets", "public", "static", - "vendor", "node_modules", "target", "pkg", "cmd", "internal", "api", "web", "server", - "client", "frontend", "backend", "service", "services", "workspace", "repo", "code", + "src", + "app", + "project", + "tmp", + "temp", + "build", + "dist", + "out", + "output", + "lib", + "bin", + "test", + "tests", + "example", + "examples", + "docs", + "doc", + "assets", + "public", + "static", + "vendor", + "node_modules", + "target", + "pkg", + "cmd", + "internal", + "api", + "web", + "server", + "client", + "frontend", + "backend", + "service", + "services", + "workspace", + "repo", + "code", ]; fn detect_directory_name(project_dir: &Path, scan_result: &ScanResult) -> Vec { @@ -143,10 +174,7 @@ fn detect_directory_name(project_dir: &Path, scan_result: &ScanResult) -> Vec Vec { +fn detect_config_files(project_dir: &Path, scan_result: &ScanResult) -> Vec { let mut candidates = Vec::new(); if let Some(mut c) = parse_cargo_toml(project_dir, scan_result) { @@ -252,8 +280,7 @@ fn parse_package_json( }; if let Some(author_name) = author_str { if !author_name.is_empty() { - let (file_count, total_occurrences) = - count_occurrences(&author_name, scan_result); + let (file_count, total_occurrences) = count_occurrences(&author_name, scan_result); candidates.push(DetectedCandidate { suggested_name: "author".to_string(), value: author_name, @@ -306,8 +333,8 @@ fn parse_pyproject_toml( let author_name = first .get("name") .and_then(|n| n.as_str()) - .or_else(|| first.as_str().map(|s| s)) - .map(|s| strip_email(s)); + .or_else(|| first.as_str()) + .map(strip_email); if let Some(name) = author_name { if !name.is_empty() { let (file_count, total_occurrences) = count_occurrences(&name, scan_result); @@ -328,18 +355,12 @@ fn parse_pyproject_toml( Some(candidates) } -fn parse_go_mod( - project_dir: &Path, - scan_result: &ScanResult, -) -> Option> { +fn parse_go_mod(project_dir: &Path, scan_result: &ScanResult) -> Option> { let path = project_dir.join("go.mod"); let content = std::fs::read_to_string(&path).ok()?; let re = Regex::new(r"^module\s+(\S+)").unwrap(); - let module_path = re - .captures(&content)? - .get(1)? - .as_str(); + let module_path = re.captures(&content)?.get(1)?.as_str(); let segments: Vec<&str> = module_path.split('/').collect(); @@ -386,10 +407,7 @@ fn parse_go_mod( // ── Tier 3: Git metadata ───────────────────────────────────────────────── -fn detect_git_metadata( - project_dir: &Path, - scan_result: &ScanResult, -) -> Vec { +fn detect_git_metadata(project_dir: &Path, scan_result: &ScanResult) -> Vec { let mut candidates = Vec::new(); // Try to get remote origin URL @@ -518,30 +536,27 @@ fn detect_frequency( let normalized_key = words.join(" "); - let file_count = token_file_map - .get(token) - .map(|s| s.len()) - .unwrap_or(0); + let file_count = token_file_map.get(token).map(|s| s.len()).unwrap_or(0); // Skip single-occurrence-single-file tokens if *count == 1 && file_count <= 1 { continue; } - let matches_dir = normalized_key == split_into_words(dir_name).join(" ") - && !dir_name.is_empty(); + let matches_dir = + normalized_key == split_into_words(dir_name).join(" ") && !dir_name.is_empty(); let in_config = config_values.contains(&token.to_lowercase()); - let cluster = clusters.entry(normalized_key.clone()).or_insert_with(|| { - TokenCluster { + let cluster = clusters + .entry(normalized_key.clone()) + .or_insert_with(|| TokenCluster { normalized: words.clone(), literals: Vec::new(), total_occurrences: 0, file_count: 0, matches_dir_name: false, in_config_value: false, - } - }); + }); if !cluster.literals.contains(token) { cluster.literals.push(token.clone()); @@ -564,7 +579,11 @@ fn detect_frequency( for (key, cluster) in &clusters { // Skip if already covered by higher tiers - if cluster.literals.iter().any(|l| covered_values.contains(&l.to_lowercase())) { + if cluster + .literals + .iter() + .any(|l| covered_values.contains(&l.to_lowercase())) + { continue; } @@ -624,7 +643,11 @@ fn score_cluster(cluster: &TokenCluster) -> f64 { // Config value match (binary) let config_score = if cluster.in_config_value { 1.0 } else { 0.0 }; - 0.15 * occ_score + 0.20 * file_score + 0.35 * variant_score + 0.20 * dir_score + 0.10 * config_score + 0.15 * occ_score + + 0.20 * file_score + + 0.35 * variant_score + + 0.20 * dir_score + + 0.10 * config_score } fn merge_similar_clusters(clusters: &mut HashMap) { @@ -638,8 +661,14 @@ fn merge_similar_clusters(clusters: &mut HashMap) { } let dist = strsim::levenshtein(&keys[i], &keys[j]); if dist <= 1 { - let size_i = clusters.get(&keys[i]).map(|c| c.total_occurrences).unwrap_or(0); - let size_j = clusters.get(&keys[j]).map(|c| c.total_occurrences).unwrap_or(0); + let size_i = clusters + .get(&keys[i]) + .map(|c| c.total_occurrences) + .unwrap_or(0); + let size_j = clusters + .get(&keys[j]) + .map(|c| c.total_occurrences) + .unwrap_or(0); if size_i >= size_j { merge_map.insert(keys[j].clone(), keys[i].clone()); } else { @@ -704,9 +733,7 @@ fn is_noise_token(token: &str, words: &[String]) -> bool { // All words are stopwords, file-format words, or very short if words.iter().all(|w| { - w.len() < 3 - || STOPWORDS.contains(&w.as_str()) - || FILE_FORMAT_WORDS.contains(&w.as_str()) + w.len() < 3 || STOPWORDS.contains(&w.as_str()) || FILE_FORMAT_WORDS.contains(&w.as_str()) }) { return true; } @@ -715,69 +742,377 @@ fn is_noise_token(token: &str, words: &[String]) -> bool { } const FILE_FORMAT_WORDS: &[&str] = &[ - "toml", "json", "yaml", "yml", "xml", "csv", "html", "css", "md", "txt", - "log", "cfg", "ini", "env", "lock", "mod", "rs", "js", "ts", "py", "go", - "rb", "java", "kt", "swift", "cpp", "hpp", "vue", "jsx", "tsx", + "toml", "json", "yaml", "yml", "xml", "csv", "html", "css", "md", "txt", "log", "cfg", "ini", + "env", "lock", "mod", "rs", "js", "ts", "py", "go", "rb", "java", "kt", "swift", "cpp", "hpp", + "vue", "jsx", "tsx", ]; const LANGUAGE_KEYWORDS: &[&str] = &[ // Rust - "async", "await", "break", "const", "continue", "crate", "dyn", "else", "enum", "extern", - "false", "fn", "for", "if", "impl", "in", "let", "loop", "match", "mod", "move", "mut", - "pub", "ref", "return", "self", "static", "struct", "super", "trait", "true", "type", - "unsafe", "use", "where", "while", "yield", + "async", + "await", + "break", + "const", + "continue", + "crate", + "dyn", + "else", + "enum", + "extern", + "false", + "fn", + "for", + "if", + "impl", + "in", + "let", + "loop", + "match", + "mod", + "move", + "mut", + "pub", + "ref", + "return", + "self", + "static", + "struct", + "super", + "trait", + "true", + "type", + "unsafe", + "use", + "where", + "while", + "yield", // JS/TS - "abstract", "arguments", "boolean", "byte", "case", "catch", "char", "class", "debugger", - "default", "delete", "do", "double", "eval", "export", "extends", "final", "finally", - "float", "function", "goto", "implements", "import", "instanceof", "int", "interface", - "long", "native", "new", "null", "package", "private", "protected", "public", "short", - "switch", "synchronized", "this", "throw", "throws", "transient", "try", "typeof", - "undefined", "var", "void", "volatile", "with", + "abstract", + "arguments", + "boolean", + "byte", + "case", + "catch", + "char", + "class", + "debugger", + "default", + "delete", + "do", + "double", + "eval", + "export", + "extends", + "final", + "finally", + "float", + "function", + "goto", + "implements", + "import", + "instanceof", + "int", + "interface", + "long", + "native", + "new", + "null", + "package", + "private", + "protected", + "public", + "short", + "switch", + "synchronized", + "this", + "throw", + "throws", + "transient", + "try", + "typeof", + "undefined", + "var", + "void", + "volatile", + "with", // Python - "and", "as", "assert", "class", "def", "del", "elif", "except", "exec", "from", - "global", "is", "lambda", "nonlocal", "not", "or", "pass", "print", "raise", - "with", "yield", + "and", + "as", + "assert", + "class", + "def", + "del", + "elif", + "except", + "exec", + "from", + "global", + "is", + "lambda", + "nonlocal", + "not", + "or", + "pass", + "print", + "raise", + "with", + "yield", // Go - "chan", "defer", "fallthrough", "go", "goroutine", "interface", "map", "range", - "select", "func", + "chan", + "defer", + "fallthrough", + "go", + "goroutine", + "interface", + "map", + "range", + "select", + "func", ]; const COMMON_LIBRARIES: &[&str] = &[ - "react", "redux", "webpack", "babel", "eslint", "prettier", "jest", "mocha", "chai", - "express", "fastify", "next", "nuxt", "vue", "angular", "svelte", - "serde", "tokio", "actix", "axum", "clap", "anyhow", "thiserror", "tracing", - "reqwest", "hyper", "warp", "rocket", "diesel", "sqlx", - "django", "flask", "fastapi", "pytest", "numpy", "pandas", "scipy", - "spring", "hibernate", "junit", "maven", "gradle", - "gin", "echo", "fiber", "gorm", - "lodash", "axios", "moment", "dayjs", "ramda", "underscore", - "tailwind", "bootstrap", "material", - "typescript", "javascript", "python", "golang", "rustlang", + "react", + "redux", + "webpack", + "babel", + "eslint", + "prettier", + "jest", + "mocha", + "chai", + "express", + "fastify", + "next", + "nuxt", + "vue", + "angular", + "svelte", + "serde", + "tokio", + "actix", + "axum", + "clap", + "anyhow", + "thiserror", + "tracing", + "reqwest", + "hyper", + "warp", + "rocket", + "diesel", + "sqlx", + "django", + "flask", + "fastapi", + "pytest", + "numpy", + "pandas", + "scipy", + "spring", + "hibernate", + "junit", + "maven", + "gradle", + "gin", + "echo", + "fiber", + "gorm", + "lodash", + "axios", + "moment", + "dayjs", + "ramda", + "underscore", + "tailwind", + "bootstrap", + "material", + "typescript", + "javascript", + "python", + "golang", + "rustlang", ]; const STOPWORDS: &[&str] = &[ // English stopwords - "the", "and", "for", "are", "but", "not", "you", "all", "can", "had", "her", "was", - "one", "our", "out", "get", "set", "has", "his", "how", "its", "let", "may", "new", - "now", "old", "see", "way", "who", "did", "got", "has", "him", "into", "just", - "like", "make", "many", "some", "than", "them", "then", "very", "when", "with", - "have", "from", "been", "also", "each", "that", "this", "will", "your", "what", - "which", "their", "about", "would", "there", "could", "other", "after", "first", - "these", "those", "being", "where", "should", "because", + "the", + "and", + "for", + "are", + "but", + "not", + "you", + "all", + "can", + "had", + "her", + "was", + "one", + "our", + "out", + "get", + "set", + "has", + "his", + "how", + "its", + "let", + "may", + "new", + "now", + "old", + "see", + "way", + "who", + "did", + "got", + "has", + "him", + "into", + "just", + "like", + "make", + "many", + "some", + "than", + "them", + "then", + "very", + "when", + "with", + "have", + "from", + "been", + "also", + "each", + "that", + "this", + "will", + "your", + "what", + "which", + "their", + "about", + "would", + "there", + "could", + "other", + "after", + "first", + "these", + "those", + "being", + "where", + "should", + "because", // Short generic words common in code identifiers - "my", "no", "is", "on", "in", "to", "by", "do", "up", "so", "or", - "app", "run", "dry", "log", "cmd", "arg", "env", "dir", "key", "map", - "max", "min", "raw", "ref", "src", "str", "tmp", "url", "var", "buf", - "msg", "req", "res", "err", "pkg", "lib", "bin", "fmt", "ctx", "cfg", - "opt", "val", "idx", "len", "ptr", "num", "std", "gen", "pre", "sub", + "my", + "no", + "is", + "on", + "in", + "to", + "by", + "do", + "up", + "so", + "or", + "app", + "run", + "dry", + "log", + "cmd", + "arg", + "env", + "dir", + "key", + "map", + "max", + "min", + "raw", + "ref", + "src", + "str", + "tmp", + "url", + "var", + "buf", + "msg", + "req", + "res", + "err", + "pkg", + "lib", + "bin", + "fmt", + "ctx", + "cfg", + "opt", + "val", + "idx", + "len", + "ptr", + "num", + "std", + "gen", + "pre", + "sub", // Programming type/concept words - "string", "number", "bool", "boolean", "array", "object", "value", "result", - "error", "option", "none", "some", "true", "false", "null", "undefined", - "file", "path", "name", "type", "data", "info", "list", "item", "node", - "index", "count", "size", "length", "config", "settings", "options", - "input", "output", "source", "target", "test", "main", "init", "setup", - "todo", "fixme", "hack", "note", "warning", "debug", "trace", "level", - "mode", "flag", "status", "state", "cache", "hook", "hooks", + "string", + "number", + "bool", + "boolean", + "array", + "object", + "value", + "result", + "error", + "option", + "none", + "some", + "true", + "false", + "null", + "undefined", + "file", + "path", + "name", + "type", + "data", + "info", + "list", + "item", + "node", + "index", + "count", + "size", + "length", + "config", + "settings", + "options", + "input", + "output", + "source", + "target", + "test", + "main", + "init", + "setup", + "todo", + "fixme", + "hack", + "note", + "warning", + "debug", + "trace", + "level", + "mode", + "flag", + "status", + "state", + "cache", + "hook", + "hooks", ]; // ── Helpers ────────────────────────────────────────────────────────────── @@ -943,10 +1278,16 @@ mod tests { let scan = make_scan_result(vec![("index.js", "cool-widget stuff")]); let candidates = parse_package_json(dir.path(), &scan).unwrap(); - let name_candidate = candidates.iter().find(|c| c.suggested_name == "project_name").unwrap(); + let name_candidate = candidates + .iter() + .find(|c| c.suggested_name == "project_name") + .unwrap(); assert_eq!(name_candidate.value, "cool-widget"); - let author_candidate = candidates.iter().find(|c| c.suggested_name == "author").unwrap(); + let author_candidate = candidates + .iter() + .find(|c| c.suggested_name == "author") + .unwrap(); assert_eq!(author_candidate.value, "Bob Smith"); } @@ -978,7 +1319,9 @@ mod tests { let scan = make_scan_result(vec![("main.go", "package main // my-service by acme")]); let candidates = parse_go_mod(dir.path(), &scan).unwrap(); - let project = candidates.iter().find(|c| c.suggested_name == "project_name"); + let project = candidates + .iter() + .find(|c| c.suggested_name == "project_name"); assert!(project.is_some()); assert_eq!(project.unwrap().value, "my-service"); @@ -1001,7 +1344,11 @@ mod tests { #[test] fn test_tier2_malformed_cargo_toml() { let dir = tempfile::tempdir().unwrap(); - std::fs::write(dir.path().join("Cargo.toml"), "this is not valid toml {{{}}}").unwrap(); + std::fs::write( + dir.path().join("Cargo.toml"), + "this is not valid toml {{{}}}", + ) + .unwrap(); let scan = make_scan_result(vec![]); assert!(parse_cargo_toml(dir.path(), &scan).is_none()); } @@ -1060,7 +1407,11 @@ mod tests { let words = split_into_words(&c.value); words == vec!["data", "pipeline"] }); - assert!(found, "should find data-pipeline cluster, got: {:?}", candidates); + assert!( + found, + "should find data-pipeline cluster, got: {:?}", + candidates + ); } #[test] @@ -1078,16 +1429,17 @@ mod tests { // "async" alone should be filtered for c in &candidates { let lower = c.value.to_lowercase(); - assert!(!LANGUAGE_KEYWORDS.contains(&lower.as_str()) || c.value.contains('-') || c.value.contains('_')); + assert!( + !LANGUAGE_KEYWORDS.contains(&lower.as_str()) + || c.value.contains('-') + || c.value.contains('_') + ); } } #[test] fn test_frequency_filters_short_tokens() { - let scan = make_scan_result(vec![ - ("a.txt", "ab cd ef gh"), - ("b.txt", "ab cd ef gh"), - ]); + let scan = make_scan_result(vec![("a.txt", "ab cd ef gh"), ("b.txt", "ab cd ef gh")]); let covered = HashSet::new(); let config_vals = HashSet::new(); @@ -1109,7 +1461,9 @@ mod tests { let config_vals = HashSet::new(); let candidates = detect_frequency(&scan, &covered, &config_vals, ""); - let has_widget = candidates.iter().any(|c| c.value.to_lowercase().contains("widget")); + let has_widget = candidates + .iter() + .any(|c| c.value.to_lowercase().contains("widget")); assert!(!has_widget, "covered values should be skipped"); } @@ -1295,7 +1649,10 @@ mod tests { let result = auto_detect(&project_dir, &scan); assert!(!result.candidates.is_empty()); - let project_name = result.candidates.iter().find(|c| c.suggested_name == "project_name"); + let project_name = result + .candidates + .iter() + .find(|c| c.suggested_name == "project_name"); assert!(project_name.is_some(), "should detect project_name"); assert_eq!(project_name.unwrap().value, "my-widget"); } diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 96e7ecb..8fb9790 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -15,6 +15,7 @@ use inquire::{Confirm, Select, Text}; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; +use self::auto_detect::{auto_detect, DetectedCandidate}; use self::conditional::{detect_conditional_files, patterns_for_variable, DetectedConditional}; use self::config_gen::{ generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, @@ -23,7 +24,6 @@ use self::exclude::{detect_copy_without_render, detect_excludes}; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; -use self::auto_detect::{auto_detect, DetectedCandidate}; use self::scan::{scan_project, ScannedFile}; use self::variants::{ computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, @@ -781,8 +781,7 @@ fn confirm_auto_detected_interactive( if candidate.total_occurrences > 0 { eprintln!( " {} occurrences across {} files", - candidate.total_occurrences, - candidate.file_count + candidate.total_occurrences, candidate.file_count ); } diff --git a/src/render/context.rs b/src/render/context.rs index 4680c64..f530022 100644 --- a/src/render/context.rs +++ b/src/render/context.rs @@ -27,18 +27,12 @@ pub fn tera_with_filters() -> Tera { /// /// Splits on the separator (default `-`), lowercases the first word, /// title-cases the rest, and joins them. -fn camelcase_filter( - value: &Value, - args: &HashMap, -) -> Result { +fn camelcase_filter(value: &Value, args: &HashMap) -> Result { let s = value .as_str() .ok_or_else(|| tera::Error::msg("camelcase filter requires a string value"))?; - let sep = args - .get("sep") - .and_then(|v| v.as_str()) - .unwrap_or("-"); + let sep = args.get("sep").and_then(|v| v.as_str()).unwrap_or("-"); let words: Vec<&str> = s.split(sep).collect(); if words.is_empty() { diff --git a/tests/integration.rs b/tests/integration.rs index 0310b92..243935c 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -965,10 +965,7 @@ fn test_extract_auto_yes() { let plan = plan_extraction(&options).unwrap(); execute_extraction(&plan, false).unwrap(); - let project_var = plan - .variables - .iter() - .find(|v| v.name == "project_name"); + let project_var = plan.variables.iter().find(|v| v.name == "project_name"); assert!( project_var.is_some(), "should auto-detect project_name, got vars: {:?}", @@ -1050,9 +1047,10 @@ fn test_extract_auto_frequency_fallback() { let plan = plan_extraction(&options).unwrap(); - let has_relevant_var = plan.variables.iter().any(|v| { - v.value.contains("cool") || v.name.contains("cool") - }); + let has_relevant_var = plan + .variables + .iter() + .any(|v| v.value.contains("cool") || v.name.contains("cool")); assert!( has_relevant_var, "should detect cool-widget related variable, got: {:?}", From cc93d6164c177540b1624de47ccfa222258f2044 Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:27:06 -0500 Subject: [PATCH 08/29] fix(extract): resolve merge chains in cluster deduplication --- src/extract/auto_detect.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index b0306d9..115e8c8 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -678,7 +678,19 @@ fn merge_similar_clusters(clusters: &mut HashMap) { } } - for (from, to) in &merge_map { + // Resolve merge chains: if A→B and B→C, then A→C + let resolved: HashMap = merge_map + .keys() + .map(|k| { + let mut target = merge_map[k].clone(); + while let Some(next) = merge_map.get(&target) { + target = next.clone(); + } + (k.clone(), target) + }) + .collect(); + + for (from, to) in &resolved { if let Some(removed) = clusters.remove(from) { if let Some(target) = clusters.get_mut(to) { for lit in removed.literals { From bdd420bf24d2c215e80512f076d8fc7ac3ba97df Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:31:37 -0500 Subject: [PATCH 09/29] refactor(extract): use enum for PlannedExtractFile content --- src/commands/extract.rs | 10 ++-- src/extract/mod.rs | 102 +++++++++++++++++++++++++--------------- 2 files changed, 71 insertions(+), 41 deletions(-) diff --git a/src/commands/extract.rs b/src/commands/extract.rs index faf0018..93738fe 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -63,15 +63,19 @@ fn print_dry_run(plan: &diecut::extract::ExtractionPlan) { style(plan.output_dir.display()).cyan() ); - let templated: Vec<_> = plan.files.iter().filter(|f| f.has_replacements).collect(); - let copied: Vec<_> = plan.files.iter().filter(|f| !f.has_replacements).collect(); + let templated: Vec<_> = plan.files.iter().filter(|f| f.has_replacements()).collect(); + let copied: Vec<_> = plan + .files + .iter() + .filter(|f| !f.has_replacements()) + .collect(); eprintln!("\nTemplated files ({}):", templated.len()); for file in &templated { eprintln!( " {} ({} replacements)", file.template_path.display(), - file.replacement_count + file.replacement_count() ); } diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 8fb9790..66bc88b 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -39,21 +39,47 @@ pub struct ExtractVariable { pub occurrence_counts: Vec<(String, usize, usize)>, } +/// The content of an extracted template file. +#[derive(Debug, Clone)] +pub enum ExtractedContent { + /// A text file with optional template replacements applied. + Text { + content: String, + replacement_count: usize, + }, + /// A binary file copied verbatim. + Binary(Vec), +} + /// A file that will be part of the extracted template. #[derive(Debug, Clone)] pub struct PlannedExtractFile { /// Relative path in the output template (may contain template expressions). pub template_path: PathBuf, - /// Content (with replacements applied), or None for binary files. - pub content: Option, - /// Original bytes for binary files. - pub binary_content: Option>, + /// The file content (text with replacements, or binary bytes). + pub content: ExtractedContent, +} + +impl PlannedExtractFile { /// Whether this file had template replacements applied. - pub has_replacements: bool, - /// Number of replacements made. - pub replacement_count: usize, + pub fn has_replacements(&self) -> bool { + matches!(&self.content, ExtractedContent::Text { replacement_count, .. } if *replacement_count > 0) + } + /// Whether this is a binary file. - pub is_binary: bool, + pub fn is_binary(&self) -> bool { + matches!(&self.content, ExtractedContent::Binary(_)) + } + + /// Number of replacements made (0 for binary files). + pub fn replacement_count(&self) -> usize { + match &self.content { + ExtractedContent::Text { + replacement_count, .. + } => *replacement_count, + ExtractedContent::Binary(_) => 0, + } + } } /// The full extraction plan, ready to be executed or reviewed. @@ -273,18 +299,13 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { })?; planned_files.push(PlannedExtractFile { template_path, - content: None, - binary_content: Some(binary_content), - has_replacements: false, - replacement_count: 0, - is_binary: true, + content: ExtractedContent::Binary(binary_content), }); } else if let Some(ref content) = file.content { let (replaced, count) = apply_replacements(content, &rules); - let has_replacements = count > 0; // Add .die suffix if file has template replacements - let final_path = if has_replacements { + let final_path = if count > 0 { let mut p = template_path.as_os_str().to_string_lossy().to_string(); p.push_str(DEFAULT_TEMPLATES_SUFFIX); PathBuf::from(p) @@ -294,11 +315,10 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { planned_files.push(PlannedExtractFile { template_path: final_path, - content: Some(replaced), - binary_content: None, - has_replacements, - replacement_count: count, - is_binary: false, + content: ExtractedContent::Text { + content: replaced, + replacement_count: count, + }, }); } } @@ -413,22 +433,28 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> })?; } - if let Some(ref content) = file.content { - std::fs::write(&dest, content).map_err(|e| DicecutError::Io { - context: format!("writing file {}", dest.display()), - source: e, - })?; - if file.has_replacements { - rendered_count += 1; - } else { + match &file.content { + ExtractedContent::Text { + content, + replacement_count, + } => { + std::fs::write(&dest, content).map_err(|e| DicecutError::Io { + context: format!("writing file {}", dest.display()), + source: e, + })?; + if *replacement_count > 0 { + rendered_count += 1; + } else { + copied_count += 1; + } + } + ExtractedContent::Binary(bytes) => { + std::fs::write(&dest, bytes).map_err(|e| DicecutError::Io { + context: format!("writing binary file {}", dest.display()), + source: e, + })?; copied_count += 1; } - } else if let Some(ref bytes) = file.binary_content { - std::fs::write(&dest, bytes).map_err(|e| DicecutError::Io { - context: format!("writing binary file {}", dest.display()), - source: e, - })?; - copied_count += 1; } } @@ -838,9 +864,9 @@ fn confirm_auto_detected_interactive( } fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { - let templated: Vec<_> = files.iter().filter(|f| f.has_replacements).collect(); - let copied: Vec<_> = files.iter().filter(|f| !f.has_replacements).collect(); - let binary_count = files.iter().filter(|f| f.is_binary).count(); + let templated: Vec<_> = files.iter().filter(|f| f.has_replacements()).collect(); + let copied: Vec<_> = files.iter().filter(|f| !f.has_replacements()).collect(); + let binary_count = files.iter().filter(|f| f.is_binary()).count(); eprintln!( "\n{} Files to template {}", @@ -855,7 +881,7 @@ fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { eprintln!( " {:<40} {} replacements", file.template_path.display(), - file.replacement_count + file.replacement_count() ); } From 917ea7706c1cd4dacb89f69ee2779458667d2910 Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:32:23 -0500 Subject: [PATCH 10/29] fix(extract): replace partial_cmp().unwrap() with total_cmp() for NaN safety --- src/extract/auto_detect.rs | 4 ++-- src/extract/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index 115e8c8..3b38c34 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -97,7 +97,7 @@ pub fn auto_detect(project_dir: &Path, scan_result: &ScanResult) -> AutoDetectRe deduplicate_candidates(&mut candidates); // Sort by confidence descending - candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + candidates.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); AutoDetectResult { candidates } } @@ -614,7 +614,7 @@ fn detect_frequency( } // Sort by confidence, take top 5 - freq_candidates.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + freq_candidates.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); freq_candidates.truncate(5); freq_candidates diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 66bc88b..150f08c 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -734,7 +734,7 @@ fn resolve_candidates_yes( } // For name collisions, pick highest confidence - group.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); let winner = group[0]; eprintln!( @@ -791,7 +791,7 @@ fn confirm_auto_detected_interactive( } // Sort by confidence descending - group.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap()); + group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); if group.len() == 1 { // Single candidate — simple confirm From a35eb2fd58ce5fb7142a729f14b513c6e53c3bc9 Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:35:49 -0500 Subject: [PATCH 11/29] fix(extract): use dedicated error variant for malformed --var arguments --- src/commands/extract.rs | 2 +- src/error.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/commands/extract.rs b/src/commands/extract.rs index 93738fe..04c23b3 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -45,7 +45,7 @@ fn parse_vars(vars: &[String]) -> diecut::error::Result> { for var in vars { let (key, value) = var .split_once('=') - .ok_or_else(|| DicecutError::ExtractNoVariables)?; + .ok_or_else(|| DicecutError::ExtractInvalidVar { input: var.clone() })?; parsed.push((key.trim().to_string(), value.trim().to_string())); } diff --git a/src/error.rs b/src/error.rs index a612908..3ad5597 100644 --- a/src/error.rs +++ b/src/error.rs @@ -128,6 +128,10 @@ pub enum DicecutError { ))] ExtractNoVariables, + #[error("Invalid --var argument: {input} (expected key=value)")] + #[diagnostic(help("Use --var key=value format, e.g., --var project_name=my-app"))] + ExtractInvalidVar { input: String }, + #[error("Output directory already exists: {path}")] #[diagnostic(help( "Choose a different output path with -o, or remove the existing directory" From 9bb5be9425b01bb8b0d3949dc67470bf1aaee9d9 Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:35:46 -0500 Subject: [PATCH 12/29] fix(extract): disable git terminal prompts during auto-detection --- src/extract/auto_detect.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index 3b38c34..3224e46 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -454,6 +454,7 @@ fn git_config_get(project_dir: &Path, key: &str) -> Option { .arg("--get") .arg(key) .current_dir(project_dir) + .env("GIT_TERMINAL_PROMPT", "0") .output() .ok()?; From b490f8c5dd79b6bdec43ef1bdbb1b42d7763840f Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:35:48 -0500 Subject: [PATCH 13/29] refactor(extract): consolidate duplicate count_occurrences functions --- src/extract/auto_detect.rs | 2 +- src/extract/mod.rs | 34 +++------------------------------- 2 files changed, 4 insertions(+), 32 deletions(-) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index 3224e46..dfec675 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -1130,7 +1130,7 @@ const STOPWORDS: &[&str] = &[ // ── Helpers ────────────────────────────────────────────────────────────── -fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize) { +pub fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize) { let mut file_count = 0; let mut total = 0; diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 150f08c..4388207 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -15,7 +15,7 @@ use inquire::{Confirm, Select, Text}; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; -use self::auto_detect::{auto_detect, DetectedCandidate}; +use self::auto_detect::{auto_detect, count_occurrences, DetectedCandidate}; use self::conditional::{detect_conditional_files, patterns_for_variable, DetectedConditional}; use self::config_gen::{ generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, @@ -24,7 +24,7 @@ use self::exclude::{detect_copy_without_render, detect_excludes}; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; -use self::scan::{scan_project, ScannedFile}; +use self::scan::scan_project; use self::variants::{ computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, }; @@ -207,8 +207,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { let mut occurrence_counts = Vec::new(); for variant in &all_variants { - let (file_count, total_hits) = - count_variant_occurrences(&variant.literal, &scan_result.files); + let (file_count, total_hits) = count_occurrences(&variant.literal, &scan_result); occurrence_counts.push((variant.name.to_string(), file_count, total_hits)); } @@ -517,33 +516,6 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> // ── Interactive helpers ────────────────────────────────────────────────── -fn count_variant_occurrences(literal: &str, files: &[ScannedFile]) -> (usize, usize) { - let mut file_count = 0; - let mut total_hits = 0; - - for file in files { - if let Some(ref content) = file.content { - let hits = content.matches(literal).count(); - if hits > 0 { - file_count += 1; - total_hits += hits; - } - } - } - - // Also check path components - for file in files { - let path_str = file.relative_path.to_string_lossy(); - let hits = path_str.matches(literal).count(); - if hits > 0 { - // Don't double-count file_count if already counted from content - total_hits += hits; - } - } - - (file_count, total_hits) -} - fn confirm_variants_interactive(variables: Vec) -> Result> { let mut confirmed = Vec::new(); From c163f04a5f1f1e9437832d82c9cc09785ede1610 Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 21:35:58 -0500 Subject: [PATCH 14/29] perf(extract): use LazyLock for Regex compilation Replace Regex::new() calls inside function bodies with std::sync::LazyLock statics so regexes are compiled once instead of on every invocation. Bumps MSRV to 1.80. --- Cargo.toml | 2 +- src/extract/auto_detect.rs | 19 ++++++++++++------- src/extract/variants.rs | 8 ++++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 64ef434..ecca649 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ name = "diecut" version = "0.3.4" edition = "2021" license = "MIT" -rust-version = "1.75" +rust-version = "1.80" description = "A single binary project template generator" [lib] diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index dfec675..a9a2612 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -1,9 +1,19 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; use std::process::Command; +use std::sync::LazyLock; use regex_lite::Regex; +static GO_MOD_RE: LazyLock = LazyLock::new(|| Regex::new(r"^module\s+(\S+)").unwrap()); + +static TOKEN_RE: LazyLock = LazyLock::new(|| { + Regex::new( + r"[a-zA-Z][a-zA-Z0-9]*(?:[-_.][a-zA-Z0-9]+)+|[A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+(?:[A-Z][a-z]+)+|[A-Z]{2,}(?:_[A-Z]{2,})+", + ) + .unwrap() +}); + use super::scan::ScanResult; use super::variants::split_into_words; @@ -359,8 +369,7 @@ fn parse_go_mod(project_dir: &Path, scan_result: &ScanResult) -> Option = module_path.split('/').collect(); @@ -501,16 +510,12 @@ fn detect_frequency( dir_name: &str, ) -> Vec { // Tokenize all text file content - let token_re = Regex::new( - r"[a-zA-Z][a-zA-Z0-9]*(?:[-_.][a-zA-Z0-9]+)+|[A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+(?:[A-Z][a-z]+)+|[A-Z]{2,}(?:_[A-Z]{2,})+" - ).unwrap(); - let mut token_file_map: HashMap> = HashMap::new(); let mut token_counts: HashMap = HashMap::new(); for (file_idx, file) in scan_result.files.iter().enumerate() { if let Some(ref content) = file.content { - for mat in token_re.find_iter(content) { + for mat in TOKEN_RE.find_iter(content) { let token = mat.as_str().to_string(); token_file_map .entry(token.clone()) diff --git a/src/extract/variants.rs b/src/extract/variants.rs index 8458b29..525b475 100644 --- a/src/extract/variants.rs +++ b/src/extract/variants.rs @@ -1,5 +1,10 @@ +use std::sync::LazyLock; + use regex_lite::Regex; +static CAMEL_SPLIT_RE: LazyLock = + LazyLock::new(|| Regex::new(r"[A-Z][a-z]*|[a-z]+|[0-9]+").unwrap()); + /// A case variant of a variable value, with its literal text and Tera expression. #[derive(Debug, Clone, PartialEq)] pub struct CaseVariant { @@ -26,8 +31,7 @@ pub fn split_into_words(value: &str) -> Vec { } // camelCase / PascalCase splitting - let re = Regex::new(r"[A-Z][a-z]*|[a-z]+|[0-9]+").unwrap(); - let words: Vec = re + let words: Vec = CAMEL_SPLIT_RE .find_iter(value) .map(|m| m.as_str().to_lowercase()) .collect(); From 761fcd582d1cfc0be347970e4bf00e2338687e2b Mon Sep 17 00:00:00 2001 From: rroskam Date: Fri, 27 Feb 2026 22:49:31 -0500 Subject: [PATCH 15/29] fix(extract): address code audit findings - Guard against infinite loop in merge-chain resolution by tracking visited nodes when walking the merge map - Count path-only occurrences in file_count so confidence scoring doesn't miss values that appear only in file paths - Rewrite apply_replacements as a single-pass algorithm that collects all match positions first, preventing later rules from corrupting Tera expressions inserted by earlier rules - Propagate IO errors (e.g. permission denied) from scan instead of silently dropping unreadable files; only downgrade to binary on InvalidData (UTF-8 decode failure) --- src/extract/auto_detect.rs | 18 +++++- src/extract/replace.rs | 121 ++++++++++++++++++------------------- src/extract/scan.rs | 17 ++++-- 3 files changed, 88 insertions(+), 68 deletions(-) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index a9a2612..0777fb9 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -685,11 +685,17 @@ fn merge_similar_clusters(clusters: &mut HashMap) { } // Resolve merge chains: if A→B and B→C, then A→C + // Use a visited set to guard against cycles. let resolved: HashMap = merge_map .keys() .map(|k| { let mut target = merge_map[k].clone(); + let mut seen = HashSet::new(); + seen.insert(k.clone()); while let Some(next) = merge_map.get(&target) { + if !seen.insert(next.clone()) { + break; + } target = next.clone(); } (k.clone(), target) @@ -1140,17 +1146,25 @@ pub fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize let mut total = 0; for file in &scan_result.files { + let mut counted_file = false; + if let Some(ref content) = file.content { let hits = content.matches(value).count(); if hits > 0 { file_count += 1; + counted_file = true; total += hits; } } - // Also check path + let path_str = file.relative_path.to_string_lossy(); let path_hits = path_str.matches(value).count(); - total += path_hits; + if path_hits > 0 { + total += path_hits; + if !counted_file { + file_count += 1; + } + } } (file_count, total) diff --git a/src/extract/replace.rs b/src/extract/replace.rs index 42914ec..95e36ad 100644 --- a/src/extract/replace.rs +++ b/src/extract/replace.rs @@ -27,87 +27,86 @@ fn is_word_char(c: char) -> bool { c.is_alphanumeric() || c == '_' || c == '-' } -/// Replace `literal` in `text` only at word boundaries. +/// Apply replacement rules to a string, longest-match-first, in a single pass. /// -/// A match is at a word boundary when the characters immediately before and -/// after the match are not word-like (alphanumeric, `_`, or `-`), or the -/// match is at the start/end of the string. +/// All match positions are identified first against the original text, then +/// applied in one pass so that replacement output is never re-scanned by later +/// rules. Uses word-boundary-aware matching to prevent replacing substrings +/// inside longer words (e.g., "app" inside "application"). /// -/// Multi-word literals (containing a separator like `-`, `_`, or `.`) always -/// use boundary-aware replacement since false positives are unlikely but still -/// possible in paths and compound tokens. -fn replace_whole_word(text: &str, literal: &str, replacement: &str) -> (String, usize) { - let literal_len = literal.len(); - let text_len = text.len(); - - if literal_len == 0 || text_len < literal_len { - return (text.to_string(), 0); - } - - let mut result = String::with_capacity(text.len()); - let mut count = 0; - let mut start = 0; - - while start <= text_len - literal_len { - match text[start..].find(literal) { - Some(pos) => { - let match_start = start + pos; - let match_end = match_start + literal_len; - - let ok_before = match_start == 0 - || !is_word_char(text[..match_start].chars().next_back().unwrap()); - let ok_after = match_end == text_len - || !is_word_char(text[match_end..].chars().next().unwrap()); - - if ok_before && ok_after { - result.push_str(&text[start..match_start]); - result.push_str(replacement); - count += 1; - start = match_end; - } else { - // Not a word boundary — advance past the start of this match +/// Returns the modified string and the number of replacements made. +pub fn apply_replacements(content: &str, rules: &[ReplacementRule]) -> (String, usize) { + if rules.is_empty() { + return (content.to_string(), 0); + } + + // Collect all (start, end, replacement_index) matches across all rules. + let mut matches: Vec<(usize, usize, usize)> = Vec::new(); + + for (rule_idx, rule) in rules.iter().enumerate() { + if rule.literal.is_empty() { + continue; + } + let literal = &rule.literal; + let literal_len = literal.len(); + let text_len = content.len(); + + if text_len < literal_len { + continue; + } + + let mut start = 0; + while start <= text_len - literal_len { + match content[start..].find(literal) { + Some(pos) => { + let match_start = start + pos; + let match_end = match_start + literal_len; + + let ok_before = match_start == 0 + || !is_word_char(content[..match_start].chars().next_back().unwrap()); + let ok_after = match_end == text_len + || !is_word_char(content[match_end..].chars().next().unwrap()); + + if ok_before && ok_after { + matches.push((match_start, match_end, rule_idx)); + } + let next = match_start - + text[match_start..] + + content[match_start..] .char_indices() .nth(1) .map(|(i, _)| i) .unwrap_or(1); - result.push_str(&text[start..next]); start = next; } + None => break, } - None => break, } } - result.push_str(&text[start..]); - (result, count) -} - -/// Apply replacement rules to a string, longest-match-first. -/// -/// Uses word-boundary-aware matching to prevent replacing substrings -/// inside longer words (e.g., "app" inside "application"). -/// -/// Returns the modified string and the number of replacements made. -pub fn apply_replacements(content: &str, rules: &[ReplacementRule]) -> (String, usize) { - if rules.is_empty() { + if matches.is_empty() { return (content.to_string(), 0); } - let mut result = content.to_string(); + // Sort by start position; on tie, prefer the longer match (lower rule index + // already means longer literal due to build_replacement_rules sorting). + matches.sort_by(|a, b| a.0.cmp(&b.0).then(b.1.cmp(&a.1))); + + // Greedily select non-overlapping matches. + let mut result = String::with_capacity(content.len()); let mut total_count = 0; + let mut cursor = 0; - for rule in rules { - if rule.literal.is_empty() { - continue; - } - let (replaced, count) = replace_whole_word(&result, &rule.literal, &rule.replacement); - if count > 0 { - result = replaced; - total_count += count; + for (m_start, m_end, rule_idx) in &matches { + if *m_start < cursor { + continue; // overlaps with a previously accepted match } + result.push_str(&content[cursor..*m_start]); + result.push_str(&rules[*rule_idx].replacement); + total_count += 1; + cursor = *m_end; } + result.push_str(&content[cursor..]); (result, total_count) } diff --git a/src/extract/scan.rs b/src/extract/scan.rs index 278fd75..bd5bb6f 100644 --- a/src/extract/scan.rs +++ b/src/extract/scan.rs @@ -64,13 +64,20 @@ pub fn scan_project(project_dir: &Path, excludes: &[String]) -> crate::error::Re } let absolute_path = entry.path().to_path_buf(); - let is_binary = is_binary_file(&absolute_path); - let content = if is_binary { - None + let (is_binary, content) = if is_binary_file(&absolute_path) { + (true, None) } else { - // If we can't read as UTF-8, treat as binary - std::fs::read_to_string(&absolute_path).ok() + match std::fs::read_to_string(&absolute_path) { + Ok(s) => (false, Some(s)), + Err(e) if e.kind() == std::io::ErrorKind::InvalidData => (true, None), + Err(e) => { + return Err(crate::error::DicecutError::Io { + context: format!("reading file {}", absolute_path.display()), + source: e, + }); + } + } }; files.push(ScannedFile { From fb22906355eda9bb9b680438fbb6d24682630409 Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 11:07:57 -0500 Subject: [PATCH 16/29] fix(extract): handle nested excludes and symlinks to directories detect_excludes only checked if exclude patterns existed at the project root, missing patterns like node_modules at deeper levels (e.g. docs/node_modules/). Always include all DEFAULT_EXCLUDES since should_exclude already handles nested matching via path components. Also skip symlinks that resolve to directories during scan. pnpm's node_modules/.pnpm uses symlinks to directories, and walkdir reports these as non-directory entries, causing read_to_string to fail with "Is a directory". --- src/extract/exclude.rs | 26 +++++++------------------- src/extract/scan.rs | 30 +++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index 8c4c082..8f3254b 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -62,22 +62,11 @@ const DEFAULT_COPY_WITHOUT_RENDER: &[&str] = &[ ]; /// Detect which default exclude patterns actually exist in the project. -pub fn detect_excludes(project_dir: &Path) -> Vec { - let mut found = Vec::new(); - - for pattern in DEFAULT_EXCLUDES { - let clean = pattern.trim_end_matches('/'); - // Skip glob patterns — they're always included - if clean.contains('*') { - found.push(pattern.to_string()); - continue; - } - if project_dir.join(clean).exists() { - found.push(pattern.to_string()); - } - } - - found +/// +/// All DEFAULT_EXCLUDES are always included because patterns like `node_modules` +/// can appear at any depth (e.g. `docs/node_modules/`), not just the project root. +pub fn detect_excludes(_project_dir: &Path) -> Vec { + DEFAULT_EXCLUDES.iter().map(|s| s.to_string()).collect() } /// Detect which copy-without-render patterns are relevant based on files present. @@ -191,14 +180,13 @@ mod tests { #[test] fn test_detect_excludes() { let dir = tempfile::tempdir().unwrap(); - std::fs::create_dir(dir.path().join(".git")).unwrap(); - std::fs::write(dir.path().join(".DS_Store"), "").unwrap(); let found = detect_excludes(dir.path()); + // All DEFAULT_EXCLUDES are always included regardless of what exists on disk assert!(found.iter().any(|e| e.contains(".git"))); assert!(found.iter().any(|e| e == ".DS_Store")); - // Glob patterns should always be included assert!(found.iter().any(|e| e == "*.pyc")); + assert!(found.iter().any(|e| e.contains("node_modules"))); } #[test] diff --git a/src/extract/scan.rs b/src/extract/scan.rs index bd5bb6f..544aa87 100644 --- a/src/extract/scan.rs +++ b/src/extract/scan.rs @@ -47,10 +47,14 @@ pub fn scan_project(project_dir: &Path, excludes: &[String]) -> crate::error::Re .unwrap_or_else(|| std::io::Error::other("walkdir error")), })?; - // Skip directories themselves (we only care about files) + // Skip directories (including symlinks to directories, e.g. pnpm's + // node_modules/.pnpm uses symlinks that point to directories). if entry.file_type().is_dir() { continue; } + if entry.path_is_symlink() && entry.path().is_dir() { + continue; + } let relative_path = entry .path() @@ -124,6 +128,30 @@ mod tests { assert_eq!(result.files[0].relative_path, PathBuf::from("README.md")); } + #[cfg(unix)] + #[test] + fn test_scan_project_skips_symlinks_to_directories() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write(dir.path().join("real.txt"), "hello").unwrap(); + + // Create a subdirectory and a symlink pointing to it + let subdir = dir.path().join("subdir"); + std::fs::create_dir(&subdir).unwrap(); + std::fs::write(subdir.join("nested.txt"), "nested").unwrap(); + std::os::unix::fs::symlink(&subdir, dir.path().join("link-to-dir")).unwrap(); + + let result = scan_project(dir.path(), &[]).unwrap(); + // Should find real.txt and subdir/nested.txt, but NOT choke on link-to-dir + let paths: Vec = result + .files + .iter() + .map(|f| f.relative_path.to_string_lossy().to_string()) + .collect(); + assert!(paths.contains(&"real.txt".to_string())); + assert!(paths.contains(&"subdir/nested.txt".to_string())); + assert!(!paths.iter().any(|p| p.contains("link-to-dir"))); + } + #[test] fn test_scan_project_binary_detection() { let dir = tempfile::tempdir().unwrap(); From e28b004dbd49a2115972f4304e7219dd9fc7b619 Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 11:25:15 -0500 Subject: [PATCH 17/29] fix(extract): exclude .worktrees/ from template extraction Git worktrees are working copies, not part of the project source. Without this, extract would template duplicate files from any active worktrees in the project. --- src/extract/exclude.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index 8f3254b..b86752f 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -28,6 +28,7 @@ const DEFAULT_EXCLUDES: &[&str] = &[ ".nuxt", ".output", ".turbo", + ".worktrees", ".diecut-answers.toml", ]; From 5cc96722a376903f7f0ebce80267da79b75b7909 Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 12:56:35 -0500 Subject: [PATCH 18/29] feat(extract): stub content files instead of copying verbatim Classify text files with 0 template replacements as boilerplate (config, dotfiles, CI) or content (prose, source). Boilerplate is copied in full; content files are stubbed to minimal placeholders so templates preserve structure without project-specific prose. Interactive confirmation now shows three categories: Templated, Boilerplate, and Stubbed. --- src/extract/mod.rs | 146 +++++++++++++++++++++++--------- src/extract/stub.rs | 199 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 38 deletions(-) create mode 100644 src/extract/stub.rs diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 4388207..9f0c8e4 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -4,6 +4,7 @@ pub mod config_gen; pub mod exclude; pub mod replace; pub mod scan; +pub mod stub; pub mod variants; use std::collections::{BTreeMap, HashMap}; @@ -20,11 +21,12 @@ use self::conditional::{detect_conditional_files, patterns_for_variable, Detecte use self::config_gen::{ generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, }; -use self::exclude::{detect_copy_without_render, detect_excludes}; +use self::exclude::{all_default_excludes, detect_copy_without_render, relevant_config_excludes}; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; use self::scan::scan_project; +use self::stub::{classify_file, generate_stub, FileRole}; use self::variants::{ computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, }; @@ -58,6 +60,8 @@ pub struct PlannedExtractFile { pub template_path: PathBuf, /// The file content (text with replacements, or binary bytes). pub content: ExtractedContent, + /// Whether this file was stubbed (content replaced with a minimal placeholder). + pub stubbed: bool, } impl PlannedExtractFile { @@ -144,15 +148,15 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { }); } - // Phase 1: Detect excludes - let mut excludes = detect_excludes(source_dir); + // Phase 1: All default excludes for scanning (safety — never walks into .git/, node_modules/, etc.) + let scan_excludes = all_default_excludes(); // Phase 2: Scan project eprintln!( "\n{}", style(format!("Scanning {}...", source_dir.display())).bold() ); - let scan_result = scan_project(source_dir, &excludes)?; + let scan_result = scan_project(source_dir, &scan_excludes)?; eprintln!( " {} files found, {} excluded", scan_result.files.len(), @@ -245,11 +249,6 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { confirm_variants_interactive(extract_variables)? }; - // Phase 5: Interactive exclude confirmation - if !options.yes { - excludes = confirm_excludes_interactive(excludes)?; - } - // Phase 6: Detect conditional files let detected_conditionals = if options.yes { vec![] // Batch mode: no conditional files @@ -299,29 +298,64 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { planned_files.push(PlannedExtractFile { template_path, content: ExtractedContent::Binary(binary_content), + stubbed: false, }); } else if let Some(ref content) = file.content { let (replaced, count) = apply_replacements(content, &rules); - // Add .die suffix if file has template replacements - let final_path = if count > 0 { + if count > 0 { + // Has template replacements — keep content, add .die suffix let mut p = template_path.as_os_str().to_string_lossy().to_string(); p.push_str(DEFAULT_TEMPLATES_SUFFIX); - PathBuf::from(p) + planned_files.push(PlannedExtractFile { + template_path: PathBuf::from(p), + content: ExtractedContent::Text { + content: replaced, + replacement_count: count, + }, + stubbed: false, + }); } else { - template_path - }; - - planned_files.push(PlannedExtractFile { - template_path: final_path, - content: ExtractedContent::Text { - content: replaced, - replacement_count: count, - }, - }); + // No replacements — classify as boilerplate or content + match classify_file(&file.relative_path) { + FileRole::Boilerplate => { + planned_files.push(PlannedExtractFile { + template_path, + content: ExtractedContent::Text { + content: replaced, + replacement_count: 0, + }, + stubbed: false, + }); + } + FileRole::Content => { + let stub = generate_stub(&file.relative_path); + planned_files.push(PlannedExtractFile { + template_path, + content: ExtractedContent::Text { + content: stub, + replacement_count: 0, + }, + stubbed: true, + }); + } + } + } } } + // Phase 9.5: Compute config-appropriate excludes from planned template files + // Only patterns that match files actually in the template are worth writing to diecut.toml + let template_paths: Vec = planned_files + .iter() + .map(|f| f.template_path.clone()) + .collect(); + let mut config_excludes = relevant_config_excludes(&template_paths); + + if !options.yes { + config_excludes = confirm_excludes_interactive(config_excludes)?; + } + // Phase 10: Interactive file confirmation if !options.yes { confirm_files_interactive(&planned_files)?; @@ -390,7 +424,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { .unwrap_or_else(|| "template".to_string()), prompted_variables: prompted_vars, computed_variables: computed_vars, - exclude_patterns: excludes.clone(), + exclude_patterns: config_excludes.clone(), copy_without_render: copy_without_render.clone(), conditional_entries: conditional_entries.clone(), }); @@ -401,7 +435,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { config_toml, variables: confirmed_variables, conditional_entries, - exclude_patterns: excludes, + exclude_patterns: config_excludes, copy_without_render, }) } @@ -420,6 +454,7 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> // Write template files let mut rendered_count = 0; let mut copied_count = 0; + let mut stubbed_count = 0; for file in &plan.files { let dest = template_dir.join(&file.template_path); @@ -443,6 +478,8 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> })?; if *replacement_count > 0 { rendered_count += 1; + } else if file.stubbed { + stubbed_count += 1; } else { copied_count += 1; } @@ -500,8 +537,8 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> computed_count ); eprintln!( - " {} files templated, {} files copied", - rendered_count, copied_count + " {} files templated, {} files copied, {} files stubbed", + rendered_count, copied_count, stubbed_count ); if !plan.conditional_entries.is_empty() { eprintln!( @@ -628,12 +665,16 @@ fn confirm_excludes_interactive(mut excludes: Vec) -> Result style("──").dim(), style("─────────────────────────────────────────────").dim() ); - eprintln!(" Auto-detected:"); - for e in &excludes { - eprintln!(" {}", e); + if excludes.is_empty() { + eprintln!(" No exclude patterns needed for this template."); + } else { + eprintln!(" Patterns matching template files:"); + for e in &excludes { + eprintln!(" {}", e); + } } - let extra = Text::new("Add any others? (comma-separated, enter to accept)") + let extra = Text::new("Add extra exclude patterns? (comma-separated, enter to skip)") .with_default("") .prompt() .map_err(|_| DicecutError::PromptCancelled)?; @@ -837,31 +878,60 @@ fn confirm_auto_detected_interactive( fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { let templated: Vec<_> = files.iter().filter(|f| f.has_replacements()).collect(); - let copied: Vec<_> = files.iter().filter(|f| !f.has_replacements()).collect(); + let boilerplate: Vec<_> = files + .iter() + .filter(|f| !f.has_replacements() && !f.stubbed && !f.is_binary()) + .collect(); + let stubbed: Vec<_> = files.iter().filter(|f| f.stubbed).collect(); let binary_count = files.iter().filter(|f| f.is_binary()).count(); eprintln!( - "\n{} Files to template {}", + "\n{} File plan {}", style("──").dim(), - style("────────────────────────────────────").dim() + style("──────────────────────────────────────────").dim() ); + + // Templated files eprintln!( - " Will get {} suffix (template replacements made):", + "\n {} ({} files, {} suffix):", + style("Templated").bold(), + templated.len(), DEFAULT_TEMPLATES_SUFFIX ); for file in &templated { eprintln!( - " {:<40} {} replacements", + " {:<50} {} replacements", file.template_path.display(), file.replacement_count() ); } + // Boilerplate files eprintln!( - "\n Copied verbatim: {} files (including {} binary)", - copied.len(), - binary_count + "\n {} (copied in full, {} files{}):", + style("Boilerplate").bold(), + boilerplate.len() + binary_count, + if binary_count > 0 { + format!(", {} binary", binary_count) + } else { + String::new() + } ); + for file in &boilerplate { + eprintln!(" {}", file.template_path.display()); + } + + // Stubbed files + if !stubbed.is_empty() { + eprintln!( + "\n {} (structure only, {} files):", + style("Stubbed").bold(), + stubbed.len() + ); + for file in &stubbed { + eprintln!(" {}", file.template_path.display()); + } + } let proceed = Confirm::new("Proceed?") .with_default(true) diff --git a/src/extract/stub.rs b/src/extract/stub.rs new file mode 100644 index 0000000..cf19e19 --- /dev/null +++ b/src/extract/stub.rs @@ -0,0 +1,199 @@ +use std::path::Path; + +/// Whether a file is boilerplate (copy in full) or content (stub to placeholder). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FileRole { + /// Config, dotfiles, CI — copy verbatim into the template. + Boilerplate, + /// Prose, docs, source — stub to minimal placeholder. + Content, +} + +/// Filenames (case-insensitive) that are always boilerplate. +const BOILERPLATE_FILENAMES: &[&str] = &[ + ".gitignore", + ".gitattributes", + ".editorconfig", + ".prettierrc", + ".npmrc", + ".nvmrc", + ".gitkeep", + "makefile", + "dockerfile", + "justfile", + "license", + "licence", + "procfile", +]; + +/// Extensions (case-insensitive, without dot) that are always boilerplate. +const BOILERPLATE_EXTENSIONS: &[&str] = &[ + "toml", "yaml", "yml", "json", "jsonc", "json5", "xml", "sh", "bash", "zsh", "bat", "cmd", + "ps1", "cfg", "ini", "conf", +]; + +/// Directory prefixes — files under these dirs are boilerplate. +const BOILERPLATE_DIR_PREFIXES: &[&str] = &[".github/", ".gitlab/", ".circleci/", ".vscode/"]; + +/// Classify a file as boilerplate or content based on its relative path. +/// +/// Only called for text files with 0 template replacements. +pub fn classify_file(path: &Path) -> FileRole { + let path_str = path.to_string_lossy(); + + // Check directory prefix + for prefix in BOILERPLATE_DIR_PREFIXES { + if path_str.starts_with(prefix) { + return FileRole::Boilerplate; + } + } + + // Check filename (case-insensitive) + if let Some(filename) = path.file_name().and_then(|n| n.to_str()) { + let lower = filename.to_lowercase(); + if BOILERPLATE_FILENAMES.contains(&lower.as_str()) { + return FileRole::Boilerplate; + } + } + + // Check extension (case-insensitive) + if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + let lower = ext.to_lowercase(); + if BOILERPLATE_EXTENSIONS.contains(&lower.as_str()) { + return FileRole::Boilerplate; + } + } + + FileRole::Content +} + +/// Generate a minimal stub for a content file. +/// +/// - `.md` files get `# {Title}\n` where Title is derived from the filename. +/// - Everything else gets an empty string. +pub fn generate_stub(path: &Path) -> String { + let is_md = path + .extension() + .and_then(|e| e.to_str()) + .is_some_and(|e| e.eq_ignore_ascii_case("md")); + + if is_md { + let title = path + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("Untitled"); + // Title-case: capitalize first letter, leave rest as-is + let title = title_case(title); + format!("# {title}\n") + } else { + String::new() + } +} + +/// Convert a filename stem like "craft" or "SKILL" into title case. +/// +/// Splits on `-` and `_`, capitalizes each word's first letter. +fn title_case(s: &str) -> String { + s.split(['-', '_']) + .filter(|w| !w.is_empty()) + .map(|word| { + let mut chars = word.chars(); + match chars.next() { + Some(first) => { + let rest: String = chars.collect::().to_lowercase(); + format!("{}{rest}", first.to_uppercase()) + } + None => String::new(), + } + }) + .collect::>() + .join(" ") +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + // ── classify_file ──────────────────────────────────────────────── + + #[rstest] + #[case(".gitignore", FileRole::Boilerplate)] + #[case(".editorconfig", FileRole::Boilerplate)] + #[case("Makefile", FileRole::Boilerplate)] + #[case("Dockerfile", FileRole::Boilerplate)] + #[case("LICENSE", FileRole::Boilerplate)] + #[case("Procfile", FileRole::Boilerplate)] + fn classify_boilerplate_filenames(#[case] filename: &str, #[case] expected: FileRole) { + assert_eq!(classify_file(Path::new(filename)), expected); + } + + #[rstest] + #[case("Cargo.toml", FileRole::Boilerplate)] + #[case("config.yaml", FileRole::Boilerplate)] + #[case("settings.yml", FileRole::Boilerplate)] + #[case("package.json", FileRole::Boilerplate)] + #[case("tsconfig.json", FileRole::Boilerplate)] + #[case("setup.cfg", FileRole::Boilerplate)] + #[case("build.sh", FileRole::Boilerplate)] + #[case("deploy.ps1", FileRole::Boilerplate)] + #[case("app.conf", FileRole::Boilerplate)] + fn classify_boilerplate_extensions(#[case] filename: &str, #[case] expected: FileRole) { + assert_eq!(classify_file(Path::new(filename)), expected); + } + + #[rstest] + #[case(".github/workflows/ci.yml", FileRole::Boilerplate)] + #[case(".github/CODEOWNERS", FileRole::Boilerplate)] + #[case(".gitlab/ci/deploy.yml", FileRole::Boilerplate)] + #[case(".circleci/config.yml", FileRole::Boilerplate)] + #[case(".vscode/settings.json", FileRole::Boilerplate)] + fn classify_boilerplate_directories(#[case] path: &str, #[case] expected: FileRole) { + assert_eq!(classify_file(Path::new(path)), expected); + } + + #[rstest] + #[case("README.md")] + #[case("docs/guide.md")] + #[case("src/main.rs")] + #[case("src/lib.py")] + #[case("index.html")] + #[case("app.css")] + #[case("skills/convention-mining/SKILL.md")] + fn classify_content(#[case] path: &str) { + assert_eq!(classify_file(Path::new(path)), FileRole::Content); + } + + // ── generate_stub ──────────────────────────────────────────────── + + #[rstest] + #[case("README.md", "# Readme\n")] + #[case("craft.md", "# Craft\n")] + #[case("SKILL.md", "# Skill\n")] + #[case("getting-started.md", "# Getting Started\n")] + #[case("my_notes.md", "# My Notes\n")] + fn stub_md_files(#[case] filename: &str, #[case] expected: &str) { + assert_eq!(generate_stub(Path::new(filename)), expected); + } + + #[rstest] + #[case("src/main.rs")] + #[case("index.html")] + #[case("app.css")] + #[case("data.txt")] + fn stub_non_md_files(#[case] filename: &str) { + assert_eq!(generate_stub(Path::new(filename)), ""); + } + + // ── title_case ─────────────────────────────────────────────────── + + #[rstest] + #[case("craft", "Craft")] + #[case("SKILL", "Skill")] + #[case("getting-started", "Getting Started")] + #[case("my_notes", "My Notes")] + #[case("README", "Readme")] + fn test_title_case(#[case] input: &str, #[case] expected: &str) { + assert_eq!(title_case(input), expected); + } +} From fc75d152439210180a2fbb89915d2d8a4eb743d6 Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 12:57:38 -0500 Subject: [PATCH 19/29] fix(extract): commit missing exclude.rs refactor The rename of detect_excludes to all_default_excludes and the new relevant_config_excludes function were already referenced by mod.rs but the file itself was not staged in the previous commit. --- src/extract/exclude.rs | 60 +++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index b86752f..f228830 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -62,14 +62,30 @@ const DEFAULT_COPY_WITHOUT_RENDER: &[&str] = &[ "composer.lock", ]; -/// Detect which default exclude patterns actually exist in the project. +/// Return all default exclude patterns for use during scanning. /// -/// All DEFAULT_EXCLUDES are always included because patterns like `node_modules` -/// can appear at any depth (e.g. `docs/node_modules/`), not just the project root. -pub fn detect_excludes(_project_dir: &Path) -> Vec { +/// All DEFAULT_EXCLUDES are always used during the scan phase because patterns +/// like `node_modules` can appear at any depth (e.g. `docs/node_modules/`). +pub fn all_default_excludes() -> Vec { DEFAULT_EXCLUDES.iter().map(|s| s.to_string()).collect() } +/// Return only the DEFAULT_EXCLUDES patterns that match at least one file in the +/// template output. These are the patterns worth writing to `diecut.toml`'s +/// `[files] exclude` — directory patterns like `.git/` or `node_modules/` that +/// were filtered during scan are omitted since those files never appear in the +/// template. +pub fn relevant_config_excludes(template_files: &[std::path::PathBuf]) -> Vec { + let all = all_default_excludes(); + all.into_iter() + .filter(|pattern| { + template_files + .iter() + .any(|f| should_exclude(f, std::slice::from_ref(pattern))) + }) + .collect() +} + /// Detect which copy-without-render patterns are relevant based on files present. pub fn detect_copy_without_render( _project_dir: &Path, @@ -179,17 +195,43 @@ mod tests { } #[test] - fn test_detect_excludes() { - let dir = tempfile::tempdir().unwrap(); - - let found = detect_excludes(dir.path()); - // All DEFAULT_EXCLUDES are always included regardless of what exists on disk + fn test_all_default_excludes() { + let found = all_default_excludes(); + // All DEFAULT_EXCLUDES are always included assert!(found.iter().any(|e| e.contains(".git"))); assert!(found.iter().any(|e| e == ".DS_Store")); assert!(found.iter().any(|e| e == "*.pyc")); assert!(found.iter().any(|e| e.contains("node_modules"))); } + #[test] + fn test_relevant_config_excludes_empty_when_no_matches() { + // Typical template files won't match any DEFAULT_EXCLUDES + let files = vec![ + PathBuf::from("src/main.rs"), + PathBuf::from("README.md"), + PathBuf::from("Cargo.toml"), + ]; + let relevant = relevant_config_excludes(&files); + assert!(relevant.is_empty()); + } + + #[test] + fn test_relevant_config_excludes_finds_matching_patterns() { + let files = vec![ + PathBuf::from("src/main.py"), + PathBuf::from("src/__pycache__/main.pyc"), + PathBuf::from(".DS_Store"), + ]; + let relevant = relevant_config_excludes(&files); + assert!(relevant.contains(&"*.pyc".to_string())); + assert!(relevant.contains(&".DS_Store".to_string())); + assert!(relevant.contains(&"__pycache__".to_string())); + // Directory excludes that don't match should not appear + assert!(!relevant.contains(&".git".to_string())); + assert!(!relevant.contains(&"node_modules".to_string())); + } + #[test] fn test_detect_copy_without_render() { let files = vec![ From 257fea509cdf6bc88f282ddcc827daeedbe589d9 Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 13:42:10 -0500 Subject: [PATCH 20/29] feat(extract): drop deep content files, add --stub-depth flag Content files deeper than N path components (default 2) are now dropped entirely instead of being stubbed. Shallow content files like README.md or docs/guide.md are still stubbed as before. The threshold is configurable via --stub-depth. --- src/cli.rs | 4 +++ src/commands/extract.rs | 26 ++++++++++++++++--- src/extract/mod.rs | 28 ++++++++++++++++----- src/extract/stub.rs | 55 +++++++++++++++++++++++++++++------------ src/main.rs | 12 ++++++++- tests/integration.rs | 11 +++++++++ 6 files changed, 109 insertions(+), 27 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 6c687a0..fde16cb 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -75,6 +75,10 @@ pub enum Commands { #[arg(long, default_value = "0.5")] min_confidence: f64, + /// Max path depth for stubbing content files (deeper files are dropped) + #[arg(long, default_value = "2")] + stub_depth: usize, + /// Show what would be extracted without writing files #[arg(long)] dry_run: bool, diff --git a/src/commands/extract.rs b/src/commands/extract.rs index 04c23b3..6251044 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -6,6 +6,7 @@ use diecut::error::DicecutError; use diecut::extract::{execute_extraction, plan_extraction, ExtractOptions}; use miette::Result; +#[allow(clippy::too_many_arguments)] pub fn run( source: String, vars: Vec, @@ -13,6 +14,7 @@ pub fn run( in_place: bool, yes: bool, min_confidence: f64, + stub_depth: usize, dry_run: bool, ) -> Result<()> { let variables = parse_vars(&vars)?; @@ -24,6 +26,7 @@ pub fn run( in_place, yes, min_confidence, + stub_depth, dry_run, }; @@ -64,11 +67,12 @@ fn print_dry_run(plan: &diecut::extract::ExtractionPlan) { ); let templated: Vec<_> = plan.files.iter().filter(|f| f.has_replacements()).collect(); - let copied: Vec<_> = plan + let boilerplate: Vec<_> = plan .files .iter() - .filter(|f| !f.has_replacements()) + .filter(|f| !f.has_replacements() && !f.stubbed) .collect(); + let stubbed: Vec<_> = plan.files.iter().filter(|f| f.stubbed).collect(); eprintln!("\nTemplated files ({}):", templated.len()); for file in &templated { @@ -79,11 +83,25 @@ fn print_dry_run(plan: &diecut::extract::ExtractionPlan) { ); } - eprintln!("\nCopied verbatim ({}):", copied.len()); - for file in &copied { + eprintln!("\nBoilerplate ({}):", boilerplate.len()); + for file in &boilerplate { eprintln!(" {}", file.template_path.display()); } + if !stubbed.is_empty() { + eprintln!("\nStubbed ({}):", stubbed.len()); + for file in &stubbed { + eprintln!(" {}", file.template_path.display()); + } + } + + if plan.dropped_count > 0 { + eprintln!("\nDropped ({}):", plan.dropped_count); + for path in &plan.dropped_paths { + eprintln!(" {}", path.display()); + } + } + eprintln!("\nVariables:"); for var in &plan.variables { eprintln!(" {} = {:?}", var.name, var.value); diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 9f0c8e4..c9a6e34 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -96,6 +96,8 @@ pub struct ExtractionPlan { pub conditional_entries: Vec, pub exclude_patterns: Vec, pub copy_without_render: Vec, + pub dropped_count: usize, + pub dropped_paths: Vec, } /// Options for the extraction process. @@ -106,6 +108,7 @@ pub struct ExtractOptions { pub in_place: bool, pub yes: bool, pub min_confidence: f64, + pub stub_depth: usize, pub dry_run: bool, } @@ -285,6 +288,8 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { // Phase 9: Apply replacements to files let mut planned_files = Vec::new(); + let mut dropped_count = 0; + let mut dropped_paths = Vec::new(); for file in &scan_result.files { let template_path = apply_path_replacements(&file.relative_path, &rules); @@ -316,8 +321,8 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { stubbed: false, }); } else { - // No replacements — classify as boilerplate or content - match classify_file(&file.relative_path) { + // No replacements — classify as boilerplate, content, or dropped + match classify_file(&file.relative_path, options.stub_depth) { FileRole::Boilerplate => { planned_files.push(PlannedExtractFile { template_path, @@ -339,6 +344,10 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { stubbed: true, }); } + FileRole::Dropped => { + dropped_count += 1; + dropped_paths.push(file.relative_path.clone()); + } } } } @@ -358,7 +367,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { // Phase 10: Interactive file confirmation if !options.yes { - confirm_files_interactive(&planned_files)?; + confirm_files_interactive(&planned_files, dropped_count)?; } // Phase 11: Build conditional entries @@ -437,6 +446,8 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { conditional_entries, exclude_patterns: config_excludes, copy_without_render, + dropped_count, + dropped_paths, }) } @@ -537,8 +548,8 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> computed_count ); eprintln!( - " {} files templated, {} files copied, {} files stubbed", - rendered_count, copied_count, stubbed_count + " {} files templated, {} files copied, {} files stubbed, {} files dropped", + rendered_count, copied_count, stubbed_count, plan.dropped_count ); if !plan.conditional_entries.is_empty() { eprintln!( @@ -876,7 +887,7 @@ fn confirm_auto_detected_interactive( Ok(accepted) } -fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { +fn confirm_files_interactive(files: &[PlannedExtractFile], dropped_count: usize) -> Result<()> { let templated: Vec<_> = files.iter().filter(|f| f.has_replacements()).collect(); let boilerplate: Vec<_> = files .iter() @@ -933,6 +944,11 @@ fn confirm_files_interactive(files: &[PlannedExtractFile]) -> Result<()> { } } + // Dropped files + if dropped_count > 0 { + eprintln!("\n {} ({} files):", style("Dropped").bold(), dropped_count); + } + let proceed = Confirm::new("Proceed?") .with_default(true) .prompt() diff --git a/src/extract/stub.rs b/src/extract/stub.rs index cf19e19..8c6ce47 100644 --- a/src/extract/stub.rs +++ b/src/extract/stub.rs @@ -1,12 +1,14 @@ use std::path::Path; -/// Whether a file is boilerplate (copy in full) or content (stub to placeholder). +/// Whether a file is boilerplate (copy in full), content (stub), or too deep (drop). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum FileRole { /// Config, dotfiles, CI — copy verbatim into the template. Boilerplate, /// Prose, docs, source — stub to minimal placeholder. Content, + /// Content deeper than stub_depth — drop entirely. + Dropped, } /// Filenames (case-insensitive) that are always boilerplate. @@ -35,10 +37,11 @@ const BOILERPLATE_EXTENSIONS: &[&str] = &[ /// Directory prefixes — files under these dirs are boilerplate. const BOILERPLATE_DIR_PREFIXES: &[&str] = &[".github/", ".gitlab/", ".circleci/", ".vscode/"]; -/// Classify a file as boilerplate or content based on its relative path. +/// Classify a file as boilerplate, content, or dropped based on its relative path. /// /// Only called for text files with 0 template replacements. -pub fn classify_file(path: &Path) -> FileRole { +/// Files deeper than `stub_depth` path components are dropped entirely. +pub fn classify_file(path: &Path, stub_depth: usize) -> FileRole { let path_str = path.to_string_lossy(); // Check directory prefix @@ -64,7 +67,12 @@ pub fn classify_file(path: &Path) -> FileRole { } } - FileRole::Content + let depth = path.components().count(); + if depth > stub_depth { + FileRole::Dropped + } else { + FileRole::Content + } } /// Generate a minimal stub for a content file. @@ -125,7 +133,7 @@ mod tests { #[case("LICENSE", FileRole::Boilerplate)] #[case("Procfile", FileRole::Boilerplate)] fn classify_boilerplate_filenames(#[case] filename: &str, #[case] expected: FileRole) { - assert_eq!(classify_file(Path::new(filename)), expected); + assert_eq!(classify_file(Path::new(filename), 2), expected); } #[rstest] @@ -139,7 +147,7 @@ mod tests { #[case("deploy.ps1", FileRole::Boilerplate)] #[case("app.conf", FileRole::Boilerplate)] fn classify_boilerplate_extensions(#[case] filename: &str, #[case] expected: FileRole) { - assert_eq!(classify_file(Path::new(filename)), expected); + assert_eq!(classify_file(Path::new(filename), 2), expected); } #[rstest] @@ -149,19 +157,34 @@ mod tests { #[case(".circleci/config.yml", FileRole::Boilerplate)] #[case(".vscode/settings.json", FileRole::Boilerplate)] fn classify_boilerplate_directories(#[case] path: &str, #[case] expected: FileRole) { - assert_eq!(classify_file(Path::new(path)), expected); + assert_eq!(classify_file(Path::new(path), 2), expected); } #[rstest] - #[case("README.md")] - #[case("docs/guide.md")] - #[case("src/main.rs")] - #[case("src/lib.py")] - #[case("index.html")] - #[case("app.css")] - #[case("skills/convention-mining/SKILL.md")] - fn classify_content(#[case] path: &str) { - assert_eq!(classify_file(Path::new(path)), FileRole::Content); + #[case("README.md", 2)] + #[case("docs/guide.md", 2)] + #[case("src/main.rs", 2)] + #[case("src/lib.py", 2)] + #[case("index.html", 2)] + #[case("app.css", 2)] + #[case("skills/convention-mining/SKILL.md", 3)] // depth 3, stub_depth 3 → Content + fn classify_content(#[case] path: &str, #[case] stub_depth: usize) { + assert_eq!( + classify_file(Path::new(path), stub_depth), + FileRole::Content + ); + } + + #[rstest] + #[case("skills/convention-mining/SKILL.md", 2)] // depth 3 > stub_depth 2 + #[case("skills/writing-skills/craft.md", 2)] // depth 3 > stub_depth 2 + #[case("a/b/c/deep.md", 2)] // depth 4 > stub_depth 2 + #[case("docs/guide.md", 1)] // depth 2 > stub_depth 1 + fn classify_dropped(#[case] path: &str, #[case] stub_depth: usize) { + assert_eq!( + classify_file(Path::new(path), stub_depth), + FileRole::Dropped + ); } // ── generate_stub ──────────────────────────────────────────────── diff --git a/src/main.rs b/src/main.rs index 4999bb2..11dec94 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,7 +26,17 @@ fn main() -> miette::Result<()> { in_place, yes, min_confidence, + stub_depth, dry_run, - } => commands::extract::run(source, vars, output, in_place, yes, min_confidence, dry_run), + } => commands::extract::run( + source, + vars, + output, + in_place, + yes, + min_confidence, + stub_depth, + dry_run, + ), } } diff --git a/tests/integration.rs b/tests/integration.rs index 243935c..bee61fc 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -656,6 +656,7 @@ fn test_extract_batch_basic() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -700,6 +701,7 @@ fn test_extract_detects_case_variants() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -754,6 +756,7 @@ fn test_extract_dry_run_writes_nothing() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: true, }; @@ -783,6 +786,7 @@ fn test_extract_rejects_already_template() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -804,6 +808,7 @@ fn test_extract_rejects_no_variables() { in_place: false, yes: true, min_confidence: 1.0, + stub_depth: 2, dry_run: false, }; @@ -827,6 +832,7 @@ fn test_extract_templates_path_components() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -890,6 +896,7 @@ fn test_extract_round_trip() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -959,6 +966,7 @@ fn test_extract_auto_yes() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -1000,6 +1008,7 @@ fn test_extract_auto_explicit_vars_merged() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -1042,6 +1051,7 @@ fn test_extract_auto_frequency_fallback() { in_place: false, yes: true, min_confidence: 0.5, + stub_depth: 2, dry_run: false, }; @@ -1085,6 +1095,7 @@ fn test_extract_min_confidence_filters() { in_place: false, yes: true, min_confidence: 0.99, + stub_depth: 2, dry_run: true, }; From b0d69da6d716ea37495b8c2b6b9743124daed4a1 Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 14:15:25 -0500 Subject: [PATCH 21/29] refactor: autodetect --- src/extract/auto_detect.rs | 103 +++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index 0777fb9..ae0fc7b 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -230,6 +230,25 @@ fn parse_cargo_toml( }); } + if let Some(version) = parsed + .get("package") + .and_then(|p| p.get("version")) + .and_then(|v| v.as_str()) + { + if !version.is_empty() { + let (file_count, total_occurrences) = count_occurrences(version, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "version".to_string(), + value: version.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "Cargo.toml [package].version".to_string(), + file_count, + total_occurrences, + }); + } + } + if let Some(authors) = parsed .get("package") .and_then(|p| p.get("authors")) @@ -280,6 +299,21 @@ fn parse_package_json( }); } + if let Some(version) = parsed.get("version").and_then(|v| v.as_str()) { + if !version.is_empty() { + let (file_count, total_occurrences) = count_occurrences(version, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "version".to_string(), + value: version.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "package.json \"version\"".to_string(), + file_count, + total_occurrences, + }); + } + } + if let Some(author) = parsed.get("author") { let author_str = match author { serde_json::Value::String(s) => Some(strip_email(s)), @@ -334,6 +368,25 @@ fn parse_pyproject_toml( }); } + if let Some(version) = parsed + .get("project") + .and_then(|p| p.get("version")) + .and_then(|v| v.as_str()) + { + if !version.is_empty() { + let (file_count, total_occurrences) = count_occurrences(version, scan_result); + candidates.push(DetectedCandidate { + suggested_name: "version".to_string(), + value: version.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence: 0.85, + reason: "pyproject.toml [project].version".to_string(), + file_count, + total_occurrences, + }); + } + } + if let Some(authors) = parsed .get("project") .and_then(|p| p.get("authors")) @@ -1287,7 +1340,7 @@ mod tests { let dir = tempfile::tempdir().unwrap(); std::fs::write( dir.path().join("Cargo.toml"), - "[package]\nname = \"data-pipeline\"\nauthors = [\"Alice \"]\n", + "[package]\nname = \"data-pipeline\"\nversion = \"0.3.1\"\nauthors = [\"Alice \"]\n", ) .unwrap(); @@ -1295,6 +1348,9 @@ mod tests { let candidates = parse_cargo_toml(dir.path(), &scan).unwrap(); assert!(candidates.iter().any(|c| c.value == "data-pipeline")); + assert!(candidates + .iter() + .any(|c| c.value == "0.3.1" && c.suggested_name == "version" && c.confidence == 0.85)); assert!(candidates.iter().any(|c| c.value == "Alice")); } @@ -1303,7 +1359,7 @@ mod tests { let dir = tempfile::tempdir().unwrap(); std::fs::write( dir.path().join("package.json"), - r#"{"name": "@myorg/cool-widget", "author": "Bob Smith "}"#, + r#"{"name": "@myorg/cool-widget", "version": "2.1.0", "author": "Bob Smith "}"#, ) .unwrap(); @@ -1316,6 +1372,13 @@ mod tests { .unwrap(); assert_eq!(name_candidate.value, "cool-widget"); + let version_candidate = candidates + .iter() + .find(|c| c.suggested_name == "version") + .unwrap(); + assert_eq!(version_candidate.value, "2.1.0"); + assert_eq!(version_candidate.confidence, 0.85); + let author_candidate = candidates .iter() .find(|c| c.suggested_name == "author") @@ -1328,7 +1391,7 @@ mod tests { let dir = tempfile::tempdir().unwrap(); std::fs::write( dir.path().join("pyproject.toml"), - "[project]\nname = \"my-tool\"\n\n[[project.authors]]\nname = \"Charlie\"\n", + "[project]\nname = \"my-tool\"\nversion = \"1.0.0\"\n\n[[project.authors]]\nname = \"Charlie\"\n", ) .unwrap(); @@ -1336,6 +1399,9 @@ mod tests { let candidates = parse_pyproject_toml(dir.path(), &scan).unwrap(); assert!(candidates.iter().any(|c| c.value == "my-tool")); + assert!(candidates + .iter() + .any(|c| c.value == "1.0.0" && c.suggested_name == "version" && c.confidence == 0.85)); assert!(candidates.iter().any(|c| c.value == "Charlie")); } @@ -1385,6 +1451,37 @@ mod tests { assert!(parse_cargo_toml(dir.path(), &scan).is_none()); } + #[test] + fn test_tier2_version_missing() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("Cargo.toml"), + "[package]\nname = \"no-version-crate\"\n", + ) + .unwrap(); + std::fs::write( + dir.path().join("package.json"), + r#"{"name": "no-version-pkg"}"#, + ) + .unwrap(); + std::fs::write( + dir.path().join("pyproject.toml"), + "[project]\nname = \"no-version-py\"\n", + ) + .unwrap(); + + let scan = make_scan_result(vec![]); + + let cargo = parse_cargo_toml(dir.path(), &scan).unwrap(); + assert!(!cargo.iter().any(|c| c.suggested_name == "version")); + + let pkg = parse_package_json(dir.path(), &scan).unwrap(); + assert!(!pkg.iter().any(|c| c.suggested_name == "version")); + + let pyproj = parse_pyproject_toml(dir.path(), &scan).unwrap(); + assert!(!pyproj.iter().any(|c| c.suggested_name == "version")); + } + // ── Tier 3 tests ───────────────────────────────────────────────── #[test] From 5657b71f4e001cefbe05a1db1d645f9c4d40e8eb Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 14:40:48 -0500 Subject: [PATCH 22/29] refactor(extract): simplify auto-detect and extract interactive UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract 6 interactive UI functions from mod.rs into interactive.rs - Deduplicate config parsers with push_config_candidate helper - Replace Tier 4 frequency analysis (~770 lines of noise-filter lists) with a ~60-line multi-variant heuristic requiring ≥2 case forms - Remove strsim dependency (no longer needed) --- Cargo.lock | 1 - Cargo.toml | 1 - src/extract/auto_detect.rs | 1002 +++++++----------------------------- src/extract/interactive.rs | 411 +++++++++++++++ src/extract/mod.rs | 412 +-------------- 5 files changed, 597 insertions(+), 1230 deletions(-) create mode 100644 src/extract/interactive.rs diff --git a/Cargo.lock b/Cargo.lock index 113ae0e..ce434bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -462,7 +462,6 @@ dependencies = [ "serde", "serde_json", "sha2", - "strsim", "tempfile", "tera", "thiserror", diff --git a/Cargo.toml b/Cargo.toml index ecca649..09bd987 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,6 @@ sha2 = "0.10" fs4 = "0.12" content_inspector = "0.2" indexmap = { version = "2.11.4", features = ["serde"] } -strsim = "0.11" [dev-dependencies] rstest = "0.23" diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs index ae0fc7b..193bbac 100644 --- a/src/extract/auto_detect.rs +++ b/src/extract/auto_detect.rs @@ -55,15 +55,6 @@ pub struct AutoDetectResult { pub candidates: Vec, } -struct TokenCluster { - normalized: Vec, - literals: Vec, - total_occurrences: usize, - file_count: usize, - matches_dir_name: bool, - in_config_value: bool, -} - // ── Entry point ────────────────────────────────────────────────────────── /// Run all 4 auto-detection tiers against a scanned project. @@ -83,25 +74,8 @@ pub fn auto_detect(project_dir: &Path, scan_result: &ScanResult) -> AutoDetectRe let covered_values: HashSet = candidates.iter().map(|c| c.value.to_lowercase()).collect(); - // Collect config values for frequency analysis boosting - let config_values: HashSet = candidates - .iter() - .filter(|c| c.tier == ConfidenceTier::ConfigFile) - .map(|c| c.value.to_lowercase()) - .collect(); - - let dir_name = project_dir - .file_name() - .map(|n| n.to_string_lossy().to_lowercase()) - .unwrap_or_default(); - // Tier 4: Frequency analysis - candidates.extend(detect_frequency( - scan_result, - &covered_values, - &config_values, - &dir_name, - )); + candidates.extend(detect_frequency(scan_result, &covered_values)); // Deduplicate by normalized word list, keeping highest confidence deduplicate_candidates(&mut candidates); @@ -203,6 +177,26 @@ fn detect_config_files(project_dir: &Path, scan_result: &ScanResult) -> Vec, + value: &str, + suggested_name: &str, + confidence: f64, + reason: &str, + scan_result: &ScanResult, +) { + let (file_count, total_occurrences) = count_occurrences(value, scan_result); + candidates.push(DetectedCandidate { + suggested_name: suggested_name.to_string(), + value: value.to_string(), + tier: ConfidenceTier::ConfigFile, + confidence, + reason: reason.to_string(), + file_count, + total_occurrences, + }); +} + fn parse_cargo_toml( project_dir: &Path, scan_result: &ScanResult, @@ -218,16 +212,14 @@ fn parse_cargo_toml( .and_then(|p| p.get("name")) .and_then(|n| n.as_str()) { - let (file_count, total_occurrences) = count_occurrences(name, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "project_name".to_string(), - value: name.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.90, - reason: "Cargo.toml [package].name".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + name, + "project_name", + 0.90, + "Cargo.toml [package].name", + scan_result, + ); } if let Some(version) = parsed @@ -236,16 +228,14 @@ fn parse_cargo_toml( .and_then(|v| v.as_str()) { if !version.is_empty() { - let (file_count, total_occurrences) = count_occurrences(version, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "version".to_string(), - value: version.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "Cargo.toml [package].version".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + version, + "version", + 0.85, + "Cargo.toml [package].version", + scan_result, + ); } } @@ -257,16 +247,14 @@ fn parse_cargo_toml( if let Some(first) = authors.first().and_then(|a| a.as_str()) { let author = strip_email(first); if !author.is_empty() { - let (file_count, total_occurrences) = count_occurrences(&author, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "author".to_string(), - value: author.clone(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "Cargo.toml [package].authors[0]".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + &author, + "author", + 0.85, + "Cargo.toml [package].authors[0]", + scan_result, + ); } } } @@ -285,32 +273,27 @@ fn parse_package_json( let mut candidates = Vec::new(); if let Some(name) = parsed.get("name").and_then(|n| n.as_str()) { - // Strip npm scope @org/ let clean_name = strip_npm_scope(name); - let (file_count, total_occurrences) = count_occurrences(clean_name, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "project_name".to_string(), - value: clean_name.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.90, - reason: "package.json \"name\"".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + clean_name, + "project_name", + 0.90, + "package.json \"name\"", + scan_result, + ); } if let Some(version) = parsed.get("version").and_then(|v| v.as_str()) { if !version.is_empty() { - let (file_count, total_occurrences) = count_occurrences(version, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "version".to_string(), - value: version.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "package.json \"version\"".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + version, + "version", + 0.85, + "package.json \"version\"", + scan_result, + ); } } @@ -324,16 +307,14 @@ fn parse_package_json( }; if let Some(author_name) = author_str { if !author_name.is_empty() { - let (file_count, total_occurrences) = count_occurrences(&author_name, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "author".to_string(), - value: author_name, - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "package.json \"author\"".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + &author_name, + "author", + 0.85, + "package.json \"author\"", + scan_result, + ); } } } @@ -356,16 +337,14 @@ fn parse_pyproject_toml( .and_then(|p| p.get("name")) .and_then(|n| n.as_str()) { - let (file_count, total_occurrences) = count_occurrences(name, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "project_name".to_string(), - value: name.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.90, - reason: "pyproject.toml [project].name".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + name, + "project_name", + 0.90, + "pyproject.toml [project].name", + scan_result, + ); } if let Some(version) = parsed @@ -374,16 +353,14 @@ fn parse_pyproject_toml( .and_then(|v| v.as_str()) { if !version.is_empty() { - let (file_count, total_occurrences) = count_occurrences(version, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "version".to_string(), - value: version.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "pyproject.toml [project].version".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + version, + "version", + 0.85, + "pyproject.toml [project].version", + scan_result, + ); } } @@ -400,16 +377,14 @@ fn parse_pyproject_toml( .map(strip_email); if let Some(name) = author_name { if !name.is_empty() { - let (file_count, total_occurrences) = count_occurrences(&name, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "author".to_string(), - value: name, - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "pyproject.toml [project].authors[0].name".to_string(), - file_count, - total_occurrences, - }); + push_config_candidate( + &mut candidates, + &name, + "author", + 0.85, + "pyproject.toml [project].authors[0].name", + scan_result, + ); } } } @@ -434,32 +409,29 @@ fn parse_go_mod(project_dir: &Path, scan_result: &ScanResult) -> Option= 3 { let org = segments[segments.len() - 2]; if !org.is_empty() && org != name { - let (org_file_count, org_total_occurrences) = count_occurrences(org, scan_result); + let (_, org_total_occurrences) = count_occurrences(org, scan_result); if org_total_occurrences > 0 { - candidates.push(DetectedCandidate { - suggested_name: "org_name".to_string(), - value: org.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: format!("go.mod module org \"{}\"", org), - file_count: org_file_count, - total_occurrences: org_total_occurrences, - }); + push_config_candidate( + &mut candidates, + org, + "org_name", + 0.85, + &format!("go.mod module org \"{}\"", org), + scan_result, + ); } } } @@ -559,8 +531,6 @@ fn parse_org_from_url(url: &str) -> Option { fn detect_frequency( scan_result: &ScanResult, covered_values: &HashSet, - config_values: &HashSet, - dir_name: &str, ) -> Vec { // Tokenize all text file content let mut token_file_map: HashMap> = HashMap::new(); @@ -579,64 +549,58 @@ fn detect_frequency( } } - // Build clusters by normalized word list - let mut clusters: HashMap = HashMap::new(); + // Group tokens by normalized word list to find multi-variant clusters + struct Cluster { + literals: Vec, + total_occurrences: usize, + files: HashSet, + } + + let mut clusters: HashMap = HashMap::new(); for (token, count) in &token_counts { let words = split_into_words(token); - - // Filter noise - if words.iter().all(|w| w.len() < 3) { - continue; - } - if is_noise_token(token, &words) { - continue; - } - let normalized_key = words.join(" "); - let file_count = token_file_map.get(token).map(|s| s.len()).unwrap_or(0); - - // Skip single-occurrence-single-file tokens - if *count == 1 && file_count <= 1 { + // Token must be at least 4 chars + if token.len() < 4 { continue; } - let matches_dir = - normalized_key == split_into_words(dir_name).join(" ") && !dir_name.is_empty(); - let in_config = config_values.contains(&token.to_lowercase()); - - let cluster = clusters - .entry(normalized_key.clone()) - .or_insert_with(|| TokenCluster { - normalized: words.clone(), - literals: Vec::new(), - total_occurrences: 0, - file_count: 0, - matches_dir_name: false, - in_config_value: false, - }); + let cluster = clusters.entry(normalized_key).or_insert_with(|| Cluster { + literals: Vec::new(), + total_occurrences: 0, + files: HashSet::new(), + }); if !cluster.literals.contains(token) { cluster.literals.push(token.clone()); } cluster.total_occurrences += count; - // Merge file sets for accurate file_count - let files_for_token = token_file_map.get(token).map(|s| s.len()).unwrap_or(0); - if files_for_token > cluster.file_count { - cluster.file_count = files_for_token; + if let Some(file_set) = token_file_map.get(token) { + cluster.files.extend(file_set); } - cluster.matches_dir_name = cluster.matches_dir_name || matches_dir; - cluster.in_config_value = cluster.in_config_value || in_config; } - // Merge near-misses using Levenshtein distance - merge_similar_clusters(&mut clusters); - - // Score and convert to candidates + // Filter and convert to candidates let mut freq_candidates: Vec = Vec::new(); - for (key, cluster) in &clusters { + for cluster in clusters.values() { + // Must have ≥2 distinct case variants (the key multi-variant heuristic) + if cluster.literals.len() < 2 { + continue; + } + + // Must have ≥3 total occurrences + if cluster.total_occurrences < 3 { + continue; + } + + // Must appear in ≥2 files + if cluster.files.len() < 2 { + continue; + } + // Skip if already covered by higher tiers if cluster .literals @@ -646,552 +610,42 @@ fn detect_frequency( continue; } - let score = score_cluster(cluster); - - // Filter low-scoring candidates - if score < 0.30 { - continue; - } - let best_literal = &cluster.literals[0]; - let suggested_name = suggest_variable_name(&cluster.normalized, key); + let words = split_into_words(best_literal); + let suggested_name = if words.len() <= 3 { + words.join("_") + } else { + words[..3].join("_") + }; + let file_count = cluster.files.len(); freq_candidates.push(DetectedCandidate { suggested_name, value: best_literal.clone(), tier: ConfidenceTier::FrequencyAnalysis, - confidence: score, + confidence: 0.60, reason: format!( "{} occurrences across {} files, {} variant(s)", cluster.total_occurrences, - cluster.file_count, + file_count, cluster.literals.len() ), - file_count: cluster.file_count, + file_count, total_occurrences: cluster.total_occurrences, }); } - // Sort by confidence, take top 5 - freq_candidates.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); + // Sort by file_count * total_occurrences descending, take top 5 + freq_candidates.sort_by(|a, b| { + let score_a = a.file_count * a.total_occurrences; + let score_b = b.file_count * b.total_occurrences; + score_b.cmp(&score_a) + }); freq_candidates.truncate(5); freq_candidates } -fn score_cluster(cluster: &TokenCluster) -> f64 { - // Occurrence count (log-scaled, 0.0..1.0) - let occ_score = (cluster.total_occurrences as f64).ln_1p() / 10.0_f64.ln_1p(); - let occ_score = occ_score.min(1.0); - - // File spread (log-scaled, 0.0..1.0) - let file_score = (cluster.file_count as f64).ln_1p() / 10.0_f64.ln_1p(); - let file_score = file_score.min(1.0); - - // Variant diversity - let variant_score = match cluster.literals.len() { - 0 | 1 => 0.0, - 2 => 0.5, - 3 => 0.75, - _ => 1.0, - }; - - // Directory name match (binary) - let dir_score = if cluster.matches_dir_name { 1.0 } else { 0.0 }; - - // Config value match (binary) - let config_score = if cluster.in_config_value { 1.0 } else { 0.0 }; - - 0.15 * occ_score - + 0.20 * file_score - + 0.35 * variant_score - + 0.20 * dir_score - + 0.10 * config_score -} - -fn merge_similar_clusters(clusters: &mut HashMap) { - let keys: Vec = clusters.keys().cloned().collect(); - let mut merge_map: HashMap = HashMap::new(); - - for i in 0..keys.len() { - for j in (i + 1)..keys.len() { - if merge_map.contains_key(&keys[j]) { - continue; - } - let dist = strsim::levenshtein(&keys[i], &keys[j]); - if dist <= 1 { - let size_i = clusters - .get(&keys[i]) - .map(|c| c.total_occurrences) - .unwrap_or(0); - let size_j = clusters - .get(&keys[j]) - .map(|c| c.total_occurrences) - .unwrap_or(0); - if size_i >= size_j { - merge_map.insert(keys[j].clone(), keys[i].clone()); - } else { - merge_map.insert(keys[i].clone(), keys[j].clone()); - } - } - } - } - - // Resolve merge chains: if A→B and B→C, then A→C - // Use a visited set to guard against cycles. - let resolved: HashMap = merge_map - .keys() - .map(|k| { - let mut target = merge_map[k].clone(); - let mut seen = HashSet::new(); - seen.insert(k.clone()); - while let Some(next) = merge_map.get(&target) { - if !seen.insert(next.clone()) { - break; - } - target = next.clone(); - } - (k.clone(), target) - }) - .collect(); - - for (from, to) in &resolved { - if let Some(removed) = clusters.remove(from) { - if let Some(target) = clusters.get_mut(to) { - for lit in removed.literals { - if !target.literals.contains(&lit) { - target.literals.push(lit); - } - } - target.total_occurrences += removed.total_occurrences; - if removed.file_count > target.file_count { - target.file_count = removed.file_count; - } - target.matches_dir_name = target.matches_dir_name || removed.matches_dir_name; - target.in_config_value = target.in_config_value || removed.in_config_value; - } - } - } -} - -fn suggest_variable_name(words: &[String], _key: &str) -> String { - if words.len() <= 3 { - words.join("_") - } else { - // Truncate long names - words[..3].join("_") - } -} - -// ── Noise filtering ────────────────────────────────────────────────────── - -fn is_noise_token(token: &str, words: &[String]) -> bool { - let lower = token.to_lowercase(); - - // Too short - if lower.len() < 3 { - return true; - } - - // Language keywords - if LANGUAGE_KEYWORDS.contains(&lower.as_str()) { - return true; - } - - // Common library names - if COMMON_LIBRARIES.contains(&lower.as_str()) { - return true; - } - - // Stopwords (individual words) - if words.len() == 1 && STOPWORDS.contains(&lower.as_str()) { - return true; - } - - // All words are stopwords, file-format words, or very short - if words.iter().all(|w| { - w.len() < 3 || STOPWORDS.contains(&w.as_str()) || FILE_FORMAT_WORDS.contains(&w.as_str()) - }) { - return true; - } - - false -} - -const FILE_FORMAT_WORDS: &[&str] = &[ - "toml", "json", "yaml", "yml", "xml", "csv", "html", "css", "md", "txt", "log", "cfg", "ini", - "env", "lock", "mod", "rs", "js", "ts", "py", "go", "rb", "java", "kt", "swift", "cpp", "hpp", - "vue", "jsx", "tsx", -]; - -const LANGUAGE_KEYWORDS: &[&str] = &[ - // Rust - "async", - "await", - "break", - "const", - "continue", - "crate", - "dyn", - "else", - "enum", - "extern", - "false", - "fn", - "for", - "if", - "impl", - "in", - "let", - "loop", - "match", - "mod", - "move", - "mut", - "pub", - "ref", - "return", - "self", - "static", - "struct", - "super", - "trait", - "true", - "type", - "unsafe", - "use", - "where", - "while", - "yield", - // JS/TS - "abstract", - "arguments", - "boolean", - "byte", - "case", - "catch", - "char", - "class", - "debugger", - "default", - "delete", - "do", - "double", - "eval", - "export", - "extends", - "final", - "finally", - "float", - "function", - "goto", - "implements", - "import", - "instanceof", - "int", - "interface", - "long", - "native", - "new", - "null", - "package", - "private", - "protected", - "public", - "short", - "switch", - "synchronized", - "this", - "throw", - "throws", - "transient", - "try", - "typeof", - "undefined", - "var", - "void", - "volatile", - "with", - // Python - "and", - "as", - "assert", - "class", - "def", - "del", - "elif", - "except", - "exec", - "from", - "global", - "is", - "lambda", - "nonlocal", - "not", - "or", - "pass", - "print", - "raise", - "with", - "yield", - // Go - "chan", - "defer", - "fallthrough", - "go", - "goroutine", - "interface", - "map", - "range", - "select", - "func", -]; - -const COMMON_LIBRARIES: &[&str] = &[ - "react", - "redux", - "webpack", - "babel", - "eslint", - "prettier", - "jest", - "mocha", - "chai", - "express", - "fastify", - "next", - "nuxt", - "vue", - "angular", - "svelte", - "serde", - "tokio", - "actix", - "axum", - "clap", - "anyhow", - "thiserror", - "tracing", - "reqwest", - "hyper", - "warp", - "rocket", - "diesel", - "sqlx", - "django", - "flask", - "fastapi", - "pytest", - "numpy", - "pandas", - "scipy", - "spring", - "hibernate", - "junit", - "maven", - "gradle", - "gin", - "echo", - "fiber", - "gorm", - "lodash", - "axios", - "moment", - "dayjs", - "ramda", - "underscore", - "tailwind", - "bootstrap", - "material", - "typescript", - "javascript", - "python", - "golang", - "rustlang", -]; - -const STOPWORDS: &[&str] = &[ - // English stopwords - "the", - "and", - "for", - "are", - "but", - "not", - "you", - "all", - "can", - "had", - "her", - "was", - "one", - "our", - "out", - "get", - "set", - "has", - "his", - "how", - "its", - "let", - "may", - "new", - "now", - "old", - "see", - "way", - "who", - "did", - "got", - "has", - "him", - "into", - "just", - "like", - "make", - "many", - "some", - "than", - "them", - "then", - "very", - "when", - "with", - "have", - "from", - "been", - "also", - "each", - "that", - "this", - "will", - "your", - "what", - "which", - "their", - "about", - "would", - "there", - "could", - "other", - "after", - "first", - "these", - "those", - "being", - "where", - "should", - "because", - // Short generic words common in code identifiers - "my", - "no", - "is", - "on", - "in", - "to", - "by", - "do", - "up", - "so", - "or", - "app", - "run", - "dry", - "log", - "cmd", - "arg", - "env", - "dir", - "key", - "map", - "max", - "min", - "raw", - "ref", - "src", - "str", - "tmp", - "url", - "var", - "buf", - "msg", - "req", - "res", - "err", - "pkg", - "lib", - "bin", - "fmt", - "ctx", - "cfg", - "opt", - "val", - "idx", - "len", - "ptr", - "num", - "std", - "gen", - "pre", - "sub", - // Programming type/concept words - "string", - "number", - "bool", - "boolean", - "array", - "object", - "value", - "result", - "error", - "option", - "none", - "some", - "true", - "false", - "null", - "undefined", - "file", - "path", - "name", - "type", - "data", - "info", - "list", - "item", - "node", - "index", - "count", - "size", - "length", - "config", - "settings", - "options", - "input", - "output", - "source", - "target", - "test", - "main", - "init", - "setup", - "todo", - "fixme", - "hack", - "note", - "warning", - "debug", - "trace", - "level", - "mode", - "flag", - "status", - "state", - "cache", - "hook", - "hooks", -]; - // ── Helpers ────────────────────────────────────────────────────────────── pub fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize) { @@ -1527,11 +981,10 @@ mod tests { ]); let covered = HashSet::new(); - let config_vals = HashSet::new(); - let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + let candidates = detect_frequency(&scan, &covered); assert!(!candidates.is_empty()); - // Should find "data-pipeline" cluster + // Should find "data-pipeline" cluster (multi-variant) let found = candidates.iter().any(|c| { let words = split_into_words(&c.value); words == vec!["data", "pipeline"] @@ -1543,36 +996,12 @@ mod tests { ); } - #[test] - fn test_frequency_filters_keywords() { - let scan = make_scan_result(vec![ - ("a.rs", "fn async_handler() {}"), - ("b.rs", "fn async_handler() {}"), - ("c.rs", "fn async_handler() {}"), - ]); - - let covered = HashSet::new(); - let config_vals = HashSet::new(); - let candidates = detect_frequency(&scan, &covered, &config_vals, ""); - - // "async" alone should be filtered - for c in &candidates { - let lower = c.value.to_lowercase(); - assert!( - !LANGUAGE_KEYWORDS.contains(&lower.as_str()) - || c.value.contains('-') - || c.value.contains('_') - ); - } - } - #[test] fn test_frequency_filters_short_tokens() { let scan = make_scan_result(vec![("a.txt", "ab cd ef gh"), ("b.txt", "ab cd ef gh")]); let covered = HashSet::new(); - let config_vals = HashSet::new(); - let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + let candidates = detect_frequency(&scan, &covered); assert!(candidates.is_empty(), "short tokens should be filtered"); } @@ -1587,8 +1016,7 @@ mod tests { let mut covered = HashSet::new(); covered.insert("my-widget".to_string()); - let config_vals = HashSet::new(); - let candidates = detect_frequency(&scan, &covered, &config_vals, ""); + let candidates = detect_frequency(&scan, &covered); let has_widget = candidates .iter() @@ -1597,83 +1025,22 @@ mod tests { } #[test] - fn test_score_cluster_multi_variant_boost() { - let single_variant = TokenCluster { - normalized: vec!["my".into(), "app".into()], - literals: vec!["my-app".into()], - total_occurrences: 10, - file_count: 5, - matches_dir_name: false, - in_config_value: false, - }; - - let multi_variant = TokenCluster { - normalized: vec!["my".into(), "app".into()], - literals: vec!["my-app".into(), "my_app".into(), "MyApp".into()], - total_occurrences: 10, - file_count: 5, - matches_dir_name: false, - in_config_value: false, - }; - - assert!(score_cluster(&multi_variant) > score_cluster(&single_variant)); - } - - #[test] - fn test_score_cluster_dir_name_boost() { - let no_dir = TokenCluster { - normalized: vec!["my".into(), "app".into()], - literals: vec!["my-app".into()], - total_occurrences: 5, - file_count: 3, - matches_dir_name: false, - in_config_value: false, - }; - - let with_dir = TokenCluster { - normalized: vec!["my".into(), "app".into()], - literals: vec!["my-app".into()], - total_occurrences: 5, - file_count: 3, - matches_dir_name: true, - in_config_value: false, - }; + fn test_frequency_requires_multi_variant() { + // Single variant only — should NOT be detected even with many occurrences + let scan = make_scan_result(vec![ + ("a.txt", "async_handler async_handler async_handler"), + ("b.txt", "async_handler async_handler"), + ("c.txt", "async_handler"), + ]); - assert!(score_cluster(&with_dir) > score_cluster(&no_dir)); - } + let covered = HashSet::new(); + let candidates = detect_frequency(&scan, &covered); - #[test] - fn test_levenshtein_merging() { - let mut clusters = HashMap::new(); - clusters.insert( - "data pipeline".to_string(), - TokenCluster { - normalized: vec!["data".into(), "pipeline".into()], - literals: vec!["data-pipeline".into()], - total_occurrences: 10, - file_count: 5, - matches_dir_name: false, - in_config_value: false, - }, - ); - clusters.insert( - "data pipelin".to_string(), // typo / near miss - TokenCluster { - normalized: vec!["data".into(), "pipelin".into()], - literals: vec!["data-pipelin".into()], - total_occurrences: 2, - file_count: 1, - matches_dir_name: false, - in_config_value: false, - }, + assert!( + candidates.is_empty(), + "single-variant tokens should be filtered, got: {:?}", + candidates ); - - merge_similar_clusters(&mut clusters); - - // Should merge into one cluster - assert_eq!(clusters.len(), 1); - let remaining = clusters.values().next().unwrap(); - assert_eq!(remaining.total_occurrences, 12); } // ── Helper tests ───────────────────────────────────────────────── @@ -1737,21 +1104,6 @@ mod tests { ); } - #[test] - fn test_suggest_variable_name() { - assert_eq!( - suggest_variable_name(&["my".into(), "app".into()], "my app"), - "my_app" - ); - assert_eq!( - suggest_variable_name( - &["very".into(), "long".into(), "name".into(), "here".into()], - "very long name here" - ), - "very_long_name" - ); - } - #[test] fn test_strip_npm_scope() { assert_eq!(strip_npm_scope("@myorg/cool-widget"), "cool-widget"); diff --git a/src/extract/interactive.rs b/src/extract/interactive.rs new file mode 100644 index 0000000..6b6c55d --- /dev/null +++ b/src/extract/interactive.rs @@ -0,0 +1,411 @@ +use std::collections::BTreeMap; + +use console::style; +use inquire::{Confirm, Select, Text}; + +use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; +use crate::error::{DicecutError, Result}; + +use super::auto_detect::DetectedCandidate; +use super::conditional::DetectedConditional; +use super::variants::generate_variants; +use super::{ExtractVariable, PlannedExtractFile}; + +pub fn confirm_variants_interactive( + variables: Vec, +) -> Result> { + let mut confirmed = Vec::new(); + + for mut var in variables { + eprintln!( + "\n{} {} = {:?} {}", + style("──").dim(), + style(&var.name).bold(), + var.value, + style("──────────────────────────────────────").dim() + ); + + if var.variants.len() == 1 && var.variants[0].name == "verbatim" { + // Simple value — just show occurrence count + let (file_count, total_hits) = var + .occurrence_counts + .first() + .map(|(_, fc, th)| (*fc, *th)) + .unwrap_or((0, 0)); + if total_hits > 0 { + eprintln!( + " Found in {} files ({} occurrences)", + file_count, total_hits + ); + } else { + eprintln!( + " {} Value not found in any file (will still be added to config)", + style("⚠").yellow() + ); + } + confirmed.push(var); + continue; + } + + // Show detected variants with counts + eprintln!(" Detected case variants:"); + let mut found_any = false; + for (i, variant) in var.variants.iter().enumerate() { + let (_, file_count, total_hits) = &var.occurrence_counts[i]; + let mark = if *total_hits > 0 { + found_any = true; + style("✓").green().to_string() + } else { + style("✗").dim().to_string() + }; + let hits_str = if *total_hits > 0 { + format!( + "{} {} across {} {}", + total_hits, + if *total_hits == 1 { "hit" } else { "hits" }, + file_count, + if *file_count == 1 { "file" } else { "files" } + ) + } else { + "not found".to_string() + }; + eprintln!( + " {} {:<16} {:<20} {}", + mark, + variant.literal, + variant.name, + style(&hits_str).dim() + ); + } + + if !found_any { + eprintln!( + " {} No occurrences found for any variant (will still be added to config)", + style("⚠").yellow() + ); + // Keep just the first variant + var.variants.truncate(1); + confirmed.push(var); + continue; + } + + let keep = Confirm::new("Keep detected variants?") + .with_default(true) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if keep { + // Remove variants with zero occurrences + let counts = var.occurrence_counts.clone(); + var.variants.retain(|v| { + counts + .iter() + .any(|(name, _, hits)| name == v.name && *hits > 0) + }); + if var.variants.is_empty() { + let all = generate_variants(&var.name, &var.value); + if let Some(first) = all.into_iter().next() { + var.variants.push(first); + } + } + } else { + // Keep only the canonical variant + var.variants.truncate(1); + } + + confirmed.push(var); + } + + Ok(confirmed) +} + +pub fn confirm_excludes_interactive(mut excludes: Vec) -> Result> { + eprintln!( + "\n{} Excludes {}", + style("──").dim(), + style("─────────────────────────────────────────────").dim() + ); + if excludes.is_empty() { + eprintln!(" No exclude patterns needed for this template."); + } else { + eprintln!(" Patterns matching template files:"); + for e in &excludes { + eprintln!(" {}", e); + } + } + + let extra = Text::new("Add extra exclude patterns? (comma-separated, enter to skip)") + .with_default("") + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if !extra.is_empty() { + for pattern in extra.split(',') { + let trimmed = pattern.trim().to_string(); + if !trimmed.is_empty() { + excludes.push(trimmed); + } + } + } + + Ok(excludes) +} + +pub fn confirm_conditionals_interactive( + detected: Vec, +) -> Result> { + eprintln!( + "\n{} Conditional files {}", + style("──").dim(), + style("────────────────────────────────────").dim() + ); + eprintln!(" These look optional. Make them conditional?"); + + let mut confirmed = Vec::new(); + for cond in detected { + let prompt = format!(" {} → {}", cond.pattern, cond.variable); + let include = Confirm::new(&prompt) + .with_default(false) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if include { + confirmed.push(cond); + } + } + + Ok(confirmed) +} + +pub fn resolve_candidates_yes( + candidates: &[DetectedCandidate], + explicit_vars: &[(String, String)], +) -> Vec<(String, String)> { + eprintln!( + "\n{} Auto-detected variables {}", + style("──").dim(), + style("──────────────────────────────────").dim() + ); + + // Group candidates by suggested_name + let mut groups: BTreeMap> = BTreeMap::new(); + for c in candidates { + groups.entry(c.suggested_name.clone()).or_default().push(c); + } + + let mut result = Vec::new(); + + for (name, mut group) in groups { + // Skip names already covered by explicit --var + if explicit_vars.iter().any(|(n, _)| n == &name) { + eprintln!( + " {} {} (explicit --var, skipping auto-detect)", + style("·").dim(), + style(&name).dim() + ); + continue; + } + + // For name collisions, pick highest confidence + group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); + let winner = group[0]; + + eprintln!( + " {} {} = {:?} ({:.0}% confidence, {})", + style("✓").green(), + style(&winner.suggested_name).bold(), + winner.value, + winner.confidence * 100.0, + winner.tier + ); + eprintln!(" {}", style(&winner.reason).dim()); + + if group.len() > 1 { + eprintln!( + " {} {} other candidates for this name (picked highest confidence)", + style("⚠").yellow(), + group.len() - 1 + ); + } + + result.push((winner.suggested_name.clone(), winner.value.clone())); + } + + result +} + +pub fn confirm_auto_detected_interactive( + candidates: Vec, + explicit_vars: &[(String, String)], +) -> Result> { + eprintln!( + "\n{} Auto-detected variables {}", + style("──").dim(), + style("──────────────────────────────────").dim() + ); + + // Group candidates by suggested_name + let mut groups: BTreeMap> = BTreeMap::new(); + for c in candidates { + groups.entry(c.suggested_name.clone()).or_default().push(c); + } + + let mut accepted = Vec::new(); + + for (name, mut group) in groups { + // Skip names already covered by explicit --var + if explicit_vars.iter().any(|(n, _)| n == &name) { + eprintln!( + "\n {} {} (provided via --var, skipping)", + style("·").dim(), + style(&name).dim() + ); + continue; + } + + // Sort by confidence descending + group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); + + if group.len() == 1 { + // Single candidate — simple confirm + let candidate = &group[0]; + eprintln!( + "\n {} = {:?} ({:.0}% confidence, {})", + style(&candidate.suggested_name).bold(), + candidate.value, + candidate.confidence * 100.0, + candidate.tier + ); + eprintln!(" {}", style(&candidate.reason).dim()); + if candidate.total_occurrences > 0 { + eprintln!( + " {} occurrences across {} files", + candidate.total_occurrences, candidate.file_count + ); + } + + let accept = Confirm::new(&format!("Accept \"{}\"?", candidate.suggested_name)) + .with_default(true) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if accept { + accepted.push((candidate.suggested_name.clone(), candidate.value.clone())); + } + } else { + // Name collision — show selection prompt + eprintln!( + "\n {} Multiple candidates for {}:", + style("⚠").yellow(), + style(&name).bold() + ); + + let mut options: Vec = group + .iter() + .map(|c| { + format!( + "{:?} ({:.0}% confidence, {})", + c.value, + c.confidence * 100.0, + c.tier + ) + }) + .collect(); + options.push("Skip".to_string()); + + let selection = Select::new(&format!("Which value for \"{}\"?", name), options) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if selection != "Skip" { + // Find the matching candidate + if let Some(chosen) = group.iter().find(|c| { + format!( + "{:?} ({:.0}% confidence, {})", + c.value, + c.confidence * 100.0, + c.tier + ) == selection + }) { + accepted.push((chosen.suggested_name.clone(), chosen.value.clone())); + } + } + } + } + + Ok(accepted) +} + +pub fn confirm_files_interactive(files: &[PlannedExtractFile], dropped_count: usize) -> Result<()> { + let templated: Vec<_> = files.iter().filter(|f| f.has_replacements()).collect(); + let boilerplate: Vec<_> = files + .iter() + .filter(|f| !f.has_replacements() && !f.stubbed && !f.is_binary()) + .collect(); + let stubbed: Vec<_> = files.iter().filter(|f| f.stubbed).collect(); + let binary_count = files.iter().filter(|f| f.is_binary()).count(); + + eprintln!( + "\n{} File plan {}", + style("──").dim(), + style("──────────────────────────────────────────").dim() + ); + + // Templated files + eprintln!( + "\n {} ({} files, {} suffix):", + style("Templated").bold(), + templated.len(), + DEFAULT_TEMPLATES_SUFFIX + ); + for file in &templated { + eprintln!( + " {:<50} {} replacements", + file.template_path.display(), + file.replacement_count() + ); + } + + // Boilerplate files + eprintln!( + "\n {} (copied in full, {} files{}):", + style("Boilerplate").bold(), + boilerplate.len() + binary_count, + if binary_count > 0 { + format!(", {} binary", binary_count) + } else { + String::new() + } + ); + for file in &boilerplate { + eprintln!(" {}", file.template_path.display()); + } + + // Stubbed files + if !stubbed.is_empty() { + eprintln!( + "\n {} (structure only, {} files):", + style("Stubbed").bold(), + stubbed.len() + ); + for file in &stubbed { + eprintln!(" {}", file.template_path.display()); + } + } + + // Dropped files + if dropped_count > 0 { + eprintln!("\n {} ({} files):", style("Dropped").bold(), dropped_count); + } + + let proceed = Confirm::new("Proceed?") + .with_default(true) + .prompt() + .map_err(|_| DicecutError::PromptCancelled)?; + + if !proceed { + return Err(DicecutError::PromptCancelled); + } + + Ok(()) +} diff --git a/src/extract/mod.rs b/src/extract/mod.rs index c9a6e34..a3b524d 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -2,26 +2,31 @@ pub mod auto_detect; pub mod conditional; pub mod config_gen; pub mod exclude; +pub mod interactive; pub mod replace; pub mod scan; pub mod stub; pub mod variants; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::path::{Path, PathBuf}; use console::style; -use inquire::{Confirm, Select, Text}; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; -use self::auto_detect::{auto_detect, count_occurrences, DetectedCandidate}; -use self::conditional::{detect_conditional_files, patterns_for_variable, DetectedConditional}; +use self::auto_detect::{auto_detect, count_occurrences}; +use self::conditional::{detect_conditional_files, patterns_for_variable}; use self::config_gen::{ generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, }; use self::exclude::{all_default_excludes, detect_copy_without_render, relevant_config_excludes}; +use self::interactive::{ + confirm_auto_detected_interactive, confirm_conditionals_interactive, + confirm_excludes_interactive, confirm_files_interactive, confirm_variants_interactive, + resolve_candidates_yes, +}; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; @@ -561,402 +566,3 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> Ok(()) } - -// ── Interactive helpers ────────────────────────────────────────────────── - -fn confirm_variants_interactive(variables: Vec) -> Result> { - let mut confirmed = Vec::new(); - - for mut var in variables { - eprintln!( - "\n{} {} = {:?} {}", - style("──").dim(), - style(&var.name).bold(), - var.value, - style("──────────────────────────────────────").dim() - ); - - if var.variants.len() == 1 && var.variants[0].name == "verbatim" { - // Simple value — just show occurrence count - let (file_count, total_hits) = var - .occurrence_counts - .first() - .map(|(_, fc, th)| (*fc, *th)) - .unwrap_or((0, 0)); - if total_hits > 0 { - eprintln!( - " Found in {} files ({} occurrences)", - file_count, total_hits - ); - } else { - eprintln!( - " {} Value not found in any file (will still be added to config)", - style("⚠").yellow() - ); - } - confirmed.push(var); - continue; - } - - // Show detected variants with counts - eprintln!(" Detected case variants:"); - let mut found_any = false; - for (i, variant) in var.variants.iter().enumerate() { - let (_, file_count, total_hits) = &var.occurrence_counts[i]; - let mark = if *total_hits > 0 { - found_any = true; - style("✓").green().to_string() - } else { - style("✗").dim().to_string() - }; - let hits_str = if *total_hits > 0 { - format!( - "{} {} across {} {}", - total_hits, - if *total_hits == 1 { "hit" } else { "hits" }, - file_count, - if *file_count == 1 { "file" } else { "files" } - ) - } else { - "not found".to_string() - }; - eprintln!( - " {} {:<16} {:<20} {}", - mark, - variant.literal, - variant.name, - style(&hits_str).dim() - ); - } - - if !found_any { - eprintln!( - " {} No occurrences found for any variant (will still be added to config)", - style("⚠").yellow() - ); - // Keep just the first variant - var.variants.truncate(1); - confirmed.push(var); - continue; - } - - let keep = Confirm::new("Keep detected variants?") - .with_default(true) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if keep { - // Remove variants with zero occurrences - let counts = var.occurrence_counts.clone(); - var.variants.retain(|v| { - counts - .iter() - .any(|(name, _, hits)| name == v.name && *hits > 0) - }); - if var.variants.is_empty() { - let all = generate_variants(&var.name, &var.value); - if let Some(first) = all.into_iter().next() { - var.variants.push(first); - } - } - } else { - // Keep only the canonical variant - var.variants.truncate(1); - } - - confirmed.push(var); - } - - Ok(confirmed) -} - -fn confirm_excludes_interactive(mut excludes: Vec) -> Result> { - eprintln!( - "\n{} Excludes {}", - style("──").dim(), - style("─────────────────────────────────────────────").dim() - ); - if excludes.is_empty() { - eprintln!(" No exclude patterns needed for this template."); - } else { - eprintln!(" Patterns matching template files:"); - for e in &excludes { - eprintln!(" {}", e); - } - } - - let extra = Text::new("Add extra exclude patterns? (comma-separated, enter to skip)") - .with_default("") - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if !extra.is_empty() { - for pattern in extra.split(',') { - let trimmed = pattern.trim().to_string(); - if !trimmed.is_empty() { - excludes.push(trimmed); - } - } - } - - Ok(excludes) -} - -fn confirm_conditionals_interactive( - detected: Vec, -) -> Result> { - eprintln!( - "\n{} Conditional files {}", - style("──").dim(), - style("────────────────────────────────────").dim() - ); - eprintln!(" These look optional. Make them conditional?"); - - let mut confirmed = Vec::new(); - for cond in detected { - let prompt = format!(" {} → {}", cond.pattern, cond.variable); - let include = Confirm::new(&prompt) - .with_default(false) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if include { - confirmed.push(cond); - } - } - - Ok(confirmed) -} - -fn resolve_candidates_yes( - candidates: &[DetectedCandidate], - explicit_vars: &[(String, String)], -) -> Vec<(String, String)> { - eprintln!( - "\n{} Auto-detected variables {}", - style("──").dim(), - style("──────────────────────────────────").dim() - ); - - // Group candidates by suggested_name - let mut groups: BTreeMap> = BTreeMap::new(); - for c in candidates { - groups.entry(c.suggested_name.clone()).or_default().push(c); - } - - let mut result = Vec::new(); - - for (name, mut group) in groups { - // Skip names already covered by explicit --var - if explicit_vars.iter().any(|(n, _)| n == &name) { - eprintln!( - " {} {} (explicit --var, skipping auto-detect)", - style("·").dim(), - style(&name).dim() - ); - continue; - } - - // For name collisions, pick highest confidence - group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); - let winner = group[0]; - - eprintln!( - " {} {} = {:?} ({:.0}% confidence, {})", - style("✓").green(), - style(&winner.suggested_name).bold(), - winner.value, - winner.confidence * 100.0, - winner.tier - ); - eprintln!(" {}", style(&winner.reason).dim()); - - if group.len() > 1 { - eprintln!( - " {} {} other candidates for this name (picked highest confidence)", - style("⚠").yellow(), - group.len() - 1 - ); - } - - result.push((winner.suggested_name.clone(), winner.value.clone())); - } - - result -} - -fn confirm_auto_detected_interactive( - candidates: Vec, - explicit_vars: &[(String, String)], -) -> Result> { - eprintln!( - "\n{} Auto-detected variables {}", - style("──").dim(), - style("──────────────────────────────────").dim() - ); - - // Group candidates by suggested_name - let mut groups: BTreeMap> = BTreeMap::new(); - for c in candidates { - groups.entry(c.suggested_name.clone()).or_default().push(c); - } - - let mut accepted = Vec::new(); - - for (name, mut group) in groups { - // Skip names already covered by explicit --var - if explicit_vars.iter().any(|(n, _)| n == &name) { - eprintln!( - "\n {} {} (provided via --var, skipping)", - style("·").dim(), - style(&name).dim() - ); - continue; - } - - // Sort by confidence descending - group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); - - if group.len() == 1 { - // Single candidate — simple confirm - let candidate = &group[0]; - eprintln!( - "\n {} = {:?} ({:.0}% confidence, {})", - style(&candidate.suggested_name).bold(), - candidate.value, - candidate.confidence * 100.0, - candidate.tier - ); - eprintln!(" {}", style(&candidate.reason).dim()); - if candidate.total_occurrences > 0 { - eprintln!( - " {} occurrences across {} files", - candidate.total_occurrences, candidate.file_count - ); - } - - let accept = Confirm::new(&format!("Accept \"{}\"?", candidate.suggested_name)) - .with_default(true) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if accept { - accepted.push((candidate.suggested_name.clone(), candidate.value.clone())); - } - } else { - // Name collision — show selection prompt - eprintln!( - "\n {} Multiple candidates for {}:", - style("⚠").yellow(), - style(&name).bold() - ); - - let mut options: Vec = group - .iter() - .map(|c| { - format!( - "{:?} ({:.0}% confidence, {})", - c.value, - c.confidence * 100.0, - c.tier - ) - }) - .collect(); - options.push("Skip".to_string()); - - let selection = Select::new(&format!("Which value for \"{}\"?", name), options) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if selection != "Skip" { - // Find the matching candidate - if let Some(chosen) = group.iter().find(|c| { - format!( - "{:?} ({:.0}% confidence, {})", - c.value, - c.confidence * 100.0, - c.tier - ) == selection - }) { - accepted.push((chosen.suggested_name.clone(), chosen.value.clone())); - } - } - } - } - - Ok(accepted) -} - -fn confirm_files_interactive(files: &[PlannedExtractFile], dropped_count: usize) -> Result<()> { - let templated: Vec<_> = files.iter().filter(|f| f.has_replacements()).collect(); - let boilerplate: Vec<_> = files - .iter() - .filter(|f| !f.has_replacements() && !f.stubbed && !f.is_binary()) - .collect(); - let stubbed: Vec<_> = files.iter().filter(|f| f.stubbed).collect(); - let binary_count = files.iter().filter(|f| f.is_binary()).count(); - - eprintln!( - "\n{} File plan {}", - style("──").dim(), - style("──────────────────────────────────────────").dim() - ); - - // Templated files - eprintln!( - "\n {} ({} files, {} suffix):", - style("Templated").bold(), - templated.len(), - DEFAULT_TEMPLATES_SUFFIX - ); - for file in &templated { - eprintln!( - " {:<50} {} replacements", - file.template_path.display(), - file.replacement_count() - ); - } - - // Boilerplate files - eprintln!( - "\n {} (copied in full, {} files{}):", - style("Boilerplate").bold(), - boilerplate.len() + binary_count, - if binary_count > 0 { - format!(", {} binary", binary_count) - } else { - String::new() - } - ); - for file in &boilerplate { - eprintln!(" {}", file.template_path.display()); - } - - // Stubbed files - if !stubbed.is_empty() { - eprintln!( - "\n {} (structure only, {} files):", - style("Stubbed").bold(), - stubbed.len() - ); - for file in &stubbed { - eprintln!(" {}", file.template_path.display()); - } - } - - // Dropped files - if dropped_count > 0 { - eprintln!("\n {} ({} files):", style("Dropped").bold(), dropped_count); - } - - let proceed = Confirm::new("Proceed?") - .with_default(true) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if !proceed { - return Err(DicecutError::PromptCancelled); - } - - Ok(()) -} From 23491a81cd0757680e2f469e59aae75e20a09d5c Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 14:44:51 -0500 Subject: [PATCH 23/29] fix(extract): apply stub-depth to templated files too Files deeper than stub_depth were only dropped when they had 0 template replacements. Deep files with incidental replacements (e.g. a project name appearing in a nested reference doc) were still kept as .die templates. Now the depth check applies regardless of replacement count. --- src/extract/mod.rs | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/extract/mod.rs b/src/extract/mod.rs index a3b524d..b1fe890 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -314,17 +314,23 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { let (replaced, count) = apply_replacements(content, &rules); if count > 0 { - // Has template replacements — keep content, add .die suffix - let mut p = template_path.as_os_str().to_string_lossy().to_string(); - p.push_str(DEFAULT_TEMPLATES_SUFFIX); - planned_files.push(PlannedExtractFile { - template_path: PathBuf::from(p), - content: ExtractedContent::Text { - content: replaced, - replacement_count: count, - }, - stubbed: false, - }); + // Has template replacements — but still drop if too deep + let depth = file.relative_path.components().count(); + if depth > options.stub_depth { + dropped_count += 1; + dropped_paths.push(file.relative_path.clone()); + } else { + let mut p = template_path.as_os_str().to_string_lossy().to_string(); + p.push_str(DEFAULT_TEMPLATES_SUFFIX); + planned_files.push(PlannedExtractFile { + template_path: PathBuf::from(p), + content: ExtractedContent::Text { + content: replaced, + replacement_count: count, + }, + stubbed: false, + }); + } } else { // No replacements — classify as boilerplate, content, or dropped match classify_file(&file.relative_path, options.stub_depth) { From 0760e235f6643e0cea26c94152ef0dcf87b0499f Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 16:46:58 -0500 Subject: [PATCH 24/29] fix(extract): filter deep files before auto-detect Non-boilerplate files deeper than stub_depth are now removed from the scan result before frequency analysis runs. This prevents detecting variables that only appear in files that would be dropped anyway. --- src/extract/mod.rs | 53 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/extract/mod.rs b/src/extract/mod.rs index b1fe890..13c2727 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -164,11 +164,28 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { "\n{}", style(format!("Scanning {}...", source_dir.display())).bold() ); - let scan_result = scan_project(source_dir, &scan_excludes)?; + let mut scan_result = scan_project(source_dir, &scan_excludes)?; + + // Drop non-boilerplate files deeper than stub_depth before auto-detect sees them. + // This prevents frequency analysis from detecting variables that only appear in + // files that would be dropped anyway. + let pre_filter_count = scan_result.files.len(); + scan_result.files.retain(|f| { + let depth = f.relative_path.components().count(); + depth <= options.stub_depth + || classify_file(&f.relative_path, options.stub_depth) == FileRole::Boilerplate + }); + let depth_dropped = pre_filter_count - scan_result.files.len(); + eprintln!( - " {} files found, {} excluded", + " {} files found, {} excluded{}", scan_result.files.len(), - scan_result.excluded_count + scan_result.excluded_count, + if depth_dropped > 0 { + format!(", {} too deep", depth_dropped) + } else { + String::new() + } ); // Phase 2.5: Auto-detect variables (always runs), merge with explicit --var entries @@ -293,7 +310,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { // Phase 9: Apply replacements to files let mut planned_files = Vec::new(); - let mut dropped_count = 0; + let mut dropped_count = depth_dropped; let mut dropped_paths = Vec::new(); for file in &scan_result.files { @@ -314,23 +331,17 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { let (replaced, count) = apply_replacements(content, &rules); if count > 0 { - // Has template replacements — but still drop if too deep - let depth = file.relative_path.components().count(); - if depth > options.stub_depth { - dropped_count += 1; - dropped_paths.push(file.relative_path.clone()); - } else { - let mut p = template_path.as_os_str().to_string_lossy().to_string(); - p.push_str(DEFAULT_TEMPLATES_SUFFIX); - planned_files.push(PlannedExtractFile { - template_path: PathBuf::from(p), - content: ExtractedContent::Text { - content: replaced, - replacement_count: count, - }, - stubbed: false, - }); - } + // Has template replacements — add .die suffix + let mut p = template_path.as_os_str().to_string_lossy().to_string(); + p.push_str(DEFAULT_TEMPLATES_SUFFIX); + planned_files.push(PlannedExtractFile { + template_path: PathBuf::from(p), + content: ExtractedContent::Text { + content: replaced, + replacement_count: count, + }, + stubbed: false, + }); } else { // No replacements — classify as boilerplate, content, or dropped match classify_file(&file.relative_path, options.stub_depth) { From 2f6240490e1095c4c2df54a4f7560b9f2ed2a80e Mon Sep 17 00:00:00 2001 From: rroskam Date: Sat, 28 Feb 2026 17:26:09 -0500 Subject: [PATCH 25/29] refactor: improve extraction --- src/extract/exclude.rs | 58 ++++++++++++++++++++++++++++++++++++++ src/extract/interactive.rs | 17 ++++++++++- src/extract/mod.rs | 18 +++++++++++- 3 files changed, 91 insertions(+), 2 deletions(-) diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index f228830..d8bac71 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -29,6 +29,8 @@ const DEFAULT_EXCLUDES: &[&str] = &[ ".output", ".turbo", ".worktrees", + ".claude/worktrees", + ".astro", ".diecut-answers.toml", ]; @@ -119,6 +121,27 @@ pub fn detect_copy_without_render( found } +/// Check if a file should be copied without rendering (lock files, binary-like assets). +/// +/// These files are included in the template but should never have replacements +/// applied during extraction — they're copied verbatim. +pub fn is_copy_without_render(path: &Path) -> bool { + for pattern in DEFAULT_COPY_WITHOUT_RENDER { + if let Some(ext) = pattern.strip_prefix("*.") { + if let Some(file_ext) = path.extension() { + if file_ext.to_string_lossy().eq_ignore_ascii_case(ext) { + return true; + } + } + } else if let Some(file_name) = path.file_name() { + if file_name.to_string_lossy() == *pattern { + return true; + } + } + } + false +} + /// Check if a path should be excluded based on the exclude patterns. pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { let path_str = relative_path.to_string_lossy(); @@ -232,6 +255,41 @@ mod tests { assert!(!relevant.contains(&"node_modules".to_string())); } + #[test] + fn test_should_exclude_claude_worktrees() { + let excludes = all_default_excludes(); + assert!(should_exclude( + Path::new(".claude/worktrees/agent-abc/Cargo.toml"), + &excludes + )); + // .claude/settings.local.json should NOT be excluded + assert!(!should_exclude( + Path::new(".claude/settings.local.json"), + &excludes + )); + } + + #[test] + fn test_should_exclude_astro() { + let excludes = all_default_excludes(); + assert!(should_exclude( + Path::new("docs/.astro/data-store.json"), + &excludes + )); + assert!(should_exclude(Path::new(".astro/settings.json"), &excludes)); + } + + #[test] + fn test_is_copy_without_render() { + assert!(is_copy_without_render(Path::new("Cargo.lock"))); + assert!(is_copy_without_render(Path::new("pnpm-lock.yaml"))); + assert!(is_copy_without_render(Path::new("package-lock.json"))); + assert!(is_copy_without_render(Path::new("logo.png"))); + assert!(is_copy_without_render(Path::new("deep/nested/file.lock"))); + assert!(!is_copy_without_render(Path::new("src/main.rs"))); + assert!(!is_copy_without_render(Path::new("README.md"))); + } + #[test] fn test_detect_copy_without_render() { let files = vec![ diff --git a/src/extract/interactive.rs b/src/extract/interactive.rs index 6b6c55d..2f9b8e9 100644 --- a/src/extract/interactive.rs +++ b/src/extract/interactive.rs @@ -6,7 +6,7 @@ use inquire::{Confirm, Select, Text}; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; -use super::auto_detect::DetectedCandidate; +use super::auto_detect::{ConfidenceTier, DetectedCandidate}; use super::conditional::DetectedConditional; use super::variants::generate_variants; use super::{ExtractVariable, PlannedExtractFile}; @@ -194,6 +194,7 @@ pub fn resolve_candidates_yes( } let mut result = Vec::new(); + let mut skipped_freq = 0; for (name, mut group) in groups { // Skip names already covered by explicit --var @@ -210,6 +211,12 @@ pub fn resolve_candidates_yes( group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); let winner = group[0]; + // Skip frequency-analysis candidates in -y mode — too noisy for auto-accept + if winner.tier == ConfidenceTier::FrequencyAnalysis { + skipped_freq += 1; + continue; + } + eprintln!( " {} {} = {:?} ({:.0}% confidence, {})", style("✓").green(), @@ -231,6 +238,14 @@ pub fn resolve_candidates_yes( result.push((winner.suggested_name.clone(), winner.value.clone())); } + if skipped_freq > 0 { + eprintln!( + " {} {} frequency-detected candidate(s) skipped (use interactive mode to review)", + style("·").dim(), + skipped_freq + ); + } + result } diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 13c2727..f9d1318 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -21,7 +21,10 @@ use self::conditional::{detect_conditional_files, patterns_for_variable}; use self::config_gen::{ generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, }; -use self::exclude::{all_default_excludes, detect_copy_without_render, relevant_config_excludes}; +use self::exclude::{ + all_default_excludes, detect_copy_without_render, is_copy_without_render, + relevant_config_excludes, +}; use self::interactive::{ confirm_auto_detected_interactive, confirm_conditionals_interactive, confirm_excludes_interactive, confirm_files_interactive, confirm_variants_interactive, @@ -328,6 +331,19 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { stubbed: false, }); } else if let Some(ref content) = file.content { + // Lock files and other copy-without-render files: skip replacement + if is_copy_without_render(&file.relative_path) { + planned_files.push(PlannedExtractFile { + template_path, + content: ExtractedContent::Text { + content: content.clone(), + replacement_count: 0, + }, + stubbed: false, + }); + continue; + } + let (replaced, count) = apply_replacements(content, &rules); if count > 0 { From 674be8ca3b667449b48b9d2ebdc8414bde1b34a0 Mon Sep 17 00:00:00 2001 From: rroskam Date: Wed, 4 Mar 2026 12:05:38 -0500 Subject: [PATCH 26/29] refactor(extract): trim to engine-only for PR 1 Remove auto-detect, interactive prompts, and conditional files to reduce PR scope. These features are preserved on feat/extract-auto-detect for a follow-up PR. - Delete auto_detect.rs (1,140 lines), interactive.rs (426 lines), conditional.rs (170 lines) - Remove --yes and --min-confidence CLI flags - Move count_occurrences to scan.rs (test-only) - Remove 4 auto-detect integration tests - Strip dead params and deduplicate DEFAULT_EXCLUDES --- src/cli.rs | 8 - src/commands/extract.rs | 8 +- src/extract/auto_detect.rs | 1140 ------------------------------------ src/extract/conditional.rs | 170 ------ src/extract/exclude.rs | 11 +- src/extract/interactive.rs | 426 -------------- src/extract/mod.rs | 159 +---- src/extract/replace.rs | 15 +- src/extract/scan.rs | 30 + src/main.rs | 13 +- tests/integration.rs | 206 +------ 11 files changed, 77 insertions(+), 2109 deletions(-) delete mode 100644 src/extract/auto_detect.rs delete mode 100644 src/extract/conditional.rs delete mode 100644 src/extract/interactive.rs diff --git a/src/cli.rs b/src/cli.rs index fde16cb..92bc8e1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -67,14 +67,6 @@ pub enum Commands { #[arg(long)] in_place: bool, - /// Accept all defaults without prompting - #[arg(short = 'y', long)] - yes: bool, - - /// Minimum confidence threshold for auto-detected variables (0.0-1.0) - #[arg(long, default_value = "0.5")] - min_confidence: f64, - /// Max path depth for stubbing content files (deeper files are dropped) #[arg(long, default_value = "2")] stub_depth: usize, diff --git a/src/commands/extract.rs b/src/commands/extract.rs index 6251044..b7f7e9f 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -6,14 +6,11 @@ use diecut::error::DicecutError; use diecut::extract::{execute_extraction, plan_extraction, ExtractOptions}; use miette::Result; -#[allow(clippy::too_many_arguments)] pub fn run( source: String, vars: Vec, output: Option, in_place: bool, - yes: bool, - min_confidence: f64, stub_depth: usize, dry_run: bool, ) -> Result<()> { @@ -24,10 +21,7 @@ pub fn run( variables, output_dir: output.map(PathBuf::from), in_place, - yes, - min_confidence, stub_depth, - dry_run, }; let plan = plan_extraction(&options)?; @@ -37,7 +31,7 @@ pub fn run( return Ok(()); } - execute_extraction(&plan, in_place)?; + execute_extraction(&plan)?; Ok(()) } diff --git a/src/extract/auto_detect.rs b/src/extract/auto_detect.rs deleted file mode 100644 index 193bbac..0000000 --- a/src/extract/auto_detect.rs +++ /dev/null @@ -1,1140 +0,0 @@ -use std::collections::{HashMap, HashSet}; -use std::path::Path; -use std::process::Command; -use std::sync::LazyLock; - -use regex_lite::Regex; - -static GO_MOD_RE: LazyLock = LazyLock::new(|| Regex::new(r"^module\s+(\S+)").unwrap()); - -static TOKEN_RE: LazyLock = LazyLock::new(|| { - Regex::new( - r"[a-zA-Z][a-zA-Z0-9]*(?:[-_.][a-zA-Z0-9]+)+|[A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+(?:[A-Z][a-z]+)+|[A-Z]{2,}(?:_[A-Z]{2,})+", - ) - .unwrap() -}); - -use super::scan::ScanResult; -use super::variants::split_into_words; - -/// Confidence tier indicating how a candidate variable was detected. -#[derive(Debug, Clone, PartialEq)] -pub enum ConfidenceTier { - DirectoryName, - ConfigFile, - GitMetadata, - FrequencyAnalysis, -} - -impl std::fmt::Display for ConfidenceTier { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - ConfidenceTier::DirectoryName => write!(f, "directory name"), - ConfidenceTier::ConfigFile => write!(f, "config file"), - ConfidenceTier::GitMetadata => write!(f, "git metadata"), - ConfidenceTier::FrequencyAnalysis => write!(f, "frequency analysis"), - } - } -} - -/// A candidate variable detected by auto-detection. -#[derive(Debug, Clone)] -pub struct DetectedCandidate { - pub suggested_name: String, - pub value: String, - pub tier: ConfidenceTier, - pub confidence: f64, - pub reason: String, - pub file_count: usize, - pub total_occurrences: usize, -} - -/// Result of running auto-detection. -#[derive(Debug)] -pub struct AutoDetectResult { - pub candidates: Vec, -} - -// ── Entry point ────────────────────────────────────────────────────────── - -/// Run all 4 auto-detection tiers against a scanned project. -pub fn auto_detect(project_dir: &Path, scan_result: &ScanResult) -> AutoDetectResult { - let mut candidates = Vec::new(); - - // Tier 1: Directory name - candidates.extend(detect_directory_name(project_dir, scan_result)); - - // Tier 2: Ecosystem config files - candidates.extend(detect_config_files(project_dir, scan_result)); - - // Tier 3: Git metadata - candidates.extend(detect_git_metadata(project_dir, scan_result)); - - // Collect values already covered by tiers 1-3 - let covered_values: HashSet = - candidates.iter().map(|c| c.value.to_lowercase()).collect(); - - // Tier 4: Frequency analysis - candidates.extend(detect_frequency(scan_result, &covered_values)); - - // Deduplicate by normalized word list, keeping highest confidence - deduplicate_candidates(&mut candidates); - - // Sort by confidence descending - candidates.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); - - AutoDetectResult { candidates } -} - -// ── Tier 1: Directory name ─────────────────────────────────────────────── - -const GENERIC_DIR_NAMES: &[&str] = &[ - "src", - "app", - "project", - "tmp", - "temp", - "build", - "dist", - "out", - "output", - "lib", - "bin", - "test", - "tests", - "example", - "examples", - "docs", - "doc", - "assets", - "public", - "static", - "vendor", - "node_modules", - "target", - "pkg", - "cmd", - "internal", - "api", - "web", - "server", - "client", - "frontend", - "backend", - "service", - "services", - "workspace", - "repo", - "code", -]; - -fn detect_directory_name(project_dir: &Path, scan_result: &ScanResult) -> Vec { - let dir_name = match project_dir.file_name() { - Some(name) => name.to_string_lossy().to_string(), - None => return vec![], - }; - - if GENERIC_DIR_NAMES.contains(&dir_name.to_lowercase().as_str()) { - return vec![]; - } - - // Must have at least 2 chars - if dir_name.len() < 2 { - return vec![]; - } - - let (file_count, total_occurrences) = count_occurrences(&dir_name, scan_result); - - vec![DetectedCandidate { - suggested_name: "project_name".to_string(), - value: dir_name.clone(), - tier: ConfidenceTier::DirectoryName, - confidence: 0.95, - reason: format!("directory name \"{}\"", dir_name), - file_count, - total_occurrences, - }] -} - -// ── Tier 2: Ecosystem config files ─────────────────────────────────────── - -fn detect_config_files(project_dir: &Path, scan_result: &ScanResult) -> Vec { - let mut candidates = Vec::new(); - - if let Some(mut c) = parse_cargo_toml(project_dir, scan_result) { - candidates.append(&mut c); - } - if let Some(mut c) = parse_package_json(project_dir, scan_result) { - candidates.append(&mut c); - } - if let Some(mut c) = parse_pyproject_toml(project_dir, scan_result) { - candidates.append(&mut c); - } - if let Some(mut c) = parse_go_mod(project_dir, scan_result) { - candidates.append(&mut c); - } - - candidates -} - -fn push_config_candidate( - candidates: &mut Vec, - value: &str, - suggested_name: &str, - confidence: f64, - reason: &str, - scan_result: &ScanResult, -) { - let (file_count, total_occurrences) = count_occurrences(value, scan_result); - candidates.push(DetectedCandidate { - suggested_name: suggested_name.to_string(), - value: value.to_string(), - tier: ConfidenceTier::ConfigFile, - confidence, - reason: reason.to_string(), - file_count, - total_occurrences, - }); -} - -fn parse_cargo_toml( - project_dir: &Path, - scan_result: &ScanResult, -) -> Option> { - let path = project_dir.join("Cargo.toml"); - let content = std::fs::read_to_string(&path).ok()?; - let parsed: toml::Value = content.parse().ok()?; - - let mut candidates = Vec::new(); - - if let Some(name) = parsed - .get("package") - .and_then(|p| p.get("name")) - .and_then(|n| n.as_str()) - { - push_config_candidate( - &mut candidates, - name, - "project_name", - 0.90, - "Cargo.toml [package].name", - scan_result, - ); - } - - if let Some(version) = parsed - .get("package") - .and_then(|p| p.get("version")) - .and_then(|v| v.as_str()) - { - if !version.is_empty() { - push_config_candidate( - &mut candidates, - version, - "version", - 0.85, - "Cargo.toml [package].version", - scan_result, - ); - } - } - - if let Some(authors) = parsed - .get("package") - .and_then(|p| p.get("authors")) - .and_then(|a| a.as_array()) - { - if let Some(first) = authors.first().and_then(|a| a.as_str()) { - let author = strip_email(first); - if !author.is_empty() { - push_config_candidate( - &mut candidates, - &author, - "author", - 0.85, - "Cargo.toml [package].authors[0]", - scan_result, - ); - } - } - } - - Some(candidates) -} - -fn parse_package_json( - project_dir: &Path, - scan_result: &ScanResult, -) -> Option> { - let path = project_dir.join("package.json"); - let content = std::fs::read_to_string(&path).ok()?; - let parsed: serde_json::Value = serde_json::from_str(&content).ok()?; - - let mut candidates = Vec::new(); - - if let Some(name) = parsed.get("name").and_then(|n| n.as_str()) { - let clean_name = strip_npm_scope(name); - push_config_candidate( - &mut candidates, - clean_name, - "project_name", - 0.90, - "package.json \"name\"", - scan_result, - ); - } - - if let Some(version) = parsed.get("version").and_then(|v| v.as_str()) { - if !version.is_empty() { - push_config_candidate( - &mut candidates, - version, - "version", - 0.85, - "package.json \"version\"", - scan_result, - ); - } - } - - if let Some(author) = parsed.get("author") { - let author_str = match author { - serde_json::Value::String(s) => Some(strip_email(s)), - serde_json::Value::Object(obj) => { - obj.get("name").and_then(|n| n.as_str()).map(String::from) - } - _ => None, - }; - if let Some(author_name) = author_str { - if !author_name.is_empty() { - push_config_candidate( - &mut candidates, - &author_name, - "author", - 0.85, - "package.json \"author\"", - scan_result, - ); - } - } - } - - Some(candidates) -} - -fn parse_pyproject_toml( - project_dir: &Path, - scan_result: &ScanResult, -) -> Option> { - let path = project_dir.join("pyproject.toml"); - let content = std::fs::read_to_string(&path).ok()?; - let parsed: toml::Value = content.parse().ok()?; - - let mut candidates = Vec::new(); - - if let Some(name) = parsed - .get("project") - .and_then(|p| p.get("name")) - .and_then(|n| n.as_str()) - { - push_config_candidate( - &mut candidates, - name, - "project_name", - 0.90, - "pyproject.toml [project].name", - scan_result, - ); - } - - if let Some(version) = parsed - .get("project") - .and_then(|p| p.get("version")) - .and_then(|v| v.as_str()) - { - if !version.is_empty() { - push_config_candidate( - &mut candidates, - version, - "version", - 0.85, - "pyproject.toml [project].version", - scan_result, - ); - } - } - - if let Some(authors) = parsed - .get("project") - .and_then(|p| p.get("authors")) - .and_then(|a| a.as_array()) - { - if let Some(first) = authors.first() { - let author_name = first - .get("name") - .and_then(|n| n.as_str()) - .or_else(|| first.as_str()) - .map(strip_email); - if let Some(name) = author_name { - if !name.is_empty() { - push_config_candidate( - &mut candidates, - &name, - "author", - 0.85, - "pyproject.toml [project].authors[0].name", - scan_result, - ); - } - } - } - } - - Some(candidates) -} - -fn parse_go_mod(project_dir: &Path, scan_result: &ScanResult) -> Option> { - let path = project_dir.join("go.mod"); - let content = std::fs::read_to_string(&path).ok()?; - - let module_path = GO_MOD_RE.captures(&content)?.get(1)?.as_str(); - - let segments: Vec<&str> = module_path.split('/').collect(); - - // Extract last path segment as project name - let name = segments.last().copied()?; - if name.is_empty() { - return None; - } - - let mut candidates = Vec::new(); - - push_config_candidate( - &mut candidates, - name, - "project_name", - 0.90, - &format!("go.mod module \"{}\"", module_path), - scan_result, - ); - - // Extract org name (second-to-last segment for github.com/org/repo patterns) - if segments.len() >= 3 { - let org = segments[segments.len() - 2]; - if !org.is_empty() && org != name { - let (_, org_total_occurrences) = count_occurrences(org, scan_result); - if org_total_occurrences > 0 { - push_config_candidate( - &mut candidates, - org, - "org_name", - 0.85, - &format!("go.mod module org \"{}\"", org), - scan_result, - ); - } - } - } - - Some(candidates) -} - -// ── Tier 3: Git metadata ───────────────────────────────────────────────── - -fn detect_git_metadata(project_dir: &Path, scan_result: &ScanResult) -> Vec { - let mut candidates = Vec::new(); - - // Try to get remote origin URL - if let Some(url) = git_config_get(project_dir, "remote.origin.url") { - if let Some(org) = parse_org_from_url(&url) { - let (file_count, total_occurrences) = count_occurrences(&org, scan_result); - // Only include if org name actually appears in files - if total_occurrences > 0 { - candidates.push(DetectedCandidate { - suggested_name: "org_name".to_string(), - value: org.clone(), - tier: ConfidenceTier::GitMetadata, - confidence: 0.70, - reason: format!("git remote org \"{}\"", org), - file_count, - total_occurrences, - }); - } - } - } - - // Try to get user name - if let Some(user_name) = git_config_get(project_dir, "user.name") { - if !user_name.is_empty() { - let (file_count, total_occurrences) = count_occurrences(&user_name, scan_result); - candidates.push(DetectedCandidate { - suggested_name: "author".to_string(), - value: user_name.clone(), - tier: ConfidenceTier::GitMetadata, - confidence: 0.65, - reason: format!("git config user.name \"{}\"", user_name), - file_count, - total_occurrences, - }); - } - } - - candidates -} - -fn git_config_get(project_dir: &Path, key: &str) -> Option { - let output = Command::new("git") - .arg("config") - .arg("--get") - .arg(key) - .current_dir(project_dir) - .env("GIT_TERMINAL_PROMPT", "0") - .output() - .ok()?; - - if !output.status.success() { - return None; - } - - let value = String::from_utf8(output.stdout).ok()?.trim().to_string(); - if value.is_empty() { - None - } else { - Some(value) - } -} - -fn parse_org_from_url(url: &str) -> Option { - // SSH: git@github.com:org/repo.git - if let Some(rest) = url.strip_prefix("git@") { - let after_colon = rest.split(':').nth(1)?; - let org = after_colon.split('/').next()?; - if !org.is_empty() { - return Some(org.to_string()); - } - } - - // HTTPS: https://github.com/org/repo.git - if url.starts_with("https://") || url.starts_with("http://") { - let parts: Vec<&str> = url.split('/').collect(); - // https://host/org/repo → parts[3] is org - if parts.len() >= 4 && !parts[3].is_empty() { - return Some(parts[3].to_string()); - } - } - - None -} - -// ── Tier 4: Frequency analysis ─────────────────────────────────────────── - -fn detect_frequency( - scan_result: &ScanResult, - covered_values: &HashSet, -) -> Vec { - // Tokenize all text file content - let mut token_file_map: HashMap> = HashMap::new(); - let mut token_counts: HashMap = HashMap::new(); - - for (file_idx, file) in scan_result.files.iter().enumerate() { - if let Some(ref content) = file.content { - for mat in TOKEN_RE.find_iter(content) { - let token = mat.as_str().to_string(); - token_file_map - .entry(token.clone()) - .or_default() - .insert(file_idx); - *token_counts.entry(token).or_insert(0) += 1; - } - } - } - - // Group tokens by normalized word list to find multi-variant clusters - struct Cluster { - literals: Vec, - total_occurrences: usize, - files: HashSet, - } - - let mut clusters: HashMap = HashMap::new(); - - for (token, count) in &token_counts { - let words = split_into_words(token); - let normalized_key = words.join(" "); - - // Token must be at least 4 chars - if token.len() < 4 { - continue; - } - - let cluster = clusters.entry(normalized_key).or_insert_with(|| Cluster { - literals: Vec::new(), - total_occurrences: 0, - files: HashSet::new(), - }); - - if !cluster.literals.contains(token) { - cluster.literals.push(token.clone()); - } - cluster.total_occurrences += count; - if let Some(file_set) = token_file_map.get(token) { - cluster.files.extend(file_set); - } - } - - // Filter and convert to candidates - let mut freq_candidates: Vec = Vec::new(); - - for cluster in clusters.values() { - // Must have ≥2 distinct case variants (the key multi-variant heuristic) - if cluster.literals.len() < 2 { - continue; - } - - // Must have ≥3 total occurrences - if cluster.total_occurrences < 3 { - continue; - } - - // Must appear in ≥2 files - if cluster.files.len() < 2 { - continue; - } - - // Skip if already covered by higher tiers - if cluster - .literals - .iter() - .any(|l| covered_values.contains(&l.to_lowercase())) - { - continue; - } - - let best_literal = &cluster.literals[0]; - let words = split_into_words(best_literal); - let suggested_name = if words.len() <= 3 { - words.join("_") - } else { - words[..3].join("_") - }; - - let file_count = cluster.files.len(); - freq_candidates.push(DetectedCandidate { - suggested_name, - value: best_literal.clone(), - tier: ConfidenceTier::FrequencyAnalysis, - confidence: 0.60, - reason: format!( - "{} occurrences across {} files, {} variant(s)", - cluster.total_occurrences, - file_count, - cluster.literals.len() - ), - file_count, - total_occurrences: cluster.total_occurrences, - }); - } - - // Sort by file_count * total_occurrences descending, take top 5 - freq_candidates.sort_by(|a, b| { - let score_a = a.file_count * a.total_occurrences; - let score_b = b.file_count * b.total_occurrences; - score_b.cmp(&score_a) - }); - freq_candidates.truncate(5); - - freq_candidates -} - -// ── Helpers ────────────────────────────────────────────────────────────── - -pub fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize) { - let mut file_count = 0; - let mut total = 0; - - for file in &scan_result.files { - let mut counted_file = false; - - if let Some(ref content) = file.content { - let hits = content.matches(value).count(); - if hits > 0 { - file_count += 1; - counted_file = true; - total += hits; - } - } - - let path_str = file.relative_path.to_string_lossy(); - let path_hits = path_str.matches(value).count(); - if path_hits > 0 { - total += path_hits; - if !counted_file { - file_count += 1; - } - } - } - - (file_count, total) -} - -pub fn strip_email(s: &str) -> String { - // "Jane Doe " → "Jane Doe" - if let Some(idx) = s.find('<') { - s[..idx].trim().to_string() - } else if s.contains('@') { - // Bare email — use part before @ - s.split('@').next().unwrap_or("").trim().to_string() - } else { - s.trim().to_string() - } -} - -fn strip_npm_scope(name: &str) -> &str { - if let Some(rest) = name.strip_prefix('@') { - rest.split('/').nth(1).unwrap_or(name) - } else { - name - } -} - -fn deduplicate_candidates(candidates: &mut Vec) { - // Only deduplicate by value (same literal from multiple tiers → keep highest confidence). - // Name collisions (e.g., two different "author" candidates) are preserved - // for the interactive/yes layer to resolve. - let mut seen_value: HashMap = HashMap::new(); - let mut to_remove = Vec::new(); - - for (i, candidate) in candidates.iter().enumerate() { - let value_key = candidate.value.to_lowercase(); - if let Some(&prev_idx) = seen_value.get(&value_key) { - if candidate.confidence > candidates[prev_idx].confidence { - to_remove.push(prev_idx); - seen_value.insert(value_key, i); - } else { - to_remove.push(i); - } - } else { - seen_value.insert(value_key, i); - } - } - - to_remove.sort_unstable(); - to_remove.dedup(); - for idx in to_remove.into_iter().rev() { - candidates.remove(idx); - } -} - -// ── Tests ──────────────────────────────────────────────────────────────── - -#[cfg(test)] -mod tests { - use super::*; - use crate::extract::scan::ScannedFile; - use std::path::PathBuf; - - fn make_scan_result(files: Vec<(&str, &str)>) -> ScanResult { - ScanResult { - files: files - .into_iter() - .map(|(path, content)| ScannedFile { - relative_path: PathBuf::from(path), - absolute_path: PathBuf::from(path), - is_binary: false, - content: Some(content.to_string()), - }) - .collect(), - excluded_count: 0, - } - } - - // ── Tier 1 tests ───────────────────────────────────────────────── - - #[test] - fn test_tier1_basic_dir_name() { - let scan = make_scan_result(vec![ - ("README.md", "# my-widget\nA widget project"), - ("src/lib.rs", "// my-widget core"), - ]); - let dir = PathBuf::from("/projects/my-widget"); - let candidates = detect_directory_name(&dir, &scan); - - assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].value, "my-widget"); - assert_eq!(candidates[0].suggested_name, "project_name"); - assert_eq!(candidates[0].confidence, 0.95); - assert!(candidates[0].total_occurrences >= 2); - } - - #[test] - fn test_tier1_generic_name_skipped() { - let scan = make_scan_result(vec![("main.rs", "fn main() {}")]); - let dir = PathBuf::from("/projects/src"); - let candidates = detect_directory_name(&dir, &scan); - assert!(candidates.is_empty()); - } - - #[test] - fn test_tier1_occurrence_counting() { - let scan = make_scan_result(vec![ - ("a.txt", "hello hello hello"), - ("b.txt", "hello world"), - ]); - let dir = PathBuf::from("/projects/hello"); - let candidates = detect_directory_name(&dir, &scan); - assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].file_count, 2); - assert!(candidates[0].total_occurrences >= 4); - } - - // ── Tier 2 tests ───────────────────────────────────────────────── - - #[test] - fn test_tier2_cargo_toml() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write( - dir.path().join("Cargo.toml"), - "[package]\nname = \"data-pipeline\"\nversion = \"0.3.1\"\nauthors = [\"Alice \"]\n", - ) - .unwrap(); - - let scan = make_scan_result(vec![("src/main.rs", "data-pipeline runs here")]); - let candidates = parse_cargo_toml(dir.path(), &scan).unwrap(); - - assert!(candidates.iter().any(|c| c.value == "data-pipeline")); - assert!(candidates - .iter() - .any(|c| c.value == "0.3.1" && c.suggested_name == "version" && c.confidence == 0.85)); - assert!(candidates.iter().any(|c| c.value == "Alice")); - } - - #[test] - fn test_tier2_package_json_with_scope() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write( - dir.path().join("package.json"), - r#"{"name": "@myorg/cool-widget", "version": "2.1.0", "author": "Bob Smith "}"#, - ) - .unwrap(); - - let scan = make_scan_result(vec![("index.js", "cool-widget stuff")]); - let candidates = parse_package_json(dir.path(), &scan).unwrap(); - - let name_candidate = candidates - .iter() - .find(|c| c.suggested_name == "project_name") - .unwrap(); - assert_eq!(name_candidate.value, "cool-widget"); - - let version_candidate = candidates - .iter() - .find(|c| c.suggested_name == "version") - .unwrap(); - assert_eq!(version_candidate.value, "2.1.0"); - assert_eq!(version_candidate.confidence, 0.85); - - let author_candidate = candidates - .iter() - .find(|c| c.suggested_name == "author") - .unwrap(); - assert_eq!(author_candidate.value, "Bob Smith"); - } - - #[test] - fn test_tier2_pyproject_toml() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write( - dir.path().join("pyproject.toml"), - "[project]\nname = \"my-tool\"\nversion = \"1.0.0\"\n\n[[project.authors]]\nname = \"Charlie\"\n", - ) - .unwrap(); - - let scan = make_scan_result(vec![("setup.py", "my-tool setup")]); - let candidates = parse_pyproject_toml(dir.path(), &scan).unwrap(); - - assert!(candidates.iter().any(|c| c.value == "my-tool")); - assert!(candidates - .iter() - .any(|c| c.value == "1.0.0" && c.suggested_name == "version" && c.confidence == 0.85)); - assert!(candidates.iter().any(|c| c.value == "Charlie")); - } - - #[test] - fn test_tier2_go_mod() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write( - dir.path().join("go.mod"), - "module github.com/acme/my-service\n\ngo 1.21\n", - ) - .unwrap(); - - let scan = make_scan_result(vec![("main.go", "package main // my-service by acme")]); - let candidates = parse_go_mod(dir.path(), &scan).unwrap(); - - let project = candidates - .iter() - .find(|c| c.suggested_name == "project_name"); - assert!(project.is_some()); - assert_eq!(project.unwrap().value, "my-service"); - - let org = candidates.iter().find(|c| c.suggested_name == "org_name"); - assert!(org.is_some(), "should extract org from go.mod module path"); - assert_eq!(org.unwrap().value, "acme"); - } - - #[test] - fn test_tier2_missing_file() { - let dir = tempfile::tempdir().unwrap(); - let scan = make_scan_result(vec![]); - - assert!(parse_cargo_toml(dir.path(), &scan).is_none()); - assert!(parse_package_json(dir.path(), &scan).is_none()); - assert!(parse_pyproject_toml(dir.path(), &scan).is_none()); - assert!(parse_go_mod(dir.path(), &scan).is_none()); - } - - #[test] - fn test_tier2_malformed_cargo_toml() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write( - dir.path().join("Cargo.toml"), - "this is not valid toml {{{}}}", - ) - .unwrap(); - let scan = make_scan_result(vec![]); - assert!(parse_cargo_toml(dir.path(), &scan).is_none()); - } - - #[test] - fn test_tier2_version_missing() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write( - dir.path().join("Cargo.toml"), - "[package]\nname = \"no-version-crate\"\n", - ) - .unwrap(); - std::fs::write( - dir.path().join("package.json"), - r#"{"name": "no-version-pkg"}"#, - ) - .unwrap(); - std::fs::write( - dir.path().join("pyproject.toml"), - "[project]\nname = \"no-version-py\"\n", - ) - .unwrap(); - - let scan = make_scan_result(vec![]); - - let cargo = parse_cargo_toml(dir.path(), &scan).unwrap(); - assert!(!cargo.iter().any(|c| c.suggested_name == "version")); - - let pkg = parse_package_json(dir.path(), &scan).unwrap(); - assert!(!pkg.iter().any(|c| c.suggested_name == "version")); - - let pyproj = parse_pyproject_toml(dir.path(), &scan).unwrap(); - assert!(!pyproj.iter().any(|c| c.suggested_name == "version")); - } - - // ── Tier 3 tests ───────────────────────────────────────────────── - - #[test] - fn test_parse_org_from_url_ssh() { - assert_eq!( - parse_org_from_url("git@github.com:acme-corp/my-repo.git"), - Some("acme-corp".to_string()) - ); - } - - #[test] - fn test_parse_org_from_url_https() { - assert_eq!( - parse_org_from_url("https://github.com/acme-corp/my-repo.git"), - Some("acme-corp".to_string()) - ); - } - - #[test] - fn test_strip_email_with_angle_brackets() { - assert_eq!(strip_email("Jane Doe "), "Jane Doe"); - } - - #[test] - fn test_strip_email_bare_email() { - assert_eq!(strip_email("jane@example.com"), "jane"); - } - - #[test] - fn test_strip_email_no_email() { - assert_eq!(strip_email("Jane Doe"), "Jane Doe"); - } - - // ── Tier 4 tests ───────────────────────────────────────────────── - - #[test] - fn test_frequency_finds_repeated_identifier() { - let scan = make_scan_result(vec![ - ("a.txt", "data-pipeline is great\ndata-pipeline rocks"), - ("b.txt", "use data_pipeline here\ndata_pipeline again"), - ("c.txt", "DataPipeline class\nDataPipeline impl"), - ("d.txt", "DATA_PIPELINE env var\nDATA_PIPELINE config"), - ]); - - let covered = HashSet::new(); - let candidates = detect_frequency(&scan, &covered); - - assert!(!candidates.is_empty()); - // Should find "data-pipeline" cluster (multi-variant) - let found = candidates.iter().any(|c| { - let words = split_into_words(&c.value); - words == vec!["data", "pipeline"] - }); - assert!( - found, - "should find data-pipeline cluster, got: {:?}", - candidates - ); - } - - #[test] - fn test_frequency_filters_short_tokens() { - let scan = make_scan_result(vec![("a.txt", "ab cd ef gh"), ("b.txt", "ab cd ef gh")]); - - let covered = HashSet::new(); - let candidates = detect_frequency(&scan, &covered); - - assert!(candidates.is_empty(), "short tokens should be filtered"); - } - - #[test] - fn test_frequency_skips_covered_values() { - let scan = make_scan_result(vec![ - ("a.txt", "my-widget rocks"), - ("b.txt", "my-widget is great"), - ("c.txt", "my_widget too"), - ]); - - let mut covered = HashSet::new(); - covered.insert("my-widget".to_string()); - let candidates = detect_frequency(&scan, &covered); - - let has_widget = candidates - .iter() - .any(|c| c.value.to_lowercase().contains("widget")); - assert!(!has_widget, "covered values should be skipped"); - } - - #[test] - fn test_frequency_requires_multi_variant() { - // Single variant only — should NOT be detected even with many occurrences - let scan = make_scan_result(vec![ - ("a.txt", "async_handler async_handler async_handler"), - ("b.txt", "async_handler async_handler"), - ("c.txt", "async_handler"), - ]); - - let covered = HashSet::new(); - let candidates = detect_frequency(&scan, &covered); - - assert!( - candidates.is_empty(), - "single-variant tokens should be filtered, got: {:?}", - candidates - ); - } - - // ── Helper tests ───────────────────────────────────────────────── - - #[test] - fn test_deduplication_keeps_highest_confidence() { - let mut candidates = vec![ - DetectedCandidate { - suggested_name: "project_name".to_string(), - value: "my-app".to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.90, - reason: "Cargo.toml".to_string(), - file_count: 3, - total_occurrences: 10, - }, - DetectedCandidate { - suggested_name: "project_name".to_string(), - value: "my-app".to_string(), - tier: ConfidenceTier::DirectoryName, - confidence: 0.95, - reason: "directory name".to_string(), - file_count: 3, - total_occurrences: 10, - }, - ]; - - deduplicate_candidates(&mut candidates); - assert_eq!(candidates.len(), 1); - assert_eq!(candidates[0].confidence, 0.95); - } - - #[test] - fn test_name_collisions_preserved() { - let mut candidates = vec![ - DetectedCandidate { - suggested_name: "author".to_string(), - value: "Alice Johnson".to_string(), - tier: ConfidenceTier::ConfigFile, - confidence: 0.85, - reason: "package.json".to_string(), - file_count: 3, - total_occurrences: 5, - }, - DetectedCandidate { - suggested_name: "author".to_string(), - value: "Robert Roskam".to_string(), - tier: ConfidenceTier::GitMetadata, - confidence: 0.65, - reason: "git config".to_string(), - file_count: 0, - total_occurrences: 0, - }, - ]; - - deduplicate_candidates(&mut candidates); - assert_eq!( - candidates.len(), - 2, - "name collisions should be preserved for interactive resolution" - ); - } - - #[test] - fn test_strip_npm_scope() { - assert_eq!(strip_npm_scope("@myorg/cool-widget"), "cool-widget"); - assert_eq!(strip_npm_scope("plain-package"), "plain-package"); - } - - #[test] - fn test_auto_detect_integration() { - let dir = tempfile::tempdir().unwrap(); - let project_dir = dir.path().join("my-widget"); - std::fs::create_dir(&project_dir).unwrap(); - std::fs::write( - project_dir.join("README.md"), - "# my-widget\nWelcome to my-widget", - ) - .unwrap(); - std::fs::write( - project_dir.join("lib.rs"), - "pub mod my_widget;\nstruct MyWidget;", - ) - .unwrap(); - - let scan = crate::extract::scan::scan_project(&project_dir, &[]).unwrap(); - let result = auto_detect(&project_dir, &scan); - - assert!(!result.candidates.is_empty()); - let project_name = result - .candidates - .iter() - .find(|c| c.suggested_name == "project_name"); - assert!(project_name.is_some(), "should detect project_name"); - assert_eq!(project_name.unwrap().value, "my-widget"); - } -} diff --git a/src/extract/conditional.rs b/src/extract/conditional.rs deleted file mode 100644 index 67e7346..0000000 --- a/src/extract/conditional.rs +++ /dev/null @@ -1,170 +0,0 @@ -use std::path::Path; - -/// A known optional file pattern that can be made conditional in the template. -#[derive(Debug, Clone)] -pub struct ConditionalPattern { - /// Glob pattern to match files. - pub pattern: &'static str, - /// Variable name to control inclusion. - pub variable: &'static str, - /// Human-readable description. - pub description: &'static str, -} - -/// Curated list of known optional file patterns. -const KNOWN_PATTERNS: &[ConditionalPattern] = &[ - ConditionalPattern { - pattern: ".github/**", - variable: "use_github_actions", - description: "GitHub Actions CI", - }, - ConditionalPattern { - pattern: ".gitlab-ci.yml", - variable: "use_gitlab_ci", - description: "GitLab CI", - }, - ConditionalPattern { - pattern: "Dockerfile", - variable: "use_docker", - description: "Docker support", - }, - ConditionalPattern { - pattern: "docker-compose.yml", - variable: "use_docker", - description: "Docker support", - }, - ConditionalPattern { - pattern: "docker-compose.yaml", - variable: "use_docker", - description: "Docker support", - }, - ConditionalPattern { - pattern: ".pre-commit-config.yaml", - variable: "use_pre_commit", - description: "Pre-commit hooks", - }, - ConditionalPattern { - pattern: "Makefile", - variable: "use_make", - description: "Make build system", - }, - ConditionalPattern { - pattern: "Justfile", - variable: "use_just", - description: "Just command runner", - }, - ConditionalPattern { - pattern: ".editorconfig", - variable: "use_editorconfig", - description: "EditorConfig", - }, - ConditionalPattern { - pattern: "renovate.json", - variable: "use_renovate", - description: "Renovate dependency updates", - }, - ConditionalPattern { - pattern: ".renovaterc", - variable: "use_renovate", - description: "Renovate dependency updates", - }, - ConditionalPattern { - pattern: ".github/dependabot.yml", - variable: "use_dependabot", - description: "Dependabot", - }, - ConditionalPattern { - pattern: ".husky/**", - variable: "use_husky", - description: "Git hooks (JS)", - }, -]; - -/// A detected conditional file in the project. -#[derive(Debug, Clone)] -pub struct DetectedConditional { - /// The pattern that matched. - pub pattern: String, - /// The variable name to control this pattern. - pub variable: String, - /// Human-readable description. - pub description: String, -} - -/// Detect which known optional file patterns exist in the project. -/// -/// Groups by variable name — e.g., multiple Docker files share `use_docker`. -pub fn detect_conditional_files(project_dir: &Path) -> Vec { - let mut detected = Vec::new(); - let mut seen_variables = std::collections::HashSet::new(); - - for known in KNOWN_PATTERNS { - let exists = if known.pattern.contains("**") { - // Directory pattern — check if the directory exists - let dir_part = known.pattern.split("/**").next().unwrap_or(known.pattern); - project_dir.join(dir_part).exists() - } else { - project_dir.join(known.pattern).exists() - }; - - if exists && seen_variables.insert(known.variable) { - detected.push(DetectedConditional { - pattern: known.pattern.to_string(), - variable: known.variable.to_string(), - description: known.description.to_string(), - }); - } - } - - detected -} - -/// Get all patterns for a given variable name from the known patterns list. -pub fn patterns_for_variable(variable: &str) -> Vec<&'static str> { - KNOWN_PATTERNS - .iter() - .filter(|p| p.variable == variable) - .map(|p| p.pattern) - .collect() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_detect_conditional_files_github() { - let dir = tempfile::tempdir().unwrap(); - std::fs::create_dir_all(dir.path().join(".github/workflows")).unwrap(); - - let detected = detect_conditional_files(dir.path()); - assert_eq!(detected.len(), 1); - assert_eq!(detected[0].variable, "use_github_actions"); - } - - #[test] - fn test_detect_conditional_files_docker() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write(dir.path().join("Dockerfile"), "FROM alpine").unwrap(); - std::fs::write(dir.path().join("docker-compose.yml"), "version: '3'").unwrap(); - - let detected = detect_conditional_files(dir.path()); - // Should deduplicate by variable name - assert_eq!(detected.len(), 1); - assert_eq!(detected[0].variable, "use_docker"); - } - - #[test] - fn test_detect_conditional_files_empty() { - let dir = tempfile::tempdir().unwrap(); - let detected = detect_conditional_files(dir.path()); - assert!(detected.is_empty()); - } - - #[test] - fn test_patterns_for_variable() { - let docker_patterns = patterns_for_variable("use_docker"); - assert!(docker_patterns.contains(&"Dockerfile")); - assert!(docker_patterns.contains(&"docker-compose.yml")); - } -} diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index d8bac71..0f75060 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -3,15 +3,12 @@ use std::path::Path; /// Default directories and files to exclude from template extraction. const DEFAULT_EXCLUDES: &[&str] = &[ ".git", - ".git/", ".hg", ".svn", "node_modules", - "node_modules/", ".DS_Store", "Thumbs.db", "__pycache__", - "__pycache__/", "*.pyc", ".tox", ".nox", @@ -19,7 +16,6 @@ const DEFAULT_EXCLUDES: &[&str] = &[ ".ruff_cache", ".pytest_cache", "target", - "target/", ".venv", ".env", "dist", @@ -89,10 +85,7 @@ pub fn relevant_config_excludes(template_files: &[std::path::PathBuf]) -> Vec Vec { +pub fn detect_copy_without_render(files: &[std::path::PathBuf]) -> Vec { let mut found = Vec::new(); for pattern in DEFAULT_COPY_WITHOUT_RENDER { @@ -297,7 +290,7 @@ mod tests { PathBuf::from("font.woff2"), PathBuf::from("README.md"), ]; - let found = detect_copy_without_render(Path::new("."), &files); + let found = detect_copy_without_render(&files); assert!(found.contains(&"*.png".to_string())); assert!(found.contains(&"*.woff2".to_string())); assert!(!found.contains(&"*.jpg".to_string())); diff --git a/src/extract/interactive.rs b/src/extract/interactive.rs deleted file mode 100644 index 2f9b8e9..0000000 --- a/src/extract/interactive.rs +++ /dev/null @@ -1,426 +0,0 @@ -use std::collections::BTreeMap; - -use console::style; -use inquire::{Confirm, Select, Text}; - -use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; -use crate::error::{DicecutError, Result}; - -use super::auto_detect::{ConfidenceTier, DetectedCandidate}; -use super::conditional::DetectedConditional; -use super::variants::generate_variants; -use super::{ExtractVariable, PlannedExtractFile}; - -pub fn confirm_variants_interactive( - variables: Vec, -) -> Result> { - let mut confirmed = Vec::new(); - - for mut var in variables { - eprintln!( - "\n{} {} = {:?} {}", - style("──").dim(), - style(&var.name).bold(), - var.value, - style("──────────────────────────────────────").dim() - ); - - if var.variants.len() == 1 && var.variants[0].name == "verbatim" { - // Simple value — just show occurrence count - let (file_count, total_hits) = var - .occurrence_counts - .first() - .map(|(_, fc, th)| (*fc, *th)) - .unwrap_or((0, 0)); - if total_hits > 0 { - eprintln!( - " Found in {} files ({} occurrences)", - file_count, total_hits - ); - } else { - eprintln!( - " {} Value not found in any file (will still be added to config)", - style("⚠").yellow() - ); - } - confirmed.push(var); - continue; - } - - // Show detected variants with counts - eprintln!(" Detected case variants:"); - let mut found_any = false; - for (i, variant) in var.variants.iter().enumerate() { - let (_, file_count, total_hits) = &var.occurrence_counts[i]; - let mark = if *total_hits > 0 { - found_any = true; - style("✓").green().to_string() - } else { - style("✗").dim().to_string() - }; - let hits_str = if *total_hits > 0 { - format!( - "{} {} across {} {}", - total_hits, - if *total_hits == 1 { "hit" } else { "hits" }, - file_count, - if *file_count == 1 { "file" } else { "files" } - ) - } else { - "not found".to_string() - }; - eprintln!( - " {} {:<16} {:<20} {}", - mark, - variant.literal, - variant.name, - style(&hits_str).dim() - ); - } - - if !found_any { - eprintln!( - " {} No occurrences found for any variant (will still be added to config)", - style("⚠").yellow() - ); - // Keep just the first variant - var.variants.truncate(1); - confirmed.push(var); - continue; - } - - let keep = Confirm::new("Keep detected variants?") - .with_default(true) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if keep { - // Remove variants with zero occurrences - let counts = var.occurrence_counts.clone(); - var.variants.retain(|v| { - counts - .iter() - .any(|(name, _, hits)| name == v.name && *hits > 0) - }); - if var.variants.is_empty() { - let all = generate_variants(&var.name, &var.value); - if let Some(first) = all.into_iter().next() { - var.variants.push(first); - } - } - } else { - // Keep only the canonical variant - var.variants.truncate(1); - } - - confirmed.push(var); - } - - Ok(confirmed) -} - -pub fn confirm_excludes_interactive(mut excludes: Vec) -> Result> { - eprintln!( - "\n{} Excludes {}", - style("──").dim(), - style("─────────────────────────────────────────────").dim() - ); - if excludes.is_empty() { - eprintln!(" No exclude patterns needed for this template."); - } else { - eprintln!(" Patterns matching template files:"); - for e in &excludes { - eprintln!(" {}", e); - } - } - - let extra = Text::new("Add extra exclude patterns? (comma-separated, enter to skip)") - .with_default("") - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if !extra.is_empty() { - for pattern in extra.split(',') { - let trimmed = pattern.trim().to_string(); - if !trimmed.is_empty() { - excludes.push(trimmed); - } - } - } - - Ok(excludes) -} - -pub fn confirm_conditionals_interactive( - detected: Vec, -) -> Result> { - eprintln!( - "\n{} Conditional files {}", - style("──").dim(), - style("────────────────────────────────────").dim() - ); - eprintln!(" These look optional. Make them conditional?"); - - let mut confirmed = Vec::new(); - for cond in detected { - let prompt = format!(" {} → {}", cond.pattern, cond.variable); - let include = Confirm::new(&prompt) - .with_default(false) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if include { - confirmed.push(cond); - } - } - - Ok(confirmed) -} - -pub fn resolve_candidates_yes( - candidates: &[DetectedCandidate], - explicit_vars: &[(String, String)], -) -> Vec<(String, String)> { - eprintln!( - "\n{} Auto-detected variables {}", - style("──").dim(), - style("──────────────────────────────────").dim() - ); - - // Group candidates by suggested_name - let mut groups: BTreeMap> = BTreeMap::new(); - for c in candidates { - groups.entry(c.suggested_name.clone()).or_default().push(c); - } - - let mut result = Vec::new(); - let mut skipped_freq = 0; - - for (name, mut group) in groups { - // Skip names already covered by explicit --var - if explicit_vars.iter().any(|(n, _)| n == &name) { - eprintln!( - " {} {} (explicit --var, skipping auto-detect)", - style("·").dim(), - style(&name).dim() - ); - continue; - } - - // For name collisions, pick highest confidence - group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); - let winner = group[0]; - - // Skip frequency-analysis candidates in -y mode — too noisy for auto-accept - if winner.tier == ConfidenceTier::FrequencyAnalysis { - skipped_freq += 1; - continue; - } - - eprintln!( - " {} {} = {:?} ({:.0}% confidence, {})", - style("✓").green(), - style(&winner.suggested_name).bold(), - winner.value, - winner.confidence * 100.0, - winner.tier - ); - eprintln!(" {}", style(&winner.reason).dim()); - - if group.len() > 1 { - eprintln!( - " {} {} other candidates for this name (picked highest confidence)", - style("⚠").yellow(), - group.len() - 1 - ); - } - - result.push((winner.suggested_name.clone(), winner.value.clone())); - } - - if skipped_freq > 0 { - eprintln!( - " {} {} frequency-detected candidate(s) skipped (use interactive mode to review)", - style("·").dim(), - skipped_freq - ); - } - - result -} - -pub fn confirm_auto_detected_interactive( - candidates: Vec, - explicit_vars: &[(String, String)], -) -> Result> { - eprintln!( - "\n{} Auto-detected variables {}", - style("──").dim(), - style("──────────────────────────────────").dim() - ); - - // Group candidates by suggested_name - let mut groups: BTreeMap> = BTreeMap::new(); - for c in candidates { - groups.entry(c.suggested_name.clone()).or_default().push(c); - } - - let mut accepted = Vec::new(); - - for (name, mut group) in groups { - // Skip names already covered by explicit --var - if explicit_vars.iter().any(|(n, _)| n == &name) { - eprintln!( - "\n {} {} (provided via --var, skipping)", - style("·").dim(), - style(&name).dim() - ); - continue; - } - - // Sort by confidence descending - group.sort_by(|a, b| b.confidence.total_cmp(&a.confidence)); - - if group.len() == 1 { - // Single candidate — simple confirm - let candidate = &group[0]; - eprintln!( - "\n {} = {:?} ({:.0}% confidence, {})", - style(&candidate.suggested_name).bold(), - candidate.value, - candidate.confidence * 100.0, - candidate.tier - ); - eprintln!(" {}", style(&candidate.reason).dim()); - if candidate.total_occurrences > 0 { - eprintln!( - " {} occurrences across {} files", - candidate.total_occurrences, candidate.file_count - ); - } - - let accept = Confirm::new(&format!("Accept \"{}\"?", candidate.suggested_name)) - .with_default(true) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if accept { - accepted.push((candidate.suggested_name.clone(), candidate.value.clone())); - } - } else { - // Name collision — show selection prompt - eprintln!( - "\n {} Multiple candidates for {}:", - style("⚠").yellow(), - style(&name).bold() - ); - - let mut options: Vec = group - .iter() - .map(|c| { - format!( - "{:?} ({:.0}% confidence, {})", - c.value, - c.confidence * 100.0, - c.tier - ) - }) - .collect(); - options.push("Skip".to_string()); - - let selection = Select::new(&format!("Which value for \"{}\"?", name), options) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if selection != "Skip" { - // Find the matching candidate - if let Some(chosen) = group.iter().find(|c| { - format!( - "{:?} ({:.0}% confidence, {})", - c.value, - c.confidence * 100.0, - c.tier - ) == selection - }) { - accepted.push((chosen.suggested_name.clone(), chosen.value.clone())); - } - } - } - } - - Ok(accepted) -} - -pub fn confirm_files_interactive(files: &[PlannedExtractFile], dropped_count: usize) -> Result<()> { - let templated: Vec<_> = files.iter().filter(|f| f.has_replacements()).collect(); - let boilerplate: Vec<_> = files - .iter() - .filter(|f| !f.has_replacements() && !f.stubbed && !f.is_binary()) - .collect(); - let stubbed: Vec<_> = files.iter().filter(|f| f.stubbed).collect(); - let binary_count = files.iter().filter(|f| f.is_binary()).count(); - - eprintln!( - "\n{} File plan {}", - style("──").dim(), - style("──────────────────────────────────────────").dim() - ); - - // Templated files - eprintln!( - "\n {} ({} files, {} suffix):", - style("Templated").bold(), - templated.len(), - DEFAULT_TEMPLATES_SUFFIX - ); - for file in &templated { - eprintln!( - " {:<50} {} replacements", - file.template_path.display(), - file.replacement_count() - ); - } - - // Boilerplate files - eprintln!( - "\n {} (copied in full, {} files{}):", - style("Boilerplate").bold(), - boilerplate.len() + binary_count, - if binary_count > 0 { - format!(", {} binary", binary_count) - } else { - String::new() - } - ); - for file in &boilerplate { - eprintln!(" {}", file.template_path.display()); - } - - // Stubbed files - if !stubbed.is_empty() { - eprintln!( - "\n {} (structure only, {} files):", - style("Stubbed").bold(), - stubbed.len() - ); - for file in &stubbed { - eprintln!(" {}", file.template_path.display()); - } - } - - // Dropped files - if dropped_count > 0 { - eprintln!("\n {} ({} files):", style("Dropped").bold(), dropped_count); - } - - let proceed = Confirm::new("Proceed?") - .with_default(true) - .prompt() - .map_err(|_| DicecutError::PromptCancelled)?; - - if !proceed { - return Err(DicecutError::PromptCancelled); - } - - Ok(()) -} diff --git a/src/extract/mod.rs b/src/extract/mod.rs index f9d1318..767d7ae 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -1,8 +1,5 @@ -pub mod auto_detect; -pub mod conditional; pub mod config_gen; pub mod exclude; -pub mod interactive; pub mod replace; pub mod scan; pub mod stub; @@ -16,24 +13,17 @@ use console::style; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; -use self::auto_detect::{auto_detect, count_occurrences}; -use self::conditional::{detect_conditional_files, patterns_for_variable}; use self::config_gen::{ - generate_config_toml, ComputedVariable, ConditionalEntry, ConfigGenOptions, PromptedVariable, + generate_config_toml, ComputedVariable, ConfigGenOptions, PromptedVariable, }; use self::exclude::{ all_default_excludes, detect_copy_without_render, is_copy_without_render, relevant_config_excludes, }; -use self::interactive::{ - confirm_auto_detected_interactive, confirm_conditionals_interactive, - confirm_excludes_interactive, confirm_files_interactive, confirm_variants_interactive, - resolve_candidates_yes, -}; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; -use self::scan::scan_project; +use self::scan::{count_occurrences, scan_project}; use self::stub::{classify_file, generate_stub, FileRole}; use self::variants::{ computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, @@ -101,7 +91,6 @@ pub struct ExtractionPlan { pub files: Vec, pub config_toml: String, pub variables: Vec, - pub conditional_entries: Vec, pub exclude_patterns: Vec, pub copy_without_render: Vec, pub dropped_count: usize, @@ -114,10 +103,7 @@ pub struct ExtractOptions { pub variables: Vec<(String, String)>, pub output_dir: Option, pub in_place: bool, - pub yes: bool, - pub min_confidence: f64, pub stub_depth: usize, - pub dry_run: bool, } /// Plan an extraction: scan the project, detect variants, build replacement rules. @@ -191,45 +177,11 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { } ); - // Phase 2.5: Auto-detect variables (always runs), merge with explicit --var entries - let variables = { - let explicit_vars = options.variables.clone(); - let detect_result = auto_detect(source_dir, &scan_result); - - // Filter candidates below min_confidence threshold - let candidates: Vec<_> = detect_result - .candidates - .into_iter() - .filter(|c| c.confidence >= options.min_confidence) - .collect(); - - if candidates.is_empty() && explicit_vars.is_empty() { - return Err(DicecutError::ExtractNoVariables); - } - - // Resolve auto-detected candidates (merge with explicit vars) - let auto_vars = if candidates.is_empty() { - vec![] - } else if options.yes { - resolve_candidates_yes(&candidates, &explicit_vars) - } else { - confirm_auto_detected_interactive(candidates, &explicit_vars)? - }; - - // Merge: explicit vars first (pre-accepted), then auto-detected additions - let mut merged = explicit_vars; - for (name, value) in auto_vars { - if !merged.iter().any(|(n, _)| n == &name) { - merged.push((name, value)); - } - } - - if merged.is_empty() { - return Err(DicecutError::ExtractNoVariables); - } - - merged - }; + // Validate that at least one --var was provided + let variables = options.variables.clone(); + if variables.is_empty() { + return Err(DicecutError::ExtractNoVariables); + } // Phase 3: Generate variants and count occurrences let mut extract_variables = Vec::new(); @@ -251,43 +203,26 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { }); } - // Phase 4: Interactive variant confirmation - let confirmed_variables = if options.yes { - // Batch mode: auto-accept all found variants - extract_variables - .into_iter() - .map(|mut var| { - var.variants.retain(|v| { - var.occurrence_counts - .iter() - .any(|(name, _, hits)| name == v.name && *hits > 0) - || v.name == "verbatim" - }); - // Always keep at least the verbatim/canonical variant - if var.variants.is_empty() { - let all = generate_variants(&var.name, &var.value); - if let Some(first) = all.into_iter().next() { - var.variants.push(first); - } + // Phase 4: Auto-accept found variants (keep those with occurrences + verbatim) + let confirmed_variables: Vec = extract_variables + .into_iter() + .map(|mut var| { + var.variants.retain(|v| { + var.occurrence_counts + .iter() + .any(|(name, _, hits)| name == v.name && *hits > 0) + || v.name == "verbatim" + }); + // Always keep at least the verbatim/canonical variant + if var.variants.is_empty() { + let all = generate_variants(&var.name, &var.value); + if let Some(first) = all.into_iter().next() { + var.variants.push(first); } - var - }) - .collect() - } else { - confirm_variants_interactive(extract_variables)? - }; - - // Phase 6: Detect conditional files - let detected_conditionals = if options.yes { - vec![] // Batch mode: no conditional files - } else { - let detected = detect_conditional_files(source_dir); - if detected.is_empty() { - vec![] - } else { - confirm_conditionals_interactive(detected)? - } - }; + } + var + }) + .collect(); // Phase 7: Build replacement rules let mut rules = Vec::new(); @@ -309,7 +244,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { .iter() .map(|f| f.relative_path.clone()) .collect(); - let copy_without_render = detect_copy_without_render(source_dir, &file_paths); + let copy_without_render = detect_copy_without_render(&file_paths); // Phase 9: Apply replacements to files let mut planned_files = Vec::new(); @@ -397,34 +332,9 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { .iter() .map(|f| f.template_path.clone()) .collect(); - let mut config_excludes = relevant_config_excludes(&template_paths); - - if !options.yes { - config_excludes = confirm_excludes_interactive(config_excludes)?; - } - - // Phase 10: Interactive file confirmation - if !options.yes { - confirm_files_interactive(&planned_files, dropped_count)?; - } - - // Phase 11: Build conditional entries - let conditional_entries: Vec = detected_conditionals - .iter() - .map(|d| { - let patterns = patterns_for_variable(&d.variable) - .into_iter() - .map(|p| p.to_string()) - .collect(); - ConditionalEntry { - patterns, - variable: d.variable.clone(), - description: d.description.clone(), - } - }) - .collect(); + let config_excludes = relevant_config_excludes(&template_paths); - // Phase 12: Generate config + // Generate config let canonical_seps: HashMap = confirmed_variables .iter() .map(|v| (v.name.clone(), detect_separator(&v.value))) @@ -473,7 +383,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { computed_variables: computed_vars, exclude_patterns: config_excludes.clone(), copy_without_render: copy_without_render.clone(), - conditional_entries: conditional_entries.clone(), + conditional_entries: vec![], }); Ok(ExtractionPlan { @@ -481,7 +391,6 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { files: planned_files, config_toml, variables: confirmed_variables, - conditional_entries, exclude_patterns: config_excludes, copy_without_render, dropped_count, @@ -490,7 +399,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { } /// Execute an extraction plan: write files and config to the output directory. -pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> { +pub fn execute_extraction(plan: &ExtractionPlan) -> Result<()> { let output_dir = &plan.output_dir; let template_dir = output_dir.join("template"); @@ -589,12 +498,6 @@ pub fn execute_extraction(plan: &ExtractionPlan, _in_place: bool) -> Result<()> " {} files templated, {} files copied, {} files stubbed, {} files dropped", rendered_count, copied_count, stubbed_count, plan.dropped_count ); - if !plan.conditional_entries.is_empty() { - eprintln!( - " {} conditional patterns added", - plan.conditional_entries.len() - ); - } eprintln!(" Review diecut.toml to fine-tune"); Ok(()) diff --git a/src/extract/replace.rs b/src/extract/replace.rs index 95e36ad..5cbe93c 100644 --- a/src/extract/replace.rs +++ b/src/extract/replace.rs @@ -133,18 +133,17 @@ pub fn apply_path_replacements(path: &Path, rules: &[ReplacementRule]) -> PathBu components.iter().collect() } -/// Count occurrences of a literal in a string. -pub fn count_occurrences(content: &str, literal: &str) -> usize { - if literal.is_empty() { - return 0; - } - content.matches(literal).count() -} - #[cfg(test)] mod tests { use super::*; + fn count_occurrences(content: &str, literal: &str) -> usize { + if literal.is_empty() { + return 0; + } + content.matches(literal).count() + } + fn make_rule(literal: &str, replacement: &str) -> ReplacementRule { ReplacementRule { literal: literal.to_string(), diff --git a/src/extract/scan.rs b/src/extract/scan.rs index 544aa87..088d6dd 100644 --- a/src/extract/scan.rs +++ b/src/extract/scan.rs @@ -98,6 +98,36 @@ pub fn scan_project(project_dir: &Path, excludes: &[String]) -> crate::error::Re }) } +/// Count how many files contain `value` and the total number of hits across all files. +pub fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize) { + let mut file_count = 0; + let mut total = 0; + + for file in &scan_result.files { + let mut counted_file = false; + + if let Some(ref content) = file.content { + let hits = content.matches(value).count(); + if hits > 0 { + file_count += 1; + counted_file = true; + total += hits; + } + } + + let path_str = file.relative_path.to_string_lossy(); + let path_hits = path_str.matches(value).count(); + if path_hits > 0 { + total += path_hits; + if !counted_file { + file_count += 1; + } + } + } + + (file_count, total) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/main.rs b/src/main.rs index 11dec94..0e6ef6a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,19 +24,8 @@ fn main() -> miette::Result<()> { vars, output, in_place, - yes, - min_confidence, stub_depth, dry_run, - } => commands::extract::run( - source, - vars, - output, - in_place, - yes, - min_confidence, - stub_depth, - dry_run, - ), + } => commands::extract::run(source, vars, output, in_place, stub_depth, dry_run), } } diff --git a/tests/integration.rs b/tests/integration.rs index bee61fc..f56835d 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -654,14 +654,11 @@ fn test_extract_batch_basic() { ], output_dir: Some(output_path.clone()), in_place: false, - yes: true, - min_confidence: 0.5, stub_depth: 2, - dry_run: false, }; let plan = plan_extraction(&options).unwrap(); - execute_extraction(&plan, false).unwrap(); + execute_extraction(&plan).unwrap(); // Verify diecut.toml was created assert!(output_path.join("diecut.toml").exists()); @@ -699,10 +696,7 @@ fn test_extract_detects_case_variants() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - yes: true, - min_confidence: 0.5, stub_depth: 2, - dry_run: false, }; let plan = plan_extraction(&options).unwrap(); @@ -731,7 +725,7 @@ fn test_extract_detects_case_variants() { "should detect screaming_snake variant" ); - execute_extraction(&plan, false).unwrap(); + execute_extraction(&plan).unwrap(); // The config should have computed variables for variants let config = std::fs::read_to_string(output_path.join("diecut.toml")).unwrap(); @@ -754,10 +748,7 @@ fn test_extract_dry_run_writes_nothing() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - yes: true, - min_confidence: 0.5, stub_depth: 2, - dry_run: true, }; let plan = plan_extraction(&options).unwrap(); @@ -784,10 +775,7 @@ fn test_extract_rejects_already_template() { variables: vec![("name".to_string(), "val".to_string())], output_dir: None, in_place: false, - yes: true, - min_confidence: 0.5, stub_depth: 2, - dry_run: false, }; let result = plan_extraction(&options); @@ -799,17 +787,13 @@ fn test_extract_rejects_no_variables() { let project = tempfile::tempdir().unwrap(); std::fs::write(project.path().join("hello.txt"), "hello").unwrap(); - // With min_confidence=1.0, no auto-detected candidates can pass, and no explicit - // vars are given, so extraction should fail with ExtractNoVariables + // No --var provided → should fail with ExtractNoVariables let options = ExtractOptions { source_dir: project.path().to_path_buf(), variables: vec![], output_dir: None, in_place: false, - yes: true, - min_confidence: 1.0, stub_depth: 2, - dry_run: false, }; let result = plan_extraction(&options); @@ -830,10 +814,7 @@ fn test_extract_templates_path_components() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - yes: true, - min_confidence: 0.5, stub_depth: 2, - dry_run: false, }; let plan = plan_extraction(&options).unwrap(); @@ -849,7 +830,7 @@ fn test_extract_templates_path_components() { "should template path components containing the variable value" ); - execute_extraction(&plan, false).unwrap(); + execute_extraction(&plan).unwrap(); } #[test] @@ -894,14 +875,11 @@ fn test_extract_round_trip() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(extracted_path.clone()), in_place: false, - yes: true, - min_confidence: 0.5, stub_depth: 2, - dry_run: false, }; let plan = plan_extraction(&options).unwrap(); - execute_extraction(&plan, false).unwrap(); + execute_extraction(&plan).unwrap(); // Verify the extracted template has the key structure assert!(extracted_path.join("diecut.toml").exists()); @@ -931,177 +909,3 @@ fn test_extract_round_trip() { } } } - -// ── Auto-detect tests ──────────────────────────────────────────────────── - -#[test] -fn test_extract_auto_yes() { - let project = tempfile::tempdir().unwrap(); - let project_dir = project.path().join("data-pipeline"); - std::fs::create_dir(&project_dir).unwrap(); - std::fs::write( - project_dir.join("Cargo.toml"), - "[package]\nname = \"data-pipeline\"\nversion = \"0.1.0\"\n", - ) - .unwrap(); - std::fs::write( - project_dir.join("README.md"), - "# data-pipeline\nWelcome to data-pipeline\n", - ) - .unwrap(); - std::fs::create_dir(project_dir.join("src")).unwrap(); - std::fs::write( - project_dir.join("src/main.rs"), - "fn main() {\n println!(\"data-pipeline starting\");\n}\n", - ) - .unwrap(); - - let output = tempfile::tempdir().unwrap(); - let output_path = output.path().join("auto-extracted"); - - let options = ExtractOptions { - source_dir: project_dir.clone(), - variables: vec![], - output_dir: Some(output_path.clone()), - in_place: false, - yes: true, - min_confidence: 0.5, - stub_depth: 2, - dry_run: false, - }; - - let plan = plan_extraction(&options).unwrap(); - execute_extraction(&plan, false).unwrap(); - - let project_var = plan.variables.iter().find(|v| v.name == "project_name"); - assert!( - project_var.is_some(), - "should auto-detect project_name, got vars: {:?}", - plan.variables.iter().map(|v| &v.name).collect::>() - ); - assert_eq!(project_var.unwrap().value, "data-pipeline"); - - assert!(output_path.join("diecut.toml").exists()); - let config = std::fs::read_to_string(output_path.join("diecut.toml")).unwrap(); - assert!(config.contains("project_name")); -} - -#[test] -fn test_extract_auto_explicit_vars_merged() { - let project = tempfile::tempdir().unwrap(); - let project_dir = project.path().join("my-service"); - std::fs::create_dir(&project_dir).unwrap(); - std::fs::write( - project_dir.join("Cargo.toml"), - "[package]\nname = \"my-service\"\n", - ) - .unwrap(); - std::fs::write(project_dir.join("README.md"), "# my-service\n").unwrap(); - - let output = tempfile::tempdir().unwrap(); - let output_path = output.path().join("explicit-extracted"); - - let options = ExtractOptions { - source_dir: project_dir.clone(), - variables: vec![("app_name".to_string(), "my-service".to_string())], - output_dir: Some(output_path.clone()), - in_place: false, - yes: true, - min_confidence: 0.5, - stub_depth: 2, - dry_run: false, - }; - - let plan = plan_extraction(&options).unwrap(); - - let has_app_name = plan.variables.iter().any(|v| v.name == "app_name"); - assert!(has_app_name, "should use explicit var app_name"); - // Auto-detect still runs and merges additional candidates - // (project_name may or may not appear depending on dedup with app_name's value) -} - -#[test] -fn test_extract_auto_frequency_fallback() { - let project = tempfile::tempdir().unwrap(); - let project_dir = project.path().join("cool-widget"); - std::fs::create_dir(&project_dir).unwrap(); - std::fs::write( - project_dir.join("main.txt"), - "cool-widget is great\ncool_widget module\nCoolWidget class\n", - ) - .unwrap(); - std::fs::write( - project_dir.join("config.txt"), - "name = cool-widget\nmodule = cool_widget\n", - ) - .unwrap(); - std::fs::write( - project_dir.join("test.txt"), - "testing cool-widget\nCOOL_WIDGET env\n", - ) - .unwrap(); - - let output = tempfile::tempdir().unwrap(); - let output_path = output.path().join("freq-extracted"); - - let options = ExtractOptions { - source_dir: project_dir.clone(), - variables: vec![], - output_dir: Some(output_path.clone()), - in_place: false, - yes: true, - min_confidence: 0.5, - stub_depth: 2, - dry_run: false, - }; - - let plan = plan_extraction(&options).unwrap(); - - let has_relevant_var = plan - .variables - .iter() - .any(|v| v.value.contains("cool") || v.name.contains("cool")); - assert!( - has_relevant_var, - "should detect cool-widget related variable, got: {:?}", - plan.variables - .iter() - .map(|v| format!("{}={}", v.name, v.value)) - .collect::>() - ); -} - -#[test] -fn test_extract_min_confidence_filters() { - let project = tempfile::tempdir().unwrap(); - let project_dir = project.path().join("tiny-app"); - std::fs::create_dir(&project_dir).unwrap(); - std::fs::write( - project_dir.join("Cargo.toml"), - "[package]\nname = \"tiny-app\"\nversion = \"0.1.0\"\n", - ) - .unwrap(); - std::fs::write( - project_dir.join("README.md"), - "# tiny-app\nWelcome to tiny-app\n", - ) - .unwrap(); - - // With a very high threshold, all auto-detected candidates should be filtered out - let options = ExtractOptions { - source_dir: project_dir.clone(), - variables: vec![], - output_dir: None, - in_place: false, - yes: true, - min_confidence: 0.99, - stub_depth: 2, - dry_run: true, - }; - - let result = plan_extraction(&options); - assert!( - result.is_err(), - "high min_confidence should filter out all candidates" - ); -} From 094222a71726d5cb71ed27ea7e3077bb6ce269f3 Mon Sep 17 00:00:00 2001 From: rroskam Date: Wed, 4 Mar 2026 23:19:48 -0500 Subject: [PATCH 27/29] refactor(extract): trim to verbatim-only for PR review Defer variants, stub/drop, copy-without-render, camelcase filter, and config_gen module to a follow-up PR. Inline minimal config generation. Remove --stub-depth flag. 991 lines changed (down from 2,539). --- Cargo.toml | 2 +- src/cli.rs | 4 - src/commands/extract.rs | 30 +--- src/extract/config_gen.rs | 206 ------------------------- src/extract/exclude.rs | 185 +--------------------- src/extract/mod.rs | 312 ++++++++------------------------------ src/extract/replace.rs | 116 -------------- src/extract/scan.rs | 83 ---------- src/extract/stub.rs | 222 --------------------------- src/extract/variants.rs | 309 ------------------------------------- src/main.rs | 3 +- src/prompt/engine.rs | 2 +- src/render/context.rs | 75 +-------- src/render/mod.rs | 2 +- src/render/walker.rs | 4 +- tests/integration.rs | 138 ----------------- 16 files changed, 76 insertions(+), 1617 deletions(-) delete mode 100644 src/extract/config_gen.rs delete mode 100644 src/extract/stub.rs delete mode 100644 src/extract/variants.rs diff --git a/Cargo.toml b/Cargo.toml index 09bd987..3a5902b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,7 @@ name = "diecut" version = "0.3.4" edition = "2021" license = "MIT" -rust-version = "1.80" +rust-version = "1.75" description = "A single binary project template generator" [lib] diff --git a/src/cli.rs b/src/cli.rs index 92bc8e1..d87f2e8 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -67,10 +67,6 @@ pub enum Commands { #[arg(long)] in_place: bool, - /// Max path depth for stubbing content files (deeper files are dropped) - #[arg(long, default_value = "2")] - stub_depth: usize, - /// Show what would be extracted without writing files #[arg(long)] dry_run: bool, diff --git a/src/commands/extract.rs b/src/commands/extract.rs index b7f7e9f..37a31be 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -11,7 +11,6 @@ pub fn run( vars: Vec, output: Option, in_place: bool, - stub_depth: usize, dry_run: bool, ) -> Result<()> { let variables = parse_vars(&vars)?; @@ -21,7 +20,6 @@ pub fn run( variables, output_dir: output.map(PathBuf::from), in_place, - stub_depth, }; let plan = plan_extraction(&options)?; @@ -61,12 +59,11 @@ fn print_dry_run(plan: &diecut::extract::ExtractionPlan) { ); let templated: Vec<_> = plan.files.iter().filter(|f| f.has_replacements()).collect(); - let boilerplate: Vec<_> = plan + let copied: Vec<_> = plan .files .iter() - .filter(|f| !f.has_replacements() && !f.stubbed) + .filter(|f| !f.has_replacements()) .collect(); - let stubbed: Vec<_> = plan.files.iter().filter(|f| f.stubbed).collect(); eprintln!("\nTemplated files ({}):", templated.len()); for file in &templated { @@ -77,33 +74,14 @@ fn print_dry_run(plan: &diecut::extract::ExtractionPlan) { ); } - eprintln!("\nBoilerplate ({}):", boilerplate.len()); - for file in &boilerplate { + eprintln!("\nCopied ({}):", copied.len()); + for file in &copied { eprintln!(" {}", file.template_path.display()); } - if !stubbed.is_empty() { - eprintln!("\nStubbed ({}):", stubbed.len()); - for file in &stubbed { - eprintln!(" {}", file.template_path.display()); - } - } - - if plan.dropped_count > 0 { - eprintln!("\nDropped ({}):", plan.dropped_count); - for path in &plan.dropped_paths { - eprintln!(" {}", path.display()); - } - } - eprintln!("\nVariables:"); for var in &plan.variables { eprintln!(" {} = {:?}", var.name, var.value); - for variant in &var.variants { - if variant.name != "verbatim" { - eprintln!(" {} → {}", variant.name, variant.literal); - } - } } eprintln!("\nGenerated diecut.toml:"); diff --git a/src/extract/config_gen.rs b/src/extract/config_gen.rs deleted file mode 100644 index 91dea6c..0000000 --- a/src/extract/config_gen.rs +++ /dev/null @@ -1,206 +0,0 @@ -/// A prompted variable entry for the generated config. -pub struct PromptedVariable { - pub name: String, - pub default_value: String, - pub prompt: String, -} - -/// A computed variable entry for the generated config. -pub struct ComputedVariable { - pub name: String, - pub expression: String, -} - -/// A conditional file entry for the generated config. -#[derive(Debug, Clone)] -pub struct ConditionalEntry { - pub patterns: Vec, - pub variable: String, - pub description: String, -} - -/// Options for generating the diecut.toml config file. -pub struct ConfigGenOptions { - pub template_name: String, - pub prompted_variables: Vec, - pub computed_variables: Vec, - pub exclude_patterns: Vec, - pub copy_without_render: Vec, - pub conditional_entries: Vec, -} - -/// Generate a diecut.toml config string with comments for readability. -/// -/// Uses manual TOML string building because the `toml` crate can't serialize comments, -/// and users need to read and edit this file. -pub fn generate_config_toml(options: &ConfigGenOptions) -> String { - let mut out = String::new(); - - // [template] section - out.push_str("[template]\n"); - out.push_str(&format!( - "name = {}\n", - escape_toml_string(&options.template_name) - )); - out.push_str("version = \"1.0.0\"\n"); - out.push_str("# description = \"A project template\"\n"); - out.push('\n'); - - // [variables] section — prompted variables first - if !options.prompted_variables.is_empty() || !options.computed_variables.is_empty() { - out.push_str("# ── Variables ──────────────────────────────────────────\n"); - out.push_str("# Prompted variables are asked during `diecut new`.\n"); - out.push_str("# Computed variables are auto-derived and never prompted.\n"); - out.push('\n'); - } - - for var in &options.prompted_variables { - out.push_str(&format!("[variables.{}]\n", var.name)); - out.push_str("type = \"string\"\n"); - out.push_str(&format!("prompt = {}\n", escape_toml_string(&var.prompt))); - out.push_str(&format!( - "default = {}\n", - escape_toml_string(&var.default_value) - )); - out.push('\n'); - } - - // Conditional file boolean variables - for entry in &options.conditional_entries { - out.push_str(&format!("# {} ({})\n", entry.variable, entry.description)); - out.push_str(&format!("[variables.{}]\n", entry.variable)); - out.push_str("type = \"bool\"\n"); - out.push_str(&format!( - "prompt = {}\n", - escape_toml_string(&format!("Include {}?", entry.description.to_lowercase())) - )); - out.push_str("default = true\n"); - out.push('\n'); - } - - // Computed variables - for var in &options.computed_variables { - out.push_str(&format!("[variables.{}]\n", var.name)); - out.push_str("type = \"string\"\n"); - out.push_str(&format!( - "computed = {}\n", - escape_toml_string(&format!("{{{{ {} }}}}", var.expression)) - )); - out.push('\n'); - } - - // [files] section - out.push_str("# ── Files ─────────────────────────────────────────────\n"); - out.push_str("[files]\n"); - - if !options.exclude_patterns.is_empty() { - out.push_str("exclude = [\n"); - for pattern in &options.exclude_patterns { - out.push_str(&format!(" {},\n", escape_toml_string(pattern))); - } - out.push_str("]\n"); - } - - if !options.copy_without_render.is_empty() { - out.push_str("copy_without_render = [\n"); - for pattern in &options.copy_without_render { - out.push_str(&format!(" {},\n", escape_toml_string(pattern))); - } - out.push_str("]\n"); - } - - out.push('\n'); - - // [[files.conditional]] entries - for entry in &options.conditional_entries { - for pattern in &entry.patterns { - out.push_str(&format!("# {}\n", entry.description)); - out.push_str("[[files.conditional]]\n"); - out.push_str(&format!("pattern = {}\n", escape_toml_string(pattern))); - out.push_str(&format!("when = {}\n", escape_toml_string(&entry.variable))); - out.push('\n'); - } - } - - // [hooks] section - out.push_str("# ── Hooks ─────────────────────────────────────────────\n"); - out.push_str("# [hooks]\n"); - out.push_str("# post_create = \"echo 'Project created!'\"\n"); - - out -} - -/// Escape a string for TOML output. -fn escape_toml_string(s: &str) -> String { - toml::Value::String(s.to_string()).to_string() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_generate_config_basic() { - let options = ConfigGenOptions { - template_name: "my-template".to_string(), - prompted_variables: vec![PromptedVariable { - name: "project_name".to_string(), - default_value: "my-app".to_string(), - prompt: "Project name".to_string(), - }], - computed_variables: vec![ComputedVariable { - name: "project_name_snake".to_string(), - expression: "project_name | replace(from=\"-\", to=\"_\")".to_string(), - }], - exclude_patterns: vec![".git/".to_string()], - copy_without_render: vec!["*.png".to_string()], - conditional_entries: vec![], - }; - - let toml = generate_config_toml(&options); - - assert!(toml.contains("[template]")); - assert!(toml.contains("name = \"my-template\"")); - assert!(toml.contains("[variables.project_name]")); - assert!(toml.contains("type = \"string\"")); - assert!(toml.contains("[variables.project_name_snake]")); - assert!(toml.contains("computed =")); - assert!(toml.contains("[files]")); - assert!(toml.contains("\".git/\"")); - assert!(toml.contains("\"*.png\"")); - } - - #[test] - fn test_generate_config_with_conditionals() { - let options = ConfigGenOptions { - template_name: "test".to_string(), - prompted_variables: vec![], - computed_variables: vec![], - exclude_patterns: vec![], - copy_without_render: vec![], - conditional_entries: vec![ConditionalEntry { - patterns: vec![".github/**".to_string()], - variable: "use_github_actions".to_string(), - description: "GitHub Actions CI".to_string(), - }], - }; - - let toml = generate_config_toml(&options); - - assert!(toml.contains("[variables.use_github_actions]")); - assert!(toml.contains("type = \"bool\"")); - assert!(toml.contains("default = true")); - assert!(toml.contains("[[files.conditional]]")); - assert!(toml.contains("pattern = \".github/**\"")); - assert!(toml.contains("when = \"use_github_actions\"")); - } - - #[test] - fn test_escape_toml_string() { - assert_eq!(escape_toml_string("hello"), "\"hello\""); - // toml crate uses multi-line strings for values containing quotes - let escaped = escape_toml_string("it's \"fine\""); - assert!(escaped.contains("it's")); - assert!(escaped.contains("fine")); - } -} diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index 0f75060..2f6beff 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -30,36 +30,6 @@ const DEFAULT_EXCLUDES: &[&str] = &[ ".diecut-answers.toml", ]; -/// Patterns for files that should be copied without rendering (binary-like or problematic). -const DEFAULT_COPY_WITHOUT_RENDER: &[&str] = &[ - "*.png", - "*.jpg", - "*.jpeg", - "*.gif", - "*.ico", - "*.svg", - "*.webp", - "*.woff", - "*.woff2", - "*.ttf", - "*.eot", - "*.otf", - "*.zip", - "*.tar", - "*.gz", - "*.bz2", - "*.xz", - "*.pdf", - "*.lock", - "package-lock.json", - "yarn.lock", - "pnpm-lock.yaml", - "Cargo.lock", - "Gemfile.lock", - "poetry.lock", - "composer.lock", -]; - /// Return all default exclude patterns for use during scanning. /// /// All DEFAULT_EXCLUDES are always used during the scan phase because patterns @@ -68,73 +38,6 @@ pub fn all_default_excludes() -> Vec { DEFAULT_EXCLUDES.iter().map(|s| s.to_string()).collect() } -/// Return only the DEFAULT_EXCLUDES patterns that match at least one file in the -/// template output. These are the patterns worth writing to `diecut.toml`'s -/// `[files] exclude` — directory patterns like `.git/` or `node_modules/` that -/// were filtered during scan are omitted since those files never appear in the -/// template. -pub fn relevant_config_excludes(template_files: &[std::path::PathBuf]) -> Vec { - let all = all_default_excludes(); - all.into_iter() - .filter(|pattern| { - template_files - .iter() - .any(|f| should_exclude(f, std::slice::from_ref(pattern))) - }) - .collect() -} - -/// Detect which copy-without-render patterns are relevant based on files present. -pub fn detect_copy_without_render(files: &[std::path::PathBuf]) -> Vec { - let mut found = Vec::new(); - - for pattern in DEFAULT_COPY_WITHOUT_RENDER { - if pattern.starts_with('*') { - // Extension pattern — check if any file matches - let ext = pattern.trim_start_matches("*."); - if files.iter().any(|f| { - f.extension() - .map(|e| e.to_string_lossy().eq_ignore_ascii_case(ext)) - .unwrap_or(false) - }) { - found.push(pattern.to_string()); - } - } else { - // Exact filename — check if present - if files.iter().any(|f| { - f.file_name() - .map(|n| n.to_string_lossy() == *pattern) - .unwrap_or(false) - }) { - found.push(pattern.to_string()); - } - } - } - - found -} - -/// Check if a file should be copied without rendering (lock files, binary-like assets). -/// -/// These files are included in the template but should never have replacements -/// applied during extraction — they're copied verbatim. -pub fn is_copy_without_render(path: &Path) -> bool { - for pattern in DEFAULT_COPY_WITHOUT_RENDER { - if let Some(ext) = pattern.strip_prefix("*.") { - if let Some(file_ext) = path.extension() { - if file_ext.to_string_lossy().eq_ignore_ascii_case(ext) { - return true; - } - } - } else if let Some(file_name) = path.file_name() { - if file_name.to_string_lossy() == *pattern { - return true; - } - } - } - false -} - /// Check if a path should be excluded based on the exclude patterns. pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { let path_str = relative_path.to_string_lossy(); @@ -175,7 +78,7 @@ pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { #[cfg(test)] mod tests { use super::*; - use std::path::PathBuf; + use std::path::Path; #[test] fn test_should_exclude_git() { @@ -209,90 +112,4 @@ mod tests { assert!(!should_exclude(Path::new("src/main.rs"), &excludes)); assert!(!should_exclude(Path::new("README.md"), &excludes)); } - - #[test] - fn test_all_default_excludes() { - let found = all_default_excludes(); - // All DEFAULT_EXCLUDES are always included - assert!(found.iter().any(|e| e.contains(".git"))); - assert!(found.iter().any(|e| e == ".DS_Store")); - assert!(found.iter().any(|e| e == "*.pyc")); - assert!(found.iter().any(|e| e.contains("node_modules"))); - } - - #[test] - fn test_relevant_config_excludes_empty_when_no_matches() { - // Typical template files won't match any DEFAULT_EXCLUDES - let files = vec![ - PathBuf::from("src/main.rs"), - PathBuf::from("README.md"), - PathBuf::from("Cargo.toml"), - ]; - let relevant = relevant_config_excludes(&files); - assert!(relevant.is_empty()); - } - - #[test] - fn test_relevant_config_excludes_finds_matching_patterns() { - let files = vec![ - PathBuf::from("src/main.py"), - PathBuf::from("src/__pycache__/main.pyc"), - PathBuf::from(".DS_Store"), - ]; - let relevant = relevant_config_excludes(&files); - assert!(relevant.contains(&"*.pyc".to_string())); - assert!(relevant.contains(&".DS_Store".to_string())); - assert!(relevant.contains(&"__pycache__".to_string())); - // Directory excludes that don't match should not appear - assert!(!relevant.contains(&".git".to_string())); - assert!(!relevant.contains(&"node_modules".to_string())); - } - - #[test] - fn test_should_exclude_claude_worktrees() { - let excludes = all_default_excludes(); - assert!(should_exclude( - Path::new(".claude/worktrees/agent-abc/Cargo.toml"), - &excludes - )); - // .claude/settings.local.json should NOT be excluded - assert!(!should_exclude( - Path::new(".claude/settings.local.json"), - &excludes - )); - } - - #[test] - fn test_should_exclude_astro() { - let excludes = all_default_excludes(); - assert!(should_exclude( - Path::new("docs/.astro/data-store.json"), - &excludes - )); - assert!(should_exclude(Path::new(".astro/settings.json"), &excludes)); - } - - #[test] - fn test_is_copy_without_render() { - assert!(is_copy_without_render(Path::new("Cargo.lock"))); - assert!(is_copy_without_render(Path::new("pnpm-lock.yaml"))); - assert!(is_copy_without_render(Path::new("package-lock.json"))); - assert!(is_copy_without_render(Path::new("logo.png"))); - assert!(is_copy_without_render(Path::new("deep/nested/file.lock"))); - assert!(!is_copy_without_render(Path::new("src/main.rs"))); - assert!(!is_copy_without_render(Path::new("README.md"))); - } - - #[test] - fn test_detect_copy_without_render() { - let files = vec![ - PathBuf::from("logo.png"), - PathBuf::from("font.woff2"), - PathBuf::from("README.md"), - ]; - let found = detect_copy_without_render(&files); - assert!(found.contains(&"*.png".to_string())); - assert!(found.contains(&"*.woff2".to_string())); - assert!(!found.contains(&"*.jpg".to_string())); - } } diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 767d7ae..29f7cd5 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -1,11 +1,7 @@ -pub mod config_gen; pub mod exclude; pub mod replace; pub mod scan; -pub mod stub; -pub mod variants; -use std::collections::HashMap; use std::path::{Path, PathBuf}; use console::style; @@ -13,30 +9,17 @@ use console::style; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; -use self::config_gen::{ - generate_config_toml, ComputedVariable, ConfigGenOptions, PromptedVariable, -}; -use self::exclude::{ - all_default_excludes, detect_copy_without_render, is_copy_without_render, - relevant_config_excludes, -}; +use self::exclude::all_default_excludes; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; -use self::scan::{count_occurrences, scan_project}; -use self::stub::{classify_file, generate_stub, FileRole}; -use self::variants::{ - computed_expression, detect_separator, generate_variants, is_canonical_variant, CaseVariant, -}; +use self::scan::scan_project; -/// A variable with its value and confirmed case variants. +/// A variable with its value. #[derive(Debug, Clone)] pub struct ExtractVariable { pub name: String, pub value: String, - pub variants: Vec, - /// Per-variant occurrence counts: (variant_name, file_count, total_hits). - pub occurrence_counts: Vec<(String, usize, usize)>, } /// The content of an extracted template file. @@ -58,8 +41,6 @@ pub struct PlannedExtractFile { pub template_path: PathBuf, /// The file content (text with replacements, or binary bytes). pub content: ExtractedContent, - /// Whether this file was stubbed (content replaced with a minimal placeholder). - pub stubbed: bool, } impl PlannedExtractFile { @@ -91,10 +72,6 @@ pub struct ExtractionPlan { pub files: Vec, pub config_toml: String, pub variables: Vec, - pub exclude_patterns: Vec, - pub copy_without_render: Vec, - pub dropped_count: usize, - pub dropped_paths: Vec, } /// Options for the extraction process. @@ -103,10 +80,9 @@ pub struct ExtractOptions { pub variables: Vec<(String, String)>, pub output_dir: Option, pub in_place: bool, - pub stub_depth: usize, } -/// Plan an extraction: scan the project, detect variants, build replacement rules. +/// Plan an extraction: scan the project, build replacement rules, apply replacements. pub fn plan_extraction(options: &ExtractOptions) -> Result { let source_dir = &options.source_dir; @@ -145,36 +121,18 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { }); } - // Phase 1: All default excludes for scanning (safety — never walks into .git/, node_modules/, etc.) + // Scan project let scan_excludes = all_default_excludes(); - - // Phase 2: Scan project eprintln!( "\n{}", style(format!("Scanning {}...", source_dir.display())).bold() ); - let mut scan_result = scan_project(source_dir, &scan_excludes)?; - - // Drop non-boilerplate files deeper than stub_depth before auto-detect sees them. - // This prevents frequency analysis from detecting variables that only appear in - // files that would be dropped anyway. - let pre_filter_count = scan_result.files.len(); - scan_result.files.retain(|f| { - let depth = f.relative_path.components().count(); - depth <= options.stub_depth - || classify_file(&f.relative_path, options.stub_depth) == FileRole::Boilerplate - }); - let depth_dropped = pre_filter_count - scan_result.files.len(); + let scan_result = scan_project(source_dir, &scan_excludes)?; eprintln!( - " {} files found, {} excluded{}", + " {} files found, {} excluded", scan_result.files.len(), scan_result.excluded_count, - if depth_dropped > 0 { - format!(", {} too deep", depth_dropped) - } else { - String::new() - } ); // Validate that at least one --var was provided @@ -183,73 +141,29 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { return Err(DicecutError::ExtractNoVariables); } - // Phase 3: Generate variants and count occurrences - let mut extract_variables = Vec::new(); - - for (var_name, var_value) in &variables { - let all_variants = generate_variants(var_name, var_value); - - let mut occurrence_counts = Vec::new(); - for variant in &all_variants { - let (file_count, total_hits) = count_occurrences(&variant.literal, &scan_result); - occurrence_counts.push((variant.name.to_string(), file_count, total_hits)); - } - - extract_variables.push(ExtractVariable { - name: var_name.clone(), - value: var_value.clone(), - variants: all_variants, - occurrence_counts, - }); - } - - // Phase 4: Auto-accept found variants (keep those with occurrences + verbatim) - let confirmed_variables: Vec = extract_variables - .into_iter() - .map(|mut var| { - var.variants.retain(|v| { - var.occurrence_counts - .iter() - .any(|(name, _, hits)| name == v.name && *hits > 0) - || v.name == "verbatim" - }); - // Always keep at least the verbatim/canonical variant - if var.variants.is_empty() { - let all = generate_variants(&var.name, &var.value); - if let Some(first) = all.into_iter().next() { - var.variants.push(first); - } - } - var + // Build extract variables (verbatim only) + let extract_variables: Vec = variables + .iter() + .map(|(name, value)| ExtractVariable { + name: name.clone(), + value: value.clone(), }) .collect(); - // Phase 7: Build replacement rules - let mut rules = Vec::new(); - for var in &confirmed_variables { - for variant in &var.variants { - rules.push(ReplacementRule { - literal: variant.literal.clone(), - replacement: variant.tera_expr.clone(), - variable: var.name.clone(), - variant: variant.name.to_string(), - }); - } - } - build_replacement_rules(&mut rules); - - // Phase 8: Detect copy_without_render patterns - let file_paths: Vec = scan_result - .files + // Build replacement rules — one rule per variable, verbatim only + let mut rules: Vec = extract_variables .iter() - .map(|f| f.relative_path.clone()) + .map(|var| ReplacementRule { + literal: var.value.clone(), + replacement: format!("{{{{ {} }}}}", var.name), + variable: var.name.clone(), + variant: "verbatim".to_string(), + }) .collect(); - let copy_without_render = detect_copy_without_render(&file_paths); + build_replacement_rules(&mut rules); - // Phase 9: Apply replacements to files + // Apply replacements to files let mut planned_files = Vec::new(); - let mut dropped_count = depth_dropped; - let mut dropped_paths = Vec::new(); for file in &scan_result.files { let template_path = apply_path_replacements(&file.relative_path, &rules); @@ -263,22 +177,8 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { planned_files.push(PlannedExtractFile { template_path, content: ExtractedContent::Binary(binary_content), - stubbed: false, }); } else if let Some(ref content) = file.content { - // Lock files and other copy-without-render files: skip replacement - if is_copy_without_render(&file.relative_path) { - planned_files.push(PlannedExtractFile { - template_path, - content: ExtractedContent::Text { - content: content.clone(), - replacement_count: 0, - }, - stubbed: false, - }); - continue; - } - let (replaced, count) = apply_replacements(content, &rules); if count > 0 { @@ -291,110 +191,33 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { content: replaced, replacement_count: count, }, - stubbed: false, }); } else { - // No replacements — classify as boilerplate, content, or dropped - match classify_file(&file.relative_path, options.stub_depth) { - FileRole::Boilerplate => { - planned_files.push(PlannedExtractFile { - template_path, - content: ExtractedContent::Text { - content: replaced, - replacement_count: 0, - }, - stubbed: false, - }); - } - FileRole::Content => { - let stub = generate_stub(&file.relative_path); - planned_files.push(PlannedExtractFile { - template_path, - content: ExtractedContent::Text { - content: stub, - replacement_count: 0, - }, - stubbed: true, - }); - } - FileRole::Dropped => { - dropped_count += 1; - dropped_paths.push(file.relative_path.clone()); - } - } - } - } - } - - // Phase 9.5: Compute config-appropriate excludes from planned template files - // Only patterns that match files actually in the template are worth writing to diecut.toml - let template_paths: Vec = planned_files - .iter() - .map(|f| f.template_path.clone()) - .collect(); - let config_excludes = relevant_config_excludes(&template_paths); - - // Generate config - let canonical_seps: HashMap = confirmed_variables - .iter() - .map(|v| (v.name.clone(), detect_separator(&v.value))) - .collect(); - - let prompted_vars: Vec = confirmed_variables - .iter() - .map(|v| PromptedVariable { - name: v.name.clone(), - default_value: v.value.clone(), - prompt: v.name.replace(['_', '-'], " "), - }) - .collect(); - - let mut computed_vars = Vec::new(); - for var in &confirmed_variables { - let canonical_sep = canonical_seps.get(&var.name).copied().unwrap_or("-"); - for variant in &var.variants { - // Skip the canonical variant (it uses the variable directly) - if variant.name == "verbatim" { - continue; - } - // Skip the variant that matches the canonical separator - if is_canonical_variant(variant.name, canonical_sep) { - continue; - } - - let computed_name = format!("{}_{}", var.name, variant.name); - let expression = computed_expression(&var.name, variant.name, canonical_sep); - // Don't add if expression is just the variable name - if expression != var.name { - computed_vars.push(ComputedVariable { - name: computed_name, - expression, + // No replacements — copy verbatim + planned_files.push(PlannedExtractFile { + template_path, + content: ExtractedContent::Text { + content: replaced, + replacement_count: 0, + }, }); } } } - let config_toml = generate_config_toml(&ConfigGenOptions { - template_name: source_dir - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| "template".to_string()), - prompted_variables: prompted_vars, - computed_variables: computed_vars, - exclude_patterns: config_excludes.clone(), - copy_without_render: copy_without_render.clone(), - conditional_entries: vec![], - }); + // Generate minimal config TOML inline + let template_name = source_dir + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "template".to_string()); + + let config_toml = generate_minimal_config(&template_name, &extract_variables); Ok(ExtractionPlan { output_dir, files: planned_files, config_toml, - variables: confirmed_variables, - exclude_patterns: config_excludes, - copy_without_render, - dropped_count, - dropped_paths, + variables: extract_variables, }) } @@ -412,7 +235,6 @@ pub fn execute_extraction(plan: &ExtractionPlan) -> Result<()> { // Write template files let mut rendered_count = 0; let mut copied_count = 0; - let mut stubbed_count = 0; for file in &plan.files { let dest = template_dir.join(&file.template_path); @@ -436,8 +258,6 @@ pub fn execute_extraction(plan: &ExtractionPlan) -> Result<()> { })?; if *replacement_count > 0 { rendered_count += 1; - } else if file.stubbed { - stubbed_count += 1; } else { copied_count += 1; } @@ -460,45 +280,39 @@ pub fn execute_extraction(plan: &ExtractionPlan) -> Result<()> { })?; // Summary - let prompted_count = plan.variables.len(); - let computed_count = plan - .variables - .iter() - .flat_map(|v| &v.variants) - .filter(|variant| { - variant.name != "verbatim" - && !matches!( - ( - variant.name, - detect_separator( - plan.variables - .iter() - .find(|v2| v2.variants.contains(variant)) - .map(|v2| v2.value.as_str()) - .unwrap_or("") - ) - ), - ("kebab", "-") | ("snake", "_") | ("dot", ".") - ) - }) - .count(); - eprintln!( "\n{} Template extracted to {}", style("✓").green().bold(), style(output_dir.display()).cyan() ); eprintln!( - " {} variables ({} prompted, {} computed)", - prompted_count + computed_count, - prompted_count, - computed_count - ); - eprintln!( - " {} files templated, {} files copied, {} files stubbed, {} files dropped", - rendered_count, copied_count, stubbed_count, plan.dropped_count + " {} variables, {} files templated, {} files copied", + plan.variables.len(), + rendered_count, + copied_count, ); eprintln!(" Review diecut.toml to fine-tune"); Ok(()) } + +fn generate_minimal_config(template_name: &str, variables: &[ExtractVariable]) -> String { + let escape = |s: &str| toml::Value::String(s.to_string()).to_string(); + let mut out = String::new(); + + out.push_str(&format!("[template]\nname = {}\n", escape(template_name))); + out.push_str("version = \"1.0.0\"\n\n"); + + for var in variables { + out.push_str(&format!("[variables.{}]\n", var.name)); + out.push_str(&format!( + "type = \"string\"\nprompt = {}\n", + escape(&var.name.replace(['_', '-'], " ")) + )); + out.push_str(&format!("default = {}\n\n", escape(&var.value))); + } + + out.push_str("[files]\n# exclude = []\n# copy_without_render = []\n\n"); + out.push_str("# [hooks]\n# post_create = \"echo 'Project created!'\"\n"); + out +} diff --git a/src/extract/replace.rs b/src/extract/replace.rs index 5cbe93c..419405a 100644 --- a/src/extract/replace.rs +++ b/src/extract/replace.rs @@ -132,119 +132,3 @@ pub fn apply_path_replacements(path: &Path, rules: &[ReplacementRule]) -> PathBu components.iter().collect() } - -#[cfg(test)] -mod tests { - use super::*; - - fn count_occurrences(content: &str, literal: &str) -> usize { - if literal.is_empty() { - return 0; - } - content.matches(literal).count() - } - - fn make_rule(literal: &str, replacement: &str) -> ReplacementRule { - ReplacementRule { - literal: literal.to_string(), - replacement: replacement.to_string(), - variable: "test".to_string(), - variant: "test".to_string(), - } - } - - #[test] - fn test_apply_replacements_basic() { - let rules = vec![make_rule("my-app", "{{ project_name }}")]; - let (result, count) = apply_replacements("Welcome to my-app!", &rules); - assert_eq!(result, "Welcome to {{ project_name }}!"); - assert_eq!(count, 1); - } - - #[test] - fn test_apply_replacements_multiple() { - let rules = vec![make_rule("my-app", "{{ project_name }}")]; - let (result, count) = apply_replacements("my-app is great, use my-app", &rules); - assert_eq!( - result, - "{{ project_name }} is great, use {{ project_name }}" - ); - assert_eq!(count, 2); - } - - #[test] - fn test_longest_match_first() { - let mut rules = vec![ - make_rule("my", "{{ org }}"), - make_rule("my-app", "{{ project_name }}"), - ]; - build_replacement_rules(&mut rules); - - // "my-app" should match before "my" - assert_eq!(rules[0].literal, "my-app"); - assert_eq!(rules[1].literal, "my"); - } - - #[test] - fn test_apply_replacements_empty_rules() { - let (result, count) = apply_replacements("hello world", &[]); - assert_eq!(result, "hello world"); - assert_eq!(count, 0); - } - - #[test] - fn test_apply_path_replacements() { - let rules = vec![make_rule("my-app", "{{ project_name }}")]; - let path = Path::new("my-app/src/main.rs"); - let result = apply_path_replacements(path, &rules); - assert_eq!(result, PathBuf::from("{{ project_name }}/src/main.rs")); - } - - #[test] - fn test_count_occurrences() { - assert_eq!(count_occurrences("my-app and my-app", "my-app"), 2); - assert_eq!(count_occurrences("hello world", "missing"), 0); - assert_eq!(count_occurrences("anything", ""), 0); - } - - #[test] - fn test_no_substring_collision_suffix() { - let rules = vec![make_rule("app", "{{ name }}")]; - let (result, count) = apply_replacements("application startup", &rules); - assert_eq!(result, "application startup"); - assert_eq!(count, 0); - } - - #[test] - fn test_no_substring_collision_prefix() { - let rules = vec![make_rule("app", "{{ name }}")]; - let (result, count) = apply_replacements("webapp is cool", &rules); - assert_eq!(result, "webapp is cool"); - assert_eq!(count, 0); - } - - #[test] - fn test_standalone_match_with_punctuation() { - let rules = vec![make_rule("app", "{{ name }}")]; - let (result, count) = apply_replacements("run app. start app!", &rules); - assert_eq!(result, "run {{ name }}. start {{ name }}!"); - assert_eq!(count, 2); - } - - #[test] - fn test_match_at_string_boundaries() { - let rules = vec![make_rule("app", "{{ name }}")]; - let (result, count) = apply_replacements("app", &rules); - assert_eq!(result, "{{ name }}"); - assert_eq!(count, 1); - } - - #[test] - fn test_compound_literal_still_matches() { - // Multi-word literals like "my-app" should still match inside strings - let rules = vec![make_rule("my-app", "{{ name }}")]; - let (result, count) = apply_replacements("name = \"my-app\"", &rules); - assert_eq!(result, "name = \"{{ name }}\""); - assert_eq!(count, 1); - } -} diff --git a/src/extract/scan.rs b/src/extract/scan.rs index 088d6dd..c5ea216 100644 --- a/src/extract/scan.rs +++ b/src/extract/scan.rs @@ -127,86 +127,3 @@ pub fn count_occurrences(value: &str, scan_result: &ScanResult) -> (usize, usize (file_count, total) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_scan_project_basic() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write(dir.path().join("README.md"), "# Hello").unwrap(); - std::fs::create_dir(dir.path().join("src")).unwrap(); - std::fs::write(dir.path().join("src/main.rs"), "fn main() {}").unwrap(); - - let result = scan_project(dir.path(), &[]).unwrap(); - assert_eq!(result.files.len(), 2); - assert_eq!(result.excluded_count, 0); - } - - #[test] - fn test_scan_project_with_excludes() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write(dir.path().join("README.md"), "# Hello").unwrap(); - std::fs::create_dir(dir.path().join(".git")).unwrap(); - std::fs::write(dir.path().join(".git/config"), "").unwrap(); - - let excludes = vec![".git".to_string()]; - let result = scan_project(dir.path(), &excludes).unwrap(); - assert_eq!(result.files.len(), 1); - assert_eq!(result.excluded_count, 1); - assert_eq!(result.files[0].relative_path, PathBuf::from("README.md")); - } - - #[cfg(unix)] - #[test] - fn test_scan_project_skips_symlinks_to_directories() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write(dir.path().join("real.txt"), "hello").unwrap(); - - // Create a subdirectory and a symlink pointing to it - let subdir = dir.path().join("subdir"); - std::fs::create_dir(&subdir).unwrap(); - std::fs::write(subdir.join("nested.txt"), "nested").unwrap(); - std::os::unix::fs::symlink(&subdir, dir.path().join("link-to-dir")).unwrap(); - - let result = scan_project(dir.path(), &[]).unwrap(); - // Should find real.txt and subdir/nested.txt, but NOT choke on link-to-dir - let paths: Vec = result - .files - .iter() - .map(|f| f.relative_path.to_string_lossy().to_string()) - .collect(); - assert!(paths.contains(&"real.txt".to_string())); - assert!(paths.contains(&"subdir/nested.txt".to_string())); - assert!(!paths.iter().any(|p| p.contains("link-to-dir"))); - } - - #[test] - fn test_scan_project_binary_detection() { - let dir = tempfile::tempdir().unwrap(); - std::fs::write(dir.path().join("text.txt"), "hello").unwrap(); - std::fs::write( - dir.path().join("binary.bin"), - &(0..256).map(|i| i as u8).collect::>(), - ) - .unwrap(); - - let result = scan_project(dir.path(), &[]).unwrap(); - let text_file = result - .files - .iter() - .find(|f| f.relative_path.to_string_lossy() == "text.txt") - .unwrap(); - let binary_file = result - .files - .iter() - .find(|f| f.relative_path.to_string_lossy() == "binary.bin") - .unwrap(); - - assert!(!text_file.is_binary); - assert!(text_file.content.is_some()); - assert!(binary_file.is_binary); - assert!(binary_file.content.is_none()); - } -} diff --git a/src/extract/stub.rs b/src/extract/stub.rs deleted file mode 100644 index 8c6ce47..0000000 --- a/src/extract/stub.rs +++ /dev/null @@ -1,222 +0,0 @@ -use std::path::Path; - -/// Whether a file is boilerplate (copy in full), content (stub), or too deep (drop). -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum FileRole { - /// Config, dotfiles, CI — copy verbatim into the template. - Boilerplate, - /// Prose, docs, source — stub to minimal placeholder. - Content, - /// Content deeper than stub_depth — drop entirely. - Dropped, -} - -/// Filenames (case-insensitive) that are always boilerplate. -const BOILERPLATE_FILENAMES: &[&str] = &[ - ".gitignore", - ".gitattributes", - ".editorconfig", - ".prettierrc", - ".npmrc", - ".nvmrc", - ".gitkeep", - "makefile", - "dockerfile", - "justfile", - "license", - "licence", - "procfile", -]; - -/// Extensions (case-insensitive, without dot) that are always boilerplate. -const BOILERPLATE_EXTENSIONS: &[&str] = &[ - "toml", "yaml", "yml", "json", "jsonc", "json5", "xml", "sh", "bash", "zsh", "bat", "cmd", - "ps1", "cfg", "ini", "conf", -]; - -/// Directory prefixes — files under these dirs are boilerplate. -const BOILERPLATE_DIR_PREFIXES: &[&str] = &[".github/", ".gitlab/", ".circleci/", ".vscode/"]; - -/// Classify a file as boilerplate, content, or dropped based on its relative path. -/// -/// Only called for text files with 0 template replacements. -/// Files deeper than `stub_depth` path components are dropped entirely. -pub fn classify_file(path: &Path, stub_depth: usize) -> FileRole { - let path_str = path.to_string_lossy(); - - // Check directory prefix - for prefix in BOILERPLATE_DIR_PREFIXES { - if path_str.starts_with(prefix) { - return FileRole::Boilerplate; - } - } - - // Check filename (case-insensitive) - if let Some(filename) = path.file_name().and_then(|n| n.to_str()) { - let lower = filename.to_lowercase(); - if BOILERPLATE_FILENAMES.contains(&lower.as_str()) { - return FileRole::Boilerplate; - } - } - - // Check extension (case-insensitive) - if let Some(ext) = path.extension().and_then(|e| e.to_str()) { - let lower = ext.to_lowercase(); - if BOILERPLATE_EXTENSIONS.contains(&lower.as_str()) { - return FileRole::Boilerplate; - } - } - - let depth = path.components().count(); - if depth > stub_depth { - FileRole::Dropped - } else { - FileRole::Content - } -} - -/// Generate a minimal stub for a content file. -/// -/// - `.md` files get `# {Title}\n` where Title is derived from the filename. -/// - Everything else gets an empty string. -pub fn generate_stub(path: &Path) -> String { - let is_md = path - .extension() - .and_then(|e| e.to_str()) - .is_some_and(|e| e.eq_ignore_ascii_case("md")); - - if is_md { - let title = path - .file_stem() - .and_then(|s| s.to_str()) - .unwrap_or("Untitled"); - // Title-case: capitalize first letter, leave rest as-is - let title = title_case(title); - format!("# {title}\n") - } else { - String::new() - } -} - -/// Convert a filename stem like "craft" or "SKILL" into title case. -/// -/// Splits on `-` and `_`, capitalizes each word's first letter. -fn title_case(s: &str) -> String { - s.split(['-', '_']) - .filter(|w| !w.is_empty()) - .map(|word| { - let mut chars = word.chars(); - match chars.next() { - Some(first) => { - let rest: String = chars.collect::().to_lowercase(); - format!("{}{rest}", first.to_uppercase()) - } - None => String::new(), - } - }) - .collect::>() - .join(" ") -} - -#[cfg(test)] -mod tests { - use super::*; - use rstest::rstest; - - // ── classify_file ──────────────────────────────────────────────── - - #[rstest] - #[case(".gitignore", FileRole::Boilerplate)] - #[case(".editorconfig", FileRole::Boilerplate)] - #[case("Makefile", FileRole::Boilerplate)] - #[case("Dockerfile", FileRole::Boilerplate)] - #[case("LICENSE", FileRole::Boilerplate)] - #[case("Procfile", FileRole::Boilerplate)] - fn classify_boilerplate_filenames(#[case] filename: &str, #[case] expected: FileRole) { - assert_eq!(classify_file(Path::new(filename), 2), expected); - } - - #[rstest] - #[case("Cargo.toml", FileRole::Boilerplate)] - #[case("config.yaml", FileRole::Boilerplate)] - #[case("settings.yml", FileRole::Boilerplate)] - #[case("package.json", FileRole::Boilerplate)] - #[case("tsconfig.json", FileRole::Boilerplate)] - #[case("setup.cfg", FileRole::Boilerplate)] - #[case("build.sh", FileRole::Boilerplate)] - #[case("deploy.ps1", FileRole::Boilerplate)] - #[case("app.conf", FileRole::Boilerplate)] - fn classify_boilerplate_extensions(#[case] filename: &str, #[case] expected: FileRole) { - assert_eq!(classify_file(Path::new(filename), 2), expected); - } - - #[rstest] - #[case(".github/workflows/ci.yml", FileRole::Boilerplate)] - #[case(".github/CODEOWNERS", FileRole::Boilerplate)] - #[case(".gitlab/ci/deploy.yml", FileRole::Boilerplate)] - #[case(".circleci/config.yml", FileRole::Boilerplate)] - #[case(".vscode/settings.json", FileRole::Boilerplate)] - fn classify_boilerplate_directories(#[case] path: &str, #[case] expected: FileRole) { - assert_eq!(classify_file(Path::new(path), 2), expected); - } - - #[rstest] - #[case("README.md", 2)] - #[case("docs/guide.md", 2)] - #[case("src/main.rs", 2)] - #[case("src/lib.py", 2)] - #[case("index.html", 2)] - #[case("app.css", 2)] - #[case("skills/convention-mining/SKILL.md", 3)] // depth 3, stub_depth 3 → Content - fn classify_content(#[case] path: &str, #[case] stub_depth: usize) { - assert_eq!( - classify_file(Path::new(path), stub_depth), - FileRole::Content - ); - } - - #[rstest] - #[case("skills/convention-mining/SKILL.md", 2)] // depth 3 > stub_depth 2 - #[case("skills/writing-skills/craft.md", 2)] // depth 3 > stub_depth 2 - #[case("a/b/c/deep.md", 2)] // depth 4 > stub_depth 2 - #[case("docs/guide.md", 1)] // depth 2 > stub_depth 1 - fn classify_dropped(#[case] path: &str, #[case] stub_depth: usize) { - assert_eq!( - classify_file(Path::new(path), stub_depth), - FileRole::Dropped - ); - } - - // ── generate_stub ──────────────────────────────────────────────── - - #[rstest] - #[case("README.md", "# Readme\n")] - #[case("craft.md", "# Craft\n")] - #[case("SKILL.md", "# Skill\n")] - #[case("getting-started.md", "# Getting Started\n")] - #[case("my_notes.md", "# My Notes\n")] - fn stub_md_files(#[case] filename: &str, #[case] expected: &str) { - assert_eq!(generate_stub(Path::new(filename)), expected); - } - - #[rstest] - #[case("src/main.rs")] - #[case("index.html")] - #[case("app.css")] - #[case("data.txt")] - fn stub_non_md_files(#[case] filename: &str) { - assert_eq!(generate_stub(Path::new(filename)), ""); - } - - // ── title_case ─────────────────────────────────────────────────── - - #[rstest] - #[case("craft", "Craft")] - #[case("SKILL", "Skill")] - #[case("getting-started", "Getting Started")] - #[case("my_notes", "My Notes")] - #[case("README", "Readme")] - fn test_title_case(#[case] input: &str, #[case] expected: &str) { - assert_eq!(title_case(input), expected); - } -} diff --git a/src/extract/variants.rs b/src/extract/variants.rs deleted file mode 100644 index 525b475..0000000 --- a/src/extract/variants.rs +++ /dev/null @@ -1,309 +0,0 @@ -use std::sync::LazyLock; - -use regex_lite::Regex; - -static CAMEL_SPLIT_RE: LazyLock = - LazyLock::new(|| Regex::new(r"[A-Z][a-z]*|[a-z]+|[0-9]+").unwrap()); - -/// A case variant of a variable value, with its literal text and Tera expression. -#[derive(Debug, Clone, PartialEq)] -pub struct CaseVariant { - pub name: &'static str, - pub literal: String, - pub tera_expr: String, -} - -/// Split a string value into words for case variant generation. -/// -/// Handles kebab-case, snake_case, camelCase, PascalCase, dot.case, and space-separated. -pub fn split_into_words(value: &str) -> Vec { - if value.contains('-') { - return value.split('-').map(|s| s.to_lowercase()).collect(); - } - if value.contains('_') { - return value.split('_').map(|s| s.to_lowercase()).collect(); - } - if value.contains('.') { - return value.split('.').map(|s| s.to_lowercase()).collect(); - } - if value.contains(' ') { - return value.split_whitespace().map(|s| s.to_lowercase()).collect(); - } - - // camelCase / PascalCase splitting - let words: Vec = CAMEL_SPLIT_RE - .find_iter(value) - .map(|m| m.as_str().to_lowercase()) - .collect(); - - if words.is_empty() { - vec![value.to_lowercase()] - } else { - words - } -} - -/// Detect if a value is "multi-word" in a way that supports case variants. -/// -/// Single words and space-separated phrases skip variant detection. -fn supports_case_variants(value: &str) -> bool { - let words = split_into_words(value); - if words.len() < 2 { - return false; - } - // Space-separated values (like author names) skip variant detection - if value.contains(' ') { - return false; - } - true -} - -fn to_kebab(words: &[String]) -> String { - words.join("-") -} - -fn to_snake(words: &[String]) -> String { - words.join("_") -} - -fn to_screaming_snake(words: &[String]) -> String { - words - .iter() - .map(|w| w.to_uppercase()) - .collect::>() - .join("_") -} - -fn to_screaming_kebab(words: &[String]) -> String { - words - .iter() - .map(|w| w.to_uppercase()) - .collect::>() - .join("-") -} - -fn to_pascal(words: &[String]) -> String { - words - .iter() - .map(|w| { - let mut chars = w.chars(); - match chars.next() { - Some(c) => { - let upper: String = c.to_uppercase().collect(); - upper + chars.as_str() - } - None => String::new(), - } - }) - .collect() -} - -fn to_camel(words: &[String]) -> String { - let pascal = to_pascal(words); - let mut chars = pascal.chars(); - match chars.next() { - Some(c) => { - let lower: String = c.to_lowercase().collect(); - lower + chars.as_str() - } - None => String::new(), - } -} - -fn to_dot(words: &[String]) -> String { - words.join(".") -} - -/// Detect the canonical separator in the original value. -pub fn detect_separator(value: &str) -> &'static str { - if value.contains('-') { - "-" - } else if value.contains('_') { - "_" - } else if value.contains('.') { - "." - } else { - // PascalCase/camelCase — treat as kebab canonical - "-" - } -} - -/// Check whether a variant is the canonical one (matches the input separator). -/// -/// Canonical variants use the bare `{{ var_name }}` expression and do not get -/// a computed variable in diecut.toml. -pub fn is_canonical_variant(variant_name: &str, canonical_sep: &str) -> bool { - matches!( - (variant_name, canonical_sep), - ("kebab", "-") | ("snake", "_") | ("dot", ".") - ) -} - -/// Build a Tera expression for a variant, given the variable name and canonical separator. -/// -/// Canonical variants use `{{ var_name }}` directly. Non-canonical variants reference -/// their computed variable (e.g., `{{ var_name_snake }}`), which is defined in diecut.toml. -fn tera_expr_for_variant(var_name: &str, variant_name: &str, canonical_sep: &str) -> String { - if variant_name == "verbatim" || is_canonical_variant(variant_name, canonical_sep) { - return format!("{{{{ {var_name} }}}}"); - } - // Non-canonical variants reference their computed variable name - format!("{{{{ {var_name}_{variant_name} }}}}") -} - -/// Generate all case variants for a given variable value. -/// -/// Returns the canonical variant first, followed by alternatives. -/// Only returns variants whose literal differs from the canonical form. -/// Single-word values and space-separated phrases return only a verbatim replacement. -pub fn generate_variants(var_name: &str, value: &str) -> Vec { - if !supports_case_variants(value) { - return vec![CaseVariant { - name: "verbatim", - literal: value.to_string(), - tera_expr: format!("{{{{ {var_name} }}}}"), - }]; - } - - let words = split_into_words(value); - let canonical_sep = detect_separator(value); - - let candidates: Vec<(&str, String)> = vec![ - ("kebab", to_kebab(&words)), - ("snake", to_snake(&words)), - ("screaming_snake", to_screaming_snake(&words)), - ("screaming_kebab", to_screaming_kebab(&words)), - ("pascal", to_pascal(&words)), - ("camel", to_camel(&words)), - ("dot", to_dot(&words)), - ]; - - // Deduplicate: some variants produce the same literal (e.g., single-word) - let mut seen = std::collections::HashSet::new(); - let mut variants = Vec::new(); - - for (name, literal) in candidates { - if seen.insert(literal.clone()) { - let tera_expr = tera_expr_for_variant(var_name, name, canonical_sep); - variants.push(CaseVariant { - name, - literal, - tera_expr, - }); - } - } - - variants -} - -/// Build a computed Tera expression for a named variant variable. -/// -/// This is used in diecut.toml for computed variables like `project_name_snake`. -pub fn computed_expression(var_name: &str, variant_name: &str, canonical_sep: &str) -> String { - match (variant_name, canonical_sep) { - ("snake", sep) if sep != "_" => { - format!("{var_name} | replace(from=\"{sep}\", to=\"_\")") - } - ("screaming_snake", sep) => { - if sep == "_" { - format!("{var_name} | upper") - } else { - format!("{var_name} | replace(from=\"{sep}\", to=\"_\") | upper") - } - } - ("screaming_kebab", sep) => { - if sep == "-" { - format!("{var_name} | upper") - } else { - format!("{var_name} | replace(from=\"{sep}\", to=\"-\") | upper") - } - } - ("pascal", sep) => { - format!("{var_name} | replace(from=\"{sep}\", to=\" \") | title | replace(from=\" \", to=\"\")") - } - ("camel", sep) => { - format!("{var_name} | camelcase(sep=\"{sep}\")") - } - ("kebab", sep) if sep != "-" => { - format!("{var_name} | replace(from=\"{sep}\", to=\"-\")") - } - ("dot", sep) if sep != "." => { - format!("{var_name} | replace(from=\"{sep}\", to=\".\")") - } - _ => var_name.to_string(), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use rstest::rstest; - - #[rstest] - #[case("my-app", vec!["my", "app"])] - #[case("my_app", vec!["my", "app"])] - #[case("MyApp", vec!["my", "app"])] - #[case("myApp", vec!["my", "app"])] - #[case("my.app", vec!["my", "app"])] - #[case("my app", vec!["my", "app"])] - #[case("single", vec!["single"])] - fn test_split_into_words(#[case] input: &str, #[case] expected: Vec<&str>) { - assert_eq!(split_into_words(input), expected); - } - - #[test] - fn test_generate_variants_kebab() { - let variants = generate_variants("project_name", "my-app"); - let names: Vec<&str> = variants.iter().map(|v| v.name).collect(); - assert!(names.contains(&"kebab")); - assert!(names.contains(&"snake")); - assert!(names.contains(&"pascal")); - - let kebab = variants.iter().find(|v| v.name == "kebab").unwrap(); - assert_eq!(kebab.literal, "my-app"); - - let snake = variants.iter().find(|v| v.name == "snake").unwrap(); - assert_eq!(snake.literal, "my_app"); - - let pascal = variants.iter().find(|v| v.name == "pascal").unwrap(); - assert_eq!(pascal.literal, "MyApp"); - } - - #[test] - fn test_generate_variants_single_word() { - let variants = generate_variants("name", "hello"); - assert_eq!(variants.len(), 1); - assert_eq!(variants[0].name, "verbatim"); - assert_eq!(variants[0].literal, "hello"); - } - - #[test] - fn test_generate_variants_space_separated() { - let variants = generate_variants("author", "Jane Doe"); - assert_eq!(variants.len(), 1); - assert_eq!(variants[0].name, "verbatim"); - assert_eq!(variants[0].literal, "Jane Doe"); - } - - #[test] - fn test_generate_variants_screaming_snake() { - let variants = generate_variants("project_name", "my-app"); - let ss = variants - .iter() - .find(|v| v.name == "screaming_snake") - .unwrap(); - assert_eq!(ss.literal, "MY_APP"); - } - - #[test] - fn test_tera_expr_kebab_canonical() { - let expr = tera_expr_for_variant("project_name", "kebab", "-"); - assert_eq!(expr, "{{ project_name }}"); - } - - #[test] - fn test_tera_expr_snake_from_kebab() { - let expr = tera_expr_for_variant("project_name", "snake", "-"); - assert_eq!(expr, "{{ project_name_snake }}"); - } -} diff --git a/src/main.rs b/src/main.rs index 0e6ef6a..6a13faa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,8 +24,7 @@ fn main() -> miette::Result<()> { vars, output, in_place, - stub_depth, dry_run, - } => commands::extract::run(source, vars, output, in_place, stub_depth, dry_run), + } => commands::extract::run(source, vars, output, in_place, dry_run), } } diff --git a/src/prompt/engine.rs b/src/prompt/engine.rs index 47fc847..4de7253 100644 --- a/src/prompt/engine.rs +++ b/src/prompt/engine.rs @@ -96,7 +96,7 @@ fn evaluate_computed( computed_expr: &str, values: &BTreeMap, ) -> Result { - let mut tera = crate::render::tera_with_filters(); + let mut tera = tera::Tera::default(); tera.add_raw_template("__computed__", computed_expr) .map_err(|e| DicecutError::ComputedEvaluation { name: name.to_string(), diff --git a/src/render/context.rs b/src/render/context.rs index f530022..f29f678 100644 --- a/src/render/context.rs +++ b/src/render/context.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; use tera::{Context, Tera, Value}; @@ -10,85 +10,14 @@ pub fn build_context(variables: &BTreeMap) -> Context { context } -/// Create a Tera instance with custom filters registered. -/// -/// This should be used instead of `Tera::default()` anywhere templates or -/// computed expressions are evaluated, so that custom filters like `camelcase` -/// are available. -pub fn tera_with_filters() -> Tera { - let mut tera = Tera::default(); - tera.register_filter("camelcase", camelcase_filter); - tera -} - -/// Custom Tera filter: convert a separated string to camelCase. -/// -/// Usage: `{{ value | camelcase }}` or `{{ value | camelcase(sep="-") }}` -/// -/// Splits on the separator (default `-`), lowercases the first word, -/// title-cases the rest, and joins them. -fn camelcase_filter(value: &Value, args: &HashMap) -> Result { - let s = value - .as_str() - .ok_or_else(|| tera::Error::msg("camelcase filter requires a string value"))?; - - let sep = args.get("sep").and_then(|v| v.as_str()).unwrap_or("-"); - - let words: Vec<&str> = s.split(sep).collect(); - if words.is_empty() { - return Ok(Value::String(String::new())); - } - - let mut result = words[0].to_lowercase(); - for word in &words[1..] { - let mut chars = word.chars(); - if let Some(first) = chars.next() { - result.extend(first.to_uppercase()); - result.push_str(&chars.as_str().to_lowercase()); - } - } - - Ok(Value::String(result)) -} - /// Evaluate a Tera boolean expression against a variable context. /// /// Returns `Ok(true)` if the expression evaluates to true, `Ok(false)` otherwise. /// Returns `Err` if the expression fails to parse or render. pub fn eval_bool_expr(expr: &str, context: &Context) -> std::result::Result { - let mut tera = tera_with_filters(); + let mut tera = Tera::default(); let template_str = format!("{{% if {expr} %}}true{{% else %}}false{{% endif %}}"); tera.add_raw_template("__when__", &template_str)?; let result = tera.render("__when__", context)?; Ok(result.trim() == "true") } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_camelcase_filter_kebab() { - let val = Value::String("my-cool-app".to_string()); - let args = HashMap::new(); - let result = camelcase_filter(&val, &args).unwrap(); - assert_eq!(result, Value::String("myCoolApp".to_string())); - } - - #[test] - fn test_camelcase_filter_custom_sep() { - let val = Value::String("my_cool_app".to_string()); - let mut args = HashMap::new(); - args.insert("sep".to_string(), Value::String("_".to_string())); - let result = camelcase_filter(&val, &args).unwrap(); - assert_eq!(result, Value::String("myCoolApp".to_string())); - } - - #[test] - fn test_camelcase_filter_single_word() { - let val = Value::String("hello".to_string()); - let args = HashMap::new(); - let result = camelcase_filter(&val, &args).unwrap(); - assert_eq!(result, Value::String("hello".to_string())); - } -} diff --git a/src/render/mod.rs b/src/render/mod.rs index 8a87f30..5674674 100644 --- a/src/render/mod.rs +++ b/src/render/mod.rs @@ -2,7 +2,7 @@ pub mod context; pub mod file; pub mod walker; -pub use context::{build_context, eval_bool_expr, tera_with_filters}; +pub use context::{build_context, eval_bool_expr}; pub use walker::{ execute_plan, plan_render, walk_and_render, GeneratedProject, GenerationPlan, PlannedFile, }; diff --git a/src/render/walker.rs b/src/render/walker.rs index 97b1e96..caf9e26 100644 --- a/src/render/walker.rs +++ b/src/render/walker.rs @@ -2,7 +2,7 @@ use std::collections::BTreeMap; use std::path::{Path, PathBuf}; use globset::{Glob, GlobSet, GlobSetBuilder}; -use tera::{Context, Value}; +use tera::{Context, Tera, Value}; use walkdir::WalkDir; use crate::adapter::ResolvedTemplate; @@ -104,7 +104,7 @@ pub fn plan_render( source: e, })?; - let mut tera = crate::render::tera_with_filters(); + let mut tera = Tera::default(); let template_name = rel_str.to_string(); let parse_result = tera.add_raw_template(&template_name, &content); let render_result = parse_result.and_then(|_| tera.render(&template_name, context)); diff --git a/tests/integration.rs b/tests/integration.rs index f56835d..024d6e4 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -654,7 +654,6 @@ fn test_extract_batch_basic() { ], output_dir: Some(output_path.clone()), in_place: false, - stub_depth: 2, }; let plan = plan_extraction(&options).unwrap(); @@ -679,62 +678,6 @@ fn test_extract_batch_basic() { assert!(has_die_files, "should have files with .die suffix"); } -#[test] -fn test_extract_detects_case_variants() { - let project = tempfile::tempdir().unwrap(); - std::fs::write( - project.path().join("config.toml"), - "[package]\nname = \"my-app\"\nmodule = \"my_app\"\nclass = \"MyApp\"\nenv = \"MY_APP_PORT\"\n", - ) - .unwrap(); - - let output = tempfile::tempdir().unwrap(); - let output_path = output.path().join("extracted"); - - let options = ExtractOptions { - source_dir: project.path().to_path_buf(), - variables: vec![("project_name".to_string(), "my-app".to_string())], - output_dir: Some(output_path.clone()), - in_place: false, - stub_depth: 2, - }; - - let plan = plan_extraction(&options).unwrap(); - - // Should detect variants used in the file - let var = plan - .variables - .iter() - .find(|v| v.name == "project_name") - .unwrap(); - let variant_names: Vec<&str> = var.variants.iter().map(|v| v.name).collect(); - assert!( - variant_names.contains(&"kebab"), - "should detect kebab variant" - ); - assert!( - variant_names.contains(&"snake"), - "should detect snake variant" - ); - assert!( - variant_names.contains(&"pascal"), - "should detect pascal variant" - ); - assert!( - variant_names.contains(&"screaming_snake"), - "should detect screaming_snake variant" - ); - - execute_extraction(&plan).unwrap(); - - // The config should have computed variables for variants - let config = std::fs::read_to_string(output_path.join("diecut.toml")).unwrap(); - assert!( - config.contains("project_name_snake"), - "should have snake computed var" - ); -} - #[test] fn test_extract_dry_run_writes_nothing() { let project = tempfile::tempdir().unwrap(); @@ -748,7 +691,6 @@ fn test_extract_dry_run_writes_nothing() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - stub_depth: 2, }; let plan = plan_extraction(&options).unwrap(); @@ -775,7 +717,6 @@ fn test_extract_rejects_already_template() { variables: vec![("name".to_string(), "val".to_string())], output_dir: None, in_place: false, - stub_depth: 2, }; let result = plan_extraction(&options); @@ -793,7 +734,6 @@ fn test_extract_rejects_no_variables() { variables: vec![], output_dir: None, in_place: false, - stub_depth: 2, }; let result = plan_extraction(&options); @@ -814,7 +754,6 @@ fn test_extract_templates_path_components() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, - stub_depth: 2, }; let plan = plan_extraction(&options).unwrap(); @@ -832,80 +771,3 @@ fn test_extract_templates_path_components() { execute_extraction(&plan).unwrap(); } - -#[test] -fn test_extract_round_trip() { - // Step 1: Generate a project from an existing template - let template_dir = fixture_path("basic-template"); - let resolved = adapter::resolve_template(&template_dir).unwrap(); - - let mut variables = BTreeMap::new(); - variables.insert( - "project_name".to_string(), - tera::Value::String("my-app".to_string()), - ); - variables.insert( - "author".to_string(), - tera::Value::String("Jane Doe".to_string()), - ); - variables.insert("use_docker".to_string(), tera::Value::Bool(false)); - variables.insert( - "license".to_string(), - tera::Value::String("MIT".to_string()), - ); - variables.insert( - "project_slug".to_string(), - tera::Value::String("my-app".to_string()), - ); - - let context = build_context(&variables); - let generated = tempfile::tempdir().unwrap(); - walk_and_render(&resolved, generated.path(), &variables, &context).unwrap(); - - // The generated project has files under generated/my-app/ - let project_dir = generated.path().join("my-app"); - assert!(project_dir.exists(), "generated project should exist"); - - // Step 2: Extract it back into a template - let extracted = tempfile::tempdir().unwrap(); - let extracted_path = extracted.path().join("extracted-template"); - - let options = ExtractOptions { - source_dir: project_dir.clone(), - variables: vec![("project_name".to_string(), "my-app".to_string())], - output_dir: Some(extracted_path.clone()), - in_place: false, - stub_depth: 2, - }; - - let plan = plan_extraction(&options).unwrap(); - execute_extraction(&plan).unwrap(); - - // Verify the extracted template has the key structure - assert!(extracted_path.join("diecut.toml").exists()); - assert!(extracted_path.join("template").exists()); - - let config = std::fs::read_to_string(extracted_path.join("diecut.toml")).unwrap(); - assert!(config.contains("project_name")); - - // Verify template files exist and contain template syntax - let template_files: Vec<_> = walkdir::WalkDir::new(extracted_path.join("template")) - .into_iter() - .filter_map(|e| e.ok()) - .filter(|e| e.file_type().is_file()) - .collect(); - assert!(!template_files.is_empty(), "should have template files"); - - // Files with .die suffix should contain template expressions - for entry in &template_files { - if entry.path().to_string_lossy().ends_with(".die") { - let content = std::fs::read_to_string(entry.path()).unwrap(); - assert!( - content.contains("{{") || content.contains("{%"), - "file {} should contain template syntax, got: {}", - entry.path().display(), - content - ); - } - } -} From e9e76b8f823d3a2c4fc1191e8b81086afc445574 Mon Sep 17 00:00:00 2001 From: rroskam Date: Wed, 4 Mar 2026 23:24:17 -0500 Subject: [PATCH 28/29] refactor(extract): load excludes from embedded file with override Move exclude patterns to default_excludes.txt (Rust + macOS only). Add --exclude-from flag to use a custom exclude file. Replace all_default_excludes() with load_excludes(Option<&Path>). --- src/cli.rs | 4 ++ src/commands/extract.rs | 2 + src/extract/default_excludes.txt | 12 ++++ src/extract/exclude.rs | 103 +++++++++++-------------------- src/extract/mod.rs | 5 +- src/main.rs | 3 +- tests/integration.rs | 5 ++ 7 files changed, 63 insertions(+), 71 deletions(-) create mode 100644 src/extract/default_excludes.txt diff --git a/src/cli.rs b/src/cli.rs index d87f2e8..58594f4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -67,6 +67,10 @@ pub enum Commands { #[arg(long)] in_place: bool, + /// File with exclude patterns (one per line, # comments) + #[arg(long, value_name = "FILE")] + exclude_from: Option, + /// Show what would be extracted without writing files #[arg(long)] dry_run: bool, diff --git a/src/commands/extract.rs b/src/commands/extract.rs index 37a31be..5b6efc6 100644 --- a/src/commands/extract.rs +++ b/src/commands/extract.rs @@ -11,6 +11,7 @@ pub fn run( vars: Vec, output: Option, in_place: bool, + exclude_from: Option, dry_run: bool, ) -> Result<()> { let variables = parse_vars(&vars)?; @@ -20,6 +21,7 @@ pub fn run( variables, output_dir: output.map(PathBuf::from), in_place, + exclude_file: exclude_from.map(PathBuf::from), }; let plan = plan_extraction(&options)?; diff --git a/src/extract/default_excludes.txt b/src/extract/default_excludes.txt new file mode 100644 index 0000000..4219dd1 --- /dev/null +++ b/src/extract/default_excludes.txt @@ -0,0 +1,12 @@ +# Version control +.git + +# Rust +target +Cargo.lock + +# macOS +.DS_Store + +# Diecut +.diecut-answers.toml diff --git a/src/extract/exclude.rs b/src/extract/exclude.rs index 2f6beff..b9e7b55 100644 --- a/src/extract/exclude.rs +++ b/src/extract/exclude.rs @@ -1,41 +1,20 @@ use std::path::Path; -/// Default directories and files to exclude from template extraction. -const DEFAULT_EXCLUDES: &[&str] = &[ - ".git", - ".hg", - ".svn", - "node_modules", - ".DS_Store", - "Thumbs.db", - "__pycache__", - "*.pyc", - ".tox", - ".nox", - ".mypy_cache", - ".ruff_cache", - ".pytest_cache", - "target", - ".venv", - ".env", - "dist", - "build", - ".next", - ".nuxt", - ".output", - ".turbo", - ".worktrees", - ".claude/worktrees", - ".astro", - ".diecut-answers.toml", -]; +const DEFAULT_EXCLUDES: &str = include_str!("default_excludes.txt"); -/// Return all default exclude patterns for use during scanning. -/// -/// All DEFAULT_EXCLUDES are always used during the scan phase because patterns -/// like `node_modules` can appear at any depth (e.g. `docs/node_modules/`). -pub fn all_default_excludes() -> Vec { - DEFAULT_EXCLUDES.iter().map(|s| s.to_string()).collect() +/// Load exclude patterns from a file, or use the built-in defaults. +pub fn load_excludes(override_file: Option<&Path>) -> Vec { + let text = match override_file { + Some(path) => { + std::fs::read_to_string(path).unwrap_or_else(|_| DEFAULT_EXCLUDES.to_string()) + } + None => DEFAULT_EXCLUDES.to_string(), + }; + text.lines() + .map(|l| l.trim()) + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .map(|l| l.to_string()) + .collect() } /// Check if a path should be excluded based on the exclude patterns. @@ -45,19 +24,15 @@ pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { for pattern in excludes { let clean = pattern.trim_end_matches('/'); - if clean.contains('*') { - // Glob-style matching: *.pyc matches any .pyc file - if let Some(ext) = clean.strip_prefix("*.") { - if let Some(file_ext) = relative_path.extension() { - if file_ext.to_string_lossy().eq_ignore_ascii_case(ext) { - return true; - } + if let Some(ext) = clean.strip_prefix("*.") { + if let Some(file_ext) = relative_path.extension() { + if file_ext.to_string_lossy().eq_ignore_ascii_case(ext) { + return true; } } continue; } - // Exact directory/file match at any level for component in relative_path.components() { if let std::path::Component::Normal(os_str) = component { if os_str.to_string_lossy() == clean { @@ -66,7 +41,6 @@ pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { } } - // Full path match if path_str == clean || path_str.starts_with(&format!("{clean}/")) { return true; } @@ -78,38 +52,31 @@ pub fn should_exclude(relative_path: &Path, excludes: &[String]) -> bool { #[cfg(test)] mod tests { use super::*; - use std::path::Path; #[test] - fn test_should_exclude_git() { - let excludes = vec![".git/".to_string()]; - assert!(should_exclude(Path::new(".git/config"), &excludes)); - assert!(should_exclude(Path::new(".git/HEAD"), &excludes)); + fn test_load_defaults() { + let excludes = load_excludes(None); + assert!(excludes.contains(&".git".to_string())); + assert!(excludes.contains(&"target".to_string())); + assert!(excludes.contains(&".DS_Store".to_string())); + assert!(!excludes.iter().any(|e| e.starts_with('#'))); } #[test] - fn test_should_exclude_node_modules() { - let excludes = vec!["node_modules".to_string()]; - assert!(should_exclude( - Path::new("node_modules/express/index.js"), - &excludes - )); + fn test_should_exclude_matches() { + let excludes = vec![".git".to_string(), "*.pyc".to_string()]; + assert!(should_exclude(Path::new(".git/HEAD"), &excludes)); + assert!(should_exclude(Path::new("pkg/foo.pyc"), &excludes)); + assert!(!should_exclude(Path::new("src/main.rs"), &excludes)); } #[test] - fn test_should_exclude_glob() { - let excludes = vec!["*.pyc".to_string()]; - assert!(should_exclude( - Path::new("module/__pycache__/foo.pyc"), - &excludes - )); - assert!(!should_exclude(Path::new("module/foo.py"), &excludes)); - } + fn test_override_file() { + let dir = tempfile::tempdir().unwrap(); + let file = dir.path().join("excludes.txt"); + std::fs::write(&file, "# custom\nvendor\n*.log\n").unwrap(); - #[test] - fn test_should_not_exclude_normal_file() { - let excludes = vec![".git/".to_string(), "node_modules".to_string()]; - assert!(!should_exclude(Path::new("src/main.rs"), &excludes)); - assert!(!should_exclude(Path::new("README.md"), &excludes)); + let excludes = load_excludes(Some(&file)); + assert_eq!(excludes, vec!["vendor", "*.log"]); } } diff --git a/src/extract/mod.rs b/src/extract/mod.rs index 29f7cd5..094f27c 100644 --- a/src/extract/mod.rs +++ b/src/extract/mod.rs @@ -9,7 +9,7 @@ use console::style; use crate::config::schema::DEFAULT_TEMPLATES_SUFFIX; use crate::error::{DicecutError, Result}; -use self::exclude::all_default_excludes; +use self::exclude::load_excludes; use self::replace::{ apply_path_replacements, apply_replacements, build_replacement_rules, ReplacementRule, }; @@ -80,6 +80,7 @@ pub struct ExtractOptions { pub variables: Vec<(String, String)>, pub output_dir: Option, pub in_place: bool, + pub exclude_file: Option, } /// Plan an extraction: scan the project, build replacement rules, apply replacements. @@ -122,7 +123,7 @@ pub fn plan_extraction(options: &ExtractOptions) -> Result { } // Scan project - let scan_excludes = all_default_excludes(); + let scan_excludes = load_excludes(options.exclude_file.as_deref()); eprintln!( "\n{}", style(format!("Scanning {}...", source_dir.display())).bold() diff --git a/src/main.rs b/src/main.rs index 6a13faa..e2c0167 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,7 +24,8 @@ fn main() -> miette::Result<()> { vars, output, in_place, + exclude_from, dry_run, - } => commands::extract::run(source, vars, output, in_place, dry_run), + } => commands::extract::run(source, vars, output, in_place, exclude_from, dry_run), } } diff --git a/tests/integration.rs b/tests/integration.rs index 024d6e4..ca73855 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -654,6 +654,7 @@ fn test_extract_batch_basic() { ], output_dir: Some(output_path.clone()), in_place: false, + exclude_file: None, }; let plan = plan_extraction(&options).unwrap(); @@ -691,6 +692,7 @@ fn test_extract_dry_run_writes_nothing() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, + exclude_file: None, }; let plan = plan_extraction(&options).unwrap(); @@ -717,6 +719,7 @@ fn test_extract_rejects_already_template() { variables: vec![("name".to_string(), "val".to_string())], output_dir: None, in_place: false, + exclude_file: None, }; let result = plan_extraction(&options); @@ -734,6 +737,7 @@ fn test_extract_rejects_no_variables() { variables: vec![], output_dir: None, in_place: false, + exclude_file: None, }; let result = plan_extraction(&options); @@ -754,6 +758,7 @@ fn test_extract_templates_path_components() { variables: vec![("project_name".to_string(), "my-app".to_string())], output_dir: Some(output_path.clone()), in_place: false, + exclude_file: None, }; let plan = plan_extraction(&options).unwrap(); From 13c313ad3454734f34bdd908cbbaac1773d6717c Mon Sep 17 00:00:00 2001 From: rroskam Date: Wed, 4 Mar 2026 23:42:18 -0500 Subject: [PATCH 29/29] test(extract): add unit tests for replace.rs Cover word-boundary matching, longest-match-first ordering, overlap resolution, no-rescan guarantee, Unicode handling, and path replacements. --- src/extract/replace.rs | 274 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) diff --git a/src/extract/replace.rs b/src/extract/replace.rs index 419405a..1ea9092 100644 --- a/src/extract/replace.rs +++ b/src/extract/replace.rs @@ -132,3 +132,277 @@ pub fn apply_path_replacements(path: &Path, rules: &[ReplacementRule]) -> PathBu components.iter().collect() } + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + use std::path::Path; + + /// Helper to build a single rule with minimal boilerplate. + fn rule(literal: &str, replacement: &str) -> ReplacementRule { + ReplacementRule { + literal: literal.to_string(), + replacement: replacement.to_string(), + variable: "var".to_string(), + variant: "verbatim".to_string(), + } + } + + /// Helper to build and sort a rule set, ready for apply_replacements. + fn sorted(rules: Vec) -> Vec { + let mut rules = rules; + build_replacement_rules(&mut rules); + rules + } + + // ── is_word_char ────────────────────────────────────────────── + + #[rstest] + #[case('a', true)] + #[case('Z', true)] + #[case('0', true)] + #[case('_', true)] + #[case('-', true)] + #[case(' ', false)] + #[case('.', false)] + #[case('/', false)] + #[case('{', false)] + #[case('é', true)] // alphanumeric per char::is_alphanumeric + fn word_char(#[case] c: char, #[case] expected: bool) { + assert_eq!(is_word_char(c), expected, "is_word_char({c:?})"); + } + + // ── build_replacement_rules (sorting) ───────────────────────── + + #[test] + fn sorts_longest_literal_first() { + let mut rules = vec![ + rule("app", "{{ x }}"), + rule("my-app", "{{ y }}"), + rule("a", "{{ z }}"), + ]; + build_replacement_rules(&mut rules); + + let lengths: Vec = rules.iter().map(|r| r.literal.len()).collect(); + assert_eq!(lengths, vec![6, 3, 1]); + } + + // ── apply_replacements: basic ───────────────────────────────── + + #[test] + fn no_rules_returns_original() { + let (out, count) = apply_replacements("hello world", &[]); + assert_eq!(out, "hello world"); + assert_eq!(count, 0); + } + + #[test] + fn no_match_returns_original() { + let rules = sorted(vec![rule("missing", "{{ x }}")]); + let (out, count) = apply_replacements("hello world", &rules); + assert_eq!(out, "hello world"); + assert_eq!(count, 0); + } + + #[test] + fn simple_replacement() { + let rules = sorted(vec![rule("my-app", "{{ project_name }}")]); + let (out, count) = apply_replacements("name = \"my-app\"", &rules); + assert_eq!(out, "name = \"{{ project_name }}\""); + assert_eq!(count, 1); + } + + #[test] + fn multiple_occurrences() { + let rules = sorted(vec![rule("foo", "{{ x }}")]); + let (out, count) = apply_replacements("foo and foo again foo", &rules); + assert_eq!(out, "{{ x }} and {{ x }} again {{ x }}"); + assert_eq!(count, 3); + } + + #[test] + fn empty_content() { + let rules = sorted(vec![rule("x", "{{ x }}")]); + let (out, count) = apply_replacements("", &rules); + assert_eq!(out, ""); + assert_eq!(count, 0); + } + + #[test] + fn empty_literal_is_skipped() { + let rules = vec![rule("", "{{ x }}")]; + let (out, count) = apply_replacements("hello", &rules); + assert_eq!(out, "hello"); + assert_eq!(count, 0); + } + + // ── apply_replacements: word boundaries ─────────────────────── + + #[test] + fn no_match_inside_longer_word() { + let rules = sorted(vec![rule("app", "{{ name }}")]); + let (out, count) = apply_replacements("the application is great", &rules); + assert_eq!(out, "the application is great"); + assert_eq!(count, 0); + } + + #[test] + fn no_match_with_prefix_attached() { + let rules = sorted(vec![rule("app", "{{ name }}")]); + let (out, count) = apply_replacements("myapp works", &rules); + assert_eq!(out, "myapp works"); + assert_eq!(count, 0); + } + + #[test] + fn no_match_with_suffix_attached() { + let rules = sorted(vec![rule("app", "{{ name }}")]); + let (out, count) = apply_replacements("apps are great", &rules); + assert_eq!(out, "apps are great"); + assert_eq!(count, 0); + } + + #[rstest] + #[case("app is here", "{{ n }} is here", 1)] + #[case("use app", "use {{ n }}", 1)] + #[case("app", "{{ n }}", 1)] + #[case("(app)", "({{ n }})", 1)] + #[case("\"app\"", "\"{{ n }}\"", 1)] + #[case("app.config", "{{ n }}.config", 1)] + #[case("/app/", "/{{ n }}/", 1)] + fn boundary_at_non_word_chars( + #[case] input: &str, + #[case] expected: &str, + #[case] expected_count: usize, + ) { + let rules = sorted(vec![rule("app", "{{ n }}")]); + let (out, count) = apply_replacements(input, &rules); + assert_eq!(out, expected, "input: {input:?}"); + assert_eq!(count, expected_count); + } + + #[test] + fn hyphen_is_word_boundary_blocker() { + // "my-app" contains "app" but hyphen is a word char, so "app" alone + // should NOT match inside "my-app". + let rules = sorted(vec![rule("app", "{{ name }}")]); + let (out, count) = apply_replacements("my-app", &rules); + assert_eq!(out, "my-app"); + assert_eq!(count, 0); + } + + #[test] + fn underscore_is_word_boundary_blocker() { + let rules = sorted(vec![rule("app", "{{ name }}")]); + let (out, count) = apply_replacements("my_app", &rules); + assert_eq!(out, "my_app"); + assert_eq!(count, 0); + } + + // ── apply_replacements: longest-match-first / overlap ───────── + + #[test] + fn longest_match_wins() { + let rules = sorted(vec![ + rule("my-app", "{{ full }}"), + rule("my", "{{ prefix }}"), + ]); + let (out, count) = apply_replacements("name: my-app", &rules); + assert_eq!(out, "name: {{ full }}"); + assert_eq!(count, 1); + } + + #[test] + fn shorter_rule_still_matches_elsewhere() { + let rules = sorted(vec![ + rule("my-app", "{{ full }}"), + rule("my", "{{ prefix }}"), + ]); + let (out, count) = apply_replacements("my-app by my", &rules); + assert_eq!(out, "{{ full }} by {{ prefix }}"); + assert_eq!(count, 2); + } + + #[test] + fn adjacent_non_overlapping_matches() { + // Two rules that match at adjacent positions separated by a dot. + let rules = sorted(vec![rule("foo", "{{ a }}"), rule("bar", "{{ b }}")]); + let (out, count) = apply_replacements("foo.bar", &rules); + assert_eq!(out, "{{ a }}.{{ b }}"); + assert_eq!(count, 2); + } + + // ── apply_replacements: no re-scanning ──────────────────────── + + #[test] + fn replacement_output_is_not_rescanned() { + // If re-scanning occurred, the "x" in "{{ x }}" could trigger rule 2. + let rules = sorted(vec![rule("foo", "{{ x }}"), rule("x", "WRONG")]); + let (out, count) = apply_replacements("foo", &rules); + assert_eq!(out, "{{ x }}"); + assert_eq!(count, 1); + } + + // ── apply_replacements: unicode ─────────────────────────────── + + #[test] + fn unicode_content_preserved() { + // CJK chars are alphanumeric (is_word_char → true), so the literal + // must appear at a non-word boundary to match. + let rules = sorted(vec![rule("my-app", "{{ name }}")]); + let (out, count) = apply_replacements("プロジェクト: my-app です", &rules); + assert_eq!(out, "プロジェクト: {{ name }} です"); + assert_eq!(count, 1); + } + + #[test] + fn cjk_neighbors_block_boundary() { + // CJK characters are alphanumeric → word chars, so a literal + // flanked by them is not at a word boundary. + let rules = sorted(vec![rule("名前", "{{ name }}")]); + let (out, count) = apply_replacements("私の名前はアプリです", &rules); + assert_eq!(out, "私の名前はアプリです"); + assert_eq!(count, 0); + } + + #[test] + fn multibyte_boundary_respected() { + // "café" contains "é" which is alphanumeric → word char. + // Rule for "caf" should NOT match inside "café". + let rules = sorted(vec![rule("caf", "{{ x }}")]); + let (out, count) = apply_replacements("café", &rules); + assert_eq!(out, "café"); + assert_eq!(count, 0); + } + + // ── apply_path_replacements ─────────────────────────────────── + + #[test] + fn replaces_in_path_components() { + let rules = sorted(vec![rule("my-app", "{{ name }}")]); + let path = Path::new("src/my-app/main.rs"); + let result = apply_path_replacements(path, &rules); + assert_eq!(result, PathBuf::from("src/{{ name }}/main.rs")); + } + + #[test] + fn replaces_in_filename() { + let rules = sorted(vec![rule("my-app", "{{ name }}")]); + let path = Path::new("my-app.toml"); + let result = apply_path_replacements(path, &rules); + assert_eq!(result, PathBuf::from("{{ name }}.toml")); + } + + #[test] + fn no_match_across_path_separator() { + // "src/app" should not match "src/app" as a single literal — each + // component is replaced independently. + let rules = sorted(vec![rule("src/app", "{{ x }}")]); + let path = Path::new("src/app/main.rs"); + let result = apply_path_replacements(path, &rules); + // Should be unchanged because "/" is a component separator, not part + // of any single component. + assert_eq!(result, PathBuf::from("src/app/main.rs")); + } +}