feat(swe): improve dataset generation pipeline with validation and progress monitoring (#13)

echobt · web-flow · commit 990a76563125 · 2026-02-17T19:49:48.000+04:00
* refactor(model): replace openai/gpt-5.2-codex:nitro with moonshotai/kimi-k2.5:nitro

* feat(swe): improve dataset quality, speed, and validation pipeline

Overhaul the synthetic dataset generator to produce higher-quality
benchmarks faster and with better validation:

Quality improvements:
- Add min_description_length filter (30 chars) to reject PRs with empty
  or very short descriptions, preventing blank/useless benchmark tasks
- Strip repository names, PR numbers, and GitHub URLs from generated
  prompts via post-processing in prompt_rewriter to avoid leaking
  project identity into benchmark tasks
- Add test script validation (validate_test_scripts) that checks shell
  scripts have shebang lines, are non-empty, and that referenced test
  files actually exist in the submission set — with retry loop support

Observability:
- Add new progress module with ProgressCounters (shared atomics) and
  ProgressMonitor (background tokio task) that logs pipeline stats
  (filtered/extracted/scored/accepted) every 30s with ETA percentage
- Wire progress monitor into SweOrchestrator::run lifecycle

Build config:
- Simplify .cargo/config.toml linker to use cc instead of clang+mold
  for broader compatibility
diff --git a/README.md b/README.md
@@ -317,7 +317,7 @@ swe-forge swe mine [OPTIONS]
 
 Options:
   -o, --output <DIR>          Output directory [default: ./swe-datasets]
-  -m, --model <MODEL>         OpenRouter model [default: openai/gpt-5.2-codex:nitro]
+  -m, --model <MODEL>         OpenRouter model [default: moonshotai/kimi-k2.5:nitro]
   -n, --max-tasks <N>         Number of tasks to generate [default: 1]
   -d, --difficulty <LEVEL>    Filter: easy, medium, hard [optional]
       --min-stars <N>         Minimum repo stars [default: 20]
@@ -410,7 +410,7 @@ GitHub API allows 5000 requests/hour per token. The pipeline processes candidate
 
 ### Model selection
 
-The default model is `openai/gpt-5.2-codex:nitro` via OpenRouter. Any OpenRouter-compatible model that supports function calling can be used:
+The default model is `moonshotai/kimi-k2.5:nitro` via OpenRouter. Any OpenRouter-compatible model that supports function calling can be used:
 
 ```bash
 cargo run -- swe mine --model anthropic/claude-sonnet-4 --max-tasks 5
diff --git a/src/cli/AGENTS.md b/src/cli/AGENTS.md
@@ -26,7 +26,7 @@ Clap-based CLI interface. Defines all commands, argument parsing, and command di
 ## Rules
 
 - Use `anyhow::Result` for command handler return types
-- Default model constant: `DEFAULT_MODEL = "openai/gpt-5.2-codex:nitro"`
+- Default model constant: `DEFAULT_MODEL = "moonshotai/kimi-k2.5:nitro"`
 - Default output dirs: `./generated-datasets` (generate), `./generated-swe` (swe mine)
 - Global `--log-level` arg controls tracing filter
 - API keys come from env vars or CLI args (env var takes precedence)
diff --git a/src/cli/commands.rs b/src/cli/commands.rs
@@ -20,7 +20,7 @@ use std::sync::Arc;
 use tracing::{info, warn};
 
 /// Default model to use for generation.
-const DEFAULT_MODEL: &str = "openai/gpt-5.2-codex:nitro";
+const DEFAULT_MODEL: &str = "moonshotai/kimi-k2.5:nitro";
 
 /// Default output directory for generated datasets.
 const DEFAULT_OUTPUT_DIR: &str = "./generated-datasets";
@@ -32,7 +32,7 @@ const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe";
 #[command(about = "Generate SWE-derived benchmark datasets for LLM evaluation")]
 #[command(version)]
 #[command(
-    long_about = "swe_forge generates SWE-derived terminal/CLI benchmark tasks from mined GitHub PRs.\n\nTasks are validated and exported as workspace artifacts (workspace.yaml + prompt.md).\n\nExample usage:\n  swe_forge generate --count 5 --model openai/gpt-5.2-codex:nitro --output ./generated-datasets"
+    long_about = "swe_forge generates SWE-derived terminal/CLI benchmark tasks from mined GitHub PRs.\n\nTasks are validated and exported as workspace artifacts (workspace.yaml + prompt.md).\n\nExample usage:\n  swe_forge generate --count 5 --model moonshotai/kimi-k2.5:nitro --output ./generated-datasets"
 )]
 pub struct Cli {
     /// The subcommand to execute.
@@ -1802,7 +1802,7 @@ mod tests {
     fn test_generation_output_serialization() {
         let output = GenerationOutput {
             status: "success".to_string(),
-            model: "openai/gpt-5.2-codex:nitro".to_string(),
+            model: "moonshotai/kimi-k2.5:nitro".to_string(),
             tasks: vec![GeneratedTaskOutput {
                 task_id: "swe_forge-task-001".to_string(),
                 category: "debugging".to_string(),
@@ -1820,7 +1820,7 @@ mod tests {
 
         // Verify key fields are present in output
         assert!(json.contains("\"status\": \"success\""));
-        assert!(json.contains("\"model\": \"openai/gpt-5.2-codex:nitro\""));
+        assert!(json.contains("\"model\": \"moonshotai/kimi-k2.5:nitro\""));
         assert!(json.contains("\"task_id\": \"swe_forge-task-001\""));
         assert!(json.contains("\"category\": \"debugging\""));
         assert!(json.contains("\"total_duration_ms\": 5000"));
@@ -1893,7 +1893,7 @@ mod tests {
     fn test_evaluation_output_serialization() {
         let output = EvaluationOutput {
             status: "success".to_string(),
-            model: "openai/gpt-5.2-codex:nitro".to_string(),
+            model: "moonshotai/kimi-k2.5:nitro".to_string(),
             total_tasks: 3,
             successful_tasks: 2,
             success_rate: 0.667,
@@ -1935,7 +1935,7 @@ mod tests {
 
         // Verify key fields are present in output
         assert!(json.contains("\"status\": \"success\""));
-        assert!(json.contains("\"model\": \"openai/gpt-5.2-codex:nitro\""));
+        assert!(json.contains("\"model\": \"moonshotai/kimi-k2.5:nitro\""));
         assert!(json.contains("\"total_tasks\": 3"));
         assert!(json.contains("\"successful_tasks\": 2"));
         assert!(json.contains("\"success_rate\": 0.667"));
diff --git a/src/llm/AGENTS.md b/src/llm/AGENTS.md
@@ -33,6 +33,6 @@ LLM integration layer providing an OpenAI-compatible API client with function ca
 
 - Always use `tools` + `tool_choice: "required"` for structured output — never parse free-form text
 - Provider trait objects must be `Send + Sync` (used as `Arc<dyn LlmProvider>`)
-- Default model: `openai/gpt-5.2-codex:nitro` (set in `src/cli/commands.rs`)
+- Default model: `moonshotai/kimi-k2.5:nitro` (set in `src/cli/commands.rs`)
 - Cost tracking is optional but should be used when available
 - Cache keys are content hashes (`sha2`) — not message indices
diff --git a/src/llm/mod.rs b/src/llm/mod.rs
@@ -35,10 +35,10 @@
 //!
 //! // Add providers
 //! let provider = Arc::new(OpenRouterProvider::new("api-key".to_string()));
-//! router.add_provider(provider, "openai/gpt-5.2-codex:nitro");
+//! router.add_provider(provider, "moonshotai/kimi-k2.5:nitro");
 //!
 //! // Add model capabilities for cost optimization
-//! router.add_model_capabilities(ModelCapabilities::new("openai/gpt-5.2-codex:nitro")
+//! router.add_model_capabilities(ModelCapabilities::new("moonshotai/kimi-k2.5:nitro")
 //!     .with_pricing(0.5, 1.5)
 //!     .with_coding_score(0.8));
 //! ```
diff --git a/src/llm/providers/openrouter.rs b/src/llm/providers/openrouter.rs
@@ -20,7 +20,7 @@ use crate::llm::{
 const OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1";
 
 /// Default model to use if none specified.
-const DEFAULT_MODEL: &str = "openai/gpt-5.2-codex:nitro";
+const DEFAULT_MODEL: &str = "moonshotai/kimi-k2.5:nitro";
 
 /// Maximum number of retry attempts for transient failures.
 const MAX_RETRIES: u32 = 3;
@@ -49,7 +49,7 @@ pub struct OpenRouterProvider {
 impl OpenRouterProvider {
     /// Create a new OpenRouter provider with the given API key.
     ///
-    /// Uses the default model (`openai/gpt-5.2-codex:nitro`) and base URL.
+    /// Uses the default model (`moonshotai/kimi-k2.5:nitro`) and base URL.
     ///
     /// # Arguments
     ///
@@ -695,7 +695,7 @@ mod tests {
     #[test]
     fn test_api_request_serialization_with_reasoning_effort() {
         let request = ApiRequest {
-            model: "openai/gpt-5.2-codex:nitro".to_string(),
+            model: "moonshotai/kimi-k2.5:nitro".to_string(),
             messages: vec![Message::user("Hello")],
             temperature: Some(0.7),
             max_tokens: Some(16000),
diff --git a/src/swe/filters.rs b/src/swe/filters.rs
@@ -8,6 +8,9 @@ pub struct FilterConfig {
     pub min_added_lines: usize,
     pub max_added_lines: usize,
     pub allowed_languages: Vec<String>,
+    /// Minimum combined length of PR title + body (in characters) to accept a candidate.
+    /// PRs with empty or very short descriptions are unlikely to produce good benchmark tasks.
+    pub min_description_length: usize,
 }
 
 impl Default for FilterConfig {
@@ -26,6 +29,7 @@ impl Default for FilterConfig {
                 "rust".to_string(),
                 "java".to_string(),
             ],
+            min_description_length: 30,
         }
     }
 }
@@ -51,13 +55,27 @@ impl SweepFilter {
         Self::new(FilterConfig::default())
     }
 
+    /// Evaluate whether a PR candidate should be kept for further processing.
+    ///
+    /// # Arguments
+    ///
+    /// * `language` - Primary language of the repository
+    /// * `stars` - Repository star count (0 means unknown)
+    /// * `files_changed` - Number of files changed in the PR
+    /// * `added_lines` - Number of lines added
+    /// * `changed_files` - List of changed file paths
+    /// * `title` - PR title
+    /// * `body` - PR body/description
+    #[allow(clippy::too_many_arguments)]
     pub fn keep_candidate(
         &self,
         language: &str,
         stars: u32,
         files_changed: usize,
         added_lines: usize,
         changed_files: &[String],
+        title: &str,
+        body: &str,
     ) -> FilterResult {
         let mut reasons = Vec::new();
         let mut score = 1.0f64;
@@ -119,6 +137,16 @@ impl SweepFilter {
             score -= 0.3;
         }
 
+        // Reject PRs with empty or very short descriptions
+        let description_len = title.trim().len() + body.trim().len();
+        if description_len < self.config.min_description_length {
+            reasons.push(format!(
+                "PR description too short ({description_len} chars, minimum {})",
+                self.config.min_description_length
+            ));
+            score -= 0.4;
+        }
+
         let accepted = reasons.is_empty();
         if accepted {
             reasons.push("candidate accepted".to_string());
diff --git a/src/swe/mod.rs b/src/swe/mod.rs
@@ -21,6 +21,7 @@ pub mod harness;
 pub mod orchestrator;
 pub mod pipeline;
 pub mod pr_cache;
+pub mod progress;
 pub mod prompt_rewriter;
 pub mod quality;
 pub mod test_generator;
@@ -33,6 +34,7 @@ pub use harness::{run_harness, HarnessConfig, HarnessResult, HarnessSummary};
 pub use orchestrator::{SweOrchestrator, SweOrchestratorConfig, SweRunResult};
 pub use pipeline::{SwePipeline, SwePipelineEvent, SwePipelineRunResult};
 pub use pr_cache::{OptionalCache, PrCache, PrCacheEntry};
+pub use progress::{ProgressCounters, ProgressMonitor, ProgressSnapshot};
 pub use prompt_rewriter::PromptRewriter;
 pub use quality::{QualityAssessment, QualityConfig, QualityScorer};
 pub use test_generator::{TestFile, TestGenerator};
diff --git a/src/swe/orchestrator.rs b/src/swe/orchestrator.rs
@@ -6,9 +6,12 @@ use std::sync::Arc;
 
 use serde::{Deserialize, Serialize};
 
+use std::time::Duration;
+
 use crate::export::{DatasetConfig, DatasetManager, HfUploadConfig};
 use crate::llm::LlmProvider;
 use crate::swe::pipeline::{DatasetHandle, ExportConfig, SwePipelineConfig};
+use crate::swe::progress::{ProgressCounters, ProgressMonitor};
 use crate::swe::{SwePipelineRunResult, SweTask};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -182,6 +185,10 @@ impl SweOrchestrator {
             Some(Arc::new(DatasetManager::new(ds_config).await?))
         };
 
+        // Start background progress monitor (logs every 30 seconds)
+        let progress_counters = ProgressCounters::new();
+        let monitor = ProgressMonitor::start(progress_counters, max_tasks, Duration::from_secs(30));
+
         let pipeline = crate::swe::pipeline::SwePipeline::new(&pipeline_config, self.llm.clone())?;
         let run: SwePipelineRunResult = pipeline
             .run_full(
@@ -192,6 +199,8 @@ impl SweOrchestrator {
             )
             .await?;
 
+        monitor.stop().await;
+
         // Finalize dataset: flush remaining shard, write combined parquet, upload splits
         if let Some(ref ds) = dataset_handle {
             match ds.finalize().await {
diff --git a/src/swe/pipeline.rs b/src/swe/pipeline.rs
@@ -361,6 +361,8 @@ impl SwePipeline {
                         enriched.files_changed,
                         added_lines,
                         &enriched.changed_files,
+                        &enriched.title,
+                        &enriched.body,
                     );
                     filtered_count.fetch_add(1, Ordering::Relaxed);
                     if !filter_result.accepted {
@@ -534,12 +536,7 @@ impl SwePipeline {
                         .await
                     {
                         Ok(rewritten) => {
-                            task.prompt = format!(
-                                "{repo} (#{pr}): {title}\n\n{rewritten}",
-                                repo = enriched.repository,
-                                pr = enriched.number,
-                                title = enriched.title,
-                            );
+                            task.prompt = rewritten;
                         }
                         Err(err) => {
                             tracing::warn!(task_id = %task.id, error = %err, "Prompt rewrite failed");
diff --git a/src/swe/progress.rs b/src/swe/progress.rs
diff --git a/src/swe/prompt_rewriter.rs b/src/swe/prompt_rewriter.rs
diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs
diff --git a/test-run/VALIDATION_REPORT.md b/test-run/VALIDATION_REPORT.md