Skip to content

Commit 990a765

Browse files
authored
feat(swe): improve dataset generation pipeline with validation and progress monitoring (#13)
* refactor(model): replace openai/gpt-5.2-codex:nitro with moonshotai/kimi-k2.5:nitro * feat(swe): improve dataset quality, speed, and validation pipeline Overhaul the synthetic dataset generator to produce higher-quality benchmarks faster and with better validation: Quality improvements: - Add min_description_length filter (30 chars) to reject PRs with empty or very short descriptions, preventing blank/useless benchmark tasks - Strip repository names, PR numbers, and GitHub URLs from generated prompts via post-processing in prompt_rewriter to avoid leaking project identity into benchmark tasks - Add test script validation (validate_test_scripts) that checks shell scripts have shebang lines, are non-empty, and that referenced test files actually exist in the submission set — with retry loop support Observability: - Add new progress module with ProgressCounters (shared atomics) and ProgressMonitor (background tokio task) that logs pipeline stats (filtered/extracted/scored/accepted) every 30s with ETA percentage - Wire progress monitor into SweOrchestrator::run lifecycle Build config: - Simplify .cargo/config.toml linker to use cc instead of clang+mold for broader compatibility
1 parent 9cc14a3 commit 990a765

14 files changed

Lines changed: 375 additions & 26 deletions

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ swe-forge swe mine [OPTIONS]
317317
318318
Options:
319319
-o, --output <DIR> Output directory [default: ./swe-datasets]
320-
-m, --model <MODEL> OpenRouter model [default: openai/gpt-5.2-codex:nitro]
320+
-m, --model <MODEL> OpenRouter model [default: moonshotai/kimi-k2.5:nitro]
321321
-n, --max-tasks <N> Number of tasks to generate [default: 1]
322322
-d, --difficulty <LEVEL> Filter: easy, medium, hard [optional]
323323
--min-stars <N> Minimum repo stars [default: 20]
@@ -410,7 +410,7 @@ GitHub API allows 5000 requests/hour per token. The pipeline processes candidate
410410

411411
### Model selection
412412

413-
The default model is `openai/gpt-5.2-codex:nitro` via OpenRouter. Any OpenRouter-compatible model that supports function calling can be used:
413+
The default model is `moonshotai/kimi-k2.5:nitro` via OpenRouter. Any OpenRouter-compatible model that supports function calling can be used:
414414

415415
```bash
416416
cargo run -- swe mine --model anthropic/claude-sonnet-4 --max-tasks 5

src/cli/AGENTS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Clap-based CLI interface. Defines all commands, argument parsing, and command di
2626
## Rules
2727

2828
- Use `anyhow::Result` for command handler return types
29-
- Default model constant: `DEFAULT_MODEL = "openai/gpt-5.2-codex:nitro"`
29+
- Default model constant: `DEFAULT_MODEL = "moonshotai/kimi-k2.5:nitro"`
3030
- Default output dirs: `./generated-datasets` (generate), `./generated-swe` (swe mine)
3131
- Global `--log-level` arg controls tracing filter
3232
- API keys come from env vars or CLI args (env var takes precedence)

src/cli/commands.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use std::sync::Arc;
2020
use tracing::{info, warn};
2121

2222
/// Default model to use for generation.
23-
const DEFAULT_MODEL: &str = "openai/gpt-5.2-codex:nitro";
23+
const DEFAULT_MODEL: &str = "moonshotai/kimi-k2.5:nitro";
2424

2525
/// Default output directory for generated datasets.
2626
const DEFAULT_OUTPUT_DIR: &str = "./generated-datasets";
@@ -32,7 +32,7 @@ const DEFAULT_SWE_OUTPUT_DIR: &str = "./generated-swe";
3232
#[command(about = "Generate SWE-derived benchmark datasets for LLM evaluation")]
3333
#[command(version)]
3434
#[command(
35-
long_about = "swe_forge generates SWE-derived terminal/CLI benchmark tasks from mined GitHub PRs.\n\nTasks are validated and exported as workspace artifacts (workspace.yaml + prompt.md).\n\nExample usage:\n swe_forge generate --count 5 --model openai/gpt-5.2-codex:nitro --output ./generated-datasets"
35+
long_about = "swe_forge generates SWE-derived terminal/CLI benchmark tasks from mined GitHub PRs.\n\nTasks are validated and exported as workspace artifacts (workspace.yaml + prompt.md).\n\nExample usage:\n swe_forge generate --count 5 --model moonshotai/kimi-k2.5:nitro --output ./generated-datasets"
3636
)]
3737
pub struct Cli {
3838
/// The subcommand to execute.
@@ -1802,7 +1802,7 @@ mod tests {
18021802
fn test_generation_output_serialization() {
18031803
let output = GenerationOutput {
18041804
status: "success".to_string(),
1805-
model: "openai/gpt-5.2-codex:nitro".to_string(),
1805+
model: "moonshotai/kimi-k2.5:nitro".to_string(),
18061806
tasks: vec![GeneratedTaskOutput {
18071807
task_id: "swe_forge-task-001".to_string(),
18081808
category: "debugging".to_string(),
@@ -1820,7 +1820,7 @@ mod tests {
18201820

18211821
// Verify key fields are present in output
18221822
assert!(json.contains("\"status\": \"success\""));
1823-
assert!(json.contains("\"model\": \"openai/gpt-5.2-codex:nitro\""));
1823+
assert!(json.contains("\"model\": \"moonshotai/kimi-k2.5:nitro\""));
18241824
assert!(json.contains("\"task_id\": \"swe_forge-task-001\""));
18251825
assert!(json.contains("\"category\": \"debugging\""));
18261826
assert!(json.contains("\"total_duration_ms\": 5000"));
@@ -1893,7 +1893,7 @@ mod tests {
18931893
fn test_evaluation_output_serialization() {
18941894
let output = EvaluationOutput {
18951895
status: "success".to_string(),
1896-
model: "openai/gpt-5.2-codex:nitro".to_string(),
1896+
model: "moonshotai/kimi-k2.5:nitro".to_string(),
18971897
total_tasks: 3,
18981898
successful_tasks: 2,
18991899
success_rate: 0.667,
@@ -1935,7 +1935,7 @@ mod tests {
19351935

19361936
// Verify key fields are present in output
19371937
assert!(json.contains("\"status\": \"success\""));
1938-
assert!(json.contains("\"model\": \"openai/gpt-5.2-codex:nitro\""));
1938+
assert!(json.contains("\"model\": \"moonshotai/kimi-k2.5:nitro\""));
19391939
assert!(json.contains("\"total_tasks\": 3"));
19401940
assert!(json.contains("\"successful_tasks\": 2"));
19411941
assert!(json.contains("\"success_rate\": 0.667"));

src/llm/AGENTS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,6 @@ LLM integration layer providing an OpenAI-compatible API client with function ca
3333

3434
- Always use `tools` + `tool_choice: "required"` for structured output — never parse free-form text
3535
- Provider trait objects must be `Send + Sync` (used as `Arc<dyn LlmProvider>`)
36-
- Default model: `openai/gpt-5.2-codex:nitro` (set in `src/cli/commands.rs`)
36+
- Default model: `moonshotai/kimi-k2.5:nitro` (set in `src/cli/commands.rs`)
3737
- Cost tracking is optional but should be used when available
3838
- Cache keys are content hashes (`sha2`) — not message indices

src/llm/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@
3535
//!
3636
//! // Add providers
3737
//! let provider = Arc::new(OpenRouterProvider::new("api-key".to_string()));
38-
//! router.add_provider(provider, "openai/gpt-5.2-codex:nitro");
38+
//! router.add_provider(provider, "moonshotai/kimi-k2.5:nitro");
3939
//!
4040
//! // Add model capabilities for cost optimization
41-
//! router.add_model_capabilities(ModelCapabilities::new("openai/gpt-5.2-codex:nitro")
41+
//! router.add_model_capabilities(ModelCapabilities::new("moonshotai/kimi-k2.5:nitro")
4242
//! .with_pricing(0.5, 1.5)
4343
//! .with_coding_score(0.8));
4444
//! ```

src/llm/providers/openrouter.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use crate::llm::{
2020
const OPENROUTER_BASE_URL: &str = "https://openrouter.ai/api/v1";
2121

2222
/// Default model to use if none specified.
23-
const DEFAULT_MODEL: &str = "openai/gpt-5.2-codex:nitro";
23+
const DEFAULT_MODEL: &str = "moonshotai/kimi-k2.5:nitro";
2424

2525
/// Maximum number of retry attempts for transient failures.
2626
const MAX_RETRIES: u32 = 3;
@@ -49,7 +49,7 @@ pub struct OpenRouterProvider {
4949
impl OpenRouterProvider {
5050
/// Create a new OpenRouter provider with the given API key.
5151
///
52-
/// Uses the default model (`openai/gpt-5.2-codex:nitro`) and base URL.
52+
/// Uses the default model (`moonshotai/kimi-k2.5:nitro`) and base URL.
5353
///
5454
/// # Arguments
5555
///
@@ -695,7 +695,7 @@ mod tests {
695695
#[test]
696696
fn test_api_request_serialization_with_reasoning_effort() {
697697
let request = ApiRequest {
698-
model: "openai/gpt-5.2-codex:nitro".to_string(),
698+
model: "moonshotai/kimi-k2.5:nitro".to_string(),
699699
messages: vec![Message::user("Hello")],
700700
temperature: Some(0.7),
701701
max_tokens: Some(16000),

src/swe/filters.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ pub struct FilterConfig {
88
pub min_added_lines: usize,
99
pub max_added_lines: usize,
1010
pub allowed_languages: Vec<String>,
11+
/// Minimum combined length of PR title + body (in characters) to accept a candidate.
12+
/// PRs with empty or very short descriptions are unlikely to produce good benchmark tasks.
13+
pub min_description_length: usize,
1114
}
1215

1316
impl Default for FilterConfig {
@@ -26,6 +29,7 @@ impl Default for FilterConfig {
2629
"rust".to_string(),
2730
"java".to_string(),
2831
],
32+
min_description_length: 30,
2933
}
3034
}
3135
}
@@ -51,13 +55,27 @@ impl SweepFilter {
5155
Self::new(FilterConfig::default())
5256
}
5357

58+
/// Evaluate whether a PR candidate should be kept for further processing.
59+
///
60+
/// # Arguments
61+
///
62+
/// * `language` - Primary language of the repository
63+
/// * `stars` - Repository star count (0 means unknown)
64+
/// * `files_changed` - Number of files changed in the PR
65+
/// * `added_lines` - Number of lines added
66+
/// * `changed_files` - List of changed file paths
67+
/// * `title` - PR title
68+
/// * `body` - PR body/description
69+
#[allow(clippy::too_many_arguments)]
5470
pub fn keep_candidate(
5571
&self,
5672
language: &str,
5773
stars: u32,
5874
files_changed: usize,
5975
added_lines: usize,
6076
changed_files: &[String],
77+
title: &str,
78+
body: &str,
6179
) -> FilterResult {
6280
let mut reasons = Vec::new();
6381
let mut score = 1.0f64;
@@ -119,6 +137,16 @@ impl SweepFilter {
119137
score -= 0.3;
120138
}
121139

140+
// Reject PRs with empty or very short descriptions
141+
let description_len = title.trim().len() + body.trim().len();
142+
if description_len < self.config.min_description_length {
143+
reasons.push(format!(
144+
"PR description too short ({description_len} chars, minimum {})",
145+
self.config.min_description_length
146+
));
147+
score -= 0.4;
148+
}
149+
122150
let accepted = reasons.is_empty();
123151
if accepted {
124152
reasons.push("candidate accepted".to_string());

src/swe/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub mod harness;
2121
pub mod orchestrator;
2222
pub mod pipeline;
2323
pub mod pr_cache;
24+
pub mod progress;
2425
pub mod prompt_rewriter;
2526
pub mod quality;
2627
pub mod test_generator;
@@ -33,6 +34,7 @@ pub use harness::{run_harness, HarnessConfig, HarnessResult, HarnessSummary};
3334
pub use orchestrator::{SweOrchestrator, SweOrchestratorConfig, SweRunResult};
3435
pub use pipeline::{SwePipeline, SwePipelineEvent, SwePipelineRunResult};
3536
pub use pr_cache::{OptionalCache, PrCache, PrCacheEntry};
37+
pub use progress::{ProgressCounters, ProgressMonitor, ProgressSnapshot};
3638
pub use prompt_rewriter::PromptRewriter;
3739
pub use quality::{QualityAssessment, QualityConfig, QualityScorer};
3840
pub use test_generator::{TestFile, TestGenerator};

src/swe/orchestrator.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@ use std::sync::Arc;
66

77
use serde::{Deserialize, Serialize};
88

9+
use std::time::Duration;
10+
911
use crate::export::{DatasetConfig, DatasetManager, HfUploadConfig};
1012
use crate::llm::LlmProvider;
1113
use crate::swe::pipeline::{DatasetHandle, ExportConfig, SwePipelineConfig};
14+
use crate::swe::progress::{ProgressCounters, ProgressMonitor};
1215
use crate::swe::{SwePipelineRunResult, SweTask};
1316

1417
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -182,6 +185,10 @@ impl SweOrchestrator {
182185
Some(Arc::new(DatasetManager::new(ds_config).await?))
183186
};
184187

188+
// Start background progress monitor (logs every 30 seconds)
189+
let progress_counters = ProgressCounters::new();
190+
let monitor = ProgressMonitor::start(progress_counters, max_tasks, Duration::from_secs(30));
191+
185192
let pipeline = crate::swe::pipeline::SwePipeline::new(&pipeline_config, self.llm.clone())?;
186193
let run: SwePipelineRunResult = pipeline
187194
.run_full(
@@ -192,6 +199,8 @@ impl SweOrchestrator {
192199
)
193200
.await?;
194201

202+
monitor.stop().await;
203+
195204
// Finalize dataset: flush remaining shard, write combined parquet, upload splits
196205
if let Some(ref ds) = dataset_handle {
197206
match ds.finalize().await {

src/swe/pipeline.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,8 @@ impl SwePipeline {
361361
enriched.files_changed,
362362
added_lines,
363363
&enriched.changed_files,
364+
&enriched.title,
365+
&enriched.body,
364366
);
365367
filtered_count.fetch_add(1, Ordering::Relaxed);
366368
if !filter_result.accepted {
@@ -534,12 +536,7 @@ impl SwePipeline {
534536
.await
535537
{
536538
Ok(rewritten) => {
537-
task.prompt = format!(
538-
"{repo} (#{pr}): {title}\n\n{rewritten}",
539-
repo = enriched.repository,
540-
pr = enriched.number,
541-
title = enriched.title,
542-
);
539+
task.prompt = rewritten;
543540
}
544541
Err(err) => {
545542
tracing::warn!(task_id = %task.id, error = %err, "Prompt rewrite failed");

0 commit comments

Comments
 (0)