Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
353 changes: 353 additions & 0 deletions benchmark_validation_report.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/swe/docker_sandbox.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ impl DockerSandbox {

// Clone the repository
let clone_cmd = format!(
"git clone --depth 50 https://github.com/{}.git /repo 2>&1",
"git clone --depth 500 https://github.com/{}.git /repo 2>&1",
repo
);
let clone = sandbox.exec(&clone_cmd, 180_000).await;
Expand Down
53 changes: 52 additions & 1 deletion src/swe/filters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ impl SweepFilter {
language: &str,
stars: u32,
files_changed: usize,
_added_lines: usize,
added_lines: usize,
changed_files: &[String],
) -> FilterResult {
let mut reasons = Vec::new();
let mut score = 1.0f64;
Expand Down Expand Up @@ -97,6 +98,27 @@ impl SweepFilter {
score -= 0.25;
}

if added_lines > 0 && added_lines < self.config.min_added_lines {
reasons.push(format!(
"added lines {} below minimum {}",
added_lines, self.config.min_added_lines
));
score -= 0.2;
}

if added_lines > self.config.max_added_lines {
reasons.push(format!(
"added lines {} above maximum {}",
added_lines, self.config.max_added_lines
));
score -= 0.2;
}

if !changed_files.is_empty() && Self::is_docs_only_change(changed_files) {
reasons.push("all changed files are documentation/config only".to_string());
score -= 0.3;
}

let accepted = reasons.is_empty();
if accepted {
reasons.push("candidate accepted".to_string());
Expand All @@ -108,4 +130,33 @@ impl SweepFilter {
reasons,
}
}

fn is_docs_only_change(files: &[String]) -> bool {
let doc_extensions = [
"md", "txt", "yml", "yaml", "json", "toml", "ini", "cfg", "rst", "adoc", "csv", "svg",
"png", "jpg", "jpeg", "gif", "ico",
];
let doc_names = [
"readme",
"changelog",
"license",
"licence",
"contributing",
"authors",
"codeowners",
"code_of_conduct",
".gitignore",
".editorconfig",
".prettierrc",
".eslintignore",
];

files.iter().all(|f| {
let lower = f.to_lowercase();
let basename = lower.rsplit('/').next().unwrap_or(&lower);
let ext = basename.rsplit('.').next().unwrap_or("");

doc_extensions.contains(&ext) || doc_names.iter().any(|n| basename.starts_with(n))
})
}
}
73 changes: 69 additions & 4 deletions src/swe/harness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,28 @@ async fn docker_rm(container: &str) {
.await;
}

async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> {
use tokio::io::AsyncWriteExt;
let tee_cmd = format!("cat > '/repo/{}'", path);
let mut child = Command::new("docker")
.args([
"exec", "-i", "-w", "/repo", container, "bash", "-c", &tee_cmd,
])
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::piped())
.spawn()?;
if let Some(ref mut stdin) = child.stdin {
stdin.write_all(content.as_bytes()).await?;
stdin.shutdown().await?;
}
let output = child.wait_with_output().await?;
if !output.status.success() {
anyhow::bail!("write failed: {}", String::from_utf8_lossy(&output.stderr));
}
Ok(())
}

fn container_name(task_id: &str) -> String {
let safe = task_id.replace('/', "-").replace(' ', "_");
format!("swe-harness-{safe}")
Expand Down Expand Up @@ -186,6 +208,14 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
let agent_dir_abs =
std::fs::canonicalize(&config.agent_dir).unwrap_or_else(|_| config.agent_dir.clone());

// Auto-select Docker image based on task language unless overridden
let docker_image = if config.docker_image == "python:3.12-slim" && task.language != "unknown" {
super::docker_sandbox::image_for_language(&task.language).to_string()
} else {
config.docker_image.clone()
};
info!(task_id = %task.id, language = %task.language, image = %docker_image, "Selected Docker image");

// Remove stale container if exists
docker_rm(&cname).await;

Expand All @@ -202,7 +232,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
&format!("{}:/agent:ro", agent_dir_abs.display()),
"-w",
"/repo",
&config.docker_image,
&docker_image,
"sleep",
"7200",
])
Expand Down Expand Up @@ -243,7 +273,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult

// Clone repo
let clone_cmd = format!(
"git clone --depth 100 https://github.com/{}.git /repo 2>&1",
"git clone --depth 500 https://github.com/{}.git /repo 2>&1",
task.repo
);
let (code, _, err) = docker_exec(&cname, &clone_cmd, 180).await;
Expand All @@ -261,8 +291,26 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
)
.await;
if code != 0 {
result.error = Some(format!("Checkout failed: {}", truncate(&err, 500)));
return result;
info!(task_id = %task.id, "Shallow clone missed commit, fetching full history...");
let (fcode, _, _ferr) =
docker_exec(&cname, "cd /repo && git fetch --unshallow 2>&1", 300).await;
if fcode != 0 {
result.error = Some(format!(
"Checkout failed (even after unshallow): {}",
truncate(&err, 500)
));
return result;
}
let (code2, _, err2) = docker_exec(
&cname,
&format!("cd /repo && git checkout {} --force 2>&1", task.base_commit),
60,
)
.await;
if code2 != 0 {
result.error = Some(format!("Checkout failed: {}", truncate(&err2, 500)));
return result;
}
}
}

Expand All @@ -289,6 +337,23 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
warn!(task_id = %task.id, "Agent requirements install returned non-zero (continuing)");
}

// Copy test files into container
if let Some(test_files_json) = task.meta.get("test_files") {
if let Ok(files) =
serde_json::from_str::<Vec<super::test_generator::TestFile>>(test_files_json)
{
for tf in &files {
let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path);
docker_exec(&cname, &mkdir_cmd, 10).await;
let write_result = docker_write_file(&cname, &tf.path, &tf.content).await;
if let Err(e) = write_result {
warn!(task_id = %task.id, path = %tf.path, "Failed to copy test file: {}", e);
}
}
info!(task_id = %task.id, "Copied {} test files into container", files.len());
}
}

// 2. SANITY CHECK: fail_to_pass must fail, pass_to_pass must pass
info!(task_id = %task.id, "Running sanity checks...");

Expand Down
1 change: 1 addition & 0 deletions src/swe/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ impl SwePipeline {
enriched.stars,
enriched.files_changed,
added_lines,
&enriched.changed_files,
);
filtered_count.fetch_add(1, Ordering::Relaxed);
if !filter_result.accepted {
Expand Down
4 changes: 2 additions & 2 deletions src/swe/quality.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub struct QualityConfig {
impl Default for QualityConfig {
fn default() -> Self {
Self {
min_quality_score: 0.1,
min_quality_score: 0.25,
}
}
}
Expand Down Expand Up @@ -335,7 +335,7 @@ impl QualityScorer {
};

let score = classification.score.clamp(0.0, 1.0);
let passed = score >= self.config.min_quality_score;
let passed = score >= self.config.min_quality_score && classification.quality_good;

tracing::info!(
task_id = %task.id,
Expand Down
57 changes: 52 additions & 5 deletions src/swe/test_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::swe::docker_sandbox::DockerSandbox;
use crate::swe::SweTask;

const MAX_AGENT_TURNS: usize = 200;
const MAX_VALIDATION_RETRIES: usize = 2;
const MAX_VALIDATION_RETRIES: usize = 3;

const SYSTEM_PROMPT: &str = r#"You are a test engineer writing verification tests for GitHub pull requests for the SWE-bench benchmark.

Expand All @@ -34,6 +34,8 @@ WORKFLOW:
3. Find existing test suites covering code ADJACENT to the PR changes -- add them as pass_to_pass.
4. Write NEW test files that exercise the BEHAVIOR introduced by the PR.
5. Run your tests via `shell` to validate: fail_to_pass MUST fail, pass_to_pass MUST pass on base.
5b. VERIFY pass_to_pass: Run each pass_to_pass command via `shell` and confirm exit code 0.
If it fails, choose a different existing test or use a build command instead.
6. Call `submit_tests` with everything.

MANDATORY RULES FOR TEST QUALITY:
Expand All @@ -58,6 +60,11 @@ MANDATORY RULES FOR TEST QUALITY:
- If the project has a test suite, find relevant existing test commands and include them.
- If the PR changes function_a() in a module, test that function_b() still works (pass_to_pass).
- If the PR changes a class method, verify other methods on the same class are unaffected.
- CRITICAL: pass_to_pass commands MUST use EXISTING test infrastructure that already works
on the base commit. Run the command yourself via `shell` BEFORE submitting to verify it passes.
- Do NOT create new test files for pass_to_pass. Use the project's existing test commands.
- If no existing tests exist adjacent to the PR, use a simple build command (e.g., `cargo build`,
`npm run build`, `go build ./...`) as pass_to_pass instead.

4. ROBUSTNESS & EDGE CASES (derive from the PR diff):
- If the PR adds input validation: test with null, empty, oversized, malformed inputs.
Expand Down Expand Up @@ -316,6 +323,29 @@ impl TestGenerator {
}
}

if submit.fail_to_pass.is_empty() {
if validation_retries < MAX_VALIDATION_RETRIES {
validation_retries += 1;
tracing::warn!(
task_id = %task.id,
retry = validation_retries,
"Rejecting empty fail_to_pass"
);
messages.push(Message::tool_result(
&tc.id,
"REJECTED: fail_to_pass must contain at least one test command. \
Write a test that FAILS on the base commit and PASSES after the PR patch is applied.".to_string(),
));
continue;
}
messages.push(Message::tool_result(
&tc.id,
"REJECTED: fail_to_pass is still empty after retries."
.to_string(),
));
continue;
}

// --- Heuristic: reject string-matching tests ---
if let Some(rejection) = reject_string_matching_tests(&all_files) {
if validation_retries < MAX_VALIDATION_RETRIES {
Expand All @@ -338,8 +368,15 @@ impl TestGenerator {
}
tracing::warn!(
task_id = %task.id,
"String-matching tests after max retries, accepting anyway"
"String-matching tests after max retries, REJECTING"
);
messages.push(Message::tool_result(
&tc.id,
"REJECTED: Your tests still use forbidden source-reading patterns after multiple retries. \
Rewrite completely: import modules, call functions, check return values. \
Do NOT read source files.".to_string(),
));
continue;
}

// --- Dual-commit validation: apply patch, re-run tests ---
Expand Down Expand Up @@ -369,8 +406,16 @@ impl TestGenerator {
}
tracing::warn!(
task_id = %task.id,
"Dual-commit validation failed after max retries, accepting with warning"
"Dual-commit validation failed after max retries, REJECTING"
);
messages.push(Message::tool_result(
&tc.id,
format!(
"REJECTED: {reason}\n\nYour tests failed dual-commit validation after multiple retries. \
Rewrite your tests completely."
),
));
continue;
}
ValidationResult::Accepted => {
tracing::info!(
Expand Down Expand Up @@ -463,10 +508,12 @@ impl TestGenerator {
if apply_3way.exit_code != 0 {
tracing::warn!(
stderr = %apply_3way.stderr,
"Patch apply failed, skipping dual-commit validation"
"Patch apply failed, rejecting task"
);
sandbox.exec("git checkout -- . 2>/dev/null", 10_000).await;
return ValidationResult::Accepted;
return ValidationResult::Rejected(
"PR patch could not be applied to the base commit. The test cannot be validated.".to_string()
);
}
}

Expand Down
2 changes: 2 additions & 0 deletions test-run/easy/batocera-linux/batocera.linux-15418/checks.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
python -m unittest tests/test_yquake2_riscv_config.py
python -m compileall -q python-src
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# batocera-linux/batocera.linux-15418 (original PR)

batocera-linux/batocera.linux (#15418): no yquake2 for riscv boards... yet

(no description)
5 changes: 5 additions & 0 deletions test-run/easy/batocera-linux/batocera.linux-15418/prompt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# batocera-linux/batocera.linux-15418

batocera-linux/batocera.linux (#15418): no yquake2 for riscv boards... yet

Disable or withhold the yquake2 package/build for RISC-V boards so it is not offered or built on those platforms until support is ready.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
# This test must FAIL on base commit, PASS after fix
python -m unittest tests/test_yquake2_riscv_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
# This test must PASS on base commit AND after fix
python -m compileall -q python-src
Loading
Loading