CortexLM · echobt · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/benchmark_validation_report.md b/benchmark_validation_report.md
diff --git a/src/swe/docker_sandbox.rs b/src/swe/docker_sandbox.rs
@@ -105,7 +105,7 @@ impl DockerSandbox {
 
         // Clone the repository
         let clone_cmd = format!(
-            "git clone --depth 50 https://github.com/{}.git /repo 2>&1",
+            "git clone --depth 500 https://github.com/{}.git /repo 2>&1",
             repo
         );
         let clone = sandbox.exec(&clone_cmd, 180_000).await;

diff --git a/src/swe/filters.rs b/src/swe/filters.rs
@@ -56,7 +56,8 @@ impl SweepFilter {
         language: &str,
         stars: u32,
         files_changed: usize,
-        _added_lines: usize,
+        added_lines: usize,
+        changed_files: &[String],
     ) -> FilterResult {
         let mut reasons = Vec::new();
         let mut score = 1.0f64;
@@ -97,6 +98,27 @@ impl SweepFilter {
             score -= 0.25;
         }
 
+        if added_lines > 0 && added_lines < self.config.min_added_lines {
+            reasons.push(format!(
+                "added lines {} below minimum {}",
+                added_lines, self.config.min_added_lines
+            ));
+            score -= 0.2;
+        }
+
+        if added_lines > self.config.max_added_lines {
+            reasons.push(format!(
+                "added lines {} above maximum {}",
+                added_lines, self.config.max_added_lines
+            ));
+            score -= 0.2;
+        }
+
+        if !changed_files.is_empty() && Self::is_docs_only_change(changed_files) {
+            reasons.push("all changed files are documentation/config only".to_string());
+            score -= 0.3;
+        }
+
         let accepted = reasons.is_empty();
         if accepted {
             reasons.push("candidate accepted".to_string());
@@ -108,4 +130,33 @@ impl SweepFilter {
             reasons,
         }
     }
+
+    fn is_docs_only_change(files: &[String]) -> bool {
+        let doc_extensions = [
+            "md", "txt", "yml", "yaml", "json", "toml", "ini", "cfg", "rst", "adoc", "csv", "svg",
+            "png", "jpg", "jpeg", "gif", "ico",
+        ];
+        let doc_names = [
+            "readme",
+            "changelog",
+            "license",
+            "licence",
+            "contributing",
+            "authors",
+            "codeowners",
+            "code_of_conduct",
+            ".gitignore",
+            ".editorconfig",
+            ".prettierrc",
+            ".eslintignore",
+        ];
+
+        files.iter().all(|f| {
+            let lower = f.to_lowercase();
+            let basename = lower.rsplit('/').next().unwrap_or(&lower);
+            let ext = basename.rsplit('.').next().unwrap_or("");
+
+            doc_extensions.contains(&ext) || doc_names.iter().any(|n| basename.starts_with(n))
+        })
+    }
 }
diff --git a/src/swe/harness.rs b/src/swe/harness.rs
@@ -143,6 +143,28 @@ async fn docker_rm(container: &str) {
         .await;
 }
 
+async fn docker_write_file(container: &str, path: &str, content: &str) -> Result<()> {
+    use tokio::io::AsyncWriteExt;
+    let tee_cmd = format!("cat > '/repo/{}'", path);
+    let mut child = Command::new("docker")
+        .args([
+            "exec", "-i", "-w", "/repo", container, "bash", "-c", &tee_cmd,
+        ])
+        .stdin(std::process::Stdio::piped())
+        .stdout(std::process::Stdio::null())
+        .stderr(std::process::Stdio::piped())
+        .spawn()?;
+    if let Some(ref mut stdin) = child.stdin {
+        stdin.write_all(content.as_bytes()).await?;
+        stdin.shutdown().await?;
+    }
+    let output = child.wait_with_output().await?;
+    if !output.status.success() {
+        anyhow::bail!("write failed: {}", String::from_utf8_lossy(&output.stderr));
+    }
+    Ok(())
+}
+
 fn container_name(task_id: &str) -> String {
     let safe = task_id.replace('/', "-").replace(' ', "_");
     format!("swe-harness-{safe}")
@@ -186,6 +208,14 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
     let agent_dir_abs =
         std::fs::canonicalize(&config.agent_dir).unwrap_or_else(|_| config.agent_dir.clone());
 
+    // Auto-select Docker image based on task language unless overridden
+    let docker_image = if config.docker_image == "python:3.12-slim" && task.language != "unknown" {
+        super::docker_sandbox::image_for_language(&task.language).to_string()
+    } else {
+        config.docker_image.clone()
+    };
+    info!(task_id = %task.id, language = %task.language, image = %docker_image, "Selected Docker image");
+
     // Remove stale container if exists
     docker_rm(&cname).await;
 
@@ -202,7 +232,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
             &format!("{}:/agent:ro", agent_dir_abs.display()),
             "-w",
             "/repo",
-            &config.docker_image,
+            &docker_image,
             "sleep",
             "7200",
         ])
@@ -243,7 +273,7 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
 
     // Clone repo
     let clone_cmd = format!(
-        "git clone --depth 100 https://github.com/{}.git /repo 2>&1",
+        "git clone --depth 500 https://github.com/{}.git /repo 2>&1",
         task.repo
     );
     let (code, _, err) = docker_exec(&cname, &clone_cmd, 180).await;
@@ -261,8 +291,26 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
         )
         .await;
         if code != 0 {
-            result.error = Some(format!("Checkout failed: {}", truncate(&err, 500)));
-            return result;
+            info!(task_id = %task.id, "Shallow clone missed commit, fetching full history...");
+            let (fcode, _, _ferr) =
+                docker_exec(&cname, "cd /repo && git fetch --unshallow 2>&1", 300).await;
+            if fcode != 0 {
+                result.error = Some(format!(
+                    "Checkout failed (even after unshallow): {}",
+                    truncate(&err, 500)
+                ));
+                return result;
+            }
+            let (code2, _, err2) = docker_exec(
+                &cname,
+                &format!("cd /repo && git checkout {} --force 2>&1", task.base_commit),
+                60,
+            )
+            .await;
+            if code2 != 0 {
+                result.error = Some(format!("Checkout failed: {}", truncate(&err2, 500)));
+                return result;
+            }
         }
     }
 
@@ -289,6 +337,23 @@ async fn evaluate_task(task: &SweTask, config: &HarnessConfig) -> HarnessResult
         warn!(task_id = %task.id, "Agent requirements install returned non-zero (continuing)");
     }
 
+    // Copy test files into container
+    if let Some(test_files_json) = task.meta.get("test_files") {
+        if let Ok(files) =
+            serde_json::from_str::<Vec<super::test_generator::TestFile>>(test_files_json)
+        {
+            for tf in &files {
+                let mkdir_cmd = format!("mkdir -p \"$(dirname '/repo/{}')\"", tf.path);
+                docker_exec(&cname, &mkdir_cmd, 10).await;
+                let write_result = docker_write_file(&cname, &tf.path, &tf.content).await;
+                if let Err(e) = write_result {
+                    warn!(task_id = %task.id, path = %tf.path, "Failed to copy test file: {}", e);
+                }
+            }
+            info!(task_id = %task.id, "Copied {} test files into container", files.len());
+        }
+    }
+
     // 2. SANITY CHECK: fail_to_pass must fail, pass_to_pass must pass
     info!(task_id = %task.id, "Running sanity checks...");
 

diff --git a/src/swe/pipeline.rs b/src/swe/pipeline.rs
@@ -360,6 +360,7 @@ impl SwePipeline {
                         enriched.stars,
                         enriched.files_changed,
                         added_lines,
+                        &enriched.changed_files,
                     );
                     filtered_count.fetch_add(1, Ordering::Relaxed);
                     if !filter_result.accepted {

diff --git a/src/swe/quality.rs b/src/swe/quality.rs
@@ -14,7 +14,7 @@ pub struct QualityConfig {
 impl Default for QualityConfig {
     fn default() -> Self {
         Self {
-            min_quality_score: 0.1,
+            min_quality_score: 0.25,
         }
     }
 }
@@ -335,7 +335,7 @@ impl QualityScorer {
         };
 
         let score = classification.score.clamp(0.0, 1.0);
-        let passed = score >= self.config.min_quality_score;
+        let passed = score >= self.config.min_quality_score && classification.quality_good;
 
         tracing::info!(
             task_id = %task.id,

diff --git a/src/swe/test_generator.rs b/src/swe/test_generator.rs
@@ -15,7 +15,7 @@ use crate::swe::docker_sandbox::DockerSandbox;
 use crate::swe::SweTask;
 
 const MAX_AGENT_TURNS: usize = 200;
-const MAX_VALIDATION_RETRIES: usize = 2;
+const MAX_VALIDATION_RETRIES: usize = 3;
 
 const SYSTEM_PROMPT: &str = r#"You are a test engineer writing verification tests for GitHub pull requests for the SWE-bench benchmark.
 
@@ -34,6 +34,8 @@ WORKFLOW:
 3. Find existing test suites covering code ADJACENT to the PR changes -- add them as pass_to_pass.
 4. Write NEW test files that exercise the BEHAVIOR introduced by the PR.
 5. Run your tests via `shell` to validate: fail_to_pass MUST fail, pass_to_pass MUST pass on base.
+5b. VERIFY pass_to_pass: Run each pass_to_pass command via `shell` and confirm exit code 0.
+    If it fails, choose a different existing test or use a build command instead.
 6. Call `submit_tests` with everything.
 
 MANDATORY RULES FOR TEST QUALITY:
@@ -58,6 +60,11 @@ MANDATORY RULES FOR TEST QUALITY:
    - If the project has a test suite, find relevant existing test commands and include them.
    - If the PR changes function_a() in a module, test that function_b() still works (pass_to_pass).
    - If the PR changes a class method, verify other methods on the same class are unaffected.
+   - CRITICAL: pass_to_pass commands MUST use EXISTING test infrastructure that already works
+     on the base commit. Run the command yourself via `shell` BEFORE submitting to verify it passes.
+   - Do NOT create new test files for pass_to_pass. Use the project's existing test commands.
+   - If no existing tests exist adjacent to the PR, use a simple build command (e.g., `cargo build`,
+     `npm run build`, `go build ./...`) as pass_to_pass instead.
 
 4. ROBUSTNESS & EDGE CASES (derive from the PR diff):
    - If the PR adds input validation: test with null, empty, oversized, malformed inputs.
@@ -316,6 +323,29 @@ impl TestGenerator {
                                 }
                             }
 
+                            if submit.fail_to_pass.is_empty() {
+                                if validation_retries < MAX_VALIDATION_RETRIES {
+                                    validation_retries += 1;
+                                    tracing::warn!(
+                                        task_id = %task.id,
+                                        retry = validation_retries,
+                                        "Rejecting empty fail_to_pass"
+                                    );
+                                    messages.push(Message::tool_result(
+                                        &tc.id,
+                                        "REJECTED: fail_to_pass must contain at least one test command. \
+                                         Write a test that FAILS on the base commit and PASSES after the PR patch is applied.".to_string(),
+                                    ));
+                                    continue;
+                                }
+                                messages.push(Message::tool_result(
+                                    &tc.id,
+                                    "REJECTED: fail_to_pass is still empty after retries."
+                                        .to_string(),
+                                ));
+                                continue;
+                            }
+
                             // --- Heuristic: reject string-matching tests ---
                             if let Some(rejection) = reject_string_matching_tests(&all_files) {
                                 if validation_retries < MAX_VALIDATION_RETRIES {
@@ -338,8 +368,15 @@ impl TestGenerator {
                                 }
                                 tracing::warn!(
                                     task_id = %task.id,
-                                    "String-matching tests after max retries, accepting anyway"
+                                    "String-matching tests after max retries, REJECTING"
                                 );
+                                messages.push(Message::tool_result(
+                                    &tc.id,
+                                    "REJECTED: Your tests still use forbidden source-reading patterns after multiple retries. \
+                                     Rewrite completely: import modules, call functions, check return values. \
+                                     Do NOT read source files.".to_string(),
+                                ));
+                                continue;
                             }
 
                             // --- Dual-commit validation: apply patch, re-run tests ---
@@ -369,8 +406,16 @@ impl TestGenerator {
                                     }
                                     tracing::warn!(
                                         task_id = %task.id,
-                                        "Dual-commit validation failed after max retries, accepting with warning"
+                                        "Dual-commit validation failed after max retries, REJECTING"
                                     );
+                                    messages.push(Message::tool_result(
+                                        &tc.id,
+                                        format!(
+                                            "REJECTED: {reason}\n\nYour tests failed dual-commit validation after multiple retries. \
+                                             Rewrite your tests completely."
+                                        ),
+                                    ));
+                                    continue;
                                 }
                                 ValidationResult::Accepted => {
                                     tracing::info!(
@@ -463,10 +508,12 @@ impl TestGenerator {
             if apply_3way.exit_code != 0 {
                 tracing::warn!(
                     stderr = %apply_3way.stderr,
-                    "Patch apply failed, skipping dual-commit validation"
+                    "Patch apply failed, rejecting task"
                 );
                 sandbox.exec("git checkout -- . 2>/dev/null", 10_000).await;
-                return ValidationResult::Accepted;
+                return ValidationResult::Rejected(
+                    "PR patch could not be applied to the base commit. The test cannot be validated.".to_string()
+                );
             }
         }
 

diff --git a/test-run/easy/batocera-linux/batocera.linux-15418/checks.txt b/test-run/easy/batocera-linux/batocera.linux-15418/checks.txt
@@ -0,0 +1,2 @@
+python -m unittest tests/test_yquake2_riscv_config.py
+python -m compileall -q python-src
diff --git a/test-run/easy/batocera-linux/batocera.linux-15418/original_pr.md b/test-run/easy/batocera-linux/batocera.linux-15418/original_pr.md
@@ -0,0 +1,5 @@
+# batocera-linux/batocera.linux-15418 (original PR)
+
+batocera-linux/batocera.linux (#15418): no yquake2 for riscv boards... yet
+
+(no description)
diff --git a/test-run/easy/batocera-linux/batocera.linux-15418/prompt.md b/test-run/easy/batocera-linux/batocera.linux-15418/prompt.md
@@ -0,0 +1,5 @@
+# batocera-linux/batocera.linux-15418
+
+batocera-linux/batocera.linux (#15418): no yquake2 for riscv boards... yet
+
+Disable or withhold the yquake2 package/build for RISC-V boards so it is not offered or built on those platforms until support is ready.
diff --git a/test-run/easy/batocera-linux/batocera.linux-15418/tests/fail_to_pass_1.sh b/test-run/easy/batocera-linux/batocera.linux-15418/tests/fail_to_pass_1.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# This test must FAIL on base commit, PASS after fix
+python -m unittest tests/test_yquake2_riscv_config.py
diff --git a/test-run/easy/batocera-linux/batocera.linux-15418/tests/pass_to_pass_1.sh b/test-run/easy/batocera-linux/batocera.linux-15418/tests/pass_to_pass_1.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# This test must PASS on base commit AND after fix
+python -m compileall -q python-src
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python -m unittest tests/test_yquake2_riscv_config.py
		python -m compileall -q python-src