fix: remove all hardcoded command fallbacks

Atlas · Atlas · commit 72e3de7e7d4f · 2026-03-28T14:29:57.000Z
- Add DiscoveryError exception for LLM-unavailable scenarios
- Remove pip install/npm test/pytest tests fallbacks in workspace.py
- Remove hardcoded pip install -e . in agentic.py
- Remove file count heuristics in pipeline.py
- Pipeline now fails gracefully instead of using hardcoded fallbacks
- Update test to expect None instead of 'easy' when no classifier
diff --git a/src/swe_forge/discovery/agentic.py b/src/swe_forge/discovery/agentic.py
@@ -108,10 +108,9 @@ async def discover_install_commands(
                 f.endswith("setup.py") or f == "setup.py" for f in filenames
             )
             if has_pyproject or has_setup:
-                discovered.install_commands = ["pip install -e ."]
+                # Don't hardcode commands - let LLM discover them
                 discovered.discovery_source = "python-package"
-                discovered.confidence = "high"
-                return discovered
+                # Continue to LLM discovery below instead of returning early
 
         # Step 1: Read CI/CD for tested commands
         ci_commands = self._extract_ci_install_commands(files, language)
diff --git a/src/swe_forge/exceptions.py b/src/swe_forge/exceptions.py
@@ -0,0 +1,16 @@
+"""Custom exceptions for swe-forge."""
+
+
+class DiscoveryError(Exception):
+    """Raised when command discovery fails and no LLM is available.
+
+    This exception is raised when:
+    - No install commands were discovered via LLM
+    - No test commands were discovered via LLM
+    - LLM client is not configured for command discovery
+
+    The pipeline should fail gracefully rather than falling back to
+    hardcoded commands.
+    """
+
+    pass
diff --git a/src/swe_forge/export/workspace.py b/src/swe_forge/export/workspace.py
@@ -42,15 +42,13 @@ def export_task_to_workspace(
         f"{docker_username}/swe-forge-tasks:{task.id}" if docker_username else None
     )
 
-    # Get install commands from config or defaults
+    # Get install commands from config - NO FALLBACKS
     install_commands = task.install_config.get("install_commands", [])
     if not install_commands:
-        if task.language == "python":
-            install_commands = ["pip install -e .", "pip install pytest"]
-        elif task.language in ("javascript", "typescript"):
-            install_commands = ["npm install", "npm test"]
-        elif task.language == "rust":
-            install_commands = ["cargo build", "cargo test"]
+        raise DiscoveryError(
+            f"No install commands discovered for task {task.id}. "
+            "Ensure LLM-based discovery is configured (OPENROUTER_API_KEY)."
+        )
 
     # Get test commands - fallback to test_patch extraction if empty
     fail_to_pass = list(task.fail_to_pass) if task.fail_to_pass else []
@@ -62,14 +60,12 @@ def export_task_to_workspace(
         if test_files:
             fail_to_pass = [f"pytest {f} -v" for f in test_files]
 
-    # Default test commands as last resort
+    # No fallback test commands - require LLM discovery
     if not fail_to_pass:
-        if task.language == "python":
-            fail_to_pass = ["pytest tests/ -v"]
-        elif task.language in ("javascript", "typescript"):
-            fail_to_pass = ["npm test"]
-        elif task.language == "rust":
-            fail_to_pass = ["cargo test"]
+        raise DiscoveryError(
+            f"No test commands discovered for task {task.id}. "
+            "Ensure TestGenerator is configured and ran successfully."
+        )
 
     # Build workspace data
     workspace_data: dict[str, Any] = {
diff --git a/src/swe_forge/swe/pipeline.py b/src/swe_forge/swe/pipeline.py
@@ -301,12 +301,13 @@ async def _preclassify_stage(
                 response: TriageResponse = await classifier.classify_triage(pr_info)
                 difficulty = response.difficulty
             else:
-                if enriched.files_changed <= 2:
-                    difficulty = "easy"
-                elif enriched.files_changed <= 5:
-                    difficulty = "medium"
-                else:
-                    difficulty = "hard"
+                # NO HARDCODED HEURISTICS - skip if no classifier
+                logger.warning(
+                    "No classifier for %s#%d, skipping pre-classification",
+                    enriched.repo,
+                    enriched.number,
+                )
+                return None
 
             if difficulty == "easy":
                 metrics.preclassify_easy += 1
@@ -395,15 +396,13 @@ async def _deep_stage(
                         else:
                             metrics.difficulty_hard += 1
                     else:
-                        if enriched.files_changed <= 2:
-                            task.difficulty_score = 1
-                            metrics.difficulty_easy += 1
-                        elif enriched.files_changed <= 5:
-                            task.difficulty_score = 2
-                            metrics.difficulty_medium += 1
-                        else:
-                            task.difficulty_score = 3
-                            metrics.difficulty_hard += 1
+                        # NO HARDCODED HEURISTICS - skip difficulty scoring
+                        task.difficulty_score = 0
+                        logger.warning(
+                            "No difficulty classifier for %s#%d, skipping scoring",
+                            enriched.repo,
+                            enriched.number,
+                        )
 
                     await self._emit_event(
                         event_queue,
diff --git a/tests/test_swe/test_pipeline.py b/tests/test_swe/test_pipeline.py
@@ -443,7 +443,7 @@ async def test_difficulty_classifier_integration(self, mock_gh_client):
 
     @pytest.mark.asyncio
     async def test_difficulty_classifier_fallback(self, mock_gh_client):
-        """Verify heuristics used when llm_client=None."""
+        """Verify NO HARDCODED HEURISTICS when llm_client=None."""
         from swe_forge.swe.enricher import EnrichedPullRequest
 
         config = SwePipelineConfig()
@@ -470,8 +470,10 @@ async def test_difficulty_classifier_fallback(self, mock_gh_client):
 
         result = await pipeline._preclassify_stage(enriched, semaphore, metrics)
 
-        assert result == "easy"
-        assert metrics.preclassify_easy == 1
+        # NO HARDCODED HEURISTICS - returns None when no classifier
+        assert result is None
+        # No classification happened
+        assert metrics.preclassify_easy == 0
 
     @pytest.mark.asyncio
     async def test_difficulty_classifier_score_mapping(self, mock_gh_client):