ProjectTech4DevAI
diff --git a/‎backend/app/crud/evaluations/batch.py‎
Lines changed: 4 additions & 4 deletions b/‎backend/app/crud/evaluations/batch.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎backend/app/crud/evaluations/processing.py‎
Lines changed: 106 additions & 2 deletions b/‎backend/app/crud/evaluations/processing.py‎
Lines changed: 106 additions & 2 deletions
diff --git a/‎backend/app/models/llm/constants.py‎
Lines changed: 7 additions & 0 deletions b/‎backend/app/models/llm/constants.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backend/app/models/llm/request.py‎
Lines changed: 3 additions & 0 deletions b/‎backend/app/models/llm/request.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backend/app/tests/api/routes/test_evaluation.py‎
Lines changed: 59 additions & 0 deletions b/‎backend/app/tests/api/routes/test_evaluation.py‎
Lines changed: 59 additions & 0 deletions
@@ -106,12 +106,12 @@ def build_evaluation_jsonl(
         body: dict[str, Any] = {
             "model": config.model,
             "instructions": config.instructions,
-            "temperature": config.temperature
-            if config.temperature is not None
-            else 0.01,
             "input": question,  # Add input from dataset
         }
 
+        if "temperature" in config.model_fields_set:
+            body["temperature"] = config.temperature
+
         # Add reasoning only if provided
         if config.reasoning:
             body["reasoning"] = {"effort": config.reasoning}
@@ -189,7 +189,7 @@ def start_evaluation_batch(
             "description": f"Evaluation: {eval_run.run_name}",
             "completion_window": "24h",
             # Store complete config for reference
-            "evaluation_config": config.model_dump(exclude_none=True),
+            "evaluation_config": config.model_dump(exclude_unset=True),
         }
 
         # Step 5: Start batch job using generic infrastructure
 
@@ -37,13 +37,82 @@
     create_langfuse_dataset_run,
     update_traces_with_cosine_scores,
 )
-from app.crud.job import get_batch_job
+from app.crud.job import get_batch_job, update_batch_job
 from app.models import EvaluationRun
+from app.models.batch_job import BatchJob, BatchJobUpdate
 from app.utils import get_langfuse_client, get_openai_client
 
 logger = logging.getLogger(__name__)
 
 
+def _extract_batch_error_message(
+    provider: OpenAIBatchProvider,
+    error_file_id: str,
+    batch_job: BatchJob,
+    session: Session,
+) -> str:
+    """
+    Download the error file from OpenAI, parse JSONL entries, and extract
+    the most common error message. Updates batch_job.error_message.
+
+    Args:
+        provider: OpenAI batch provider instance
+        error_file_id: OpenAI error file ID
+        batch_job: BatchJob to update with error message
+        session: Database session
+
+    Returns:
+        Human-readable error message with the top error and counts
+    """
+    try:
+        error_content = provider.download_file(error_file_id)
+        lines = error_content.strip().split("\n")
+
+        error_counts: dict[str, int] = {}
+        for line in lines:
+            try:
+                entry = json.loads(line)
+                message = (
+                    entry.get("response", {})
+                    .get("body", {})
+                    .get("error", {})
+                    .get("message", "Unknown error")
+                )
+                error_counts[message] = error_counts.get(message, 0) + 1
+            except json.JSONDecodeError:
+                continue
+
+        if error_counts:
+            top_error = max(error_counts, key=error_counts.get)
+            top_count = error_counts[top_error]
+            total = sum(error_counts.values())
+            error_msg = f"{top_error} ({top_count}/{total} requests)"
+        else:
+            error_msg = "Batch completed with errors but could not parse error file"
+
+    except Exception as e:
+        logger.error(
+            f"[_extract_batch_error_message] Failed to extract errors | batch_job_id={batch_job.id} | {e}",
+            exc_info=True,
+        )
+        error_msg = (
+            f"Batch completed with all requests failed (error_file_id: {error_file_id})"
+        )
+
+    # Update batch_job with extracted error message (outside try/except
+    # so persistence failures propagate to the caller)
+    batch_job_update = BatchJobUpdate(error_message=error_msg)
+    update_batch_job(
+        session=session, batch_job=batch_job, batch_job_update=batch_job_update
+    )
+
+    logger.info(
+        f"[_extract_batch_error_message] Extracted error | batch_job_id={batch_job.id} | {error_msg}"
+    )
+
+    return error_msg
+
+
 def parse_evaluation_output(
     raw_results: list[dict[str, Any]], dataset_items: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
@@ -560,14 +629,49 @@ async def check_and_process_evaluation(
 
         # IMPORTANT: Poll OpenAI to get the latest status before checking
         provider = OpenAIBatchProvider(client=openai_client)
-        poll_batch_status(session=session, provider=provider, batch_job=batch_job)
+        status_result = poll_batch_status(
+            session=session, provider=provider, batch_job=batch_job
+        )
 
         # Refresh batch_job to get the updated provider_status
         session.refresh(batch_job)
         provider_status = batch_job.provider_status
 
         # Handle different provider statuses
         if provider_status == "completed":
+            # Check if batch completed but all requests failed
+            # (output_file_id is absent, error_file_id is present)
+            if not status_result.get(
+                "provider_output_file_id", batch_job.provider_output_file_id
+            ) and status_result.get("error_file_id"):
+                error_msg = _extract_batch_error_message(
+                    provider=provider,
+                    error_file_id=status_result["error_file_id"],
+                    batch_job=batch_job,
+                    session=session,
+                )
+
+                eval_run = update_evaluation_run(
+                    session=session,
+                    eval_run=eval_run,
+                    status="failed",
+                    error_message=error_msg,
+                )
+
+                logger.error(
+                    f"[check_and_process_evaluation] {log_prefix} Batch completed with all requests failed | {error_msg}"
+                )
+
+                return {
+                    "run_id": eval_run.id,
+                    "run_name": eval_run.run_name,
+                    "previous_status": previous_status,
+                    "current_status": "failed",
+                    "provider_status": provider_status,
+                    "action": "failed",
+                    "error": error_msg,
+                }
+
             # Process the completed evaluation
             await process_completed_evaluation(
                 eval_run=eval_run,
 
@@ -22,6 +22,13 @@
         "o1",
         "o1-preview",
         "o1-mini",
+        "gpt-5.4-pro",
+        "gpt-5.4-mini",
+        "gpt-5.4-nano",
+        "gpt-5",
+        "gpt-4-turbo",
+        "gpt-4",
+        "gpt-3.5-turbo",
     ],
 }
 
 
@@ -251,6 +251,7 @@ def validate_params(self):
             provider = self.provider
             provider_was_auto_assigned = True
 
+        user_provided_temperature = "temperature" in self.params
         validated = model_class.model_validate(self.params)
 
         if provider is not None:
@@ -288,6 +289,8 @@ def validate_params(self):
                         )
 
         self.params = validated.model_dump(exclude_none=True)
+        if not user_provided_temperature:
+            self.params.pop("temperature", None)
         return self
 
 
 
@@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None:
             assert request_dict["body"]["input"] == f"Question {i}"
             assert request_dict["body"]["model"] == "gpt-4o"
 
+    def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None:
+        """When temperature is explicitly set, it should appear in the JSONL body."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        config = TextLLMParams(model="gpt-4o", temperature=0.5)
+
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
+
+        assert len(jsonl_data) == 1
+        assert "temperature" in jsonl_data[0]["body"]
+        assert jsonl_data[0]["body"]["temperature"] == 0.5
+
+    def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None:
+        """When temperature is not explicitly set, it should NOT appear in the JSONL body."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        # Only model provided — temperature not in model_fields_set
+        config = TextLLMParams(model="gpt-4o")
+
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
+
+        assert len(jsonl_data) == 1
+        assert "temperature" not in jsonl_data[0]["body"]
+
+    def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set(
+        self,
+    ) -> None:
+        """When temperature is explicitly set to 0.0, it should still appear in the body."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        config = TextLLMParams(model="gpt-4o", temperature=0.0)
+
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
+
+        assert len(jsonl_data) == 1
+        assert "temperature" in jsonl_data[0]["body"]
+        assert jsonl_data[0]["body"]["temperature"] == 0.0
+
 
 class TestGetEvaluationRunStatus:
     """Test GET /evaluations/{evaluation_id} endpoint."""