Merge branch 'main' into enhancement/evaluation-errors

AkhileshNegi · web-flow · commit dba07bc1c8d9 · 2026-04-01T10:14:55.000+05:30
diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py
@@ -106,12 +106,12 @@ def build_evaluation_jsonl(
         body: dict[str, Any] = {
             "model": config.model,
             "instructions": config.instructions,
-            "temperature": config.temperature
-            if config.temperature is not None
-            else 0.01,
             "input": question,  # Add input from dataset
         }
 
+        if "temperature" in config.model_fields_set:
+            body["temperature"] = config.temperature
+
         # Add reasoning only if provided
         if config.reasoning:
             body["reasoning"] = {"effort": config.reasoning}
@@ -189,7 +189,7 @@ def start_evaluation_batch(
             "description": f"Evaluation: {eval_run.run_name}",
             "completion_window": "24h",
             # Store complete config for reference
-            "evaluation_config": config.model_dump(exclude_none=True),
+            "evaluation_config": config.model_dump(exclude_unset=True),
         }
 
         # Step 5: Start batch job using generic infrastructure
diff --git a/backend/app/models/llm/constants.py b/backend/app/models/llm/constants.py
@@ -22,6 +22,13 @@
         "o1",
         "o1-preview",
         "o1-mini",
+        "gpt-5.4-pro",
+        "gpt-5.4-mini",
+        "gpt-5.4-nano",
+        "gpt-5",
+        "gpt-4-turbo",
+        "gpt-4",
+        "gpt-3.5-turbo",
     ],
 }
 
diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py
@@ -251,6 +251,7 @@ def validate_params(self):
             provider = self.provider
             provider_was_auto_assigned = True
 
+        user_provided_temperature = "temperature" in self.params
         validated = model_class.model_validate(self.params)
 
         if provider is not None:
@@ -288,6 +289,8 @@ def validate_params(self):
                         )
 
         self.params = validated.model_dump(exclude_none=True)
+        if not user_provided_temperature:
+            self.params.pop("temperature", None)
         return self
 
 
diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py
@@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None:
             assert request_dict["body"]["input"] == f"Question {i}"
             assert request_dict["body"]["model"] == "gpt-4o"
 
+    def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None:
+        """When temperature is explicitly set, it should appear in the JSONL body."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        config = TextLLMParams(model="gpt-4o", temperature=0.5)
+
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
+
+        assert len(jsonl_data) == 1
+        assert "temperature" in jsonl_data[0]["body"]
+        assert jsonl_data[0]["body"]["temperature"] == 0.5
+
+    def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None:
+        """When temperature is not explicitly set, it should NOT appear in the JSONL body."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        # Only model provided — temperature not in model_fields_set
+        config = TextLLMParams(model="gpt-4o")
+
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
+
+        assert len(jsonl_data) == 1
+        assert "temperature" not in jsonl_data[0]["body"]
+
+    def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set(
+        self,
+    ) -> None:
+        """When temperature is explicitly set to 0.0, it should still appear in the body."""
+        dataset_items = [
+            {
+                "id": "item1",
+                "input": {"question": "Test question"},
+                "expected_output": {"answer": "Test answer"},
+                "metadata": {},
+            }
+        ]
+
+        config = TextLLMParams(model="gpt-4o", temperature=0.0)
+
+        jsonl_data = build_evaluation_jsonl(dataset_items, config)
+
+        assert len(jsonl_data) == 1
+        assert "temperature" in jsonl_data[0]["body"]
+        assert jsonl_data[0]["body"]["temperature"] == 0.0
+
 
 class TestGetEvaluationRunStatus:
     """Test GET /evaluations/{evaluation_id} endpoint."""
diff --git a/backend/app/tests/models/__init__.py b/backend/app/tests/models/__init__.py
diff --git a/backend/app/tests/models/llm/__init__.py b/backend/app/tests/models/llm/__init__.py
diff --git a/backend/app/tests/models/llm/test_request.py b/backend/app/tests/models/llm/test_request.py
@@ -0,0 +1,111 @@
+import pytest
+from pydantic import ValidationError
+
+from app.models.llm.request import KaapiCompletionConfig
+
+
+class TestKaapiCompletionConfigTemperature:
+    """Test temperature handling in KaapiCompletionConfig.validate_params."""
+
+    def test_temperature_preserved_when_user_provides_it(self) -> None:
+        """When user explicitly provides temperature, it should be in params."""
+        config = KaapiCompletionConfig(
+            provider="openai",
+            type="text",
+            params={
+                "model": "gpt-4o",
+                "temperature": 0.7,
+            },
+        )
+
+        assert "temperature" in config.params
+        assert config.params["temperature"] == 0.7
+
+    def test_temperature_excluded_when_user_does_not_provide_it(self) -> None:
+        """When user does not provide temperature, it should NOT be in params
+        even though TextLLMParams has a default of 0.1."""
+        config = KaapiCompletionConfig(
+            provider="openai",
+            type="text",
+            params={
+                "model": "gpt-4o",
+            },
+        )
+
+        assert "temperature" not in config.params
+
+    def test_temperature_zero_preserved_when_explicitly_set(self) -> None:
+        """When user explicitly sets temperature to 0.0, it should be preserved."""
+        config = KaapiCompletionConfig(
+            provider="openai",
+            type="text",
+            params={
+                "model": "gpt-4o",
+                "temperature": 0.0,
+            },
+        )
+
+        assert "temperature" in config.params
+        assert config.params["temperature"] == 0.0
+
+
+class TestNewSupportedModels:
+    """Test that newly added models are accepted for openai/text provider."""
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "gpt-5.4-pro",
+            "gpt-5.4-mini",
+            "gpt-5.4-nano",
+            "gpt-5",
+            "gpt-4-turbo",
+            "gpt-4",
+            "gpt-3.5-turbo",
+        ],
+    )
+    def test_new_model_accepted(self, model: str) -> None:
+        """New models should be accepted for openai text provider."""
+        config = KaapiCompletionConfig(
+            provider="openai",
+            type="text",
+            params={"model": model},
+        )
+
+        assert config.params["model"] == model
+
+    @pytest.mark.parametrize(
+        "model",
+        [
+            "gpt-4o",
+            "gpt-4o-mini",
+            "gpt-4.1",
+            "gpt-4.1-mini",
+            "gpt-4.1-nano",
+            "gpt-5.4",
+            "gpt-5.1",
+            "gpt-5-mini",
+            "gpt-5-nano",
+            "o1",
+            "o1-preview",
+            "o1-mini",
+        ],
+    )
+    def test_existing_models_still_accepted(self, model: str) -> None:
+        """Previously supported models should still be accepted."""
+        config = KaapiCompletionConfig(
+            provider="openai",
+            type="text",
+            params={"model": model},
+        )
+
+        assert config.params["model"] == model
+
+    def test_unsupported_model_rejected(self) -> None:
+        """An unsupported model should raise a validation error."""
+        with pytest.raises(ValidationError, match="not supported"):
+            KaapiCompletionConfig(
+                provider="openai",
+                type="text",
+                params={"model": "unsupported-model-xyz"},
+            )