Skip to content

Commit dba07bc

Browse files
authored
Merge branch 'main' into enhancement/evaluation-errors
2 parents 6e47cb2 + 5e853ad commit dba07bc

File tree

7 files changed

+184
-4
lines changed

7 files changed

+184
-4
lines changed

backend/app/crud/evaluations/batch.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ def build_evaluation_jsonl(
106106
body: dict[str, Any] = {
107107
"model": config.model,
108108
"instructions": config.instructions,
109-
"temperature": config.temperature
110-
if config.temperature is not None
111-
else 0.01,
112109
"input": question, # Add input from dataset
113110
}
114111

112+
if "temperature" in config.model_fields_set:
113+
body["temperature"] = config.temperature
114+
115115
# Add reasoning only if provided
116116
if config.reasoning:
117117
body["reasoning"] = {"effort": config.reasoning}
@@ -189,7 +189,7 @@ def start_evaluation_batch(
189189
"description": f"Evaluation: {eval_run.run_name}",
190190
"completion_window": "24h",
191191
# Store complete config for reference
192-
"evaluation_config": config.model_dump(exclude_none=True),
192+
"evaluation_config": config.model_dump(exclude_unset=True),
193193
}
194194

195195
# Step 5: Start batch job using generic infrastructure

backend/app/models/llm/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
"o1",
2323
"o1-preview",
2424
"o1-mini",
25+
"gpt-5.4-pro",
26+
"gpt-5.4-mini",
27+
"gpt-5.4-nano",
28+
"gpt-5",
29+
"gpt-4-turbo",
30+
"gpt-4",
31+
"gpt-3.5-turbo",
2532
],
2633
}
2734

backend/app/models/llm/request.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def validate_params(self):
251251
provider = self.provider
252252
provider_was_auto_assigned = True
253253

254+
user_provided_temperature = "temperature" in self.params
254255
validated = model_class.model_validate(self.params)
255256

256257
if provider is not None:
@@ -288,6 +289,8 @@ def validate_params(self):
288289
)
289290

290291
self.params = validated.model_dump(exclude_none=True)
292+
if not user_provided_temperature:
293+
self.params.pop("temperature", None)
291294
return self
292295

293296

backend/app/tests/api/routes/test_evaluation.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None:
731731
assert request_dict["body"]["input"] == f"Question {i}"
732732
assert request_dict["body"]["model"] == "gpt-4o"
733733

734+
def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None:
735+
"""When temperature is explicitly set, it should appear in the JSONL body."""
736+
dataset_items = [
737+
{
738+
"id": "item1",
739+
"input": {"question": "Test question"},
740+
"expected_output": {"answer": "Test answer"},
741+
"metadata": {},
742+
}
743+
]
744+
745+
config = TextLLMParams(model="gpt-4o", temperature=0.5)
746+
747+
jsonl_data = build_evaluation_jsonl(dataset_items, config)
748+
749+
assert len(jsonl_data) == 1
750+
assert "temperature" in jsonl_data[0]["body"]
751+
assert jsonl_data[0]["body"]["temperature"] == 0.5
752+
753+
def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None:
754+
"""When temperature is not explicitly set, it should NOT appear in the JSONL body."""
755+
dataset_items = [
756+
{
757+
"id": "item1",
758+
"input": {"question": "Test question"},
759+
"expected_output": {"answer": "Test answer"},
760+
"metadata": {},
761+
}
762+
]
763+
764+
# Only model provided — temperature not in model_fields_set
765+
config = TextLLMParams(model="gpt-4o")
766+
767+
jsonl_data = build_evaluation_jsonl(dataset_items, config)
768+
769+
assert len(jsonl_data) == 1
770+
assert "temperature" not in jsonl_data[0]["body"]
771+
772+
def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set(
773+
self,
774+
) -> None:
775+
"""When temperature is explicitly set to 0.0, it should still appear in the body."""
776+
dataset_items = [
777+
{
778+
"id": "item1",
779+
"input": {"question": "Test question"},
780+
"expected_output": {"answer": "Test answer"},
781+
"metadata": {},
782+
}
783+
]
784+
785+
config = TextLLMParams(model="gpt-4o", temperature=0.0)
786+
787+
jsonl_data = build_evaluation_jsonl(dataset_items, config)
788+
789+
assert len(jsonl_data) == 1
790+
assert "temperature" in jsonl_data[0]["body"]
791+
assert jsonl_data[0]["body"]["temperature"] == 0.0
792+
734793

735794
class TestGetEvaluationRunStatus:
736795
"""Test GET /evaluations/{evaluation_id} endpoint."""

backend/app/tests/models/__init__.py

Whitespace-only changes.

backend/app/tests/models/llm/__init__.py

Whitespace-only changes.
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import pytest
2+
from pydantic import ValidationError
3+
4+
from app.models.llm.request import KaapiCompletionConfig
5+
6+
7+
class TestKaapiCompletionConfigTemperature:
8+
"""Test temperature handling in KaapiCompletionConfig.validate_params."""
9+
10+
def test_temperature_preserved_when_user_provides_it(self) -> None:
11+
"""When user explicitly provides temperature, it should be in params."""
12+
config = KaapiCompletionConfig(
13+
provider="openai",
14+
type="text",
15+
params={
16+
"model": "gpt-4o",
17+
"temperature": 0.7,
18+
},
19+
)
20+
21+
assert "temperature" in config.params
22+
assert config.params["temperature"] == 0.7
23+
24+
def test_temperature_excluded_when_user_does_not_provide_it(self) -> None:
25+
"""When user does not provide temperature, it should NOT be in params
26+
even though TextLLMParams has a default of 0.1."""
27+
config = KaapiCompletionConfig(
28+
provider="openai",
29+
type="text",
30+
params={
31+
"model": "gpt-4o",
32+
},
33+
)
34+
35+
assert "temperature" not in config.params
36+
37+
def test_temperature_zero_preserved_when_explicitly_set(self) -> None:
38+
"""When user explicitly sets temperature to 0.0, it should be preserved."""
39+
config = KaapiCompletionConfig(
40+
provider="openai",
41+
type="text",
42+
params={
43+
"model": "gpt-4o",
44+
"temperature": 0.0,
45+
},
46+
)
47+
48+
assert "temperature" in config.params
49+
assert config.params["temperature"] == 0.0
50+
51+
52+
class TestNewSupportedModels:
53+
"""Test that newly added models are accepted for openai/text provider."""
54+
55+
@pytest.mark.parametrize(
56+
"model",
57+
[
58+
"gpt-5.4-pro",
59+
"gpt-5.4-mini",
60+
"gpt-5.4-nano",
61+
"gpt-5",
62+
"gpt-4-turbo",
63+
"gpt-4",
64+
"gpt-3.5-turbo",
65+
],
66+
)
67+
def test_new_model_accepted(self, model: str) -> None:
68+
"""New models should be accepted for openai text provider."""
69+
config = KaapiCompletionConfig(
70+
provider="openai",
71+
type="text",
72+
params={"model": model},
73+
)
74+
75+
assert config.params["model"] == model
76+
77+
@pytest.mark.parametrize(
78+
"model",
79+
[
80+
"gpt-4o",
81+
"gpt-4o-mini",
82+
"gpt-4.1",
83+
"gpt-4.1-mini",
84+
"gpt-4.1-nano",
85+
"gpt-5.4",
86+
"gpt-5.1",
87+
"gpt-5-mini",
88+
"gpt-5-nano",
89+
"o1",
90+
"o1-preview",
91+
"o1-mini",
92+
],
93+
)
94+
def test_existing_models_still_accepted(self, model: str) -> None:
95+
"""Previously supported models should still be accepted."""
96+
config = KaapiCompletionConfig(
97+
provider="openai",
98+
type="text",
99+
params={"model": model},
100+
)
101+
102+
assert config.params["model"] == model
103+
104+
def test_unsupported_model_rejected(self) -> None:
105+
"""An unsupported model should raise a validation error."""
106+
with pytest.raises(ValidationError, match="not supported"):
107+
KaapiCompletionConfig(
108+
provider="openai",
109+
type="text",
110+
params={"model": "unsupported-model-xyz"},
111+
)

0 commit comments

Comments
 (0)