Skip to content

Commit 292be60

Browse files
authored
Merge branch 'main' into enhancement/dynamic_batching
2 parents 3246e88 + ed9d789 commit 292be60

File tree

11 files changed

+603
-7
lines changed

11 files changed

+603
-7
lines changed

backend/app/crud/evaluations/batch.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ def build_evaluation_jsonl(
106106
body: dict[str, Any] = {
107107
"model": config.model,
108108
"instructions": config.instructions,
109-
"temperature": config.temperature
110-
if config.temperature is not None
111-
else 0.01,
112109
"input": question, # Add input from dataset
113110
}
114111

112+
if "temperature" in config.model_fields_set:
113+
body["temperature"] = config.temperature
114+
115115
# Add reasoning only if provided
116116
if config.reasoning:
117117
body["reasoning"] = {"effort": config.reasoning}
@@ -189,7 +189,7 @@ def start_evaluation_batch(
189189
"description": f"Evaluation: {eval_run.run_name}",
190190
"completion_window": "24h",
191191
# Store complete config for reference
192-
"evaluation_config": config.model_dump(exclude_none=True),
192+
"evaluation_config": config.model_dump(exclude_unset=True),
193193
}
194194

195195
# Step 5: Start batch job using generic infrastructure

backend/app/crud/evaluations/processing.py

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,82 @@
3737
create_langfuse_dataset_run,
3838
update_traces_with_cosine_scores,
3939
)
40-
from app.crud.job import get_batch_job
40+
from app.crud.job import get_batch_job, update_batch_job
4141
from app.models import EvaluationRun
42+
from app.models.batch_job import BatchJob, BatchJobUpdate
4243
from app.utils import get_langfuse_client, get_openai_client
4344

4445
logger = logging.getLogger(__name__)
4546

4647

48+
def _extract_batch_error_message(
49+
provider: OpenAIBatchProvider,
50+
error_file_id: str,
51+
batch_job: BatchJob,
52+
session: Session,
53+
) -> str:
54+
"""
55+
Download the error file from OpenAI, parse JSONL entries, and extract
56+
the most common error message. Updates batch_job.error_message.
57+
58+
Args:
59+
provider: OpenAI batch provider instance
60+
error_file_id: OpenAI error file ID
61+
batch_job: BatchJob to update with error message
62+
session: Database session
63+
64+
Returns:
65+
Human-readable error message with the top error and counts
66+
"""
67+
try:
68+
error_content = provider.download_file(error_file_id)
69+
lines = error_content.strip().split("\n")
70+
71+
error_counts: dict[str, int] = {}
72+
for line in lines:
73+
try:
74+
entry = json.loads(line)
75+
message = (
76+
entry.get("response", {})
77+
.get("body", {})
78+
.get("error", {})
79+
.get("message", "Unknown error")
80+
)
81+
error_counts[message] = error_counts.get(message, 0) + 1
82+
except json.JSONDecodeError:
83+
continue
84+
85+
if error_counts:
86+
top_error = max(error_counts, key=error_counts.get)
87+
top_count = error_counts[top_error]
88+
total = sum(error_counts.values())
89+
error_msg = f"{top_error} ({top_count}/{total} requests)"
90+
else:
91+
error_msg = "Batch completed with errors but could not parse error file"
92+
93+
except Exception as e:
94+
logger.error(
95+
f"[_extract_batch_error_message] Failed to extract errors | batch_job_id={batch_job.id} | {e}",
96+
exc_info=True,
97+
)
98+
error_msg = (
99+
f"Batch completed with all requests failed (error_file_id: {error_file_id})"
100+
)
101+
102+
# Update batch_job with extracted error message (outside try/except
103+
# so persistence failures propagate to the caller)
104+
batch_job_update = BatchJobUpdate(error_message=error_msg)
105+
update_batch_job(
106+
session=session, batch_job=batch_job, batch_job_update=batch_job_update
107+
)
108+
109+
logger.info(
110+
f"[_extract_batch_error_message] Extracted error | batch_job_id={batch_job.id} | {error_msg}"
111+
)
112+
113+
return error_msg
114+
115+
47116
def parse_evaluation_output(
48117
raw_results: list[dict[str, Any]], dataset_items: list[dict[str, Any]]
49118
) -> list[dict[str, Any]]:
@@ -560,14 +629,49 @@ async def check_and_process_evaluation(
560629

561630
# IMPORTANT: Poll OpenAI to get the latest status before checking
562631
provider = OpenAIBatchProvider(client=openai_client)
563-
poll_batch_status(session=session, provider=provider, batch_job=batch_job)
632+
status_result = poll_batch_status(
633+
session=session, provider=provider, batch_job=batch_job
634+
)
564635

565636
# Refresh batch_job to get the updated provider_status
566637
session.refresh(batch_job)
567638
provider_status = batch_job.provider_status
568639

569640
# Handle different provider statuses
570641
if provider_status == "completed":
642+
# Check if batch completed but all requests failed
643+
# (output_file_id is absent, error_file_id is present)
644+
if not status_result.get(
645+
"provider_output_file_id", batch_job.provider_output_file_id
646+
) and status_result.get("error_file_id"):
647+
error_msg = _extract_batch_error_message(
648+
provider=provider,
649+
error_file_id=status_result["error_file_id"],
650+
batch_job=batch_job,
651+
session=session,
652+
)
653+
654+
eval_run = update_evaluation_run(
655+
session=session,
656+
eval_run=eval_run,
657+
status="failed",
658+
error_message=error_msg,
659+
)
660+
661+
logger.error(
662+
f"[check_and_process_evaluation] {log_prefix} Batch completed with all requests failed | {error_msg}"
663+
)
664+
665+
return {
666+
"run_id": eval_run.id,
667+
"run_name": eval_run.run_name,
668+
"previous_status": previous_status,
669+
"current_status": "failed",
670+
"provider_status": provider_status,
671+
"action": "failed",
672+
"error": error_msg,
673+
}
674+
571675
# Process the completed evaluation
572676
await process_completed_evaluation(
573677
eval_run=eval_run,

backend/app/models/llm/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
"o1",
2323
"o1-preview",
2424
"o1-mini",
25+
"gpt-5.4-pro",
26+
"gpt-5.4-mini",
27+
"gpt-5.4-nano",
28+
"gpt-5",
29+
"gpt-4-turbo",
30+
"gpt-4",
31+
"gpt-3.5-turbo",
2532
],
2633
}
2734

backend/app/models/llm/request.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ def validate_params(self):
251251
provider = self.provider
252252
provider_was_auto_assigned = True
253253

254+
user_provided_temperature = "temperature" in self.params
254255
validated = model_class.model_validate(self.params)
255256

256257
if provider is not None:
@@ -288,6 +289,8 @@ def validate_params(self):
288289
)
289290

290291
self.params = validated.model_dump(exclude_none=True)
292+
if not user_provided_temperature:
293+
self.params.pop("temperature", None)
291294
return self
292295

293296

backend/app/tests/api/routes/test_evaluation.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None:
731731
assert request_dict["body"]["input"] == f"Question {i}"
732732
assert request_dict["body"]["model"] == "gpt-4o"
733733

734+
def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None:
735+
"""When temperature is explicitly set, it should appear in the JSONL body."""
736+
dataset_items = [
737+
{
738+
"id": "item1",
739+
"input": {"question": "Test question"},
740+
"expected_output": {"answer": "Test answer"},
741+
"metadata": {},
742+
}
743+
]
744+
745+
config = TextLLMParams(model="gpt-4o", temperature=0.5)
746+
747+
jsonl_data = build_evaluation_jsonl(dataset_items, config)
748+
749+
assert len(jsonl_data) == 1
750+
assert "temperature" in jsonl_data[0]["body"]
751+
assert jsonl_data[0]["body"]["temperature"] == 0.5
752+
753+
def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None:
754+
"""When temperature is not explicitly set, it should NOT appear in the JSONL body."""
755+
dataset_items = [
756+
{
757+
"id": "item1",
758+
"input": {"question": "Test question"},
759+
"expected_output": {"answer": "Test answer"},
760+
"metadata": {},
761+
}
762+
]
763+
764+
# Only model provided — temperature not in model_fields_set
765+
config = TextLLMParams(model="gpt-4o")
766+
767+
jsonl_data = build_evaluation_jsonl(dataset_items, config)
768+
769+
assert len(jsonl_data) == 1
770+
assert "temperature" not in jsonl_data[0]["body"]
771+
772+
def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set(
773+
self,
774+
) -> None:
775+
"""When temperature is explicitly set to 0.0, it should still appear in the body."""
776+
dataset_items = [
777+
{
778+
"id": "item1",
779+
"input": {"question": "Test question"},
780+
"expected_output": {"answer": "Test answer"},
781+
"metadata": {},
782+
}
783+
]
784+
785+
config = TextLLMParams(model="gpt-4o", temperature=0.0)
786+
787+
jsonl_data = build_evaluation_jsonl(dataset_items, config)
788+
789+
assert len(jsonl_data) == 1
790+
assert "temperature" in jsonl_data[0]["body"]
791+
assert jsonl_data[0]["body"]["temperature"] == 0.0
792+
734793

735794
class TestGetEvaluationRunStatus:
736795
"""Test GET /evaluations/{evaluation_id} endpoint."""

0 commit comments

Comments
 (0)