Skip to content

Commit 44b70ac

Browse files
authored
Evaluation: Add question ID(#553)
1 parent 6ac9391 commit 44b70ac

File tree

4 files changed

+212
-9
lines changed

4 files changed

+212
-9
lines changed

backend/app/crud/evaluations/langfuse.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def create_langfuse_dataset_run(
8888
ground_truth = result["ground_truth"]
8989
response_id = result.get("response_id")
9090
usage_raw = result.get("usage")
91+
question_id = result.get("question_id")
9192

9293
dataset_item = dataset_items_map.get(item_id)
9394
if not dataset_item:
@@ -105,6 +106,8 @@ def create_langfuse_dataset_run(
105106
}
106107
if response_id:
107108
metadata["response_id"] = response_id
109+
if question_id:
110+
metadata["question_id"] = question_id
108111

109112
# Create trace with basic info
110113
langfuse.trace(
@@ -250,7 +253,7 @@ def upload_dataset_to_langfuse(
250253
f"duplication_factor={duplication_factor}"
251254
)
252255

253-
def upload_item(item: dict[str, str], duplicate_num: int) -> bool:
256+
def upload_item(item: dict[str, str], duplicate_num: int, question_id: str) -> bool:
254257
try:
255258
langfuse.create_dataset_item(
256259
dataset_name=dataset_name,
@@ -260,6 +263,7 @@ def upload_item(item: dict[str, str], duplicate_num: int) -> bool:
260263
"original_question": item["question"],
261264
"duplicate_number": duplicate_num + 1,
262265
"duplication_factor": duplication_factor,
266+
"question_id": question_id,
263267
},
264268
)
265269
return True
@@ -275,19 +279,22 @@ def upload_item(item: dict[str, str], duplicate_num: int) -> bool:
275279
# Create or get dataset in Langfuse
276280
dataset = langfuse.create_dataset(name=dataset_name)
277281

278-
upload_tasks = [
279-
(item, duplicate_num)
280-
for item in items
281-
for duplicate_num in range(duplication_factor)
282-
]
282+
# Generate question_id for each unique question before duplication
283+
# All duplicates of the same question share the same question_id
284+
# Using 1-based integer IDs for easier sorting and grouping
285+
upload_tasks = []
286+
for idx, item in enumerate(items, start=1):
287+
question_id = idx
288+
for duplicate_num in range(duplication_factor):
289+
upload_tasks.append((item, duplicate_num, question_id))
283290

284291
# Upload items concurrently using ThreadPoolExecutor
285292
total_uploaded = 0
286293
with ThreadPoolExecutor(max_workers=4) as executor:
287294
# Submit all upload tasks and collect the futures
288295
futures = []
289-
for item, dup_num in upload_tasks:
290-
future = executor.submit(upload_item, item, dup_num)
296+
for item, dup_num, question_id in upload_tasks:
297+
future = executor.submit(upload_item, item, dup_num, question_id)
291298
futures.append(future)
292299

293300
for future in as_completed(futures):
@@ -416,6 +423,7 @@ def fetch_trace_scores_from_langfuse(
416423
"question": "",
417424
"llm_answer": "",
418425
"ground_truth_answer": "",
426+
"question_id": "",
419427
"scores": [],
420428
}
421429

@@ -433,11 +441,12 @@ def fetch_trace_scores_from_langfuse(
433441
elif isinstance(trace.output, str):
434442
trace_data["llm_answer"] = trace.output
435443

436-
# Get ground truth from metadata
444+
# Get ground truth and question_id from metadata
437445
if trace.metadata and isinstance(trace.metadata, dict):
438446
trace_data["ground_truth_answer"] = trace.metadata.get(
439447
"ground_truth", ""
440448
)
449+
trace_data["question_id"] = trace.metadata.get("question_id", "")
441450

442451
# Add scores from this trace
443452
if trace.scores:

backend/app/crud/evaluations/processing.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ def parse_evaluation_output(
154154
question = dataset_item["input"].get("question", "")
155155
ground_truth = dataset_item["expected_output"].get("answer", "")
156156

157+
# Extract question_id from dataset item metadata
158+
question_id = dataset_item.get("metadata", {}).get("question_id")
159+
157160
results.append(
158161
{
159162
"item_id": item_id,
@@ -162,6 +165,7 @@ def parse_evaluation_output(
162165
"ground_truth": ground_truth,
163166
"response_id": response_id,
164167
"usage": usage,
168+
"question_id": question_id,
165169
}
166170
)
167171

backend/app/tests/crud/evaluations/test_langfuse.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,91 @@ def test_create_langfuse_dataset_run_with_cost_tracking(self) -> None:
274274
mock_langfuse.flush.assert_called_once()
275275
assert mock_langfuse.trace.call_count == 2
276276

277+
def test_create_langfuse_dataset_run_with_question_id(self) -> None:
278+
"""Test that question_id is included in trace metadata."""
279+
mock_langfuse = MagicMock()
280+
mock_dataset = MagicMock()
281+
mock_generation = MagicMock()
282+
283+
mock_item1 = MagicMock()
284+
mock_item1.id = "item_1"
285+
mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
286+
287+
mock_dataset.items = [mock_item1]
288+
mock_langfuse.get_dataset.return_value = mock_dataset
289+
mock_langfuse.generation.return_value = mock_generation
290+
291+
results = [
292+
{
293+
"item_id": "item_1",
294+
"question": "What is 2+2?",
295+
"generated_output": "4",
296+
"ground_truth": "4",
297+
"response_id": "resp_123",
298+
"usage": {
299+
"input_tokens": 10,
300+
"output_tokens": 5,
301+
"total_tokens": 15,
302+
},
303+
"question_id": 1,
304+
},
305+
]
306+
307+
trace_id_mapping = create_langfuse_dataset_run(
308+
langfuse=mock_langfuse,
309+
dataset_name="test_dataset",
310+
run_name="test_run",
311+
results=results,
312+
model="gpt-4o",
313+
)
314+
315+
assert len(trace_id_mapping) == 1
316+
317+
# Verify trace was called with question_id in metadata
318+
trace_call = mock_langfuse.trace.call_args
319+
assert trace_call.kwargs["metadata"]["question_id"] == 1
320+
321+
# Verify generation was called with question_id in metadata
322+
generation_call = mock_langfuse.generation.call_args
323+
assert generation_call.kwargs["metadata"]["question_id"] == 1
324+
325+
def test_create_langfuse_dataset_run_without_question_id(self) -> None:
326+
"""Test that traces work without question_id (backwards compatibility)."""
327+
mock_langfuse = MagicMock()
328+
mock_dataset = MagicMock()
329+
330+
mock_item1 = MagicMock()
331+
mock_item1.id = "item_1"
332+
mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
333+
334+
mock_dataset.items = [mock_item1]
335+
mock_langfuse.get_dataset.return_value = mock_dataset
336+
337+
# Results without question_id
338+
results = [
339+
{
340+
"item_id": "item_1",
341+
"question": "What is 2+2?",
342+
"generated_output": "4",
343+
"ground_truth": "4",
344+
"response_id": "resp_123",
345+
"usage": None,
346+
},
347+
]
348+
349+
trace_id_mapping = create_langfuse_dataset_run(
350+
langfuse=mock_langfuse,
351+
dataset_name="test_dataset",
352+
run_name="test_run",
353+
results=results,
354+
)
355+
356+
assert len(trace_id_mapping) == 1
357+
358+
# Verify trace was called without question_id in metadata
359+
trace_call = mock_langfuse.trace.call_args
360+
assert "question_id" not in trace_call.kwargs["metadata"]
361+
277362

278363
class TestUpdateTracesWithCosineScores:
279364
"""Test updating Langfuse traces with cosine similarity scores."""
@@ -411,6 +496,80 @@ def test_upload_dataset_to_langfuse_duplication_metadata(self, valid_items):
411496
assert duplicate_numbers.count(2) == 3
412497
assert duplicate_numbers.count(3) == 3
413498

499+
def test_upload_dataset_to_langfuse_question_id_in_metadata(self, valid_items):
500+
"""Test that question_id is included in metadata as integer."""
501+
mock_langfuse = MagicMock()
502+
mock_dataset = MagicMock()
503+
mock_dataset.id = "dataset_123"
504+
mock_langfuse.create_dataset.return_value = mock_dataset
505+
506+
upload_dataset_to_langfuse(
507+
langfuse=mock_langfuse,
508+
items=valid_items,
509+
dataset_name="test_dataset",
510+
duplication_factor=1,
511+
)
512+
513+
calls = mock_langfuse.create_dataset_item.call_args_list
514+
assert len(calls) == 3
515+
516+
question_ids = []
517+
for call_args in calls:
518+
metadata = call_args.kwargs.get("metadata", {})
519+
assert "question_id" in metadata
520+
assert metadata["question_id"] is not None
521+
# Verify it's an integer (1-based index)
522+
assert isinstance(metadata["question_id"], int)
523+
question_ids.append(metadata["question_id"])
524+
525+
# Verify sequential IDs starting from 1
526+
assert sorted(question_ids) == [1, 2, 3]
527+
528+
def test_upload_dataset_to_langfuse_same_question_id_for_duplicates(
529+
self, valid_items
530+
):
531+
"""Test that all duplicates of the same question share the same question_id."""
532+
mock_langfuse = MagicMock()
533+
mock_dataset = MagicMock()
534+
mock_dataset.id = "dataset_123"
535+
mock_langfuse.create_dataset.return_value = mock_dataset
536+
537+
upload_dataset_to_langfuse(
538+
langfuse=mock_langfuse,
539+
items=valid_items,
540+
dataset_name="test_dataset",
541+
duplication_factor=3,
542+
)
543+
544+
calls = mock_langfuse.create_dataset_item.call_args_list
545+
assert len(calls) == 9 # 3 items * 3 duplicates
546+
547+
# Group calls by original_question
548+
question_ids_by_question: dict[str, set[int]] = {}
549+
for call_args in calls:
550+
metadata = call_args.kwargs.get("metadata", {})
551+
original_question = metadata.get("original_question")
552+
question_id = metadata.get("question_id")
553+
554+
# Verify question_id is an integer
555+
assert isinstance(question_id, int)
556+
557+
if original_question not in question_ids_by_question:
558+
question_ids_by_question[original_question] = set()
559+
question_ids_by_question[original_question].add(question_id)
560+
561+
# Verify each question has exactly one unique question_id across all duplicates
562+
for question, question_ids in question_ids_by_question.items():
563+
assert (
564+
len(question_ids) == 1
565+
), f"Question '{question}' has multiple question_ids: {question_ids}"
566+
567+
# Verify different questions have different question_ids (1, 2, 3)
568+
all_unique_ids: set[int] = set()
569+
for qid_set in question_ids_by_question.values():
570+
all_unique_ids.update(qid_set)
571+
assert all_unique_ids == {1, 2, 3} # 3 unique questions = IDs 1, 2, 3
572+
414573
def test_upload_dataset_to_langfuse_empty_items(self) -> None:
415574
"""Test with empty items list."""
416575
mock_langfuse = MagicMock()

backend/app/tests/crud/evaluations/test_processing.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def test_parse_evaluation_output_basic(self) -> None:
5252
"id": "item1",
5353
"input": {"question": "What is 2+2?"},
5454
"expected_output": {"answer": "4"},
55+
"metadata": {"question_id": 1},
5556
}
5657
]
5758

@@ -64,6 +65,36 @@ def test_parse_evaluation_output_basic(self) -> None:
6465
assert results[0]["ground_truth"] == "4"
6566
assert results[0]["response_id"] == "resp_123"
6667
assert results[0]["usage"]["total_tokens"] == 15
68+
assert results[0]["question_id"] == 1
69+
70+
def test_parse_evaluation_output_without_question_id(self) -> None:
71+
"""Test parsing dataset items without question_id (backwards compatibility)."""
72+
raw_results = [
73+
{
74+
"custom_id": "item1",
75+
"response": {
76+
"body": {
77+
"id": "resp_123",
78+
"output": "Answer text",
79+
"usage": {"total_tokens": 10},
80+
}
81+
},
82+
}
83+
]
84+
85+
dataset_items = [
86+
{
87+
"id": "item1",
88+
"input": {"question": "Test question?"},
89+
"expected_output": {"answer": "Test answer"},
90+
# No metadata / question_id
91+
}
92+
]
93+
94+
results = parse_evaluation_output(raw_results, dataset_items)
95+
96+
assert len(results) == 1
97+
assert results[0]["question_id"] is None
6798

6899
def test_parse_evaluation_output_simple_string(self) -> None:
69100
"""Test parsing with simple string output."""

0 commit comments

Comments
 (0)