@@ -274,6 +274,91 @@ def test_create_langfuse_dataset_run_with_cost_tracking(self) -> None:
274274 mock_langfuse .flush .assert_called_once ()
275275 assert mock_langfuse .trace .call_count == 2
276276
277+ def test_create_langfuse_dataset_run_with_question_id (self ) -> None :
278+ """Test that question_id is included in trace metadata."""
279+ mock_langfuse = MagicMock ()
280+ mock_dataset = MagicMock ()
281+ mock_generation = MagicMock ()
282+
283+ mock_item1 = MagicMock ()
284+ mock_item1 .id = "item_1"
285+ mock_item1 .observe .return_value .__enter__ .return_value = "trace_id_1"
286+
287+ mock_dataset .items = [mock_item1 ]
288+ mock_langfuse .get_dataset .return_value = mock_dataset
289+ mock_langfuse .generation .return_value = mock_generation
290+
291+ results = [
292+ {
293+ "item_id" : "item_1" ,
294+ "question" : "What is 2+2?" ,
295+ "generated_output" : "4" ,
296+ "ground_truth" : "4" ,
297+ "response_id" : "resp_123" ,
298+ "usage" : {
299+ "input_tokens" : 10 ,
300+ "output_tokens" : 5 ,
301+ "total_tokens" : 15 ,
302+ },
303+ "question_id" : 1 ,
304+ },
305+ ]
306+
307+ trace_id_mapping = create_langfuse_dataset_run (
308+ langfuse = mock_langfuse ,
309+ dataset_name = "test_dataset" ,
310+ run_name = "test_run" ,
311+ results = results ,
312+ model = "gpt-4o" ,
313+ )
314+
315+ assert len (trace_id_mapping ) == 1
316+
317+ # Verify trace was called with question_id in metadata
318+ trace_call = mock_langfuse .trace .call_args
319+ assert trace_call .kwargs ["metadata" ]["question_id" ] == 1
320+
321+ # Verify generation was called with question_id in metadata
322+ generation_call = mock_langfuse .generation .call_args
323+ assert generation_call .kwargs ["metadata" ]["question_id" ] == 1
324+
325+ def test_create_langfuse_dataset_run_without_question_id (self ) -> None :
326+ """Test that traces work without question_id (backwards compatibility)."""
327+ mock_langfuse = MagicMock ()
328+ mock_dataset = MagicMock ()
329+
330+ mock_item1 = MagicMock ()
331+ mock_item1 .id = "item_1"
332+ mock_item1 .observe .return_value .__enter__ .return_value = "trace_id_1"
333+
334+ mock_dataset .items = [mock_item1 ]
335+ mock_langfuse .get_dataset .return_value = mock_dataset
336+
337+ # Results without question_id
338+ results = [
339+ {
340+ "item_id" : "item_1" ,
341+ "question" : "What is 2+2?" ,
342+ "generated_output" : "4" ,
343+ "ground_truth" : "4" ,
344+ "response_id" : "resp_123" ,
345+ "usage" : None ,
346+ },
347+ ]
348+
349+ trace_id_mapping = create_langfuse_dataset_run (
350+ langfuse = mock_langfuse ,
351+ dataset_name = "test_dataset" ,
352+ run_name = "test_run" ,
353+ results = results ,
354+ )
355+
356+ assert len (trace_id_mapping ) == 1
357+
358+ # Verify trace was called without question_id in metadata
359+ trace_call = mock_langfuse .trace .call_args
360+ assert "question_id" not in trace_call .kwargs ["metadata" ]
361+
277362
278363class TestUpdateTracesWithCosineScores :
279364 """Test updating Langfuse traces with cosine similarity scores."""
@@ -411,6 +496,80 @@ def test_upload_dataset_to_langfuse_duplication_metadata(self, valid_items):
411496 assert duplicate_numbers .count (2 ) == 3
412497 assert duplicate_numbers .count (3 ) == 3
413498
499+ def test_upload_dataset_to_langfuse_question_id_in_metadata (self , valid_items ):
500+ """Test that question_id is included in metadata as integer."""
501+ mock_langfuse = MagicMock ()
502+ mock_dataset = MagicMock ()
503+ mock_dataset .id = "dataset_123"
504+ mock_langfuse .create_dataset .return_value = mock_dataset
505+
506+ upload_dataset_to_langfuse (
507+ langfuse = mock_langfuse ,
508+ items = valid_items ,
509+ dataset_name = "test_dataset" ,
510+ duplication_factor = 1 ,
511+ )
512+
513+ calls = mock_langfuse .create_dataset_item .call_args_list
514+ assert len (calls ) == 3
515+
516+ question_ids = []
517+ for call_args in calls :
518+ metadata = call_args .kwargs .get ("metadata" , {})
519+ assert "question_id" in metadata
520+ assert metadata ["question_id" ] is not None
521+ # Verify it's an integer (1-based index)
522+ assert isinstance (metadata ["question_id" ], int )
523+ question_ids .append (metadata ["question_id" ])
524+
525+ # Verify sequential IDs starting from 1
526+ assert sorted (question_ids ) == [1 , 2 , 3 ]
527+
528+ def test_upload_dataset_to_langfuse_same_question_id_for_duplicates (
529+ self , valid_items
530+ ):
531+ """Test that all duplicates of the same question share the same question_id."""
532+ mock_langfuse = MagicMock ()
533+ mock_dataset = MagicMock ()
534+ mock_dataset .id = "dataset_123"
535+ mock_langfuse .create_dataset .return_value = mock_dataset
536+
537+ upload_dataset_to_langfuse (
538+ langfuse = mock_langfuse ,
539+ items = valid_items ,
540+ dataset_name = "test_dataset" ,
541+ duplication_factor = 3 ,
542+ )
543+
544+ calls = mock_langfuse .create_dataset_item .call_args_list
545+ assert len (calls ) == 9 # 3 items * 3 duplicates
546+
547+ # Group calls by original_question
548+ question_ids_by_question : dict [str , set [int ]] = {}
549+ for call_args in calls :
550+ metadata = call_args .kwargs .get ("metadata" , {})
551+ original_question = metadata .get ("original_question" )
552+ question_id = metadata .get ("question_id" )
553+
554+ # Verify question_id is an integer
555+ assert isinstance (question_id , int )
556+
557+ if original_question not in question_ids_by_question :
558+ question_ids_by_question [original_question ] = set ()
559+ question_ids_by_question [original_question ].add (question_id )
560+
561+ # Verify each question has exactly one unique question_id across all duplicates
562+ for question , question_ids in question_ids_by_question .items ():
563+ assert (
564+ len (question_ids ) == 1
565+ ), f"Question '{ question } ' has multiple question_ids: { question_ids } "
566+
567+ # Verify different questions have different question_ids (1, 2, 3)
568+ all_unique_ids : set [int ] = set ()
569+ for qid_set in question_ids_by_question .values ():
570+ all_unique_ids .update (qid_set )
571+ assert all_unique_ids == {1 , 2 , 3 } # 3 unique questions = IDs 1, 2, 3
572+
414573 def test_upload_dataset_to_langfuse_empty_items (self ) -> None :
415574 """Test with empty items list."""
416575 mock_langfuse = MagicMock ()
0 commit comments