66from sqlmodel import Session , select
77
88from app .crud .evaluations .processing import (
9+ _extract_batch_error_message ,
910 check_and_process_evaluation ,
1011 parse_evaluation_output ,
1112 process_completed_embedding_batch ,
@@ -653,11 +654,12 @@ async def test_check_and_process_evaluation_completed(
653654 db , project_id = test_dataset .project_id , use_kaapi_schema = True
654655 )
655656
656- # Create batch job
657+ # Create batch job with output file (successful completion)
657658 batch_job = BatchJob (
658659 provider = "openai" ,
659660 provider_batch_id = "batch_abc" ,
660661 provider_status = "completed" ,
662+ provider_output_file_id = "output-file-123" ,
661663 job_type = BatchJobType .EVALUATION ,
662664 total_items = 2 ,
663665 status = "submitted" ,
@@ -688,6 +690,12 @@ async def test_check_and_process_evaluation_completed(
688690 db .refresh (eval_run )
689691
690692 mock_get_batch .return_value = batch_job
693+ mock_poll .return_value = {
694+ "provider_status" : "completed" ,
695+ "provider_output_file_id" : "output-file-123" ,
696+ "error_file_id" : None ,
697+ "request_counts" : {"total" : 2 , "completed" : 2 , "failed" : 0 },
698+ }
691699 mock_process .return_value = eval_run
692700
693701 mock_openai = MagicMock ()
@@ -756,6 +764,111 @@ async def test_check_and_process_evaluation_failed(
756764 db .refresh (eval_run )
757765
758766 mock_get_batch .return_value = batch_job
767+ mock_poll .return_value = {
768+ "provider_status" : "failed" ,
769+ "provider_output_file_id" : None ,
770+ "error_file_id" : None ,
771+ "error_message" : "Provider error" ,
772+ "request_counts" : {"total" : 2 , "completed" : 0 , "failed" : 2 },
773+ }
774+
775+ mock_openai = MagicMock ()
776+ mock_langfuse = MagicMock ()
777+
778+ result = await check_and_process_evaluation (
779+ eval_run = eval_run ,
780+ session = db ,
781+ openai_client = mock_openai ,
782+ langfuse = mock_langfuse ,
783+ )
784+
785+ assert result ["action" ] == "failed"
786+ assert result ["current_status" ] == "failed"
787+ db .refresh (eval_run )
788+ assert eval_run .status == "failed"
789+
790+ @pytest .mark .asyncio
791+ @patch ("app.crud.evaluations.processing.get_batch_job" )
792+ @patch ("app.crud.evaluations.processing.poll_batch_status" )
793+ @patch ("app.crud.evaluations.processing.OpenAIBatchProvider" )
794+ async def test_check_and_process_evaluation_completed_all_requests_failed (
795+ self ,
796+ mock_provider_cls ,
797+ mock_poll ,
798+ mock_get_batch ,
799+ db : Session ,
800+ test_dataset ,
801+ ):
802+ """Test batch completed but all requests failed — both batch_job and eval_run get error_message."""
803+ config = create_test_config (
804+ db , project_id = test_dataset .project_id , use_kaapi_schema = True
805+ )
806+
807+ # Create batch job: completed status but NO provider_output_file_id
808+ batch_job = BatchJob (
809+ provider = "openai" ,
810+ provider_batch_id = "batch_all_fail" ,
811+ provider_status = "completed" ,
812+ job_type = BatchJobType .EVALUATION ,
813+ total_items = 9 ,
814+ status = "submitted" ,
815+ organization_id = test_dataset .organization_id ,
816+ project_id = test_dataset .project_id ,
817+ inserted_at = now (),
818+ updated_at = now (),
819+ )
820+ db .add (batch_job )
821+ db .commit ()
822+ db .refresh (batch_job )
823+
824+ eval_run = create_evaluation_run (
825+ session = db ,
826+ run_name = "test_run_all_fail" ,
827+ dataset_name = test_dataset .name ,
828+ dataset_id = test_dataset .id ,
829+ config_id = config .id ,
830+ config_version = 1 ,
831+ organization_id = test_dataset .organization_id ,
832+ project_id = test_dataset .project_id ,
833+ )
834+ eval_run .batch_job_id = batch_job .id
835+ eval_run .status = "processing"
836+ db .add (eval_run )
837+ db .commit ()
838+ db .refresh (eval_run )
839+
840+ mock_get_batch .return_value = batch_job
841+ mock_poll .return_value = {
842+ "provider_status" : "completed" ,
843+ "provider_output_file_id" : None ,
844+ "error_file_id" : "error-file-abc" ,
845+ "request_counts" : {"total" : 9 , "completed" : 0 , "failed" : 9 },
846+ }
847+
848+ # Mock the provider instance returned by OpenAIBatchProvider(client=...)
849+ # to return realistic error file content
850+ error_lines = "\n " .join (
851+ [
852+ json .dumps (
853+ {
854+ "id" : f"batch_req_{ i } " ,
855+ "custom_id" : f"id-{ i } " ,
856+ "response" : {
857+ "status_code" : 400 ,
858+ "body" : {
859+ "error" : {
860+ "message" : "Unsupported parameter: 'temperature' is not supported with this model." ,
861+ }
862+ },
863+ },
864+ "error" : None ,
865+ }
866+ )
867+ for i in range (9 )
868+ ]
869+ )
870+ mock_provider_instance = mock_provider_cls .return_value
871+ mock_provider_instance .download_file .return_value = error_lines
759872
760873 mock_openai = MagicMock ()
761874 mock_langfuse = MagicMock ()
@@ -769,8 +882,123 @@ async def test_check_and_process_evaluation_failed(
769882
770883 assert result ["action" ] == "failed"
771884 assert result ["current_status" ] == "failed"
885+ assert "temperature" in result ["error" ]
886+ assert "(9/9 requests)" in result ["error" ]
887+
888+ # Verify eval_run updated with error
772889 db .refresh (eval_run )
773890 assert eval_run .status == "failed"
891+ assert "temperature" in eval_run .error_message
892+
893+ # Verify batch_job updated with error
894+ db .refresh (batch_job )
895+ assert "temperature" in batch_job .error_message
896+ assert "(9/9 requests)" in batch_job .error_message
897+
898+
899+ class TestExtractBatchErrorMessage :
900+ """Test extracting error messages from OpenAI error files."""
901+
902+ def test_single_unique_error (self ) -> None :
903+ """Test error file where all requests have the same error."""
904+ error_lines = []
905+ for i in range (5 ):
906+ error_lines .append (
907+ json .dumps (
908+ {
909+ "id" : f"batch_req_{ i } " ,
910+ "custom_id" : f"id-{ i } " ,
911+ "response" : {
912+ "status_code" : 400 ,
913+ "body" : {
914+ "error" : {
915+ "message" : "Unsupported parameter: 'temperature' is not supported with this model." ,
916+ "type" : "invalid_request_error" ,
917+ }
918+ },
919+ },
920+ "error" : None ,
921+ }
922+ )
923+ )
924+ error_content = "\n " .join (error_lines )
925+
926+ mock_provider = MagicMock ()
927+ mock_provider .download_file .return_value = error_content
928+
929+ mock_session = MagicMock ()
930+ mock_batch_job = MagicMock ()
931+ mock_batch_job .id = 1
932+
933+ result = _extract_batch_error_message (
934+ provider = mock_provider ,
935+ error_file_id = "error-file-123" ,
936+ batch_job = mock_batch_job ,
937+ session = mock_session ,
938+ )
939+
940+ assert "Unsupported parameter" in result
941+ assert "(5/5 requests)" in result
942+ mock_provider .download_file .assert_called_once_with ("error-file-123" )
943+
944+ def test_multiple_unique_errors_picks_most_common (self ) -> None :
945+ """Test error file with mixed errors; picks the most frequent one."""
946+ error_lines = []
947+ # 3 requests with temperature error
948+ for i in range (3 ):
949+ error_lines .append (
950+ json .dumps (
951+ {
952+ "id" : f"batch_req_{ i } " ,
953+ "custom_id" : f"id-{ i } " ,
954+ "response" : {
955+ "status_code" : 400 ,
956+ "body" : {
957+ "error" : {
958+ "message" : "Unsupported parameter: 'temperature'" ,
959+ }
960+ },
961+ },
962+ "error" : None ,
963+ }
964+ )
965+ )
966+ # 1 request with rate limit error
967+ error_lines .append (
968+ json .dumps (
969+ {
970+ "id" : "batch_req_3" ,
971+ "custom_id" : "id-3" ,
972+ "response" : {
973+ "status_code" : 429 ,
974+ "body" : {
975+ "error" : {
976+ "message" : "Rate limit exceeded" ,
977+ }
978+ },
979+ },
980+ "error" : None ,
981+ }
982+ )
983+ )
984+ error_content = "\n " .join (error_lines )
985+
986+ mock_provider = MagicMock ()
987+ mock_provider .download_file .return_value = error_content
988+
989+ mock_session = MagicMock ()
990+ mock_batch_job = MagicMock ()
991+ mock_batch_job .id = 1
992+
993+ result = _extract_batch_error_message (
994+ provider = mock_provider ,
995+ error_file_id = "error-file-123" ,
996+ batch_job = mock_batch_job ,
997+ session = mock_session ,
998+ )
999+
1000+ assert "Unsupported parameter: 'temperature'" in result
1001+ assert "(3/4 requests)" in result
7741002
7751003
7761004class TestPollAllPendingEvaluations :
0 commit comments