IBM · martinscooper · Feb 21, 2025
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
@@ -63,6 +63,8 @@ class LLMJudge(BulkInstanceMetric):
     generate_summaries: bool = True
     format = "formats.chat_api"
     include_prompts_in_result: bool = False
+    response_variable_name_field: Optional[str] = None
+    response_variable_name: str = "response"
     criteria_field: str = None
     criteria: Criteria = None
     logger = get_logger()
@@ -103,6 +105,16 @@ def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
             for td in task_data
         ]
 
+    def get_response_variable_names(self, task_data: List[Dict[str, Any]]) -> List[str]:
+        if self.response_variable_name_field is None:
+            return [self.response_variable_name] * len(task_data)
+        try:
+            return [td[self.response_variable_name_field] for td in task_data]
+        except KeyError as e:
+            raise UnitxtError(
+                f"The response variable name field `{self.response_variable_name_field}` was not found in the task data instance."
+            ) from e
+
     def perform_evaluation_step(
         self,
         instances: list,
@@ -184,6 +196,8 @@ def prepare(self):
                 "response": str,
                 "criteria_description": str,
                 "display_options_instruction": str,
+                "response_variable_name": str,
+                "response_variable_name_title": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -202,6 +216,7 @@ def prepare(self):
                 "criteria_description": str,
                 "score_option_instruction": str,
                 "options": list,
+                "response_variable_name": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -341,6 +356,7 @@ def compute(
         criterias = self.get_criterias(task_data, evaluations_count)
         self.set_main_score(criterias)
         contexts = self.get_contexts(task_data)
+        response_variable_names = self.get_response_variable_names(task_data)
         if self.check_positional_bias:
             criterias += [
                 CriteriaWithOptions(
@@ -352,6 +368,7 @@ def compute(
                 for criteria in criterias
             ]
             contexts += contexts
+            response_variable_names += response_variable_names
             predictions += predictions
 
         parsed_criterias = [
@@ -373,13 +390,16 @@ def compute(
                 "response": prediction,
                 "display_options_instruction": display_options_instruction,
                 "criteria_description": criteria_description,
+                "response_variable_name": response_variable_name,
+                "response_variable_name_title": response_variable_name.capitalize(),
                 "data_classification_policy": ["public"],
             }
-            for context, prediction, criteria_description, display_options_instruction in zip(
+            for context, prediction, criteria_description, display_options_instruction, response_variable_name in zip(
                 contexts,
                 predictions,
                 criteria_description_list,
                 display_options_instruction_list,
+                response_variable_names
             )
         ]
         assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
@@ -416,12 +436,14 @@ def compute(
                 "criteria_description": criteria_description,
                 "score_option_instruction": score_option_instruction,
                 "options": criteria_option_names,
+                "response_variable_name": response_variable_name,
                 "data_classification_policy": ["public"],
             }
-            for criteria_description, score_option_instruction, criteria_option_names in zip(
+            for criteria_description, score_option_instruction, criteria_option_names, response_variable_name in zip(
                 criteria_description_list,
                 score_option_instruction_list,
                 criteria_option_names_list,
+                response_variable_names
             )
         ]
 
@@ -477,6 +499,8 @@ def prepare(self):
                 "option_b": str,
                 "criteria_name": str,
                 "criteria_description": str,
+                "response_variable_name": str,
+                "response_variable_name_title": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -494,6 +518,7 @@ def prepare(self):
             input_fields={
                 "score_option_instruction": str,
                 "options": list,
+                "response_variable_name": str,
             },
             reference_fields={},
             prediction_type=str,
@@ -754,9 +779,11 @@ def compute(
 
         criterias = self.get_criterias(task_data, instances_count)
         contexts = self.get_contexts(task_data)
+        response_variable_names = self.get_response_variable_names(task_data)
         if self.check_positional_bias:
             criterias.extend(criterias)
             contexts.extend(contexts)
+            response_variable_names.extend(response_variable_names)
             for response_pairs, option_pairs in zip(
                 response_pairs_list, option_pairs_list
             ):
@@ -776,6 +803,8 @@ def compute(
                 "option_b": option_pair[1],
                 "criteria_name": criterias[i].name,
                 "criteria_description": criterias[i].description,
+                "response_variable_name": response_variable_names[i],
+                "response_variable_name_title": response_variable_names[i].capitalize(),
                 "data_classification_policy": ["public"],
             }
             for i, (response_pairs, option_pairs) in enumerate(
@@ -838,31 +867,27 @@ def compute(
             )
             self.logger.info("The summary was generated successfully.")
 
-        score_option_instruction_list = [
-            "".join(
+        score_option_instructions_list = [
+            ["".join(
                 [
                     f'Choose "{option}" if Response {option} is better quality.\n'
                     for option in option_pair
                 ]
-            )
+            ) for option_pair in option_pairs]
             for option_pairs in option_pairs_list
-            for option_pair in option_pairs
         ]
 
         option_selection_instances = [
             {
                 "options": [f"Response {option}" for option in option_pair],
                 "score_option_instruction": score_option_instruction,
+                "response_variable_name": response_variable_names[i],
                 "data_classification_policy": ["public"],
             }
-            for option_pair, score_option_instruction in zip(
-                [
-                    option_pair
-                    for option_pairs in option_pairs_list
-                    for option_pair in option_pairs
-                ],
-                score_option_instruction_list,
+            for i, (score_option_instructions, option_pairs) in enumerate(
+                zip(score_option_instructions_list, option_pairs_list)
             )
+            for score_option_instruction, option_pair in zip(score_option_instructions, option_pairs)
         ]
 
         previous_messages = [

diff --git a/src/unitxt/llm_as_judge_chat_templates.py b/src/unitxt/llm_as_judge_chat_templates.py
@@ -3,20 +3,20 @@
 direct_template_dict = {
     "assessment": InputOutputTemplate(
         input_format="""
-You are presented with a response generated subject to a context.
-The context includes information relevant to the nature or generation of the response.
-You will assess the quality of the response subject to an evaluation criteria.
+You are presented with a {response_variable_name} generated subject to a context.
+The context includes information relevant to the nature or generation of the {response_variable_name}.
+You will assess the quality of the {response_variable_name} subject to an evaluation criteria.
 ###Context:
 {context_variables}
 
-###Response:
+###{response_variable_name_title}:
 {response}
 
 ###Evaluation criteria:
 {criteria_description}
 {display_options_instruction}
 
-Briefly assess the quality of the response subject to the evaluation criteria.
+Briefly assess the quality of the {response_variable_name} subject to the evaluation criteria.
 Focus on the evaluation criteria during assessment, do not provide a general assessment.
 Assessment:
 
@@ -29,7 +29,7 @@
 Summary:"""
     ),
     "answer": InputOutputTemplate(
-        input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the response.
+        input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the {response_variable_name}.
 ###Evaluation criteria:
 {criteria_description}
 {score_option_instruction}
@@ -41,8 +41,8 @@
 
 pairwise_template_dict = {
     "assessment": InputOutputTemplate(
-        input_format="""You are provided a pair of responses (Response {option_a} and Response {option_b}) generated subject to a context.
-You will choose the better quality response subject to the evaluation criteria.
+        input_format="""You are provided a pair of {response_variable_name}s ({response_variable_name_title} {option_a} and {response_variable_name_title} {option_b}) generated subject to a context.
+You will choose the better quality {response_variable_name} subject to the evaluation criteria.
 
 This is the context:
 {context_variables}
@@ -51,25 +51,25 @@
 {criteria_name}
 {criteria_description}
 
-Response {option_a}:
+{response_variable_name_title} {option_a}:
 {response_a}
-Response {option_b}:
+{response_variable_name_title} {option_b}:
 {response_b}
 
-Keeping the evaluation criteria in mind, briefly assess which response is better.
+Keeping the evaluation criteria in mind, briefly assess which {response_variable_name} is better.
 Focus on the evaluation criteria during assessment, do not provide a general assessment.
 Assessment:
 
 Lets think step by step """
     ),
     "summarization": InputOutputTemplate(
-        input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which response won.
+        input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which {response_variable_name} won.
 
 Assessment: {assessment}
 Summary:"""
     ),
     "answer": InputOutputTemplate(
-        input_format="""Now considering the evaluation criteria, which response is better quality? Only include the chosen response.
+        input_format="""Now considering the evaluation criteria, which {response_variable_name} is better quality? Only include the chosen {response_variable_name}.
 {score_option_instruction}
 Answer: """,
         postprocessors=["processors.match_closest_option"],