diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py index 027df76cb1..0234e09525 100644 --- a/src/unitxt/llm_as_judge.py +++ b/src/unitxt/llm_as_judge.py @@ -63,6 +63,8 @@ class LLMJudge(BulkInstanceMetric): generate_summaries: bool = True format = "formats.chat_api" include_prompts_in_result: bool = False + response_variable_name_field: Optional[str] = None + response_variable_name: str = "response" criteria_field: str = None criteria: Criteria = None logger = get_logger() @@ -103,6 +105,16 @@ def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]: for td in task_data ] + def get_response_variable_names(self, task_data: List[Dict[str, Any]]) -> List[str]: + if self.response_variable_name_field is None: + return [self.response_variable_name] * len(task_data) + try: + return [td[self.response_variable_name_field] for td in task_data] + except KeyError as e: + raise UnitxtError( + f"The response variable name field `{self.response_variable_name_field}` was not found in the task data instance." + ) from e + def perform_evaluation_step( self, instances: list, @@ -184,6 +196,8 @@ def prepare(self): "response": str, "criteria_description": str, "display_options_instruction": str, + "response_variable_name": str, + "response_variable_name_title": str, }, reference_fields={}, prediction_type=str, @@ -202,6 +216,7 @@ def prepare(self): "criteria_description": str, "score_option_instruction": str, "options": list, + "response_variable_name": str, }, reference_fields={}, prediction_type=str, @@ -341,6 +356,7 @@ def compute( criterias = self.get_criterias(task_data, evaluations_count) self.set_main_score(criterias) contexts = self.get_contexts(task_data) + response_variable_names = self.get_response_variable_names(task_data) if self.check_positional_bias: criterias += [ CriteriaWithOptions( @@ -352,6 +368,7 @@ def compute( for criteria in criterias ] contexts += contexts + response_variable_names += response_variable_names predictions += predictions parsed_criterias = [ @@ -373,13 +390,16 @@ def compute( "response": prediction, "display_options_instruction": display_options_instruction, "criteria_description": criteria_description, + "response_variable_name": response_variable_name, + "response_variable_name_title": response_variable_name.capitalize(), "data_classification_policy": ["public"], } - for context, prediction, criteria_description, display_options_instruction in zip( + for context, prediction, criteria_description, display_options_instruction, response_variable_name in zip( contexts, predictions, criteria_description_list, display_options_instruction_list, + response_variable_names ) ] assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step( @@ -416,12 +436,14 @@ def compute( "criteria_description": criteria_description, "score_option_instruction": score_option_instruction, "options": criteria_option_names, + "response_variable_name": response_variable_name, "data_classification_policy": ["public"], } - for criteria_description, score_option_instruction, criteria_option_names in zip( + for criteria_description, score_option_instruction, criteria_option_names, response_variable_name in zip( criteria_description_list, score_option_instruction_list, criteria_option_names_list, + response_variable_names ) ] @@ -477,6 +499,8 @@ def prepare(self): "option_b": str, "criteria_name": str, "criteria_description": str, + "response_variable_name": str, + "response_variable_name_title": str, }, reference_fields={}, prediction_type=str, @@ -494,6 +518,7 @@ def prepare(self): input_fields={ "score_option_instruction": str, "options": list, + "response_variable_name": str, }, reference_fields={}, prediction_type=str, @@ -754,9 +779,11 @@ def compute( criterias = self.get_criterias(task_data, instances_count) contexts = self.get_contexts(task_data) + response_variable_names = self.get_response_variable_names(task_data) if self.check_positional_bias: criterias.extend(criterias) contexts.extend(contexts) + response_variable_names.extend(response_variable_names) for response_pairs, option_pairs in zip( response_pairs_list, option_pairs_list ): @@ -776,6 +803,8 @@ def compute( "option_b": option_pair[1], "criteria_name": criterias[i].name, "criteria_description": criterias[i].description, + "response_variable_name": response_variable_names[i], + "response_variable_name_title": response_variable_names[i].capitalize(), "data_classification_policy": ["public"], } for i, (response_pairs, option_pairs) in enumerate( @@ -838,31 +867,27 @@ def compute( ) self.logger.info("The summary was generated successfully.") - score_option_instruction_list = [ - "".join( + score_option_instructions_list = [ + ["".join( [ f'Choose "{option}" if Response {option} is better quality.\n' for option in option_pair ] - ) + ) for option_pair in option_pairs] for option_pairs in option_pairs_list - for option_pair in option_pairs ] option_selection_instances = [ { "options": [f"Response {option}" for option in option_pair], "score_option_instruction": score_option_instruction, + "response_variable_name": response_variable_names[i], "data_classification_policy": ["public"], } - for option_pair, score_option_instruction in zip( - [ - option_pair - for option_pairs in option_pairs_list - for option_pair in option_pairs - ], - score_option_instruction_list, + for i, (score_option_instructions, option_pairs) in enumerate( + zip(score_option_instructions_list, option_pairs_list) ) + for score_option_instruction, option_pair in zip(score_option_instructions, option_pairs) ] previous_messages = [ diff --git a/src/unitxt/llm_as_judge_chat_templates.py b/src/unitxt/llm_as_judge_chat_templates.py index 18a8acd4d9..9bb47a5961 100644 --- a/src/unitxt/llm_as_judge_chat_templates.py +++ b/src/unitxt/llm_as_judge_chat_templates.py @@ -3,20 +3,20 @@ direct_template_dict = { "assessment": InputOutputTemplate( input_format=""" -You are presented with a response generated subject to a context. -The context includes information relevant to the nature or generation of the response. -You will assess the quality of the response subject to an evaluation criteria. +You are presented with a {response_variable_name} generated subject to a context. +The context includes information relevant to the nature or generation of the {response_variable_name}. +You will assess the quality of the {response_variable_name} subject to an evaluation criteria. ###Context: {context_variables} -###Response: +###{response_variable_name_title}: {response} ###Evaluation criteria: {criteria_description} {display_options_instruction} -Briefly assess the quality of the response subject to the evaluation criteria. +Briefly assess the quality of the {response_variable_name} subject to the evaluation criteria. Focus on the evaluation criteria during assessment, do not provide a general assessment. Assessment: @@ -29,7 +29,7 @@ Summary:""" ), "answer": InputOutputTemplate( - input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the response. + input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the {response_variable_name}. ###Evaluation criteria: {criteria_description} {score_option_instruction} @@ -41,8 +41,8 @@ pairwise_template_dict = { "assessment": InputOutputTemplate( - input_format="""You are provided a pair of responses (Response {option_a} and Response {option_b}) generated subject to a context. -You will choose the better quality response subject to the evaluation criteria. + input_format="""You are provided a pair of {response_variable_name}s ({response_variable_name_title} {option_a} and {response_variable_name_title} {option_b}) generated subject to a context. +You will choose the better quality {response_variable_name} subject to the evaluation criteria. This is the context: {context_variables} @@ -51,25 +51,25 @@ {criteria_name} {criteria_description} -Response {option_a}: +{response_variable_name_title} {option_a}: {response_a} -Response {option_b}: +{response_variable_name_title} {option_b}: {response_b} -Keeping the evaluation criteria in mind, briefly assess which response is better. +Keeping the evaluation criteria in mind, briefly assess which {response_variable_name} is better. Focus on the evaluation criteria during assessment, do not provide a general assessment. Assessment: Lets think step by step """ ), "summarization": InputOutputTemplate( - input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which response won. + input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which {response_variable_name} won. Assessment: {assessment} Summary:""" ), "answer": InputOutputTemplate( - input_format="""Now considering the evaluation criteria, which response is better quality? Only include the chosen response. + input_format="""Now considering the evaluation criteria, which {response_variable_name} is better quality? Only include the chosen {response_variable_name}. {score_option_instruction} Answer: """, postprocessors=["processors.match_closest_option"],