Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 38 additions & 13 deletions src/unitxt/llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class LLMJudge(BulkInstanceMetric):
generate_summaries: bool = True
format = "formats.chat_api"
include_prompts_in_result: bool = False
response_variable_name_field: Optional[str] = None
response_variable_name: str = "response"
criteria_field: str = None
criteria: Criteria = None
logger = get_logger()
Expand Down Expand Up @@ -103,6 +105,16 @@ def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
for td in task_data
]

def get_response_variable_names(self, task_data: List[Dict[str, Any]]) -> List[str]:
if self.response_variable_name_field is None:
return [self.response_variable_name] * len(task_data)
try:
return [td[self.response_variable_name_field] for td in task_data]
except KeyError as e:
raise UnitxtError(
f"The response variable name field `{self.response_variable_name_field}` was not found in the task data instance."
) from e

def perform_evaluation_step(
self,
instances: list,
Expand Down Expand Up @@ -184,6 +196,8 @@ def prepare(self):
"response": str,
"criteria_description": str,
"display_options_instruction": str,
"response_variable_name": str,
"response_variable_name_title": str,
},
reference_fields={},
prediction_type=str,
Expand All @@ -202,6 +216,7 @@ def prepare(self):
"criteria_description": str,
"score_option_instruction": str,
"options": list,
"response_variable_name": str,
},
reference_fields={},
prediction_type=str,
Expand Down Expand Up @@ -341,6 +356,7 @@ def compute(
criterias = self.get_criterias(task_data, evaluations_count)
self.set_main_score(criterias)
contexts = self.get_contexts(task_data)
response_variable_names = self.get_response_variable_names(task_data)
if self.check_positional_bias:
criterias += [
CriteriaWithOptions(
Expand All @@ -352,6 +368,7 @@ def compute(
for criteria in criterias
]
contexts += contexts
response_variable_names += response_variable_names
predictions += predictions

parsed_criterias = [
Expand All @@ -373,13 +390,16 @@ def compute(
"response": prediction,
"display_options_instruction": display_options_instruction,
"criteria_description": criteria_description,
"response_variable_name": response_variable_name,
"response_variable_name_title": response_variable_name.capitalize(),
"data_classification_policy": ["public"],
}
for context, prediction, criteria_description, display_options_instruction in zip(
for context, prediction, criteria_description, display_options_instruction, response_variable_name in zip(
contexts,
predictions,
criteria_description_list,
display_options_instruction_list,
response_variable_names
)
]
assessment_prompts, assessment_outputs, _ = self.perform_evaluation_step(
Expand Down Expand Up @@ -416,12 +436,14 @@ def compute(
"criteria_description": criteria_description,
"score_option_instruction": score_option_instruction,
"options": criteria_option_names,
"response_variable_name": response_variable_name,
"data_classification_policy": ["public"],
}
for criteria_description, score_option_instruction, criteria_option_names in zip(
for criteria_description, score_option_instruction, criteria_option_names, response_variable_name in zip(
criteria_description_list,
score_option_instruction_list,
criteria_option_names_list,
response_variable_names
)
]

Expand Down Expand Up @@ -477,6 +499,8 @@ def prepare(self):
"option_b": str,
"criteria_name": str,
"criteria_description": str,
"response_variable_name": str,
"response_variable_name_title": str,
},
reference_fields={},
prediction_type=str,
Expand All @@ -494,6 +518,7 @@ def prepare(self):
input_fields={
"score_option_instruction": str,
"options": list,
"response_variable_name": str,
},
reference_fields={},
prediction_type=str,
Expand Down Expand Up @@ -754,9 +779,11 @@ def compute(

criterias = self.get_criterias(task_data, instances_count)
contexts = self.get_contexts(task_data)
response_variable_names = self.get_response_variable_names(task_data)
if self.check_positional_bias:
criterias.extend(criterias)
contexts.extend(contexts)
response_variable_names.extend(response_variable_names)
for response_pairs, option_pairs in zip(
response_pairs_list, option_pairs_list
):
Expand All @@ -776,6 +803,8 @@ def compute(
"option_b": option_pair[1],
"criteria_name": criterias[i].name,
"criteria_description": criterias[i].description,
"response_variable_name": response_variable_names[i],
"response_variable_name_title": response_variable_names[i].capitalize(),
"data_classification_policy": ["public"],
}
for i, (response_pairs, option_pairs) in enumerate(
Expand Down Expand Up @@ -838,31 +867,27 @@ def compute(
)
self.logger.info("The summary was generated successfully.")

score_option_instruction_list = [
"".join(
score_option_instructions_list = [
["".join(
[
f'Choose "{option}" if Response {option} is better quality.\n'
for option in option_pair
]
)
) for option_pair in option_pairs]
for option_pairs in option_pairs_list
for option_pair in option_pairs
]

option_selection_instances = [
{
"options": [f"Response {option}" for option in option_pair],
"score_option_instruction": score_option_instruction,
"response_variable_name": response_variable_names[i],
"data_classification_policy": ["public"],
}
for option_pair, score_option_instruction in zip(
[
option_pair
for option_pairs in option_pairs_list
for option_pair in option_pairs
],
score_option_instruction_list,
for i, (score_option_instructions, option_pairs) in enumerate(
zip(score_option_instructions_list, option_pairs_list)
)
for score_option_instruction, option_pair in zip(score_option_instructions, option_pairs)
]

previous_messages = [
Expand Down
26 changes: 13 additions & 13 deletions src/unitxt/llm_as_judge_chat_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
direct_template_dict = {
"assessment": InputOutputTemplate(
input_format="""
You are presented with a response generated subject to a context.
The context includes information relevant to the nature or generation of the response.
You will assess the quality of the response subject to an evaluation criteria.
You are presented with a {response_variable_name} generated subject to a context.
The context includes information relevant to the nature or generation of the {response_variable_name}.
You will assess the quality of the {response_variable_name} subject to an evaluation criteria.
###Context:
{context_variables}

###Response:
###{response_variable_name_title}:
{response}

###Evaluation criteria:
{criteria_description}
{display_options_instruction}

Briefly assess the quality of the response subject to the evaluation criteria.
Briefly assess the quality of the {response_variable_name} subject to the evaluation criteria.
Focus on the evaluation criteria during assessment, do not provide a general assessment.
Assessment:

Expand All @@ -29,7 +29,7 @@
Summary:"""
),
"answer": InputOutputTemplate(
input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the response.
input_format="""Now consider the evaluation criteria and choose a final answer. Only include the chosen answer in the {response_variable_name}.
###Evaluation criteria:
{criteria_description}
{score_option_instruction}
Expand All @@ -41,8 +41,8 @@

pairwise_template_dict = {
"assessment": InputOutputTemplate(
input_format="""You are provided a pair of responses (Response {option_a} and Response {option_b}) generated subject to a context.
You will choose the better quality response subject to the evaluation criteria.
input_format="""You are provided a pair of {response_variable_name}s ({response_variable_name_title} {option_a} and {response_variable_name_title} {option_b}) generated subject to a context.
You will choose the better quality {response_variable_name} subject to the evaluation criteria.

This is the context:
{context_variables}
Expand All @@ -51,25 +51,25 @@
{criteria_name}
{criteria_description}

Response {option_a}:
{response_variable_name_title} {option_a}:
{response_a}
Response {option_b}:
{response_variable_name_title} {option_b}:
{response_b}

Keeping the evaluation criteria in mind, briefly assess which response is better.
Keeping the evaluation criteria in mind, briefly assess which {response_variable_name} is better.
Focus on the evaluation criteria during assessment, do not provide a general assessment.
Assessment:

Lets think step by step """
),
"summarization": InputOutputTemplate(
input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which response won.
input_format="""Transform the following assessment into a concise summary that focuses on the key details, excluding references to the assessment itself. The summary must clearly state which {response_variable_name} won.

Assessment: {assessment}
Summary:"""
),
"answer": InputOutputTemplate(
input_format="""Now considering the evaluation criteria, which response is better quality? Only include the chosen response.
input_format="""Now considering the evaluation criteria, which {response_variable_name} is better quality? Only include the chosen {response_variable_name}.
{score_option_instruction}
Answer: """,
postprocessors=["processors.match_closest_option"],
Expand Down