-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathevaluation.py
More file actions
193 lines (166 loc) · 7.85 KB
/
evaluation.py
File metadata and controls
193 lines (166 loc) · 7.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import json
import os
import time
import tiktoken
import argparse
from os.path import join,exists
from openai import OpenAI
from tqdm import tqdm
encoder = tiktoken.get_encoding("cl100k_base")
SYS_MSG ="Based on the provided Input (if any) and Generated Text, answer the ensuing Questions with either a YES or NO choice. Your selection should be based on your judgment as well as the following rules:\n\n- YES: Select 'YES' if the generated text entirely fulfills the condition specified in the question. However, note that even minor inaccuracies exclude the text from receiving a 'YES' rating. As an illustration. consider a question that asks. \"Does each sentence in the generated text use a second person?” If even one sentence does not use the second person, the answer should NOT be 'YES'. To qualify for a 'YES' rating, the generated text must be entirely accurate and relevant to the question\n\n- NO: Opt for 'NO' if the generated text fails to meet the question's requirements or provides no information that could be utilized to answer the question. For instance, if the question asks. \"Is the second sentence in the generated text a compound sentence?\" and the generated text only has one sentence. it offers no relevant information to answer the question. Consequently, the answer should be 'NO'.'''"
def load_jsonl(file_path):
"General function to load jsonl file"
_data = []
with open(file_path, 'r') as f:
for data in f:
jline = json.loads(data)
_data.append(jline)
return _data
def bool_ratio(fpath):
"Calculate true false ratio for eval results"
_data = load_jsonl(fpath)
count = {"true":0, "false":0}
for entry in _data:
if entry.get("eval", None) is None:
print("Wrong output")
print(entry['id'])
if len(entry['decomposed_questions']) != len(entry['eval']):
print("Wrong length")
print(entry['id'])
if None in entry['eval']:
print("None in eval")
print(entry['id'])
for eva_value in entry['eval']:
if eva_value:
count["true"] += 1
else:
count["false"] += 1
print("-------- True False Table --------")
print(count)
print(f"Percentage of True: {count['true']/sum(count.values())}")
return
def run_evaluation(client, in_path, o_dir, eval_model="gpt-4-0314", temperature=0):
"""
Main function to run decomposed questisons evaluation on models' outputs
in_path: str, path to the model output file
o_dir: str, path to the output folder
eval_model: str, default "gpt-4-0314", model name to be used for evaluation
temperature: float, default 0, temperature to be used for evaluation
"""
_data = load_jsonl(in_path)
_model_name = in_path.split('/')[1].split('_')[0]
# ceate output folder if not exists
_o_dir = join(o_dir, eval_model)
if not exists(_o_dir):
os.mkdir(_o_dir)
_opath = join(_o_dir, f"{_model_name}_DecomposeEval.json")
# load_results if exists
if os.path.exists(_opath):
_exist = load_jsonl(_opath)
_exist_ids = [i['id'] for i in _exist]
for pos, instance in enumerate(_data):
if instance['id'] in _exist_ids:
_data[pos] = _exist[_exist_ids.index(instance['id'])]
result_writer = open(_opath, 'w')
print(f"--------Evaluating output from {in_path}--------")
print(f"--------Evaluation Using {eval_model}--------")
for entry in tqdm(_data):
# ski if eval exists
if entry.get('eval', None) is not None:
result_writer.write(json.dumps(entry) + '\n')
result_writer.flush()
continue
input_task = entry['input']
output = entry['output']
if output is None: # skip if result hasn't been generated
continue
message = []
answer = ""
# print(f"--------Instance {entry['id']}--------")
for question in entry['decomposed_questions']:
if len(message) == 0:
if input_task:
content = f"{SYS_MSG}\n\nInput:\n\"{input_task}\"\n\nGenerated Text:\n\"{output}\"\n\nQuestion:\n{question}\n"
else:
content = f"{SYS_MSG}\n\nGenerated Text:\n\"{output}\"\n\nQuestion:\n{question}\n"
else:
content = f"{question}\n"
message.append({"role": "user", "content": content})
# create a chat completion
success = False
early_stop = False
while not success:
try:
completion = client.chat.completions.create(
model=eval_model,
messages=message,
temperature=temperature,
)
generation = completion.choices[0].message.content
message.append(
{"role": "assistant", "content": generation})
# check if generation is yes or no
if generation.lower().startswith("yes") or generation.lower().startswith("no"):
if generation.lower().startswith("yes"):
answer += "Yes\n"
else:
answer += "No\n"
else:
if "YES" in generation and "NO" not in generation:
answer += "Yes\n"
elif "YES" not in generation and "NO" in generation:
answer += "No\n"
else:
for msg in message:
print(msg['content'])
print("NO YES or NO answer!" + generation)
answer += "None\n"
early_stop = True
break
success = True
except Exception as e:
print("ERROR!")
print(e)
print("Retry!")
time.sleep(20)
# when no answer occurs, break the loop and continue to next instance
if early_stop:
break
answer = answer[:-1]
# save eval results as List[bool]
bool_results = []
for i in answer.split('\n'):
if i == "Yes":
bool_results.append(True)
elif i == "No":
bool_results.append(False)
else:
bool_results.append(None)
entry['eval'] = bool_results
result_writer.write(json.dumps(entry) + '\n')
result_writer.flush()
result_writer.close()
# run true false ratio calculation
bool_ratio(_opath)
return _opath
def main_run(args):
client = OpenAI(api_key=args.api_key)
results_file = args.input
output_dir = args.output_dir
eval_model = args.model
temperature = args.temperature
if not exists(results_file):
print(f"results_dir {results_file} not exists")
return
# run evaluation for each model
run_evaluation(client, results_file, output_dir, eval_model, temperature)
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--api_key", type=str, default=None)
parser.add_argument("--model", type=str, default="gpt-4-0314", help="model name to be used for evaluation")
parser.add_argument("--input", type=str, required=True, help="path to the results file")
parser.add_argument("--output_dir", type=str, required=True, help="path to the output folder")
parser.add_argument("--temperature", type=float, default=0, help="temperature to be used for evaluation")
args = parser.parse_args()
main_run(args)