Skip to content

Commit fd211aa

Browse files
committed
2 parents add4651 + f860ced commit fd211aa

File tree

1 file changed

+169
-0
lines changed

1 file changed

+169
-0
lines changed

analysis/bcb_subset.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import pickle
2+
import json
3+
import numpy as np
4+
from tqdm import tqdm
5+
from ast import literal_eval
6+
from glob import glob
7+
from sentence_transformers import SentenceTransformer, util
8+
import matplotlib.pyplot as plt
9+
from transformers import AutoTokenizer
10+
from datasets import load_dataset, Dataset, Features, Value, Sequence, DatasetDict
11+
12+
from utils import *
13+
14+
VERSION = "v0.1.0_hf"
15+
def update_model_info(model_info):
16+
for model, info in model_info.items():
17+
if "https://huggingface.co/" in info["link"]:
18+
hf_model = info["link"].split("https://huggingface.co/")[-1]
19+
print(hf_model)
20+
tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
21+
if tokenizer.chat_template is None:
22+
model_info[model]["direct_complete"] = True
23+
else:
24+
model_info[model]["direct_complete"] = False
25+
else:
26+
model_info[model]["direct_complete"] = False
27+
28+
return model_info
29+
30+
31+
def embed_sentences(data, col_name, id_name, model, save_path, push_to_hub=False):
32+
pool = model.start_multi_process_pool()
33+
embeddings = model.encode_multi_process(data[col_name], pool=pool)
34+
qids = data[id_name]
35+
features = Features({id_name: Value(dtype='string'), 'embeddings': Sequence(Value('float32'))})
36+
embed_dict = {
37+
id_name: qids,
38+
"embeddings": embeddings
39+
}
40+
embed_ds = Dataset.from_dict(embed_dict, features=features)
41+
if push_to_hub:
42+
embed_ds.push_to_hub(f"bigcode/{save_path}")
43+
else:
44+
embed_ds.save_to_disk(save_path)
45+
return embed_ds
46+
47+
48+
def get_top_docs(query_embs, doc_emb, docs):
49+
scores = np.dot(query_embs, doc_emb.T)
50+
top_doc_indices = np.argmax(scores, axis=1)
51+
top_scores = scores[np.arange(len(scores)), top_doc_indices]
52+
results = [(i, docs[doc_idx], score) for i, (doc_idx, score) in tqdm(enumerate(zip(top_doc_indices, top_scores)))]
53+
54+
return results
55+
56+
57+
def filter_top_k_percent(results, k_percent):
58+
all_scores = [score for _, score in results]
59+
threshold = np.percentile(all_scores, 100 - k_percent)
60+
filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold]
61+
return filtered_results
62+
63+
64+
def filter_top_threshold(results, threshold):
65+
filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold]
66+
return filtered_results
67+
68+
69+
def read_task_perf(tids, task="complete"):
70+
model_results = dict()
71+
result_files = []
72+
for model, info in model_info.items():
73+
if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
74+
continue
75+
task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)}
76+
model = model.replace("/", "--")
77+
# if info["link"].startswith("https://huggingface.co/"):
78+
# model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
79+
try:
80+
if info["prompted"] and not info["direct_complete"]:
81+
files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
82+
if files:
83+
file = files[0]
84+
else:
85+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
86+
else:
87+
file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
88+
except:
89+
continue
90+
with open(file, "r") as f:
91+
data = json.load(f)
92+
for task_id, perfs in data["eval"].items():
93+
status = 1 if perfs[0]["status"] == "pass" else 0
94+
task_perf[task_id] = status
95+
model_results[info["name"]] = np.mean([status for tid, status in task_perf.items() if tid in tids])
96+
return sorted(model_results.items(), key=lambda x: x[1], reverse=True)
97+
98+
99+
if __name__ == "__main__":
100+
bcb = load_dataset("bigcode/bigcodebench", trust_remote_code=True, split=VERSION)
101+
se = load_dataset("bigcode/stack-exchange-preferences-20230914-clean-anonymization", trust_remote_code=True, split="train")
102+
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
103+
104+
model_info = update_model_info(model_info)
105+
106+
se_embed = embed_sentences(se, "question", "qid", model, "stack-exchange-embeddings-20230914", push_to_hub=True)
107+
bcb_embed = embed_sentences(bcb, "complete_prompt", "task_id", model, "bigcodebench-doc-embeddings", push_to_hub=True)
108+
109+
solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", trust_remote_code=True, split="complete")
110+
111+
query_embs = np.array(se_embed["embeddings"])
112+
doc_emb = np.array(bcb_embed["embeddings"])
113+
docs = bcb_embed["task_id"]
114+
retrieval_results = get_top_docs(query_embs, doc_emb, docs)
115+
116+
Dataset.from_dict({"qid": [i for i, _, _ in retrieval_results], "tid": [doc for _, doc, _ in retrieval_results], "score": [score for _, _, score in retrieval_results]}).push_to_hub("bigcode/se_bcb_results")
117+
118+
retrieval_ds = load_dataset("bigcode/se_bcb_results", trust_remote_code=True, split="train")
119+
120+
top_results = dict()
121+
for sample in tqdm(retrieval_ds):
122+
i, doc, score = sample["qid"], sample["tid"], sample["score"]
123+
if score > 0.7:
124+
if doc not in top_results:
125+
top_results[doc] = (i, doc, score)
126+
else:
127+
if score > top_results[doc][2]:
128+
top_results[doc] = (i, doc, score)
129+
130+
top_id = {task_id: (qid, score) for qid, task_id, score in top_results.values()}
131+
132+
hard_lib_filter = {sample["task_id"] for sample in bcb if len(literal_eval(sample["libs"])) > 2}
133+
hard_length_filter = {sample["task_id"] for sample in bcb if len(sample["canonical_solution"]) > 426}
134+
hard_rate_filter = {task["task_id"]: task["solve_rate"] for task in solve_rate if task["solve_rate"] < 50}
135+
136+
hard_tid = top_id.keys() & hard_length_filter & hard_rate_filter.keys() & hard_lib_filter
137+
138+
hard_bcb = bcb.filter(lambda x: x["task_id"] in hard_tid)
139+
hard_bcb_tid = bcb.filter(lambda x: x["task_id"] in hard_tid)["task_id"]
140+
hard_se_qid = [top_id[_id][0] for _id in hard_bcb_tid]
141+
hard_se_q = se.select(hard_se_qid)
142+
hard_se_scores = [top_id[_id][1] for _id in hard_bcb_tid]
143+
hard_bcb_dict = {
144+
"task_id": hard_bcb_tid,
145+
"complete_prompt": hard_bcb["complete_prompt"],
146+
"instruct_prompt": hard_bcb["instruct_prompt"],
147+
"canonical_solution": hard_bcb["canonical_solution"],
148+
"code_prompt": hard_bcb["code_prompt"],
149+
"test": hard_bcb["test"],
150+
"entry_point": hard_bcb["entry_point"],
151+
"doc_struct": hard_bcb["doc_struct"],
152+
"libs": hard_bcb["libs"],
153+
"q_idx": hard_se_qid,
154+
"question": hard_se_q["question"],
155+
"score": hard_se_scores,
156+
"_id": hard_bcb_tid
157+
}
158+
hard_bcb = Dataset.from_dict(hard_bcb_dict)
159+
DatasetDict({VERSION: hard_bcb}).push_to_hub("bigcode/bigcodebench-hard")
160+
161+
hard_complete_results = read_task_perf(hard_tid)
162+
hard_instruct_results = read_task_perf(hard_tid, task="instruct")
163+
164+
complete_res_dict = {model: score for model, score in hard_complete_results}
165+
instruct_res_dict = {model: score for model, score in hard_instruct_results}
166+
avg_res_dict = {model: (complete_res_dict[model] + instruct_res_dict[model]) / 2 for model in complete_res_dict if model in instruct_res_dict}
167+
168+
for model, score in sorted(avg_res_dict.items(), key=lambda x: x[1], reverse=True):
169+
print(model, round(score*100, 1))

0 commit comments

Comments
 (0)