1
+ import pickle
2
+ import json
3
+ import numpy as np
4
+ from tqdm import tqdm
5
+ from ast import literal_eval
6
+ from glob import glob
7
+ from sentence_transformers import SentenceTransformer , util
8
+ import matplotlib .pyplot as plt
9
+ from transformers import AutoTokenizer
10
+ from datasets import load_dataset , Dataset , Features , Value , Sequence , DatasetDict
11
+
12
+ from utils import *
13
+
14
+ VERSION = "v0.1.0_hf"
15
+ def update_model_info (model_info ):
16
+ for model , info in model_info .items ():
17
+ if "https://huggingface.co/" in info ["link" ]:
18
+ hf_model = info ["link" ].split ("https://huggingface.co/" )[- 1 ]
19
+ print (hf_model )
20
+ tokenizer = AutoTokenizer .from_pretrained (hf_model , trust_remote_code = True )
21
+ if tokenizer .chat_template is None :
22
+ model_info [model ]["direct_complete" ] = True
23
+ else :
24
+ model_info [model ]["direct_complete" ] = False
25
+ else :
26
+ model_info [model ]["direct_complete" ] = False
27
+
28
+ return model_info
29
+
30
+
31
+ def embed_sentences (data , col_name , id_name , model , save_path , push_to_hub = False ):
32
+ pool = model .start_multi_process_pool ()
33
+ embeddings = model .encode_multi_process (data [col_name ], pool = pool )
34
+ qids = data [id_name ]
35
+ features = Features ({id_name : Value (dtype = 'string' ), 'embeddings' : Sequence (Value ('float32' ))})
36
+ embed_dict = {
37
+ id_name : qids ,
38
+ "embeddings" : embeddings
39
+ }
40
+ embed_ds = Dataset .from_dict (embed_dict , features = features )
41
+ if push_to_hub :
42
+ embed_ds .push_to_hub (f"bigcode/{ save_path } " )
43
+ else :
44
+ embed_ds .save_to_disk (save_path )
45
+ return embed_ds
46
+
47
+
48
+ def get_top_docs (query_embs , doc_emb , docs ):
49
+ scores = np .dot (query_embs , doc_emb .T )
50
+ top_doc_indices = np .argmax (scores , axis = 1 )
51
+ top_scores = scores [np .arange (len (scores )), top_doc_indices ]
52
+ results = [(i , docs [doc_idx ], score ) for i , (doc_idx , score ) in tqdm (enumerate (zip (top_doc_indices , top_scores )))]
53
+
54
+ return results
55
+
56
+
57
+ def filter_top_k_percent (results , k_percent ):
58
+ all_scores = [score for _ , score in results ]
59
+ threshold = np .percentile (all_scores , 100 - k_percent )
60
+ filtered_results = [(i , doc , score ) for i , doc , score in results if score > threshold ]
61
+ return filtered_results
62
+
63
+
64
+ def filter_top_threshold (results , threshold ):
65
+ filtered_results = [(i , doc , score ) for i , doc , score in results if score > threshold ]
66
+ return filtered_results
67
+
68
+
69
+ def read_task_perf (tids , task = "complete" ):
70
+ model_results = dict ()
71
+ result_files = []
72
+ for model , info in model_info .items ():
73
+ if task == "instruct" and (not info ["prompted" ] or info ["name" ] in ["Granite-Code-3B-Instruct" , "Granite-Code-8B-Instruct" ]):
74
+ continue
75
+ task_perf = {f"BigCodeBench/{ task_id } " : 0 for task_id in range (1140 )}
76
+ model = model .replace ("/" , "--" )
77
+ # if info["link"].startswith("https://huggingface.co/"):
78
+ # model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
79
+ try :
80
+ if info ["prompted" ] and not info ["direct_complete" ]:
81
+ files = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized-calibrated_eval_results.json" )
82
+ if files :
83
+ file = files [0 ]
84
+ else :
85
+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
86
+ else :
87
+ file = glob (f"results/{ model } --bigcodebench-{ task } *-0-1-sanitized_eval_results.json" )[0 ]
88
+ except :
89
+ continue
90
+ with open (file , "r" ) as f :
91
+ data = json .load (f )
92
+ for task_id , perfs in data ["eval" ].items ():
93
+ status = 1 if perfs [0 ]["status" ] == "pass" else 0
94
+ task_perf [task_id ] = status
95
+ model_results [info ["name" ]] = np .mean ([status for tid , status in task_perf .items () if tid in tids ])
96
+ return sorted (model_results .items (), key = lambda x : x [1 ], reverse = True )
97
+
98
+
99
+ if __name__ == "__main__" :
100
+ bcb = load_dataset ("bigcode/bigcodebench" , trust_remote_code = True , split = VERSION )
101
+ se = load_dataset ("bigcode/stack-exchange-preferences-20230914-clean-anonymization" , trust_remote_code = True , split = "train" )
102
+ model = SentenceTransformer ("sentence-transformers/all-mpnet-base-v2" )
103
+
104
+ model_info = update_model_info (model_info )
105
+
106
+ se_embed = embed_sentences (se , "question" , "qid" , model , "stack-exchange-embeddings-20230914" , push_to_hub = True )
107
+ bcb_embed = embed_sentences (bcb , "complete_prompt" , "task_id" , model , "bigcodebench-doc-embeddings" , push_to_hub = True )
108
+
109
+ solve_rate = load_dataset ("bigcode/bigcodebench-solve-rate" , trust_remote_code = True , split = "complete" )
110
+
111
+ query_embs = np .array (se_embed ["embeddings" ])
112
+ doc_emb = np .array (bcb_embed ["embeddings" ])
113
+ docs = bcb_embed ["task_id" ]
114
+ retrieval_results = get_top_docs (query_embs , doc_emb , docs )
115
+
116
+ Dataset .from_dict ({"qid" : [i for i , _ , _ in retrieval_results ], "tid" : [doc for _ , doc , _ in retrieval_results ], "score" : [score for _ , _ , score in retrieval_results ]}).push_to_hub ("bigcode/se_bcb_results" )
117
+
118
+ retrieval_ds = load_dataset ("bigcode/se_bcb_results" , trust_remote_code = True , split = "train" )
119
+
120
+ top_results = dict ()
121
+ for sample in tqdm (retrieval_ds ):
122
+ i , doc , score = sample ["qid" ], sample ["tid" ], sample ["score" ]
123
+ if score > 0.7 :
124
+ if doc not in top_results :
125
+ top_results [doc ] = (i , doc , score )
126
+ else :
127
+ if score > top_results [doc ][2 ]:
128
+ top_results [doc ] = (i , doc , score )
129
+
130
+ top_id = {task_id : (qid , score ) for qid , task_id , score in top_results .values ()}
131
+
132
+ hard_lib_filter = {sample ["task_id" ] for sample in bcb if len (literal_eval (sample ["libs" ])) > 2 }
133
+ hard_length_filter = {sample ["task_id" ] for sample in bcb if len (sample ["canonical_solution" ]) > 426 }
134
+ hard_rate_filter = {task ["task_id" ]: task ["solve_rate" ] for task in solve_rate if task ["solve_rate" ] < 50 }
135
+
136
+ hard_tid = top_id .keys () & hard_length_filter & hard_rate_filter .keys () & hard_lib_filter
137
+
138
+ hard_bcb = bcb .filter (lambda x : x ["task_id" ] in hard_tid )
139
+ hard_bcb_tid = bcb .filter (lambda x : x ["task_id" ] in hard_tid )["task_id" ]
140
+ hard_se_qid = [top_id [_id ][0 ] for _id in hard_bcb_tid ]
141
+ hard_se_q = se .select (hard_se_qid )
142
+ hard_se_scores = [top_id [_id ][1 ] for _id in hard_bcb_tid ]
143
+ hard_bcb_dict = {
144
+ "task_id" : hard_bcb_tid ,
145
+ "complete_prompt" : hard_bcb ["complete_prompt" ],
146
+ "instruct_prompt" : hard_bcb ["instruct_prompt" ],
147
+ "canonical_solution" : hard_bcb ["canonical_solution" ],
148
+ "code_prompt" : hard_bcb ["code_prompt" ],
149
+ "test" : hard_bcb ["test" ],
150
+ "entry_point" : hard_bcb ["entry_point" ],
151
+ "doc_struct" : hard_bcb ["doc_struct" ],
152
+ "libs" : hard_bcb ["libs" ],
153
+ "q_idx" : hard_se_qid ,
154
+ "question" : hard_se_q ["question" ],
155
+ "score" : hard_se_scores ,
156
+ "_id" : hard_bcb_tid
157
+ }
158
+ hard_bcb = Dataset .from_dict (hard_bcb_dict )
159
+ DatasetDict ({VERSION : hard_bcb }).push_to_hub ("bigcode/bigcodebench-hard" )
160
+
161
+ hard_complete_results = read_task_perf (hard_tid )
162
+ hard_instruct_results = read_task_perf (hard_tid , task = "instruct" )
163
+
164
+ complete_res_dict = {model : score for model , score in hard_complete_results }
165
+ instruct_res_dict = {model : score for model , score in hard_instruct_results }
166
+ avg_res_dict = {model : (complete_res_dict [model ] + instruct_res_dict [model ]) / 2 for model in complete_res_dict if model in instruct_res_dict }
167
+
168
+ for model , score in sorted (avg_res_dict .items (), key = lambda x : x [1 ], reverse = True ):
169
+ print (model , round (score * 100 , 1 ))
0 commit comments