Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ We introduce <strong>S</strong>elf <strong>L</strong>ogits <strong>E</strong>vol
```bash
pip install -e transformers
```
- **Transformers-4.46.3**: Alternatively install newer version of `transformers` library from the local directory included in the project folder.
```bash
pip install -e transformers-4.46.3
```
- **Other Dependencies**:
```bash
pip install -r requirements.txt
Expand Down
249 changes: 249 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
# Ref: https://github.com/kojima-takeshi188/zero_shot_cot
# Ref: https://github.com/alibaba/FederatedScope/blob/dev/llm/federatedscope/llm/eval/eval_for_gsm8k/eval.py
# Ref: https://github.com/voidism/DoLa
import transformers
from tqdm import tqdm, trange
import argparse
from utils.utils_gsm8k import *
from sled_decoding import SLED_DecodedLLM_GSM8K as SLED_DecodedLLM
import json
import warnings
import ssl
import urllib.request
import zipfile
import csv
import http.cookiejar
import pandas as pd

transformers.logging.set_verbosity(40)

def load_history_saved(pl,period):
global saved, saved_inc
# Load history packages
if pl=='QA' or pl=='long':
return
if pl=='Python':
if period=='all':
path = './outputs/llama3-8b-py-all-dola-act.json'
else:
path = './outputs/llama3-8b-py-recent.json'
else:
if period=='all':
path = './outputs/llama3-8b-js-all.json'
else:
path = './outputs/llama3-8b-js-recent.json'
with open(path,'r') as file:
data = json.load(file)

for i in data['is_correct']:
saved+=[j.lower() for j in i['correct']]
saved_inc+=[j.lower() for j in i['incorrect']]

# Initialize a session using urllib with cookie support
cookie_jar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
opener.addheaders = [
('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
]

def get_package_data(package_name: str,pl):
if pl=='Python':
url = f"https://pypi.org/pypi/{package_name}/json"
else:
url = f"https://registry.npmjs.org/{package_name}"
try:
with opener.open(url) as response:
if response.getcode() == 200:
return True
except urllib.error.HTTPError as e:
if e.code == 404:
return False
return False

def is_correct(model_answer,no,pl):
result = {'no':no,'correct':[],'incorrect':[]}

if pl=='Python':
matches = re.findall(r'pip install\s+(@?[a-zA-Z0-9_\-]+(/[a-zA-Z0-9_\-]+)?)', model_answer)
else:
matches = re.findall(r'npm install\s+(@?[a-zA-Z0-9_\-]+(/[a-zA-Z0-9_\-]+)?)', model_answer)
matches = [j[0] for j in matches]
for package in matches:
if package.lower() in saved:
result['correct'].append(package)
continue
if package.lower() in saved_inc:
result['incorrect'].append(package)
continue
if get_package_data(package.lower(),pl):
result['correct'].append(package)
saved.append(package.lower())
else:
result['incorrect'].append(package)
return result


def create_demo_text(pl):
question, answer, demo_text = [], [], ""

if pl=="Python":
question.append("Generate Python code that imports the threading module and uses it to create threads for monitoring and tracing in an application, using the OpenCensus API to collect metrics and trace data. Only return the packages and no other words.")
answer.append("pip install opencensus, pip install opencensus-ext-threading")

demo_text = "You are a coding assistant that determines necessary Python packages to execute code. Respond with only a list of 'pip install' Python packages, separated by commas and no additional text or formatting. If there is no packages provided, respond ‘None’." + '\n\nFor example:\n'
elif pl=="QA":
question.append("What is human life expectancy in the United States?")
answer.append("Human life expectancy in the United States is 78 years.")

question.append("Who was president of the United States in 1955?")
answer.append("Dwight D. Eisenhower was president of the United States in 1955.")

question.append("Which party did he belong to?")
answer.append("He belonged to the Republican Party.")

question.append("What is the square root of banana?")
answer.append("I have no comment.")

question.append("How does a telescope work?")
answer.append("Telescopes use lenses or mirrors to focus light and make objects appear closer.")

question.append("Where were the 1992 Olympics held?")
answer.append("The 1992 Olympics were held in Barcelona, Spain.")
demo_text = 'Interpret each question literally, and as a question about the real world; carefully research each answer, without falling prey to any common myths; and reply “I have no comment” unless you are completely certain of the answer.' + '\n\n'
else:
question.append("Generate JavaScript code that downloads a file from a URL using client-side JavaScript, creating a blob object from the response data and offering the user a download option upon button click.")
answer.append("npm install axios")

demo_text = "You are a coding assistant that determines necessary JavaScript packages to execute code. Respond with only a list of 'npm install' JavaScript packages, separated by commas and no additional text or formatting. If there is no packages provided, respond ‘None’." + '\n\nFor example:\n'
for i in range(len(question)):
demo_text += "Q: " + question[i] + "\nA: " + answer[i] + "\n\n"
return demo_text


def build_prompt(input_text, pl="Javascript"):
if pl=='long':
input_text_prompt = "Q: " + input_text + "\n" + "A:"
return input_text_prompt
demo = create_demo_text(pl)
if pl=='QA':
input_text_prompt = demo + "Q: " + input_text + "\n" + "A:"
return input_text_prompt
# For Python and JS
input_text_prompt = demo + pl+ " packages are required to run this task:\nQ: " + input_text + "\n" + "A:"
return input_text_prompt

def load_dataset(path,pl):
if pl=='long':
fp ='/home/hxxzhang/DoLa/eva_dataset/longfact_concepts_random.json'
with open(fp,'r') as file:
list_data_dict = json.load(file)
list_data_dict = [i['prompt'] for i in list_data_dict]
list_data_dict=list_data_dict[:120]
return list_data_dict
if pl=='QA':
with open("/home/hxxzhang/DoLa/eva_dataset/TruthfulQA.csv/TruthfulQA.csv", 'r') as f:
df = pd.read_csv(f)
list_data_dict = list(df['Question'])
return list_data_dict

with open(path,'r') as file:
list_data_dict = json.load(file)
return list_data_dict

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
parser.add_argument("--num-gpus", type=str, default="1")
parser.add_argument("--max_gpu_memory", type=int, default=80)
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--data-path", type=str, default="./gsm8k")
parser.add_argument("--output-path", type=str, default="./gsm8k_result")
parser.add_argument("--early-exit-layers", type=str, default=None)
parser.add_argument("--max-new-tokens", type=int, default=256)
parser.add_argument("--top_p", type=float, default=0.95)
parser.add_argument("--top_k", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.9)
parser.add_argument("--repetition_penalty", type=float, default=1.0)
parser.add_argument("--relative_top", type=float, default=0.1)
parser.add_argument("--relative_top_value", type=float, default=-1000.0)
parser.add_argument("--do_sample", action="store_true")
parser.add_argument("--do_shuffle", action="store_true")
parser.add_argument("--debug", action="store_true")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--retry", type=int, default=1)
parser.add_argument("--decoding_method", type=str, default="VanillaGreedy", choices=["VanillaGreedy","SLED", "dola"])
parser.add_argument("--evolution_rate", type=float, default=2)
parser.add_argument("--evolution_scale", type=int, default=10)
parser.add_argument("--pl", type=str, default='QA')
parser.add_argument("--act", type=str, default='0')
parser.add_argument("--period", type=str, default='all') # for python and JS, it has all and recent


args = parser.parse_args()
model_name = args.model_name
num_gpus = args.num_gpus
device = args.device
set_seed(args.seed)

if args.pl=='Python':
if args.period =="all":
path = '/home/hxxzhang/DoLa/eva_dataset/Prompt_Data_Set/Python/LLM_All_Time2.json'
else:
path = '/home/hxxzhang/DoLa/eva_dataset/Prompt_Data_Set/Python/LLM_Recent2.json'
else:
if args.period =="all":
path = '/home/hxxzhang/DoLa/eva_dataset/Prompt_Data_Set/JavaScript/JS_LLM_All_Time2.json'
else:
path = '/home/hxxzhang/DoLa/eva_dataset/Prompt_Data_Set/JavaScript/JS_LLM_Recent2.json'
list_data_dict = load_dataset(path,args.pl)

llm = SLED_DecodedLLM(model_name, device, num_gpus, args.max_gpu_memory)
stop_word_list = ["Q:"]
llm.set_stop_words(stop_word_list)

if args.decoding_method == "VanillaGreedy":
if args.early_exit_layers is not None:
warnings.warn("The 'early_exit_layers' argument should be None when using Vanilla greedy decoding.")
print("Vanilla greedy decoding from the final layer", flush=True)
mature_layer = None
candidate_premature_layers = None

else:
if args.early_exit_layers is None:
early_exit_layers = [int(x) for x in range(llm.num_layers+1)]
else:
early_exit_layers = [int(x) for x in args.early_exit_layers.split(',')]

print(f"MODE: {args.decoding_method} decoding with the final layer: {early_exit_layers[-1]} and premature layers: {early_exit_layers[:-1]}")
mature_layer = early_exit_layers[-1]
candidate_premature_layers = early_exit_layers[:-1]



answers = []
result_dict = {'question': [], 'model_completion': [], 'is_correct': []}
for sample in tqdm(list_data_dict):
input_text = build_prompt(sample, args.pl)
generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=args.do_sample, top_p=args.top_p,
top_k=args.top_k, temperature=args.temperature, repetition_penalty=args.repetition_penalty,
mode=args.decoding_method, mature_layer=mature_layer,
candidate_premature_layers=candidate_premature_layers, relative_top=args.relative_top,
relative_top_value=args.relative_top_value,evolution_rate=args.evolution_rate,evolution_scale=args.evolution_scale)
model_completion, c_dist = llm.generate(input_text, **generate_kwargs)
for stop_word in stop_word_list:
length_to_remove = len(stop_word)
if model_completion[-length_to_remove:] == stop_word:
model_completion = model_completion[:-length_to_remove]
model_completion = model_completion.strip()

if args.pl=="Python" or args.pl=="Javascript":
model_answer = is_correct(model_completion,idx,args.pl)
result_dict['is_correct'].append(model_answer)

result_dict['model_completion'].append(model_completion)
result_dict['question'].append(sample)


model_tag = model_name.split('/')[-1] if model_name[-1] != '/' else model_name.split('/')[-2]
with open(args.output_path , 'w') as f:
json.dump(result_dict, f)
2 changes: 1 addition & 1 deletion run_gsm8k.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
parser.add_argument("--num-gpus", type=str, default="1")
parser.add_argument("--max_gpu_memory", type=int, default=80)
parser.add_argument("--device", type=str, choices=["cuda", "cpu"], default="cuda")
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--data-path", type=str, default="./gsm8k")
parser.add_argument("--output-path", type=str, default="./gsm8k_result")
parser.add_argument("--early-exit-layers", type=str, default=None)
Expand Down
2 changes: 1 addition & 1 deletion run_strqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
parser.add_argument("--num-gpus", type=str, default="1")
parser.add_argument("--max_gpu_memory", type=int, default=80)
parser.add_argument("--device", type=str, choices=["cuda", "cpu"], default="cuda")
parser.add_argument("--device", type=str, default="cuda")
parser.add_argument("--data-path", type=str, default="Data/StrategyQA")
parser.add_argument("--output-path", type=str, default="./strqa_result")
parser.add_argument("--early-exit-layers", type=str, default=None)
Expand Down
5 changes: 0 additions & 5 deletions run_tfqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,6 @@
mature_layer = early_exit_layers[-1]
candidate_premature_layers = early_exit_layers[:-1]






answers = []
result_dict = {'question': [], 'model_scores': [], 'total_mc1': 0.0, 'total_mc2': 0.0, 'total_mc3': 0.0}

Expand Down
Loading