Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,15 @@ cython_debug/

/rllm/datasets/cora/
/rllm/datasets/rel-movielens1m/
/.idea/
/.idea/

/examples/LIFT/classification/data/test.json
/examples/LIFT/classification/data/train.json
/examples/LIFT/classification/data/val.json
/examples/LIFT/classification/data/pred.json


/examples/LIFT/regression/data/test.json
/examples/LIFT/regression/data/train.json
/examples/LIFT/regression/data/val.json
/examples/LIFT/regression/data/pred.json
Empty file.
3 changes: 3 additions & 0 deletions examples/LIFT/classification/train.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
call conda activate rllm
python train.py
pause
142 changes: 142 additions & 0 deletions examples/LIFT/classification/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Paper: LIFT: Language-Interfaced FineTuning for Non-Language Machine Learning Tasks (NeurIPS 2022)
# macro_f1: 0.1293655952583329
# micro_f1: 0.29685807150595883
# Total time: 875.0060040950775s


import pandas as pd
from functools import partial
import sys
import re
import argparse
import time

sys.path.append("../src")
from utils.helper import data2text,write_jsonl
import models.lora_gptj as GPTJ
# from run_exps_helper import *
import torch
from sklearn.preprocessing import MultiLabelBinarizer

sys.path.append("../../../")
from rllm.utils import macro_f1_score, micro_f1_score, get_llm_chat_cost

time_start = time.time()

def data2text(row, label = True, init = '', end = ''):
prompt = init
prompt += ' Title:'+str(row['Title']).replace("'", "").replace('"', '')\
# +' Director:'+str(row['Director']).replace("'", "").replace('"', '')\
# +' Cast:'+str(row['Cast'])+' Runtime:'+str(row['Runtime']).replace("'", "").replace('"', '')\
# +' Plot:'+str(row['Plot']).replace("'", "").replace('"', '')
# +' Languages:'+ str(row['Languages']).replace("'", "").replace('"', '')\
# +' Certificate:'+str(row['Certificate']).replace("'", "").replace('"', '')\
# +' Year:'+ str(row['Year']).replace("'", "").replace('"', '')\
prompt += end

if not label:
# final_prompt = "{\"prompt\":\"%s###\", \"completion\":\"@@@\"}" % (prompt)
final_prompt = "{\"prompt\":\"%s###\"}" % (prompt)
else:
completion = row['Genre']
final_prompt = "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)
return final_prompt

def df2propmts(df, data2text_func, init='', end='', label=True):
jsonl = df.apply(lambda row: data2text_func(row, label=label, init=init, end=end), axis=1).tolist()
return jsonl

parser = argparse.ArgumentParser(description='')
parser.add_argument("-g", "--gpu_id", default=0, type=int)
parser.add_argument("--local_rank", default=-1, type=int)
parser.add_argument("--seed", default=12345, type=int)
parser.add_argument("-p", "--is_permuted", action="store_true")

parser.add_argument("-v", "--eval", default=0, type=int)
args = parser.parse_args()

device = torch.device(f'cuda:{args.gpu_id}') if torch.cuda.is_available() else 'cpu'
torch.cuda.set_device(args.gpu_id)

users = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/classification/users.csv')
train = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/classification/movies/train.csv')
val = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/classification/movies/validation.csv')
test = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/classification/movies/test.csv')
ratings = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/classification/ratings.csv')

init='Given information about a movie: '
end = 'What is the genres it may belong to? Note: 1. Give the answer as following format: genre_1|genre_2|...|genre_n 2. The answers must only be chosen from followings:Documentary, Adventure, Comedy, Horror, War, Sci-Fi, Drama, Mystery, Western, Action, Children\'s, Musical, Thriller, Crime, Film-Noir, Romance, Animation, Fantasy'

train_prompts = df2propmts(train, data2text, init, end)
val_prompts = df2propmts(val, data2text, init, end)
test_prompts = df2propmts(test, data2text, init, end)


write_jsonl('\n'.join(train_prompts),'train.json')
write_jsonl('\n'.join(val_prompts),'val.json')
write_jsonl('\n'.join(test_prompts),'test.json')

y_val = val['Genre']
y_test = test['Genre']

movie_genres = test["Genre"].str.split("|")
# print(type(test["Genre"]))
# print(movie_genres)
all_genres = list(set([genre for genres in movie_genres for genre in genres]))
print(y_test)

# gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device,model_name='hivemind/gpt-j-6B-8bit')

gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device)
train_configs={'learning_rate': 1e-5, 'batch_size': 2, 'epochs':1, 'weight_decay': 0.01, 'warmup_steps': 6}
gpt.finetune('data/train.json', 'data/val.json', train_configs, saving_checkpoint=False)

test_prompts = extract_prompts('data/test.json')
pred = query(gpt, test_prompts,bs=8)
# write_jsonl('\n'.join(pred),'pred.json')
# print(pred)

# pred = pd.DataFrame({'Genre':pred})
# y_pred = pred['Genre'].str.split("|")
# y_pred_filtered = []
# for genres in y_pred:
# filtered_genres = [genre for genre in genres if genre in all_genres]
# if len(filtered_genres) == 0:
# y_pred_filtered.append(pd.Series([]))
# else:
# y_pred_filtered.append(pd.Series(filtered_genres))
y_pred = []

for row in pred:
filter_row = []
if row:
split_row = row.split('|')
for genre in split_row:
if genre in all_genres: filter_row.append(genre)
y_pred.append(filter_row)

y_pred=pd.Series(y_pred)





mlb = MultiLabelBinarizer(classes=all_genres)
real_genres_matrix = mlb.fit_transform(movie_genres)
# print(real_genres_matrix)
pred_genres_matrix = mlb.fit_transform(y_pred)
# print(pred_genres_matrix)
macro_f1 = macro_f1_score(real_genres_matrix, pred_genres_matrix)
micro_f1 = micro_f1_score(real_genres_matrix, pred_genres_matrix)

time_end = time.time()

print(f"macro_f1: {macro_f1}")
print(f"micro_f1: {micro_f1}")
print(f"Total time: {time_end - time_start}s")
# print(f"Total USD$: {total_cost}")
Empty file.
3 changes: 3 additions & 0 deletions examples/LIFT/regression/train.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
call conda activate rllm
python train.py
pause
153 changes: 153 additions & 0 deletions examples/LIFT/regression/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Paper: LIFT: Language-Interfaced FineTuning for Non-Language Machine Learning Tasks (NeurIPS 2022)
# mae_loss: 1.044205298013245
# Total time: 4518.223552465439s



import pandas as pd
from functools import partial
import sys
import time

sys.path.append("../src")
from utils.helper import data2text,write_jsonl
import models.lora_gptj as GPTJ
from run_exps_helper import *
import torch
from sklearn.preprocessing import MultiLabelBinarizer

sys.path.append("../../../")
from rllm.utils import mae, get_llm_chat_cost

time_start = time.time()

def df2prompts(df:pd.DataFrame, init = '',end = '',prompts_each_user = 1, n_given_rows = 5,sample_users=100,label =True):
grouped = df.groupby('UserID')
if sample_users:
selected_users = grouped['UserID'].unique().sample(n=sample_users, replace=False)
# print(selected_users)
selected_groups = [grouped.get_group(user[0]) for user in selected_users]
# print(selected_groups)
else:
selected_groups = [group for _, group in grouped]

jsonl = []
for group in selected_groups:

# print(group)
for i in range(prompts_each_user):
given_rows = group.sample(n = n_given_rows,replace= True )
infer_rows = group.sample(n = 1,replace= True)
prompt = init

n = 0
for index,row in given_rows.iterrows():
n += 1
id = row['MovieID']
movie_info = movies[movies["MovielensID"] == id]
prompt += str(n)+') '\
'Title: ' + str(movie_info['Title'].values[0]).replace("'", "").replace('"', '') + ' ' \
'Genre: ' + str(movie_info['Genre'].values[0]).replace("'", "").replace('"', '') + ' ' \
'Rating: '+ str(row['Rating']).replace("'", "").replace('"', '')+'; '

prompt += 'Now I want you to predict the user\'s ratings for the following movie(s): '

n = 0
for index,row in infer_rows.iterrows():
n += 1
id = row['MovieID']
prompt += \
'Title: ' + str(movie_info['Title'].values[0]).replace("'", "").replace('"', '') + ' ' \
'Genre: ' + str(movie_info['Genre'].values[0]).replace("'", "").replace('"', '') + '; ' \

prompt += end
if label:
completion = "|".join([str(row['Rating']) for index,row in infer_rows.iterrows()])
final_prompt = "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)
else:
final_prompt = f"{prompt}###"
jsonl.append(final_prompt)
return jsonl





parser = argparse.ArgumentParser(description='')
parser.add_argument("-g", "--gpu_id", default=0, type=int)
parser.add_argument("--local_rank", default=-1, type=int)
parser.add_argument("--seed", default=12345, type=int)
parser.add_argument("-p", "--is_permuted", action="store_true")

parser.add_argument("-v", "--eval", default=0, type=int)
args = parser.parse_args()

device = torch.device(f'cuda:{args.gpu_id}') if torch.cuda.is_available() else 'cpu'
torch.cuda.set_device(args.gpu_id)

users = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/regression/users.csv')
train = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/regression/ratings/train.csv')
val = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/regression/ratings/validation.csv')
test = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/regression/ratings/test.csv')
movies = pd.read_csv(
'../../../rllm/datasets/rel-movielens1m/regression/movies.csv')

init= 'Given a user\'s past movie ratings in the format: Title, Genres, Rating (Note: Ratings range from 1 to 5)'
#end = 'What\'s the rating that the user will give to the movie(s)? Give a single number as rating if there\'s only one movie, else return like this: rating_for_movie1|rating_for_movie_2|...|rating_for_movie_n. Do not say anything else.'
end = 'What\'s the rating that the user will give to the movie? Give a single number as rating.'
train_prompts = df2prompts(train, init, end,sample_users=1000)
val_prompts = df2prompts(val, init, end,sample_users=None)
test_prompts = df2prompts(test, init, end,sample_users=None)
# test_prompts = df2prompts(test, init, end,sample_users=None)


write_jsonl('\n'.join(train_prompts),'train.json')
write_jsonl('\n'.join(val_prompts),'val.json')
write_jsonl('\n'.join(test_prompts),'test.json')

y_val = val['Rating']
y_test = test['Rating']






# gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device,model_name='hivemind/gpt-j-6B-8bit')
gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device)
train_configs={'learning_rate': 1e-5, 'batch_size': 1, 'epochs':1, 'weight_decay': 0.01, 'warmup_steps': 6}
gpt.finetune('data/train.json', 'data/val.json', train_configs, saving_checkpoint=False)

test_prompts = extract_prompts('data/test.json')
test_completions = extract_completion('data/test.json')

pred= query(gpt, test_prompts,bs=8)
print(pred)

pred_l=[]
for x in pred:
try:
pred_l.append(int(x))
except:
pred_l.append(0)

y_pred = pd.DataFrame({'Rating':pred_l})['Rating']

y_test = pd.DataFrame({'Rating':[int(x[0]) for x in test_completions]})['Rating']


# acc = get_accuracy(y_pred, y_test)
# print(acc)

mae_loss = mae(y_test, y_pred)

time_end = time.time()

print(f"mae_loss: {mae_loss}")

print(f"Total time: {time_end - time_start}s")
# print(f"Total USD$: {total_cost}")
Loading