rllm-team · dajiaohuang · Mar 22, 2024 · Mar 28, 2024 · Mar 28, 2024 · Apr 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -161,4 +161,15 @@ cython_debug/
 
 /rllm/datasets/cora/
 /rllm/datasets/rel-movielens1m/
-/.idea/
+/.idea/
+
+/examples/LIFT/classification/data/test.json
+/examples/LIFT/classification/data/train.json
+/examples/LIFT/classification/data/val.json
+/examples/LIFT/classification/data/pred.json
+
+
+/examples/LIFT/regression/data/test.json
+/examples/LIFT/regression/data/train.json
+/examples/LIFT/regression/data/val.json
+/examples/LIFT/regression/data/pred.json
diff --git a/examples/LIFT/classification/data/some_train_data b/examples/LIFT/classification/data/some_train_data
diff --git a/examples/LIFT/classification/train.bat b/examples/LIFT/classification/train.bat
@@ -0,0 +1,3 @@
+call conda activate rllm
+python train.py
+pause
diff --git a/examples/LIFT/classification/train.py b/examples/LIFT/classification/train.py
@@ -0,0 +1,142 @@
+# Paper: LIFT: Language-Interfaced FineTuning for Non-Language Machine Learning Tasks (NeurIPS 2022)
+# macro_f1: 0.1293655952583329
+# micro_f1: 0.29685807150595883
+# Total time: 875.0060040950775s
+
+
+import pandas as pd
+from functools import partial
+import sys
+import re
+import argparse
+import time
+
+sys.path.append("../src")
+from utils.helper import data2text,write_jsonl
+import models.lora_gptj as GPTJ
+# from run_exps_helper import *
+import torch
+from sklearn.preprocessing import MultiLabelBinarizer
+
+sys.path.append("../../../")
+from rllm.utils import macro_f1_score, micro_f1_score, get_llm_chat_cost
+
+time_start = time.time()
+
+def data2text(row, label = True, init = '', end = ''):
+    prompt = init 
+    prompt += ' Title:'+str(row['Title']).replace("'", "").replace('"', '')\
+        # +' Director:'+str(row['Director']).replace("'", "").replace('"', '')\
+        # +' Cast:'+str(row['Cast'])+' Runtime:'+str(row['Runtime']).replace("'", "").replace('"', '')\
+        # +' Plot:'+str(row['Plot']).replace("'", "").replace('"', '')
+        # +' Languages:'+ str(row['Languages']).replace("'", "").replace('"', '')\
+        # +' Certificate:'+str(row['Certificate']).replace("'", "").replace('"', '')\
+        # +' Year:'+ str(row['Year']).replace("'", "").replace('"', '')\
+    prompt += end
+
+    if not label:
+        # final_prompt = "{\"prompt\":\"%s###\", \"completion\":\"@@@\"}" % (prompt)
+        final_prompt = "{\"prompt\":\"%s###\"}" % (prompt)
+    else:
+        completion = row['Genre']
+        final_prompt = "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)
+    return final_prompt
+
+def df2propmts(df, data2text_func, init='', end='', label=True):
+    jsonl = df.apply(lambda row: data2text_func(row, label=label, init=init, end=end), axis=1).tolist()
+    return jsonl
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument("-g", "--gpu_id", default=0, type=int)
+parser.add_argument("--local_rank", default=-1, type=int)
+parser.add_argument("--seed", default=12345, type=int)
+parser.add_argument("-p", "--is_permuted", action="store_true")
+
+parser.add_argument("-v", "--eval", default=0, type=int)
+args = parser.parse_args()
+
+device = torch.device(f'cuda:{args.gpu_id}') if torch.cuda.is_available() else 'cpu'
+torch.cuda.set_device(args.gpu_id)
+
+users = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/classification/users.csv')
+train = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/classification/movies/train.csv')
+val = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/classification/movies/validation.csv')
+test = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/classification/movies/test.csv')
+ratings = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/classification/ratings.csv')
+
+init='Given information about a movie: '
+end = 'What is the genres it may belong to? Note: 1. Give the answer as following format: genre_1|genre_2|...|genre_n 2. The answers must only be chosen from followings:Documentary, Adventure, Comedy, Horror, War, Sci-Fi, Drama, Mystery, Western, Action, Children\'s, Musical, Thriller, Crime, Film-Noir, Romance, Animation, Fantasy'
+
+train_prompts = df2propmts(train, data2text, init, end)
+val_prompts = df2propmts(val, data2text, init, end)
+test_prompts = df2propmts(test, data2text, init, end)
+
+
+write_jsonl('\n'.join(train_prompts),'train.json')
+write_jsonl('\n'.join(val_prompts),'val.json')
+write_jsonl('\n'.join(test_prompts),'test.json')
+
+y_val = val['Genre']
+y_test = test['Genre']
+
+movie_genres = test["Genre"].str.split("|")
+# print(type(test["Genre"]))
+# print(movie_genres)
+all_genres = list(set([genre for genres in movie_genres for genre in genres]))
+print(y_test)
+
+# gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device,model_name='hivemind/gpt-j-6B-8bit')
+
+gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device)
+train_configs={'learning_rate': 1e-5, 'batch_size': 2, 'epochs':1,  'weight_decay': 0.01, 'warmup_steps': 6}
+gpt.finetune('data/train.json', 'data/val.json', train_configs, saving_checkpoint=False)
+
+test_prompts = extract_prompts('data/test.json')
+pred = query(gpt, test_prompts,bs=8)
+# write_jsonl('\n'.join(pred),'pred.json')
+# print(pred)
+
+# pred = pd.DataFrame({'Genre':pred})
+# y_pred = pred['Genre'].str.split("|")
+# y_pred_filtered = []
+# for genres in y_pred:
+#     filtered_genres = [genre for genre in genres if genre in all_genres]
+#     if len(filtered_genres) == 0:
+#         y_pred_filtered.append(pd.Series([]))
+#     else:
+#         y_pred_filtered.append(pd.Series(filtered_genres))
+y_pred = []
+
+for row in pred:
+    filter_row = []
+    if row:
+        split_row = row.split('|')
+        for genre in split_row:
+            if genre in all_genres: filter_row.append(genre)
+    y_pred.append(filter_row)
+
+y_pred=pd.Series(y_pred)
+
+
+
+
+
+mlb = MultiLabelBinarizer(classes=all_genres)
+real_genres_matrix = mlb.fit_transform(movie_genres)
+# print(real_genres_matrix)
+pred_genres_matrix = mlb.fit_transform(y_pred)
+# print(pred_genres_matrix)
+macro_f1 = macro_f1_score(real_genres_matrix, pred_genres_matrix)
+micro_f1 = micro_f1_score(real_genres_matrix, pred_genres_matrix)
+
+time_end = time.time()
+
+print(f"macro_f1: {macro_f1}")
+print(f"micro_f1: {micro_f1}")
+print(f"Total time: {time_end - time_start}s")
+# print(f"Total USD$: {total_cost}")
diff --git a/examples/LIFT/regression/data/some_train_data b/examples/LIFT/regression/data/some_train_data
diff --git a/examples/LIFT/regression/train.bat b/examples/LIFT/regression/train.bat
@@ -0,0 +1,3 @@
+call conda activate rllm
+python train.py
+pause
diff --git a/examples/LIFT/regression/train.py b/examples/LIFT/regression/train.py
@@ -0,0 +1,153 @@
+# Paper: LIFT: Language-Interfaced FineTuning for Non-Language Machine Learning Tasks (NeurIPS 2022)
+# mae_loss: 1.044205298013245
+# Total time: 4518.223552465439s
+
+
+
+import pandas as pd
+from functools import partial
+import sys
+import time
+
+sys.path.append("../src")
+from utils.helper import data2text,write_jsonl
+import models.lora_gptj as GPTJ
+from run_exps_helper import *
+import torch
+from sklearn.preprocessing import MultiLabelBinarizer
+
+sys.path.append("../../../")
+from rllm.utils import mae, get_llm_chat_cost
+
+time_start = time.time()
+
+def df2prompts(df:pd.DataFrame, init = '',end = '',prompts_each_user = 1, n_given_rows = 5,sample_users=100,label =True):
+    grouped = df.groupby('UserID')
+    if sample_users:
+        selected_users = grouped['UserID'].unique().sample(n=sample_users, replace=False)
+        # print(selected_users)
+        selected_groups = [grouped.get_group(user[0]) for user in selected_users]
+        # print(selected_groups)
+    else:
+        selected_groups = [group for _, group in grouped]
+
+    jsonl = []
+    for group in selected_groups:
+
+        # print(group)
+        for i in range(prompts_each_user):
+            given_rows = group.sample(n = n_given_rows,replace= True )
+            infer_rows = group.sample(n = 1,replace= True)
+            prompt = init 
+
+            n = 0
+            for index,row in given_rows.iterrows():
+                n += 1
+                id = row['MovieID']
+                movie_info = movies[movies["MovielensID"] == id]
+                prompt += str(n)+') '\
+                    'Title: ' + str(movie_info['Title'].values[0]).replace("'", "").replace('"', '') + ' ' \
+                    'Genre: ' + str(movie_info['Genre'].values[0]).replace("'", "").replace('"', '') + ' ' \
+                    'Rating: '+ str(row['Rating']).replace("'", "").replace('"', '')+'; '
+
+            prompt += 'Now I want you to predict the user\'s ratings for the following movie(s): '
+
+            n = 0
+            for index,row in infer_rows.iterrows():
+                n += 1
+                id = row['MovieID']
+                prompt += \
+                    'Title: ' + str(movie_info['Title'].values[0]).replace("'", "").replace('"', '') + ' ' \
+                    'Genre: ' + str(movie_info['Genre'].values[0]).replace("'", "").replace('"', '') + '; ' \
+
+            prompt += end
+            if label:
+                completion = "|".join([str(row['Rating']) for index,row in infer_rows.iterrows()])
+                final_prompt = "{\"prompt\":\"%s###\", \"completion\":\"%s@@@\"}" % (prompt, completion)
+            else:
+                final_prompt = f"{prompt}###"
+            jsonl.append(final_prompt)
+    return jsonl
+
+
+
+
+
+parser = argparse.ArgumentParser(description='')
+parser.add_argument("-g", "--gpu_id", default=0, type=int)
+parser.add_argument("--local_rank", default=-1, type=int)
+parser.add_argument("--seed", default=12345, type=int)
+parser.add_argument("-p", "--is_permuted", action="store_true")
+
+parser.add_argument("-v", "--eval", default=0, type=int)
+args = parser.parse_args()
+
+device = torch.device(f'cuda:{args.gpu_id}') if torch.cuda.is_available() else 'cpu'
+torch.cuda.set_device(args.gpu_id)
+
+users = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/regression/users.csv')
+train = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/regression/ratings/train.csv')
+val = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/regression/ratings/validation.csv')
+test = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/regression/ratings/test.csv')
+movies = pd.read_csv(
+    '../../../rllm/datasets/rel-movielens1m/regression/movies.csv')
+
+init= 'Given a user\'s past movie ratings in the format: Title, Genres, Rating (Note: Ratings range from 1 to 5)'
+#end = 'What\'s the rating that the user will give to the movie(s)? Give a single number as rating if there\'s only one movie, else return like this: rating_for_movie1|rating_for_movie_2|...|rating_for_movie_n. Do not say anything else.'
+end = 'What\'s the rating that the user will give to the movie? Give a single number as rating.'
+train_prompts = df2prompts(train, init, end,sample_users=1000)
+val_prompts = df2prompts(val, init, end,sample_users=None)
+test_prompts = df2prompts(test, init, end,sample_users=None)
+# test_prompts = df2prompts(test, init, end,sample_users=None)
+
+
+write_jsonl('\n'.join(train_prompts),'train.json')
+write_jsonl('\n'.join(val_prompts),'val.json')
+write_jsonl('\n'.join(test_prompts),'test.json')
+
+y_val = val['Rating']
+y_test = test['Rating']
+
+
+
+
+
+
+# gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device,model_name='hivemind/gpt-j-6B-8bit')
+gpt = GPTJ.LoRaQGPTJ(adapter=True, device=device)
+train_configs={'learning_rate': 1e-5, 'batch_size': 1, 'epochs':1,  'weight_decay': 0.01, 'warmup_steps': 6}
+gpt.finetune('data/train.json', 'data/val.json', train_configs, saving_checkpoint=False)
+
+test_prompts = extract_prompts('data/test.json')
+test_completions = extract_completion('data/test.json')
+
+pred= query(gpt, test_prompts,bs=8)
+print(pred)
+
+pred_l=[]
+for x in pred:
+    try: 
+        pred_l.append(int(x))
+    except:
+        pred_l.append(0)
+
+y_pred = pd.DataFrame({'Rating':pred_l})['Rating']
+
+y_test = pd.DataFrame({'Rating':[int(x[0]) for x in test_completions]})['Rating']
+
+
+# acc = get_accuracy(y_pred, y_test)
+# print(acc)
+
+mae_loss = mae(y_test, y_pred)
+
+time_end = time.time()
+
+print(f"mae_loss: {mae_loss}")
+
+print(f"Total time: {time_end - time_start}s")
+# print(f"Total USD$: {total_cost}")