forked from yuqingcuiyuki/recommendation-system
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_lightfm_tuneN.py
94 lines (68 loc) · 3.02 KB
/
model_lightfm_tuneN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM
import numpy as np
# read parquet file: train mat
train_mat = pd.read_parquet('Downloads/train_mat.parquet')
# display the first few rows of the dataframe
print(train_mat.head())
print(train_mat.shape)
print('reduce size of df by rating > 4')
train_mat=train_mat[train_mat['rating']!=1]
train_mat=train_mat[train_mat['rating']!=2]
train_mat=train_mat[train_mat['rating']!=3]
train_mat=train_mat[train_mat['rating']!=4]
#train_mat=train_mat[train_mat['rating']!=5]
#train_mat=train_mat[train_mat['rating']!=6]
#train_mat=train_mat[train_mat['rating']!=7]
print('remove recording_msid that listened <= 5 users')
msid_counts = train_mat['recording_msid'].value_counts()
valid_msid_values = msid_counts[msid_counts >=5].index
train_mat = train_mat[train_mat['recording_msid'].isin(valid_msid_values)]
print('reduced shape', train_mat.shape)
# Map 'user_id' and 'recording_msid' to unique indices
print('create mapping')
user_id_mapping = {id: i for i, id in enumerate(train_mat['user_id'].unique())}
recording_msid_mapping = {id: i for i, id in enumerate(train_mat['recording_msid'].unique())}
print('start creating sparse matrix')
matrix = coo_matrix((train_mat['rating'],
(train_mat['user_id'].map(user_id_mapping),
train_mat['recording_msid'].map(recording_msid_mapping))))
print(matrix.shape)
print('matrix done, start modeling')
no_components=[40, 50, 100]
i_alpha=1e-4
u_alpha=1e-4
for num in no_components:
model = LightFM(loss='warp',
random_state=1004,
item_alpha=i_alpha,
user_alpha=u_alpha,
no_components=num)
model.fit(matrix, epochs=10)
print('model fit done')
print('start prediciton')
# get number of users and items
n_users, n_items = matrix.shape
# create an empty list to store the results
results = []
user_id_map=train_mat['user_id'].unique()
recording_msid_map=train_mat['recording_msid'].unique()
# iterate over all users
for user_id in range(n_users):
print('at user', user_id)
# get the scores for all items for this user
scores = model.predict(user_id, np.arange(n_items))
# print(scores)
# get the top 100 item indices with the highest scores
top_items = np.argsort(-scores)[:100]
# append to the results
results.append([user_id_map[user_id], [recording_msid_map[i] for i in list(top_items)]])
print('prediction done')
print('store as df')
# convert the results to a DataFrame
result_df= pd.DataFrame(results, columns=['user_id', 'predictions'])
print(result_df.head())
print(f'save as pq for parameters: no_components={num}, item_alpha={i_alpha}, user_alpha={u_alpha}')
result_df.to_parquet(f'lightfm_results_N/train_pred_{num}_{i_alpha}_{u_alpha}.parquet')
print('done')