diff --git "a/Week15_\353\263\265\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" "b/Week15_\353\263\265\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" new file mode 100644 index 0000000..812eae7 --- /dev/null +++ "b/Week15_\353\263\265\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" @@ -0,0 +1,1253 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 320 + }, + "id": "_2wx-a72aIKD", + "outputId": "1ed99227-dd6f-4fc2-cbaa-f9dfc4bc92c8" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(4658, 8)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " train_id name item_condition_id \\\n", + "0 0 MLB Cincinnati Reds T Shirt Size XL 3 \n", + "1 1 Razer BlackWidow Chroma Keyboard 3 \n", + "2 2 AVA-VIV Blouse 1 \n", + "\n", + " category_name brand_name price \\\n", + "0 Men/Tops/T-shirts NaN 10.0 \n", + "1 Electronics/Computers & Tablets/Components & P... Razer 52.0 \n", + "2 Women/Tops & Blouses/Blouse Target 10.0 \n", + "\n", + " shipping item_description \n", + "0 1 No description yet \n", + "1 0 This keyboard is in great condition and works ... \n", + "2 1 Adorable top with a hint of lace and a key hol... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
train_idnameitem_condition_idcategory_namebrand_namepriceshippingitem_description
00MLB Cincinnati Reds T Shirt Size XL3Men/Tops/T-shirtsNaN10.01No description yet
11Razer BlackWidow Chroma Keyboard3Electronics/Computers & Tablets/Components & P...Razer52.00This keyboard is in great condition and works ...
22AVA-VIV Blouse1Women/Tops & Blouses/BlouseTarget10.01Adorable top with a hint of lace and a key hol...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "mercari_df", + "summary": "{\n \"name\": \"mercari_df\",\n \"rows\": 4658,\n \"fields\": [\n {\n \"column\": \"train_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1344,\n \"min\": 0,\n \"max\": 4657,\n \"num_unique_values\": 4658,\n \"samples\": [\n 1231,\n 3915,\n 2536\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4606,\n \"samples\": [\n \"Watermelon Shisha hookah pen\",\n \"Lularoe large irma lot black,grey,olive\",\n \"For Love & Lemons, (XS) Dress\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_condition_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 1,\n 5,\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"category_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 501,\n \"samples\": [\n \"Women/Tops & Blouses/Wrap\",\n \"Women/Sweaters/Full Zip\",\n \"Men/Tops/Henley\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"brand_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 501,\n \"samples\": [\n \"Pyrex\",\n \"Marucci\",\n \"O'Neill\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 42.37784448649254,\n \"min\": 0.0,\n \"max\": 1506.0,\n \"num_unique_values\": 169,\n \"samples\": [\n 111.0,\n 51.0,\n 245.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"shipping\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"item_description\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4322,\n \"samples\": [\n \"Brand new! Size small.\",\n \"Silver tone, silver tone glass cabochon with the most memorable quote from the film. Mesh bag and chain included. Free shipping.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 3 + } + ], + "source": [ + "from sklearn.linear_model import Ridge, LogisticRegression\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "import pandas as pd\n", + "\n", + "# Now read the decompressed tsv file\n", + "mercari_df= pd.read_csv('/content/mercari_train.tsv', sep='\\t')\n", + "print(mercari_df.shape)\n", + "mercari_df.head(3)" + ] + }, + { + "cell_type": "code", + "source": [ + "print(mercari_df.info())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cvx9BNIyfRl-", + "outputId": "9be3bc06-235d-4f80-ce10-e8cf30317bb1" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 4658 entries, 0 to 4657\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 train_id 4658 non-null int64 \n", + " 1 name 4658 non-null object \n", + " 2 item_condition_id 4658 non-null int64 \n", + " 3 category_name 4636 non-null object \n", + " 4 brand_name 2650 non-null object \n", + " 5 price 4658 non-null float64\n", + " 6 shipping 4658 non-null int64 \n", + " 7 item_description 4658 non-null object \n", + "dtypes: float64(1), int64(3), object(4)\n", + "memory usage: 291.3+ KB\n", + "None\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "y_train_df = mercari_df['price']\n", + "plt.figure(figsize=(6,4))\n", + "sns.histplot(y_train_df,bins=100)\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 258 + }, + "id": "UlSIB-r_fLko", + "outputId": "19b038e2-dfbb-404f-dd6d-c4042df7015b" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "y_train_df = np.log1p(y_train_df)\n", + "sns.histplot(y_train_df, bins=50)\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 287 + }, + "id": "xwAZnvSff-2U", + "outputId": "9199cdb3-a219-4e18-9be2-7befdb9673c0" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "mercari_df['price'] = np.log1p(mercari_df['price'])\n", + "mercari_df['price'].head(3)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 178 + }, + "id": "Wgj15ZEbgeBY", + "outputId": "8643b00f-9208-47e3-ea67-937d31af688c" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 2.397895\n", + "1 3.970292\n", + "2 2.397895\n", + "Name: price, dtype: float64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
02.397895
13.970292
22.397895
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print('Shipping 값 유형:\\n',mercari_df['shipping'].value_counts())\n", + "print('item_condition_id 값 유형: \\n',mercari_df ['item_condition_id' ].value_counts())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wDSzgEWPg2CP", + "outputId": "01f864fe-34d2-48db-fd68-bc6f1a4163a4" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Shipping 값 유형:\n", + " shipping\n", + "0 2539\n", + "1 2119\n", + "Name: count, dtype: int64\n", + "item_condition_id 값 유형: \n", + " item_condition_id\n", + "1 2064\n", + "3 1338\n", + "2 1138\n", + "4 111\n", + "5 7\n", + "Name: count, dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "boolean_cond = mercari_df['item_description'] == 'No description yet'\n", + "mercari_df[boolean_cond]['item_description'].count()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BCXU1-SihUUv", + "outputId": "72c2d7a7-dc60-4b24-9386-ca1fe4ac7990" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "np.int64(4658)" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def split_cat(category_name):\n", + " try:\n", + " return category_name.split('/')\n", + " except:\n", + " return ['Other_Null' , 'Other_Null' , 'Other_Null']\n", + "\n", + "# 대, 중, 소 카테고리를 분리하여 새로운 칼럼 생성\n", + "mercari_df['cat_dae'], mercari_df['cat_jung'], mercari_df['cat_so'] = \\\n", + " zip(*mercari_df['category_name'].apply(lambda x : split_cat(x)))\n", + "\n", + "print('대분류 유형:\\n', mercari_df['cat_dae'].value_counts())\n", + "print('중분류 개수:', mercari_df['cat_jung'].nunique())\n", + "print('소분류 개수:', mercari_df['cat_so'].nunique())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J0HauEuihmmX", + "outputId": "f3e82bb9-78fb-4d41-8fea-8f6aff533209" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "대분류 유형:\n", + " cat_dae\n", + "Women 2009\n", + "Beauty 704\n", + "Kids 546\n", + "Electronics 382\n", + "Men 306\n", + "Home 221\n", + "Other 167\n", + "Vintage & Collectibles 145\n", + "Sports & Outdoors 78\n", + "Handmade 78\n", + "Other_Null 22\n", + "Name: count, dtype: int64\n", + "중분류 개수: 98\n", + "소분류 개수: 388\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# 결측치(Null)를 'None' 또는 'Other_Null' 등으로 대체\n", + "mercari_df['brand_name'] = mercari_df['brand_name'].fillna(value='None')\n", + "mercari_df['category_name'] = mercari_df['category_name'].fillna(value='Other_Null')\n", + "mercari_df['item_description'] = mercari_df['item_description'].fillna(value='No description yet')\n", + "\n", + "# 모든 칼럼의 결측치 개수 확인\n", + "mercari_df.isnull().sum()\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 429 + }, + "id": "bflPh_z-hsro", + "outputId": "a3cfc5c9-00e1-4566-af2c-a268ef1748c5" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "train_id 0\n", + "name 0\n", + "item_condition_id 0\n", + "category_name 0\n", + "brand_name 0\n", + "price 0\n", + "shipping 0\n", + "item_description 0\n", + "cat_dae 0\n", + "cat_jung 0\n", + "cat_so 0\n", + "dtype: int64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
train_id0
name0
item_condition_id0
category_name0
brand_name0
price0
shipping0
item_description0
cat_dae0
cat_jung0
cat_so0
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print('brand_name 의 유형 건수 :', mercari_df['brand_name'].nunique())\n", + "print('brand_name sample 5건 : \\n', mercari_df['brand_name'].value_counts()[:5])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "EzcWnQDAhzX3", + "outputId": "74b58a9a-6351-47a8-8ea3-58879fdfd7cf" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "brand_name 의 유형 건수 : 502\n", + "brand_name sample 5건 : \n", + " brand_name\n", + "None 2008\n", + "Nike 156\n", + "Victoria's Secret 152\n", + "PINK 150\n", + "LuLaRoe 91\n", + "Name: count, dtype: int64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print('name 의 종류 개수 :', mercari_df['name'].nunique())\n", + "print('name sample 7건: \\n', mercari_df['name'][:7])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6_x-kBydh2M-", + "outputId": "7af948f9-36b9-4d92-e577-46b3645559bc" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "name 의 종류 개수 : 4606\n", + "name sample 7건: \n", + " 0 MLB Cincinnati Reds T Shirt Size XL\n", + "1 Razer BlackWidow Chroma Keyboard\n", + "2 AVA-VIV Blouse\n", + "3 Leather Horse Statues\n", + "4 24K GOLD plated rose\n", + "5 Bundled items requested for Ruie\n", + "6 Acacia pacific tides santorini top\n", + "Name: name, dtype: object\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pd.set_option('max_colwidth', 200)\n", + "# item_description의 평균 문자열 크기\n", + "print('item_description 평균 문자열 크기:', mercari_df['item_description'].str.len().mean())\n", + "mercari_df['item_description'][:2]\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 165 + }, + "id": "7mxDIMLoiSbs", + "outputId": "8084eb84-a525-435c-c31b-c19841c804e7" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "item_description 평균 문자열 크기: 18.0\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 No description yet\n", + "1 No description yet\n", + "Name: item_description, dtype: object" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_description
0No description yet
1No description yet
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# name은 CountVectorizer, item_description은 TfidfVectorizer 사용\n", + "cnt_vec = CountVectorizer()\n", + "X_name = cnt_vec.fit_transform(mercari_df.name)\n", + "\n", + "tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1,3), stop_words='english')\n", + "X_descp = tfidf_descp.fit_transform(mercari_df['item_description'])\n", + "\n", + "print('name vector shape:', X_name.shape)\n", + "print('item_description vector shape:', X_descp.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9d8_mipzie84", + "outputId": "59b83970-c8dd-4194-d531-936d57e8b10a" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "name vector shape: (4658, 4922)\n", + "item_description vector shape: (4658, 1)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.preprocessing import LabelBinarizer\n", + "\n", + "# 나머지 피처들을 희소 행렬 원-핫 인코딩(LabelBinarizer) 변환\n", + "lb_brand_name= LabelBinarizer(sparse_output=True)\n", + "X_brand = lb_brand_name.fit_transform(mercari_df['brand_name'])\n", + "\n", + "lb_item_cond_id = LabelBinarizer(sparse_output=True)\n", + "X_item_cond_id = lb_item_cond_id.fit_transform(mercari_df['item_condition_id'])\n", + "\n", + "lb_shipping= LabelBinarizer(sparse_output=True)\n", + "X_shipping = lb_shipping.fit_transform(mercari_df['shipping'])\n", + "\n", + "# 카테고리 분리된 피처들을 인코딩\n", + "lb_cat_dae = LabelBinarizer(sparse_output=True)\n", + "X_cat_dae = lb_cat_dae.fit_transform(mercari_df['cat_dae'])\n", + "\n", + "lb_cat_jung = LabelBinarizer(sparse_output=True)\n", + "X_cat_jung = lb_cat_jung.fit_transform(mercari_df['cat_jung'])\n", + "\n", + "lb_cat_so = LabelBinarizer(sparse_output=True)\n", + "X_cat_so = lb_cat_so.fit_transform(mercari_df['cat_so'])" + ], + "metadata": { + "id": "v7MwUmx2ibEZ" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 변환된 데이터 세트의 타입과 shape 확인\n", + "print(type(X_brand), type(X_item_cond_id), type(X_shipping))\n", + "print('X_brand shape: {0}, X_item_cond_id shape: {1}'.format(X_brand.shape, X_item_cond_id.shape))\n", + "print('X_shipping shape: {0}, X_cat_dae shape: {1}'.format(X_shipping.shape, X_cat_dae.shape))\n", + "print('X_cat_jung shape: {0}, X_cat_so shape: {1}'.format(X_cat_jung.shape, X_cat_so.shape))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xAV62qI0il6D", + "outputId": "75ea496a-6387-41f1-8bbe-2de229d911d3" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " \n", + "X_brand shape: (4658, 502), X_item_cond_id shape: (4658, 5)\n", + "X_shipping shape: (4658, 1), X_cat_dae shape: (4658, 11)\n", + "X_cat_jung shape: (4658, 98), X_cat_so shape: (4658, 388)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from scipy.sparse import hstack\n", + "import gc\n", + "\n", + "sparse_matrix_list = (X_name, X_descp, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "\n", + "# hstack을 이용해 희소 행렬들을 결합\n", + "X_features_sparse = hstack(sparse_matrix_list).tocsr()\n", + "print(type(X_features_sparse), X_features_sparse.shape)\n", + "\n", + "# 데이터 결합 후 불필요한 메모리 해제를 위해 del과 gc 사용 가능 (실습 시 유의)\n", + "# del X_features_sparse\n", + "# gc.collect()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QsYR-HF6ip6I", + "outputId": "a8c9c0e6-8fbb-4649-8815-13ddedce0ba7" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " (4658, 5928)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def rmsle(y , y_pred):\n", + " # 언더플로 방지를 위해 log1p 사용\n", + " return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))\n", + "\n", + "def evaluate_org_price(y_test , preds):\n", + " # 로그 변환된 값을 다시 원래 가격으로 변환 (expm1 사용)\n", + " preds_expm = np.expm1(preds)\n", + " y_test_expm = np.expm1(y_test)\n", + "\n", + " # RMSLE 계산\n", + " rmsle_result = rmsle(y_test_expm, preds_expm)\n", + " return rmsle_result" + ], + "metadata": { + "id": "ISIZo8AwirDm" + }, + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import gc\n", + "from scipy.sparse import hstack\n", + "def model_train_predict(model, matrix_list):\n", + " # scipy.sparse 모듈의 hstack을 이용해 희소 행렬 결합\n", + " X = hstack(matrix_list).tocsr()\n", + " X_train, X_test, y_train, y_test = train_test_split(X, mercari_df['price'],\n", + " test_size=0.2, random_state=156)\n", + " # 모델 학습 및 예측\n", + " model.fit(X_train, y_train)\n", + " preds = model.predict(X_test)\n", + " del X, X_train, X_test, y_train\n", + " gc.collect()\n", + " return preds, y_test" + ], + "metadata": { + "id": "xV9ZjzAoi5B-" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "linear_model = Ridge(solver = \"lsqr\", fit_intercept=False)\n", + "sparse_matrix_list = (X_name, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)\n", + "print('Item Description을 제외했을 때 rmsle 값:', evaluate_org_price(y_test, linear_preds))\n", + "sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)\n", + "print('Item Description을 포함한 rmsle 값:', evaluate_org_price(y_test, linear_preds))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3UUUiTqNjECX", + "outputId": "6f5d8176-2500-4b8d-89ef-875864d16bca" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Item Description을 제외했을 때 rmsle 값: 0.619656884498767\n", + "Item Description을 포함한 rmsle 값: 0.6193618101686929\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from lightgbm import LGBMRegressor\n", + "\n", + "sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125, random_state=156)\n", + "lgbm_preds, y_test = model_train_predict(model=lgbm_model, matrix_list= sparse_matrix_list)\n", + "print('LightGBM rmsle:', evaluate_org_price(y_test, lgbm_preds))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "I6fLMZw2jtk8", + "outputId": "46fbe011-a681-4ec6-b043-4212fae6ff41" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010143 seconds.\n", + "You can set `force_row_wise=true` to remove the overhead.\n", + "And if memory is not enough, you can set `force_col_wise=true`.\n", + "[LightGBM] [Info] Total Bins 523\n", + "[LightGBM] [Info] Number of data points in the train set: 3726, number of used features: 256\n", + "[LightGBM] [Info] Start training from score 2.961855\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n", + "[LightGBM] [Warning] No further splits with positive gain, best gain: -inf\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but LGBMRegressor was fitted with feature names\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "LightGBM rmsle: 0.7219205731489617\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "preds = lgbm_preds * 0.45 + linear_preds * 0.55\n", + "print('LightGBM과 Ridge를 ensemble한 최종 rmsle 값:', evaluate_org_price(y_test, preds))\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OomH-T-OkJAD", + "outputId": "97c335f2-8e23-41c3-9aa9-43bba9276c76" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "LightGBM과 Ridge를 ensemble한 최종 rmsle 값: 0.6175189411760594\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git "a/Week16_\353\263\265\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" "b/Week16_\353\263\265\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" new file mode 100644 index 0000000..80a2c20 --- /dev/null +++ "b/Week16_\353\263\265\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" @@ -0,0 +1,5534 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 205 + }, + "id": "p-Qo-iaIl4TD", + "outputId": "840b5b91-17f2-4838-953d-caaab5a244e6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(4803, 20)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " budget genres \\\n", + "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", + "\n", + " homepage id \\\n", + "0 http://www.avatarmovie.com/ 19995 \n", + "\n", + " keywords original_language \\\n", + "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", + "\n", + " original_title overview \\\n", + "0 Avatar In the 22nd century, a paraplegic Marine is di... \n", + "\n", + " popularity production_companies \\\n", + "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... \n", + "\n", + " production_countries release_date revenue \\\n", + "0 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2009-12-10 2787965087 \n", + "\n", + " runtime spoken_languages status \\\n", + "0 162.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", + "\n", + " tagline title vote_average vote_count \n", + "0 Enter the World of Pandora. Avatar 7.2 11800 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companiesproduction_countriesrelease_daterevenueruntimespoken_languagesstatustaglinetitlevote_averagevote_count
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...2009-12-102787965087162.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.211800
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "movies", + "summary": "{\n \"name\": \"movies\",\n \"rows\": 4803,\n \"fields\": [\n {\n \"column\": \"budget\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40722391,\n \"min\": 0,\n \"max\": 380000000,\n \"num_unique_values\": 436,\n \"samples\": [\n 439000,\n 68000000,\n 700000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"genres\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1175,\n \"samples\": [\n \"[{\\\"id\\\": 14, \\\"name\\\": \\\"Fantasy\\\"}, {\\\"id\\\": 12, \\\"name\\\": \\\"Adventure\\\"}, {\\\"id\\\": 16, \\\"name\\\": \\\"Animation\\\"}]\",\n \"[{\\\"id\\\": 28, \\\"name\\\": \\\"Action\\\"}, {\\\"id\\\": 35, \\\"name\\\": \\\"Comedy\\\"}, {\\\"id\\\": 80, \\\"name\\\": \\\"Crime\\\"}, {\\\"id\\\": 18, \\\"name\\\": \\\"Drama\\\"}]\",\n \"[{\\\"id\\\": 12, \\\"name\\\": \\\"Adventure\\\"}, {\\\"id\\\": 16, \\\"name\\\": \\\"Animation\\\"}, {\\\"id\\\": 10751, \\\"name\\\": \\\"Family\\\"}, {\\\"id\\\": 14, \\\"name\\\": \\\"Fantasy\\\"}, {\\\"id\\\": 878, \\\"name\\\": \\\"Science Fiction\\\"}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"homepage\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1691,\n \"samples\": [\n \"https://www.warnerbros.com/running-scared\",\n \"http://www.51birchstreet.com/index.php\",\n \"http://movies2.foxjapan.com/glee/\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 88694,\n \"min\": 5,\n \"max\": 459488,\n \"num_unique_values\": 4803,\n \"samples\": [\n 8427,\n 13006,\n 18041\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"keywords\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4222,\n \"samples\": [\n \"[{\\\"id\\\": 782, \\\"name\\\": \\\"assassin\\\"}, {\\\"id\\\": 1872, \\\"name\\\": \\\"loss of father\\\"}, {\\\"id\\\": 2908, \\\"name\\\": \\\"secret society\\\"}, {\\\"id\\\": 3045, \\\"name\\\": \\\"mission of murder\\\"}, {\\\"id\\\": 9748, \\\"name\\\": \\\"revenge\\\"}]\",\n \"[{\\\"id\\\": 2987, \\\"name\\\": \\\"gang war\\\"}, {\\\"id\\\": 4942, \\\"name\\\": \\\"victim of murder\\\"}, {\\\"id\\\": 5332, \\\"name\\\": \\\"greed\\\"}, {\\\"id\\\": 6062, \\\"name\\\": \\\"hostility\\\"}, {\\\"id\\\": 156212, \\\"name\\\": \\\"spaghetti western\\\"}]\",\n \"[{\\\"id\\\": 703, \\\"name\\\": \\\"detective\\\"}, {\\\"id\\\": 1299, \\\"name\\\": \\\"monster\\\"}, {\\\"id\\\": 6101, \\\"name\\\": \\\"engine\\\"}, {\\\"id\\\": 10988, \\\"name\\\": \\\"based on tv series\\\"}, {\\\"id\\\": 15162, \\\"name\\\": \\\"dog\\\"}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"original_language\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 37,\n \"samples\": [\n \"xx\",\n \"ta\",\n \"es\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"original_title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4801,\n \"samples\": [\n \"I Spy\",\n \"Love Letters\",\n \"Sleepover\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"overview\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4800,\n \"samples\": [\n \"When the Switchblade, the most sophisticated prototype stealth fighter created yet, is stolen from the U.S. government, one of the United States' top spies, Alex Scott, is called to action. What he doesn't expect is to get teamed up with a cocky civilian, World Class Boxing Champion Kelly Robinson, on a dangerous top secret espionage mission. Their assignment: using equal parts skill and humor, catch Arnold Gundars, one of the world's most successful arms dealers.\",\n \"When \\\"street smart\\\" rapper Christopher \\\"C-Note\\\" Hawkins (Big Boi) applies for a membership to all-white Carolina Pines Country Club, the establishment's proprietors are hardly ready to oblige him.\",\n \"As their first year of high school looms ahead, best friends Julie, Hannah, Yancy and Farrah have one last summer sleepover. Little do they know they're about to embark on the adventure of a lifetime. Desperate to shed their nerdy status, they take part in a night-long scavenger hunt that pits them against their popular archrivals. Everything under the sun goes on -- from taking Yancy's father's car to sneaking into nightclubs!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"popularity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 31.816649749537806,\n \"min\": 0.0,\n \"max\": 875.581305,\n \"num_unique_values\": 4802,\n \"samples\": [\n 13.267631,\n 0.010909,\n 5.842299\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"production_companies\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3697,\n \"samples\": [\n \"[{\\\"name\\\": \\\"Paramount Pictures\\\", \\\"id\\\": 4}, {\\\"name\\\": \\\"Cherry Alley Productions\\\", \\\"id\\\": 2232}]\",\n \"[{\\\"name\\\": \\\"Twentieth Century Fox Film Corporation\\\", \\\"id\\\": 306}, {\\\"name\\\": \\\"Dune Entertainment\\\", \\\"id\\\": 444}, {\\\"name\\\": \\\"Regency Enterprises\\\", \\\"id\\\": 508}, {\\\"name\\\": \\\"Guy Walks into a Bar Productions\\\", \\\"id\\\": 2645}, {\\\"name\\\": \\\"Deep River Productions\\\", \\\"id\\\": 2646}, {\\\"name\\\": \\\"Friendly Films (II)\\\", \\\"id\\\": 81136}]\",\n \"[{\\\"name\\\": \\\"Twentieth Century Fox Film Corporation\\\", \\\"id\\\": 306}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"production_countries\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 469,\n \"samples\": [\n \"[{\\\"iso_3166_1\\\": \\\"ES\\\", \\\"name\\\": \\\"Spain\\\"}, {\\\"iso_3166_1\\\": \\\"GB\\\", \\\"name\\\": \\\"United Kingdom\\\"}, {\\\"iso_3166_1\\\": \\\"US\\\", \\\"name\\\": \\\"United States of America\\\"}, {\\\"iso_3166_1\\\": \\\"FR\\\", \\\"name\\\": \\\"France\\\"}]\",\n \"[{\\\"iso_3166_1\\\": \\\"US\\\", \\\"name\\\": \\\"United States of America\\\"}, {\\\"iso_3166_1\\\": \\\"CA\\\", \\\"name\\\": \\\"Canada\\\"}, {\\\"iso_3166_1\\\": \\\"DE\\\", \\\"name\\\": \\\"Germany\\\"}]\",\n \"[{\\\"iso_3166_1\\\": \\\"DE\\\", \\\"name\\\": \\\"Germany\\\"}, {\\\"iso_3166_1\\\": \\\"ES\\\", \\\"name\\\": \\\"Spain\\\"}, {\\\"iso_3166_1\\\": \\\"GB\\\", \\\"name\\\": \\\"United Kingdom\\\"}, {\\\"iso_3166_1\\\": \\\"US\\\", \\\"name\\\": \\\"United States of America\\\"}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"release_date\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 3280,\n \"samples\": [\n \"1966-10-16\",\n \"1987-07-31\",\n \"1993-09-23\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"revenue\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 162857100,\n \"min\": 0,\n \"max\": 2787965087,\n \"num_unique_values\": 3297,\n \"samples\": [\n 11833696,\n 10462500,\n 17807569\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"runtime\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22.611934588844207,\n \"min\": 0.0,\n \"max\": 338.0,\n \"num_unique_values\": 156,\n \"samples\": [\n 74.0,\n 85.0,\n 170.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spoken_languages\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 544,\n \"samples\": [\n \"[{\\\"iso_639_1\\\": \\\"es\\\", \\\"name\\\": \\\"Espa\\\\u00f1ol\\\"}, {\\\"iso_639_1\\\": \\\"en\\\", \\\"name\\\": \\\"English\\\"}, {\\\"iso_639_1\\\": \\\"fr\\\", \\\"name\\\": \\\"Fran\\\\u00e7ais\\\"}, {\\\"iso_639_1\\\": \\\"hu\\\", \\\"name\\\": \\\"Magyar\\\"}]\",\n \"[{\\\"iso_639_1\\\": \\\"en\\\", \\\"name\\\": \\\"English\\\"}, {\\\"iso_639_1\\\": \\\"it\\\", \\\"name\\\": \\\"Italiano\\\"}, {\\\"iso_639_1\\\": \\\"pt\\\", \\\"name\\\": \\\"Portugu\\\\u00eas\\\"}]\",\n \"[{\\\"iso_639_1\\\": \\\"de\\\", \\\"name\\\": \\\"Deutsch\\\"}, {\\\"iso_639_1\\\": \\\"it\\\", \\\"name\\\": \\\"Italiano\\\"}, {\\\"iso_639_1\\\": \\\"la\\\", \\\"name\\\": \\\"Latin\\\"}, {\\\"iso_639_1\\\": \\\"pl\\\", \\\"name\\\": \\\"Polski\\\"}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"status\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Released\",\n \"Post Production\",\n \"Rumored\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"tagline\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3944,\n \"samples\": [\n \"When you're 17, every day is war.\",\n \"An Unspeakable Horror. A Creative Genius. Captured For Eternity.\",\n \"May the schwartz be with you\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4800,\n \"samples\": [\n \"I Spy\",\n \"Who's Your Caddy?\",\n \"Sleepover\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.1946121628478925,\n \"min\": 0.0,\n \"max\": 10.0,\n \"num_unique_values\": 71,\n \"samples\": [\n 5.1,\n 7.2,\n 4.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1234,\n \"min\": 0,\n \"max\": 13752,\n \"num_unique_values\": 1609,\n \"samples\": [\n 7604,\n 3428,\n 225\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 5 + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import warnings; warnings.filterwarnings(action='ignore')\n", + "\n", + "\n", + "movies = pd.read_csv('/content/tmdb_5000_movies.csv')\n", + "\n", + "print(movies.shape)\n", + "movies.head(1)" + ] + }, + { + "cell_type": "code", + "source": [ + "movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity',\n", + "'keywords', 'overview']]" + ], + "metadata": { + "id": "6Cgppo4BnFFi" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pd.set_option('max_colwidth', 100)\n", + "movies_df[['genres','keywords']][:1]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 185 + }, + "id": "3KKHVHvUnQH0", + "outputId": "5bdfeabb-8243-46d1-cdd4-787482e5100c" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " genres \\\n", + "0 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"name\": \"Fantasy\"}, {... \n", + "\n", + " keywords \n", + "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\": 2964, \"name\": \"future\"}, {\"id\": 3386, \"name\": \"sp... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
genreskeywords
0[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"name\": \"Fantasy\"}, {...[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\": 2964, \"name\": \"future\"}, {\"id\": 3386, \"name\": \"sp...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"movies_df[['genres','keywords']][:1]\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"genres\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"[{\\\"id\\\": 28, \\\"name\\\": \\\"Action\\\"}, {\\\"id\\\": 12, \\\"name\\\": \\\"Adventure\\\"}, {\\\"id\\\": 14, \\\"name\\\": \\\"Fantasy\\\"}, {\\\"id\\\": 878, \\\"name\\\": \\\"Science Fiction\\\"}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"keywords\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"[{\\\"id\\\": 1463, \\\"name\\\": \\\"culture clash\\\"}, {\\\"id\\\": 2964, \\\"name\\\": \\\"future\\\"}, {\\\"id\\\": 3386, \\\"name\\\": \\\"space war\\\"}, {\\\"id\\\": 3388, \\\"name\\\": \\\"space colony\\\"}, {\\\"id\\\": 3679, \\\"name\\\": \\\"society\\\"}, {\\\"id\\\": 3801, \\\"name\\\": \\\"space travel\\\"}, {\\\"id\\\": 9685, \\\"name\\\": \\\"futuristic\\\"}, {\\\"id\\\": 9840, \\\"name\\\": \\\"romance\\\"}, {\\\"id\\\": 9882, \\\"name\\\": \\\"space\\\"}, {\\\"id\\\": 9951, \\\"name\\\": \\\"alien\\\"}, {\\\"id\\\": 10148, \\\"name\\\": \\\"tribe\\\"}, {\\\"id\\\": 10158, \\\"name\\\": \\\"alien planet\\\"}, {\\\"id\\\": 10987, \\\"name\\\": \\\"cgi\\\"}, {\\\"id\\\": 11399, \\\"name\\\": \\\"marine\\\"}, {\\\"id\\\": 13065, \\\"name\\\": \\\"soldier\\\"}, {\\\"id\\\": 14643, \\\"name\\\": \\\"battle\\\"}, {\\\"id\\\": 14720, \\\"name\\\": \\\"love affair\\\"}, {\\\"id\\\": 165431, \\\"name\\\": \\\"anti war\\\"}, {\\\"id\\\": 193554, \\\"name\\\": \\\"power relations\\\"}, {\\\"id\\\": 206690, \\\"name\\\": \\\"mind and soul\\\"}, {\\\"id\\\": 209714, \\\"name\\\": \\\"3d\\\"}]\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from ast import literal_eval\n", + "movies_df['genres'] = movies_df['genres'].apply(literal_eval)\n", + "movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)\n" + ], + "metadata": { + "id": "TYnnJFTYncem" + }, + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "movies_df['genres']= movies_df['genres'].apply(lambda x:[y['name']for y in x])\n", + "movies_df['keywords']=movies_df['keywords'].apply(lambda x:[y['name']for y in x])\n", + "movies_df[['genres','keywords']][:1]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 150 + }, + "id": "2m3xn1A7niFD", + "outputId": "20f1c989-85bb-4e40-893c-9932f432a1ef" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " genres \\\n", + "0 [Action, Adventure, Fantasy, Science Fiction] \n", + "\n", + " keywords \n", + "0 [culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
genreskeywords
0[Action, Adventure, Fantasy, Science Fiction][culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"movies_df[['genres','keywords']][:1]\",\n \"rows\": 1,\n \"fields\": [\n {\n \"column\": \"genres\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"keywords\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "# Co니ntVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환.\n", + "movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))\n", + "count_vect = CountVectorizer(min_df=0.0, ngram_range=(1, 2))\n", + "genre_mat = count_vect.fit_transform(movies_df['genres_literal'])\n", + "print(genre_mat.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UEMGP2nqn4to", + "outputId": "73aefc4e-97f0-4873-d9fd-835c71f669e7" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(4803, 276)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "\n", + "genre_sim = cosine_similarity(genre_mat, genre_mat)\n", + "print(genre_sim.shape)\n", + "print(genre_sim[:2])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9qIQThEHoR6q", + "outputId": "db8677c9-cc0a-402a-f7eb-8f3a7c245560" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(4803, 4803)\n", + "[[1. 0.59628479 0.4472136 ... 0. 0. 0. ]\n", + " [0.59628479 1. 0.4 ... 0. 0. 0. ]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]\n", + "print(genre_sim_sorted_ind[:1 ])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wVhjkaPdoiXX", + "outputId": "20780968-be62-48bb-c68e-4687dae2bd2b" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[ 0 46 3494 ... 3331 3333 2031]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def find_sim_movie(df, sorted_ind, title_name, top_n=10):\n", + " title_movie = df[df['title'] == title_name]\n", + " title_index = title_movie.index.values\n", + " similar_indexes = sorted_ind[title_index, :(top_n)]\n", + " print(similar_indexes)\n", + " similar_indexes = similar_indexes.reshape(-1)\n", + " return df.iloc[similar_indexes]" + ], + "metadata": { + "id": "-x8iVisDp468" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)\n", + "similar_movies[['title','vote_average']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 436 + }, + "id": "ONHSBk0hqK3n", + "outputId": "2635dde7-081c-453f-fbcd-db698dd7e5c7" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[[1881 3378 3866 1370 1464 588 3887 3594 2839 892]]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title vote_average\n", + "1881 The Shawshank Redemption 8.5\n", + "3378 Auto Focus 6.1\n", + "3866 City of God 8.1\n", + "1370 21 6.5\n", + "1464 Black Water Transit 0.0\n", + "588 Wall Street: Money Never Sleeps 5.8\n", + "3887 Trainspotting 7.8\n", + "3594 Spring Breakers 5.0\n", + "2839 Rounders 6.9\n", + "892 Casino 7.8" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlevote_average
1881The Shawshank Redemption8.5
3378Auto Focus6.1
3866City of God8.1
1370216.5
1464Black Water Transit0.0
588Wall Street: Money Never Sleeps5.8
3887Trainspotting7.8
3594Spring Breakers5.0
2839Rounders6.9
892Casino7.8
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"similar_movies[['title','vote_average']]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Rounders\",\n \"Auto Focus\",\n \"Wall Street: Money Never Sleeps\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.4636242498490803,\n \"min\": 0.0,\n \"max\": 8.5,\n \"num_unique_values\": 9,\n \"samples\": [\n 5.0,\n 6.1,\n 5.8\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movies_df[['title','vote_average','vote_count']].sort_values('vote_average',ascending = False)[:10]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 591 + }, + "id": "Z_RJ2Q8MqUwV", + "outputId": "dfb622a2-7f0a-41e8-c4af-74b973634ec3" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title vote_average vote_count\n", + "4662 Little Big Top 10.0 1\n", + "3519 Stiff Upper Lips 10.0 1\n", + "4045 Dancer, Texas Pop. 81 10.0 1\n", + "4247 Me You and Five Bucks 10.0 2\n", + "3992 Sardaarji 9.5 2\n", + "2386 One Man's Hero 9.3 2\n", + "1881 The Shawshank Redemption 8.5 8205\n", + "2970 There Goes My Baby 8.5 2\n", + "3337 The Godfather 8.4 5893\n", + "2796 The Prisoner of Zenda 8.4 11" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlevote_averagevote_count
4662Little Big Top10.01
3519Stiff Upper Lips10.01
4045Dancer, Texas Pop. 8110.01
4247Me You and Five Bucks10.02
3992Sardaarji9.52
2386One Man's Hero9.32
1881The Shawshank Redemption8.58205
2970There Goes My Baby8.52
3337The Godfather8.45893
2796The Prisoner of Zenda8.411
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"movies_df[['title','vote_average','vote_count']]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"The Godfather\",\n \"Stiff Upper Lips\",\n \"One Man's Hero\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7366591251499343,\n \"min\": 8.4,\n \"max\": 10.0,\n \"num_unique_values\": 5,\n \"samples\": [\n 9.5,\n 8.4,\n 9.3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3020,\n \"min\": 1,\n \"max\": 8205,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 11,\n 8205\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "C = movies_df['vote_average'].mean()\n", + "m = movies_df['vote_count'].quantile(0.6)\n", + "print('C:',round(C,3), 'm:',round(m,3))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qRhpm_7AqnkS", + "outputId": "95ce35f0-6125-493a-f841-3ea4d8806425" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "C: 6.092 m: 370.2\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "percentile = 0.6\n", + "m = movies_df['vote_count'].quantile(percentile)\n", + "C = movies_df['vote_average'].mean()\n", + "\n", + "def weighted_vote_average(record):\n", + " v = record['vote_count']\n", + " R = record['vote_average']\n", + "\n", + " return ((v/(v+m)) * R) + ((m/(m+v)) * C)\n", + "movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)" + ], + "metadata": { + "id": "l_OOoSF3rONS" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "movies_df[['title', 'vote_average', 'weighted_vote', 'vote_count']].sort_values(\n", + "'weighted_vote', ascending=False)[:10]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 539 + }, + "id": "-6HwHdejriJZ", + "outputId": "4b89278a-ff9c-4ebf-8475-a1db8cc8b328" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title vote_average weighted_vote vote_count\n", + "1881 The Shawshank Redemption 8.5 8.396052 8205\n", + "3337 The Godfather 8.4 8.263591 5893\n", + "662 Fight Club 8.3 8.216455 9413\n", + "3232 Pulp Fiction 8.3 8.207102 8428\n", + "65 The Dark Knight 8.2 8.136930 12002\n", + "1818 Schindler's List 8.3 8.126069 4329\n", + "3865 Whiplash 8.3 8.123248 4254\n", + "809 Forrest Gump 8.2 8.105954 7927\n", + "2294 Spirited Away 8.3 8.105867 3840\n", + "2731 The Godfather: Part II 8.3 8.079586 3338" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlevote_averageweighted_votevote_count
1881The Shawshank Redemption8.58.3960528205
3337The Godfather8.48.2635915893
662Fight Club8.38.2164559413
3232Pulp Fiction8.38.2071028428
65The Dark Knight8.28.13693012002
1818Schindler's List8.38.1260694329
3865Whiplash8.38.1232484254
809Forrest Gump8.28.1059547927
2294Spirited Away8.38.1058673840
2731The Godfather: Part II8.38.0795863338
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"'weighted_vote', ascending=False)[:10]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Spirited Away\",\n \"The Godfather\",\n \"Schindler's List\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.08755950357709151,\n \"min\": 8.2,\n \"max\": 8.5,\n \"num_unique_values\": 4,\n \"samples\": [\n 8.4,\n 8.2,\n 8.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"weighted_vote\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.09696608479450805,\n \"min\": 8.07958629828635,\n \"max\": 8.39605162693645,\n \"num_unique_values\": 10,\n \"samples\": [\n 8.105867158639835,\n 8.263590802034972,\n 8.126068673669016\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2866,\n \"min\": 3338,\n \"max\": 12002,\n \"num_unique_values\": 10,\n \"samples\": [\n 3840,\n 5893,\n 4329\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def find_sim_movie(df, sorted_ind, title_name, top_n=10):\n", + " title_movie = df[df['title'] == title_name]\n", + " title_index = title_movie.index.values\n", + " similar_indexes = sorted_ind[title_index, :(top_n*2)]\n", + " similar_indexes = similar_indexes.reshape(-1)\n", + " similar_indexes = similar_indexes[similar_indexes != title_index]\n", + " return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]\n", + "similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather',10)\n", + "similar_movies[['title', 'vote_average', 'weighted_vote']]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 539 + }, + "id": "Xp6Muja8rpPs", + "outputId": "2114ecb6-6400-446a-bb37-b1f8744e651e" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title vote_average weighted_vote\n", + "1881 The Shawshank Redemption 8.5 8.396052\n", + "2731 The Godfather: Part II 8.3 8.079586\n", + "1847 GoodFellas 8.2 7.976937\n", + "3866 City of God 8.1 7.759693\n", + "1663 Once Upon a Time in America 8.2 7.657811\n", + "3887 Trainspotting 7.8 7.591009\n", + "883 Catch Me If You Can 7.7 7.557097\n", + "892 Casino 7.8 7.423040\n", + "4041 This Is England 7.4 6.739664\n", + "1149 American Hustle 6.8 6.717525" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlevote_averageweighted_vote
1881The Shawshank Redemption8.58.396052
2731The Godfather: Part II8.38.079586
1847GoodFellas8.27.976937
3866City of God8.17.759693
1663Once Upon a Time in America8.27.657811
3887Trainspotting7.87.591009
883Catch Me If You Can7.77.557097
892Casino7.87.423040
4041This Is England7.46.739664
1149American Hustle6.86.717525
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"similar_movies[['title', 'vote_average', 'weighted_vote']]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"This Is England\",\n \"The Godfather: Part II\",\n \"Trainspotting\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"vote_average\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5006662228138289,\n \"min\": 6.8,\n \"max\": 8.5,\n \"num_unique_values\": 8,\n \"samples\": [\n 8.3,\n 7.7,\n 8.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"weighted_vote\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5360110086094774,\n \"min\": 6.717525466229835,\n \"max\": 8.39605162693645,\n \"num_unique_values\": 10,\n \"samples\": [\n 6.739664363482589,\n 8.07958629828635,\n 7.591009490713154\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movies = pd.read_csv('/content/movies.csv')\n", + "ratings = pd.read_csv('/content/ratings.csv')\n", + "print(movies.shape)\n", + "print(ratings.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "X_sHojdZsAJ5", + "outputId": "857675a2-0265-4661-b700-cb8d0be16eb3" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(9742, 3)\n", + "(100836, 4)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "ratings = ratings[['userId', 'movieId', 'rating']]\n", + "ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')\n", + "ratings_matrix.head(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 224 + }, + "id": "mzMannI-sexe", + "outputId": "5ef6ab6c-dbac-4704-d9cf-edde49b4c707" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "movieId 1 2 3 4 5 6 7 8 \\\n", + "userId \n", + "1 4.0 NaN 4.0 NaN NaN 4.0 NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "movieId 9 10 ... 193565 193567 193571 193573 193579 193581 \\\n", + "userId ... \n", + "1 NaN NaN ... NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN ... NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN ... NaN NaN NaN NaN NaN NaN \n", + "\n", + "movieId 193583 193585 193587 193609 \n", + "userId \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "\n", + "[3 rows x 9724 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movieId12345678910...193565193567193571193573193579193581193583193585193587193609
userId
14.0NaN4.0NaNNaN4.0NaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

3 rows × 9724 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "ratings_matrix" + } + }, + "metadata": {}, + "execution_count": 28 + } + ] + }, + { + "cell_type": "code", + "source": [ + "rating_movies = pd.merge(ratings, movies, on='movieId')\n", + "ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')\n", + "ratings_matrix = ratings_matrix.fillna(0)\n", + "ratings_matrix.head(3)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 345 + }, + "id": "-zUwAL-MsuOr", + "outputId": "e05a8cbd-f71f-4a3b-9311-7243778178cc" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title '71 (2014) 'Hellboy': The Seeds of Creation (2004) \\\n", + "userId \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "\n", + "title 'Round Midnight (1986) 'Salem's Lot (2004) \\\n", + "userId \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "\n", + "title 'Til There Was You (1997) 'Tis the Season for Love (2015) \\\n", + "userId \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "\n", + "title 'burbs, The (1989) 'night Mother (1986) (500) Days of Summer (2009) \\\n", + "userId \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "\n", + "title *batteries not included (1987) ... Zulu (2013) [REC] (2007) \\\n", + "userId ... \n", + "1 0.0 ... 0.0 0.0 \n", + "2 0.0 ... 0.0 0.0 \n", + "3 0.0 ... 0.0 0.0 \n", + "\n", + "title [REC]² (2009) [REC]³ 3 Génesis (2012) \\\n", + "userId \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "\n", + "title anohana: The Flower We Saw That Day - The Movie (2013) \\\n", + "userId \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "\n", + "title eXistenZ (1999) xXx (2002) xXx: State of the Union (2005) \\\n", + "userId \n", + "1 0.0 0.0 0.0 \n", + "2 0.0 0.0 0.0 \n", + "3 0.0 0.0 0.0 \n", + "\n", + "title ¡Three Amigos! (1986) À nous la liberté (Freedom for Us) (1931) \n", + "userId \n", + "1 4.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "\n", + "[3 rows x 9719 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
title'71 (2014)'Hellboy': The Seeds of Creation (2004)'Round Midnight (1986)'Salem's Lot (2004)'Til There Was You (1997)'Tis the Season for Love (2015)'burbs, The (1989)'night Mother (1986)(500) Days of Summer (2009)*batteries not included (1987)...Zulu (2013)[REC] (2007)[REC]² (2009)[REC]³ 3 Génesis (2012)anohana: The Flower We Saw That Day - The Movie (2013)eXistenZ (1999)xXx (2002)xXx: State of the Union (2005)¡Three Amigos! (1986)À nous la liberté (Freedom for Us) (1931)
userId
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.04.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

3 rows × 9719 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "ratings_matrix" + } + }, + "metadata": {}, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "source": [ + "ratings_matrix_T = ratings_matrix.transpose()\n", + "ratings_matrix_T.head(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 345 + }, + "id": "5dlvzJTntH9P", + "outputId": "a3f60c9d-3bb4-4f40-be37-f133b9bec087" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "userId 1 2 3 4 5 6 7 \\\n", + "title \n", + "'71 (2014) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "'Round Midnight (1986) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "userId 8 9 10 ... 601 602 603 \\\n", + "title ... \n", + "'71 (2014) 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", + "'Round Midnight (1986) 0.0 0.0 0.0 ... 0.0 0.0 0.0 \n", + "\n", + "userId 604 605 606 607 608 609 610 \n", + "title \n", + "'71 (2014) 0.0 0.0 0.0 0.0 0.0 0.0 4.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "'Round Midnight (1986) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[3 rows x 610 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userId12345678910...601602603604605606607608609610
title
'71 (2014)0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.04.0
'Hellboy': The Seeds of Creation (2004)0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
'Round Midnight (1986)0.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

3 rows × 610 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "ratings_matrix_T" + } + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics.pairwise import cosine_similarity\n", + "item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)\n", + "item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,columns=ratings_matrix.columns)\n", + "\n", + "print(item_sim_df.shape)\n", + "item_sim_df.head(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 485 + }, + "id": "Pq8BU1YstKwx", + "outputId": "bf5f9049-a0c0-4faf-8480-543a27dfdb5b" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(9719, 9719)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title '71 (2014) \\\n", + "title \n", + "'71 (2014) 1.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title 'Hellboy': The Seeds of Creation (2004) \\\n", + "title \n", + "'71 (2014) 0.000000 \n", + "'Hellboy': The Seeds of Creation (2004) 1.000000 \n", + "'Round Midnight (1986) 0.707107 \n", + "\n", + "title 'Round Midnight (1986) \\\n", + "title \n", + "'71 (2014) 0.000000 \n", + "'Hellboy': The Seeds of Creation (2004) 0.707107 \n", + "'Round Midnight (1986) 1.000000 \n", + "\n", + "title 'Salem's Lot (2004) \\\n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title 'Til There Was You (1997) \\\n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title 'Tis the Season for Love (2015) \\\n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title 'burbs, The (1989) \\\n", + "title \n", + "'71 (2014) 0.000000 \n", + "'Hellboy': The Seeds of Creation (2004) 0.000000 \n", + "'Round Midnight (1986) 0.176777 \n", + "\n", + "title 'night Mother (1986) \\\n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title (500) Days of Summer (2009) \\\n", + "title \n", + "'71 (2014) 0.141653 \n", + "'Hellboy': The Seeds of Creation (2004) 0.000000 \n", + "'Round Midnight (1986) 0.000000 \n", + "\n", + "title *batteries not included (1987) ... \\\n", + "title ... \n", + "'71 (2014) 0.0 ... \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 ... \n", + "'Round Midnight (1986) 0.0 ... \n", + "\n", + "title Zulu (2013) [REC] (2007) \\\n", + "title \n", + "'71 (2014) 0.0 0.342055 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 0.000000 \n", + "'Round Midnight (1986) 0.0 0.000000 \n", + "\n", + "title [REC]² (2009) \\\n", + "title \n", + "'71 (2014) 0.543305 \n", + "'Hellboy': The Seeds of Creation (2004) 0.000000 \n", + "'Round Midnight (1986) 0.000000 \n", + "\n", + "title [REC]³ 3 Génesis (2012) \\\n", + "title \n", + "'71 (2014) 0.707107 \n", + "'Hellboy': The Seeds of Creation (2004) 0.000000 \n", + "'Round Midnight (1986) 0.000000 \n", + "\n", + "title anohana: The Flower We Saw That Day - The Movie (2013) \\\n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title eXistenZ (1999) xXx (2002) \\\n", + "title \n", + "'71 (2014) 0.0 0.139431 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 0.000000 \n", + "'Round Midnight (1986) 0.0 0.000000 \n", + "\n", + "title xXx: State of the Union (2005) \\\n", + "title \n", + "'71 (2014) 0.327327 \n", + "'Hellboy': The Seeds of Creation (2004) 0.000000 \n", + "'Round Midnight (1986) 0.000000 \n", + "\n", + "title ¡Three Amigos! (1986) \\\n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "title À nous la liberté (Freedom for Us) (1931) \n", + "title \n", + "'71 (2014) 0.0 \n", + "'Hellboy': The Seeds of Creation (2004) 0.0 \n", + "'Round Midnight (1986) 0.0 \n", + "\n", + "[3 rows x 9719 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
title'71 (2014)'Hellboy': The Seeds of Creation (2004)'Round Midnight (1986)'Salem's Lot (2004)'Til There Was You (1997)'Tis the Season for Love (2015)'burbs, The (1989)'night Mother (1986)(500) Days of Summer (2009)*batteries not included (1987)...Zulu (2013)[REC] (2007)[REC]² (2009)[REC]³ 3 Génesis (2012)anohana: The Flower We Saw That Day - The Movie (2013)eXistenZ (1999)xXx (2002)xXx: State of the Union (2005)¡Three Amigos! (1986)À nous la liberté (Freedom for Us) (1931)
title
'71 (2014)1.00.0000000.0000000.00.00.00.0000000.00.1416530.0...0.00.3420550.5433050.7071070.00.00.1394310.3273270.00.0
'Hellboy': The Seeds of Creation (2004)0.01.0000000.7071070.00.00.00.0000000.00.0000000.0...0.00.0000000.0000000.0000000.00.00.0000000.0000000.00.0
'Round Midnight (1986)0.00.7071071.0000000.00.00.00.1767770.00.0000000.0...0.00.0000000.0000000.0000000.00.00.0000000.0000000.00.0
\n", + "

3 rows × 9719 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "item_sim_df" + } + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "source": [ + "item_sim_df[\"Godfather, The (1972)\"].sort_values(ascending=False)[:6]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 355 + }, + "id": "rJX6rv0XteNt", + "outputId": "425f88a9-0fd9-4188-fc7c-c0ae66006bd1" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title\n", + "Godfather, The (1972) 1.000000\n", + "Godfather: Part II, The (1974) 0.821773\n", + "Goodfellas (1990) 0.664841\n", + "One Flew Over the Cuckoo's Nest (1975) 0.620536\n", + "Star Wars: Episode IV - A New Hope (1977) 0.595317\n", + "Fargo (1996) 0.588614\n", + "Name: Godfather, The (1972), dtype: float64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Godfather, The (1972)
title
Godfather, The (1972)1.000000
Godfather: Part II, The (1974)0.821773
Goodfellas (1990)0.664841
One Flew Over the Cuckoo's Nest (1975)0.620536
Star Wars: Episode IV - A New Hope (1977)0.595317
Fargo (1996)0.588614
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 33 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def predict_rating(ratings_arr, item_sim_arr ):\n", + " ratings_pred = ratings_arr.dot(item_sim_arr)/ np.array([np.abs(item_sim_arr).sum(axis=1)])\n", + " return ratings_pred" + ], + "metadata": { + "id": "sS3XbWG7tlQ4" + }, + "execution_count": 34, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)\n", + "ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index, \\\n", + "columns = ratings_matrix.columns)\n", + "ratings_pred_matrix.head(3)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 345 + }, + "id": "bRuPfd9it8JE", + "outputId": "a0d8a6b3-973d-46a1-a0de-919bf0facb54" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title '71 (2014) 'Hellboy': The Seeds of Creation (2004) \\\n", + "userId \n", + "1 0.070345 0.577855 \n", + "2 0.018260 0.042744 \n", + "3 0.011884 0.030279 \n", + "\n", + "title 'Round Midnight (1986) 'Salem's Lot (2004) \\\n", + "userId \n", + "1 0.321696 0.227055 \n", + "2 0.018861 0.000000 \n", + "3 0.064437 0.003762 \n", + "\n", + "title 'Til There Was You (1997) 'Tis the Season for Love (2015) \\\n", + "userId \n", + "1 0.206958 0.194615 \n", + "2 0.000000 0.035995 \n", + "3 0.003749 0.002722 \n", + "\n", + "title 'burbs, The (1989) 'night Mother (1986) (500) Days of Summer (2009) \\\n", + "userId \n", + "1 0.249883 0.102542 0.157084 \n", + "2 0.013413 0.002314 0.032213 \n", + "3 0.014625 0.002085 0.005666 \n", + "\n", + "title *batteries not included (1987) ... Zulu (2013) [REC] (2007) \\\n", + "userId ... \n", + "1 0.178197 ... 0.113608 0.181738 \n", + "2 0.014863 ... 0.015640 0.020855 \n", + "3 0.006272 ... 0.006923 0.011665 \n", + "\n", + "title [REC]² (2009) [REC]³ 3 Génesis (2012) \\\n", + "userId \n", + "1 0.133962 0.128574 \n", + "2 0.020119 0.015745 \n", + "3 0.011800 0.012225 \n", + "\n", + "title anohana: The Flower We Saw That Day - The Movie (2013) \\\n", + "userId \n", + "1 0.006179 \n", + "2 0.049983 \n", + "3 0.000000 \n", + "\n", + "title eXistenZ (1999) xXx (2002) xXx: State of the Union (2005) \\\n", + "userId \n", + "1 0.212070 0.192921 0.136024 \n", + "2 0.014876 0.021616 0.024528 \n", + "3 0.008194 0.007017 0.009229 \n", + "\n", + "title ¡Three Amigos! (1986) À nous la liberté (Freedom for Us) (1931) \n", + "userId \n", + "1 0.292955 0.720347 \n", + "2 0.017563 0.000000 \n", + "3 0.010420 0.084501 \n", + "\n", + "[3 rows x 9719 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
title'71 (2014)'Hellboy': The Seeds of Creation (2004)'Round Midnight (1986)'Salem's Lot (2004)'Til There Was You (1997)'Tis the Season for Love (2015)'burbs, The (1989)'night Mother (1986)(500) Days of Summer (2009)*batteries not included (1987)...Zulu (2013)[REC] (2007)[REC]² (2009)[REC]³ 3 Génesis (2012)anohana: The Flower We Saw That Day - The Movie (2013)eXistenZ (1999)xXx (2002)xXx: State of the Union (2005)¡Three Amigos! (1986)À nous la liberté (Freedom for Us) (1931)
userId
10.0703450.5778550.3216960.2270550.2069580.1946150.2498830.1025420.1570840.178197...0.1136080.1817380.1339620.1285740.0061790.2120700.1929210.1360240.2929550.720347
20.0182600.0427440.0188610.0000000.0000000.0359950.0134130.0023140.0322130.014863...0.0156400.0208550.0201190.0157450.0499830.0148760.0216160.0245280.0175630.000000
30.0118840.0302790.0644370.0037620.0037490.0027220.0146250.0020850.0056660.006272...0.0069230.0116650.0118000.0122250.0000000.0081940.0070170.0092290.0104200.084501
\n", + "

3 rows × 9719 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "ratings_pred_matrix" + } + }, + "metadata": {}, + "execution_count": 37 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "def get_mse(pred, actual):\n", + " pred = pred[actual.nonzero()].flatten()\n", + " actual = actual[actual.nonzero()].flatten()\n", + " return mean_squared_error(pred,actual)\n", + "print('아이템 기반 모든 최근접 이웃 MSE:',get_mse(ratings_pred, ratings_matrix.values ))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mjL-JTkYuaNs", + "outputId": "5b4adec2-a899-49fe-d7c5-ad97ddc1c476" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "아이템 기반 모든 최근접 이웃 MSE: 9.895354759094706\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):\n", + " pred = np.zeros(ratings_arr.shape)\n", + " for col in range(ratings_arr.shape[1]):\n", + " top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]\n", + " for row in range(ratings_arr.shape[0]):\n", + " pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)\n", + " pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))\n", + " return pred\n" + ], + "metadata": { + "id": "DonUsDksutOC" + }, + "execution_count": 39, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)\n", + "print('아이템 기반 최근접 Top-20 이웃 MSE:' , get_mse(ratings_pred, ratings_matrix.values ))\n", + "ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,\n", + " columns = ratings_matrix.columns)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "otoRqn5kvaaS", + "outputId": "b2dc59e0-6044-4da4-b803-5b631c35d873" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "아이템 기반 최근접 Top-20 이웃 MSE: 3.694409449382562\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "user_rating_id = ratings_matrix.loc[9,:]\n", + "user_rating_id[user_rating_id>0].sort_values(ascending = False)[:10]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 481 + }, + "id": "gAR2KBvzvmyO", + "outputId": "04897083-9c0e-4223-df67-366f467c8b4a" + }, + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title\n", + "Adaptation (2002) 5.0\n", + "Austin Powers in Goldmember (2002) 5.0\n", + "Back to the Future (1985) 5.0\n", + "Citizen Kane (1941) 5.0\n", + "Lord of the Rings: The Fellowship of the Ring, The (2001) 5.0\n", + "Lord of the Rings: The Two Towers, The (2002) 5.0\n", + "Producers, The (1968) 5.0\n", + "Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 5.0\n", + "Elling (2001) 4.0\n", + "King of Comedy, The (1983) 4.0\n", + "Name: 9, dtype: float64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
9
title
Adaptation (2002)5.0
Austin Powers in Goldmember (2002)5.0
Back to the Future (1985)5.0
Citizen Kane (1941)5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)5.0
Lord of the Rings: The Two Towers, The (2002)5.0
Producers, The (1968)5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)5.0
Elling (2001)4.0
King of Comedy, The (1983)4.0
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def get_unseen_movies(ratings_matrix, userId):\n", + " user_rating = ratings_matrix.loc[userId,:]\n", + " already_seen = user_rating[user_rating>0].index.tolist()\n", + " movies_list = rating_movies.columns.tolist()\n", + " unseen_list = [movie for movie in movies_list if movie not in already_seen]\n", + " return unseen_list" + ], + "metadata": { + "id": "H7VOrTJOv9gY" + }, + "execution_count": 47, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def recomm_movie_by_userid(pred_df, userid, unseen_list, top_n=10):\n", + " recomm_movies = pred_df.loc[userid,unseen_list].sort_values(ascending=False)[:top_n]\n", + " return recomm_movies\n", + "\n", + "# Correctly generate unseen_list within this cell\n", + "user_id_to_recommend = 9\n", + "user_rating_data = ratings_matrix.loc[user_id_to_recommend,:]\n", + "already_seen_movies_by_user = user_rating_data[user_rating_data > 0].index.tolist()\n", + "all_available_movies = ratings_matrix.columns.tolist() # Get all movie titles from ratings_matrix\n", + "unseen_list = [movie for movie in all_available_movies if movie not in already_seen_movies_by_user]\n", + "\n", + "recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, user_id_to_recommend, unseen_list, top_n=10)\n", + "recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])\n", + "recomm_movies" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 515 + }, + "id": "7-9Bum7pwWlP", + "outputId": "a7a58772-a377-47c1-9269-af424acc4969" + }, + "execution_count": 49, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " pred_score\n", + "title \n", + "Shrek (2001) 0.866202\n", + "Spider-Man (2002) 0.857854\n", + "Last Samurai, The (2003) 0.817473\n", + "Indiana Jones and the Temple of Doom (1984) 0.816626\n", + "Matrix Reloaded, The (2003) 0.800990\n", + "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001) 0.765159\n", + "Gladiator (2000) 0.740956\n", + "Matrix, The (1999) 0.732693\n", + "Pirates of the Caribbean: The Curse of the Black Pearl (2003) 0.689591\n", + "Lord of the Rings: The Return of the King, The (2003) 0.676711" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pred_score
title
Shrek (2001)0.866202
Spider-Man (2002)0.857854
Last Samurai, The (2003)0.817473
Indiana Jones and the Temple of Doom (1984)0.816626
Matrix Reloaded, The (2003)0.800990
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)0.765159
Gladiator (2000)0.740956
Matrix, The (1999)0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003)0.689591
Lord of the Rings: The Return of the King, The (2003)0.676711
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "recomm_movies", + "summary": "{\n \"name\": \"recomm_movies\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Pirates of the Caribbean: The Curse of the Black Pearl (2003)\",\n \"Spider-Man (2002)\",\n \"Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.06614432811511851,\n \"min\": 0.6767108283499336,\n \"max\": 0.8662018746933645,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.6895905595608812,\n 0.8578535950426878,\n 0.7651586070058114\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 49 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Rg4nszX5x_H1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "# 1. RMSE를 계산하는 함수 정의\n", + "# 실제 평점이 있는 요소에 대해서만 오차를 계산합니다.\n", + "def get_rmse(R, P, Q, non_zeros):\n", + " error = 0\n", + " # 예측된 평점 행렬 계산 (P와 Q의 전치행렬 곱)\n", + " full_pred_matrix = np.dot(P, Q.T)\n", + "\n", + " # 실제 평점 행렬에서 0이 아닌(평점이 매겨진) 위치의 인덱스 추출\n", + " x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]\n", + " y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]\n", + " R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]\n", + "\n", + " # 예측 행렬에서도 동일한 위치의 값만 추출\n", + " full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]\n", + "\n", + " # RMSE 계산\n", + " mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)\n", + " rmse = np.sqrt(mse)\n", + "\n", + " return rmse\n", + "\n", + "# 2. 행렬 분해 메인 함수 정의 (SGD 방식)\n", + "def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):\n", + " num_users, num_items = R.shape\n", + "\n", + " # P와 Q 매트릭스를 정규 분포를 가진 랜덤한 값으로 초기화\n", + " np.random.seed(1)\n", + " P = np.random.normal(scale=1./K, size=(num_users, K))\n", + " Q = np.random.normal(scale=1./K, size=(num_items, K))\n", + "\n", + " # 실제 평점이 있는(0보다 큰) 위치와 값을 리스트에 저장\n", + " non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]\n", + "\n", + " # SGD 기법을 이용해 P와 Q를 반복적으로 업데이트\n", + " for step in range(steps):\n", + " for i, j, r in non_zeros:\n", + " # 실제 평점과 예측 평점의 차이(오차) 계산\n", + " eij = r - np.dot(P[i, :], Q[j, :].T)\n", + "\n", + " # 규제(Regularization)를 포함한 업데이트 공식 적용\n", + " P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])\n", + " Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])\n", + "\n", + " # 10회 반복마다 RMSE 출력\n", + " rmse = get_rmse(R, P, Q, non_zeros)\n", + " if (step % 10) == 0 :\n", + " print(\"### iteration step : \", step ,\" rmse : \", rmse)\n", + "\n", + " return P, Q\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "Vv9t4RXuK4XX" + }, + "execution_count": 51, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ratings = ratings[['userId','movieId','rating']]\n", + "ratings_matrix = ratings.pivot_table('rating',index='userId',columns='movieId')\n", + "\n", + "ratings_movies = pd.merge(ratings, movies, on='movieId')\n", + "ratings_matrix = ratings_movies.pivot_table('rating', index='userId', columns='title')\n", + "P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda = 0.01)\n", + "pred_matrix=np.dot(P,Q.T)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d7MRsW9DyRWy", + "outputId": "5f569b0f-f148-4cb8-fe0e-9a731ecfd8d9" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "### iteration step : 0 rmse : 2.9023619751336867\n", + "### iteration step : 10 rmse : 0.7335768591017927\n", + "### iteration step : 20 rmse : 0.5115539026853442\n", + "### iteration step : 30 rmse : 0.37261628282537446\n", + "### iteration step : 40 rmse : 0.2960818299181014\n", + "### iteration step : 50 rmse : 0.2520353192341642\n", + "### iteration step : 60 rmse : 0.22487503275269854\n", + "### iteration step : 70 rmse : 0.2068545530233154\n", + "### iteration step : 80 rmse : 0.19413418783028685\n", + "### iteration step : 90 rmse : 0.18470082002720406\n", + "### iteration step : 100 rmse : 0.17742927527209104\n", + "### iteration step : 110 rmse : 0.1716522696470749\n", + "### iteration step : 120 rmse : 0.16695181946871726\n", + "### iteration step : 130 rmse : 0.16305292191997542\n", + "### iteration step : 140 rmse : 0.15976691929679646\n", + "### iteration step : 150 rmse : 0.1569598699945732\n", + "### iteration step : 160 rmse : 0.15453398186715425\n", + "### iteration step : 170 rmse : 0.15241618551077643\n", + "### iteration step : 180 rmse : 0.1505508073962831\n", + "### iteration step : 190 rmse : 0.1488947091323209\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index,\n", + " columns = ratings_matrix.columns)\n", + "ratings_pred_matrix.head(3)\n", + "\n", + "# Correctly generate unseen_list within this cell\n", + "user_id_to_recommend = 9\n", + "user_rating_data = ratings_matrix.loc[user_id_to_recommend,:]\n", + "already_seen_movies_by_user = user_rating_data[user_rating_data > 0].index.tolist()\n", + "all_available_movies = ratings_matrix.columns.tolist() # Get all movie titles from ratings_matrix\n", + "unseen_list = [movie for movie in all_available_movies if movie not in already_seen_movies_by_user]\n", + "\n", + "recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, user_id_to_recommend, unseen_list, top_n=10)\n", + "recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])\n", + "recomm_movies" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 446 + }, + "id": "rM-3tXAVz_5t", + "outputId": "d02db0fe-7651-4c96-e50b-7a99ec0d7abc" + }, + "execution_count": 54, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " pred_score\n", + "title \n", + "Rear Window (1954) 5.704612\n", + "South Park: Bigger, Longer and Uncut (1999) 5.451100\n", + "Rounders (1998) 5.298393\n", + "Blade Runner (1982) 5.244951\n", + "Roger & Me (1989) 5.191962\n", + "Gattaca (1997) 5.183179\n", + "Ben-Hur (1959) 5.130463\n", + "Rosencrantz and Guildenstern Are Dead (1990) 5.087375\n", + "Big Lebowski, The (1998) 5.038690\n", + "Star Wars: Episode V - The Empire Strikes Back (1980) 4.989601" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pred_score
title
Rear Window (1954)5.704612
South Park: Bigger, Longer and Uncut (1999)5.451100
Rounders (1998)5.298393
Blade Runner (1982)5.244951
Roger & Me (1989)5.191962
Gattaca (1997)5.183179
Ben-Hur (1959)5.130463
Rosencrantz and Guildenstern Are Dead (1990)5.087375
Big Lebowski, The (1998)5.038690
Star Wars: Episode V - The Empire Strikes Back (1980)4.989601
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "recomm_movies", + "summary": "{\n \"name\": \"recomm_movies\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Big Lebowski, The (1998)\",\n \"South Park: Bigger, Longer and Uncut (1999)\",\n \"Gattaca (1997)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pred_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.21272885538651393,\n \"min\": 4.989601238872484,\n \"max\": 5.704612469838172,\n \"num_unique_values\": 10,\n \"samples\": [\n 5.0386897288205725,\n 5.451100205772531,\n 5.183178550884765\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 54 + } + ] + } + ] +} \ No newline at end of file diff --git "a/Week16_\354\230\210\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" "b/Week16_\354\230\210\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" new file mode 100644 index 0000000..e9689d9 --- /dev/null +++ "b/Week16_\354\230\210\354\212\265\352\263\274\354\240\234_\354\227\204\354\247\200\353\257\274.ipynb" @@ -0,0 +1,961 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "3NtjyiLRC2s_" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "# 1. 원본 평점 행렬 R 정의 (세 번째 행 끝에 쉼표 추가)\n", + "R = np.array([[4, np.nan, np.nan, 2, np.nan],\n", + " [np.nan, 5, np.nan, 3, 1],\n", + " [np.nan, np.nan, 3, 4, 4], # 여기에 쉼표가 있어야 합니다.\n", + " [5, 2, 1, 2, np.nan]])\n", + "\n", + "num_users, num_items = R.shape\n", + "K = 3 # 잠재 요인 수\n", + "\n", + "np.random.seed(1)\n", + "\n", + "# 2. P 행렬: (사용자 수, K)\n", + "P = np.random.normal(scale=1./K, size=(num_users, K))\n", + "\n", + "# 3. Q 행렬: (아이템 수, K)로 수정 (num_users -> num_items)\n", + "# 나중에 예측 평점을 구할 때 P와 Q의 전치행렬(Q.T)을 곱하게 됩니다.\n", + "Q = np.random.normal(scale=1./K, size=(num_items, K))\n" + ] + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "# 1. RMSE를 계산하는 함수 정의\n", + "# 실제 평점이 있는 요소에 대해서만 오차를 계산합니다.\n", + "def get_rmse(R, P, Q, non_zeros):\n", + " error = 0\n", + " # 예측된 평점 행렬 계산 (P와 Q의 전치행렬 곱)\n", + " full_pred_matrix = np.dot(P, Q.T)\n", + "\n", + " # 실제 평점 행렬에서 0이 아닌(평점이 매겨진) 위치의 인덱스 추출\n", + " x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]\n", + " y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]\n", + " R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]\n", + "\n", + " # 예측 행렬에서도 동일한 위치의 값만 추출\n", + " full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]\n", + "\n", + " # RMSE 계산\n", + " mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)\n", + " rmse = np.sqrt(mse)\n", + "\n", + " return rmse\n", + "\n", + "# 2. 행렬 분해 메인 함수 정의 (SGD 방식)\n", + "def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):\n", + " num_users, num_items = R.shape\n", + "\n", + " # P와 Q 매트릭스를 정규 분포를 가진 랜덤한 값으로 초기화\n", + " np.random.seed(1)\n", + " P = np.random.normal(scale=1./K, size=(num_users, K))\n", + " Q = np.random.normal(scale=1./K, size=(num_items, K))\n", + "\n", + " # 실제 평점이 있는(0보다 큰) 위치와 값을 리스트에 저장\n", + " non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]\n", + "\n", + " # SGD 기법을 이용해 P와 Q를 반복적으로 업데이트\n", + " for step in range(steps):\n", + " for i, j, r in non_zeros:\n", + " # 실제 평점과 예측 평점의 차이(오차) 계산\n", + " eij = r - np.dot(P[i, :], Q[j, :].T)\n", + "\n", + " # 규제(Regularization)를 포함한 업데이트 공식 적용\n", + " P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])\n", + " Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])\n", + "\n", + " # 10회 반복마다 RMSE 출력\n", + " rmse = get_rmse(R, P, Q, non_zeros)\n", + " if (step % 10) == 0 :\n", + " print(\"### iteration step : \", step ,\" rmse : \", rmse)\n", + "\n", + " return P, Q\n", + "\n", + "# 3. 실험용 데이터 설정 및 실행\n", + "# 원본 평점 행렬 R (결측치는 0으로 표시) [cite: 59, 198]\n", + "R = np.array([[4, 0, 0, 2, 0],\n", + " [0, 5, 0, 3, 1],\n", + " [0, 0, 3, 4, 4],\n", + " [5, 2, 1, 2, 0]])\n", + "\n", + "# 행렬 분해 수행 (잠재 요인 K=3)\n", + "P, Q = matrix_factorization(R, K=3, steps=200, learning_rate=0.01, r_lambda = 0.01)\n", + "\n", + "# 최종 예측 행렬 결과 확인\n", + "pred_matrix = np.dot(P, Q.T)\n", + "print(\"\\n##### 예측 행렬 결과 #####\")\n", + "print(np.round(pred_matrix, 2))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vv9t4RXuK4XX", + "outputId": "7098cad6-8f08-4951-dca2-35c971cb66f5" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "### iteration step : 0 rmse : 3.2388050277987723\n", + "### iteration step : 10 rmse : 2.916635469036195\n", + "### iteration step : 20 rmse : 2.198706863150832\n", + "### iteration step : 30 rmse : 1.3045525302788117\n", + "### iteration step : 40 rmse : 0.7565127655009479\n", + "### iteration step : 50 rmse : 0.4876723101369648\n", + "### iteration step : 60 rmse : 0.3562311584670397\n", + "### iteration step : 70 rmse : 0.27952329795334957\n", + "### iteration step : 80 rmse : 0.22656292990066265\n", + "### iteration step : 90 rmse : 0.18700753399789358\n", + "### iteration step : 100 rmse : 0.1564340384819247\n", + "### iteration step : 110 rmse : 0.13234649522249217\n", + "### iteration step : 120 rmse : 0.11311333721232043\n", + "### iteration step : 130 rmse : 0.09759058709002262\n", + "### iteration step : 140 rmse : 0.0849441568587214\n", + "### iteration step : 150 rmse : 0.07455141311978046\n", + "### iteration step : 160 rmse : 0.06594094420477092\n", + "### iteration step : 170 rmse : 0.05875268710429461\n", + "### iteration step : 180 rmse : 0.05270988708098207\n", + "### iteration step : 190 rmse : 0.047598581976266506\n", + "\n", + "##### 예측 행렬 결과 #####\n", + "[[ 3.96 0.59 1.39 2.07 1.77]\n", + " [ 6.87 4.98 0.99 2.94 1.01]\n", + " [ 6.68 -0.17 3. 3.94 3.99]\n", + " [ 4.95 2. 0.98 2.08 1.07]]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "pip install scikit-surprise" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lE7W2QxHL7Mw", + "outputId": "67005ebf-2452-483b-ee99-082c2491b617" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting scikit-surprise\n", + " Downloading scikit_surprise-1.1.4.tar.gz (154 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/154.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.4/154.4 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-surprise) (1.5.3)\n", + "Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.12/dist-packages (from scikit-surprise) (2.0.2)\n", + "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-surprise) (1.16.3)\n", + "Building wheels for collected packages: scikit-surprise\n", + " Building wheel for scikit-surprise (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2555930 sha256=85df86b3f4d812156c8d65262fa8e78b8b383934888a93c9c6979cec4fcc74b8\n", + " Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1fbaab6061854e6cfbb81a0ae52c92a502a7fa454b\n", + "Successfully built scikit-surprise\n", + "Installing collected packages: scikit-surprise\n", + "Successfully installed scikit-surprise-1.1.4\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from surprise import SVD\n", + "from surprise import Dataset\n", + "from surprise import accuracy\n", + "from surprise.model_selection import train_test_split" + ], + "metadata": { + "id": "cAFO7EL5MUnk" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 484 + }, + "id": "623d62df", + "outputId": "42083862-d7a1-496b-924e-64330ca431db" + }, + "source": [ + "!pip install 'numpy<2'" + ], + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting numpy<2\n", + " Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/61.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.0/61.0 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.0/18.0 MB\u001b[0m \u001b[31m57.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: numpy\n", + " Attempting uninstall: numpy\n", + " Found existing installation: numpy 2.0.2\n", + " Uninstalling numpy-2.0.2:\n", + " Successfully uninstalled numpy-2.0.2\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n", + "shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.\n", + "jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n", + "pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n", + "opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\n", + "jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.\n", + "opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= \"3.9\", but you have numpy 1.26.4 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed numpy-1.26.4\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "numpy" + ] + }, + "id": "ee4a414eaf0c42c8be5949f964bcf0e2" + } + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "data = Dataset.load_builtin('ml-100k')\n", + "# 수행 시마다 동일하게 데이터를 분할하기 위해 random.state 값 부여\n", + "trainset, testset = train_test_split(data, test_size=.25, random_state=0)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "IdtAD1THM8_T", + "outputId": "171500e2-107f-4c44-d58b-d1a70e3b0ab2" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y\n", + "Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...\n", + "Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "algo = SVD(random_state=0)\n", + "algo.fit(trainset)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LeHGfxhgNKWR", + "outputId": "0a0d5db5-69bd-4062-faf7-97051d5745b5" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "predictions = algo.test( testset )\n", + "print('prediction type:', type(predictions), 'size:',len(predictions))\n", + "print('prediction 결과의 최초 5개 추출')\n", + "predictions[:5]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "smXz373RNVOZ", + "outputId": "bb21c1c6-2b26-4fb2-b4ad-2aef3d91757e" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "prediction type: size: 25000\n", + "prediction 결과의 최초 5개 추출\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[Prediction(uid='120', iid='282', r_ui=4.0, est=3.5114147666251547, details={'was_impossible': False}),\n", + " Prediction(uid='882', iid='291', r_ui=4.0, est=3.573872419581491, details={'was_impossible': False}),\n", + " Prediction(uid='535', iid='507', r_ui=5.0, est=4.033583485472447, details={'was_impossible': False}),\n", + " Prediction(uid='697', iid='244', r_ui=5.0, est=3.8463639495936905, details={'was_impossible': False}),\n", + " Prediction(uid='751', iid='385', r_ui=4.0, est=3.1807542478219157, details={'was_impossible': False})]" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "[ (pred.uid, pred.iid, pred.est) for pred in predictions[:3] ]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6htAgalXNpc9", + "outputId": "b01b5ec8-9470-4b84-d270-c8681fd828bc" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('120', '282', 3.5114147666251547),\n", + " ('882', '291', 3.573872419581491),\n", + " ('535', '507', 4.033583485472447)]" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "uid = str(196)\n", + "iid = str(302)\n", + "pred = algo.predict(uid,iid)\n", + "print(pred)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_1s6bm2FN5-E", + "outputId": "a2063a9b-ceeb-4513-bae4-3b38683d9eaa" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "user: 196 item: 302 r_ui = None est = 4.49 {'was_impossible': False}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "accuracy.rmse(predictions)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2uAIAVJJOCCd", + "outputId": "0989b014-762e-432c-fe0e-1fe6cabc2ddd" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "RMSE: 0.9467\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9466860806937948" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from surprise import Dataset\n", + "\n", + "# Surprise가 다운로드한 ml-100k 데이터셋의 실제 경로와 파일 이름 (u.data)을 사용합니다.\n", + "# ml-100k 데이터셋의 u.data 파일은 사용자ID, 아이템ID, 평점, 타임스탬프 순서로 구성되어 있으며 탭으로 구분됩니다.\n", + "# Surprise의 Dataset.load_builtin('ml-100k')는 이미 이 데이터를 로드하여 내부적으로 사용하고 있습니다.\n", + "# 만약 pandas를 사용하여 이 데이터를 로드하려면 다음처럼 할 수 있습니다.\n", + "\n", + "# ml-100k 데이터셋의 ratings 파일은 'u.data'입니다.\n", + "# 이 파일은 사용자 ID, 영화 ID, 평점, 타임스탬프로 구성되어 있으며, 탭으로 구분됩니다.\n", + "ratings_file_path = '/root/.surprise_data/ml-100k/ml-100k/u.data' # './' 제거하여 절대 경로로 수정\n", + "\n", + "# pandas로 u.data 파일을 읽을 때, 컬럼 이름과 구분자(sep='\\t')를 명시해야 합니다.\n", + "ratings_df = pd.read_csv(ratings_file_path, sep='\\t', header=None,\n", + " names=['user_id', 'item_id', 'rating', 'timestamp'])\n", + "\n", + "# 데이터프레임의 처음 5행을 출력하여 확인합니다.\n", + "print(ratings_df.head())\n", + "\n", + "# 필요한 경우, Surprise에서 요구하는 형식 (user, item, rating)으로 저장할 수 있습니다.\n", + "# 예를 들어, 'ratings_noh.csv' 파일로 저장하려면:\n", + "# ratings_df[['user_id', 'item_id', 'rating']].to_csv(\n", + "# './root/.surprise_data/ml-100k/ml-100k/ratings_noh.csv', index=False, header=False\n", + "# )" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VYWA7JdCOYjk", + "outputId": "7d75dd9e-60b0-404b-9029-c8b328ac31b5" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " user_id item_id rating timestamp\n", + "0 196 242 3 881250949\n", + "1 186 302 3 891717742\n", + "2 22 377 1 878887116\n", + "3 244 51 2 880606923\n", + "4 166 346 1 886397596\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "01f2f836", + "outputId": "7f12bac9-7034-4241-8fa1-f297c116348b" + }, + "source": [ + "!ls -R /root/.surprise_data/ml-100k" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "/root/.surprise_data/ml-100k:\n", + "ml-100k\n", + "\n", + "/root/.surprise_data/ml-100k/ml-100k:\n", + "allbut.pl u1.base u2.test u4.base u5.test ub.base\tu.genre u.occupation\n", + "mku.sh\t u1.test u3.base u4.test ua.base ub.test\tu.info\t u.user\n", + "README\t u2.base u3.test u5.base ua.test u.data\tu.item\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from surprise import Reader\n", + "reader = Reader(line_format='user item rating timestamp', sep='\\t',rating_scale=(0.5, 5))\n", + "data=Dataset.load_from_file('/root/.surprise_data/ml-100k/ml-100k/u.data', reader=reader)" + ], + "metadata": { + "id": "yrlL-42FP1wV" + }, + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 1. 데이터셋을 trainset과 testset으로 분할\n", + "# train_test_split은 Trainset 객체와 raw testset (list of tuples)을 반환합니다.\n", + "trainset, testset = train_test_split(data, test_size=0.25, random_state=0)\n", + "\n", + "# 2. SVD 모델 학습\n", + "algo = SVD(n_factors=50, random_state=0)\n", + "algo.fit(trainset) # trainset은 이미 surprise.Trainset 객체입니다.\n", + "\n", + "# 3. 예측은 testset을 사용하여 수행합니다.\n", + "predictions = algo.test(testset) # testset은 raw ratings의 list입니다.\n", + "accuracy.rmse(predictions)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GEvyP645Qrvx", + "outputId": "ccf7e074-b8be-4afc-d515-74aff27f1da3" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "RMSE: 0.9458\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9457855197571977" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from surprise import Reader, Dataset\n", + "ratings = pd.read_csv('/root/.surprise_data/ml-100k/ml-100k/u.data', sep='\\t', header=None, names=['userid', 'movield', 'rating', 'timestamp'])\n", + "reader = Reader(rating_scale=(0.5, 5.0))\n", + "# ratings DataFrame에서 칼럼은 사용자 아이디, 아이템 아이디, 평점 순서를 지켜야 합니다.\n", + "data = Dataset.load_from_df(ratings[['userid', 'movield', 'rating']], reader)\n", + "trainset, testset = train_test_split(data, test_size=.25, random_state=0)\n", + "algo = SVD(n_factors=50, random_state=0)\n", + "algo.fit(trainset)\n", + "predictions = algo.test( testset )\n", + "accuracy.rmse(predictions)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vHaoymFcR2r9", + "outputId": "2f7eadec-90b3-403a-a123-af39ad25fdd0" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "RMSE: 0.9458\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.9457855197571977" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from surprise.model_selection import cross_validate\n", + "# 판다스 DataFrame에서 Surprise 데이터 세트로 데이터 로딩\n", + "reader = Reader(rating_scale=(0.5, 5.0))\n", + "data = Dataset.load_from_df(ratings[['userid', 'movield', 'rating']], reader)\n", + "algo = SVD(random_state=0)\n", + "cross_validate(algo, data, measures=[ 'RMSE', 'MAE'], cv=5, verbose=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V9HiDOsQS82q", + "outputId": "cb099636-06d2-49d8-c2ac-90056a011a65" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Evaluating RMSE, MAE of algorithm SVD on 5 split(s).\n", + "\n", + " Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std \n", + "RMSE (testset) 0.9409 0.9388 0.9277 0.9483 0.9331 0.9378 0.0070 \n", + "MAE (testset) 0.7420 0.7416 0.7293 0.7451 0.7367 0.7389 0.0055 \n", + "Fit time 1.32 1.31 1.37 1.94 1.32 1.45 0.25 \n", + "Test time 0.12 0.25 0.18 0.10 0.27 0.18 0.07 \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'test_rmse': array([0.94093683, 0.93876325, 0.92773331, 0.94829749, 0.93311353]),\n", + " 'test_mae': array([0.74197351, 0.74163748, 0.72931499, 0.74508501, 0.73666611]),\n", + " 'fit_time': (1.3230950832366943,\n", + " 1.310234546661377,\n", + " 1.3650386333465576,\n", + " 1.9425837993621826,\n", + " 1.321216106414795),\n", + " 'test_time': (0.11644506454467773,\n", + " 0.2541470527648926,\n", + " 0.17620587348937988,\n", + " 0.09936308860778809,\n", + " 0.2698531150817871)}" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from surprise.model_selection import GridSearchCV\n", + "\n", + "param_grid = {'n_epochs': [20,40,60],'n_factors':[50,100,200]}\n", + "\n", + "gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mse', 'mae'], cv=3)\n", + "gs.fit(data)\n", + "print(gs.best_score['rmse'])\n", + "print(gs.best_params['rmse'])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7MKXNfz8Tb5e", + "outputId": "e1fb642b-4edc-42f3-802b-0050d10027d7" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.943753389457367\n", + "{'n_epochs': 20, 'n_factors': 50}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data = Dataset.load_from_df(ratings[['userid', 'movield', 'rating']], reader)\n", + "# DatasetAutoFolds 객체에서 전체 학습셋을 만듭니다.\n", + "trainset = data.build_full_trainset()\n", + "algo = SVD(n_factors=50, random_state=0)\n", + "algo.fit(trainset)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LsLvTJr3T3ZQ", + "outputId": "76192413-7cd0-4ea9-8f78-b624ef2d69a4" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 29 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from surprise.dataset import DatasetAutoFolds\n", + "reader = Reader(line_format='user item rating timestamp', sep='\\t', rating_scale=(0.5, 5))\n", + "\n", + "data_folds = DatasetAutoFolds(ratings_file ='/root/.surprise_data/ml-100k/ml-100k/u.data', reader = reader )\n", + "\n", + "trainset = data_folds.build_full_trainset()\n", + "algo = SVD(n_epochs=20, n_factors=50, random_state=0)\n", + "algo.fit(trainset)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ErCgbrzTT-NU", + "outputId": "dfb26184-a398-4e02-ad48-ff3b49573e8f" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 32 + } + ] + }, + { + "cell_type": "code", + "source": [ + "movies = pd.read_csv('/root/.surprise_data/ml-100k/ml-100k/u.item', sep='|', header=None, encoding='latin1')\n", + "# 'u.item' 파일은 컬럼이 '|' (파이프)로 구분되어 있고 헤더가 없으므로 이를 명시합니다.\n", + "# 또한, 영화 상세 정보 파일에는 'movieid'라는 컬럼 이름이 없으므로, 영화 ID가 첫 번째 컬럼에 있다고 가정하고 적절한 컬럼 이름을 지정하거나 인덱스를 사용하여 접근해야 합니다.\n", + "# 여기서는 임시로 첫 번째 컬럼을 'movieid'로 가정하고 진행합니다.\n", + "movies.columns = ['movieid', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', \"Children's\", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']\n", + "\n", + "# 'ratings' DataFrame에는 'movield' 컬럼이 있고, 'u.item'에서는 'movieid' 컬럼이 사용됩니다.\n", + "# 'movield'를 사용하여 해당 영화 ID를 찾아야 합니다.\n", + "movieIds = ratings[ratings['userid']==9]['movield']\n", + "\n", + "if movieIds[movieIds == 42].count()==0:\n", + " print('사용자 아이디 9는 영화 아이디 42의 평점 없음')\n", + "\n", + "print(movies[movies['movieid']==42])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tP-TG3MFVczP", + "outputId": "ed352f5b-623b-4039-8a99-acfe87d933f3" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "사용자 아이디 9는 영화 아이디 42의 평점 없음\n", + " movieid title release_date video_release_date \\\n", + "41 42 Clerks (1994) 01-Jan-1994 NaN \n", + "\n", + " imdb_url unknown Action \\\n", + "41 http://us.imdb.com/M/title-exact?Clerks%20(1994) 0 0 \n", + "\n", + " Adventure Animation Children's ... Fantasy Film-Noir Horror \\\n", + "41 0 0 0 ... 0 0 0 \n", + "\n", + " Musical Mystery Romance Sci-Fi Thriller War Western \n", + "41 0 0 0 0 0 0 0 \n", + "\n", + "[1 rows x 24 columns]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "uid = str(9)\n", + "iid = str(42)\n", + "pred = algo.predict(uid, iid, verbose=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kXIrvv8xXO2F", + "outputId": "3263d3ff-df8c-448e-c389-7e5376f49d9c" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "user: 9 item: 42 r_ui = None est = 4.25 {'was_impossible': False}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def get_unseen_surprise(ratings, movies, userid):\n", + " seen_movies = ratings[ratings['userid']==userid]['movield'].tolist()\n", + " total_movies = movies['movieid'].tolist()\n", + " unseen_movies = [movie for movie in total_movies if movie not in seen_movies]\n", + " print('평점 매긴 영화 수:', len(seen_movies),'추천 대상 영화 수:', len(unseen_movies),\n", + "'전체 영화 수:', len(total_movies))\n", + " return unseen_movies\n", + "unseen_movies = get_unseen_surprise(ratings, movies, 9)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YKwxIUvvXZ2k", + "outputId": "2e755025-bcdb-4948-d886-19e25db9e2f8" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "평점 매긴 영화 수: 22 추천 대상 영화 수: 1660 전체 영화 수: 1682\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "def get_unseen_surprise(ratings, movies, userId):\n", + " # 특정 사용자가 평점을 매긴 모든 영화 리스트 추출\n", + " seen_movies = ratings[ratings['userid'] == userId]['movield'].tolist()\n", + "\n", + " # 모든 영화의 movieId 리스트 추출\n", + " total_movies = movies['movieid'].tolist()\n", + "\n", + " # 모든 영화 중 이미 본 영화를 제외하여 미시청 영화 리스트 생성\n", + " unseen_movies = [movie for movie in total_movies if movie not in seen_movies]\n", + "\n", + " print(f'### 사용자 {userId}가 시청하지 않은 영화 개수: {len(unseen_movies)}')\n", + " return unseen_movies\n", + "\n", + "def recomm_movie_by_surprise(algo, userId, unseen_movies, movies, top_n=10):\n", + " # 1. 미시청 영화들에 대해 알고리즘 객체의 predict()를 호출하여 예측 평점 계산\n", + " predictions = [algo.predict(str(userId), str(movieId)) for movieId in unseen_movies]\n", + "\n", + " # 2. 예측 평점(est)을 기준으로 내림차순 정렬하기 위한 내부 함수\n", + " def sortkey_est(pred):\n", + " return pred.est\n", + "\n", + " # 3. 정렬 후 상위 top_n개 추출\n", + " predictions.sort(key=sortkey_est, reverse=True)\n", + " top_predictions = predictions[:top_n]\n", + "\n", + " # 4. 상위 영화들의 상세 정보(ID, 예측 평점, 제목) 추출\n", + " top_movie_ids = [int(pred.iid) for pred in top_predictions]\n", + " top_movie_rating = [pred.est for pred in top_predictions]\n", + "\n", + " # 영화 제목(Title)을 가져오기 위해 movies 데이터프레임과 매칭\n", + " # (movies 데이터프레임의 인덱스 기준이 아닌 movieId 컬럼 기준으로 필터링)\n", + " top_movie_titles = movies[movies.movieid.isin(top_movie_ids)]['title']\n", + "\n", + " # 최종 리스트 생성: (영화ID, 영화제목, 예측평점)\n", + " top_movie_preds = [ (id, title, rating) for id, title, rating in \\\n", + " zip(top_movie_ids, top_movie_titles, top_movie_rating)]\n", + "\n", + " return top_movie_preds\n", + "\n", + "unseen_movies = get_unseen_surprise(ratings, movies, 9)\n", + "top_movie_preds = recomm_movie_by_surprise(algo, 9, unseen_movies, movies, top_n=10)\n", + "print('#### Top-10 추천 영화 리스트 #####')\n", + "for top_movie in top_movie_preds:\n", + " print(top_movie[1],\";\", top_movie[2])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DrQ_Eu_FYSDr", + "outputId": "b3dfda57-f828-468c-fa7c-f237813c63aa" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "### 사용자 9가 시청하지 않은 영화 개수: 1660\n", + "#### Top-10 추천 영화 리스트 #####\n", + "Usual Suspects, The (1995) ; 4.998461495674477\n", + "Shawshank Redemption, The (1994) ; 4.9581509884065245\n", + "Wallace & Gromit: The Best of Aardman Animation (1996) ; 4.919645463559541\n", + "Wrong Trousers, The (1993) ; 4.882499471175927\n", + "Empire Strikes Back, The (1980) ; 4.87836594561176\n", + "Raiders of the Lost Ark (1981) ; 4.873833196467519\n", + "L.A. Confidential (1997) ; 4.8584916788987496\n", + "One Flew Over the Cuckoo's Nest (1975) ; 4.820999931348582\n", + "Close Shave, A (1995) ; 4.795043987320939\n", + "North by Northwest (1959) ; 4.789671087078756\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git "a/f17ef4a4-e5ca-4bf8-bab9-5302b1a84f41_9\354\236\245_\354\240\225\353\246\254.pdf" "b/f17ef4a4-e5ca-4bf8-bab9-5302b1a84f41_9\354\236\245_\354\240\225\353\246\254.pdf" new file mode 100644 index 0000000..8bbf42a Binary files /dev/null and "b/f17ef4a4-e5ca-4bf8-bab9-5302b1a84f41_9\354\236\245_\354\240\225\353\246\254.pdf" differ