diff --git "a/Week13_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\214\341\205\265\341\206\253\341\204\213\341\205\260\341\204\213\341\205\265\341\204\213\341\205\243\341\206\253.ipynb" "b/Week13_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\214\341\205\265\341\206\253\341\204\213\341\205\260\341\204\213\341\205\265\341\204\213\341\205\243\341\206\253.ipynb" new file mode 100644 index 0000000..d163e0c --- /dev/null +++ "b/Week13_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\214\341\205\265\341\206\253\341\204\213\341\205\260\341\204\213\341\205\265\341\204\213\341\205\243\341\206\253.ipynb" @@ -0,0 +1,752 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "28bb1023", + "metadata": {}, + "source": [ + "# 데이터 전처리" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4ac901a9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1482535, 8)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
train_idnameitem_condition_idcategory_namebrand_namepriceshippingitem_description
00MLB Cincinnati Reds T Shirt Size XL3Men/Tops/T-shirtsNaN10.01No description yet
11Razer BlackWidow Chroma Keyboard3Electronics/Computers & Tablets/Components & P...Razer52.00This keyboard is in great condition and works ...
22AVA-VIV Blouse1Women/Tops & Blouses/BlouseTarget10.01Adorable top with a hint of lace and a key hol...
\n", + "
" + ], + "text/plain": [ + " train_id name item_condition_id \\\n", + "0 0 MLB Cincinnati Reds T Shirt Size XL 3 \n", + "1 1 Razer BlackWidow Chroma Keyboard 3 \n", + "2 2 AVA-VIV Blouse 1 \n", + "\n", + " category_name brand_name price \\\n", + "0 Men/Tops/T-shirts NaN 10.0 \n", + "1 Electronics/Computers & Tablets/Components & P... Razer 52.0 \n", + "2 Women/Tops & Blouses/Blouse Target 10.0 \n", + "\n", + " shipping item_description \n", + "0 1 No description yet \n", + "1 0 This keyboard is in great condition and works ... \n", + "2 1 Adorable top with a hint of lace and a key hol... " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import Ridge , LogisticRegression\n", + "from sklearn.model_selection import train_test_split , cross_val_score\n", + "from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer\n", + "import pandas as pd\n", + "\n", + "mercari_df = pd.read_csv('mercari_train.tsv',sep='\\t')\n", + "print(mercari_df.shape)\n", + "mercari_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "26d90c10", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1482535 entries, 0 to 1482534\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 train_id 1482535 non-null int64 \n", + " 1 name 1482535 non-null object \n", + " 2 item_condition_id 1482535 non-null int64 \n", + " 3 category_name 1476208 non-null object \n", + " 4 brand_name 849853 non-null object \n", + " 5 price 1482535 non-null float64\n", + " 6 shipping 1482535 non-null int64 \n", + " 7 item_description 1482529 non-null object \n", + "dtypes: float64(1), int64(3), object(4)\n", + "memory usage: 90.5+ MB\n", + "None\n" + ] + } + ], + "source": [ + "print(mercari_df.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "83871101", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Price Column의 데이터값 분포\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "y_train_df = mercari_df['price']\n", + "plt.figure(figsize=(6,4))\n", + "sns.histplot(y_train_df,bins=100)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b4b34af4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Log Transformation\n", + "import numpy as np\n", + "y_train_df = np.log1p(y_train_df)\n", + "sns.histplot(y_train_df,bins=50)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "152603c5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2.397895\n", + "1 3.970292\n", + "2 2.397895\n", + "Name: price, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 로그 변환한 값을 데이터프레임에 넣기\n", + "mercari_df['price'] = np.log1p(mercari_df['price'])\n", + "mercari_df['price'].head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "160f4f7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shipping 값 유형:\n", + " shipping\n", + "0 819435\n", + "1 663100\n", + "Name: count, dtype: int64\n", + "item_condition_id 값 유형:\n", + " item_condition_id\n", + "1 640549\n", + "3 432161\n", + "2 375479\n", + "4 31962\n", + "5 2384\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Shipping & item_condition 값 유형 체크\n", + "print('Shipping 값 유형:\\n',mercari_df['shipping'].value_counts())\n", + "print('item_condition_id 값 유형:\\n',mercari_df['item_condition_id'].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a3eff2e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "82489" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# null 와 비슷한 \"No description yet\"값 확인 \n", + "boolean_cond = mercari_df['item_description']=='No description yet'\n", + "mercari_df[boolean_cond]['item_description'].count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f41b346", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "대분류 유형 :\n", + " cat_dae\n", + "Women 664385\n", + "Beauty 207828\n", + "Kids 171689\n", + "Electronics 122690\n", + "Men 93680\n", + "Home 67871\n", + "Vintage & Collectibles 46530\n", + "Other 45351\n", + "Handmade 30842\n", + "Sports & Outdoors 25342\n", + "Other_Null 6327\n", + "Name: count, dtype: int64\n", + "중분류 갯수 : 114\n", + "소분류 갯수 : 871\n" + ] + } + ], + "source": [ + "# category_name column을 대,중,소로 분리\n", + "def split_cat(category_name):\n", + " try:\n", + " return category_name.split('/')\n", + " except:\n", + " return ['Other_Null','Other_Null','Other_Null']\n", + "mercari_df['cat_dae'],mercari_df['cat_jung'],mercari_df['cat_so'] = zip(*mercari_df['category_name'].apply(lambda x: split_cat(x)))\n", + "\n", + "print('대분류 유형 :\\n', mercari_df['cat_dae'].value_counts())\n", + "# 갯수만 출력\n", + "print('중분류 갯수 :', mercari_df['cat_jung'].nunique())\n", + "print('소분류 갯수 :', mercari_df['cat_so'].nunique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d16508a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "train_id 0\n", + "name 0\n", + "item_condition_id 0\n", + "category_name 0\n", + "brand_name 0\n", + "price 0\n", + "shipping 0\n", + "item_description 0\n", + "cat_dae 0\n", + "cat_jung 0\n", + "cat_so 0\n", + "dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fill NaN values with Other_Null\n", + "mercari_df['brand_name'] = mercari_df['brand_name'].fillna(value='Other_Null')\n", + "mercari_df['category_name'] = mercari_df['category_name'].fillna(value='Other_Null')\n", + "mercari_df['item_description'] = mercari_df['item_description'].fillna(value='Other_Null')\n", + "\n", + "# Checking if Null values left\n", + "mercari_df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "4cbb83f2", + "metadata": {}, + "source": [ + "# 피처 인코딩과 피처 벡터화" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4ff13773", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "brand name 의 유형 건수 : 4810\n", + "brand name sample 5건 : \n", + " brand_name\n", + "Other_Null 632682\n", + "PINK 54088\n", + "Nike 54043\n", + "Victoria's Secret 48036\n", + "LuLaRoe 31024\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "print('brand name 의 유형 건수 :', mercari_df['brand_name'].nunique())\n", + "print('brand name sample 5건 : \\n', mercari_df['brand_name'].value_counts()[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4aec6c0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name 의 종류 갯수 : 1225273\n", + "name sample 7건 : \n", + " 0 MLB Cincinnati Reds T Shirt Size XL\n", + "1 Razer BlackWidow Chroma Keyboard\n", + "2 AVA-VIV Blouse\n", + "3 Leather Horse Statues\n", + "4 24K GOLD plated rose\n", + "5 Bundled items requested for Ruie\n", + "6 Acacia pacific tides santorini top\n", + "Name: name, dtype: object\n" + ] + } + ], + "source": [ + "print('name 의 종류 갯수 :', mercari_df['name'].nunique())\n", + "print('name sample 7건 : \\n', mercari_df['name'][:7])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2f12dac0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "item_description 평균 문자열 개수: 145.71139703278507\n" + ] + }, + { + "data": { + "text/plain": [ + "0 No description yet\n", + "1 This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.\n", + "Name: item_description, dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option('max_colwidth',200)\n", + "# item_description의 평균 문자열 개수\n", + "print('item_description 평균 문자열 개수:',mercari_df['item_description'].str.len().mean())\n", + "mercari_df['item_description'][:2]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "eb15eed6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name vectorization shape: (1482535, 105757)\n", + "item_description vectorization shape: (1482535, 50000)\n" + ] + } + ], + "source": [ + "# Feature Vectorization of name feature (CountVectorizer)\n", + "cnt_vec = CountVectorizer()\n", + "X_name = cnt_vec.fit_transform(mercari_df.name)\n", + "\n", + "# Feature Vectorization of item_description feature (TfidfVectorizer)\n", + "tfidf_descp = TfidfVectorizer(max_features = 50000, ngram_range= (1,3) , stop_words='english')\n", + "X_descp = tfidf_descp.fit_transform(mercari_df['item_description'])\n", + "\n", + "print('name vectorization shape:',X_name.shape)\n", + "print('item_description vectorization shape:',X_descp.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4dd4ad1e", + "metadata": {}, + "outputs": [], + "source": [ + "# LabelBinarizer로 원-핫 인코딩 변환\n", + "from sklearn.preprocessing import LabelBinarizer\n", + "# brand_name\n", + "lb_brand_name = LabelBinarizer(sparse_output=True)\n", + "X_brand = lb_brand_name.fit_transform(mercari_df['brand_name'])\n", + "# item_condition\n", + "lb_item_cond_id = LabelBinarizer(sparse_output=True)\n", + "X_item_cond_id = lb_item_cond_id.fit_transform(mercari_df['item_condition_id'])\n", + "# shipping\n", + "lb_shipping= LabelBinarizer(sparse_output=True)\n", + "X_shipping = lb_shipping.fit_transform(mercari_df['shipping'])\n", + "\n", + "# cat_dae, cat_jung, cat_so \n", + "lb_cat_dae = LabelBinarizer(sparse_output=True)\n", + "X_cat_dae= lb_cat_dae.fit_transform(mercari_df['cat_dae'])\n", + "\n", + "lb_cat_jung = LabelBinarizer(sparse_output=True)\n", + "X_cat_jung = lb_cat_jung.fit_transform(mercari_df['cat_jung'])\n", + "\n", + "lb_cat_so = LabelBinarizer(sparse_output=True)\n", + "X_cat_so = lb_cat_so.fit_transform(mercari_df['cat_so'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "583474cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " \n", + "X_brand_shape:(1482535, 4810), X_item_cond_id shape:(1482535, 5)\n", + "X_shipping shape:(1482535, 1), X_cat_dae shape:(1482535, 11)\n", + "X_cat_jung shape:(1482535, 114), X_cat_so shape:(1482535, 871)\n" + ] + } + ], + "source": [ + "print(type(X_brand), type(X_item_cond_id), type(X_shipping))\n", + "print('X_brand_shape:{0}, X_item_cond_id shape:{1}'.format(X_brand.shape, X_item_cond_id.shape))\n", + "print('X_shipping shape:{0}, X_cat_dae shape:{1}'.format(X_shipping.shape, X_cat_dae.shape))\n", + "print('X_cat_jung shape:{0}, X_cat_so shape:{1}'.format(X_cat_jung.shape, X_cat_so.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c5bc4669", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " (1482535, 161569)\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Constructing feature matrix using hstack\n", + "from scipy.sparse import hstack\n", + "import gc \n", + "sparse_matrix_list = (X_name,X_descp,X_brand,X_item_cond_id,X_shipping,X_cat_dae,X_cat_jung,X_cat_so)\n", + "X_features_sparse = hstack(sparse_matrix_list).tocsr()\n", + "print(type(X_features_sparse),X_features_sparse.shape)\n", + "del X_features_sparse\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "id": "5821ea59", + "metadata": {}, + "source": [ + "# Ridge Regression " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0996aa94", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate \n", + "def rmsle(y,y_pred):\n", + " return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y_pred),2)))\n", + "\n", + "def evaluate_org_price(y_test,preds):\n", + " preds_exmpm = np.expm1(preds)\n", + " y_test_exmpm = np.expm1(y_test)\n", + " \n", + " rmsle_result = rmsle(y_test_exmpm, preds_exmpm)\n", + " return rmsle_result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d820cf1", + "metadata": {}, + "outputs": [], + "source": [ + "import gc \n", + "from scipy.sparse import hstack\n", + "\n", + "def model_train_predict(model,matrix_list):\n", + " X= hstack(matrix_list).tocsr() \n", + " \n", + " X_train, X_test, y_train, y_test=train_test_split(X, mercari_df['price'], \n", + " test_size=0.2, random_state=156)\n", + " \n", + " # Train\n", + " model.fit(X_train , y_train)\n", + " # Predict\n", + " preds = model.predict(X_test)\n", + " \n", + " # Free Memory\n", + " del X , X_train , X_test , y_train \n", + " gc.collect()\n", + " \n", + " return preds , y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0bdc39f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Item Description을 제외했을 때 rmsle 값: 0.4984480211612475\n", + "Item Description을 포함한 rmsle 값: 0.4679507419600776\n" + ] + } + ], + "source": [ + "linear_model = Ridge(solver = \"lsqr\", fit_intercept=False)\n", + "\n", + "sparse_matrix_list = (X_name, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "linear_preds , y_test = model_train_predict(model=linear_model ,matrix_list=sparse_matrix_list)\n", + "print('Item Description을 제외했을 때 rmsle 값:', evaluate_org_price(y_test , linear_preds))\n", + "\n", + "sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "linear_preds , y_test = model_train_predict(model=linear_model , matrix_list=sparse_matrix_list)\n", + "print('Item Description을 포함한 rmsle 값:', evaluate_org_price(y_test ,linear_preds))" + ] + }, + { + "cell_type": "markdown", + "id": "bd4ee00e", + "metadata": {}, + "source": [ + "# LightGBM + Ensemble" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "81b9160b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 222.862067 seconds.\n", + "You can set `force_row_wise=true` to remove the overhead.\n", + "And if memory is not enough, you can set `force_col_wise=true`.\n", + "[LightGBM] [Info] Total Bins 1068421\n", + "[LightGBM] [Info] Number of data points in the train set: 1186028, number of used features: 65338\n", + "[LightGBM] [Info] Start training from score 2.979514\n", + "LightGBM rmsle 값: 0.4569987654210569\n" + ] + } + ], + "source": [ + "from lightgbm import LGBMRegressor\n", + "\n", + "sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,\n", + " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n", + "\n", + "lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125, random_state=156)\n", + "lgbm_preds , y_test = model_train_predict(model = lgbm_model , matrix_list=sparse_matrix_list)\n", + "print('LightGBM rmsle 값:', evaluate_org_price(y_test , lgbm_preds))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "3b67ecc5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LightGBM과 Ridge를 ensemble한 최종 rmsle 값: 0.4468973140634917\n" + ] + } + ], + "source": [ + "preds = lgbm_preds * 0.45 + linear_preds * 0.55\n", + "print('LightGBM과 Ridge를 ensemble한 최종 rmsle 값:', evaluate_org_price(y_test , preds))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}