diff --git "a/Week13_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\211\341\205\245\341\204\213\341\205\247\341\206\253.ipynb" "b/Week13_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\211\341\205\245\341\204\213\341\205\247\341\206\253.ipynb"
new file mode 100644
index 0000000..4d72780
--- /dev/null
+++ "b/Week13_\341\204\207\341\205\251\341\206\250\341\204\211\341\205\263\341\206\270\341\204\200\341\205\252\341\204\214\341\205\246_\341\204\200\341\205\265\341\206\267\341\204\211\341\205\245\341\204\213\341\205\247\341\206\253.ipynb"
@@ -0,0 +1,1204 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.linear_model import Ridge ,LogisticRegression\n",
+ "from sklearn.model_selection import train_test_split ,cross_val_score\n",
+ "from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer\n",
+ "import pandas as pd\n",
+ "\n",
+ "from google.colab import drive\n",
+ "drive.mount('/content/drive')\n",
+ "\n",
+ "mercari_df = pd.read_csv('/content/drive/MyDrive/EuronData/mercari_train.tsv',sep='\\t')\n",
+ "print(mercari_df.shape)\n",
+ "mercari_df.head(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 320
+ },
+ "id": "lUxDYKvHA9eB",
+ "outputId": "3b79d3ea-e56a-4712-cd8c-73c41ee51d93"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(1482535, 8)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " train_id name item_condition_id \\\n",
+ "0 0 MLB Cincinnati Reds T Shirt Size XL 3 \n",
+ "1 1 Razer BlackWidow Chroma Keyboard 3 \n",
+ "2 2 AVA-VIV Blouse 1 \n",
+ "\n",
+ " category_name brand_name price \\\n",
+ "0 Men/Tops/T-shirts NaN 10.0 \n",
+ "1 Electronics/Computers & Tablets/Components & P... Razer 52.0 \n",
+ "2 Women/Tops & Blouses/Blouse Target 10.0 \n",
+ "\n",
+ " shipping item_description \n",
+ "0 1 No description yet \n",
+ "1 0 This keyboard is in great condition and works ... \n",
+ "2 1 Adorable top with a hint of lace and a key hol... "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " train_id \n",
+ " name \n",
+ " item_condition_id \n",
+ " category_name \n",
+ " brand_name \n",
+ " price \n",
+ " shipping \n",
+ " item_description \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 0 \n",
+ " MLB Cincinnati Reds T Shirt Size XL \n",
+ " 3 \n",
+ " Men/Tops/T-shirts \n",
+ " NaN \n",
+ " 10.0 \n",
+ " 1 \n",
+ " No description yet \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 \n",
+ " Razer BlackWidow Chroma Keyboard \n",
+ " 3 \n",
+ " Electronics/Computers & Tablets/Components & P... \n",
+ " Razer \n",
+ " 52.0 \n",
+ " 0 \n",
+ " This keyboard is in great condition and works ... \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 2 \n",
+ " AVA-VIV Blouse \n",
+ " 1 \n",
+ " Women/Tops & Blouses/Blouse \n",
+ " Target \n",
+ " 10.0 \n",
+ " 1 \n",
+ " Adorable top with a hint of lace and a key hol... \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "mercari_df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 2
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(mercari_df.info())"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "JUn0ECTJB5rh",
+ "outputId": "ecab9a4b-b76f-4ca8-a705-0f8cb78e0a5a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 1482535 entries, 0 to 1482534\n",
+ "Data columns (total 8 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 train_id 1482535 non-null int64 \n",
+ " 1 name 1482535 non-null object \n",
+ " 2 item_condition_id 1482535 non-null int64 \n",
+ " 3 category_name 1476208 non-null object \n",
+ " 4 brand_name 849853 non-null object \n",
+ " 5 price 1482535 non-null float64\n",
+ " 6 shipping 1482535 non-null int64 \n",
+ " 7 item_description 1482529 non-null object \n",
+ "dtypes: float64(1), int64(3), object(4)\n",
+ "memory usage: 90.5+ MB\n",
+ "None\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "y_train_df = mercari_df['price']\n",
+ "plt.figure(figsize=(6,4))\n",
+ "sns.histplot(y_train_df, bins=100)\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 388
+ },
+ "id": "s0vdW9LgB5ua",
+ "outputId": "b85099f5-d146-454e-c340-0eaf6880d3c7"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "y_train_df = np.log1p(y_train_df)\n",
+ "sns.histplot(y_train_df, bins=50)\n",
+ "plt.show()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 449
+ },
+ "id": "A1Ytthx4B5w_",
+ "outputId": "db662fca-bd7c-4d0a-8ff8-9f9f0f8bbe69"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ ""
+ ],
+ "image/png": "\n"
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mercari_df['price'] = np.log1p(mercari_df['price'])\n",
+ "mercari_df['price'].head(3)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 178
+ },
+ "id": "y-8DTZNyB5zN",
+ "outputId": "8c8f110f-8157-48ba-eec1-0ed2312d7cee"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 2.397895\n",
+ "1 3.970292\n",
+ "2 2.397895\n",
+ "Name: price, dtype: float64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " price \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 2.397895 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 3.970292 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 2.397895 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: float64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print('Shipping 값 유형:\\n',mercari_df['shipping'].value_counts())\n",
+ "print('item_condition_id 값 유형:\\n',mercari_df['item_condition_id'].value_counts())\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wcqg13iDB53X",
+ "outputId": "0b6baded-f99a-44ad-a5d1-c85cbca4dfc9"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Shipping 값 유형:\n",
+ " shipping\n",
+ "0 819435\n",
+ "1 663100\n",
+ "Name: count, dtype: int64\n",
+ "item_condition_id 값 유형:\n",
+ " item_condition_id\n",
+ "1 640549\n",
+ "3 432161\n",
+ "2 375479\n",
+ "4 31962\n",
+ "5 2384\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "boolean_cond= mercari_df['item_description']=='No description yet'\n",
+ "mercari_df[boolean_cond]['item_description'].count()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "5u809R1OB55R",
+ "outputId": "e098117b-aa6e-4d51-efca-868a575a2791"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "np.int64(82489)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# apply lambda에서 호출되는 대,중,소 분할 함수 생성, 대,중,소 값을 리스트 반환\n",
+ "def split_cat(category_name):\n",
+ " try:\n",
+ " return category_name.split('/')\n",
+ " except:\n",
+ " return ['Other_Null' , 'Other_Null' , 'Other_Null']\n",
+ "\n",
+ "# 위의 split_cat( )을 apply lambda에서 호출하여 대,중,소 컬럼을 mercari_df에 생성.\n",
+ "mercari_df['cat_dae'], mercari_df['cat_jung'], mercari_df['cat_so'] = \\\n",
+ " zip(*mercari_df['category_name'].apply(lambda x : split_cat(x)))\n",
+ "\n",
+ "# 대분류만 값의 유형과 건수를 살펴보고, 중분류, 소분류는 값의 유형이 많으므로 분류 갯수만 추출\n",
+ "print('대분류 유형 :\\n', mercari_df['cat_dae'].value_counts())\n",
+ "print('중분류 갯수 :', mercari_df['cat_jung'].nunique())\n",
+ "print('소분류 갯수 :', mercari_df['cat_so'].nunique())\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4WACvyR_B59K",
+ "outputId": "b2a16b1e-c99e-4918-efd7-223b5ff87eba"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "대분류 유형 :\n",
+ " cat_dae\n",
+ "Women 664385\n",
+ "Beauty 207828\n",
+ "Kids 171689\n",
+ "Electronics 122690\n",
+ "Men 93680\n",
+ "Home 67871\n",
+ "Vintage & Collectibles 46530\n",
+ "Other 45351\n",
+ "Handmade 30842\n",
+ "Sports & Outdoors 25342\n",
+ "Other_Null 6327\n",
+ "Name: count, dtype: int64\n",
+ "중분류 갯수 : 114\n",
+ "소분류 갯수 : 871\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mercari_df['brand_name'] = mercari_df['brand_name'].fillna(value='Other_Null')\n",
+ "mercari_df['category_name'] = mercari_df['category_name'].fillna(value='Other_Null')\n",
+ "mercari_df['item_description'] = mercari_df['item_description'].fillna(value='Other_Null')\n",
+ "\n",
+ "# 각 컬럼별로 Null값 건수 확인. 모두 0가 나와야 합니다.\n",
+ "mercari_df.isnull().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 429
+ },
+ "id": "dVsh0vq_B5_y",
+ "outputId": "3a19ffd9-48ea-426d-e82f-ebda1dad0858"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "train_id 0\n",
+ "name 0\n",
+ "item_condition_id 0\n",
+ "category_name 0\n",
+ "brand_name 0\n",
+ "price 0\n",
+ "shipping 0\n",
+ "item_description 0\n",
+ "cat_dae 0\n",
+ "cat_jung 0\n",
+ "cat_so 0\n",
+ "dtype: int64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " train_id \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " name \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " item_condition_id \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " category_name \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " brand_name \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " price \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " shipping \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " item_description \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " cat_dae \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " cat_jung \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ " cat_so \n",
+ " 0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: int64 "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print('brand name 의 유형 건수 :', mercari_df['brand_name'].nunique())\n",
+ "print('brand name sample 5건 : \\n', mercari_df['brand_name'].value_counts()[:5])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4qUP4S4JB6Hk",
+ "outputId": "c0921ed3-62d4-4ed6-fd8a-43fbbf231920"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "brand name 의 유형 건수 : 4810\n",
+ "brand name sample 5건 : \n",
+ " brand_name\n",
+ "Other_Null 632682\n",
+ "PINK 54088\n",
+ "Nike 54043\n",
+ "Victoria's Secret 48036\n",
+ "LuLaRoe 31024\n",
+ "Name: count, dtype: int64\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print('name 의 종류 갯수 :', mercari_df['name'].nunique())\n",
+ "print('name sample 7건 : \\n', mercari_df['name'][:7])"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UzsQfwTcB6Jd",
+ "outputId": "7b2ff271-d553-43f5-be0f-4862d2923b3b"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "name 의 종류 갯수 : 1225273\n",
+ "name sample 7건 : \n",
+ " 0 MLB Cincinnati Reds T Shirt Size XL\n",
+ "1 Razer BlackWidow Chroma Keyboard\n",
+ "2 AVA-VIV Blouse\n",
+ "3 Leather Horse Statues\n",
+ "4 24K GOLD plated rose\n",
+ "5 Bundled items requested for Ruie\n",
+ "6 Acacia pacific tides santorini top\n",
+ "Name: name, dtype: object\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pd.set_option('max_colwidth', 200)\n",
+ "\n",
+ "# item_description의 평균 문자열 개수\n",
+ "print('item_description 평균 문자열 개수:',mercari_df['item_description'].str.len().mean())\n",
+ "\n",
+ "mercari_df['item_description'][:2]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 182
+ },
+ "id": "L6N1JdJjB6Lk",
+ "outputId": "7dcf43f3-c509-4b38-f967-4ea8989c2951"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "item_description 평균 문자열 개수: 145.71139703278507\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0 No description yet\n",
+ "1 This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.\n",
+ "Name: item_description, dtype: object"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " item_description \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " No description yet \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC. \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
dtype: object "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# name 속성에 대한 feature vectorization 변환\n",
+ "cnt_vec = CountVectorizer()\n",
+ "X_name = cnt_vec.fit_transform(mercari_df.name)\n",
+ "\n",
+ "# item_description 에 대한 feature vectorization 변환\n",
+ "tfidf_descp = TfidfVectorizer(max_features = 50000, ngram_range= (1,3) , stop_words='english')\n",
+ "X_descp = tfidf_descp.fit_transform(mercari_df['item_description'])\n",
+ "\n",
+ "print('name vectorization shape:',X_name.shape)\n",
+ "print('item_description vectorization shape:',X_descp.shape)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "oO9B-q-zB6Nj",
+ "outputId": "7bdea7b2-f9b2-451c-dffb-b8f3c3a7033a"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "name vectorization shape: (1482535, 105757)\n",
+ "item_description vectorization shape: (1482535, 50000)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from sklearn.preprocessing import LabelBinarizer\n",
+ "\n",
+ "# brand_name, item_condition_id, shipping 각 피처들을 희소 행렬 원-핫 인코딩 변환\n",
+ "lb_brand_name= LabelBinarizer(sparse_output=True)\n",
+ "X_brand = lb_brand_name.fit_transform(mercari_df['brand_name'])\n",
+ "\n",
+ "lb_item_cond_id = LabelBinarizer(sparse_output=True)\n",
+ "X_item_cond_id = lb_item_cond_id.fit_transform(mercari_df['item_condition_id'])\n",
+ "\n",
+ "lb_shipping= LabelBinarizer(sparse_output=True)\n",
+ "X_shipping = lb_shipping.fit_transform(mercari_df['shipping'])\n",
+ "\n",
+ "# cat_dae, cat_jung, cat_so 각 피처들을 희소 행렬 원-핫 인코딩 변환\n",
+ "lb_cat_dae = LabelBinarizer(sparse_output=True)\n",
+ "X_cat_dae= lb_cat_dae.fit_transform(mercari_df['cat_dae'])\n",
+ "\n",
+ "lb_cat_jung = LabelBinarizer(sparse_output=True)\n",
+ "X_cat_jung = lb_cat_jung.fit_transform(mercari_df['cat_jung'])\n",
+ "\n",
+ "lb_cat_so = LabelBinarizer(sparse_output=True)\n",
+ "X_cat_so = lb_cat_so.fit_transform(mercari_df['cat_so'])"
+ ],
+ "metadata": {
+ "id": "xvnF4fXuB6Po"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(type(X_brand), type(X_item_cond_id), type(X_shipping))\n",
+ "print('X_brand_shape:{0}, X_item_cond_id shape:{1}'.format(X_brand.shape, X_item_cond_id.shape))\n",
+ "print('X_shipping shape:{0}, X_cat_dae shape:{1}'.format(X_shipping.shape, X_cat_dae.shape))\n",
+ "print('X_cat_jung shape:{0}, X_cat_so shape:{1}'.format(X_cat_jung.shape, X_cat_so.shape))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mKSZFwqCB6Rw",
+ "outputId": "05c81ea3-bccf-4afe-b6fd-543dc251f427"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " \n",
+ "X_brand_shape:(1482535, 4810), X_item_cond_id shape:(1482535, 5)\n",
+ "X_shipping shape:(1482535, 1), X_cat_dae shape:(1482535, 11)\n",
+ "X_cat_jung shape:(1482535, 114), X_cat_so shape:(1482535, 871)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from scipy.sparse import hstack\n",
+ "import gc\n",
+ "\n",
+ "sparse_matrix_list = (X_name, X_descp, X_brand, X_item_cond_id,\n",
+ " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n",
+ "\n",
+ "# 사이파이 sparse 모듈의 hstack 함수를 이용하여 앞에서 인코딩과 Vectorization을 수행한 데이터 셋을 모두 결합.\n",
+ "X_features_sparse= hstack(sparse_matrix_list).tocsr()\n",
+ "print(type(X_features_sparse), X_features_sparse.shape)\n",
+ "\n",
+ "# 데이터 셋이 메모리를 많이 차지하므로 사용 용도가 끝났으면 바로 메모리에서 삭제.\n",
+ "del X_features_sparse\n",
+ "gc.collect()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "EBSHz5YRB6T3",
+ "outputId": "eb6744cd-e1fa-4539-f391-6257f81adaa5"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " (1482535, 161569)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def rmsle(y , y_pred):\n",
+ " # underflow, overflow를 막기 위해 log가 아닌 log1p로 rmsle 계산\n",
+ " return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))\n",
+ "\n",
+ "def evaluate_org_price(y_test , preds):\n",
+ "\n",
+ " # 원본 데이터는 log1p로 변환되었으므로 exmpm1으로 원복 필요.\n",
+ " preds_exmpm = np.expm1(preds)\n",
+ " y_test_exmpm = np.expm1(y_test)\n",
+ "\n",
+ " # rmsle로 RMSLE 값 추출\n",
+ " rmsle_result = rmsle(y_test_exmpm, preds_exmpm)\n",
+ " return rmsle_result"
+ ],
+ "metadata": {
+ "id": "JlCcx3bVB6Wa"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import gc\n",
+ "from scipy.sparse import hstack\n",
+ "\n",
+ "def model_train_predict(model,matrix_list):\n",
+ " # scipy.sparse 모듈의 hstack 을 이용하여 sparse matrix 결합\n",
+ " X= hstack(matrix_list).tocsr()\n",
+ "\n",
+ " X_train, X_test, y_train, y_test=train_test_split(X, mercari_df['price'],\n",
+ " test_size=0.2, random_state=156)\n",
+ "\n",
+ " # 모델 학습 및 예측\n",
+ " model.fit(X_train , y_train)\n",
+ " preds = model.predict(X_test)\n",
+ "\n",
+ " del X , X_train , X_test , y_train\n",
+ " gc.collect()\n",
+ "\n",
+ " return preds , y_test"
+ ],
+ "metadata": {
+ "id": "6GOni4z4B6fi"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "linear_model = Ridge(solver = \"lsqr\", fit_intercept=False)\n",
+ "\n",
+ "sparse_matrix_list = (X_name, X_brand, X_item_cond_id,\n",
+ " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n",
+ "linear_preds , y_test = model_train_predict(model=linear_model ,matrix_list=sparse_matrix_list)\n",
+ "print('Item Description을 제외했을 때 rmsle 값:', evaluate_org_price(y_test , linear_preds))\n",
+ "\n",
+ "sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,\n",
+ " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n",
+ "linear_preds , y_test = model_train_predict(model=linear_model , matrix_list=sparse_matrix_list)\n",
+ "print('Item Description을 포함한 rmsle 값:', evaluate_org_price(y_test ,linear_preds))\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "NQe_ycVxB6hW",
+ "outputId": "d6506169-fadc-4d2c-9986-781d0bcc1ce2"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Item Description을 제외했을 때 rmsle 값: 0.4983990938999374\n",
+ "Item Description을 포함한 rmsle 값: 0.4680432471796771\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from lightgbm import LGBMRegressor\n",
+ "\n",
+ "sparse_matrix_list = (X_descp, X_name, X_brand, X_item_cond_id,\n",
+ " X_shipping, X_cat_dae, X_cat_jung, X_cat_so)\n",
+ "\n",
+ "lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125, random_state=156)\n",
+ "lgbm_preds , y_test = model_train_predict(model = lgbm_model , matrix_list=sparse_matrix_list)\n",
+ "print('LightGBM rmsle 값:', evaluate_org_price(y_test , lgbm_preds))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vGJUeJABCJSW",
+ "outputId": "464be022-2a6d-4d64-832c-998c70f80c88"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3450.660095 seconds.\n",
+ "You can set `force_row_wise=true` to remove the overhead.\n",
+ "And if memory is not enough, you can set `force_col_wise=true`.\n",
+ "[LightGBM] [Info] Total Bins 1068323\n",
+ "[LightGBM] [Info] Number of data points in the train set: 1186028, number of used features: 65338\n",
+ "[LightGBM] [Info] Start training from score 2.979514\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.11/dist-packages/sklearn/utils/deprecation.py:151: FutureWarning: 'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "LightGBM rmsle 값: 0.4563962127849484\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "preds = lgbm_preds * 0.45 + linear_preds * 0.55\n",
+ "print('LightGBM과 Ridge를 ensemble한 최종 rmsle 값:', evaluate_org_price(y_test , preds))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "u6zW8DcZCJUy",
+ "outputId": "6aa8b564-9074-4141-89a3-c92254e755f4"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "LightGBM과 Ridge를 ensemble한 최종 rmsle 값: 0.4467272727321774\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file