diff --git "a/week20_nlp_hw_ipynb\354\235\230_\354\202\254\353\263\270.ipynb" "b/week20_nlp_hw_ipynb\354\235\230_\354\202\254\353\263\270.ipynb" new file mode 100644 index 0000000..fef14bf --- /dev/null +++ "b/week20_nlp_hw_ipynb\354\235\230_\354\202\254\353\263\270.ipynb" @@ -0,0 +1,447 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "week20_nlp_hw.ipynb์˜ ์‚ฌ๋ณธ", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "๐Ÿ“Œ week20 ๊ณผ์ œ๋Š” **18์ฃผ์ฐจ์˜ Constituency Parsing TreeRNNS ์‹ค์Šต**์œผ๋กœ ๊ตฌ์„ฑ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.\n", + "\n", + "๐Ÿ“Œ ์œ„ํ‚ค๋…์Šค์˜ ๋”ฅ๋Ÿฌ๋‹์„ ์ด์šฉํ•œ ์ž์—ฐ์–ด ์ฒ˜๋ฆฌ ์ž…๋ฌธ ๊ต์žฌ ์‹ค์Šต, ๊ด€๋ จ ๋ธ”๋กœ๊ทธ ๋“ฑ์˜ ๋ฌธ์„œ ์ž๋ฃŒ๋กœ ๊ตฌ์„ฑ๋˜์–ด ์žˆ๋Š” ๊ณผ์ œ์ž…๋‹ˆ๋‹ค. \n", + "\n", + "๐Ÿ“Œ ์•ˆ๋‚ด๋œ ๋งํฌ์— ๋งž์ถ”์–ด **์ง์ ‘ ์ฝ”๋“œ๋ฅผ ๋”ฐ๋ผ ์น˜๋ฉด์„œ (ํ•„์‚ฌ)** ํ•ด๋‹น nlp task ์˜ ๊ธฐ๋ณธ์ ์ธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์™€ ๋ฉ”์„œ๋“œ๋ฅผ ์ˆ™์ง€ํ•ด๋ณด์‹œ๋ฉด ์ข‹์„ ๊ฒƒ ๊ฐ™์Šต๋‹ˆ๋‹ค๐Ÿ˜Š ํ•„์ˆ˜๋ผ๊ณ  ์ฒดํฌํ•œ ๋ถ€๋ถ„์€ ๊ณผ์ œ์— ๋ฐ˜๋“œ์‹œ ํฌํ•จ์‹œ์ผœ์ฃผ์‹œ๊ณ , ์„ ํƒ์œผ๋กœ ์ฒดํฌํ•œ ๋ถ€๋ถ„์€ ์ž์œจ์ ์œผ๋กœ ์Šคํ„ฐ๋”” ํ•˜์‹œ๋ฉด ๋ฉ๋‹ˆ๋‹ค.\n", + "\n", + "๐Ÿ“Œ ๊ถ๊ธˆํ•œ ์‚ฌํ•ญ์€ ๊นƒํ—ˆ๋ธŒ ์ด์Šˆ๋‚˜, ์นดํ†ก๋ฐฉ, ์„ธ์…˜ ๋ฐœํ‘œ ์‹œ์ž‘ ์ด์ „ ์‹œ๊ฐ„ ๋“ฑ์„ ํ™œ์šฉํ•˜์—ฌ ์ž์œ ๋กญ๊ฒŒ ๊ณต์œ ํ•ด์ฃผ์„ธ์š”!" + ], + "metadata": { + "id": "BX3ac8Ag1RPC" + } + }, + { + "cell_type": "markdown", + "source": [ + "๐Ÿฅฐ **์ดํ•˜ ์˜ˆ์ œ๋ฅผ ์‹ค์Šตํ•˜์‹œ๋ฉด ๋ฉ๋‹ˆ๋‹ค.**\n", + "\n", + "**1-2๋Š” ํ•„์ˆ˜๊ณผ์ œ์ž…๋‹ˆ๋‹ค.**\n", + "\n", + "์‹ค์Šต๋ณด๋‹ค๋Š” ๊ฐœ๋… ์ดํ•ด๊ฐ€ ์ฃผ์ธ ๋‹จ์›์ด๋ผ, ์‹ค์Šต ๊ณผ์ œ๋Š” ์ ์Šต๋‹ˆ๋‹ค. TreeRNN์€ ํ˜„์‹ค์ ์œผ๋กœ ์ž˜ ์“ฐ์ด์ง€ ์•Š๊ธฐ ๋•Œ๋ฌธ์ด์ฃ . ์™œ ์ž˜ ์“ฐ์ด์ง€ ์•Š์„๊นŒ์š”? ์ด ์งˆ๋ฌธ์— ๋Œ€ํ•ด ์ƒ๊ฐํ•ด ๋ณด๋Š” ๊ฒƒ์œผ๋กœ ์ด๋ฒˆ ๊ณผ์ œ๋ฅผ ์‹œ์ž‘ํ•ด ๋ด…์‹œ๋‹ค." + ], + "metadata": { + "id": "Kq8aMYKGPQR0" + } + }, + { + "cell_type": "markdown", + "source": [ + "`your answer here`" + ], + "metadata": { + "id": "ZJLIfgQ9vlNe" + } + }, + { + "cell_type": "markdown", + "source": [ + "### **1๏ธโƒฃ Probabilistic Parsing ์‹ค์Šต**" + ], + "metadata": { + "id": "SHTPAk95iNtP" + } + }, + { + "cell_type": "markdown", + "source": [ + "๐Ÿ“Œ [Probabilistic Context Free Grammars](https://lost-contact.mit.edu/afs/cs.pitt.edu/projects/nltk/docs/tutorial/pcfg/nochunks.html#pcfg) \n", + "\n", + "TreeRNN์— PCFG rule์„ ์ ์šฉํ•ด ๋งŒ๋“  ๊ฒƒ์ด Syntactically-United RNN์ด์ฃ . PCFG rule์„ ์‹ค์Šตํ•ด ๋ด…์‹œ๋‹ค." + ], + "metadata": { + "id": "9L-jAHPkiBV0" + } + }, + { + "cell_type": "code", + "source": [ + "sentence = \"\"\"At eight o'clock on Thursday morning\n", + "... Arthur didn't feel very good.\"\"\"\n", + "tokens = nltk.word_tokenize(sentence)\n", + "tokens" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XOGPFvyJWoW0", + "outputId": "13a312ce-897d-47ef-b897-ec9c97ba7278" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['At',\n", + " 'eight',\n", + " \"o'clock\",\n", + " 'on',\n", + " 'Thursday',\n", + " 'morning',\n", + " 'Arthur',\n", + " 'did',\n", + " \"n't\",\n", + " 'feel',\n", + " 'very',\n", + " 'good',\n", + " '.']" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "code", + "source": [ + "tagged = nltk.pos_tag(tokens)\n", + "tagged[0:6]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4SmAH4lpU9cS", + "outputId": "2eb44e67-3619-4aa4-96e7-d4d12eac5c5b" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('At', 'IN'),\n", + " ('eight', 'CD'),\n", + " (\"o'clock\", 'NN'),\n", + " ('on', 'IN'),\n", + " ('Thursday', 'NNP'),\n", + " ('morning', 'NN')]" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "source": [ + "entities = nltk.chunk.ne_chunk(tagged)\n", + "entities" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 410 + }, + "id": "ZiOVpEpjeDso", + "outputId": "01430795-0c2a-45a6-9b1e-428c842c11b9" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "error", + "ename": "ModuleNotFoundError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/formatters.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0mmethod\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_real_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprint_method\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmethod\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 338\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 339\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/nltk/tree/tree.py\u001b[0m in \u001b[0;36m_repr_svg_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 782\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_repr_svg_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 783\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0msvgling\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdraw_tree\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 784\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 785\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdraw_tree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_repr_svg_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'svgling'" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Tree('S', [('At', 'IN'), ('eight', 'CD'), (\"o'clock\", 'NN'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), Tree('PERSON', [('Arthur', 'NNP')]), ('did', 'VBD'), (\"n't\", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')])" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### **2๏ธโƒฃ Tree Based Algorithm**" + ], + "metadata": { + "id": "HfTr_BPwGc8D" + } + }, + { + "cell_type": "markdown", + "source": [ + "๐Ÿ“Œ [Decision Tree](https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/ml-decision-tree/tutorial/) ์‹ค์Šต\n", + "\n", + "์ฐธ๊ณ : [Embedding Graphs with Deep Learning ์ฝ๊ธฐ ์ž๋ฃŒ](https://towardsdatascience.com/embedding-graphs-with-deep-learning-55e0c66d7752)\n", + "\n", + "Decision Tree๋Š” ๋จธ์‹  ๋Ÿฌ๋‹์—์„œ ์ž์ฃผ ์“ฐ์ด๋Š” Tree Algorith์ž…๋‹ˆ๋‹ค. Decision Tree๋ฅผ ์‹ค์Šตํ•ด ๋ณด๋ฉฐ Tree ๊ตฌ์กฐ์™€ ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์ดํ•ดํ•ด ๋ด…์‹œ๋‹ค.\n" + ], + "metadata": { + "id": "0HnXNyCAwSHJ" + } + }, + { + "cell_type": "code", + "source": [ + "#Importing required libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.datasets import load_iris\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import train_test_split" + ], + "metadata": { + "id": "svtikbdZatBY" + }, + "execution_count": 50, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Loading the iris data\n", + "data = load_iris()\n", + "print('Classes to predict: ', data.target_names)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "--tIW4ikeyC9", + "outputId": "498d53cd-b5c0-4628-9254-7af085ed5032" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Classes to predict: ['setosa' 'versicolor' 'virginica']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#Extracting data attributes\n", + "X = data.data\n", + "### Extracting target/ class labels\n", + "y = data.target\n", + "\n", + "print('Number of examples in the data:', X.shape[0])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J4epi16eezp4", + "outputId": "4130b657-f9bf-4150-b42e-91d3026cd973" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of examples in the data: 150\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#First four rows in the variable 'X'\n", + "X[:4]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oH5lMSude1Jp", + "outputId": "9d5806c8-efd1-493e-c238-5ade5f202cf2" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([[5.1, 3.5, 1.4, 0.2],\n", + " [4.9, 3. , 1.4, 0.2],\n", + " [4.7, 3.2, 1.3, 0.2],\n", + " [4.6, 3.1, 1.5, 0.2]])" + ] + }, + "metadata": {}, + "execution_count": 56 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#Using the train_test_split to create train and test sets.\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)" + ], + "metadata": { + "id": "wAM_o-4Ie3QV" + }, + "execution_count": 57, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Importing the Decision tree classifier from the sklearn library.\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "clf = DecisionTreeClassifier(criterion = 'entropy')" + ], + "metadata": { + "id": "FQRJM_dFfoGG" + }, + "execution_count": 64, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Training the decision tree classifier. \n", + "clf.fit(X_train, y_train)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KrSFO54afprP", + "outputId": "0f3d0f9d-b8d5-4de5-a85c-a3ad585168df" + }, + "execution_count": 65, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DecisionTreeClassifier(criterion='entropy')" + ] + }, + "metadata": {}, + "execution_count": 65 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#Predicting labels on the test set.\n", + "y_pred = clf.predict(X_test)" + ], + "metadata": { + "id": "DylDmUIWfrW9" + }, + "execution_count": 66, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#Importing the accuracy metric from sklearn.metrics library\n", + "\n", + "from sklearn.metrics import accuracy_score\n", + "print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))\n", + "print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xyVcWoougS4l", + "outputId": "fc811c1c-e66c-4bbf-fee3-9f1818750aef" + }, + "execution_count": 67, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy Score on train data: 1.0\n", + "Accuracy Score on test data: 0.9736842105263158\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=50)\n", + "clf.fit(X_train, y_train)\n", + "print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))\n", + "print('Accuracy Score on the test data: ', accuracy_score(y_true=y_test, y_pred=clf.predict(X_test)))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-FFc4F2CgUt5", + "outputId": "f4ec459b-dbc4-405a-f4fc-3163b34df2a5" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Accuracy Score on train data: 0.9553571428571429\n", + "Accuracy Score on the test data: 0.9736842105263158\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### ์ฐธ๊ณ  ์ž๋ฃŒ\n", + "\n", + "- [RNTN](https://ratsgo.github.io/deep%20learning/2017/06/24/RNTN/)\n", + "\n", + "- [When and Why Tree-Based Models (Often) Outperform Neural Networks](https://towardsdatascience.com/when-and-why-tree-based-models-often-outperform-neural-networks-ceba9ecd0fd8)\n", + "\n", + "- ์ €๋ฒˆ ์ฃผ์ฐจ ๋‚ด์šฉ๊ณผ ๊ด€๋ จ๋œ ํŽ˜์ดํผ [Parsing with Compositional Vector Grammars](https://nlp.stanford.edu/pubs/SocherBauerManningNg_ACL2013.pdf) ๋„ ๋„์›€์ด ๋  ๊ฑฐ์˜ˆ์š”! :)\n", + "\n", + "- [RNN๊ณผ LSTM์ด ์™œ ์ž˜ ์•ˆ ์“ฐ์ด๋Š”์ง€์— ๋Œ€ํ•œ ์นผ๋Ÿผ](https://medium.com/towards-data-science/the-fall-of-rnn-lstm-2d1594c74ce0)๋„ ํ•œ๋ฒˆ ์ฝ์–ด ๋ณด์‹œ๋ฉด ์ข‹์„ ๊ฒƒ ๊ฐ™์Šต๋‹ˆ๋‹ค." + ], + "metadata": { + "id": "pNDTxHF3sxo8" + } + } + ] +} \ No newline at end of file