diff --git a/TEXT MINING ASSIGNMENT(elon musk).ipynb b/TEXT MINING ASSIGNMENT(elon musk).ipynb new file mode 100644 index 0000000..aab16cf --- /dev/null +++ b/TEXT MINING ASSIGNMENT(elon musk).ipynb @@ -0,0 +1,1100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "fe72c2fa", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import numpy as np \n", + "import string # specal operation on string\n", + "import spacy # language model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9fc118c7", + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib.pyplot import imread\n", + "from matplotlib import pyplot as plt\n", + "from wordcloud import WordCloud, STOPWORDS\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "2258910b", + "metadata": {}, + "outputs": [], + "source": [ + "data=pd.read_csv(\"Elon_musk.csv\",error_bad_lines=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "570caa3a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0Text
01@kunalb11 I�m an alien
12@ID_AA_Carmack Ray tracing on Cyberpunk with H...
23@joerogan @Spotify Great interview!
34@gtera27 Doge is underestimated
45@teslacn Congratulations Tesla China for amazi...
.........
19941995@flcnhvy True, it sounds so surreal, but the n...
19951996@PPathole Make sure to read ur terms & con...
19961997@TeslaGong @PPathole Samwise Gamgee
19971998@PPathole Altho Dumb and Dumber is <U+0001F525...
19981999Progress update August 28
\n", + "

1999 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Text\n", + "0 1 @kunalb11 I�m an alien\n", + "1 2 @ID_AA_Carmack Ray tracing on Cyberpunk with H...\n", + "2 3 @joerogan @Spotify Great interview!\n", + "3 4 @gtera27 Doge is underestimated\n", + "4 5 @teslacn Congratulations Tesla China for amazi...\n", + "... ... ...\n", + "1994 1995 @flcnhvy True, it sounds so surreal, but the n...\n", + "1995 1996 @PPathole Make sure to read ur terms & con...\n", + "1996 1997 @TeslaGong @PPathole Samwise Gamgee\n", + "1997 1998 @PPathole Altho Dumb and Dumber is \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x
0@kunalb11 I�m an alien
1@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2@joerogan @Spotify Great interview!
3@gtera27 Doge is underestimated
4@teslacn Congratulations Tesla China for amazi...
......
1994@flcnhvy True, it sounds so surreal, but the n...
1995@PPathole Make sure to read ur terms &amp; con...
1996@TeslaGong @PPathole Samwise Gamgee
1997@PPathole Altho Dumb and Dumber is <U+0001F525...
1998Progress update August 28
\n", + "

1999 rows × 1 columns

\n", + "" + ], + "text/plain": [ + " x\n", + "0 @kunalb11 I�m an alien\n", + "1 @ID_AA_Carmack Ray tracing on Cyberpunk with H...\n", + "2 @joerogan @Spotify Great interview!\n", + "3 @gtera27 Doge is underestimated\n", + "4 @teslacn Congratulations Tesla China for amazi...\n", + "... ...\n", + "1994 @flcnhvy True, it sounds so surreal, but the n...\n", + "1995 @PPathole Make sure to read ur terms & con...\n", + "1996 @TeslaGong @PPathole Samwise Gamgee\n", + "1997 @PPathole Altho Dumb and Dumber is \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x
0@kunalb11 I�m an alien
1@ID_AA_Carmack Ray tracing on Cyberpunk with H...
2@joerogan @Spotify Great interview!
3@gtera27 Doge is underestimated
4@teslacn Congratulations Tesla China for amazi...
......
1994@flcnhvy True, it sounds so surreal, but the n...
1995@PPathole Make sure to read ur terms &amp; con...
1996@TeslaGong @PPathole Samwise Gamgee
1997@PPathole Altho Dumb and Dumber is <U+0001F525...
1998Progress update August 28
\n", + "

1999 rows × 1 columns

\n", + "" + ], + "text/plain": [ + " x\n", + "0 @kunalb11 I�m an alien\n", + "1 @ID_AA_Carmack Ray tracing on Cyberpunk with H...\n", + "2 @joerogan @Spotify Great interview!\n", + "3 @gtera27 Doge is underestimated\n", + "4 @teslacn Congratulations Tesla China for amazi...\n", + "... ...\n", + "1994 @flcnhvy True, it sounds so surreal, but the n...\n", + "1995 @PPathole Make sure to read ur terms & con...\n", + "1996 @TeslaGong @PPathole Samwise Gamgee\n", + "1997 @PPathole Altho Dumb and Dumber is \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
x
0I�m an alien
1_AA_Carmack Ray tracing on Cyberpunk with HDR ...
2Great interview!
3Doge is underestimated
4Congratulations Tesla China for amazing execu...
\n", + "" + ], + "text/plain": [ + " x\n", + "0 I�m an alien\n", + "1 _AA_Carmack Ray tracing on Cyberpunk with HDR ...\n", + "2 Great interview!\n", + "3 Doge is underestimated\n", + "4 Congratulations Tesla China for amazing execu..." + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[\"x\"] = data[\"x\"].apply(cleantext)\n", + "\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "51b38df2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xsubjectivitypolarity
0I�m an alien0.750000-0.250000
1_AA_Carmack Ray tracing on Cyberpunk with HDR ...0.0000000.000000
2Great interview!0.7500001.000000
3Doge is underestimated0.0000000.000000
4Congratulations Tesla China for amazing execu...0.3666670.345313
............
1994True, it sounds so surreal, but the negative ...0.5083330.111111
1995Make sure to read ur terms &amp; conditions b...0.8888890.625000
1996Samwise Gamgee0.0000000.000000
1997Altho Dumb and Dumber is <U+0001F525><U+0001F...0.500000-0.375000
1998Progress update August 280.0000000.000000
\n", + "

1999 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " x subjectivity \\\n", + "0 I�m an alien 0.750000 \n", + "1 _AA_Carmack Ray tracing on Cyberpunk with HDR ... 0.000000 \n", + "2 Great interview! 0.750000 \n", + "3 Doge is underestimated 0.000000 \n", + "4 Congratulations Tesla China for amazing execu... 0.366667 \n", + "... ... ... \n", + "1994 True, it sounds so surreal, but the negative ... 0.508333 \n", + "1995 Make sure to read ur terms & conditions b... 0.888889 \n", + "1996 Samwise Gamgee 0.000000 \n", + "1997 Altho Dumb and Dumber is " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "from wordcloud import WordCloud\n", + "\n", + "allwords = \" \".join([x for x in data[\"x\"]])\n", + "wordCloud = WordCloud(width = 1000, height = 1000, random_state = 21, max_font_size = 119).generate(allwords)\n", + "plt.figure(figsize=(20, 20), dpi=80)\n", + "plt.imshow(wordCloud, interpolation = \"bilinear\")\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "1b1a046c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xsubjectivitypolarityanalysis
0I�m an alien0.750000-0.250000Negative
1_AA_Carmack Ray tracing on Cyberpunk with HDR ...0.0000000.000000Neutral
2Great interview!0.7500001.000000Positive
3Doge is underestimated0.0000000.000000Neutral
4Congratulations Tesla China for amazing execu...0.3666670.345313Positive
...............
1994True, it sounds so surreal, but the negative ...0.5083330.111111Positive
1995Make sure to read ur terms &amp; conditions b...0.8888890.625000Positive
1996Samwise Gamgee0.0000000.000000Neutral
1997Altho Dumb and Dumber is <U+0001F525><U+0001F...0.500000-0.375000Negative
1998Progress update August 280.0000000.000000Neutral
\n", + "

1999 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " x subjectivity \\\n", + "0 I�m an alien 0.750000 \n", + "1 _AA_Carmack Ray tracing on Cyberpunk with HDR ... 0.000000 \n", + "2 Great interview! 0.750000 \n", + "3 Doge is underestimated 0.000000 \n", + "4 Congratulations Tesla China for amazing execu... 0.366667 \n", + "... ... ... \n", + "1994 True, it sounds so surreal, but the negative ... 0.508333 \n", + "1995 Make sure to read ur terms & conditions b... 0.888889 \n", + "1996 Samwise Gamgee 0.000000 \n", + "1997 Altho Dumb and Dumber is " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 8))\n", + "\n", + "for i in range(0, data.shape[0]):\n", + " plt.scatter(data[\"polarity\"][i], data[\"subjectivity\"][i], color = \"Red\")\n", + "\n", + "plt.title(\"Sentiment Analysis\") # Add The Graph Title\n", + "plt.xlabel(\"Polarity\") # Add The X-Label\n", + "plt.ylabel(\"Subjectivity\") # Add The Y-Label\n", + "plt.show() # Showing The Graph" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "00502eff", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "vectorizer = CountVectorizer()\n", + "X = vectorizer.fit_transform(data[\"x\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "569273d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "an 474\n", + "alien 438\n", + "_aa_carmack 215\n", + "ray 2969\n", + "tracing 3750\n", + " ... \n", + "clicking 921\n", + "accept 363\n", + "samwise 3149\n", + "gamgee 1663\n", + "altho 456\n", + "Length: 4117, dtype: int64" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series(vectorizer.vocabulary_)" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "dbff8b5b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "the 403\n", + "to 390\n", + "is 347\n", + "a 324\n", + "of 301\n", + "& 216\n", + "in 203\n", + "for 177\n", + "be 154\n", + "will 136\n", + "on 124\n", + ": 117\n", + "I 114\n", + "but 109\n", + "that 104\n", + "with 99\n", + "are 95\n", + "it 89\n", + "at 85\n", + "Tesla 85\n", + "The 79\n", + "we 77\n", + "� 76\n", + "and 72\n", + "this 72\n", + "from 70\n", + "have 69\n", + "was 68\n", + "as 63\n", + "This 60\n", + "you 58\n", + "We 58\n", + "has 55\n", + "not 54\n", + "Yes 50\n", + "so 48\n", + "more 48\n", + "just 47\n", + "than 44\n", + "should 43\n", + "an 39\n", + "all 39\n", + "can 39\n", + "or 37\n", + "do 37\n", + "It�s 37\n", + "like 37\n", + "great 36\n", + "would 36\n", + "launch 36\n", + "dtype: int64" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Word frequency\n", + "freq = pd.Series(' '.join(data[\"x\"]).split()).value_counts()[:50] # for top 20\n", + "freq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7753767a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}