1
+ {
2
+ "cells" : [
3
+ {
4
+ "cell_type" : " markdown" ,
5
+ "metadata" : {
6
+ "id" : " view-in-github" ,
7
+ "colab_type" : " text"
8
+ },
9
+ "source" : [
10
+ " <a href=\" https://colab.research.google.com/github/usha-madithati/esmart.github.io/blob/main/Yet_another_copy_of_SentimentAnalysis.ipynb\" target=\" _parent\" ><img src=\" https://colab.research.google.com/assets/colab-badge.svg\" alt=\" Open In Colab\" /></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type" : " code" ,
15
+ "execution_count" : 3 ,
16
+ "metadata" : {
17
+ "colab" : {
18
+ "base_uri" : " https://localhost:8080/" ,
19
+ "height" : 228
20
+ },
21
+ "id" : " 6fldNk57Fwkt" ,
22
+ "outputId" : " f9c44560-ffa0-4a4a-ccca-1371c31242e3"
23
+ },
24
+ "outputs" : [
25
+ {
26
+ "output_type" : " error" ,
27
+ "ename" : " IndentationError" ,
28
+ "evalue" : " unexpected indent (<ipython-input-3-cbcedf656fe3>, line 126)" ,
29
+ "traceback" : [
30
+ " \u001b [0;31m---------------------------------------------------------------------------\u001b [0m" ,
31
+ " \u001b [0;31mIndentationError\u001b [0m Traceback (most recent call last)" ,
32
+ "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/compilerop.py\u001b[0m in \u001b[0;36mast_parse\u001b[0;34m(self, source, filename, symbol)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mArguments\u001b[0m \u001b[0mare\u001b[0m \u001b[0mexactly\u001b[0m \u001b[0mthe\u001b[0m \u001b[0msame\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32min\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mstandard\u001b[0m \u001b[0mlibrary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m and are passed to the built-in compile function.\"\"\"\n\u001b[0;32m--> 101\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msymbol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0mPyCF_ONLY_AST\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mreset_compiler_flags\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
33
+ " \u001b [0;31mIndentationError\u001b [0m: unexpected indent (<ipython-input-3-cbcedf656fe3>, line 126)"
34
+ ]
35
+ }
36
+ ],
37
+ "source" : [
38
+ " \n " ,
39
+ " !pip install numpy\n " ,
40
+ " import numpy as np\n " ,
41
+ " print(np.__version__)\n " ,
42
+ " \n " ,
43
+ " !pip install pandas\n " ,
44
+ " import pandas as pd\n " ,
45
+ " print(pd.__version__)\n " ,
46
+ " !pip install nltk\n " ,
47
+ " import nltk\n " ,
48
+ " print(nltk.__version__)\n " ,
49
+ " from nltk.sentiment.vader import SentimentIntensityAnalyzer\n " ,
50
+ " import re\n " ,
51
+ " from textblob import TextBlob\n " ,
52
+ " from wordcloud import WordCloud\n " ,
53
+ " import seaborn as sns\n " ,
54
+ " import matplotlib.pyplot as plt\n " ,
55
+ " import cufflinks as cf\n " ,
56
+ " %matplotlib inline\n " ,
57
+ " from plotly.offline import init_notebook_mode,iplot\n " ,
58
+ " init_notebook_mode(connected = True)\n " ,
59
+ " cf.go_offline();\n " ,
60
+ " import plotly.graph_objs as go\n " ,
61
+ " from plotly.subplots import make_subplots\n " ,
62
+ " \n " ,
63
+ " import warnings\n " ,
64
+ " warnings.filterwarnings(\" ignore\" )\n " ,
65
+ " warnings.warn(\" this will not show\" )\n " ,
66
+ " pd.set_option('display.max_columns',None)\n " ,
67
+ " df= pd.read_csv(\" /content/amazon.csv\" )\n " ,
68
+ " del df[df.columns[0]]\n " ,
69
+ " import pandas as pd\n " ,
70
+ " \n " ,
71
+ " # Load your dataset\n " ,
72
+ " df = pd.read_csv(\" /content/amazon.csv\" )\n " ,
73
+ " columns_to_clean=['reviewerName','overall','reviewText','reviewTime','day_diff','helpful_yes','helpful_no','total_vote','score_pos_neg_diff','score_average_rating','wilson_lower_bound']\n " ,
74
+ " df_cleaned=df.dropna(subset=columns_to_clean)\n " ,
75
+ " df_cleaned.to_csv('cleaned_dataset.csv',index=False)\n " ,
76
+ " \n " ,
77
+ " # Check for missing values\n " ,
78
+ " df = pd.read_csv(\" /content/cleaned_dataset.csv\" )\n " ,
79
+ " missing_values = df.isnull().sum().sum()\n " ,
80
+ " if missing_values == 0:\n " ,
81
+ " print(\" No missing values found.\" )\n " ,
82
+ " else:\n " ,
83
+ " print(\" Missing values found in the dataset.total:{missing_values}\" )\n " ,
84
+ " \n " ,
85
+ " # Check for duplicates\n " ,
86
+ " duplicate_rows = df.duplicated().sum()\n " ,
87
+ " if duplicate_rows == 0:\n " ,
88
+ " print(\" No duplicate rows found.\" )\n " ,
89
+ " else:\n " ,
90
+ " print(\" Duplicate rows found.\" )\n " ,
91
+ " \n " ,
92
+ " # Validate data types\n " ,
93
+ " print(\" Data types:\" )\n " ,
94
+ " print(df.dtypes)\n " ,
95
+ " \n " ,
96
+ " # Explore outliers (This example assumes numerical columns)\n " ,
97
+ " outliers = df.describe().loc[['min', 'max']].T\n " ,
98
+ " print(\" Outliers (min/max values per column):\" )\n " ,
99
+ " print(outliers)\n " ,
100
+ " df = pd.read_csv(\" /content/cleaned_dataset.csv\" )\n " ,
101
+ " def missing_values_analysis(df):\n " ,
102
+ " na_columns_=[col for col in df.columns if df[col].isnull().sum()>0]\n " ,
103
+ " n_miss = pd.Series(df[na_columns_].isnull().sum())\n " ,
104
+ " ratio_ = (df[na_columns_].isnull().sum()/df.shape[0]*100).sort_values(ascending=True)\n " ,
105
+ " missing_df = pd.concat([n_miss,np.round(ratio_,2)],axis=1,keys=['Missing Values','Ratio'])\n " ,
106
+ " return missing_df\n " ,
107
+ " def check_dataframe(df, head=5, tail=5):\n " ,
108
+ " print(\" SHAPE\" .center(82, '~'))\n " ,
109
+ " print('Rows: {}'.format(df.shape[0]))\n " ,
110
+ " print('Columns: {}'.format(df.shape[1]))\n " ,
111
+ " print(\" TYPES\" .center(82, '~'))\n " ,
112
+ " print(df.dtypes)\n " ,
113
+ " print(\"\" .center(82, '~'))\n " ,
114
+ " print(missing_values_analysis(df))\n " ,
115
+ " print('DUPLICATED VALUES'.center(83, '~'))\n " ,
116
+ " print(df.duplicated().sum())\n " ,
117
+ " print(\" QUANTILES\" .center(82, '~'))\n " ,
118
+ " numerical_columns=df.select_dtypes(include=['int','float'])\n " ,
119
+ " quantiles=numerical_columns.quantile([0,0.05,0.50,0.95,0.99,1])\n " ,
120
+ " print(\" QUANTILES:\" )\n " ,
121
+ " print(quantiles)\n " ,
122
+ " check_dataframe(df)\n " ,
123
+ " def check_class(data_frame):\n " ,
124
+ " nunique_df=pd.DataFrame({'Variable':data_frame.columns,\n " ,
125
+ " 'Classes':[data_frame[i].nunique()\\\n " ,
126
+ " for i in data_frame.columns]})\n " ,
127
+ " nunique_df=nunique_df.sort_values('Classes',ascending=False)\n " ,
128
+ " nunique_df=nunique_df.reset_index(drop=True)\n " ,
129
+ " return nunique_df\n " ,
130
+ " check_class(df)\n " ,
131
+ " \n " ,
132
+ " import plotly.graph_objects as go\n " ,
133
+ " from plotly.subplots import make_subplots\n " ,
134
+ " constraints=['#B34D22','#EBE00C','#1FEB0C','#0C92EB','#EB0CD5']\n " ,
135
+ " def categeorical_variable_summary(df,column_name):\n " ,
136
+ " fig=make_subplots(rows=1,cols=2,\n " ,
137
+ " subplot_titles=('Countplot','Percentage'),\n " ,
138
+ " specs=[[{\" type\" :\" xy\" },{'type': \" domain\" }]]) # Define domain here\n " ,
139
+ " fig.add_trace(go.Bar(y=df[column_name].value_counts().values.tolist(),\n " ,
140
+ " x=[str(i)for i in df[column_name].value_counts().index],\n " ,
141
+ " text=df[column_name].value_counts().values.tolist(),\n " ,
142
+ " textfont=dict(size=14),\n " ,
143
+ " name=column_name,\n " ,
144
+ " textposition='auto',\n " ,
145
+ " showlegend=False,\n " ,
146
+ " marker=dict(color=constraints,\n " ,
147
+ " line=dict(color='#DBE6EC',\n " ,
148
+ " width=1))),\n " ,
149
+ " row=1,col=1)\n " ,
150
+ " fig.add_trace(go.Pie(labels=df[column_name].value_counts().keys(),\n " ,
151
+ " values=df[column_name].value_counts().values,\n " ,
152
+ " textfont=dict(size=18),\n " ,
153
+ " textposition='auto',\n " ,
154
+ " showlegend=False,\n " ,
155
+ " name=column_name,\n " ,
156
+ " marker=dict(colors=constraints)),\n " ,
157
+ " row=1,col=2)\n " ,
158
+ " fig.update_layout(title={'text':column_name,\n " ,
159
+ " 'y':0.9,\n " ,
160
+ " 'x':0.5,\n " ,
161
+ " 'xanchor':'center',\n " ,
162
+ " 'yanchor':'top'},\n " ,
163
+ " template='plotly_white')\n " ,
164
+ " display(fig)\n " ,
165
+ " \n " ,
166
+ " categeorical_variable_summary(df,'overall')\n " ,
167
+ " \n " ,
168
+ " \n " ,
169
+ " \n " ,
170
+ " \n " ,
171
+ " \n " ,
172
+ " \n " ,
173
+ " \n " ,
174
+ " \n " ,
175
+ " \n " ,
176
+ " \n " ,
177
+ " \n " ,
178
+ " \n " ,
179
+ " \n "
180
+ ]
181
+ },
182
+ {
183
+ "cell_type" : " markdown" ,
184
+ "source" : [
185
+ " # New Section"
186
+ ],
187
+ "metadata" : {
188
+ "id" : " 2tFeBscJiE4L"
189
+ }
190
+ },
191
+ {
192
+ "cell_type" : " code" ,
193
+ "execution_count" : 1 ,
194
+ "metadata" : {
195
+ "id" : " cBzE5CQXR_tM"
196
+ },
197
+ "outputs" : [],
198
+ "source" : []
199
+ }
200
+ ],
201
+ "metadata" : {
202
+ "colab" : {
203
+ "provenance" : [],
204
+ "mount_file_id" : " 1dYDRXWpObORZhMxwjIYOMlmVM5A-Vqiy" ,
205
+ "authorship_tag" : " ABX9TyMPxOvxKC3egh8n8I0nm6SN" ,
206
+ "include_colab_link" : true
207
+ },
208
+ "kernelspec" : {
209
+ "display_name" : " Python 3" ,
210
+ "name" : " python3"
211
+ },
212
+ "language_info" : {
213
+ "name" : " python"
214
+ }
215
+ },
216
+ "nbformat" : 4 ,
217
+ "nbformat_minor" : 0
218
+ }
0 commit comments