Skip to content

Commit 9bd9663

Browse files
Created using Colab
1 parent 51e9b5e commit 9bd9663

File tree

1 file changed

+218
-0
lines changed

1 file changed

+218
-0
lines changed
+218
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "view-in-github",
7+
"colab_type": "text"
8+
},
9+
"source": [
10+
"<a href=\"https://colab.research.google.com/github/usha-madithati/esmart.github.io/blob/main/Yet_another_copy_of_SentimentAnalysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 3,
16+
"metadata": {
17+
"colab": {
18+
"base_uri": "https://localhost:8080/",
19+
"height": 228
20+
},
21+
"id": "6fldNk57Fwkt",
22+
"outputId": "f9c44560-ffa0-4a4a-ccca-1371c31242e3"
23+
},
24+
"outputs": [
25+
{
26+
"output_type": "error",
27+
"ename": "IndentationError",
28+
"evalue": "unexpected indent (<ipython-input-3-cbcedf656fe3>, line 126)",
29+
"traceback": [
30+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
31+
"\u001b[0;31mIndentationError\u001b[0m Traceback (most recent call last)",
32+
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/compilerop.py\u001b[0m in \u001b[0;36mast_parse\u001b[0;34m(self, source, filename, symbol)\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mArguments\u001b[0m \u001b[0mare\u001b[0m \u001b[0mexactly\u001b[0m \u001b[0mthe\u001b[0m \u001b[0msame\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32min\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mstandard\u001b[0m \u001b[0mlibrary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m and are passed to the built-in compile function.\"\"\"\n\u001b[0;32m--> 101\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msymbol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0mPyCF_ONLY_AST\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mreset_compiler_flags\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
33+
"\u001b[0;31mIndentationError\u001b[0m: unexpected indent (<ipython-input-3-cbcedf656fe3>, line 126)"
34+
]
35+
}
36+
],
37+
"source": [
38+
"\n",
39+
"!pip install numpy\n",
40+
"import numpy as np\n",
41+
"print(np.__version__)\n",
42+
"\n",
43+
"!pip install pandas\n",
44+
"import pandas as pd\n",
45+
"print(pd.__version__)\n",
46+
"!pip install nltk\n",
47+
"import nltk\n",
48+
"print(nltk.__version__)\n",
49+
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
50+
"import re\n",
51+
"from textblob import TextBlob\n",
52+
"from wordcloud import WordCloud\n",
53+
"import seaborn as sns\n",
54+
"import matplotlib.pyplot as plt\n",
55+
"import cufflinks as cf\n",
56+
"%matplotlib inline\n",
57+
"from plotly.offline import init_notebook_mode,iplot\n",
58+
"init_notebook_mode(connected = True)\n",
59+
"cf.go_offline();\n",
60+
"import plotly.graph_objs as go\n",
61+
"from plotly.subplots import make_subplots\n",
62+
"\n",
63+
"import warnings\n",
64+
"warnings.filterwarnings(\"ignore\")\n",
65+
"warnings.warn(\"this will not show\")\n",
66+
"pd.set_option('display.max_columns',None)\n",
67+
"df= pd.read_csv(\"/content/amazon.csv\")\n",
68+
"del df[df.columns[0]]\n",
69+
"import pandas as pd\n",
70+
"\n",
71+
"# Load your dataset\n",
72+
"df = pd.read_csv(\"/content/amazon.csv\")\n",
73+
"columns_to_clean=['reviewerName','overall','reviewText','reviewTime','day_diff','helpful_yes','helpful_no','total_vote','score_pos_neg_diff','score_average_rating','wilson_lower_bound']\n",
74+
"df_cleaned=df.dropna(subset=columns_to_clean)\n",
75+
"df_cleaned.to_csv('cleaned_dataset.csv',index=False)\n",
76+
"\n",
77+
"# Check for missing values\n",
78+
"df = pd.read_csv(\"/content/cleaned_dataset.csv\")\n",
79+
"missing_values = df.isnull().sum().sum()\n",
80+
"if missing_values == 0:\n",
81+
" print(\"No missing values found.\")\n",
82+
"else:\n",
83+
" print(\"Missing values found in the dataset.total:{missing_values}\")\n",
84+
"\n",
85+
"# Check for duplicates\n",
86+
"duplicate_rows = df.duplicated().sum()\n",
87+
"if duplicate_rows == 0:\n",
88+
" print(\"No duplicate rows found.\")\n",
89+
"else:\n",
90+
" print(\"Duplicate rows found.\")\n",
91+
"\n",
92+
"# Validate data types\n",
93+
"print(\"Data types:\")\n",
94+
"print(df.dtypes)\n",
95+
"\n",
96+
"# Explore outliers (This example assumes numerical columns)\n",
97+
"outliers = df.describe().loc[['min', 'max']].T\n",
98+
"print(\"Outliers (min/max values per column):\")\n",
99+
"print(outliers)\n",
100+
"df = pd.read_csv(\"/content/cleaned_dataset.csv\")\n",
101+
"def missing_values_analysis(df):\n",
102+
" na_columns_=[col for col in df.columns if df[col].isnull().sum()>0]\n",
103+
" n_miss = pd.Series(df[na_columns_].isnull().sum())\n",
104+
" ratio_ = (df[na_columns_].isnull().sum()/df.shape[0]*100).sort_values(ascending=True)\n",
105+
" missing_df = pd.concat([n_miss,np.round(ratio_,2)],axis=1,keys=['Missing Values','Ratio'])\n",
106+
" return missing_df\n",
107+
"def check_dataframe(df, head=5, tail=5):\n",
108+
" print(\"SHAPE\".center(82, '~'))\n",
109+
" print('Rows: {}'.format(df.shape[0]))\n",
110+
" print('Columns: {}'.format(df.shape[1]))\n",
111+
" print(\"TYPES\".center(82, '~'))\n",
112+
" print(df.dtypes)\n",
113+
" print(\"\".center(82, '~'))\n",
114+
" print(missing_values_analysis(df))\n",
115+
" print('DUPLICATED VALUES'.center(83, '~'))\n",
116+
" print(df.duplicated().sum())\n",
117+
" print(\"QUANTILES\".center(82, '~'))\n",
118+
" numerical_columns=df.select_dtypes(include=['int','float'])\n",
119+
" quantiles=numerical_columns.quantile([0,0.05,0.50,0.95,0.99,1])\n",
120+
" print(\"QUANTILES:\")\n",
121+
" print(quantiles)\n",
122+
"check_dataframe(df)\n",
123+
"def check_class(data_frame):\n",
124+
" nunique_df=pd.DataFrame({'Variable':data_frame.columns,\n",
125+
" 'Classes':[data_frame[i].nunique()\\\n",
126+
" for i in data_frame.columns]})\n",
127+
" nunique_df=nunique_df.sort_values('Classes',ascending=False)\n",
128+
" nunique_df=nunique_df.reset_index(drop=True)\n",
129+
" return nunique_df\n",
130+
"check_class(df)\n",
131+
"\n",
132+
"import plotly.graph_objects as go\n",
133+
"from plotly.subplots import make_subplots\n",
134+
"constraints=['#B34D22','#EBE00C','#1FEB0C','#0C92EB','#EB0CD5']\n",
135+
"def categeorical_variable_summary(df,column_name):\n",
136+
" fig=make_subplots(rows=1,cols=2,\n",
137+
" subplot_titles=('Countplot','Percentage'),\n",
138+
" specs=[[{\"type\":\"xy\"},{'type': \"domain\"}]]) # Define domain here\n",
139+
" fig.add_trace(go.Bar(y=df[column_name].value_counts().values.tolist(),\n",
140+
" x=[str(i)for i in df[column_name].value_counts().index],\n",
141+
" text=df[column_name].value_counts().values.tolist(),\n",
142+
" textfont=dict(size=14),\n",
143+
" name=column_name,\n",
144+
" textposition='auto',\n",
145+
" showlegend=False,\n",
146+
" marker=dict(color=constraints,\n",
147+
" line=dict(color='#DBE6EC',\n",
148+
" width=1))),\n",
149+
" row=1,col=1)\n",
150+
" fig.add_trace(go.Pie(labels=df[column_name].value_counts().keys(),\n",
151+
" values=df[column_name].value_counts().values,\n",
152+
" textfont=dict(size=18),\n",
153+
" textposition='auto',\n",
154+
" showlegend=False,\n",
155+
" name=column_name,\n",
156+
" marker=dict(colors=constraints)),\n",
157+
" row=1,col=2)\n",
158+
" fig.update_layout(title={'text':column_name,\n",
159+
" 'y':0.9,\n",
160+
" 'x':0.5,\n",
161+
" 'xanchor':'center',\n",
162+
" 'yanchor':'top'},\n",
163+
" template='plotly_white')\n",
164+
" display(fig)\n",
165+
"\n",
166+
"categeorical_variable_summary(df,'overall')\n",
167+
"\n",
168+
"\n",
169+
"\n",
170+
"\n",
171+
"\n",
172+
"\n",
173+
"\n",
174+
"\n",
175+
"\n",
176+
"\n",
177+
"\n",
178+
"\n",
179+
"\n"
180+
]
181+
},
182+
{
183+
"cell_type": "markdown",
184+
"source": [
185+
"# New Section"
186+
],
187+
"metadata": {
188+
"id": "2tFeBscJiE4L"
189+
}
190+
},
191+
{
192+
"cell_type": "code",
193+
"execution_count": 1,
194+
"metadata": {
195+
"id": "cBzE5CQXR_tM"
196+
},
197+
"outputs": [],
198+
"source": []
199+
}
200+
],
201+
"metadata": {
202+
"colab": {
203+
"provenance": [],
204+
"mount_file_id": "1dYDRXWpObORZhMxwjIYOMlmVM5A-Vqiy",
205+
"authorship_tag": "ABX9TyMPxOvxKC3egh8n8I0nm6SN",
206+
"include_colab_link": true
207+
},
208+
"kernelspec": {
209+
"display_name": "Python 3",
210+
"name": "python3"
211+
},
212+
"language_info": {
213+
"name": "python"
214+
}
215+
},
216+
"nbformat": 4,
217+
"nbformat_minor": 0
218+
}

0 commit comments

Comments
 (0)