Created using Colab

usha-madithati · usha-madithati · commit 9bd9663f36d6 · 2024-06-16T12:46:11.000+05:30
diff --git a/Yet_another_copy_of_SentimentAnalysis.ipynb b/Yet_another_copy_of_SentimentAnalysis.ipynb
@@ -0,0 +1,218 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/usha-madithati/esmart.github.io/blob/main/Yet_another_copy_of_SentimentAnalysis.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 228
+        },
+        "id": "6fldNk57Fwkt",
+        "outputId": "f9c44560-ffa0-4a4a-ccca-1371c31242e3"
+      },
+      "outputs": [
+        {
+          "output_type": "error",
+          "ename": "IndentationError",
+          "evalue": "unexpected indent (<ipython-input-3-cbcedf656fe3>, line 126)",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mIndentationError\u001b[0m                          Traceback (most recent call last)",
+            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/compilerop.py\u001b[0m in \u001b[0;36mast_parse\u001b[0;34m(self, source, filename, symbol)\u001b[0m\n\u001b[1;32m     99\u001b[0m         \u001b[0mArguments\u001b[0m \u001b[0mare\u001b[0m \u001b[0mexactly\u001b[0m \u001b[0mthe\u001b[0m \u001b[0msame\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32min\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mstandard\u001b[0m \u001b[0mlibrary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    100\u001b[0m         and are passed to the built-in compile function.\"\"\"\n\u001b[0;32m--> 101\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mcompile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msymbol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mflags\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0mPyCF_ONLY_AST\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    103\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mreset_compiler_flags\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mIndentationError\u001b[0m: unexpected indent (<ipython-input-3-cbcedf656fe3>, line 126)"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "!pip install numpy\n",
+        "import numpy as np\n",
+        "print(np.__version__)\n",
+        "\n",
+        "!pip install pandas\n",
+        "import pandas as pd\n",
+        "print(pd.__version__)\n",
+        "!pip install nltk\n",
+        "import nltk\n",
+        "print(nltk.__version__)\n",
+        "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+        "import re\n",
+        "from textblob import TextBlob\n",
+        "from wordcloud import WordCloud\n",
+        "import seaborn as sns\n",
+        "import matplotlib.pyplot as plt\n",
+        "import cufflinks as cf\n",
+        "%matplotlib inline\n",
+        "from plotly.offline import init_notebook_mode,iplot\n",
+        "init_notebook_mode(connected = True)\n",
+        "cf.go_offline();\n",
+        "import plotly.graph_objs as go\n",
+        "from plotly.subplots import make_subplots\n",
+        "\n",
+        "import warnings\n",
+        "warnings.filterwarnings(\"ignore\")\n",
+        "warnings.warn(\"this will not show\")\n",
+        "pd.set_option('display.max_columns',None)\n",
+        "df= pd.read_csv(\"/content/amazon.csv\")\n",
+        "del df[df.columns[0]]\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Load your dataset\n",
+        "df = pd.read_csv(\"/content/amazon.csv\")\n",
+        "columns_to_clean=['reviewerName','overall','reviewText','reviewTime','day_diff','helpful_yes','helpful_no','total_vote','score_pos_neg_diff','score_average_rating','wilson_lower_bound']\n",
+        "df_cleaned=df.dropna(subset=columns_to_clean)\n",
+        "df_cleaned.to_csv('cleaned_dataset.csv',index=False)\n",
+        "\n",
+        "# Check for missing values\n",
+        "df = pd.read_csv(\"/content/cleaned_dataset.csv\")\n",
+        "missing_values = df.isnull().sum().sum()\n",
+        "if missing_values == 0:\n",
+        "    print(\"No missing values found.\")\n",
+        "else:\n",
+        "    print(\"Missing values found in the dataset.total:{missing_values}\")\n",
+        "\n",
+        "# Check for duplicates\n",
+        "duplicate_rows = df.duplicated().sum()\n",
+        "if duplicate_rows == 0:\n",
+        "    print(\"No duplicate rows found.\")\n",
+        "else:\n",
+        "    print(\"Duplicate rows found.\")\n",
+        "\n",
+        "# Validate data types\n",
+        "print(\"Data types:\")\n",
+        "print(df.dtypes)\n",
+        "\n",
+        "# Explore outliers (This example assumes numerical columns)\n",
+        "outliers = df.describe().loc[['min', 'max']].T\n",
+        "print(\"Outliers (min/max values per column):\")\n",
+        "print(outliers)\n",
+        "df = pd.read_csv(\"/content/cleaned_dataset.csv\")\n",
+        "def missing_values_analysis(df):\n",
+        "  na_columns_=[col for col in df.columns if df[col].isnull().sum()>0]\n",
+        "  n_miss = pd.Series(df[na_columns_].isnull().sum())\n",
+        "  ratio_ = (df[na_columns_].isnull().sum()/df.shape[0]*100).sort_values(ascending=True)\n",
+        "  missing_df = pd.concat([n_miss,np.round(ratio_,2)],axis=1,keys=['Missing Values','Ratio'])\n",
+        "  return missing_df\n",
+        "def check_dataframe(df, head=5, tail=5):\n",
+        "    print(\"SHAPE\".center(82, '~'))\n",
+        "    print('Rows: {}'.format(df.shape[0]))\n",
+        "    print('Columns: {}'.format(df.shape[1]))\n",
+        "    print(\"TYPES\".center(82, '~'))\n",
+        "    print(df.dtypes)\n",
+        "    print(\"\".center(82, '~'))\n",
+        "    print(missing_values_analysis(df))\n",
+        "    print('DUPLICATED VALUES'.center(83, '~'))\n",
+        "    print(df.duplicated().sum())\n",
+        "    print(\"QUANTILES\".center(82, '~'))\n",
+        "    numerical_columns=df.select_dtypes(include=['int','float'])\n",
+        "    quantiles=numerical_columns.quantile([0,0.05,0.50,0.95,0.99,1])\n",
+        "    print(\"QUANTILES:\")\n",
+        "    print(quantiles)\n",
+        "check_dataframe(df)\n",
+        "def check_class(data_frame):\n",
+        "  nunique_df=pd.DataFrame({'Variable':data_frame.columns,\n",
+        "                           'Classes':[data_frame[i].nunique()\\\n",
+        "                                      for i in data_frame.columns]})\n",
+        "  nunique_df=nunique_df.sort_values('Classes',ascending=False)\n",
+        "  nunique_df=nunique_df.reset_index(drop=True)\n",
+        "  return nunique_df\n",
+        "check_class(df)\n",
+        "\n",
+        "import plotly.graph_objects as go\n",
+        "from plotly.subplots import make_subplots\n",
+        "constraints=['#B34D22','#EBE00C','#1FEB0C','#0C92EB','#EB0CD5']\n",
+        "def categeorical_variable_summary(df,column_name):\n",
+        "  fig=make_subplots(rows=1,cols=2,\n",
+        "                    subplot_titles=('Countplot','Percentage'),\n",
+        "                    specs=[[{\"type\":\"xy\"},{'type': \"domain\"}]]) # Define domain here\n",
+        "  fig.add_trace(go.Bar(y=df[column_name].value_counts().values.tolist(),\n",
+        "                       x=[str(i)for i in df[column_name].value_counts().index],\n",
+        "                       text=df[column_name].value_counts().values.tolist(),\n",
+        "                       textfont=dict(size=14),\n",
+        "                       name=column_name,\n",
+        "                       textposition='auto',\n",
+        "                      showlegend=False,\n",
+        "                       marker=dict(color=constraints,\n",
+        "                                   line=dict(color='#DBE6EC',\n",
+        "                                             width=1))),\n",
+        "                              row=1,col=1)\n",
+        "  fig.add_trace(go.Pie(labels=df[column_name].value_counts().keys(),\n",
+        "                       values=df[column_name].value_counts().values,\n",
+        "                       textfont=dict(size=18),\n",
+        "                       textposition='auto',\n",
+        "                       showlegend=False,\n",
+        "                       name=column_name,\n",
+        "                       marker=dict(colors=constraints)),\n",
+        "                row=1,col=2)\n",
+        "  fig.update_layout(title={'text':column_name,\n",
+        "                           'y':0.9,\n",
+        "                           'x':0.5,\n",
+        "                           'xanchor':'center',\n",
+        "                           'yanchor':'top'},\n",
+        "                    template='plotly_white')\n",
+        "    display(fig)\n",
+        "\n",
+        "categeorical_variable_summary(df,'overall')\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# New Section"
+      ],
+      "metadata": {
+        "id": "2tFeBscJiE4L"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "cBzE5CQXR_tM"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "mount_file_id": "1dYDRXWpObORZhMxwjIYOMlmVM5A-Vqiy",
+      "authorship_tag": "ABX9TyMPxOvxKC3egh8n8I0nm6SN",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}