diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Dataset/data.csv b/Machine Learning/Libraries/Data Preprocessing in ML/Dataset/data.csv new file mode 100644 index 00000000..8f00ee5e --- /dev/null +++ b/Machine Learning/Libraries/Data Preprocessing in ML/Dataset/data.csv @@ -0,0 +1,8 @@ +Name,Age,Position,Experience,Salary +Nitin,24,ML Engg,4,22000 +Harshita,23,ML Engg,3,18000 +Arya,,Data Analyst,, +Gagan,,Data Analyst,2,10000 +Randhir,,,4,16000 +Saurav,22,admin,3,8000 +Manoj,31,Manager,11,25000 diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Dataset/place_area_price.csv b/Machine Learning/Libraries/Data Preprocessing in ML/Dataset/place_area_price.csv new file mode 100644 index 00000000..ea161858 --- /dev/null +++ b/Machine Learning/Libraries/Data Preprocessing in ML/Dataset/place_area_price.csv @@ -0,0 +1,9 @@ +place,area,price +Lonavala,2600,550000 +Lonavala,3000,565000 +Lonavala,3200,610000 +Karjat,2600,450000 +Karjat,2800,500000 +Khandala,2600,600000 +Khandala,2900,650000 +Khandala,3200,680000 diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Handling_categorical_data.ipynb b/Machine Learning/Libraries/Data Preprocessing in ML/Handling_categorical_data.ipynb new file mode 100644 index 00000000..798997e4 --- /dev/null +++ b/Machine Learning/Libraries/Data Preprocessing in ML/Handling_categorical_data.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "# handling categorical data using dummies from pandas\r\n", + "\r\n", + "import pandas as pd" + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "data = pd.read_csv(\"E:\\github\\DS-ScriptsNook\\Machine Learning\\Libraries\\Data Preprocessing in ML\\Dataset\\place_area_price.csv\") #load the data\r\n", + "print(data)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " place area price\n", + "0 Lonavala 2600 550000\n", + "1 Lonavala 3000 565000\n", + "2 Lonavala 3200 610000\n", + "3 Karjat 2600 450000\n", + "4 Karjat 2800 500000\n", + "5 Khandala 2600 600000\n", + "6 Khandala 2900 650000\n", + "7 Khandala 3200 680000\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "dummies = pd.get_dummies(data.place) # dummies is a function in pandas to convert categorical values into numerical ones\r\n", + "print(dummies)\r\n", + "\r\n", + "# the data is converted to numerical values but here we can see that if one col is deleted,\r\n", + "# we can predict the value for other cols\r\n", + "# eg1: if we drop the first row, and if it has to be 1 then the other 2 cols would have values 0\r\n", + "# eg2: if lonavala has 1, then we can say that khandala and karjat would be 0 and hence it is ok to,\r\n", + "# drop the first col which would become easy for the model to compute. This is shown in the eg below" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Karjat Khandala Lonavala\n", + "0 0 0 1\n", + "1 0 0 1\n", + "2 0 0 1\n", + "3 1 0 0\n", + "4 1 0 0\n", + "5 0 1 0\n", + "6 0 1 0\n", + "7 0 1 0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "new_dummies = pd.get_dummies(data.place, drop_first=True)\r\n", + "print(new_dummies)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Khandala Lonavala\n", + "0 0 1\n", + "1 0 1\n", + "2 0 1\n", + "3 0 0\n", + "4 0 0\n", + "5 1 0\n", + "6 1 0\n", + "7 1 0\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.6.4", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.6.4 64-bit" + }, + "interpreter": { + "hash": "f2db1a205d05422567bfea71378eb1163d4d5d7418f0062693a7c6bfcbaf4348" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Handling_numerical_data1.ipynb b/Machine Learning/Libraries/Data Preprocessing in ML/Handling_numerical_data1.ipynb new file mode 100644 index 00000000..39006155 --- /dev/null +++ b/Machine Learning/Libraries/Data Preprocessing in ML/Handling_numerical_data1.ipynb @@ -0,0 +1,269 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "# check for null values\r\n", + "\r\n", + "import pandas as pd\r\n", + "data = pd.read_csv(\"E:\\github\\DS-ScriptsNook\\Machine Learning\\Libraries\\Data Preprocessing in ML\\Dataset\\data.csv\")\r\n", + "print(data)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "0 Nitin 24.0 ML Engg 4.0 22000.0\n", + "1 Harshita 23.0 ML Engg 3.0 18000.0\n", + "2 Arya NaN Data Analyst NaN NaN\n", + "3 Gagan NaN Data Analyst 2.0 10000.0\n", + "4 Randhir NaN NaN 4.0 16000.0\n", + "5 Saurav 22.0 admin 3.0 8000.0\n", + "6 Manoj 31.0 Manager 11.0 25000.0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "# check for null data\r\n", + "res = data.isnull().sum()\r\n", + "print(res)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Name 0\n", + "Age 3\n", + "Position 1\n", + "Experience 1\n", + "Salary 1\n", + "dtype: int64\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "# find out all whose age is null\r\n", + "d1 = data[data.Age.isnull()]\r\n", + "print(d1)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "2 Arya NaN Data Analyst NaN NaN\n", + "3 Gagan NaN Data Analyst 2.0 10000.0\n", + "4 Randhir NaN NaN 4.0 16000.0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "# if any value is null\r\n", + "d3 = data[data.isnull().any(axis=1)]\r\n", + "print(d3)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "2 Arya NaN Data Analyst NaN NaN\n", + "3 Gagan NaN Data Analyst 2.0 10000.0\n", + "4 Randhir NaN NaN 4.0 16000.0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### How to drop ?" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "# drops if any null values are present\r\n", + "d1 = data.dropna(how=\"any\", axis=0)\r\n", + "print(d1)\r\n", + "\r\n", + "# arya, gagan, randhir got dropped" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "0 Nitin 24.0 ML Engg 4.0 22000.0\n", + "1 Harshita 23.0 ML Engg 3.0 18000.0\n", + "5 Saurav 22.0 admin 3.0 8000.0\n", + "6 Manoj 31.0 Manager 11.0 25000.0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "# drop whose salary is null\r\n", + "d2 = data.dropna(subset=[\"Salary\"])\r\n", + "print(d2)\r\n", + "\r\n", + "# arya got dropped" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "0 Nitin 24.0 ML Engg 4.0 22000.0\n", + "1 Harshita 23.0 ML Engg 3.0 18000.0\n", + "3 Gagan NaN Data Analyst 2.0 10000.0\n", + "4 Randhir NaN NaN 4.0 16000.0\n", + "5 Saurav 22.0 admin 3.0 8000.0\n", + "6 Manoj 31.0 Manager 11.0 25000.0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "### How to fill missing values ?" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "d1 = data.fillna({\"Age\": data[\"Age\"].mean()}) # fillna fills a col with the mean/median values\r\n", + "print(d1)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "0 Nitin 24.0 ML Engg 4.0 22000.0\n", + "1 Harshita 23.0 ML Engg 3.0 18000.0\n", + "2 Arya 25.0 Data Analyst NaN NaN\n", + "3 Gagan 25.0 Data Analyst 2.0 10000.0\n", + "4 Randhir 25.0 NaN 4.0 16000.0\n", + "5 Saurav 22.0 admin 3.0 8000.0\n", + "6 Manoj 31.0 Manager 11.0 25000.0\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "# fill missing values in Experience\r\n", + "d2 = data.fillna({\"Experience\": data[\"Experience\"].mean()})\r\n", + "print(d2)\r\n", + "print(\" \")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "0 Nitin 24.0 ML Engg 4.0 22000.0\n", + "1 Harshita 23.0 ML Engg 3.0 18000.0\n", + "2 Arya NaN Data Analyst 4.5 NaN\n", + "3 Gagan NaN Data Analyst 2.0 10000.0\n", + "4 Randhir NaN NaN 4.0 16000.0\n", + "5 Saurav 22.0 admin 3.0 8000.0\n", + "6 Manoj 31.0 Manager 11.0 25000.0\n", + " \n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 9, + "source": [ + "# filling null values in Position by a default one\r\n", + "d3 = data.fillna({\"Position\": \"Unallocated\"})\r\n", + "print(d3)\r\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Name Age Position Experience Salary\n", + "0 Nitin 24.0 ML Engg 4.0 22000.0\n", + "1 Harshita 23.0 ML Engg 3.0 18000.0\n", + "2 Arya NaN Data Analyst NaN NaN\n", + "3 Gagan NaN Data Analyst 2.0 10000.0\n", + "4 Randhir NaN Unallocated 4.0 16000.0\n", + "5 Saurav 22.0 admin 3.0 8000.0\n", + "6 Manoj 31.0 Manager 11.0 25000.0\n" + ] + } + ], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.6.4", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.6.4 64-bit" + }, + "interpreter": { + "hash": "f2db1a205d05422567bfea71378eb1163d4d5d7418f0062693a7c6bfcbaf4348" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Images/img1.PNG b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img1.PNG new file mode 100644 index 00000000..e64c95d2 Binary files /dev/null and b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img1.PNG differ diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Images/img2.PNG b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img2.PNG new file mode 100644 index 00000000..ef50c48a Binary files /dev/null and b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img2.PNG differ diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Images/img3.PNG b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img3.PNG new file mode 100644 index 00000000..9bf71552 Binary files /dev/null and b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img3.PNG differ diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Images/img4.PNG b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img4.PNG new file mode 100644 index 00000000..37b60af2 Binary files /dev/null and b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img4.PNG differ diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/Images/img5.PNG b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img5.PNG new file mode 100644 index 00000000..9e2c0c0b Binary files /dev/null and b/Machine Learning/Libraries/Data Preprocessing in ML/Images/img5.PNG differ diff --git a/Machine Learning/Libraries/Data Preprocessing in ML/README.md b/Machine Learning/Libraries/Data Preprocessing in ML/README.md new file mode 100644 index 00000000..1cce6136 --- /dev/null +++ b/Machine Learning/Libraries/Data Preprocessing in ML/README.md @@ -0,0 +1,105 @@ +### Data Pre-processing in ML + +### GOAL +To get a basic understanding of how to pre-process the data before giving the data into the model. + + +### PURPOSE +Our machine learning models just cannot take in the real data as it is , since it may contain inconsistent values, missing values or wrong values. Therefore it is highly important to learn this section before moving further. + + +### DESCRIPTION +- Data pre-processing means to understand the data. +- Data preprocessing is a process of preparing the raw data and making it suitable for a machine learning model. + +It is of 2 types: +- one handle numerical values +- other to handle categorical values + + +### WHAT I HAD DONE +1. Check for null values +- In order to proceed, we need to check if our data has any null values or not, +this can be checked with: + +**Syntax:** +data.isnull().sum() +This will give no of null values in each column + +2. Handling missing values +- If you have less missing values and are not that important, then you can directly drop them , + +**Syntax:** +d1 = data.dropna(how="any", axis=0), it drops rows which as any null values +d2 = data.dropna(subset=[column_name]), drops rows from the column name specified + +If you want to fill missing values, then you can do them as following: +**Syntax:** +d1 = data.fillna({"Age": data["Age"].mean()}), for numerical data +d2 = data.fillna({"Position": "Unallocated"}), for categorical data + +3. Handling categorical values +- As we know that our model cannot work with categorical values, we have to convert them into numerical data and this can be done using pandas library + +**Syntax:** +features = pd.get_dummies(data.place) , eg: place is a column name in the data +It will convert the categorical data in 0,1,2.. format + +new_features = pd.get_dummies(data.place, drop_first=True) +About **drop_first=True**, i've explained in detailed in Handling_categorical_data.ipynb + + +### WORKFLOW OF YOUR PROJECT FILES +1. Load the data using pandas +2. Understand the data, if any null values are present or not. +3. If there are some null values in numerical column, then they can be dropped of filled with mean/median +4. If there are some null values in categorical column, we can allocate a default value in them. +5. Lastly, if the data contains some categorical column, then they should be converted into numerical using the **get _dummies** function in pandas + + +### STATE YOUR PROCEDURE AND UNDERSTANDING FROM YOUR WORK +- This is a simple approach towards data pre-processing and very easy for the beginners to learn and therefore i chose this methodology. +- With this approach as well the cleaning of the data can be done. +- With this repo, I got to revise the ML Skills again + +### USAGE + +Data pre-processing should be first step for any ml enthisiast since your model cannot take in noisy data and hence it is highly important to learn this. + +## USE CASES + +This step should be done compulsorily in all cases before developing an ml model. + +### LIBRARIES USED + +- pandas (pip install pandas) + +**ADVANTAGES** +- Data pre-processing helps us in giving the data into model without any problems + +**DISADVANTAGES** + +None + +**APPLICATIONS** +- Data preprocessing is used database-driven applications such as customer relationship management and rule-based applications + +**SCREENSHOTS** + +None +None +None + +### CONCLUSION + +Therefore, before building any ml model, one must take take about the data preprocessing section because the real-world data is often incomplete, inconsistent, and/or lacking in certain behaviors or trends, and is likely to contain many errors. + +### REFERENCES +https://scikit-learn.org/stable/modules/preprocessing.html +https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9 +https://www.javatpoint.com/data-preprocessing-machine-learning + +### YOUR NAME +Karakattil Dilrose Reji + +