Skip to content

Commit f355be3

Browse files
Merge pull request #124 from dilroseR/issue-#110
Data Preprocessing in ML
2 parents b27d297 + 4d48ed2 commit f355be3

File tree

10 files changed

+514
-0
lines changed

10 files changed

+514
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Name,Age,Position,Experience,Salary
2+
Nitin,24,ML Engg,4,22000
3+
Harshita,23,ML Engg,3,18000
4+
Arya,,Data Analyst,,
5+
Gagan,,Data Analyst,2,10000
6+
Randhir,,,4,16000
7+
Saurav,22,admin,3,8000
8+
Manoj,31,Manager,11,25000
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
place,area,price
2+
Lonavala,2600,550000
3+
Lonavala,3000,565000
4+
Lonavala,3200,610000
5+
Karjat,2600,450000
6+
Karjat,2800,500000
7+
Khandala,2600,600000
8+
Khandala,2900,650000
9+
Khandala,3200,680000
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"source": [
7+
"# handling categorical data using dummies from pandas\r\n",
8+
"\r\n",
9+
"import pandas as pd"
10+
],
11+
"outputs": [],
12+
"metadata": {}
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 3,
17+
"source": [
18+
"data = pd.read_csv(\"E:\\github\\DS-ScriptsNook\\Machine Learning\\Libraries\\Data Preprocessing in ML\\Dataset\\place_area_price.csv\") #load the data\r\n",
19+
"print(data)"
20+
],
21+
"outputs": [
22+
{
23+
"output_type": "stream",
24+
"name": "stdout",
25+
"text": [
26+
" place area price\n",
27+
"0 Lonavala 2600 550000\n",
28+
"1 Lonavala 3000 565000\n",
29+
"2 Lonavala 3200 610000\n",
30+
"3 Karjat 2600 450000\n",
31+
"4 Karjat 2800 500000\n",
32+
"5 Khandala 2600 600000\n",
33+
"6 Khandala 2900 650000\n",
34+
"7 Khandala 3200 680000\n"
35+
]
36+
}
37+
],
38+
"metadata": {}
39+
},
40+
{
41+
"cell_type": "code",
42+
"execution_count": 4,
43+
"source": [
44+
"dummies = pd.get_dummies(data.place) # dummies is a function in pandas to convert categorical values into numerical ones\r\n",
45+
"print(dummies)\r\n",
46+
"\r\n",
47+
"# the data is converted to numerical values but here we can see that if one col is deleted,\r\n",
48+
"# we can predict the value for other cols\r\n",
49+
"# eg1: if we drop the first row, and if it has to be 1 then the other 2 cols would have values 0\r\n",
50+
"# eg2: if lonavala has 1, then we can say that khandala and karjat would be 0 and hence it is ok to,\r\n",
51+
"# drop the first col which would become easy for the model to compute. This is shown in the eg below"
52+
],
53+
"outputs": [
54+
{
55+
"output_type": "stream",
56+
"name": "stdout",
57+
"text": [
58+
" Karjat Khandala Lonavala\n",
59+
"0 0 0 1\n",
60+
"1 0 0 1\n",
61+
"2 0 0 1\n",
62+
"3 1 0 0\n",
63+
"4 1 0 0\n",
64+
"5 0 1 0\n",
65+
"6 0 1 0\n",
66+
"7 0 1 0\n"
67+
]
68+
}
69+
],
70+
"metadata": {}
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 5,
75+
"source": [
76+
"new_dummies = pd.get_dummies(data.place, drop_first=True)\r\n",
77+
"print(new_dummies)"
78+
],
79+
"outputs": [
80+
{
81+
"output_type": "stream",
82+
"name": "stdout",
83+
"text": [
84+
" Khandala Lonavala\n",
85+
"0 0 1\n",
86+
"1 0 1\n",
87+
"2 0 1\n",
88+
"3 0 0\n",
89+
"4 0 0\n",
90+
"5 1 0\n",
91+
"6 1 0\n",
92+
"7 1 0\n"
93+
]
94+
}
95+
],
96+
"metadata": {}
97+
}
98+
],
99+
"metadata": {
100+
"orig_nbformat": 4,
101+
"language_info": {
102+
"name": "python",
103+
"version": "3.6.4",
104+
"mimetype": "text/x-python",
105+
"codemirror_mode": {
106+
"name": "ipython",
107+
"version": 3
108+
},
109+
"pygments_lexer": "ipython3",
110+
"nbconvert_exporter": "python",
111+
"file_extension": ".py"
112+
},
113+
"kernelspec": {
114+
"name": "python3",
115+
"display_name": "Python 3.6.4 64-bit"
116+
},
117+
"interpreter": {
118+
"hash": "f2db1a205d05422567bfea71378eb1163d4d5d7418f0062693a7c6bfcbaf4348"
119+
}
120+
},
121+
"nbformat": 4,
122+
"nbformat_minor": 2
123+
}

0 commit comments

Comments
 (0)