-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWeek10_Flipped_Exercise.R
177 lines (101 loc) · 4.17 KB
/
Week10_Flipped_Exercise.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
##### IE332 Week 10 Flipped Exercise ###########
# Use diabetes data set-------
#Initialization: Set a specific seed to obtain reproducible results.
set.seed(1234)
#1. Read the data set to a data frame
diabetesData <- read.csv("data/diabetes.csv")
#2. Check out the first few rows
head(diabetesData)
#3. Check the structure of the data set to see if we have correct categorical/numerical variables
str(diabetesData)
#4. Make the "outcome" column categorical (factor) if needed (Note that if it is numerical glm() will work)
#myDiabetData$Outcome<-as.factor(myDiabetData$Outcome)
#5. How many patients of each class (diabetes-1, no diabetes-0) are there?
xtabs(~Outcome, data=diabetesData)
#6. Divide the data set into training and test data sets (you can take 80% of the data for training)
nrow(diabetesData) * .8
trainSetDiabetes <- diabetesData[1:615,]
testSetDiabetets <- diabetesData[616:768,]
#7. Build a logistic regression model using only training data set
logitDiabetes <- glm(Outcome~., family= binomial, data=trainSetDiabetes)
summary(logitDiabetes)
#8. Use test set to predict the outcome, which is 0 (no) or 1 (yes)
# You can use 0.5 as the threshold
pHat1 <- predict(logitDiabetes, testSetDiabetets, type="response")
yHat1 <- ifelse(pHat1 >= 0.5, 1, 0)
#9. Create confusion matrix
#Create confusion matrix manually
confMatrix <- table(testSetDiabetets$Outcome, yHat1)
confMatrix
#10. Calculate precision, recall, accuracy, sensitivity and specificity
TP <- confMatrix[2,2]
FP <- confMatrix[1,2]
TN <- confMatrix[1,1]
FN <- confMatrix[2,1]
#Accuracy rate
mean(yHat1 == testSetDiabetets$Outcome) * 100
#accuracy = (TP+TN)/(TP+FN+TN+FP)
accuracy <- (TP+TN)/(TP+FN+TN+FP)
accuracy * 100
#precision = TP/(TP+FP)
precision <- TP/(TP+FP)
100 * precision
#recall = TP/(TP+FN)
recall <- TP/(TP+FN)
100 * recall
# sensitivity = TP/(TP+FN)
sensitivity <- TP/sum(testSetDiabetets$Outcome == 1)
100*sensitivity
# specificity = TN/(TN+FP)
specificity <- TN/sum(testSetDiabetets$Outcome == 0)
100* specificity
#11. Go to our e-book and solve Question 33 of Exercise 9.3. (page 97)
FlippedDataSet <- read_excel("data/jaggia_ba_2e_ch09_data.xlsx", sheet="Exercise_9.33")
head(FlippedDataSet)
summary(FlippedDataSet)
trainingFlippedSet <- FlippedDataSet[1:75,]
testFlippedSet <- FlippedDataSet[76:100,]
logitFlipped1 <- glm(y~x1, data=trainingFlippedSet)
logitFlipped2 <- glm(y~x1+x2, data=trainingFlippedSet)
pHatFlipped1 <- predict(logitFlipped1, testFlippedSet, type="response")
yHatFlipped1 <- ifelse(pHatFlipped1 >= 0.5, 1, 0)
pHatFlipped2 <- predict(logitFlipped2, testFlippedSet, type="response")
yHatFlipped2 <- ifelse(pHatFlipped2 >= 0.5, 1, 0)
#Accuracy rate
mean(yHatFlipped1 == testFlippedSet$y) * 100
mean(yHatFlipped2 == testFlippedSet$y) * 100
logitFlipped3 <- glm(y~x1+x2 ,data =FlippedDataSet)
predict(logitFlipped3, data.frame(x1=25, x2=50))
#12. Go to our e-book and solve Question 35 of Exercise 9.3. (page 97)
myDataFlipped35 <- read_excel("data/jaggia_ba_2e_ch09_data.xlsx", sheet = "Exercise_9.35")
summary(myDataFlipped35)
trainingFlippedSet2 <- myDataFlipped35[1:75,]
testFlippedSet2 <- myDataFlipped35[76:100,]
logitFlipped35 <- glm(y~x1+x2, data=trainingFlippedSet2)
pHatFlipped35 <- predict(logitFlipped35, testFlippedSet2, type="response")
yHatFlipped35 <- ifelse(pHatFlipped35 >= 0.5, 1, 0)
#Accuracy rate
mean(yHatFlipped35 == testFlippedSet2$y) * 100
#Create confusion matrix manually
confMatrix <- table(testFlippedSet2, yHatFlipped35)
confMatrix
#10. Calculate precision, recall, accuracy, sensitivity and specificity
TP <- confMatrix[2,2]
FP <- confMatrix[1,2]
TN <- confMatrix[1,1]
FN <- confMatrix[2,1]
#accuracy = (TP+TN)/(TP+FN+TN+FP)
accuracy <- (TP+TN)/(TP+FN+TN+FP)
accuracy * 100
#precision = TP/(TP+FP)
precision <- TP/(TP+FP)
100 * precision
#recall = TP/(TP+FN)
recall <- TP/(TP+FN)
100 * recall
# sensitivity = TP/(TP+FN)
sensitivity <- TP/sum(testFlippedSet2$y == 1)
100*sensitivity
# specificity = TN/(TN+FP)
specificity <- TN/sum(testFlippedSet2$y == 0)
100* specificity