From e974ebe0235a7d19a250b70aaf84dcafa32d9671 Mon Sep 17 00:00:00 2001
From: la2738 <la2738@tc.columbia.edu>
Date: Sun, 8 Dec 2019 02:43:49 -0500
Subject: [PATCH] :)

---
 .gitignore        |   4 +++
 Assignment7.Rmd   |  65 +++++++++++++++++++++++++++++++++++++---------
 assignment7.Rproj |  13 ++++++++++
 tree1.ps          | Bin 0 -> 5833 bytes
 4 files changed, 70 insertions(+), 12 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 assignment7.Rproj
 create mode 100644 tree1.ps

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5b6a065
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/Assignment7.Rmd b/Assignment7.Rmd
index 105cbdf..3c68a55 100644
--- a/Assignment7.Rmd
+++ b/Assignment7.Rmd
@@ -1,7 +1,7 @@
 ---
 title: "Assignment 7 - Answers"
-author: "Charles Lang"
-date: "11/30/2016"
+author: "Ling Ai"
+date: "12/2/2019"
 output: html_document
 ---
 
@@ -11,60 +11,85 @@ In the following assignment you will be looking at data from an one level of an
 
 #Upload data
 ```{r}
-
+D1 <-read.csv("online.data.csv")
 ```
 
 #Visualization 
 ```{r}
+library(ggplot2)
+library(dplyr)
+library(tidyr)
 #Start by creating histograms of the distributions for all variables (#HINT: look up "facet" in the ggplot documentation)
+D1$level.up <- ifelse(D1$level.up == "yes",1,0)
+D2 <- gather(D1, "variable", "score", 2:7)
+
+ggplot(D2, aes(score)) + facet_wrap(~variable, scales = "free") + geom_histogram()
 
 #Then visualize the relationships between variables
+pairs(D1)
 
 #Try to capture an intution about the data and the relationships
 
 ```
 #Classification tree
 ```{r}
+library(rpart)
 #Create a classification tree that predicts whether a student "levels up" in the online course using three variables of your choice (As we did last time, set all controls to their minimums)
+c.tree1 <- rpart(level.up ~ post.test.score + post.test.score + messages + forum.posts + av.assignment.score, method="class", data= D1)
 
 #Plot and generate a CP table for your tree 
+printcp(c.tree1)
+post(c.tree1, file = "tree1.ps", title = "level up")
 
 #Generate a probability value that represents the probability that a student levels up based your classification tree 
 
-D1$pred <- predict(rp, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
+D1$pred <- predict(c.tree1, type = "prob")[,2]#Last class we used type = "class" which predicted the classification for us, this time we are using type = "prob" to see the probability that our classififcation is based on.
 ```
 ## Part II
 #Now you can generate the ROC curve for your model. You will need to install the package ROCR to do this.
 ```{r}
+#install.packages("ROCR")
 library(ROCR)
 
 #Plot the curve
 pred.detail <- prediction(D1$pred, D1$level.up) 
-plot(performance(pred.detail, "tpr", "fpr"))
+plot(performance(pred.detail, "tpr", "fpr")) #"tpr" true positive rate, "fpr" false positive rate
 abline(0, 1, lty = 2)
 
 #Calculate the Area Under the Curve
-unlist(slot(performance(Pred2,"auc"), "y.values"))#Unlist liberates the AUC value from the "performance" object created by ROCR
+unlist(slot(performance(pred.detail,"auc"), "y.values"))
+
+#Unlist liberates the AUC value from the "performance" object created by ROCR
+#the area under the curve is 1
 
 #Now repeat this process, but using the variables you did not use for the previous model and compare the plots & results of your two models. Which one do you think was the better model? Why?
+
+pred.detail1 <- prediction(D1$post.test.score, D1$level.up)
+plot(performance(pred.detail1, "tpr", "fpr"))
+abline(0, 1, lty = 2)
+unlist(slot(performance(pred.detail1,"auc"), "y.values"))
+
+# The first model is better because it's AUC value is 1 and the second model has 0.919925.
 ```
 ## Part III
 #Thresholds
 ```{r}
 #Look at the ROC plot for your first model. Based on this plot choose a probability threshold that balances capturing the most correct predictions against false positives. Then generate a new variable in your data set that classifies each student according to your chosen threshold.
 
-threshold.pred1 <- 
+D1$threshold.pred1 <- ifelse(D1$pred >= 0.5, 1, 0)
 
 #Now generate three diagnostics:
 
-D1$accuracy.model1 <-
+D1$accuracy.model1 <- mean(ifelse(D1$level.up == D1$threshold.pred1, 1, 0))
+D1$accuracy.model1 <- as.integer(D1$accuracy.model1)
+accuracy1 <- sum(D1$accuracy.model1) / length(D1$accuracy.model1)
 
-D1$precision.model1 <- 
-
-D1$recall.model1 <- 
+D1$precision.model1 <- ifelse(D1$level.up == 1 & D1$threshold.pred1 == 1, 1, 0)
+precision1 <- sum(D1$precision.model1) / sum (D1$threshold.pred1) 
+D1$recall.model1 <- ifelse(D1$level.up == 1 & D1$threshold.pred1 == 1, 1, 0)
+recall1 <- sum(D1$precision.model1) / sum(D1$level.up)
 
 #Finally, calculate Kappa for your model according to:
-
 #First generate the table of comparisons
 table1 <- table(D1$level.up, D1$threshold.pred1)
 
@@ -75,7 +100,23 @@ matrix1 <- as.matrix(table1)
 kappa(matrix1, exact = TRUE)/kappa(matrix1)
 
 #Now choose a different threshold value and repeat these diagnostics. What conclusions can you draw about your two thresholds?
+D1$threshold.pred2 <- ifelse(D1$pred >= 0.9, 1, 0)
+
+D1$accuracy.model2 <- mean(ifelse(D1$level.up == D1$threshold.pred2, 1, 0))
+D1$accuracy.model2 <- as.integer(D1$accuracy.model2)
+accuracy2 <- sum(D1$accuracy.model2) / length(D1$accuracy.model2)
+
+D1$precision.model2 <- ifelse(D1$level.up == 1 & D1$threshold.pred2 == 1, 1, 0)
+precision2 <- sum(D1$precision.model2) / sum (D1$threshold.pred2) 
+D1$recall.model2 <- ifelse(D1$level.up == 1 & D1$threshold.pred2 == 1, 1, 0)
+recall2 <- sum(D1$precision.model2) / sum(D1$level.up)
+
+
+table2 <- table(D1$level.up, D1$threshold.pred2)
+matrix2 <- as.matrix(table2)
+kappa(matrix2, exact = TRUE)/kappa(matrix2)
 
+#For two models the value of kappa are the same.
 ```
 
 ### To Submit Your Assignment
diff --git a/assignment7.Rproj b/assignment7.Rproj
new file mode 100644
index 0000000..8e3c2eb
--- /dev/null
+++ b/assignment7.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/tree1.ps b/tree1.ps
new file mode 100644
index 0000000000000000000000000000000000000000..54ad3cf5ff581be7369b9d9018b195d09e2c565d
GIT binary patch
literal 5833
zcmd5=+iu)O5`E`a)IJE1v6vn<n^(bWAo3+KU`K`>U>^)WaLCq(yA;VKIkE$-{(Vn1
z*}RO#juT)3C$eUWRn=9eZe7&b-`;)L{FK)_y-~t-&dy%c+2K}K{jXZ*I)AUb`p{;&
zyP)g3>gg9<-s!%`Qb_*&=+Wk~F7s2*uXbhepMyTdVwFei55M2;>T==BTb&o_1*JX(
zNkFkL$))or_<_uCMPKR*dQU&MX>(I#9bFyz=Fo%nywz!6x6J%dU-zHWmVJ1a?itQ`
z+UqLk6Rz3<c+<Y9s|$LQR(Y4D4NG3uhbk|s{bl{-f@I7;K@?H!`Cz`R^5^v}N2+5{
zy)UYFZC%!TXS-{N?rGmq_x|Tgc3HSCJnq}H>PncU#HF@|Ma@DFl<RBfjCOnkiKY|H
zcn+B6ob*mZYFAXb5#8=v^Ssp^(5+e8nC@=s&kbM_B=NJ<y(QMYt&zog#VGlCTNc$Z
zqcTTU8YlKQE0xS}va;^<Bi^;Kb6u3>N~JcByS}aey>>h^3~R+D6}ptR*-9C(;iFF;
zqK9)w5R$UG`JEJP<nUa7$!-vC^I-uY;3G6h-5$zg{%H`6WP>NXZ9Hb|JI>5*PX^}R
z=EH(mhP_eWP&j2ET*>SDJ4{61R5@X!H}MQ;vkvG0Yba0$BK)5uRG<FBu|Pw7E^?GY
zQ#Y2)5v}wdQEkVpa0#OJPLyb$vY-C9tdh}Q#;@P8k~ih}$k0+WGnG7<9?;mFs1b+#
zoNCCxvvA<tSW9Ek48>%FFg_eUtj=6PSy?oQ#^Ziz_d6~p-Q!D!QL*k!G&^&%wA5zQ
zLe6|Hu$>V`7Q~iA!pxX^`bgX7uU}rKoz8!{d~ThB?9->TeW9~D*Kgkbx_Ay2LF=CK
zLqo1`RpP}|>SGBb#=q{RClXJ2lJes2;p<{JDhMdebCTv8q{$OLBG}MZ)QW3eb|)-b
z<Zuv99-I8Mez1L;qBg$(Bz<&+FFikqfXkC!7%2uQ6Q%sM&IAr$T%m#}h(dgcWFP|%
zU!+WA;(bd;#;&i}jvIM03h*TfT;<AR=l*$9^!i<0;0lFzkO??E{4Y-2mGHA?v%ODT
zET&SrCe79Zvzw~vQ7N6By{@wIkZXH;Kf1iP(>;#aRpne~4%b1ct35h9FS35PkKoI_
zE&J`O*Dp}lbdUS$f$q`%JOUnQNZ$jtfi^8mXl!3rs5i_Bdi~+*4KCJ7PEYu*BgzhK
zi<^_}fI?9_+gI>G4t)k0TtlY#dB*%9oMG{pF&X-Y0_OciV0JZHU>hI(Q1H}Wgty%O
z77!PYe<+wd#IRa?XL*iW>~{G4e*pPP7X@E&6C3Nep$<zhc3P6ou5o0{3+nJY_LPVd
z-g5=+gRFG?Sa?1SzoSHnFrWaUcn0R!wRVP(^r&-q*L8{KlcN+*IPya&ypYcS`-B8$
zB=ru4Y{0{f?@+T%I3GW;lJ4`gyK$uDVH2N;e_?()60_C_-eRI+sl$g;K}ac~Lh@q8
zuH8B^h=q&*N($vs=|qWe6IduMXEU*C)>l3RFqB;>5^RW$^p%K`L0_>xo--`y51c`t
zSv8C!T0W^L7Czog(kF~DbvC$un7$Kf*A@GUUlv4{VP}7ei^2<nCxpFb-#WgBJHaQJ
zMC{~wQ+K`Su{zDdJ`0brQ;{da5F4@*zV|)Wp8aEGEmWZh6`*;z;2v3H;ia`L_S<`|
z6#Q@QoZ)>1Ap_n7R~fSFiy%bI7;l8j9z@AR>@va8-<T|54TuIJR^X_F34Aia%Lfxt
z3_=D>Ad?<$$i#@D@R-1nhLhlJOoWmNUwFt92i_1^KBSPw1D{VrWJfVX*cGAi0B>Y`
z(GNO-kjSi%oXZUeN$}lLkOO58#}S~0GjMvulsz7(0n=KBPoM@pCEzZo!C!*}l!XG2
zpazr%KcX-M0HFcT#s(U^G>>630R=Tg)WjOp5NwkMrZ@@{!7-iW80+eY)#)q4lTSW$
zOirx36A}*a3Z%&i#7v4!b$~FTnWDyJ;d9k<K-p1~6C3MQ8kPxKX3nUO+BVn2v2ziO
z)nOblE{<ui^)$FX7n-d=2D>ax#AmA6Wii_-=YZPs=Sbt-xKTV<%qRuGX)*yKW+Ad;
ziX8fq2~&K_gAuUVBB%Xvm#~kgyTnvH;9>x-yJYA&laaYrhb}RFx(q3L0Ta;{zsU&t
z!Wf;>Gg6SVnL|HfCfKBc8oOjX{#2I?POS<Eu+S|WgAvM6m!P}QCC2)ySXtwbyTk<h
zs4+0P=9WQkGQmE7f0qnCaaN};;i0xdvI1$l#7v3PUBc+6fb!@vIkB;x(!hXhf`;#N
z=4?t?+jhxtEOIe*<vc@|Wpk=a7?bT12D>cHWz8;&*;Y9R)RsT@{M;onjPQC%%5^#H
zw^$-J%7g?Z9wq{d1<!Z<U1)qbYbsp%s1>}c37PyU%?LL+?%O}9X?L2)=ec$6&iAri
zb_~Mv&~>-IJG1SEf*lIIywI}@Y!3+-LF8>`t-y<U5AjAVIa#xeQOk>0#_T$Smv(GR
z3u7Y#n+E>Cb`GuCROH4yhvcX;n;IFT&Jn6_?T%(W^rqF9(6H<EF!4o<KdSnuCm=8N
lVsXNqe14}(Iy81m()=p;TbmZ8Zt-pWJFdjnmshWx{{gdt^QHg*

literal 0
HcmV?d00001